From 9fc49679be65d6da815660d0e56fda36819dde58 Mon Sep 17 00:00:00 2001
From: xgrand <xavier.grand@inserm.fr>
Date: Tue, 30 May 2023 16:57:31 +0200
Subject: [PATCH] Add Dockerfile and scripts to fusion_parser module to
 arriba_fusion.nf

---
 .../fusion_parser/1.0/Dockerfile              |  16 ++
 .../fusion_parser/1.0/docker_init.sh          |   5 +
 .../fusion_parser/1.0/fusion_parser.R         | 163 ++++++++++++++++++
 .../fusion_parser/1.0/install_pkgs.R          |  26 +++
 src/arriba_fusion.nf                          |   9 +-
 src/nf_modules/arriba/main.nf                 |   2 +-
 src/nf_modules/fusion_parser/main.nf          |  25 +++
 7 files changed, 243 insertions(+), 3 deletions(-)
 create mode 100644 src/.docker_modules/fusion_parser/1.0/Dockerfile
 create mode 100755 src/.docker_modules/fusion_parser/1.0/docker_init.sh
 create mode 100644 src/.docker_modules/fusion_parser/1.0/fusion_parser.R
 create mode 100644 src/.docker_modules/fusion_parser/1.0/install_pkgs.R
 create mode 100644 src/nf_modules/fusion_parser/main.nf

diff --git a/src/.docker_modules/fusion_parser/1.0/Dockerfile b/src/.docker_modules/fusion_parser/1.0/Dockerfile
new file mode 100644
index 0000000..4ef9e45
--- /dev/null
+++ b/src/.docker_modules/fusion_parser/1.0/Dockerfile
@@ -0,0 +1,16 @@
+FROM rocker/r-base:4.2.3
+LABEL MAINTAINER "Xavier Grand <xavier.grand@ens-lyon.fr>"
+
+RUN apt update && apt-get install -y apt-transport-https
+RUN apt install -y libxml2-dev libfontconfig1-dev libssl-dev \
+                   libcurl4-openssl-dev libharfbuzz-dev \
+                   libfribidi-dev libfreetype6-dev libpng-dev \
+                   libtiff5-dev libjpeg-dev 
+
+## copy Rscript files
+COPY ./*.R .
+
+RUN Rscript install_pkgs.R
+
+# command to run on container start
+CMD [ "bash" ]
diff --git a/src/.docker_modules/fusion_parser/1.0/docker_init.sh b/src/.docker_modules/fusion_parser/1.0/docker_init.sh
new file mode 100755
index 0000000..0fa64b8
--- /dev/null
+++ b/src/.docker_modules/fusion_parser/1.0/docker_init.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+# docker pull xgrand/fusion_parser:1.0
+docker build src/.docker_modules/fusion_parser/1.0 -t 'xgrand/fusion_parser:1.0'
+docker push xgrand/fusion_parser:1.0
+# docker buildx build --platform linux/amd64,linux/arm64 -t "xgrand/fusion_parser:1.0" --push src/.docker_modules/fusion_parser/1.0
\ No newline at end of file
diff --git a/src/.docker_modules/fusion_parser/1.0/fusion_parser.R b/src/.docker_modules/fusion_parser/1.0/fusion_parser.R
new file mode 100644
index 0000000..cbc30e6
--- /dev/null
+++ b/src/.docker_modules/fusion_parser/1.0/fusion_parser.R
@@ -0,0 +1,163 @@
+#!/usr/bin/env Rscript
+# list.of.packages <- c("optparse", "dplyr", "tidyr", "stringr", "BiocManager", 
+#                       "tidyverse", "tibble", "purrr", "furrr", "future", "vsn",
+#                       "pacman")
+# new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
+# 
+# if (length(new.packages) > 0) {
+#   install.packages(new.packages, dependencies = TRUE, repos = "https://cran.r-project.org")
+# }
+# 
+# list.of.BiocPackages <- c("DESeq2", "BiocParallel")
+# new.BiocPackages <- list.of.BiocPackages[!(list.of.BiocPackages %in% installed.packages()[,"Package"])]
+# 
+# if (length(new.BiocPackages) > 0) {
+#   BiocManager::install(new.BiocPackages, dependencies = TRUE, ask = FALSE)
+# }
+
+pacman::p_load(optparse, tidyverse, stringr, DESeq2, tibble, BiocParallel,
+               furrr, future, future.batchtools, vsn)
+
+# Option parser and loading data:
+option_list = list(
+  make_option(c("-f", "--fusion"), type="character", default=NULL, 
+              help="fusions folder path", metavar="character"),
+  make_option(c("-c", "--count"), type="character", default=NULL,
+              help="htseq counts folder path.", metavar="character"),
+  make_option(c("-d", "--design"), type="character", default=NULL,
+              help="path to design table", metavar="character"),
+  make_option(c("-t", "--threads"), type="integer", default=4,
+              help="number of threads", metavar="integer"),
+  make_option(c("-m", "--memory"), type="integer", default=8,
+              help="memory in Gbytes", metavar="integer")
+)
+
+opt_parser = OptionParser(option_list=option_list)
+opt = parse_args(opt_parser)
+
+# Number of threads for DESeq2:
+register(MulticoreParam(opt$threads))
+
+# Set the number of threads/workers to use
+plan(multisession, workers = opt$threads)
+
+# Set memory limits for parallel workers
+options(future.globals.maxSize = ((opt$memory*1000)*1024^2))
+
+# read design input file:
+design <- read.table(file = opt$design, header = TRUE, row.names = NULL)
+list_sample <- as.list(design$sample)
+
+# Function to parse all file from a list:
+parse_fusion <- function(ech) {
+  # read fusion file:
+  fusion_file <- paste0(ech, "_fusions.tsv")
+  df1 <- read.table(file = paste0(opt$fusion, fusion_file))
+  colnames(df1) <- c("gene1","gene2","strand1",
+                     "strand2","breakpoint1","breakpoint2","site1",
+                     "site2","type","split_reads1","split_reads2",
+                     "discordant_mates","coverage1","coverage2","confidence",
+                     "reading_frame","tags","retained_protein_domains",
+                     "closest_genomic_breakpoint1",
+                     "closest_genomic_breakpoint2","gene_id1","gene_id2",
+                     "transcript_id1","transcript_id2","direction1",
+                     "direction2","filters","fusion_transcript",
+                     "peptide_sequence","read_identifiers")
+  # filter fusion file:
+  df1 <- filter(df1, type  == 'deletion/read-through' |
+                  type == "deletion/read-through/5'-5'" |
+                  type == "deletion/read-through/3'-3'")
+  df1 <- df1[df1$strand1 == df1$strand2,]
+  df1 <- df1 %>% filter(strand1 %in% c("+/+", "-/-"))
+  
+  df1 <- df1 %>% select(gene1, gene2, strand1, split_reads1, split_reads2, 
+                        discordant_mates, type, gene_id1, gene_id2)
+  
+  htseq_file <- paste0(ech, ".tsv")
+  htseq_df <- read.table(file = paste0(opt$count, htseq_file), header = FALSE)
+  colnames(htseq_df) <- c("gene_id", "count")
+  
+  df1 <- left_join(df1, htseq_df, by = join_by(gene_id1 == gene_id),
+                   suffix = c(".x", ".y"))
+  colnames(df1)[10] <- "count1"
+  df1 <- left_join(df1, htseq_df, by = join_by(gene_id2 == gene_id),
+                   suffix = c(".x", ".y"))
+  colnames(df1)[11] <- "count2"
+  colnames(df1)[3] <- "strand"
+
+  df1 <- df1 %>% dplyr::mutate(ID = paste0(gene1, "_", gene2))
+
+  df2 <- df1 %>% dplyr::group_by(ID) %>%
+    dplyr::summarise(reads1=sum(split_reads1),
+                     reads2=sum(split_reads2),
+                     discordant=sum(discordant_mates)) %>%
+    dplyr::rowwise() %>% 
+    dplyr::mutate(reads_total = sum(reads1, 
+                                  reads2, 
+                                  discordant)
+           ) %>%
+    dplyr::select(ID, reads_total)
+  
+  colnames(df2)[2] <- ech
+  
+  df1 <- df1 %>% select(all_of(c("ID", "gene_id1", "gene_id2", 
+                                 "split_reads1", "split_reads2", 
+                                 "discordant_mates", "type", "count1", "count2",
+                                 "strand")))
+  
+  write.table(df1, file = paste0(opt$output, ech, "_parsed_fusion.csv"), 
+              sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
+  
+  df2$ID <- as.factor(df2$ID)
+  return(df2)
+}
+
+# all_results <- map(list_sample, parse_fusion)
+all_results <- furrr::future_map(list_sample, parse_fusion)
+gc()
+concat_res <- purrr::reduce(all_results, full_join, by = "ID")
+concat_res[is.na(concat_res)] <- 0
+
+# DESeq2:
+## Prepare the dataset:
+deseq_df <- as.matrix(concat_res %>% column_to_rownames(var = "ID"))
+exp_design <- design %>% column_to_rownames(var = "sample")
+exp_design$condition <- as.factor(exp_design$condition)
+exp_design$rep <- as.factor((exp_design$rep))
+
+dds <- DESeq2::DESeqDataSetFromMatrix(countData=deseq_df, 
+                                      colData=exp_design, 
+                                      design=~condition + rep, 
+                                      tidy = FALSE)
+
+## Filtering:
+keep <- rowSums(counts(dds) > 1) >= 3
+dds <- dds[keep,]
+
+dds <- DESeq2::DESeq(dds)
+
+# Plots:
+pdf(paste0(opt$output, "rplot.pdf"))
+res <- results(dds)
+plotMA(res) #, ylim=c(-2,2))
+plotDispEsts(dds)
+
+# this gives log2(n + 1)
+# ntd <- normTransform(dds)
+# meanSdPlot(assay(ntd))
+dev.off()
+
+# save results:
+save_deseq <- function(comparaison, dds) {
+  tmp <- as.data.frame(DESeq2::results(dds, contrast=c("condition",comparaison)))
+  tmp <- tibble::rownames_to_column(tmp, "ID")
+  write.table(tmp, 
+              file = paste0(opt$output, comparaison[1], "_", comparaison[2], ".tsv"), 
+              row.names = FALSE,
+              dec = ",",
+              sep = "\t",
+              qmethod = "double"
+  )
+}
+
+sapply(list_comparaison, save_deseq, dds)
diff --git a/src/.docker_modules/fusion_parser/1.0/install_pkgs.R b/src/.docker_modules/fusion_parser/1.0/install_pkgs.R
new file mode 100644
index 0000000..37f282d
--- /dev/null
+++ b/src/.docker_modules/fusion_parser/1.0/install_pkgs.R
@@ -0,0 +1,26 @@
+#!/usr/bin/env Rscript
+list.of.packages <- c("optparse", "dplyr", "tidyr", "stringr", "BiocManager",
+                      "tidyverse", "tibble", "purrr", "furrr", "future", "vsn",
+                      "pacman", "gargle", "ids", "systemfonts", "textshaping",
+                      "googledrive", "googlesheets4", "httr", "ragg", "rvest",
+                      "xml2", "textshaping", "ragg")
+
+new.packages <- list.of.packages[!(list.of.packages %in%
+installed.packages()[,"Package"])]
+
+if (length(new.packages) > 0) {
+  install.packages(new.packages,
+                   dependencies = TRUE,
+                   repos = "https://cran.r-project.org")
+}
+
+list.of.BiocPackages <- c("DESeq2", "BiocParallel")
+new.BiocPackages <- list.of.BiocPackages[!(list.of.BiocPackages %in%
+                                             installed.packages()[,"Package"])]
+
+if (length(new.BiocPackages) > 0) {
+  BiocManager::install(new.BiocPackages, dependencies = TRUE, ask = FALSE)
+}
+
+pacman::p_load(optparse, tidyverse, stringr, DESeq2, tibble, BiocParallel,
+               furrr, future, future.batchtools, vsn)
\ No newline at end of file
diff --git a/src/arriba_fusion.nf b/src/arriba_fusion.nf
index 52b2f5c..0d5c7db 100644
--- a/src/arriba_fusion.nf
+++ b/src/arriba_fusion.nf
@@ -74,6 +74,7 @@ params.bam = ""
 params.genome = "/home/xavier/Data/Genome/hg19/Homo_sapiens.GRCh37.dna.primary_assembly.fa"
 params.gtf = "/home/xavier/Data/Genome/hg19/Homo_sapiens.GRCh37.87.gtf"
 params.index = "/home/xavier/Data/Genome/hg19/STAR"
+params.htseq_param = "yes"
 
 /* Params out */
 params.fastp_out = "02_fastp"
@@ -103,6 +104,7 @@ if(params.index) {
 } else {
   index_list = ""
 }
+log.info "htseq param: -s ${params.htseq_param}"
 
 /*
  ****************************************************************
@@ -157,6 +159,7 @@ include { filter_bam_quality } from './nf_modules/samtools/main.nf'
 include { index_with_gtf } from './nf_modules/star/main_2.7.8a.nf'
 include { mapping2fusion } from './nf_modules/star/main_2.7.8a.nf'
 include { index_bam } from './nf_modules/samtools/main.nf'
+include { htseq_count } from './nf_modules/htseq/main.nf' addParams(htseq_out: '09_htseq_count', htseq_param: "${params.htseq_param}" )
 include { arriba } from "./nf_modules/arriba/main.nf"
 include { draw_fusions } from "./nf_modules/arriba/main.nf"
 
@@ -176,13 +179,13 @@ workflow {
 
   if(params.fastq != ""){
     fastp(fastq_files)
-    fastqc_raw(fastq_files)
+    /* fastqc_raw(fastq_files)
     fastqc_preprocessed(fastp.out.fastq)
     multiqc(fastqc_raw.out.report)
      .mix(
        fastqc_preprocessed.out.report
        ).collect()
-
+  */
 /*
  ****************************************************************
                           Mapping
@@ -205,6 +208,7 @@ workflow {
 
     filter_bam_quality(mapping2fusion.out.bam)
     index_bam(filter_bam_quality.out.bam.collect())
+    htseq_count(index_bam.out.bam_idx.collect(), gtf_file.collect())
 
 /*
  ****************************************************************
@@ -217,6 +221,7 @@ workflow {
   }
   else {
     index_bam(bam_files.collect())
+    htseq_count(index_bam.out.bam_idx.collect(), gtf_file.collect())
     arriba(index_bam.out.bam_idx.collect(), gtf_file.collect(), genome_file.collect())
     draw_fusions(arriba.out.fusions, index_bam.out.bam_idx, gtf_file)
   }
diff --git a/src/nf_modules/arriba/main.nf b/src/nf_modules/arriba/main.nf
index be1630a..9ef3413 100644
--- a/src/nf_modules/arriba/main.nf
+++ b/src/nf_modules/arriba/main.nf
@@ -49,7 +49,7 @@ process draw_fusions{
 
   script:
 """
-draw_fusions.R \
+Rscript /usr/local/bin/arriba_v${version}/draw_fusions.R \
     --fusions=${fusions} \
     --alignments=${bam} \
     --output=${fusion_id}_fusions.pdf \
diff --git a/src/nf_modules/fusion_parser/main.nf b/src/nf_modules/fusion_parser/main.nf
new file mode 100644
index 0000000..ceac664
--- /dev/null
+++ b/src/nf_modules/fusion_parser/main.nf
@@ -0,0 +1,25 @@
+version = "1.0"
+container_url = "xgrand/fusion_parser:${version}"
+
+process parsefusion{
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  tag "${bam_id}"
+  if (params.fusion_out != "") {
+    publishDir "results/${params.fusion_out}", mode: 'copy'
+  }
+
+  input:
+  path(fusion)
+  path(count)
+  path(design)
+
+  output:
+  tuple val(bam_id), path("*.tsv"), emit: dfg
+
+  script:
+  memory = "${task.memory}" - ~/\s*GB/
+"""
+Rscript fusion_parser.R --fusion ${fusion} --count ${count} --design ${design} --threads ${task.cpus} --memory ${memory}
+"""
+}
\ No newline at end of file
-- 
GitLab