Add STAR:2.7.8 modif RNAseq_XGR.nf & arriba_fusion.nf

fd4214ae · Xavier Grand · 82f27265 · fd4214ae · fd4214ae · fd4214ae
Commit fd4214ae authored 2 years ago by Xavier Grand
--- a/src/.docker_modules/star/2.7.8a/Dockerfile
+++ b/src/.docker_modules/star/2.7.8a/Dockerfile
+FROM quay.io/biocontainers/star:2.7.8a--0
+MAINTAINER Xavier Grand
--- a/src/.docker_modules/star/2.7.8a/docker_init.sh
+++ b/src/.docker_modules/star/2.7.8a/docker_init.sh
+#!/bin/sh
+docker pull xgrand/star:2.7.8a
+# docker build src/.docker_modules/star/2.7.8a/ -t 'xgrand/star:2.7.8a'
+# docker push xgrand/star:2.7.8a
+docker buildx build --platform linux/amd64,linux/arm64 -t "xgrand/star:2.7.8a" --push src/.docker_modules/star/2.7.8a
\ No newline at end of file
--- a/src/RNAseq_XGR.nf
+++ b/src/RNAseq_XGR.nf
@@ -22,7 +22,7 @@ def helpMessage() {
    Usage:
    The typical command for running the pipeline is as follows:
-      nextflow ./src/star_fusion.nf -c ./src/nextflow.config -profile singularity
+      nextflow ./src/RNAseq_XGR.nf -c ./src/nextflow.config -profile singularity
    Mandatory arguments:
      --project [path]                Path to the project folder. Results are saved in this folder.

--- a/src/arriba_fusion.nf
+++ b/src/arriba_fusion.nf
@@ -30,10 +30,10 @@ def helpMessage() {
                                      Available: docker, singularity, podman, psmn, ccin2p3
    Input:
-      --fastq [path]                  Path to fastq folder.
+      --fastq [path]                  Path to fastq files.
-      --bam [path]                    Path to the bam-containing folder.
+      --bam [path]                    Path to the bam files.
-    References:
+    References:                       Can be downloaded with download_references.sh (not implemented in pipeline).
      --genome [path]                 Path to genome reference fasta file.
      --gtf [path]                    Path to genome annotation gtf file.
@@ -61,10 +61,11 @@ if (params.help || params.h) {
 */
 params.project = ""
+params.bam_folder = ""
+params.genome = ""
+params.gtf = ""
 params.bam = ""
 params.fastq = ""
-if (params.genome) { params.genome = path(params.genome, checkIfExists: true) } else { exit 1, "No genome specified." }
-if (params.gtf) { params.gtf = path(params.gtf, checkIfExists: true) } else { exit 1, "No annotation specified." }
 /* Params out */
 params.fastp_out = "$params.project/fastp/"
@@ -80,6 +81,12 @@ params.index_bam_out = "$params.project/Bam_filt_sort_indexed/"
 log.info "Reference genome : ${params.genome}"
 log.info "Genome annotation : ${params.gtf}"
+if(params.bam_folder != "") {
+  log.info "bam files (--bam): ${bam}"
+}
+else {
+  log.info "fastq files (--fastq): ${params.fastq}"
+}
 /*
 ****************************************************************
@@ -87,23 +94,28 @@ log.info "Genome annotation : ${params.gtf}"
 ****************************************************************
 */
-if(params.bam != "") {
+if(params.bam_folder != "") {
    Channel
        .fromPath( params.bam )
+        .ifEmpty { error "Cannot find any bam files in: ${params.bam}" }
+        .map { it -> [it.simpleName, it]}
        .set { bam_files }
 }
 else {
    Channel
-        .fromFilePairs( params.fastq, size = -1 )
+        .fromFilePairs( params.fastq, size: -1)
-        .set(fastq_files)    
+        .set { fastq_files }
 }
 Channel
  .fromPath( params.genome )
+  .ifEmpty { error "Cannot find any fasta files in: ${params.genome}" }
+  .map { it -> [it.simpleName, it]}
  .set { genome }
 Channel
  .fromPath( params.gtf )
+  .ifEmpty { error "Cannot find any annotation files in: ${params.gtf}" }
  .set { gtf }
 /*
@@ -113,9 +125,11 @@ Channel
 */
 include { fastp } from './nf_modules/fastp/main.nf'
-include { fastqc_fastq as fastqc_raw } from fastqc_mod addParams(fastqc_fastq_out: "$params.project/01_fastqc_raw/")
+include { fastqc_fastq as fastqc_raw } from './nf_modules/fastqc/main.nf' addParams(fastqc_fastq_out: "$params.project/01_fastqc_raw/")
-include { fastqc_fastq as fastqc_preprocessed } from fastqc_mod addParams(fastqc_fastq_out: "$params.project/02_fastqc_preprocessed/")
+include { fastqc_fastq as fastqc_preprocessed } from './nf_modules/fastqc/main.nf' addParams(fastqc_fastq_out: "$params.project/02_fastqc_preprocessed/")
 include { multiqc } from './nf_modules/multiqc/main.nf' addParams(multiqc_out: "$params.project/QC/")
+include { index_with_gtf } from './nf_modules/star/main_2.7.8a.nf' addParams(star_mapping_fastq_out: "$params.project/STAR_index/")
+include { mapping_fastq_withChimeric } from './nf_modules/star/main_2.7.8a.nf' addParams(star_mapping_fastq_out: "$params.project/STAR/")
 include { arriba } from "./nf_modules/arriba/main.nf"
 /*
@@ -127,26 +141,20 @@ include { arriba } from "./nf_modules/arriba/main.nf"
 workflow {
  if(params.bam == ""){
-    fastp()
+    fastp(fastq_files)
-    fastqc_raw()
+    // fastqc_raw(fastq_files.collect())
-    fastqc_preprocessed()
+    // fastqc_preprocessed(fastp_out.fastq.collect())
-    multiqc()
+    // multiqc(fastqc_raw_out.report)
-    .mix(
+    // .mix(
-      fastqc_preprocessed.out.report
+    //   fastqc_preprocessed.out.report
-      ).collect()
+    //   ).collect()
-    index_fasta()
+    index_with_gtf(genome, gtf)
-    mapping_fastq()
+    // mapping_fastq_withChimeric(index_fasta_out.index, fastp_out.fastq)
-    filter_bam_quality()
+    // filter_bam_quality(mapping_fastq_withChimeric_out.bam)
-    sort_bam()
+    // arriba()
-    index_bam()
+  }
+  else {
+    arriba(bam_files, gtf, genome)
  }
-  //###################### ARRIBA FUSION ########################
-  arriba(fastq_files, gtf, genome)
-  //################ GRAPHICAL REPRESENTATIONS ##################
 }
\ No newline at end of file
--- a/src/nextflow.config
+++ b/src/nextflow.config
@@ -18,7 +18,7 @@ profiles {
    docker.enabled = true
    process {
      errorStrategy = 'finish'
-      memory = '16GB'
+      memory = '15GB'
      withLabel: big_mem_mono_cpus {
        cpus = 1
      }
@@ -47,7 +47,7 @@ profiles {
    podman.enabled = true
    process {
      errorStrategy = 'finish'
-      memory = '16GB'
+      memory = '15GB'
      withLabel: big_mem_mono_cpus {
        cpus = 1
      }
@@ -77,7 +77,7 @@ profiles {
    singularity.cacheDir = "./bin/"
    process {
      errorStrategy = 'finish'
-      memory = '16GB'
+      memory = '15GB'
      withLabel: big_mem_mono_cpus {
        cpus = 1
      }

--- a/src/nf_modules/star/main_2.7.8a.nf
+++ b/src/nf_modules/star/main_2.7.8a.nf
+version = "2.7.8a"
+container_url = "xgrand/star:${version}"
+params.star_mapping_fastq_out = ""
+process gff3_2_gtf {
+  container = "dceoy/cufflinks"
+  label "small_mem_mono_cpus"
+    input:
+        tuple val(genome_id), path(gff3_file)
+    output:
+        path "${genome_id}.gtf", emit: gtf
+    script:
+"""
+gffread ${gff3_file} -T -o ${genome_id}.gtf
+"""
+}
+process index_with_gtf {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  input:
+    tuple val(genome_id), path(genome_fasta)
+    path gtf_file
+  output:
+    tuple val(genome_id), path ("*"), emit: index
+  script:
+"""
+STAR --runThreadN ${task.cpus} --runMode genomeGenerate \
+--genomeDir ./ \
+--genomeFastaFiles ${genome_fasta}  \
+--sjdbGTFfile ${gtf_file} \
+--genomeSAindexNbases 13 # min(14, log2(GenomeLength)/2 - 1)
+"""
+}
+workflow index_with_gff {
+  take:
+    genome_fasta
+    gff_file
+  main:
+    gff3_2_gtf(gff_file)
+    index_with_gtf(genome_fasta,gff3_2_gtf.out.gtf)
+  emit:
+    report = index_with_gtf.out.index
+}
+process index_without_gff {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  input:
+    tuple val(genome_id), path(genome_fasta)
+  output:
+    tuple val(genome_id), path ("*"), emit: index
+  script:
+"""
+STAR --runThreadN ${task.cpus} --runMode genomeGenerate \
+--genomeDir ./ \
+--genomeFastaFiles ${genome_fasta}  \
+--genomeSAindexNbases 13 # min(14, log2(GenomeLength)/2 - 1)
+"""
+}
+process mapping_fastq {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  if (params.star_mapping_fastq_out != "") {
+    publishDir "results/${params.star_mapping_fastq_out}", mode: 'copy'
+  }
+  input:
+    tuple val(index_id), path(index)
+    tuple val(reads_id), path(reads) 
+  output:
+    path "*.Log.final.out", emit: report
+    tuple val(reads_id), path("*.bam"), emit: bam
+  script:
+if (reads_id instanceof List){
+    file_prefix = reads_id[0]
+  } else {
+    file_prefix = reads_id
+  }
+if (reads.size() == 2)
+"""
+mkdir -p index
+mv ${index} index/
+STAR --runThreadN ${task.cpus} \
+--genomeDir index/ \
+--readFilesCommand zcat \
+--readFilesIn ${reads[0]} ${reads[1]} \
+--outFileNamePrefix ${reads_id}. \
+--alignIntronMax 10000 \
+--outSAMtype BAM SortedByCoordinate \
+--outSAMstrandField intronMotif
+mv ${reads_id}.Aligned.sortedByCoord.out.bam ${reads_id}.bam
+"""
+else
+"""
+mkdir -p index
+mv ${index} index/
+STAR --runThreadN ${task.cpus} \
+--genomeDir index/ \
+--readFilesCommand zcat \
+--readFilesIn ${reads} \
+--outFileNamePrefix ${reads_id}. \
+--alignIntronMax 10000 \
+--outSAMtype BAM SortedByCoordinate \
+--outSAMstrandField intronMotif
+mv ${reads_id}.Aligned.sortedByCoord.out.bam ${reads_id}.bam
+"""
+}
+process mapping_fastq_withChimeric {
+  container = "${container_url}"
+  label "big_mem_multi_cpus"
+  if (params.star_mapping_fastq_out != "") {
+    publishDir "results/${params.star_mapping_fastq_out}", mode: 'copy'
+  }
+  input:
+    tuple val(index_id), path(index)
+    tuple val(reads_id), path(reads) 
+  output:
+    path "*.Log.final.out", emit: report
+    tuple val(reads_id), path("*.bam"), emit: bam
+  script:
+if (reads_id instanceof List){
+    file_prefix = reads_id[0]
+  } else {
+    file_prefix = reads_id
+  }
+if (reads.size() == 2)
+"""
+mkdir -p index
+mv ${index} index/
+STAR --runThreadN ${task.cpus} \
+--genomeDir index/ \
+--readFilesCommand zcat \
+--readFilesIn ${reads[0]} ${reads[1]} \
+--outFileNamePrefix ${reads_id}. \
+--alignIntronMax 10000 \
+--outSAMtype BAM SortedByCoordinate \
+--outSAMstrandField intronMotif \
+--chimOutType WithinBAM
+mv ${reads_id}.Aligned.sortedByCoord.out.bam ${reads_id}.bam
+"""
+else
+"""
+mkdir -p index
+mv ${index} index/
+STAR --runThreadN ${task.cpus} \
+--genomeDir index/ \
+--readFilesCommand zcat \
+--readFilesIn ${reads} \
+--outFileNamePrefix ${reads_id}. \
+--alignIntronMax 10000 \
+--outSAMtype BAM SortedByCoordinate \
+--outSAMstrandField intronMotif \
+--chimOutType WithinBAM
+mv ${reads_id}.Aligned.sortedByCoord.out.bam ${reads_id}.bam
+"""
+}
\ No newline at end of file