Skip to content
Snippets Groups Projects
Select Git revision
  • a6ff34dddb7eaa3c4755bc1e7032cbaecab95bbd
  • master default protected
  • star
  • dev
  • v0.4.0
  • v0.3.0
  • v0.2.9
  • v0.2.8
  • v0.2.7
  • v0.2.6
  • v0.1.0
  • v0.2.5
  • v0.2.4
  • v0.2.3
  • v0.2.2
  • v0.2.1
  • v0.2.0
  • v0.1.2
18 results

RNAseq.nf

Blame
  • Forked from LBMC / nextflow
    Source project has a limited visibility.
    RNAseq.nf 13.32 KiB
    /*
    *	RNAseq Analysis pipeline
    */
    
    //////////////////////////////////////////////////////
    //                    PARAMETERS                    //
    //////////////////////////////////////////////////////
    
    ///////////////////////
    // PARMS : FILE PATH //
    ///////////////////////
    
    params.fastq_raw = "data/fastq/*{_R1,_R2}.fastq.gz"
    params.output = "results"
    params.filter = "data/filter/human_rRNA_tRNA/*.bt2"
    params.index_genome = "data/genome/*.ht2"
    params.gtf = "data/annotation/*.gtf"
    params.gtf_collapse = "data/annotation/*.gtf"
    params.index_postgenome = "data/post_genome/*.ht2"
    
    /////////////////////
    // PARMS : OPTIONS //
    /////////////////////
    
    params.do_fastqc = true
    params.do_dedup = true
    params.do_postgenome = true
    
    ////////////////////////
    // ADAPTORS SEQUENCES //
    ////////////////////////
    
    params.adaptorR1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA"
    params.adaptorR2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
    
    //////////////
    // LOG INFO //
    //////////////
    
    log.info "input raw : ${params.fastq_raw}"
    log.info "outut directory : ${params.output}"
    log.info "filter index files : ${params.filter}"
    log.info "genome index : ${params.index_genome}"
    log.info "gtf file : ${params.gtf}"
    log.info "collapsed gtf file for rnaseqc: ${params.gtf_collapse}"
    log.info "post-genome index : ${params.index_postgenome}"
    log.info ""
    log.info "do fastqc ? : ${params.do_fastqc}"
    log.info "do deduplication ? : ${params.do_dedup}"
    log.info "do post genome alignement ? : ${params.do_postgenome}"
    log.info ""
    
    //////////////
    // CHANNELS //
    //////////////
    
    Channel
       .fromFilePairs(params.fastq_raw)
       .ifEmpty { error "Cannot find any file matching: ${params.fastq_raw}" }
       .into {INPUT_FASTQC_RAW;
              INPUT_CUTADAPT}
    
    Channel
       .fromPath( params.filter )
       .ifEmpty { error "Cannot find any index files matching: ${params.filter}" }
       .set { FILTER_INDEX }
    
    Channel
       .fromPath ( params.index_genome )
       .ifEmpty { error "Cannot find any index files matching: ${params.index_genome}" }
       .set { GENOME_INDEX }
    
    Channel
       .fromPath( params.gtf )
       .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf}" }
       .set { GTF_FILE }
    
    Channel
      .fromPath( params.gtf_collapse )
      .ifEmpty { error "Cannot find any gtf file matching: ${params.gtf_collapse}" }
      .set { GTF_COLLAPSE }
    
    Channel
       .fromPath ( params.index_postgenome )
       .ifEmpty { error "Cannot find any index files matching: ${params.index_postgenome}" }
       .set { POSTGENOME_INDEX }
    
    //////////////////////////////////////////////////////
    //                     PROCESS                      //
    //////////////////////////////////////////////////////
    
    ////////////////////////////////////////////////////////////////////////
    //////////////////////////// PRE PROCESS ///////////////////////////////
    ////////////////////////////////////////////////////////////////////////
    
    
    /////////////////////////
    /* Fastqc of raw input */
    /////////////////////////
    
    process fastqc_raw {
      tag "$file_id"
      publishDir "${params.output}/00_fastqc/raw/", mode: 'copy'
    
      input:
      set file_id, file(reads) from INPUT_FASTQC_RAW
    
      output:
      file "*_fastqc.{html,zip}" into OUTPUT_FASTQC_RAW
    
      when:
      params.do_fastqc
    
        """
        fastqc ${file_id}* -t ${task.cpus}
        """
    }
    
    ///////////////////////
    /* Trimming adaptors */
    ///////////////////////
    
    process trimming {
      tag "$file_id"
      publishDir "${params.output}/01_cutadapt/", mode: 'copy'
      echo true
    
      input:
      set file_id, file(reads) from INPUT_CUTADAPT
    
      output:
      set file_id, "*cut_{R1,R2}.fastq.gz" into CUTADAPT_OUTPUT
      file "*first_report.txt" into CUTADAPT_LOG
      file "*{second,third}_report.txt" into CUTADAPT_LOG_2
    
    
      script:
      """
      cutadapt -j ${task.cpus} \
               -a "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA" \
               -A "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT" \
               -o ${file_id}_tmp_R1.fastq.gz \
               -p ${file_id}_tmp_R2.fastq.gz \
               --minimum-length 70 \
               ${reads[0]} ${reads[1]} > ${file_id}_first_report.txt
    
      cutadapt -j ${task.cpus} \
               -a "A{100}" \
               -o ${file_id}_cut_R1.fastq.gz \
               ${file_id}_tmp_R1.fastq.gz \
                  > ${file_id}_second_report.txt
    
      cutadapt -j ${task.cpus} \
             -u -13 \
             -o ${file_id}_cut_R2.fastq.gz \
             ${file_id}_tmp_R2.fastq.gz \
               > ${file_id}_third_report.txt
      """
    }
    
    CUTADAPT_OUTPUT.into{CUTADAPT_OUTPUT_FASTQC;
                         CUTADAPT_OUTPUT_FILTER}
    
    /////////////////////////
    /* Fastqc of raw input */
    /////////////////////////
    
    process fastqc_cut {
      tag "$file_id"
      publishDir "${params.output}/00_fastqc/cut/", mode: 'copy'
    
      input:
      set file_id, file(reads) from CUTADAPT_OUTPUT_FASTQC
    
      output:
      file "*.{html,zip}" into OUTPUT_FASTQC_CUT
    
      when:
      params.do_fastqc
    
        """
        fastqc ${file_id}* -t ${task.cpus}
        """
    }
    
    /////////////////////////////
    /* rRNA and tRNA filtering */
    /////////////////////////////
    
    process rRNA_removal {
      tag "$file_id"
      publishDir "${params.output}/02_rRNA_depletion/", mode: 'copy'
    
      input:
      set file_id, file(reads) from CUTADAPT_OUTPUT_FILTER
      file index from FILTER_INDEX.toList()
    
      output:
      set file_id, "*.fastq.gz" into FILTER_FASTQ
      set file_id, "*.bam*" into FILTER_BAM
      file "*.{txt,stats}" into FILTER_LOG
    
      script:
      index_id = index[0]
      for (index_file in index) {
        if (index_file =~ /.*\.1\.bt2/ && !(index_file =~ /.*\.rev\.1\.bt2/)) {
            index_id = ( index_file =~ /(.*)\.1\.bt2/)[0][1]
        }
      }
    """
    bowtie2 --sensitive -p ${task.cpus} -x ${index_id} \
    -1 ${reads[0]} -2 ${reads[1]} --no-unal \
    --un-conc-gz ${file_id}_R%.fastq.gz 2> \
    ${file_id}_filter.txt | samtools view -bS - \
    | samtools sort -@ ${task.cpus} -o ${file_id}.filter.bam \
                && samtools index ${file_id}.filter.bam \
                && samtools idxstats ${file_id}.filter.bam  > \
                   ${file_id}.filter.stats
    
    if grep -q "Error " ${file_id}_filter.txt; then
      exit 1
    fi
    """
    }
    FILTER_FASTQ.into{FILTER_FASTQ_FASTQC;
                      FILTER_FASTQ_HISAT;
                      FILTER_FASTQ_POSTGENOME
                    }
    
    /////////////////////////////
    /* Fastqc of filtred reads */
    /////////////////////////////
    
    process fastqc_filter {
      tag "$file_id"
      publishDir "${params.output}/00_fastqc/filter/", mode: 'copy'
    
      input:
      set file_id, file(reads) from FILTER_FASTQ_FASTQC
    
      output:
      set file_id, "*.{html,zip}" into OUTPUT_FASTQC_FILTER
    
      when:
      params.do_fastqc
    
        """
        fastqc ${file_id}* -t ${task.cpus}
        """
    }
    
    
    ///////////////////////////////////////////////////////////////////
    //////////////////////////// GENOME ///////////////////////////////
    ///////////////////////////////////////////////////////////////////
    
    
    ///////////////////////////////////////////////
    /*	mapping against human genome with hisat2 */
    ///////////////////////////////////////////////
    
    process hisat2_human {
      tag "$file_id"
      publishDir "${params.output}/03_hisat2/", mode: 'copy'
    
      input:
        set file_id, file(fastq_filtred) from FILTER_FASTQ_HISAT
        file index from GENOME_INDEX.toList()
    
      output:
        set file_id, "*_notaligned_R{1,2}.fastq.gz" into HISAT_UNALIGNED
        set file_id, "*.{bam,bam.bai}" into HISAT_ALIGNED
        file "*.txt" into HISAT_LOG
    
      script:
      index_id = index[0]
      for (index_file in index) {
        if (index_file =~ /.*\.1\.ht2/ && !(index_file =~ /.*\.rev\.1\.ht2/)) {
            index_id = ( index_file =~ /(.*)\.1\.ht2/)[0][1]
        }
      }
    """
    hisat2 -x ${index_id} \
           -p ${task.cpus} \
           -1 ${fastq_filtred[0]} \
           -2 ${fastq_filtred[1]} \
           --un-conc-gz ${file_id}_notaligned_R%.fastq.gz \
           --rna-strandness 'FR' \
           --dta \
           2> ${file_id}_genome.txt \
    | samtools view -bS -F 4 - \
    | samtools sort -@ ${task.cpus} -o ${file_id}.bam \
    && samtools index ${file_id}.bam
    
    if grep -q "ERR" ${file_id}.txt; then
      exit 1
    fi
    """
    }
    
    HISAT_ALIGNED.into{HISAT_ALIGNED_FASTQC;
                       HISAT_ALIGNED_DEDUP}
    
    ////////////////////////////
    /* Fastqc of genome reads */
    ////////////////////////////
    
    process fastqc_genome {
      tag "$file_id"
      publishDir "${params.output}/00_fastqc/genome/", mode: 'copy'
    
      input:
      set file_id, file(reads) from HISAT_ALIGNED_FASTQC
    
      output:
      set file_id, "*.{html,zip}" into OUTPUT_FASTQC_GENOME
    
      when:
      params.do_fastqc
    
        """
        fastqc ${file_id}* -t ${task.cpus}
        """
    }
    
    ////////////////////////////
    /* Deduplication of reads */
    ////////////////////////////
    
    process dedup_genome {
      tag "$file_id"
      publishDir "${params.output}/03_hisat2/dedup/", mode: 'copy'
    
      input:
      set file_id, file(bam) from HISAT_ALIGNED_DEDUP
    
      output:
      set file_id, "*.{bam, bai}" into DEDUP_GENOME
      file "*.log" into DEDUP_LOG
    
      when:
      params.do_dedup
    
        """
        umi_tools dedup -I ${bam}[0] \
        -S ${file_id}.dedup.bam \
        2> dedup.log
        """
    }
    
    if (! params.do_dedup) {
      HISAT_ALIGNED_DEDUP.into{DEDUP_GENOME}
    }
    
    DEDUP_GENOME.into{DEDUP_GENOME_HTSEQ;
                      DEDUP_GENOME_RNASEQC
                    }
    
    ///////////
    /* HTseq */
    ///////////
    
    process sort_bam {
      tag "$file_id"
    
      input:
        set file_id, file(bam) from DEDUP_GENOME_HTSEQ
    
      output:
        set file_id, "*_htseq.bam" into SORTED_NAME_GENOME
    
      script:
    """
    samtools sort -@ ${task.cpus} -n -O BAM -o ${file_id}_htseq.bam ${bam[0]}
    """
    }
    
    process counting {
      tag "$file_id"
      publishDir "${params.output}/04_HTseq/", mode: 'copy'
    
      input:
      set file_id, file(bam) from SORTED_NAME_GENOME
      file gtf from GTF_FILE.toList()
    
      output:
      file "*.count" into HTSEQ_COUNT
    
      script:
    """
    htseq-count ${bam[0]} ${gtf} \
                --mode=intersection-nonempty \
                -a 10 \
                -s yes \
                -t CDS \
                -i gene_id \
                -f bam \
    > ${file_id}_CDS.count
    
    htseq-count ${bam[0]} ${gtf} \
                --mode=intersection-nonempty \
                -a 10 \
                -s yes \
                -t exon \
                -i gene_id \
                -f bam \
    > ${file_id}.count
    
    """
    }
    
    ///////////////
    /* RNASEQ QC */
    ///////////////
    
    process rnaseq_qc {
      tag "$file_id"
      publishDir "${params.output}/06_RNAseqQC/", mode: 'copy'
    
      input:
        set file_id, file(bam) from DEDUP_GENOME_RNASEQC
        file (gtf) from GTF_COLLAPSE.collect()
    
      output:
        file "*" into RNASEQC_OUTPUT
    
      script:
    """
    rnaseqc ${gtf} ${bam[0]} -s ${file_id} ./ \
    --stranded 'FR'
    """
    }
    
    ////////////////////////////////////////////////////////////////////////
    //////////////////////////// POST GENOME ///////////////////////////////
    ////////////////////////////////////////////////////////////////////////
    
    /////////////////////////
    /* HISAT2 POST_GENOMIC */
    /////////////////////////
    
    process hisat2_postGenomic {
      tag "$file_id"
      publishDir "${params.output}/05_post_genome_hisat2/", mode: 'copy'
    
      input:
        set file_id, file(fastq_unaligned) from FILTER_FASTQ_POSTGENOME
        file index2 from POSTGENOME_INDEX.collect()
    
      output:
        set file_id, "*.{bam,bam.bai}" into POSTGENOME_ALIGNED
        file "*_postgenome.txt" into POSTGENOME_LOG
    
      when:
      params.do_postgenome
    
      script:
      index2_id = index2[0]
      for (index2_file in index2) {
        if (index2_file =~ /.*\.1\.ht2/ && !(index2_file =~ /.*\.rev\.1\.ht2/)) {
            index2_id = ( index2_file =~ /(.*)\.1\.ht2/)[0][1]
        }
      }
    """
    hisat2 -x ${index2_id} \
           -p ${task.cpus} \
           -1 ${fastq_unaligned[0]} \
           -2 ${fastq_unaligned[1]} \
           --rna-strandness 'FR' \
           --dta\
           2> ${file_id}_postgenome.txt \
    | samtools view -bS -F 4 - \
    | samtools sort -@ ${task.cpus} -o ${file_id}.bam \
    && samtools index ${file_id}.bam
    
    if grep -q "ERR" ${file_id}_postgenome.txt; then
      exit 1
    fi
    """
    }
    
    POSTGENOME_ALIGNED.into{POSTGENOME_ALIGNED_FASTQC;
                       POSTGENOME_ALIGNED_DEDUP}
    
    ////////////////////////////
    /* Deduplication of reads */
    ////////////////////////////
    
    process dedup_postgenome {
      tag "$file_id"
      publishDir "${params.output}/05_post_genome_hisat2/dedup/", mode: 'copy'
    
      input:
      set file_id, file(bam) from POSTGENOME_ALIGNED_DEDUP
    
      output:
      set file_id, "*.{bam, bai}" into DEDUP_POSTGENOME
      file "*.log" into DEDUP_POSTGENOME_LOG
    
      when:
      params.do_dedup
    
      """
      umi_tools dedup -I ${bam}[0] \
      -S ${file_id}.dedup.bam \
      2> dedup.log
      """
    }
    
    process fastqc_postgenome {
      tag "$file_id"
      publishDir "${params.output}/00_fastqc/postgenome/", mode: 'copy'
    
      input:
      set file_id, file(reads) from POSTGENOME_ALIGNED_FASTQC
    
      output:
      set file_id, "*.{html,zip}" into OUTPUT_FASTQC_POSTGENOME
    
      when:
      params.do_fastqc
    
        """
        fastqc ${file_id}* -t ${task.cpus}
        """
    }
    
    ////////////////////////////////////////////////////////////////////////
    //////////////////////////// POST PROCESS //////////////////////////////
    ////////////////////////////////////////////////////////////////////////
    
    
    /////////////
    /* MultiQC */
    /////////////
    
    
    process multiqc {
      tag "multiqc"
      publishDir "${params.output}/multiqc", mode: 'copy'
    
      input:
      file ('fastqc/*') from OUTPUT_FASTQC_RAW.collect().ifEmpty([])
      file ('*') from CUTADAPT_LOG.collect().ifEmpty([])
      file ('fastqc/*') from OUTPUT_FASTQC_CUT.collect().ifEmpty([])
      file ('*') from FILTER_LOG.collect().ifEmpty([])
      file ('fastqc/*') from OUTPUT_FASTQC_FILTER.collect().ifEmpty([])
      file ('*') from HISAT_LOG.collect().ifEmpty([])
      file ('fastqc/*') from OUTPUT_FASTQC_GENOME.collect().ifEmpty([])
      file ('*') from HTSEQ_COUNT.collect().ifEmpty([])
      file ('*') from POSTGENOME_LOG.collect().ifEmpty([])
      file ('fastqc/*') from OUTPUT_FASTQC_POSTGENOME.collect().ifEmpty([])
      file ('*') from RNASEQC_OUTPUT.collect().ifEmpty([])
    
      output:
      file "multiqc_report.html" into multiqc_report
      file "multiqc_data"
    
      script:
    """
    multiqc ./
    """
    }