add multiQC report

ac7ab363 · nservant · 31af5d39 · ac7ab363 · ac7ab363
Commit ac7ab363 authored Apr 9, 2019 by nservant
--- a/bin/merge_statfiles.py
+++ b/bin/merge_statfiles.py
+#!/usr/bin/env python
+## HiC-Pro
+## Copyright (c) 2015 Institut Curie                               
+## Author(s): Nicolas Servant, Eric Viara
+## Contact: nicolas.servant@curie.fr
+## This software is distributed without any guarantee under the terms of the BSD-3 licence.
+## See the LICENCE file for details
+"""
+Script to merge any files with the same template
+"""
+import argparse
+import sys
+import glob
+import os
+from collections import OrderedDict
+def num(s):
+    try:
+        return int(s)
+    except ValueError:
+        return float(s)
+if __name__ == "__main__":
+    ## Read command line arguments
+    parser = argparse.ArgumentParser()      
+    parser.add_argument("-f", "--files", help="List of input file(s)", type=str, nargs='+')
+    parser.add_argument("-v", "--verbose", help="verbose mode", action='store_true')
+    args = parser.parse_args()
+    infiles = args.files
+    li = len(infiles)
+    if li > 0:
+        if args.verbose:
+            print "## merge_statfiles.py"
+            print "## Merging "+ str(li)+" files"
+        ## Reading first file to get the template
+        template = OrderedDict()
+        if args.verbose:
+            print "## Use "+infiles[0]+" as template"
+        with open(infiles[0]) as f:
+            for line in f:
+                if not line.startswith("#"):
+                    lsp = line.strip().split("\t")
+                    data = map(num, lsp[1:len(lsp)])
+                    template[str(lsp[0])] = data
+        if len(template) == 0:
+            print "Cannot find template files !"
+            sys.exit(1)
+        ## Int are counts / Float are percentage
+        for fidx in xrange(1, li):
+            with open(infiles[fidx]) as f:
+                for line in f:
+                    if not line.startswith("#"):
+                        lsp = line.strip().split("\t")
+                        if lsp[0] in template:
+                            for i in xrange(1, len(lsp)):
+                                if isinstance(num(lsp[i]), int):
+                                    template[lsp[0]][i-1] += num(lsp[i])
+                                else:
+                                    template[lsp[0]][i-1] = round((template[lsp[0]][i-1] + num(lsp[i]))/2,3)
+                        else:
+                            sys.stderr.write("Warning : '"+lsp[0]+"' not found in template ["+infiles[fidx]+"]\n")
+        ## Print template
+        for x in template:
+            sys.stdout.write(x)
+            for y in template[x]:
+                sys.stdout.write("\t"+str(y))
+            sys.stdout.write("\n")
+    else:
+        print "No files to merge - stop"
+        sys.exit(1)
--- a/main.nf
+++ b/main.nf
@@ -471,11 +471,12 @@ process merge_mapping_steps{
   output:
      set val(sample), file("${prefix}_bwt2merged.bam") into bwt2_merged_bam
-      set val(prefix), file("${prefix}.mapstat") into all_mapstat
+      set val(oname), file("${prefix}.mapstat") into all_mapstat
   script:
      sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2)/
      tag = prefix.toString() =~/_R1|_val_1/ ? "R1" : "R2"
+      oname = prefix.toString() - ~/(\.[0-9]+)$/
      """
      samtools merge -@ ${task.cpus} \\
@@ -511,13 +512,14 @@ process combine_mapped_files{
   output:
      set val(sample), file("${sample}_bwt2pairs.bam") into paired_bam
-      file "*.pairstat" into all_pairstat
+      set val(oname), file("*.pairstat") into all_pairstat
   script:
      r1_bam = aligned_bam[0]
      r1_prefix = r1_bam.toString() - ~/_bwt2merged.bam$/
      r2_bam = aligned_bam[1]
      r2_prefix = r2_bam.toString() - ~/_bwt2merged.bam$/
+      oname = sample.toString() - ~/(\.[0-9]+)$/
      def opts = "-t"
      opts = params.rm_singleton ? "${opts}" : "--single ${opts}"
@@ -546,7 +548,7 @@ process get_valid_interaction{
   output:
      set val(sample), file("*.validPairs") into valid_pairs
      set val(sample), file("*.validPairs") into valid_pairs_4cool
-      file "*RSstat" into all_rsstat
+      set val(sample), file("*RSstat") into all_rsstat
   script:
@@ -570,10 +572,10 @@ process get_valid_interaction{
 * STEP3 - BUILD MATRIX
 */
-if ( params.splitFastq ){
+process remove_duplicates {
-   process merge_sample {
   tag "$sample"
-      publishDir "${params.outdir}/hic_results/data", mode: 'copy'
+   publishDir "${params.outdir}/hic_results/data", mode: 'copy',
+   	      saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$sample/$filename" : "$filename"}	      
   input:
     set val(sample), file(vpairs) from valid_pairs.groupTuple()
@@ -581,16 +583,56 @@ if ( params.splitFastq ){
   output:
     set val(sample), file("*.allValidPairs") into all_valid_pairs
     set val(sample), file("*.allValidPairs") into all_valid_pairs_4cool
+     file("stats/") into all_mergestat
   script:
+   if ( params.rm_dup ){
   """
-      cat $vpairs > test.allValidPairs
+   mkdir -p stats/${sample}
+   sort -T /tmp/ -S 50% -k2,2V -k3,3n -k5,5V -k6,6n -m ${vpairs} | \
+   awk -F"\\t" 'BEGIN{c1=0;c2=0;s1=0;s2=0}(c1!=\$2 || c2!=\$5 || s1!=\$3 || s2!=\$6){print;c1=\$2;c2=\$5;s1=\$3;s2=\$6}' > ${sample}.allValidPairs                   
+   echo -n "valid_interaction\t" > stats/${sample}/${sample}_allValidPairs.mergestat
+   cat ${vpairs} | wc -l >> stats/${sample}/${sample}_allValidPairs.mergestat
+   echo -n "valid_interaction_rmdup\t" >> stats/${sample}/${sample}_allValidPairs.mergestat
+   cat ${sample}.allValidPairs | wc -l >> stats/${sample}/${sample}_allValidPairs.mergestat
+   awk 'BEGIN{cis=0;trans=0;sr=0;lr=0} \$2 == \$5{cis=cis+1; d=\$6>\$3?\$6-\$3:\$3-\$6; if (d<=20000){sr=sr+1}else{lr=lr+1}} \$2!=\$5{trans=trans+1}END{print "trans_interaction\\t"trans"\\ncis_interaction\\t"cis"\\ncis_shortRange\\t"sr"\\ncis_longRange\\t"lr}' ${sample}.allValidPairs >> stats/${sample}/${sample}_allValidPairs.mergestat
   """
-   }
   }else{
-   all_valid_pairs = valid_pairs
+   """
-   all_valid_pairs_4cool = valid_pairs	
+   mkdir -p stats/${sample}
+   cat ${vpairs} > ${sample}.allValidPairs
+   echo -n "valid_interaction\t" > stats/${sample}/${sample}_allValidPairs.mergestat
+   cat ${vpairs} | wc -l >> stats/${sample}/${sample}_allValidPairs.mergestat
+   echo -n "valid_interaction_rmdup\t" >> stats/${sample}/${sample}_allValidPairs.mergestat
+   cat ${sample}.allValidPairs | wc -l >> stats/${sample}/${sample}_allValidPairs.mergestat
+   awk 'BEGIN{cis=0;trans=0;sr=0;lr=0} \$2 == \$5{cis=cis+1; d=\$6>\$3?\$6-\$3:\$3-\$6; if (d<=20000){sr=sr+1}else{lr=lr+1}} \$2!=\$5{trans=trans+1}END{print "trans_interaction\\t"trans"\\ncis_interaction\\t"cis"\\ncis_shortRange\\t"sr"\\ncis_longRange\\t"lr}' ${sample}.allValidPairs >> stats/${sample}/${sample}_allValidPairs.mergestat
+   """
   }
+}
+process merge_sample {
+   tag "$ext"
+   publishDir "${params.outdir}/hic_results/stats/${sample}", mode: 'copy'
+   input:
+     set val(prefix), file(fstat) from all_mapstat.groupTuple().concat(all_pairstat.groupTuple(), all_rsstat.groupTuple())
+  output:
+     file("mstats/") into all_mstats
+  script:
+     sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2)/
+     if ( (fstat =~ /.mapstat/) ){ ext = "mmapstat" }
+     if ( (fstat =~ /.pairstat/) ){ ext = "mpairstat" }
+     if ( (fstat =~ /.RSstat/) ){ ext = "mRSstat" }
+     """
+     mkdir -p mstats/${sample}
+     merge_statfiles.py -f ${fstat} > mstats/${sample}/${prefix}.${ext}
+     """
+}
 process build_contact_maps{
   tag "$sample - $mres"
@@ -659,14 +701,13 @@ process generate_cool{
 /*
 * STEP 5 - MultiQC
+ */ 
 process multiqc {
    publishDir "${params.outdir}/MultiQC", mode: 'copy'
    input:
    file multiqc_config from ch_multiqc_config
-    //file ('mapping/stats/*') from combine_mapping_results.collect()
+    file ('input_*/*') from all_mstats.concat(all_mergestat).collect()
-    //file ('hic_results/data/stats/*') from valid_interaction_results.collect()
    file ('software_versions/*') from software_versions_yaml
    file workflow_summary from create_workflow_summary(summary)
@@ -677,11 +718,12 @@ process multiqc {
    script:
    rtitle = custom_runName ? "--title \"$custom_runName\"" : ''
    rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : ''
    """
    multiqc -f $rtitle $rfilename --config $multiqc_config .
    """
 }
-*/
 /****************************************************
 * POST-PROCESSING