From ac7ab3639f8eaf603c14d1499fe9a58027ef6608 Mon Sep 17 00:00:00 2001
From: nservant <nservant@curie.fr>
Date: Tue, 9 Apr 2019 15:35:49 +0200
Subject: [PATCH] add multiQC report

---
 bin/merge_statfiles.py | 82 ++++++++++++++++++++++++++++++++++++
 main.nf                | 96 ++++++++++++++++++++++++++++++------------
 2 files changed, 151 insertions(+), 27 deletions(-)
 create mode 100755 bin/merge_statfiles.py

diff --git a/bin/merge_statfiles.py b/bin/merge_statfiles.py
new file mode 100755
index 0000000..ab3d078
--- /dev/null
+++ b/bin/merge_statfiles.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+## HiC-Pro
+## Copyright (c) 2015 Institut Curie                               
+## Author(s): Nicolas Servant, Eric Viara
+## Contact: nicolas.servant@curie.fr
+## This software is distributed without any guarantee under the terms of the BSD-3 licence.
+## See the LICENCE file for details
+
+"""
+Script to merge any files with the same template
+"""
+
+import argparse
+import sys
+import glob
+import os
+from collections import OrderedDict
+
+def num(s):
+    try:
+        return int(s)
+    except ValueError:
+        return float(s)
+
+
+if __name__ == "__main__":
+    ## Read command line arguments
+    parser = argparse.ArgumentParser()      
+    parser.add_argument("-f", "--files", help="List of input file(s)", type=str, nargs='+')
+    parser.add_argument("-v", "--verbose", help="verbose mode", action='store_true')
+    args = parser.parse_args()
+               
+    infiles = args.files
+    li = len(infiles)
+
+    if li > 0:
+        if args.verbose:
+            print "## merge_statfiles.py"
+            print "## Merging "+ str(li)+" files"
+ 
+        ## Reading first file to get the template
+        template = OrderedDict()
+        if args.verbose:
+            print "## Use "+infiles[0]+" as template"
+        with open(infiles[0]) as f:
+            for line in f:
+                if not line.startswith("#"):
+                    lsp = line.strip().split("\t")
+                    data = map(num, lsp[1:len(lsp)])
+                    template[str(lsp[0])] = data
+                
+        if len(template) == 0:
+            print "Cannot find template files !"
+            sys.exit(1)
+
+        ## Int are counts / Float are percentage
+        for fidx in xrange(1, li):
+            with open(infiles[fidx]) as f:
+                for line in f:
+                    if not line.startswith("#"):
+                        lsp = line.strip().split("\t")
+                        if lsp[0] in template:
+                            for i in xrange(1, len(lsp)):
+                                if isinstance(num(lsp[i]), int):
+                                    template[lsp[0]][i-1] += num(lsp[i])
+                                else:
+                                    template[lsp[0]][i-1] = round((template[lsp[0]][i-1] + num(lsp[i]))/2,3)
+                        else:
+                            sys.stderr.write("Warning : '"+lsp[0]+"' not found in template ["+infiles[fidx]+"]\n")
+                            
+        ## Print template
+        for x in template:
+            sys.stdout.write(x)
+            for y in template[x]:
+                sys.stdout.write("\t"+str(y))
+            sys.stdout.write("\n")
+
+    else:
+        print "No files to merge - stop"
+        sys.exit(1)
+
diff --git a/main.nf b/main.nf
index 8f7f61b..24783e2 100644
--- a/main.nf
+++ b/main.nf
@@ -471,11 +471,12 @@ process merge_mapping_steps{
 
    output:
       set val(sample), file("${prefix}_bwt2merged.bam") into bwt2_merged_bam
-      set val(prefix), file("${prefix}.mapstat") into all_mapstat
+      set val(oname), file("${prefix}.mapstat") into all_mapstat
 
    script:
       sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2)/
       tag = prefix.toString() =~/_R1|_val_1/ ? "R1" : "R2"
+      oname = prefix.toString() - ~/(\.[0-9]+)$/
 
       """
       samtools merge -@ ${task.cpus} \\
@@ -505,20 +506,21 @@ process combine_mapped_files{
    tag "$sample = $r1_prefix + $r2_prefix"
    publishDir "${params.outdir}/mapping", mode: 'copy',
    	      saveAs: {filename -> filename.indexOf(".pairstat") > 0 ? "stats/$filename" : "$filename"}	      
-
+ 
    input:
       set val(sample), file(aligned_bam) from bwt2_merged_bam.groupTuple()
 
    output:
       set val(sample), file("${sample}_bwt2pairs.bam") into paired_bam
-      file "*.pairstat" into all_pairstat
+      set val(oname), file("*.pairstat") into all_pairstat
 
    script:
       r1_bam = aligned_bam[0]
       r1_prefix = r1_bam.toString() - ~/_bwt2merged.bam$/
       r2_bam = aligned_bam[1]
       r2_prefix = r2_bam.toString() - ~/_bwt2merged.bam$/
-      
+      oname = sample.toString() - ~/(\.[0-9]+)$/
+ 
       def opts = "-t"
       opts = params.rm_singleton ? "${opts}" : "--single ${opts}"
       opts = params.rm_multi ? "${opts}" : "--multi ${opts}"
@@ -546,7 +548,7 @@ process get_valid_interaction{
    output:
       set val(sample), file("*.validPairs") into valid_pairs
       set val(sample), file("*.validPairs") into valid_pairs_4cool
-      file "*RSstat" into all_rsstat
+      set val(sample), file("*RSstat") into all_rsstat
 
    script:
 	
@@ -570,28 +572,68 @@ process get_valid_interaction{
  * STEP3 - BUILD MATRIX
 */
 
-if ( params.splitFastq ){
-   process merge_sample {
-      tag "$sample"
-      publishDir "${params.outdir}/hic_results/data", mode: 'copy'
+process remove_duplicates {
+   tag "$sample"
+   publishDir "${params.outdir}/hic_results/data", mode: 'copy',
+   	      saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$sample/$filename" : "$filename"}	      
 
-      input:
-	set val(sample), file(vpairs) from valid_pairs.groupTuple()
-	
-      output:
-            set val(sample), file("*.allValidPairs") into all_valid_pairs
-	    set val(sample), file("*.allValidPairs") into all_valid_pairs_4cool
-      	    
-      script:
-      """
-      cat $vpairs > test.allValidPairs
-      """
+   input:
+     set val(sample), file(vpairs) from valid_pairs.groupTuple()
+
+   output:
+     set val(sample), file("*.allValidPairs") into all_valid_pairs
+     set val(sample), file("*.allValidPairs") into all_valid_pairs_4cool
+     file("stats/") into all_mergestat
+
+   script:
+   if ( params.rm_dup ){
+   """
+   mkdir -p stats/${sample}
+   sort -T /tmp/ -S 50% -k2,2V -k3,3n -k5,5V -k6,6n -m ${vpairs} | \
+   awk -F"\\t" 'BEGIN{c1=0;c2=0;s1=0;s2=0}(c1!=\$2 || c2!=\$5 || s1!=\$3 || s2!=\$6){print;c1=\$2;c2=\$5;s1=\$3;s2=\$6}' > ${sample}.allValidPairs                   
+   echo -n "valid_interaction\t" > stats/${sample}/${sample}_allValidPairs.mergestat
+   cat ${vpairs} | wc -l >> stats/${sample}/${sample}_allValidPairs.mergestat
+   echo -n "valid_interaction_rmdup\t" >> stats/${sample}/${sample}_allValidPairs.mergestat
+   cat ${sample}.allValidPairs | wc -l >> stats/${sample}/${sample}_allValidPairs.mergestat
+   awk 'BEGIN{cis=0;trans=0;sr=0;lr=0} \$2 == \$5{cis=cis+1; d=\$6>\$3?\$6-\$3:\$3-\$6; if (d<=20000){sr=sr+1}else{lr=lr+1}} \$2!=\$5{trans=trans+1}END{print "trans_interaction\\t"trans"\\ncis_interaction\\t"cis"\\ncis_shortRange\\t"sr"\\ncis_longRange\\t"lr}' ${sample}.allValidPairs >> stats/${sample}/${sample}_allValidPairs.mergestat
+
+   """
+   }else{
+   """
+   mkdir -p stats/${sample}
+   cat ${vpairs} > ${sample}.allValidPairs
+   echo -n "valid_interaction\t" > stats/${sample}/${sample}_allValidPairs.mergestat
+   cat ${vpairs} | wc -l >> stats/${sample}/${sample}_allValidPairs.mergestat
+   echo -n "valid_interaction_rmdup\t" >> stats/${sample}/${sample}_allValidPairs.mergestat
+   cat ${sample}.allValidPairs | wc -l >> stats/${sample}/${sample}_allValidPairs.mergestat
+   awk 'BEGIN{cis=0;trans=0;sr=0;lr=0} \$2 == \$5{cis=cis+1; d=\$6>\$3?\$6-\$3:\$3-\$6; if (d<=20000){sr=sr+1}else{lr=lr+1}} \$2!=\$5{trans=trans+1}END{print "trans_interaction\\t"trans"\\ncis_interaction\\t"cis"\\ncis_shortRange\\t"sr"\\ncis_longRange\\t"lr}' ${sample}.allValidPairs >> stats/${sample}/${sample}_allValidPairs.mergestat
+   """
    }
-}else{
-   all_valid_pairs = valid_pairs
-   all_valid_pairs_4cool = valid_pairs	
 }
 
+process merge_sample {
+   tag "$ext"
+   publishDir "${params.outdir}/hic_results/stats/${sample}", mode: 'copy'
+
+   input:
+     set val(prefix), file(fstat) from all_mapstat.groupTuple().concat(all_pairstat.groupTuple(), all_rsstat.groupTuple())
+
+  output:
+     file("mstats/") into all_mstats
+
+  script:
+     sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2)/
+     if ( (fstat =~ /.mapstat/) ){ ext = "mmapstat" }
+     if ( (fstat =~ /.pairstat/) ){ ext = "mpairstat" }
+     if ( (fstat =~ /.RSstat/) ){ ext = "mRSstat" }
+
+     """
+     mkdir -p mstats/${sample}
+     merge_statfiles.py -f ${fstat} > mstats/${sample}/${prefix}.${ext}
+     """
+}
+
+
 process build_contact_maps{
    tag "$sample - $mres"
    publishDir "${params.outdir}/hic_results/matrix/raw", mode: 'copy'
@@ -659,14 +701,13 @@ process generate_cool{
 
 /*
  * STEP 5 - MultiQC
-
+ */ 
 process multiqc {
     publishDir "${params.outdir}/MultiQC", mode: 'copy'
 
     input:
     file multiqc_config from ch_multiqc_config
-    //file ('mapping/stats/*') from combine_mapping_results.collect()
-    //file ('hic_results/data/stats/*') from valid_interaction_results.collect()
+    file ('input_*/*') from all_mstats.concat(all_mergestat).collect()
     file ('software_versions/*') from software_versions_yaml
     file workflow_summary from create_workflow_summary(summary)
 
@@ -677,11 +718,12 @@ process multiqc {
     script:
     rtitle = custom_runName ? "--title \"$custom_runName\"" : ''
     rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : ''
+   
     """
     multiqc -f $rtitle $rfilename --config $multiqc_config .
     """
 }
-*/
+
 
 /****************************************************
  * POST-PROCESSING
-- 
GitLab