From f3ba46f4f324a068d58d39c5f53a48ab7af408f1 Mon Sep 17 00:00:00 2001
From: nservant <nicolas.servant@curie.fr>
Date: Mon, 18 Jul 2022 16:18:19 +0200
Subject: [PATCH] [MODIF] test on real dataset

---
 bin/hicpro_merge_validpairs.sh                | 35 +++++++++++++++++--
 conf/base.config                              |  2 +-
 conf/modules.config                           |  8 ++---
 modules/local/cooltools/eigs-cis.nf           |  2 +-
 modules/local/hicpro/build_contact_maps.nf    |  2 +-
 modules/local/hicpro/dnase_mapping_stats.nf   |  2 +-
 modules/local/hicpro/hicpro2pairs.nf          |  6 ++--
 .../local/hicpro/merge_valid_interaction.nf   |  2 +-
 modules/local/hicpro/run_ice.nf               |  2 +-
 subworkflows/local/cooler.nf                  |  3 +-
 workflows/hic.nf                              | 16 ++++++---
 11 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/bin/hicpro_merge_validpairs.sh b/bin/hicpro_merge_validpairs.sh
index 09f9727..40b758c 100755
--- a/bin/hicpro_merge_validpairs.sh
+++ b/bin/hicpro_merge_validpairs.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -e
 
 ##
 ## HiC-Pro
@@ -17,11 +18,36 @@ done
 shift $(( OPTIND - 1 ))
 
 vpairs="$@"
+vpairs_sorted=$(echo $vpairs | sed -e 's/validPairs/sorted.validPairs/g')
+
+mkdir -p ./tmp/
 
 if [[ ${rmDup} == 1 ]]; then
-    ## Sort valid pairs and remove read pairs with same starts (i.e duplicated read pairs)
-    sort -S 50% -k2,2V -k3,3n -k5,5V -k6,6n -m ${vpairs} | \
-        awk -F"\t" 'BEGIN{c1=0;c2=0;s1=0;s2=0}(c1!=$2 || c2!=$5 || s1!=$3 || s2!=$6){print;c1=$2;c2=$5;s1=$3;s2=$6}' > ${prefix}.allValidPairs
+    ## Sort individual validPairs files
+    fcounts=0
+    for vfile in ${vpairs}
+    do
+	echo "Sorting ${vfile} ..."
+	fcounts=$((fcounts+1))
+	ofile=$(echo ${vfile} | sed -e 's/validPairs/sorted.validPairs/')
+	#sort -k2,2V -k3,3n -k5,5V -k6,6n -T ./tmp/ -o ${ofile} ${vfile}
+	sort -k2,2 -k5,5 -k3,3n -k6,6n -T ./tmp/ -o ${ofile} ${vfile}
+    done
+
+    if [[ $fcounts -gt 1 ]]
+    then
+	echo "Merging and removing the duplicates ..."
+	## Sort valid pairs and remove read pairs with same starts (i.e duplicated read pairs)
+	#sort -k2,2V -k3,3n -k5,5V -k6,6n -T ./tmp/ -m ${vpairs_sorted} | \
+	 sort -k2,2 -k5,5 -k3,3n -k6,6n -T ./tmp/ -m ${vpairs_sorted} | \
+            awk -F"\t" 'BEGIN{c1=0;c2=0;s1=0;s2=0}(c1!=$2 || c2!=$5 || s1!=$3 || s2!=$6){print;c1=$2;c2=$5;s1=$3;s2=$6}' > ${prefix}.allValidPairs
+    else
+	echo "Removing the duplicates ..."
+	cat ${vpairs_sorted} | awk -F"\t" 'BEGIN{c1=0;c2=0;s1=0;s2=0}(c1!=$2 || c2!=$5 || s1!=$3 || s2!=$6){print;c1=$2;c2=$5;s1=$3;s2=$6}' > ${prefix}.allValidPairs
+    fi
+
+    ## clean
+    /bin/rm -rf ${vpairs_sorted}
 else
     cat ${vpairs} > ${prefix}.allValidPairs
 fi
@@ -33,3 +59,6 @@ cat ${prefix}.allValidPairs | wc -l >> ${prefix}_allValidPairs.mergestat
 
 ## Count short range (<20000) vs long range contacts
 awk 'BEGIN{cis=0;trans=0;sr=0;lr=0} $2 == $5{cis=cis+1; d=$6>$3?$6-$3:$3-$6; if (d<=20000){sr=sr+1}else{lr=lr+1}} $2!=$5{trans=trans+1}END{print "trans_interaction\t"trans"\ncis_interaction\t"cis"\ncis_shortRange\t"sr"\ncis_longRange\t"lr}' ${prefix}.allValidPairs >> ${prefix}_allValidPairs.mergestat
+
+## clean
+/bin/rm -rf ./tmp/
diff --git a/conf/base.config b/conf/base.config
index 96e1742..e1895e3 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -43,7 +43,7 @@ process {
         time   = { check_max( 20.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_high_memory {
-        memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+        memory = { check_max( 24.GB * task.attempt, 'memory' ) }
     }
     withLabel:error_ignore {
         errorStrategy = 'ignore'
diff --git a/conf/modules.config b/conf/modules.config
index 06aaba2..5b279c5 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -49,7 +49,7 @@ process {
             mode: 'copy',
             enabled: params.save_aligned_intermediates
         ]
-        ext.prefix = { params.split_fastq ? "${meta.chunk}_${meta.mates}" : "${meta.id}_${meta.mates}" }
+        ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}" }
         ext.args = params.bwt2_opts_end2end ?: ''
         ext.args2 = !params.dnase ? "-F 4" :""
     }
@@ -68,7 +68,7 @@ process {
             mode: 'copy',
             enabled: params.save_aligned_intermediates
         ]
-        ext.prefix = { params.split_fastq ? "${meta.chunk}_${meta.mates}_trimmed" : "${meta.id}_${meta.mates}_trimmed" }
+        ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}_trimmed" }
         ext.args = params.bwt2_opts_trimmed ?: ''
         ext.args2 = ""
     }
@@ -79,7 +79,7 @@ process {
             mode: 'copy',
             enabled: params.save_aligned_intermediates
         ]
-        ext.prefix = { params.split_fastq ? "${meta.chunk}_${meta.mates}" : "${meta.id}_${meta.mates}" }
+        ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}" }
     }
 
     withName: 'COMBINE_MATES' {
@@ -93,7 +93,7 @@ process {
             params.keep_multi ? "--multi" : "",
             params.min_mapq ? "-q ${params.min_mapq}" : ""
         ].join(' ').trim()
-        ext.prefix = { params.split_fastq ? "${meta.chunk}" : "${meta.id}" }
+        ext.prefix = { "${meta.id}_${meta.chunk}" }
     }
 
     withName: 'GET_VALID_INTERACTION' {
diff --git a/modules/local/cooltools/eigs-cis.nf b/modules/local/cooltools/eigs-cis.nf
index a63dee2..55d6d16 100644
--- a/modules/local/cooltools/eigs-cis.nf
+++ b/modules/local/cooltools/eigs-cis.nf
@@ -29,7 +29,7 @@ process CALL_COMPARTMENTS {
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        cooltools: \$(cooltools --version 2>&1 | sed 's/cooletools, version //')
+        cooltools: \$(cooltools --version 2>&1 | grep version | sed 's/cooltools, version //')
     END_VERSIONS
     """
 }
diff --git a/modules/local/hicpro/build_contact_maps.nf b/modules/local/hicpro/build_contact_maps.nf
index 1ba3550..097ff59 100644
--- a/modules/local/hicpro/build_contact_maps.nf
+++ b/modules/local/hicpro/build_contact_maps.nf
@@ -1,6 +1,6 @@
 process BUILD_CONTACT_MAPS{
   tag "$meta.id - $res"
-  label 'process_highmem'
+  label 'process_high_memory'
 
   conda (params.enable_conda ? "conda-forge::sed=4.7" : null)
   container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/modules/local/hicpro/dnase_mapping_stats.nf b/modules/local/hicpro/dnase_mapping_stats.nf
index edc40dc..9505298 100644
--- a/modules/local/hicpro/dnase_mapping_stats.nf
+++ b/modules/local/hicpro/dnase_mapping_stats.nf
@@ -16,7 +16,7 @@ process MAPPING_STATS_DNASE {
     tuple val(meta), path("${prefix}.mapstat"), emit:stats
 
     script:
-    prefix = meta.id + "_" + meta.mates
+    prefix = meta.id + "_" + meta.chunk + "_" + meta.mates
     tag = meta.mates
     """
     echo "## ${prefix}" > ${prefix}.mapstat
diff --git a/modules/local/hicpro/hicpro2pairs.nf b/modules/local/hicpro/hicpro2pairs.nf
index 4ee35b0..aa475c2 100644
--- a/modules/local/hicpro/hicpro2pairs.nf
+++ b/modules/local/hicpro/hicpro2pairs.nf
@@ -19,13 +19,13 @@ process HICPRO2PAIRS {
     prefix = "${meta.id}"
     """
     ##columns: readID chr1 pos1 chr2 pos2 strand1 strand2
-    awk '{OFS="\t";print \$1,\$2,\$3,\$5,\$6,\$4,\$7}' $vpairs > ${prefix}_contacts.pairs
-    sort -k2,2 -k4,4 -k3,3n -k5,5n ${prefix}_contacts.pairs | bgzip -c > ${prefix}_contacts.pairs.gz
+    awk '{OFS="\t";print \$1,\$2,\$3,\$5,\$6,\$4,\$7}' $vpairs | bgzip -c > ${prefix}_contacts.pairs.gz
+    ##sort -k2,2 -k4,4 -k3,3n -k5,5n ${prefix}_contacts.pairs | bgzip -c > ${prefix}_contacts.pairs.gz
     pairix -f ${prefix}_contacts.pairs.gz
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        pairix: \$(echo \$(pairix 2>&1 | grep Version | sed -e 's/Version: //')
+        pairix: \$(echo \$(pairix 2>&1 | grep Version | sed -e 's/Version: //'))
     END_VERSIONS
     """
 }
diff --git a/modules/local/hicpro/merge_valid_interaction.nf b/modules/local/hicpro/merge_valid_interaction.nf
index 6da4356..c5b716d 100644
--- a/modules/local/hicpro/merge_valid_interaction.nf
+++ b/modules/local/hicpro/merge_valid_interaction.nf
@@ -1,6 +1,6 @@
 process MERGE_VALID_INTERACTION {
     tag "$prefix"
-    label 'process_highmem'
+    label 'process_high_memory'
 
     conda (params.enable_conda ? "conda-forge::gawk=5.1.0" : null)
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/modules/local/hicpro/run_ice.nf b/modules/local/hicpro/run_ice.nf
index e77b5c2..996baf1 100644
--- a/modules/local/hicpro/run_ice.nf
+++ b/modules/local/hicpro/run_ice.nf
@@ -1,6 +1,6 @@
 process ICE_NORMALIZATION{
     tag "$rmaps"
-    label 'process_highmem'
+    label 'process_high_memory'
 
     conda (params.enable_conda ? "conda-forge::python=3.9  bioconda::iced=0.5.10 conda-forge::numpy=1.22.3" : null)
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/subworkflows/local/cooler.nf b/subworkflows/local/cooler.nf
index 8780b82..7ce15de 100644
--- a/subworkflows/local/cooler.nf
+++ b/subworkflows/local/cooler.nf
@@ -63,8 +63,9 @@ workflow COOLER {
   if (!params.res_zoomify){
     ch_res_zoomify = cool_bins.min()
   }else{
-    ch_res_zoomify = params.res_zoomify
+    ch_res_zoomify = Channel.from(params.res_zoomify).splitCsv().flatten().unique().toInteger()
   }
+
   ch_cool
     .combine(ch_res_zoomify)
     .filter{ it[2] == it[3] }
diff --git a/workflows/hic.nf b/workflows/hic.nf
index 60e54a0..a58b89c 100644
--- a/workflows/hic.nf
+++ b/workflows/hic.nf
@@ -36,13 +36,20 @@ if (params.digestion){
 }
 
 //****************************************
-// Maps resolution for downstream analysis
+// Combine all maps resolution for downstream analysis
+
+ch_map_res = Channel.from( params.bin_size ).splitCsv().flatten().toInteger()
+
+if (params.res_zoomify){
+  ch_zoom_res = Channel.from( params.res_zoomify ).splitCsv().flatten().toInteger()
+  ch_map_res = ch_map_res.concat(ch_zoom_res)
+}
 
-ch_map_res = Channel.from( params.bin_size ).splitCsv().flatten()
 if (params.res_tads && !params.skip_tads){
   Channel.from( "${params.res_tads}" )
     .splitCsv()
     .flatten()
+    .toInteger()
     .set {ch_tads_res}
   ch_map_res = ch_map_res.concat(ch_tads_res)
 }else{
@@ -56,6 +63,7 @@ if (params.res_dist_decay && !params.skip_dist_decay){
   Channel.from( "${params.res_dist_decay}" )
     .splitCsv()
     .flatten()
+    .toInteger()
     .set {ch_ddecay_res}
    ch_map_res = ch_map_res.concat(ch_ddecay_res)
 }else{
@@ -69,6 +77,7 @@ if (params.res_compartments && !params.skip_compartments){
   Channel.from( "${params.res_compartments}" )
     .splitCsv()
     .flatten()
+    .toInteger()
     .set {ch_comp_res}
    ch_map_res = ch_map_res.concat(ch_comp_res)
 }else{
@@ -154,8 +163,6 @@ workflow HIC {
     ch_input
   )
 
-  INPUT_CHECK.out.reads.view()
-
   //
   // SUBWORKFLOW: Prepare genome annotation
   //
@@ -202,7 +209,6 @@ workflow HIC {
   if (!params.skip_dist_decay){
     COOLER.out.cool
       .combine(ch_ddecay_res)
-      .view()
       .filter{ it[0].resolution == it[2] }
       .map { it -> [it[0], it[1]]}
       .set{ ch_distdecay }
-- 
GitLab