From 364489edd3129e2c145386770c43b984cee3a800 Mon Sep 17 00:00:00 2001
From: Mia Croiset <mia.croiset@ens-lyon.fr>
Date: Fri, 12 May 2023 09:42:08 +0200
Subject: [PATCH] hicstuff pcr filter module

---
 bin/hicstuff_filter_pcr.py           | 78 ++++++++++++++++++++++++++++
 conf/hicstuff.config                 | 11 ++++
 modules/local/hicstuff/filter_pcr.nf | 21 ++++++++
 subworkflows/local/hicstuff_sub.nf   | 10 +++-
 4 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100755 bin/hicstuff_filter_pcr.py
 create mode 100644 modules/local/hicstuff/filter_pcr.nf

diff --git a/bin/hicstuff_filter_pcr.py b/bin/hicstuff_filter_pcr.py
new file mode 100755
index 0000000..40c4568
--- /dev/null
+++ b/bin/hicstuff_filter_pcr.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import hicstuff_io as hio
+from hicstuff_log import logger
+import argparse
+import os, time, csv, sys, re
+
+def filter_pcr_dup(pairs_idx_file, filtered_file):
+    """
+    Filter out PCR duplicates from a coordinate-sorted pairs file using
+    overrrepresented exact coordinates. If multiple fragments have two reads
+    with the exact same coordinates, only one of those fragments is kept.
+    Parameters
+    ----------
+    pairs_idx_file : str
+        Path to an indexed pairs file containing the Hi-C reads.
+    filtered_file : str
+        Path to the output pairs file after removing duplicates.
+    """
+    # Keep count of how many reads are filtered
+    filter_count = 0
+    reads_count = 0
+    # Store header lines
+    header = hio.get_pairs_header(pairs_idx_file)
+    with open(pairs_idx_file, "r") as pairs, open(filtered_file, "w") as filtered:
+        # Copy header lines to filtered file
+        for head_line in header:
+            filtered.write(head_line + "\n")
+            next(pairs)
+
+        # Use csv methods to easily access columns
+        paircols = [
+            "readID",
+            "chr1",
+            "pos1",
+            "chr2",
+            "pos2",
+            "strand1",
+            "strand2",
+            "frag1",
+            "frag2",
+        ]
+        # Columns used for comparison of coordinates
+        coord_cols = [col for col in paircols if col != "readID"]
+        pair_reader = csv.DictReader(pairs, delimiter="\t", fieldnames=paircols)
+        filt_writer = csv.DictWriter(filtered, delimiter="\t", fieldnames=paircols)
+
+        # Initialise a variable to store coordinates of reads in previous pair
+        prev = {k: 0 for k in paircols}
+        for pair in pair_reader:
+            reads_count += 1
+            # If coordinates are the same as before, skip pair
+            if all(pair[pair_var] == prev[pair_var] for pair_var in coord_cols):
+                filter_count += 1
+                continue
+            # Else write pair and store new coordinates as previous
+            else:
+                filt_writer.writerow(pair)
+                prev = pair
+        logger.info(
+            "%d%% PCR duplicates have been filtered out (%d / %d pairs) "
+            % (100 * round(filter_count / reads_count, 3), filter_count, reads_count)
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input")
+    parser.add_argument("-p", "--prefix")
+    parser.add_argument("-o" ,"--output_file")
+    args = parser.parse_args()
+
+    input = args.input
+    prefix = args.prefix
+    output_file = prefix+"_"+args.output_file
+
+    filter_pcr_dup(input, output_file)
diff --git a/conf/hicstuff.config b/conf/hicstuff.config
index d7cae49..0f27d39 100644
--- a/conf/hicstuff.config
+++ b/conf/hicstuff.config
@@ -141,6 +141,7 @@ params {
     hicstuff_rm_centro = 'None'         //int or 'None'
     hicstuff_distance_plot = 'false'
     hicstuff_distance_out_plot = 'plot_distance_law.pdf'
+    hicstuff_filter_pcr_out_file = 'valid_idx_pcrfree.pairs'
 
     //Hicstuff optional modules
     filter_event = true
@@ -223,6 +224,16 @@ process {
         ]
     }
 
+    withName: 'FILTER_PCR' {
+        ext.args = { [
+            "-o ${params.hicstuff_filter_pcr_out_file}"
+        ].join('').trim() }
+        publishDir = [
+            path: { "${params.outdir}/hicstuff/pairs" },
+            mode: 'copy'
+        ]
+    }
+
     withName: 'BUILD_MATRIX' {
         ext.args = params.hicstuff_matrix
         publishDir = [
diff --git a/modules/local/hicstuff/filter_pcr.nf b/modules/local/hicstuff/filter_pcr.nf
new file mode 100644
index 0000000..57ab0f4
--- /dev/null
+++ b/modules/local/hicstuff/filter_pcr.nf
@@ -0,0 +1,21 @@
+process FILTER_PCR {
+    tag "$meta1.id"
+    label 'process_high'
+
+    conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3"
+    container = "lbmc/hicstuff:3.1.3"
+
+    input:
+    tuple val(meta1), path(idx_pairs)
+
+    output:
+    tuple val(meta1), path("*_valid_idx_pcrfree.pairs"), emit: idx_pairs_pcrfree
+
+    script:
+
+    def args = task.ext.args ?: ''
+
+    """
+    hicstuff_filter_pcr.py -i ${idx_pairs} -p ${meta1.id} ${args}
+    """
+}
diff --git a/subworkflows/local/hicstuff_sub.nf b/subworkflows/local/hicstuff_sub.nf
index 048c084..befd3e5 100644
--- a/subworkflows/local/hicstuff_sub.nf
+++ b/subworkflows/local/hicstuff_sub.nf
@@ -82,7 +82,15 @@ workflow HICSTUFF_SUB {
         )
     }
 
-
+    if (params.filter_pcr && params.filter_pcr_picard ){
+        error "Error: filter_pcr and filter_pcr_picard can't both be true at the same time! Set one of them false in the config file"
+    }
+    else if ( params.filter_pcr ){
+        FILTER_PCR(
+            ch_idx_pairs
+        )
+        FILTER_PCR.out.idx_pairs_pcrfree.set{ ch_idx_pairs }
+    }
 
     //TODO rajouter filtres + distance law + filtres PCR en options
     // pour les PCR filter, soit le hicstuff soit PICARD
-- 
GitLab