From 364489edd3129e2c145386770c43b984cee3a800 Mon Sep 17 00:00:00 2001 From: Mia Croiset <mia.croiset@ens-lyon.fr> Date: Fri, 12 May 2023 09:42:08 +0200 Subject: [PATCH] hicstuff pcr filter module --- bin/hicstuff_filter_pcr.py | 78 ++++++++++++++++++++++++++++ conf/hicstuff.config | 11 ++++ modules/local/hicstuff/filter_pcr.nf | 21 ++++++++ subworkflows/local/hicstuff_sub.nf | 10 +++- 4 files changed, 119 insertions(+), 1 deletion(-) create mode 100755 bin/hicstuff_filter_pcr.py create mode 100644 modules/local/hicstuff/filter_pcr.nf diff --git a/bin/hicstuff_filter_pcr.py b/bin/hicstuff_filter_pcr.py new file mode 100755 index 0000000..40c4568 --- /dev/null +++ b/bin/hicstuff_filter_pcr.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import hicstuff_io as hio +from hicstuff_log import logger +import argparse +import os, time, csv, sys, re + +def filter_pcr_dup(pairs_idx_file, filtered_file): + """ + Filter out PCR duplicates from a coordinate-sorted pairs file using + overrrepresented exact coordinates. If multiple fragments have two reads + with the exact same coordinates, only one of those fragments is kept. + Parameters + ---------- + pairs_idx_file : str + Path to an indexed pairs file containing the Hi-C reads. + filtered_file : str + Path to the output pairs file after removing duplicates. + """ + # Keep count of how many reads are filtered + filter_count = 0 + reads_count = 0 + # Store header lines + header = hio.get_pairs_header(pairs_idx_file) + with open(pairs_idx_file, "r") as pairs, open(filtered_file, "w") as filtered: + # Copy header lines to filtered file + for head_line in header: + filtered.write(head_line + "\n") + next(pairs) + + # Use csv methods to easily access columns + paircols = [ + "readID", + "chr1", + "pos1", + "chr2", + "pos2", + "strand1", + "strand2", + "frag1", + "frag2", + ] + # Columns used for comparison of coordinates + coord_cols = [col for col in paircols if col != "readID"] + pair_reader = csv.DictReader(pairs, delimiter="\t", fieldnames=paircols) + filt_writer = csv.DictWriter(filtered, delimiter="\t", fieldnames=paircols) + + # Initialise a variable to store coordinates of reads in previous pair + prev = {k: 0 for k in paircols} + for pair in pair_reader: + reads_count += 1 + # If coordinates are the same as before, skip pair + if all(pair[pair_var] == prev[pair_var] for pair_var in coord_cols): + filter_count += 1 + continue + # Else write pair and store new coordinates as previous + else: + filt_writer.writerow(pair) + prev = pair + logger.info( + "%d%% PCR duplicates have been filtered out (%d / %d pairs) " + % (100 * round(filter_count / reads_count, 3), filter_count, reads_count) + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input") + parser.add_argument("-p", "--prefix") + parser.add_argument("-o" ,"--output_file") + args = parser.parse_args() + + input = args.input + prefix = args.prefix + output_file = prefix+"_"+args.output_file + + filter_pcr_dup(input, output_file) diff --git a/conf/hicstuff.config b/conf/hicstuff.config index d7cae49..0f27d39 100644 --- a/conf/hicstuff.config +++ b/conf/hicstuff.config @@ -141,6 +141,7 @@ params { hicstuff_rm_centro = 'None' //int or 'None' hicstuff_distance_plot = 'false' hicstuff_distance_out_plot = 'plot_distance_law.pdf' + hicstuff_filter_pcr_out_file = 'valid_idx_pcrfree.pairs' //Hicstuff optional modules filter_event = true @@ -223,6 +224,16 @@ process { ] } + withName: 'FILTER_PCR' { + ext.args = { [ + "-o ${params.hicstuff_filter_pcr_out_file}" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/hicstuff/pairs" }, + mode: 'copy' + ] + } + withName: 'BUILD_MATRIX' { ext.args = params.hicstuff_matrix publishDir = [ diff --git a/modules/local/hicstuff/filter_pcr.nf b/modules/local/hicstuff/filter_pcr.nf new file mode 100644 index 0000000..57ab0f4 --- /dev/null +++ b/modules/local/hicstuff/filter_pcr.nf @@ -0,0 +1,21 @@ +process FILTER_PCR { + tag "$meta1.id" + label 'process_high' + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "lbmc/hicstuff:3.1.3" + + input: + tuple val(meta1), path(idx_pairs) + + output: + tuple val(meta1), path("*_valid_idx_pcrfree.pairs"), emit: idx_pairs_pcrfree + + script: + + def args = task.ext.args ?: '' + + """ + hicstuff_filter_pcr.py -i ${idx_pairs} -p ${meta1.id} ${args} + """ +} diff --git a/subworkflows/local/hicstuff_sub.nf b/subworkflows/local/hicstuff_sub.nf index 048c084..befd3e5 100644 --- a/subworkflows/local/hicstuff_sub.nf +++ b/subworkflows/local/hicstuff_sub.nf @@ -82,7 +82,15 @@ workflow HICSTUFF_SUB { ) } - + if (params.filter_pcr && params.filter_pcr_picard ){ + error "Error: filter_pcr and filter_pcr_picard can't both be true at the same time! Set one of them false in the config file" + } + else if ( params.filter_pcr ){ + FILTER_PCR( + ch_idx_pairs + ) + FILTER_PCR.out.idx_pairs_pcrfree.set{ ch_idx_pairs } + } //TODO rajouter filtres + distance law + filtres PCR en options // pour les PCR filter, soit le hicstuff soit PICARD -- GitLab