diff --git a/bin/hicstuff_cutsite.py b/bin/hicstuff_cutsite.py new file mode 100755 index 0000000000000000000000000000000000000000..a53466aeaf03b8ec5f358709544f07b4752adeca --- /dev/null +++ b/bin/hicstuff_cutsite.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Cut the reads at their ligation sites. + +The module will cut at ligation sites of the given enzyme. It will from the +original fastq (gzipped or not) files create new gzipped fastq files with +combinations of the fragments of reads obtains by cutting at the ligation sites +of the reads. + +There are three choices to how combine the fragments. 1. "for_vs_rev": All the +combinations are made between one forward fragment and one reverse fragment. +It's the most releveant one for usual workflow. 2. "all": All 2-combinations are +made. 3. "pile": Only combinations between adjacent fragments in the initial +reads are made. + +This module contains the following functions: + - cut_ligation_sites + - cutsite_read + - write_pair + - _writer +""" + + +import gzip +import multiprocessing +import pyfastx +import re +import sys +import argparse +import hicstuff_digest as hcd +from hicstuff_log import logger + + +def cut_ligation_sites( + fq_for, fq_rev, digest_for, digest_rev, enzyme, mode, seed_size, n_cpu +): + """Create new reads to manage pairs with a digestion and create multiple + pairs to take into account all the contact present. + + The function write two files for both the forward and reverse fastq with the + new reads. The new reads have at the end of the ID ":[0-9]" added to + differentiate the different pairs created from one read. + + The function will look for all the sites present and create new pairs of + reads according to the mode given to retreive as much as possible of the HiC + signal. + + Parameters + ---------- + fq_for : str + Path to the forward fastq file to digest. + fq_rev : str + Path to the reverse fatsq file to digest. + digest_for : str + Path to the output digested forward fatsq file to write. + digest_rev : str + Path to the output digested reverse fatsq file to write. + enzyme : str + The list of restriction enzyme used to digest the genome separated by a + comma. Example: HpaII,MluCI. + mode : str + Mode to use to make the digestion. Three values possible: "all", + "for_vs_rev", "pile". + seed_size : int + Minimum size of a fragment (i.e. seed size used in mapping as reads + smaller won't be mapped.) + n_cpu : int + Number of CPUs. + """ + # Process the ligation sites given + ligation_sites = hcd.gen_enzyme_religation_regex(enzyme) + + # Defined stop_token which is used to mark the end of input file + stop_token = "STOP" + # A stack is a string cointaining multiple read pairs + max_stack_size = 1000 + + # Create count to have an idea of the digested pairs repartition. + original_number_of_pairs = 0 + final_number_of_pairs = 0 + new_reads_for = "" + new_reads_rev = "" + current_stack = 0 + + # Start parallel threading to compute the + # ctx = multiprocessing.get_context("spawn") + queue = multiprocessing.Queue(max(1, n_cpu - 1)) + writer_process = multiprocessing.Process( + target=_writer, args=(digest_for, digest_rev, queue, stop_token) + ) + writer_process.start() + + # Iterate on all pairs + for read_for, read_rev in zip( + pyfastx.Fastq(fq_for, build_index=False), + pyfastx.Fastq(fq_rev, build_index=False), + ): + + # Count the numbers of original reads processed. + original_number_of_pairs += 1 + + # Count for stack size. + current_stack += 1 + + # Extract components of the reads. + for_name, for_seq, for_qual = read_for + rev_name, rev_seq, rev_qual = read_rev + + # Sanity check to be sure all reads are with their mate. + if for_name != rev_name: + logger.error( + "The fastq files contains reads not sorted :\n{0}\n{1}".format( + for_name, rev_name + ) + ) + sys.exit(1) + + # Cut the forward and reverse reads at the ligation sites. + for_seq_list, for_qual_list = cutsite_read( + ligation_sites, + for_seq, + for_qual, + seed_size, + ) + rev_seq_list, rev_qual_list = cutsite_read( + ligation_sites, + rev_seq, + rev_qual, + seed_size, + ) + + # Write the new combinations of fragments. + new_reads_for, new_reads_rev, final_number_of_pairs = write_pair( + new_reads_for, + new_reads_rev, + for_name, + for_seq_list, + for_qual_list, + rev_seq_list, + rev_qual_list, + mode, + final_number_of_pairs, + ) + + # If stack full, add it in the queue. + if current_stack == max_stack_size: + + # Add the pair in the queue. + pairs = (new_reads_for.encode(), new_reads_rev.encode()) + queue.put(pairs) + + # Empty the stack + current_stack = 0 + new_reads_for = "" + new_reads_rev = "" + + # End the parallel processing. + pairs = (new_reads_for.encode(), new_reads_rev.encode()) + queue.put(pairs) + queue.put(stop_token) + writer_process.join() + + # Return information on the different pairs created + logger.info(f"Library used: {fq_for} - {fq_rev}") + logger.info(f"Number of pairs before digestion: {original_number_of_pairs}") + logger.info(f"Number of pairs after digestion: {final_number_of_pairs}") + + +def cutsite_read(ligation_sites, seq, qual, seed_size=0): + """Find ligation sites in a given sequence and return list of the fragmented + sequence and quality. + + Parameters: + ----------- + ligation_sites : re.Pattern + Regex of all possible ligations according to the given enzymes. + seq : str + Sequence where to search for ligation_sites. + qual : str + Quality values of the sequence given. + seed_size : int + Minimum size of a fragment (i.e. seed size used in mapping as reads + smaller won't be mapped.) + + Returns: + -------- + list of str + List of cut sequences. The split is made 4 bases after the start of + the ligation site. + list of str + List of string of the qualities. + + Examples: + --------- + >>> cutsite_read(re.compile(r'GA.TA.TC'), "AAGAGTATTC", "FFF--FAFAF") + (['AAGAGT', 'ATTC'], ['FFF--F', 'AFAF']) + """ + + # Find the ligation sites. + ligation_sites_list = [] + if ligation_sites.search(seq): + ligation_sites_list = [ + site.start() for site in ligation_sites.finditer(seq) + ] + ligation_sites_list.append(len(seq)) + + # Split the sequences on the ligation sites. + seq_list = [] + qual_list = [] + left_site = 0 + for right_site in ligation_sites_list: + if right_site + 4 - left_site > seed_size: + seq_list.append(seq[left_site : right_site + 4]) + qual_list.append(qual[left_site : right_site + 4]) + left_site = right_site + 4 + + return seq_list, qual_list + + +def write_pair( + new_reads_for, + new_reads_rev, + name, + for_seq_list, + for_qual_list, + rev_seq_list, + rev_qual_list, + mode, + final_number_of_pairs, +): + """Function to write one pair with the combinations of fragment depending on + the chosen mode. + + Parameters: + ----------- + new_reads_for : str + Stack of the new forward reads ready to be written. + new_reads_rev : str + Stack of the new reverse reads ready to be written. + name : str + Name of the fastq read. + for_seq_list : list + List of forward sequences of the fastq read. + for_qual_list : list + List of forward qualities of the fastq read. + rev_seq_list : list + List of reverse sequencs of the fastq read. + rev_qual_list : list + List of reverse qualities of the fastq read. + mode : str + Mode to use to make the digestion. Three values possible: "all", + "for_vs_rev", "pile". + final_numbers_of_pairs : int + Count of pairs after cutting. + + Returns: + -------- + str + Stack of forward reads ready to be written with the last pairs added. + str + Stack of reverse reads ready to be written with the last pairs added. + int + Count of pairs after cutting. + """ + + # Mode "for_vs_rev": Make contacts only between fragments from different + # reads (one fragment from forward and one from reverse). + if mode == "for_vs_rev": + for i in range(len(for_seq_list)): + for j in range(len(rev_seq_list)): + final_number_of_pairs += 1 + new_reads_for += "@%s\n%s\n+\n%s\n" % ( + name + ":" + str(i) + str(j), + for_seq_list[i], + for_qual_list[i], + ) + new_reads_rev += "@%s\n%s\n+\n%s\n" % ( + name + ":" + str(i) + str(j), + rev_seq_list[j], + rev_qual_list[j], + ) + + # Mode "all": Make all the possible contacts between the fragments. + elif mode == "all": + seq_list = for_seq_list + rev_seq_list + qual_list = for_qual_list + rev_qual_list + for i in range(len(seq_list)): + for j in range(i + 1, len(seq_list)): + final_number_of_pairs += 1 + if i < len(for_seq_list): + new_reads_for += "@%s\n%s\n+\n%s\n" % ( + name + ":" + str(i) + str(j), + seq_list[i], + qual_list[i], + ) + # Reverse the forward read if comes from reverse. + else: + new_reads_for += "@%s\n%s\n+\n%s\n" % ( + name + ":" + str(i) + str(j), + seq_list[i][::-1], + qual_list[i][::-1], + ) + # Reverse the reverse read if comes from forward. + if j < len(for_seq_list): + new_reads_rev += "@%s\n%s\n+\n%s\n" % ( + name + ":" + str(i) + str(j), + seq_list[j][::-1], + qual_list[j][::-1], + ) + else: + new_reads_rev += "@%s\n%s\n+\n%s\n" % ( + name + ":" + str(i) + str(j), + seq_list[j], + qual_list[j], + ) + + # Mode "pile": Only make contacts bewteen two adjacent fragments. + elif mode == "pile": + seq_list = for_seq_list + rev_seq_list + qual_list = for_qual_list + rev_qual_list + for i in range(len(seq_list) - 1): + final_number_of_pairs += 1 + if i < len(for_seq_list) - 1: + new_reads_for += "@%s\n%s\n+\n%s\n" % ( + name + ":" + str(i) + str(i + 1), + seq_list[i], + qual_list[i], + ) + new_reads_rev += "@%s\n%s\n+\n%s\n" % ( + name + ":" + str(i) + str(i + 1), + seq_list[i + 1][::-1], + qual_list[i + 1][::-1], + ) + elif i == len(for_seq_list) - 1: + new_reads_for += "@%s\n%s\n+\n%s\n" % ( + name + ":" + str(i) + str(i + 1), + seq_list[i], + qual_list[i], + ) + new_reads_rev += "@%s\n%s\n+\n%s\n" % ( + name + ":" + str(i) + str(i + 1), + seq_list[i + 1], + qual_list[i + 1], + ) + else: + new_reads_for += "@%s\n%s\n+\n%s\n" % ( + name + ":" + str(i) + str(i + 1), + seq_list[i][::-1], + qual_list[i][::-1], + ) + new_reads_rev += "@%s\n%s\n+\n%s\n" % ( + name + ":" + str(i) + str(i + 1), + seq_list[i + 1], + qual_list[i + 1], + ) + + return new_reads_for, new_reads_rev, final_number_of_pairs + + +def _writer(output_for, output_rev, queue, stop_token): + """Function to write the pair throw parallel processing. + + Parameters: + ----------- + output_for : str + Path to the output forward compressed fastq file. + output_rev : str + Path to the output reverse compressed fastq file. + queue : multiprocessing.queues.Queue + Queue for the multiprocesing of the whole file. + stop_token : str + Token to signal that the end of the file have been reached. + """ + with gzip.open(output_for, "wb") as for_fq, gzip.open( + output_rev, "wb" + ) as rev_fq: + while True: + line = queue.get() + if line == stop_token: + return 0 + for_fq.write(line[0]) + rev_fq.write(line[1]) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-r1", "--reads1") + parser.add_argument("-r2", "--reads2") + parser.add_argument("-e", "--enzyme") + parser.add_argument("-o1", "--output1") + parser.add_argument("-o2", "--output2") + parser.add_argument("-m", "--mode") + parser.add_argument("-s", "--seed") + parser.add_argument("-c", "--n_cpu") + args = parser.parse_args() + + reads1 = args.reads1 + reads2 = args.reads2 + enzyme = args.enzyme + out1 = args.output1 + out2 = args.output2 + mode = args.mode + seed_size = int(args.seed) + n_cpu = int(args.n_cpu) + + + #hicstuff case sensitive enzymes adaptation + if enzyme == "hindiii": + enzyme = "HindIII" + elif enzyme == "dpnii": + enzyme = "DpnII" + elif enzyme == "bglii": + enzyme = "BglII" + elif enzyme == "mboi": + enzyme = "MboI" + elif enzyme == "arima": + enzyme = ["DpnII","HinfI"] + + cut_ligation_sites(reads1, reads2, out1, out2, enzyme, mode, seed_size, n_cpu) diff --git a/conf/modules.config b/conf/modules.config index b6bdcf184e25aa2656c8300a50ced3d7e9add4ab..afd5c3ebdb93f221c3ff6a5e7507c6f9d2eb5b12 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -434,4 +434,21 @@ process { "-n" ].join('').trim() } } + + //******************************** + withName: 'CUTSITE' { + ext.output_for = { "${meta1.id}_${meta1.chunk}_${meta1.mates}_digested.fastq" } + ext.output_rev = { "${meta2.id}_${meta2.chunk}_${meta2.mates}_digested.fastq" } + ext.args = { [ + " -m ${params.cutsite_mode}", + " -s ${params.cutsite_seed}", + " -c ${params.cutsite_cpu}" + ].join('').trim() + } + publishDir = [ + path: { "${params.outdir}/cutsite/digested"}, + mode: 'copy', + enabled: params.save_digested + ] + } } diff --git a/modules/local/hicstuff/cutsite.nf b/modules/local/hicstuff/cutsite.nf new file mode 100644 index 0000000000000000000000000000000000000000..acba59ab566b83473a7b85bceee43f4d5d1ada2a --- /dev/null +++ b/modules/local/hicstuff/cutsite.nf @@ -0,0 +1,29 @@ +process CUTSITE { + tag "$meta1.id" + label 'process_high' + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "docker.io/lbmc/hicstuff:3.1.3" + + input: + tuple val(meta1), path(reads1), val(meta2), path(reads2) + val(digestion) + + output: + tuple val(meta1), path ("*.fastq"), emit: fastq + path "versions.yml", emit: versions + + script: + def args = task.ext.args ?: '' + def output_for = task.ext.output_for ?: "${meta1.id}_${meta1.chunk}_${meta1.mates}.fastq" + def output_rev = task.ext.output_rev ?: "${meta2.id}_${meta2.chunk}_${meta2.mates}.fastq" + + """ + hicstuff_cutsite.py -r1 ${reads1} -r2 ${reads2} -e ${digestion} -o1 ${output_for} -o2 ${output_rev} ${args} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + hicstuff: v3.1.3) + END_VERSIONS + """ +} + diff --git a/nextflow.config b/nextflow.config index 4c5f6337dd5d4c91411148914ec7d3302ecc5b0c..32c983c0971d1d59828ebc5682d7f72ea77db999 100644 --- a/nextflow.config +++ b/nextflow.config @@ -170,6 +170,12 @@ params { //Hicstuff save intermediates files save_fragments = false + //Cutsite + cutsite_mode = 'for_vs_rev' + cutsite_seed = 0 + cutsite_cpu = 4 + save_digested = false + cutsite = false } // Load base.config by default for all pipelines diff --git a/subworkflows/local/hicstuff.nf b/subworkflows/local/hicstuff.nf index 0b3072e1a5554be81a0540576722bc782d1ea31a..576522e087798103f57566f65e1e1bc4f149e2e4 100644 --- a/subworkflows/local/hicstuff.nf +++ b/subworkflows/local/hicstuff.nf @@ -49,6 +49,19 @@ workflow HICSTUFF { ch_reads_r2 = reads.map{ it -> pairToSingle(it,"R2") } ch_reads = ch_reads_r1.concat(ch_reads_r2) +/* if (params.cutsite){ + ch_reads.combine(ch_reads) + .map { + meta1, reads1, meta2, reads2 -> + meta1.id == meta2.id && meta1.chunk == meta2.chunk && meta1.mates == "R1" && meta2.mates == "R2" ? [ meta1, bam1, meta2, bam2 ] : null + }.set{ new_ch_reads } + CUTSITE( + new_ch_reads, + digestion + ) + ch_reads = CUTSITE.out.fastq + } */ + BOWTIE2_ALIGNMENT( ch_reads, index.collect() diff --git a/workflows/hic.nf b/workflows/hic.nf index 61111706855913db2ed8d3dc6056da7f935a84c6..9a4439385633886d4c55969179ad9f1e0350acd3 100644 --- a/workflows/hic.nf +++ b/workflows/hic.nf @@ -126,6 +126,7 @@ include { TADS } from '../subworkflows/local/tads' // include { FASTQC } from '../modules/nf-core/fastqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { CUTSITE } from '../modules/local/hicstuff/cutsite' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -147,6 +148,18 @@ Channel.fromPath( params.fasta ) // Info required for completion email and summary def multiqc_report = [] +// Paired-end to Single-end +def pairToSingle(row, mates) { + def meta = row[0].clone() + meta.single_end = true + meta.mates = mates + if (mates == "R1" | mates == "1") { + return [meta, [ row[1][0]] ] + }else if (mates == "R2" | mates == "2"){ + return [meta, [ row[1][1]] ] + } +} + workflow HIC { ch_versions = Channel.empty() @@ -157,6 +170,7 @@ workflow HIC { INPUT_CHECK ( ch_input ) + ch_reads = INPUT_CHECK.out.reads // // SUBWORKFLOW: Prepare genome annotation @@ -175,12 +189,33 @@ workflow HIC { ) ch_versions = ch_versions.mix(FASTQC.out.versions) + + if (params.cutsite){ + // Align each mates separetly and add mates information in [meta] + ch_reads_r1 = ch_reads.map{ it -> pairToSingle(it,"R1") } + ch_reads_r2 = ch_reads.map{ it -> pairToSingle(it,"R2") } + ch_reads = ch_reads_r1.concat(ch_reads_r2) + + ch_reads.combine(ch_reads) + .map { + meta1, reads1, meta2, reads2 -> + meta1.id == meta2.id && meta1.chunk == meta2.chunk && meta1.mates == "R1" && meta2.mates == "R2" ? [ meta1, reads1, meta2, reads2 ] : null + }.set{ new_ch_reads } + CUTSITE( + new_ch_reads, + params.digestion + ) + ch_reads = CUTSITE.out.fastq + ch_reads.view() + } + + // // SUB-WORFLOW: HiC-Pro // if (params.workflow == "hicpro"){ HICPRO ( - INPUT_CHECK.out.reads, + ch_reads, PREPARE_GENOME.out.index, PREPARE_GENOME.out.res_frag, PREPARE_GENOME.out.chromosome_size, @@ -221,7 +256,7 @@ workflow HIC { else { HICSTUFF( - INPUT_CHECK.out.reads, + ch_reads, PREPARE_GENOME.out.index, ch_ligation_site, params.digestion, //str enzyme