Skip to content
Snippets Groups Projects
Verified Commit 42307c8b authored by Mia Croiset's avatar Mia Croiset
Browse files

add cutsite option

parent 7e270b75
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# coding: utf-8
"""Cut the reads at their ligation sites.
The module will cut at ligation sites of the given enzyme. It will from the
original fastq (gzipped or not) files create new gzipped fastq files with
combinations of the fragments of reads obtains by cutting at the ligation sites
of the reads.
There are three choices to how combine the fragments. 1. "for_vs_rev": All the
combinations are made between one forward fragment and one reverse fragment.
It's the most releveant one for usual workflow. 2. "all": All 2-combinations are
made. 3. "pile": Only combinations between adjacent fragments in the initial
reads are made.
This module contains the following functions:
- cut_ligation_sites
- cutsite_read
- write_pair
- _writer
"""
import gzip
import multiprocessing
import pyfastx
import re
import sys
import argparse
import hicstuff_digest as hcd
from hicstuff_log import logger
def cut_ligation_sites(
fq_for, fq_rev, digest_for, digest_rev, enzyme, mode, seed_size, n_cpu
):
"""Create new reads to manage pairs with a digestion and create multiple
pairs to take into account all the contact present.
The function write two files for both the forward and reverse fastq with the
new reads. The new reads have at the end of the ID ":[0-9]" added to
differentiate the different pairs created from one read.
The function will look for all the sites present and create new pairs of
reads according to the mode given to retreive as much as possible of the HiC
signal.
Parameters
----------
fq_for : str
Path to the forward fastq file to digest.
fq_rev : str
Path to the reverse fatsq file to digest.
digest_for : str
Path to the output digested forward fatsq file to write.
digest_rev : str
Path to the output digested reverse fatsq file to write.
enzyme : str
The list of restriction enzyme used to digest the genome separated by a
comma. Example: HpaII,MluCI.
mode : str
Mode to use to make the digestion. Three values possible: "all",
"for_vs_rev", "pile".
seed_size : int
Minimum size of a fragment (i.e. seed size used in mapping as reads
smaller won't be mapped.)
n_cpu : int
Number of CPUs.
"""
# Process the ligation sites given
ligation_sites = hcd.gen_enzyme_religation_regex(enzyme)
# Defined stop_token which is used to mark the end of input file
stop_token = "STOP"
# A stack is a string cointaining multiple read pairs
max_stack_size = 1000
# Create count to have an idea of the digested pairs repartition.
original_number_of_pairs = 0
final_number_of_pairs = 0
new_reads_for = ""
new_reads_rev = ""
current_stack = 0
# Start parallel threading to compute the
# ctx = multiprocessing.get_context("spawn")
queue = multiprocessing.Queue(max(1, n_cpu - 1))
writer_process = multiprocessing.Process(
target=_writer, args=(digest_for, digest_rev, queue, stop_token)
)
writer_process.start()
# Iterate on all pairs
for read_for, read_rev in zip(
pyfastx.Fastq(fq_for, build_index=False),
pyfastx.Fastq(fq_rev, build_index=False),
):
# Count the numbers of original reads processed.
original_number_of_pairs += 1
# Count for stack size.
current_stack += 1
# Extract components of the reads.
for_name, for_seq, for_qual = read_for
rev_name, rev_seq, rev_qual = read_rev
# Sanity check to be sure all reads are with their mate.
if for_name != rev_name:
logger.error(
"The fastq files contains reads not sorted :\n{0}\n{1}".format(
for_name, rev_name
)
)
sys.exit(1)
# Cut the forward and reverse reads at the ligation sites.
for_seq_list, for_qual_list = cutsite_read(
ligation_sites,
for_seq,
for_qual,
seed_size,
)
rev_seq_list, rev_qual_list = cutsite_read(
ligation_sites,
rev_seq,
rev_qual,
seed_size,
)
# Write the new combinations of fragments.
new_reads_for, new_reads_rev, final_number_of_pairs = write_pair(
new_reads_for,
new_reads_rev,
for_name,
for_seq_list,
for_qual_list,
rev_seq_list,
rev_qual_list,
mode,
final_number_of_pairs,
)
# If stack full, add it in the queue.
if current_stack == max_stack_size:
# Add the pair in the queue.
pairs = (new_reads_for.encode(), new_reads_rev.encode())
queue.put(pairs)
# Empty the stack
current_stack = 0
new_reads_for = ""
new_reads_rev = ""
# End the parallel processing.
pairs = (new_reads_for.encode(), new_reads_rev.encode())
queue.put(pairs)
queue.put(stop_token)
writer_process.join()
# Return information on the different pairs created
logger.info(f"Library used: {fq_for} - {fq_rev}")
logger.info(f"Number of pairs before digestion: {original_number_of_pairs}")
logger.info(f"Number of pairs after digestion: {final_number_of_pairs}")
def cutsite_read(ligation_sites, seq, qual, seed_size=0):
"""Find ligation sites in a given sequence and return list of the fragmented
sequence and quality.
Parameters:
-----------
ligation_sites : re.Pattern
Regex of all possible ligations according to the given enzymes.
seq : str
Sequence where to search for ligation_sites.
qual : str
Quality values of the sequence given.
seed_size : int
Minimum size of a fragment (i.e. seed size used in mapping as reads
smaller won't be mapped.)
Returns:
--------
list of str
List of cut sequences. The split is made 4 bases after the start of
the ligation site.
list of str
List of string of the qualities.
Examples:
---------
>>> cutsite_read(re.compile(r'GA.TA.TC'), "AAGAGTATTC", "FFF--FAFAF")
(['AAGAGT', 'ATTC'], ['FFF--F', 'AFAF'])
"""
# Find the ligation sites.
ligation_sites_list = []
if ligation_sites.search(seq):
ligation_sites_list = [
site.start() for site in ligation_sites.finditer(seq)
]
ligation_sites_list.append(len(seq))
# Split the sequences on the ligation sites.
seq_list = []
qual_list = []
left_site = 0
for right_site in ligation_sites_list:
if right_site + 4 - left_site > seed_size:
seq_list.append(seq[left_site : right_site + 4])
qual_list.append(qual[left_site : right_site + 4])
left_site = right_site + 4
return seq_list, qual_list
def write_pair(
new_reads_for,
new_reads_rev,
name,
for_seq_list,
for_qual_list,
rev_seq_list,
rev_qual_list,
mode,
final_number_of_pairs,
):
"""Function to write one pair with the combinations of fragment depending on
the chosen mode.
Parameters:
-----------
new_reads_for : str
Stack of the new forward reads ready to be written.
new_reads_rev : str
Stack of the new reverse reads ready to be written.
name : str
Name of the fastq read.
for_seq_list : list
List of forward sequences of the fastq read.
for_qual_list : list
List of forward qualities of the fastq read.
rev_seq_list : list
List of reverse sequencs of the fastq read.
rev_qual_list : list
List of reverse qualities of the fastq read.
mode : str
Mode to use to make the digestion. Three values possible: "all",
"for_vs_rev", "pile".
final_numbers_of_pairs : int
Count of pairs after cutting.
Returns:
--------
str
Stack of forward reads ready to be written with the last pairs added.
str
Stack of reverse reads ready to be written with the last pairs added.
int
Count of pairs after cutting.
"""
# Mode "for_vs_rev": Make contacts only between fragments from different
# reads (one fragment from forward and one from reverse).
if mode == "for_vs_rev":
for i in range(len(for_seq_list)):
for j in range(len(rev_seq_list)):
final_number_of_pairs += 1
new_reads_for += "@%s\n%s\n+\n%s\n" % (
name + ":" + str(i) + str(j),
for_seq_list[i],
for_qual_list[i],
)
new_reads_rev += "@%s\n%s\n+\n%s\n" % (
name + ":" + str(i) + str(j),
rev_seq_list[j],
rev_qual_list[j],
)
# Mode "all": Make all the possible contacts between the fragments.
elif mode == "all":
seq_list = for_seq_list + rev_seq_list
qual_list = for_qual_list + rev_qual_list
for i in range(len(seq_list)):
for j in range(i + 1, len(seq_list)):
final_number_of_pairs += 1
if i < len(for_seq_list):
new_reads_for += "@%s\n%s\n+\n%s\n" % (
name + ":" + str(i) + str(j),
seq_list[i],
qual_list[i],
)
# Reverse the forward read if comes from reverse.
else:
new_reads_for += "@%s\n%s\n+\n%s\n" % (
name + ":" + str(i) + str(j),
seq_list[i][::-1],
qual_list[i][::-1],
)
# Reverse the reverse read if comes from forward.
if j < len(for_seq_list):
new_reads_rev += "@%s\n%s\n+\n%s\n" % (
name + ":" + str(i) + str(j),
seq_list[j][::-1],
qual_list[j][::-1],
)
else:
new_reads_rev += "@%s\n%s\n+\n%s\n" % (
name + ":" + str(i) + str(j),
seq_list[j],
qual_list[j],
)
# Mode "pile": Only make contacts bewteen two adjacent fragments.
elif mode == "pile":
seq_list = for_seq_list + rev_seq_list
qual_list = for_qual_list + rev_qual_list
for i in range(len(seq_list) - 1):
final_number_of_pairs += 1
if i < len(for_seq_list) - 1:
new_reads_for += "@%s\n%s\n+\n%s\n" % (
name + ":" + str(i) + str(i + 1),
seq_list[i],
qual_list[i],
)
new_reads_rev += "@%s\n%s\n+\n%s\n" % (
name + ":" + str(i) + str(i + 1),
seq_list[i + 1][::-1],
qual_list[i + 1][::-1],
)
elif i == len(for_seq_list) - 1:
new_reads_for += "@%s\n%s\n+\n%s\n" % (
name + ":" + str(i) + str(i + 1),
seq_list[i],
qual_list[i],
)
new_reads_rev += "@%s\n%s\n+\n%s\n" % (
name + ":" + str(i) + str(i + 1),
seq_list[i + 1],
qual_list[i + 1],
)
else:
new_reads_for += "@%s\n%s\n+\n%s\n" % (
name + ":" + str(i) + str(i + 1),
seq_list[i][::-1],
qual_list[i][::-1],
)
new_reads_rev += "@%s\n%s\n+\n%s\n" % (
name + ":" + str(i) + str(i + 1),
seq_list[i + 1],
qual_list[i + 1],
)
return new_reads_for, new_reads_rev, final_number_of_pairs
def _writer(output_for, output_rev, queue, stop_token):
"""Function to write the pair throw parallel processing.
Parameters:
-----------
output_for : str
Path to the output forward compressed fastq file.
output_rev : str
Path to the output reverse compressed fastq file.
queue : multiprocessing.queues.Queue
Queue for the multiprocesing of the whole file.
stop_token : str
Token to signal that the end of the file have been reached.
"""
with gzip.open(output_for, "wb") as for_fq, gzip.open(
output_rev, "wb"
) as rev_fq:
while True:
line = queue.get()
if line == stop_token:
return 0
for_fq.write(line[0])
rev_fq.write(line[1])
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-r1", "--reads1")
parser.add_argument("-r2", "--reads2")
parser.add_argument("-e", "--enzyme")
parser.add_argument("-o1", "--output1")
parser.add_argument("-o2", "--output2")
parser.add_argument("-m", "--mode")
parser.add_argument("-s", "--seed")
parser.add_argument("-c", "--n_cpu")
args = parser.parse_args()
reads1 = args.reads1
reads2 = args.reads2
enzyme = args.enzyme
out1 = args.output1
out2 = args.output2
mode = args.mode
seed_size = int(args.seed)
n_cpu = int(args.n_cpu)
#hicstuff case sensitive enzymes adaptation
if enzyme == "hindiii":
enzyme = "HindIII"
elif enzyme == "dpnii":
enzyme = "DpnII"
elif enzyme == "bglii":
enzyme = "BglII"
elif enzyme == "mboi":
enzyme = "MboI"
elif enzyme == "arima":
enzyme = ["DpnII","HinfI"]
cut_ligation_sites(reads1, reads2, out1, out2, enzyme, mode, seed_size, n_cpu)
......@@ -434,4 +434,21 @@ process {
"-n"
].join('').trim() }
}
//********************************
withName: 'CUTSITE' {
ext.output_for = { "${meta1.id}_${meta1.chunk}_${meta1.mates}_digested.fastq" }
ext.output_rev = { "${meta2.id}_${meta2.chunk}_${meta2.mates}_digested.fastq" }
ext.args = { [
" -m ${params.cutsite_mode}",
" -s ${params.cutsite_seed}",
" -c ${params.cutsite_cpu}"
].join('').trim()
}
publishDir = [
path: { "${params.outdir}/cutsite/digested"},
mode: 'copy',
enabled: params.save_digested
]
}
}
process CUTSITE {
tag "$meta1.id"
label 'process_high'
conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3"
container = "docker.io/lbmc/hicstuff:3.1.3"
input:
tuple val(meta1), path(reads1), val(meta2), path(reads2)
val(digestion)
output:
tuple val(meta1), path ("*.fastq"), emit: fastq
path "versions.yml", emit: versions
script:
def args = task.ext.args ?: ''
def output_for = task.ext.output_for ?: "${meta1.id}_${meta1.chunk}_${meta1.mates}.fastq"
def output_rev = task.ext.output_rev ?: "${meta2.id}_${meta2.chunk}_${meta2.mates}.fastq"
"""
hicstuff_cutsite.py -r1 ${reads1} -r2 ${reads2} -e ${digestion} -o1 ${output_for} -o2 ${output_rev} ${args}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
hicstuff: v3.1.3)
END_VERSIONS
"""
}
......@@ -170,6 +170,12 @@ params {
//Hicstuff save intermediates files
save_fragments = false
//Cutsite
cutsite_mode = 'for_vs_rev'
cutsite_seed = 0
cutsite_cpu = 4
save_digested = false
cutsite = false
}
// Load base.config by default for all pipelines
......
......@@ -49,6 +49,19 @@ workflow HICSTUFF {
ch_reads_r2 = reads.map{ it -> pairToSingle(it,"R2") }
ch_reads = ch_reads_r1.concat(ch_reads_r2)
/* if (params.cutsite){
ch_reads.combine(ch_reads)
.map {
meta1, reads1, meta2, reads2 ->
meta1.id == meta2.id && meta1.chunk == meta2.chunk && meta1.mates == "R1" && meta2.mates == "R2" ? [ meta1, bam1, meta2, bam2 ] : null
}.set{ new_ch_reads }
CUTSITE(
new_ch_reads,
digestion
)
ch_reads = CUTSITE.out.fastq
} */
BOWTIE2_ALIGNMENT(
ch_reads,
index.collect()
......
......@@ -126,6 +126,7 @@ include { TADS } from '../subworkflows/local/tads'
//
include { FASTQC } from '../modules/nf-core/fastqc/main'
include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'
include { CUTSITE } from '../modules/local/hicstuff/cutsite'
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......@@ -147,6 +148,18 @@ Channel.fromPath( params.fasta )
// Info required for completion email and summary
def multiqc_report = []
// Paired-end to Single-end
def pairToSingle(row, mates) {
def meta = row[0].clone()
meta.single_end = true
meta.mates = mates
if (mates == "R1" | mates == "1") {
return [meta, [ row[1][0]] ]
}else if (mates == "R2" | mates == "2"){
return [meta, [ row[1][1]] ]
}
}
workflow HIC {
ch_versions = Channel.empty()
......@@ -157,6 +170,7 @@ workflow HIC {
INPUT_CHECK (
ch_input
)
ch_reads = INPUT_CHECK.out.reads
//
// SUBWORKFLOW: Prepare genome annotation
......@@ -175,12 +189,33 @@ workflow HIC {
)
ch_versions = ch_versions.mix(FASTQC.out.versions)
if (params.cutsite){
// Align each mates separetly and add mates information in [meta]
ch_reads_r1 = ch_reads.map{ it -> pairToSingle(it,"R1") }
ch_reads_r2 = ch_reads.map{ it -> pairToSingle(it,"R2") }
ch_reads = ch_reads_r1.concat(ch_reads_r2)
ch_reads.combine(ch_reads)
.map {
meta1, reads1, meta2, reads2 ->
meta1.id == meta2.id && meta1.chunk == meta2.chunk && meta1.mates == "R1" && meta2.mates == "R2" ? [ meta1, reads1, meta2, reads2 ] : null
}.set{ new_ch_reads }
CUTSITE(
new_ch_reads,
params.digestion
)
ch_reads = CUTSITE.out.fastq
ch_reads.view()
}
//
// SUB-WORFLOW: HiC-Pro
//
if (params.workflow == "hicpro"){
HICPRO (
INPUT_CHECK.out.reads,
ch_reads,
PREPARE_GENOME.out.index,
PREPARE_GENOME.out.res_frag,
PREPARE_GENOME.out.chromosome_size,
......@@ -221,7 +256,7 @@ workflow HIC {
else {
HICSTUFF(
INPUT_CHECK.out.reads,
ch_reads,
PREPARE_GENOME.out.index,
ch_ligation_site,
params.digestion, //str enzyme
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment