From 0b0764ede139c4c05aff3692a769c25a743781a9 Mon Sep 17 00:00:00 2001 From: nservant <nservant@curie.fr> Date: Tue, 12 May 2020 10:32:30 +0200 Subject: [PATCH] [MODIF] update yml + python3 --- CHANGELOG.md | 9 +- bin/digest_genome.py | 150 +++++++------ bin/mapped_2hic_dnase.py | 121 +++++------ bin/mapped_2hic_fragments.py | 188 ++++++++-------- bin/mergeSAM.py | 403 +++++++++++++++++------------------ conf/base.config | 15 +- environment.yml | 23 +- main.nf | 160 +++++++------- 8 files changed, 528 insertions(+), 541 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8de4bf1..7c1941a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,16 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Added` -* Bump v1.2.0 +* Bump v1.2.0dev * Merge template nf-core 1.9 +* Move some options to camel_case +* Update conda environment file +* Update python scripts for python3 +### `Deprecated` + +* --skipMaps, --skipIce, --skipCool, --skipMultiQC are deprecated and replaced by --skip_maps, --skip_ice, --skip_cool, --skip_multiqc +* --saveReference, --saveAlignedIntermediates, --saveInteractionBAM are replaced by --save_reference, --save_aligned_intermediates, --save_interaction_bam ## v1.1.1 diff --git a/bin/digest_genome.py b/bin/digest_genome.py index ac6d8da..2c29a49 100755 --- a/bin/digest_genome.py +++ b/bin/digest_genome.py @@ -26,48 +26,47 @@ RE_cutsite = { def find_re_sites(filename, sequences, offset): - infile = open(filename) - chr_id = None - big_str = "" - indices = [] - all_indices = [] - contig_names = [] - c = 0 - for line in infile: - c += 1 - if line.startswith(">"): - print line.split()[0][1:], "..." - # If this is not the first chromosome, find the indices and append - # them to the list - if chr_id is not None: - for rs in range(len(sequences)): - pattern = "(?=%s)" % sequences[rs].lower() - indices += [m.start() + offset[rs] - for m in re.finditer(pattern, big_str)] - indices.sort() - all_indices.append(indices) - indices = [] - - # This is a new chromosome. Empty the sequence string, and add the - # correct chrom id - big_str = "" - chr_id = line.split()[0][1:] - if chr_id in contig_names: - print "The fasta file contains several instance of", - print chr_id, ". Exit." - sys.exit(-1) - contig_names.append(chr_id) - else: - # As long as we don't change chromosomes, continue reading the - # file, and appending the sequences - big_str += line.lower().strip() - # Add the indices for the last chromosome - for rs in range(len(sequences)): - pattern = "(?=%s)" % sequences[rs].lower() - indices += [m.start() + offset[rs] - for m in re.finditer(pattern, big_str)] - indices.sort() - all_indices.append(indices) + with open(filename, 'r') as infile: + chr_id = None + big_str = "" + indices = [] + all_indices = [] + contig_names = [] + c = 0 + for line in infile: + c += 1 + if line.startswith(">"): + print("{}...".format(line.split()[0][1:])) + # If this is not the first chromosome, find the indices and append + # them to the list + if chr_id is not None: + for rs in range(len(sequences)): + pattern = "(?={})".format(sequences[rs].lower()) + indices += [m.start() + offset[rs]\ + for m in re.finditer(pattern, big_str)] + indices.sort() + all_indices.append(indices) + indices = [] + + # This is a new chromosome. Empty the sequence string, and add the + # correct chrom id + big_str = "" + chr_id = line.split()[0][1:] + if chr_id in contig_names: + print("The fasta file contains several instance of {}. Exit.".format(chr_id)) + sys.exit(-1) + contig_names.append(chr_id) + else: + # As long as we don't change chromosomes, continue reading the + # file, and appending the sequences + big_str += line.lower().strip() + # Add the indices for the last chromosome + for rs in range(len(sequences)): + pattern = "(?={})".format(sequences[rs].lower()) + indices += [m.start() + offset[rs] + for m in re.finditer(pattern, big_str)] + indices.sort() + all_indices.append(indices) return contig_names, all_indices @@ -76,27 +75,27 @@ def find_chromsomose_lengths(reference_filename): chromosome_lengths = [] chromosome_names = [] length = None - infile = open(reference_filename) - for line in infile: - if line.startswith(">"): - chromosome_names.append(line[1:].strip()) - if length is not None: - chromosome_lengths.append(length) - length = 0 - else: - length += len(line.strip()) - chromosome_lengths.append(length) + with open(reference_filename, 'r') as infile: + for line in infile: + if line.startswith(">"): + chromosome_names.append(line[1:].strip()) + if length is not None: + chromosome_lengths.append(length) + length = 0 + else: + length += len(line.strip()) + chromosome_lengths.append(length) return chromosome_names, np.array(chromosome_lengths) def replaceN(cs): npos = int(cs.find('N')) cseql = [] - if npos!= -1: + if npos != -1: for nuc in ["A","C","G","T"]: tmp = cs.replace('N', nuc, 1) tmpl = replaceN(tmp) - if type(tmpl)==list: + if type(tmpl) == list: cseql = cseql + tmpl else: cseql.append(tmpl) @@ -138,15 +137,15 @@ if __name__ == "__main__": offpos = int(cseq.find('^')) if offpos == -1: - print "Unable to detect offset for", cseq - print "Please, use '^' to specified the cutting position,", - print "i.e A^GATCT for HindIII digestion" + print("Unable to detect offset for {}. Please, use '^' to specify the cutting position,\ + i.e A^GATCT for HindIII digestion.".format(cseq)) sys.exit(-1) for nuc in list(set(cs)): - if nuc != 'A' and nuc != 'C' and nuc != 'G' and nuc != 'T' and nuc != 'N' and nuc != '^': - print "Find unexpected character ['",nuc,"']in restriction motif" - print "Note that multiple motifs should be separated by a space (not a comma !)" + if nuc not in ['A','T','G','C','N','^']: + print("Find unexpected character ['{}']in restriction motif".format(nuc)) + print("Note that multiple motifs should be separated by a space (not a comma !)") + sys.exit(-1) offset.append(offpos) @@ -166,9 +165,9 @@ if __name__ == "__main__": if out is None: out = os.path.splitext(filename)[0] + "_fragments.bed" - print "Analyzing", filename - print "Restriction site(s)", ",".join(sequences) - print "Offset(s)", ','.join(str(x) for x in offset) + print("Analyzing", filename) + print("Restriction site(s)", ",".join(sequences)) + print("Offset(s)", ','.join(str(x) for x in offset)) # Read fasta file and look for rs per chromosome contig_names, all_indices = find_re_sites(filename, sequences, offset=offset) @@ -183,17 +182,14 @@ if __name__ == "__main__": valid_fragments.append(valid_fragments_chr) # Write results - print "Writing to", out, "..." - outfile = open(out, "w") - for chrom_name, indices in zip(contig_names, valid_fragments): - frag_id = 0 - for begin, end in indices: - # allow to remove cases where the enzyme cut at - # the first position of the chromosome - if end > begin: - frag_id += 1 - frag_name = "HIC_%s_%d" % (chrom_name, frag_id) - outfile.write( - "%s\t%d\t%d\t%s\t0\t+\n" % (chrom_name, begin, - end, frag_name)) - outfile.close() + print("Writing to {} ...".format(out)) + with open(out, 'w') as outfile: + for chrom_name, indices in zip(contig_names, valid_fragments): + frag_id = 0 + for begin, end in indices: + # allow to remove cases where the enzyme cut at + # the first position of the chromosome + if end > begin: + frag_id += 1 + frag_name = "HIC_{}_{}".format(str(chrom_name), int(frag_id)) + outfile.write("{}\t{}\t{}\t{}\t0\t+\n".format(str(chrom_name), int(begin), int(end), str(frag_name))) diff --git a/bin/mapped_2hic_dnase.py b/bin/mapped_2hic_dnase.py index 36c5a60..dd023b0 100755 --- a/bin/mapped_2hic_dnase.py +++ b/bin/mapped_2hic_dnase.py @@ -21,14 +21,14 @@ import pysam def usage(): """Usage function""" - print "Usage : python mapped_2hic_dnase.py" - print "-r/--mappedReadsFile <BAM/SAM file of mapped reads>" - print "[-o/--outputDir] <Output directory. Default is current directory>" - print "[-d/--minCisDist] <Minimum distance between intrachromosomal contact to consider>" - print "[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>" - print "[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>" - print "[-v/--verbose] <Verbose>" - print "[-h/--help] <Help>" + print("Usage : python mapped_2hic_dnase.py") + print("-r/--mappedReadsFile <BAM/SAM file of mapped reads>") + print("[-o/--outputDir] <Output directory. Default is current directory>") + print("[-d/--minCisDist] <Minimum distance between intrachromosomal contact to consider>") + print("[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>") + print("[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>") + print("[-v/--verbose] <Verbose>") + print("[-h/--help] <Help>") return @@ -78,11 +78,11 @@ def get_read_pos(read, st="start"): list of aligned reads """ if st == "middle": - pos = read.pos + int(read.alen/2) + pos = read.reference_start + int(read.alen/2) elif st =="start": pos = get_read_start(read) elif st == "left": - pos = read.pos + pos = read.reference_start return pos @@ -92,9 +92,9 @@ def get_read_start(read): Return the 5' end of the read """ if read.is_reverse: - pos = read.pos + read.alen -1 + pos = read.reference_start + read.alen -1 else: - pos = read.pos + pos = read.reference_start return pos @@ -108,20 +108,16 @@ def get_ordered_reads(read1, read2): read1 = [AlignedRead] read2 = [AlignedRead] """ - if read1.tid == read2.tid: + if read1.reference_id == read2.reference_id: if get_read_pos(read1) < get_read_pos(read2): - r1 = read1 - r2 = read2 + r1, r2 = read1, read2 else: - r1 = read2 - r2 = read1 + r1, r2 = read2, read1 else: - if read1.tid < read2.tid: - r1 = read1 - r2 = read2 + if read1.reference_id < read2.reference_id: + r1, r2 = read1, read2 else: - r1 = read2 - r2 = read1 + r1, r2 = read2, read1 return r1, r2 @@ -134,7 +130,7 @@ def isIntraChrom(read1, read2): read2 : [AlignedRead] """ - if read1.tid == read2.tid: + if read1.reference_id == read2.reference_id: return True else: return False @@ -187,7 +183,7 @@ def get_cis_dist(read1, read2): def get_read_tag(read, tag): - for t in read.tags: + for t in read.get_tags(): if t[0] == tag: return t[1] return None @@ -229,11 +225,11 @@ if __name__ == "__main__": # Verbose mode if verbose: - print "## overlapMapped2HiCFragments.py" - print "## mappedReadsFile=", mappedReadsFile - print "## minCisDist=", minDist - print "## allOuput=", allOutput - print "## verbose=", verbose, "\n" + print("## overlapMapped2HiCFragments.py") + print("## mappedReadsFile=", mappedReadsFile) + print("## minCisDist=", minDist) + print("## allOuput=", allOutput) + print("## verbose={}\n".format(verbose)) # Initialize variables reads_counter = 0 @@ -271,7 +267,7 @@ if __name__ == "__main__": # Read the SAM/BAM file if verbose: - print "## Opening SAM/BAM file '", mappedReadsFile, "'..." + print("## Opening SAM/BAM file {} ...".format(mappedReadsFile)) samfile = pysam.Samfile(mappedReadsFile, "rb") # Reads are 0-based too (for both SAM and BAM format) @@ -286,7 +282,7 @@ if __name__ == "__main__": if read.is_read1: r1 = read if not r1.is_unmapped: - r1_chrom = samfile.getrname(r1.tid) + r1_chrom = samfile.get_reference_name(r1.reference_id) else: r1_chrom = None @@ -294,11 +290,11 @@ if __name__ == "__main__": elif read.is_read2: r2 = read if not r2.is_unmapped: - r2_chrom = samfile.getrname(r2.tid) + r2_chrom = samfile.get_reference_name(r2.reference_id) else: r2_chrom = None - if isIntraChrom(r1,r2): + if isIntraChrom(r1, r2): dist = get_cis_dist(r1, r2) else: dist = None @@ -368,8 +364,8 @@ if __name__ == "__main__": ##reorient reads to ease duplicates removal or1, or2 = get_ordered_reads(r1, r2) - or1_chrom = samfile.getrname(or1.tid) - or2_chrom = samfile.getrname(or2.tid) + or1_chrom = samfile.get_reference_name(or1.reference_id) + or2_chrom = samfile.get_reference_name(or2.reference_id) ##reset as tag now that the reads are oriented r1as = get_read_tag(or1, gtag) @@ -378,7 +374,7 @@ if __name__ == "__main__": htag = str(r1as)+"-"+str(r2as) cur_handler.write( - or1.qname + "\t" + + or1.query_name + "\t" + or1_chrom + "\t" + str(get_read_pos(or1)+1) + "\t" + str(get_read_strand(or1)) + "\t" + @@ -394,7 +390,7 @@ if __name__ == "__main__": elif r2.is_unmapped and not r1.is_unmapped: cur_handler.write( - r1.qname + "\t" + + r1.query_name + "\t" + r1_chrom + "\t" + str(get_read_pos(r1)+1) + "\t" + str(get_read_strand(r1)) + "\t" + @@ -408,7 +404,7 @@ if __name__ == "__main__": "*" + "\n") elif r1.is_unmapped and not r2.is_unmapped: cur_handler.write( - r2.qname + "\t" + + r2.query_name + "\t" + "*" + "\t" + "*" + "\t" + "*" + "\t" + @@ -422,7 +418,7 @@ if __name__ == "__main__": str(r2.mapping_quality) + "\n") if (reads_counter % 100000 == 0 and verbose): - print "##", reads_counter + print("##", reads_counter) # Close handler handle_valid.close() @@ -432,33 +428,28 @@ if __name__ == "__main__": handle_filt.close() # Write stats file - handle_stat = open(outputDir + '/' + baseReadsFile + '.RSstat', 'w') - handle_stat.write("## Hi-C processing - no restriction fragments\n") - handle_stat.write("Valid_interaction_pairs\t" + str(valid_counter) + "\n") - handle_stat.write( - "Valid_interaction_pairs_FF\t" + str(valid_counter_FF) + "\n") - handle_stat.write( - "Valid_interaction_pairs_RR\t" + str(valid_counter_RR) + "\n") - handle_stat.write( - "Valid_interaction_pairs_RF\t" + str(valid_counter_RF) + "\n") - handle_stat.write( - "Valid_interaction_pairs_FR\t" + str(valid_counter_FR) + "\n") - handle_stat.write("Single-end_pairs\t" + str(single_counter) + "\n") - handle_stat.write("Filtered_pairs\t" + str(filt_counter) + "\n") - handle_stat.write("Dumped_pairs\t" + str(dump_counter) + "\n") + with open(outputDir + '/' + baseReadsFile + '.RSstat', 'w') as handle_stat: + handle_stat.write("## Hi-C processing - no restriction fragments\n") + handle_stat.write("Valid_interaction_pairs\t" + str(valid_counter) + "\n") + handle_stat.write("Valid_interaction_pairs_FF\t" + str(valid_counter_FF) + "\n") + handle_stat.write("Valid_interaction_pairs_RR\t" + str(valid_counter_RR) + "\n") + handle_stat.write("Valid_interaction_pairs_RF\t" + str(valid_counter_RF) + "\n") + handle_stat.write("Valid_interaction_pairs_FR\t" + str(valid_counter_FR) + "\n") + handle_stat.write("Single-end_pairs\t" + str(single_counter) + "\n") + handle_stat.write("Filtered_pairs\t" + str(filt_counter) + "\n") + handle_stat.write("Dumped_pairs\t" + str(dump_counter) + "\n") ## Write AS report - if gtag is not None: - handle_stat.write("## ======================================\n") - handle_stat.write("## Allele specific information\n") - handle_stat.write("Valid_pairs_from_ref_genome_(1-1)\t" + str(G1G1_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_ref_genome_with_one_unassigned_mate_(0-1/1-0)\t" + str(UG1_ascounter+G1U_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_alt_genome_(2-2)\t" + str(G2G2_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_alt_genome_with_one_unassigned_mate_(0-2/2-0)\t" + str(UG2_ascounter+G2U_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_alt_and_ref_genome_(1-2/2-1)\t" + str(G1G2_ascounter+G2G1_ascounter) + "\n") - handle_stat.write("Valid_pairs_with_both_unassigned_mated_(0-0)\t" + str(UU_ascounter) + "\n") - handle_stat.write("Valid_pairs_with_at_least_one_conflicting_mate_(3-)\t" + str(CF_ascounter) + "\n") - - handle_stat.close() + if gtag is not None: + handle_stat.write("## ======================================\n") + handle_stat.write("## Allele specific information\n") + handle_stat.write("Valid_pairs_from_ref_genome_(1-1)\t" + str(G1G1_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_ref_genome_with_one_unassigned_mate_(0-1/1-0)\t" + str(UG1_ascounter+G1U_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_alt_genome_(2-2)\t" + str(G2G2_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_alt_genome_with_one_unassigned_mate_(0-2/2-0)\t" + str(UG2_ascounter+G2U_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_alt_and_ref_genome_(1-2/2-1)\t" + str(G1G2_ascounter+G2G1_ascounter) + "\n") + handle_stat.write("Valid_pairs_with_both_unassigned_mated_(0-0)\t" + str(UU_ascounter) + "\n") + handle_stat.write("Valid_pairs_with_at_least_one_conflicting_mate_(3-)\t" + str(CF_ascounter) + "\n") + diff --git a/bin/mapped_2hic_fragments.py b/bin/mapped_2hic_fragments.py index d4790ee..e823ee0 100755 --- a/bin/mapped_2hic_fragments.py +++ b/bin/mapped_2hic_fragments.py @@ -12,7 +12,6 @@ Script to keep only valid 3C products - DE and SC are removed Output is : readname / """ - import time import getopt import sys @@ -24,20 +23,20 @@ from bx.intervals.intersection import Intersecter, Interval def usage(): """Usage function""" - print "Usage : python mapped_2hic_fragments.py" - print "-f/--fragmentFile <Restriction fragment file GFF3>" - print "-r/--mappedReadsFile <BAM/SAM file of mapped reads>" - print "[-o/--outputDir] <Output directory. Default is current directory>" - print "[-s/--shortestInsertSize] <Shortest insert size of mapped reads to consider>" - print "[-l/--longestInsertSize] <Longest insert size of mapped reads to consider>" - print "[-t/--shortestFragmentLength] <Shortest restriction fragment length to consider>" - print "[-m/--longestFragmentLength] <Longest restriction fragment length to consider>" - print "[-d/--minCisDist] <Minimum distance between intrachromosomal contact to consider>" - print "[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>" - print "[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>" - print "[-S/--sam] <Output an additional SAM file with flag 'CT' for pairs classification>" - print "[-v/--verbose] <Verbose>" - print "[-h/--help] <Help>" + print("Usage : python mapped_2hic_fragments.py") + print("-f/--fragmentFile <Restriction fragment file GFF3>") + print("-r/--mappedReadsFile <BAM/SAM file of mapped reads>") + print("[-o/--outputDir] <Output directory. Default is current directory>") + print("[-s/--shortestInsertSize] <Shortest insert size of mapped reads to consider>") + print("[-l/--longestInsertSize] <Longest insert size of mapped reads to consider>") + print("[-t/--shortestFragmentLength] <Shortest restriction fragment length to consider>") + print("[-m/--longestFragmentLength] <Longest restriction fragment length to consider>") + print("[-d/--minCisDist] <Minimum distance between intrachromosomal contact to consider>") + print("[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>") + print("[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>") + print("[-S/--sam] <Output an additional SAM file with flag 'CT' for pairs classification>") + print("[-v/--verbose] <Verbose>") + print("[-h/--help] <Help>") return @@ -67,7 +66,7 @@ def timing(function, *args): """ startTime = time.time() result = function(*args) - print '%s function took %0.3f ms' % (function.func_name, (time.time() - startTime) * 1000) + print('{} function took {:.3f}ms'.format(function.__name__, (time.time() - startTime) * 1000)) return result @@ -96,8 +95,7 @@ def isIntraChrom(read1, read2): """ if read1.tid == read2.tid: return True - else: - return False + return False def get_cis_dist(read1, read2): @@ -114,8 +112,7 @@ def get_cis_dist(read1, read2): if not read1.is_unmapped and not read2.is_unmapped: ## Contact distances can be calculated for intrachromosomal reads only if isIntraChrom(read1, read2): - r1pos = get_read_pos(read1) - r2pos = get_read_pos(read2) + r1pos, r2pos = get_read_pos(read1), get_read_pos(read2) dist = abs(r1pos - r2pos) return dist @@ -138,11 +135,11 @@ def get_read_pos(read, st="start"): """ if st == "middle": - pos = read.pos + int(read.alen/2) + pos = read.reference_start + int(read.alen/2) elif st =="start": pos = get_read_start(read) elif st == "left": - pos = read.pos + pos = read.reference_start return pos @@ -152,9 +149,9 @@ def get_read_start(read): Return the 5' end of the read """ if read.is_reverse: - pos = read.pos + read.alen -1 + pos = read.reference_start + read.alen -1 else: - pos = read.pos + pos = read.reference_start return pos def get_ordered_reads(read1, read2): @@ -178,18 +175,14 @@ def get_ordered_reads(read1, read2): """ if read1.tid == read2.tid: if get_read_pos(read1) < get_read_pos(read2): - r1 = read1 - r2 = read2 + r1, r2 = read1, read2 else: - r1 = read2 - r2 = read1 + r1, r2 = read2, read1 else: if read1.tid < read2.tid: - r1 = read1 - r2 = read2 + r1, r2 = read1, read2 else: - r1 = read2 - r2 = read1 + r1, r2 = read2, read1 return r1, r2 @@ -206,46 +199,44 @@ def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbo """ resFrag = {} if verbose: - print "## Loading Restriction File Intervals '", in_file, "'..." - + print("## Loading Restriction File Intervals {} ...".format(in_file)) bed_handle = open(in_file) nline = 0 nfilt = 0 for line in bed_handle: - nline +=1 - bedtab = line.split("\t") - try: - chromosome, start, end, name = bedtab[:4] - except ValueError: - print "Warning : wrong input format in line", nline,". Not a BED file !?" - continue + nline += 1 + bedtab = line.split("\t") + try: + chromosome, start, end, name = bedtab[:4] + except ValueError: + print("Warning : wrong input format in line {}. Not a BED file ?!".format(nline)) + continue # BED files are zero-based as Intervals objects - start = int(start) # + 1 - end = int(end) - fragl = abs(end - start) - name = name.strip() - - ## Discard fragments outside the size range - filt=False - if minfragsize != None and int(fragl) < int(minfragsize): - nfilt+=1 - filt=True - elif maxfragsize != None and int(fragl) > int(maxfragsize): - nfilt+=1 - filt=True + start = int(start) # + 1 + end = int(end) + fragl = abs(end - start) + name = name.strip() + + ## Discard fragments outside the size range + filt = False + if minfragsize != None and int(fragl) < int(minfragsize): + nfilt += 1 + filt = True + elif maxfragsize != None and int(fragl) > int(maxfragsize): + nfilt += 1 + filt = True - if chromosome in resFrag: - tree = resFrag[chromosome] - tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) - else: - tree = Intersecter() - tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) - resFrag[chromosome] = tree + if chromosome in resFrag: + tree = resFrag[chromosome] + tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) + else: + tree = Intersecter() + tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) + resFrag[chromosome] = tree if nfilt > 0: - print "Warning : ", nfilt ,"fragment(s) outside of range and discarded. ", nline - nfilt, " remaining." - + print("Warning : {} fragment(s) outside of range and discarded. {} remaining.".format(nfilt, nline - nfilt)) bed_handle.close() return resFrag @@ -260,22 +251,22 @@ def get_overlapping_restriction_fragment(resFrag, chrom, read): read = the read to intersect [AlignedRead] """ - # Get read position (middle or 5' end) + # Get read position (middle or start) pos = get_read_pos(read, st="middle") if chrom in resFrag: # Overlap with the position of the read (zero-based) resfrag = resFrag[chrom].find(pos, pos+1) if len(resfrag) > 1: - print "Warning : ", len(resfrag), " restriction fragments found for ", read.qname, "- skipped" + print("Warning : {} restictions fragments found for {} -skipped".format(len(resfrag), read.query_name)) return None elif len(resfrag) == 0: - print "Warning - no restriction fragments for ", read.qname ," at ", chrom, ":", pos + print("Warning - no restriction fragments for {} at {} : {}".format(read.query_name, chrom, pos)) return None else: return resfrag[0] else: - print "Warning - no restriction fragments for ", read.qname," at ", chrom, ":", pos + print("Warning - no restriction fragments for {} at {} : {}".format(read.qname, chrom, pos)) return None @@ -301,11 +292,11 @@ def is_religation(read1, read2, frag1, frag2): Check the orientation of reads -><- """ - ret=False + ret = False if are_contiguous_fragments(frag1, frag2, read1.tid, read2.tid): #r1, r2 = get_ordered_reads(read1, read2) #if get_read_strand(r1) == "+" and get_read_strand(r2) == "-": - ret=True + ret = True return ret @@ -374,8 +365,8 @@ def get_PE_fragment_size(read1, read2, resFrag1, resFrag2, interactionType): read1 : [AlignedRead] read2 : [AlignedRead] - resfrag1 = restrictin fragment overlapping the R1 read [interval] - resfrag1 = restrictin fragment overlapping the R1 read [interval] + resfrag1 = restriction fragment overlapping the R1 read [interval] + resfrag1 = restriction fragment overlapping the R1 read [interval] interactionType : Type of interaction from get_interaction_type() [str] """ @@ -463,7 +454,7 @@ def get_interaction_type(read1, read1_chrom, resfrag1, read2, def get_read_tag(read, tag): - for t in read.tags: + for t in read.get_tags(): if t[0] == tag: return t[1] return None @@ -520,16 +511,16 @@ if __name__ == "__main__": # Verbose mode if verbose: - print "## overlapMapped2HiCFragments.py" - print "## mappedReadsFile=", mappedReadsFile - print "## fragmentFile=", fragmentFile - print "## minInsertSize=", minInsertSize - print "## maxInsertSize=", maxInsertSize - print "## minFragSize=", minFragSize - print "## maxFragSize=", maxFragSize - print "## allOuput=", allOutput - print "## SAM ouput=", samOut - print "## verbose=", verbose, "\n" + print("## overlapMapped2HiCFragments.py") + print("## mappedReadsFile=", mappedReadsFile) + print("## fragmentFile=", fragmentFile) + print("## minInsertSize=", minInsertSize) + print("## maxInsertSize=", maxInsertSize) + print("## minFragSize=", minFragSize) + print("## maxFragSize=", maxFragSize) + print("## allOuput=", allOutput) + print("## SAM ouput=", samOut) + print("## verbose={}\n".format(verbose)) # Initialize variables reads_counter = 0 @@ -576,7 +567,7 @@ if __name__ == "__main__": # Read the SAM/BAM file if verbose: - print "## Opening SAM/BAM file '", mappedReadsFile, "'..." + print("## Opening SAM/BAM file {} ...".format(mappedReadsFile)) samfile = pysam.Samfile(mappedReadsFile, "rb") if samOut: @@ -585,7 +576,7 @@ if __name__ == "__main__": # Reads are 0-based too (for both SAM and BAM format) # Loop on all reads if verbose: - print "## Classifying Interactions ..." + print("## Classifying Interactions ...") for read in samfile.fetch(until_eof=True): reads_counter += 1 @@ -596,7 +587,7 @@ if __name__ == "__main__": if read.is_read1: r1 = read if not r1.is_unmapped: - r1_chrom = samfile.getrname(r1.tid) + r1_chrom = samfile.get_reference_name(r1.tid) r1_resfrag = get_overlapping_restriction_fragment(resFrag, r1_chrom, r1) else: r1_resfrag = None @@ -606,7 +597,7 @@ if __name__ == "__main__": elif read.is_read2: r2 = read if not r2.is_unmapped: - r2_chrom = samfile.getrname(r2.tid) + r2_chrom = samfile.get_reference_name(r2.tid) r2_resfrag = get_overlapping_restriction_fragment(resFrag, r2_chrom, r2) else: r2_resfrag = None @@ -706,8 +697,8 @@ if __name__ == "__main__": if not r1.is_unmapped and not r2.is_unmapped: ##reorient reads to ease duplicates removal or1, or2 = get_ordered_reads(r1, r2) - or1_chrom = samfile.getrname(or1.tid) - or2_chrom = samfile.getrname(or2.tid) + or1_chrom = samfile.get_reference_name(or1.tid) + or2_chrom = samfile.get_reference_name(or2.tid) ##reset as tag now that the reads are oriented r1as = get_read_tag(or1, gtag) @@ -734,7 +725,7 @@ if __name__ == "__main__": or2_fragname = 'None' cur_handler.write( - or1.qname + "\t" + + or1.query_name + "\t" + or1_chrom + "\t" + str(get_read_pos(or1)+1) + "\t" + str(get_read_strand(or1)) + "\t" + @@ -753,7 +744,7 @@ if __name__ == "__main__": r1_fragname = r1_resfrag.value['name'] cur_handler.write( - r1.qname + "\t" + + r1.query_name + "\t" + r1_chrom + "\t" + str(get_read_pos(r1)+1) + "\t" + str(get_read_strand(r1)) + "\t" + @@ -770,7 +761,7 @@ if __name__ == "__main__": r2_fragname = r2_resfrag.value['name'] cur_handler.write( - r2.qname + "\t" + + r2.query_name + "\t" + "*" + "\t" + "*" + "\t" + "*" + "\t" + @@ -791,7 +782,7 @@ if __name__ == "__main__": handle_sam.write(r2) if (reads_counter % 100000 == 0 and verbose): - print "##", reads_counter + print("##", reads_counter) # Close handler handle_valid.close() @@ -808,14 +799,10 @@ if __name__ == "__main__": handle_stat = open(outputDir + '/' + baseReadsFile + '.RSstat', 'w') handle_stat.write("## Hi-C processing\n") handle_stat.write("Valid_interaction_pairs\t" + str(valid_counter) + "\n") - handle_stat.write( - "Valid_interaction_pairs_FF\t" + str(valid_counter_FF) + "\n") - handle_stat.write( - "Valid_interaction_pairs_RR\t" + str(valid_counter_RR) + "\n") - handle_stat.write( - "Valid_interaction_pairs_RF\t" + str(valid_counter_RF) + "\n") - handle_stat.write( - "Valid_interaction_pairs_FR\t" + str(valid_counter_FR) + "\n") + handle_stat.write("Valid_interaction_pairs_FF\t" + str(valid_counter_FF) + "\n") + handle_stat.write("Valid_interaction_pairs_RR\t" + str(valid_counter_RR) + "\n") + handle_stat.write("Valid_interaction_pairs_RF\t" + str(valid_counter_RF) + "\n") + handle_stat.write("Valid_interaction_pairs_FR\t" + str(valid_counter_FR) + "\n") handle_stat.write("Dangling_end_pairs\t" + str(de_counter) + "\n") handle_stat.write("Religation_pairs\t" + str(re_counter) + "\n") handle_stat.write("Self_Cycle_pairs\t" + str(sc_counter) + "\n") @@ -839,4 +826,3 @@ if __name__ == "__main__": if samOut: samfile.close() - diff --git a/bin/mergeSAM.py b/bin/mergeSAM.py index fdf0c67..12917b1 100755 --- a/bin/mergeSAM.py +++ b/bin/mergeSAM.py @@ -19,20 +19,19 @@ import sys import os import re import pysam -from itertools import izip def usage(): """Usage function""" - print "Usage : python mergeSAM.py" - print "-f/--forward <forward read mapped file>" - print "-r/--reverse <reverse read mapped file>" - print "[-o/--output] <Output file. Default is stdin>" - print "[-s/--single] <report singleton>" - print "[-m/--multi] <report multiple hits>" - print "[-q/--qual] <minimum reads mapping quality>" - print "[-t/--stat] <generate a stat file>" - print "[-v/--verbose] <Verbose>" - print "[-h/--help] <Help>" + print("Usage : python mergeSAM.py") + print("-f/--forward <forward read mapped file>") + print("-r/--reverse <reverse read mapped file>") + print("[-o/--output] <Output file. Default is stdin>") + print("[-s/--single] <report singleton>") + print("[-m/--multi] <report multiple hits>") + print("[-q/--qual] <minimum reads mapping quality>") + print("[-t/--stat] <generate a stat file>") + print("[-v/--verbose] <Verbose>") + print("[-h/--help] <Help>") return @@ -53,37 +52,36 @@ def get_args(): def is_unique_bowtie2(read): - ret = False - if not read.is_unmapped and read.has_tag('AS'): - if read.has_tag('XS'): - primary = read.get_tag('AS') - secondary = read.get_tag('XS') - if (primary > secondary): - ret = True - else: - ret = True - - return ret + ret = False + if not read.is_unmapped and read.has_tag('AS'): + if read.has_tag('XS'): + primary = read.get_tag('AS') + secondary = read.get_tag('XS') + if (primary > secondary): + ret = True + else: + ret = True + return ret ## Remove everything after "/" or " " in read's name def get_read_name(read): - name = read.qname + name = read.query_name #return name.split("/",1)[0] return re.split('/| ', name)[0] def sam_flag(read1, read2, hr1, hr2): + + f1 = read1.flag + f2 = read2.flag - f1 = read1.flag - f2 = read2.flag - - if r1.is_unmapped == False: - r1_chrom = hr1.getrname(r1.tid) - else: - r1_chrom="*" - if r2.is_unmapped == False: - r2_chrom = hr2.getrname(r2.tid) - else: - r2_chrom="*" + if r1.is_unmapped == False: + r1_chrom = hr1.get_reference_name(r1.reference_id) + else: + r1_chrom = "*" + if r2.is_unmapped == False: + r2_chrom = hr2.get_reference_name(r2.reference_id) + else: + r2_chrom="*" ##Relevant bitwise flags (flag in an 11-bit binary number) @@ -101,226 +99,221 @@ def sam_flag(read1, read2, hr1, hr2): ##Output example: a paired-end read that aligns to the reverse strand ##and is the first mate in the pair will have flag 83 (= 64 + 16 + 2 + 1) - if f1 & 0x4: - f1 = f1 | 0x8 + if f1 & 0x4: + f1 = f1 | 0x8 - if f2 & 0x4: - f2 = f2 | 0x8 + if f2 & 0x4: + f2 = f2 | 0x8 - if (not (f1 & 0x4) and not (f2 & 0x4)): + if (not (f1 & 0x4) and not (f2 & 0x4)): ##The flag should now indicate this is paired-end data - f1 = f1 | 0x1 - f1 = f1 | 0x2 - f2 = f2 | 0x1 - f2 = f2 | 0x2 + f1 = f1 | 0x1 + f1 = f1 | 0x2 + f2 = f2 | 0x1 + f2 = f2 | 0x2 ##Indicate if the pair is on the reverse strand - if f1 & 0x10: - f2 = f2 | 0x20 + if f1 & 0x10: + f2 = f2 | 0x20 - if f2 & 0x10: - f1 = f1 | 0x20 + if f2 & 0x10: + f1 = f1 | 0x20 ##Is this first or the second pair? - f1 = f1 | 0x40 - f2 = f2 | 0x80 + f1 = f1 | 0x40 + f2 = f2 | 0x80 ##Insert the modified bitwise flags into the reads - read1.flag = f1 - read2.flag = f2 - - ##Determine the RNEXT and PNEXT values (i.e. the positional values of a read's pair) - #RNEXT - if r1_chrom == r2_chrom: - read1.rnext = r1.tid - read2.rnext = r1.tid - else: - read1.rnext = r2.tid - read2.rnext = r1.tid - - #PNEXT - read1.pnext = read2.pos - read2.pnext = read1.pos - - return(read1, read2) + read1.flag = f1 + read2.flag = f2 + + ##Determine the RNEXT and PNEXT values (i.e. the positional values of a read's pair) + #RNEXT + if r1_chrom == r2_chrom: + read1.next_reference_id = r1.reference_id + read2.next_reference_id = r1.reference_id + else: + read1.next_reference_id = r2.reference_id + read2.next_reference_id = r1.reference_id + #PNEXT + read1.next_reference_start = read2.reference_start + read2.next_reference_start = read1.reference_start + + return(read1, read2) if __name__ == "__main__": ## Read command line arguments - opts = get_args() - inputFile = None - outputFile = None - mapq = None - report_single = False - report_multi = False - verbose = False - stat = False - output = "-" - - if len(opts) == 0: - usage() - sys.exit() - - for opt, arg in opts: - if opt in ("-h", "--help"): - usage() - sys.exit() - elif opt in ("-f", "--forward"): - R1file = arg - elif opt in ("-r", "--reverse"): - R2file = arg - elif opt in ("-o", "--output"): - output = arg - elif opt in ("-q", "--qual"): - mapq = arg - elif opt in ("-s", "--single"): - report_single = True - elif opt in ("-m", "--multi"): - report_multi = True - elif opt in ("-t", "--stat"): - stat = True - elif opt in ("-v", "--verbose"): - verbose = True - else: - assert False, "unhandled option" + opts = get_args() + inputFile = None + outputFile = None + mapq = None + report_single = False + report_multi = False + verbose = False + stat = False + output = "-" + + if len(opts) == 0: + usage() + sys.exit() + + for opt, arg in opts: + if opt in ("-h", "--help"): + usage() + sys.exit() + elif opt in ("-f", "--forward"): + R1file = arg + elif opt in ("-r", "--reverse"): + R2file = arg + elif opt in ("-o", "--output"): + output = arg + elif opt in ("-q", "--qual"): + mapq = arg + elif opt in ("-s", "--single"): + report_single = True + elif opt in ("-m", "--multi"): + report_multi = True + elif opt in ("-t", "--stat"): + stat = True + elif opt in ("-v", "--verbose"): + verbose = True + else: + assert False, "unhandled option" ## Verbose mode - if verbose: - print "## mergeBAM.py" - print "## forward=", R1file - print "## reverse=", R2file - print "## output=", output - print "## min mapq=", mapq - print "## report_single=", report_single - print "## report_multi=", report_multi - print "## verbose=", verbose + if verbose: + print("## mergeBAM.py") + print("## forward=", R1file) + print("## reverse=", R2file) + print("## output=", output) + print("## min mapq=", mapq) + print("## report_single=", report_single) + print("## report_multi=", report_multi) + print("## verbose=", verbose) ## Initialize variables - tot_pairs_counter = 0 - multi_pairs_counter = 0 - uniq_pairs_counter = 0 - unmapped_pairs_counter = 0 - lowq_pairs_counter = 0 - multi_singles_counter = 0 - uniq_singles_counter = 0 - lowq_singles_counter = 0 + tot_pairs_counter = 0 + multi_pairs_counter = 0 + uniq_pairs_counter = 0 + unmapped_pairs_counter = 0 + lowq_pairs_counter = 0 + multi_singles_counter = 0 + uniq_singles_counter = 0 + lowq_singles_counter = 0 #local_counter = 0 - paired_reads_counter = 0 - singleton_counter = 0 - reads_counter = 0 - r1 = None - r2 = None + paired_reads_counter = 0 + singleton_counter = 0 + reads_counter = 0 + r1 = None + r2 = None ## Reads are 0-based too (for both SAM and BAM format) ## Loop on all reads - if verbose: - print "## Merging forward and reverse tags ..." - - with pysam.Samfile(R1file, "rb") as hr1, pysam.Samfile(R2file, "rb") as hr2: - if output == "-": - outfile = pysam.AlignmentFile(output, "w", template=hr1) - else: - outfile = pysam.AlignmentFile(output, "wb", template=hr1) - for r1, r2 in izip(hr1.fetch(until_eof=True), hr2.fetch(until_eof=True)): - reads_counter +=1 + if verbose: + print("## Merging forward and reverse tags ...") + with pysam.Samfile(R1file, "rb") as hr1, pysam.Samfile(R2file, "rb") as hr2: + if output == "-": + outfile = pysam.AlignmentFile(output, "w", template=hr1) + else: + outfile = pysam.AlignmentFile(output, "wb", template=hr1) + for r1, r2 in zip(hr1.fetch(until_eof=True), hr2.fetch(until_eof=True)): + reads_counter +=1 #print r1 #print r2 #print hr1.getrname(r1.tid) #print hr2.getrname(r2.tid) - if (reads_counter % 1000000 == 0 and verbose): - print "##", reads_counter + if (reads_counter % 1000000 == 0 and verbose): + print("##", reads_counter) - if get_read_name(r1) == get_read_name(r2): + if get_read_name(r1) == get_read_name(r2): ## both unmapped - if r1.is_unmapped == True and r2.is_unmapped == True: - unmapped_pairs_counter += 1 - continue + if r1.is_unmapped == True and r2.is_unmapped == True: + unmapped_pairs_counter += 1 + continue ## both mapped - elif r1.is_unmapped == False and r2.is_unmapped == False: + elif r1.is_unmapped == False and r2.is_unmapped == False: ## quality - if mapq != None and (r1.mapping_quality < int(mapq) or r2.mapping_quality < int(mapq)): - lowq_pairs_counter += 1 - continue + if mapq != None and (r1.mapping_quality < int(mapq) or r2.mapping_quality < int(mapq)): + lowq_pairs_counter += 1 + continue ## Unique mapping - if is_unique_bowtie2(r1) == True and is_unique_bowtie2(r2) == True: - uniq_pairs_counter += 1 - else: - multi_pairs_counter += 1 - if report_multi == False: - continue + if is_unique_bowtie2(r1) == True and is_unique_bowtie2(r2) == True: + uniq_pairs_counter += 1 + else: + multi_pairs_counter += 1 + if report_multi == False: + continue # one end mapped, other is not - else: - singleton_counter += 1 - if report_single == False: - continue - if r1.is_unmapped == False: ## first end is mapped, second is not + else: + singleton_counter += 1 + if report_single == False: + continue + if r1.is_unmapped == False: ## first end is mapped, second is not ## quality - if mapq != None and (r1.mapping_quality < int(mapq)): - lowq_singles_counter += 1 - continue + if mapq != None and (r1.mapping_quality < int(mapq)): + lowq_singles_counter += 1 + continue ## Unique mapping - if is_unique_bowtie2(r1) == True: - uniq_singles_counter += 1 - else: - multi_singles_counter += 1 - if report_multi == False: - continue - else: ## second end is mapped, first is not + if is_unique_bowtie2(r1) == True: + uniq_singles_counter += 1 + else: + multi_singles_counter += 1 + if report_multi == False: + continue + else: ## second end is mapped, first is not ## quality - if mapq != None and (r2.mapping_quality < int(mapq)): - lowq_singles_counter += 1 - continue + if mapq != None and (r2.mapping_quality < int(mapq)): + lowq_singles_counter += 1 + continue ## Unique mapping - if is_unique_bowtie2(r2) == True: - uniq_singles_counter += 1 - else: - multi_singles_counter += 1 - if report_multi == False: - continue + if is_unique_bowtie2(r2) == True: + uniq_singles_counter += 1 + else: + multi_singles_counter += 1 + if report_multi == False: + continue - tot_pairs_counter += 1 - (r1, r2) = sam_flag(r1,r2, hr1, hr2) + tot_pairs_counter += 1 + (r1, r2) = sam_flag(r1,r2, hr1, hr2) #print hr1.getrname(r1.tid) #print hr2.getrname(r2.tid) #print r1 #print r2 ## Write output - outfile.write(r1) - outfile.write(r2) - - else: - print "Forward and reverse reads not paired. Check that BAM files have the same read names and are sorted." - sys.exit(1) - - if stat: - if output == '-': - statfile = "pairing.stat" - else: - statfile = re.sub('\.bam$', '.pairstat', output) - handle_stat = open(statfile, 'w') - - handle_stat.write("Total_pairs_processed\t" + str(reads_counter) + "\t" + str(round(float(reads_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Unmapped_pairs\t" + str(unmapped_pairs_counter) + "\t" + str(round(float(unmapped_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Low_qual_pairs\t" + str(lowq_pairs_counter) + "\t" + str(round(float(lowq_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Unique_paired_alignments\t" + str(uniq_pairs_counter) + "\t" + str(round(float(uniq_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Multiple_pairs_alignments\t" + str(multi_pairs_counter) + "\t" + str(round(float(multi_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Pairs_with_singleton\t" + str(singleton_counter) + "\t" + str(round(float(singleton_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Low_qual_singleton\t" + str(lowq_singles_counter) + "\t" + str(round(float(lowq_singles_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Unique_singleton_alignments\t" + str(uniq_singles_counter) + "\t" + str(round(float(uniq_singles_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Multiple_singleton_alignments\t" + str(multi_singles_counter) + "\t" + str(round(float(multi_singles_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Reported_pairs\t" + str(tot_pairs_counter) + "\t" + str(round(float(tot_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.close() - - hr1.close() - hr2.close() - outfile.close() + outfile.write(r1) + outfile.write(r2) + + else: + print("Forward and reverse reads not paired. Check that BAM files have the same read names and are sorted.") + sys.exit(1) + + if stat: + if output == '-': + statfile = "pairing.stat" + else: + statfile = re.sub('\.bam$', '.pairstat', output) + with open(statfile, 'w') as handle_stat: + handle_stat.write("Total_pairs_processed\t" + str(reads_counter) + "\t" + str(round(float(reads_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Unmapped_pairs\t" + str(unmapped_pairs_counter) + "\t" + str(round(float(unmapped_pairs_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Low_qual_pairs\t" + str(lowq_pairs_counter) + "\t" + str(round(float(lowq_pairs_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Unique_paired_alignments\t" + str(uniq_pairs_counter) + "\t" + str(round(float(uniq_pairs_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Multiple_pairs_alignments\t" + str(multi_pairs_counter) + "\t" + str(round(float(multi_pairs_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Pairs_with_singleton\t" + str(singleton_counter) + "\t" + str(round(float(singleton_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Low_qual_singleton\t" + str(lowq_singles_counter) + "\t" + str(round(float(lowq_singles_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Unique_singleton_alignments\t" + str(uniq_singles_counter) + "\t" + str(round(float(uniq_singles_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Multiple_singleton_alignments\t" + str(multi_singles_counter) + "\t" + str(round(float(multi_singles_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Reported_pairs\t" + str(tot_pairs_counter) + "\t" + str(round(float(tot_pairs_counter)/float(reads_counter)*100,3)) + "\n") + hr1.close() + hr2.close() + outfile.close() diff --git a/conf/base.config b/conf/base.config index 021b3f4..d655a76 100644 --- a/conf/base.config +++ b/conf/base.config @@ -25,23 +25,26 @@ process { // nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_low { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 14.GB * task.attempt, 'memory' ) } + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } time = { check_max( 6.h * task.attempt, 'time' ) } } withLabel:process_medium { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 42.GB * task.attempt, 'memory' ) } + cpus = { check_max( 4 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } time = { check_max( 8.h * task.attempt, 'time' ) } } withLabel:process_high { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 84.GB * task.attempt, 'memory' ) } + cpus = { check_max( 8 * task.attempt, 'cpus' ) } + memory = { check_max( 64.GB * task.attempt, 'memory' ) } time = { check_max( 10.h * task.attempt, 'time' ) } } withLabel:process_long { time = { check_max( 20.h * task.attempt, 'time' ) } } + withLabel:process_highmem { + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + } withName:get_software_versions { cache = false } diff --git a/environment.yml b/environment.yml index c1e113e..9d0d609 100644 --- a/environment.yml +++ b/environment.yml @@ -6,16 +6,13 @@ channels: - bioconda - defaults dependencies: - - python=2.7.15 - - pip=19.1 - - scipy=1.2.1 - - numpy=1.16.3 - - r-markdown=0.9 - - bx-python=0.8.2 - - pysam=0.15.2 - - cooler=0.8.5 - - bowtie2=2.3.5 - - samtools=1.9 - - multiqc=1.7 - - pip: - - iced==0.5.1 + - conda-forge::python=3.7.6 + - conda-forge::scipy=1.4.1 + - conda-forge::numpy=1.18.1 + - bioconda::iced=0.5.4 + - bioconda::bx-python=0.8.8 + - bioconda::pysam=0.15.4 + - bioconda::cooler=0.8.6 + - bioconda::bowtie2=2.3.5 + - bioconda::samtools=1.9 + - bioconda::multiqc=1.8 diff --git a/main.nf b/main.nf index 18c0526..85b4154 100644 --- a/main.nf +++ b/main.nf @@ -21,62 +21,64 @@ def helpMessage() { nextflow run nf-core/hic --reads '*_R{1,2}.fastq.gz' -profile conda Mandatory arguments: - --reads Path to input data (must be surrounded with quotes) - -profile Configuration profile to use. Can use multiple (comma separated) - Available: conda, docker, singularity, awsbatch, test and more. - - References If not specified in the configuration file or you wish to overwrite any of the references. - --genome Name of iGenomes reference - --bwt2_index Path to Bowtie2 index - --fasta Path to Fasta reference - --chromosome_size Path to chromosome size file - --restriction_fragments Path to restriction fragment file (bed) - --saveReference Save reference genome to output folder. Default: False - --saveAlignedIntermediates Save intermediates alignment files. Default: False + --reads [file] Path to input data (must be surrounded with quotes) + -profile [str] Configuration profile to use. Can use multiple (comma separated) + Available: conda, docker, singularity, awsbatch, test and more. + + References If not specified in the configuration file or you wish to overwrite any of the references. + --genome [str] Name of iGenomes reference + --bwt2_index [file] Path to Bowtie2 index + --fasta [file] Path to Fasta reference + --chromosome_size [file] Path to chromosome size file + --restriction_fragments [file] Path to restriction fragment file (bed) + --save_reference [bool] Save reference genome to output folder. Default: False + --save_aligned_intermediates [bool] Save intermediates alignment files. Default: False Alignments - --bwt2_opts_end2end Options for bowtie2 end-to-end mappinf (first mapping step). See hic.config for default. - --bwt2_opts_trimmed Options for bowtie2 mapping after ligation site trimming. See hic.config for default. - --min_mapq Minimum mapping quality values to consider. Default: 10 - --restriction_site Cutting motif(s) of restriction enzyme(s) (comma separated). Default: 'A^AGCTT' - --ligation_site Ligation motifs to trim (comma separated). Default: 'AAGCTAGCTT' - --rm_singleton Remove singleton reads. Default: true - --rm_multi Remove multi-mapped reads. Default: true - --rm_dup Remove duplicates. Default: true + --bwt2_opts_end2end [str] Options for bowtie2 end-to-end mappinf (first mapping step). See hic.config for default. + --bwt2_opts_trimmed [str] Options for bowtie2 mapping after ligation site trimming. See hic.config for default. + --min_mapq [int] Minimum mapping quality values to consider. Default: 10 + --restriction_site [str] Cutting motif(s) of restriction enzyme(s) (comma separated). Default: 'A^AGCTT' + --ligation_site [str] Ligation motifs to trim (comma separated). Default: 'AAGCTAGCTT' + --rm_singleton [bool] Remove singleton reads. Default: true + --rm_multi [bool] Remove multi-mapped reads. Default: true + --rm_dup [bool] Remove duplicates. Default: true Contacts calling - --min_restriction_fragment_size Minimum size of restriction fragments to consider. Default: None - --max_restriction_fragment_size Maximum size of restriction fragments to consider. Default: None - --min_insert_size Minimum insert size of mapped reads to consider. Default: None - --max_insert_size Maximum insert size of mapped reads to consider. Default: None - --saveInteractionBAM Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False + --min_restriction_fragment_size [int] Minimum size of restriction fragments to consider. Default: None + --max_restriction_fragment_size [int] Maximum size of restriction fragments to consider. Default: None + --min_insert_size [int] Minimum insert size of mapped reads to consider. Default: None + --max_insert_size [int] Maximum insert size of mapped reads to consider. Default: None + --save_interaction_bam [bool] Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False - --dnase Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False - --min_cis_dist Minimum intra-chromosomal distance to consider. Default: None + --dnase [bool] Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False + --min_cis_dist [int] Minimum intra-chromosomal distance to consider. Default: None Contact maps - --bin_size Bin size for contact maps (comma separated). Default: '1000000,500000' - --ice_max_iter Maximum number of iteration for ICE normalization. Default: 100 - --ice_filter_low_count_perc Percentage of low counts columns/rows to filter before ICE normalization. Default: 0.02 - --ice_filter_high_count_perc Percentage of high counts columns/rows to filter before ICE normalization. Default: 0 - --ice_eps Convergence criteria for ICE normalization. Default: 0.1 + --bin_size [int] Bin size for contact maps (comma separated). Default: '1000000,500000' + --ice_max_iter [int] Maximum number of iteration for ICE normalization. Default: 100 + --ice_filter_low_count_perc [float] Percentage of low counts columns/rows to filter before ICE normalization. Default: 0.02 + --ice_filter_high_count_perc [float] Percentage of high counts columns/rows to filter before ICE normalization. Default: 0 + --ice_eps [float] Convergence criteria for ICE normalization. Default: 0.1 Workflow - --skipMaps Skip generation of contact maps. Useful for capture-C. Default: False - --skipIce Skip ICE normalization. Default: False - --skipCool Skip generation of cool files. Default: False - --skipMultiQC Skip MultiQC. Default: False + --skip_maps [bool] Skip generation of contact maps. Useful for capture-C. Default: False + --skip_ice [bool] Skip ICE normalization. Default: False + --skip_cool [bool] Skip generation of cool files. Default: False + --skip_multiqc [bool] Skip MultiQC. Default: False Other - --splitFastq Size of read chuncks to use to speed up the workflow. Default: None - --outdir The output directory where the results will be saved. Default: './results' - --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. Default: None - -name Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. Default: None + --split_fastq [bool] Size of read chuncks to use to speed up the workflow. Default: None + --outdir [file] The output directory where the results will be saved. Default: './results' + --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. Default: None + --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful + --max_multiqc_email_size [str] Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. Default: None AWSBatch - --awsqueue The AWSBatch JobQueue that needs to be set when running on AWSBatch - --awsregion The AWS Region for your AWS Batch job to run on + --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch + --awsregion [str] The AWS Region for your AWS Batch job to run on """.stripIndent() } @@ -152,7 +154,7 @@ if (params.readPaths){ .separate( raw_reads, raw_reads_2 ) { a -> [tuple(a[0], a[1][0]), tuple(a[0], a[1][1])] } } -if ( params.splitFastq ){ +if ( params.split_fastq ){ raw_reads_full = raw_reads.concat( raw_reads_2 ) raw_reads = raw_reads_full.splitFastq( by: params.splitFastq , file: true) }else{ @@ -191,7 +193,6 @@ else { } // Chromosome size - if ( params.chromosome_size ){ Channel.fromPath( params.chromosome_size , checkIfExists: true) .into {chromosome_size; chromosome_size_cool} @@ -236,7 +237,7 @@ def summary = [:] if(workflow.revision) summary['Pipeline Release'] = workflow.revision summary['Run Name'] = custom_runName ?: workflow.runName summary['Reads'] = params.reads -summary['splitFastq'] = params.splitFastq +summary['splitFastq'] = params.split_fastq summary['Fasta Ref'] = params.fasta summary['Restriction Motif']= params.restriction_site summary['Ligation Motif'] = params.ligation_site @@ -249,7 +250,6 @@ summary['Min Insert Size'] = params.min_insert_size summary['Max Insert Size'] = params.max_insert_size summary['Min CIS dist'] = params.min_cis_dist summary['Maps resolution'] = params.bin_size - summary['Max Memory'] = params.max_memory summary['Max CPUs'] = params.max_cpus summary['Max Time'] = params.max_time @@ -333,8 +333,9 @@ process get_software_versions { if(!params.bwt2_index && params.fasta){ process makeBowtie2Index { tag "$bwt2_base" - publishDir path: { params.saveReference ? "${params.outdir}/reference_genome" : params.outdir }, - saveAs: { params.saveReference ? it : null }, mode: 'copy' + label 'process_highmem' + publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, + saveAs: { params.save_reference ? it : null }, mode: 'copy' input: file fasta from fasta_for_index @@ -356,8 +357,9 @@ if(!params.bwt2_index && params.fasta){ if(!params.chromosome_size && params.fasta){ process makeChromSize { tag "$fasta" - publishDir path: { params.saveReference ? "${params.outdir}/reference_genome" : params.outdir }, - saveAs: { params.saveReference ? it : null }, mode: 'copy' + label 'process_low'' + publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, + saveAs: { params.save_reference ? it : null }, mode: 'copy' input: file fasta from fasta_for_chromsize @@ -375,9 +377,10 @@ if(!params.chromosome_size && params.fasta){ if(!params.restriction_fragments && params.fasta && !params.dnase){ process getRestrictionFragments { - tag "$fasta - ${params.restriction_site}" - publishDir path: { params.saveReference ? "${params.outdir}/reference_genome" : params.outdir }, - saveAs: { params.saveReference ? it : null }, mode: 'copy' + tag "$fasta ${params.restriction_site}" + label 'process_low' + publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir }, + saveAs: { params.save_reference ? it : null }, mode: 'copy' input: file fasta from fasta_for_resfrag @@ -402,8 +405,9 @@ if(!params.restriction_fragments && params.fasta && !params.dnase){ process bowtie2_end_to_end { tag "$prefix" - publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, - saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' + label 'process_medium' + publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir }, + saveAs: { params.save_aligned_intermediates ? it : null }, mode: 'copy' input: set val(sample), file(reads) from raw_reads @@ -440,8 +444,9 @@ process bowtie2_end_to_end { process trim_reads { tag "$prefix" - publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, - saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' + label 'process_low' + publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir }, + saveAs: { params.save_aligned_intermediates ? it : null }, mode: 'copy' when: !params.dnase @@ -462,8 +467,9 @@ process trim_reads { process bowtie2_on_trimmed_reads { tag "$prefix" - publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, - saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' + label 'process_medium' + publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir }, + saveAs: { params.save_aligned_intermediates ? it : null }, mode: 'copy' when: !params.dnase @@ -489,8 +495,9 @@ process bowtie2_on_trimmed_reads { if (!params.dnase){ process merge_mapping_steps{ tag "$sample = $bam1 + $bam2" - publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, - saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' + label 'process_medium' + publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir }, + saveAs: { params.save_aligned_intermediates ? it : null }, mode: 'copy' input: set val(prefix), file(bam1), file(bam2) from end_to_end_bam.join( trimmed_bam ) @@ -529,8 +536,9 @@ if (!params.dnase){ }else{ process dnase_mapping_stats{ tag "$sample = $bam1" - publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, - saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' + label 'process_medium' + publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir }, + saveAs: { params.save_aligned_intermediates ? it : null }, mode: 'copy' input: set val(prefix), file(bam1) from end_to_end_bam @@ -556,10 +564,9 @@ if (!params.dnase){ } } -println(bwt2_merged_bam) - process combine_mapped_files{ tag "$sample = $r1_prefix + $r2_prefix" + label 'process_low' publishDir "${params.outdir}/mapping", mode: 'copy', saveAs: {filename -> filename.indexOf(".pairstat") > 0 ? "stats/$filename" : "$filename"} @@ -594,6 +601,7 @@ process combine_mapped_files{ if (!params.dnase){ process get_valid_interaction{ tag "$sample" + label 'process_low' publishDir "${params.outdir}/hic_results/data", mode: 'copy', saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$filename" : "$filename"} @@ -611,7 +619,7 @@ if (!params.dnase){ set val(sample), file("*RSstat") into all_rsstat script: - if (params.splitFastq){ + if (params.split_fastq){ sample = sample.toString() - ~/(\.[0-9]+)$/ } @@ -621,7 +629,7 @@ if (!params.dnase){ if ("$params.max_insert_size".isInteger()) opts="${opts} -l ${params.max_insert_size}" if ("$params.min_restriction_fragment_size".isInteger()) opts="${opts} -t ${params.min_restriction_fragment_size}" if ("$params.max_restriction_fragment_size".isInteger()) opts="${opts} -m ${params.max_restriction_fragment_size}" - if (params.saveInteractionBAM) opts="${opts} --sam" + if (params.save_interaction_bam) opts="${opts} --sam" """ mapped_2hic_fragments.py -f ${frag_file} -r ${pe_bam} --all ${opts} """ @@ -630,6 +638,7 @@ if (!params.dnase){ else{ process get_valid_interaction_dnase{ tag "$sample" + label 'process_low' publishDir "${params.outdir}/hic_results/data", mode: 'copy', saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$filename" : "$filename"} @@ -642,7 +651,7 @@ else{ set val(sample), file("*RSstat") into all_rsstat script: - if (params.splitFastq){ + if (params.split_fastq){ sample = sample.toString() - ~/(\.[0-9]+)$/ } @@ -661,6 +670,7 @@ else{ process remove_duplicates { tag "$sample" + label 'process_highmem' publishDir "${params.outdir}/hic_results/data", mode: 'copy', saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$sample/$filename" : "$filename"} @@ -707,6 +717,7 @@ process remove_duplicates { process merge_sample { tag "$ext" + label 'process_low' publishDir "${params.outdir}/hic_results/stats/${sample}", mode: 'copy' input: @@ -726,13 +737,13 @@ process merge_sample { """ } - process build_contact_maps{ tag "$sample - $mres" + label 'process_highmem' publishDir "${params.outdir}/hic_results/matrix/raw", mode: 'copy' when: - !params.skipMaps + !params.skip_maps input: set val(sample), file(vpairs), val(mres) from all_valid_pairs.combine(map_res) @@ -754,10 +765,11 @@ process build_contact_maps{ process run_ice{ tag "$rmaps" + label 'process_highmem' publishDir "${params.outdir}/hic_results/matrix/iced", mode: 'copy' when: - !params.skipMaps && !params.skipIce + !params.skip_maps && !params.skip_ice input: file(rmaps) from raw_maps @@ -782,10 +794,11 @@ process run_ice{ */ process generate_cool{ tag "$sample" + label 'process_medium' publishDir "${params.outdir}/export/cool", mode: 'copy' when: - !params.skipCool + !params.skip_cool input: set val(sample), file(vpairs) from all_valid_pairs_4cool @@ -805,10 +818,11 @@ process generate_cool{ * STEP 6 - MultiQC */ process multiqc { + label 'process_low' publishDir "${params.outdir}/MultiQC", mode: 'copy' when: - !params.skipMultiQC + !params.skip_multiQC input: file multiqc_config from ch_multiqc_config -- GitLab