From 6fbdb2eb5aa1af6583dbe68bc343bfd866f60577 Mon Sep 17 00:00:00 2001 From: Edmund Miller <edmund.a.miller@protonmail.com> Date: Wed, 4 Jan 2023 08:17:08 -0600 Subject: [PATCH] [LINT] Run black --- bin/digest_genome.py | 101 +++++----- bin/mapped_2hic_dnase.py | 222 ++++++++++++--------- bin/mapped_2hic_fragments.py | 366 +++++++++++++++++++++-------------- bin/mergeSAM.py | 181 +++++++++++------ bin/merge_statfiles.py | 34 ++-- 5 files changed, 543 insertions(+), 361 deletions(-) diff --git a/bin/digest_genome.py b/bin/digest_genome.py index 2c29a49..9f05b45 100755 --- a/bin/digest_genome.py +++ b/bin/digest_genome.py @@ -18,15 +18,11 @@ import os import sys import numpy as np -RE_cutsite = { - "mboi": ["^GATC"], - "dpnii": ["^GATC"], - "bglii": ["A^GATCT"], - "hindiii": ["A^AGCTT"]} +RE_cutsite = {"mboi": ["^GATC"], "dpnii": ["^GATC"], "bglii": ["A^GATCT"], "hindiii": ["A^AGCTT"]} def find_re_sites(filename, sequences, offset): - with open(filename, 'r') as infile: + with open(filename, "r") as infile: chr_id = None big_str = "" indices = [] @@ -40,13 +36,12 @@ def find_re_sites(filename, sequences, offset): # If this is not the first chromosome, find the indices and append # them to the list if chr_id is not None: - for rs in range(len(sequences)): - pattern = "(?={})".format(sequences[rs].lower()) - indices += [m.start() + offset[rs]\ - for m in re.finditer(pattern, big_str)] - indices.sort() - all_indices.append(indices) - indices = [] + for rs in range(len(sequences)): + pattern = "(?={})".format(sequences[rs].lower()) + indices += [m.start() + offset[rs] for m in re.finditer(pattern, big_str)] + indices.sort() + all_indices.append(indices) + indices = [] # This is a new chromosome. Empty the sequence string, and add the # correct chrom id @@ -63,11 +58,10 @@ def find_re_sites(filename, sequences, offset): # Add the indices for the last chromosome for rs in range(len(sequences)): pattern = "(?={})".format(sequences[rs].lower()) - indices += [m.start() + offset[rs] - for m in re.finditer(pattern, big_str)] + indices += [m.start() + offset[rs] for m in re.finditer(pattern, big_str)] indices.sort() all_indices.append(indices) - + return contig_names, all_indices @@ -75,7 +69,7 @@ def find_chromsomose_lengths(reference_filename): chromosome_lengths = [] chromosome_names = [] length = None - with open(reference_filename, 'r') as infile: + with open(reference_filename, "r") as infile: for line in infile: if line.startswith(">"): chromosome_names.append(line[1:].strip()) @@ -89,11 +83,11 @@ def find_chromsomose_lengths(reference_filename): def replaceN(cs): - npos = int(cs.find('N')) + npos = int(cs.find("N")) cseql = [] if npos != -1: - for nuc in ["A","C","G","T"]: - tmp = cs.replace('N', nuc, 1) + for nuc in ["A", "C", "G", "T"]: + tmp = cs.replace("N", nuc, 1) tmpl = replaceN(tmp) if type(tmpl) == list: cseql = cseql + tmpl @@ -106,50 +100,59 @@ def replaceN(cs): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('fastafile') - parser.add_argument('-r', '--restriction_sites', - dest='res_sites', - nargs='+', - help=("The cutting position has to be specified using " - "'^'. For instance, -r A^AGCTT for HindIII " - "digestion. Several restriction enzyme can be " - "specified.")) - parser.add_argument('-o', '--out', default=None) + parser.add_argument("fastafile") + parser.add_argument( + "-r", + "--restriction_sites", + dest="res_sites", + nargs="+", + help=( + "The cutting position has to be specified using " + "'^'. For instance, -r A^AGCTT for HindIII " + "digestion. Several restriction enzyme can be " + "specified." + ), + ) + parser.add_argument("-o", "--out", default=None) args = parser.parse_args() filename = args.fastafile out = args.out - + # Split restriction sites if comma-separated - cutsites=[] + cutsites = [] for s in args.res_sites: - for m in s.split(','): + for m in s.split(","): cutsites.append(m) - + # process args and get restriction enzyme sequences sequences = [] offset = [] for cs in cutsites: if cs.lower() in RE_cutsite: - cseq = ''.join(RE_cutsite[cs.lower()]) + cseq = "".join(RE_cutsite[cs.lower()]) else: cseq = cs - offpos = int(cseq.find('^')) + offpos = int(cseq.find("^")) if offpos == -1: - print("Unable to detect offset for {}. Please, use '^' to specify the cutting position,\ - i.e A^GATCT for HindIII digestion.".format(cseq)) + print( + "Unable to detect offset for {}. Please, use '^' to specify the cutting position,\ + i.e A^GATCT for HindIII digestion.".format( + cseq + ) + ) sys.exit(-1) for nuc in list(set(cs)): - if nuc not in ['A','T','G','C','N','^']: + if nuc not in ["A", "T", "G", "C", "N", "^"]: print("Find unexpected character ['{}']in restriction motif".format(nuc)) print("Note that multiple motifs should be separated by a space (not a comma !)") sys.exit(-1) offset.append(offpos) - sequences.append(re.sub('\^', '', cseq)) + sequences.append(re.sub("\^", "", cseq)) # replace all N in restriction motif sequences_without_N = [] @@ -158,32 +161,32 @@ if __name__ == "__main__": nrs = replaceN(sequences[rs]) sequences_without_N = sequences_without_N + nrs offset_without_N = offset_without_N + [offset[rs]] * len(nrs) - + sequences = sequences_without_N offset = offset_without_N - + if out is None: out = os.path.splitext(filename)[0] + "_fragments.bed" print("Analyzing", filename) print("Restriction site(s)", ",".join(sequences)) - print("Offset(s)", ','.join(str(x) for x in offset)) + print("Offset(s)", ",".join(str(x) for x in offset)) # Read fasta file and look for rs per chromosome - contig_names, all_indices = find_re_sites(filename, sequences, offset=offset) + contig_names, all_indices = find_re_sites(filename, sequences, offset=offset) _, lengths = find_chromsomose_lengths(filename) valid_fragments = [] for i, indices in enumerate(all_indices): valid_fragments_chr = np.concatenate( - [np.concatenate([[0], indices])[:, np.newaxis], - np.concatenate([indices, [lengths[i]]])[:, np.newaxis]], - axis=1) + [np.concatenate([[0], indices])[:, np.newaxis], np.concatenate([indices, [lengths[i]]])[:, np.newaxis]], + axis=1, + ) valid_fragments.append(valid_fragments_chr) # Write results print("Writing to {} ...".format(out)) - with open(out, 'w') as outfile: + with open(out, "w") as outfile: for chrom_name, indices in zip(contig_names, valid_fragments): frag_id = 0 for begin, end in indices: @@ -192,4 +195,6 @@ if __name__ == "__main__": if end > begin: frag_id += 1 frag_name = "HIC_{}_{}".format(str(chrom_name), int(frag_id)) - outfile.write("{}\t{}\t{}\t{}\t0\t+\n".format(str(chrom_name), int(begin), int(end), str(frag_name))) + outfile.write( + "{}\t{}\t{}\t{}\t0\t+\n".format(str(chrom_name), int(begin), int(end), str(frag_name)) + ) diff --git a/bin/mapped_2hic_dnase.py b/bin/mapped_2hic_dnase.py index dd023b0..ff59366 100755 --- a/bin/mapped_2hic_dnase.py +++ b/bin/mapped_2hic_dnase.py @@ -25,8 +25,12 @@ def usage(): print("-r/--mappedReadsFile <BAM/SAM file of mapped reads>") print("[-o/--outputDir] <Output directory. Default is current directory>") print("[-d/--minCisDist] <Minimum distance between intrachromosomal contact to consider>") - print("[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>") - print("[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>") + print( + "[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>" + ) + print( + "[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>" + ) print("[-v/--verbose] <Verbose>") print("[-h/--help] <Help>") return @@ -38,8 +42,8 @@ def get_args(): opts, args = getopt.getopt( sys.argv[1:], "r:o:d:g:avh", - ["mappedReadsFile=", - "outputDir=", "minDist=", "gatg", "all", "verbose", "help"]) + ["mappedReadsFile=", "outputDir=", "minDist=", "gatg", "all", "verbose", "help"], + ) except getopt.GetoptError: usage() sys.exit(-1) @@ -78,8 +82,8 @@ def get_read_pos(read, st="start"): list of aligned reads """ if st == "middle": - pos = read.reference_start + int(read.alen/2) - elif st =="start": + pos = read.reference_start + int(read.alen / 2) + elif st == "start": pos = get_read_start(read) elif st == "left": pos = read.reference_start @@ -88,11 +92,11 @@ def get_read_pos(read, st="start"): def get_read_start(read): - """ - Return the 5' end of the read + """ + Return the 5' end of the read """ if read.is_reverse: - pos = read.reference_start + read.alen -1 + pos = read.reference_start + read.alen - 1 else: pos = read.reference_start return pos @@ -125,7 +129,7 @@ def get_ordered_reads(read1, read2): def isIntraChrom(read1, read2): """ Return true is the reads pair is intrachromosomal - + read1 : [AlignedRead] read2 : [AlignedRead] @@ -163,23 +167,23 @@ def get_valid_orientation(read1, read2): def get_cis_dist(read1, read2): - """ - Calculte the size of the DNA fragment library + """ + Calculte the size of the DNA fragment library - read1 : [AlignedRead] - read2 : [AlignedRead] + read1 : [AlignedRead] + read2 : [AlignedRead] - """ - # Get oriented reads - ##r1, r2 = get_ordered_reads(read1, read2) - dist = None - if not r1.is_unmapped and not r2.is_unmapped: - ## Contact distances can be calculated for intrachromosomal reads only - if isIntraChrom(read1, read2): - r1pos = get_read_pos(read1) - r2pos = get_read_pos(read2) - dist = abs(r1pos - r2pos) - return dist + """ + # Get oriented reads + ##r1, r2 = get_ordered_reads(read1, read2) + dist = None + if not r1.is_unmapped and not r2.is_unmapped: + ## Contact distances can be calculated for intrachromosomal reads only + if isIntraChrom(read1, read2): + r1pos = get_read_pos(read1) + r2pos = get_read_pos(read2) + dist = abs(r1pos - r2pos) + return dist def get_read_tag(read, tag): @@ -255,15 +259,15 @@ if __name__ == "__main__": CF_ascounter = 0 baseReadsFile = os.path.basename(mappedReadsFile) - baseReadsFile = re.sub(r'\.bam$|\.sam$', '', baseReadsFile) + baseReadsFile = re.sub(r"\.bam$|\.sam$", "", baseReadsFile) # Open handlers for output files - handle_valid = open(outputDir + '/' + baseReadsFile + '.validPairs', 'w') + handle_valid = open(outputDir + "/" + baseReadsFile + ".validPairs", "w") if allOutput: - handle_dump = open(outputDir + '/' + baseReadsFile + '.DumpPairs', 'w') - handle_single = open(outputDir + '/' + baseReadsFile + '.SinglePairs','w') - handle_filt = open(outputDir + '/' + baseReadsFile + '.FiltPairs','w') + handle_dump = open(outputDir + "/" + baseReadsFile + ".DumpPairs", "w") + handle_single = open(outputDir + "/" + baseReadsFile + ".SinglePairs", "w") + handle_filt = open(outputDir + "/" + baseReadsFile + ".FiltPairs", "w") # Read the SAM/BAM file if verbose: @@ -306,7 +310,7 @@ if __name__ == "__main__": cur_handler = handle_single if allOutput else None # Check Distance criteria - Filter - if (minDist is not None and dist is not None and dist < int(minDist)): + if minDist is not None and dist is not None and dist < int(minDist): interactionType = "FILT" filt_counter += 1 cur_handler = handle_filt if allOutput else None @@ -330,13 +334,11 @@ if __name__ == "__main__": dump_counter += 1 cur_handler = handle_dump if allOutput else None - - # Split valid pairs based on XA tag if gtag is not None: r1as = get_read_tag(r1, gtag) r2as = get_read_tag(r2, gtag) - + if r1as == 1 and r2as == 1: G1G1_ascounter += 1 elif r1as == 2 and r2as == 2: @@ -357,11 +359,10 @@ if __name__ == "__main__": CF_ascounter += 1 else: UU_ascounter += 1 - - + if cur_handler is not None: if not r1.is_unmapped and not r2.is_unmapped: - + ##reorient reads to ease duplicates removal or1, or2 = get_ordered_reads(r1, r2) or1_chrom = samfile.get_reference_name(or1.reference_id) @@ -371,53 +372,93 @@ if __name__ == "__main__": r1as = get_read_tag(or1, gtag) r2as = get_read_tag(or2, gtag) if gtag is not None: - htag = str(r1as)+"-"+str(r2as) - + htag = str(r1as) + "-" + str(r2as) + cur_handler.write( - or1.query_name + "\t" + - or1_chrom + "\t" + - str(get_read_pos(or1)+1) + "\t" + - str(get_read_strand(or1)) + "\t" + - or2_chrom + "\t" + - str(get_read_pos(or2)+1) + "\t" + - str(get_read_strand(or2)) + "\t" + - "NA" + "\t" + ##dist - "NA" + "\t" + ##resfrag1 - "NA" + "\t" + ##resfrag2 - str(or1.mapping_quality) + "\t" + - str(or2.mapping_quality) + "\t" + - str(htag) + "\n") - + or1.query_name + + "\t" + + or1_chrom + + "\t" + + str(get_read_pos(or1) + 1) + + "\t" + + str(get_read_strand(or1)) + + "\t" + + or2_chrom + + "\t" + + str(get_read_pos(or2) + 1) + + "\t" + + str(get_read_strand(or2)) + + "\t" + + "NA" + + "\t" + + "NA" ##dist + + "\t" + + "NA" ##resfrag1 + + "\t" + + str(or1.mapping_quality) ##resfrag2 + + "\t" + + str(or2.mapping_quality) + + "\t" + + str(htag) + + "\n" + ) + elif r2.is_unmapped and not r1.is_unmapped: cur_handler.write( - r1.query_name + "\t" + - r1_chrom + "\t" + - str(get_read_pos(r1)+1) + "\t" + - str(get_read_strand(r1)) + "\t" + - "*" + "\t" + - "*" + "\t" + - "*" + "\t" + - "*" + "\t" + - "*" + "\t" + - "*" + "\t" + - str(r1.mapping_quality) + "\t" + - "*" + "\n") + r1.query_name + + "\t" + + r1_chrom + + "\t" + + str(get_read_pos(r1) + 1) + + "\t" + + str(get_read_strand(r1)) + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + str(r1.mapping_quality) + + "\t" + + "*" + + "\n" + ) elif r1.is_unmapped and not r2.is_unmapped: cur_handler.write( - r2.query_name + "\t" + - "*" + "\t" + - "*" + "\t" + - "*" + "\t" + - r2_chrom + "\t" + - str(get_read_pos(r2)+1) + "\t" + - str(get_read_strand(r2)) + "\t" + - "*" + "\t" + - "*" + "\t" + - "*" + "\t" + - "*" + "\t" + - str(r2.mapping_quality) + "\n") - - if (reads_counter % 100000 == 0 and verbose): + r2.query_name + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + r2_chrom + + "\t" + + str(get_read_pos(r2) + 1) + + "\t" + + str(get_read_strand(r2)) + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + str(r2.mapping_quality) + + "\n" + ) + + if reads_counter % 100000 == 0 and verbose: print("##", reads_counter) # Close handler @@ -428,7 +469,7 @@ if __name__ == "__main__": handle_filt.close() # Write stats file - with open(outputDir + '/' + baseReadsFile + '.RSstat', 'w') as handle_stat: + with open(outputDir + "/" + baseReadsFile + ".RSstat", "w") as handle_stat: handle_stat.write("## Hi-C processing - no restriction fragments\n") handle_stat.write("Valid_interaction_pairs\t" + str(valid_counter) + "\n") handle_stat.write("Valid_interaction_pairs_FF\t" + str(valid_counter_FF) + "\n") @@ -439,17 +480,24 @@ if __name__ == "__main__": handle_stat.write("Filtered_pairs\t" + str(filt_counter) + "\n") handle_stat.write("Dumped_pairs\t" + str(dump_counter) + "\n") - ## Write AS report + ## Write AS report if gtag is not None: handle_stat.write("## ======================================\n") handle_stat.write("## Allele specific information\n") handle_stat.write("Valid_pairs_from_ref_genome_(1-1)\t" + str(G1G1_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_ref_genome_with_one_unassigned_mate_(0-1/1-0)\t" + str(UG1_ascounter+G1U_ascounter) + "\n") + handle_stat.write( + "Valid_pairs_from_ref_genome_with_one_unassigned_mate_(0-1/1-0)\t" + + str(UG1_ascounter + G1U_ascounter) + + "\n" + ) handle_stat.write("Valid_pairs_from_alt_genome_(2-2)\t" + str(G2G2_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_alt_genome_with_one_unassigned_mate_(0-2/2-0)\t" + str(UG2_ascounter+G2U_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_alt_and_ref_genome_(1-2/2-1)\t" + str(G1G2_ascounter+G2G1_ascounter) + "\n") + handle_stat.write( + "Valid_pairs_from_alt_genome_with_one_unassigned_mate_(0-2/2-0)\t" + + str(UG2_ascounter + G2U_ascounter) + + "\n" + ) + handle_stat.write( + "Valid_pairs_from_alt_and_ref_genome_(1-2/2-1)\t" + str(G1G2_ascounter + G2G1_ascounter) + "\n" + ) handle_stat.write("Valid_pairs_with_both_unassigned_mated_(0-0)\t" + str(UU_ascounter) + "\n") handle_stat.write("Valid_pairs_with_at_least_one_conflicting_mate_(3-)\t" + str(CF_ascounter) + "\n") - - - diff --git a/bin/mapped_2hic_fragments.py b/bin/mapped_2hic_fragments.py index e823ee0..cc0e40b 100755 --- a/bin/mapped_2hic_fragments.py +++ b/bin/mapped_2hic_fragments.py @@ -32,8 +32,12 @@ def usage(): print("[-t/--shortestFragmentLength] <Shortest restriction fragment length to consider>") print("[-m/--longestFragmentLength] <Longest restriction fragment length to consider>") print("[-d/--minCisDist] <Minimum distance between intrachromosomal contact to consider>") - print("[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>") - print("[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>") + print( + "[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>" + ) + print( + "[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>" + ) print("[-S/--sam] <Output an additional SAM file with flag 'CT' for pairs classification>") print("[-v/--verbose] <Verbose>") print("[-h/--help] <Help>") @@ -46,13 +50,22 @@ def get_args(): opts, args = getopt.getopt( sys.argv[1:], "f:r:o:s:l:t:m:d:g:Svah", - ["fragmentFile=", - "mappedReadsFile=", - "outputDir=", - "minInsertSize=", "maxInsertSize", - "minFragSize", "maxFragSize", - "minDist", - "gatg", "sam", "verbose", "all", "help"]) + [ + "fragmentFile=", + "mappedReadsFile=", + "outputDir=", + "minInsertSize=", + "maxInsertSize", + "minFragSize", + "maxFragSize", + "minDist", + "gatg", + "sam", + "verbose", + "all", + "help", + ], + ) except getopt.GetoptError: usage() sys.exit(-1) @@ -66,7 +79,7 @@ def timing(function, *args): """ startTime = time.time() result = function(*args) - print('{} function took {:.3f}ms'.format(function.__name__, (time.time() - startTime) * 1000)) + print("{} function took {:.3f}ms".format(function.__name__, (time.time() - startTime) * 1000)) return result @@ -88,7 +101,7 @@ def get_read_strand(read): def isIntraChrom(read1, read2): """ Return true is the reads pair is intrachromosomal - + read1 : [AlignedRead] read2 : [AlignedRead] @@ -99,22 +112,22 @@ def isIntraChrom(read1, read2): def get_cis_dist(read1, read2): - """ - Calculte the contact distance between two intrachromosomal reads + """ + Calculte the contact distance between two intrachromosomal reads - read1 : [AlignedRead] - read2 : [AlignedRead] + read1 : [AlignedRead] + read2 : [AlignedRead] - """ - # Get oriented reads - ##r1, r2 = get_ordered_reads(read1, read2) - dist = None - if not read1.is_unmapped and not read2.is_unmapped: - ## Contact distances can be calculated for intrachromosomal reads only - if isIntraChrom(read1, read2): - r1pos, r2pos = get_read_pos(read1), get_read_pos(read2) - dist = abs(r1pos - r2pos) - return dist + """ + # Get oriented reads + ##r1, r2 = get_ordered_reads(read1, read2) + dist = None + if not read1.is_unmapped and not read2.is_unmapped: + ## Contact distances can be calculated for intrachromosomal reads only + if isIntraChrom(read1, read2): + r1pos, r2pos = get_read_pos(read1), get_read_pos(read2) + dist = abs(r1pos - r2pos) + return dist def get_read_pos(read, st="start"): @@ -135,12 +148,12 @@ def get_read_pos(read, st="start"): """ if st == "middle": - pos = read.reference_start + int(read.alen/2) - elif st =="start": + pos = read.reference_start + int(read.alen / 2) + elif st == "start": pos = get_read_start(read) elif st == "left": pos = read.reference_start - + return pos @@ -149,11 +162,12 @@ def get_read_start(read): Return the 5' end of the read """ if read.is_reverse: - pos = read.reference_start + read.alen -1 + pos = read.reference_start + read.alen - 1 else: pos = read.reference_start return pos + def get_ordered_reads(read1, read2): """ Reorient reads @@ -183,9 +197,10 @@ def get_ordered_reads(read1, read2): r1, r2 = read1, read2 else: r1, r2 = read2, read1 - + return r1, r2 + def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False): """ Read a BED file and store the intervals in a tree @@ -204,37 +219,37 @@ def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbo nline = 0 nfilt = 0 for line in bed_handle: - nline += 1 - bedtab = line.split("\t") - try: - chromosome, start, end, name = bedtab[:4] - except ValueError: - print("Warning : wrong input format in line {}. Not a BED file ?!".format(nline)) - continue + nline += 1 + bedtab = line.split("\t") + try: + chromosome, start, end, name = bedtab[:4] + except ValueError: + print("Warning : wrong input format in line {}. Not a BED file ?!".format(nline)) + continue # BED files are zero-based as Intervals objects - start = int(start) # + 1 - end = int(end) - fragl = abs(end - start) - name = name.strip() - - ## Discard fragments outside the size range - filt = False - if minfragsize != None and int(fragl) < int(minfragsize): - nfilt += 1 - filt = True - elif maxfragsize != None and int(fragl) > int(maxfragsize): - nfilt += 1 - filt = True - - if chromosome in resFrag: - tree = resFrag[chromosome] - tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) - else: - tree = Intersecter() - tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) - resFrag[chromosome] = tree - + start = int(start) # + 1 + end = int(end) + fragl = abs(end - start) + name = name.strip() + + ## Discard fragments outside the size range + filt = False + if minfragsize != None and int(fragl) < int(minfragsize): + nfilt += 1 + filt = True + elif maxfragsize != None and int(fragl) > int(maxfragsize): + nfilt += 1 + filt = True + + if chromosome in resFrag: + tree = resFrag[chromosome] + tree.add_interval(Interval(start, end, value={"name": name, "filter": filt})) + else: + tree = Intersecter() + tree.add_interval(Interval(start, end, value={"name": name, "filter": filt})) + resFrag[chromosome] = tree + if nfilt > 0: print("Warning : {} fragment(s) outside of range and discarded. {} remaining.".format(nfilt, nline - nfilt)) bed_handle.close() @@ -253,10 +268,10 @@ def get_overlapping_restriction_fragment(resFrag, chrom, read): """ # Get read position (middle or start) pos = get_read_pos(read, st="middle") - + if chrom in resFrag: # Overlap with the position of the read (zero-based) - resfrag = resFrag[chrom].find(pos, pos+1) + resfrag = resFrag[chrom].find(pos, pos + 1) if len(resfrag) > 1: print("Warning : {} restictions fragments found for {} -skipped".format(len(resfrag), read.query_name)) return None @@ -271,21 +286,22 @@ def get_overlapping_restriction_fragment(resFrag, chrom, read): def are_contiguous_fragments(frag1, frag2, chr1, chr2): - ''' + """ Compare fragment positions to check if they are contiguous - ''' + """ ret = False if chr1 == chr2: if int(frag1.start) < int(frag2.start): d = int(frag2.start) - int(frag1.end) else: d = int(frag1.start) - int(frag2.end) - + if d == 0: ret = True - + return ret + def is_religation(read1, read2, frag1, frag2): """ Reads are expected to map adjacent fragments @@ -294,8 +310,8 @@ def is_religation(read1, read2, frag1, frag2): """ ret = False if are_contiguous_fragments(frag1, frag2, read1.tid, read2.tid): - #r1, r2 = get_ordered_reads(read1, read2) - #if get_read_strand(r1) == "+" and get_read_strand(r2) == "-": + # r1, r2 = get_ordered_reads(read1, read2) + # if get_read_strand(r1) == "+" and get_read_strand(r2) == "-": ret = True return ret @@ -405,8 +421,7 @@ def get_PE_fragment_size(read1, read2, resFrag1, resFrag2, interactionType): return fragmentsize -def get_interaction_type(read1, read1_chrom, resfrag1, read2, - read2_chrom, resfrag2, verbose): +def get_interaction_type(read1, read1_chrom, resfrag1, read2, read2_chrom, resfrag2, verbose): """ Returns the interaction type @@ -433,7 +448,7 @@ def get_interaction_type(read1, read1_chrom, resfrag1, read2, # If returned InteractionType=None -> Same restriction fragment # and same strand = Dump interactionType = None - + if not read1.is_unmapped and not read2.is_unmapped and resfrag1 is not None and resfrag2 is not None: # same restriction fragment if resfrag1 == resfrag2: @@ -549,29 +564,29 @@ if __name__ == "__main__": CF_ascounter = 0 baseReadsFile = os.path.basename(mappedReadsFile) - baseReadsFile = re.sub(r'\.bam$|\.sam$', '', baseReadsFile) + baseReadsFile = re.sub(r"\.bam$|\.sam$", "", baseReadsFile) # Open handlers for output files - handle_valid = open(outputDir + '/' + baseReadsFile + '.validPairs', 'w') + handle_valid = open(outputDir + "/" + baseReadsFile + ".validPairs", "w") if allOutput: - handle_de = open(outputDir + '/' + baseReadsFile + '.DEPairs', 'w') - handle_re = open(outputDir + '/' + baseReadsFile + '.REPairs', 'w') - handle_sc = open(outputDir + '/' + baseReadsFile + '.SCPairs', 'w') - handle_dump = open(outputDir + '/' + baseReadsFile + '.DumpPairs', 'w') - handle_single = open(outputDir + '/' + baseReadsFile + '.SinglePairs', 'w') - handle_filt = open(outputDir + '/' + baseReadsFile + '.FiltPairs', 'w') + handle_de = open(outputDir + "/" + baseReadsFile + ".DEPairs", "w") + handle_re = open(outputDir + "/" + baseReadsFile + ".REPairs", "w") + handle_sc = open(outputDir + "/" + baseReadsFile + ".SCPairs", "w") + handle_dump = open(outputDir + "/" + baseReadsFile + ".DumpPairs", "w") + handle_single = open(outputDir + "/" + baseReadsFile + ".SinglePairs", "w") + handle_filt = open(outputDir + "/" + baseReadsFile + ".FiltPairs", "w") # Read the BED file resFrag = timing(load_restriction_fragment, fragmentFile, minFragSize, maxFragSize, verbose) - + # Read the SAM/BAM file if verbose: print("## Opening SAM/BAM file {} ...".format(mappedReadsFile)) samfile = pysam.Samfile(mappedReadsFile, "rb") if samOut: - handle_sam = pysam.AlignmentFile(outputDir + '/' + baseReadsFile + '_interaction.bam', "wb", template=samfile) + handle_sam = pysam.AlignmentFile(outputDir + "/" + baseReadsFile + "_interaction.bam", "wb", template=samfile) # Reads are 0-based too (for both SAM and BAM format) # Loop on all reads @@ -608,22 +623,24 @@ if __name__ == "__main__": interactionType = get_interaction_type(r1, r1_chrom, r1_resfrag, r2, r2_chrom, r2_resfrag, verbose) dist = get_PE_fragment_size(r1, r2, r1_resfrag, r2_resfrag, interactionType) cdist = get_cis_dist(r1, r2) - + ## Filter based on restriction fragments - if (r1_resfrag is not None and r1_resfrag.value['filter'] == True) or (r2_resfrag is not None and r2_resfrag.value['filter']) == True: + if (r1_resfrag is not None and r1_resfrag.value["filter"] == True) or ( + r2_resfrag is not None and r2_resfrag.value["filter"] + ) == True: interactionType = "FILT" - + # Check Insert size criteria - FILT - if (minInsertSize is not None and dist is not None and - dist < int(minInsertSize)) or \ - (maxInsertSize is not None and dist is not None and dist > int(maxInsertSize)): + if (minInsertSize is not None and dist is not None and dist < int(minInsertSize)) or ( + maxInsertSize is not None and dist is not None and dist > int(maxInsertSize) + ): interactionType = "FILT" # Check Distance criteria - FILT # Done for VI otherwise this criteria will overwrite all other invalid classification - if (interactionType == "VI" and minDist is not None and cdist is not None and cdist < int(minDist)): + if interactionType == "VI" and minDist is not None and cdist is not None and cdist < int(minDist): interactionType = "FILT" - + if interactionType == "VI": valid_counter += 1 cur_handler = handle_valid @@ -677,11 +694,11 @@ if __name__ == "__main__": elif interactionType == "SI": single_counter += 1 cur_handler = handle_single if allOutput else None - + elif interactionType == "FILT": filt_counter += 1 cur_handler = handle_filt if allOutput else None - + else: interactionType = "DUMP" dump_counter += 1 @@ -694,17 +711,17 @@ if __name__ == "__main__": ## Write results in right handler if cur_handler is not None: - if not r1.is_unmapped and not r2.is_unmapped: + if not r1.is_unmapped and not r2.is_unmapped: ##reorient reads to ease duplicates removal or1, or2 = get_ordered_reads(r1, r2) or1_chrom = samfile.get_reference_name(or1.tid) or2_chrom = samfile.get_reference_name(or2.tid) - + ##reset as tag now that the reads are oriented r1as = get_read_tag(or1, gtag) r2as = get_read_tag(or2, gtag) if gtag is not None: - htag = str(r1as)+"-"+str(r2as) + htag = str(r1as) + "-" + str(r2as) ##get fragment name and reorient if necessary if or1 == r1 and or2 == r2: @@ -715,73 +732,113 @@ if __name__ == "__main__": or2_resfrag = r1_resfrag if or1_resfrag is not None: - or1_fragname = or1_resfrag.value['name'] + or1_fragname = or1_resfrag.value["name"] else: - or1_fragname = 'None' - + or1_fragname = "None" + if or2_resfrag is not None: - or2_fragname = or2_resfrag.value['name'] + or2_fragname = or2_resfrag.value["name"] else: - or2_fragname = 'None' - + or2_fragname = "None" + cur_handler.write( - or1.query_name + "\t" + - or1_chrom + "\t" + - str(get_read_pos(or1)+1) + "\t" + - str(get_read_strand(or1)) + "\t" + - or2_chrom + "\t" + - str(get_read_pos(or2)+1) + "\t" + - str(get_read_strand(or2)) + "\t" + - str(dist) + "\t" + - or1_fragname + "\t" + - or2_fragname + "\t" + - str(or1.mapping_quality) + "\t" + - str(or2.mapping_quality) + "\t" + - str(htag) + "\n") + or1.query_name + + "\t" + + or1_chrom + + "\t" + + str(get_read_pos(or1) + 1) + + "\t" + + str(get_read_strand(or1)) + + "\t" + + or2_chrom + + "\t" + + str(get_read_pos(or2) + 1) + + "\t" + + str(get_read_strand(or2)) + + "\t" + + str(dist) + + "\t" + + or1_fragname + + "\t" + + or2_fragname + + "\t" + + str(or1.mapping_quality) + + "\t" + + str(or2.mapping_quality) + + "\t" + + str(htag) + + "\n" + ) elif r2.is_unmapped and not r1.is_unmapped: if r1_resfrag is not None: - r1_fragname = r1_resfrag.value['name'] - + r1_fragname = r1_resfrag.value["name"] + cur_handler.write( - r1.query_name + "\t" + - r1_chrom + "\t" + - str(get_read_pos(r1)+1) + "\t" + - str(get_read_strand(r1)) + "\t" + - "*" + "\t" + - "*" + "\t" + - "*" + "\t" + - "*" + "\t" + - r1_fragname + "\t" + - "*" + "\t" + - str(r1.mapping_quality) + "\t" + - "*" + "\n") + r1.query_name + + "\t" + + r1_chrom + + "\t" + + str(get_read_pos(r1) + 1) + + "\t" + + str(get_read_strand(r1)) + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + r1_fragname + + "\t" + + "*" + + "\t" + + str(r1.mapping_quality) + + "\t" + + "*" + + "\n" + ) elif r1.is_unmapped and not r2.is_unmapped: if r2_resfrag is not None: - r2_fragname = r2_resfrag.value['name'] - + r2_fragname = r2_resfrag.value["name"] + cur_handler.write( - r2.query_name + "\t" + - "*" + "\t" + - "*" + "\t" + - "*" + "\t" + - r2_chrom + "\t" + - str(get_read_pos(r2)+1) + "\t" + - str(get_read_strand(r2)) + "\t" + - "*" + "\t" + - "*" + "\t" + - r2_fragname + "\t" + - "*" + "\t" + - str(r2.mapping_quality) + "\n") - - ## Keep initial order + r2.query_name + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + r2_chrom + + "\t" + + str(get_read_pos(r2) + 1) + + "\t" + + str(get_read_strand(r2)) + + "\t" + + "*" + + "\t" + + "*" + + "\t" + + r2_fragname + + "\t" + + "*" + + "\t" + + str(r2.mapping_quality) + + "\n" + ) + + ## Keep initial order if samOut: - r1.tags = r1.tags + [('CT', str(interactionType))] - r2.tags = r2.tags + [('CT', str(interactionType))] + r1.tags = r1.tags + [("CT", str(interactionType))] + r2.tags = r2.tags + [("CT", str(interactionType))] handle_sam.write(r1) handle_sam.write(r2) - if (reads_counter % 100000 == 0 and verbose): + if reads_counter % 100000 == 0 and verbose: print("##", reads_counter) # Close handler @@ -794,9 +851,8 @@ if __name__ == "__main__": handle_single.close() handle_filt.close() - # Write stats file - handle_stat = open(outputDir + '/' + baseReadsFile + '.RSstat', 'w') + handle_stat = open(outputDir + "/" + baseReadsFile + ".RSstat", "w") handle_stat.write("## Hi-C processing\n") handle_stat.write("Valid_interaction_pairs\t" + str(valid_counter) + "\n") handle_stat.write("Valid_interaction_pairs_FF\t" + str(valid_counter_FF) + "\n") @@ -815,10 +871,20 @@ if __name__ == "__main__": handle_stat.write("## ======================================\n") handle_stat.write("## Allele specific information\n") handle_stat.write("Valid_pairs_from_ref_genome_(1-1)\t" + str(G1G1_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_ref_genome_with_one_unassigned_mate_(0-1/1-0)\t" + str(UG1_ascounter+G1U_ascounter) + "\n") + handle_stat.write( + "Valid_pairs_from_ref_genome_with_one_unassigned_mate_(0-1/1-0)\t" + + str(UG1_ascounter + G1U_ascounter) + + "\n" + ) handle_stat.write("Valid_pairs_from_alt_genome_(2-2)\t" + str(G2G2_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_alt_genome_with_one_unassigned_mate_(0-2/2-0)\t" + str(UG2_ascounter+G2U_ascounter) + "\n") - handle_stat.write("Valid_pairs_from_alt_and_ref_genome_(1-2/2-1)\t" + str(G1G2_ascounter+G2G1_ascounter) + "\n") + handle_stat.write( + "Valid_pairs_from_alt_genome_with_one_unassigned_mate_(0-2/2-0)\t" + + str(UG2_ascounter + G2U_ascounter) + + "\n" + ) + handle_stat.write( + "Valid_pairs_from_alt_and_ref_genome_(1-2/2-1)\t" + str(G1G2_ascounter + G2G1_ascounter) + "\n" + ) handle_stat.write("Valid_pairs_with_both_unassigned_mated_(0-0)\t" + str(UU_ascounter) + "\n") handle_stat.write("Valid_pairs_with_at_least_one_conflicting_mate_(3-)\t" + str(CF_ascounter) + "\n") diff --git a/bin/mergeSAM.py b/bin/mergeSAM.py index a907fd7..82ab8c3 100755 --- a/bin/mergeSAM.py +++ b/bin/mergeSAM.py @@ -1,7 +1,7 @@ #!/usr/bin/env python ## HiC-Pro -## Copyright (c) 2015 Institut Curie +## Copyright (c) 2015 Institut Curie ## Author(s): Nicolas Servant, Eric Viara ## Contact: nicolas.servant@curie.fr ## This software is distributed without any guarantee under the terms of the BSD-3 licence. @@ -20,6 +20,7 @@ import os import re import pysam + def usage(): """Usage function""" print("Usage : python mergeSAM.py") @@ -41,10 +42,8 @@ def get_args(): opts, args = getopt.getopt( sys.argv[1:], "f:r:o:q:smtvh", - ["forward=", - "reverse=", - "output=", "qual=", - "single", "multi", "stat", "verbose", "help"]) + ["forward=", "reverse=", "output=", "qual=", "single", "multi", "stat", "verbose", "help"], + ) except getopt.GetoptError: usage() sys.exit(-1) @@ -53,24 +52,26 @@ def get_args(): def is_unique_bowtie2(read): ret = False - if not read.is_unmapped and read.has_tag('AS'): - if read.has_tag('XS'): - primary = read.get_tag('AS') - secondary = read.get_tag('XS') - if (primary > secondary): + if not read.is_unmapped and read.has_tag("AS"): + if read.has_tag("XS"): + primary = read.get_tag("AS") + secondary = read.get_tag("XS") + if primary > secondary: ret = True else: ret = True return ret + ## Remove everything after "/" or " " in read's name def get_read_name(read): name = read.query_name - #return name.split("/",1)[0] - return re.split('/| ', name)[0] + # return name.split("/",1)[0] + return re.split("/| ", name)[0] + def sam_flag(read1, read2, hr1, hr2): - + f1 = read1.flag f2 = read2.flag @@ -81,7 +82,7 @@ def sam_flag(read1, read2, hr1, hr2): if r2.is_unmapped == False: r2_chrom = hr2.get_reference_name(r2.reference_id) else: - r2_chrom="*" + r2_chrom = "*" ##Relevant bitwise flags (flag in an 11-bit binary number) ##1 The read is one of a pair @@ -92,54 +93,53 @@ def sam_flag(read1, read2, hr1, hr2): ##32 The other mate in the paired-end alignment is aligned to the reverse reference strand ##64 The read is the first (#1) mate in a pair ##128 The read is the second (#2) mate in a pair - - ##The reads were mapped as single-end data, so should expect flags of + + ##The reads were mapped as single-end data, so should expect flags of ##0 (map to the '+' strand) or 16 (map to the '-' strand) - ##Output example: a paired-end read that aligns to the reverse strand + ##Output example: a paired-end read that aligns to the reverse strand ##and is the first mate in the pair will have flag 83 (= 64 + 16 + 2 + 1) - + if f1 & 0x4: f1 = f1 | 0x8 if f2 & 0x4: f2 = f2 | 0x8 - - if (not (f1 & 0x4) and not (f2 & 0x4)): + + if not (f1 & 0x4) and not (f2 & 0x4): ##The flag should now indicate this is paired-end data f1 = f1 | 0x1 f1 = f1 | 0x2 f2 = f2 | 0x1 - f2 = f2 | 0x2 - + f2 = f2 | 0x2 + ##Indicate if the pair is on the reverse strand if f1 & 0x10: f2 = f2 | 0x20 - + if f2 & 0x10: f1 = f1 | 0x20 - + ##Is this first or the second pair? f1 = f1 | 0x40 f2 = f2 | 0x80 - + ##Insert the modified bitwise flags into the reads read1.flag = f1 read2.flag = f2 - + ##Determine the RNEXT and PNEXT values (i.e. the positional values of a read's pair) - #RNEXT + # RNEXT if r1_chrom == r2_chrom: read1.next_reference_id = r1.reference_id read2.next_reference_id = r1.reference_id else: read1.next_reference_id = r2.reference_id read2.next_reference_id = r1.reference_id - #PNEXT + # PNEXT read1.next_reference_start = read2.reference_start read2.next_reference_start = read1.reference_start - return(read1, read2) - + return (read1, read2) if __name__ == "__main__": @@ -196,13 +196,13 @@ if __name__ == "__main__": tot_pairs_counter = 0 multi_pairs_counter = 0 uniq_pairs_counter = 0 - unmapped_pairs_counter = 0 + unmapped_pairs_counter = 0 lowq_pairs_counter = 0 multi_singles_counter = 0 uniq_singles_counter = 0 lowq_singles_counter = 0 - #local_counter = 0 + # local_counter = 0 paired_reads_counter = 0 singleton_counter = 0 reads_counter = 0 @@ -213,31 +213,31 @@ if __name__ == "__main__": ## Loop on all reads if verbose: print("## Merging forward and reverse tags ...") - - with pysam.Samfile(R1file, "rb") as hr1, pysam.Samfile(R2file, "rb") as hr2: + + with pysam.Samfile(R1file, "rb") as hr1, pysam.Samfile(R2file, "rb") as hr2: if output == "-": outfile = pysam.AlignmentFile(output, "w", template=hr1) else: outfile = pysam.AlignmentFile(output, "wb", template=hr1) - + for r1, r2 in zip(hr1.fetch(until_eof=True), hr2.fetch(until_eof=True)): - reads_counter +=1 - if (reads_counter % 1000000 == 0 and verbose): + reads_counter += 1 + if reads_counter % 1000000 == 0 and verbose: print("##", reads_counter) - + if get_read_name(r1) == get_read_name(r2): ## both unmapped if r1.is_unmapped == True and r2.is_unmapped == True: unmapped_pairs_counter += 1 continue - + ## both mapped elif r1.is_unmapped == False and r2.is_unmapped == False: ## quality if mapq != None and (r1.mapping_quality < int(mapq) or r2.mapping_quality < int(mapq)): lowq_pairs_counter += 1 continue - + ## Unique mapping if is_unique_bowtie2(r1) == True and is_unique_bowtie2(r2) == True: uniq_pairs_counter += 1 @@ -253,7 +253,7 @@ if __name__ == "__main__": continue if r1.is_unmapped == False: ## first end is mapped, second is not ## quality - if mapq != None and (r1.mapping_quality < int(mapq)): + if mapq != None and (r1.mapping_quality < int(mapq)): lowq_singles_counter += 1 continue ## Unique mapping @@ -265,7 +265,7 @@ if __name__ == "__main__": continue else: ## second end is mapped, first is not ## quality - if mapq != None and (r2.mapping_quality < int(mapq)): + if mapq != None and (r2.mapping_quality < int(mapq)): lowq_singles_counter += 1 continue ## Unique mapping @@ -276,34 +276,95 @@ if __name__ == "__main__": if report_multi == False: continue - tot_pairs_counter += 1 - (r1, r2) = sam_flag(r1,r2, hr1, hr2) + tot_pairs_counter += 1 + (r1, r2) = sam_flag(r1, r2, hr1, hr2) ## Write output outfile.write(r1) outfile.write(r2) - + else: - print("Forward and reverse reads not paired. Check that BAM files have the same read names and are sorted.") + print( + "Forward and reverse reads not paired. Check that BAM files have the same read names and are sorted." + ) sys.exit(1) if stat: - if output == '-': + if output == "-": statfile = "pairing.stat" else: - statfile = re.sub('\.bam$', '.pairstat', output) - with open(statfile, 'w') as handle_stat: - handle_stat.write("Total_pairs_processed\t" + str(reads_counter) + "\t" + str(round(float(reads_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Unmapped_pairs\t" + str(unmapped_pairs_counter) + "\t" + str(round(float(unmapped_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Low_qual_pairs\t" + str(lowq_pairs_counter) + "\t" + str(round(float(lowq_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Unique_paired_alignments\t" + str(uniq_pairs_counter) + "\t" + str(round(float(uniq_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Multiple_pairs_alignments\t" + str(multi_pairs_counter) + "\t" + str(round(float(multi_pairs_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Pairs_with_singleton\t" + str(singleton_counter) + "\t" + str(round(float(singleton_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Low_qual_singleton\t" + str(lowq_singles_counter) + "\t" + str(round(float(lowq_singles_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Unique_singleton_alignments\t" + str(uniq_singles_counter) + "\t" + str(round(float(uniq_singles_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Multiple_singleton_alignments\t" + str(multi_singles_counter) + "\t" + str(round(float(multi_singles_counter)/float(reads_counter)*100,3)) + "\n") - handle_stat.write("Reported_pairs\t" + str(tot_pairs_counter) + "\t" + str(round(float(tot_pairs_counter)/float(reads_counter)*100,3)) + "\n") + statfile = re.sub("\.bam$", ".pairstat", output) + with open(statfile, "w") as handle_stat: + handle_stat.write( + "Total_pairs_processed\t" + + str(reads_counter) + + "\t" + + str(round(float(reads_counter) / float(reads_counter) * 100, 3)) + + "\n" + ) + handle_stat.write( + "Unmapped_pairs\t" + + str(unmapped_pairs_counter) + + "\t" + + str(round(float(unmapped_pairs_counter) / float(reads_counter) * 100, 3)) + + "\n" + ) + handle_stat.write( + "Low_qual_pairs\t" + + str(lowq_pairs_counter) + + "\t" + + str(round(float(lowq_pairs_counter) / float(reads_counter) * 100, 3)) + + "\n" + ) + handle_stat.write( + "Unique_paired_alignments\t" + + str(uniq_pairs_counter) + + "\t" + + str(round(float(uniq_pairs_counter) / float(reads_counter) * 100, 3)) + + "\n" + ) + handle_stat.write( + "Multiple_pairs_alignments\t" + + str(multi_pairs_counter) + + "\t" + + str(round(float(multi_pairs_counter) / float(reads_counter) * 100, 3)) + + "\n" + ) + handle_stat.write( + "Pairs_with_singleton\t" + + str(singleton_counter) + + "\t" + + str(round(float(singleton_counter) / float(reads_counter) * 100, 3)) + + "\n" + ) + handle_stat.write( + "Low_qual_singleton\t" + + str(lowq_singles_counter) + + "\t" + + str(round(float(lowq_singles_counter) / float(reads_counter) * 100, 3)) + + "\n" + ) + handle_stat.write( + "Unique_singleton_alignments\t" + + str(uniq_singles_counter) + + "\t" + + str(round(float(uniq_singles_counter) / float(reads_counter) * 100, 3)) + + "\n" + ) + handle_stat.write( + "Multiple_singleton_alignments\t" + + str(multi_singles_counter) + + "\t" + + str(round(float(multi_singles_counter) / float(reads_counter) * 100, 3)) + + "\n" + ) + handle_stat.write( + "Reported_pairs\t" + + str(tot_pairs_counter) + + "\t" + + str(round(float(tot_pairs_counter) / float(reads_counter) * 100, 3)) + + "\n" + ) hr1.close() hr2.close() outfile.close() - diff --git a/bin/merge_statfiles.py b/bin/merge_statfiles.py index dc11bf7..c3986e1 100755 --- a/bin/merge_statfiles.py +++ b/bin/merge_statfiles.py @@ -1,7 +1,7 @@ #!/usr/bin/env python ## nf-core-hic -## Copyright (c) 2020 Institut Curie +## Copyright (c) 2020 Institut Curie ## Author(s): Nicolas Servant ## Contact: nicolas.servant@curie.fr ## This software is distributed without any guarantee under the terms of the BSD-3 licence. @@ -17,6 +17,7 @@ import glob import os from collections import OrderedDict + def num(s): try: return int(s) @@ -26,30 +27,30 @@ def num(s): if __name__ == "__main__": ## Read command line arguments - parser = argparse.ArgumentParser() - parser.add_argument("-f", "--files", help="List of input file(s)", type=str, nargs='+') - parser.add_argument("-v", "--verbose", help="verbose mode", action='store_true') + parser = argparse.ArgumentParser() + parser.add_argument("-f", "--files", help="List of input file(s)", type=str, nargs="+") + parser.add_argument("-v", "--verbose", help="verbose mode", action="store_true") args = parser.parse_args() - + infiles = args.files li = len(infiles) if li > 0: if args.verbose: print("## merge_statfiles.py") - print("## Merging "+ str(li)+" files") - + print("## Merging " + str(li) + " files") + ## Reading first file to get the template template = OrderedDict() if args.verbose: - print("## Use "+infiles[0]+" as template") + print("## Use " + infiles[0] + " as template") with open(infiles[0]) as f: for line in f: if not line.startswith("#"): lsp = line.strip().split("\t") - data = map(num, lsp[1:len(lsp)]) + data = map(num, lsp[1 : len(lsp)]) template[str(lsp[0])] = list(data) - + if len(template) == 0: print("Cannot find template files !") sys.exit(1) @@ -63,20 +64,21 @@ if __name__ == "__main__": if lsp[0] in template: for i in list(range(1, len(lsp))): if isinstance(num(lsp[i]), int): - template[lsp[0]][i-1] += num(lsp[i]) + template[lsp[0]][i - 1] += num(lsp[i]) else: - template[lsp[0]][i-1] = round((template[lsp[0]][i-1] + num(lsp[i]))/2,3) + template[lsp[0]][i - 1] = round((template[lsp[0]][i - 1] + num(lsp[i])) / 2, 3) else: - sys.stderr.write("Warning : '"+lsp[0]+"' not found in template ["+infiles[fidx]+"]\n") - + sys.stderr.write( + "Warning : '" + lsp[0] + "' not found in template [" + infiles[fidx] + "]\n" + ) + ## Print template for x in template: sys.stdout.write(x) for y in template[x]: - sys.stdout.write("\t"+str(y)) + sys.stdout.write("\t" + str(y)) sys.stdout.write("\n") else: print("No files to merge - stop") sys.exit(1) - -- GitLab