diff --git a/bin/build_matrix b/bin/build_matrix new file mode 100755 index 0000000000000000000000000000000000000000..c61c6176c46edf71a6be8dcd3d090c0c1a0b9c4a Binary files /dev/null and b/bin/build_matrix differ diff --git a/bin/cutsite_trimming b/bin/cutsite_trimming new file mode 100755 index 0000000000000000000000000000000000000000..aef62c5802acbb650dc7f60040cfde09f1e9d57f Binary files /dev/null and b/bin/cutsite_trimming differ diff --git a/bin/mapped_2hic_fragments.py b/bin/mapped_2hic_fragments.py new file mode 100755 index 0000000000000000000000000000000000000000..391a58b815b41f5d5cc6b32fea41e0db33e35c91 --- /dev/null +++ b/bin/mapped_2hic_fragments.py @@ -0,0 +1,837 @@ +#!/usr/bin/env python + +# HiC-Pro +# Copyleft 2015 Institut Curie +# Author(s): Nicolas Servant, Eric Viara +# Contact: nicolas.servant@curie.fr +# This software is distributed without any guarantee under the terms of the +# GNU General +# Public License, either Version 2, June 1991 or Version 3, June 2007. + +""" +Script to keep only valid 3C products - DE and SC are removed +Output is : readname / +""" + +import time +import getopt +import sys +import os +import re +import pysam +from bx.intervals.intersection import Intersecter, Interval + + +def usage(): + """Usage function""" + print "Usage : python mapped_2hic_fragments.py" + print "-f/--fragmentFile <Restriction fragment file GFF3>" + print "-r/--mappedReadsFile <BAM/SAM file of mapped reads>" + print "[-o/--outputDir] <Output directory. Default is current directory>" + print "[-s/--shortestInsertSize] <Shortest insert size of mapped reads to consider>" + print "[-l/--longestInsertSize] <Longest insert size of mapped reads to consider>" + print "[-t/--shortestFragmentLength] <Shortest restriction fragment length to consider>" + print "[-m/--longestFragmentLength] <Longest restriction fragment length to consider>" + print "[-d/--minCisDist] <Minimum distance between intrachromosomal contact to consider>" + print "[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>" + print "[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>" + print "[-S/--sam] <Output an additional SAM file with flag 'CT' for pairs classification>" + print "[-v/--verbose] <Verbose>" + print "[-h/--help] <Help>" + return + + +def get_args(): + """Get argument""" + try: + opts, args = getopt.getopt( + sys.argv[1:], + "f:r:o:s:l:t:m:d:g:Svah", + ["fragmentFile=", + "mappedReadsFile=", + "outputDir=", + "minInsertSize=", "maxInsertSize", + "minFragSize", "maxFragSize", + "minDist", + "gatg", "samOut", "verbose", "all", "help"]) + except getopt.GetoptError: + usage() + sys.exit(-1) + return opts + + +def timing(function, *args): + """ + Run a fonction and eturn the run time and the result of the function + If the function requires arguments, those can be passed in + """ + startTime = time.time() + result = function(*args) + print '%s function took %0.3f ms' % (function.func_name, (time.time() - startTime) * 1000) + return result + + +def get_read_strand(read): + """ + Conversion of read position to naive strand representation + + Parameters + ---------- + read : list + list of aligned reads + """ + strand = "+" + if read.is_reverse: + strand = "-" + return strand + + +def isIntraChrom(read1, read2): + """ + Return true is the reads pair is intrachromosomal + + read1 : [AlignedRead] + read2 : [AlignedRead] + + """ + if read1.tid == read2.tid: + return True + else: + return False + + +def get_cis_dist(read1, read2): + """ + Calculte the contact distance between two intrachromosomal reads + + read1 : [AlignedRead] + read2 : [AlignedRead] + + """ + # Get oriented reads + ##r1, r2 = get_ordered_reads(read1, read2) + dist = None + if not read1.is_unmapped and not read2.is_unmapped: + ## Contact distances can be calculated for intrachromosomal reads only + if isIntraChrom(read1, read2): + r1pos = get_read_pos(read1) + r2pos = get_read_pos(read2) + dist = abs(r1pos - r2pos) + return dist + + +def get_read_pos(read, st="start"): + """ + Return the read position (zero-based) used for the intersection with + the restriction fragment + + The 5' end is not a good choice for the reverse reads (which contain part + of the restriction site, and thus overlap the next restriction fragment) + Using the left-most position (ie. start, 5' for forward, 3' for reverse) or the + middle of the read should work but the middle of the reads might be more + safe + + Parameters + ----------- + read : list + list of aligned reads + """ + + if st == "middle": + pos = read.pos + int(read.alen/2) + elif st =="start": + pos = get_read_start(read) + elif st == "left": + pos = read.pos + + return pos + + +def get_read_start(read): + """ + Return the 5' end of the read + """ + if read.is_reverse: + pos = read.pos + read.alen -1 + else: + pos = read.pos + return pos + +def get_ordered_reads(read1, read2): + """ + Reorient reads + + The sequencing is usually not oriented. Reorient the reads so that r1 is + always before r2. + Sequencing is always performed from 5' to 3' end + So in unstranded case, we can have + + 1 2 + ---> ---> + ========== or ========= + <---- <--- + 2 1 + + Reordering the reads allow to always be in the first case + read1 = [AlignedRead] + read2 = [AlignedRead] + """ + if read1.tid == read2.tid: + if get_read_pos(read1) < get_read_pos(read2): + r1 = read1 + r2 = read2 + else: + r1 = read2 + r2 = read1 + else: + if read1.tid < read2.tid: + r1 = read1 + r2 = read2 + else: + r1 = read2 + r2 = read1 + + return r1, r2 + +def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False): + """ + Read a BED file and store the intervals in a tree + + Intervals are zero-based objects. The output object is a hash table with + one search tree per chromosome + + in_file = input file [character] + verbose = verbose mode [logical] + + """ + resFrag = {} + if verbose: + print "## Loading Restriction File Intervals '", in_file, "'..." + + bed_handle = open(in_file) + nline = 0 + nfilt = 0 + for line in bed_handle: + nline +=1 + bedtab = line.split("\t") + try: + chromosome, start, end, name = bedtab[:4] + except ValueError: + print "Warning : wrong input format in line", nline,". Not a BED file !?" + continue + + # BED files are zero-based as Intervals objects + start = int(start) # + 1 + end = int(end) + fragl = abs(end - start) + name = name.strip() + + ## Discard fragments outside the size range + filt=False + if minfragsize != None and int(fragl) < int(minfragsize): + nfilt+=1 + filt=True + elif maxfragsize != None and int(fragl) > int(maxfragsize): + nfilt+=1 + filt=True + + if chromosome in resFrag: + tree = resFrag[chromosome] + tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) + else: + tree = Intersecter() + tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) + resFrag[chromosome] = tree + + if nfilt > 0: + print "Warning : ", nfilt ,"fragment(s) outside of range and discarded. ", nline - nfilt, " remaining." + + bed_handle.close() + return resFrag + + +def get_overlapping_restriction_fragment(resFrag, chrom, read): + """ + Intersect a given read with the set of restriction fragments + + ## + resFrag = the restriction fragments [hash] + chrom = the chromosome to look at [character] + read = the read to intersect [AlignedRead] + + """ + # Get read position (middle or 5' end) + pos = get_read_pos(read, st="middle") + + if chrom in resFrag: + # Overlap with the position of the read (zero-based) + resfrag = resFrag[chrom].find(pos, pos+1) + if len(resfrag) > 1: + print "Warning : ", len(resfrag), " restriction fragments found for ", read.qname, "- skipped" + return None + elif len(resfrag) == 0: + print "Warning - no restriction fragments for ", read.qname ," at ", chrom, ":", pos + return None + else: + return resfrag[0] + else: + print "Warning - no restriction fragments for ", read.qname," at ", chrom, ":", pos + return None + + +def are_contiguous_fragments(frag1, frag2, chr1, chr2): + ''' + Compare fragment positions to check if they are contiguous + ''' + ret = False + if chr1 == chr2: + if int(frag1.start) < int(frag2.start): + d = int(frag2.start) - int(frag1.end) + else: + d = int(frag1.start) - int(frag2.end) + + if d == 0: + ret = True + + return ret + +def is_religation(read1, read2, frag1, frag2): + """ + Reads are expected to map adjacent fragments + Check the orientation of reads -><- + + """ + ret=False + if are_contiguous_fragments(frag1, frag2, read1.tid, read2.tid): + #r1, r2 = get_ordered_reads(read1, read2) + #if get_read_strand(r1) == "+" and get_read_strand(r2) == "-": + ret=True + return ret + + +def is_self_circle(read1, read2): + """ + Both reads are expected to be on the same restriction fragments + Check the orientation of reads <--> + + read1 : [AlignedRead] + read2 : [AlignedRead] + """ + ret = False + # Get oriented reads + r1, r2 = get_ordered_reads(read1, read2) + # 1<- ->2 or 2<- ->1 + if get_read_strand(r1) == "-" and get_read_strand(r2) == "+": + ret = True + return ret + + +def is_dangling_end(read1, read2): + """ + Both reads are expected to be on the same restriction fragments + Check the orientation of reads -><- + + read1 : [AlignedRead] + read2 : [AlignedRead] + """ + ret = False + # Get oriented reads + r1, r2 = get_ordered_reads(read1, read2) + # 1-> <-2 or 2-> <-1 + if get_read_strand(r1) == "+" and get_read_strand(r2) == "-": + ret = True + return ret + + +def get_valid_orientation(read1, read2): + """ + Both reads are expected to be on the different restriction fragments + Check the orientation of reads ->-> / <-<- / -><- / <--> + + read1 : [AlignedRead] + read2 : [AlignedRead] + + """ + # Get oriented reads + r1, r2 = get_ordered_reads(read1, read2) + + direction = None + if get_read_strand(r1) == "+" and get_read_strand(r2) == "+": + direction = "FF" + elif get_read_strand(r1) == "-" and get_read_strand(r2) == "-": + direction = "RR" + elif get_read_strand(r1) == "+" and get_read_strand(r2) == "-": + direction = "FR" + elif get_read_strand(r1) == "-" and get_read_strand(r2) == "+": + direction = "RF" + + return direction + + +def get_PE_fragment_size(read1, read2, resFrag1, resFrag2, interactionType): + """ + Calculte the size of the DNA fragment library + + read1 : [AlignedRead] + read2 : [AlignedRead] + resfrag1 = restrictin fragment overlapping the R1 read [interval] + resfrag1 = restrictin fragment overlapping the R1 read [interval] + interactionType : Type of interaction from get_interaction_type() [str] + + """ + + fragmentsize = None + + # Get oriented reads + r1, r2 = get_ordered_reads(read1, read2) + if not r1.is_unmapped and not r2.is_unmapped: + if r1 == read2: + rfrag1 = resFrag2 + rfrag2 = resFrag1 + else: + rfrag1 = resFrag1 + rfrag2 = resFrag2 + + ## In this case use the read start ! + r1pos = get_read_start(r1) + r2pos = get_read_start(r2) + + if interactionType == "DE" or interactionType == "RE": + fragmentsize = r2pos - r1pos + elif interactionType == "SC": + fragmentsize = (r1pos - rfrag1.start) + (rfrag2.end - r2pos) + elif interactionType == "VI": + if get_read_strand(r1) == "+": + dr1 = rfrag1.end - r1pos + else: + dr1 = r1pos - rfrag1.start + if get_read_strand(r2) == "+": + dr2 = rfrag2.end - r2pos + else: + dr2 = r2pos - rfrag2.start + fragmentsize = dr2 + dr1 + + return fragmentsize + + +def get_interaction_type(read1, read1_chrom, resfrag1, read2, + read2_chrom, resfrag2, verbose): + """ + Returns the interaction type + + For a given reads pair and their related restriction fragment, classify + the 3C products as : + + - Interaction + - Self circle + - Dangling end + - Religation + - Unknown + + ## + read1 = the R1 read of the pair [AlignedRead] + read1_chrom = the chromosome of R1 read [character] + resfrag1 = restrictin fragment overlapping the R1 read [interval] + read2 = the R2 read of the pair [AlignedRead] + read2_chrom = the chromosome of R2 read [character] + resfrag2 = restrictin fragment overlapping the R2 read [interval] + verbose = verbose mode [logical] + + """ + + # If returned InteractionType=None -> Same restriction fragment + # and same strand = Dump + interactionType = None + + if not read1.is_unmapped and not read2.is_unmapped and resfrag1 is not None and resfrag2 is not None: + # same restriction fragment + if resfrag1 == resfrag2: + # Self_circle <- -> + if is_self_circle(read1, read2): + interactionType = "SC" + # Dangling_end -> <- + elif is_dangling_end(read1, read2): + interactionType = "DE" + elif is_religation(read1, read2, resfrag1, resfrag2): + interactionType = "RE" + else: + interactionType = "VI" + elif r1.is_unmapped or r2.is_unmapped: + interactionType = "SI" + + return interactionType + + +def get_read_tag(read, tag): + for t in read.tags: + if t[0] == tag: + return t[1] + return None + + +if __name__ == "__main__": + # Read command line arguments + opts = get_args() + samOut = False + verbose = False + allOutput = False + minInsertSize = None + maxInsertSize = None + minFragSize = None + maxFragSize = None + minDist = None + outputDir = "." + gtag = None + + if len(opts) == 0: + usage() + sys.exit() + + for opt, arg in opts: + if opt in ("-h", "--help"): + usage() + sys.exit() + elif opt in ("-f", "--fragmentFile"): + fragmentFile = arg + elif opt in ("-r", "--mappedReadsFile"): + mappedReadsFile = arg + elif opt in ("-o", "--outputDir"): + outputDir = arg + elif opt in ("-s", "--shortestInsertSize"): + minInsertSize = arg + elif opt in ("-l", "--longestInsertSize"): + maxInsertSize = arg + elif opt in ("-t", "--shortestFragmentLength"): + minFragSize = arg + elif opt in ("-m", "--longestFragmentLength"): + maxFragSize = arg + elif opt in ("-d", "--minCisDist"): + minDist = arg + elif opt in ("-g", "--gtag"): + gtag = arg + elif opt in ("-a", "--all"): + allOutput = True + elif opt in ("-S", "--sam"): + samOut = True + elif opt in ("-v", "--verbose"): + verbose = True + else: + assert False, "unhandled option" + + # Verbose mode + if verbose: + print "## overlapMapped2HiCFragments.py" + print "## mappedReadsFile=", mappedReadsFile + print "## fragmentFile=", fragmentFile + print "## minInsertSize=", minInsertSize + print "## maxInsertSize=", maxInsertSize + print "## minFragSize=", minFragSize + print "## maxFragSize=", maxFragSize + print "## allOuput=", allOutput + print "## SAM ouput=", samOut + print "## verbose=", verbose, "\n" + + # Initialize variables + reads_counter = 0 + de_counter = 0 + re_counter = 0 + sc_counter = 0 + valid_counter = 0 + valid_counter_FF = 0 + valid_counter_RR = 0 + valid_counter_FR = 0 + valid_counter_RF = 0 + single_counter = 0 + dump_counter = 0 + filt_counter = 0 + + ## AS counter + G1G1_ascounter = 0 + G2G2_ascounter = 0 + G1U_ascounter = 0 + UG1_ascounter = 0 + G2U_ascounter = 0 + UG2_ascounter = 0 + G1G2_ascounter = 0 + G2G1_ascounter = 0 + UU_ascounter = 0 + CF_ascounter = 0 + + baseReadsFile = os.path.basename(mappedReadsFile) + baseReadsFile = re.sub(r'\.bam$|\.sam$', '', baseReadsFile) + + # Open handlers for output files + handle_valid = open(outputDir + '/' + baseReadsFile + '.validPairs', 'w') + + if allOutput: + handle_de = open(outputDir + '/' + baseReadsFile + '.DEPairs', 'w') + handle_re = open(outputDir + '/' + baseReadsFile + '.REPairs', 'w') + handle_sc = open(outputDir + '/' + baseReadsFile + '.SCPairs', 'w') + handle_dump = open(outputDir + '/' + baseReadsFile + '.DumpPairs', 'w') + handle_single = open(outputDir + '/' + baseReadsFile + '.SinglePairs', 'w') + handle_filt = open(outputDir + '/' + baseReadsFile + '.FiltPairs', 'w') + + # Read the BED file + resFrag = timing(load_restriction_fragment, fragmentFile, minFragSize, maxFragSize, verbose) + + # Read the SAM/BAM file + if verbose: + print "## Opening SAM/BAM file '", mappedReadsFile, "'..." + samfile = pysam.Samfile(mappedReadsFile, "rb") + + if samOut: + handle_sam = pysam.AlignmentFile(outputDir + '/' + baseReadsFile + '_interaction.bam', "wb", template=samfile) + + # Reads are 0-based too (for both SAM and BAM format) + # Loop on all reads + if verbose: + print "## Classifying Interactions ..." + + for read in samfile.fetch(until_eof=True): + reads_counter += 1 + cur_handler = None + htag = "" + + # First mate + if read.is_read1: + r1 = read + if not r1.is_unmapped: + r1_chrom = samfile.getrname(r1.tid) + r1_resfrag = get_overlapping_restriction_fragment(resFrag, r1_chrom, r1) + else: + r1_resfrag = None + r1_chrom = None + + # Second mate + elif read.is_read2: + r2 = read + if not r2.is_unmapped: + r2_chrom = samfile.getrname(r2.tid) + r2_resfrag = get_overlapping_restriction_fragment(resFrag, r2_chrom, r2) + else: + r2_resfrag = None + r2_chrom = None + + if r1_resfrag is not None or r2_resfrag is not None: + interactionType = get_interaction_type(r1, r1_chrom, r1_resfrag, r2, r2_chrom, r2_resfrag, verbose) + dist = get_PE_fragment_size(r1, r2, r1_resfrag, r2_resfrag, interactionType) + cdist = get_cis_dist(r1, r2) + + ## Filter based on restriction fragments + if (r1_resfrag is not None and r1_resfrag.value['filter'] == True) or (r2_resfrag is not None and r2_resfrag.value['filter']) == True: + interactionType = "FILT" + + # Check Insert size criteria - FILT + if (minInsertSize is not None and dist is not None and + dist < int(minInsertSize)) or \ + (maxInsertSize is not None and dist is not None and dist > int(maxInsertSize)): + interactionType = "FILT" + + # Check Distance criteria - FILT + # Done for VI otherwise this criteria will overwrite all other invalid classification + if (interactionType == "VI" and minDist is not None and cdist is not None and cdist < int(minDist)): + interactionType = "FILT" + + if interactionType == "VI": + valid_counter += 1 + cur_handler = handle_valid + validType = get_valid_orientation(r1, r2) + if validType == "RR": + valid_counter_RR += 1 + elif validType == "FF": + valid_counter_FF += 1 + elif validType == "FR": + valid_counter_FR += 1 + elif validType == "RF": + valid_counter_RF += 1 + + ## Counts valid pairs based on XA tag + if gtag is not None: + r1as = get_read_tag(r1, gtag) + r2as = get_read_tag(r2, gtag) + if r1as == 1 and r2as == 1: + G1G1_ascounter += 1 + elif r1as == 2 and r2as == 2: + G2G2_ascounter += 1 + elif r1as == 1 and r2as == 0: + G1U_ascounter += 1 + elif r1as == 0 and r2as == 1: + UG1_ascounter += 1 + elif r1as == 2 and r2as == 0: + G2U_ascounter += 1 + elif r1as == 0 and r2as == 2: + UG2_ascounter += 1 + elif r1as == 1 and r2as == 2: + G1G2_ascounter += 1 + elif r1as == 2 and r2as == 1: + G2G1_ascounter += 1 + elif r1as == 3 or r2as == 3: + CF_ascounter += 1 + else: + UU_ascounter += 1 + + elif interactionType == "DE": + de_counter += 1 + cur_handler = handle_de if allOutput else None + + elif interactionType == "RE": + re_counter += 1 + cur_handler = handle_re if allOutput else None + + elif interactionType == "SC": + sc_counter += 1 + cur_handler = handle_sc if allOutput else None + + elif interactionType == "SI": + single_counter += 1 + cur_handler = handle_single if allOutput else None + + elif interactionType == "FILT": + filt_counter += 1 + cur_handler = handle_filt if allOutput else None + + else: + interactionType = "DUMP" + dump_counter += 1 + cur_handler = handle_dump if allOutput else None + else: + interactionType = "DUMP" + dump_counter += 1 + cur_handler = handle_dump if allOutput else None + dist = None + + ## Write results in right handler + if cur_handler is not None: + if not r1.is_unmapped and not r2.is_unmapped: + ##reorient reads to ease duplicates removal + or1, or2 = get_ordered_reads(r1, r2) + or1_chrom = samfile.getrname(or1.tid) + or2_chrom = samfile.getrname(or2.tid) + + ##reset as tag now that the reads are oriented + r1as = get_read_tag(or1, gtag) + r2as = get_read_tag(or2, gtag) + if gtag is not None: + htag = str(r1as)+"-"+str(r2as) + + ##get fragment name and reorient if necessary + if or1 == r1 and or2 == r2: + or1_resfrag = r1_resfrag + or2_resfrag = r2_resfrag + elif or1 == r2 and or2 == r1: + or1_resfrag = r2_resfrag + or2_resfrag = r1_resfrag + + if or1_resfrag is not None: + or1_fragname = or1_resfrag.value['name'] + + if or2_resfrag is not None: + or2_fragname = or2_resfrag.value['name'] + + cur_handler.write( + or1.qname + "\t" + + or1_chrom + "\t" + + str(get_read_pos(or1)+1) + "\t" + + str(get_read_strand(or1)) + "\t" + + or2_chrom + "\t" + + str(get_read_pos(or2)+1) + "\t" + + str(get_read_strand(or2)) + "\t" + + str(dist) + "\t" + + or1_fragname + "\t" + + or2_fragname + "\t" + + str(or1.mapping_quality) + "\t" + + str(or2.mapping_quality) + "\t" + + str(htag) + "\n") + + elif r2.is_unmapped and not r1.is_unmapped: + if r1_resfrag is not None: + r1_fragname = r1_resfrag.value['name'] + + cur_handler.write( + r1.qname + "\t" + + r1_chrom + "\t" + + str(get_read_pos(r1)+1) + "\t" + + str(get_read_strand(r1)) + "\t" + + "*" + "\t" + + "*" + "\t" + + "*" + "\t" + + "*" + "\t" + + r1_fragname + "\t" + + "*" + "\t" + + str(r1.mapping_quality) + "\t" + + "*" + "\n") + elif r1.is_unmapped and not r2.is_unmapped: + if r2_resfrag is not None: + r2_fragname = r2_resfrag.value['name'] + + cur_handler.write( + r2.qname + "\t" + + "*" + "\t" + + "*" + "\t" + + "*" + "\t" + + r2_chrom + "\t" + + str(get_read_pos(r2)+1) + "\t" + + str(get_read_strand(r2)) + "\t" + + "*" + "\t" + + "*" + "\t" + + r2_fragname + "\t" + + "*" + "\t" + + str(r2.mapping_quality) + "\n") + + ## Keep initial order + if samOut: + r1.tags = r1.tags + [('CT', str(interactionType))] + r2.tags = r2.tags + [('CT', str(interactionType))] + handle_sam.write(r1) + handle_sam.write(r2) + + if (reads_counter % 100000 == 0 and verbose): + print "##", reads_counter + + # Close handler + handle_valid.close() + if allOutput: + handle_de.close() + handle_re.close() + handle_sc.close() + handle_dump.close() + handle_single.close() + handle_filt.close() + + + # Write stats file + handle_stat = open(outputDir + '/' + baseReadsFile + '.RSstat', 'w') + handle_stat.write("## Hi-C processing\n") + handle_stat.write("Valid_interaction_pairs\t" + str(valid_counter) + "\n") + handle_stat.write( + "Valid_interaction_pairs_FF\t" + str(valid_counter_FF) + "\n") + handle_stat.write( + "Valid_interaction_pairs_RR\t" + str(valid_counter_RR) + "\n") + handle_stat.write( + "Valid_interaction_pairs_RF\t" + str(valid_counter_RF) + "\n") + handle_stat.write( + "Valid_interaction_pairs_FR\t" + str(valid_counter_FR) + "\n") + handle_stat.write("Dangling_end_pairs\t" + str(de_counter) + "\n") + handle_stat.write("Religation_pairs\t" + str(re_counter) + "\n") + handle_stat.write("Self_Cycle_pairs\t" + str(sc_counter) + "\n") + handle_stat.write("Single-end_pairs\t" + str(single_counter) + "\n") + handle_stat.write("Filtered_pairs\t" + str(filt_counter) + "\n") + handle_stat.write("Dumped_pairs\t" + str(dump_counter) + "\n") + + ## Write AS report + if gtag is not None: + handle_stat.write("## ======================================\n") + handle_stat.write("## Allele specific information\n") + handle_stat.write("Valid_pairs_from_ref_genome_(1-1)\t" + str(G1G1_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_ref_genome_with_one_unassigned_mate_(0-1/1-0)\t" + str(UG1_ascounter+G1U_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_alt_genome_(2-2)\t" + str(G2G2_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_alt_genome_with_one_unassigned_mate_(0-2/2-0)\t" + str(UG2_ascounter+G2U_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_alt_and_ref_genome_(1-2/2-1)\t" + str(G1G2_ascounter+G2G1_ascounter) + "\n") + handle_stat.write("Valid_pairs_with_both_unassigned_mated_(0-0)\t" + str(UU_ascounter) + "\n") + handle_stat.write("Valid_pairs_with_at_least_one_conflicting_mate_(3-)\t" + str(CF_ascounter) + "\n") + + handle_stat.close() + + if samOut: + samfile.close() + diff --git a/bin/mergeSAM.py b/bin/mergeSAM.py new file mode 100755 index 0000000000000000000000000000000000000000..fdf0c67dfc24f161266c48506bdfda6b3eb7c899 --- /dev/null +++ b/bin/mergeSAM.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python + +## HiC-Pro +## Copyright (c) 2015 Institut Curie +## Author(s): Nicolas Servant, Eric Viara +## Contact: nicolas.servant@curie.fr +## This software is distributed without any guarantee under the terms of the BSD-3 licence. +## See the LICENCE file for details + + +""" +Script to pair 2 SAM/BAM files into one PE BAM +- On 03/05/16 Ferhat made changes starting from ~/bin/HiC-Pro_2.7.2b/scripts/mergeSAM.py +to make singletons possible to be reported +""" + +import getopt +import sys +import os +import re +import pysam +from itertools import izip + +def usage(): + """Usage function""" + print "Usage : python mergeSAM.py" + print "-f/--forward <forward read mapped file>" + print "-r/--reverse <reverse read mapped file>" + print "[-o/--output] <Output file. Default is stdin>" + print "[-s/--single] <report singleton>" + print "[-m/--multi] <report multiple hits>" + print "[-q/--qual] <minimum reads mapping quality>" + print "[-t/--stat] <generate a stat file>" + print "[-v/--verbose] <Verbose>" + print "[-h/--help] <Help>" + return + + +def get_args(): + """Get argument""" + try: + opts, args = getopt.getopt( + sys.argv[1:], + "f:r:o:q:smtvh", + ["forward=", + "reverse=", + "output=", "qual=", + "single", "multi", "stat", "verbose", "help"]) + except getopt.GetoptError: + usage() + sys.exit(-1) + return opts + + +def is_unique_bowtie2(read): + ret = False + if not read.is_unmapped and read.has_tag('AS'): + if read.has_tag('XS'): + primary = read.get_tag('AS') + secondary = read.get_tag('XS') + if (primary > secondary): + ret = True + else: + ret = True + + return ret + +## Remove everything after "/" or " " in read's name +def get_read_name(read): + name = read.qname + #return name.split("/",1)[0] + return re.split('/| ', name)[0] + +def sam_flag(read1, read2, hr1, hr2): + + f1 = read1.flag + f2 = read2.flag + + if r1.is_unmapped == False: + r1_chrom = hr1.getrname(r1.tid) + else: + r1_chrom="*" + if r2.is_unmapped == False: + r2_chrom = hr2.getrname(r2.tid) + else: + r2_chrom="*" + + + ##Relevant bitwise flags (flag in an 11-bit binary number) + ##1 The read is one of a pair + ##2 The alignment is one end of a proper paired-end alignment + ##4 The read has no reported alignments + ##8 The read is one of a pair and has no reported alignments + ##16 The alignment is to the reverse reference strand + ##32 The other mate in the paired-end alignment is aligned to the reverse reference strand + ##64 The read is the first (#1) mate in a pair + ##128 The read is the second (#2) mate in a pair + + ##The reads were mapped as single-end data, so should expect flags of + ##0 (map to the '+' strand) or 16 (map to the '-' strand) + ##Output example: a paired-end read that aligns to the reverse strand + ##and is the first mate in the pair will have flag 83 (= 64 + 16 + 2 + 1) + + if f1 & 0x4: + f1 = f1 | 0x8 + + if f2 & 0x4: + f2 = f2 | 0x8 + + if (not (f1 & 0x4) and not (f2 & 0x4)): + ##The flag should now indicate this is paired-end data + f1 = f1 | 0x1 + f1 = f1 | 0x2 + f2 = f2 | 0x1 + f2 = f2 | 0x2 + + + ##Indicate if the pair is on the reverse strand + if f1 & 0x10: + f2 = f2 | 0x20 + + if f2 & 0x10: + f1 = f1 | 0x20 + + ##Is this first or the second pair? + f1 = f1 | 0x40 + f2 = f2 | 0x80 + + ##Insert the modified bitwise flags into the reads + read1.flag = f1 + read2.flag = f2 + + ##Determine the RNEXT and PNEXT values (i.e. the positional values of a read's pair) + #RNEXT + if r1_chrom == r2_chrom: + read1.rnext = r1.tid + read2.rnext = r1.tid + else: + read1.rnext = r2.tid + read2.rnext = r1.tid + + #PNEXT + read1.pnext = read2.pos + read2.pnext = read1.pos + + return(read1, read2) + + + +if __name__ == "__main__": + ## Read command line arguments + opts = get_args() + inputFile = None + outputFile = None + mapq = None + report_single = False + report_multi = False + verbose = False + stat = False + output = "-" + + if len(opts) == 0: + usage() + sys.exit() + + for opt, arg in opts: + if opt in ("-h", "--help"): + usage() + sys.exit() + elif opt in ("-f", "--forward"): + R1file = arg + elif opt in ("-r", "--reverse"): + R2file = arg + elif opt in ("-o", "--output"): + output = arg + elif opt in ("-q", "--qual"): + mapq = arg + elif opt in ("-s", "--single"): + report_single = True + elif opt in ("-m", "--multi"): + report_multi = True + elif opt in ("-t", "--stat"): + stat = True + elif opt in ("-v", "--verbose"): + verbose = True + else: + assert False, "unhandled option" + + ## Verbose mode + if verbose: + print "## mergeBAM.py" + print "## forward=", R1file + print "## reverse=", R2file + print "## output=", output + print "## min mapq=", mapq + print "## report_single=", report_single + print "## report_multi=", report_multi + print "## verbose=", verbose + + ## Initialize variables + tot_pairs_counter = 0 + multi_pairs_counter = 0 + uniq_pairs_counter = 0 + unmapped_pairs_counter = 0 + lowq_pairs_counter = 0 + multi_singles_counter = 0 + uniq_singles_counter = 0 + lowq_singles_counter = 0 + + #local_counter = 0 + paired_reads_counter = 0 + singleton_counter = 0 + reads_counter = 0 + r1 = None + r2 = None + + ## Reads are 0-based too (for both SAM and BAM format) + ## Loop on all reads + if verbose: + print "## Merging forward and reverse tags ..." + + with pysam.Samfile(R1file, "rb") as hr1, pysam.Samfile(R2file, "rb") as hr2: + if output == "-": + outfile = pysam.AlignmentFile(output, "w", template=hr1) + else: + outfile = pysam.AlignmentFile(output, "wb", template=hr1) + for r1, r2 in izip(hr1.fetch(until_eof=True), hr2.fetch(until_eof=True)): + reads_counter +=1 + + #print r1 + #print r2 + #print hr1.getrname(r1.tid) + #print hr2.getrname(r2.tid) + + if (reads_counter % 1000000 == 0 and verbose): + print "##", reads_counter + + if get_read_name(r1) == get_read_name(r2): + + ## both unmapped + if r1.is_unmapped == True and r2.is_unmapped == True: + unmapped_pairs_counter += 1 + continue + + ## both mapped + elif r1.is_unmapped == False and r2.is_unmapped == False: + ## quality + if mapq != None and (r1.mapping_quality < int(mapq) or r2.mapping_quality < int(mapq)): + lowq_pairs_counter += 1 + continue + + ## Unique mapping + if is_unique_bowtie2(r1) == True and is_unique_bowtie2(r2) == True: + uniq_pairs_counter += 1 + else: + multi_pairs_counter += 1 + if report_multi == False: + continue + # one end mapped, other is not + else: + singleton_counter += 1 + if report_single == False: + continue + if r1.is_unmapped == False: ## first end is mapped, second is not + ## quality + if mapq != None and (r1.mapping_quality < int(mapq)): + lowq_singles_counter += 1 + continue + ## Unique mapping + if is_unique_bowtie2(r1) == True: + uniq_singles_counter += 1 + else: + multi_singles_counter += 1 + if report_multi == False: + continue + else: ## second end is mapped, first is not + ## quality + if mapq != None and (r2.mapping_quality < int(mapq)): + lowq_singles_counter += 1 + continue + ## Unique mapping + if is_unique_bowtie2(r2) == True: + uniq_singles_counter += 1 + else: + multi_singles_counter += 1 + if report_multi == False: + continue + + tot_pairs_counter += 1 + (r1, r2) = sam_flag(r1,r2, hr1, hr2) + + #print hr1.getrname(r1.tid) + #print hr2.getrname(r2.tid) + #print r1 + #print r2 + ## Write output + outfile.write(r1) + outfile.write(r2) + + else: + print "Forward and reverse reads not paired. Check that BAM files have the same read names and are sorted." + sys.exit(1) + + if stat: + if output == '-': + statfile = "pairing.stat" + else: + statfile = re.sub('\.bam$', '.pairstat', output) + handle_stat = open(statfile, 'w') + + handle_stat.write("Total_pairs_processed\t" + str(reads_counter) + "\t" + str(round(float(reads_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Unmapped_pairs\t" + str(unmapped_pairs_counter) + "\t" + str(round(float(unmapped_pairs_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Low_qual_pairs\t" + str(lowq_pairs_counter) + "\t" + str(round(float(lowq_pairs_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Unique_paired_alignments\t" + str(uniq_pairs_counter) + "\t" + str(round(float(uniq_pairs_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Multiple_pairs_alignments\t" + str(multi_pairs_counter) + "\t" + str(round(float(multi_pairs_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Pairs_with_singleton\t" + str(singleton_counter) + "\t" + str(round(float(singleton_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Low_qual_singleton\t" + str(lowq_singles_counter) + "\t" + str(round(float(lowq_singles_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Unique_singleton_alignments\t" + str(uniq_singles_counter) + "\t" + str(round(float(uniq_singles_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Multiple_singleton_alignments\t" + str(multi_singles_counter) + "\t" + str(round(float(multi_singles_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.write("Reported_pairs\t" + str(tot_pairs_counter) + "\t" + str(round(float(tot_pairs_counter)/float(reads_counter)*100,3)) + "\n") + handle_stat.close() + + hr1.close() + hr2.close() + outfile.close() + diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 0e98e07fea0933a455bd576100e8b026cb684e18..4a1747d86e627006574c326807fcb8ff7637c242 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -7,14 +7,10 @@ import re regexes = { 'nf-core/hic': ['v_pipeline.txt', r"(\S+)"], 'Nextflow': ['v_nextflow.txt', r"(\S+)"], - 'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"], - 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], } results = OrderedDict() results['nf-core/hic'] = '<span style="color:#999999;\">N/A</span>' results['Nextflow'] = '<span style="color:#999999;\">N/A</span>' -results['FastQC'] = '<span style="color:#999999;\">N/A</span>' -results['MultiQC'] = '<span style="color:#999999;\">N/A</span>' # Search each file using its regex for k, v in regexes.items(): diff --git a/bin/src/build_matrix.cpp b/bin/src/build_matrix.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e366d5b7649d3f9eb040a80eee5a5d10572f6593 --- /dev/null +++ b/bin/src/build_matrix.cpp @@ -0,0 +1,1037 @@ +// HiC-Pro +// Copyright 2015 Institut Curie +// Author(s): Eric Viara +// Contact: nicolas.servant@curie.fr +// This software is distributed without any guarantee under the terms of the BSD-3 License + +#include <iostream> +#include <iomanip> +#include <fstream> +#include <sstream> +#include <unordered_map> +#include <map> +#include <vector> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <math.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/stat.h> + + +static const int SPARSE_FMT = 0x1; +static const int BED_FMT = 0x2; +static const char* prog; +static bool progress = false; +static bool detail_progress = false; +static bool quiet = false; + +static bool NO_DICHO = getenv("NO_DICHO") != NULL; + +typedef unsigned int chrsize_t; + +const std::string VERSION = "1.2 [2015-10-20]"; + +const static chrsize_t BIN_NOT_FOUND = (chrsize_t)-1; + +class AxisChromosome; + +static bool is_empty_line(const char* buffer) +{ + while (char c = *buffer++) { + if (c != ' ' || c != '\n' || c != '\t') { + return false; + } + } + return true; +} + +static int bed_line_parse(char* buffer, char chr[], chrsize_t& start, chrsize_t& end, const std::string& bedfile, size_t line_num) +{ + if (sscanf(buffer, "%s %u %u", chr, &start, &end) != 3) { + std::cerr << "bed file \"" << bedfile << "\" at line #" << line_num << " format error\n"; + return 1; + } + return 0; +} + +struct Interval { + chrsize_t start; + chrsize_t end; + + Interval(chrsize_t start = 0, chrsize_t end = 0) : start(start), end(end) { } +}; + +class ChrRegions { + + std::vector<std::string> chr_v; + std::map<std::string, std::vector<Interval>* > intervals; + +public: + ChrRegions() { } + + int readBedfile(const std::string& bedfile) { + std::ifstream ifs(bedfile.c_str()); + if (ifs.bad() || ifs.fail()) { + std::cerr << prog << " cannot open bed file: " << bedfile << " for reading\n"; + return 1; + } + char buffer[4096]; + size_t line_num = 0; + chrsize_t lastend = 0; + char lastchr[2048] = {0}; + while (!ifs.eof()) { + ifs.getline(buffer, sizeof(buffer)-1); + line_num++; + if (is_empty_line(buffer)) { + continue; + } + chrsize_t start = 0; + chrsize_t end = 0; + char chr[2048]; + if (bed_line_parse(buffer, chr, start, end, bedfile, line_num)) { + return 1; + } + if (intervals.find(chr) == intervals.end()) { + intervals[chr] = new std::vector<Interval>(); + chr_v.push_back(chr); + } + /* + if (lastend != 0 && !strcmp(lastchr, chr) && start != lastend) { + std::cerr << "warning: discontinuous segment for chromosome " << chr << " at position " << start << " " << end << std::endl; + } + */ + if (*lastchr && strcmp(lastchr, chr)) { + lastend = 0; + } + + if (lastend != 0 && start < lastend) { + std::cerr << "error: bedfile not sorted at line #" << line_num << std::endl; + exit(1); + } + strcpy(lastchr, chr); + lastend = end; + intervals[chr]->push_back(Interval(start, end)); + if (progress && (line_num % 100000) == 0) { + std::cerr << '.' << std::flush; + } + } + if (progress) { + std::cerr << std::endl; + } + return 0; + } + + void displayBed(std::ostream& ofs, const std::vector<AxisChromosome*>& axis_chr) const { + std::vector<std::string>::const_iterator begin = chr_v.begin(); + std::vector<std::string>::const_iterator end = chr_v.end(); + unsigned int num = 1; + while (begin != end) { + const std::string& chrname = *begin; + std::map<std::string, std::vector<Interval>* >::const_iterator iter = intervals.find(chrname); + assert(iter != intervals.end()); + const std::vector<Interval>* itv_vect = (*iter).second; + std::vector<Interval>::const_iterator itv_begin = itv_vect->begin(); + std::vector<Interval>::const_iterator itv_end = itv_vect->end(); + while (itv_begin != itv_end) { + const Interval& itv = (*itv_begin); + ofs << chrname << '\t' << itv.start << '\t' << itv.end << '\t' << num << '\n'; + if (progress && (num % 100000) == 0) { + std::cerr << '.' << std::flush; + } + num++; + ++itv_begin; + } + ++begin; + } + if (progress) { + std::cerr << std::endl; + } + } + + const std::vector<Interval>* getIntervalsFromChr(const std::string& chr) const { + std::map<std::string, std::vector<Interval>* >::const_iterator iter = intervals.find(chr); + if (iter != intervals.end()) { + return (*iter).second; + } + return NULL; + } +}; + +class Dichotomic { + + int min, max; + const std::vector<Interval>& intervals; + +public: + Dichotomic(const std::vector<Interval>& intervals) : intervals(intervals) { + //min = middle(intervals[0]); + //max = middle(intervals[intervals.size()-1]); + min = 0; + max = intervals.size()-1; + } + + static chrsize_t middle(const Interval& itv) { + return (itv.start+1 + itv.end) / 2; + } + + int find(chrsize_t value) { + int l = min; + int r = max; + int n = 0; + while (l <= r) { + n = (l + r) >> 1; + const Interval& itv = intervals[n]; + if (value >= itv.start+1 && value <= itv.end) { + return n; + } + + int x = middle(itv) - value; + + if (x < 0) { + l = n + 1; + } else { + r = n - 1; + } + //std::cout << "l: " << l << '\n'; + //std::cout << "r: " << r << '\n'; + } + + return -1; + } +}; + +class Chromosome { + +private: + static std::unordered_map<std::string, Chromosome*> chr_map; + + void computeSizes(chrsize_t ori_binsize, chrsize_t step, bool binadjust, const ChrRegions* chr_regions); + + std::string name; + + chrsize_t chrsize; + + chrsize_t binsize; + chrsize_t stepsize; + chrsize_t bincount; + + const ChrRegions* chr_regions; + +public: + Chromosome(const std::string& name, chrsize_t chrsize, chrsize_t ori_binsize, chrsize_t step, bool binadjust, const ChrRegions* chr_regions) : name(name), chrsize(chrsize), chr_regions(chr_regions) { + computeSizes(ori_binsize, step, binadjust, chr_regions); + assert(chr_map.find(name) == chr_map.end()); + chr_map[name] = this; + } + + void adjustBinsize(chrsize_t ori_binsize, const chrsize_t step); + + const std::string& getName() const {return name;} + chrsize_t getChrsize() const {return chrsize;} + chrsize_t getBinsize() const {return binsize;} + chrsize_t getStepsize() const {return stepsize;} + chrsize_t getBincount() const {return bincount;} + + const ChrRegions* getChrRegions() const {return chr_regions;} + + static chrsize_t getCount() { + return chr_map.size(); + } + + static Chromosome* getByName(const std::string& name) { + return chr_map[name]; + } +}; + +class AxisChromosome { + int idx; // really needed ? + const Chromosome* chr; + chrsize_t binstart; + chrsize_t binend; + +public: + AxisChromosome(int binoffset, const Chromosome* chr, const AxisChromosome* lastAxisChr) : chr(chr) { + if (lastAxisChr != NULL) { + binstart = lastAxisChr->getBinend(); + } else { + binstart = binoffset; + } + binend = binstart + chr->getBincount(); + /* + if (verbose) { + std::cerr << "AxisChromosome: " << chr->getName() << " " << binstart << " " << binend << " " << chr->getBincount() << std::endl; + } + */ + } + + chrsize_t getBinstart() const {return binstart;} + chrsize_t getBinend() const {return binend;} + chrsize_t getChrsize() const {return chr->getChrsize();} + chrsize_t getBinsize() const {return chr->getBinsize();} + chrsize_t getStepsize() const {return chr->getStepsize();} + chrsize_t getBincount() const {return chr->getBincount();} + + const Chromosome* getChromosome() const {return chr;} + + chrsize_t assign_bin(const std::string& org, chrsize_t start) const { + const ChrRegions* chr_regions = chr->getChrRegions(); + if (chr_regions != NULL) { + const std::vector<Interval>* intervals = chr_regions->getIntervalsFromChr(chr->getName()); + assert(intervals != NULL); + + if (!NO_DICHO) { + Dichotomic dicho(*intervals); + int where = dicho.find(start); + if (where < 0) { + if (!quiet) { + std::cerr << "warning: no bin at position " << chr->getName() << ":" << start << std::endl; + } + return BIN_NOT_FOUND; + } + return where + getBinstart(); + } + + std::vector<Interval>::const_iterator begin = intervals->begin(); + std::vector<Interval>::const_iterator end = intervals->end(); + + chrsize_t binidx = 1; + while (begin != end) { + const Interval& itv = *begin; + if (start >= itv.start+1 && start <= itv.end) { + break; + } + ++binidx; + ++begin; + } + + return binidx + getBinstart() - 1; + } + + int loc = (int)start; + int binsize = getBinsize(); + int stepsize = getStepsize(); + int cur_binidx = 1 + ceil((double)(loc-binsize)/stepsize); + int cur_binbeg = stepsize * (cur_binidx-1)+1; + int cur_binend = cur_binbeg + binsize-1; + int chrsize = getChrsize(); + if (cur_binend > chrsize) { + cur_binend = chrsize; + } + return cur_binidx + getBinstart() - 1; + } +}; + +class Matrix { + + std::vector<AxisChromosome*> axis_chr_abs; + std::vector<AxisChromosome*> axis_chr_ord; + std::unordered_map<std::string, AxisChromosome*> axis_chr_abs_map; + std::unordered_map<std::string, AxisChromosome*> axis_chr_ord_map; + + std::map<chrsize_t, std::map<chrsize_t, chrsize_t> > mat; + + void addAxisChromosome(const std::vector<const Chromosome*>& chr_v, std::vector<AxisChromosome*>& axis_chr, std::unordered_map<std::string, AxisChromosome*>& axis_chr_map); + + const AxisChromosome* getAxisChromosome(const std::string& chrname, const std::unordered_map<std::string, AxisChromosome*>& axis_chr_map) const { + std::unordered_map<std::string, AxisChromosome*>::const_iterator iter = axis_chr_map.find(chrname); + if (iter == axis_chr_map.end()) { + return NULL; + } + return (*iter).second; + } + + void displayBed(std::ostream& ofs, const std::vector<AxisChromosome*>& axis_chr) const { + std::vector<AxisChromosome*>::const_iterator begin = axis_chr.begin(); + std::vector<AxisChromosome*>::const_iterator end = axis_chr.end(); + while (begin != end) { + const AxisChromosome* axis_chr = *begin; + const std::string& name = axis_chr->getChromosome()->getName(); + chrsize_t binstart = axis_chr->getBinstart(); + chrsize_t binend = axis_chr->getBinend(); + chrsize_t binsize = axis_chr->getBinsize(); + chrsize_t chrsize = axis_chr->getChrsize(); + binend -= binstart; + for (chrsize_t bin = 0; bin < binend; ++bin) { + // bed are 0-based begin, 1-based end + chrsize_t beg = bin * binsize; + chrsize_t end = beg + binsize - 1; + if (end > chrsize) { + end = chrsize-1; + } + ofs << name << '\t' << beg << '\t' << (end+1) << '\t' << (bin+binstart) << '\n'; + } + ++begin; + } + } + + int binoffset; + +public: + Matrix(int binoffset) : binoffset(binoffset) {} + + void addXAxisChromosome(const std::vector<const Chromosome*>& chr_v); + void addYAxisChromosome(const std::vector<const Chromosome*>& chr_v); + + const AxisChromosome* getXAxisChromosome(const std::string& chrname) const { + return getAxisChromosome(chrname, axis_chr_abs_map); + } + + const AxisChromosome* getYAxisChromosome(const std::string& chrname) const { + return getAxisChromosome(chrname, axis_chr_ord_map); + } + + void add(chrsize_t abs_bin, chrsize_t ord_bin) { + std::map<chrsize_t, std::map<chrsize_t, chrsize_t> >::iterator iter = mat.find(abs_bin); + if (iter == mat.end()) { + mat[abs_bin] = std::map<chrsize_t, chrsize_t>(); + mat[abs_bin][ord_bin] = 1; + } else { + (*iter).second[ord_bin]++; + } + } + + void displayMatrix(std::ostream& ofs) const { + std::map<chrsize_t, std::map<chrsize_t, chrsize_t> >::const_iterator begin = mat.begin(); + std::map<chrsize_t, std::map<chrsize_t, chrsize_t> >::const_iterator end = mat.end(); + size_t line_total = 0; + if (progress) { + while (begin != end) { + const std::map<chrsize_t, chrsize_t>& line = (*begin).second; + line_total += line.size(); + ++begin; + } + begin = mat.begin(); + } + + size_t line_cnt = 1; + if (progress) { + std::cerr << "\n=================\n"; + std::cerr << " Dumping matrix\n"; + std::cerr << "=================\n\n"; + } + size_t modulo = line_total / 1000; + while (begin != end) { + chrsize_t abs = (*begin).first; + const std::map<chrsize_t, chrsize_t>& line = (*begin).second; + std::map<chrsize_t, chrsize_t>::const_iterator bb = line.begin(); + std::map<chrsize_t, chrsize_t>::const_iterator ee = line.end(); + while (bb != ee) { + if (progress && (line_cnt % modulo) == 0) { + double percent = (double(line_cnt)/line_total)*100; + std::cerr << "" << percent << "% " << line_cnt << " / " << line_total << std::endl; + } + ofs << abs << '\t' << (*bb).first << '\t' << (*bb).second << '\n'; + line_cnt++; + ++bb; + } + ++begin; + } + } + + void displayXBed(std::ostream& ofs) const { + displayBed(ofs, axis_chr_abs); + } + + void displayYBed(std::ostream& ofs) const { + displayBed(ofs, axis_chr_ord); + } + + const std::vector<AxisChromosome*>& getXAxisChromosomes() {return axis_chr_abs;} + const std::vector<AxisChromosome*>& getYAxisChromosomes() {return axis_chr_ord;} +}; + +void Matrix::addAxisChromosome(const std::vector<const Chromosome*>& chr_v, std::vector<AxisChromosome*>& axis_chr, std::unordered_map<std::string, AxisChromosome*>& axis_chr_map) +{ + std::vector<const Chromosome*>::const_iterator begin = chr_v.begin(); + std::vector<const Chromosome*>::const_iterator end = chr_v.end(); + + const AxisChromosome* lastAxisChr = NULL; + while (begin != end) { + const Chromosome* chr = *begin; + AxisChromosome* axisChr = new AxisChromosome(binoffset, chr, lastAxisChr); + axis_chr.push_back(axisChr); + axis_chr_map[chr->getName()] = axisChr; + lastAxisChr = axisChr; + ++begin; + } +} + +void Matrix::addXAxisChromosome(const std::vector<const Chromosome*>& chr_v) +{ + addAxisChromosome(chr_v, axis_chr_abs, axis_chr_abs_map); +} + +void Matrix::addYAxisChromosome(const std::vector<const Chromosome*>& chr_v) +{ + addAxisChromosome(chr_v, axis_chr_ord, axis_chr_ord_map); +} + +std::unordered_map<std::string, Chromosome*> Chromosome::chr_map; + +enum Format { + SPARSE_IND_FMT = SPARSE_FMT, + SPARSE_BED_FMT = SPARSE_FMT|BED_FMT, + EXPANDED_FMT = 0x4 +}; + +void Chromosome::adjustBinsize(chrsize_t ori_binsize, const chrsize_t step) +{ + bincount = 1 + (chrsize_t)floor( (double)(chrsize-ori_binsize) / (ori_binsize/step)); + binsize = chrsize / bincount; + stepsize = binsize / step; +} + +void Chromosome::computeSizes(chrsize_t ori_binsize, chrsize_t step, bool binadjust, const ChrRegions* chr_regions) +{ + if (NULL != chr_regions) { + const std::vector<Interval>* intervals = chr_regions->getIntervalsFromChr(name); + assert(intervals != NULL); + bincount = intervals->size(); + /* + if (verbose) { + std::cerr << name << " bincount: " << bincount << std::endl; + } + */ + } else { + if (chrsize < ori_binsize) { + binsize = chrsize; + stepsize = chrsize; + bincount = 1; + } else if (binadjust) { + adjustBinsize(ori_binsize, step); + } else { + binsize = ori_binsize; + stepsize = (chrsize_t)floor(ori_binsize/step); + chrsize_t remainder = (chrsize - ori_binsize) % stepsize; + chrsize_t tmp_bincount = 1 + (chrsize_t)floor(chrsize-ori_binsize)/stepsize; + bincount = remainder > 0 ? tmp_bincount+1 : tmp_bincount; + } + /* + if (verbose) { + std::cerr << name << " sizes: " << chrsize << " " << binsize << " " << stepsize << " " << bincount << std::endl; + } + */ + } +} + +static int usage(int ret = 1) +{ + std::cerr << "\nusage: " << prog << " --binsize BINSIZE|--binfile --chrsizes FILE --ifile FILE\n"; + std::cerr << " --oprefix PREFIX [--binadjust] [--step STEP] [--binoffset OFFSET]\n"; + std::cerr << " [--matrix-format asis|upper|lower|complete][--chrA CHR... --chrB CHR...] [--quiet] [--progress] [--detail-progress]\n"; + std::cerr << "\nusage: " << prog << " --version\n"; + std::cerr << "\nusage: " << prog << " --help\n"; + return ret; +} + +static int help() +{ + (void)usage(); + std::cerr << "\nOPTIONS\n\n"; + std::cerr << " --version : display version\n"; + std::cerr << " --binsize BINSIZE : bin size\n"; + std::cerr << " --binfile BEDFILE : bed file containing bins (chr start end)\n"; + std::cerr << " --chrsizes FILE : file containing chromosome sizes\n"; + std::cerr << " --ifile FILE : input interaction file\n"; + std::cerr << " --oprefix PREFIX : output prefix of generated files (matrix and bed)\n"; + std::cerr << " --binadjust : [optional] adjust bin sizes, default is false\n"; + std::cerr << " --step STEP : [optional] step size, default is 1\n"; + std::cerr << " --binoffset OFFSET : [optional] starting bin offset, default is 1\n"; + std::cerr << " --matrix-format FORMAT : [optional] FORMAT may be:\n"; + std::cerr << " - asis: matrix is generated according to input data (default)\n"; + std::cerr << " - upper: only the upper matrix is generated\n"; + std::cerr << " - lower: only the lower matrix is generated\n"; + std::cerr << " - complete: generate both parts of the matrix (upper and lower);\n"; + std::cerr << " input data must contain only one part (upper or lower) \n"; + std::cerr << " --chrA CHR : [optional] colon separated list of abscissa chromosomes; default is all chromosomes\n"; + std::cerr << " --chrB CHR : [optional] colon separated list of ordinate chromosomes; default is all chromosomes\n"; + std::cerr << " --quiet : do not display any warning\n"; + std::cerr << " --progress : display progress\n"; + std::cerr << " --detail-progress : display detail progress (needs preliminary steps consuming time)\n"; + return -1; +} + +enum MatrixFormat { + ASIS_MATRIX = 1, + UPPER_MATRIX, + LOWER_MATRIX, + COMPLETE_MATRIX +}; + +static int get_options(int argc, char* argv[], chrsize_t& binsize, const char*& binfile, const char*& chrsize_file, const char*& ifile, const char*& oprefix, Format& format, std::string& bed_prefix, bool& binadjust, MatrixFormat& matrix_format, chrsize_t& step, bool& whole_genome, int& binoffset, const char*& chrA, const char*& chrB) +{ + prog = argv[0]; + for (int ac = 1; ac < argc; ++ac) { + const char* opt = argv[ac]; + if (*opt == '-') { + if (!strcmp(opt, "--binadjust")) { + binadjust = true; + } else if (!strcmp(opt, "--version")) { + std::cout << "build_matrix version " << VERSION << "\n"; + exit(0); + } else if (!strcmp(opt, "--progress")) { + progress = true; + } else if (!strcmp(opt, "--quiet")) { + quiet = true; + } else if (!strcmp(opt, "--detail-progress")) { + progress = true; + detail_progress = true; + } else if (!strcmp(opt, "--matrix-format")) { + if (ac == argc-1) { + return usage(); + } + std::string matrix_format_str = argv[++ac]; + if (matrix_format_str == "asis") { + matrix_format = ASIS_MATRIX; + } else if (matrix_format_str == "upper") { + matrix_format = UPPER_MATRIX; + } else if (matrix_format_str == "lower") { + matrix_format = LOWER_MATRIX; + } else if (matrix_format_str == "complete") { + matrix_format = COMPLETE_MATRIX; + } else { + return usage(); + } + } else if (!strcmp(opt, "--step")) { + if (ac == argc-1) { + return usage(); + } + step = atoi(argv[++ac]); + } else if (!strcmp(opt, "--binfile")) { + if (ac == argc-1) { + return usage(); + } + binfile = argv[++ac]; + } else if (!strcmp(opt, "--binsize")) { + if (ac == argc-1) { + return usage(); + } + binsize = atoi(argv[++ac]); + } else if (!strcmp(opt, "--binoffset")) { + if (ac == argc-1) { + return usage(); + } + binoffset = atoi(argv[++ac]); + } else if (!strcmp(opt, "--ifile")) { + if (ac == argc-1) { + return usage(); + } + ifile = argv[++ac]; + } else if (!strcmp(opt, "--oprefix")) { + if (ac == argc-1) { + return usage(); + } + oprefix = argv[++ac]; + } else if (!strcmp(opt, "--chrsizes")) { + if (ac == argc-1) { + return usage(); + } + chrsize_file = argv[++ac]; + } else if (!strcmp(opt, "--chrA")) { + if (ac == argc-1) { + return usage(); + } + chrA = argv[++ac]; + whole_genome = false; + } else if (!strcmp(opt, "--chrB")) { + if (ac == argc-1) { + return usage(); + } + chrB = argv[++ac]; + whole_genome = false; + } else if (!strcmp(opt, "--help")) { + return help(); + } else { + std::cerr << '\n' << prog << ": unknown option " << opt << std::endl; + return usage(); + } + } + } + + return 0; +} + +static void split_in_vect(const std::string& str, std::vector<const Chromosome*>& vect) +{ + size_t last_pos = 0; + while (size_t pos = str.find(':', last_pos)) { + std::string chrname; + bool last = pos == std::string::npos; + if (last) { + chrname = str.substr(last_pos); + } else { + chrname = str.substr(last_pos, pos-last_pos); + } + const Chromosome* chr = Chromosome::getByName(chrname); + if (!chr) { + std::cerr << prog << ": unknown chromosome " << chrname << std::endl; + exit(1); + } + vect.push_back(chr); + if (last) { + break; + } + last_pos = pos+1; + } +} + +static int interaction_parse(char* buffer, char*& lchr, chrsize_t& lstart, char*& rchr, chrsize_t& rstart) +{ + char c; + char* str; + while ((c = *buffer++) != 0) { + if (c == '\t') { + lchr = buffer; + break; + } + } + while ((c = *buffer) != 0) { + if (c == '\t') { + *buffer++ = 0; + str = buffer; + break; + } + buffer++; + } + + while ((c = *buffer) != 0) { + if (c == '\t') { + *buffer++ = 0; + lstart = atoi(str); + break; + } + buffer++; + } + + while ((c = *buffer++) != 0) { + if (c == '\t') { + rchr = buffer; + break; + } + } + + while ((c = *buffer) != 0) { + if (c == '\t') { + *buffer++ = 0; + str = buffer; + break; + } + buffer++; + } + + while ((c = *buffer) != 0) { + if (c == '\t') { + *buffer++ = 0; + rstart = atoi(str); + break; + } + buffer++; + } + + return 0; +} + +static char p_buffer[512000]; + +static int build_matrix_init(Matrix& matrix, const char* ifile, std::ifstream& ifs, const std::string& oprefix, std::ofstream& matfs, std::ofstream& xbedfs, std::ofstream& ybedfs, const char* chrsize_file, bool whole_genome, const char* chrA, const char* chrB, chrsize_t ori_binsize, const char* binfile, chrsize_t step, bool binadjust, ChrRegions*& chr_regions, size_t& line_total) +{ + ifs.open(ifile); + if (ifs.bad() || ifs.fail()) { + std::cerr << prog << " cannot open interaction file: " << ifile << " for reading\n"; + return 1; + } + + if (detail_progress) { + if (progress) { + std::cerr << "\n======================================\n"; + std::cerr << " Getting information for progress bar\n"; + std::cerr << "======================================\n\n"; + } + std::cerr << std::setprecision(2) << std::fixed; + int fd = open(ifile, O_RDONLY); + struct stat st; + assert(fstat(fd, &st) == 0); + assert(fd >= 0); + int nn; + int cnt = 1; + while ((nn = read(fd, p_buffer, sizeof(p_buffer))) > 0) { + const char *p = p_buffer; + while (nn-- > 0) { + if (*p++ == '\n') { + line_total++; + } + } + if ((cnt % 200) == 0) { + std::cerr << '.' << std::flush; + } + cnt++; + } + std::cerr << std::endl; + close(fd); + } + + std::ifstream chrsizefs; + chrsizefs.open(chrsize_file); + if (chrsizefs.bad() || chrsizefs.fail()) { + std::cerr << prog << " cannot open chrsizes file: " << chrsize_file << " for reading\n"; + return 1; + } + + std::string matfile = oprefix + ".matrix"; + matfs.open(matfile); + if (matfs.bad() || matfs.fail()) { + std::cerr << prog << " cannot open file: " << matfile << " for writing\n"; + return 1; + } + + std::string xbedfile = oprefix + "_abs.bed"; + xbedfs.open(xbedfile); + if (xbedfs.bad() || xbedfs.fail()) { + std::cerr << prog << " cannot open file: " << xbedfile << " for writing\n"; + return 1; + } + + std::string ybedfile = oprefix + "_ord.bed"; + if (!whole_genome) { + //std::string xbedlink; + //size_t pos = xbedfile.rfind('/'); + //if (pos != std::string::npos) { + // xbedlink = xbedfile.substr(pos+1); + //} else { + // xbedlink = xbedfile; + //} + //unlink(ybedfile.c_str()); + //if (symlink(xbedlink.c_str(), ybedfile.c_str())) { + // std::cerr << prog << " cannot created link: " << ybedfile << "\n"; + // return 1; + //} + //} else { + ybedfs.open(ybedfile); + if (ybedfs.bad() || ybedfs.fail()) { + std::cerr << prog << " cannot open file: " << ybedfile << " for writing\n"; + return 1; + } + } + + chr_regions = NULL; + if (NULL != binfile) { + chr_regions = new ChrRegions(); + if (progress) { + std::cerr << "\n=================\n"; + std::cerr << " Reading binfile\n"; + std::cerr << "=================\n\n"; + } + if (chr_regions->readBedfile(binfile)) { + return 1; + } + } + + std::vector<const Chromosome*> all_chr_v; + while (!chrsizefs.eof()) { + std::string buffer; + getline(chrsizefs, buffer); + + chrsize_t chrsize; + std::istringstream istr(buffer); + std::string name; + istr >> name >> chrsize; + if (!istr.fail()) { + Chromosome* chromosome = new Chromosome(name, chrsize, ori_binsize, step, binadjust, chr_regions); + all_chr_v.push_back(chromosome); + } + } + + chrsizefs.close(); + + if (chrA) { + assert(chrB != NULL); + std::vector<const Chromosome*> chrA_v; + std::vector<const Chromosome*> chrB_v; + split_in_vect(chrA, chrA_v); + split_in_vect(chrB, chrB_v); + matrix.addXAxisChromosome(chrA_v); + matrix.addYAxisChromosome(chrB_v); + } else { + matrix.addXAxisChromosome(all_chr_v); + matrix.addYAxisChromosome(all_chr_v); + } + + return 0; +} + +static int build_matrix(int binoffset, chrsize_t ori_binsize, const char* binfile, const char* chrsize_file, const char* ifile, const char* oprefix, Format _dummy_format, const std::string& _dummy_bed_prefix, bool binadjust, MatrixFormat matrix_format, chrsize_t step, bool whole_genome, const char* chrA, const char* chrB) +{ + std::ifstream ifs; + std::ofstream matfs, xbedfs, ybedfs; + + Matrix matrix(binoffset); + ChrRegions *chr_regions = NULL; + size_t line_total = 0; + if (int ret = build_matrix_init(matrix, ifile, ifs, oprefix, matfs, xbedfs, ybedfs, chrsize_file, whole_genome, chrA, chrB, ori_binsize, binfile, step, binadjust, chr_regions, line_total)) { + return ret; + } + + if (progress) { + std::cerr << "\n=================\n"; + std::cerr << " Building matrix\n"; + std::cerr << "=================\n\n"; + } + size_t line_cnt = 1; + size_t line_num = 0; + char buffer[4096]; + std::string lmark, rmark, lorg, rorg; + while (!ifs.eof()) { + ifs.getline(buffer, sizeof(buffer)-1); + line_num++; + if (is_empty_line(buffer)) { + continue; + } + chrsize_t lstart = 0; + chrsize_t rstart = 0; + char* lchr = NULL; + char* rchr = NULL; + interaction_parse(buffer, lchr, lstart, rchr, rstart); + const AxisChromosome* abs_chr = matrix.getXAxisChromosome(lchr); + if (!abs_chr) { + continue; + } + const AxisChromosome* ord_chr = matrix.getYAxisChromosome(rchr); + if (!ord_chr) { + continue; + } + chrsize_t abs_bin = abs_chr->assign_bin(lorg, lstart); + if (abs_bin == BIN_NOT_FOUND) { + continue; + } + chrsize_t ord_bin = ord_chr->assign_bin(rorg, rstart); + if (ord_bin == BIN_NOT_FOUND) { + continue; + } + switch(matrix_format) { + + case ASIS_MATRIX: + matrix.add(abs_bin, ord_bin); + break; + + case UPPER_MATRIX: + if (abs_bin < ord_bin) { + matrix.add(abs_bin, ord_bin); + } else { + matrix.add(ord_bin, abs_bin); + } + break; + + case LOWER_MATRIX: + if (abs_bin > ord_bin) { + matrix.add(abs_bin, ord_bin); + } else { + matrix.add(ord_bin, abs_bin); + } + break; + + case COMPLETE_MATRIX: + matrix.add(abs_bin, ord_bin); + if (abs_bin != ord_bin) { + matrix.add(ord_bin, abs_bin); + } + break; + } + line_cnt++; + if (progress && (line_cnt % 100000) == 0) { + if (detail_progress) { + double percent = (double(line_cnt)/line_total)*100; + std::cerr << "" << percent << "% " << line_cnt << " / " << line_total << std::endl; + } else { + std::cerr << line_cnt << std::endl; + } + } + } + + if (progress) { + std::cerr << "\n==================\n"; + std::cerr << " Dumping bedfiles\n"; + std::cerr << "==================\n\n"; + } + + if (NULL != chr_regions) { + chr_regions->displayBed(xbedfs, matrix.getXAxisChromosomes()); + if (!whole_genome) { + chr_regions->displayBed(ybedfs, matrix.getYAxisChromosomes()); + } + } else { + matrix.displayXBed(xbedfs); + if (!whole_genome) { + matrix.displayYBed(ybedfs); + } + } + matrix.displayMatrix(matfs); + xbedfs.close(); + ybedfs.close(); + matfs.close(); + return 0; +} + +int main(int argc, char* argv[]) +{ + chrsize_t step = 1; + bool binadjust = false; + MatrixFormat matrix_format = ASIS_MATRIX; + chrsize_t binsize = 0; + const char* ifile = NULL; + const char* oprefix = NULL; + const char* chrA = NULL; + const char* chrB = NULL; + const char* chrsize_file = NULL; + const char* binfile = NULL; + bool whole_genome = true; + int binoffset = 1; + std::string bed_prefix; + Format format = SPARSE_BED_FMT; + + if (int ret = get_options(argc, argv, binsize, binfile, chrsize_file, ifile, oprefix, format, bed_prefix, binadjust, matrix_format, step, whole_genome, binoffset, chrA, chrB)) { + if (ret < 0) { + return 0; + } + return ret; + } + + if (!binsize && !binfile) { + std::cerr << '\n'; + std::cerr << prog << ": missing --binsize or --binfile option\n"; + return usage(); + } + + if (!chrsize_file) { + std::cerr << '\n'; + std::cerr << prog << ": missing --chrsizes option\n"; + return usage(); + } + + if (!ifile) { + std::cerr << '\n'; + std::cerr << prog << ": missing --ifile option\n"; + return usage(); + } + + if (!oprefix) { + std::cerr << '\n'; + std::cerr << prog << ": missing --oprefix option\n"; + return usage(); + } + + if ((chrA && !chrB) || (!chrA && chrB)) { + std::cerr << '\n'; + std::cerr << prog << ": options --chrA and --chrB must be set simultanously\n"; + return usage(); + } + + if (binfile && binsize) { + std::cerr << '\n'; + std::cerr << prog << ": options --binfile and --binsize cannot be set simultanously\n"; + return usage(); + } + + return build_matrix(binoffset, binsize, binfile, chrsize_file, ifile, oprefix, format, bed_prefix, binadjust, matrix_format, step, whole_genome, chrA, chrB); +} diff --git a/bin/src/cutsite_trimming.cpp b/bin/src/cutsite_trimming.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ef3fa869cd3bfe5f4e473908224cb42c2b99cbfe --- /dev/null +++ b/bin/src/cutsite_trimming.cpp @@ -0,0 +1,153 @@ +// HiC-Pro +// Copyright 2015 Institut Curie +// Author(s): Nicolas Servant +// Contact: nicolas.servant@curie.fr +// This software is distributed without any guarantee under the terms of the BSD-3 licence + +// g++ -std=c++0x -o cutsite_trimming cutsite_trimming.cpp +//./cutsite_trimming -fastq fastq -cutsite AGCTT + + +#include <iostream> // std::cout +#include <stdlib.h> +#include <string.h> +#include <vector> +#include <fstream> + +static const char* prog; + +static int usage(int ret=1) +{ + std::cerr << "usage: " << prog << " --fastq FASTQFILE --cutsite CUTSITE --out OUTFILE [--rmuntrim] \n"; + std::cerr << "usage: " << prog << " --help\n"; + return ret; +} + +static int get_options(int argc, char* argv[], std::string& fastqFile, + std::vector<std::string>& cutSites, std::string& output, bool& rmuntrim) +{ + prog = argv[0]; + if (argc == 1){ + exit(usage()); + } + for (int ac = 1; ac < argc; ++ac) { + const char* opt = argv[ac]; + if (*opt == '-') { + if (!strcmp(opt, "--fastq")) { + fastqFile = std::string(argv[++ac]); + } else if (!strcmp(opt, "--cutsite")) { + + std::string cutSitesSequence; + cutSitesSequence = std::string(argv[++ac]); + size_t pos = cutSitesSequence.find(","); + size_t begin = 0; + while(pos != std::string::npos){ + cutSites.push_back(cutSitesSequence.substr(begin, pos - begin)); + begin = pos + 1; + pos = cutSitesSequence.find(",", begin + 1); + } + cutSites.push_back(cutSitesSequence.substr(begin, pos)); + + } + else if (!strcmp(opt, "--out")) { + output = std::string(argv[++ac]); + } + else if (!strcmp(opt, "--rmuntrim")) { + rmuntrim = true; + } + }else { + std::cerr << prog << ": unknown option " << opt << std::endl; + return usage(); + } + } + return 0; +} + +static int trim_fastq(std::string& fastqFile, + std::vector<std::string>& cutSites, + std::string& outFile, bool& rmuntrim) +{ + + int trim_count=0; + std::string ID; + std::ifstream ifs (fastqFile); + std::ofstream ofs (outFile); + + if (ifs.is_open()){ + while (getline(ifs, ID)) { + std::string seq; + std::string dummy; + std::string qual; + + getline(ifs, seq); + getline(ifs, dummy); + getline(ifs, qual); + + bool find_pos = false; + size_t pos = std::string::npos; + for (std::vector<std::string>::iterator it = cutSites.begin(); it != cutSites.end(); ++it){ + size_t tmp_pos = seq.find(*it); + if (tmp_pos != std::string::npos) { + // If find_pos is alread True, there is a problem (there are two cut + // sites in the same read).) + if (find_pos == true){ + if(tmp_pos < pos) { + pos = tmp_pos; + } + } else { + find_pos = true; + pos = tmp_pos; + } + } + } + + if (pos != std::string::npos) { + trim_count++; + ofs << ID << '\n'; + ofs << seq.substr(0, pos) << '\n'; + ofs << "+\n"; + ofs << qual.substr(0, pos) << '\n'; + } else { + if (!rmuntrim){ + ofs << ID << '\n'; + ofs << seq << '\n'; + ofs << "+\n"; + ofs << qual << '\n'; + } + } + find_pos = false; + } + }else{ + std::cerr << "Error : Cannot open file : " << fastqFile; + } + return trim_count; +} + +int main(int argc, char* argv[]) +{ + + std::string fastqFile; + std::vector<std::string> cutSites; + std::string outFile; + bool rmuntrim = false; + + int ret = get_options(argc, argv, fastqFile, cutSites, outFile, rmuntrim); + printf("##Fastq file: %s\n", fastqFile.c_str()); + printf("##Restriction sites:\n"); + for(std::vector<std::string>::iterator it = cutSites.begin(); it != cutSites.end(); ++it){ + std::cout << *it << std::endl; + } + printf("##Output File: %s\n", outFile.c_str()); + + if (fastqFile.empty() || cutSites.size() == 0 || outFile.empty()){ + usage(); + exit(ret); + } + + int trim_count=trim_fastq(fastqFile, cutSites, outFile, rmuntrim); + printf("\n##Trimmed reads: %d\n", trim_count); + return(0); + } + + + diff --git a/conf/base.config b/conf/base.config index 23c9e4a820a7261cdbd2379235a182492bed0499..a8413de93141eb08c6c6f6498cf45532b8c24164 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,6 +1,6 @@ /* * ------------------------------------------------- - * nf-core/hic Nextflow base config file + * Nextflow base config file * ------------------------------------------------- * A 'blank slate' config file, appropriate for general * use on most high performace compute environments. @@ -13,29 +13,40 @@ process { container = params.container - // TODO nf-core: Check the defaults for all processes - cpus = { check_max( 1 * task.attempt, 'cpus' ) } + cpus = { check_max( 2, 'cpus' ) } memory = { check_max( 8.GB * task.attempt, 'memory' ) } time = { check_max( 2.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in [1,143,137,104,134,139] ? 'retry' : 'terminate' } maxRetries = 1 maxErrors = '-1' // Process-specific resource requirements - // TODO nf-core: Customise requirements for specific processes - withName: fastqc { - errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'ignore' } + withName:bowtie2_end_to_end { + cpus = { check_max( 2, 'cpus' ) } + memory = { check_max( 16.GB * task.attempt, 'memory' ) } + time = { check_max( 5.h * task.attempt, 'time' ) } } - withName: multiqc { - errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'ignore' } + withName:bowtie2_on_trimmed_reads { + cpus = { check_max( 2, 'cpus' ) } + memory = { check_max( 16.GB * task.attempt, 'memory' ) } + time = { check_max( 5.h * task.attempt, 'time' ) } + } + withName:merge_mapping_steps { + cpus = { check_max( 1, 'cpus' ) } + memory = { check_max( 20.GB * task.attempt, 'memory' ) } + time = { check_max( 5.h * task.attempt, 'time' ) } + } + withName:trim_reads { + cpus = { check_max (1, 'cpus')} + memory = { check_max( 10.GB * task.attempt, 'memory' ) } + time = { check_max( 5.h * task.attempt, 'time' ) } } } params { // Defaults only, expecting to be overwritten - max_memory = 128.GB - max_cpus = 16 - max_time = 240.h - igenomes_base = 's3://ngi-igenomes/igenomes/' + max_memory = 20.GB + max_cpus = 1 + max_time = 24.h } diff --git a/conf/curie.config b/conf/curie.config new file mode 100644 index 0000000000000000000000000000000000000000..ab85a2d9d778ac3ca875a273e9bbcb7eb966253d --- /dev/null +++ b/conf/curie.config @@ -0,0 +1,16 @@ +singularity { + enabled = false +} + +process { + executor = 'pbs' + queue = params.queue + //beforeScript = 'export PATH=/bioinfo/pipelines/sandbox/dev/nfcore/rnaseq/modules/conda/envs/nf-core-rnaseq-1.2/bin:$PATH' +} + +params { + clusterOptions = false + max_memory = 128.GB + max_cpus = 4 + max_time = 240.h +} diff --git a/conf/hicpro.config b/conf/hicpro.config new file mode 100644 index 0000000000000000000000000000000000000000..3eafbd1a447565a8b158b0ac7f08675995e163d0 --- /dev/null +++ b/conf/hicpro.config @@ -0,0 +1,17 @@ +/* + * ------------------------------------------------- + * Nextflow config file for Genomes paths + * ------------------------------------------------- + * Defines reference genomes + * Can be used by any config that customises the base + * path using $params.genomes_base / --genomes_base + */ + +params { + bwt2_index = '/data/annotations/pipelines/Human/hg19/indexes/bowtie2/hg19' + bwt2_opts_end2end = '--very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder' + bwt2_opts_trimmed = '--very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder' + restriction_fragment_bed = '/data/users/nservant/Apps/HiC-Pro_annotation/HindIII_resfrag_hg19.bed' + chromosome_size = '/data/users/nservant/Apps/HiC-Pro_annotation/chrom_hg19.sizes' +} + diff --git a/conf/test.config b/conf/test.config index a03678b36aa77f315c7e31909247c57242922d61..e8a8e0eb440f402fa1266522a0f5805bbbccffc9 100644 --- a/conf/test.config +++ b/conf/test.config @@ -13,11 +13,8 @@ params { max_memory = 6.GB max_time = 48.h // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - singleEnd = false readPaths = [ - ['Testdata', ['https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R1.tiny.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R2.tiny.fastq.gz']], - ['SRR389222', ['https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub1.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub2.fastq.gz']] + ['Testdata2', ['/bioinfo/users/nservant/GIT/HiCPro/test-op/hSRR400264_00_R1.fastq.gz', '/bioinfo/users/nservant/GIT/HiCPro/test-op/hSRR400264_00_R1.fastq.gz']], + ['Testdata1', ['/bioinfo/users/nservant/GIT/HiCPro/test-op/hSRR400264_01_R1.fastq.gz', '/bioinfo/users/nservant/GIT/HiCPro/test-op/hSRR400264_01_R1.fastq.gz']], ] } diff --git a/main.nf b/main.nf index 97a07deedf81033e2d56bd4d66631446cecbe1cd..49b3d61153cc6120d629c3731af2a02503628bde 100644 --- a/main.nf +++ b/main.nf @@ -23,6 +23,9 @@ def helpMessage() { nf-core/hic v${workflow.manifest.version} ======================================================= + This pipeline is a Nextflow version of the HiC-Pro pipeline for Hi-C data processing. + See https://github.com/nservant/HiC-Pro for details. + Usage: The typical command for running the pipeline is as follows: @@ -30,13 +33,13 @@ def helpMessage() { nextflow run nf-core/hic --reads '*_R{1,2}.fastq.gz' -profile docker Mandatory arguments: - --reads Path to input data (must be surrounded with quotes) + --readsPath Path to input data (must be surrounded with quotes) --genome Name of iGenomes reference -profile Configuration profile to use. Can use multiple (comma separated) Available: conda, docker, singularity, awsbatch, test and more. Options: - --singleEnd Specifies that the input is single end reads + References If not specified in the configuration file or you wish to overwrite any of the references. --fasta Path to Fasta reference @@ -52,7 +55,7 @@ def helpMessage() { """.stripIndent() } -/* +/********************************************************** * SET UP CONFIGURATION VARIABLES */ @@ -64,17 +67,11 @@ if (params.help){ // TODO nf-core: Add any reference files that are needed // Configurable reference genomes -fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false -if ( params.fasta ){ - fasta = file(params.fasta) - if( !fasta.exists() ) exit 1, "Fasta file not found: ${params.fasta}" -} -// -// NOTE - THIS IS NOT USED IN THIS PIPELINE, EXAMPLE ONLY -// If you want to use the above in a process, define the following: -// input: -// file fasta from fasta -// +//fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false +//if ( params.fasta ){ +// fasta = file(params.fasta) +// if( !fasta.exists() ) exit 1, "Fasta file not found: ${params.fasta}" +//} // Has the run name been specified by the user? @@ -98,30 +95,51 @@ if( workflow.profile == 'awsbatch') { ch_multiqc_config = Channel.fromPath(params.multiqc_config) ch_output_docs = Channel.fromPath("$baseDir/docs/output.md") + + + +/********************************************************** + * SET UP CHANNELS + */ + +/* + * input read files + */ +Channel + .fromFilePairs( params.readPaths ) + .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } + .set { raw_reads_pairs } + +raw_reads = Channel.create() +raw_reads_2 = Channel.create() +Channel + .fromFilePairs( params.readPaths ) + .separate( raw_reads, raw_reads_2 ) { a -> [tuple(a[0], a[1][0]), tuple(a[0], a[1][1])] } + + +// SPlit fastq files +// https://www.nextflow.io/docs/latest/operator.html#splitfastq + /* - * Create a channel for input read files + * Other input channels */ - if(params.readPaths){ - if(params.singleEnd){ - Channel - .from(params.readPaths) - .map { row -> [ row[0], [file(row[1][0])]] } - .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } - .into { read_files_fastqc; read_files_trimming } - } else { - Channel - .from(params.readPaths) - .map { row -> [ row[0], [file(row[1][0]), file(row[1][1])]] } - .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } - .into { read_files_fastqc; read_files_trimming } - } - } else { - Channel - .fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 ) - .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --singleEnd on the command line." } - .into { read_files_fastqc; read_files_trimming } - } +// Bowtie2 Index +bwt2_file = file("${params.bwt2_index}.1.bt2") +if( !bwt2_file.exists() ) exit 1, "Reference genome Bowtie 2 not found: ${params.bwt2_index}" +bwt2_index = Channel.value( "${params.bwt2_index}" ) + +// Restriction fragment +res_frag_file = Channel.value( "${params.restriction_fragment_bed}" ) + +// Chromosome size +chr_size = Channel.value( "${params.chromosome_size}" ) + + + +/********************************************************** + * SET UP LOGS + */ // Header log info log.info """======================================================= @@ -140,7 +158,7 @@ summary['Run Name'] = custom_runName ?: workflow.runName // TODO nf-core: Report custom parameters here summary['Reads'] = params.reads summary['Fasta Ref'] = params.fasta -summary['Data Type'] = params.singleEnd ? 'Single-End' : 'Paired-End' +//summary['Data Type'] = params.singleEnd ? 'Single-End' : 'Paired-End' summary['Max Memory'] = params.max_memory summary['Max CPUs'] = params.max_cpus summary['Max Time'] = params.max_time @@ -191,43 +209,174 @@ process get_software_versions { file 'software_versions_mqc.yaml' into software_versions_yaml script: - // TODO nf-core: Get all tools to print their version number here - """ + """ echo $workflow.manifest.version > v_pipeline.txt echo $workflow.nextflow.version > v_nextflow.txt - fastqc --version > v_fastqc.txt - multiqc --version > v_multiqc.txt + bowtie2 --version > v_bowtie2.txt + python --version > v_python.txt + samtools --version > v_samtools.txt scrape_software_versions.py > software_versions_mqc.yaml """ } +/**************************************************** + * MAIN WORKFLOW + */ /* - * STEP 1 - FastQC - */ -process fastqc { - tag "$name" - publishDir "${params.outdir}/fastqc", mode: 'copy', - saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"} + * STEP 1 - Two-steps Reads Mapping +*/ - input: - set val(name), file(reads) from read_files_fastqc +raw_reads = raw_reads.concat( raw_reads_2 ) + +process bowtie2_end_to_end { + tag "$prefix" + input: + set val(sample), file(reads) from raw_reads + val bt2_index from bwt2_index + + output: + set val(prefix), file("${prefix}_unmap.fastq") into unmapped_end_to_end + set val(prefix), file("${prefix}.bam") into end_to_end_bam + + script: + prefix = reads.toString() - ~/(\.fq)?(\.fastq)?(\.gz)?$/ + def bwt2_opts = params.bwt2_opts_end2end + """ + bowtie2 --rg-id BMG --rg SM:${prefix} \\ + ${bwt2_opts} \\ + -p ${task.cpus} \\ + -x ${bt2_index} \\ + --un ${prefix}_unmap.fastq \\ + -U ${reads} | samtools view -F 4 -bS - > ${prefix}.bam + """ +} - output: - file "*_fastqc.{zip,html}" into fastqc_results +process trim_reads { + tag "$prefix" + input: + set val(prefix), file(reads) from unmapped_end_to_end - script: - """ - fastqc -q $reads - """ + output: + set val(prefix), file("${prefix}_trimmed.fastq") into trimmed_reads + + script: + """ + cutsite_trimming --fastq $reads \\ + --cutsite params.ligation_motifs \\ + --out ${prefix}_trimmed.fastq + """ +} + +process bowtie2_on_trimmed_reads { + tag "$prefix" + input: + set val(prefix), file(reads) from trimmed_reads + val bt2_index from bwt2_index + + output: + set val(prefix), file("${prefix}_trimmed.bam") into trimmed_bam + + script: + prefix = reads.toString() - ~/(_trimmed)?(\.fq)?(\.fastq)?(\.gz)?$/ + def bwt2_opts = params.bwt2_opts_trimmed + """ + bowtie2 --rg-id BMG --rg SM:${prefix} \\ + ${bwt2_opts} \\ + -p ${task.cpus} \\ + -x ${bt2_index} \\ + -U ${reads} | samtools view -bS - > ${prefix}_trimmed.bam + """ +} + +process merge_mapping_steps{ + tag "$bam1 + $bam2" + input: + set val(prefix), file(bam1), file(bam2) from end_to_end_bam.join( trimmed_bam ) + + output: + set val(sample), file("${prefix}_bwt2merged.bam") into bwt2_merged_bam + + script: + sample = prefix.toString() - ~/(_R1)?(_R2)?(_val_1)?(_val_2)?$/ + """ + samtools merge -@ ${task.cpus} \\ + -f ${prefix}_bwt2merged.bam \\ + ${bam1} ${bam2} + + samtools sort -@ ${task.cpus} -m 800M \\ + -n -T /tmp/ \\ + -o ${prefix}_bwt2merged.sorted.bam \\ + ${prefix}_bwt2merged.bam + + mv ${prefix}_bwt2merged.sorted.bam ${prefix}_bwt2merged.bam + """ } +process combine_mapped_files{ + tag "$sample = $r1_prefix + $r2_prefix" + input: + set val(sample), file(aligned_bam) from bwt2_merged_bam.groupTuple() + + output: + set val(sample), file("${sample}_bwt2pairs.bam") into paired_bam + + script: + r1_bam = aligned_bam[0] + r1_prefix = r1_bam.toString() - ~/_bwt2merged.bam$/ + r2_bam = aligned_bam[1] + r2_prefix = r2_bam.toString() - ~/_bwt2merged.bam$/ + """ + mergeSAM.py -f ${r1_bam} -r ${r2_bam} -o ${sample}_bwt2pairs.bam + """ +} /* - * STEP 2 - MultiQC - */ + * STEP2 - DETECT VALID PAIRS +*/ + +process get_valid_interaction{ + tag "$sample" + input: + set val(sample), file(pe_bam) from paired_bam + val frag_file from res_frag_file + + output: + set val(sample), file("*.validPairs") into valid_pairs + + script: + """ + mapped_2hic_fragments.py -f ${frag_file} -r ${pe_bam} + """ +} + + +/* + * STEP3 - BUILD MATRIX +*/ + +process build_contact_maps{ + tag "$sample" + input: + set val(sample), file(vpairs) from valid_pairs + val chrsize from chr_size + + output: + set val(sample), file("*.matrix") into matrix_file + + script: + """ + build_matrix --matrix-format upper --binsize 1000000 --chrsizes ${chrsize} --ifile ${vpairs} --oprefix ${sample}_1000000 + """ + +} + + +/* + // STEP 2 - MultiQC + process multiqc { publishDir "${params.outdir}/MultiQC", mode: 'copy' @@ -252,10 +401,8 @@ process multiqc { } +// STEP 3 - Output Description HTML -/* - * STEP 3 - Output Description HTML - */ process output_documentation { publishDir "${params.outdir}/Documentation", mode: 'copy' @@ -270,7 +417,7 @@ process output_documentation { markdown_to_html.r $output_docs results_description.html """ } - +*/ /* diff --git a/nextflow.config b/nextflow.config index 5f363c6caace1d84440605b20a729f09a6dd0d29..0e1bb10c32d84244773e60e40c97528cc34704f9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,7 +18,6 @@ params { // Workflow flags // TODO nf-core: Specify your pipeline's command line flags reads = "data/*{1,2}.fastq.gz" - singleEnd = false outdir = './results' // Boilerplate options @@ -42,6 +41,8 @@ includeConfig 'conf/base.config' // Load nf-core custom profiles from different Institutions includeConfig "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}/nfcore_custom.config" +includeConfig 'conf/hicpro.config' + profiles { awsbatch { includeConfig 'conf/awsbatch.config' } conda { process.conda = "$baseDir/environment.yml" }