diff --git a/CHANGELOG.md b/CHANGELOG.md index d183e93ca8a28381fc3cbf29df9f607d8321c627..d01eb83c4893a44d41e9cb18dd547cc20892d50b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## v1.1dev +* Support 'N' base motif in restriction/ligation sites +* Support multiple restriction enzymes/ligattion sites (comma separated) (#31) * Add --saveInteractionBAM option * Add DOI (#29) * Fix bug for reads extension _1/_2 (#30) diff --git a/bin/digest_genome.py b/bin/digest_genome.py index db2d151602269c22f3b0a837446bba30a4b442b6..ac6d8da3d9f6faa3e9c0960fbecac52ea30da61a 100755 --- a/bin/digest_genome.py +++ b/bin/digest_genome.py @@ -47,6 +47,7 @@ def find_re_sites(filename, sequences, offset): indices.sort() all_indices.append(indices) indices = [] + # This is a new chromosome. Empty the sequence string, and add the # correct chrom id big_str = "" @@ -67,6 +68,7 @@ def find_re_sites(filename, sequences, offset): for m in re.finditer(pattern, big_str)] indices.sort() all_indices.append(indices) + return contig_names, all_indices @@ -87,6 +89,22 @@ def find_chromsomose_lengths(reference_filename): return chromosome_names, np.array(chromosome_lengths) +def replaceN(cs): + npos = int(cs.find('N')) + cseql = [] + if npos!= -1: + for nuc in ["A","C","G","T"]: + tmp = cs.replace('N', nuc, 1) + tmpl = replaceN(tmp) + if type(tmpl)==list: + cseql = cseql + tmpl + else: + cseql.append(tmpl) + else: + cseql.append(cs) + return cseql + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('fastafile') @@ -102,8 +120,13 @@ if __name__ == "__main__": filename = args.fastafile out = args.out - cutsites = args.res_sites - + + # Split restriction sites if comma-separated + cutsites=[] + for s in args.res_sites: + for m in s.split(','): + cutsites.append(m) + # process args and get restriction enzyme sequences sequences = [] offset = [] @@ -112,15 +135,34 @@ if __name__ == "__main__": cseq = ''.join(RE_cutsite[cs.lower()]) else: cseq = cs + offpos = int(cseq.find('^')) if offpos == -1: print "Unable to detect offset for", cseq print "Please, use '^' to specified the cutting position,", print "i.e A^GATCT for HindIII digestion" sys.exit(-1) + + for nuc in list(set(cs)): + if nuc != 'A' and nuc != 'C' and nuc != 'G' and nuc != 'T' and nuc != 'N' and nuc != '^': + print "Find unexpected character ['",nuc,"']in restriction motif" + print "Note that multiple motifs should be separated by a space (not a comma !)" + sys.exit(-1) + offset.append(offpos) sequences.append(re.sub('\^', '', cseq)) + # replace all N in restriction motif + sequences_without_N = [] + offset_without_N = [] + for rs in range(len(sequences)): + nrs = replaceN(sequences[rs]) + sequences_without_N = sequences_without_N + nrs + offset_without_N = offset_without_N + [offset[rs]] * len(nrs) + + sequences = sequences_without_N + offset = offset_without_N + if out is None: out = os.path.splitext(filename)[0] + "_fragments.bed" @@ -129,8 +171,7 @@ if __name__ == "__main__": print "Offset(s)", ','.join(str(x) for x in offset) # Read fasta file and look for rs per chromosome - contig_names, all_indices = find_re_sites(filename, sequences, - offset=offset) + contig_names, all_indices = find_re_sites(filename, sequences, offset=offset) _, lengths = find_chromsomose_lengths(filename) valid_fragments = [] diff --git a/docs/usage.md b/docs/usage.md index 53d922a2fddf53e8b3924b16f7edc3fe97464d73..add8b47bd337c82bdb0b2f930d7fa677208caaf1 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -296,12 +296,14 @@ Minimum mapping quality. Reads with lower quality are discarded. Default: 10 Restriction motif(s) for Hi-C digestion protocol. The restriction motif(s) is(are) used to generate the list of restriction fragments. The precise cutting site of the restriction enzyme has to be specified using the '^' character. Default: 'A^AGCTT' Here are a few examples: -* MboI: '^GATC' -* DpnII: '^GATC' -* BglII: 'A^GATCT' -* HindIII: 'A^AGCTT' +* MboI: ^GATC +* DpnII: ^GATC +* BglII: A^GATCT +* HindIII: A^AGCTT +* ARIMA kit: ^GATC,^GANT + +Note that multiples restriction motifs can be provided (comma-separated) and that 'N' base are supported. -Note that multiples restriction motifs can be provided (comma-separated). ```bash --restriction_size '[Cutting motif]' @@ -310,12 +312,15 @@ Note that multiples restriction motifs can be provided (comma-separated). #### `--ligation_site` Ligation motif after reads ligation. This motif is used for reads trimming and depends on the fill in strategy. -Note that multiple ligation sites can be specified. Default: 'AAGCTAGCTT' +Note that multiple ligation sites can be specified (comma separated) and that 'N' base is interpreted and replaced by 'A','C','G','T'. +Default: 'AAGCTAGCTT' ```bash --ligation_site '[Ligation motif]' ``` +Exemple of the ARIMA kit: GATCGATC,GATCGANT,GANTGATC,GANTGANT + #### `--min_restriction_fragment_size` Minimum size of restriction fragments to consider for the Hi-C processing. Default: ''