Skip to content
Snippets Groups Projects
Commit 85a3af07 authored by nservant's avatar nservant
Browse files

digest_genome.py support N bases and multiple sites

parent c977a62c
No related branches found
No related tags found
No related merge requests found
......@@ -2,6 +2,8 @@
## v1.1dev
* Support 'N' base motif in restriction/ligation sites
* Support multiple restriction enzymes/ligattion sites (comma separated) (#31)
* Add --saveInteractionBAM option
* Add DOI (#29)
* Fix bug for reads extension _1/_2 (#30)
......
......@@ -47,6 +47,7 @@ def find_re_sites(filename, sequences, offset):
indices.sort()
all_indices.append(indices)
indices = []
# This is a new chromosome. Empty the sequence string, and add the
# correct chrom id
big_str = ""
......@@ -67,6 +68,7 @@ def find_re_sites(filename, sequences, offset):
for m in re.finditer(pattern, big_str)]
indices.sort()
all_indices.append(indices)
return contig_names, all_indices
......@@ -87,6 +89,22 @@ def find_chromsomose_lengths(reference_filename):
return chromosome_names, np.array(chromosome_lengths)
def replaceN(cs):
npos = int(cs.find('N'))
cseql = []
if npos!= -1:
for nuc in ["A","C","G","T"]:
tmp = cs.replace('N', nuc, 1)
tmpl = replaceN(tmp)
if type(tmpl)==list:
cseql = cseql + tmpl
else:
cseql.append(tmpl)
else:
cseql.append(cs)
return cseql
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('fastafile')
......@@ -102,8 +120,13 @@ if __name__ == "__main__":
filename = args.fastafile
out = args.out
cutsites = args.res_sites
# Split restriction sites if comma-separated
cutsites=[]
for s in args.res_sites:
for m in s.split(','):
cutsites.append(m)
# process args and get restriction enzyme sequences
sequences = []
offset = []
......@@ -112,15 +135,34 @@ if __name__ == "__main__":
cseq = ''.join(RE_cutsite[cs.lower()])
else:
cseq = cs
offpos = int(cseq.find('^'))
if offpos == -1:
print "Unable to detect offset for", cseq
print "Please, use '^' to specified the cutting position,",
print "i.e A^GATCT for HindIII digestion"
sys.exit(-1)
for nuc in list(set(cs)):
if nuc != 'A' and nuc != 'C' and nuc != 'G' and nuc != 'T' and nuc != 'N' and nuc != '^':
print "Find unexpected character ['",nuc,"']in restriction motif"
print "Note that multiple motifs should be separated by a space (not a comma !)"
sys.exit(-1)
offset.append(offpos)
sequences.append(re.sub('\^', '', cseq))
# replace all N in restriction motif
sequences_without_N = []
offset_without_N = []
for rs in range(len(sequences)):
nrs = replaceN(sequences[rs])
sequences_without_N = sequences_without_N + nrs
offset_without_N = offset_without_N + [offset[rs]] * len(nrs)
sequences = sequences_without_N
offset = offset_without_N
if out is None:
out = os.path.splitext(filename)[0] + "_fragments.bed"
......@@ -129,8 +171,7 @@ if __name__ == "__main__":
print "Offset(s)", ','.join(str(x) for x in offset)
# Read fasta file and look for rs per chromosome
contig_names, all_indices = find_re_sites(filename, sequences,
offset=offset)
contig_names, all_indices = find_re_sites(filename, sequences, offset=offset)
_, lengths = find_chromsomose_lengths(filename)
valid_fragments = []
......
......@@ -296,12 +296,14 @@ Minimum mapping quality. Reads with lower quality are discarded. Default: 10
Restriction motif(s) for Hi-C digestion protocol. The restriction motif(s) is(are) used to generate the list of restriction fragments.
The precise cutting site of the restriction enzyme has to be specified using the '^' character. Default: 'A^AGCTT'
Here are a few examples:
* MboI: '^GATC'
* DpnII: '^GATC'
* BglII: 'A^GATCT'
* HindIII: 'A^AGCTT'
* MboI: ^GATC
* DpnII: ^GATC
* BglII: A^GATCT
* HindIII: A^AGCTT
* ARIMA kit: ^GATC,^GANT
Note that multiples restriction motifs can be provided (comma-separated) and that 'N' base are supported.
Note that multiples restriction motifs can be provided (comma-separated).
```bash
--restriction_size '[Cutting motif]'
......@@ -310,12 +312,15 @@ Note that multiples restriction motifs can be provided (comma-separated).
#### `--ligation_site`
Ligation motif after reads ligation. This motif is used for reads trimming and depends on the fill in strategy.
Note that multiple ligation sites can be specified. Default: 'AAGCTAGCTT'
Note that multiple ligation sites can be specified (comma separated) and that 'N' base is interpreted and replaced by 'A','C','G','T'.
Default: 'AAGCTAGCTT'
```bash
--ligation_site '[Ligation motif]'
```
Exemple of the ARIMA kit: GATCGATC,GATCGANT,GANTGATC,GANTGANT
#### `--min_restriction_fragment_size`
Minimum size of restriction fragments to consider for the Hi-C processing. Default: ''
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment