Skip to content
Snippets Groups Projects
Commit 85a3af07 authored by nservant's avatar nservant
Browse files

digest_genome.py support N bases and multiple sites

parent c977a62c
No related branches found
No related tags found
No related merge requests found
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
## v1.1dev ## v1.1dev
* Support 'N' base motif in restriction/ligation sites
* Support multiple restriction enzymes/ligattion sites (comma separated) (#31)
* Add --saveInteractionBAM option * Add --saveInteractionBAM option
* Add DOI (#29) * Add DOI (#29)
* Fix bug for reads extension _1/_2 (#30) * Fix bug for reads extension _1/_2 (#30)
......
...@@ -47,6 +47,7 @@ def find_re_sites(filename, sequences, offset): ...@@ -47,6 +47,7 @@ def find_re_sites(filename, sequences, offset):
indices.sort() indices.sort()
all_indices.append(indices) all_indices.append(indices)
indices = [] indices = []
# This is a new chromosome. Empty the sequence string, and add the # This is a new chromosome. Empty the sequence string, and add the
# correct chrom id # correct chrom id
big_str = "" big_str = ""
...@@ -67,6 +68,7 @@ def find_re_sites(filename, sequences, offset): ...@@ -67,6 +68,7 @@ def find_re_sites(filename, sequences, offset):
for m in re.finditer(pattern, big_str)] for m in re.finditer(pattern, big_str)]
indices.sort() indices.sort()
all_indices.append(indices) all_indices.append(indices)
return contig_names, all_indices return contig_names, all_indices
...@@ -87,6 +89,22 @@ def find_chromsomose_lengths(reference_filename): ...@@ -87,6 +89,22 @@ def find_chromsomose_lengths(reference_filename):
return chromosome_names, np.array(chromosome_lengths) return chromosome_names, np.array(chromosome_lengths)
def replaceN(cs):
npos = int(cs.find('N'))
cseql = []
if npos!= -1:
for nuc in ["A","C","G","T"]:
tmp = cs.replace('N', nuc, 1)
tmpl = replaceN(tmp)
if type(tmpl)==list:
cseql = cseql + tmpl
else:
cseql.append(tmpl)
else:
cseql.append(cs)
return cseql
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('fastafile') parser.add_argument('fastafile')
...@@ -102,7 +120,12 @@ if __name__ == "__main__": ...@@ -102,7 +120,12 @@ if __name__ == "__main__":
filename = args.fastafile filename = args.fastafile
out = args.out out = args.out
cutsites = args.res_sites
# Split restriction sites if comma-separated
cutsites=[]
for s in args.res_sites:
for m in s.split(','):
cutsites.append(m)
# process args and get restriction enzyme sequences # process args and get restriction enzyme sequences
sequences = [] sequences = []
...@@ -112,15 +135,34 @@ if __name__ == "__main__": ...@@ -112,15 +135,34 @@ if __name__ == "__main__":
cseq = ''.join(RE_cutsite[cs.lower()]) cseq = ''.join(RE_cutsite[cs.lower()])
else: else:
cseq = cs cseq = cs
offpos = int(cseq.find('^')) offpos = int(cseq.find('^'))
if offpos == -1: if offpos == -1:
print "Unable to detect offset for", cseq print "Unable to detect offset for", cseq
print "Please, use '^' to specified the cutting position,", print "Please, use '^' to specified the cutting position,",
print "i.e A^GATCT for HindIII digestion" print "i.e A^GATCT for HindIII digestion"
sys.exit(-1) sys.exit(-1)
for nuc in list(set(cs)):
if nuc != 'A' and nuc != 'C' and nuc != 'G' and nuc != 'T' and nuc != 'N' and nuc != '^':
print "Find unexpected character ['",nuc,"']in restriction motif"
print "Note that multiple motifs should be separated by a space (not a comma !)"
sys.exit(-1)
offset.append(offpos) offset.append(offpos)
sequences.append(re.sub('\^', '', cseq)) sequences.append(re.sub('\^', '', cseq))
# replace all N in restriction motif
sequences_without_N = []
offset_without_N = []
for rs in range(len(sequences)):
nrs = replaceN(sequences[rs])
sequences_without_N = sequences_without_N + nrs
offset_without_N = offset_without_N + [offset[rs]] * len(nrs)
sequences = sequences_without_N
offset = offset_without_N
if out is None: if out is None:
out = os.path.splitext(filename)[0] + "_fragments.bed" out = os.path.splitext(filename)[0] + "_fragments.bed"
...@@ -129,8 +171,7 @@ if __name__ == "__main__": ...@@ -129,8 +171,7 @@ if __name__ == "__main__":
print "Offset(s)", ','.join(str(x) for x in offset) print "Offset(s)", ','.join(str(x) for x in offset)
# Read fasta file and look for rs per chromosome # Read fasta file and look for rs per chromosome
contig_names, all_indices = find_re_sites(filename, sequences, contig_names, all_indices = find_re_sites(filename, sequences, offset=offset)
offset=offset)
_, lengths = find_chromsomose_lengths(filename) _, lengths = find_chromsomose_lengths(filename)
valid_fragments = [] valid_fragments = []
......
...@@ -296,12 +296,14 @@ Minimum mapping quality. Reads with lower quality are discarded. Default: 10 ...@@ -296,12 +296,14 @@ Minimum mapping quality. Reads with lower quality are discarded. Default: 10
Restriction motif(s) for Hi-C digestion protocol. The restriction motif(s) is(are) used to generate the list of restriction fragments. Restriction motif(s) for Hi-C digestion protocol. The restriction motif(s) is(are) used to generate the list of restriction fragments.
The precise cutting site of the restriction enzyme has to be specified using the '^' character. Default: 'A^AGCTT' The precise cutting site of the restriction enzyme has to be specified using the '^' character. Default: 'A^AGCTT'
Here are a few examples: Here are a few examples:
* MboI: '^GATC' * MboI: ^GATC
* DpnII: '^GATC' * DpnII: ^GATC
* BglII: 'A^GATCT' * BglII: A^GATCT
* HindIII: 'A^AGCTT' * HindIII: A^AGCTT
* ARIMA kit: ^GATC,^GANT
Note that multiples restriction motifs can be provided (comma-separated) and that 'N' base are supported.
Note that multiples restriction motifs can be provided (comma-separated).
```bash ```bash
--restriction_size '[Cutting motif]' --restriction_size '[Cutting motif]'
...@@ -310,12 +312,15 @@ Note that multiples restriction motifs can be provided (comma-separated). ...@@ -310,12 +312,15 @@ Note that multiples restriction motifs can be provided (comma-separated).
#### `--ligation_site` #### `--ligation_site`
Ligation motif after reads ligation. This motif is used for reads trimming and depends on the fill in strategy. Ligation motif after reads ligation. This motif is used for reads trimming and depends on the fill in strategy.
Note that multiple ligation sites can be specified. Default: 'AAGCTAGCTT' Note that multiple ligation sites can be specified (comma separated) and that 'N' base is interpreted and replaced by 'A','C','G','T'.
Default: 'AAGCTAGCTT'
```bash ```bash
--ligation_site '[Ligation motif]' --ligation_site '[Ligation motif]'
``` ```
Exemple of the ARIMA kit: GATCGATC,GATCGANT,GANTGATC,GANTGANT
#### `--min_restriction_fragment_size` #### `--min_restriction_fragment_size`
Minimum size of restriction fragments to consider for the Hi-C processing. Default: '' Minimum size of restriction fragments to consider for the Hi-C processing. Default: ''
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment