From 85a3af07bdc97ac40eb159147a331ff406ecee7a Mon Sep 17 00:00:00 2001
From: nservant <nservant@curie.fr>
Date: Fri, 4 Oct 2019 17:42:02 +0200
Subject: [PATCH] digest_genome.py support N bases and multiple sites

---
 CHANGELOG.md         |  2 ++
 bin/digest_genome.py | 49 ++++++++++++++++++++++++++++++++++++++++----
 docs/usage.md        | 17 +++++++++------
 3 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d183e93..d01eb83 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## v1.1dev
 
+* Support 'N' base motif in restriction/ligation sites
+* Support multiple restriction enzymes/ligattion sites (comma separated) (#31)
 * Add --saveInteractionBAM option
 * Add DOI (#29)
 * Fix bug for reads extension _1/_2 (#30)
diff --git a/bin/digest_genome.py b/bin/digest_genome.py
index db2d151..ac6d8da 100755
--- a/bin/digest_genome.py
+++ b/bin/digest_genome.py
@@ -47,6 +47,7 @@ def find_re_sites(filename, sequences, offset):
                 indices.sort()
                 all_indices.append(indices)
                 indices = []
+
             # This is a new chromosome. Empty the sequence string, and add the
             # correct chrom id
             big_str = ""
@@ -67,6 +68,7 @@ def find_re_sites(filename, sequences, offset):
                     for m in re.finditer(pattern, big_str)]
     indices.sort()
     all_indices.append(indices)
+    
     return contig_names, all_indices
 
 
@@ -87,6 +89,22 @@ def find_chromsomose_lengths(reference_filename):
     return chromosome_names, np.array(chromosome_lengths)
 
 
+def replaceN(cs):
+    npos = int(cs.find('N'))
+    cseql = []
+    if npos!= -1:
+        for nuc in ["A","C","G","T"]:
+            tmp = cs.replace('N', nuc, 1)
+            tmpl = replaceN(tmp)
+            if type(tmpl)==list:
+                cseql = cseql + tmpl
+            else:
+                cseql.append(tmpl)
+    else:
+        cseql.append(cs)
+    return cseql
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('fastafile')
@@ -102,8 +120,13 @@ if __name__ == "__main__":
 
     filename = args.fastafile
     out = args.out
-    cutsites = args.res_sites
-
+    
+    # Split restriction sites if comma-separated
+    cutsites=[]
+    for s in args.res_sites:
+        for m in s.split(','):
+            cutsites.append(m)
+                
     # process args and get restriction enzyme sequences
     sequences = []
     offset = []
@@ -112,15 +135,34 @@ if __name__ == "__main__":
             cseq = ''.join(RE_cutsite[cs.lower()])
         else:
             cseq = cs
+
         offpos = int(cseq.find('^'))
         if offpos == -1:
             print "Unable to detect offset for", cseq
             print "Please, use '^' to specified the cutting position,",
             print "i.e A^GATCT for HindIII digestion"
             sys.exit(-1)
+
+        for nuc in list(set(cs)):
+            if nuc != 'A' and nuc != 'C' and nuc != 'G' and nuc != 'T' and nuc != 'N' and nuc != '^':
+                print "Find unexpected character ['",nuc,"']in restriction motif"
+                print "Note that multiple motifs should be separated by a space (not a comma !)"
+                sys.exit(-1)
+
         offset.append(offpos)
         sequences.append(re.sub('\^', '', cseq))
 
+    # replace all N in restriction motif
+    sequences_without_N = []
+    offset_without_N = []
+    for rs in range(len(sequences)):
+        nrs = replaceN(sequences[rs])
+        sequences_without_N = sequences_without_N + nrs
+        offset_without_N = offset_without_N + [offset[rs]] * len(nrs)
+          
+    sequences = sequences_without_N
+    offset = offset_without_N
+    
     if out is None:
         out = os.path.splitext(filename)[0] + "_fragments.bed"
 
@@ -129,8 +171,7 @@ if __name__ == "__main__":
     print "Offset(s)",  ','.join(str(x) for x in offset)
 
     # Read fasta file and look for rs per chromosome
-    contig_names, all_indices = find_re_sites(filename, sequences,
-                                              offset=offset)
+    contig_names, all_indices = find_re_sites(filename, sequences,  offset=offset)
     _, lengths = find_chromsomose_lengths(filename)
 
     valid_fragments = []
diff --git a/docs/usage.md b/docs/usage.md
index 53d922a..add8b47 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -296,12 +296,14 @@ Minimum mapping quality. Reads with lower quality are discarded. Default: 10
 Restriction motif(s) for Hi-C digestion protocol. The restriction motif(s) is(are) used to generate the list of restriction fragments.
 The precise cutting site of the restriction enzyme has to be specified using the '^' character. Default: 'A^AGCTT'
 Here are a few examples:
-* MboI: '^GATC'
-* DpnII: '^GATC'
-* BglII: 'A^GATCT'
-* HindIII: 'A^AGCTT'
+* MboI: ^GATC
+* DpnII: ^GATC
+* BglII: A^GATCT
+* HindIII: A^AGCTT
+* ARIMA kit: ^GATC,^GANT
+
+Note that multiples restriction motifs can be provided (comma-separated) and that 'N' base are supported.
 
-Note that multiples restriction motifs can be provided (comma-separated).
 
 ```bash
 --restriction_size '[Cutting motif]'
@@ -310,12 +312,15 @@ Note that multiples restriction motifs can be provided (comma-separated).
 #### `--ligation_site`
 
 Ligation motif after reads ligation. This motif is used for reads trimming and depends on the fill in strategy.
-Note that multiple ligation sites can be specified. Default: 'AAGCTAGCTT'
+Note that multiple ligation sites can be specified (comma separated) and that 'N' base is interpreted and replaced by 'A','C','G','T'.
+Default: 'AAGCTAGCTT'
 
 ```bash
 --ligation_site '[Ligation motif]'
 ```
 
+Exemple of the ARIMA kit: GATCGATC,GATCGANT,GANTGATC,GANTGANT
+
 #### `--min_restriction_fragment_size`
 
 Minimum size of restriction fragments to consider for the Hi-C processing. Default: ''
-- 
GitLab