From 9f5e877412776879036f6ab947a24ad522710249 Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Mon, 9 Mar 2020 15:34:12 +0100
Subject: [PATCH] src/db_utils/frequency_scripts/frequency_function.py:
 modification of frequencies for better performances

---
 .../frequency_scripts/frequency_function.py   | 28 ++++++++-----------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/src/db_utils/frequency_scripts/frequency_function.py b/src/db_utils/frequency_scripts/frequency_function.py
index 4e8c0cac..19bc7a2e 100644
--- a/src/db_utils/frequency_scripts/frequency_function.py
+++ b/src/db_utils/frequency_scripts/frequency_function.py
@@ -12,6 +12,7 @@ from typing import Iterable, Dict, List
 import logging
 from Bio.Seq import Seq
 import numpy as np
+import regex as re
 
 
 def frequencies(sequence: str, nt_list: Iterable[str]) -> Dict[str, float]:
@@ -24,23 +25,16 @@ def frequencies(sequence: str, nt_list: Iterable[str]) -> Dict[str, float]:
     """
     if not full_defined(sequence) or len(sequence) < 3:
         return {nt: np.nan for nt in nt_list}
-    dic = {nt: 0. for nt in nt_list}
     seql = len(sequence)
-    for i, n in enumerate(sequence):
-        for nt in nt_list:
-            ntl = len(nt)
-            if ntl == 1:
-                if nt in ["S", "W", "K", "M", "R", "Y"]:
-                    if n in Config_freq.iupac_dic[nt]:
-                        dic[nt] += 1 / seql * 100
-                else:
-                    if n == nt:
-                        dic[nt] += 1 / seql * 100
-            else:
-                if sequence[i:i + ntl] == nt:
-                    dic[nt] += 1 / (seql - ntl + 1) * 100
-    for nt in nt_list:
-        dic[nt] = round(dic[nt], 5)
+    dic = {}
+    for n in nt_list:
+        if n in Config_freq.iupac_dic.keys():
+            pat = re.compile(Config_freq.iupac_dic[n])
+        else:
+            pat = re.compile(n)
+        seqlen = seql - len(n) + 1
+        dic[n] = round((len(list(re.findall(pat, sequence, overlapped=True)))
+                        / seqlen) * 100, 5)
     return dic
 
 
@@ -57,7 +51,7 @@ def compute_dic(dic_seq: Dict[str, Seq], coord: List[str],
     sequence = dic_seq[coord[0]][int(coord[1]): int(coord[2])]
     if coord[3] == "-":
         sequence = sequence.reverse_complement()
-    return frequencies(sequence, nt_list)
+    return frequencies(str(sequence), nt_list)
 
 
 def get_ft_type(ft: str) -> str:
-- 
GitLab