diff --git a/src/db_utils/frequency_scripts/config_freq.py b/src/db_utils/frequency_scripts/config_freq.py index 4f70c3c4c8feb7663e9ee9b87e91210df02b1fdb..c30fdf5234c7ef681340331cffdeabc5056ea194 100644 --- a/src/db_utils/frequency_scripts/config_freq.py +++ b/src/db_utils/frequency_scripts/config_freq.py @@ -71,8 +71,8 @@ class Config_freq: output = Config.results / 'frequency_table' output_exon_file = output / 'exon_freq.txt' output_gene_file = output / 'gene_freq.txt' - iupac_dic = {"S": ["G", "C"], "W": ["A", "T"], "K": ["G", "T"], - "M": ["A", "C"], "R": ["A", "G"], "Y": ["C", "T"]} + iupac_dic = {"S": "[GC]", "W": "[AT]", "K": "[GT]", + "M": "[AC]", "R": "[AG]", "Y": "[CT]"} nt_list = compute_nt_list() aa_list = list("RHKDESTNQCGPAVILMFYW") codon_list = [nt for nt in nt_list if len(nt) == 3] diff --git a/src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py b/src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py index 532113693f70e3f42c5f957ffd511c79527195a5..966106e6c820d79ce2f55d6c6b84fedab3887340 100644 --- a/src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py +++ b/src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py @@ -119,14 +119,20 @@ def compute_gene_orf_data(df: pd.DataFrame) -> pd.DataFrame: :return: """ df['id_gene'] = df['id_exon'].str.replace(r'_\d+', '') - gene_len = df[['id_gene', 'length']].groupby('id_gene').sum().reset_index() + df_exon = df.loc[(df['ft_type'] == "aa") & (df['ft'] == 'A'), :] + res = df_exon.groupby('id_exon').count()['length'].values + if np.max(res) != np.min(res) != 1: + msg = "On exon must be present only once !" + logging.exception(msg) + raise ValueError(msg) + gene_len = df_exon[['id_gene', 'length']].groupby('id_gene').sum()\ + .reset_index() df = pd.merge(df, gene_len, how='left', on="id_gene", suffixes=['_exon', '_gene']) df['new_freq'] = df['frequency'] * (df['length_exon'] / df['length_gene']) df_final = df[['id_gene', 'ft_type', 'ft', 'new_freq']].\ groupby(['id_gene', 'ft_type', 'ft']).sum().reset_index() df_final.rename(columns={"new_freq": 'frequency'}, inplace=True) - df_final['frequency'] = round(df_final['frequency'], 5) return df_final @@ -174,6 +180,8 @@ def create_or_load_freq_table(ps: int) -> Tuple[pd.DataFrame, pd.DataFrame]: else: df_gene = pd.read_csv(Config_freq.output_gene_file, sep="\t") HG19 = None + df_exon['frequency'] = round(df_exon['frequency'], 5) + df_gene['frequency'] = round(df_gene['frequency'], 5) return df_exon, df_gene diff --git a/src/db_utils/frequency_scripts/frequency_function.py b/src/db_utils/frequency_scripts/frequency_function.py index 4e8c0caca92f83a5a9a20549e707b613dab54c5a..19bc7a2e548af8cb5f8c21ff923f7b58c1692f78 100644 --- a/src/db_utils/frequency_scripts/frequency_function.py +++ b/src/db_utils/frequency_scripts/frequency_function.py @@ -12,6 +12,7 @@ from typing import Iterable, Dict, List import logging from Bio.Seq import Seq import numpy as np +import regex as re def frequencies(sequence: str, nt_list: Iterable[str]) -> Dict[str, float]: @@ -24,23 +25,16 @@ def frequencies(sequence: str, nt_list: Iterable[str]) -> Dict[str, float]: """ if not full_defined(sequence) or len(sequence) < 3: return {nt: np.nan for nt in nt_list} - dic = {nt: 0. for nt in nt_list} seql = len(sequence) - for i, n in enumerate(sequence): - for nt in nt_list: - ntl = len(nt) - if ntl == 1: - if nt in ["S", "W", "K", "M", "R", "Y"]: - if n in Config_freq.iupac_dic[nt]: - dic[nt] += 1 / seql * 100 - else: - if n == nt: - dic[nt] += 1 / seql * 100 - else: - if sequence[i:i + ntl] == nt: - dic[nt] += 1 / (seql - ntl + 1) * 100 - for nt in nt_list: - dic[nt] = round(dic[nt], 5) + dic = {} + for n in nt_list: + if n in Config_freq.iupac_dic.keys(): + pat = re.compile(Config_freq.iupac_dic[n]) + else: + pat = re.compile(n) + seqlen = seql - len(n) + 1 + dic[n] = round((len(list(re.findall(pat, sequence, overlapped=True))) + / seqlen) * 100, 5) return dic @@ -57,7 +51,7 @@ def compute_dic(dic_seq: Dict[str, Seq], coord: List[str], sequence = dic_seq[coord[0]][int(coord[1]): int(coord[2])] if coord[3] == "-": sequence = sequence.reverse_complement() - return frequencies(sequence, nt_list) + return frequencies(str(sequence), nt_list) def get_ft_type(ft: str) -> str: