Skip to content
Snippets Groups Projects
Commit d95d4c87 authored by alapendr's avatar alapendr
Browse files

Merge branch 'master' of gitbio.ens-lyon.fr:LBMC/regards/chia-pet_network

parents c09cedd4 302c2c0e
No related branches found
No related tags found
No related merge requests found
......@@ -71,8 +71,8 @@ class Config_freq:
output = Config.results / 'frequency_table'
output_exon_file = output / 'exon_freq.txt'
output_gene_file = output / 'gene_freq.txt'
iupac_dic = {"S": ["G", "C"], "W": ["A", "T"], "K": ["G", "T"],
"M": ["A", "C"], "R": ["A", "G"], "Y": ["C", "T"]}
iupac_dic = {"S": "[GC]", "W": "[AT]", "K": "[GT]",
"M": "[AC]", "R": "[AG]", "Y": "[CT]"}
nt_list = compute_nt_list()
aa_list = list("RHKDESTNQCGPAVILMFYW")
codon_list = [nt for nt in nt_list if len(nt) == 3]
......
......@@ -119,14 +119,20 @@ def compute_gene_orf_data(df: pd.DataFrame) -> pd.DataFrame:
:return:
"""
df['id_gene'] = df['id_exon'].str.replace(r'_\d+', '')
gene_len = df[['id_gene', 'length']].groupby('id_gene').sum().reset_index()
df_exon = df.loc[(df['ft_type'] == "aa") & (df['ft'] == 'A'), :]
res = df_exon.groupby('id_exon').count()['length'].values
if np.max(res) != np.min(res) != 1:
msg = "On exon must be present only once !"
logging.exception(msg)
raise ValueError(msg)
gene_len = df_exon[['id_gene', 'length']].groupby('id_gene').sum()\
.reset_index()
df = pd.merge(df, gene_len, how='left', on="id_gene",
suffixes=['_exon', '_gene'])
df['new_freq'] = df['frequency'] * (df['length_exon'] / df['length_gene'])
df_final = df[['id_gene', 'ft_type', 'ft', 'new_freq']].\
groupby(['id_gene', 'ft_type', 'ft']).sum().reset_index()
df_final.rename(columns={"new_freq": 'frequency'}, inplace=True)
df_final['frequency'] = round(df_final['frequency'], 5)
return df_final
......@@ -174,6 +180,8 @@ def create_or_load_freq_table(ps: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
else:
df_gene = pd.read_csv(Config_freq.output_gene_file, sep="\t")
HG19 = None
df_exon['frequency'] = round(df_exon['frequency'], 5)
df_gene['frequency'] = round(df_gene['frequency'], 5)
return df_exon, df_gene
......
......@@ -12,6 +12,7 @@ from typing import Iterable, Dict, List
import logging
from Bio.Seq import Seq
import numpy as np
import regex as re
def frequencies(sequence: str, nt_list: Iterable[str]) -> Dict[str, float]:
......@@ -24,23 +25,16 @@ def frequencies(sequence: str, nt_list: Iterable[str]) -> Dict[str, float]:
"""
if not full_defined(sequence) or len(sequence) < 3:
return {nt: np.nan for nt in nt_list}
dic = {nt: 0. for nt in nt_list}
seql = len(sequence)
for i, n in enumerate(sequence):
for nt in nt_list:
ntl = len(nt)
if ntl == 1:
if nt in ["S", "W", "K", "M", "R", "Y"]:
if n in Config_freq.iupac_dic[nt]:
dic[nt] += 1 / seql * 100
else:
if n == nt:
dic[nt] += 1 / seql * 100
else:
if sequence[i:i + ntl] == nt:
dic[nt] += 1 / (seql - ntl + 1) * 100
for nt in nt_list:
dic[nt] = round(dic[nt], 5)
dic = {}
for n in nt_list:
if n in Config_freq.iupac_dic.keys():
pat = re.compile(Config_freq.iupac_dic[n])
else:
pat = re.compile(n)
seqlen = seql - len(n) + 1
dic[n] = round((len(list(re.findall(pat, sequence, overlapped=True)))
/ seqlen) * 100, 5)
return dic
......@@ -57,7 +51,7 @@ def compute_dic(dic_seq: Dict[str, Seq], coord: List[str],
sequence = dic_seq[coord[0]][int(coord[1]): int(coord[2])]
if coord[3] == "-":
sequence = sequence.reverse_complement()
return frequencies(sequence, nt_list)
return frequencies(str(sequence), nt_list)
def get_ft_type(ft: str) -> str:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment