Skip to content
Snippets Groups Projects
Commit 54555bf4 authored by nfontrod's avatar nfontrod
Browse files

src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py :...

src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py : modification of compute_gene_orf_data to compute codon/aa/properties frequencies at gene level
parent 9f5e8774
No related branches found
No related tags found
No related merge requests found
......@@ -119,14 +119,20 @@ def compute_gene_orf_data(df: pd.DataFrame) -> pd.DataFrame:
:return:
"""
df['id_gene'] = df['id_exon'].str.replace(r'_\d+', '')
gene_len = df[['id_gene', 'length']].groupby('id_gene').sum().reset_index()
df_exon = df.loc[(df['ft_type'] == "aa") & (df['ft'] == 'A'), :]
res = df_exon.groupby('id_exon').count()['length'].values
if np.max(res) != np.min(res) != 1:
msg = "On exon must be present only once !"
logging.exception(msg)
raise ValueError(msg)
gene_len = df_exon[['id_gene', 'length']].groupby('id_gene').sum()\
.reset_index()
df = pd.merge(df, gene_len, how='left', on="id_gene",
suffixes=['_exon', '_gene'])
df['new_freq'] = df['frequency'] * (df['length_exon'] / df['length_gene'])
df_final = df[['id_gene', 'ft_type', 'ft', 'new_freq']].\
groupby(['id_gene', 'ft_type', 'ft']).sum().reset_index()
df_final.rename(columns={"new_freq": 'frequency'}, inplace=True)
df_final['frequency'] = round(df_final['frequency'], 5)
return df_final
......@@ -174,6 +180,8 @@ def create_or_load_freq_table(ps: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
else:
df_gene = pd.read_csv(Config_freq.output_gene_file, sep="\t")
HG19 = None
df_exon['frequency'] = round(df_exon['frequency'], 5)
df_gene['frequency'] = round(df_gene['frequency'], 5)
return df_exon, df_gene
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment