diff --git a/src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py b/src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py index 4b6c8d382acd1a39a29079ac6e607f7cab32800d..14173c24c23a79a360ce572722879934ad8656c4 100644 --- a/src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py +++ b/src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py @@ -10,77 +10,90 @@ from Bio import SeqIO from Bio.Seq import Seq import logging import logging.config -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Callable from .config_freq import Config_freq, Config from pathlib import Path import pandas as pd -from .frequency_function import concat_dic, compute_dic, get_ft_type +from .frequency_function import compute_dic, get_ft_type from .orf_frequency_functions import mytranslate, get_codon_freq, \ get_aa_freq, get_mean_propensity_score, get_orf_ft_type from ...logging_conf import logging_def from ..populate_database import populate_df from tqdm import tqdm +import multiprocessing as mp +import numpy as np -def get_freq_table(bed_file: Path, dic_seq: Dict[str, Seq]) -> pd.DataFrame: +HG19 = {"void": Seq("")} # Global variable thta will store hg19 genome + + +def multiprocess_freq(line: str) -> pd.Series: """ - Create a table with new frequency files. + Get the frequency for the region given by the line of a bed file. - :param bed_file: a bed file - :param dic_seq: Dictionary of sequences + :param line: A line of a bed file + :return: The frequency values """ - logging.info(f"Files {bed_file}") - id_col = f'id_{bed_file.name.replace(".bed", "")}' - dic = {id_col: []} - num_lines = sum(1 for _ in bed_file.open("r")) - with bed_file.open("r") as inbed: - pbar = tqdm(inbed, desc=f'Processing {bed_file.name}', total=num_lines) - for line in pbar: - if "#" not in line[0]: - line = line.replace("\n", "") - sline: List[str] = line.split("\t") - if int(sline[2]) - int(sline[1]) > 2: - coord = [sline[0], sline[1], sline[2], sline[5]] - dic_freq = compute_dic(dic_seq, coord, Config_freq.nt_list) - dic = concat_dic(dic, dic_freq, sline[3], id_col) - df = pd.DataFrame(dic) - df = df.melt(id_vars=[id_col], var_name="ft", value_name='frequency') - df['ft_type'] = df['ft'].apply(get_ft_type) - return df + line = line.replace("\n", "") + sline: List[str] = line.split("\t") + coord = [sline[0], sline[1], sline[2], sline[5]] + dic_freq = compute_dic(HG19, coord, Config_freq.nt_list) + dic_freq['id'] = sline[3] + dic_freq['length'] = int(sline[2]) - int(sline[1]) + return pd.Series(dic_freq) + + +def multiprocess_freq_orf(line: str) -> pd.Series: + exon: List = line.replace('\n', '').split('\t') + exon[1] = int(exon[1]) + exon[2] = int(exon[2]) + if (exon[2] - exon[1]) % 3 != 0: + msg = f"The length of {exon[3]} is not a multiple of 3 !" + logging.exception(msg) + raise IndexError(msg) + nt_seq, aa_seq = mytranslate(HG19, [exon[0], exon[1], + exon[2], exon[5]]) + if len(aa_seq) > 0 and "*" not in aa_seq: + dic_codon = get_codon_freq(nt_seq, Config_freq.codon_list) + dic_aa = get_aa_freq(aa_seq, Config_freq.aa_list) + dic_prop = get_mean_propensity_score(aa_seq, + Config_freq.property_dic) + dic_freq = dict(dic_codon, **dic_aa, **dic_prop) + else: + ft_list = Config_freq.codon_list + Config_freq.aa_list + \ + Config_freq.property_dic + dic_freq = {cft: np.nan for cft in ft_list} + dic_freq['id'] = exon[3] + dic_freq['length'] = exon[2] - exon[1] + return pd.Series(dic_freq) -def get_orf_freq_table(bed_orf: Path, dic_seq: Dict[str, Seq]): +def get_freq_table(ps: int, bed_file: Path, mutli_fnc: Callable, + fnc_ft_type: Callable) -> pd.DataFrame: """ - Write an orf bed exon file. + Create a table with new frequency files. - :param bed_orf: a bed file containing exons orf - :param dic_seq:a dictionary containing chromosome sequences. + :param bed_file: a bed file + :param ps: The number of process to create + :param mutli_fnc: Function to execute + :param fnc_ft_type: The function to get ft type name """ - id_col = 'id_exon' - dic = {id_col: []} - num_lines = sum(1 for _ in bed_orf.open("r")) - with bed_orf.open("r") as infile: - pbar = tqdm(infile, desc=f'Processing {bed_orf.name}', total=num_lines) - for line in pbar: - exon: List = line.replace('\n', '').split('\t') - exon[1] = int(exon[1]) - exon[2] = int(exon[2]) - if (exon[2] - exon[1]) % 3 != 0: - msg = f"The length of {exon[3]} is not a multiple of 3 !" - logging.exception(msg) - raise IndexError(msg) - nt_seq, aa_seq = mytranslate(dic_seq, [exon[0], exon[1], - exon[2], exon[5]]) - if len(aa_seq) > 0 and "*" not in aa_seq: - dic_codon = get_codon_freq(nt_seq, Config_freq.codon_list) - dic_aa = get_aa_freq(aa_seq, Config_freq.aa_list) - dic_prop = get_mean_propensity_score(aa_seq, - Config_freq.property_dic) - dic_freq = dict(dic_codon, **dic_aa, **dic_prop) - dic = concat_dic(dic, dic_freq, exon[3], id_col) - df = pd.DataFrame(dic) - df = df.melt(id_vars=[id_col], var_name="ft", value_name='frequency') - df['ft_type'] = df['ft'].apply(get_orf_ft_type) + id_col = f'id_{bed_file.name.replace(".bed", "").replace("_orf", "")}' + pool = mp.Pool(processes=ps) + processes = [] + with bed_file.open("r") as inbed: + for line in inbed: + arg = [line] + processes.append(pool.apply_async(mutli_fnc, arg)) + my_res = [] + pbar = tqdm(processes, desc=f'Processing {bed_file.name}') + for proc in pbar: + my_res.append(proc.get(timeout=None)) + df = pd.DataFrame(my_res) + df = df.melt(id_vars=["id", 'length'], var_name="ft", + value_name='frequency') + df['ft_type'] = df['ft'].apply(fnc_ft_type) + df.rename(columns={'id': id_col}, inplace=True) return df @@ -97,43 +110,79 @@ def load_hg19(hg19: Path) -> Dict[str, Seq]: return dic -def create_or_load_freq_table() -> Tuple[pd.DataFrame, pd.DataFrame]: +def compute_gene_orf_data(df: pd.DataFrame) -> pd.DataFrame: + """ + Compute the frequency of codons, amino_acid and properties within \ + genes (corresponds to the concatenation of exons). + + :param df: Dataframe fo exon + :return: + """ + df['id_gene'] = df['id_exon'].str.replace(r'_\d+', '') + gene_len = df[['id_gene', 'length']].groupby('id_gene').sum().reset_index() + df = pd.merge(df, gene_len, how='left', on="id_gene", + suffixes=['_exon', '_gene']) + df['new_freq'] = df['frequency'] * (df['length_exon'] / df['length_gene']) + df_final = df[['id_gene', 'ft_type', 'ft', 'new_freq']].\ + groupby(['id_gene', 'ft_type', 'ft']).sum().reset_index() + return df_final + + +def create_or_load_freq_table(ps: int) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Create frequency tables for gene and exons. + + :param ps: The number of processes to use to compute the frequency \ + table """ + global HG19 if not Config_freq.output_exon_file.is_file() \ or not Config_freq.output_gene_file.is_file(): logging.debug('loading hg19 genome') - dic_seq = load_hg19(Config_freq.hg19) + HG19 = load_hg19(Config_freq.hg19) if not Config_freq.output_exon_file.is_file(): logging.debug('get frequency table for exon') - df_exon = get_freq_table(Config.bed_exon, dic_seq) + df_exon = get_freq_table(ps, Config.bed_exon, + multiprocess_freq, get_ft_type) logging.debug(df_exon.head()) logging.debug('Get orf frequency for exon') - df_orf_exon = get_orf_freq_table(Config_freq.exon_orf_bed, dic_seq) + df_orf_exon = get_freq_table(ps, Config_freq.exon_orf_bed, + multiprocess_freq_orf, get_orf_ft_type) logging.debug(df_orf_exon.head()) - df_exon = pd.concat([df_exon, df_orf_exon], axis=0, ignore_index=True) + df_exon = pd.concat([df_exon, df_orf_exon], axis=0, ignore_index=True, + sort=False) else: logging.debug('Loading exon_freq file ...') df_exon = pd.read_csv(Config_freq.output_exon_file, sep="\t") + df_orf_exon = df_exon[ + df_exon['ft_type'].isin('codon', 'aa', 'properties'), :] if not Config_freq.output_gene_file.is_file(): logging.debug('get frequency table gene') - df_gene = get_freq_table(Config.bed_gene, dic_seq) + df_gene = get_freq_table(ps, Config.bed_gene, + multiprocess_freq, get_ft_type) logging.debug(df_gene.head()) + logging.debug('Get orf frequency for gene') + df_orf_gene = compute_gene_orf_data(df_orf_exon) + logging.debug(df_orf_gene.head()) + df_gene = pd.concat([df_exon, df_gene], axis=0, ignore_index=True, + sort=False) else: df_gene = pd.read_csv(Config_freq.output_gene_file, sep="\t") + HG19 = None return df_exon, df_gene -def fill_frequency_tables(logging_level: str = 'DISABLE'): +def fill_frequency_tables(ps: int = 1, logging_level: str = 'DISABLE'): """ Fill the frequency tables of exon and gene. + :param ps: the number of process to create :param logging_level: The level of information to display """ logging_def(Config_freq.output, __file__, logging_level) - df_exon, df_gene = create_or_load_freq_table() - exit(1) + df_exon, df_gene = create_or_load_freq_table(ps) + df_exon.to_csv(Config_freq.output_exon_file, sep="\t", index=False) + df_gene.to_csv(Config_freq.output_gene_file, sep="\t", index=False) logging.debug('Filling cin_exon_frequency') populate_df(table='cin_exon_frequency', df=df_exon, clean='y') logging.debug('Filling cin_gene_frequency') @@ -141,4 +190,4 @@ def fill_frequency_tables(logging_level: str = 'DISABLE'): if __name__ == "__main__": - fill_frequency_tables('DEBUG') + fill_frequency_tables(1, 'DEBUG')