diff --git a/src/db_utils/db_creation.py b/src/db_utils/db_creation.py index 7a23303153909757527467ce8fc9e20818f4336a..128c20dd741d39427f16187e3115f5aed3b97ebe 100755 --- a/src/db_utils/db_creation.py +++ b/src/db_utils/db_creation.py @@ -86,6 +86,7 @@ def create_cin_gene_frequency_table(conn: sqlite3.Connection) -> None: [ft] VARCHAR(3) NOT NULL, [ft_type] VARCHAR(3) NOT NULL, [id_gene] INT NOT NULL, + [region] VARCHAR(10) NOT NULL, [frequency] FLOAT NULL, PRIMARY KEY ([id]), FOREIGN KEY ([id_gene]) REFERENCES cin_gene([id]))''') diff --git a/src/db_utils/frequency_scripts/config_freq.py b/src/db_utils/frequency_scripts/config_freq.py index c30fdf5234c7ef681340331cffdeabc5056ea194..56d0e3863093d8ee0822d69ca91a347186602c9c 100644 --- a/src/db_utils/frequency_scripts/config_freq.py +++ b/src/db_utils/frequency_scripts/config_freq.py @@ -68,9 +68,10 @@ class Config_freq: """A class containing configurations variables used in this subfolder.""" property_file = Config.data / 'chosen_scales.csv' exon_orf_bed = Config.data / 'bed' / 'exon_orf.bed' + intron_bed = Config.data / 'bed' / 'intron.bed' output = Config.results / 'frequency_table' - output_exon_file = output / 'exon_freq.txt' - output_gene_file = output / 'gene_freq.txt' + output_exon_file = output / 'exon_freq.txt.gz' + output_gene_file = output / 'gene_freq.txt.gz' iupac_dic = {"S": "[GC]", "W": "[AT]", "K": "[GT]", "M": "[AC]", "R": "[AG]", "Y": "[CT]"} nt_list = compute_nt_list() diff --git a/src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py b/src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py index 966106e6c820d79ce2f55d6c6b84fedab3887340..ad2b15f6fec5afef7dd8e2c69ee15bc593f0a18c 100644 --- a/src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py +++ b/src/db_utils/frequency_scripts/create_n_fill_exon_frequency_file.py @@ -64,10 +64,24 @@ def multiprocess_freq_orf(line: str) -> pd.Series: list(Config_freq.property_dic.keys()) dic_freq = {cft: np.nan for cft in ft_list} dic_freq['id'] = exon[3] - dic_freq['length'] = exon[2] - exon[1] + dic_freq['length'] = (exon[2] - exon[1]) / 3 return pd.Series(dic_freq) +def adapt_exon_length(mseries: pd.Series) -> int: + """ + Resize exon length if needed. + + :param mseries: A dataframe line + :return: The size + """ + if mseries['ft_type'] not in ['dnt', 'tnt']: + return mseries['length'] + if mseries['ft_type'] == "dnt": + return mseries['length'] - 1 + return mseries['length'] - 2 + + def get_freq_table(ps: int, bed_file: Path, mutli_fnc: Callable, fnc_ft_type: Callable) -> pd.DataFrame: """ @@ -93,6 +107,7 @@ def get_freq_table(ps: int, bed_file: Path, mutli_fnc: Callable, df = df.melt(id_vars=["id", 'length'], var_name="ft", value_name='frequency') df['ft_type'] = df['ft'].apply(fnc_ft_type) + df['length'] = df.apply(adapt_exon_length, axis=1) df.rename(columns={'id': id_col}, inplace=True) return df @@ -110,24 +125,39 @@ def load_hg19(hg19: Path) -> Dict[str, Seq]: return dic -def compute_gene_orf_data(df: pd.DataFrame) -> pd.DataFrame: +def compute_gene_orf_data(df: pd.DataFrame, ft_type_check: str = "aa", + ft_check: str = "A") -> pd.DataFrame: """ Compute the frequency of codons, amino_acid and properties within \ genes (corresponds to the concatenation of exons). :param df: Dataframe fo exon + :param ft_type_check: The feature type used to check if exons are \ + not duplicated + :param ft_check: The feature used to check if exons are not duplicated :return: """ df['id_gene'] = df['id_exon'].str.replace(r'_\d+', '') - df_exon = df.loc[(df['ft_type'] == "aa") & (df['ft'] == 'A'), :] - res = df_exon.groupby('id_exon').count()['length'].values + df_exon = df.loc[((df['ft_type'] == 'nt') & (df['ft'] == 'A')) | + ((df['ft_type'] == 'dnt') & (df['ft'] == 'AA')) | + ((df['ft_type'] == 'tnt') & (df['ft'] == 'AAA')) | + ((df['ft_type'] == 'codon') & (df['ft'] == 'AAA')) | + ((df['ft_type'] == 'aa') & (df['ft'] == 'A')) | + ((df['ft_type'] == 'properties') & + (df['ft'] == 'Flexibility_Vihinen')), :] + df_exon_tmp = df.loc[((df['ft_type'] == 'nt') & (df['ft'] == 'A')) | + ((df['ft_type'] == 'codon') & (df['ft'] == 'AAA')), :] + res = df_exon_tmp.groupby('id_exon').count()['length'].values if np.max(res) != np.min(res) != 1: - msg = "On exon must be present only once !" + msg = "One exon must be present only once !" logging.exception(msg) raise ValueError(msg) - gene_len = df_exon[['id_gene', 'length']].groupby('id_gene').sum()\ + df_exon = df_exon[["id_gene", "ft_type", "length"]] + gene_len = df_exon[['id_gene', 'length', 'ft_type']]\ + .groupby(['id_gene', 'ft_type']).sum()\ .reset_index() - df = pd.merge(df, gene_len, how='left', on="id_gene", + print(gene_len.loc[gene_len["id_gene"] == '10006', :]) + df = pd.merge(df, gene_len, how='left', on=["id_gene", "ft_type"], suffixes=['_exon', '_gene']) df['new_freq'] = df['frequency'] * (df['length_exon'] / df['length_gene']) df_final = df[['id_gene', 'ft_type', 'ft', 'new_freq']].\ @@ -159,26 +189,53 @@ def create_or_load_freq_table(ps: int) -> Tuple[pd.DataFrame, pd.DataFrame]: logging.debug(df_orf_exon.head()) df_exon = pd.concat([df_exon, df_orf_exon], axis=0, ignore_index=True, sort=False) - df_exon.to_csv(Config_freq.output_exon_file, sep="\t", index=False) + df_exon.to_csv(Config_freq.output_exon_file, sep="\t", index=False, + compression='gzip') else: logging.debug('Loading exon_freq file ...') - df_exon = pd.read_csv(Config_freq.output_exon_file, sep="\t") + df_exon = pd.read_csv(Config_freq.output_exon_file, sep="\t", + compression='gzip') df_orf_exon = df_exon.loc[ - df_exon['ft_type'].isin(['codon', 'aa', 'properties']), :].copy() + df_exon['ft_type'].isin(['codon', 'aa', 'properties']), + :].copy() if not Config_freq.output_gene_file.is_file(): logging.debug('get frequency table gene') - df_gene = get_freq_table(ps, Config.bed_gene, - multiprocess_freq, get_ft_type) - df_gene.drop('length', inplace=True, axis=1) - logging.debug(df_gene.head()) - logging.debug('Get orf frequency for gene') - df_orf_gene = compute_gene_orf_data(df_orf_exon) - logging.debug(df_orf_gene.head()) - df_gene = pd.concat([df_gene, df_orf_gene], axis=0, ignore_index=True, + df_gene_tmp = get_freq_table(ps, Config.bed_gene, + multiprocess_freq, get_ft_type) + df_gene_tmp.drop('length', inplace=True, axis=1) + logging.debug(df_gene_tmp.head()) + df_nt_exon = df_exon.loc[ + df_exon['ft_type'].isin(['nt', 'dnt', 'tnt']), + :].copy() + logging.debug('Get mean exon frequency for gene') + df_exon_nt_gene = compute_gene_orf_data(df_nt_exon.copy(), + ft_type_check='nt', + ft_check='A') + logging.debug(df_exon_nt_gene.head()) + logging.debug('Get mean exon orf frequency for gene') + df_exon_orf_gene = compute_gene_orf_data(df_orf_exon) + logging.debug(df_exon_orf_gene.head()) + logging.debug('Get mean intron frequency for gene') + df_intron = get_freq_table(ps, Config_freq.intron_bed, + multiprocess_freq, get_ft_type) + df_intron.rename({'id_intron': 'id_exon'}, axis=1, inplace=True) + logging.debug(df_intron.head()) + df_intron_gene = compute_gene_orf_data(df_intron, ft_type_check='nt', + ft_check='A') + logging.debug(df_intron_gene.head()) + df_gene = pd.concat([df_gene_tmp, df_exon_nt_gene, df_exon_orf_gene, + df_intron_gene], + axis=0, ignore_index=True, sort=False) - df_gene.to_csv(Config_freq.output_gene_file, sep="\t", index=False) + df_gene['region'] = ['gene'] * df_gene_tmp.shape[0] \ + + ['exon'] * df_exon_nt_gene.shape[0] \ + + ['exon'] * df_exon_orf_gene.shape[0] \ + + ['intron'] * df_intron_gene.shape[0] + df_gene.to_csv(Config_freq.output_gene_file, sep="\t", index=False, + compression='gzip') else: - df_gene = pd.read_csv(Config_freq.output_gene_file, sep="\t") + df_gene = pd.read_csv(Config_freq.output_gene_file, sep="\t", + compression='gzip') HG19 = None df_exon['frequency'] = round(df_exon['frequency'], 5) df_gene['frequency'] = round(df_gene['frequency'], 5)