diff --git a/src/db_utils/creation_of_exon_table.py b/src/db_utils/creation_of_exon_table.py index 540ba22d77c5ea644e27fb72c4697b020e4c1a83..d8d0fb879838fe84bf4385a89af59622266cbd3b 100755 --- a/src/db_utils/creation_of_exon_table.py +++ b/src/db_utils/creation_of_exon_table.py @@ -8,67 +8,83 @@ Description: Create the exon_ctrl file and the exon file from pathlib import Path import pandas as pd -from typing import List -from .config import Config, logging_def +from ..logging_conf import logging_def +from .config import Config import logging +from .populate_database import populate_df -def get_ctrl_exon(mfile: Path) -> List[str]: +def get_genomic_region(bed_file: Path) -> pd.DataFrame: """ - Get the control exons in mfile. + Create a file containing the control exons - :param mfile: A file containing control exons - :return: The lis of control exons in mfile + :param bed_file: A bed file containing intern exons + :return: the bed file as a dataframe """ - with mfile.open("r") as f: - exon_list = f.read().splitlines() - return exon_list + df = pd.read_csv(bed_file, sep="\t") + df.columns = ["chromosome", "start", "stop", "id", "score", "strand"] + df = df.loc[(df.stop - df.start > 2), :] + return df -def get_ctrl(bed_file: Path) -> None: +def get_gene_table() -> pd.DataFrame: """ - Create a file containing the control exons + Create the gene table. - :param bed_file: A bed file containing intern exons - :return: the bed file as a dataframe + :return: The gene_table. """ + logging.debug("Load the bed file of genes") + df_gene = get_genomic_region(Config.bed_gene) + df_gene.columns = ["chromosome", "start", "stop", "id", "name", "strand"] + logging.debug(df_gene.head()) + return df_gene - df = pd.read_csv(bed_file, sep="\t") - df.columns = ["chr", "start", "stop", "name", "score", "strand"] - df = df.loc[(df.stop - df.start > 2), :] - df = df.loc[:, "name"] - df.to_csv(Config.ctrl_exon_file, header=False, index=False) + +def get_exon_table(df_gene_name: pd.DataFrame) -> pd.DataFrame: + """ + Create the exon_table + + :param df_gene_name: A datframe of gene id and gene name + :return: The exon table + """ + df_gene_name.columns = ['id_gene', 'gene_name'] + df_gene_name['id_gene'].astype(int) + logging.debug("Load the bed file of exons") + df_exon = get_genomic_region(Config.bed_exon) + df_exon = df_exon.loc[df_exon['stop'] - df_exon['start'] > 2, :] + logging.debug(df_exon.head()) + df_exon.drop('score', inplace=True, axis=1) + logging.debug('Creating id_gene column ...') + df_exon['id_gene'] = df_exon['id'].str.replace(r'_\d+$', '') + df_exon['id_gene'] = df_exon['id_gene'].astype(int) + logging.debug(df_exon.head()) + logging.debug("Creation of a column pos") + df_exon['pos'] = df_exon['id'].str.replace(r'^\d+_', '') + df_exon['pos'] = df_exon['pos'].astype(int) + logging.debug(df_exon.head()) + logging.debug('Merging with gene name dataframe') + df_exon = df_exon.merge(df_gene_name, how="left", on='id_gene') + df_exon['name'] = df_exon['gene_name'] + '_' + \ + df_exon['pos'].astype('str') + df_exon.drop('gene_name', inplace=True, axis=1) + return df_exon -def get_exon_table(ctrl_exon: Path, gene_tab: Path, - logging_level: str = "DISABLE"): +def main_fill_exon_n_gene(logging_level: str = "DISABLE") -> None: """ - Create the table of exon. + Create the cin_exon and cin_gene table and insert them into \ + the ChIA-PET dabatbase. - :param ctrl_exon: A file containing control exons - :param gene_tab: A file containing FasterDB gene display \ - in two columns: gene_symbol and fasterdb id. - :param logging_level: The level of display + :param logging_level: The level of information to display """ - logging_def(Config.output, logging_level) - ctrl_exon_list = get_ctrl_exon(ctrl_exon) - logging.debug("Creating dataframe of control exons") - df_exon = pd.DataFrame({"id": ctrl_exon_list}) - logging.debug(f"{df_exon.head()}\nshape:{df_exon.shape}") - logging.debug("Creation of a pos column") - df_exon["pos"] = df_exon["id"].apply(lambda x: int(x.split("_")[1])) - logging.debug(f"{df_exon.head()}\nshape:{df_exon.shape}") - logging.debug("Creation of a gene_id column") - df_exon["gene_id"] = df_exon["id"].apply(lambda x: int(x.split("_")[0])) - logging.debug(f"{df_exon.head()}\nshape:{df_exon.shape}") - logging.debug("Addition of a gene column") - gene_df = pd.read_csv(gene_tab, sep="\t") - gene_df.columns = ["gene", "gene_id"] - df_exon = df_exon.merge(gene_df) - logging.debug(f"{df_exon.head()}\nshape:{df_exon.shape}") - logging.debug("Addition of a name column") - df_exon["name"] = df_exon.apply(lambda x: f"{x.gene}_" - f"{x.pos}", axis=1) - df_exon = df_exon[["pos", "name", "id", "gene_id"]] - logging.debug(f"{df_exon.head()}\nshape:{df_exon.shape}") - df_exon.to_csv(Config.exon_file, sep="\t", index=False) + logging_def(Config.db_file.parent, __file__, logging_level) + df_gene = get_gene_table() + df_exon = get_exon_table(df_gene[['id', 'name']]) + logging.debug('Filling cin_gene table') + populate_df(table='cin_gene', df=df_gene, clean='y') + logging.debug('Filling cin_exon table') + populate_df(table='cin_exon', df=df_exon, clean='y') + + +if __name__ == '__main__': + main_fill_exon_n_gene('DEBUG')