diff --git a/src/db_utils/__main__.py b/src/db_utils/__main__.py index c6679832c5094d22f02a79a4ebed164354dd3346..55599b1901a34182150345e6a85f39ededde7620 100755 --- a/src/db_utils/__main__.py +++ b/src/db_utils/__main__.py @@ -10,8 +10,7 @@ from .config import Config from ..logging_conf import logging_def from .db_creation import main_create_db from .fill_exon_n_gene_table import main_fill_exon_n_gene -# from .create_freq_table import create_freq_table -# from .populate_database import populate +from .fill_splicing_lore_tables import fill_splicing_lore_data import logging @@ -24,21 +23,8 @@ def launcher(logging_level: str = "INFO"): main_create_db('DISABLE') logging.info('Filling cin_gene and cin_exon') main_fill_exon_n_gene('DISABLE') - # logging.info(f"Creation of {Config.ctrl_exon_file} file") - # get_ctrl(Config.exon_intern) - # logging.info(f"Creation of {Config.exon_file} file") - # get_exon_table(Config.ctrl_exon_file, Config.gene_file, logging_level) - # logging.info(f"Creation of {Config.frequency_file} file") - # create_freq_table(Config.bed_orf, Config.bed_exon, Config.ctrl_exon_file, - # logging_level) - # - #Â mpopulate = populate.__wrapped__ - # logging.info(f"Filling {Config.tables[0]} table") - # mpopulate(Config.tables[0], Config.gene_file, "y", logging_level) - # logging.info(f"Filling {Config.tables[1]} table") - # mpopulate(Config.tables[1], Config.exon_file, "y", logging_level) - # logging.info(f"Filling {Config.tables[2]} table") - # mpopulate(Config.tables[2], Config.frequency_file, "y", logging_level) + logging.info('Filling splicing lore tables') + fill_splicing_lore_data('DISABLE') -launcher(logging_level = "DEBUG") \ No newline at end of file +launcher(logging_level="DEBUG") diff --git a/src/db_utils/config.py b/src/db_utils/config.py index 8f9d60ba71004f18633749eb5d45fce0de6e70ff..6155a2f44a8e960db1e23d9c4b26b064c6f54d68 100755 --- a/src/db_utils/config.py +++ b/src/db_utils/config.py @@ -13,8 +13,11 @@ class Config: """ A class containing every parameters used in the submodule db_utils """ - db_file = Path(__file__).parents[2] / "results" / 'chia_pet_database.db' - tables = ["cin_gene", "cin_exon", "cin_frequency", "cin_interaction"] - bed_exon = Path(__file__).parents[2] / 'data' / 'bed' / 'exon.bed' - bed_gene = Path(__file__).parents[2] / 'data' / 'bed' / 'gene.bed' - + data = Path(__file__).parents[2] / 'data' + results = Path(__file__).parents[2] / "results" + db_file = results / 'chia_pet_database.db' + bed_exon = data / 'bed' / 'exon.bed' + bed_gene = data / 'bed' / 'gene.bed' + ase_event_file = data / 'splicing_lore_data' / 'ase_event.txt' + splicing_projects = data / 'splicing_lore_data' / \ + 'splicing_lore_projects.txt' diff --git a/src/db_utils/fill_splicing_lore_tables.py b/src/db_utils/fill_splicing_lore_tables.py new file mode 100644 index 0000000000000000000000000000000000000000..63cadbb06d90a534c7f390504ee95b5688b3c121 --- /dev/null +++ b/src/db_utils/fill_splicing_lore_tables.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to fill the tables \ +cin_ase_event and cin_project_splicing_lore +""" + +from .config import Config +import pandas as pd +import logging +from ..logging_conf import logging_def +from .populate_database import populate_df + + +def load_splicing_projects() -> pd.DataFrame: + """ + Load the projects of splicing lore database. + + :return: The dataframe of the splicing lore projects + """ + df = pd.read_csv(Config.splicing_projects, sep="\t") + logging.debug(df.head()) + return df + + +def load_ase_event() -> pd.DataFrame: + """ + Load the alternative splicing events of detected in the splicing lore \ + database. + + :return: The dataframe of splicing lore alternative splicing events + """ + logging.debug('loading cin_project_splicing_lore') + df = pd.read_csv(Config.ase_event_file, sep="\t") + df.drop(["gene_symbol", "chromosome", "start", "stop", "exons_flanquants"], + inplace=True, axis=1) + cols = list(df.columns) + if cols.index('exon_skipped') != -1: + cols[cols.index('exon_skipped')] = 'pos' + df.columns = cols + df['exon_id'] = df['gene_id'].astype(str) + '_' + df['pos'].astype(str) + logging.debug(df.head()) + return df + + +def fill_splicing_lore_data(logging_level: str = 'DISABLE') -> None: + """ + Fill the tables cin_ase_event and cin_project_splicing_lore + """ + logging_def(Config.results, __file__, logging_level) + + sf_projects = load_splicing_projects() + ase_events = load_ase_event() + logging.debug('Filling cin_project_splicing_lore') + populate_df(table='cin_project_splicing_lore', df=sf_projects, clean='y') + logging.debug('Filling cin_ase_event') + populate_df(table='cin_ase_event', df=ase_events, clean='y') + + +if __name__ == "__main__": + fill_splicing_lore_data('DEBUG') diff --git a/src/db_utils/populate_database.py b/src/db_utils/populate_database.py index 56ff2750217e745a2cf2131d62fe80a65d563950..4fcf80ff8e7016ac9c37428f4d207ec44effd636 100755 --- a/src/db_utils/populate_database.py +++ b/src/db_utils/populate_database.py @@ -113,16 +113,31 @@ def insert_data(table: str, content: List[Tuple], cnx: sqlite3.Connection cursor.close() -def check_table_name(table : str): +def get_table_names(cnx: sqlite3.Connection) -> List[str]: + """ + Get the list of available table names. + + :param cnx: The connection to ChIA-PET database. + + :return: The list of availbale tables + """ + c = cnx.cursor() + c.execute("""SELECT name FROM sqlite_master WHERE type = 'table';""") + res = c.fetchall() + res = [r[0] for r in res] + return res + + +def check_table_name(cnx: sqlite3.Connection, table: str): """ Check if we can use the table name `table`. + :param cnx: Connection to ChIA-PET database :param table: The name of the table to fille :return: The same name with the prefix gin if it wasn't here. """ - if "cin" not in table: - table = f"cin_{table.lower()}" - if table not in Config.tables: + tables = get_table_names(cnx) + if table not in tables: msg = f"The name {table} is not available." \ f" If the table exist in the database, " \ f"change the config file to add the table name " \ @@ -175,8 +190,8 @@ def populate_df(table: str, df: pd.DataFrame, clean: str): tab. :param clean: y to remove the data in the table, n else. """ - table = check_table_name(table) cnx = sqlite3.connect(Config.db_file) + table = check_table_name(cnx, table) check_content_clean_and_insert(table, df, cnx, clean) @@ -195,8 +210,8 @@ def populate(table: str, file: str, clean: str, logging_level: str = """ logging_def(Config.db_file.parent, __file__, logging_level) mfile = Path(file) - table = check_table_name(table) cnx = sqlite3.connect(Config.db_file) + table = check_table_name(cnx, table) check_content_clean_and_insert(table, mfile, cnx, clean)