Skip to content
Snippets Groups Projects
Commit f0a79f9c authored by nfontrod's avatar nfontrod
Browse files

src/db_utils/creation_of_exon_table.py: major modification, this script will...

src/db_utils/creation_of_exon_table.py: major modification, this script will create and fill the cin_exon and cin_gene table of the new database model
parent 181f87ed
No related branches found
No related tags found
No related merge requests found
......@@ -8,67 +8,83 @@ Description: Create the exon_ctrl file and the exon file
from pathlib import Path
import pandas as pd
from typing import List
from .config import Config, logging_def
from ..logging_conf import logging_def
from .config import Config
import logging
from .populate_database import populate_df
def get_ctrl_exon(mfile: Path) -> List[str]:
def get_genomic_region(bed_file: Path) -> pd.DataFrame:
"""
Get the control exons in mfile.
Create a file containing the control exons
:param mfile: A file containing control exons
:return: The lis of control exons in mfile
:param bed_file: A bed file containing intern exons
:return: the bed file as a dataframe
"""
with mfile.open("r") as f:
exon_list = f.read().splitlines()
return exon_list
df = pd.read_csv(bed_file, sep="\t")
df.columns = ["chromosome", "start", "stop", "id", "score", "strand"]
df = df.loc[(df.stop - df.start > 2), :]
return df
def get_ctrl(bed_file: Path) -> None:
def get_gene_table() -> pd.DataFrame:
"""
Create a file containing the control exons
Create the gene table.
:param bed_file: A bed file containing intern exons
:return: the bed file as a dataframe
:return: The gene_table.
"""
logging.debug("Load the bed file of genes")
df_gene = get_genomic_region(Config.bed_gene)
df_gene.columns = ["chromosome", "start", "stop", "id", "name", "strand"]
logging.debug(df_gene.head())
return df_gene
df = pd.read_csv(bed_file, sep="\t")
df.columns = ["chr", "start", "stop", "name", "score", "strand"]
df = df.loc[(df.stop - df.start > 2), :]
df = df.loc[:, "name"]
df.to_csv(Config.ctrl_exon_file, header=False, index=False)
def get_exon_table(df_gene_name: pd.DataFrame) -> pd.DataFrame:
"""
Create the exon_table
:param df_gene_name: A datframe of gene id and gene name
:return: The exon table
"""
df_gene_name.columns = ['id_gene', 'gene_name']
df_gene_name['id_gene'].astype(int)
logging.debug("Load the bed file of exons")
df_exon = get_genomic_region(Config.bed_exon)
df_exon = df_exon.loc[df_exon['stop'] - df_exon['start'] > 2, :]
logging.debug(df_exon.head())
df_exon.drop('score', inplace=True, axis=1)
logging.debug('Creating id_gene column ...')
df_exon['id_gene'] = df_exon['id'].str.replace(r'_\d+$', '')
df_exon['id_gene'] = df_exon['id_gene'].astype(int)
logging.debug(df_exon.head())
logging.debug("Creation of a column pos")
df_exon['pos'] = df_exon['id'].str.replace(r'^\d+_', '')
df_exon['pos'] = df_exon['pos'].astype(int)
logging.debug(df_exon.head())
logging.debug('Merging with gene name dataframe')
df_exon = df_exon.merge(df_gene_name, how="left", on='id_gene')
df_exon['name'] = df_exon['gene_name'] + '_' + \
df_exon['pos'].astype('str')
df_exon.drop('gene_name', inplace=True, axis=1)
return df_exon
def get_exon_table(ctrl_exon: Path, gene_tab: Path,
logging_level: str = "DISABLE"):
def main_fill_exon_n_gene(logging_level: str = "DISABLE") -> None:
"""
Create the table of exon.
Create the cin_exon and cin_gene table and insert them into \
the ChIA-PET dabatbase.
:param ctrl_exon: A file containing control exons
:param gene_tab: A file containing FasterDB gene display \
in two columns: gene_symbol and fasterdb id.
:param logging_level: The level of display
:param logging_level: The level of information to display
"""
logging_def(Config.output, logging_level)
ctrl_exon_list = get_ctrl_exon(ctrl_exon)
logging.debug("Creating dataframe of control exons")
df_exon = pd.DataFrame({"id": ctrl_exon_list})
logging.debug(f"{df_exon.head()}\nshape:{df_exon.shape}")
logging.debug("Creation of a pos column")
df_exon["pos"] = df_exon["id"].apply(lambda x: int(x.split("_")[1]))
logging.debug(f"{df_exon.head()}\nshape:{df_exon.shape}")
logging.debug("Creation of a gene_id column")
df_exon["gene_id"] = df_exon["id"].apply(lambda x: int(x.split("_")[0]))
logging.debug(f"{df_exon.head()}\nshape:{df_exon.shape}")
logging.debug("Addition of a gene column")
gene_df = pd.read_csv(gene_tab, sep="\t")
gene_df.columns = ["gene", "gene_id"]
df_exon = df_exon.merge(gene_df)
logging.debug(f"{df_exon.head()}\nshape:{df_exon.shape}")
logging.debug("Addition of a name column")
df_exon["name"] = df_exon.apply(lambda x: f"{x.gene}_"
f"{x.pos}", axis=1)
df_exon = df_exon[["pos", "name", "id", "gene_id"]]
logging.debug(f"{df_exon.head()}\nshape:{df_exon.shape}")
df_exon.to_csv(Config.exon_file, sep="\t", index=False)
logging_def(Config.db_file.parent, __file__, logging_level)
df_gene = get_gene_table()
df_exon = get_exon_table(df_gene[['id', 'name']])
logging.debug('Filling cin_gene table')
populate_df(table='cin_gene', df=df_gene, clean='y')
logging.debug('Filling cin_exon table')
populate_df(table='cin_exon', df=df_exon, clean='y')
if __name__ == '__main__':
main_fill_exon_n_gene('DEBUG')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment