Skip to content
Snippets Groups Projects
Commit c75b6ddf authored by nfontrod's avatar nfontrod
Browse files

src/db_utils/projects_metadata/get_fill_metadata.py: fix fill_projects_table...

src/db_utils/projects_metadata/get_fill_metadata.py: fix fill_projects_table to make it launch launch_pmagicGEO_script + pep8 modifications
parent c229c568
No related branches found
No related tags found
No related merge requests found
......@@ -9,7 +9,6 @@ Then it allows to get data to fill the projects table in the ChIA-PET database.
"""
import subprocess
from pathlib import Path
from .config_metadata_chiapet import Config_metadata as Config
import pandas as pd
from io import StringIO
......@@ -19,13 +18,11 @@ import logging
from ...logging_conf import logging_def
def launch_PmagicGEO_script() -> None:
def launch_pmagicGEO_script() -> None:
"""
Launch PmagicGEO.pl to obtain metadata associated to a list of projects \
that we want to study.
:param GSM_file: A file with the list of GSM or GSE for which we want to \
obtain metadata.
"""
subprocess.check_output(f"PmagicGEO.pl -i {Config.geo_id_file} "
f"-p {Config.output} " f"-o {Config.outfile}",
......@@ -52,8 +49,8 @@ def load_metadata() -> pd.DataFrame:
new_cols[0] = 'gsm'
df.columns = new_cols
logging.debug(df.head())
return df[["gsm", "GSM-GSE", "GSM-Title", "GSM-Characteristics", \
"GSM_FAMILYSOFT-contact_institute", "GSEA-Citation(s)"]]
return df[["gsm", "GSM-GSE", "GSM-Title", "GSM-Characteristics",
"GSM_FAMILYSOFT-contact_institute", "GSEA-Citation(s)"]]
def load_manual_metadata() -> pd.DataFrame:
......@@ -62,23 +59,25 @@ def load_manual_metadata() -> pd.DataFrame:
These informations were retrieved manually.
"""
mfile = Config.metadata_file
return pd.read_csv(mfile, sep = "\t")
return pd.read_csv(mfile, sep="\t")
def merge_metadata() -> pd.DataFrame:
"""
Merging of metadata obtained through PmagicGEO.pl and manually.
"""
projects = load_metadata().merge(load_manual_metadata(), left_on = "gsm",
right_on = "GSM")
projects = load_metadata().merge(load_manual_metadata(), left_on="gsm",
right_on="GSM")
projects = projects.drop(["gsm", "GSM"], axis=1)
projects.rename(columns={"GSM-GSE": "id_project", "GSM-Title": "name",
"GSM-Characteristics": "description",
"GSM_FAMILYSOFT-contact_institute": "institute",
"GSEA-Citation(s)": "citation", "Cell line": "cell_line",
"Real source": "database", "Kept GSM": "id_sample",
"Antibody": "antibody"}, inplace=True)
projects = projects.reset_index().rename(columns={"index":"id"})
"GSM-Characteristics": "description",
"GSM_FAMILYSOFT-contact_institute": "institute",
"GSEA-Citation(s)": "citation",
"Cell line": "cell_line",
"Real source": "database",
"Kept GSM": "id_sample",
"Antibody": "antibody"}, inplace=True)
projects = projects.reset_index().rename(columns={"index": "id"})
logging.debug(projects.head())
return projects
......@@ -90,9 +89,11 @@ def fill_projects_table(logging_level: str = "DISABLE",) -> None:
:param logging_level: The level of data to display.
"""
logging_def(Config.output, __file__, logging_level)
logging.debug('Launching Pmagicgeo ...')
launch_pmagicGEO_script()
logging.debug('Filling cin_projects')
populate_df(table='cin_projects', df=merge_metadata(), clean='y')
if __name__ == "__main__":
fill_projects_table('DEBUG')
\ No newline at end of file
fill_projects_table('DEBUG')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment