diff --git a/src/db_utils/projects_metadata/get_fill_metadata.py b/src/db_utils/projects_metadata/get_fill_metadata.py index 1886020dd8a268c7efe1eb2cff1b350d9e9f7726..eba211a2f74b64b1f8e647db1f0b39d9e255e146 100644 --- a/src/db_utils/projects_metadata/get_fill_metadata.py +++ b/src/db_utils/projects_metadata/get_fill_metadata.py @@ -9,7 +9,6 @@ Then it allows to get data to fill the projects table in the ChIA-PET database. """ import subprocess -from pathlib import Path from .config_metadata_chiapet import Config_metadata as Config import pandas as pd from io import StringIO @@ -19,13 +18,11 @@ import logging from ...logging_conf import logging_def -def launch_PmagicGEO_script() -> None: +def launch_pmagicGEO_script() -> None: """ Launch PmagicGEO.pl to obtain metadata associated to a list of projects \ that we want to study. - :param GSM_file: A file with the list of GSM or GSE for which we want to \ - obtain metadata. """ subprocess.check_output(f"PmagicGEO.pl -i {Config.geo_id_file} " f"-p {Config.output} " f"-o {Config.outfile}", @@ -52,8 +49,8 @@ def load_metadata() -> pd.DataFrame: new_cols[0] = 'gsm' df.columns = new_cols logging.debug(df.head()) - return df[["gsm", "GSM-GSE", "GSM-Title", "GSM-Characteristics", \ - "GSM_FAMILYSOFT-contact_institute", "GSEA-Citation(s)"]] + return df[["gsm", "GSM-GSE", "GSM-Title", "GSM-Characteristics", + "GSM_FAMILYSOFT-contact_institute", "GSEA-Citation(s)"]] def load_manual_metadata() -> pd.DataFrame: @@ -62,23 +59,25 @@ def load_manual_metadata() -> pd.DataFrame: These informations were retrieved manually. """ mfile = Config.metadata_file - return pd.read_csv(mfile, sep = "\t") + return pd.read_csv(mfile, sep="\t") def merge_metadata() -> pd.DataFrame: """ Merging of metadata obtained through PmagicGEO.pl and manually. """ - projects = load_metadata().merge(load_manual_metadata(), left_on = "gsm", - right_on = "GSM") + projects = load_metadata().merge(load_manual_metadata(), left_on="gsm", + right_on="GSM") projects = projects.drop(["gsm", "GSM"], axis=1) projects.rename(columns={"GSM-GSE": "id_project", "GSM-Title": "name", - "GSM-Characteristics": "description", - "GSM_FAMILYSOFT-contact_institute": "institute", - "GSEA-Citation(s)": "citation", "Cell line": "cell_line", - "Real source": "database", "Kept GSM": "id_sample", - "Antibody": "antibody"}, inplace=True) - projects = projects.reset_index().rename(columns={"index":"id"}) + "GSM-Characteristics": "description", + "GSM_FAMILYSOFT-contact_institute": "institute", + "GSEA-Citation(s)": "citation", + "Cell line": "cell_line", + "Real source": "database", + "Kept GSM": "id_sample", + "Antibody": "antibody"}, inplace=True) + projects = projects.reset_index().rename(columns={"index": "id"}) logging.debug(projects.head()) return projects @@ -90,9 +89,11 @@ def fill_projects_table(logging_level: str = "DISABLE",) -> None: :param logging_level: The level of data to display. """ logging_def(Config.output, __file__, logging_level) + logging.debug('Launching Pmagicgeo ...') + launch_pmagicGEO_script() logging.debug('Filling cin_projects') populate_df(table='cin_projects', df=merge_metadata(), clean='y') if __name__ == "__main__": - fill_projects_table('DEBUG') \ No newline at end of file + fill_projects_table('DEBUG')