Skip to content
Snippets Groups Projects
Commit 112e7a9d authored by alapendr's avatar alapendr
Browse files

creation of the projects_metadata folder

parent d59efe83
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
Description: Configuration variables for subfolder projects_metadata
"""
from ..config import Config
class Config_metadata:
"""
Configuration variable for this subfolder
"""
geo_id_file = Config.data / 'metadata_files' / 'chia_pet_list_GSM.txt'
output = Config.results / 'projects_metadata'
metadata_file = Config.data / 'metadata_files' / 'chia_pet_list.csv'
outfile = 'chia_pet_projects'
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Description: This script allows to launch PmagicGEO.pl to obtain metadata \
associated to a list of projects that we want to study.
Then it allows to get data to fill the projects table in the ChIA-PET database.
"""
import subprocess
from pathlib import Path
from .config_metadata_chiapet import Config_metadata as Config
import pandas as pd
from io import StringIO
from ..populate_database import populate_df
import re
def launch_PmagicGEO_script() -> None:
"""
Launch PmagicGEO.pl to obtain metadata associated to a list of projects \
that we want to study.
:param GSM_file: A file with the list of GSM or GSE for which we want to \
obtain metadata.
"""
subprocess.check_output(f"PmagicGEO.pl -i {Config.geo_id_file} " \
f"-p {Config.output} " f"-o {Config.outfile}", \
shell=True, stderr=subprocess.STDOUT)
def load_metadata() -> pd.DataFrame:
"""
Load the metadata file
"""
mfile = Config.output / f"{Config.outfile}.infos.csv"
content = ""
with mfile.open('r') as infile:
for line in infile:
if line[0] != '#':
content += line
df = pd.read_csv(StringIO(content), sep="\t")
first_line = df.iloc[0, :].values
df.drop(0, axis=0, inplace=True)
p = re.compile(r'\.\d+')
cols = [p.sub('', c) for c in list(df.columns)]
new_cols = [cols[i] + "-" + first_line[i] for i in range(len(first_line))]
if "Unnamed" in new_cols[0]:
new_cols[0] = 'gsm'
df.columns = new_cols
logging.debug(df.head())
return df[["gsm", "GSM-GSE", "GSM-Title", "GSM-Characteristics", \
"GSM_FAMILYSOFT-contact_institute", "GSEA-Citation(s)"]]
def load_manual_metadata() -> pd.DataFrame:
"""
Load an other file which contains metadata not retrieved by PmagicGEO.pl.
These informations were retrieved manually.
"""
mfile = Config.metadata_file
return pd.read_csv(mfile, sep = "\t")
def merge_metadata() -> pd.DataFrame:
"""
Merging of metadata obtained through PmagicGEO.pl and manually.
"""
projects = load_metadata().merge(load_manual_metadata(), left_on = "gsm", \
right_on = "GSM")
projects = projects.drop(["gsm", "GSM"], axis=1)
projects.rename(columns={"GSM-GSE": "id_project", "GSM-Title": "name", \
"GSM-Characteristics": "description", \
"GSM_FAMILYSOFT-contact_institute": "institute", \
"GSEA-Citation(s)": "citation", "Cell line": "cell_line", \
"Real source": "database", "Kept GSM": "id_sample", \
"Antibody": "antibody"}, inplace=True)
projects = projects.reset_index().rename(columns={"index":"id"})
logging.debug(projects.head())
return projects
def fill_projects_table() -> None:
"""
Fill the table projects
"""
logging.debug('Filling cin_projects')
populate_df(table='cin_projects', df=merge_metadata(), clean='y')
if __name__ == "__main__":
fill_projects_table('DEBUG')
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment