From f826d8d68ce8d1566e24e567b6b9c3bf69058122 Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Mon, 25 Jan 2021 17:19:45 +0100 Subject: [PATCH] src/download_encode_eclip/*.py: creation of a module used to download all the eclip data (on hg19) from Encode --- src/download_encode_eclip/__init__.py | 7 + src/download_encode_eclip/__main__.py | 17 ++ src/download_encode_eclip/config.py | 20 ++ src/download_encode_eclip/get_encode_clips.py | 179 ++++++++++++++++++ 4 files changed, 223 insertions(+) create mode 100644 src/download_encode_eclip/__init__.py create mode 100644 src/download_encode_eclip/__main__.py create mode 100644 src/download_encode_eclip/config.py create mode 100644 src/download_encode_eclip/get_encode_clips.py diff --git a/src/download_encode_eclip/__init__.py b/src/download_encode_eclip/__init__.py new file mode 100644 index 00000000..60f3a118 --- /dev/null +++ b/src/download_encode_eclip/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: +""" diff --git a/src/download_encode_eclip/__main__.py b/src/download_encode_eclip/__main__.py new file mode 100644 index 00000000..ffab5c68 --- /dev/null +++ b/src/download_encode_eclip/__main__.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: Launcher that will download all eclip experiment file. +""" + +from .get_encode_clips import download_eclip +from .config import ConfigEncodeClip + + +def get_eclips(): + download_eclip(ConfigEncodeClip.output) + + +get_eclips() diff --git a/src/download_encode_eclip/config.py b/src/download_encode_eclip/config.py new file mode 100644 index 00000000..fca51369 --- /dev/null +++ b/src/download_encode_eclip/config.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: Contains variables needed in this submodule +""" + +from ..db_utils.config import Config + + +class ConfigEncodeClip: + """ + Contains the variable used in this submodule + """ + data = Config.data + eclip_file = data / "CLIP_bed" / "experiment_report_2021_1_22_16h_39m.tsv" + output = Config.results / "Encode_clips" + encode_website = "https://www.encodeproject.org" + diff --git a/src/download_encode_eclip/get_encode_clips.py b/src/download_encode_eclip/get_encode_clips.py new file mode 100644 index 00000000..b1ba45b2 --- /dev/null +++ b/src/download_encode_eclip/get_encode_clips.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to get every encode eclip. +""" + +from .config import ConfigEncodeClip +import pandas as pd +import json +import subprocess as sp +from typing import Dict, Optional, Tuple +from pathlib import Path +from ..logging_conf import logging_def +import logging + + + +def load_encode_eclip_file() -> pd.DataFrame: + """ + :return: A table containing encode eclip experiment + + >>> d = load_encode_eclip_file().head() + >>> d[d.columns[0:4]] + ID Target gene symbol Biosample summary Project + 0 /experiments/ENCSR322HHA/ TIAL1 HepG2 ENCODE + 1 /experiments/ENCSR023PKW/ EIF3G K562 ENCODE + 2 /experiments/ENCSR366DGX/ KHSRP HepG2 ENCODE + 3 /experiments/ENCSR337XGI/ SAFB HepG2 ENCODE + 4 /experiments/ENCSR977OXG/ PRPF4 HepG2 ENCODE + >>> d[d.columns[4:]] + Biological replicate Technical replicate Organism Genome assembly + 0 1,2 1 Homo sapiens GRCh38,hg19 + 1 2,1 1 Homo sapiens GRCh38,hg19 + 2 2,1 1 Homo sapiens GRCh38,hg19 + 3 2,1 1 Homo sapiens GRCh38,hg19 + 4 1,2 1 Homo sapiens GRCh38,hg19 + """ + df = pd.read_csv(ConfigEncodeClip.eclip_file, sep="\t") + df.rename({"Biosample term name": "Biosample summary"}, axis=1, + inplace=True) + return df + + +def get_encode_url(encode_id: str) -> str: + """ + + :param encode_id: An partial encode URL for an experiment + :return: The complete encode URL + + >>> get_encode_url("/experiments/ENCSR464OSH/") + 'https://www.encodeproject.org/experiments/ENCSR464OSH/' + >>> get_encode_url("lolipop") + Traceback (most recent call last): + ... + ValueError: The encode_id should begin with /experiments/ + """ + if not encode_id.startswith("/experiments/"): + raise ValueError("The encode_id should begin with /experiments/") + return f"{ConfigEncodeClip.encode_website}{encode_id}" + + +def get_json(url: str) -> Dict: + """ + :param url: An encode URL pointing to an eclip experiment + :return: The json contained in the url + + >>> d = get_json("https://www.encodeproject.org/experiments/ENCSR069EVH/") + >>> list(d.keys())[0:4] + ['assay_term_name', 'biosample_ontology', 'documents', 'references'] + """ + cmd = f"curl {url} --silent | sed -n '10p'" + res = sp.check_output(cmd, shell=True).decode("UTF-8") + return json.loads(res) + + +def get_bed_eclip_file(dic_file: Dict) -> Optional[Tuple[str, Dict]]: + """ + From a dictionary containing data on an experiment file, return \ + the url to download the file if it's a bed of narrowpeak on hg19. + + :param dic_file: A file containing data on an experiment file + :return: The url of the file if the file is a bed file of narrowpeaks \ + on hg19. + + >>> d = get_json("https://www.encodeproject.org/experiments/ENCSR069EVH/") + >>> df = d["files"][24] + >>> url, data = get_bed_eclip_file(df) + >>> url.replace("https://www.encodeproject.org", "") + '/files/ENCFF777FHS/@@download/ENCFF777FHS.bed.gz' + >>> data == {'rep': [1, 2], 'title': 'ENCFF777FHS'} + True + >>> get_bed_eclip_file(d["files"][22]) + """ + for k in ["file_format", "file_format_type", "no_file_available", + "assembly", "assay_term_name"]: + if k not in dic_file.keys(): + return None + if dic_file["file_format"] != "bed" or \ + dic_file["file_format_type"] != "narrowPeak" or \ + dic_file["no_file_available"] or dic_file["assembly"] != "hg19" or \ + dic_file["assay_term_name"] != "eCLIP" or \ + dic_file["status"] != "released": + return None + data = {"rep": dic_file["biological_replicates"], + "title": dic_file["title"]} + return f"{ConfigEncodeClip.encode_website}{dic_file['href']}", data + + +def download_clip_file(url_clip: str, output: Path, data: Dict, + row: pd.Series) -> Path: + """ + Save the file located at url_clip and return the Path of the created file. + + :param url_clip: An url containing a bed file + :param output: The folder where the result will be created + :param data: A dictionary containing the number of replicates \ + used to create the file and the sample name + :param row: The row of an encode table containing eclip \ + experiments. + :return: The Path where the file downloaded from clip_url is stored + + >>> u = str(ConfigEncodeClip.encode_website) + \ + "/files/ENCFF777FHS/@@download/ENCFF777FHS.bed.gz" + >>> data = {'rep': [1, 2], 'title': 'ENCFF777FHS'} + >>> s = pd.Series({"ID": "/experiments/ENCSR069EVH/", + ... "Target gene symbol": "FUS", "Biosample summary": "HepG2", + ... "Project": "ENCODE", "Biological replicate": "1,2", + ... "Technical replicate": "1", "Organism": "Homo sapiens", + ... "Genome assembly": "GRCh38,hg19"}) + >>> outf = download_clip_file(u, Path("/tmp"), data, s) + >>> outf.is_file() + True + >>> outf.unlink() + """ + bs = row['Biosample summary'].replace(" ", "-") + outfile = output / f"{row['Target gene symbol']}_" \ + f"{bs}_{row['Project']}_" \ + f"{row['ID'].replace('/experiments/', '')[:-1]}_" \ + f"{data['title']}_" \ + f"rep{'-'.join(map(str, data['rep']))}.hg19.bed.gz" + cmd = f"curl -L {url_clip} -o {outfile} --silent" + sp.check_call(cmd, shell=True) + return outfile + + +def download_eclip(output: Path) -> None: + """ + Download every eclip experiment of interest. + + :param output: File where the eclip experiment will be stored + """ + output.mkdir(exist_ok=True) + logging_def(ConfigEncodeClip.output, __file__, "INFO") + df = load_encode_eclip_file() + for i in range(df.shape[0]): + sample_ok = 0 + my_row = df.iloc[i, :] + exp_name = my_row['ID'].replace('/experiments/', '')[:-1] + logging.info(f"Working on experiment " + f"{exp_name}") + dic_exp = get_json(get_encode_url(my_row["ID"])) + if "files" not in dic_exp.keys(): + continue + for dic_file in dic_exp["files"]: + res = get_bed_eclip_file(dic_file) + if res is not None: + download_clip_file(res[0], output, res[1], my_row) + sample_ok += 1 + logging.info(f" --> {dic_file['title']}, done") + if sample_ok != 3: + logging.warning(f"experiment {exp_name} has {sample_ok} bed " + f"reported !") + + +if __name__ == "__main__": + import doctest + doctest.testmod() \ No newline at end of file -- GitLab