Skip to content
Snippets Groups Projects
Commit f826d8d6 authored by nfontrod's avatar nfontrod
Browse files

src/download_encode_eclip/*.py: creation of a module used to download all the...

src/download_encode_eclip/*.py: creation of a module used to download all the eclip data (on hg19) from Encode
parent 8cbc86a4
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description:
"""
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: Launcher that will download all eclip experiment file.
"""
from .get_encode_clips import download_eclip
from .config import ConfigEncodeClip
def get_eclips():
download_eclip(ConfigEncodeClip.output)
get_eclips()
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: Contains variables needed in this submodule
"""
from ..db_utils.config import Config
class ConfigEncodeClip:
"""
Contains the variable used in this submodule
"""
data = Config.data
eclip_file = data / "CLIP_bed" / "experiment_report_2021_1_22_16h_39m.tsv"
output = Config.results / "Encode_clips"
encode_website = "https://www.encodeproject.org"
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: The goal of this script is to get every encode eclip.
"""
from .config import ConfigEncodeClip
import pandas as pd
import json
import subprocess as sp
from typing import Dict, Optional, Tuple
from pathlib import Path
from ..logging_conf import logging_def
import logging
def load_encode_eclip_file() -> pd.DataFrame:
"""
:return: A table containing encode eclip experiment
>>> d = load_encode_eclip_file().head()
>>> d[d.columns[0:4]]
ID Target gene symbol Biosample summary Project
0 /experiments/ENCSR322HHA/ TIAL1 HepG2 ENCODE
1 /experiments/ENCSR023PKW/ EIF3G K562 ENCODE
2 /experiments/ENCSR366DGX/ KHSRP HepG2 ENCODE
3 /experiments/ENCSR337XGI/ SAFB HepG2 ENCODE
4 /experiments/ENCSR977OXG/ PRPF4 HepG2 ENCODE
>>> d[d.columns[4:]]
Biological replicate Technical replicate Organism Genome assembly
0 1,2 1 Homo sapiens GRCh38,hg19
1 2,1 1 Homo sapiens GRCh38,hg19
2 2,1 1 Homo sapiens GRCh38,hg19
3 2,1 1 Homo sapiens GRCh38,hg19
4 1,2 1 Homo sapiens GRCh38,hg19
"""
df = pd.read_csv(ConfigEncodeClip.eclip_file, sep="\t")
df.rename({"Biosample term name": "Biosample summary"}, axis=1,
inplace=True)
return df
def get_encode_url(encode_id: str) -> str:
"""
:param encode_id: An partial encode URL for an experiment
:return: The complete encode URL
>>> get_encode_url("/experiments/ENCSR464OSH/")
'https://www.encodeproject.org/experiments/ENCSR464OSH/'
>>> get_encode_url("lolipop")
Traceback (most recent call last):
...
ValueError: The encode_id should begin with /experiments/
"""
if not encode_id.startswith("/experiments/"):
raise ValueError("The encode_id should begin with /experiments/")
return f"{ConfigEncodeClip.encode_website}{encode_id}"
def get_json(url: str) -> Dict:
"""
:param url: An encode URL pointing to an eclip experiment
:return: The json contained in the url
>>> d = get_json("https://www.encodeproject.org/experiments/ENCSR069EVH/")
>>> list(d.keys())[0:4]
['assay_term_name', 'biosample_ontology', 'documents', 'references']
"""
cmd = f"curl {url} --silent | sed -n '10p'"
res = sp.check_output(cmd, shell=True).decode("UTF-8")
return json.loads(res)
def get_bed_eclip_file(dic_file: Dict) -> Optional[Tuple[str, Dict]]:
"""
From a dictionary containing data on an experiment file, return \
the url to download the file if it's a bed of narrowpeak on hg19.
:param dic_file: A file containing data on an experiment file
:return: The url of the file if the file is a bed file of narrowpeaks \
on hg19.
>>> d = get_json("https://www.encodeproject.org/experiments/ENCSR069EVH/")
>>> df = d["files"][24]
>>> url, data = get_bed_eclip_file(df)
>>> url.replace("https://www.encodeproject.org", "")
'/files/ENCFF777FHS/@@download/ENCFF777FHS.bed.gz'
>>> data == {'rep': [1, 2], 'title': 'ENCFF777FHS'}
True
>>> get_bed_eclip_file(d["files"][22])
"""
for k in ["file_format", "file_format_type", "no_file_available",
"assembly", "assay_term_name"]:
if k not in dic_file.keys():
return None
if dic_file["file_format"] != "bed" or \
dic_file["file_format_type"] != "narrowPeak" or \
dic_file["no_file_available"] or dic_file["assembly"] != "hg19" or \
dic_file["assay_term_name"] != "eCLIP" or \
dic_file["status"] != "released":
return None
data = {"rep": dic_file["biological_replicates"],
"title": dic_file["title"]}
return f"{ConfigEncodeClip.encode_website}{dic_file['href']}", data
def download_clip_file(url_clip: str, output: Path, data: Dict,
row: pd.Series) -> Path:
"""
Save the file located at url_clip and return the Path of the created file.
:param url_clip: An url containing a bed file
:param output: The folder where the result will be created
:param data: A dictionary containing the number of replicates \
used to create the file and the sample name
:param row: The row of an encode table containing eclip \
experiments.
:return: The Path where the file downloaded from clip_url is stored
>>> u = str(ConfigEncodeClip.encode_website) + \
"/files/ENCFF777FHS/@@download/ENCFF777FHS.bed.gz"
>>> data = {'rep': [1, 2], 'title': 'ENCFF777FHS'}
>>> s = pd.Series({"ID": "/experiments/ENCSR069EVH/",
... "Target gene symbol": "FUS", "Biosample summary": "HepG2",
... "Project": "ENCODE", "Biological replicate": "1,2",
... "Technical replicate": "1", "Organism": "Homo sapiens",
... "Genome assembly": "GRCh38,hg19"})
>>> outf = download_clip_file(u, Path("/tmp"), data, s)
>>> outf.is_file()
True
>>> outf.unlink()
"""
bs = row['Biosample summary'].replace(" ", "-")
outfile = output / f"{row['Target gene symbol']}_" \
f"{bs}_{row['Project']}_" \
f"{row['ID'].replace('/experiments/', '')[:-1]}_" \
f"{data['title']}_" \
f"rep{'-'.join(map(str, data['rep']))}.hg19.bed.gz"
cmd = f"curl -L {url_clip} -o {outfile} --silent"
sp.check_call(cmd, shell=True)
return outfile
def download_eclip(output: Path) -> None:
"""
Download every eclip experiment of interest.
:param output: File where the eclip experiment will be stored
"""
output.mkdir(exist_ok=True)
logging_def(ConfigEncodeClip.output, __file__, "INFO")
df = load_encode_eclip_file()
for i in range(df.shape[0]):
sample_ok = 0
my_row = df.iloc[i, :]
exp_name = my_row['ID'].replace('/experiments/', '')[:-1]
logging.info(f"Working on experiment "
f"{exp_name}")
dic_exp = get_json(get_encode_url(my_row["ID"]))
if "files" not in dic_exp.keys():
continue
for dic_file in dic_exp["files"]:
res = get_bed_eclip_file(dic_file)
if res is not None:
download_clip_file(res[0], output, res[1], my_row)
sample_ok += 1
logging.info(f" --> {dic_file['title']}, done")
if sample_ok != 3:
logging.warning(f"experiment {exp_name} has {sample_ok} bed "
f"reported !")
if __name__ == "__main__":
import doctest
doctest.testmod()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment