From f826d8d68ce8d1566e24e567b6b9c3bf69058122 Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Mon, 25 Jan 2021 17:19:45 +0100
Subject: [PATCH] src/download_encode_eclip/*.py: creation of a module used to
 download all the eclip data (on hg19) from Encode

---
 src/download_encode_eclip/__init__.py         |   7 +
 src/download_encode_eclip/__main__.py         |  17 ++
 src/download_encode_eclip/config.py           |  20 ++
 src/download_encode_eclip/get_encode_clips.py | 179 ++++++++++++++++++
 4 files changed, 223 insertions(+)
 create mode 100644 src/download_encode_eclip/__init__.py
 create mode 100644 src/download_encode_eclip/__main__.py
 create mode 100644 src/download_encode_eclip/config.py
 create mode 100644 src/download_encode_eclip/get_encode_clips.py

diff --git a/src/download_encode_eclip/__init__.py b/src/download_encode_eclip/__init__.py
new file mode 100644
index 00000000..60f3a118
--- /dev/null
+++ b/src/download_encode_eclip/__init__.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description:
+"""
diff --git a/src/download_encode_eclip/__main__.py b/src/download_encode_eclip/__main__.py
new file mode 100644
index 00000000..ffab5c68
--- /dev/null
+++ b/src/download_encode_eclip/__main__.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description: Launcher that will download all eclip experiment file.
+"""
+
+from .get_encode_clips import download_eclip
+from .config import ConfigEncodeClip
+
+
+def get_eclips():
+    download_eclip(ConfigEncodeClip.output)
+
+
+get_eclips()
diff --git a/src/download_encode_eclip/config.py b/src/download_encode_eclip/config.py
new file mode 100644
index 00000000..fca51369
--- /dev/null
+++ b/src/download_encode_eclip/config.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description: Contains variables needed in this submodule
+"""
+
+from ..db_utils.config import Config
+
+
+class ConfigEncodeClip:
+    """
+    Contains the variable used in this submodule
+    """
+    data = Config.data
+    eclip_file = data / "CLIP_bed" / "experiment_report_2021_1_22_16h_39m.tsv"
+    output = Config.results / "Encode_clips"
+    encode_website = "https://www.encodeproject.org"
+
diff --git a/src/download_encode_eclip/get_encode_clips.py b/src/download_encode_eclip/get_encode_clips.py
new file mode 100644
index 00000000..b1ba45b2
--- /dev/null
+++ b/src/download_encode_eclip/get_encode_clips.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description: The goal of this script is to get every encode eclip.
+"""
+
+from .config import ConfigEncodeClip
+import pandas as pd
+import json
+import subprocess as sp
+from typing import Dict, Optional, Tuple
+from pathlib import Path
+from ..logging_conf import logging_def
+import logging
+
+
+
+def load_encode_eclip_file() -> pd.DataFrame:
+    """
+    :return: A table containing encode eclip experiment
+
+    >>> d = load_encode_eclip_file().head()
+    >>> d[d.columns[0:4]]
+                              ID Target gene symbol Biosample summary Project
+    0  /experiments/ENCSR322HHA/              TIAL1             HepG2  ENCODE
+    1  /experiments/ENCSR023PKW/              EIF3G              K562  ENCODE
+    2  /experiments/ENCSR366DGX/              KHSRP             HepG2  ENCODE
+    3  /experiments/ENCSR337XGI/               SAFB             HepG2  ENCODE
+    4  /experiments/ENCSR977OXG/              PRPF4             HepG2  ENCODE
+    >>> d[d.columns[4:]]
+      Biological replicate  Technical replicate      Organism Genome assembly
+    0                  1,2                    1  Homo sapiens     GRCh38,hg19
+    1                  2,1                    1  Homo sapiens     GRCh38,hg19
+    2                  2,1                    1  Homo sapiens     GRCh38,hg19
+    3                  2,1                    1  Homo sapiens     GRCh38,hg19
+    4                  1,2                    1  Homo sapiens     GRCh38,hg19
+    """
+    df =  pd.read_csv(ConfigEncodeClip.eclip_file, sep="\t")
+    df.rename({"Biosample term name": "Biosample summary"}, axis=1,
+              inplace=True)
+    return df
+
+
+def get_encode_url(encode_id: str) -> str:
+    """
+
+    :param encode_id: An partial encode URL for an experiment
+    :return: The complete encode URL
+
+    >>> get_encode_url("/experiments/ENCSR464OSH/")
+    'https://www.encodeproject.org/experiments/ENCSR464OSH/'
+    >>> get_encode_url("lolipop")
+    Traceback (most recent call last):
+    ...
+    ValueError: The encode_id should begin with /experiments/
+    """
+    if not encode_id.startswith("/experiments/"):
+        raise ValueError("The encode_id should begin with /experiments/")
+    return f"{ConfigEncodeClip.encode_website}{encode_id}"
+
+
+def get_json(url: str) -> Dict:
+    """
+    :param url: An encode URL pointing to an eclip experiment
+    :return: The json contained in the url
+
+    >>> d = get_json("https://www.encodeproject.org/experiments/ENCSR069EVH/")
+    >>> list(d.keys())[0:4]
+    ['assay_term_name', 'biosample_ontology', 'documents', 'references']
+    """
+    cmd = f"curl {url} --silent | sed -n '10p'"
+    res = sp.check_output(cmd, shell=True).decode("UTF-8")
+    return json.loads(res)
+
+
+def get_bed_eclip_file(dic_file: Dict) -> Optional[Tuple[str, Dict]]:
+    """
+    From a dictionary containing data on an experiment file, return \
+    the url to download the file if it's a bed of narrowpeak on hg19.
+
+    :param dic_file: A file containing data on an experiment file
+    :return: The url of the file if the file is a bed file of narrowpeaks \
+    on hg19.
+
+    >>> d = get_json("https://www.encodeproject.org/experiments/ENCSR069EVH/")
+    >>> df = d["files"][24]
+    >>> url, data = get_bed_eclip_file(df)
+    >>> url.replace("https://www.encodeproject.org", "")
+    '/files/ENCFF777FHS/@@download/ENCFF777FHS.bed.gz'
+    >>> data == {'rep': [1, 2], 'title': 'ENCFF777FHS'}
+    True
+    >>> get_bed_eclip_file(d["files"][22])
+    """
+    for k in ["file_format", "file_format_type", "no_file_available",
+              "assembly", "assay_term_name"]:
+        if k not in dic_file.keys():
+            return None
+    if dic_file["file_format"] != "bed" or \
+        dic_file["file_format_type"] != "narrowPeak" or \
+        dic_file["no_file_available"] or dic_file["assembly"] != "hg19" or \
+        dic_file["assay_term_name"] != "eCLIP" or \
+        dic_file["status"] != "released":
+        return None
+    data = {"rep": dic_file["biological_replicates"],
+            "title": dic_file["title"]}
+    return f"{ConfigEncodeClip.encode_website}{dic_file['href']}", data
+
+
+def download_clip_file(url_clip: str, output: Path, data: Dict,
+                       row: pd.Series) -> Path:
+    """
+    Save the file located at url_clip and return the Path of the created file.
+
+    :param url_clip: An url containing a bed file
+    :param output: The folder where the result will be created
+    :param data: A dictionary containing the number of replicates \
+    used to create the file and the sample name
+    :param row: The row of an encode table containing eclip \
+    experiments.
+    :return: The Path where the file downloaded from clip_url is stored
+
+    >>> u = str(ConfigEncodeClip.encode_website) + \
+    "/files/ENCFF777FHS/@@download/ENCFF777FHS.bed.gz"
+    >>> data = {'rep': [1, 2], 'title': 'ENCFF777FHS'}
+    >>> s = pd.Series({"ID": "/experiments/ENCSR069EVH/",
+    ... "Target gene symbol": "FUS", "Biosample summary": "HepG2",
+    ... "Project": "ENCODE", "Biological replicate": "1,2",
+    ... "Technical replicate": "1", "Organism": "Homo sapiens",
+    ... "Genome assembly": "GRCh38,hg19"})
+    >>> outf = download_clip_file(u, Path("/tmp"), data, s)
+    >>> outf.is_file()
+    True
+    >>> outf.unlink()
+    """
+    bs = row['Biosample summary'].replace(" ", "-")
+    outfile = output / f"{row['Target gene symbol']}_" \
+                       f"{bs}_{row['Project']}_" \
+                       f"{row['ID'].replace('/experiments/', '')[:-1]}_" \
+                       f"{data['title']}_" \
+                       f"rep{'-'.join(map(str, data['rep']))}.hg19.bed.gz"
+    cmd = f"curl -L {url_clip} -o {outfile} --silent"
+    sp.check_call(cmd, shell=True)
+    return outfile
+
+
+def download_eclip(output: Path) -> None:
+    """
+    Download every eclip experiment of interest.
+
+    :param output:  File where the eclip experiment will be stored
+    """
+    output.mkdir(exist_ok=True)
+    logging_def(ConfigEncodeClip.output, __file__, "INFO")
+    df = load_encode_eclip_file()
+    for i in range(df.shape[0]):
+        sample_ok = 0
+        my_row = df.iloc[i, :]
+        exp_name = my_row['ID'].replace('/experiments/', '')[:-1]
+        logging.info(f"Working on experiment "
+                     f"{exp_name}")
+        dic_exp = get_json(get_encode_url(my_row["ID"]))
+        if "files" not in dic_exp.keys():
+            continue
+        for dic_file in dic_exp["files"]:
+            res = get_bed_eclip_file(dic_file)
+            if res is not None:
+                download_clip_file(res[0], output, res[1], my_row)
+                sample_ok += 1
+                logging.info(f"  --> {dic_file['title']}, done")
+        if sample_ok != 3:
+            logging.warning(f"experiment {exp_name} has {sample_ok} bed "
+                            f"reported !")
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
\ No newline at end of file
-- 
GitLab