From 2cd7037bbe9317a630ae537c7ec3e58e3bf42d49 Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Fri, 23 Oct 2020 16:12:40 +0200
Subject: [PATCH] src.bed_handler: creation of bed of regulated exons by
 DDX5/17

---
 src/bed_handler/__main__.py                   |   8 ++
 src/bed_handler/get_gene_regulated_by_ddx.py  |   8 +-
 .../select_regulated_near_ctcf_exons.py       | 122 ++++++++++++++++++
 3 files changed, 134 insertions(+), 4 deletions(-)
 create mode 100644 src/bed_handler/select_regulated_near_ctcf_exons.py

diff --git a/src/bed_handler/__main__.py b/src/bed_handler/__main__.py
index 4f87f0c..79bc3dc 100644
--- a/src/bed_handler/__main__.py
+++ b/src/bed_handler/__main__.py
@@ -10,6 +10,7 @@ visualisation
 from .filter_gene import create_filtered_bed
 from .get_gene_locations import create_region_bed
 from .get_gene_regulated_by_ddx import write_regulated_gene_file
+from .select_regulated_near_ctcf_exons import create_bed_ctcf_exon
 
 
 def launcher():
@@ -19,6 +20,13 @@ def launcher():
     write_regulated_gene_file()
     create_filtered_bed()
     create_region_bed()
+    create_bed_ctcf_exon("down", 0, "both")
+    create_bed_ctcf_exon("down", 1000, "both")
+    create_bed_ctcf_exon("down", 2000, "both")
+    create_bed_ctcf_exon("down", 1000, "upstream")
+    create_bed_ctcf_exon("down", 2000, "upstream")
+    create_bed_ctcf_exon("down", 1000, "downstream")
+    create_bed_ctcf_exon("down", 2000, "downstream")
 
 
 launcher()
diff --git a/src/bed_handler/get_gene_regulated_by_ddx.py b/src/bed_handler/get_gene_regulated_by_ddx.py
index b48e5b2..dd4b6f9 100644
--- a/src/bed_handler/get_gene_regulated_by_ddx.py
+++ b/src/bed_handler/get_gene_regulated_by_ddx.py
@@ -49,7 +49,7 @@ def format_exon_bed(exon_bed: Path, gene_bed: Path) -> pd.DataFrame:
                      'exon_name']]
 
 
-def load_sipp_vs_ctct(sipp_file: Path, exon_table: pd.DataFrame
+def load_sipp_vs_ctcf(sipp_file: Path, exon_table: pd.DataFrame
                       ) -> pd.DataFrame:
     """
     Load the file containing exons regulated by DDX5 17 and containing their \
@@ -60,7 +60,7 @@ def load_sipp_vs_ctct(sipp_file: Path, exon_table: pd.DataFrame
     containing their distance to CTCF.
     :return: Same input table as a dataframe
 
-    >>> df = load_sipp_vs_ctct(TestConfig.sipp_vs_ctcf,
+    >>> df = load_sipp_vs_ctcf(TestConfig.sipp_vs_ctcf,
     ... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed))
     >>> df[['exon_name', "exon_id", 'group', 'id', 'gene_id']]
       exon_name               exon_id      group   id  gene_id
@@ -85,7 +85,7 @@ def get_ddx_genes(df: pd.DataFrame) -> List[int]:
     :param df: A dataframe of DDX regulated exons
     :return: Return the unique values in gene_id column.
 
-    >>> df = load_sipp_vs_ctct(TestConfig.sipp_vs_ctcf,
+    >>> df = load_sipp_vs_ctcf(TestConfig.sipp_vs_ctcf,
     ... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed))
     >>> get_ddx_genes(df)
     [1]
@@ -99,7 +99,7 @@ def write_regulated_gene_file():
     """
     BedConfig.bed.output.mkdir(exist_ok=True, parents=True)
     exon_table = format_exon_bed(BedConfig.exon_bed, BedConfig.gene_bed)
-    final_table = load_sipp_vs_ctct(BedConfig.sipp_vs_ctcf, exon_table)
+    final_table = load_sipp_vs_ctcf(BedConfig.sipp_vs_ctcf, exon_table)
     list_gene = get_ddx_genes(final_table)
     with BedConfig.bed.ddx_genes.open('w')as outfile:
         for gene in list_gene:
diff --git a/src/bed_handler/select_regulated_near_ctcf_exons.py b/src/bed_handler/select_regulated_near_ctcf_exons.py
new file mode 100644
index 0000000..8c1bab0
--- /dev/null
+++ b/src/bed_handler/select_regulated_near_ctcf_exons.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description: Create bed of exons regulated by DDX5/17 and near CTCF
+"""
+
+from .config import TestConfig, BedConfig
+from .get_gene_regulated_by_ddx import load_sipp_vs_ctcf, format_exon_bed
+import pandas as pd
+from doctest import testmod
+
+
+def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
+                               location: str) -> pd.DataFrame:
+    """
+    Filter the dataframe to recover only regulated exons near CTCF.
+
+    :param df: The dataframe of exon regulated by CTCF
+    :param reg: The regulation by CTCF
+    :param threshold: The threshold distance
+    :param location: The location of interest
+    :return: The filtered dataframe
+
+    >>> cdf = load_sipp_vs_ctcf(TestConfig.sipp_vs_ctcf,
+    ... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed))
+    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 4, 'upstream')
+    >>> rdf[['exon_name', 'dist', 'group', 'id']]
+      exon_name  dist      group   id
+    0    DSC2_1    -4  siPP_DOWN  1_1
+    1    DSC2_2    -3  siPP_DOWN  1_2
+    2    DSC2_3    -2  siPP_DOWN  1_3
+    4    DSC2_5     0  siPP_DOWN  1_5
+    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'upstream')
+    >>> rdf[['exon_name', 'dist', 'group', 'id']]
+      exon_name  dist      group   id
+    2    DSC2_3    -2  siPP_DOWN  1_3
+    4    DSC2_5     0  siPP_DOWN  1_5
+    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'downstream')
+    >>> rdf[['exon_name', 'dist', 'group', 'id']]
+      exon_name  dist      group   id
+    4    DSC2_5     0  siPP_DOWN  1_5
+    5    DSC2_6     1  siPP_DOWN  1_6
+    6    DSC2_7     2  siPP_DOWN  1_7
+    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'both')
+    >>> rdf[['exon_name', 'dist', 'group', 'id']]
+      exon_name  dist      group   id
+    2    DSC2_3    -2  siPP_DOWN  1_3
+    4    DSC2_5     0  siPP_DOWN  1_5
+    5    DSC2_6     1  siPP_DOWN  1_6
+    6    DSC2_7     2  siPP_DOWN  1_7
+    >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'both')
+    >>> rdf[['exon_name', 'dist', 'group', 'id']]
+      exon_name  dist    group   id
+    3    DSC2_4    -1  siPP_UP  1_4
+    >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'downstream')
+    >>> rdf[['exon_name', 'dist', 'group', 'id']]
+    Empty DataFrame
+    Columns: [exon_name, dist, group, id]
+    Index: []
+    >>> rdf = filter_ctcf_distance_table(cdf, 'all', 2, 'both')
+    >>> rdf[['exon_name', 'dist', 'group', 'id']]
+      exon_name  dist      group   id
+    2    DSC2_3    -2  siPP_DOWN  1_3
+    3    DSC2_4    -1    siPP_UP  1_4
+    4    DSC2_5     0  siPP_DOWN  1_5
+    5    DSC2_6     1  siPP_DOWN  1_6
+    6    DSC2_7     2  siPP_DOWN  1_7
+    >>> filter_ctcf_distance_table(cdf, 'lul', 2, 'both')
+    Traceback (most recent call last):
+    ...
+    ValueError: reg parameter should be one in: ['down', 'up', 'all']
+    >>> filter_ctcf_distance_table(cdf, 'up', 2, 'xd')
+    Traceback (most recent call last):
+    ...
+    ValueError: location parameter should be in ['upstream', \
+'downstream', 'both']
+    """
+    if reg not in ['down', 'up', 'all']:
+        raise ValueError(f"reg parameter should be one in: "
+                         f"['down', 'up', 'all']")
+    if location not in ['upstream', 'downstream', 'both']:
+        raise ValueError(f"location parameter should be in "
+                         f"['upstream', 'downstream', 'both']")
+    if reg != "all":
+        df = df.loc[df["group"] == f"siPP_{reg.upper()}", :]
+    if location == "upstream":
+        df = df.loc[(df["dist"] >= threshold * -1) &
+                    (df["dist"] <= 0), :]
+    elif location == "downstream":
+        df = df.loc[(df["dist"] <= threshold) &
+                    (df["dist"] >= 0), :]
+    else:
+        df = df.loc[abs(df["dist"]) <= threshold, :]
+    return df
+
+
+def create_bed_ctcf_exon(reg: str, threshold: int,
+                         location: str) -> None:
+    """
+    Filter the dataframe to recover only regulated exons near CTCF.
+
+    :param reg: The regulation by CTCF
+    :param threshold: The threshold distance
+    :param location: The location of interest
+    """
+    if threshold < 0:
+        threshold = 0
+    if threshold == 0:
+        location = "both"
+    df = load_sipp_vs_ctcf(BedConfig.sipp_vs_ctcf,
+                           format_exon_bed(BedConfig.exon_bed,
+                                           BedConfig.gene_bed))
+    df = filter_ctcf_distance_table(df, reg, threshold, location)
+    df.to_csv(BedConfig.bed.output / f"CTCF_{threshold}_{location}_"
+                                     f"ddx_{reg}_exon.bed", sep="\t",
+              index=False)
+
+
+if __name__ == "__main__":
+    testmod()
-- 
GitLab