From aee24d7d0d288f7623533031e8c1bd3b15448c2c Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Tue, 3 Nov 2020 15:21:03 +0100
Subject: [PATCH] src/bed_handler/select_regulated_near_ctcf_exons.py: add a
 parameter include0 to include exons containing CTCF if wanted. Add a
 parameter near_ctcf that allows to recover regulated exons far from a CTCF
 site

---
 .../select_regulated_near_ctcf_exons.py       | 94 ++++++++++++++-----
 1 file changed, 71 insertions(+), 23 deletions(-)

diff --git a/src/bed_handler/select_regulated_near_ctcf_exons.py b/src/bed_handler/select_regulated_near_ctcf_exons.py
index 4389c9c..6894170 100644
--- a/src/bed_handler/select_regulated_near_ctcf_exons.py
+++ b/src/bed_handler/select_regulated_near_ctcf_exons.py
@@ -11,10 +11,11 @@ from .get_gene_regulated_by_ddx import load_sipp_vs_ctcf, format_exon_bed
 import pandas as pd
 from doctest import testmod
 from .filter_gene import filter_bed
+import warnings
 
 
 def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
-                               location: str) -> pd.DataFrame:
+                               location: str, include0: bool) -> pd.DataFrame:
     """
     Filter the dataframe to recover only regulated exons near CTCF.
 
@@ -22,57 +23,74 @@ def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
     :param reg: The regulation by CTCF
     :param threshold: The threshold distance
     :param location: The location of interest
+    :param include0: True to include exons containing a CTCF site if the \
+    threshold is greater than 0.
     :return: The filtered dataframe
 
     >>> cdf = load_sipp_vs_ctcf(TestConfig.sipp_vs_ctcf,
     ... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed))
-    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 4, 'upstream')
+    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 4, 'upstream', False)
     >>> rdf[['exon_name', 'dist', 'group', 'id']]
       exon_name  dist      group   id
     0    DSC2_1    -4  siPP_DOWN  1_1
     1    DSC2_2    -3  siPP_DOWN  1_2
     2    DSC2_3    -2  siPP_DOWN  1_3
-    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'upstream')
+    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 4, 'upstream', True)
+    >>> rdf[['exon_name', 'dist', 'group', 'id']]
+      exon_name  dist      group   id
+    0    DSC2_1    -4  siPP_DOWN  1_1
+    1    DSC2_2    -3  siPP_DOWN  1_2
+    2    DSC2_3    -2  siPP_DOWN  1_3
+    4    DSC2_5     0  siPP_DOWN  1_5
+    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'upstream', False)
     >>> rdf[['exon_name', 'dist', 'group', 'id']]
       exon_name  dist      group   id
     2    DSC2_3    -2  siPP_DOWN  1_3
-    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'downstream')
+    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'downstream', False)
     >>> rdf[['exon_name', 'dist', 'group', 'id']]
       exon_name  dist      group   id
     5    DSC2_6     1  siPP_DOWN  1_6
     6    DSC2_7     2  siPP_DOWN  1_7
-    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'both')
+    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'both', False)
     >>> rdf[['exon_name', 'dist', 'group', 'id']]
       exon_name  dist      group   id
     2    DSC2_3    -2  siPP_DOWN  1_3
     5    DSC2_6     1  siPP_DOWN  1_6
     6    DSC2_7     2  siPP_DOWN  1_7
-    >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'both')
+    >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'both', False)
     >>> rdf[['exon_name', 'dist', 'group', 'id']]
       exon_name  dist    group   id
     3    DSC2_4    -1  siPP_UP  1_4
-    >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'downstream')
+    >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'downstream', False)
     >>> rdf[['exon_name', 'dist', 'group', 'id']]
     Empty DataFrame
     Columns: [exon_name, dist, group, id]
     Index: []
-    >>> rdf = filter_ctcf_distance_table(cdf, 'all', 2, 'both')
+    >>> rdf = filter_ctcf_distance_table(cdf, 'all', 2, 'both', False)
     >>> rdf[['exon_name', 'dist', 'group', 'id']]
       exon_name  dist      group   id
     2    DSC2_3    -2  siPP_DOWN  1_3
     3    DSC2_4    -1    siPP_UP  1_4
     5    DSC2_6     1  siPP_DOWN  1_6
     6    DSC2_7     2  siPP_DOWN  1_7
-    >>> filter_ctcf_distance_table(cdf, 'lul', 2, 'both')
+    >>> rdf = filter_ctcf_distance_table(cdf, 'all', 2, 'both', True)
+    >>> rdf[['exon_name', 'dist', 'group', 'id']]
+      exon_name  dist      group   id
+    2    DSC2_3    -2  siPP_DOWN  1_3
+    3    DSC2_4    -1    siPP_UP  1_4
+    4    DSC2_5     0  siPP_DOWN  1_5
+    5    DSC2_6     1  siPP_DOWN  1_6
+    6    DSC2_7     2  siPP_DOWN  1_7
+    >>> filter_ctcf_distance_table(cdf, 'lul', 2, 'both', False)
     Traceback (most recent call last):
     ...
     ValueError: reg parameter should be one in: ['down', 'up', 'all']
-    >>> filter_ctcf_distance_table(cdf, 'up', 2, 'xd')
+    >>> filter_ctcf_distance_table(cdf, 'up', 2, 'xd', False)
     Traceback (most recent call last):
     ...
     ValueError: location parameter should be in ['upstream', \
 'downstream', 'both']
-    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 0, 'both')
+    >>> rdf = filter_ctcf_distance_table(cdf, 'down', 0, 'both', False)
     >>> rdf[['exon_name', 'dist', 'group', 'id']]
       exon_name  dist      group   id
     4    DSC2_5     0  siPP_DOWN  1_5
@@ -87,45 +105,75 @@ def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
         df = df.loc[df["group"] == f"siPP_{reg.upper()}", :]
     if location == "upstream":
         df = df.loc[(df["dist"] >= threshold * -1) &
-                    (df["dist"] < 0), :]
+                    (df["dist"] < 0 if not include0
+                     else df["dist"] <= 0), :]
     elif location == "downstream":
         df = df.loc[(df["dist"] <= threshold) &
-                    (df["dist"] > 0), :]
+                    (df["dist"] > 0 if not include0
+                     else df["dist"] >= 0), :]
     else:
         if threshold == 0:
             df = df.loc[abs(df["dist"]) == 0, :]
         else:
-            df = df.loc[(abs(df["dist"]) <= threshold) &
-                        (df["dist"] != 0), :]
+            if not include0:
+                df = df.loc[(abs(df["dist"]) <= threshold) &
+                            (df["dist"] != 0), :]
+            else:
+                df = df.loc[abs(df["dist"]) <= threshold, :]
     return df
 
 
 def create_bed_ctcf_exon(reg: str, threshold: int,
-                         location: str) -> None:
+                         location: str, include0: bool = False,
+                         near_ctcf: bool = True) -> None:
     """
     Filter the dataframe to recover only regulated exons near CTCF.
 
     :param reg: The regulation by CTCF
     :param threshold: The threshold distance
     :param location: The location of interest
+    :param include0: True to include exons containing a CTCF site if the \
+    threshold is greater than 0.
+    :param near_ctcf: True to recover exons near CTCF False to recover \
+    those far from CTCF
     """
     if threshold < 0:
         threshold = 0
     if threshold == 0:
         location = "both"
-    df = load_sipp_vs_ctcf(BedConfig.sipp_vs_ctcf,
-                           format_exon_bed(BedConfig.exon_bed,
-                                           BedConfig.gene_bed))
-    df = filter_ctcf_distance_table(df, reg, threshold, location)
-    list_exons = df['id'].to_list()
+    if not include0 and threshold == 0:
+        warnings.warn("include0 must be True when threshold = 0."
+                      "Setting include0 to true !")
+        include0 = True
+    if not near_ctcf and not include0:
+        warnings.warn("include0 must be True when near_ctcf is False")
+        include0 = True
+    i0 = "with0" if include0 else "without0"
+    df_reg = load_sipp_vs_ctcf(BedConfig.sipp_vs_ctcf,
+                               format_exon_bed(BedConfig.exon_bed,
+                                               BedConfig.gene_bed))
+    df = filter_ctcf_distance_table(df_reg, reg, threshold, location, include0)
+    if near_ctcf:
+        name_near = ""
+        list_exons = df['id'].to_list()
+    else:
+        name_near = "Far_"
+        if reg != "all":
+            tmp_exons = df_reg.loc[df_reg["group"] == f"siPP_{reg.upper()}",
+                                   "id"].to_list()
+        else:
+            tmp_exons = df_reg['id'].to_list()
+        list_exons = [e for e in tmp_exons if e not in df['id'].to_list()]
     list_genes = [int(exon.split('_')[0]) for exon in list_exons]
     df_exon = filter_bed(BedConfig.exon_bed, list_exons)
     df_gene = filter_bed(BedConfig.gene_bed, list_genes)
     df_exon.to_csv(BedConfig.bed.output /
-                   f"CTCF_{threshold}_{location}_ddx_{reg}_exon.bed", sep="\t",
+                   f"{name_near}CTCF_{threshold}_{location}_ddx_{reg}_{i0}_exon.bed",
+                   sep="\t",
                    index=False)
     df_gene.to_csv(BedConfig.bed.output /
-                   f"CTCF_{threshold}_{location}_ddx_{reg}_gene.bed", sep="\t",
+                   f"{name_near}CTCF_{threshold}_{location}_ddx_{reg}_{i0}_gene.bed",
+                   sep="\t",
                    index=False)
 
 
-- 
GitLab