From 0117f4ca6c0d4e1b3001cecc0ac11fb88dd90ec5 Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Mon, 15 Mar 2021 18:16:31 +0100
Subject: [PATCH] src/bed_handler/select_regulated_near_ctcf_exons.py: new
 function get_bed_ctcf_exon to only recover exons close to CTCF peaks in a
 given bed file containing exons

---
 .../select_regulated_near_ctcf_exons.py       | 86 ++++++++++++++++++-
 1 file changed, 82 insertions(+), 4 deletions(-)

diff --git a/src/bed_handler/select_regulated_near_ctcf_exons.py b/src/bed_handler/select_regulated_near_ctcf_exons.py
index a9c2fdb..11d37e4 100644
--- a/src/bed_handler/select_regulated_near_ctcf_exons.py
+++ b/src/bed_handler/select_regulated_near_ctcf_exons.py
@@ -13,6 +13,8 @@ from doctest import testmod
 from .filter_gene import filter_bed
 import warnings
 from .get_other_exon_in_same_gene import create_gene_bed4norm
+from pathlib import Path
+import lazyparser as lp
 
 
 def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
@@ -128,9 +130,10 @@ def create_bed_ctcf_exon(reg: str, threshold: int,
                          location: str, include0: bool = False,
                          near_ctcf: bool = True) -> None:
     """
-    Filter the dataframe to recover only regulated exons near CTCF.
+    Filter the dataframe of exon regulated by ddx5/17 to recover only \
+    regulated exons near CTCF.
 
-    :param reg: The regulation by CTCF
+    :param reg: The regulation by DDX5/17
     :param threshold: The threshold distance
     :param location: The location of interest
     :param include0: True to include exons containing a CTCF site if the \
@@ -163,7 +166,9 @@ def create_bed_ctcf_exon(reg: str, threshold: int,
                                    "id"].to_list()
         else:
             tmp_exons = df_reg['id'].to_list()
-        list_exons = [e for e in tmp_exons if e not in df['id'].to_list()]
+        bad_id = df['id'].to_list() if include0 \
+            else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list()
+        list_exons = [e for e in tmp_exons if e not in bad_id]
     list_genes = [int(exon.split('_')[0]) for exon in list_exons]
     df_exon = filter_bed(BedConfig.exon_bed, list_exons)
     df_gene = filter_bed(BedConfig.gene_bed, list_genes)
@@ -185,6 +190,79 @@ def create_bed_ctcf_exon(reg: str, threshold: int,
                    index=False)
 
 
+@lp.parse(exon_bed="file", location=["upstream", "downstream", "both"],
+          threshold="threshold >= 0")
+def get_bed_ctcf_exon(exon_bed: str, threshold: int,
+                      location: str, include0: bool = False,
+                      near_ctcf: bool = True,
+                      name_near: str = "") -> None:
+    """
+    Filter a bed file containing exons to recover only regulated exons \
+    near or far from CTCF.
+
+    :param exon_bed: a bed containing exons
+    :param threshold: The threshold distance
+    :param location: The location of interest
+    :param include0: True to include exons containing a CTCF site if the \
+    threshold is greater than 0.
+    :param near_ctcf: True to recover exons near CTCF False to recover \
+    those far from CTCF
+    :param name_near: prefix of the output file name
+    """
+    if name_near == "":
+        name_near = Path(exon_bed).name.replace(".bed", "")
+    threshold = max(threshold, 0)
+    if threshold == 0:
+        location = "both"
+    if not include0 and threshold == 0:
+        warnings.warn("include0 must be True when threshold = 0."
+                      "Setting include0 to true !")
+        include0 = True
+    if not near_ctcf and not include0:
+        warnings.warn("include0 must be True when near_ctcf is False")
+        include0 = True
+    i0 = "with0" if include0 else "without0"
+    df_reg = load_sipp_vs_ctcf(BedConfig.all_vs_ctcf,
+                               format_exon_bed(Path(exon_bed),
+                                               BedConfig.gene_bed))
+    df_reg = df_reg[-df_reg["gene_id"].isna()]
+    df_reg["gene_id"] = df_reg["gene_id"].astype(int)
+    df_reg["exon_pos"] = df_reg["exon_pos"].astype(int)
+    df = filter_ctcf_distance_table(df_reg, 'all', threshold, location,
+                                    include0)
+    if near_ctcf:
+        name_near = name_near + "_near_"
+        list_exons = df['id'].to_list()
+    else:
+        name_near = name_near + "_far_"
+        tmp_exons = df_reg['id'].to_list()
+        bad_id = df['id'].to_list() if include0 \
+            else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list()
+        list_exons = [e for e in tmp_exons if e not in bad_id]
+    list_genes = [int(exon.split('_')[0]) for exon in list_exons]
+    df_exon = filter_bed(BedConfig.exon_bed, list_exons)
+    df_gene = filter_bed(BedConfig.gene_bed, list_genes)
+    df_exon.to_csv(BedConfig.bed.output /
+                   f"{name_near}CTCF_{threshold}_{location}_ddx_"
+                   f"{i0}_exon.bed",
+                   sep="\t",
+                   index=False)
+    df_gene.to_csv(BedConfig.bed.output /
+                   f"{name_near}CTCF_{threshold}_{location}_ddx_"
+                   f"{i0}_gene.bed",
+                   sep="\t",
+                   index=False)
+    df_gene = create_gene_bed4norm(BedConfig.gene_bed, df_exon)
+    df_gene.to_csv(BedConfig.bed.output /
+                   f"{name_near}CTCF_{threshold}_{location}_ddx_"
+                   f"{i0}_gene-dup.bed",
+                   sep="\t",
+                   index=False)
+
 
 if __name__ == "__main__":
-    testmod()
+    import sys
+    if len(sys.argv) == 1:
+        testmod()
+    else:
+        get_bed_ctcf_exon()
-- 
GitLab