From c296fc1908e4adf30ac16dce9c0b8d4a9b5f45c8 Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Mon, 21 Feb 2022 09:36:29 +0100
Subject: [PATCH] src/bed_handler/select_regulated_near_ctcf_exons.py: add
 filter_expressed function

---
 .../select_regulated_near_ctcf_exons.py            | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/bed_handler/select_regulated_near_ctcf_exons.py b/src/bed_handler/select_regulated_near_ctcf_exons.py
index 11d37e4..6218984 100644
--- a/src/bed_handler/select_regulated_near_ctcf_exons.py
+++ b/src/bed_handler/select_regulated_near_ctcf_exons.py
@@ -15,6 +15,7 @@ import warnings
 from .get_other_exon_in_same_gene import create_gene_bed4norm
 from pathlib import Path
 import lazyparser as lp
+from typing import List
 
 
 def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
@@ -126,6 +127,17 @@ def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
     return df
 
 
+def filter_expressed(exon_list: List[str]) -> List[str]:
+    """
+    Filter only expressed exons.
+
+    :param exon_list: A list of exons
+    :return: The list of expressed exons
+    """
+    egenes = BedConfig.expressed_genes.open('r').read().splitlines()
+    return [exon for exon in exon_list if exon.split("_")[0] in egenes]
+
+
 def create_bed_ctcf_exon(reg: str, threshold: int,
                          location: str, include0: bool = False,
                          near_ctcf: bool = True) -> None:
@@ -169,6 +181,7 @@ def create_bed_ctcf_exon(reg: str, threshold: int,
         bad_id = df['id'].to_list() if include0 \
             else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list()
         list_exons = [e for e in tmp_exons if e not in bad_id]
+    list_exons = filter_expressed(list_exons)
     list_genes = [int(exon.split('_')[0]) for exon in list_exons]
     df_exon = filter_bed(BedConfig.exon_bed, list_exons)
     df_gene = filter_bed(BedConfig.gene_bed, list_genes)
@@ -239,6 +252,7 @@ def get_bed_ctcf_exon(exon_bed: str, threshold: int,
         bad_id = df['id'].to_list() if include0 \
             else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list()
         list_exons = [e for e in tmp_exons if e not in bad_id]
+    list_exons = filter_expressed(list_exons)
     list_genes = [int(exon.split('_')[0]) for exon in list_exons]
     df_exon = filter_bed(BedConfig.exon_bed, list_exons)
     df_gene = filter_bed(BedConfig.gene_bed, list_genes)
-- 
GitLab