From c296fc1908e4adf30ac16dce9c0b8d4a9b5f45c8 Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Mon, 21 Feb 2022 09:36:29 +0100 Subject: [PATCH] src/bed_handler/select_regulated_near_ctcf_exons.py: add filter_expressed function --- .../select_regulated_near_ctcf_exons.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/bed_handler/select_regulated_near_ctcf_exons.py b/src/bed_handler/select_regulated_near_ctcf_exons.py index 11d37e4..6218984 100644 --- a/src/bed_handler/select_regulated_near_ctcf_exons.py +++ b/src/bed_handler/select_regulated_near_ctcf_exons.py @@ -15,6 +15,7 @@ import warnings from .get_other_exon_in_same_gene import create_gene_bed4norm from pathlib import Path import lazyparser as lp +from typing import List def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int, @@ -126,6 +127,17 @@ def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int, return df +def filter_expressed(exon_list: List[str]) -> List[str]: + """ + Filter only expressed exons. + + :param exon_list: A list of exons + :return: The list of expressed exons + """ + egenes = BedConfig.expressed_genes.open('r').read().splitlines() + return [exon for exon in exon_list if exon.split("_")[0] in egenes] + + def create_bed_ctcf_exon(reg: str, threshold: int, location: str, include0: bool = False, near_ctcf: bool = True) -> None: @@ -169,6 +181,7 @@ def create_bed_ctcf_exon(reg: str, threshold: int, bad_id = df['id'].to_list() if include0 \ else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list() list_exons = [e for e in tmp_exons if e not in bad_id] + list_exons = filter_expressed(list_exons) list_genes = [int(exon.split('_')[0]) for exon in list_exons] df_exon = filter_bed(BedConfig.exon_bed, list_exons) df_gene = filter_bed(BedConfig.gene_bed, list_genes) @@ -239,6 +252,7 @@ def get_bed_ctcf_exon(exon_bed: str, threshold: int, bad_id = df['id'].to_list() if include0 \ else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list() list_exons = [e for e in tmp_exons if e not in bad_id] + list_exons = filter_expressed(list_exons) list_genes = [int(exon.split('_')[0]) for exon in list_exons] df_exon = filter_bed(BedConfig.exon_bed, list_exons) df_gene = filter_bed(BedConfig.gene_bed, list_genes) -- GitLab