From aee24d7d0d288f7623533031e8c1bd3b15448c2c Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Tue, 3 Nov 2020 15:21:03 +0100 Subject: [PATCH] src/bed_handler/select_regulated_near_ctcf_exons.py: add a parameter include0 to include exons containing CTCF if wanted. Add a parameter near_ctcf that allows to recover regulated exons far from a CTCF site --- .../select_regulated_near_ctcf_exons.py | 94 ++++++++++++++----- 1 file changed, 71 insertions(+), 23 deletions(-) diff --git a/src/bed_handler/select_regulated_near_ctcf_exons.py b/src/bed_handler/select_regulated_near_ctcf_exons.py index 4389c9c..6894170 100644 --- a/src/bed_handler/select_regulated_near_ctcf_exons.py +++ b/src/bed_handler/select_regulated_near_ctcf_exons.py @@ -11,10 +11,11 @@ from .get_gene_regulated_by_ddx import load_sipp_vs_ctcf, format_exon_bed import pandas as pd from doctest import testmod from .filter_gene import filter_bed +import warnings def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int, - location: str) -> pd.DataFrame: + location: str, include0: bool) -> pd.DataFrame: """ Filter the dataframe to recover only regulated exons near CTCF. @@ -22,57 +23,74 @@ def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int, :param reg: The regulation by CTCF :param threshold: The threshold distance :param location: The location of interest + :param include0: True to include exons containing a CTCF site if the \ + threshold is greater than 0. :return: The filtered dataframe >>> cdf = load_sipp_vs_ctcf(TestConfig.sipp_vs_ctcf, ... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed)) - >>> rdf = filter_ctcf_distance_table(cdf, 'down', 4, 'upstream') + >>> rdf = filter_ctcf_distance_table(cdf, 'down', 4, 'upstream', False) >>> rdf[['exon_name', 'dist', 'group', 'id']] exon_name dist group id 0 DSC2_1 -4 siPP_DOWN 1_1 1 DSC2_2 -3 siPP_DOWN 1_2 2 DSC2_3 -2 siPP_DOWN 1_3 - >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'upstream') + >>> rdf = filter_ctcf_distance_table(cdf, 'down', 4, 'upstream', True) + >>> rdf[['exon_name', 'dist', 'group', 'id']] + exon_name dist group id + 0 DSC2_1 -4 siPP_DOWN 1_1 + 1 DSC2_2 -3 siPP_DOWN 1_2 + 2 DSC2_3 -2 siPP_DOWN 1_3 + 4 DSC2_5 0 siPP_DOWN 1_5 + >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'upstream', False) >>> rdf[['exon_name', 'dist', 'group', 'id']] exon_name dist group id 2 DSC2_3 -2 siPP_DOWN 1_3 - >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'downstream') + >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'downstream', False) >>> rdf[['exon_name', 'dist', 'group', 'id']] exon_name dist group id 5 DSC2_6 1 siPP_DOWN 1_6 6 DSC2_7 2 siPP_DOWN 1_7 - >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'both') + >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'both', False) >>> rdf[['exon_name', 'dist', 'group', 'id']] exon_name dist group id 2 DSC2_3 -2 siPP_DOWN 1_3 5 DSC2_6 1 siPP_DOWN 1_6 6 DSC2_7 2 siPP_DOWN 1_7 - >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'both') + >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'both', False) >>> rdf[['exon_name', 'dist', 'group', 'id']] exon_name dist group id 3 DSC2_4 -1 siPP_UP 1_4 - >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'downstream') + >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'downstream', False) >>> rdf[['exon_name', 'dist', 'group', 'id']] Empty DataFrame Columns: [exon_name, dist, group, id] Index: [] - >>> rdf = filter_ctcf_distance_table(cdf, 'all', 2, 'both') + >>> rdf = filter_ctcf_distance_table(cdf, 'all', 2, 'both', False) >>> rdf[['exon_name', 'dist', 'group', 'id']] exon_name dist group id 2 DSC2_3 -2 siPP_DOWN 1_3 3 DSC2_4 -1 siPP_UP 1_4 5 DSC2_6 1 siPP_DOWN 1_6 6 DSC2_7 2 siPP_DOWN 1_7 - >>> filter_ctcf_distance_table(cdf, 'lul', 2, 'both') + >>> rdf = filter_ctcf_distance_table(cdf, 'all', 2, 'both', True) + >>> rdf[['exon_name', 'dist', 'group', 'id']] + exon_name dist group id + 2 DSC2_3 -2 siPP_DOWN 1_3 + 3 DSC2_4 -1 siPP_UP 1_4 + 4 DSC2_5 0 siPP_DOWN 1_5 + 5 DSC2_6 1 siPP_DOWN 1_6 + 6 DSC2_7 2 siPP_DOWN 1_7 + >>> filter_ctcf_distance_table(cdf, 'lul', 2, 'both', False) Traceback (most recent call last): ... ValueError: reg parameter should be one in: ['down', 'up', 'all'] - >>> filter_ctcf_distance_table(cdf, 'up', 2, 'xd') + >>> filter_ctcf_distance_table(cdf, 'up', 2, 'xd', False) Traceback (most recent call last): ... ValueError: location parameter should be in ['upstream', \ 'downstream', 'both'] - >>> rdf = filter_ctcf_distance_table(cdf, 'down', 0, 'both') + >>> rdf = filter_ctcf_distance_table(cdf, 'down', 0, 'both', False) >>> rdf[['exon_name', 'dist', 'group', 'id']] exon_name dist group id 4 DSC2_5 0 siPP_DOWN 1_5 @@ -87,45 +105,75 @@ def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int, df = df.loc[df["group"] == f"siPP_{reg.upper()}", :] if location == "upstream": df = df.loc[(df["dist"] >= threshold * -1) & - (df["dist"] < 0), :] + (df["dist"] < 0 if not include0 + else df["dist"] <= 0), :] elif location == "downstream": df = df.loc[(df["dist"] <= threshold) & - (df["dist"] > 0), :] + (df["dist"] > 0 if not include0 + else df["dist"] >= 0), :] else: if threshold == 0: df = df.loc[abs(df["dist"]) == 0, :] else: - df = df.loc[(abs(df["dist"]) <= threshold) & - (df["dist"] != 0), :] + if not include0: + df = df.loc[(abs(df["dist"]) <= threshold) & + (df["dist"] != 0), :] + else: + df = df.loc[abs(df["dist"]) <= threshold, :] return df def create_bed_ctcf_exon(reg: str, threshold: int, - location: str) -> None: + location: str, include0: bool = False, + near_ctcf: bool = True) -> None: """ Filter the dataframe to recover only regulated exons near CTCF. :param reg: The regulation by CTCF :param threshold: The threshold distance :param location: The location of interest + :param include0: True to include exons containing a CTCF site if the \ + threshold is greater than 0. + :param near_ctcf: True to recover exons near CTCF False to recover \ + those far from CTCF """ if threshold < 0: threshold = 0 if threshold == 0: location = "both" - df = load_sipp_vs_ctcf(BedConfig.sipp_vs_ctcf, - format_exon_bed(BedConfig.exon_bed, - BedConfig.gene_bed)) - df = filter_ctcf_distance_table(df, reg, threshold, location) - list_exons = df['id'].to_list() + if not include0 and threshold == 0: + warnings.warn("include0 must be True when threshold = 0." + "Setting include0 to true !") + include0 = True + if not near_ctcf and not include0: + warnings.warn("include0 must be True when near_ctcf is False") + include0 = True + i0 = "with0" if include0 else "without0" + df_reg = load_sipp_vs_ctcf(BedConfig.sipp_vs_ctcf, + format_exon_bed(BedConfig.exon_bed, + BedConfig.gene_bed)) + df = filter_ctcf_distance_table(df_reg, reg, threshold, location, include0) + if near_ctcf: + name_near = "" + list_exons = df['id'].to_list() + else: + name_near = "Far_" + if reg != "all": + tmp_exons = df_reg.loc[df_reg["group"] == f"siPP_{reg.upper()}", + "id"].to_list() + else: + tmp_exons = df_reg['id'].to_list() + list_exons = [e for e in tmp_exons if e not in df['id'].to_list()] list_genes = [int(exon.split('_')[0]) for exon in list_exons] df_exon = filter_bed(BedConfig.exon_bed, list_exons) df_gene = filter_bed(BedConfig.gene_bed, list_genes) df_exon.to_csv(BedConfig.bed.output / - f"CTCF_{threshold}_{location}_ddx_{reg}_exon.bed", sep="\t", + f"{name_near}CTCF_{threshold}_{location}_ddx_{reg}_{i0}_exon.bed", + sep="\t", index=False) df_gene.to_csv(BedConfig.bed.output / - f"CTCF_{threshold}_{location}_ddx_{reg}_gene.bed", sep="\t", + f"{name_near}CTCF_{threshold}_{location}_ddx_{reg}_{i0}_gene.bed", + sep="\t", index=False) -- GitLab