From 0117f4ca6c0d4e1b3001cecc0ac11fb88dd90ec5 Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Mon, 15 Mar 2021 18:16:31 +0100 Subject: [PATCH] src/bed_handler/select_regulated_near_ctcf_exons.py: new function get_bed_ctcf_exon to only recover exons close to CTCF peaks in a given bed file containing exons --- .../select_regulated_near_ctcf_exons.py | 86 ++++++++++++++++++- 1 file changed, 82 insertions(+), 4 deletions(-) diff --git a/src/bed_handler/select_regulated_near_ctcf_exons.py b/src/bed_handler/select_regulated_near_ctcf_exons.py index a9c2fdb..11d37e4 100644 --- a/src/bed_handler/select_regulated_near_ctcf_exons.py +++ b/src/bed_handler/select_regulated_near_ctcf_exons.py @@ -13,6 +13,8 @@ from doctest import testmod from .filter_gene import filter_bed import warnings from .get_other_exon_in_same_gene import create_gene_bed4norm +from pathlib import Path +import lazyparser as lp def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int, @@ -128,9 +130,10 @@ def create_bed_ctcf_exon(reg: str, threshold: int, location: str, include0: bool = False, near_ctcf: bool = True) -> None: """ - Filter the dataframe to recover only regulated exons near CTCF. + Filter the dataframe of exon regulated by ddx5/17 to recover only \ + regulated exons near CTCF. - :param reg: The regulation by CTCF + :param reg: The regulation by DDX5/17 :param threshold: The threshold distance :param location: The location of interest :param include0: True to include exons containing a CTCF site if the \ @@ -163,7 +166,9 @@ def create_bed_ctcf_exon(reg: str, threshold: int, "id"].to_list() else: tmp_exons = df_reg['id'].to_list() - list_exons = [e for e in tmp_exons if e not in df['id'].to_list()] + bad_id = df['id'].to_list() if include0 \ + else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list() + list_exons = [e for e in tmp_exons if e not in bad_id] list_genes = [int(exon.split('_')[0]) for exon in list_exons] df_exon = filter_bed(BedConfig.exon_bed, list_exons) df_gene = filter_bed(BedConfig.gene_bed, list_genes) @@ -185,6 +190,79 @@ def create_bed_ctcf_exon(reg: str, threshold: int, index=False) +@lp.parse(exon_bed="file", location=["upstream", "downstream", "both"], + threshold="threshold >= 0") +def get_bed_ctcf_exon(exon_bed: str, threshold: int, + location: str, include0: bool = False, + near_ctcf: bool = True, + name_near: str = "") -> None: + """ + Filter a bed file containing exons to recover only regulated exons \ + near or far from CTCF. + + :param exon_bed: a bed containing exons + :param threshold: The threshold distance + :param location: The location of interest + :param include0: True to include exons containing a CTCF site if the \ + threshold is greater than 0. + :param near_ctcf: True to recover exons near CTCF False to recover \ + those far from CTCF + :param name_near: prefix of the output file name + """ + if name_near == "": + name_near = Path(exon_bed).name.replace(".bed", "") + threshold = max(threshold, 0) + if threshold == 0: + location = "both" + if not include0 and threshold == 0: + warnings.warn("include0 must be True when threshold = 0." + "Setting include0 to true !") + include0 = True + if not near_ctcf and not include0: + warnings.warn("include0 must be True when near_ctcf is False") + include0 = True + i0 = "with0" if include0 else "without0" + df_reg = load_sipp_vs_ctcf(BedConfig.all_vs_ctcf, + format_exon_bed(Path(exon_bed), + BedConfig.gene_bed)) + df_reg = df_reg[-df_reg["gene_id"].isna()] + df_reg["gene_id"] = df_reg["gene_id"].astype(int) + df_reg["exon_pos"] = df_reg["exon_pos"].astype(int) + df = filter_ctcf_distance_table(df_reg, 'all', threshold, location, + include0) + if near_ctcf: + name_near = name_near + "_near_" + list_exons = df['id'].to_list() + else: + name_near = name_near + "_far_" + tmp_exons = df_reg['id'].to_list() + bad_id = df['id'].to_list() if include0 \ + else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list() + list_exons = [e for e in tmp_exons if e not in bad_id] + list_genes = [int(exon.split('_')[0]) for exon in list_exons] + df_exon = filter_bed(BedConfig.exon_bed, list_exons) + df_gene = filter_bed(BedConfig.gene_bed, list_genes) + df_exon.to_csv(BedConfig.bed.output / + f"{name_near}CTCF_{threshold}_{location}_ddx_" + f"{i0}_exon.bed", + sep="\t", + index=False) + df_gene.to_csv(BedConfig.bed.output / + f"{name_near}CTCF_{threshold}_{location}_ddx_" + f"{i0}_gene.bed", + sep="\t", + index=False) + df_gene = create_gene_bed4norm(BedConfig.gene_bed, df_exon) + df_gene.to_csv(BedConfig.bed.output / + f"{name_near}CTCF_{threshold}_{location}_ddx_" + f"{i0}_gene-dup.bed", + sep="\t", + index=False) + if __name__ == "__main__": - testmod() + import sys + if len(sys.argv) == 1: + testmod() + else: + get_bed_ctcf_exon() -- GitLab