From 2cd7037bbe9317a630ae537c7ec3e58e3bf42d49 Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Fri, 23 Oct 2020 16:12:40 +0200 Subject: [PATCH] src.bed_handler: creation of bed of regulated exons by DDX5/17 --- src/bed_handler/__main__.py | 8 ++ src/bed_handler/get_gene_regulated_by_ddx.py | 8 +- .../select_regulated_near_ctcf_exons.py | 122 ++++++++++++++++++ 3 files changed, 134 insertions(+), 4 deletions(-) create mode 100644 src/bed_handler/select_regulated_near_ctcf_exons.py diff --git a/src/bed_handler/__main__.py b/src/bed_handler/__main__.py index 4f87f0c..79bc3dc 100644 --- a/src/bed_handler/__main__.py +++ b/src/bed_handler/__main__.py @@ -10,6 +10,7 @@ visualisation from .filter_gene import create_filtered_bed from .get_gene_locations import create_region_bed from .get_gene_regulated_by_ddx import write_regulated_gene_file +from .select_regulated_near_ctcf_exons import create_bed_ctcf_exon def launcher(): @@ -19,6 +20,13 @@ def launcher(): write_regulated_gene_file() create_filtered_bed() create_region_bed() + create_bed_ctcf_exon("down", 0, "both") + create_bed_ctcf_exon("down", 1000, "both") + create_bed_ctcf_exon("down", 2000, "both") + create_bed_ctcf_exon("down", 1000, "upstream") + create_bed_ctcf_exon("down", 2000, "upstream") + create_bed_ctcf_exon("down", 1000, "downstream") + create_bed_ctcf_exon("down", 2000, "downstream") launcher() diff --git a/src/bed_handler/get_gene_regulated_by_ddx.py b/src/bed_handler/get_gene_regulated_by_ddx.py index b48e5b2..dd4b6f9 100644 --- a/src/bed_handler/get_gene_regulated_by_ddx.py +++ b/src/bed_handler/get_gene_regulated_by_ddx.py @@ -49,7 +49,7 @@ def format_exon_bed(exon_bed: Path, gene_bed: Path) -> pd.DataFrame: 'exon_name']] -def load_sipp_vs_ctct(sipp_file: Path, exon_table: pd.DataFrame +def load_sipp_vs_ctcf(sipp_file: Path, exon_table: pd.DataFrame ) -> pd.DataFrame: """ Load the file containing exons regulated by DDX5 17 and containing their \ @@ -60,7 +60,7 @@ def load_sipp_vs_ctct(sipp_file: Path, exon_table: pd.DataFrame containing their distance to CTCF. :return: Same input table as a dataframe - >>> df = load_sipp_vs_ctct(TestConfig.sipp_vs_ctcf, + >>> df = load_sipp_vs_ctcf(TestConfig.sipp_vs_ctcf, ... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed)) >>> df[['exon_name', "exon_id", 'group', 'id', 'gene_id']] exon_name exon_id group id gene_id @@ -85,7 +85,7 @@ def get_ddx_genes(df: pd.DataFrame) -> List[int]: :param df: A dataframe of DDX regulated exons :return: Return the unique values in gene_id column. - >>> df = load_sipp_vs_ctct(TestConfig.sipp_vs_ctcf, + >>> df = load_sipp_vs_ctcf(TestConfig.sipp_vs_ctcf, ... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed)) >>> get_ddx_genes(df) [1] @@ -99,7 +99,7 @@ def write_regulated_gene_file(): """ BedConfig.bed.output.mkdir(exist_ok=True, parents=True) exon_table = format_exon_bed(BedConfig.exon_bed, BedConfig.gene_bed) - final_table = load_sipp_vs_ctct(BedConfig.sipp_vs_ctcf, exon_table) + final_table = load_sipp_vs_ctcf(BedConfig.sipp_vs_ctcf, exon_table) list_gene = get_ddx_genes(final_table) with BedConfig.bed.ddx_genes.open('w')as outfile: for gene in list_gene: diff --git a/src/bed_handler/select_regulated_near_ctcf_exons.py b/src/bed_handler/select_regulated_near_ctcf_exons.py new file mode 100644 index 0000000..8c1bab0 --- /dev/null +++ b/src/bed_handler/select_regulated_near_ctcf_exons.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: Create bed of exons regulated by DDX5/17 and near CTCF +""" + +from .config import TestConfig, BedConfig +from .get_gene_regulated_by_ddx import load_sipp_vs_ctcf, format_exon_bed +import pandas as pd +from doctest import testmod + + +def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int, + location: str) -> pd.DataFrame: + """ + Filter the dataframe to recover only regulated exons near CTCF. + + :param df: The dataframe of exon regulated by CTCF + :param reg: The regulation by CTCF + :param threshold: The threshold distance + :param location: The location of interest + :return: The filtered dataframe + + >>> cdf = load_sipp_vs_ctcf(TestConfig.sipp_vs_ctcf, + ... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed)) + >>> rdf = filter_ctcf_distance_table(cdf, 'down', 4, 'upstream') + >>> rdf[['exon_name', 'dist', 'group', 'id']] + exon_name dist group id + 0 DSC2_1 -4 siPP_DOWN 1_1 + 1 DSC2_2 -3 siPP_DOWN 1_2 + 2 DSC2_3 -2 siPP_DOWN 1_3 + 4 DSC2_5 0 siPP_DOWN 1_5 + >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'upstream') + >>> rdf[['exon_name', 'dist', 'group', 'id']] + exon_name dist group id + 2 DSC2_3 -2 siPP_DOWN 1_3 + 4 DSC2_5 0 siPP_DOWN 1_5 + >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'downstream') + >>> rdf[['exon_name', 'dist', 'group', 'id']] + exon_name dist group id + 4 DSC2_5 0 siPP_DOWN 1_5 + 5 DSC2_6 1 siPP_DOWN 1_6 + 6 DSC2_7 2 siPP_DOWN 1_7 + >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'both') + >>> rdf[['exon_name', 'dist', 'group', 'id']] + exon_name dist group id + 2 DSC2_3 -2 siPP_DOWN 1_3 + 4 DSC2_5 0 siPP_DOWN 1_5 + 5 DSC2_6 1 siPP_DOWN 1_6 + 6 DSC2_7 2 siPP_DOWN 1_7 + >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'both') + >>> rdf[['exon_name', 'dist', 'group', 'id']] + exon_name dist group id + 3 DSC2_4 -1 siPP_UP 1_4 + >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'downstream') + >>> rdf[['exon_name', 'dist', 'group', 'id']] + Empty DataFrame + Columns: [exon_name, dist, group, id] + Index: [] + >>> rdf = filter_ctcf_distance_table(cdf, 'all', 2, 'both') + >>> rdf[['exon_name', 'dist', 'group', 'id']] + exon_name dist group id + 2 DSC2_3 -2 siPP_DOWN 1_3 + 3 DSC2_4 -1 siPP_UP 1_4 + 4 DSC2_5 0 siPP_DOWN 1_5 + 5 DSC2_6 1 siPP_DOWN 1_6 + 6 DSC2_7 2 siPP_DOWN 1_7 + >>> filter_ctcf_distance_table(cdf, 'lul', 2, 'both') + Traceback (most recent call last): + ... + ValueError: reg parameter should be one in: ['down', 'up', 'all'] + >>> filter_ctcf_distance_table(cdf, 'up', 2, 'xd') + Traceback (most recent call last): + ... + ValueError: location parameter should be in ['upstream', \ +'downstream', 'both'] + """ + if reg not in ['down', 'up', 'all']: + raise ValueError(f"reg parameter should be one in: " + f"['down', 'up', 'all']") + if location not in ['upstream', 'downstream', 'both']: + raise ValueError(f"location parameter should be in " + f"['upstream', 'downstream', 'both']") + if reg != "all": + df = df.loc[df["group"] == f"siPP_{reg.upper()}", :] + if location == "upstream": + df = df.loc[(df["dist"] >= threshold * -1) & + (df["dist"] <= 0), :] + elif location == "downstream": + df = df.loc[(df["dist"] <= threshold) & + (df["dist"] >= 0), :] + else: + df = df.loc[abs(df["dist"]) <= threshold, :] + return df + + +def create_bed_ctcf_exon(reg: str, threshold: int, + location: str) -> None: + """ + Filter the dataframe to recover only regulated exons near CTCF. + + :param reg: The regulation by CTCF + :param threshold: The threshold distance + :param location: The location of interest + """ + if threshold < 0: + threshold = 0 + if threshold == 0: + location = "both" + df = load_sipp_vs_ctcf(BedConfig.sipp_vs_ctcf, + format_exon_bed(BedConfig.exon_bed, + BedConfig.gene_bed)) + df = filter_ctcf_distance_table(df, reg, threshold, location) + df.to_csv(BedConfig.bed.output / f"CTCF_{threshold}_{location}_" + f"ddx_{reg}_exon.bed", sep="\t", + index=False) + + +if __name__ == "__main__": + testmod() -- GitLab