Skip to content
Snippets Groups Projects
Commit aee24d7d authored by nfontrod's avatar nfontrod
Browse files

src/bed_handler/select_regulated_near_ctcf_exons.py: add a parameter include0...

src/bed_handler/select_regulated_near_ctcf_exons.py: add a parameter include0 to include exons containing CTCF if wanted. Add a parameter near_ctcf that allows to recover regulated exons far from a CTCF site
parent a16f26ca
Branches
No related tags found
No related merge requests found
...@@ -11,10 +11,11 @@ from .get_gene_regulated_by_ddx import load_sipp_vs_ctcf, format_exon_bed ...@@ -11,10 +11,11 @@ from .get_gene_regulated_by_ddx import load_sipp_vs_ctcf, format_exon_bed
import pandas as pd import pandas as pd
from doctest import testmod from doctest import testmod
from .filter_gene import filter_bed from .filter_gene import filter_bed
import warnings
def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int, def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
location: str) -> pd.DataFrame: location: str, include0: bool) -> pd.DataFrame:
""" """
Filter the dataframe to recover only regulated exons near CTCF. Filter the dataframe to recover only regulated exons near CTCF.
...@@ -22,57 +23,74 @@ def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int, ...@@ -22,57 +23,74 @@ def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
:param reg: The regulation by CTCF :param reg: The regulation by CTCF
:param threshold: The threshold distance :param threshold: The threshold distance
:param location: The location of interest :param location: The location of interest
:param include0: True to include exons containing a CTCF site if the \
threshold is greater than 0.
:return: The filtered dataframe :return: The filtered dataframe
>>> cdf = load_sipp_vs_ctcf(TestConfig.sipp_vs_ctcf, >>> cdf = load_sipp_vs_ctcf(TestConfig.sipp_vs_ctcf,
... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed)) ... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed))
>>> rdf = filter_ctcf_distance_table(cdf, 'down', 4, 'upstream') >>> rdf = filter_ctcf_distance_table(cdf, 'down', 4, 'upstream', False)
>>> rdf[['exon_name', 'dist', 'group', 'id']] >>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id exon_name dist group id
0 DSC2_1 -4 siPP_DOWN 1_1 0 DSC2_1 -4 siPP_DOWN 1_1
1 DSC2_2 -3 siPP_DOWN 1_2 1 DSC2_2 -3 siPP_DOWN 1_2
2 DSC2_3 -2 siPP_DOWN 1_3 2 DSC2_3 -2 siPP_DOWN 1_3
>>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'upstream') >>> rdf = filter_ctcf_distance_table(cdf, 'down', 4, 'upstream', True)
>>> rdf[['exon_name', 'dist', 'group', 'id']] >>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id exon_name dist group id
0 DSC2_1 -4 siPP_DOWN 1_1
1 DSC2_2 -3 siPP_DOWN 1_2
2 DSC2_3 -2 siPP_DOWN 1_3 2 DSC2_3 -2 siPP_DOWN 1_3
>>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'downstream') 4 DSC2_5 0 siPP_DOWN 1_5
>>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'upstream', False)
>>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id
2 DSC2_3 -2 siPP_DOWN 1_3
>>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'downstream', False)
>>> rdf[['exon_name', 'dist', 'group', 'id']] >>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id exon_name dist group id
5 DSC2_6 1 siPP_DOWN 1_6 5 DSC2_6 1 siPP_DOWN 1_6
6 DSC2_7 2 siPP_DOWN 1_7 6 DSC2_7 2 siPP_DOWN 1_7
>>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'both') >>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'both', False)
>>> rdf[['exon_name', 'dist', 'group', 'id']] >>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id exon_name dist group id
2 DSC2_3 -2 siPP_DOWN 1_3 2 DSC2_3 -2 siPP_DOWN 1_3
5 DSC2_6 1 siPP_DOWN 1_6 5 DSC2_6 1 siPP_DOWN 1_6
6 DSC2_7 2 siPP_DOWN 1_7 6 DSC2_7 2 siPP_DOWN 1_7
>>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'both') >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'both', False)
>>> rdf[['exon_name', 'dist', 'group', 'id']] >>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id exon_name dist group id
3 DSC2_4 -1 siPP_UP 1_4 3 DSC2_4 -1 siPP_UP 1_4
>>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'downstream') >>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'downstream', False)
>>> rdf[['exon_name', 'dist', 'group', 'id']] >>> rdf[['exon_name', 'dist', 'group', 'id']]
Empty DataFrame Empty DataFrame
Columns: [exon_name, dist, group, id] Columns: [exon_name, dist, group, id]
Index: [] Index: []
>>> rdf = filter_ctcf_distance_table(cdf, 'all', 2, 'both') >>> rdf = filter_ctcf_distance_table(cdf, 'all', 2, 'both', False)
>>> rdf[['exon_name', 'dist', 'group', 'id']] >>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id exon_name dist group id
2 DSC2_3 -2 siPP_DOWN 1_3 2 DSC2_3 -2 siPP_DOWN 1_3
3 DSC2_4 -1 siPP_UP 1_4 3 DSC2_4 -1 siPP_UP 1_4
5 DSC2_6 1 siPP_DOWN 1_6 5 DSC2_6 1 siPP_DOWN 1_6
6 DSC2_7 2 siPP_DOWN 1_7 6 DSC2_7 2 siPP_DOWN 1_7
>>> filter_ctcf_distance_table(cdf, 'lul', 2, 'both') >>> rdf = filter_ctcf_distance_table(cdf, 'all', 2, 'both', True)
>>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id
2 DSC2_3 -2 siPP_DOWN 1_3
3 DSC2_4 -1 siPP_UP 1_4
4 DSC2_5 0 siPP_DOWN 1_5
5 DSC2_6 1 siPP_DOWN 1_6
6 DSC2_7 2 siPP_DOWN 1_7
>>> filter_ctcf_distance_table(cdf, 'lul', 2, 'both', False)
Traceback (most recent call last): Traceback (most recent call last):
... ...
ValueError: reg parameter should be one in: ['down', 'up', 'all'] ValueError: reg parameter should be one in: ['down', 'up', 'all']
>>> filter_ctcf_distance_table(cdf, 'up', 2, 'xd') >>> filter_ctcf_distance_table(cdf, 'up', 2, 'xd', False)
Traceback (most recent call last): Traceback (most recent call last):
... ...
ValueError: location parameter should be in ['upstream', \ ValueError: location parameter should be in ['upstream', \
'downstream', 'both'] 'downstream', 'both']
>>> rdf = filter_ctcf_distance_table(cdf, 'down', 0, 'both') >>> rdf = filter_ctcf_distance_table(cdf, 'down', 0, 'both', False)
>>> rdf[['exon_name', 'dist', 'group', 'id']] >>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id exon_name dist group id
4 DSC2_5 0 siPP_DOWN 1_5 4 DSC2_5 0 siPP_DOWN 1_5
...@@ -87,45 +105,75 @@ def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int, ...@@ -87,45 +105,75 @@ def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
df = df.loc[df["group"] == f"siPP_{reg.upper()}", :] df = df.loc[df["group"] == f"siPP_{reg.upper()}", :]
if location == "upstream": if location == "upstream":
df = df.loc[(df["dist"] >= threshold * -1) & df = df.loc[(df["dist"] >= threshold * -1) &
(df["dist"] < 0), :] (df["dist"] < 0 if not include0
else df["dist"] <= 0), :]
elif location == "downstream": elif location == "downstream":
df = df.loc[(df["dist"] <= threshold) & df = df.loc[(df["dist"] <= threshold) &
(df["dist"] > 0), :] (df["dist"] > 0 if not include0
else df["dist"] >= 0), :]
else: else:
if threshold == 0: if threshold == 0:
df = df.loc[abs(df["dist"]) == 0, :] df = df.loc[abs(df["dist"]) == 0, :]
else: else:
if not include0:
df = df.loc[(abs(df["dist"]) <= threshold) & df = df.loc[(abs(df["dist"]) <= threshold) &
(df["dist"] != 0), :] (df["dist"] != 0), :]
else:
df = df.loc[abs(df["dist"]) <= threshold, :]
return df return df
def create_bed_ctcf_exon(reg: str, threshold: int, def create_bed_ctcf_exon(reg: str, threshold: int,
location: str) -> None: location: str, include0: bool = False,
near_ctcf: bool = True) -> None:
""" """
Filter the dataframe to recover only regulated exons near CTCF. Filter the dataframe to recover only regulated exons near CTCF.
:param reg: The regulation by CTCF :param reg: The regulation by CTCF
:param threshold: The threshold distance :param threshold: The threshold distance
:param location: The location of interest :param location: The location of interest
:param include0: True to include exons containing a CTCF site if the \
threshold is greater than 0.
:param near_ctcf: True to recover exons near CTCF False to recover \
those far from CTCF
""" """
if threshold < 0: if threshold < 0:
threshold = 0 threshold = 0
if threshold == 0: if threshold == 0:
location = "both" location = "both"
df = load_sipp_vs_ctcf(BedConfig.sipp_vs_ctcf, if not include0 and threshold == 0:
warnings.warn("include0 must be True when threshold = 0."
"Setting include0 to true !")
include0 = True
if not near_ctcf and not include0:
warnings.warn("include0 must be True when near_ctcf is False")
include0 = True
i0 = "with0" if include0 else "without0"
df_reg = load_sipp_vs_ctcf(BedConfig.sipp_vs_ctcf,
format_exon_bed(BedConfig.exon_bed, format_exon_bed(BedConfig.exon_bed,
BedConfig.gene_bed)) BedConfig.gene_bed))
df = filter_ctcf_distance_table(df, reg, threshold, location) df = filter_ctcf_distance_table(df_reg, reg, threshold, location, include0)
if near_ctcf:
name_near = ""
list_exons = df['id'].to_list() list_exons = df['id'].to_list()
else:
name_near = "Far_"
if reg != "all":
tmp_exons = df_reg.loc[df_reg["group"] == f"siPP_{reg.upper()}",
"id"].to_list()
else:
tmp_exons = df_reg['id'].to_list()
list_exons = [e for e in tmp_exons if e not in df['id'].to_list()]
list_genes = [int(exon.split('_')[0]) for exon in list_exons] list_genes = [int(exon.split('_')[0]) for exon in list_exons]
df_exon = filter_bed(BedConfig.exon_bed, list_exons) df_exon = filter_bed(BedConfig.exon_bed, list_exons)
df_gene = filter_bed(BedConfig.gene_bed, list_genes) df_gene = filter_bed(BedConfig.gene_bed, list_genes)
df_exon.to_csv(BedConfig.bed.output / df_exon.to_csv(BedConfig.bed.output /
f"CTCF_{threshold}_{location}_ddx_{reg}_exon.bed", sep="\t", f"{name_near}CTCF_{threshold}_{location}_ddx_{reg}_{i0}_exon.bed",
sep="\t",
index=False) index=False)
df_gene.to_csv(BedConfig.bed.output / df_gene.to_csv(BedConfig.bed.output /
f"CTCF_{threshold}_{location}_ddx_{reg}_gene.bed", sep="\t", f"{name_near}CTCF_{threshold}_{location}_ddx_{reg}_{i0}_gene.bed",
sep="\t",
index=False) index=False)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment