Skip to content
Snippets Groups Projects
Commit 2cd7037b authored by nfontrod's avatar nfontrod
Browse files

src.bed_handler: creation of bed of regulated exons by DDX5/17

parent 96d104de
No related branches found
No related tags found
No related merge requests found
......@@ -10,6 +10,7 @@ visualisation
from .filter_gene import create_filtered_bed
from .get_gene_locations import create_region_bed
from .get_gene_regulated_by_ddx import write_regulated_gene_file
from .select_regulated_near_ctcf_exons import create_bed_ctcf_exon
def launcher():
......@@ -19,6 +20,13 @@ def launcher():
write_regulated_gene_file()
create_filtered_bed()
create_region_bed()
create_bed_ctcf_exon("down", 0, "both")
create_bed_ctcf_exon("down", 1000, "both")
create_bed_ctcf_exon("down", 2000, "both")
create_bed_ctcf_exon("down", 1000, "upstream")
create_bed_ctcf_exon("down", 2000, "upstream")
create_bed_ctcf_exon("down", 1000, "downstream")
create_bed_ctcf_exon("down", 2000, "downstream")
launcher()
......@@ -49,7 +49,7 @@ def format_exon_bed(exon_bed: Path, gene_bed: Path) -> pd.DataFrame:
'exon_name']]
def load_sipp_vs_ctct(sipp_file: Path, exon_table: pd.DataFrame
def load_sipp_vs_ctcf(sipp_file: Path, exon_table: pd.DataFrame
) -> pd.DataFrame:
"""
Load the file containing exons regulated by DDX5 17 and containing their \
......@@ -60,7 +60,7 @@ def load_sipp_vs_ctct(sipp_file: Path, exon_table: pd.DataFrame
containing their distance to CTCF.
:return: Same input table as a dataframe
>>> df = load_sipp_vs_ctct(TestConfig.sipp_vs_ctcf,
>>> df = load_sipp_vs_ctcf(TestConfig.sipp_vs_ctcf,
... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed))
>>> df[['exon_name', "exon_id", 'group', 'id', 'gene_id']]
exon_name exon_id group id gene_id
......@@ -85,7 +85,7 @@ def get_ddx_genes(df: pd.DataFrame) -> List[int]:
:param df: A dataframe of DDX regulated exons
:return: Return the unique values in gene_id column.
>>> df = load_sipp_vs_ctct(TestConfig.sipp_vs_ctcf,
>>> df = load_sipp_vs_ctcf(TestConfig.sipp_vs_ctcf,
... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed))
>>> get_ddx_genes(df)
[1]
......@@ -99,7 +99,7 @@ def write_regulated_gene_file():
"""
BedConfig.bed.output.mkdir(exist_ok=True, parents=True)
exon_table = format_exon_bed(BedConfig.exon_bed, BedConfig.gene_bed)
final_table = load_sipp_vs_ctct(BedConfig.sipp_vs_ctcf, exon_table)
final_table = load_sipp_vs_ctcf(BedConfig.sipp_vs_ctcf, exon_table)
list_gene = get_ddx_genes(final_table)
with BedConfig.bed.ddx_genes.open('w')as outfile:
for gene in list_gene:
......
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: Create bed of exons regulated by DDX5/17 and near CTCF
"""
from .config import TestConfig, BedConfig
from .get_gene_regulated_by_ddx import load_sipp_vs_ctcf, format_exon_bed
import pandas as pd
from doctest import testmod
def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
location: str) -> pd.DataFrame:
"""
Filter the dataframe to recover only regulated exons near CTCF.
:param df: The dataframe of exon regulated by CTCF
:param reg: The regulation by CTCF
:param threshold: The threshold distance
:param location: The location of interest
:return: The filtered dataframe
>>> cdf = load_sipp_vs_ctcf(TestConfig.sipp_vs_ctcf,
... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed))
>>> rdf = filter_ctcf_distance_table(cdf, 'down', 4, 'upstream')
>>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id
0 DSC2_1 -4 siPP_DOWN 1_1
1 DSC2_2 -3 siPP_DOWN 1_2
2 DSC2_3 -2 siPP_DOWN 1_3
4 DSC2_5 0 siPP_DOWN 1_5
>>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'upstream')
>>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id
2 DSC2_3 -2 siPP_DOWN 1_3
4 DSC2_5 0 siPP_DOWN 1_5
>>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'downstream')
>>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id
4 DSC2_5 0 siPP_DOWN 1_5
5 DSC2_6 1 siPP_DOWN 1_6
6 DSC2_7 2 siPP_DOWN 1_7
>>> rdf = filter_ctcf_distance_table(cdf, 'down', 2, 'both')
>>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id
2 DSC2_3 -2 siPP_DOWN 1_3
4 DSC2_5 0 siPP_DOWN 1_5
5 DSC2_6 1 siPP_DOWN 1_6
6 DSC2_7 2 siPP_DOWN 1_7
>>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'both')
>>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id
3 DSC2_4 -1 siPP_UP 1_4
>>> rdf = filter_ctcf_distance_table(cdf, 'up', 2, 'downstream')
>>> rdf[['exon_name', 'dist', 'group', 'id']]
Empty DataFrame
Columns: [exon_name, dist, group, id]
Index: []
>>> rdf = filter_ctcf_distance_table(cdf, 'all', 2, 'both')
>>> rdf[['exon_name', 'dist', 'group', 'id']]
exon_name dist group id
2 DSC2_3 -2 siPP_DOWN 1_3
3 DSC2_4 -1 siPP_UP 1_4
4 DSC2_5 0 siPP_DOWN 1_5
5 DSC2_6 1 siPP_DOWN 1_6
6 DSC2_7 2 siPP_DOWN 1_7
>>> filter_ctcf_distance_table(cdf, 'lul', 2, 'both')
Traceback (most recent call last):
...
ValueError: reg parameter should be one in: ['down', 'up', 'all']
>>> filter_ctcf_distance_table(cdf, 'up', 2, 'xd')
Traceback (most recent call last):
...
ValueError: location parameter should be in ['upstream', \
'downstream', 'both']
"""
if reg not in ['down', 'up', 'all']:
raise ValueError(f"reg parameter should be one in: "
f"['down', 'up', 'all']")
if location not in ['upstream', 'downstream', 'both']:
raise ValueError(f"location parameter should be in "
f"['upstream', 'downstream', 'both']")
if reg != "all":
df = df.loc[df["group"] == f"siPP_{reg.upper()}", :]
if location == "upstream":
df = df.loc[(df["dist"] >= threshold * -1) &
(df["dist"] <= 0), :]
elif location == "downstream":
df = df.loc[(df["dist"] <= threshold) &
(df["dist"] >= 0), :]
else:
df = df.loc[abs(df["dist"]) <= threshold, :]
return df
def create_bed_ctcf_exon(reg: str, threshold: int,
location: str) -> None:
"""
Filter the dataframe to recover only regulated exons near CTCF.
:param reg: The regulation by CTCF
:param threshold: The threshold distance
:param location: The location of interest
"""
if threshold < 0:
threshold = 0
if threshold == 0:
location = "both"
df = load_sipp_vs_ctcf(BedConfig.sipp_vs_ctcf,
format_exon_bed(BedConfig.exon_bed,
BedConfig.gene_bed))
df = filter_ctcf_distance_table(df, reg, threshold, location)
df.to_csv(BedConfig.bed.output / f"CTCF_{threshold}_{location}_"
f"ddx_{reg}_exon.bed", sep="\t",
index=False)
if __name__ == "__main__":
testmod()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment