Commit 0117f4ca authored by nfontrod's avatar nfontrod
Browse files

src/bed_handler/select_regulated_near_ctcf_exons.py: new function...

src/bed_handler/select_regulated_near_ctcf_exons.py: new function get_bed_ctcf_exon to only recover exons close to CTCF peaks in a given bed file containing exons
parent 80868e56
......@@ -13,6 +13,8 @@ from doctest import testmod
from .filter_gene import filter_bed
import warnings
from .get_other_exon_in_same_gene import create_gene_bed4norm
from pathlib import Path
import lazyparser as lp
def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
......@@ -128,9 +130,10 @@ def create_bed_ctcf_exon(reg: str, threshold: int,
location: str, include0: bool = False,
near_ctcf: bool = True) -> None:
"""
Filter the dataframe to recover only regulated exons near CTCF.
Filter the dataframe of exon regulated by ddx5/17 to recover only \
regulated exons near CTCF.
:param reg: The regulation by CTCF
:param reg: The regulation by DDX5/17
:param threshold: The threshold distance
:param location: The location of interest
:param include0: True to include exons containing a CTCF site if the \
......@@ -163,7 +166,9 @@ def create_bed_ctcf_exon(reg: str, threshold: int,
"id"].to_list()
else:
tmp_exons = df_reg['id'].to_list()
list_exons = [e for e in tmp_exons if e not in df['id'].to_list()]
bad_id = df['id'].to_list() if include0 \
else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list()
list_exons = [e for e in tmp_exons if e not in bad_id]
list_genes = [int(exon.split('_')[0]) for exon in list_exons]
df_exon = filter_bed(BedConfig.exon_bed, list_exons)
df_gene = filter_bed(BedConfig.gene_bed, list_genes)
......@@ -185,6 +190,79 @@ def create_bed_ctcf_exon(reg: str, threshold: int,
index=False)
@lp.parse(exon_bed="file", location=["upstream", "downstream", "both"],
threshold="threshold >= 0")
def get_bed_ctcf_exon(exon_bed: str, threshold: int,
location: str, include0: bool = False,
near_ctcf: bool = True,
name_near: str = "") -> None:
"""
Filter a bed file containing exons to recover only regulated exons \
near or far from CTCF.
:param exon_bed: a bed containing exons
:param threshold: The threshold distance
:param location: The location of interest
:param include0: True to include exons containing a CTCF site if the \
threshold is greater than 0.
:param near_ctcf: True to recover exons near CTCF False to recover \
those far from CTCF
:param name_near: prefix of the output file name
"""
if name_near == "":
name_near = Path(exon_bed).name.replace(".bed", "")
threshold = max(threshold, 0)
if threshold == 0:
location = "both"
if not include0 and threshold == 0:
warnings.warn("include0 must be True when threshold = 0."
"Setting include0 to true !")
include0 = True
if not near_ctcf and not include0:
warnings.warn("include0 must be True when near_ctcf is False")
include0 = True
i0 = "with0" if include0 else "without0"
df_reg = load_sipp_vs_ctcf(BedConfig.all_vs_ctcf,
format_exon_bed(Path(exon_bed),
BedConfig.gene_bed))
df_reg = df_reg[-df_reg["gene_id"].isna()]
df_reg["gene_id"] = df_reg["gene_id"].astype(int)
df_reg["exon_pos"] = df_reg["exon_pos"].astype(int)
df = filter_ctcf_distance_table(df_reg, 'all', threshold, location,
include0)
if near_ctcf:
name_near = name_near + "_near_"
list_exons = df['id'].to_list()
else:
name_near = name_near + "_far_"
tmp_exons = df_reg['id'].to_list()
bad_id = df['id'].to_list() if include0 \
else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list()
list_exons = [e for e in tmp_exons if e not in bad_id]
list_genes = [int(exon.split('_')[0]) for exon in list_exons]
df_exon = filter_bed(BedConfig.exon_bed, list_exons)
df_gene = filter_bed(BedConfig.gene_bed, list_genes)
df_exon.to_csv(BedConfig.bed.output /
f"{name_near}CTCF_{threshold}_{location}_ddx_"
f"{i0}_exon.bed",
sep="\t",
index=False)
df_gene.to_csv(BedConfig.bed.output /
f"{name_near}CTCF_{threshold}_{location}_ddx_"
f"{i0}_gene.bed",
sep="\t",
index=False)
df_gene = create_gene_bed4norm(BedConfig.gene_bed, df_exon)
df_gene.to_csv(BedConfig.bed.output /
f"{name_near}CTCF_{threshold}_{location}_ddx_"
f"{i0}_gene-dup.bed",
sep="\t",
index=False)
if __name__ == "__main__":
testmod()
import sys
if len(sys.argv) == 1:
testmod()
else:
get_bed_ctcf_exon()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment