Skip to content
Snippets Groups Projects
Commit 52d4c43f authored by nfontrod's avatar nfontrod
Browse files

src/bed_handler/get_other_exon_in_same_gene.py: From a list of exons get the...

src/bed_handler/get_other_exon_in_same_gene.py: From a list of exons get the list of other exons in same gene far from a CTCF site
parent 01f7fc8e
Branches
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: The goal of this script is, given \
an bed file containing exon, recover all exons in every \
gene in the bed file, then remove the exons presents in the input bed \
and eliminate those near a CTCF site.
"""
from pathlib import Path
from .get_gene_regulated_by_ddx import load_sipp_vs_ctcf, format_exon_bed
from .config import BedConfig, TestConfig, OutputBed
import pandas as pd
from typing import List, Tuple
import lazyparser as lp
def get_gene2keep(df: pd.DataFrame) -> List[int]:
"""
Get the genes we want to keep from a dataframe corresponding to a \
bed file.
:param df: A dataframe corresponding to a bed of exons
:return: The list of gene to keep
>>> data = pd.DataFrame({"#ref": [18, 19], "start": [100, 200],
... "end": [200, 300], "id": ["1_1", "2_1"], "score": [0, 0],
... "strand": ["-", "+"]})
>>> get_gene2keep(data)
[1, 2]
"""
return [int(exon_id.split("_")[0]) for exon_id in df['id'].to_list()]
def get_exon2remove(df: pd.DataFrame) -> List[str]:
"""
Get the list of exon that we don't want to keep.
:param df: A dataframe corresponding to a bed file
:return: The list of exon to remove
>>> data = pd.DataFrame({"#ref": [18, 19], "start": [100, 200],
... "end": [200, 300], "id": ["1_1", "2_1"], "score": [0, 0],
... "strand": ["-", "+"]})
>>> get_exon2remove(data)
['1_1', '2_1']
"""
return df['id'].to_list()
def get_lists_from_bed(bed: str) -> Tuple[List[int], List[str]]:
"""
Get the List of gene to keep and the list of exon to remove from a \
bed file.
:param bed: A bed file containing exons.
:return: The list of gene to keep and the list of exons to remove.
>>> g2k, e2r = get_lists_from_bed(TestConfig.exon_bed)
>>> g2k
[1, 1, 1, 1, 1, 1, 1, 1, 1]
>>> e2r
['1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9']
"""
df = pd.read_csv(bed, sep="\t")
return get_gene2keep(df), get_exon2remove(df)
def filter_exon(bed: Path, gene2keep: List[int],
exon2remove: List[str]) -> pd.DataFrame:
"""
:param bed: The bed file containing every fasterDb exon.
:param gene2keep: The list of gene 2 keep
:param exon2remove: The list of exon 2 remove
:return: The dataframe of fasterDB exon filtered
>>> filter_exon(TestConfig.exon_bed, [1], ["1_1", "1_3", "1_5", "1_7"])
#ref start end id score strand
1 18 28681183 28681432 1_2 0 -
3 18 28672063 28672263 1_4 0 -
5 18 28670990 28671110 1_6 0 -
7 18 28667631 28667776 1_8 0 -
8 18 28666538 28666705 1_9 0 -
"""
df = pd.read_csv(bed, sep="\t")
df['gene_id'] = get_gene2keep(df)
df = df.loc[(df['gene_id'].isin(gene2keep)) &
(-df['id'].isin(exon2remove)), :].copy()
df.drop('gene_id', axis=1, inplace=True)
return df
def get_final_bed(df_dist: pd.DataFrame, df_exon: pd.DataFrame,
threshold: int) -> pd.DataFrame:
"""
Merge de dataframe of exon with the dataframe of distance to ctcf \
site and filter those that are far from CTCF.
:param df_dist: A dataframe containing the distance of every exon \
to ctcf.
:param df_exon: The dataframe of good exon
:param threshold: The minimum distance required to keep the exon.
:return: The dataframe of exon that are distant from at \
least `threshold` nucleotide than CTCF
>>> data_ctcf = pd.DataFrame({"id": ["1_1", "1_2"], "dist": [-20, 30]})
>>> data = pd.DataFrame({"#ref": [18, 19, 20], "start": [100, 200, 300],
... "end": [200, 300, 400], "id": ["1_1", "1_2", "3_1"],
... "score": [0, 0, 0], "strand": ["-", "+", "-"]})
>>> get_final_bed(data_ctcf, data, 10)
#ref start end id score strand
0 18 100 200 1_1 0 -
1 19 200 300 1_2 0 +
>>> get_final_bed(data_ctcf, data, 21)
#ref start end id score strand
1 19 200 300 1_2 0 +
"""
df = df_exon.merge(df_dist, how="left", on="id")
df = df.loc[abs(df["dist"]) >= threshold, :].copy()
df.drop("dist", axis=1, inplace=True)
return df
def create_gene_bed4norm(gene_bed: Path, df_exon: pd.DataFrame
) -> pd.DataFrame:
"""
Create a bed containing duplicated gene used to norm the exon bed.
:param gene_bed: A bed file containing all FasterDB genes
:param df_exon: The dataframe of exons of interest
:return: The dataframe containing duplicated gene
>>> data = pd.DataFrame({"#ref": [18, 19, 20, 21, 22],
... "start": [100, 200, 300, 5, 6],
... "end": [200, 300, 400, 5, 6],
... "id": ["1_1", "1_2", "1_3", "3_1", "3_2"],
... "score": [0, 0, 0, 0, 0], "strand": ["-", "+", "-", "-", "+"]})
>>> create_gene_bed4norm(TestConfig.gene_bed, data)
#ref start end id score strand
0 18 28645943 28682388 1 DSC2 -
1 18 28645943 28682388 1 DSC2 -
2 18 28645943 28682388 1 DSC2 -
3 18 28898050 28937394 3 DSG1 +
4 18 28898050 28937394 3 DSG1 +
"""
df_gene = pd.read_csv(gene_bed, sep="\t")
list_gene = get_gene2keep(df_exon)
tmp = pd.DataFrame({"id": list_gene})
df_gene = df_gene.loc[df_gene['id'].isin(list_gene), :]
return pd.merge(df_gene, tmp)
@lp.parse(bed="file", distance_threshold="distance_threshold > 0")
def get_other_exon_in_same_gene(bed: str,
distance_threshold: int,
outfile: str) -> None:
"""
From a bed of exon, create a bed contaiing other exons in the \
same gene far from CTCF sites.
:param bed: A bed file containing exons
:param distance_threshold: The minimum distance required to keep the exon.
:param outfile: The name of the output file
"""
exon_table = format_exon_bed(BedConfig.exon_bed, BedConfig.gene_bed)
ctcf_dist = load_sipp_vs_ctcf(BedConfig.exons_vs_ctcf, exon_table)
ctcf_dist = ctcf_dist[["dist", "id"]]
g2k, e2r = get_lists_from_bed(bed)
good_exon_df = filter_exon(BedConfig.exon_bed, g2k, e2r)
final_df = get_final_bed(ctcf_dist, good_exon_df, distance_threshold)
final_df.to_csv(OutputBed.output / outfile, sep="\t", index=False)
gene_df = create_gene_bed4norm(BedConfig.gene_bed, final_df)
gene_df.to_csv(OutputBed.output / (outfile.replace(".bed", "") +
"-gene-dup.bed"), sep="\t", index=False)
gene_df = gene_df.drop_duplicates()
gene_df.to_csv(OutputBed.output / (outfile.replace(".bed", "") +
"-gene.bed"), sep="\t", index=False)
if __name__ == "__main__":
get_other_exon_in_same_gene()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment