Skip to content
Snippets Groups Projects
Commit 80868e56 authored by nfontrod's avatar nfontrod
Browse files

src/bed_handler/get_last_exons.py: script to get the last exons of each gene...

src/bed_handler/get_last_exons.py: script to get the last exons of each gene in a bed file given in input
parent ddda12b1
Branches
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: From a bed file containing genes, identifies the last exons in \
genes.
"""
from pathlib import Path
import pandas as pd
from .config import TestConfig, BedConfig
from typing import List
from loguru import logger
import lazyparser as lp
def load_last_exons(exon_bed: Path) -> pd.DataFrame:
"""
Load the last exons found in the file `exon_bed`.
:param exon_bed: A bed file containing exons
:return: The corresponding dataframe with only last exon
>>> load_last_exons(TestConfig.exon_bed)
#ref start end id score strand id_gene
8 18 28666538 28666705 1_9 0 - 1
"""
df = pd.read_csv(exon_bed, sep="\t")
if "id" not in df.columns:
raise IndexError("The dataframe should have a column named 'id'.")
df["id_gene"] = df["id"].str.replace(r"_\d+", "").astype(int)
df["pos"] = df["id"].str.replace(r"\d+_", "").astype(int)
df.sort_values(["id_gene", "pos"], ascending=True, inplace=True)
return df.drop_duplicates(subset="id_gene", keep="last")\
.drop("pos", axis=1)
def get_id_gene(bed_gene: Path) -> List[int]:
"""
get the id of genes inside a bed file.
:param bed_gene: A bed file containing genes
:return: The list of gene id present in bed_gene file
>>> get_id_gene(TestConfig.gene_bed)
[1, 2, 3, 4, 5, 6, 7, 8, 9]
"""
df = pd.read_csv(bed_gene, sep="\t")
if "id" not in df.columns:
raise IndexError("The dataframe should have a column named 'id'.")
return df["id"].to_list()
def get_last_exons(df_last_exon: pd.DataFrame, list_gene: List[int]
) -> pd.DataFrame:
"""
Filter only last exons belonging to the genes located in \
list_gene.
:param df_last_exon: A dataframe containing only last exons
:param list_gene: A list of interest gene
:return: The list of last exons of interest
>>> d = pd.DataFrame({'#ref': {8: 18}, 'start': {8: 28666538},
... 'end': {8: 28666705}, 'id': {8: '1_9'}, 'score': {8: 0},
... 'strand': {8: '-'}, 'id_gene': {8: 1}})
>>> get_last_exons(d, [1])
#ref start end id score strand
8 18 28666538 28666705 1_9 0 -
>>> get_last_exons(d, [1, 2]) # displays a warning
#ref start end id score strand
8 18 28666538 28666705 1_9 0 -
>>> get_last_exons(d, [2]).empty # displays only a warning
True
"""
df_last_exon = df_last_exon[df_last_exon["id_gene"].isin(list_gene)]
if df_last_exon.shape[0] != len(list_gene):
logger.warning(f"{len(list_gene) - df_last_exon.shape[0]} last exons "
f"were not found for the given list of genes")
return df_last_exon.drop("id_gene", axis=1)
@lp.parse(gene_bed="file")
def get_last_exons_bed(gene_bed: str, outfile: str) -> None:
"""
Create a bed file containing the last exons for each genes in `gene_bed`.
:param gene_bed: A bed file containing genes
:param outfile: The output bed file containing last-exons.
"""
last_df = load_last_exons(BedConfig.exon_bed)
list_gene = get_id_gene(Path(gene_bed))
df_last = get_last_exons(last_df, list_gene)
df_last.to_csv(outfile, sep="\t", index=False)
if __name__ == "__main__":
import sys
if len(sys.argv) == 1:
import doctest
doctest.testmod()
else:
get_last_exons_bed()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment