From 80868e566659f96c445dc6b5a637dd33e33e9df4 Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Mon, 15 Mar 2021 18:14:36 +0100 Subject: [PATCH] src/bed_handler/get_last_exons.py: script to get the last exons of each gene in a bed file given in input --- src/bed_handler/get_last_exons.py | 103 ++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 src/bed_handler/get_last_exons.py diff --git a/src/bed_handler/get_last_exons.py b/src/bed_handler/get_last_exons.py new file mode 100644 index 0000000..1d5d1c2 --- /dev/null +++ b/src/bed_handler/get_last_exons.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: From a bed file containing genes, identifies the last exons in \ +genes. +""" + +from pathlib import Path +import pandas as pd +from .config import TestConfig, BedConfig +from typing import List +from loguru import logger +import lazyparser as lp + + +def load_last_exons(exon_bed: Path) -> pd.DataFrame: + """ + Load the last exons found in the file `exon_bed`. + + :param exon_bed: A bed file containing exons + :return: The corresponding dataframe with only last exon + + >>> load_last_exons(TestConfig.exon_bed) + #ref start end id score strand id_gene + 8 18 28666538 28666705 1_9 0 - 1 + """ + df = pd.read_csv(exon_bed, sep="\t") + if "id" not in df.columns: + raise IndexError("The dataframe should have a column named 'id'.") + df["id_gene"] = df["id"].str.replace(r"_\d+", "").astype(int) + df["pos"] = df["id"].str.replace(r"\d+_", "").astype(int) + df.sort_values(["id_gene", "pos"], ascending=True, inplace=True) + return df.drop_duplicates(subset="id_gene", keep="last")\ + .drop("pos", axis=1) + + +def get_id_gene(bed_gene: Path) -> List[int]: + """ + get the id of genes inside a bed file. + :param bed_gene: A bed file containing genes + :return: The list of gene id present in bed_gene file + + >>> get_id_gene(TestConfig.gene_bed) + [1, 2, 3, 4, 5, 6, 7, 8, 9] + """ + df = pd.read_csv(bed_gene, sep="\t") + if "id" not in df.columns: + raise IndexError("The dataframe should have a column named 'id'.") + return df["id"].to_list() + + +def get_last_exons(df_last_exon: pd.DataFrame, list_gene: List[int] + ) -> pd.DataFrame: + """ + Filter only last exons belonging to the genes located in \ + list_gene. + + :param df_last_exon: A dataframe containing only last exons + :param list_gene: A list of interest gene + :return: The list of last exons of interest + + >>> d = pd.DataFrame({'#ref': {8: 18}, 'start': {8: 28666538}, + ... 'end': {8: 28666705}, 'id': {8: '1_9'}, 'score': {8: 0}, + ... 'strand': {8: '-'}, 'id_gene': {8: 1}}) + >>> get_last_exons(d, [1]) + #ref start end id score strand + 8 18 28666538 28666705 1_9 0 - + >>> get_last_exons(d, [1, 2]) # displays a warning + #ref start end id score strand + 8 18 28666538 28666705 1_9 0 - + >>> get_last_exons(d, [2]).empty # displays only a warning + True + """ + df_last_exon = df_last_exon[df_last_exon["id_gene"].isin(list_gene)] + if df_last_exon.shape[0] != len(list_gene): + logger.warning(f"{len(list_gene) - df_last_exon.shape[0]} last exons " + f"were not found for the given list of genes") + return df_last_exon.drop("id_gene", axis=1) + + +@lp.parse(gene_bed="file") +def get_last_exons_bed(gene_bed: str, outfile: str) -> None: + """ + Create a bed file containing the last exons for each genes in `gene_bed`. + + :param gene_bed: A bed file containing genes + :param outfile: The output bed file containing last-exons. + """ + last_df = load_last_exons(BedConfig.exon_bed) + list_gene = get_id_gene(Path(gene_bed)) + df_last = get_last_exons(last_df, list_gene) + df_last.to_csv(outfile, sep="\t", index=False) + + +if __name__ == "__main__": + import sys + if len(sys.argv) == 1: + import doctest + doctest.testmod() + else: + get_last_exons_bed() -- GitLab