src/bed_handler/get_last_exons.py: script to get the last exons of each gene...

src/bed_handler/get_last_exons.py: script to get the last exons of each gene in a bed file given in input

src/bed_handler/get_last_exons.py: script to get the last exons of each gene...
80868e56 · nfontrod · ddda12b1 · 80868e56
Commit 80868e56 authored 4 years ago by nfontrod
--- a/src/bed_handler/get_last_exons.py
+++ b/src/bed_handler/get_last_exons.py
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description: From a bed file containing genes, identifies the last exons in \
+genes.
+"""
+
+from pathlib import Path
+import pandas as pd
+from .config import TestConfig, BedConfig
+from typing import List
+from loguru import logger
+import lazyparser as lp
+
+
+def load_last_exons(exon_bed: Path) -> pd.DataFrame:
+    """
+    Load the last exons found in the file `exon_bed`.
+
+    :param exon_bed: A bed file containing exons
+    :return: The corresponding dataframe with only last exon
+
+    >>> load_last_exons(TestConfig.exon_bed)
+       #ref     start       end   id  score strand  id_gene
+    8    18  28666538  28666705  1_9      0      -        1
+    """
+    df = pd.read_csv(exon_bed, sep="\t")
+    if "id" not in df.columns:
+        raise IndexError("The dataframe should have a column named 'id'.")
+    df["id_gene"] = df["id"].str.replace(r"_\d+", "").astype(int)
+    df["pos"] = df["id"].str.replace(r"\d+_", "").astype(int)
+    df.sort_values(["id_gene", "pos"], ascending=True, inplace=True)
+    return df.drop_duplicates(subset="id_gene", keep="last")\
+        .drop("pos", axis=1)
+
+
+def get_id_gene(bed_gene: Path) -> List[int]:
+    """
+    get the id of genes inside a bed file.
+    :param bed_gene: A bed file containing genes
+    :return: The list of gene id present in bed_gene file
+
+    >>> get_id_gene(TestConfig.gene_bed)
+    [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    """
+    df = pd.read_csv(bed_gene, sep="\t")
+    if "id" not in df.columns:
+        raise IndexError("The dataframe should have a column named 'id'.")
+    return df["id"].to_list()
+
+
+def get_last_exons(df_last_exon: pd.DataFrame, list_gene: List[int]
+                   ) -> pd.DataFrame:
+    """
+    Filter only last exons belonging to the genes located in \
+    list_gene.
+
+    :param df_last_exon: A dataframe containing only last exons
+    :param list_gene: A list of interest gene
+    :return: The list of last exons of interest
+
+    >>> d = pd.DataFrame({'#ref': {8: 18}, 'start': {8: 28666538},
+    ... 'end': {8: 28666705}, 'id': {8: '1_9'}, 'score': {8: 0},
+    ... 'strand': {8: '-'}, 'id_gene': {8: 1}})
+    >>> get_last_exons(d, [1])
+       #ref     start       end   id  score strand
+    8    18  28666538  28666705  1_9      0      -
+    >>> get_last_exons(d, [1, 2]) # displays a warning
+       #ref     start       end   id  score strand
+    8    18  28666538  28666705  1_9      0      -
+    >>> get_last_exons(d, [2]).empty # displays only a warning
+    True
+    """
+    df_last_exon = df_last_exon[df_last_exon["id_gene"].isin(list_gene)]
+    if df_last_exon.shape[0] != len(list_gene):
+        logger.warning(f"{len(list_gene) - df_last_exon.shape[0]} last exons "
+                       f"were not found for the given list of genes")
+    return df_last_exon.drop("id_gene", axis=1)
+
+
+@lp.parse(gene_bed="file")
+def get_last_exons_bed(gene_bed: str, outfile: str) -> None:
+    """
+    Create a bed file containing the last exons for each genes in `gene_bed`.
+
+    :param gene_bed: A bed file containing genes
+    :param outfile: The output bed file containing last-exons.
+    """
+    last_df = load_last_exons(BedConfig.exon_bed)
+    list_gene = get_id_gene(Path(gene_bed))
+    df_last = get_last_exons(last_df, list_gene)
+    df_last.to_csv(outfile, sep="\t", index=False)
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) == 1:
+        import doctest
+        doctest.testmod()
+    else:
+        get_last_exons_bed()