src/bed_handler/filter_bed.py: script that allow filtering rows in a bed file

96d104de · nfontrod · 9ccf2666 · 96d104de
Commit 96d104de authored Oct 23, 2020 by nfontrod
--- a/src/bed_handler/filter_bed.py
+++ b/src/bed_handler/filter_bed.py
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+"""
+Description: This script aims to filter a bed given the values stored in \
+a file, that can be find in a specific column in the bed file
+"""
+from pathlib import Path
+from typing import List
+from .config import TestConfig, OutputBed
+import pandas as pd
+import lazyparser as lp
+def select_ft_of_interest(gene_file: Path) -> List[int]:
+    """
+    Get the fasterDb gene id located in tge file `gene_file`.
+    :param gene_file: A file containing a list of gene of interest
+    :return: The list of gene of interest
+    >>> select_ft_of_interest(TestConfig.list_genes)
+    [73, 75, 89, 123, 128]
+    """
+    with gene_file.open('r') as infile:
+        gene_list = infile.read().splitlines()
+    return [int(gene_id) if gene_id.isdigit() else
+            gene_id for gene_id in gene_list]
+def filter_bed(bed_file: Path, gene_list: List[int], col_name: str
+               ) -> pd.DataFrame:
+    """
+    load a bed containing FasterDB gene and only recover the gene of \
+    interest within it.
+    :param bed_file: A bed file containing genes
+    :param gene_list: a list of gene of interest
+    :param col_name: The column in the bed file containing the valeus \
+    stored in `gene_list`
+    :return: The bed file bed containing only genes located in gene_list
+    >>> filter_bed(TestConfig.gene_bed, [1, 5, 9], "id")
+       #ref     start       end  id     score strand
+    0    18  28645943  28682388   1      DSC2      -
+    4    13  45766989  45775176   5     KCTD4      -
+    8    13  45967450  45992516   9  SLC25A30      -
+    """
+    df = pd.read_csv(bed_file, sep="\t")
+    return df[df[col_name].isin(gene_list)]
+@lp.parse(bed_file="file", filter_file="file")
+def create_filtered_bed(bed_file: str, filter_file: str, col_name: str,
+                        outfile: str, output: str = str(OutputBed.output)):
+    """
+    Filter the bed_file given with --bed_file parameter to only keep the \
+    rows containing the values stored in the file given with \
+    the parameter --filter_file inside the column with the name --col_name \
+    of the bed file.
+    :param bed_file: A bed file
+    :param filter_file: A file containing the values used to filter the \
+    rows of the bed file
+    :param col_name: The column used to filter rows
+    :param outfile: The name of the filtered ned file
+    :param output: Folder where the filtered bam will be created (default \
+    results/bed_file)
+    """
+    ft_list = select_ft_of_interest(Path(filter_file))
+    df = filter_bed(Path(bed_file), ft_list, col_name)
+    df.to_csv(Path(output) / outfile, sep="\t", index=False)
+if __name__ == "__main__":
+    create_filtered_bed()