diff --git a/src/bed_handler/filter_bed.py b/src/bed_handler/filter_bed.py new file mode 100644 index 0000000000000000000000000000000000000000..81fbf898658e29be88a8b1567311ff6fac5dacca --- /dev/null +++ b/src/bed_handler/filter_bed.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: This script aims to filter a bed given the values stored in \ +a file, that can be find in a specific column in the bed file +""" + +from pathlib import Path +from typing import List +from .config import TestConfig, OutputBed +import pandas as pd +import lazyparser as lp + + +def select_ft_of_interest(gene_file: Path) -> List[int]: + """ + Get the fasterDb gene id located in tge file `gene_file`. + + :param gene_file: A file containing a list of gene of interest + :return: The list of gene of interest + + >>> select_ft_of_interest(TestConfig.list_genes) + [73, 75, 89, 123, 128] + """ + with gene_file.open('r') as infile: + gene_list = infile.read().splitlines() + return [int(gene_id) if gene_id.isdigit() else + gene_id for gene_id in gene_list] + + +def filter_bed(bed_file: Path, gene_list: List[int], col_name: str + ) -> pd.DataFrame: + """ + load a bed containing FasterDB gene and only recover the gene of \ + interest within it. + + :param bed_file: A bed file containing genes + :param gene_list: a list of gene of interest + :param col_name: The column in the bed file containing the valeus \ + stored in `gene_list` + :return: The bed file bed containing only genes located in gene_list + + >>> filter_bed(TestConfig.gene_bed, [1, 5, 9], "id") + #ref start end id score strand + 0 18 28645943 28682388 1 DSC2 - + 4 13 45766989 45775176 5 KCTD4 - + 8 13 45967450 45992516 9 SLC25A30 - + """ + df = pd.read_csv(bed_file, sep="\t") + return df[df[col_name].isin(gene_list)] + + +@lp.parse(bed_file="file", filter_file="file") +def create_filtered_bed(bed_file: str, filter_file: str, col_name: str, + outfile: str, output: str = str(OutputBed.output)): + """ + Filter the bed_file given with --bed_file parameter to only keep the \ + rows containing the values stored in the file given with \ + the parameter --filter_file inside the column with the name --col_name \ + of the bed file. + + :param bed_file: A bed file + :param filter_file: A file containing the values used to filter the \ + rows of the bed file + :param col_name: The column used to filter rows + :param outfile: The name of the filtered ned file + :param output: Folder where the filtered bam will be created (default \ + results/bed_file) + """ + ft_list = select_ft_of_interest(Path(filter_file)) + df = filter_bed(Path(bed_file), ft_list, col_name) + df.to_csv(Path(output) / outfile, sep="\t", index=False) + + +if __name__ == "__main__": + create_filtered_bed()