Skip to content
Snippets Groups Projects
Commit 96d104de authored by nfontrod's avatar nfontrod
Browse files

src/bed_handler/filter_bed.py: script that allow filtering rows in a bed file

parent 9ccf2666
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: This script aims to filter a bed given the values stored in \
a file, that can be find in a specific column in the bed file
"""
from pathlib import Path
from typing import List
from .config import TestConfig, OutputBed
import pandas as pd
import lazyparser as lp
def select_ft_of_interest(gene_file: Path) -> List[int]:
"""
Get the fasterDb gene id located in tge file `gene_file`.
:param gene_file: A file containing a list of gene of interest
:return: The list of gene of interest
>>> select_ft_of_interest(TestConfig.list_genes)
[73, 75, 89, 123, 128]
"""
with gene_file.open('r') as infile:
gene_list = infile.read().splitlines()
return [int(gene_id) if gene_id.isdigit() else
gene_id for gene_id in gene_list]
def filter_bed(bed_file: Path, gene_list: List[int], col_name: str
) -> pd.DataFrame:
"""
load a bed containing FasterDB gene and only recover the gene of \
interest within it.
:param bed_file: A bed file containing genes
:param gene_list: a list of gene of interest
:param col_name: The column in the bed file containing the valeus \
stored in `gene_list`
:return: The bed file bed containing only genes located in gene_list
>>> filter_bed(TestConfig.gene_bed, [1, 5, 9], "id")
#ref start end id score strand
0 18 28645943 28682388 1 DSC2 -
4 13 45766989 45775176 5 KCTD4 -
8 13 45967450 45992516 9 SLC25A30 -
"""
df = pd.read_csv(bed_file, sep="\t")
return df[df[col_name].isin(gene_list)]
@lp.parse(bed_file="file", filter_file="file")
def create_filtered_bed(bed_file: str, filter_file: str, col_name: str,
outfile: str, output: str = str(OutputBed.output)):
"""
Filter the bed_file given with --bed_file parameter to only keep the \
rows containing the values stored in the file given with \
the parameter --filter_file inside the column with the name --col_name \
of the bed file.
:param bed_file: A bed file
:param filter_file: A file containing the values used to filter the \
rows of the bed file
:param col_name: The column used to filter rows
:param outfile: The name of the filtered ned file
:param output: Folder where the filtered bam will be created (default \
results/bed_file)
"""
ft_list = select_ft_of_interest(Path(filter_file))
df = filter_bed(Path(bed_file), ft_list, col_name)
df.to_csv(Path(output) / outfile, sep="\t", index=False)
if __name__ == "__main__":
create_filtered_bed()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment