Skip to content
Snippets Groups Projects
Commit 2a094c95 authored by nfontrod's avatar nfontrod
Browse files

src/bed_handler/filter_bed.py: add an option keep to recover every other genes...

src/bed_handler/filter_bed.py: add an option keep to recover every other genes not in the filter_file
parent c22ad3fd
No related branches found
No related tags found
No related merge requests found
Pipeline #144 failed
......@@ -30,7 +30,7 @@ def select_ft_of_interest(gene_file: Path) -> List[int]:
gene_id for gene_id in gene_list]
def filter_bed(bed_file: Path, gene_list: List[int], col_name: str
def filter_bed(bed_file: Path, gene_list: List[int], col_name: str, keep: bool
) -> pd.DataFrame:
"""
load a bed containing FasterDB gene and only recover the gene of \
......@@ -40,21 +40,34 @@ def filter_bed(bed_file: Path, gene_list: List[int], col_name: str
:param gene_list: a list of gene of interest
:param col_name: The column in the bed file containing the valeus \
stored in `gene_list`
:param keep: if true the everything inside the filter_file is kept \
otherwise it is dropped from the original bed file
:return: The bed file bed containing only genes located in gene_list
>>> filter_bed(TestConfig.gene_bed, [1, 5, 9], "id")
>>> filter_bed(TestConfig.gene_bed, [1, 5, 9], "id", True)
#ref start end id score strand
0 18 28645943 28682388 1 DSC2 -
4 13 45766989 45775176 5 KCTD4 -
8 13 45967450 45992516 9 SLC25A30 -
>>> filter_bed(TestConfig.gene_bed, [1, 5, 9], "id", False)
#ref start end id score strand
1 18 28709190 28742819 2 DSC1 -
2 18 28898050 28937394 3 DSG1 +
3 18 28956739 28994869 4 DSG4 +
5 13 45911001 45915347 6 TPT1 -
6 18 48918411 49088839 7 AC011260.1 +
7 18 49866541 51062273 8 DCC +
"""
df = pd.read_csv(bed_file, sep="\t")
return df[df[col_name].isin(gene_list)]
if keep:
return df[df[col_name].isin(gene_list)]
return df[-df[col_name].isin(gene_list)]
@lp.parse(bed_file="file", filter_file="file")
def create_filtered_bed(bed_file: str, filter_file: str, col_name: str,
outfile: str, output: str = str(OutputBed.output)):
outfile: str, output: str = str(OutputBed.output),
keep: str = 'y'):
"""
Filter the bed_file given with --bed_file parameter to only keep the \
rows containing the values stored in the file given with \
......@@ -68,9 +81,12 @@ def create_filtered_bed(bed_file: str, filter_file: str, col_name: str,
:param outfile: The name of the filtered ned file
:param output: Folder where the filtered bam will be created (default \
results/bed_file)
:param keep: if 'y' the everything inside the filter_file is kept \
otherwise it is dropped from the original bed file (default 'y')
"""
k = keep.lower() == 'y'
ft_list = select_ft_of_interest(Path(filter_file))
df = filter_bed(Path(bed_file), ft_list, col_name)
df = filter_bed(Path(bed_file), ft_list, col_name, k)
df.to_csv(Path(output) / outfile, sep="\t", index=False)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment