diff --git a/src/bed_handler/filter_bed.py b/src/bed_handler/filter_bed.py index 81fbf898658e29be88a8b1567311ff6fac5dacca..b88f833831ad1f828c4f3274a5ccd6b501761b26 100644 --- a/src/bed_handler/filter_bed.py +++ b/src/bed_handler/filter_bed.py @@ -30,7 +30,7 @@ def select_ft_of_interest(gene_file: Path) -> List[int]: gene_id for gene_id in gene_list] -def filter_bed(bed_file: Path, gene_list: List[int], col_name: str +def filter_bed(bed_file: Path, gene_list: List[int], col_name: str, keep: bool ) -> pd.DataFrame: """ load a bed containing FasterDB gene and only recover the gene of \ @@ -40,21 +40,34 @@ def filter_bed(bed_file: Path, gene_list: List[int], col_name: str :param gene_list: a list of gene of interest :param col_name: The column in the bed file containing the valeus \ stored in `gene_list` + :param keep: if true the everything inside the filter_file is kept \ + otherwise it is dropped from the original bed file :return: The bed file bed containing only genes located in gene_list - >>> filter_bed(TestConfig.gene_bed, [1, 5, 9], "id") + >>> filter_bed(TestConfig.gene_bed, [1, 5, 9], "id", True) #ref start end id score strand 0 18 28645943 28682388 1 DSC2 - 4 13 45766989 45775176 5 KCTD4 - 8 13 45967450 45992516 9 SLC25A30 - + >>> filter_bed(TestConfig.gene_bed, [1, 5, 9], "id", False) + #ref start end id score strand + 1 18 28709190 28742819 2 DSC1 - + 2 18 28898050 28937394 3 DSG1 + + 3 18 28956739 28994869 4 DSG4 + + 5 13 45911001 45915347 6 TPT1 - + 6 18 48918411 49088839 7 AC011260.1 + + 7 18 49866541 51062273 8 DCC + """ df = pd.read_csv(bed_file, sep="\t") - return df[df[col_name].isin(gene_list)] + if keep: + return df[df[col_name].isin(gene_list)] + return df[-df[col_name].isin(gene_list)] @lp.parse(bed_file="file", filter_file="file") def create_filtered_bed(bed_file: str, filter_file: str, col_name: str, - outfile: str, output: str = str(OutputBed.output)): + outfile: str, output: str = str(OutputBed.output), + keep: str = 'y'): """ Filter the bed_file given with --bed_file parameter to only keep the \ rows containing the values stored in the file given with \ @@ -68,9 +81,12 @@ def create_filtered_bed(bed_file: str, filter_file: str, col_name: str, :param outfile: The name of the filtered ned file :param output: Folder where the filtered bam will be created (default \ results/bed_file) + :param keep: if 'y' the everything inside the filter_file is kept \ + otherwise it is dropped from the original bed file (default 'y') """ + k = keep.lower() == 'y' ft_list = select_ft_of_interest(Path(filter_file)) - df = filter_bed(Path(bed_file), ft_list, col_name) + df = filter_bed(Path(bed_file), ft_list, col_name, k) df.to_csv(Path(output) / outfile, sep="\t", index=False)