From 1144c1874ef369cdf2b4e6b01d3eb4f2eb6f7c1e Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Fri, 18 Dec 2020 12:45:19 +0100 Subject: [PATCH] src/bed_handler/bed_resize.py: allow to resize the bed file --- src/bed_handler/bed_resize.py | 102 ++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 src/bed_handler/bed_resize.py diff --git a/src/bed_handler/bed_resize.py b/src/bed_handler/bed_resize.py new file mode 100644 index 0000000..8c10bda --- /dev/null +++ b/src/bed_handler/bed_resize.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to resize a bed +""" + +import pandas as pd +import lazyparser as lp +from doctest import testmod +from .config import OutputBed + + +def resize_row_from_start(rowo: pd.Series, size: int, resize_from: str + ) -> pd.Series: + """ + Resize the bed feature from start inside a bed row. + + :param rowo: A bed row + :param size: The maximum size the feature must have + :param resize_from: The coordinate for which we want to resize (default \ + "start") + :return: The row resized + + >>> crow = pd.Series({"#ref": 10, "start": 10, "end": 20, "id": 1, + ... "score": 1, "strand": "+"}) + >>> resize_row_from_start(crow, 5, "start").to_dict() + {'#ref': 10, 'start': 10, 'end': 15, 'id': 1, 'score': 1, 'strand': '+'} + >>> resize_row_from_start(crow, 5, "end").to_dict() + {'#ref': 10, 'start': 15, 'end': 20, 'id': 1, 'score': 1, 'strand': '+'} + >>> resize_row_from_start(crow, 20, "start").to_dict() + {'#ref': 10, 'start': 10, 'end': 20, 'id': 1, 'score': 1, 'strand': '+'} + >>> crow = pd.Series({"#ref": 10, "start": 50, "end": 60, "id": 1, + ... "score": 1, "strand": "-"}) + >>> resize_row_from_start(crow, 5, "start").to_dict() + {'#ref': 10, 'start': 55, 'end': 60, 'id': 1, 'score': 1, 'strand': '-'} + >>> resize_row_from_start(crow, 5, "end").to_dict() + {'#ref': 10, 'start': 50, 'end': 55, 'id': 1, 'score': 1, 'strand': '-'} + """ + row = rowo.copy() + row_strand = row["strand"] + d= {"+": "-", "-": "+"} + row_strand = d[row_strand] if resize_from == "end" else row_strand + if row["end"] - row["start"] <= size: + return row + if row_strand == "+": + row['end'] = row['start'] + size + else: + row['start'] = row['end'] - size + return row + + +def update_bed(df_bed: pd.DataFrame, size: int, + resize_from: str) -> pd.DataFrame: + """ + Resize each feature in a bed dataframe. + + :param df_bed: The dataframe corresponding to a bed to resize + :param size: The maximum size the feature must have + :param resize_from: The coordinate for which we want to resize (default \ + "start") + :return: The dataframe resized + + >>> cdf = pd.DataFrame({"#ref": [1, 1], "start": [10, 50], + ... "end": [20, 60], "id": [1, 2], "strand": ["+", "-"]}) + >>> update_bed(cdf, 5, "start") + #ref start end id strand + 0 1 10 15 1 + + 1 1 55 60 2 - + >>> update_bed(cdf, 5, "end") + #ref start end id strand + 0 1 15 20 1 + + 1 1 50 55 2 - + """ + list_s = [ + resize_row_from_start(df_bed.iloc[i, :], size, resize_from) + for i in range(df_bed.shape[0]) + ] + return pd.DataFrame(list_s) + + +@lp.parse(bed="file", size="size > 0", resize_from=["start", "end"]) +def bed_resizer(bed: str, size: int, outfile: str, + resize_from: str = "start") -> None: + """ + Resize bed features inside a bed file from their start or stop \ + position (according to their strand). + + :param bed: A bed file with the features to resize + :param size: The maximum size the feature must have + :param outfile: The output bed name + :param resize_from: The coordinate for which we want to resize (default \ + "start") + """ + df = pd.read_csv(bed, sep="\t") + ndf = update_bed(df, size, resize_from) + ndf.to_csv(OutputBed.output / outfile, sep="\t", index=False) + + +if __name__ == "__main__": + bed_resizer() \ No newline at end of file -- GitLab