From bb8ce1bab3ad098548a2343ac458a4f5848658fa Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Thu, 7 Jan 2021 19:32:46 +0100 Subject: [PATCH] src/bed_handler/bed_resize.py: add a parameter type --- src/bed_handler/bed_resize.py | 130 +++++++++++++++++++++++++++++----- 1 file changed, 112 insertions(+), 18 deletions(-) diff --git a/src/bed_handler/bed_resize.py b/src/bed_handler/bed_resize.py index 8c10bda..51b1e96 100644 --- a/src/bed_handler/bed_resize.py +++ b/src/bed_handler/bed_resize.py @@ -12,35 +12,36 @@ from doctest import testmod from .config import OutputBed -def resize_row_from_start(rowo: pd.Series, size: int, resize_from: str - ) -> pd.Series: +def resize_row_inner(rowo: pd.Series, size: int, resize_from: str + ) -> pd.Series: """ - Resize the bed feature from start inside a bed row. + Resize the bed feature inside a bed row. :param rowo: A bed row :param size: The maximum size the feature must have :param resize_from: The coordinate for which we want to resize (default \ "start") + :param type: The type of resize to make :return: The row resized >>> crow = pd.Series({"#ref": 10, "start": 10, "end": 20, "id": 1, ... "score": 1, "strand": "+"}) - >>> resize_row_from_start(crow, 5, "start").to_dict() + >>> resize_row_inner(crow, 5, "start").to_dict() {'#ref': 10, 'start': 10, 'end': 15, 'id': 1, 'score': 1, 'strand': '+'} - >>> resize_row_from_start(crow, 5, "end").to_dict() + >>> resize_row_inner(crow, 5, "end").to_dict() {'#ref': 10, 'start': 15, 'end': 20, 'id': 1, 'score': 1, 'strand': '+'} - >>> resize_row_from_start(crow, 20, "start").to_dict() - {'#ref': 10, 'start': 10, 'end': 20, 'id': 1, 'score': 1, 'strand': '+'} + >>> resize_row_inner(crow, 7, "start").to_dict() + {'#ref': 10, 'start': 10, 'end': 17, 'id': 1, 'score': 1, 'strand': '+'} >>> crow = pd.Series({"#ref": 10, "start": 50, "end": 60, "id": 1, ... "score": 1, "strand": "-"}) - >>> resize_row_from_start(crow, 5, "start").to_dict() + >>> resize_row_inner(crow, 5, "start").to_dict() {'#ref': 10, 'start': 55, 'end': 60, 'id': 1, 'score': 1, 'strand': '-'} - >>> resize_row_from_start(crow, 5, "end").to_dict() + >>> resize_row_inner(crow, 5, "end").to_dict() {'#ref': 10, 'start': 50, 'end': 55, 'id': 1, 'score': 1, 'strand': '-'} """ row = rowo.copy() row_strand = row["strand"] - d= {"+": "-", "-": "+"} + d = {"+": "-", "-": "+"} row_strand = d[row_strand] if resize_from == "end" else row_strand if row["end"] - row["start"] <= size: return row @@ -51,8 +52,90 @@ def resize_row_from_start(rowo: pd.Series, size: int, resize_from: str return row +def resize_row_outer(rowo: pd.Series, size: int, resize_from: str + ) -> pd.Series: + """ + Resize the bed feature inside a bed row. + + :param rowo: A bed row + :param size: The maximum size the feature must have + :param resize_from: The coordinate for which we want to resize (default \ + "start") + :param type: The type of resize to make + :return: The row resized + + >>> crow = pd.Series({"#ref": 10, "start": 10, "end": 20, "id": 1, + ... "score": 1, "strand": "+"}) + >>> resize_row_outer(crow, 5, "start").to_dict() + {'#ref': 10, 'start': 5, 'end': 10, 'id': 1, 'score': 1, 'strand': '+'} + >>> resize_row_outer(crow, 5, "end").to_dict() + {'#ref': 10, 'start': 20, 'end': 25, 'id': 1, 'score': 1, 'strand': '+'} + >>> crow = pd.Series({"#ref": 10, "start": 50, "end": 60, "id": 1, + ... "score": 1, "strand": "-"}) + >>> resize_row_outer(crow, 5, "start").to_dict() + {'#ref': 10, 'start': 60, 'end': 65, 'id': 1, 'score': 1, 'strand': '-'} + >>> resize_row_outer(crow, 5, "end").to_dict() + {'#ref': 10, 'start': 45, 'end': 50, 'id': 1, 'score': 1, 'strand': '-'} + """ + row = rowo.copy() + row_strand = row["strand"] + if ( + resize_from == "start" + and row_strand == "+" + or resize_from != "start" + and row_strand != "+" + ): + row['end'] = row['start'] + row['start'] -= size + else: + row['start'] = row["end"] + row["end"] += size + return row + + +def resize_row(rowo: pd.Series, size: int, resize_from: str, + type: str) -> pd.Series: + """ + Resize the bed feature inside a bed row. + + :param rowo: A bed row + :param size: The maximum size the feature must have + :param resize_from: The coordinate for which we want to resize (default \ + "start") + :param type: The type of resize to make + :return: The row resized + + >>> crow = pd.Series({"#ref": 10, "start": 10, "end": 20, "id": 1, + ... "score": 1, "strand": "+"}) + >>> resize_row(crow, 5, "start", "inner").to_dict() + {'#ref': 10, 'start': 10, 'end': 15, 'id': 1, 'score': 1, 'strand': '+'} + >>> resize_row(crow, 5, "end", "inner").to_dict() + {'#ref': 10, 'start': 15, 'end': 20, 'id': 1, 'score': 1, 'strand': '+'} + >>> resize_row(crow, 7, "start", "inner").to_dict() + {'#ref': 10, 'start': 10, 'end': 17, 'id': 1, 'score': 1, 'strand': '+'} + >>> resize_row(crow, 5, "start", "outer").to_dict() + {'#ref': 10, 'start': 5, 'end': 10, 'id': 1, 'score': 1, 'strand': '+'} + >>> resize_row(crow, 5, "end", "outer").to_dict() + {'#ref': 10, 'start': 20, 'end': 25, 'id': 1, 'score': 1, 'strand': '+'} + >>> crow = pd.Series({"#ref": 10, "start": 50, "end": 60, "id": 1, + ... "score": 1, "strand": "-"}) + >>> resize_row(crow, 5, "start", "inner").to_dict() + {'#ref': 10, 'start': 55, 'end': 60, 'id': 1, 'score': 1, 'strand': '-'} + >>> resize_row(crow, 5, "end", "inner").to_dict() + {'#ref': 10, 'start': 50, 'end': 55, 'id': 1, 'score': 1, 'strand': '-'} + >>> resize_row(crow, 5, "start", "outer").to_dict() + {'#ref': 10, 'start': 60, 'end': 65, 'id': 1, 'score': 1, 'strand': '-'} + >>> resize_row(crow, 5, "end", "outer").to_dict() + {'#ref': 10, 'start': 45, 'end': 50, 'id': 1, 'score': 1, 'strand': '-'} + """ + if type == "inner": + return resize_row_inner(rowo, size, resize_from) + else: + return resize_row_outer(rowo, size, resize_from) + + def update_bed(df_bed: pd.DataFrame, size: int, - resize_from: str) -> pd.DataFrame: + resize_from: str, type: str) -> pd.DataFrame: """ Resize each feature in a bed dataframe. @@ -61,28 +144,38 @@ def update_bed(df_bed: pd.DataFrame, size: int, :param resize_from: The coordinate for which we want to resize (default \ "start") :return: The dataframe resized + :param type: The type of resize to make >>> cdf = pd.DataFrame({"#ref": [1, 1], "start": [10, 50], ... "end": [20, 60], "id": [1, 2], "strand": ["+", "-"]}) - >>> update_bed(cdf, 5, "start") + >>> update_bed(cdf, 5, "start", "inner") #ref start end id strand 0 1 10 15 1 + 1 1 55 60 2 - - >>> update_bed(cdf, 5, "end") + >>> update_bed(cdf, 5, "end", "inner") #ref start end id strand 0 1 15 20 1 + 1 1 50 55 2 - + >>> update_bed(cdf, 5, "start", "outer") + #ref start end id strand + 0 1 5 10 1 + + 1 1 60 65 2 - + >>> update_bed(cdf, 5, "end", "outer") + #ref start end id strand + 0 1 20 25 1 + + 1 1 45 50 2 - """ list_s = [ - resize_row_from_start(df_bed.iloc[i, :], size, resize_from) + resize_row(df_bed.iloc[i, :], size, resize_from, type) for i in range(df_bed.shape[0]) ] return pd.DataFrame(list_s) -@lp.parse(bed="file", size="size > 0", resize_from=["start", "end"]) +@lp.parse(bed="file", size="size > 0", resize_from=["start", "end"], + type=["inner", "outer"]) def bed_resizer(bed: str, size: int, outfile: str, - resize_from: str = "start") -> None: + resize_from: str = "start", type: str = "inner") -> None: """ Resize bed features inside a bed file from their start or stop \ position (according to their strand). @@ -92,11 +185,12 @@ def bed_resizer(bed: str, size: int, outfile: str, :param outfile: The output bed name :param resize_from: The coordinate for which we want to resize (default \ "start") + :param type: The type of resize to make """ df = pd.read_csv(bed, sep="\t") - ndf = update_bed(df, size, resize_from) + ndf = update_bed(df, size, resize_from, type) ndf.to_csv(OutputBed.output / outfile, sep="\t", index=False) if __name__ == "__main__": - bed_resizer() \ No newline at end of file + bed_resizer() -- GitLab