src/bed_handler/bed_resize.py: allow to resize the bed file

1144c187 · nfontrod · a5e8bfbb · 1144c187
Commit 1144c187 authored 4 years ago by nfontrod
--- a/src/bed_handler/bed_resize.py
+++ b/src/bed_handler/bed_resize.py
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description: The goal of this script is to resize a bed
+"""
+
+import pandas as pd
+import lazyparser as lp
+from doctest import testmod
+from .config import OutputBed
+
+
+def resize_row_from_start(rowo: pd.Series, size: int, resize_from: str
+                          ) -> pd.Series:
+    """
+    Resize the bed feature from start inside a bed row.
+
+    :param rowo: A bed row
+    :param size: The maximum size the feature must have
+    :param resize_from: The coordinate for which we want to resize (default \
+    "start")
+    :return: The row resized
+
+    >>> crow = pd.Series({"#ref": 10, "start": 10, "end": 20, "id": 1,
+    ... "score": 1, "strand": "+"})
+    >>> resize_row_from_start(crow, 5, "start").to_dict()
+    {'#ref': 10, 'start': 10, 'end': 15, 'id': 1, 'score': 1, 'strand': '+'}
+    >>> resize_row_from_start(crow, 5, "end").to_dict()
+    {'#ref': 10, 'start': 15, 'end': 20, 'id': 1, 'score': 1, 'strand': '+'}
+    >>> resize_row_from_start(crow, 20, "start").to_dict()
+    {'#ref': 10, 'start': 10, 'end': 20, 'id': 1, 'score': 1, 'strand': '+'}
+    >>> crow = pd.Series({"#ref": 10, "start": 50, "end": 60, "id": 1,
+    ... "score": 1, "strand": "-"})
+    >>> resize_row_from_start(crow, 5, "start").to_dict()
+    {'#ref': 10, 'start': 55, 'end': 60, 'id': 1, 'score': 1, 'strand': '-'}
+    >>> resize_row_from_start(crow, 5, "end").to_dict()
+    {'#ref': 10, 'start': 50, 'end': 55, 'id': 1, 'score': 1, 'strand': '-'}
+    """
+    row = rowo.copy()
+    row_strand = row["strand"]
+    d= {"+": "-", "-": "+"}
+    row_strand = d[row_strand] if resize_from == "end" else row_strand
+    if row["end"] - row["start"] <= size:
+        return row
+    if row_strand == "+":
+        row['end'] = row['start'] + size
+    else:
+        row['start'] = row['end'] - size
+    return row
+
+
+def update_bed(df_bed: pd.DataFrame, size: int,
+               resize_from: str) -> pd.DataFrame:
+    """
+    Resize each feature in a bed dataframe.
+
+    :param df_bed: The dataframe corresponding to a bed to resize
+    :param size: The maximum size the feature must have
+    :param resize_from: The coordinate for which we want to resize (default \
+    "start")
+    :return: The dataframe resized
+
+    >>> cdf = pd.DataFrame({"#ref": [1, 1], "start": [10, 50],
+    ... "end": [20, 60], "id": [1, 2], "strand": ["+", "-"]})
+    >>> update_bed(cdf, 5, "start")
+       #ref  start  end  id strand
+    0     1     10   15   1      +
+    1     1     55   60   2      -
+    >>> update_bed(cdf, 5, "end")
+       #ref  start  end  id strand
+    0     1     15   20   1      +
+    1     1     50   55   2      -
+    """
+    list_s = [
+        resize_row_from_start(df_bed.iloc[i, :], size, resize_from)
+        for i in range(df_bed.shape[0])
+    ]
+    return pd.DataFrame(list_s)
+
+
+@lp.parse(bed="file", size="size > 0", resize_from=["start", "end"])
+def bed_resizer(bed: str, size: int, outfile: str,
+                resize_from: str = "start") -> None:
+    """
+    Resize bed features inside a bed file from their start or stop \
+    position (according to their strand).
+
+    :param bed: A bed file with the features to resize
+    :param size: The maximum size the feature must have
+    :param outfile: The output bed name
+    :param resize_from: The coordinate for which we want to resize (default \
+    "start")
+    """
+    df = pd.read_csv(bed, sep="\t")
+    ndf = update_bed(df, size, resize_from)
+    ndf.to_csv(OutputBed.output / outfile, sep="\t", index=False)
+
+
+if __name__ == "__main__":
+    bed_resizer()
\ No newline at end of file