From 7abb300e7639e3e721f605e9ccab1c9e552b920f Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Mon, 21 Feb 2022 09:33:32 +0100
Subject: [PATCH] src/visu/figure_maker.py: Add a stat parameter to perform
 statistical analysis on the metagene figure

---
 src/visu/figure_maker.py | 115 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 106 insertions(+), 9 deletions(-)

diff --git a/src/visu/figure_maker.py b/src/visu/figure_maker.py
index f499afb..1dc9183 100644
--- a/src/visu/figure_maker.py
+++ b/src/visu/figure_maker.py
@@ -18,6 +18,7 @@ from tqdm import tqdm
 from loguru import logger
 import matplotlib as mpl
 import matplotlib.font_manager
+from rpy2.robjects import r, pandas2ri
 
 
 def load_bed(bed: Path, bed_name: str) -> List[List[Union[int, str]]]:
@@ -259,12 +260,12 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
         .mean() \
         .reset_index()
     if figure_type == "metagene":
+        if environment[0] != 0:
+            df_sum['location'] = df_sum['bin'].apply(
+                lambda x: f"before_{region_name}" if x < 0 else
+                f"after_{region_name}" if x >= nb_bin else region_name)
         df_sum, condition_col = merge_condition_region_col(df_sum)
         return df_sum, condition_col
-    if environment[0] != 0:
-        df_sum['location'] = df_sum['bin'].apply(
-            lambda x: f"before_{region_name}" if x < 0 else
-            f"after_{region_name}" if x >= nb_bin else region_name)
     df_sum.drop('bin', axis=1, inplace=True)
     if environment[0] != 0:
         col_merge = ['condition', 'region', 'replicate', 'location']
@@ -295,7 +296,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
                     border_names: List[str], nb_bin: int,
                     environment: List[int], bed_name: str,
                     output: Path, norm: Union[int, Path],
-                    condition_col: str, ylim: Optional[List[float]]) -> None:
+                    condition_col: str, ylim: Optional[List[float]]) -> Path:
     """
     Create a metagene figure on the region of interest.
 
@@ -313,6 +314,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
     each samples
     :param condition_col: The name of the condition columns
     :param ylim: The range of the y axis
+    :return: The name of the figure
     """
     font_files = matplotlib.font_manager.findSystemFonts(fontpaths=None,
                                                          fontext='ttf')
@@ -357,6 +359,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
     g.ax.tick_params(left=True, bottom=True)
     g.savefig(output / outfile_title)
     g.fig.clf()
+    return output / outfile_title
 
 
 def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
@@ -445,6 +448,92 @@ def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path],
     return df
 
 
+def compute_diff(df_sum: pd.DataFrame) -> pd.DataFrame:
+    """
+    Compute the average coverage differences between two conditions.
+
+    :param df_sum: A dataframe containing the coverage values between \
+    two conditions
+    :return: The dataframe with a coolumn diff
+
+    >>> di = pd.DataFrame({'bin': [-25] * 6,
+    ... 'condition': ['Ctrl', 'Ctrl', 'Ctrl', 'siDDX', 'siDDX', 'siDDX'],
+    ... 'replicate': ['R1', 'R2', 'R3', 'R1', 'R2', 'R3'],
+    ... 'coverage': [0.1,
+    ...  0.2,
+    ...  0.3,
+    ...  0.15,
+    ...  0.25,
+    ...  0.35],
+    ... 'location': ['before_ddx_down'] * 6})
+    >>> compute_diff(di)
+       bin         location  Ctrl  siDDX  diff
+    0  -25  before_ddx_down   0.2   0.25  0.05
+    """
+    cond = df_sum["condition"].unique()
+    df_sum = df_sum.groupby(["bin", "location", "condition"]).mean()
+    if len(cond) != 2:
+        raise IndexError("Only two different conditions must be given to "
+                         "perform stats")
+    df = df_sum.pivot_table(index=["bin", "location"],
+                            columns="condition",
+                            values="coverage").reset_index()
+    df.columns.name = None
+    df.groupby(["bin", "location"])
+    df["diff"] = df[cond[1]] - df[cond[0]]
+    return df
+
+
+def get_shapiro_an_lm_pvalue(df: pd.DataFrame, location: str) -> pd.Series:
+    """
+    Return a series containing the shapiro and the t-test p-value of \
+    the difference in coverage between 2 conditions at a particular location
+
+    :param df: A dataframe containing the difference in coverage \
+    between 2 conditions at a particular location
+    :param location: The region of interest
+    :return: a series containing the shapiro and the t-test p-value of \
+    the difference in coverage between 2 conditions at a particular location
+    """
+    df2 = df[df["location"] == location].copy()
+    df2.to_csv(f"{location}.txt", sep="\t", index=False)
+    pandas2ri.activate()
+    stat_s = r("""
+    function(data){
+        mod <- lm(diff ~ 1, data=data)
+        s <- summary(mod)
+        stat <- s$coefficients[1, "Pr(>|t|)"]
+        shapiro <- shapiro.test(data$diff)$p.value
+        return(c(shapiro, stat))
+    }
+    """)
+    res = stat_s(df2)
+    return pd.Series({"shapiro_p-value": res[0],
+                      "t-test-p-value": res[1],
+                      "location": location})
+
+
+def compute_stat_dataframe(df_sum: pd.DataFrame,
+                           region_name: str) -> pd.DataFrame:
+    """
+    Create a dataframe containing the statistical analysis of \
+    the difference of coverage between two conditions.
+
+    :param df_sum: A dataframe containing the coverage values between \
+    two conditions
+    :param region_name: The name of the region of interest
+    :return: The dataframe containing statistical analysis
+    """
+    df_diff = compute_diff(df_sum)
+    if 'location' not in df_diff.columns:
+        df_diff["location"] = [region_name] * df_diff.shape[0]
+    list_series = [
+        get_shapiro_an_lm_pvalue(df_diff, region)
+        for region in df_diff["location"].unique()
+    ]
+    return pd.DataFrame(list_series)
+
+
 def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
                   bed_names: List[str], nb_bin: int = 100,
                   figure_type: str = 'metagene',
@@ -452,7 +541,8 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
                   show_replicate: bool = True, environment: List[int] = (0, 0),
                   border_names: List[str] = ('', ''),
                   output: Path = Path('.'),
-                  ylim: List[float] = (None, None)) -> None:
+                  ylim: List[float] = (None, None), stat: bool = False
+                  ) -> None:
     """
     Create A metagene or a barplot figure from bigwig file on regions defined \
     in the bed file provided with 'region_bed' parameter.
@@ -479,6 +569,8 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
     :param border_names: The name of the borders
     :param output: Folder where the results will be created
     :param ylim: The range of y-axis
+    :param stat: A boolean indicating whether or not to perform \
+    statistical analysis
     """
     if len(region_beds) != len(bed_names):
         raise IndexError("Parameter region_beds and bed_names should "
@@ -508,9 +600,14 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
                                          environment, region_kind,
                                          ordered_condition, bed_names)
     if figure_type == "metagene":
-        figure_metagene(df_sum, show_replicate, border_names, nb_bin,
-                        environment, region_kind, output, norm, cond_col,
-                        ylim)
+        outfile = figure_metagene(df_sum, show_replicate, border_names, nb_bin,
+                                  environment, region_kind, output, norm,
+                                  cond_col, ylim)
+        if stat:
+            df_stat = compute_stat_dataframe(df_sum, region_kind)
+            df_stat.to_csv(outfile.parent / f"{outfile.stem}.txt", sep="\t",
+            index=False)
+
     else:
         if 'location' in df_sum.columns:
             for cur_region in df_sum['location'].unique():
-- 
GitLab