src/visu/figure_maker.py: add line at bins different between two conditions

d742be18 · nfontrod · 56afa7be · d742be18
Commit d742be18 authored 3 years ago by nfontrod
--- a/src/visu/figure_maker.py
+++ b/src/visu/figure_maker.py
@@ -18,7 +18,9 @@ from tqdm import tqdm
 from loguru import logger
 import matplotlib as mpl
 import matplotlib.font_manager
-from rpy2.robjects import r, pandas2ri
+from rpy2.robjects import r, pandas2ri, FloatVector
+from matplotlib import collections as mc
+import numpy as np


 def load_bed(bed: Path, bed_name: str) -> List[List[Union[int, str]]]:
@@ -292,11 +294,118 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
    return df_sum, condition_col


+def line_maker(list_pval, up_value, position=0):
+    """
+    Create a list of lines, with their colour.
+
+    Create a line if the a p-value in list_pval[i] is below 0.05.
+    If up_mean[i] > down_mean[i] the line will be green, purple else.
+    :param list_pval: (list of float), list of pvalue get by the comparison
+    of a propensity scale in a particular sequence position in an
+    up-regulated and down_regulated set of sequences.
+    :param up_value: (int) the ordinate coordinates where the line will be
+    placed.
+    :param position: (int) the abscissa position to begin to draw the lines
+    :return: lines - (list of list of 2 tuple), the list of 2 tuple corresponds
+    to a lines with the coordinates [(x1, y1), (x2, y2)]
+    """
+    lcolor = []
+    lines = []
+    for i in range(len(list_pval)):
+        if list_pval[i] <= 0.05:
+            val = i + position
+            lines.append([(val - 0.5, up_value), (val + 0.5, up_value)])
+            lcolor.append("#000000")  # red
+    return lines, lcolor
+
+
+def paired_t_test(values1: List[float], values2: List[float]) -> float:
+    """
+    Get the p-value of a paired t-test for each bin
+
+    :param values1: A list of values
+    :param values2: Another list of values
+    :return: The p-values of the paired t-test
+    >>> paired_t_test([1, 2, 8], [5, 8, 15])
+    0.02337551764357566
+    """
+    if len(values1) != len(values2):
+        raise IndexError("values1 and values2 should have the same length")
+    ttp = r("""
+    function(values1, values2) {
+        return(t.test(values1, values2, paired=T)$p.value)
+    }
+    """)
+    return ttp(FloatVector(values1), FloatVector(values2))[0]
+
+
+def compute_stats(dff: pd.DataFrame, y_line: float, group_col: str,
+                  outfile: Path) -> Tuple[List[List[Tuple]], List]:
+    """
+
+    :param dff: A dataframe containing the coverage displayed in the figure
+    :param y_line: The height of the p-value line
+    :param group_col: A group column
+    :param outfile: The file to save the stats
+    :return: A list of lines coordinates in the form of [(x1, y1), (x2, y2)], \
+    and the color associated to each line
+    """
+    df = dff.sort_values(["bin", group_col], ascending=True)
+    groups = df[group_col].unique()
+    if len(groups) != 2:
+        raise NotImplementedError("Statistical analysis only implemented for "
+                                  "2 groups of data")
+    p_values_ttp = []
+    grp1 = []
+    vgrp1 = []
+    grp2 = []
+    vgrp2 = []
+    for bin in df["bin"].unique():
+        tmp = df[df["bin"] == bin]
+        values1 = tmp.loc[tmp[group_col] == groups[0], "coverage"].values
+        values2 = tmp.loc[tmp[group_col] == groups[1], "coverage"].values
+        p_values_ttp.append(paired_t_test(values1, values2))
+        grp1.append(np.mean(values1))
+        grp2.append(np.mean(values2))
+        vgrp1.append(";".join(map(str, [round(v, 3) for v in values1])))
+        vgrp2.append(";".join(map(str, [round(v, 3) for v in values2])))
+    stats_df = pd.DataFrame({"bin": df["bin"].unique(),
+                             "p_values": p_values_ttp,
+                             groups[0]: grp1, groups[1]: grp2,
+                             f"val-{groups[0]}": vgrp1,
+                             f"val-{groups[1]}": vgrp2,})
+    stats_df.to_csv(outfile, sep="\t", index=False)
+    return line_maker(p_values_ttp, y_line, min(df["bin"].unique()))
+
+
+def display_stat(g: sns.FacetGrid, dff: pd.DataFrame, y_line: float,
+                 group_col: str, stat: bool, outfile: Path) -> sns.FacetGrid:
+    """
+    Update the graphics by displaying stats.
+
+    :param g: A seaborn FacetGrid objects corresponding to the metagene \
+    figure
+    :param dff: A dataframe containing the coverage displayed in the figure
+    :param y_line: The height of the p-value line
+    :param group_col: The column containing the groups analyzed
+    :param outfile: The file where the metagene will be saved
+    :return: The facetGrid with the stats
+    """
+    if not stat:
+        return g
+    stat_file = outfile.parent / f"{outfile.stem}.txt"
+    lines, lcolor = compute_stats(dff, y_line, group_col, stat_file)
+    lc = mc.LineCollection(lines, colors=lcolor, linewidths=5)
+    g.ax.add_collection(lc)
+    return g
+
+
 def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
                    border_names: List[str], nb_bin: int,
                    environment: List[int], bed_name: str,
                    output: Path, norm: Union[int, Path],
-                    condition_col: str, ylim: Optional[List[float]]) -> Path:
+                    condition_col: str, ylim: Optional[List[float]],
+                    stat: bool = False) -> Path:
    """
    Create a metagene figure on the region of interest.

@@ -314,6 +423,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
    each samples
    :param condition_col: The name of the condition columns
    :param ylim: The range of the y axis
+    :param stat: A boolean indicating wether to perform statistical analysis
    :return: The name of the figure
    """
    font_files = matplotlib.font_manager.findSystemFonts(fontpaths=None,
@@ -356,6 +466,9 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
    outfile_title += ".pdf"
    if ylim[0] is not None:
        g.ax.set_ylim(ylim[0], ylim[1])
+    ymin, ymax = g.ax.get_ylim()
+    g = display_stat(g, df_sum, ymin + ((ymax - ymin) / 50), condition_col,
+                     stat, output / outfile_title)
    g.ax.tick_params(left=True, bottom=True)
    g.savefig(output / outfile_title)
    g.fig.clf()
@@ -513,27 +626,6 @@ def get_shapiro_an_lm_pvalue(df: pd.DataFrame, location: str) -> pd.Series:
                      "location": location})


-def compute_stat_dataframe(df_sum: pd.DataFrame,
-                           region_name: str) -> pd.DataFrame:
-    """
-    Create a dataframe containing the statistical analysis of \
-    the difference of coverage between two conditions.
-
-    :param df_sum: A dataframe containing the coverage values between \
-    two conditions
-    :param region_name: The name of the region of interest
-    :return: The dataframe containing statistical analysis
-    """
-    df_diff = compute_diff(df_sum)
-    if 'location' not in df_diff.columns:
-        df_diff["location"] = [region_name] * df_diff.shape[0]
-    list_series = [
-        get_shapiro_an_lm_pvalue(df_diff, region)
-        for region in df_diff["location"].unique()
-    ]
-    return pd.DataFrame(list_series)
-
-
 def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
                  bed_names: List[str], nb_bin: int = 100,
                  figure_type: str = 'metagene',
@@ -600,13 +692,9 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
                                         environment, region_kind,
                                         ordered_condition, bed_names)
    if figure_type == "metagene":
-        outfile = figure_metagene(df_sum, show_replicate, border_names, nb_bin,
-                                  environment, region_kind, output, norm,
-                                  cond_col, ylim)
-        if stat:
-            df_stat = compute_stat_dataframe(df_sum, region_kind)
-            df_stat.to_csv(outfile.parent / f"{outfile.stem}.txt", sep="\t",
-            index=False)
+        figure_metagene(df_sum, show_replicate, border_names, nb_bin,
+                        environment, region_kind, output, norm,
+                        cond_col, ylim, stat)

    else:
        if 'location' in df_sum.columns: