diff --git a/src/visu/figure_maker.py b/src/visu/figure_maker.py index 1dc9183cca72990216062fd190f895fe91fd28e7..965aaba06e8141b47cf66fceffdf2d837a2dde4b 100644 --- a/src/visu/figure_maker.py +++ b/src/visu/figure_maker.py @@ -18,7 +18,9 @@ from tqdm import tqdm from loguru import logger import matplotlib as mpl import matplotlib.font_manager -from rpy2.robjects import r, pandas2ri +from rpy2.robjects import r, pandas2ri, FloatVector +from matplotlib import collections as mc +import numpy as np def load_bed(bed: Path, bed_name: str) -> List[List[Union[int, str]]]: @@ -292,11 +294,118 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int, return df_sum, condition_col +def line_maker(list_pval, up_value, position=0): + """ + Create a list of lines, with their colour. + + Create a line if the a p-value in list_pval[i] is below 0.05. + If up_mean[i] > down_mean[i] the line will be green, purple else. + :param list_pval: (list of float), list of pvalue get by the comparison + of a propensity scale in a particular sequence position in an + up-regulated and down_regulated set of sequences. + :param up_value: (int) the ordinate coordinates where the line will be + placed. + :param position: (int) the abscissa position to begin to draw the lines + :return: lines - (list of list of 2 tuple), the list of 2 tuple corresponds + to a lines with the coordinates [(x1, y1), (x2, y2)] + """ + lcolor = [] + lines = [] + for i in range(len(list_pval)): + if list_pval[i] <= 0.05: + val = i + position + lines.append([(val - 0.5, up_value), (val + 0.5, up_value)]) + lcolor.append("#000000") # red + return lines, lcolor + + +def paired_t_test(values1: List[float], values2: List[float]) -> float: + """ + Get the p-value of a paired t-test for each bin + + :param values1: A list of values + :param values2: Another list of values + :return: The p-values of the paired t-test + >>> paired_t_test([1, 2, 8], [5, 8, 15]) + 0.02337551764357566 + """ + if len(values1) != len(values2): + raise IndexError("values1 and values2 should have the same length") + ttp = r(""" + function(values1, values2) { + return(t.test(values1, values2, paired=T)$p.value) + } + """) + return ttp(FloatVector(values1), FloatVector(values2))[0] + + +def compute_stats(dff: pd.DataFrame, y_line: float, group_col: str, + outfile: Path) -> Tuple[List[List[Tuple]], List]: + """ + + :param dff: A dataframe containing the coverage displayed in the figure + :param y_line: The height of the p-value line + :param group_col: A group column + :param outfile: The file to save the stats + :return: A list of lines coordinates in the form of [(x1, y1), (x2, y2)], \ + and the color associated to each line + """ + df = dff.sort_values(["bin", group_col], ascending=True) + groups = df[group_col].unique() + if len(groups) != 2: + raise NotImplementedError("Statistical analysis only implemented for " + "2 groups of data") + p_values_ttp = [] + grp1 = [] + vgrp1 = [] + grp2 = [] + vgrp2 = [] + for bin in df["bin"].unique(): + tmp = df[df["bin"] == bin] + values1 = tmp.loc[tmp[group_col] == groups[0], "coverage"].values + values2 = tmp.loc[tmp[group_col] == groups[1], "coverage"].values + p_values_ttp.append(paired_t_test(values1, values2)) + grp1.append(np.mean(values1)) + grp2.append(np.mean(values2)) + vgrp1.append(";".join(map(str, [round(v, 3) for v in values1]))) + vgrp2.append(";".join(map(str, [round(v, 3) for v in values2]))) + stats_df = pd.DataFrame({"bin": df["bin"].unique(), + "p_values": p_values_ttp, + groups[0]: grp1, groups[1]: grp2, + f"val-{groups[0]}": vgrp1, + f"val-{groups[1]}": vgrp2,}) + stats_df.to_csv(outfile, sep="\t", index=False) + return line_maker(p_values_ttp, y_line, min(df["bin"].unique())) + + +def display_stat(g: sns.FacetGrid, dff: pd.DataFrame, y_line: float, + group_col: str, stat: bool, outfile: Path) -> sns.FacetGrid: + """ + Update the graphics by displaying stats. + + :param g: A seaborn FacetGrid objects corresponding to the metagene \ + figure + :param dff: A dataframe containing the coverage displayed in the figure + :param y_line: The height of the p-value line + :param group_col: The column containing the groups analyzed + :param outfile: The file where the metagene will be saved + :return: The facetGrid with the stats + """ + if not stat: + return g + stat_file = outfile.parent / f"{outfile.stem}.txt" + lines, lcolor = compute_stats(dff, y_line, group_col, stat_file) + lc = mc.LineCollection(lines, colors=lcolor, linewidths=5) + g.ax.add_collection(lc) + return g + + def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, border_names: List[str], nb_bin: int, environment: List[int], bed_name: str, output: Path, norm: Union[int, Path], - condition_col: str, ylim: Optional[List[float]]) -> Path: + condition_col: str, ylim: Optional[List[float]], + stat: bool = False) -> Path: """ Create a metagene figure on the region of interest. @@ -314,6 +423,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, each samples :param condition_col: The name of the condition columns :param ylim: The range of the y axis + :param stat: A boolean indicating wether to perform statistical analysis :return: The name of the figure """ font_files = matplotlib.font_manager.findSystemFonts(fontpaths=None, @@ -356,6 +466,9 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, outfile_title += ".pdf" if ylim[0] is not None: g.ax.set_ylim(ylim[0], ylim[1]) + ymin, ymax = g.ax.get_ylim() + g = display_stat(g, df_sum, ymin + ((ymax - ymin) / 50), condition_col, + stat, output / outfile_title) g.ax.tick_params(left=True, bottom=True) g.savefig(output / outfile_title) g.fig.clf() @@ -513,27 +626,6 @@ def get_shapiro_an_lm_pvalue(df: pd.DataFrame, location: str) -> pd.Series: "location": location}) -def compute_stat_dataframe(df_sum: pd.DataFrame, - region_name: str) -> pd.DataFrame: - """ - Create a dataframe containing the statistical analysis of \ - the difference of coverage between two conditions. - - :param df_sum: A dataframe containing the coverage values between \ - two conditions - :param region_name: The name of the region of interest - :return: The dataframe containing statistical analysis - """ - df_diff = compute_diff(df_sum) - if 'location' not in df_diff.columns: - df_diff["location"] = [region_name] * df_diff.shape[0] - list_series = [ - get_shapiro_an_lm_pvalue(df_diff, region) - for region in df_diff["location"].unique() - ] - return pd.DataFrame(list_series) - - def create_figure(design: Path, bw_folder: Path, region_beds: List[Path], bed_names: List[str], nb_bin: int = 100, figure_type: str = 'metagene', @@ -600,13 +692,9 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path], environment, region_kind, ordered_condition, bed_names) if figure_type == "metagene": - outfile = figure_metagene(df_sum, show_replicate, border_names, nb_bin, - environment, region_kind, output, norm, - cond_col, ylim) - if stat: - df_stat = compute_stat_dataframe(df_sum, region_kind) - df_stat.to_csv(outfile.parent / f"{outfile.stem}.txt", sep="\t", - index=False) + figure_metagene(df_sum, show_replicate, border_names, nb_bin, + environment, region_kind, output, norm, + cond_col, ylim, stat) else: if 'location' in df_sum.columns: