From 7abb300e7639e3e721f605e9ccab1c9e552b920f Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Mon, 21 Feb 2022 09:33:32 +0100 Subject: [PATCH] src/visu/figure_maker.py: Add a stat parameter to perform statistical analysis on the metagene figure --- src/visu/figure_maker.py | 115 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 106 insertions(+), 9 deletions(-) diff --git a/src/visu/figure_maker.py b/src/visu/figure_maker.py index f499afb..1dc9183 100644 --- a/src/visu/figure_maker.py +++ b/src/visu/figure_maker.py @@ -18,6 +18,7 @@ from tqdm import tqdm from loguru import logger import matplotlib as mpl import matplotlib.font_manager +from rpy2.robjects import r, pandas2ri def load_bed(bed: Path, bed_name: str) -> List[List[Union[int, str]]]: @@ -259,12 +260,12 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int, .mean() \ .reset_index() if figure_type == "metagene": + if environment[0] != 0: + df_sum['location'] = df_sum['bin'].apply( + lambda x: f"before_{region_name}" if x < 0 else + f"after_{region_name}" if x >= nb_bin else region_name) df_sum, condition_col = merge_condition_region_col(df_sum) return df_sum, condition_col - if environment[0] != 0: - df_sum['location'] = df_sum['bin'].apply( - lambda x: f"before_{region_name}" if x < 0 else - f"after_{region_name}" if x >= nb_bin else region_name) df_sum.drop('bin', axis=1, inplace=True) if environment[0] != 0: col_merge = ['condition', 'region', 'replicate', 'location'] @@ -295,7 +296,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, border_names: List[str], nb_bin: int, environment: List[int], bed_name: str, output: Path, norm: Union[int, Path], - condition_col: str, ylim: Optional[List[float]]) -> None: + condition_col: str, ylim: Optional[List[float]]) -> Path: """ Create a metagene figure on the region of interest. @@ -313,6 +314,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, each samples :param condition_col: The name of the condition columns :param ylim: The range of the y axis + :return: The name of the figure """ font_files = matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf') @@ -357,6 +359,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, g.ax.tick_params(left=True, bottom=True) g.savefig(output / outfile_title) g.fig.clf() + return output / outfile_title def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool, @@ -445,6 +448,92 @@ def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path], return df +def compute_diff(df_sum: pd.DataFrame) -> pd.DataFrame: + """ + Compute the average coverage differences between two conditions. + + :param df_sum: A dataframe containing the coverage values between \ + two conditions + :return: The dataframe with a coolumn diff + + >>> di = pd.DataFrame({'bin': [-25] * 6, + ... 'condition': ['Ctrl', 'Ctrl', 'Ctrl', 'siDDX', 'siDDX', 'siDDX'], + ... 'replicate': ['R1', 'R2', 'R3', 'R1', 'R2', 'R3'], + ... 'coverage': [0.1, + ... 0.2, + ... 0.3, + ... 0.15, + ... 0.25, + ... 0.35], + ... 'location': ['before_ddx_down'] * 6}) + >>> compute_diff(di) + bin location Ctrl siDDX diff + 0 -25 before_ddx_down 0.2 0.25 0.05 + """ + cond = df_sum["condition"].unique() + df_sum = df_sum.groupby(["bin", "location", "condition"]).mean() + if len(cond) != 2: + raise IndexError("Only two different conditions must be given to " + "perform stats") + df = df_sum.pivot_table(index=["bin", "location"], + columns="condition", + values="coverage").reset_index() + df.columns.name = None + df.groupby(["bin", "location"]) + df["diff"] = df[cond[1]] - df[cond[0]] + return df + + +def get_shapiro_an_lm_pvalue(df: pd.DataFrame, location: str) -> pd.Series: + """ + Return a series containing the shapiro and the t-test p-value of \ + the difference in coverage between 2 conditions at a particular location + + :param df: A dataframe containing the difference in coverage \ + between 2 conditions at a particular location + :param location: The region of interest + :return: a series containing the shapiro and the t-test p-value of \ + the difference in coverage between 2 conditions at a particular location + """ + df2 = df[df["location"] == location].copy() + df2.to_csv(f"{location}.txt", sep="\t", index=False) + pandas2ri.activate() + stat_s = r(""" + function(data){ + mod <- lm(diff ~ 1, data=data) + s <- summary(mod) + stat <- s$coefficients[1, "Pr(>|t|)"] + shapiro <- shapiro.test(data$diff)$p.value + return(c(shapiro, stat)) + } + """) + res = stat_s(df2) + return pd.Series({"shapiro_p-value": res[0], + "t-test-p-value": res[1], + "location": location}) + + +def compute_stat_dataframe(df_sum: pd.DataFrame, + region_name: str) -> pd.DataFrame: + """ + Create a dataframe containing the statistical analysis of \ + the difference of coverage between two conditions. + + :param df_sum: A dataframe containing the coverage values between \ + two conditions + :param region_name: The name of the region of interest + :return: The dataframe containing statistical analysis + """ + df_diff = compute_diff(df_sum) + if 'location' not in df_diff.columns: + df_diff["location"] = [region_name] * df_diff.shape[0] + list_series = [ + get_shapiro_an_lm_pvalue(df_diff, region) + for region in df_diff["location"].unique() + ] + return pd.DataFrame(list_series) + + def create_figure(design: Path, bw_folder: Path, region_beds: List[Path], bed_names: List[str], nb_bin: int = 100, figure_type: str = 'metagene', @@ -452,7 +541,8 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path], show_replicate: bool = True, environment: List[int] = (0, 0), border_names: List[str] = ('', ''), output: Path = Path('.'), - ylim: List[float] = (None, None)) -> None: + ylim: List[float] = (None, None), stat: bool = False + ) -> None: """ Create A metagene or a barplot figure from bigwig file on regions defined \ in the bed file provided with 'region_bed' parameter. @@ -479,6 +569,8 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path], :param border_names: The name of the borders :param output: Folder where the results will be created :param ylim: The range of y-axis + :param stat: A boolean indicating whether or not to perform \ + statistical analysis """ if len(region_beds) != len(bed_names): raise IndexError("Parameter region_beds and bed_names should " @@ -508,9 +600,14 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path], environment, region_kind, ordered_condition, bed_names) if figure_type == "metagene": - figure_metagene(df_sum, show_replicate, border_names, nb_bin, - environment, region_kind, output, norm, cond_col, - ylim) + outfile = figure_metagene(df_sum, show_replicate, border_names, nb_bin, + environment, region_kind, output, norm, + cond_col, ylim) + if stat: + df_stat = compute_stat_dataframe(df_sum, region_kind) + df_stat.to_csv(outfile.parent / f"{outfile.stem}.txt", sep="\t", + index=False) + else: if 'location' in df_sum.columns: for cur_region in df_sum['location'].unique(): -- GitLab