From efbd8f90ddbbed6982c7f3e2d6b41275c3ceed8f Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Fri, 7 Aug 2020 11:45:49 +0200 Subject: [PATCH] src/find_interaction_cluster/sf_and_communities.py: creation of expand_dataframe, glmm_maker and glmm_statistics to see if the regulation of exons by a splicing factor between communities is not random --- .../sf_and_communities.py | 141 +++++++++++++++++- 1 file changed, 135 insertions(+), 6 deletions(-) diff --git a/src/find_interaction_cluster/sf_and_communities.py b/src/find_interaction_cluster/sf_and_communities.py index 92209a18..724406d3 100644 --- a/src/find_interaction_cluster/sf_and_communities.py +++ b/src/find_interaction_cluster/sf_and_communities.py @@ -24,6 +24,8 @@ from functools import reduce from operator import add import statsmodels.formula.api as api from statsmodels.stats.multitest import multipletests +from rpy2.robjects import r, pandas2ri +from pathlib import Path def get_exon_regulated_in_communities(community: List[str], @@ -127,9 +129,125 @@ def logistic_regression(n1: int, t1: int, n2: int, t2: int) -> float: return api.logit('y ~ x', data=df).fit(disp=False).pvalues['x[T.test]'] +def expand_dataframe(df: pd.DataFrame) -> pd.DataFrame: + """ + Expend the dataframe df to be able to make a general mixed model on it. + + :param df: The dataframe to expand + :return: The dataframe expanded + + >>> d = pd.DataFrame({"community": ["C0", "C1"], + ... "reg_in_community": [2, 3], + ... "community_size": [5, 7], + ... "%reg in community": [40, 42.85], 'pval': [1, 0.5], 'padj': [1, 1]}) + >>> expand_dataframe(d) + is_reg community reg_in_community community_size + 0 1 C0 2 5 + 1 1 C0 2 5 + 2 0 C0 2 5 + 3 0 C0 2 5 + 4 0 C0 2 5 + 5 1 C1 3 7 + 6 1 C1 3 7 + 7 1 C1 3 7 + 8 0 C1 3 7 + 9 0 C1 3 7 + 10 0 C1 3 7 + 11 0 C1 3 7 + + """ + content = [] + for row in df.values: + row = list(row) + values = [1] * row[1] + [0] * (row[2] - row[1]) + for v in values: + content.append([v] + row) + df = pd.DataFrame(content, columns=["is_reg"] + list(df.columns)) + df.drop(["%reg in community", "pval", "padj"], axis=1, inplace=True) + return df + + +def glmm_maker(expanded_df: pd.DataFrame, outfile: Path) -> float: + """ + Make the glmm analysis to see if the exon regulated by a splicing factor \ + are equally distributed among the communities. + + :param expanded_df: The expanded dataframe + :param outfile: A name of a file + :return: the pvalue of glmm + + >>> d = pd.DataFrame({"community": ["C0", "C1"], + ... "reg_in_community": [2, 3], + ... "community_size": [5, 7], + ... "%reg in community": [40, 42.85], 'pval': [1, 0.5], 'padj': [1, 1]}) + >>> e_df = expand_dataframe(d) + >>> outfile = ConfigGraph.get_community_file("Test", 1, 1, True, + ... "_stat.txt", True) + >>> glmm_maker(e_df, outfile) + 1.0 + """ + pandas2ri.activate() + glmm = r( + """ + require("lme4") + require("DHARMa") + + function(data, folder, partial_name) { + null_mod <- glm(is_reg ~ log(community_size) , family=binomial, data=data) + mod <- glmer(is_reg ~ log(community_size) + (1 | community), data=data, family=binomial, nAGQ=0) + simulationOutput <- simulateResiduals(fittedModel = mod, n = 250) + png(paste0(folder, "/dignostics_", partial_name, ".png")) + plot(simulationOutput) + dev.off() + return(anova(mod, null_mod, test="Chisq")) + } + + """) + folder = outfile.parent / "diagnostics" + folder.mkdir(parents=True, exist_ok=True) + partial_name = outfile.name.replace('.txt', '') + return glmm(expanded_df, str(folder), partial_name)["Pr(>Chisq)"][1] + + +def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str, + project: str, weight: int, global_weight: int, + same_gene: bool) -> pd.Series: + """ + Create the glmm statistics for a given splicing factor with \ + given communities. + + :param df: dataframe contaning stats data. + :param sf_name: The splicing factor of interest + :param reg: The regulation of interest + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :return: The glmm pvalue among with other informations + """ + ndf = df.loc[-df['community'].isin(["All-community", "FASTERDB"]), + :].copy() + expanded_df = expand_dataframe(ndf) + outfile = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, + f"{sf_name}_{reg}_stat.txt", True) + noutfold = outfile.parent / "expanded_df" + noutfold.mkdir(exist_ok=True, parents=True) + noutfile = noutfold / outfile.name + expanded_df.to_csv(noutfile, sep="\t", index=False) + pval = glmm_maker(expanded_df, outfile) + return pd.Series({"project": project, "sf": sf_name, "reg": reg, + "pval": pval}) + + def get_stat4communities(sf_name: str, reg: str, project: str, weight: int, - global_weight: int, same_gene: bool) -> pd.DataFrame: + global_weight: int, same_gene: bool + ) -> Tuple[pd.DataFrame, pd.Series]: """ Get data (proportion of `reg` regulated exons by a splicing factor `sf_name` within a community) for every community. @@ -172,7 +290,10 @@ def get_stat4communities(sf_name: str, reg: str, d['sf'] = [sf_name] * len(d["community"]) d['reg'] = [reg] * len(d["community"]) d['project'] = [project] * len(d["community"]) - return pd.DataFrame(d) + df = pd.DataFrame(d) + s = glmm_statistics(df, sf_name, reg, project, weight, global_weight, + same_gene) + return df, s def get_sfname() -> List[str]: @@ -240,19 +361,27 @@ def multiple_stat_launcher(ps: int, weights: List[int], processes[ckey] = [pool.apply_async(get_stat4communities, args)] else: processes[ckey].append(pool.apply_async(get_stat4communities, args)) - for p in processes: + for p, value in processes.items(): project, weight = p.split("_") - list_df = [] - for proc in processes[p]: - list_df.append(proc.get(timeout=None)) + list_tuples = [proc.get(timeout=None) for proc in value] pool.close() pool.join() + list_df = [t[0] for t in list_tuples] + list_series = [t[1] for t in list_tuples] df = pd.concat(list_df, axis=0, ignore_index=True) outfile = ConfigGraph.get_community_file(project, weight, dic_project[project], same_gene, "_stat.txt", True) df.to_csv(outfile, sep="\t", index=False) + glm_df = pd.DataFrame(list_series) + glm_df["padj"] = multipletests(glm_df['pval'].values, + method='fdr_bh')[1] + outfile = ConfigGraph.get_community_file(project, weight, + dic_project[project], + same_gene, + "_glmm_stat.txt", True) + glm_df.to_csv(outfile, sep="\t", index=False) if __name__ == "__main__": -- GitLab