Skip to content
Snippets Groups Projects
Commit efbd8f90 authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/sf_and_communities.py: creation of...

src/find_interaction_cluster/sf_and_communities.py: creation of expand_dataframe, glmm_maker and glmm_statistics to see if the regulation of exons by a splicing factor between communities is not random
parent 114270b4
No related branches found
No related tags found
No related merge requests found
...@@ -24,6 +24,8 @@ from functools import reduce ...@@ -24,6 +24,8 @@ from functools import reduce
from operator import add from operator import add
import statsmodels.formula.api as api import statsmodels.formula.api as api
from statsmodels.stats.multitest import multipletests from statsmodels.stats.multitest import multipletests
from rpy2.robjects import r, pandas2ri
from pathlib import Path
def get_exon_regulated_in_communities(community: List[str], def get_exon_regulated_in_communities(community: List[str],
...@@ -127,9 +129,125 @@ def logistic_regression(n1: int, t1: int, n2: int, t2: int) -> float: ...@@ -127,9 +129,125 @@ def logistic_regression(n1: int, t1: int, n2: int, t2: int) -> float:
return api.logit('y ~ x', data=df).fit(disp=False).pvalues['x[T.test]'] return api.logit('y ~ x', data=df).fit(disp=False).pvalues['x[T.test]']
def expand_dataframe(df: pd.DataFrame) -> pd.DataFrame:
"""
Expend the dataframe df to be able to make a general mixed model on it.
:param df: The dataframe to expand
:return: The dataframe expanded
>>> d = pd.DataFrame({"community": ["C0", "C1"],
... "reg_in_community": [2, 3],
... "community_size": [5, 7],
... "%reg in community": [40, 42.85], 'pval': [1, 0.5], 'padj': [1, 1]})
>>> expand_dataframe(d)
is_reg community reg_in_community community_size
0 1 C0 2 5
1 1 C0 2 5
2 0 C0 2 5
3 0 C0 2 5
4 0 C0 2 5
5 1 C1 3 7
6 1 C1 3 7
7 1 C1 3 7
8 0 C1 3 7
9 0 C1 3 7
10 0 C1 3 7
11 0 C1 3 7
"""
content = []
for row in df.values:
row = list(row)
values = [1] * row[1] + [0] * (row[2] - row[1])
for v in values:
content.append([v] + row)
df = pd.DataFrame(content, columns=["is_reg"] + list(df.columns))
df.drop(["%reg in community", "pval", "padj"], axis=1, inplace=True)
return df
def glmm_maker(expanded_df: pd.DataFrame, outfile: Path) -> float:
"""
Make the glmm analysis to see if the exon regulated by a splicing factor \
are equally distributed among the communities.
:param expanded_df: The expanded dataframe
:param outfile: A name of a file
:return: the pvalue of glmm
>>> d = pd.DataFrame({"community": ["C0", "C1"],
... "reg_in_community": [2, 3],
... "community_size": [5, 7],
... "%reg in community": [40, 42.85], 'pval': [1, 0.5], 'padj': [1, 1]})
>>> e_df = expand_dataframe(d)
>>> outfile = ConfigGraph.get_community_file("Test", 1, 1, True,
... "_stat.txt", True)
>>> glmm_maker(e_df, outfile)
1.0
"""
pandas2ri.activate()
glmm = r(
"""
require("lme4")
require("DHARMa")
function(data, folder, partial_name) {
null_mod <- glm(is_reg ~ log(community_size) , family=binomial, data=data)
mod <- glmer(is_reg ~ log(community_size) + (1 | community), data=data, family=binomial, nAGQ=0)
simulationOutput <- simulateResiduals(fittedModel = mod, n = 250)
png(paste0(folder, "/dignostics_", partial_name, ".png"))
plot(simulationOutput)
dev.off()
return(anova(mod, null_mod, test="Chisq"))
}
""")
folder = outfile.parent / "diagnostics"
folder.mkdir(parents=True, exist_ok=True)
partial_name = outfile.name.replace('.txt', '')
return glmm(expanded_df, str(folder), partial_name)["Pr(>Chisq)"][1]
def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str,
project: str, weight: int, global_weight: int,
same_gene: bool) -> pd.Series:
"""
Create the glmm statistics for a given splicing factor with \
given communities.
:param df: dataframe contaning stats data.
:param sf_name: The splicing factor of interest
:param reg: The regulation of interest
:param project: The name of the project of interest
:param weight: The minimum weight of interaction to consider
:param global_weight: The global weight to consider. if \
the global weight is equal to 0 then then density figure are calculated \
by project, else all projet are merge together and the interaction \
seen in `global_weight` project are taken into account
:param same_gene: Say if we consider as co-localised, exons within the \
same gene (True) or not (False) (default False)
:return: The glmm pvalue among with other informations
"""
ndf = df.loc[-df['community'].isin(["All-community", "FASTERDB"]),
:].copy()
expanded_df = expand_dataframe(ndf)
outfile = ConfigGraph.get_community_file(project, weight, global_weight,
same_gene,
f"{sf_name}_{reg}_stat.txt", True)
noutfold = outfile.parent / "expanded_df"
noutfold.mkdir(exist_ok=True, parents=True)
noutfile = noutfold / outfile.name
expanded_df.to_csv(noutfile, sep="\t", index=False)
pval = glmm_maker(expanded_df, outfile)
return pd.Series({"project": project, "sf": sf_name, "reg": reg,
"pval": pval})
def get_stat4communities(sf_name: str, reg: str, def get_stat4communities(sf_name: str, reg: str,
project: str, weight: int, project: str, weight: int,
global_weight: int, same_gene: bool) -> pd.DataFrame: global_weight: int, same_gene: bool
) -> Tuple[pd.DataFrame, pd.Series]:
""" """
Get data (proportion of `reg` regulated exons by a splicing factor Get data (proportion of `reg` regulated exons by a splicing factor
`sf_name` within a community) for every community. `sf_name` within a community) for every community.
...@@ -172,7 +290,10 @@ def get_stat4communities(sf_name: str, reg: str, ...@@ -172,7 +290,10 @@ def get_stat4communities(sf_name: str, reg: str,
d['sf'] = [sf_name] * len(d["community"]) d['sf'] = [sf_name] * len(d["community"])
d['reg'] = [reg] * len(d["community"]) d['reg'] = [reg] * len(d["community"])
d['project'] = [project] * len(d["community"]) d['project'] = [project] * len(d["community"])
return pd.DataFrame(d) df = pd.DataFrame(d)
s = glmm_statistics(df, sf_name, reg, project, weight, global_weight,
same_gene)
return df, s
def get_sfname() -> List[str]: def get_sfname() -> List[str]:
...@@ -240,19 +361,27 @@ def multiple_stat_launcher(ps: int, weights: List[int], ...@@ -240,19 +361,27 @@ def multiple_stat_launcher(ps: int, weights: List[int],
processes[ckey] = [pool.apply_async(get_stat4communities, args)] processes[ckey] = [pool.apply_async(get_stat4communities, args)]
else: else:
processes[ckey].append(pool.apply_async(get_stat4communities, args)) processes[ckey].append(pool.apply_async(get_stat4communities, args))
for p in processes: for p, value in processes.items():
project, weight = p.split("_") project, weight = p.split("_")
list_df = [] list_tuples = [proc.get(timeout=None) for proc in value]
for proc in processes[p]:
list_df.append(proc.get(timeout=None))
pool.close() pool.close()
pool.join() pool.join()
list_df = [t[0] for t in list_tuples]
list_series = [t[1] for t in list_tuples]
df = pd.concat(list_df, axis=0, ignore_index=True) df = pd.concat(list_df, axis=0, ignore_index=True)
outfile = ConfigGraph.get_community_file(project, weight, outfile = ConfigGraph.get_community_file(project, weight,
dic_project[project], dic_project[project],
same_gene, same_gene,
"_stat.txt", True) "_stat.txt", True)
df.to_csv(outfile, sep="\t", index=False) df.to_csv(outfile, sep="\t", index=False)
glm_df = pd.DataFrame(list_series)
glm_df["padj"] = multipletests(glm_df['pval'].values,
method='fdr_bh')[1]
outfile = ConfigGraph.get_community_file(project, weight,
dic_project[project],
same_gene,
"_glmm_stat.txt", True)
glm_df.to_csv(outfile, sep="\t", index=False)
if __name__ == "__main__": if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment