Skip to content
Snippets Groups Projects
Commit efbd8f90 authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/sf_and_communities.py: creation of...

src/find_interaction_cluster/sf_and_communities.py: creation of expand_dataframe, glmm_maker and glmm_statistics to see if the regulation of exons by a splicing factor between communities is not random
parent 114270b4
Branches
No related tags found
No related merge requests found
......@@ -24,6 +24,8 @@ from functools import reduce
from operator import add
import statsmodels.formula.api as api
from statsmodels.stats.multitest import multipletests
from rpy2.robjects import r, pandas2ri
from pathlib import Path
def get_exon_regulated_in_communities(community: List[str],
......@@ -127,9 +129,125 @@ def logistic_regression(n1: int, t1: int, n2: int, t2: int) -> float:
return api.logit('y ~ x', data=df).fit(disp=False).pvalues['x[T.test]']
def expand_dataframe(df: pd.DataFrame) -> pd.DataFrame:
"""
Expend the dataframe df to be able to make a general mixed model on it.
:param df: The dataframe to expand
:return: The dataframe expanded
>>> d = pd.DataFrame({"community": ["C0", "C1"],
... "reg_in_community": [2, 3],
... "community_size": [5, 7],
... "%reg in community": [40, 42.85], 'pval': [1, 0.5], 'padj': [1, 1]})
>>> expand_dataframe(d)
is_reg community reg_in_community community_size
0 1 C0 2 5
1 1 C0 2 5
2 0 C0 2 5
3 0 C0 2 5
4 0 C0 2 5
5 1 C1 3 7
6 1 C1 3 7
7 1 C1 3 7
8 0 C1 3 7
9 0 C1 3 7
10 0 C1 3 7
11 0 C1 3 7
"""
content = []
for row in df.values:
row = list(row)
values = [1] * row[1] + [0] * (row[2] - row[1])
for v in values:
content.append([v] + row)
df = pd.DataFrame(content, columns=["is_reg"] + list(df.columns))
df.drop(["%reg in community", "pval", "padj"], axis=1, inplace=True)
return df
def glmm_maker(expanded_df: pd.DataFrame, outfile: Path) -> float:
"""
Make the glmm analysis to see if the exon regulated by a splicing factor \
are equally distributed among the communities.
:param expanded_df: The expanded dataframe
:param outfile: A name of a file
:return: the pvalue of glmm
>>> d = pd.DataFrame({"community": ["C0", "C1"],
... "reg_in_community": [2, 3],
... "community_size": [5, 7],
... "%reg in community": [40, 42.85], 'pval': [1, 0.5], 'padj': [1, 1]})
>>> e_df = expand_dataframe(d)
>>> outfile = ConfigGraph.get_community_file("Test", 1, 1, True,
... "_stat.txt", True)
>>> glmm_maker(e_df, outfile)
1.0
"""
pandas2ri.activate()
glmm = r(
"""
require("lme4")
require("DHARMa")
function(data, folder, partial_name) {
null_mod <- glm(is_reg ~ log(community_size) , family=binomial, data=data)
mod <- glmer(is_reg ~ log(community_size) + (1 | community), data=data, family=binomial, nAGQ=0)
simulationOutput <- simulateResiduals(fittedModel = mod, n = 250)
png(paste0(folder, "/dignostics_", partial_name, ".png"))
plot(simulationOutput)
dev.off()
return(anova(mod, null_mod, test="Chisq"))
}
""")
folder = outfile.parent / "diagnostics"
folder.mkdir(parents=True, exist_ok=True)
partial_name = outfile.name.replace('.txt', '')
return glmm(expanded_df, str(folder), partial_name)["Pr(>Chisq)"][1]
def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str,
project: str, weight: int, global_weight: int,
same_gene: bool) -> pd.Series:
"""
Create the glmm statistics for a given splicing factor with \
given communities.
:param df: dataframe contaning stats data.
:param sf_name: The splicing factor of interest
:param reg: The regulation of interest
:param project: The name of the project of interest
:param weight: The minimum weight of interaction to consider
:param global_weight: The global weight to consider. if \
the global weight is equal to 0 then then density figure are calculated \
by project, else all projet are merge together and the interaction \
seen in `global_weight` project are taken into account
:param same_gene: Say if we consider as co-localised, exons within the \
same gene (True) or not (False) (default False)
:return: The glmm pvalue among with other informations
"""
ndf = df.loc[-df['community'].isin(["All-community", "FASTERDB"]),
:].copy()
expanded_df = expand_dataframe(ndf)
outfile = ConfigGraph.get_community_file(project, weight, global_weight,
same_gene,
f"{sf_name}_{reg}_stat.txt", True)
noutfold = outfile.parent / "expanded_df"
noutfold.mkdir(exist_ok=True, parents=True)
noutfile = noutfold / outfile.name
expanded_df.to_csv(noutfile, sep="\t", index=False)
pval = glmm_maker(expanded_df, outfile)
return pd.Series({"project": project, "sf": sf_name, "reg": reg,
"pval": pval})
def get_stat4communities(sf_name: str, reg: str,
project: str, weight: int,
global_weight: int, same_gene: bool) -> pd.DataFrame:
global_weight: int, same_gene: bool
) -> Tuple[pd.DataFrame, pd.Series]:
"""
Get data (proportion of `reg` regulated exons by a splicing factor
`sf_name` within a community) for every community.
......@@ -172,7 +290,10 @@ def get_stat4communities(sf_name: str, reg: str,
d['sf'] = [sf_name] * len(d["community"])
d['reg'] = [reg] * len(d["community"])
d['project'] = [project] * len(d["community"])
return pd.DataFrame(d)
df = pd.DataFrame(d)
s = glmm_statistics(df, sf_name, reg, project, weight, global_weight,
same_gene)
return df, s
def get_sfname() -> List[str]:
......@@ -240,19 +361,27 @@ def multiple_stat_launcher(ps: int, weights: List[int],
processes[ckey] = [pool.apply_async(get_stat4communities, args)]
else:
processes[ckey].append(pool.apply_async(get_stat4communities, args))
for p in processes:
for p, value in processes.items():
project, weight = p.split("_")
list_df = []
for proc in processes[p]:
list_df.append(proc.get(timeout=None))
list_tuples = [proc.get(timeout=None) for proc in value]
pool.close()
pool.join()
list_df = [t[0] for t in list_tuples]
list_series = [t[1] for t in list_tuples]
df = pd.concat(list_df, axis=0, ignore_index=True)
outfile = ConfigGraph.get_community_file(project, weight,
dic_project[project],
same_gene,
"_stat.txt", True)
df.to_csv(outfile, sep="\t", index=False)
glm_df = pd.DataFrame(list_series)
glm_df["padj"] = multipletests(glm_df['pval'].values,
method='fdr_bh')[1]
outfile = ConfigGraph.get_community_file(project, weight,
dic_project[project],
same_gene,
"_glmm_stat.txt", True)
glm_df.to_csv(outfile, sep="\t", index=False)
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment