Skip to content
Snippets Groups Projects
Commit 263f00e6 authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/nt_and_community.py: add a script allowing to...

src/find_interaction_cluster/nt_and_community.py: add a script allowing to create a control community to see if the communities are different from the rest of the genes/exons not in a community
parent c0104bd4
No related branches found
No related tags found
No related merge requests found
......@@ -135,6 +135,86 @@ def lmm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float:
return res["Pr(>Chisq)"][1]
def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str
) -> pd.DataFrame:
"""
Make the lmm analysis to see if the exon regulated by a splicing factor \
are equally distributed among the communities.
:param df: The dataframe
:param outfile: A name of a file
:param nt: the nucleotide of interest
:return: the pvalue of lmm
"""
pandas2ri.activate()
lmm = r(
"""
require("DHARMa")
function(data, folder, partial_name) {
mod <- lm(%s ~ log(community_size) + community, data=data)
simulationOutput <- simulateResiduals(fittedModel = mod, n = 250)
png(paste0(folder, "/dignostics_summary", partial_name, ".png"))
plot(simulationOutput)
dev.off()
return(as.data.frame(summary(mod)$coefficients))
}
""" % nt)
folder = outfile.parent / "diagnostics"
folder.mkdir(parents=True, exist_ok=True)
partial_name = outfile.name.replace('.txt', '')
res_df = lmm(df, str(folder), partial_name).reset_index()
res_df.rename({'index': 'community'}, inplace=True, axis=1)
res_df['community'] = res_df['community'].str.replace('community', '')
res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL"
mean_df = df[[nt, "community", "community_size"]].\
groupby(["community", "community_size"]).mean().reset_index()
return res_df.merge(mean_df, how="left", on="community")
def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]:
"""
Return the id of every gene/exons in chia-pet database.
:param cnx: A connection to chiapet database
:param feature: The feature of interest
:return: The list of feature id
"""
query = f"SELECT DISTINCT id FROM cin_{feature}"
c = cnx.cursor()
c.execute(query)
res = c.fetchall()
return [str(cid[0]) for cid in res]
def create_ctrl_community(df: pd.DataFrame, outfile: Path,
feature: str = 'exon') -> pd.DataFrame:
"""
:param df: A dataframe containing the frequency of each feature in a \
community.
:param outfile: The output table containing frequencies
:param feature: The kind of feature to analyse
:return: A dataframe containing the frequency of every nucleotides \
of every exon in a large community
"""
if outfile.is_file():
return pd.read_csv(outfile, sep="\t")
size_threshold = 10 if feature == "gene" else 50
cnx = sqlite3.connect(ConfigGraph.db_file)
ft_id = get_ft_id(cnx, feature)
list_ft = [cid for cid in ft_id
if cid not in df[f"id_{feature}"].to_list()]
df_nt = get_nt_frequency(cnx, list_ft, feature)
df_com = get_community_table([list_ft], size_threshold, feature)
df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}")
df_nt['community'] = ['C-CTRL'] * df_nt.shape[0]
df = pd.concat([df, df_nt], axis=0, ignore_index=True)
df.to_csv(outfile, sep="\t", index=False)
return df
def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
global_weight: int, same_gene: bool,
nt: str, feature: str = "exon",
......@@ -172,6 +252,17 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
res['project'] = project
res['nt'] = nt
res['pval'] = pval
nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt",
"_ctrl.txt")
ndf = create_ctrl_community(df, nt_ctrl_table, feature)
sum_df = lmm_maker_summary(ndf, outfile, nt)
outfile_ctrl = ConfigGraph.get_community_file(project, weight,
global_weight,
same_gene, feature,
f"{nt}_VS_CTRL_stat.txt",
True)
noutfile_ctrl = nfolder / outfile_ctrl.name
sum_df.to_csv(noutfile_ctrl, sep="\t", index=False)
return pd.Series(res)
......@@ -247,7 +338,7 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
nfile_table = ConfigGraph.get_community_file(project, weight,
global_weight,
same_gene, feature,
f"nt_table.txt",
f"_nt_table.txt",
True)
df.to_csv(nfile_table, sep="\t", index=False)
dic_df[ckey] = df
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment