Skip to content
Snippets Groups Projects
Commit c1ec2983 authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/nt_and_community.py: add functions for barplot...

src/find_interaction_cluster/nt_and_community.py: add functions for barplot creation and refactoring of get_stat_nt_communities functions
parent 4e1e5741
No related branches found
No related tags found
No related merge requests found
...@@ -24,6 +24,7 @@ import multiprocessing as mp ...@@ -24,6 +24,7 @@ import multiprocessing as mp
import numpy as np import numpy as np
from .radomization_test_ppi import get_pvalue from .radomization_test_ppi import get_pvalue
from tqdm import tqdm from tqdm import tqdm
import seaborn as sns
def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str],
...@@ -341,7 +342,7 @@ def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, ...@@ -341,7 +342,7 @@ def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str,
returnsorted=False)[1] returnsorted=False)[1]
adj_regs = [list_reg[i] if adj_pvals[i] <= 0.05 else " . " adj_regs = [list_reg[i] if adj_pvals[i] <= 0.05 else " . "
for i in range(len(list_reg))] for i in range(len(list_reg))]
df[f'mean_{iteration}_ctrl'] = mean_ctrl df[f'{nt}_mean_{iteration}_ctrl'] = mean_ctrl
df[f'p-adj'] = adj_pvals df[f'p-adj'] = adj_pvals
df[f'reg-adj'] = adj_regs df[f'reg-adj'] = adj_regs
return df return df
...@@ -369,6 +370,157 @@ def perm_with_ctrl(df: pd.DataFrame, feature: str, ...@@ -369,6 +370,157 @@ def perm_with_ctrl(df: pd.DataFrame, feature: str,
return perm_pvalues(mean_df, df_ctrl, feature, nt, iteration, dic_com) return perm_pvalues(mean_df, df_ctrl, feature, nt, iteration, dic_com)
def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str,
iteration: int) -> pd.DataFrame:
"""
:param df: A dataframe with the enrichment of a \
nucleotide frequency for every community
:param test_type: The kind of test performed to
:param nt: The nucleotide of interest
:param iteration: The number of iteration permformed to \
produce the database
:return: The dataframe ready for barplot visualisation
"""
if test_type == "lmm":
# removing the size parameter
df = df[df["community"] != "log(_size)"].copy()
df.rename({"Pr(>|t|)": "p-adj"}, axis=1, inplace=True)
df_bar = df[['community', nt, 'p-adj']]
df_ctrl = df_bar[df_bar["community"] == "C-CTRL"]
df_bar = df_bar[df_bar["community"] != "C-CTRL"].copy()
df_bar.sort_values(nt, ascending=True, inplace=True)
else:
df_bar = df[["community", nt, f"{nt}_mean_{iteration}_ctrl",
"p-adj"]].copy()
ctrl_val = df_bar[f"{nt}_mean_{iteration}_ctrl"]
df_bar.drop(f"{nt}_mean_{iteration}_ctrl", axis=1, inplace=True)
df_bar.sort_values(nt, ascending=True, inplace=True)
df_ctrl = pd.DataFrame({"community": ["C-CTRL"] * len(ctrl_val),
nt: ctrl_val})
df_bar = pd.concat([df_ctrl, df_bar], axis=0, ignore_index=True)
return df_bar
def make_barplot(df_bar: pd.DataFrame, outfile: Path, nt: str, test_type: str,
feature: str) -> None:
"""
Create a barplot showing the frequency of `nt` for every community \
of exons/gene in `df_bar`.
:param df_bar: A dataframe with the enrichment of a \
nucleotide frequency for every community
:param outfile: File were the figure will be stored
:param nt: The nucleotide for which we are seeking enrichment
:param test_type: The kind of test make
:param feature: The king of feature of interest
"""
sns.set()
test_name = "permutation" if test_type == "perm" else "lmm"
g = sns.catplot(x="community", y=nt, data=df_bar, kind="bar",
ci="sd", aspect=2.5, height=12,
palette=["red"] + ["grey"] * (df_bar.shape[0] - 1))
g.fig.suptitle(f"Mean frequency of {nt} among community of {feature}s\n"
f"(stats obtained with as {test_name} test)")
g.set(xticklabels=[])
g.ax.set_ylabel(f'Frequency of {nt}')
if test_type == "perm":
df_bar = df_bar.drop_duplicates(subset="community", keep="last")
for i, p in enumerate(g.ax.patches):
stats = "*" if df_bar.iloc[i, :]["p-adj"] < 0.05 else ""
print(i, stats, df_bar.iloc[i, :]["p-adj"])
g.ax.annotate(stats,
(p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10),
textcoords='offset points')
g.savefig(outfile)
def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str,
region: str, nt: str, outfile: Path,
test_type: str, df_ctrl: pd.DataFrame,
dic_com: Dict, iteration: int,
outfile_ctrl: Path) -> None:
"""
Create a dataframe with a control community, save it as a table and \
as a barplot figure.
:param df: A dataframe containing the frequency of each nucleotide \
in each exons belonging to a community.
:param feature: The kind of feature analysed
:param region: the region of interest to extract from gene
:param nt: The nucleotide of interest
:param outfile: File used to store diagnotics
:param test_type: The type of test to make (permutation or lmm)
:param df_ctrl: A dataframe containing the frequency of each nucleotide \
in each exons/gene in fasterdb.
:param dic_com: A dictionary linking each community to the exons \
it contains.
:param iteration: The number of sub samples to create
:param outfile_ctrl: file used to stored the table and the figure \
containing the test communities and the control community
"""
if test_type == "lmm":
rdf = lmm_with_ctrl(df, feature, region, nt,
outfile.parents[1] / outfile.name)
else:
rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration)
rdf.to_csv(outfile_ctrl, sep="\t", index=False)
barplot_creation(rdf, outfile_ctrl, nt,
test_type, feature, iteration)
def barplot_creation(rdf: pd.DataFrame, outfile: Path, nt: str,
test_type: str, feature: str, iteration: int) -> None:
"""
Reformat a dataframe with the enrichment of a nucleotide frequency \
for every community and then create a barplot showing those frequencies.
:param rdf: A dataframe with the enrichment of a \
nucleotide frequency for every community
:param outfile: File were rdf is stored
:param nt: The nucleotide for which we are seeking enrichment
:param test_type: The kind of test make
:param feature: The king of feature of interest
:param iteration: The number of sub samples to create
"""
rdf = prepare_dataframe(rdf, test_type, nt, iteration)
outfig = outfile.parent / outfile.name.replace(".txt", ".pdf")
make_barplot(rdf, outfig, nt, test_type, feature)
def create_outfiles(project: str, weight: int, global_weight: int,
same_gene: bool, feature: str, nt: str, test_type: str
) -> Tuple[Path, Path]:
"""
Create a file used to store diagnostics and a file used to store the \
table containing the test communities and the control community
:param project: The name of the project of interest
:param weight: The minimum weight of interaction to consider
:param global_weight: The global weight to consider. if \
the global weight is equal to 0 then then density figure are calculated \
by project, else all projet are merge together and the interaction \
seen in `global_weight` project are taken into account
:param same_gene: Say if we consider as co-localised, exons within the \
same gene (True) or not (False) (default False)
:param nt: The nucleotide of interest
:param feature: The kind of feature analysed
:param test_type: The type of test to make (permutation or lmm)
:return: file used to store diagnostics and a file used to store the \
table containing the test communities and the control community
"""
outfolder = "community_enrichment/nt_analysis"
outfile = ConfigGraph.\
get_community_file(project, weight, global_weight, same_gene, feature,
f"{nt}_stat_{test_type}.txt", outfolder)
outfile_ctrl = ConfigGraph.\
get_community_file(project, weight, global_weight, same_gene, feature,
f"{nt}_VS_CTRL_stat_{test_type}.txt", outfolder)
return outfile, outfile_ctrl
def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
global_weight: int, same_gene: bool, global_weight: int, same_gene: bool,
nt: str, df_ctrl: pd.DataFrame, nt: str, df_ctrl: pd.DataFrame,
...@@ -402,31 +554,12 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, ...@@ -402,31 +554,12 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
""" """
logging.debug(f"{test_type} for {project}, w:{weight}, " logging.debug(f"{test_type} for {project}, w:{weight}, "
f"g:{global_weight} nt: {nt}") f"g:{global_weight} nt: {nt}")
res = {"project": [], "nt": [], 'pval': []} outfile, outfile_ctrl = create_outfiles(project, weight, global_weight,
outfile = ConfigGraph.get_community_file(project, weight, same_gene, feature, nt, test_type)
global_weight, res = {"project": project, "nt": nt, 'pval': lmm_maker(df, outfile, nt)}
same_gene, feature, create_and_save_ctrl_dataframe(df, feature, region, nt, outfile, test_type,
f"{nt}_stat_{test_type}.txt", df_ctrl, dic_com, iteration, outfile_ctrl)
"community_enrichment")
nfolder = outfile.parent / "nt_analysis"
nfolder.mkdir(exist_ok=True, parents=True)
noutfile = nfolder / outfile.name
pval = lmm_maker(df, noutfile, nt)
res['project'] = project
res['nt'] = nt
res['pval'] = pval
outfile_ctrl = ConfigGraph.get_community_file(project, weight,
global_weight,
same_gene, feature,
f"{nt}_VS_CTRL_stat_"
f"{test_type}.txt",
"community_enrichment")
if test_type == "lmm":
rdf = lmm_with_ctrl(df, feature, region, nt, outfile)
else:
rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration)
noutfile_ctrl = nfolder / outfile_ctrl.name
rdf.to_csv(noutfile_ctrl, sep="\t", index=False)
return pd.Series(res) return pd.Series(res)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment