diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index 9046ae96d4159f1fec2647201cde512f5c9e23c3..07d5e40e7701ac40d133061a0cf9857afc0cbbbe 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -24,6 +24,7 @@ import multiprocessing as mp import numpy as np from .radomization_test_ppi import get_pvalue from tqdm import tqdm +import seaborn as sns def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], @@ -341,7 +342,7 @@ def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, returnsorted=False)[1] adj_regs = [list_reg[i] if adj_pvals[i] <= 0.05 else " . " for i in range(len(list_reg))] - df[f'mean_{iteration}_ctrl'] = mean_ctrl + df[f'{nt}_mean_{iteration}_ctrl'] = mean_ctrl df[f'p-adj'] = adj_pvals df[f'reg-adj'] = adj_regs return df @@ -369,6 +370,157 @@ def perm_with_ctrl(df: pd.DataFrame, feature: str, return perm_pvalues(mean_df, df_ctrl, feature, nt, iteration, dic_com) +def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str, + iteration: int) -> pd.DataFrame: + """ + + :param df: A dataframe with the enrichment of a \ + nucleotide frequency for every community + :param test_type: The kind of test performed to + :param nt: The nucleotide of interest + :param iteration: The number of iteration permformed to \ + produce the database + :return: The dataframe ready for barplot visualisation + """ + if test_type == "lmm": + # removing the size parameter + df = df[df["community"] != "log(_size)"].copy() + df.rename({"Pr(>|t|)": "p-adj"}, axis=1, inplace=True) + df_bar = df[['community', nt, 'p-adj']] + df_ctrl = df_bar[df_bar["community"] == "C-CTRL"] + df_bar = df_bar[df_bar["community"] != "C-CTRL"].copy() + df_bar.sort_values(nt, ascending=True, inplace=True) + else: + df_bar = df[["community", nt, f"{nt}_mean_{iteration}_ctrl", + "p-adj"]].copy() + ctrl_val = df_bar[f"{nt}_mean_{iteration}_ctrl"] + df_bar.drop(f"{nt}_mean_{iteration}_ctrl", axis=1, inplace=True) + df_bar.sort_values(nt, ascending=True, inplace=True) + df_ctrl = pd.DataFrame({"community": ["C-CTRL"] * len(ctrl_val), + nt: ctrl_val}) + df_bar = pd.concat([df_ctrl, df_bar], axis=0, ignore_index=True) + return df_bar + + +def make_barplot(df_bar: pd.DataFrame, outfile: Path, nt: str, test_type: str, + feature: str) -> None: + """ + Create a barplot showing the frequency of `nt` for every community \ + of exons/gene in `df_bar`. + + :param df_bar: A dataframe with the enrichment of a \ + nucleotide frequency for every community + :param outfile: File were the figure will be stored + :param nt: The nucleotide for which we are seeking enrichment + :param test_type: The kind of test make + :param feature: The king of feature of interest + """ + sns.set() + test_name = "permutation" if test_type == "perm" else "lmm" + g = sns.catplot(x="community", y=nt, data=df_bar, kind="bar", + ci="sd", aspect=2.5, height=12, + palette=["red"] + ["grey"] * (df_bar.shape[0] - 1)) + g.fig.suptitle(f"Mean frequency of {nt} among community of {feature}s\n" + f"(stats obtained with as {test_name} test)") + g.set(xticklabels=[]) + g.ax.set_ylabel(f'Frequency of {nt}') + if test_type == "perm": + df_bar = df_bar.drop_duplicates(subset="community", keep="last") + for i, p in enumerate(g.ax.patches): + stats = "*" if df_bar.iloc[i, :]["p-adj"] < 0.05 else "" + print(i, stats, df_bar.iloc[i, :]["p-adj"]) + g.ax.annotate(stats, + (p.get_x() + p.get_width() / 2., p.get_height()), + ha='center', va='center', xytext=(0, 10), + textcoords='offset points') + g.savefig(outfile) + + +def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str, + region: str, nt: str, outfile: Path, + test_type: str, df_ctrl: pd.DataFrame, + dic_com: Dict, iteration: int, + outfile_ctrl: Path) -> None: + """ + Create a dataframe with a control community, save it as a table and \ + as a barplot figure. + + :param df: A dataframe containing the frequency of each nucleotide \ + in each exons belonging to a community. + :param feature: The kind of feature analysed + :param region: the region of interest to extract from gene + :param nt: The nucleotide of interest + :param outfile: File used to store diagnotics + :param test_type: The type of test to make (permutation or lmm) + :param df_ctrl: A dataframe containing the frequency of each nucleotide \ + in each exons/gene in fasterdb. + :param dic_com: A dictionary linking each community to the exons \ + it contains. + :param iteration: The number of sub samples to create + :param outfile_ctrl: file used to stored the table and the figure \ + containing the test communities and the control community + """ + if test_type == "lmm": + rdf = lmm_with_ctrl(df, feature, region, nt, + outfile.parents[1] / outfile.name) + else: + rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration) + rdf.to_csv(outfile_ctrl, sep="\t", index=False) + barplot_creation(rdf, outfile_ctrl, nt, + test_type, feature, iteration) + + +def barplot_creation(rdf: pd.DataFrame, outfile: Path, nt: str, + test_type: str, feature: str, iteration: int) -> None: + """ + Reformat a dataframe with the enrichment of a nucleotide frequency \ + for every community and then create a barplot showing those frequencies. + + :param rdf: A dataframe with the enrichment of a \ + nucleotide frequency for every community + :param outfile: File were rdf is stored + :param nt: The nucleotide for which we are seeking enrichment + :param test_type: The kind of test make + :param feature: The king of feature of interest + :param iteration: The number of sub samples to create + """ + rdf = prepare_dataframe(rdf, test_type, nt, iteration) + outfig = outfile.parent / outfile.name.replace(".txt", ".pdf") + make_barplot(rdf, outfig, nt, test_type, feature) + + +def create_outfiles(project: str, weight: int, global_weight: int, + same_gene: bool, feature: str, nt: str, test_type: str + ) -> Tuple[Path, Path]: + """ + Create a file used to store diagnostics and a file used to store the \ + table containing the test communities and the control community + + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param nt: The nucleotide of interest + :param feature: The kind of feature analysed + :param test_type: The type of test to make (permutation or lmm) + :return: file used to store diagnostics and a file used to store the \ + table containing the test communities and the control community + """ + + outfolder = "community_enrichment/nt_analysis" + outfile = ConfigGraph.\ + get_community_file(project, weight, global_weight, same_gene, feature, + f"{nt}_stat_{test_type}.txt", outfolder) + outfile_ctrl = ConfigGraph.\ + get_community_file(project, weight, global_weight, same_gene, feature, + f"{nt}_VS_CTRL_stat_{test_type}.txt", outfolder) + return outfile, outfile_ctrl + + def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, global_weight: int, same_gene: bool, nt: str, df_ctrl: pd.DataFrame, @@ -402,31 +554,12 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, """ logging.debug(f"{test_type} for {project}, w:{weight}, " f"g:{global_weight} nt: {nt}") - res = {"project": [], "nt": [], 'pval': []} - outfile = ConfigGraph.get_community_file(project, weight, - global_weight, - same_gene, feature, - f"{nt}_stat_{test_type}.txt", - "community_enrichment") - nfolder = outfile.parent / "nt_analysis" - nfolder.mkdir(exist_ok=True, parents=True) - noutfile = nfolder / outfile.name - pval = lmm_maker(df, noutfile, nt) - res['project'] = project - res['nt'] = nt - res['pval'] = pval - outfile_ctrl = ConfigGraph.get_community_file(project, weight, - global_weight, - same_gene, feature, - f"{nt}_VS_CTRL_stat_" - f"{test_type}.txt", - "community_enrichment") - if test_type == "lmm": - rdf = lmm_with_ctrl(df, feature, region, nt, outfile) - else: - rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration) - noutfile_ctrl = nfolder / outfile_ctrl.name - rdf.to_csv(noutfile_ctrl, sep="\t", index=False) + outfile, outfile_ctrl = create_outfiles(project, weight, global_weight, + same_gene, feature, nt, test_type) + res = {"project": project, "nt": nt, 'pval': lmm_maker(df, outfile, nt)} + create_and_save_ctrl_dataframe(df, feature, region, nt, outfile, test_type, + df_ctrl, dic_com, iteration, outfile_ctrl) + return pd.Series(res)