src/find_interaction_cluster/nt_and_community.py: add functions for barplot...

src/find_interaction_cluster/nt_and_community.py: add functions for barplot creation and refactoring of get_stat_nt_communities functions

src/find_interaction_cluster/nt_and_community.py: add functions for barplot...
c1ec2983 · nfontrod · 4e1e5741 · c1ec2983
Commit c1ec2983 authored Nov 24, 2020 by nfontrod
--- a/src/find_interaction_cluster/nt_and_community.py
+++ b/src/find_interaction_cluster/nt_and_community.py
@@ -24,6 +24,7 @@ import multiprocessing as mp
 import numpy as np
 from .radomization_test_ppi import get_pvalue
 from tqdm import tqdm
+import seaborn as sns
 def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str],
@@ -341,7 +342,7 @@ def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str,
                              returnsorted=False)[1]
    adj_regs = [list_reg[i] if adj_pvals[i] <= 0.05 else " . "
                for i in range(len(list_reg))]
-    df[f'mean_{iteration}_ctrl'] = mean_ctrl
+    df[f'{nt}_mean_{iteration}_ctrl'] = mean_ctrl
    df[f'p-adj'] = adj_pvals
    df[f'reg-adj'] = adj_regs
    return df
@@ -369,6 +370,157 @@ def perm_with_ctrl(df: pd.DataFrame, feature: str,
    return perm_pvalues(mean_df, df_ctrl, feature, nt, iteration, dic_com)
+def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str,
+                      iteration: int) -> pd.DataFrame:
+    """
+    :param df: A dataframe with the enrichment of a \
+    nucleotide frequency for every community
+    :param test_type: The kind of test performed to
+    :param nt: The nucleotide of interest
+    :param iteration: The number of iteration permformed to \
+    produce the database
+    :return: The dataframe ready for barplot visualisation
+    """
+    if test_type == "lmm":
+        # removing the size parameter
+        df = df[df["community"] != "log(_size)"].copy()
+        df.rename({"Pr(>|t|)": "p-adj"}, axis=1, inplace=True)
+        df_bar = df[['community', nt, 'p-adj']]
+        df_ctrl = df_bar[df_bar["community"] == "C-CTRL"]
+        df_bar = df_bar[df_bar["community"] != "C-CTRL"].copy()
+        df_bar.sort_values(nt, ascending=True, inplace=True)
+    else:
+        df_bar = df[["community", nt, f"{nt}_mean_{iteration}_ctrl",
+                     "p-adj"]].copy()
+        ctrl_val = df_bar[f"{nt}_mean_{iteration}_ctrl"]
+        df_bar.drop(f"{nt}_mean_{iteration}_ctrl", axis=1, inplace=True)
+        df_bar.sort_values(nt, ascending=True, inplace=True)
+        df_ctrl = pd.DataFrame({"community": ["C-CTRL"] * len(ctrl_val),
+                                nt: ctrl_val})
+    df_bar = pd.concat([df_ctrl, df_bar], axis=0, ignore_index=True)
+    return df_bar
+def make_barplot(df_bar: pd.DataFrame, outfile: Path, nt: str, test_type: str,
+                 feature: str) -> None:
+    """
+    Create a barplot showing the frequency of `nt` for every community \
+    of exons/gene in `df_bar`.
+    :param df_bar: A dataframe with the enrichment of a \
+    nucleotide frequency for every community
+    :param outfile: File were the figure will be stored
+    :param nt: The nucleotide for which we are seeking enrichment
+    :param test_type: The kind of test make
+    :param feature: The king of feature of interest
+    """
+    sns.set()
+    test_name = "permutation" if test_type == "perm" else "lmm"
+    g = sns.catplot(x="community", y=nt, data=df_bar, kind="bar",
+                    ci="sd", aspect=2.5, height=12,
+                    palette=["red"] + ["grey"] * (df_bar.shape[0] - 1))
+    g.fig.suptitle(f"Mean frequency of {nt} among community of {feature}s\n"
+                   f"(stats obtained with as {test_name} test)")
+    g.set(xticklabels=[])
+    g.ax.set_ylabel(f'Frequency of {nt}')
+    if test_type == "perm":
+        df_bar = df_bar.drop_duplicates(subset="community", keep="last")
+    for i, p in enumerate(g.ax.patches):
+        stats = "*" if df_bar.iloc[i, :]["p-adj"] < 0.05 else ""
+        print(i, stats, df_bar.iloc[i, :]["p-adj"])
+        g.ax.annotate(stats,
+                      (p.get_x() + p.get_width() / 2., p.get_height()),
+                      ha='center', va='center', xytext=(0, 10),
+                      textcoords='offset points')
+    g.savefig(outfile)
+def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str,
+                                   region: str, nt: str, outfile: Path,
+                                   test_type: str, df_ctrl: pd.DataFrame,
+                                   dic_com: Dict, iteration: int,
+                                   outfile_ctrl: Path) -> None:
+    """
+    Create a dataframe with a control community, save it as a table and \
+    as a barplot figure.
+    :param df: A dataframe containing the frequency of each nucleotide \
+    in each exons belonging to a community.
+    :param feature: The kind of feature analysed
+    :param region: the region of interest to extract from gene
+    :param nt: The nucleotide of interest
+    :param outfile: File used to store diagnotics
+    :param test_type: The type of test to make (permutation or lmm)
+    :param df_ctrl: A dataframe containing the frequency of each nucleotide \
+    in each exons/gene in fasterdb.
+    :param dic_com: A dictionary linking each community to the exons \
+    it contains.
+    :param iteration: The number of sub samples to create
+    :param outfile_ctrl: file used to stored the table and the figure \
+    containing the test communities and the control community
+    """
+    if test_type == "lmm":
+        rdf = lmm_with_ctrl(df, feature, region, nt,
+                            outfile.parents[1] / outfile.name)
+    else:
+        rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration)
+    rdf.to_csv(outfile_ctrl, sep="\t", index=False)
+    barplot_creation(rdf, outfile_ctrl, nt,
+                     test_type, feature, iteration)
+def barplot_creation(rdf: pd.DataFrame, outfile: Path, nt: str,
+                     test_type: str, feature: str, iteration: int) -> None:
+    """
+    Reformat a dataframe with the enrichment of a nucleotide frequency \
+    for every community and then create a barplot showing those frequencies.
+    :param rdf: A dataframe with the enrichment of a \
+    nucleotide frequency for every community
+    :param outfile: File were rdf is stored
+    :param nt: The nucleotide for which we are seeking enrichment
+    :param test_type: The kind of test make
+    :param feature: The king of feature of interest
+    :param iteration: The number of sub samples to create
+    """
+    rdf = prepare_dataframe(rdf, test_type, nt, iteration)
+    outfig = outfile.parent / outfile.name.replace(".txt", ".pdf")
+    make_barplot(rdf, outfig, nt, test_type, feature)
+def create_outfiles(project: str, weight: int, global_weight: int,
+                    same_gene: bool, feature: str, nt: str, test_type: str
+                    ) -> Tuple[Path, Path]:
+    """
+    Create a file used to store diagnostics and a file used to store the \
+    table containing the test communities and the control community
+    :param project: The name of the project of interest
+    :param weight: The minimum weight of interaction to consider
+    :param global_weight: The global weight to consider. if \
+    the global weight is equal to 0 then then density figure are calculated \
+    by project, else all projet are merge together and the interaction \
+    seen in `global_weight` project are taken into account
+    :param same_gene: Say if we consider as co-localised, exons within the \
+    same gene (True) or not (False) (default False)
+    :param nt: The nucleotide of interest
+    :param feature: The kind of feature analysed
+    :param test_type: The type of test to make (permutation or lmm)
+    :return: file used to store diagnostics and a file used to store the \
+    table containing the test communities and the control community
+    """
+    outfolder = "community_enrichment/nt_analysis"
+    outfile = ConfigGraph.\
+        get_community_file(project, weight, global_weight, same_gene, feature,
+                           f"{nt}_stat_{test_type}.txt", outfolder)
+    outfile_ctrl = ConfigGraph.\
+        get_community_file(project, weight, global_weight, same_gene, feature,
+                           f"{nt}_VS_CTRL_stat_{test_type}.txt", outfolder)
+    return outfile, outfile_ctrl
 def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
                            global_weight: int, same_gene: bool,
                            nt: str, df_ctrl: pd.DataFrame,
@@ -402,31 +554,12 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
    """
    logging.debug(f"{test_type} for {project}, w:{weight}, "
                  f"g:{global_weight} nt: {nt}")
-    res = {"project": [], "nt": [], 'pval': []}
+    outfile, outfile_ctrl = create_outfiles(project, weight, global_weight,
-    outfile = ConfigGraph.get_community_file(project, weight,
+                                            same_gene, feature, nt, test_type)
-                                             global_weight,
+    res = {"project": project, "nt": nt, 'pval': lmm_maker(df, outfile, nt)}
-                                             same_gene, feature,
+    create_and_save_ctrl_dataframe(df, feature, region, nt, outfile, test_type,
-                                             f"{nt}_stat_{test_type}.txt",
+                                   df_ctrl, dic_com, iteration, outfile_ctrl)
-                                             "community_enrichment")
-    nfolder = outfile.parent / "nt_analysis"
-    nfolder.mkdir(exist_ok=True, parents=True)
-    noutfile = nfolder / outfile.name
-    pval = lmm_maker(df, noutfile, nt)
-    res['project'] = project
-    res['nt'] = nt
-    res['pval'] = pval
-    outfile_ctrl = ConfigGraph.get_community_file(project, weight,
-                                                  global_weight,
-                                                  same_gene, feature,
-                                                  f"{nt}_VS_CTRL_stat_"
-                                                  f"{test_type}.txt",
-                                                  "community_enrichment")
-    if test_type == "lmm":
-        rdf = lmm_with_ctrl(df, feature, region, nt, outfile)
-    else:
-        rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration)
-    noutfile_ctrl = nfolder / outfile_ctrl.name
-    rdf.to_csv(noutfile_ctrl, sep="\t", index=False)
    return pd.Series(res)