src/find_interaction_cluster/nt_and_community.py: error bar on barplots

a520d7c7 · nfontrod · c1ec2983 · a520d7c7
Commit a520d7c7 authored Nov 24, 2020 by nfontrod
--- a/src/find_interaction_cluster/nt_and_community.py
+++ b/src/find_interaction_cluster/nt_and_community.py
@@ -152,7 +152,6 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str
    :param outfile: A name of a file
    :param nt: the nucleotide of interest
    :return: the pvalue of lmm
    """
    pandas2ri.activate()
    lmm = r(
@@ -222,7 +221,8 @@ def create_ctrl_community(df: pd.DataFrame,
 def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str,
-                  nt: str, outfile_diag: Path) -> pd.DataFrame:
+                  nt: str, outfile_diag: Path
+                  ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    :param df: df: A dataframe containing the frequency of each nucleotide \
@@ -232,11 +232,12 @@ def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str,
    :param nt: The nucleotide of interest
    :param outfile_diag: File from which the diagnostics folder will be \
    inferred
-    :return: The dataframe with the p-value compared to the control \
+    :return: The dataframe with ctrl exon and \
-    list of exons.
+    The dataframe with the p-value compared to the control \
+    list of feature.
    """
    ndf = create_ctrl_community(df, feature, region)
-    return lmm_maker_summary(ndf, outfile_diag, nt)
+    return ndf, lmm_maker_summary(ndf, outfile_diag, nt)
 def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict:
@@ -418,24 +419,78 @@ def make_barplot(df_bar: pd.DataFrame, outfile: Path, nt: str, test_type: str,
    sns.set()
    test_name = "permutation" if test_type == "perm" else "lmm"
    g = sns.catplot(x="community", y=nt, data=df_bar, kind="bar",
-                    ci="sd", aspect=2.5, height=12,
+                    ci="sd", aspect=2.5, height=12, errwidth=0.5, capsize=.4,
-                    palette=["red"] + ["grey"] * (df_bar.shape[0] - 1))
+                    palette=["red"] + ["lightgray"] * (df_bar.shape[0] - 1))
    g.fig.suptitle(f"Mean frequency of {nt} among community of {feature}s\n"
                   f"(stats obtained with as {test_name} test)")
    g.set(xticklabels=[])
    g.ax.set_ylabel(f'Frequency of {nt}')
-    if test_type == "perm":
+    df_bara = df_bar.drop_duplicates(subset="community", keep="first")
-        df_bar = df_bar.drop_duplicates(subset="community", keep="last")
    for i, p in enumerate(g.ax.patches):
-        stats = "*" if df_bar.iloc[i, :]["p-adj"] < 0.05 else ""
+        stats = "*" if df_bara.iloc[i, :]["p-adj"] < 0.05 else ""
-        print(i, stats, df_bar.iloc[i, :]["p-adj"])
+        com = df_bara.iloc[i, :]["community"]
+        csd = np.std(df_bar.loc[df_bar["community"] == com, nt])
        g.ax.annotate(stats,
-                      (p.get_x() + p.get_width() / 2., p.get_height()),
+                      (p.get_x() + p.get_width() / 2., p.get_height() + csd),
                      ha='center', va='center', xytext=(0, 10),
                      textcoords='offset points')
    g.savefig(outfile)
+def expand_results_lmm(df: pd.DataFrame, rdf: pd.DataFrame,
+                       nt: str, feature: str) -> pd.DataFrame:
+    """
+    Merge df and rdf together.
+    :param df: A dataframe containing the frequency of each nucleotide \
+    in each feature belonging to a community.
+    :param rdf: The dataframe containing the mean frequency for \
+    each community and the p-value of their enrichment compared to control \
+    exons.
+    :param nt: the nucleotide of interest
+    :param feature: The feature of interest
+    :return: The merged dataframe: i.e df with the stats columns
+    """
+    p_col = "Pr(>|t|)"
+    df = df[[f"id_{feature}", nt, "community", "community_size"]].copy()
+    rdf = rdf[["community", "community_size", p_col, nt]].copy()
+    rdf.rename({nt: f"mean_{nt}", p_col: "p-adj"}, axis=1, inplace=True)
+    df = df.merge(rdf, how="left", on=["community", "community_size"])
+    df_ctrl = df[df["community"] == "C-CTRL"]
+    df = df[df["community"] != "C-CTRL"].copy()
+    df.sort_values(f"mean_{nt}", ascending=True, inplace=True)
+    return pd.concat([df_ctrl, df], axis=0, ignore_index=True)
+def expand_results_perm(df: pd.DataFrame, rdf: pd.DataFrame,
+                        nt: str, feature: str, iteration: int) -> pd.DataFrame:
+    """
+    Merge df and rdf together.
+    :param df: A dataframe containing the frequency of each nucleotide \
+    in each feature belonging to a community.
+    :param rdf: The dataframe containing the mean frequency for \
+    each community and the p-value of their enrichment compared to control \
+    exons.
+    :param nt: the nucleotide of interest
+    :param feature: The feature of interest
+    :param iteration: The number of iteration
+    :return: The merged dataframe: i.e df with the stats columns
+    """
+    df = df[[f"id_{feature}", nt, "community", "community_size"]].copy()
+    ctrl_val = rdf[f"{nt}_mean_{iteration}_ctrl"]
+    rdf = rdf[["community", "community_size", nt, "p-adj"]].copy()
+    rdf.rename({nt: f"mean_{nt}"}, axis=1, inplace=True)
+    df = df.merge(rdf, how="left", on=["community", "community_size"])
+    df_ctrl = pd.DataFrame({nt: ctrl_val,
+                            f"mean_{nt}": [np.mean(ctrl_val)] * len(ctrl_val),
+                            f"id_{feature}": ['ctrl'] * len(ctrl_val),
+                            "community_size": [len(ctrl_val)] * len(ctrl_val),
+                            "community": ["C-CTRL"] * len(ctrl_val)})
+    df.sort_values(f"mean_{nt}", ascending=True, inplace=True)
+    return pd.concat([df_ctrl, df], axis=0, ignore_index=True)
 def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str,
                                   region: str, nt: str, outfile: Path,
                                   test_type: str, df_ctrl: pd.DataFrame,
@@ -461,32 +516,34 @@ def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str,
    containing the test communities and the control community
    """
    if test_type == "lmm":
-        rdf = lmm_with_ctrl(df, feature, region, nt,
+        ndf, rdf = lmm_with_ctrl(df, feature, region, nt,
                                 outfile.parents[1] / outfile.name)
+        df_bar = expand_results_lmm(ndf, rdf, nt, feature)
    else:
        rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration)
+        df_bar = expand_results_perm(df, rdf, nt, feature, iteration)
    rdf.to_csv(outfile_ctrl, sep="\t", index=False)
-    barplot_creation(rdf, outfile_ctrl, nt,
+    barplot_creation(df_bar, outfile_ctrl, nt,
-                     test_type, feature, iteration)
+                     test_type, feature)
-def barplot_creation(rdf: pd.DataFrame, outfile: Path, nt: str,
+def barplot_creation(df_bar: pd.DataFrame, outfile: Path, nt: str,
-                     test_type: str, feature: str, iteration: int) -> None:
+                     test_type: str, feature: str) -> None:
    """
    Reformat a dataframe with the enrichment of a nucleotide frequency \
-    for every community and then create a barplot showing those frequencies.
+    for every feature for every community and then create a \
+    barplot showing those frequencies.
-    :param rdf: A dataframe with the enrichment of a \
+    :param df_bar: A dataframe with the enrichment of a \
-    nucleotide frequency for every community
+    nucleotide frequency for every community and showing the frequency \
+    of each feature in each community
    :param outfile: File were rdf is stored
    :param nt: The nucleotide for which we are seeking enrichment
    :param test_type: The kind of test make
    :param feature: The king of feature of interest
-    :param iteration: The number of sub samples to create
    """
-    rdf = prepare_dataframe(rdf, test_type, nt, iteration)
    outfig = outfile.parent / outfile.name.replace(".txt", ".pdf")
-    make_barplot(rdf, outfig, nt, test_type, feature)
+    make_barplot(df_bar, outfig, nt, test_type, feature)
 def create_outfiles(project: str, weight: int, global_weight: int,