diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index 07d5e40e7701ac40d133061a0cf9857afc0cbbbe..a96e351303428e77a41660cc7ec31e0c7b0ac4c7 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -152,7 +152,6 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str :param outfile: A name of a file :param nt: the nucleotide of interest :return: the pvalue of lmm - """ pandas2ri.activate() lmm = r( @@ -222,7 +221,8 @@ def create_ctrl_community(df: pd.DataFrame, def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str, - nt: str, outfile_diag: Path) -> pd.DataFrame: + nt: str, outfile_diag: Path + ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ :param df: df: A dataframe containing the frequency of each nucleotide \ @@ -232,11 +232,12 @@ def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str, :param nt: The nucleotide of interest :param outfile_diag: File from which the diagnostics folder will be \ inferred - :return: The dataframe with the p-value compared to the control \ - list of exons. + :return: The dataframe with ctrl exon and \ + The dataframe with the p-value compared to the control \ + list of feature. """ ndf = create_ctrl_community(df, feature, region) - return lmm_maker_summary(ndf, outfile_diag, nt) + return ndf, lmm_maker_summary(ndf, outfile_diag, nt) def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict: @@ -418,24 +419,78 @@ def make_barplot(df_bar: pd.DataFrame, outfile: Path, nt: str, test_type: str, sns.set() test_name = "permutation" if test_type == "perm" else "lmm" g = sns.catplot(x="community", y=nt, data=df_bar, kind="bar", - ci="sd", aspect=2.5, height=12, - palette=["red"] + ["grey"] * (df_bar.shape[0] - 1)) + ci="sd", aspect=2.5, height=12, errwidth=0.5, capsize=.4, + palette=["red"] + ["lightgray"] * (df_bar.shape[0] - 1)) g.fig.suptitle(f"Mean frequency of {nt} among community of {feature}s\n" f"(stats obtained with as {test_name} test)") g.set(xticklabels=[]) g.ax.set_ylabel(f'Frequency of {nt}') - if test_type == "perm": - df_bar = df_bar.drop_duplicates(subset="community", keep="last") + df_bara = df_bar.drop_duplicates(subset="community", keep="first") for i, p in enumerate(g.ax.patches): - stats = "*" if df_bar.iloc[i, :]["p-adj"] < 0.05 else "" - print(i, stats, df_bar.iloc[i, :]["p-adj"]) + stats = "*" if df_bara.iloc[i, :]["p-adj"] < 0.05 else "" + com = df_bara.iloc[i, :]["community"] + csd = np.std(df_bar.loc[df_bar["community"] == com, nt]) g.ax.annotate(stats, - (p.get_x() + p.get_width() / 2., p.get_height()), + (p.get_x() + p.get_width() / 2., p.get_height() + csd), ha='center', va='center', xytext=(0, 10), textcoords='offset points') g.savefig(outfile) +def expand_results_lmm(df: pd.DataFrame, rdf: pd.DataFrame, + nt: str, feature: str) -> pd.DataFrame: + """ + Merge df and rdf together. + + :param df: A dataframe containing the frequency of each nucleotide \ + in each feature belonging to a community. + :param rdf: The dataframe containing the mean frequency for \ + each community and the p-value of their enrichment compared to control \ + exons. + :param nt: the nucleotide of interest + :param feature: The feature of interest + :return: The merged dataframe: i.e df with the stats columns + """ + p_col = "Pr(>|t|)" + df = df[[f"id_{feature}", nt, "community", "community_size"]].copy() + rdf = rdf[["community", "community_size", p_col, nt]].copy() + rdf.rename({nt: f"mean_{nt}", p_col: "p-adj"}, axis=1, inplace=True) + df = df.merge(rdf, how="left", on=["community", "community_size"]) + df_ctrl = df[df["community"] == "C-CTRL"] + df = df[df["community"] != "C-CTRL"].copy() + df.sort_values(f"mean_{nt}", ascending=True, inplace=True) + return pd.concat([df_ctrl, df], axis=0, ignore_index=True) + + +def expand_results_perm(df: pd.DataFrame, rdf: pd.DataFrame, + nt: str, feature: str, iteration: int) -> pd.DataFrame: + """ + Merge df and rdf together. + + :param df: A dataframe containing the frequency of each nucleotide \ + in each feature belonging to a community. + :param rdf: The dataframe containing the mean frequency for \ + each community and the p-value of their enrichment compared to control \ + exons. + :param nt: the nucleotide of interest + :param feature: The feature of interest + :param iteration: The number of iteration + :return: The merged dataframe: i.e df with the stats columns + """ + df = df[[f"id_{feature}", nt, "community", "community_size"]].copy() + ctrl_val = rdf[f"{nt}_mean_{iteration}_ctrl"] + rdf = rdf[["community", "community_size", nt, "p-adj"]].copy() + rdf.rename({nt: f"mean_{nt}"}, axis=1, inplace=True) + df = df.merge(rdf, how="left", on=["community", "community_size"]) + df_ctrl = pd.DataFrame({nt: ctrl_val, + f"mean_{nt}": [np.mean(ctrl_val)] * len(ctrl_val), + f"id_{feature}": ['ctrl'] * len(ctrl_val), + "community_size": [len(ctrl_val)] * len(ctrl_val), + "community": ["C-CTRL"] * len(ctrl_val)}) + df.sort_values(f"mean_{nt}", ascending=True, inplace=True) + return pd.concat([df_ctrl, df], axis=0, ignore_index=True) + + def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str, region: str, nt: str, outfile: Path, test_type: str, df_ctrl: pd.DataFrame, @@ -461,32 +516,34 @@ def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str, containing the test communities and the control community """ if test_type == "lmm": - rdf = lmm_with_ctrl(df, feature, region, nt, - outfile.parents[1] / outfile.name) + ndf, rdf = lmm_with_ctrl(df, feature, region, nt, + outfile.parents[1] / outfile.name) + df_bar = expand_results_lmm(ndf, rdf, nt, feature) else: rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration) + df_bar = expand_results_perm(df, rdf, nt, feature, iteration) rdf.to_csv(outfile_ctrl, sep="\t", index=False) - barplot_creation(rdf, outfile_ctrl, nt, - test_type, feature, iteration) + barplot_creation(df_bar, outfile_ctrl, nt, + test_type, feature) -def barplot_creation(rdf: pd.DataFrame, outfile: Path, nt: str, - test_type: str, feature: str, iteration: int) -> None: +def barplot_creation(df_bar: pd.DataFrame, outfile: Path, nt: str, + test_type: str, feature: str) -> None: """ Reformat a dataframe with the enrichment of a nucleotide frequency \ - for every community and then create a barplot showing those frequencies. + for every feature for every community and then create a \ + barplot showing those frequencies. - :param rdf: A dataframe with the enrichment of a \ - nucleotide frequency for every community + :param df_bar: A dataframe with the enrichment of a \ + nucleotide frequency for every community and showing the frequency \ + of each feature in each community :param outfile: File were rdf is stored :param nt: The nucleotide for which we are seeking enrichment :param test_type: The kind of test make :param feature: The king of feature of interest - :param iteration: The number of sub samples to create """ - rdf = prepare_dataframe(rdf, test_type, nt, iteration) outfig = outfile.parent / outfile.name.replace(".txt", ".pdf") - make_barplot(rdf, outfig, nt, test_type, feature) + make_barplot(df_bar, outfig, nt, test_type, feature) def create_outfiles(project: str, weight: int, global_weight: int,