diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index f9ffb266414456024560e4a55d5a78abb4d25016..998d95b31b0df89890a7aea21d9075418581ee41 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -302,7 +302,8 @@ def get_permutation_mean(df_ctrl: pd.DataFrame, def perm_community_pval(row: pd.Series, df_ctrl: pd.DataFrame, - cpnt: str, iteration: int) -> Tuple[float, float, str]: + cpnt: str, iteration: int + ) -> Tuple[float, float, float, str]: """ Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ of `nt` frequencies from it. @@ -313,14 +314,14 @@ def perm_community_pval(row: pd.Series, df_ctrl: pd.DataFrame, in each exons/gene in fasterdb. :param cpnt: The component (nt, aa, dnt) of interest :param iteration: The number of sub samples to create - :return: The ctrl mean frequency value of `nt` \ + :return: The ctrl mean frequency value of `nt`, its standard error \ the pvalue and the regulation of the enrichment/impoverishment \ of the community in `row` compared to control exons. """ list_values = get_permutation_mean(df_ctrl, cpnt, row["community_size"], iteration) pval, reg = get_pvalue(np.array(list_values), row[cpnt], iteration) - return float(np.mean(list_values)), pval, reg + return float(np.mean(list_values)), float(np.std(list_values)), pval, reg def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, @@ -341,7 +342,7 @@ def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, :return: The dataframe containing p-values and regulation \ indicating the enrichment of """ - list_pval, list_reg, mean_ctrl = ([] for _ in range(3)) + list_pval, list_reg, mean_ctrl, std_ctrl = ([] for _ in range(4)) for i in tqdm(range(df.shape[0]), desc="performing permutations"): row = df.iloc[i, :] res = perm_community_pval(row, @@ -350,7 +351,8 @@ def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, ].isin(dic_com[row['community']]), :], cpnt, iteration) - [x.append(y) for x, y in zip([mean_ctrl, list_pval, list_reg], res)] + [x.append(y) for x, y in zip([mean_ctrl, std_ctrl, list_pval, + list_reg], res)] adj_pvals = multipletests(list_pval, alpha=0.05, method='fdr_bh', is_sorted=False, @@ -358,6 +360,7 @@ def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, adj_regs = [list_reg[i] if adj_pvals[i] <= 0.05 else " . " for i in range(len(list_reg))] df[f'{cpnt}_mean_{iteration}_ctrl'] = mean_ctrl + df[f'{cpnt}_std_{iteration}_ctrl'] = std_ctrl df[f'p-adj'] = adj_pvals df[f'reg-adj'] = adj_regs return df @@ -418,7 +421,7 @@ def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str, def make_barplot(df_bar: pd.DataFrame, outfile: Path, cpnt_type: str, - cpnt: str, test_type: str, feature: str) -> None: + cpnt: str, feature: str) -> None: """ Create a barplot showing the frequency of `nt` for every community \ of exons/gene in `df_bar`. @@ -429,17 +432,59 @@ def make_barplot(df_bar: pd.DataFrame, outfile: Path, cpnt_type: str, :param cpnt_type: The type of component to analyse; It \ can be 'nt', 'dnt' or 'aa'. :param cpnt: The component (nt, aa, dnt) of interest - :param test_type: The kind of test make :param feature: The king of feature of interest """ - sns.set() - test_name = "permutation" if test_type == "perm" else "lm" + sns.set(context="poster") g = sns.catplot(x="community", y=cpnt, data=df_bar, kind="bar", ci="sd", aspect=2.5, height=12, errwidth=0.5, capsize=.4, - palette=["red"] + ["lightgray"] * (df_bar.shape[0] - 1)) + palette=["red"] + ["darkgray"] * (df_bar.shape[0] - 1)) + g.fig.subplots_adjust(top=0.9) + g.fig.suptitle(f"Mean frequency of {cpnt} ({cpnt_type}) " + f"among community of {feature}s\n" + f"(stats obtained with a lm test)") + g.set(xticklabels=[]) + g.ax.set_ylabel(f'Frequency of {cpnt}') + df_bara = df_bar.drop_duplicates(subset="community", keep="first") + for i, p in enumerate(g.ax.patches): + stats = "*" if df_bara.iloc[i, :]["p-adj"] < 0.05 else "" + com = df_bara.iloc[i, :]["community"] + csd = np.std(df_bar.loc[df_bar["community"] == com, cpnt]) + g.ax.annotate(stats, + (p.get_x() + p.get_width() / 2., p.get_height() + csd), + ha='center', va='center', xytext=(0, 10), fontsize=12, + textcoords='offset points') + g.savefig(outfile) + + +def make_barplot_perm(df_bar: pd.DataFrame, outfile: Path, cpnt_type: str, + cpnt: str, feature: str) -> None: + """ + Create a barplot showing the frequency of `nt` for every community \ + of exons/gene in `df_bar`. + + :param df_bar: A dataframe with the enrichment of a \ + nucleotide frequency for every community + :param outfile: File were the figure will be stored + :param cpnt_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. + :param cpnt: The component (nt, aa, dnt) of interest + :param feature: The king of feature of interest + """ + sns.set(context="poster") + df_ctrl = df_bar.loc[df_bar[f"id_{feature}"] == 'ctrl', :] + df_bar = df_bar.loc[df_bar[f"id_{feature}"] != 'ctrl', :] + g = sns.catplot(x="community", y=cpnt, data=df_bar, kind="bar", + ci="sd", aspect=2.5, height=14, errwidth=0.5, capsize=.4, + palette= ["darkgray"] * (df_bar.shape[0])) + xrange = g.ax.get_xlim() + df_ctrl.plot(x="community", y=cpnt, kind="scatter", ax=g.ax, + yerr= "ctrl_std", legend=False, zorder=10, + color=(0.8, 0.2, 0.2, 0.4)) + g.ax.set_xlim(xrange) + g.fig.subplots_adjust(top=0.9) g.fig.suptitle(f"Mean frequency of {cpnt} ({cpnt_type}) " f"among community of {feature}s\n" - f"(stats obtained with as {test_name} test)") + f"(stats obtained with a permutation test)") g.set(xticklabels=[]) g.ax.set_ylabel(f'Frequency of {cpnt}') df_bara = df_bar.drop_duplicates(subset="community", keep="first") @@ -449,7 +494,7 @@ def make_barplot(df_bar: pd.DataFrame, outfile: Path, cpnt_type: str, csd = np.std(df_bar.loc[df_bar["community"] == com, cpnt]) g.ax.annotate(stats, (p.get_x() + p.get_width() / 2., p.get_height() + csd), - ha='center', va='center', xytext=(0, 10), + ha='center', va='center', xytext=(0, 10), fontsize=12, textcoords='offset points') g.savefig(outfile) @@ -479,6 +524,33 @@ def expand_results_lm(df: pd.DataFrame, rdf: pd.DataFrame, return pd.concat([df_ctrl, df], axis=0, ignore_index=True) +def create_perm_ctrl_df(ctrl_df: pd.DataFrame, order_df: pd.DataFrame, + cpnt: str, feature: str, iteration: int + ) -> pd.DataFrame: + """ + + :param ctrl_df: A dataframe containing the mean ctrl values, \ + the mean control std and the community from which those control \ + have been created + :param order_df: A dataframe containing the community and their final \ + order. + :param cpnt: The component (nt, aa, dnt) of interest + :param feature: The feature of interest + :param iteration: The number of iteration + :return: The ctrl_tmp_df in good order + """ + dsize = ctrl_df.shape[0] + ctrl_df[f"mean_{cpnt}"] = \ + [np.mean(ctrl_df[f"{cpnt}_mean_{iteration}_ctrl"])] * dsize + ctrl_df[f"id_{feature}"] = ['ctrl'] * dsize + ctrl_df["community_size"] = [dsize] * dsize + ctrl_df = ctrl_df.merge(order_df, how='left', on="community") + ctrl_df.rename({f"{cpnt}_mean_{iteration}_ctrl": cpnt, + f"{cpnt}_std_{iteration}_ctrl": 'ctrl_std'}, axis=1, + inplace=True) + return ctrl_df.sort_values("order", ascending=True) + + def expand_results_perm(df: pd.DataFrame, rdf: pd.DataFrame, cpnt: str, feature: str, iteration: int) -> pd.DataFrame: """ @@ -495,18 +567,15 @@ def expand_results_perm(df: pd.DataFrame, rdf: pd.DataFrame, cpnt: str, :return: The merged dataframe: i.e df with the stats columns """ df = df[[f"id_{feature}", cpnt, "community", "community_size"]].copy() - ctrl_val = rdf[f"{cpnt}_mean_{iteration}_ctrl"] + ctrl_df = rdf[[f"{cpnt}_mean_{iteration}_ctrl", + f"{cpnt}_std_{iteration}_ctrl", "community"]].copy() rdf = rdf[["community", "community_size", cpnt, "p-adj"]].copy() rdf.rename({cpnt: f"mean_{cpnt}"}, axis=1, inplace=True) df = df.merge(rdf, how="left", on=["community", "community_size"]) - df_ctrl = pd.DataFrame( - {cpnt: ctrl_val, - f"mean_{cpnt}": [np.mean(ctrl_val)] * len(ctrl_val), - f"id_{feature}": ['ctrl'] * len(ctrl_val), - "community_size": [len(ctrl_val)] * len(ctrl_val), - "community": ["C-CTRL"] * len(ctrl_val)} - ) df.sort_values(f"mean_{cpnt}", ascending=True, inplace=True) + order_df = df[["community"]].drop_duplicates().copy() + order_df["order"] = range(order_df.shape[0]) + df_ctrl = create_perm_ctrl_df(ctrl_df, order_df, cpnt, feature, iteration) return pd.concat([df_ctrl, df], axis=0, ignore_index=True) @@ -544,6 +613,8 @@ def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str, rdf = perm_with_ctrl(df, feature, cpnt, df_ctrl, dic_com, iteration) df_bar = expand_results_perm(df, rdf, cpnt, feature, iteration) rdf.to_csv(outfile_ctrl, sep="\t", index=False) + bar_outfile = str(outfile_ctrl).replace(".txt", "_bar.txt") + df_bar.to_csv(bar_outfile, sep="\t", index=False) barplot_creation(df_bar, outfile_ctrl, cpnt_type, cpnt, test_type, feature) @@ -564,9 +635,13 @@ def barplot_creation(df_bar: pd.DataFrame, outfile: Path, cpnt_type: str, :param cpnt: The component (nt, aa, dnt) of interest :param test_type: The kind of test make :param feature: The king of feature of interest + :param test_type: The type of test to make (permutation or lm) """ outfig = outfile.parent / outfile.name.replace(".txt", ".pdf") - make_barplot(df_bar, outfig, cpnt_type, cpnt, test_type, feature) + if test_type == "lm": + make_barplot(df_bar, outfig, cpnt_type, cpnt, feature) + else: + make_barplot_perm(df_bar, outfig, cpnt_type, cpnt, feature) def create_outfiles(project: str, weight: int, global_weight: int, @@ -765,7 +840,7 @@ def multiple_nt_lm_launcher(ps: int, logging.info(f"Checking if communities as an effect on {component_type} " "frequency") project = get_projects(global_weight, project) - cpnt_list = get_features(component_type) + cpnt_list = [get_features(component_type)[0]] condition = list(product([project], [weight], cpnt_list)) processes = [] pool = mp.Pool(processes=min(ps, len(condition)))