diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index 7ca5a53d151ff62dae52ad9985bb6b6834b38b30..2f9d7d0e0e6608e58603d11585d8792e3c73749f 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -21,10 +21,7 @@ from .community_finder import get_projects from ..logging_conf import logging_def from itertools import product import multiprocessing as mp -import numpy as np -from .radomization_test_ppi import get_pvalue -from tqdm import tqdm -import seaborn as sns +from .community_figures.fig_functions import create_community_fig from ..nt_composition.config import get_features @@ -151,45 +148,6 @@ def lm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float: return res["Pr(>Chisq)"][1] -def lm_maker_summary(df: pd.DataFrame, outfile: Path, cpnt: str - ) -> pd.DataFrame: - """ - Make the lm analysis to see if the exon regulated by a splicing factor \ - are equally distributed among the communities. - - :param df: The dataframe - :param outfile: A name of a file - :param cpnt: The component (nt, aa, dnt) of interest - :return: the pvalue of lm - """ - pandas2ri.activate() - lmf = r( - """ - require("DHARMa") - - function(data, folder, partial_name) { - mod <- lm(%s ~ log(community_size) + community, data=data) - simulationOutput <- simulateResiduals(fittedModel = mod, n = 250) - png(paste0(folder, "/dignostics_summary", partial_name, ".png")) - plot(simulationOutput) - dev.off() - return(as.data.frame(summary(mod)$coefficients)) - } - - """ % cpnt) - folder = outfile.parent / "diagnostics" - folder.mkdir(parents=True, exist_ok=True) - partial_name = outfile.name.replace('.txt', '') - df.to_csv(f'frequency_{cpnt}.txt', sep="\t", index=False) - res_df = lmf(df, str(folder), partial_name).reset_index() - res_df.rename({'index': 'community'}, inplace=True, axis=1) - res_df['community'] = res_df['community'].str.replace('community', '') - res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL" - mean_df = df[[cpnt, "community", "community_size"]]. \ - groupby(["community", "community_size"]).mean().reset_index() - return res_df.merge(mean_df, how="left", on="community") - - def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]: """ Return the id of every gene/exons in chia-pet database. @@ -232,28 +190,6 @@ def create_ctrl_community(df: pd.DataFrame, return df -def lm_with_ctrl(df: pd.DataFrame, feature: str, region: str, - cpnt: str, outfile_diag: Path, cpnt_type: str - ) -> Tuple[pd.DataFrame, pd.DataFrame]: - """ - - :param df: df: A dataframe containing the frequency of each nucleotide \ - in each exons belonging to a community. - :param feature: The kind of feature analysed - :param region: the region of interest to extract from gene - :param cpnt: The component (nt, aa, dnt) of interest - :param outfile_diag: File from which the diagnostics folder will be \ - inferred - :param cpnt_type: The type of component to analyse; It \ - can be 'nt', 'dnt' or 'aa'. - :return: The dataframe with ctrl exon and \ - The dataframe with the p-value compared to the control \ - list of feature. - """ - ndf = create_ctrl_community(df, feature, region, cpnt_type) - return ndf, lm_maker_summary(ndf, outfile_diag, cpnt) - - def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict: """ Create a dictionary containing the exons contained in each community. @@ -282,112 +218,6 @@ def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict: return dic -def get_permutation_mean(df_ctrl: pd.DataFrame, - cpnt: str, size: int, iteration: int) -> List[float]: - """ - Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ - of `nt` frequencies from it. - - :param df_ctrl: A dataframe containing the frequency of each nucleotide \ - in each exons/gene in fasterdb. - :param cpnt: The component (nt, aa, dnt) of interest - :param size: The size of each sub samples to create - :param iteration: The number of sub samples to create - :return: The list of mean frequencies of `nt` in each subsample - """ - return [ - float(np.mean(df_ctrl[cpnt].sample(size, replace=True).values)) - for _ in range(iteration) - ] - - -def perm_community_pval(row: pd.Series, df_ctrl: pd.DataFrame, - cpnt: str, iteration: int - ) -> Tuple[float, float, float, str]: - """ - Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ - of `nt` frequencies from it. - - :param row: A line of a dataframe containing the frequency of \ - each feature inside a community. - :param df_ctrl: A dataframe containing the frequency of each nucleotide \ - in each exons/gene in fasterdb. - :param cpnt: The component (nt, aa, dnt) of interest - :param iteration: The number of sub samples to create - :return: The ctrl mean frequency value of `nt`, its standard error \ - the pvalue and the regulation of the enrichment/impoverishment \ - of the community in `row` compared to control exons. - """ - list_values = get_permutation_mean(df_ctrl, cpnt, row["community_size"], - iteration) - pval, reg = get_pvalue(np.array(list_values), row[cpnt], iteration) - return float(np.mean(list_values)), float(np.std(list_values)), pval, reg - - -def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, - cpnt: str, iteration: int, dic_com: Dict) -> pd.DataFrame: - """ - Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ - of `nt` frequencies from it. - - :param df: A dataframe containing the frequency of each nucleotide \ - in each exons belonging to a community. - :param df_ctrl: A dataframe containing the frequency of each nucleotide \ - in each exons/gene in fasterdb. - :param feature: The feature of interest (gene, exon) - :param cpnt: The component (nt, aa, dnt) of interest - :param iteration: The number of sub samples to create - :param dic_com: A dictionary linking each community to the exons \ - it contains. - :return: The dataframe containing p-values and regulation \ - indicating the enrichment of - """ - list_pval, list_reg, mean_ctrl, std_ctrl = ([] for _ in range(4)) - for i in tqdm(range(df.shape[0]), desc="performing permutations"): - row = df.iloc[i, :] - res = perm_community_pval(row, - df_ctrl.loc[ - -df_ctrl[f'id_{feature}' - ].isin(dic_com[row['community']]), - :], - cpnt, iteration) - [x.append(y) for x, y in zip([mean_ctrl, std_ctrl, list_pval, - list_reg], res)] - adj_pvals = multipletests(list_pval, alpha=0.05, - method='fdr_bh', - is_sorted=False, - returnsorted=False)[1] - adj_regs = [list_reg[i] if adj_pvals[i] <= 0.05 else " . " - for i in range(len(list_reg))] - df[f'{cpnt}_mean_{iteration}_ctrl'] = mean_ctrl - df[f'{cpnt}_std_{iteration}_ctrl'] = std_ctrl - df[f'p-adj'] = adj_pvals - df[f'reg-adj'] = adj_regs - return df - - -def perm_with_ctrl(df: pd.DataFrame, feature: str, - cpnt: str, df_ctrl: pd.DataFrame, dic_com: Dict, - iteration: int) -> pd.DataFrame: - """ - - :param df: A dataframe containing the frequency of each nucleotide \ - in each exons belonging to a community. - :param feature: The kind of feature analysed - :param cpnt: The component (nt, aa, dnt) of interest - :param df_ctrl: A dataframe containing the frequency of each nucleotide \ - in each exons/gene in fasterdb. - :param dic_com: A dictionary linking each community to the exons \ - it contains. - :param iteration: The number of sub samples to create - :return: The dataframe with the p-value compared to the control \ - list of exons. - """ - mean_df = df[[cpnt, "community", "community_size"]]. \ - groupby(["community", "community_size"]).mean().reset_index() - return perm_pvalues(mean_df, df_ctrl, feature, cpnt, iteration, dic_com) - - def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str, iteration: int) -> pd.DataFrame: """ @@ -420,233 +250,6 @@ def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str, return df_bar -def make_barplot(df_bar: pd.DataFrame, outfile: Path, cpnt_type: str, - cpnt: str, feature: str) -> None: - """ - Create a barplot showing the frequency of `nt` for every community \ - of exons/gene in `df_bar`. - - :param df_bar: A dataframe with the enrichment of a \ - nucleotide frequency for every community - :param outfile: File were the figure will be stored - :param cpnt_type: The type of component to analyse; It \ - can be 'nt', 'dnt' or 'aa'. - :param cpnt: The component (nt, aa, dnt) of interest - :param feature: The king of feature of interest - """ - sns.set(context="poster") - g = sns.catplot(x="community", y=cpnt, data=df_bar, kind="bar", - ci="sd", aspect=2.5, height=12, errwidth=0.5, capsize=.4, - palette=["red"] + ["darkgray"] * (df_bar.shape[0] - 1)) - g.fig.subplots_adjust(top=0.9) - g.fig.suptitle(f"Mean frequency of {cpnt} ({cpnt_type}) " - f"among community of {feature}s\n" - f"(stats obtained with a lm test)") - g.set(xticklabels=[]) - g.ax.set_ylabel(f'Frequency of {cpnt}') - df_bara = df_bar.drop_duplicates(subset="community", keep="first") - for i, p in enumerate(g.ax.patches): - stats = "*" if df_bara.iloc[i, :]["p-adj"] < 0.05 else "" - com = df_bara.iloc[i, :]["community"] - csd = np.std(df_bar.loc[df_bar["community"] == com, cpnt]) - g.ax.annotate(stats, - (p.get_x() + p.get_width() / 2., p.get_height() + csd), - ha='center', va='center', xytext=(0, 10), fontsize=12, - textcoords='offset points') - g.savefig(outfile) - - -def make_barplot_perm(df_bar: pd.DataFrame, outfile: Path, cpnt_type: str, - cpnt: str, feature: str) -> None: - """ - Create a barplot showing the frequency of `nt` for every community \ - of exons/gene in `df_bar`. - - :param df_bar: A dataframe with the enrichment of a \ - nucleotide frequency for every community - :param outfile: File were the figure will be stored - :param cpnt_type: The type of component to analyse; It \ - can be 'nt', 'dnt' or 'aa'. - :param cpnt: The component (nt, aa, dnt) of interest - :param feature: The king of feature of interest - """ - sns.set(context="poster") - df_ctrl = df_bar.loc[df_bar[f"id_{feature}"] == 'ctrl', :] - df_bar = df_bar.loc[df_bar[f"id_{feature}"] != 'ctrl', :].copy() - g2 = sns.catplot(x="community", y=cpnt, data=df_bar, kind="bar", - ci="sd", aspect=2.5, height=14, errwidth=0.5, capsize=.4, - palette= ["darkgray"] * (df_bar.shape[0])) - g = sns.catplot(x="community", y=cpnt, data=df_bar, kind="point", - ci="sd", aspect=2.5, height=14, errwidth=0.5, capsize=.4, - scale=0.5, palette= ["darkgray"] * (df_bar.shape[0])) - xrange = g.ax.get_xlim() - df_ctrl.plot(x="community", y=cpnt, kind="scatter", ax=g.ax, - yerr= "ctrl_std", legend=False, zorder=10, - color=(0.8, 0.2, 0.2, 0.4)) - g.ax.set_xlim(xrange) - g.fig.subplots_adjust(top=0.9) - g.fig.suptitle(f"Mean frequency of {cpnt} ({cpnt_type}) " - f"among community of {feature}s\n" - f"(stats obtained with a permutation test)") - g.set(xticklabels=[]) - g.ax.set_ylabel(f'Frequency of {cpnt}') - df_bara = df_bar.drop_duplicates(subset="community", keep="first") - for i, p in enumerate(g2.ax.patches): - stats = "*" if df_bara.iloc[i, :]["p-adj"] < 0.05 else "" - com = df_bara.iloc[i, :]["community"] - csd = np.std(df_bar.loc[df_bar["community"] == com, cpnt]) - g.ax.annotate(stats, - (p.get_x() + p.get_width() / 2., p.get_height() + csd), - ha='center', va='center', xytext=(0, 10), fontsize=12, - textcoords='offset points') - g.savefig(outfile) - - -def expand_results_lm(df: pd.DataFrame, rdf: pd.DataFrame, - cpnt: str, feature: str) -> pd.DataFrame: - """ - Merge df and rdf together. - - :param df: A dataframe containing the frequency of each nucleotide \ - in each feature belonging to a community. - :param rdf: The dataframe containing the mean frequency for \ - each community and the p-value of their enrichment compared to control \ - exons. - :param cpnt: The component (nt, aa, dnt) of interest - :param feature: The feature of interest - :return: The merged dataframe: i.e df with the stats columns - """ - p_col = "Pr(>|t|)" - df = df[[f"id_{feature}", cpnt, "community", "community_size"]].copy() - rdf = rdf[["community", "community_size", p_col, cpnt]].copy() - rdf.rename({cpnt: f"mean_{cpnt}", p_col: "p-adj"}, axis=1, inplace=True) - df = df.merge(rdf, how="left", on=["community", "community_size"]) - df_ctrl = df[df["community"] == "C-CTRL"] - df = df[df["community"] != "C-CTRL"].copy() - df.sort_values(f"mean_{cpnt}", ascending=True, inplace=True) - return pd.concat([df_ctrl, df], axis=0, ignore_index=True) - - -def create_perm_ctrl_df(ctrl_df: pd.DataFrame, order_df: pd.DataFrame, - cpnt: str, feature: str, iteration: int - ) -> pd.DataFrame: - """ - - :param ctrl_df: A dataframe containing the mean ctrl values, \ - the mean control std and the community from which those control \ - have been created - :param order_df: A dataframe containing the community and their final \ - order. - :param cpnt: The component (nt, aa, dnt) of interest - :param feature: The feature of interest - :param iteration: The number of iteration - :return: The ctrl_tmp_df in good order - """ - dsize = ctrl_df.shape[0] - ctrl_df[f"mean_{cpnt}"] = \ - [np.mean(ctrl_df[f"{cpnt}_mean_{iteration}_ctrl"])] * dsize - ctrl_df[f"id_{feature}"] = ['ctrl'] * dsize - ctrl_df["community_size"] = [dsize] * dsize - ctrl_df = ctrl_df.merge(order_df, how='left', on="community") - ctrl_df.rename({f"{cpnt}_mean_{iteration}_ctrl": cpnt, - f"{cpnt}_std_{iteration}_ctrl": 'ctrl_std'}, axis=1, - inplace=True) - return ctrl_df.sort_values("order", ascending=True) - - -def expand_results_perm(df: pd.DataFrame, rdf: pd.DataFrame, cpnt: str, - feature: str, iteration: int) -> pd.DataFrame: - """ - Merge df and rdf together. - - :param df: A dataframe containing the frequency of each nucleotide \ - in each feature belonging to a community. - :param rdf: The dataframe containing the mean frequency for \ - each community and the p-value of their enrichment compared to control \ - exons. - :param cpnt: The component (nt, aa, dnt) of interest - :param feature: The feature of interest - :param iteration: The number of iteration - :return: The merged dataframe: i.e df with the stats columns - """ - df = df[[f"id_{feature}", cpnt, "community", "community_size"]].copy() - ctrl_df = rdf[[f"{cpnt}_mean_{iteration}_ctrl", - f"{cpnt}_std_{iteration}_ctrl", "community"]].copy() - rdf = rdf[["community", "community_size", cpnt, "p-adj"]].copy() - rdf.rename({cpnt: f"mean_{cpnt}"}, axis=1, inplace=True) - df = df.merge(rdf, how="left", on=["community", "community_size"]) - df.sort_values(f"mean_{cpnt}", ascending=True, inplace=True) - order_df = df[["community"]].drop_duplicates().copy() - order_df["order"] = range(order_df.shape[0]) - df_ctrl = create_perm_ctrl_df(ctrl_df, order_df, cpnt, feature, iteration) - return pd.concat([df_ctrl, df], axis=0, ignore_index=True) - - -def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str, - region: str, cpnt_type: str, cpnt: str, - outfile_ctrl: Path, test_type: str, - df_ctrl: pd.DataFrame, dic_com: Dict, - iteration: int) -> None: - """ - Create a dataframe with a control community, save it as a table and \ - as a barplot figure. - - :param df: A dataframe containing the frequency of each nucleotide \ - in each exons belonging to a community. - :param feature: The kind of feature analysed - :param region: the region of interest to extract from gene - :param cpnt_type: The type of component to analyse; It \ - can be 'nt', 'dnt' or 'aa'. - :param cpnt: The component (nt, aa, dnt) of interest - :param outfile_ctrl: file used to stored the table and the figure \ - containing the test communities and the control community - :param test_type: The type of test to make (permutation or lm) - :param df_ctrl: A dataframe containing the frequency of each nucleotide \ - in each exons/gene in fasterdb. - :param dic_com: A dictionary linking each community to the exons \ - it contains. - :param iteration: The number of sub samples to create - """ - if test_type == "lm": - ndf, rdf = lm_with_ctrl(df, feature, region, cpnt, - outfile_ctrl.parents[1] / outfile_ctrl.name, - cpnt_type) - df_bar = expand_results_lm(ndf, rdf, cpnt, feature) - else: - rdf = perm_with_ctrl(df, feature, cpnt, df_ctrl, dic_com, iteration) - df_bar = expand_results_perm(df, rdf, cpnt, feature, iteration) - rdf.to_csv(outfile_ctrl, sep="\t", index=False) - bar_outfile = str(outfile_ctrl).replace(".txt", "_bar.txt") - df_bar.to_csv(bar_outfile, sep="\t", index=False) - barplot_creation(df_bar, outfile_ctrl, cpnt_type, cpnt, - test_type, feature) - - -def barplot_creation(df_bar: pd.DataFrame, outfile: Path, cpnt_type: str, - cpnt: str, test_type: str, feature: str) -> None: - """ - Reformat a dataframe with the enrichment of a nucleotide frequency \ - for every feature for every community and then create a \ - barplot showing those frequencies. - - :param df_bar: A dataframe with the enrichment of a \ - nucleotide frequency for every community and showing the frequency \ - of each feature in each community - :param outfile: File were rdf is stored - :param cpnt_type: The type of component to analyse; It \ - can be 'nt', 'dnt' or 'aa'. - :param cpnt: The component (nt, aa, dnt) of interest - :param test_type: The kind of test make - :param feature: The king of feature of interest - :param test_type: The type of test to make (permutation or lm) - """ - outfig = outfile.parent / outfile.name.replace(".txt", ".pdf") - if test_type == "lm": - make_barplot(df_bar, outfig, cpnt_type, cpnt, feature) - else: - make_barplot_perm(df_bar, outfig, cpnt_type, cpnt, feature) - - def create_outfiles(project: str, weight: int, global_weight: int, same_gene: bool, feature: str, cpnt_type: str, cpnt: str, test_type: str @@ -679,16 +282,16 @@ def create_outfiles(project: str, weight: int, global_weight: int, outfolder) outfile_ctrl = ConfigGraph.\ get_community_file(project, weight, global_weight, same_gene, feature, - f"{cpnt}-{cpnt_type}_VS_CTRL_stat_{test_type}.txt", + f"{cpnt}-{cpnt_type}_VS_CTRL_stat_{test_type}.pdf", outfolder) return outfile, outfile_ctrl def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, global_weight: int, same_gene: bool, - cpnt_type: str, cpnt: str, df_ctrl: pd.DataFrame, + cpnt_type: str, cpnt: str, dic_com: Dict, feature: str = "exon", - region: str = "", test_type: str = "", + test_type: str = "", iteration: int = 1000) -> pd.Series: """ Get data (proportion of `reg` regulated exons by a splicing factor @@ -707,12 +310,9 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, :param cpnt_type: The type of component to analyse; It \ can be 'nt', 'dnt' or 'aa'. :param cpnt: The component (nt, aa, dnt) of interest - :param df_ctrl: A dataframe containing the frequency of each nucleotide \ - in each exons/gene in fasterdb. :param dic_com: A dictionary linking each community to the exons \ it contains. :param feature: The kind of feature analysed - :param region: the region of interest to extract from gene :param test_type: The type of test to make (permutation or lm) :param iteration: The number of sub samples to create """ @@ -723,10 +323,8 @@ def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, cpnt, test_type) res = {"project": project, "cpnt": cpnt, 'pval': lm_maker(df, outfile, cpnt)} - create_and_save_ctrl_dataframe(df, feature, region, cpnt_type, - cpnt, outfile_ctrl, test_type, df_ctrl, - dic_com, iteration) - + create_community_fig(df, feature, cpnt, outfile_ctrl, test_type, + dic_com, cpnt_type, iteration) return pd.Series(res) @@ -775,7 +373,7 @@ def create_dataframe(project: str, weight: int, global_weight: int, def create_dataframes(project, weight, global_weight, same_gene, feature, region, test_type, component_type: str - ) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]: + ) -> Tuple[pd.DataFrame, Dict]: """ :param weight: The weight of interaction to consider :param global_weight: The global weighs to consider. if \ @@ -792,15 +390,15 @@ def create_dataframes(project, weight, global_weight, same_gene, feature, """ df = create_dataframe(project, weight, global_weight, same_gene, feature, region, component_type) - if test_type == 'lm': - df_ctrl = pd.DataFrame() - dic_com = {} - else: - df_ctrl = create_dataframe(project, weight, global_weight, same_gene, - feature, region, component_type, - from_communities=False) - dic_com = get_feature_by_community(df, feature) - return df, df_ctrl, dic_com + df_ctrl = create_dataframe(project, weight, global_weight, same_gene, + feature, region, component_type, + from_communities=False) + df_ctrl = df_ctrl.loc[-df_ctrl[f"id_{feature}"].isin(df[f"id_{feature}"]), + :].copy() + dic_com = {} if test_type == 'lm' \ + else get_feature_by_community(df, feature) + df = pd.concat([df, df_ctrl], axis=0, ignore_index=True) + return df, dic_com def multiple_nt_lm_launcher(ps: int, @@ -849,16 +447,16 @@ def multiple_nt_lm_launcher(ps: int, processes = [] pool = mp.Pool(processes=min(ps, len(condition))) logging.debug("Creating tables") - df, df_ctrl, dic_com = create_dataframes(project, weight, global_weight, - same_gene, feature, region, - test_type, component_type) + df, dic_com = create_dataframes(project, weight, global_weight, + same_gene, feature, region, + test_type, component_type) for project, weight, cpnt in condition: nfile_table = ConfigGraph.get_community_file( project, weight, global_weight, same_gene, feature, f"_{component_type}_table.txt", "community_enrichment") df.to_csv(nfile_table, sep="\t", index=False) args = [df, project, weight, global_weight, same_gene, component_type, - cpnt, df_ctrl, dic_com, feature, region, test_type, iteration] + cpnt, dic_com, feature, test_type, iteration] processes.append(pool.apply_async(get_stat_cpnt_communities, args)) results = [p.get(timeout=None) for p in processes] pool.close()