diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index a96e351303428e77a41660cc7ec31e0c7b0ac4c7..36044dc8265f2451041846b2a4b1d27eeba0767d 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -25,10 +25,12 @@ import numpy as np from .radomization_test_ppi import get_pvalue from tqdm import tqdm import seaborn as sns +from ..nt_composition.config import get_features -def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], - feature: str, region: str = "") -> pd.DataFrame: +def get_cpnt_frequency(cnx: sqlite3.Connection, list_ft: List[str], + feature: str, region: str = "", + component_type: str = "nt") -> pd.DataFrame: """ Get the frequency of every nucleotides for features in list_ft. @@ -36,20 +38,28 @@ def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], :param list_ft: The list of exons for which we want to get :param feature: the kind of feature analysed :param region: The region of gene analysed if feature is gene + :param component_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. :return: the frequency of nucleotides for the list of exons. - >>> d = get_nt_frequency(sqlite3.connect(ConfigGraph.db_file), + >>> d = get_cpnt_frequency(sqlite3.connect(ConfigGraph.db_file), ... ["1_1", "1_2"], "exon") >>> d[["id_exon", 'A', 'C', 'G', 'T']] ft id_exon A C G T 0 1_1 16.63480 34.60803 32.12237 16.63480 1 1_2 16.06426 26.10442 39.75904 18.07229 - >>> d = get_nt_frequency(sqlite3.connect(ConfigGraph.db_file), + >>> d = get_cpnt_frequency(sqlite3.connect(ConfigGraph.db_file), ... ['1', '2'], "gene") >>> d[["id_gene", 'A', 'C', 'G', 'T']] ft id_gene A C G T 0 1 29.49376 18.34271 18.43874 33.72479 1 2 31.90401 16.40251 18.79033 32.90315 + >>> d = get_cpnt_frequency(sqlite3.connect(ConfigGraph.db_file), + ... ['1', '2'], "gene", 'exon', 'aa') + >>> d[["id_gene", "R", "K", "D", "Q", "E"]] + ft id_gene R K D Q E + 0 1 4.75247 5.19300 5.95391 4.07997 6.96189 + 1 2 4.34203 6.23736 6.77708 5.21984 7.01769 """ query_region = "" if feature == "gene": @@ -61,7 +71,7 @@ def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], SELECT ft, id_{feature}, frequency FROM cin_{feature}_frequency WHERE id_{feature} IN {tuple(list_ft)} - AND ft_type="nt" + AND ft_type = '{component_type}' {query_region} """ df = pd.read_sql_query(query, cnx) @@ -142,7 +152,7 @@ def lmm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float: return res["Pr(>Chisq)"][1] -def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str +def lmm_maker_summary(df: pd.DataFrame, outfile: Path, cpnt: str ) -> pd.DataFrame: """ Make the lmm analysis to see if the exon regulated by a splicing factor \ @@ -150,7 +160,7 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str :param df: The dataframe :param outfile: A name of a file - :param nt: the nucleotide of interest + :param cpnt: The component (nt, aa, dnt) of interest :return: the pvalue of lmm """ pandas2ri.activate() @@ -167,7 +177,7 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str return(as.data.frame(summary(mod)$coefficients)) } - """ % nt) + """ % cpnt) folder = outfile.parent / "diagnostics" folder.mkdir(parents=True, exist_ok=True) partial_name = outfile.name.replace('.txt', '') @@ -175,7 +185,7 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str res_df.rename({'index': 'community'}, inplace=True, axis=1) res_df['community'] = res_df['community'].str.replace('community', '') res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL" - mean_df = df[[nt, "community", "community_size"]]. \ + mean_df = df[[cpnt, "community", "community_size"]]. \ groupby(["community", "community_size"]).mean().reset_index() return res_df.merge(mean_df, how="left", on="community") @@ -196,14 +206,16 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]: def create_ctrl_community(df: pd.DataFrame, - feature: str = 'exon', region: str = "" - ) -> pd.DataFrame: + feature: str = 'exon', region: str = "", + cpnt_type: str = 'nt') -> pd.DataFrame: """ :param df: A dataframe containing the frequency of each feature in a \ community. :param feature: The kind of feature to analyse :param region: only use if feature is 'gene'. Used to focus on \ a given region in genes (can be gene, exon, intron). + :param cpnt_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. :return: A dataframe containing the frequency of every nucleotides \ of every exon in a large community """ @@ -212,7 +224,7 @@ def create_ctrl_community(df: pd.DataFrame, ft_id = get_ft_id(cnx, feature) list_ft = [cid for cid in ft_id if cid not in df[f"id_{feature}"].to_list()] - df_nt = get_nt_frequency(cnx, list_ft, feature, region) + df_nt = get_cpnt_frequency(cnx, list_ft, feature, region, cpnt_type) df_com = get_community_table([list_ft], size_threshold, feature) df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}") df_nt['community'] = ['C-CTRL'] * df_nt.shape[0] @@ -221,7 +233,7 @@ def create_ctrl_community(df: pd.DataFrame, def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str, - nt: str, outfile_diag: Path + cpnt: str, outfile_diag: Path, cpnt_type: str ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ @@ -229,15 +241,17 @@ def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str, in each exons belonging to a community. :param feature: The kind of feature analysed :param region: the region of interest to extract from gene - :param nt: The nucleotide of interest + :param cpnt: The component (nt, aa, dnt) of interest :param outfile_diag: File from which the diagnostics folder will be \ inferred + :param cpnt_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. :return: The dataframe with ctrl exon and \ The dataframe with the p-value compared to the control \ list of feature. """ - ndf = create_ctrl_community(df, feature, region) - return ndf, lmm_maker_summary(ndf, outfile_diag, nt) + ndf = create_ctrl_community(df, feature, region, cpnt_type) + return ndf, lmm_maker_summary(ndf, outfile_diag, cpnt) def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict: @@ -269,26 +283,26 @@ def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict: def get_permutation_mean(df_ctrl: pd.DataFrame, - nt: str, size: int, iteration: int) -> List[float]: + cpnt: str, size: int, iteration: int) -> List[float]: """ Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ of `nt` frequencies from it. :param df_ctrl: A dataframe containing the frequency of each nucleotide \ in each exons/gene in fasterdb. - :param nt: A nucleotide of interest + :param cpnt: The component (nt, aa, dnt) of interest :param size: The size of each sub samples to create :param iteration: The number of sub samples to create :return: The list of mean frequencies of `nt` in each subsample """ return [ - float(np.mean(df_ctrl[nt].sample(size, replace=True).values)) + float(np.mean(df_ctrl[cpnt].sample(size, replace=True).values)) for _ in range(iteration) ] def perm_community_pval(row: pd.Series, df_ctrl: pd.DataFrame, - nt: str, iteration: int) -> Tuple[float, float, str]: + cpnt: str, iteration: int) -> Tuple[float, float, str]: """ Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ of `nt` frequencies from it. @@ -297,20 +311,20 @@ def perm_community_pval(row: pd.Series, df_ctrl: pd.DataFrame, each feature inside a community. :param df_ctrl: A dataframe containing the frequency of each nucleotide \ in each exons/gene in fasterdb. - :param nt: A nucleotide of interest + :param cpnt: The component (nt, aa, dnt) of interest :param iteration: The number of sub samples to create :return: The ctrl mean frequency value of `nt` \ the pvalue and the regulation of the enrichment/impoverishment \ of the community in `row` compared to control exons. """ - list_values = get_permutation_mean(df_ctrl, nt, row["community_size"], + list_values = get_permutation_mean(df_ctrl, cpnt, row["community_size"], iteration) - pval, reg = get_pvalue(np.array(list_values), row[nt], iteration) + pval, reg = get_pvalue(np.array(list_values), row[cpnt], iteration) return float(np.mean(list_values)), pval, reg def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, - nt: str, iteration: int, dic_com: Dict) -> pd.DataFrame: + cpnt: str, iteration: int, dic_com: Dict) -> pd.DataFrame: """ Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ of `nt` frequencies from it. @@ -320,7 +334,7 @@ def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, :param df_ctrl: A dataframe containing the frequency of each nucleotide \ in each exons/gene in fasterdb. :param feature: The feature of interest (gene, exon) - :param nt: A nucleotide of interest + :param cpnt: The component (nt, aa, dnt) of interest :param iteration: The number of sub samples to create :param dic_com: A dictionary linking each community to the exons \ it contains. @@ -335,7 +349,7 @@ def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, -df_ctrl[f'id_{feature}' ].isin(dic_com[row['community']]), :], - nt, iteration) + cpnt, iteration) [x.append(y) for x, y in zip([mean_ctrl, list_pval, list_reg], res)] adj_pvals = multipletests(list_pval, alpha=0.05, method='fdr_bh', @@ -343,21 +357,21 @@ def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, returnsorted=False)[1] adj_regs = [list_reg[i] if adj_pvals[i] <= 0.05 else " . " for i in range(len(list_reg))] - df[f'{nt}_mean_{iteration}_ctrl'] = mean_ctrl + df[f'{cpnt}_mean_{iteration}_ctrl'] = mean_ctrl df[f'p-adj'] = adj_pvals df[f'reg-adj'] = adj_regs return df def perm_with_ctrl(df: pd.DataFrame, feature: str, - nt: str, df_ctrl: pd.DataFrame, dic_com: Dict, + cpnt: str, df_ctrl: pd.DataFrame, dic_com: Dict, iteration: int) -> pd.DataFrame: """ :param df: A dataframe containing the frequency of each nucleotide \ in each exons belonging to a community. :param feature: The kind of feature analysed - :param nt: The nucleotide of interest + :param cpnt: The component (nt, aa, dnt) of interest :param df_ctrl: A dataframe containing the frequency of each nucleotide \ in each exons/gene in fasterdb. :param dic_com: A dictionary linking each community to the exons \ @@ -366,9 +380,9 @@ def perm_with_ctrl(df: pd.DataFrame, feature: str, :return: The dataframe with the p-value compared to the control \ list of exons. """ - mean_df = df[[nt, "community", "community_size"]]. \ + mean_df = df[[cpnt, "community", "community_size"]]. \ groupby(["community", "community_size"]).mean().reset_index() - return perm_pvalues(mean_df, df_ctrl, feature, nt, iteration, dic_com) + return perm_pvalues(mean_df, df_ctrl, feature, cpnt, iteration, dic_com) def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str, @@ -403,8 +417,8 @@ def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str, return df_bar -def make_barplot(df_bar: pd.DataFrame, outfile: Path, nt: str, test_type: str, - feature: str) -> None: +def make_barplot(df_bar: pd.DataFrame, outfile: Path, cpnt_type: str, + cpnt: str, test_type: str, feature: str) -> None: """ Create a barplot showing the frequency of `nt` for every community \ of exons/gene in `df_bar`. @@ -412,24 +426,27 @@ def make_barplot(df_bar: pd.DataFrame, outfile: Path, nt: str, test_type: str, :param df_bar: A dataframe with the enrichment of a \ nucleotide frequency for every community :param outfile: File were the figure will be stored - :param nt: The nucleotide for which we are seeking enrichment + :param cpnt_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. + :param cpnt: The component (nt, aa, dnt) of interest :param test_type: The kind of test make :param feature: The king of feature of interest """ sns.set() test_name = "permutation" if test_type == "perm" else "lmm" - g = sns.catplot(x="community", y=nt, data=df_bar, kind="bar", + g = sns.catplot(x="community", y=cpnt, data=df_bar, kind="bar", ci="sd", aspect=2.5, height=12, errwidth=0.5, capsize=.4, palette=["red"] + ["lightgray"] * (df_bar.shape[0] - 1)) - g.fig.suptitle(f"Mean frequency of {nt} among community of {feature}s\n" + g.fig.suptitle(f"Mean frequency of {cpnt} ({cpnt_type}) " + f"among community of {feature}s\n" f"(stats obtained with as {test_name} test)") g.set(xticklabels=[]) - g.ax.set_ylabel(f'Frequency of {nt}') + g.ax.set_ylabel(f'Frequency of {cpnt}') df_bara = df_bar.drop_duplicates(subset="community", keep="first") for i, p in enumerate(g.ax.patches): stats = "*" if df_bara.iloc[i, :]["p-adj"] < 0.05 else "" com = df_bara.iloc[i, :]["community"] - csd = np.std(df_bar.loc[df_bar["community"] == com, nt]) + csd = np.std(df_bar.loc[df_bar["community"] == com, cpnt]) g.ax.annotate(stats, (p.get_x() + p.get_width() / 2., p.get_height() + csd), ha='center', va='center', xytext=(0, 10), @@ -438,7 +455,7 @@ def make_barplot(df_bar: pd.DataFrame, outfile: Path, nt: str, test_type: str, def expand_results_lmm(df: pd.DataFrame, rdf: pd.DataFrame, - nt: str, feature: str) -> pd.DataFrame: + cpnt: str, feature: str) -> pd.DataFrame: """ Merge df and rdf together. @@ -447,23 +464,23 @@ def expand_results_lmm(df: pd.DataFrame, rdf: pd.DataFrame, :param rdf: The dataframe containing the mean frequency for \ each community and the p-value of their enrichment compared to control \ exons. - :param nt: the nucleotide of interest + :param cpnt: The component (nt, aa, dnt) of interest :param feature: The feature of interest :return: The merged dataframe: i.e df with the stats columns """ p_col = "Pr(>|t|)" - df = df[[f"id_{feature}", nt, "community", "community_size"]].copy() - rdf = rdf[["community", "community_size", p_col, nt]].copy() - rdf.rename({nt: f"mean_{nt}", p_col: "p-adj"}, axis=1, inplace=True) + df = df[[f"id_{feature}", cpnt, "community", "community_size"]].copy() + rdf = rdf[["community", "community_size", p_col, cpnt]].copy() + rdf.rename({cpnt: f"mean_{cpnt}", p_col: "p-adj"}, axis=1, inplace=True) df = df.merge(rdf, how="left", on=["community", "community_size"]) df_ctrl = df[df["community"] == "C-CTRL"] df = df[df["community"] != "C-CTRL"].copy() - df.sort_values(f"mean_{nt}", ascending=True, inplace=True) + df.sort_values(f"mean_{cpnt}", ascending=True, inplace=True) return pd.concat([df_ctrl, df], axis=0, ignore_index=True) -def expand_results_perm(df: pd.DataFrame, rdf: pd.DataFrame, - nt: str, feature: str, iteration: int) -> pd.DataFrame: +def expand_results_perm(df: pd.DataFrame, rdf: pd.DataFrame, cpnt: str, + feature: str, iteration: int) -> pd.DataFrame: """ Merge df and rdf together. @@ -472,30 +489,32 @@ def expand_results_perm(df: pd.DataFrame, rdf: pd.DataFrame, :param rdf: The dataframe containing the mean frequency for \ each community and the p-value of their enrichment compared to control \ exons. - :param nt: the nucleotide of interest + :param cpnt: The component (nt, aa, dnt) of interest :param feature: The feature of interest :param iteration: The number of iteration :return: The merged dataframe: i.e df with the stats columns """ - df = df[[f"id_{feature}", nt, "community", "community_size"]].copy() - ctrl_val = rdf[f"{nt}_mean_{iteration}_ctrl"] - rdf = rdf[["community", "community_size", nt, "p-adj"]].copy() - rdf.rename({nt: f"mean_{nt}"}, axis=1, inplace=True) + df = df[[f"id_{feature}", cpnt, "community", "community_size"]].copy() + ctrl_val = rdf[f"{cpnt}_mean_{iteration}_ctrl"] + rdf = rdf[["community", "community_size", cpnt, "p-adj"]].copy() + rdf.rename({cpnt: f"mean_{cpnt}"}, axis=1, inplace=True) df = df.merge(rdf, how="left", on=["community", "community_size"]) - df_ctrl = pd.DataFrame({nt: ctrl_val, - f"mean_{nt}": [np.mean(ctrl_val)] * len(ctrl_val), - f"id_{feature}": ['ctrl'] * len(ctrl_val), - "community_size": [len(ctrl_val)] * len(ctrl_val), - "community": ["C-CTRL"] * len(ctrl_val)}) - df.sort_values(f"mean_{nt}", ascending=True, inplace=True) + df_ctrl = pd.DataFrame( + {cpnt: ctrl_val, + f"mean_{cpnt}": [np.mean(ctrl_val)] * len(ctrl_val), + f"id_{feature}": ['ctrl'] * len(ctrl_val), + "community_size": [len(ctrl_val)] * len(ctrl_val), + "community": ["C-CTRL"] * len(ctrl_val)} + ) + df.sort_values(f"mean_{cpnt}", ascending=True, inplace=True) return pd.concat([df_ctrl, df], axis=0, ignore_index=True) def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str, - region: str, nt: str, outfile: Path, - test_type: str, df_ctrl: pd.DataFrame, - dic_com: Dict, iteration: int, - outfile_ctrl: Path) -> None: + region: str, cpnt_type: str, cpnt: str, + outfile: Path, test_type: str, + df_ctrl: pd.DataFrame, dic_com: Dict, + iteration: int, outfile_ctrl: Path) -> None: """ Create a dataframe with a control community, save it as a table and \ as a barplot figure. @@ -504,7 +523,9 @@ def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str, in each exons belonging to a community. :param feature: The kind of feature analysed :param region: the region of interest to extract from gene - :param nt: The nucleotide of interest + :param cpnt_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. + :param cpnt: The component (nt, aa, dnt) of interest :param outfile: File used to store diagnotics :param test_type: The type of test to make (permutation or lmm) :param df_ctrl: A dataframe containing the frequency of each nucleotide \ @@ -516,19 +537,19 @@ def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str, containing the test communities and the control community """ if test_type == "lmm": - ndf, rdf = lmm_with_ctrl(df, feature, region, nt, - outfile.parents[1] / outfile.name) - df_bar = expand_results_lmm(ndf, rdf, nt, feature) + ndf, rdf = lmm_with_ctrl(df, feature, region, cpnt, + outfile.parents[1] / outfile.name, cpnt_type) + df_bar = expand_results_lmm(ndf, rdf, cpnt, feature) else: - rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration) - df_bar = expand_results_perm(df, rdf, nt, feature, iteration) + rdf = perm_with_ctrl(df, feature, cpnt, df_ctrl, dic_com, iteration) + df_bar = expand_results_perm(df, rdf, cpnt, feature, iteration) rdf.to_csv(outfile_ctrl, sep="\t", index=False) - barplot_creation(df_bar, outfile_ctrl, nt, + barplot_creation(df_bar, outfile_ctrl, cpnt_type, cpnt, test_type, feature) -def barplot_creation(df_bar: pd.DataFrame, outfile: Path, nt: str, - test_type: str, feature: str) -> None: +def barplot_creation(df_bar: pd.DataFrame, outfile: Path, cpnt_type: str, + cpnt: str, test_type: str, feature: str) -> None: """ Reformat a dataframe with the enrichment of a nucleotide frequency \ for every feature for every community and then create a \ @@ -538,16 +559,19 @@ def barplot_creation(df_bar: pd.DataFrame, outfile: Path, nt: str, nucleotide frequency for every community and showing the frequency \ of each feature in each community :param outfile: File were rdf is stored - :param nt: The nucleotide for which we are seeking enrichment + :param cpnt_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. + :param cpnt: The component (nt, aa, dnt) of interest :param test_type: The kind of test make :param feature: The king of feature of interest """ outfig = outfile.parent / outfile.name.replace(".txt", ".pdf") - make_barplot(df_bar, outfig, nt, test_type, feature) + make_barplot(df_bar, outfig, cpnt_type, cpnt, test_type, feature) def create_outfiles(project: str, weight: int, global_weight: int, - same_gene: bool, feature: str, nt: str, test_type: str + same_gene: bool, feature: str, cpnt_type: str, + cpnt: str, test_type: str ) -> Tuple[Path, Path]: """ Create a file used to store diagnostics and a file used to store the \ @@ -561,30 +585,33 @@ def create_outfiles(project: str, weight: int, global_weight: int, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) - :param nt: The nucleotide of interest + :param cpnt_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. + :param cpnt: The component (nt, aa, dnt) of interest :param feature: The kind of feature analysed :param test_type: The type of test to make (permutation or lmm) :return: file used to store diagnostics and a file used to store the \ table containing the test communities and the control community """ - outfolder = "community_enrichment/nt_analysis" + outfolder = f"community_enrichment/{cpnt_type}_analysis" outfile = ConfigGraph.\ get_community_file(project, weight, global_weight, same_gene, feature, - f"{nt}_stat_{test_type}.txt", outfolder) + f"{cpnt}-{cpnt_type}_stat_{test_type}.txt", + outfolder) outfile_ctrl = ConfigGraph.\ get_community_file(project, weight, global_weight, same_gene, feature, - f"{nt}_VS_CTRL_stat_{test_type}.txt", outfolder) + f"{cpnt}-{cpnt_type}_VS_CTRL_stat_{test_type}.txt", + outfolder) return outfile, outfile_ctrl -def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, - global_weight: int, same_gene: bool, - nt: str, df_ctrl: pd.DataFrame, - dic_com: Dict, - feature: str = "exon", - region: str = "", test_type: str = "", - iteration: int = 1000) -> pd.Series: +def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, + global_weight: int, same_gene: bool, + cpnt_type: str, cpnt: str, df_ctrl: pd.DataFrame, + dic_com: Dict, feature: str = "exon", + region: str = "", test_type: str = "", + iteration: int = 1000) -> pd.Series: """ Get data (proportion of `reg` regulated exons by a splicing factor `sf_name` within a community) for every community. @@ -599,7 +626,9 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) - :param nt: The nucleotide of interest + :param cpnt_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. + :param cpnt: The component (nt, aa, dnt) of interest :param df_ctrl: A dataframe containing the frequency of each nucleotide \ in each exons/gene in fasterdb. :param dic_com: A dictionary linking each community to the exons \ @@ -610,19 +639,23 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, :param iteration: The number of sub samples to create """ logging.debug(f"{test_type} for {project}, w:{weight}, " - f"g:{global_weight} nt: {nt}") + f"g:{global_weight} cpnt: {cpnt}({cpnt_type})") outfile, outfile_ctrl = create_outfiles(project, weight, global_weight, - same_gene, feature, nt, test_type) - res = {"project": project, "nt": nt, 'pval': lmm_maker(df, outfile, nt)} - create_and_save_ctrl_dataframe(df, feature, region, nt, outfile, test_type, - df_ctrl, dic_com, iteration, outfile_ctrl) + same_gene, feature, cpnt_type, + cpnt, test_type) + res = {"project": project, "cpnt": cpnt, + 'pval': lmm_maker(df, outfile, cpnt)} + create_and_save_ctrl_dataframe(df, feature, region, cpnt_type, + cpnt, outfile, test_type, df_ctrl, dic_com, + iteration, outfile_ctrl) return pd.Series(res) def create_dataframe(project: str, weight: int, global_weight: int, same_gene: bool, feature: str = 'exon', - region: str = "", from_communities: bool = True + region: str = "", component_type: str = 'nt', + from_communities: bool = True ) -> pd.DataFrame: """ :param project: The name of the project of interest @@ -636,6 +669,8 @@ def create_dataframe(project: str, weight: int, global_weight: int, :param feature: The kind of feature to analyse :param from_communities: True if we only select gene/exons :param region: the region of interest to extract from gene + :param component_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. :return: A dataframe containing the frequency of every nucleotides \ of every exon in a large community """ @@ -650,16 +685,18 @@ def create_dataframe(project: str, weight: int, global_weight: int, list_exon = reduce(lambda x, y: x + y, ncommunities) df_com = get_community_table(communities, size_threshold, feature) - df = get_nt_frequency(cnx, list_exon, feature, region) + df = get_cpnt_frequency(cnx, list_exon, feature, region, + component_type) df = df.merge(df_com, how="left", on=f"id_{feature}") else: list_exon = get_ft_id(cnx, feature) - df = get_nt_frequency(cnx, list_exon, feature, region) + df = get_cpnt_frequency(cnx, list_exon, feature, region, + component_type) return df def create_dataframes(project, weight, global_weight, same_gene, feature, - region, test_type + region, test_type, component_type: str ) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]: """ :param weight: The weight of interaction to consider @@ -672,15 +709,18 @@ def create_dataframes(project, weight, global_weight, same_gene, feature, :param feature: The kind of analysed feature :param region: the region of interest to extract from gene :param test_type: The type of test to make (permutation or lmm) + :param component_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. """ df = create_dataframe(project, weight, global_weight, same_gene, - feature, region) + feature, region, component_type) if test_type == 'lmm': df_ctrl = pd.DataFrame() dic_com = {} else: df_ctrl = create_dataframe(project, weight, global_weight, same_gene, - feature, region, from_communities=False) + feature, region, component_type, + from_communities=False) dic_com = get_feature_by_community(df, feature) return df, df_ctrl, dic_com @@ -690,7 +730,9 @@ def multiple_nt_lmm_launcher(ps: int, global_weight: int, project: str, same_gene: bool, - feature: str = 'exon', region: str = '', + feature: str = 'exon', + region: str = '', + component_type: str = "nt", test_type: str = "lmm", iteration: int = 1000, logging_level: str = "DISABLE"): @@ -706,6 +748,8 @@ def multiple_nt_lmm_launcher(ps: int, :param same_gene: Say if we consider as co-localised exon within the \ same gene :param feature: The kind of analysed feature + :param component_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. :param region: the region of interest to extract from gene :param test_type: The type of test to make (permutation or lmm) :param iteration: The number of iteration to perform @@ -713,25 +757,30 @@ def multiple_nt_lmm_launcher(ps: int, """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) logging_def(ConfigGraph.community_folder, __file__, logging_level) - logging.info("Checking if communities as an effect on nucleotide " + if feature == "gene" and region != "exon" and component_type == "aa": + msg = f"If you looking at amino acid at gene level, then the region " \ + f"must be 'exon' not '{region}'. Setting it to exon. " \ + f"(corresponds to the concatenation of exons for all genes)" + logging.warning(msg) + logging.info(f"Checking if communities as an effect on {component_type} " "frequency") project = get_projects(global_weight, project) - nt_list = ["A"] # , "C", "G", "T", "S", "W"] - condition = list(product([project], [weight], nt_list)) + cpnt_list = [get_features(component_type)[0]] + condition = list(product([project], [weight], cpnt_list)) processes = [] pool = mp.Pool(processes=min(ps, len(condition))) logging.debug("Creating tables") df, df_ctrl, dic_com = create_dataframes(project, weight, global_weight, same_gene, feature, region, - test_type) - for project, weight, nt in condition: + test_type, component_type) + for project, weight, cpnt in condition: nfile_table = ConfigGraph.get_community_file( project, weight, global_weight, same_gene, feature, - f"_nt_table.txt", "community_enrichment") + f"_{component_type}_table.txt", "community_enrichment") df.to_csv(nfile_table, sep="\t", index=False) - args = [df, project, weight, global_weight, same_gene, nt, df_ctrl, - dic_com, feature, region, test_type, iteration] - processes.append(pool.apply_async(get_stat_nt_communities, args)) + args = [df, project, weight, global_weight, same_gene, component_type, + cpnt, df_ctrl, dic_com, feature, region, test_type, iteration] + processes.append(pool.apply_async(get_stat_cpnt_communities, args)) results = [p.get(timeout=None) for p in processes] pool.close() pool.join() @@ -740,9 +789,9 @@ def multiple_nt_lmm_launcher(ps: int, outfile = ConfigGraph.get_community_file(project, weight, global_weight, same_gene, feature, - f"lmm-nt_stat.txt", + f"lmm-{component_type}_stat.txt", "community_enrichment") - nfolder = outfile.parent / "nt_analysis" + nfolder = outfile.parent / f"{component_type}_analysis" noutfile = nfolder / outfile.name fdf.to_csv(noutfile, sep="\t", index=False)