diff --git a/src/find_interaction_cluster/community_figures/__file__.py b/src/find_interaction_cluster/community_figures/__file__.py new file mode 100644 index 0000000000000000000000000000000000000000..60f3a118d41cf7494b1cd067bf5d649b42ba1e05 --- /dev/null +++ b/src/find_interaction_cluster/community_figures/__file__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: +""" diff --git a/src/find_interaction_cluster/community_figures/__main__.py b/src/find_interaction_cluster/community_figures/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..c6ace3aa8ad9689dc489e2bdcd374898bd454375 --- /dev/null +++ b/src/find_interaction_cluster/community_figures/__main__.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: Create a barplot showing the frequency/value of a particular \ +item in every community of genomic features (genes of exons) that are close \ +in the nucleus compared to a control list of features. +""" + +import lazyparser as lp +import pandas as pd +from pathlib import Path +from .fig_functions import create_community_fig + + +class FileNameError(Exception): + pass + + +class MissingColumnError(Exception): + pass + + +def load_and_check_table(table: str, feature: str, target_col: str): + """ + Load a file containing a dataframe. It must contains the following \ + columns: id_feature, target_col, community and community_size. + + + :param table: A file containing a table with the id of the chosen \ + `feature` (i.e FasterDB id of genes or exons), a column with data of \ + interest ( this column must have the name *target_col*) and two columns \ + with the community and the size of the community of the feature if it \ + has one (None, else). + :param feature: The kind of feature analysed + :param target_col: The name of the column containing the data of interest + :return: The loaded dataframe + """ + if table.endswith(".gz"): + df = pd.read_csv(table, sep="\t", compression="gzip") + else: + df = pd.read_csv(table, sep="\t") + required_cols = [f"id_{feature}", target_col, "community", + "community_size"] + for rqd in required_cols: + if rqd not in df.columns: + raise MissingColumnError(f"The column {rqd} is missing !") + return df + + +@lp.parse(table="file", output="folder", test_type=["lm", "permutation"], + iteration="0 < iteration < 20") +def create_community_figures(table: str, feature: str, target_col: str, + output: str, outfile: str, test_type: str, + target_kind: str = "", + iteration: int = 10000) -> None: + """ + Create a dataframe with a control community, save it as a table and \ + as a barplot figure. + + :param table: A file containing a table with the id of the chosen \ + `feature` (i.e FasterDB id of genes or exons), a column with data of \ + interest ( this column must have the name *target_col*) and two columns \ + with the community and the size of the community of the feature if it \ + has one (None, else). + :param feature: The kind of feature analysed (exons or genes) + :param target_col: The name of the column containing the data of interest + :param output: The output folder + :param outfile: The name of the output figure file (pdf format) + :param test_type: The type of test to make (permutation or lm) + :param target_kind: An optional name that describe a bit further \ + target_col. + :param iteration: The number of sub samples to create. This parameter \ + is only used if test_type = 'permutation' (default 10000). + """ + df = load_and_check_table(table, feature, target_col) + if not outfile.endswith(".pdf"): + raise FileNameError("The output figure must be in pdf format !") + moutfile = Path(output) / outfile + create_community_fig(df, feature, target_col, moutfile, test_type, + target_kind=target_kind, iteration=iteration) + + +if __name__ == "__main__": + create_community_figures() diff --git a/src/find_interaction_cluster/community_figures/fig_functions.py b/src/find_interaction_cluster/community_figures/fig_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..62f9d73aa787796cda22ad84327a282fbcea4b91 --- /dev/null +++ b/src/find_interaction_cluster/community_figures/fig_functions.py @@ -0,0 +1,560 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: +""" + +import pandas as pd +from pathlib import Path +from typing import Dict, Tuple, List, Optional +import sqlite3 +from ..config import ConfigGraph +from tqdm import tqdm +from rpy2.robjects import r, pandas2ri +from statsmodels.stats.multitest import multipletests +import numpy as np +from ..radomization_test_ppi import get_pvalue +import seaborn as sns + + +def get_cpnt_frequency(cnx: sqlite3.Connection, list_ft: List[str], + feature: str, region: str = "", + component_type: str = "nt") -> pd.DataFrame: + """ + Get the frequency of every nucleotides for features in list_ft. + + :param cnx: Connection to chia-pet database + :param list_ft: The list of exons for which we want to get + :param feature: the kind of feature analysed + :param region: The region of gene analysed if feature is gene + :param component_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. + :return: the frequency of nucleotides for the list of exons. + + >>> d = get_cpnt_frequency(sqlite3.connect(ConfigGraph.db_file), + ... ["1_1", "1_2"], "exon") + >>> d[["id_exon", 'A', 'C', 'G', 'T']] + ft id_exon A C G T + 0 1_1 16.63480 34.60803 32.12237 16.63480 + 1 1_2 16.06426 26.10442 39.75904 18.07229 + >>> d = get_cpnt_frequency(sqlite3.connect(ConfigGraph.db_file), + ... ['1', '2'], "gene") + >>> d[["id_gene", 'A', 'C', 'G', 'T']] + ft id_gene A C G T + 0 1 29.49376 18.34271 18.43874 33.72479 + 1 2 31.90401 16.40251 18.79033 32.90315 + >>> d = get_cpnt_frequency(sqlite3.connect(ConfigGraph.db_file), + ... ['1', '2'], "gene", 'exon', 'aa') + >>> d[["id_gene", "R", "K", "D", "Q", "E"]] + ft id_gene R K D Q E + 0 1 4.75247 5.19300 5.95391 4.07997 6.96189 + 1 2 4.34203 6.23736 6.77708 5.21984 7.01769 + """ + query_region = "" + if feature == "gene": + list_ft = [int(ft) for ft in list_ft] + if region == "": + region = "gene" + query_region = f"AND region = '{region}'" + query = f""" + SELECT ft, id_{feature}, frequency + FROM cin_{feature}_frequency + WHERE id_{feature} IN {tuple(list_ft)} + AND ft_type = '{component_type}' + {query_region} + """ + df = pd.read_sql_query(query, cnx) + df = df.pivot_table(index=f"id_{feature}", columns="ft", + values="frequency").reset_index() + df[f"id_{feature}"] = df[f"id_{feature}"].astype(str) + return df + + +def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]: + """ + Return the id of every gene/exons in chia-pet database. + + :param cnx: A connection to chiapet database + :param feature: The feature of interest + :return: The list of feature id + """ + query = f"SELECT DISTINCT id FROM cin_{feature}" + c = cnx.cursor() + c.execute(query) + res = c.fetchall() + return [str(cid[0]) for cid in res] + + +def get_community_table(communities: List[List[str]], + size_threshold: int, feature: str) -> pd.DataFrame: + """ + return the table indicating the name of the exons and the \ + the name of the community. + + :param communities: List of community of exons + :param size_threshold: The required size a community must \ + have to be considered + :param feature: The kind of feature analysed + :return: table of community + >>> c = [['1_1', '2_5'], ['7_9', '4_19', '3_3']] + >>> get_community_table(c, 3, 'exon') + community id_exon community_size + 0 C2 7_9 3 + 1 C2 4_19 3 + 2 C2 3_3 3 + >>> c = [['1', '2'], ['7', '49', '3']] + >>> get_community_table(c, 3, 'gene') + community id_gene community_size + 0 C2 7 3 + 1 C2 49 3 + 2 C2 3 3 + """ + dic = {"community": [], f"id_{feature}": [], "community_size": []} + for k, comm in enumerate(communities): + if len(comm) >= size_threshold: + name = f'C{k + 1}' + clen = len(comm) + for exon in comm: + dic["community"].append(name) + dic[f'id_{feature}'].append(exon) + dic["community_size"].append(clen) + return pd.DataFrame(dic) + + +def lm_maker_summary(df: pd.DataFrame, outfile: Path, target_col: str + ) -> pd.DataFrame: + """ + Make the lm analysis to see if the exon regulated by a splicing factor \ + are equally distributed among the communities. + + :param df: A dataframe containing the id of the chosen `feature` \ + (i.e FasterDB id of genes or exons) a column with data for interest (\ + this column must have the name *target_col*) and the community \ + and the size of the community of the feature if it has one (None, else). + :param outfile: A name of a file + :param target_col: The name of the column containing the data of interest + :return: the pvalue of lm + """ + pandas2ri.activate() + lmf = r( + """ + require("DHARMa") + + function(data, folder, partial_name) { + mod <- lm(%s ~ log(community_size) + community, data=data) + simulationOutput <- simulateResiduals(fittedModel = mod, n = 250) + png(paste0(folder, "/dignostics_summary", partial_name, ".png")) + plot(simulationOutput) + dev.off() + return(as.data.frame(summary(mod)$coefficients)) + } + + """ % target_col) + folder = outfile.parent / "diagnostics" + folder.mkdir(parents=True, exist_ok=True) + partial_name = outfile.name.replace('.pdf', '') + df.to_csv(f'frequency_{target_col}.txt', sep="\t", index=False) + res_df = lmf(df, str(folder), partial_name).reset_index() + res_df.rename({'index': 'community'}, inplace=True, axis=1) + res_df['community'] = res_df['community'].str.replace('community', '') + res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL" + mean_df = df[[target_col, "community", "community_size"]]. \ + groupby(["community", "community_size"]).mean().reset_index() + return res_df.merge(mean_df, how="left", on="community") + + +def lm_with_ctrl(df: pd.DataFrame, + target_col: str, outfile: Path, + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + :param df: A dataframe containing the id of the chosen `feature` \ + (i.e FasterDB id of genes or exons) a column with data for interest (\ + this column must have the name *target_col*) and the community \ + and the size of the community of the feature if it has one (None, else). + :param target_col: The name of the column containing the data of interest + :param outfile: File that will contains the final figure + :return: The dataframe with ctrl exon and \ + The dataframe with the p-value compared to the control \ + list of feature. + """ + df['community'] = df['community'].fillna("C-CTRL") + return df, lm_maker_summary(df, outfile, target_col) + + +def expand_results_lm(df: pd.DataFrame, rdf: pd.DataFrame, + target_col: str, feature: str) -> pd.DataFrame: + """ + Merge df and rdf together. + + :param df: A dataframe containing the id of the chosen `feature` \ + (i.e FasterDB id of genes or exons) a column with data for interest (\ + this column must have the name *target_col*) and the community \ + and the size of the community of the feature if it has one (None, else). + :param rdf: The dataframe containing the mean frequency for \ + each community and the p-value of their enrichment compared to control \ + exons. + :param target_col: The name of the column containing the data of interest + :param feature: The feature of interest + :return: The merged dataframe: i.e df with the stats columns + """ + p_col = "Pr(>|t|)" + df = df[[f"id_{feature}", target_col, "community", + "community_size"]].copy() + rdf = rdf[["community", "community_size", p_col, target_col]].copy() + rdf.rename({target_col: f"mean_{target_col}", p_col: "p-adj"}, + axis=1, inplace=True) + df = df.merge(rdf, how="left", on=["community", "community_size"]) + df_ctrl = df[df["community"] == "C-CTRL"] + df = df[df["community"] != "C-CTRL"].copy() + df.sort_values(f"mean_{target_col}", ascending=True, inplace=True) + return pd.concat([df_ctrl, df], axis=0, ignore_index=True) + + +def get_permutation_mean(df_ctrl: pd.DataFrame, + cpnt: str, size: int, iteration: int) -> List[float]: + """ + Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ + of `nt` frequencies from it. + + :param df_ctrl: A dataframe containing the frequency of each nucleotide \ + in each exons/gene in fasterdb. + :param cpnt: The component (nt, aa, dnt) of interest + :param size: The size of each sub samples to create + :param iteration: The number of sub samples to create + :return: The list of mean frequencies of `nt` in each subsample + """ + return [ + float(np.mean(df_ctrl[cpnt].sample(size, replace=True).values)) + for _ in range(iteration) + ] + + +def perm_community_pval(row: pd.Series, df_ctrl: pd.DataFrame, + cpnt: str, iteration: int + ) -> Tuple[float, float, float, str]: + """ + Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ + of `nt` frequencies from it. + + :param row: A line of a dataframe containing the frequency of \ + each feature inside a community. + :param df_ctrl: A dataframe containing the frequency of each nucleotide \ + in each exons/gene in fasterdb. + :param cpnt: The component (nt, aa, dnt) of interest + :param iteration: The number of sub samples to create + :return: The ctrl mean frequency value of `nt`, its standard error \ + the pvalue and the regulation of the enrichment/impoverishment \ + of the community in `row` compared to control exons. + """ + list_values = get_permutation_mean(df_ctrl, cpnt, row["community_size"], + iteration) + pval, reg = get_pvalue(np.array(list_values), row[cpnt], iteration) + return float(np.mean(list_values)), float(np.std(list_values)), pval, reg + + +def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, + target_col: str, iteration: int, + dic_com: Dict) -> pd.DataFrame: + """ + Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ + of `nt` frequencies from it. + + :param df: A dataframe containing the frequency of each nucleotide \ + in each exons belonging to a community. + :param df_ctrl: A dataframe containing the frequency of each nucleotide \ + in each exons/gene in fasterdb. + :param feature: The feature of interest (gene, exon) + :param target_col: The name of the column containing the data of interest + :param iteration: The number of sub samples to create + :param dic_com: A dictionary linking each community to the exons \ + it contains. + :return: The dataframe containing p-values and regulation \ + indicating the enrichment of + """ + list_pval, list_reg, mean_ctrl, std_ctrl = ([] for _ in range(4)) + for i in tqdm(range(df.shape[0]), desc="performing permutations"): + row = df.iloc[i, :] + res = perm_community_pval(row, + df_ctrl.loc[ + -df_ctrl[f'id_{feature}' + ].isin(dic_com[row['community']]), + :], + target_col, iteration) + [x.append(y) for x, y in zip([mean_ctrl, std_ctrl, list_pval, + list_reg], res)] + adj_pvals = multipletests(list_pval, alpha=0.05, + method='fdr_bh', + is_sorted=False, + returnsorted=False)[1] + adj_regs = [list_reg[i] if adj_pvals[i] <= 0.05 else " . " + for i in range(len(list_reg))] + df[f'{target_col}_mean_{iteration}_ctrl'] = mean_ctrl + df[f'{target_col}_std_{iteration}_ctrl'] = std_ctrl + df[f'p-adj'] = adj_pvals + df[f'reg-adj'] = adj_regs + return df + + +def perm_with_ctrl(df: pd.DataFrame, feature: str, + target_col: str, dic_com: Dict, + iteration: int) -> pd.DataFrame: + """ + + :param df: A dataframe containing the id of the chosen `feature` \ + (i.e FasterDB id of genes or exons) a column with data for interest (\ + this column must have the name *target_col*) and the community \ + and the size of the community of the feature if it has one (None, else). + :param feature: The kind of feature analysed + :param target_col: The name of the column containing the data of interest + :param dic_com: A dictionary linking each community to the exons \ + it contains. + :param iteration: The number of sub samples to create + :return: The dataframe with the p-value compared to the control \ + list of exons. + """ + df_tmp = df.loc[-df["community"].isna(), :] + mean_df = df_tmp[[target_col, "community", "community_size"]]. \ + groupby(["community", "community_size"]).mean().reset_index() + return perm_pvalues(mean_df, df, feature, target_col, + iteration, dic_com) + + +def create_perm_ctrl_df(ctrl_df: pd.DataFrame, order_df: pd.DataFrame, + cpnt: str, feature: str, iteration: int + ) -> pd.DataFrame: + """ + + :param ctrl_df: A dataframe containing the mean ctrl values, \ + the mean control std and the community from which those control \ + have been created + :param order_df: A dataframe containing the community and their final \ + order. + :param cpnt: The component (nt, aa, dnt) of interest + :param feature: The feature of interest + :param iteration: The number of iteration + :return: The ctrl_tmp_df in good order + """ + dsize = ctrl_df.shape[0] + ctrl_df[f"mean_{cpnt}"] = \ + [np.mean(ctrl_df[f"{cpnt}_mean_{iteration}_ctrl"])] * dsize + ctrl_df[f"id_{feature}"] = ['ctrl'] * dsize + ctrl_df["community_size"] = [dsize] * dsize + ctrl_df = ctrl_df.merge(order_df, how='left', on="community") + ctrl_df.rename({f"{cpnt}_mean_{iteration}_ctrl": cpnt, + f"{cpnt}_std_{iteration}_ctrl": 'ctrl_std'}, axis=1, + inplace=True) + return ctrl_df.sort_values("order", ascending=True) + + +def expand_results_perm(df: pd.DataFrame, rdf: pd.DataFrame, target_col: str, + feature: str, iteration: int) -> pd.DataFrame: + """ + Merge df and rdf together. + + :param df: A dataframe containing the id of the chosen `feature` \ + (i.e FasterDB id of genes or exons) a column with data for interest (\ + this column must have the name *target_col*) and the community \ + and the size of the community of the feature if it has one (None, else). + :param rdf: The dataframe containing the mean frequency for \ + each community and the p-value of their enrichment compared to control \ + exons. + :param target_col: The name of the column containing the data of interest + :param feature: The feature of interest + :param iteration: The number of iteration + :return: The merged dataframe: i.e df with the stats columns + """ + df = df.loc[-df["community"].isna(), + [f"id_{feature}", target_col, + "community", "community_size"]].copy() + ctrl_df = rdf[[f"{target_col}_mean_{iteration}_ctrl", + f"{target_col}_std_{iteration}_ctrl", "community"]].copy() + rdf = rdf[["community", "community_size", target_col, "p-adj"]].copy() + rdf.rename({target_col: f"mean_{target_col}"}, axis=1, inplace=True) + df = df.merge(rdf, how="left", on=["community", "community_size"]) + df.sort_values(f"mean_{target_col}", ascending=True, inplace=True) + order_df = df[["community"]].drop_duplicates().copy() + order_df["order"] = range(order_df.shape[0]) + df_ctrl = create_perm_ctrl_df(ctrl_df, order_df, target_col, feature, + iteration) + return pd.concat([df_ctrl, df], axis=0, ignore_index=True) + + +def make_barplot(df_bar: pd.DataFrame, outfile: Path, + target_col: str, feature: str, target_kind: str = "") -> None: + """ + Create a barplot showing the frequency of `nt` for every community \ + of exons/gene in `df_bar`. + + :param df_bar: A dataframe with the enrichment of a \ + nucleotide frequency for every community + :param outfile: File were the figure will be stored + :param target_kind: An optional name that describe a bit further \ + target_col. + :param target_col: The name of the column containing the data of interest + :param feature: The king of feature of interest + """ + sns.set(context="poster") + g = sns.catplot(x="community", y=target_col, data=df_bar, kind="bar", + ci="sd", aspect=2.5, height=12, errwidth=0.5, capsize=.4, + palette=["red"] + ["darkgray"] * (df_bar.shape[0] - 1)) + g.fig.subplots_adjust(top=0.9) + target_kind = f" ({target_kind})" if target_kind else "" + g.fig.suptitle(f"Mean frequency of {target_col}{target_kind}" + f"among community of {feature}s\n" + f"(stats obtained with a lm test)") + g.set(xticklabels=[]) + g.ax.set_ylabel(f'Frequency of {target_col}') + df_bara = df_bar.drop_duplicates(subset="community", keep="first") + for i, p in enumerate(g.ax.patches): + stats = "*" if df_bara.iloc[i, :]["p-adj"] < 0.05 else "" + com = df_bara.iloc[i, :]["community"] + csd = np.std(df_bar.loc[df_bar["community"] == com, target_col]) + g.ax.annotate(stats, + (p.get_x() + p.get_width() / 2., p.get_height() + csd), + ha='center', va='center', xytext=(0, 10), fontsize=12, + textcoords='offset points') + g.savefig(outfile) + + +def make_barplot_perm(df_bar: pd.DataFrame, outfile: Path, + target_col: str, feature: str, + target_kind: str = "") -> None: + """ + Create a barplot showing the frequency of `nt` for every community \ + of exons/gene in `df_bar`. + + :param df_bar: A dataframe with the enrichment of a \ + nucleotide frequency for every community + :param outfile: File were the figure will be stored + :param target_kind: An optional name that describe a bit further \ + target_col. + :param target_col: The name of the column containing the data of interest + :param feature: The king of feature of interest + """ + sns.set(context="poster") + df_ctrl = df_bar.loc[df_bar[f"id_{feature}"] == 'ctrl', :] + df_bar = df_bar.loc[df_bar[f"id_{feature}"] != 'ctrl', :].copy() + g2 = sns.catplot(x="community", y=target_col, data=df_bar, kind="bar", + ci="sd", aspect=2.5, height=14, errwidth=0.5, capsize=.4, + palette=["darkgray"] * (df_bar.shape[0])) + g = sns.catplot(x="community", y=target_col, data=df_bar, kind="point", + ci="sd", aspect=2.5, height=14, errwidth=0.5, capsize=.4, + scale=0.5, palette=["darkgray"] * (df_bar.shape[0])) + xrange = g.ax.get_xlim() + df_ctrl.plot(x="community", y=target_col, kind="scatter", ax=g.ax, + yerr="ctrl_std", legend=False, zorder=10, + color=(0.8, 0.2, 0.2, 0.4)) + g.ax.set_xlim(xrange) + g.fig.subplots_adjust(top=0.9) + target_kind = f" ({target_kind})" if target_kind else "" + g.fig.suptitle(f"Mean frequency of {target_col}{target_kind}" + f"among community of {feature}s\n" + f"(stats obtained with a permutation test)") + g.set(xticklabels=[]) + g.ax.set_ylabel(f'Frequency of {target_col}') + df_bara = df_bar.drop_duplicates(subset="community", keep="first") + for i, p in enumerate(g2.ax.patches): + stats = "*" if df_bara.iloc[i, :]["p-adj"] < 0.05 else "" + com = df_bara.iloc[i, :]["community"] + csd = np.std(df_bar.loc[df_bar["community"] == com, target_col]) + g.ax.annotate(stats, + (p.get_x() + p.get_width() / 2., p.get_height() + csd), + ha='center', va='center', xytext=(0, 10), fontsize=12, + textcoords='offset points') + g.savefig(outfile) + + +def barplot_creation(df_bar: pd.DataFrame, outfig: Path, + cpnt: str, test_type: str, feature: str, + target_kind) -> None: + """ + Reformat a dataframe with the enrichment of a nucleotide frequency \ + for every feature for every community and then create a \ + barplot showing those frequencies. + + :param df_bar: A dataframe with the enrichment of a \ + nucleotide frequency for every community and showing the frequency \ + of each feature in each community + :param outfig: File were the figure will be stored + :param cpnt: The component (nt, aa, dnt) of interest + :param test_type: The kind of test make + :param feature: The king of feature of interest + :param test_type: The type of test to make (permutation or lm) + :param target_kind: An optional name that describe a bit further \ + target_col. + """ + if test_type == "lm": + make_barplot(df_bar, outfig, cpnt, feature, target_kind) + else: + make_barplot_perm(df_bar, outfig, cpnt, feature, target_kind) + + +def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict: + """ + Create a dictionary containing the exons contained in each community. + + :param df: A dataframe containing the frequency of each nucleotide \ + in each exons belonging to a community. + :param feature: the feature of interest (exon, gene) + :return: A dictionary linking each community to the exons it contains + + >>> dataf = pd.DataFrame({"id_gene": ['1', '2', '3', '4', '5'], + ... 'community': ['C1', 'C1', 'C2', 'C2', np.nan]}) + >>> get_feature_by_community(dataf, 'gene') + {'C1': ['1', '2'], 'C2': ['3', '4']} + >>> dataf.rename({"id_gene": "id_exon"}, axis=1, inplace=True) + >>> get_feature_by_community(dataf, 'exon') + {'C1': ['1', '2'], 'C2': ['3', '4']} + """ + dic = {} + for i in range(df.shape[0]): + com, id_ft = df.iloc[i, :][['community', f'id_{feature}']] + if com is not None: + if com in dic: + dic[com].append(id_ft) + else: + dic[com] = [id_ft] + return dic + + +def create_community_fig(df: pd.DataFrame, feature: str, + target_col: str, + outfile_ctrl: Path, test_type: str, + dic_com: Optional[Dict] = None, + target_kind: str = "", + iteration: int = 10000) -> None: + """ + Create a dataframe with a control community, save it as a table and \ + as a barplot figure. + + :param df: A dataframe containing the id of the chosen `feature` \ + (i.e FasterDB id of genes or exons) a column with data for interest (\ + this column must have the name *target_col*) and the community \ + and the size of the community of the feature if it has one (None, else). + :param feature: The kind of feature analysed + :param target_col: The name of the column containing the data of interest + :param outfile_ctrl: file used to stored the table and the figure \ + containing the test communities and the control community + :param test_type: The type of test to make (permutation or lm) + :param dic_com: A dictionary linking each community to the exons \ + it contains. + :param target_kind: An optional name that describe a bit further \ + target_col. + :param iteration: The number of sub samples to create + """ + if dic_com is None: + dic_com = get_feature_by_community(df, feature) + if test_type == "lm": + ndf, rdf = lm_with_ctrl(df, target_col, outfile_ctrl) + df_bar = expand_results_lm(ndf, rdf, target_col, feature) + else: + rdf = perm_with_ctrl(df, feature, target_col, dic_com, iteration) + df_bar = expand_results_perm(df, rdf, target_col, feature, iteration) + rdf.to_csv(str(outfile_ctrl).replace(".pdf", ".txt"), sep="\t", + index=False) + bar_outfile = str(outfile_ctrl).replace(".pdf", "_bar.txt") + df_bar.to_csv(bar_outfile, sep="\t", index=False) + barplot_creation(df_bar, outfile_ctrl, target_col, test_type, feature, + target_kind)