diff --git a/src/find_interaction_cluster/__main__.py b/src/find_interaction_cluster/__main__.py index 71a3bcc3bec01abe717ee3219cf54c9199f32a99..37102fa6918cacba2c6e068d85b068acc3e4c4c9 100644 --- a/src/find_interaction_cluster/__main__.py +++ b/src/find_interaction_cluster/__main__.py @@ -13,17 +13,25 @@ from .config import ConfigGraph from typing import List from .install_hipMCL import install_hipmcl from .sf_and_communities import multiple_stat_launcher -from .nt_and_community import multiple_nt_lmm_launcher +from .nt_and_community import multiple_nt_lm_launcher +from .create_ppi_files import ppi_stat_launcher import logging from ..logging_conf import logging_def +from .colocalisation_n_ppi_analysis import coloc_ppi_stat_main -@lp.parse -def launcher(weight: List[int] = (1), - global_weight: List[int] = (0), +@lp.parse(weight=range(1, 11), global_weight=range(11), + feature=('gene', 'exon'), region=('', 'exon', 'intron', 'gene'), + iteration="iteration > 20", test_type=('lm', 'perm'), + component_type=("nt", "aa", "dnt")) +def launcher(weight: int = 1, + global_weight: int = 0, same_gene: bool = True, ps: int = ConfigGraph.cpu, - html_fig: bool = False, feature: str = 'exon', + html_fig: bool = False, feature: str = 'exon', region: str = '', + component_type: str = 'nt', + iteration: int = 1000, test_type: str = "lm", + project: str = "GSM1018963_GSM1018964", logging_level: str = "DISABLE"): """ Script used to find communities inside exon co-localized within a project @@ -38,7 +46,17 @@ def launcher(weight: List[int] = (1), same gene (True) or not (False) (default True) :param html_fig: True to display the html figure (default False). :param feature: The feature we want to analyse (default 'exon') + :param region: The region of a gene to analyse (used only if feature \ + is 'gene') (default '') (can be 'gene', 'exon', 'intron'). + :param component_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa' (default 'nt'). :param logging_level: The level of data to display (default 'DISABLE') + :param iteration: the number of iteration for ppi and frequency analysis + :param project: The project name of interest \ + (only used is global_weight = 0). + :param test_type: The king of test to perform for frequency analysis. \ + (default 'lmm') (choose from 'lmm', 'perm') + :param ps: The number of processes to use """ logging_def(ConfigGraph.community_folder, __file__, logging_level) if feature == "gene" and not same_gene: @@ -47,12 +65,20 @@ def launcher(weight: List[int] = (1), same_gene = True if not ConfigGraph.get_hipmcl_prog().is_file(): install_hipmcl("INFO") - multiple_community_launcher(1, weight, global_weight, same_gene, html_fig, - feature, logging_level) - # multiple_stat_launcher(ps, weight, global_weight, same_gene, feature, - # logging_level) - multiple_nt_lmm_launcher(ps, weight, global_weight, same_gene, feature, - logging_level) + multiple_community_launcher(weight, global_weight, project, same_gene, + html_fig, feature, logging_level) + multiple_stat_launcher(ps, weight, global_weight, project, same_gene, + feature, logging_level) + multiple_nt_lm_launcher(ps, weight, global_weight, project, + same_gene, feature, region, component_type, + test_type, iteration, logging_level) + if feature == "gene": + # ppi_stat_launcher(weight, global_weight, project, same_gene, + # ConfigGraph.ppi_threshold, iteration, + # logging_level) + coloc_ppi_stat_main(weight, global_weight, project, same_gene, + iteration, logging_level) + launcher() \ No newline at end of file diff --git a/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py b/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..6e7efb6518672e2677f13baf055091c288792567 --- /dev/null +++ b/src/find_interaction_cluster/colocalisation_n_ppi_analysis.py @@ -0,0 +1,353 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to check whether close \ +gene produce protein that interact between each other +""" + +from .config import ConfigGraph +from ..logging_conf import logging_def +import logging +from .community_finder import get_projects +from pathlib import Path +import pandas as pd +from itertools import combinations, product +from typing import List, Tuple +from doctest import testmod +import numpy as np +from .ppi_scripts.config_ppi import ConfigPPI +import seaborn as sns +from scipy.stats import mannwhitneyu +from ..fig_utils.stat_annot import add_stat_annotation + + +def get_gene_list(gene_list: str) -> List[int]: + """ + From a string containing a list of id of genes separated by a comma, \ + return a list of those id. + + :param gene_list: A list of gene id separated by a comma + :return: The list of gene id + + >>> get_gene_list("1, 2, 3") + [1, 2, 3] + """ + id_gene_list = list(map(int, gene_list.split(', '))) + return sorted(id_gene_list) + + +def get_couples_in_community(row: pd.Series) -> pd.DataFrame: + """ + From a line of a dataframe containing a community, return each \ + couples of gene in that community. + + :param row: line of a dataframe containing a community + :return: A dataframe containing couples of gene in that community + + >>> data = pd.Series({"community": "C1", + ... "nodes": 3, + ... "genes": "1, 2, 3"}) + >>> get_couples_in_community(data) + gene1 gene2 community_size community + 0 1 2 3 C1 + 1 1 3 3 C1 + 2 2 3 3 C1 + """ + couples = combinations(get_gene_list(row['genes']), 2) + dic = {"gene1": [], "gene2": []} + for gene1, gene2 in couples: + dic["gene1"].append(gene1) + dic["gene2"].append(gene2) + dic['community_size'] = [row['nodes']] * len(dic["gene1"]) + dic['community'] = [row['community']] * len(dic["gene1"]) + return pd.DataFrame(dic) + + +def edge_community_table(df_com: pd.DataFrame) -> pd.DataFrame: + """ + From a dataframe containing the communities of co-localised genes in the \ + nucleus, create a table showing each couple of gene in each community. + + :param df_com: dataframe containing the communities of co-localised \ + genes in the nucleus + :return: table showing each couple of gene in each community. + + >>> data = pd.DataFrame({"community": ["C1", "C2"], + ... "nodes": [3, 2], + ... "genes": ["1, 2, 3", "4, 5"]}) + >>> edge_community_table(data) + gene1 gene2 community_size community + 0 1 2 3 C1 + 1 1 3 3 C1 + 2 2 3 3 C1 + 3 4 5 2 C2 + """ + list_df = [get_couples_in_community(df_com.iloc[i, :]) + for i in range(df_com.shape[0])] + return pd.concat(list_df, axis=0, ignore_index=True) + + +def ppi_good_order(line: str) -> List[int]: + """ + From a line of a file coming containing each couple of genes whose \ + proteins interact with each other return the line with the proper order. + + :param line: a line of a file coming containing each couple of genes \ + whose proteins interact with each other + :return: The lowest gene id, the highest gene id and their \ + interaction score + + >>> ppi_good_order("ENSP00000000233\\tENSP00000272298\\t490\\t" + + ... "5764\\t3926\\n") + [3926, 5764, 490] + """ + cline = line.strip().split("\t") + return sorted([int(cline[3]), int(cline[4])]) + [int(cline[2])] + + +def load_fasterdb_ppi(fasterdb_ppi: Path) -> pd.DataFrame: + """ + Return a file containing each couple of genes whose proteins \ + interact with each other. + + :param fasterdb_ppi: A file containing couples of genes whose proteins \ + interact with each other + :return: The dataframe of the interaction between genes \ + at protein level and their score. + + >>> from tempfile import NamedTemporaryFile + >>> f = NamedTemporaryFile(mode='w') + >>> _ = f.write("ENSP00000000233\\tENSP00000272298\\t490\\t" + + ... "5764\\t3926\\n" + "ENSP00000000233\\tENSP00000253401\\t198\\t" + + ... "5764\\t3273\\n") + >>> f.flush() + >>> mfile = Path(f.name) + >>> load_fasterdb_ppi(mfile) + gene1 gene2 string_score + 0 3926 5764 490 + 1 3273 5764 198 + """ + with fasterdb_ppi.open("r") as ppi_file: + content = [ + ppi_good_order(line) + for line in ppi_file + if not line.startswith("protein1") + ] + return pd.DataFrame(content, columns=['gene1', 'gene2', 'string_score']) + + +def merge_community_and_score(df_cpl: pd.DataFrame, ppi_score: pd.DataFrame + ) -> pd.DataFrame: + """ + :param df_cpl: table showing each couple of gene in each community \ + at gene level + :param ppi_score: The dataframe of the interaction between genes \ + at protein level and their score. + :return: df_com with string score + + >>> d1 = pd.DataFrame({"gene1": [1, 1, 2, 4], "gene2": [2, 3, 3, 5], + ... "community_size": [3, 3, 3, 2], "community": ["C1", "C1", "C1", "C2"]}) + >>> d2 = pd.DataFrame({"gene1": [1, 1, 2, 9], "gene2": [2, 3, 3, 8], + ... "string_score": [10, 20, 30, 40]}) + >>> merge_community_and_score(d1, d2) + gene1 gene2 community_size community string_score + 0 1 2 3 C1 10.0 + 1 1 3 3 C1 20.0 + 2 2 3 3 C1 30.0 + 3 4 5 2 C2 NaN + """ + return df_cpl.merge(ppi_score, how="left", on=["gene1", "gene2"]) + + +def get_gene_in_community(df_comp: pd.DataFrame) -> List[int]: + """ + Get all the couples of exons inter-community + + :param df_comp: table showing each couple of gene in each community \ + at gene level + :return: The list of gene inside community + + >>> data = pd.DataFrame({"community": ["C1", "C2"], + ... "nodes": [3, 2], + ... "genes": ["1, 2, 3", "4, 5"]}) + >>> get_gene_in_community(data) + [1, 2, 3, 4, 5] + """ + gene_list = [] + for i in range(df_comp.shape[0]): + gene_list += get_gene_list(df_comp.iloc[i, :]["genes"]) + return gene_list + + +def get_couple_of_ctrl_exon(community_gene: str, all_genes: List[int] + ) -> List[Tuple[int, int]]: + """ + Get every control couple of genes, (couples of genes inter-community). + + :param community_gene: A string containing the genes of interest. + :param all_genes: Every genes in every community at dna level + :return: Every inter-community couple + + >>> get_couple_of_ctrl_exon('1, 2', [1, 2, 3, 4, 5]) + [(1, 3), (1, 4), (1, 5), (2, 3), (2, 4), (2, 5)] + """ + cgenes = get_gene_list(community_gene) + all_genes = [g for g in all_genes if g not in cgenes] + values = list(product(cgenes, all_genes)) + return [v for v in values if v[0] < v[1]] + + +def get_very_ctrl_couple(df_comp: pd.DataFrame) -> pd.DataFrame: + """ + Get a dataframe containing all the control couple of genes + + :param df_comp: dataframe containing the communities of co-localised \ + genes in the nucleus + :return: A dataframe containing all the control couples of genes + + >>> data = pd.DataFrame({"community": ["C1", "C2"], + ... "nodes": [3, 2], + ... "genes": ["1, 2, 3", "4, 5"]}) + >>> get_very_ctrl_couple(data) + gene1 gene2 community_size community + 0 1 4 NaN C-CTRL + 1 1 5 NaN C-CTRL + 2 2 4 NaN C-CTRL + 3 2 5 NaN C-CTRL + 4 3 4 NaN C-CTRL + 5 3 5 NaN C-CTRL + """ + all_genes = get_gene_in_community(df_comp) + ctrl_couples = [] + for i in range(df_comp.shape[0]): + ctrl_couples += get_couple_of_ctrl_exon(df_comp.iloc[i, :]["genes"], + all_genes) + df = pd.DataFrame(ctrl_couples, columns=["gene1", "gene2"]) + df["community_size"] = [np.nan] * df.shape[0] + df["community"] = ["C-CTRL"] * df.shape[0] + return df + + +def create_scored_dataframe(df_comp: pd.DataFrame, fasterdb_ppi: Path + ) -> pd.DataFrame: + """ + Create a dataframe containing couple of exons \ + inside a community at dna level and the control couples between \ + communities and their string score. + + :param df_comp: dataframe containing the communities of co-localised \ + genes in the nucleus + :param fasterdb_ppi: A file containing couples of genes whose proteins \ + interact with each other + :return: a dataframe containing couple of exons \ + inside a community at dna level and the control couples between \ + communities and their string score. + + >>> from tempfile import NamedTemporaryFile + >>> f = NamedTemporaryFile(mode='w') + >>> _ = f.write("P1\\tP2\\t490\\t1\\t2\\n" + "P3\\tP4\\t198\\t1\\t3\\n" + ... "P3\\tP4\\t199\\t1\\t4\\n" + "P3\\tP4\\t200\\t1\\t5\\n" + ... "P3\\tP4\\t202\\t2\\t3\\n" + "P3\\tP4\\t208\\t2\\t4\\n" + ... "P3\\tP4\\t398\\t2\\t5\\n" + "P3\\tP4\\t298\\t3\\t4\\n" + ... "P3\\tP4\\t498\\t5\\t3\\n") + >>> f.flush() + >>> mfile = Path(f.name) + >>> data = pd.DataFrame({"community": ["C1", "C2"], + ... "nodes": [3, 2], + ... "genes": ["1, 2, 3", "4, 5"]}) + >>> create_scored_dataframe(data, mfile) + gene1 gene2 community_size community string_score group + 0 1 2 3.0 C1 490.0 community + 1 1 3 3.0 C1 198.0 community + 2 2 3 3.0 C1 202.0 community + 3 4 5 2.0 C2 NaN community + 4 1 4 NaN C-CTRL 199.0 ctrl + 5 1 5 NaN C-CTRL 200.0 ctrl + 6 2 4 NaN C-CTRL 208.0 ctrl + 7 2 5 NaN C-CTRL 398.0 ctrl + 8 3 4 NaN C-CTRL 298.0 ctrl + 9 3 5 NaN C-CTRL 498.0 ctrl + """ + df_ctrl = get_very_ctrl_couple(df_comp) + df_real = edge_community_table(df_comp) + full_df = pd.concat([df_real, df_ctrl], axis=0, ignore_index=True) + df_score = load_fasterdb_ppi(fasterdb_ppi) + full_df = merge_community_and_score(full_df, df_score) + full_df['group'] = ["community"] * full_df.shape[0] + full_df.loc[full_df['community'] == "C-CTRL", "group"] = "ctrl" + return full_df.drop_duplicates() + + +def create_figure(df_full: pd.DataFrame, outfile: Path) -> None: + """ + Create a barplot of string confidence score. + + :param df_full: A dataframe showing the interactions between \ + genes in same community. + + :param outfile: The file ythat will contains the barplot + """ + sns.set(context="talk") + g = sns.catplot(x="group", y="string_score", data=df_full, kind="bar", + ci="sd", aspect=1.7, height=12) + _, p = mannwhitneyu(df_full.loc[df_full['group'] == 'community', + 'string_score'].values, + df_full.loc[df_full['group'] == 'community', + 'string_score'].values) + add_stat_annotation(g.ax, data=df_full, x="group", y="string_score", + box_pair_list=[("community", "ctrl", p)]) + g.savefig(outfile) + + +def coloc_ppi_stat_main(weight: int, global_weight: int, + project: str, same_gene: bool, iteration: int = 1000, + logging_level: str = "DISABLE"): + """ + Launch the statistical tests allowing to determine if interaction between \ + genes at dna level as an influence on the interactions at protein level. + + :param weight: The weight of interaction to consider + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. + :param project: The project name, used only if global_weight = 0 + :param same_gene: Say if we consider as co-localised exon within the \ + same gene + :param threshold: The minimum threshold needed to consider the interaction + :param iteration: The number of iteration to make + :param logging_level: Level of information to display + """ + ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) + logging_def(ConfigGraph.community_folder, __file__, logging_level) + logging.info("Checking if gene communities at DNA level have an " + "influence on communities at protein level") + project = get_projects(global_weight, project) + logging.debug("Calculating stats...") + community_file = ConfigGraph.get_community_file(project, weight, + global_weight, + same_gene, "gene", + f".txt") + + df_com = pd.read_csv(community_file, sep="\t") + df_com = df_com[df_com['nodes'] >= 10].copy() + full_df = create_scored_dataframe(df_com, ConfigPPI.fasterdb_ppi) + outfile = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, "gene", + f"_interation_gene-protein.pdf", + "community_gene_vs_protein") + create_figure(full_df, outfile) + logging.info(f"{full_df.shape[0]} total couples (with ctrl)") + logging.info(f"{full_df[-np.isnan(full_df['string_score'])].shape[0]} " + f"couples with a defined string score") + tmp = full_df[full_df['group'] == 'community'].shape[0] + logging.info(f"{tmp} tests couples") + tmp = full_df.loc[(-np.isnan(full_df['string_score'])) & + (full_df['group'] == 'community'), :].shape[0] + logging.info(f"{tmp} tests couples with a defined string score") + + + +if __name__ == "__main__": + testmod() \ No newline at end of file diff --git a/src/find_interaction_cluster/community_figures/__file__.py b/src/find_interaction_cluster/community_figures/__file__.py new file mode 100644 index 0000000000000000000000000000000000000000..60f3a118d41cf7494b1cd067bf5d649b42ba1e05 --- /dev/null +++ b/src/find_interaction_cluster/community_figures/__file__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: +""" diff --git a/src/find_interaction_cluster/community_figures/__main__.py b/src/find_interaction_cluster/community_figures/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..7cef8dd4ac65fccb0d7296154482de8f1822c3dd --- /dev/null +++ b/src/find_interaction_cluster/community_figures/__main__.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: Create a barplot showing the frequency/value of a particular \ +item in every community of genomic features (genes of exons) that are close \ +in the nucleus compared to a control list of features. +""" + +import lazyparser as lp +import pandas as pd +from pathlib import Path +from .fig_functions import create_community_fig + + +class FileNameError(Exception): + pass + + +class MissingColumnError(Exception): + pass + + +def load_and_check_table(table: str, feature: str, target_col: str): + """ + Load a file containing a dataframe. It must contains the following \ + columns: id_feature, target_col, community and community_size. + + + :param table: A file containing a table with the id of the chosen \ + `feature` (i.e FasterDB id of genes or exons), a column with data of \ + interest ( this column must have the name *target_col*) and two columns \ + with the community and the size of the community of the feature if it \ + has one (None, else). + :param feature: The kind of feature analysed + :param target_col: The name of the column containing the data of interest + :return: The loaded dataframe + """ + if table.endswith(".gz"): + df = pd.read_csv(table, sep="\t", compression="gzip") + else: + df = pd.read_csv(table, sep="\t") + required_cols = [f"id_{feature}", target_col, "community", + "community_size"] + for rqd in required_cols: + if rqd not in df.columns: + raise MissingColumnError(f"The column {rqd} is missing !") + return df + + +@lp.parse(table="file", output="folder", test_type=["lm", "permutation"], + iteration="20 < iteration") +def create_community_figures(table: str, feature: str, target_col: str, + output: str, outfile: str, test_type: str, + target_kind: str = "", + iteration: int = 10000) -> None: + """ + Create a dataframe with a control community, save it as a table and \ + as a barplot figure. + + :param table: A file containing a table with the id of the chosen \ + `feature` (i.e FasterDB id of genes or exons), a column with data of \ + interest ( this column must have the name *target_col*) and two columns \ + with the community and the size of the community of the feature if it \ + has one (None, else). + :param feature: The kind of feature analysed (exon or gene) + :param target_col: The name of the column containing the data of interest + :param output: The output folder + :param outfile: The name of the output figure file (pdf format) + :param test_type: The type of test to make (permutation or lm) + :param target_kind: An optional name that describe a bit further \ + target_col. + :param iteration: The number of sub samples to create. This parameter \ + is only used if test_type = 'permutation' (default 10000). + """ + df = load_and_check_table(table, feature, target_col) + if not outfile.endswith(".pdf"): + raise FileNameError("The output figure must be in pdf format !") + moutfile = Path(output) / outfile + create_community_fig(df, feature, target_col, moutfile, test_type, + target_kind=target_kind, iteration=iteration) + + +if __name__ == "__main__": + create_community_figures() diff --git a/src/find_interaction_cluster/community_figures/fig_functions.py b/src/find_interaction_cluster/community_figures/fig_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..124bf0db76fd59256cf69caf18cfedf6d497548b --- /dev/null +++ b/src/find_interaction_cluster/community_figures/fig_functions.py @@ -0,0 +1,498 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: +""" + +import pandas as pd +from pathlib import Path +from typing import Dict, Tuple, List, Optional +from tqdm import tqdm +from rpy2.robjects import r, pandas2ri +from statsmodels.stats.multitest import multipletests +import numpy as np +from ..radomization_test_ppi import get_pvalue +import seaborn as sns + + +def get_community_table(communities: List[List[str]], + size_threshold: int, feature: str) -> pd.DataFrame: + """ + return the table indicating the name of the exons and the \ + the name of the community. + + :param communities: List of community of exons + :param size_threshold: The required size a community must \ + have to be considered + :param feature: The kind of feature analysed + :return: table of community + >>> c = [['1_1', '2_5'], ['7_9', '4_19', '3_3']] + >>> get_community_table(c, 3, 'exon') + community id_exon community_size + 0 C2 7_9 3 + 1 C2 4_19 3 + 2 C2 3_3 3 + >>> c = [['1', '2'], ['7', '49', '3']] + >>> get_community_table(c, 3, 'gene') + community id_gene community_size + 0 C2 7 3 + 1 C2 49 3 + 2 C2 3 3 + """ + dic = {"community": [], f"id_{feature}": [], "community_size": []} + for k, comm in enumerate(communities): + if len(comm) >= size_threshold: + name = f'C{k + 1}' + clen = len(comm) + for exon in comm: + dic["community"].append(name) + dic[f'id_{feature}'].append(exon) + dic["community_size"].append(clen) + return pd.DataFrame(dic) + + +def lm_maker_summary(df: pd.DataFrame, outfile: Path, target_col: str + ) -> pd.DataFrame: + """ + Make the lm analysis to see if the exon regulated by a splicing factor \ + are equally distributed among the communities. + + :param df: A dataframe containing the id of the chosen `feature` \ + (i.e FasterDB id of genes or exons) a column with data for interest (\ + this column must have the name *target_col*) and the community \ + and the size of the community of the feature if it has one (None, else). + :param outfile: A name of a file + :param target_col: The name of the column containing the data of interest + :return: the pvalue of lm + """ + pandas2ri.activate() + lmf = r( + """ + require("DHARMa") + + function(data, folder, partial_name) { + mod <- lm(%s ~ log(community_size) + community, data=data) + simulationOutput <- simulateResiduals(fittedModel = mod, n = 250) + png(paste0(folder, "/dignostics_summary", partial_name, ".png")) + plot(simulationOutput) + dev.off() + return(as.data.frame(summary(mod)$coefficients)) + } + + """ % target_col) + folder = outfile.parent / "diagnostics" + folder.mkdir(parents=True, exist_ok=True) + partial_name = outfile.name.replace('.pdf', '') + df.to_csv(f'frequency_{target_col}.txt', sep="\t", index=False) + res_df = lmf(df, str(folder), partial_name).reset_index() + res_df.rename({'index': 'community'}, inplace=True, axis=1) + res_df['community'] = res_df['community'].str.replace('community', '') + res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL" + mean_df = df[[target_col, "community", "community_size"]]. \ + groupby(["community", "community_size"]).mean().reset_index() + return res_df.merge(mean_df, how="left", on="community") + + +def lm_with_ctrl(df: pd.DataFrame, + target_col: str, outfile: Path, + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + :param df: A dataframe containing the id of the chosen `feature` \ + (i.e FasterDB id of genes or exons) a column with data for interest (\ + this column must have the name *target_col*) and the community \ + and the size of the community of the feature if it has one (None, else). + :param target_col: The name of the column containing the data of interest + :param outfile: File that will contains the final figure + :return: The dataframe with ctrl exon and \ + The dataframe with the p-value compared to the control \ + list of feature. + """ + df['community'] = df['community'].fillna("C-CTRL") + size = df.loc[df["community"] == "C-CTRL", :].shape[0] + df['community_size'] = df['community_size'].fillna(size) + df['community_size'] = df['community_size'].astype(int) + return df, lm_maker_summary(df, outfile, target_col) + + +def expand_results_lm(df: pd.DataFrame, rdf: pd.DataFrame, + target_col: str, feature: str) -> pd.DataFrame: + """ + Merge df and rdf together. + + :param df: A dataframe containing the id of the chosen `feature` \ + (i.e FasterDB id of genes or exons) a column with data for interest (\ + this column must have the name *target_col*) and the community \ + and the size of the community of the feature if it has one (None, else). + :param rdf: The dataframe containing the mean frequency for \ + each community and the p-value of their enrichment compared to control \ + exons. + :param target_col: The name of the column containing the data of interest + :param feature: The feature of interest + :return: The merged dataframe: i.e df with the stats columns + """ + p_col = "Pr(>|t|)" + df = df[[f"id_{feature}", target_col, "community", + "community_size"]].copy() + rdf = rdf[["community", "community_size", p_col, target_col]].copy() + rdf.rename({target_col: f"mean_{target_col}", p_col: "p-adj"}, + axis=1, inplace=True) + df = df.merge(rdf, how="left", on=["community", "community_size"]) + df_ctrl = df[df["community"] == "C-CTRL"] + df = df[df["community"] != "C-CTRL"].copy() + df.sort_values(f"mean_{target_col}", ascending=True, inplace=True) + return pd.concat([df_ctrl, df], axis=0, ignore_index=True) + + +def get_permutation_mean(df_ctrl: pd.DataFrame, + cpnt: str, size: int, iteration: int) -> List[float]: + """ + Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ + of `nt` frequencies from it. + + :param df_ctrl: A dataframe containing the frequency of each nucleotide \ + in each exons/gene in fasterdb. + :param cpnt: The component (nt, aa, dnt) of interest + :param size: The size of each sub samples to create + :param iteration: The number of sub samples to create + :return: The list of mean frequencies of `nt` in each subsample + """ + return [ + float(np.mean(df_ctrl[cpnt].sample(int(size), replace=True).values)) + for _ in range(iteration) + ] + + +def perm_community_pval(row: pd.Series, df_ctrl: pd.DataFrame, + cpnt: str, iteration: int + ) -> Tuple[float, float, float, str]: + """ + Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ + of `nt` frequencies from it. + + :param row: A line of a dataframe containing the frequency of \ + each feature inside a community. + :param df_ctrl: A dataframe containing the frequency of each nucleotide \ + in each exons/gene in fasterdb. + :param cpnt: The component (nt, aa, dnt) of interest + :param iteration: The number of sub samples to create + :return: The ctrl mean frequency value of `nt`, its standard error \ + the pvalue and the regulation of the enrichment/impoverishment \ + of the community in `row` compared to control exons. + """ + list_values = get_permutation_mean(df_ctrl, cpnt, row["community_size"], + iteration) + pval, reg = get_pvalue(np.array(list_values), row[cpnt], iteration) + return float(np.mean(list_values)), float(np.std(list_values)), pval, reg + + +def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str, + target_col: str, iteration: int, + dic_com: Dict) -> pd.DataFrame: + """ + Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \ + of `nt` frequencies from it. + + :param df: A dataframe containing the frequency of each nucleotide \ + in each exons belonging to a community. + :param df_ctrl: A dataframe containing the frequency of each nucleotide \ + in each exons/gene in fasterdb. + :param feature: The feature of interest (gene, exon) + :param target_col: The name of the column containing the data of interest + :param iteration: The number of sub samples to create + :param dic_com: A dictionary linking each community to the exons \ + it contains. + :return: The dataframe containing p-values and regulation \ + indicating the enrichment of + """ + list_pval, list_reg, mean_ctrl, std_ctrl = ([] for _ in range(4)) + for i in tqdm(range(df.shape[0]), desc="performing permutations"): + row = df.iloc[i, :] + res = perm_community_pval(row, + df_ctrl.loc[ + -df_ctrl[f'id_{feature}' + ].isin(dic_com[row['community']]), + :], + target_col, iteration) + [x.append(y) for x, y in zip([mean_ctrl, std_ctrl, list_pval, + list_reg], res)] + adj_pvals = multipletests(list_pval, alpha=0.05, + method='fdr_bh', + is_sorted=False, + returnsorted=False)[1] + adj_regs = [list_reg[i] if adj_pvals[i] <= 0.05 else " . " + for i in range(len(list_reg))] + df[f'{target_col}_mean_{iteration}_ctrl'] = mean_ctrl + df[f'{target_col}_std_{iteration}_ctrl'] = std_ctrl + df[f'p-adj'] = adj_pvals + df[f'reg-adj'] = adj_regs + return df + + +def perm_with_ctrl(df: pd.DataFrame, feature: str, + target_col: str, dic_com: Dict, + iteration: int) -> pd.DataFrame: + """ + + :param df: A dataframe containing the id of the chosen `feature` \ + (i.e FasterDB id of genes or exons) a column with data for interest (\ + this column must have the name *target_col*) and the community \ + and the size of the community of the feature if it has one (None, else). + :param feature: The kind of feature analysed + :param target_col: The name of the column containing the data of interest + :param dic_com: A dictionary linking each community to the exons \ + it contains. + :param iteration: The number of sub samples to create + :return: The dataframe with the p-value compared to the control \ + list of exons. + """ + df_tmp = df.loc[-df["community"].isna(), :] + mean_df = df_tmp[[target_col, "community", "community_size"]]. \ + groupby(["community", "community_size"]).mean().reset_index() + return perm_pvalues(mean_df, df, feature, target_col, + iteration, dic_com) + + +def create_perm_ctrl_df(ctrl_df: pd.DataFrame, order_df: pd.DataFrame, + cpnt: str, feature: str, iteration: int + ) -> pd.DataFrame: + """ + + :param ctrl_df: A dataframe containing the mean ctrl values, \ + the mean control std and the community from which those control \ + have been created + :param order_df: A dataframe containing the community and their final \ + order. + :param cpnt: The component (nt, aa, dnt) of interest + :param feature: The feature of interest + :param iteration: The number of iteration + :return: The ctrl_tmp_df in good order + """ + dsize = ctrl_df.shape[0] + ctrl_df[f"mean_{cpnt}"] = \ + [np.mean(ctrl_df[f"{cpnt}_mean_{iteration}_ctrl"])] * dsize + ctrl_df[f"id_{feature}"] = ['ctrl'] * dsize + ctrl_df["community_size"] = [dsize] * dsize + ctrl_df = ctrl_df.merge(order_df, how='left', on="community") + ctrl_df.rename({f"{cpnt}_mean_{iteration}_ctrl": cpnt, + f"{cpnt}_std_{iteration}_ctrl": 'ctrl_std'}, axis=1, + inplace=True) + return ctrl_df.sort_values("order", ascending=True) + + +def expand_results_perm(df: pd.DataFrame, rdf: pd.DataFrame, target_col: str, + feature: str, iteration: int) -> pd.DataFrame: + """ + Merge df and rdf together. + + :param df: A dataframe containing the id of the chosen `feature` \ + (i.e FasterDB id of genes or exons) a column with data for interest (\ + this column must have the name *target_col*) and the community \ + and the size of the community of the feature if it has one (None, else). + :param rdf: The dataframe containing the mean frequency for \ + each community and the p-value of their enrichment compared to control \ + exons. + :param target_col: The name of the column containing the data of interest + :param feature: The feature of interest + :param iteration: The number of iteration + :return: The merged dataframe: i.e df with the stats columns + """ + df = df.loc[-df["community"].isna(), + [f"id_{feature}", target_col, + "community", "community_size"]].copy() + ctrl_df = rdf[[f"{target_col}_mean_{iteration}_ctrl", + f"{target_col}_std_{iteration}_ctrl", "community"]].copy() + rdf = rdf[["community", "community_size", target_col, "p-adj"]].copy() + rdf.rename({target_col: f"mean_{target_col}"}, axis=1, inplace=True) + df = df.merge(rdf, how="left", on=["community", "community_size"]) + df.sort_values(f"mean_{target_col}", ascending=True, inplace=True) + order_df = df[["community"]].drop_duplicates().copy() + order_df["order"] = range(order_df.shape[0]) + df_ctrl = create_perm_ctrl_df(ctrl_df, order_df, target_col, feature, + iteration) + return pd.concat([df_ctrl, df], axis=0, ignore_index=True) + + +def make_barplot(df_bar: pd.DataFrame, outfile: Path, + target_col: str, feature: str, target_kind: str = "") -> None: + """ + Create a barplot showing the frequency of `nt` for every community \ + of exons/gene in `df_bar`. + + :param df_bar: A dataframe with the enrichment of a \ + nucleotide frequency for every community + :param outfile: File were the figure will be stored + :param target_kind: An optional name that describe a bit further \ + target_col. + :param target_col: The name of the column containing the data of interest + :param feature: The king of feature of interest + """ + sns.set(context="poster") + g = sns.catplot(x="community", y=target_col, data=df_bar, kind="point", + ci="sd", aspect=2.5, height=14, errwidth=0.5, capsize=.4, + scale=0.5, + palette=["red"] + ["darkgray"] * (df_bar.shape[0] - 1)) + g2 = sns.catplot(x="community", y=target_col, data=df_bar, kind="bar", + ci="sd", aspect=2.5, height=14, errwidth=0.5, capsize=.4, + palette=["red"] + ["darkgray"] * (df_bar.shape[0] - 1)) + g.fig.subplots_adjust(top=0.9) + target_kind = f" ({target_kind})" if target_kind else "" + g.fig.suptitle(f"Mean frequency of {target_col}{target_kind}" + f"among community of {feature}s\n" + f"(stats obtained with a lm test)") + g.set(xticklabels=[]) + g.ax.set_ylabel(f'Frequency of {target_col}') + df_bara = df_bar.drop_duplicates(subset="community", keep="first") + for i, p in enumerate(g2.ax.patches): + stats = "*" if df_bara.iloc[i, :]["p-adj"] < 0.05 else "" + com = df_bara.iloc[i, :]["community"] + csd = np.std(df_bar.loc[df_bar["community"] == com, target_col]) + g.ax.annotate(stats, + (p.get_x() + p.get_width() / 2., p.get_height() + csd), + ha='center', va='center', xytext=(0, 10), fontsize=12, + textcoords='offset points') + g.savefig(outfile) + + +def make_barplot_perm(df_bar: pd.DataFrame, outfile: Path, + target_col: str, feature: str, + target_kind: str = "") -> None: + """ + Create a barplot showing the frequency of `nt` for every community \ + of exons/gene in `df_bar`. + + :param df_bar: A dataframe with the enrichment of a \ + nucleotide frequency for every community + :param outfile: File were the figure will be stored + :param target_kind: An optional name that describe a bit further \ + target_col. + :param target_col: The name of the column containing the data of interest + :param feature: The king of feature of interest + """ + sns.set(context="poster") + df_ctrl = df_bar.loc[df_bar[f"id_{feature}"] == 'ctrl', :] + df_bar = df_bar.loc[df_bar[f"id_{feature}"] != 'ctrl', :].copy() + g2 = sns.catplot(x="community", y=target_col, data=df_bar, kind="bar", + ci="sd", aspect=2.5, height=14, errwidth=0.5, capsize=.4, + palette=["darkgray"] * (df_bar.shape[0])) + g = sns.catplot(x="community", y=target_col, data=df_bar, kind="point", + ci="sd", aspect=2.5, height=14, errwidth=0.5, capsize=.4, + scale=0.5, palette=["darkgray"] * (df_bar.shape[0])) + xrange = g.ax.get_xlim() + df_ctrl.plot(x="community", y=target_col, kind="scatter", ax=g.ax, + yerr="ctrl_std", legend=False, zorder=10, + color=(0.8, 0.2, 0.2, 0.4)) + g.ax.set_xlim(xrange) + g.fig.subplots_adjust(top=0.9) + target_kind = f" ({target_kind})" if target_kind else "" + g.fig.suptitle(f"Mean frequency of {target_col}{target_kind}" + f"among community of {feature}s\n" + f"(stats obtained with a permutation test)") + g.set(xticklabels=[]) + g.ax.set_ylabel(f'Frequency of {target_col}') + df_bara = df_bar.drop_duplicates(subset="community", keep="first") + for i, p in enumerate(g2.ax.patches): + stats = "*" if df_bara.iloc[i, :]["p-adj"] < 0.05 else "" + com = df_bara.iloc[i, :]["community"] + csd = np.std(df_bar.loc[df_bar["community"] == com, target_col]) + g.ax.annotate(stats, + (p.get_x() + p.get_width() / 2., p.get_height() + csd), + ha='center', va='center', xytext=(0, 10), fontsize=12, + textcoords='offset points') + g.savefig(outfile) + + +def barplot_creation(df_bar: pd.DataFrame, outfig: Path, + cpnt: str, test_type: str, feature: str, + target_kind) -> None: + """ + Reformat a dataframe with the enrichment of a nucleotide frequency \ + for every feature for every community and then create a \ + barplot showing those frequencies. + + :param df_bar: A dataframe with the enrichment of a \ + nucleotide frequency for every community and showing the frequency \ + of each feature in each community + :param outfig: File were the figure will be stored + :param cpnt: The component (nt, aa, dnt) of interest + :param test_type: The kind of test make + :param feature: The king of feature of interest + :param test_type: The type of test to make (permutation or lm) + :param target_kind: An optional name that describe a bit further \ + target_col. + """ + if test_type == "lm": + make_barplot(df_bar, outfig, cpnt, feature, target_kind) + else: + make_barplot_perm(df_bar, outfig, cpnt, feature, target_kind) + + +def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict: + """ + Create a dictionary containing the exons contained in each community. + + :param df: A dataframe containing the frequency of each nucleotide \ + in each exons belonging to a community. + :param feature: the feature of interest (exon, gene) + :return: A dictionary linking each community to the exons it contains + + >>> dataf = pd.DataFrame({"id_gene": ['1', '2', '3', '4', '5'], + ... 'community': ['C1', 'C1', 'C2', 'C2', np.nan]}) + >>> get_feature_by_community(dataf, 'gene') + {'C1': ['1', '2'], 'C2': ['3', '4']} + >>> dataf.rename({"id_gene": "id_exon"}, axis=1, inplace=True) + >>> get_feature_by_community(dataf, 'exon') + {'C1': ['1', '2'], 'C2': ['3', '4']} + """ + dic = {} + for i in range(df.shape[0]): + com, id_ft = df.iloc[i, :][['community', f'id_{feature}']] + if com is not None: + if com in dic: + dic[com].append(id_ft) + else: + dic[com] = [id_ft] + return dic + + +def create_community_fig(df: pd.DataFrame, feature: str, + target_col: str, + outfile_ctrl: Path, test_type: str, + dic_com: Optional[Dict] = None, + target_kind: str = "", + iteration: int = 10000) -> None: + """ + Create a dataframe with a control community, save it as a table and \ + as a barplot figure. + + :param df: A dataframe containing the id of the chosen `feature` \ + (i.e FasterDB id of genes or exons) a column with data for interest (\ + this column must have the name *target_col*) and the community \ + and the size of the community of the feature if it has one (None, else). + :param feature: The kind of feature analysed + :param target_col: The name of the column containing the data of interest + :param outfile_ctrl: file used to stored the table and the figure \ + containing the test communities and the control community + :param test_type: The type of test to make (permutation or lm) + :param dic_com: A dictionary linking each community to the exons \ + it contains. + :param target_kind: An optional name that describe a bit further \ + target_col. + :param iteration: The number of sub samples to create + """ + if dic_com is None: + dic_com = {} if test_type == 'lm' \ + else get_feature_by_community(df, feature) + if test_type == "lm": + ndf, rdf = lm_with_ctrl(df, target_col, outfile_ctrl) + df_bar = expand_results_lm(ndf, rdf, target_col, feature) + else: + rdf = perm_with_ctrl(df, feature, target_col, dic_com, iteration) + df_bar = expand_results_perm(df, rdf, target_col, feature, iteration) + rdf.to_csv(str(outfile_ctrl).replace(".pdf", ".txt"), sep="\t", + index=False) + bar_outfile = str(outfile_ctrl).replace(".pdf", "_bar.txt") + df_bar.to_csv(bar_outfile, sep="\t", index=False) + barplot_creation(df_bar, outfile_ctrl, target_col, test_type, feature, + target_kind) diff --git a/src/find_interaction_cluster/community_finder.py b/src/find_interaction_cluster/community_finder.py index b4180848fd5efb1c27f06fe999bad04d58512d1a..d65d5a81cacaedc6dbe18b7885e4a89fc4568fa5 100644 --- a/src/find_interaction_cluster/community_finder.py +++ b/src/find_interaction_cluster/community_finder.py @@ -253,8 +253,8 @@ def get_figure_title(project, weight, global_weight, same_gene, feature): :param feature: The kind of analysed features :return: A figure title """ - title = f"Co-localisation between {feature}s having a weight greater than " \ - f"{weight} in " + title = f"Co-localisation between {feature}s having a weight greater " \ + f"than {weight} in " if global_weight == 0: title += f"the project {project}" else: @@ -347,7 +347,7 @@ def community_finder(weight: int, global_weight: int, project: str = "", logging.debug('Done !') -def get_projects(global_weight: int) -> List[str]: +def get_projects(global_weight: int, project: str) -> str: """ Get projects name. @@ -355,69 +355,38 @@ def get_projects(global_weight: int) -> List[str]: the global weight is equal to 0 then then density figure are calculated \ by project, else all projet are merge together and the interaction \ seen in `global_weight` project are taken into account - :return: The list of the project to consider + :param project: The name of a project + :return: The project to consider """ if global_weight != 0: - return [f'Global-weight-{global_weight}'] + return f'Global-weight-{global_weight}' else: - return ConfigGraph.good_projects + return project -def get_projects_name(global_weights: List[int]) -> Tuple[List[str], Dict]: - """ - Get projects name given a list of global_weight and a dictionary linking, - each project name to it's corresponding global weight. - - :param global_weights: The list of global weights to consider. if \ - the global weight is equal to 0 then then density figure are calculated \ - by project, else all projet are merge together and the interaction \ - seen in `global_weight` project are taken into account - :return: project names and a dictionary linking, - each name to it's corresponding global weight. - """ - dic = {} - projects = [] - for global_weight in global_weights: - tmp = get_projects(global_weight) - projects += tmp - for p in tmp: - dic[p] = global_weight - return projects, dic - - -def multiple_community_launcher(ps: int, weights: List[int], - global_weights: List[int], - same_gene: bool, html_fig: bool = False, +def multiple_community_launcher(weight: int, + global_weight: int, + project: str, + same_gene: bool, + html_fig: bool = False, feature: str = 'exon', logging_level: str = "DISABLE"): """ - :param ps: The number of processes we want to use. - :param weights: The list of weights of interaction to consider - :param global_weights: The list global weights to consider. if \ - the global weight is equal to 0 then then density figure are calculated \ - by project, else all projcet are merge together and the interaction \ - seen in `global_weight` project are taken into account + :param weight: The weight of interaction to consider + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. :param same_gene: Say if we consider as co-localised exon within the \ same gene :param html_fig: True to create an html figure, false else :param feature: The feature we want to analyse (default 'exon') + :param project: The project name, used only if global_weight = 0 :param logging_level: Level of information to display """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) logging_def(ConfigGraph.community_folder, __file__, logging_level) - global_weights = list(np.unique(global_weights)) - weights = list(np.unique(weights)) - projects, dic_project = get_projects_name(global_weights) - condition = list(product(projects, weights)) - processes = [] - pool = mp.Pool(processes=min(ps, len(condition))) - for project, weight in condition: - global_weight = dic_project[project] - logging.info(f'Finding community for project : {project}, ' - f'global_weight : {global_weight}, weight: {weight}') - args = [weight, global_weight, project, same_gene, html_fig, feature] - processes.append(pool.apply_async(community_finder, args)) - for proc in processes: - proc.get(timeout=None) - pool.close() - pool.join() + project = get_projects(global_weight, project) + logging.info(f'Finding community for project : {project}, ' + f'global_weight : {global_weight}, weight: {weight}') + community_finder(weight, global_weight, project, same_gene, html_fig, + feature) diff --git a/src/find_interaction_cluster/config.py b/src/find_interaction_cluster/config.py index dd566401ef4ca4f4af9eb380e5cb7126904fc002..ac3ea4925797c1eb72ebac6f5cd7d170229d62b2 100644 --- a/src/find_interaction_cluster/config.py +++ b/src/find_interaction_cluster/config.py @@ -37,7 +37,7 @@ def get_weight_folder(weight: int, global_weight: int): def get_community_file(project: str, weight: int, global_weight: int, same_gene: bool, feature: str = 'exon', - ext: str = ".txt", stat: bool = False): + ext: str = ".txt", sub_fold: str = ''): """ Get the output file of interest. @@ -51,13 +51,12 @@ def get_community_file(project: str, weight: int, global_weight: int, same gene :param the kind of feature analyzed :param ext: The file extension - :param stat: True to place the result in 'sf_community_enrichment' \ - subfolder + :param subfolder: if filled, then the data are recovered from a subfolder :return: The filename of interest """ folder = get_weight_folder(weight, global_weight) - if stat: - folder = folder / 'sf_community_enrichment' + if sub_fold != '': + folder = folder / sub_fold folder.mkdir(exist_ok=True, parents=True) if global_weight != 0: project = f"global-weight-{global_weight}" @@ -116,6 +115,3 @@ class ConfigGraph: get_hip_folder = get_hipmcl_folder get_hipmcl_prog = get_hipmcl_prog good_projects = get_good_project() - genes_communities = data / \ - "global-weight-3_weight-3_same_gene-True_gene.txt" - genes_regulation_sf_community = results / "genes_regulation_sf_community" diff --git a/src/find_interaction_cluster/create_ppi_files.py b/src/find_interaction_cluster/create_ppi_files.py new file mode 100644 index 0000000000000000000000000000000000000000..1d401736c97007912f8ec3b84253e5c7613b6fb7 --- /dev/null +++ b/src/find_interaction_cluster/create_ppi_files.py @@ -0,0 +1,373 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: create a file of gene interaction at gene and protein levels. +""" + +from .config import ConfigGraph +from .ppi_scripts.config_ppi import ConfigPPI +from typing import List +from ..logging_conf import logging_def +import numpy as np +import logging +from .community_finder import get_projects +from itertools import product +import multiprocessing as mp +from pathlib import Path +from typing import Dict, Union +import pandas as pd +from .community_finder import create_graph, find_communities, \ + write_cytoscape_graph +from doctest import testmod +from .radomization_test_ppi import get_ppi_community_gene, \ + get_dna_community_gene, update_overlap_df,summary_randomisation_test + + +def get_community_dic_fom_community_file(mfile: Union[Path, pd.DataFrame], + community_data: bool = True) -> Dict: + """ + Read a file containing community of genes and return a dictionary \ + linking each gene to other gene in it's community. + + :param mfile: A file containing community of genes + :return: A dictionary linking a gene to all the other in it's community + """ + df = pd.read_csv(mfile, sep="\t") if isinstance(mfile, Path) else mfile + dic = {} + for i in range(df.shape[0]): + cseries = df.iloc[i, :] + list_gene = list(map(int, cseries['genes'].split(", "))) + if len(list_gene) > 1: + for gene in list_gene: + if community_data: + dic[gene] = {"links": [g for g in list_gene if g != gene], + "community": cseries['community']} + else: + dic[gene] = [g for g in list_gene if g != gene] + return dic + + +def ppi_array(fasterdb_ppi: Path, threshold: int) -> np.array: + """ + Return an array of gene having their proteins interaction with each \ + other + :param fasterdb_ppi: A file containing ppi interaction + :param threshold: The minimum theshold needed to consider the interaction + :return: An array containing the interaction + """ + df = pd.read_csv(fasterdb_ppi, sep="\t") + df['gene1'] = df['gene1'].astype(str) + df['gene2'] = df['gene2'].astype(str) + return df.loc[df['combined_score'] >= threshold, + ['gene1', 'gene2', 'combined_score']]\ + .drop_duplicates(keep="first")\ + .values + + +def write_interaction_ppi(arr_interaction: np.array, project: str, + weight: int, global_weight: int, same_gene: bool, + use_weight: bool = False): + """ + + :param arr_interaction: Each couples of co-localized feature within a \ + project. + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param use_weight: Say if we want to write the weight into the result file. + """ + logging.debug('Writing interaction files ...') + outfile = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, "gene", + f"_interation_PPI_tmp.txt", + "community_gene_vs_protein") + result = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, "gene", + f"_communities_PPI_tmp.txt", + "community_gene_vs_protein") + with outfile.open('w') as f: + for exon1, exon2, cweight in arr_interaction: + if not use_weight: + cweight = 1 + f.write(f"{exon1}\t{exon2}\t{cweight}\n") + return outfile, result + + +def ppi_community_finder(fasterdb_ppi: Path, project: str, + weight: int, global_weight: int, + same_gene: bool = True, + threshold: int = 700): + """ + Find communities inside protein-protein interaction file + + :param fasterdb_ppi: File containing ppi interaction + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param threshold: The minimum threshold needed to consider the interaction + """ + interaction = ppi_array(fasterdb_ppi, threshold) + outfile, result_file = write_interaction_ppi(interaction, project, + weight, global_weight, + same_gene) + graph = create_graph(interaction) + df, dic_community = find_communities(graph, project, outfile, result_file, + "gene") + logging.debug('Writing results ...') + outfiles = [ConfigGraph.get_community_file( + project, weight, global_weight, same_gene, "gene", ext, + "community_gene_vs_protein") + for ext in [f'_graph_community_PPI.txt', f'_graph_community_PPI.cyjs']] + df.to_csv(outfiles[0], sep="\t", index=False) + logging.debug("Saving the graph ...") + write_cytoscape_graph(graph, dic_community, outfiles[1]) + logging.debug('Done !') + return df + + +def create_dataframe(gene_com_dic: Dict) -> pd.DataFrame: + """ + Create a dataframe from a dictionary containing a community of gene. + + :param gene_com_dic: A dictionary linking each gene, to it's neighbors \ + at DNA level + :return: A table containing the gene, the community name of it's gene \ + and the community size of this gene + """ + content = [[gene, gene_com_dic[gene]['community'], + len(gene_com_dic[gene]['links'])] + for gene in gene_com_dic] + return pd.DataFrame(content, columns=["id_gene", "DNA_community", + "community_size"]) + + +def add_size_and_common_gene_number(df: pd.DataFrame, dic_gene: Dict, + dic_ppi: Dict, nb_col: str, + size_col: str) -> pd.DataFrame: + """ + Add a column containing the number of shared interaction between genes \ + and their product. + + :param df: A dataframe containing + :param dic_gene: A dictionary containing the gene interacting at \ + DNA level + :param dic_ppi: A dictionary contaning the interaction between protein \ + of genes. + :param nb_col: The name of the column that will stored the shared \ + interactions between genes at protein and DNA level + :param size_col: The size of the community + :return: The dataframe with two more columns. + """ + nb_content_col = [] + size_content_col = [] + for i in range(df.shape[0]): + cgene = df.iloc[i, :]["id_gene"] + if cgene in dic_ppi: + common = [g for g in dic_ppi[cgene] + if g in dic_gene[cgene]['links']] + size_content_col.append(len(dic_ppi[cgene])) + nb_content_col.append(len(common)) + else: + size_content_col.append(0) + nb_content_col.append(0) + df[nb_col] = nb_content_col + df[size_col] = size_content_col + return df + + +def filter_most_overllaping_ppi(df: pd.DataFrame, size_threshold: int + ) -> pd.DataFrame: + """ + Get the DNA community and the ppi community have the larger overlap \ + possible. + + :param df: The dataframe containing the number of \ + genes that clusters in community at DNA and protein level + :param size_threshold: the minimum size required to keep gene + community at dna level. + :return: The dataframe with only one line per DNA community + + >>> test_df = pd.DataFrame({"id_gene": range(1, 12), + ... "DNA_community": ["C1"] * 5 + ["C2"] * 5 + ["C3"], + ... "community_size": [9] * 5 + [14] * 5 + [19], + ... "nb_com-ppi": [0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0], + ... "size_com-ppi": [30, 50, 105, 102, 25, 30, 42, 47, 89, 12, 0]}) + >>> filter_most_overllaping_ppi(test_df, 10) + DNA_community community_size nb_com-ppi size_com-ppi + 0 C1 10 3 105 + 1 C2 15 2 42 + 2 C3 20 0 0 + """ + df.drop('id_gene', axis=1, inplace=True) + dna_communities = df["DNA_community"].unique() + list_df = [] + for my_community in dna_communities: + tmp_df = df.loc[(df['DNA_community'] == my_community), :] + val = np.max(tmp_df['nb_com-ppi'].values) + list_df.append(tmp_df.loc[tmp_df['nb_com-ppi'] == val, :] + .sample(1, random_state=1)) + df = pd.concat(list_df, axis=0, ignore_index=True) + df['nb_com-ppi'] = df.apply(lambda x: x['nb_com-ppi'] + 1 + if x["size_com-ppi"] > 0 + else x['nb_com-ppi'], axis=1) + df["community_size"] = df["community_size"] + 1 + return df[df["community_size"] >= size_threshold] + + +def create_community_ppi_table(community_file: Path, fasterdb_ppi: Path, + project: str, + weight: int, global_weight: int, + same_gene: bool = True, + threshold: int = 700): + """ + + :param community_file: A file containing community of gene interacting \ + at DNA level + :param fasterdb_ppi: A file containing pairs of genes having their \ + protein product interacting at protein level. + :param project: The name of the project of interest + :param weight: The weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all project are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised exon within the \ + same gene + :param threshold: The minimum threshold needed to consider the interaction + :return: + """ + outfile = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, "gene", + '_graph_community_PPI.txt', + "community_gene_vs_protein") + if not outfile.is_file(): + logging.debug("Creating the community file for PPI") + df_comm_ppi = ppi_community_finder(fasterdb_ppi, project, weight, + global_weight, same_gene, threshold) + else: + df_comm_ppi = pd.read_csv(outfile, sep="\t") + logging.debug("Turning ppi community file into a dic") + dic_ppi_com = get_community_dic_fom_community_file(df_comm_ppi, + community_data=False) + logging.debug("Loading gene community file") + dic_gene = get_community_dic_fom_community_file(community_file, + community_data=True) + logging.debug("Creation of the gene community ad DNA level") + df = create_dataframe(dic_gene) + logging.debug("Add a column for proteins community") + df = add_size_and_common_gene_number(df, dic_gene, dic_ppi_com, + "nb_com-ppi", "size_com-ppi") + return df + + +def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, + project: str, + weight: int, global_weight: int, + same_gene: bool = True, + threshold: int = 700, iteration: int = 1000): + """ + :param community_file: A file containing community of gene interacting \ + at DNA level + :param fasterdb_ppi: A file containing pairs of genes having their \ + protein product interacting at protein level. + :param project: The name of the project of interest + :param weight: The weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projcet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised exon within the \ + same gene + :param threshold: The minimum threshold needed to consider the interaction + :param iteration: The number of iteration to make + :return: + """ + outfile = ConfigGraph.get_community_file(project, weight, + global_weight, same_gene, "gene", + f"ppi_gene_complete_table.txt", + "community_gene_vs_protein") + ppi_outfile = ConfigGraph.get_community_file(project, weight, + global_weight, + same_gene, "gene", + '_graph_community_PPI.txt', + "community_gene_vs_protein") + if not outfile.is_file(): + df = create_community_ppi_table(community_file, fasterdb_ppi, + project, weight, global_weight, + same_gene, threshold) + df.to_csv(outfile, sep="\t", index=False) + else: + df = pd.read_csv(outfile, sep="\t") + size_threshold = 10 + df_overlap = filter_most_overllaping_ppi(df, size_threshold) + ppi_gene = get_ppi_community_gene(pd.read_csv(ppi_outfile, sep="\t")) + dic_dna_gene = get_dna_community_gene(pd.read_csv(community_file, + sep="\t")) + df, dic_values = update_overlap_df(df_overlap, dic_dna_gene, ppi_gene, + iteration) + outstat = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, "gene", + f"ppi_gene_table_{iteration}_" + f"stat.txt", + "community_gene_vs_protein") + df.to_csv(outstat, sep="\t", index=False) + final_df = summary_randomisation_test(df, dic_dna_gene, ppi_gene, + iteration, dic_values, + use_seed=False) + outstat = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, "gene", + f"ppi_gene_table_{iteration}_" + f"stat_recap.txt", + "community_gene_vs_protein") + final_df.to_csv(outstat, sep="\t", index=False) + + +def ppi_stat_launcher(weight: int, + global_weight: int, + project: str, + same_gene: bool, threshold: int = 700, + iteration: int = 1000, + logging_level: str = "DISABLE"): + """ + Launch the statistical allowing to determine if interaction between \ + genes at dna level as an influence on the interactions at protein level. + + :param weight: The weight of interaction to consider + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. + :param project: The project name, used only if global_weight = 0 + :param same_gene: Say if we consider as co-localised exon within the \ + same gene + :param threshold: The minimum threshold needed to consider the interaction + :param iteration: The number of iteration to make + :param logging_level: Level of information to display + """ + ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) + logging_def(ConfigGraph.community_folder, __file__, logging_level) + logging.info("Checking if gene communities at DNA level have an " + "influence on communities at protein level") + project = get_projects(global_weight, project) + logging.debug("Calculating stats...") + community_file = ConfigGraph.get_community_file(project, weight, + global_weight, + same_gene, "gene", + f".txt") + ppi_stats_analysis(community_file, ConfigPPI.fasterdb_ppi, project, + weight, global_weight, same_gene, threshold, iteration) + + +if __name__ == "__main__": + testmod() diff --git a/src/find_interaction_cluster/nt_and_community.py b/src/find_interaction_cluster/nt_and_community.py index 48896d40879815fffa759794518de252c62a100b..2f9d7d0e0e6608e58603d11585d8792e3c73749f 100644 --- a/src/find_interaction_cluster/nt_and_community.py +++ b/src/find_interaction_cluster/nt_and_community.py @@ -7,59 +7,73 @@ Description: The goal of this script is to see if the nucleotide \ frequency is not random between communities """ - import pandas as pd import logging import sqlite3 from .config import ConfigGraph, get_communities -from typing import List +from typing import List, Tuple, Dict from doctest import testmod from functools import reduce from pathlib import Path from rpy2.robjects import r, pandas2ri from statsmodels.stats.multitest import multipletests -import numpy as np -from .community_finder import get_projects_name +from .community_finder import get_projects from ..logging_conf import logging_def from itertools import product import multiprocessing as mp -from .sf_and_communities import get_key +from .community_figures.fig_functions import create_community_fig +from ..nt_composition.config import get_features -def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], - feature: str) -> pd.DataFrame: +def get_cpnt_frequency(cnx: sqlite3.Connection, list_ft: List[str], + feature: str, region: str = "", + component_type: str = "nt") -> pd.DataFrame: """ Get the frequency of every nucleotides for features in list_ft. :param cnx: Connection to chia-pet database :param list_ft: The list of exons for which we want to get :param feature: the kind of feature analysed + :param region: The region of gene analysed if feature is gene + :param component_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. :return: the frequency of nucleotides for the list of exons. - >>> d = get_nt_frequency(sqlite3.connect(ConfigGraph.db_file), + >>> d = get_cpnt_frequency(sqlite3.connect(ConfigGraph.db_file), ... ["1_1", "1_2"], "exon") >>> d[["id_exon", 'A', 'C', 'G', 'T']] ft id_exon A C G T 0 1_1 16.63480 34.60803 32.12237 16.63480 1 1_2 16.06426 26.10442 39.75904 18.07229 - >>> d = get_nt_frequency(sqlite3.connect(ConfigGraph.db_file), + >>> d = get_cpnt_frequency(sqlite3.connect(ConfigGraph.db_file), ... ['1', '2'], "gene") >>> d[["id_gene", 'A', 'C', 'G', 'T']] ft id_gene A C G T 0 1 29.49376 18.34271 18.43874 33.72479 1 2 31.90401 16.40251 18.79033 32.90315 + >>> d = get_cpnt_frequency(sqlite3.connect(ConfigGraph.db_file), + ... ['1', '2'], "gene", 'exon', 'aa') + >>> d[["id_gene", "R", "K", "D", "Q", "E"]] + ft id_gene R K D Q E + 0 1 4.75247 5.19300 5.95391 4.07997 6.96189 + 1 2 4.34203 6.23736 6.77708 5.21984 7.01769 """ + query_region = "" if feature == "gene": list_ft = [int(ft) for ft in list_ft] + if region == "": + region = "gene" + query_region = f"AND region = '{region}'" query = f""" SELECT ft, id_{feature}, frequency FROM cin_{feature}_frequency WHERE id_{feature} IN {tuple(list_ft)} - AND ft_type="nt" + AND ft_type = '{component_type}' + {query_region} """ df = pd.read_sql_query(query, cnx) - df = df.pivot_table(index=f"id_{feature}", columns="ft", values="frequency")\ - .reset_index() + df = df.pivot_table(index=f"id_{feature}", columns="ft", + values="frequency").reset_index() df[f"id_{feature}"] = df[f"id_{feature}"].astype(str) return df @@ -78,20 +92,20 @@ def get_community_table(communities: List[List[str]], >>> c = [['1_1', '2_5'], ['7_9', '4_19', '3_3']] >>> get_community_table(c, 3, 'exon') community id_exon community_size - 0 C1 7_9 3 - 1 C1 4_19 3 - 2 C1 3_3 3 + 0 C2 7_9 3 + 1 C2 4_19 3 + 2 C2 3_3 3 >>> c = [['1', '2'], ['7', '49', '3']] >>> get_community_table(c, 3, 'gene') community id_gene community_size - 0 C1 7 3 - 1 C1 49 3 - 2 C1 3 3 + 0 C2 7 3 + 1 C2 49 3 + 2 C2 3 3 """ dic = {"community": [], f"id_{feature}": [], "community_size": []} for k, comm in enumerate(communities): if len(comm) >= size_threshold: - name = f'C{k}' + name = f'C{k + 1}' clen = len(comm) for exon in comm: dic["community"].append(name) @@ -100,15 +114,15 @@ def get_community_table(communities: List[List[str]], return pd.DataFrame(dic) -def lmm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float: +def lm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float: """ - Make the lmm analysis to see if the exon regulated by a splicing factor \ + Make the lm analysis to see if the exon regulated by a splicing factor \ are equally distributed among the communities. :param df: The dataframe :param outfile: A name of a file :param nt: the nucleotide of interest - :return: the pvalue of lmm + :return: the pvalue of lm """ pandas2ri.activate() @@ -126,7 +140,6 @@ def lmm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float: dev.off() return(anova(mod, null_mod, test="Chisq")) } - """ % (nt, nt)) folder = outfile.parent / "diagnostics" folder.mkdir(parents=True, exist_ok=True) @@ -135,45 +148,6 @@ def lmm_maker(df: pd.DataFrame, outfile: Path, nt: str) -> float: return res["Pr(>Chisq)"][1] -def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str - ) -> pd.DataFrame: - """ - Make the lmm analysis to see if the exon regulated by a splicing factor \ - are equally distributed among the communities. - - :param df: The dataframe - :param outfile: A name of a file - :param nt: the nucleotide of interest - :return: the pvalue of lmm - - """ - pandas2ri.activate() - lmm = r( - """ - require("DHARMa") - - function(data, folder, partial_name) { - mod <- lm(%s ~ log(community_size) + community, data=data) - simulationOutput <- simulateResiduals(fittedModel = mod, n = 250) - png(paste0(folder, "/dignostics_summary", partial_name, ".png")) - plot(simulationOutput) - dev.off() - return(as.data.frame(summary(mod)$coefficients)) - } - - """ % nt) - folder = outfile.parent / "diagnostics" - folder.mkdir(parents=True, exist_ok=True) - partial_name = outfile.name.replace('.txt', '') - res_df = lmm(df, str(folder), partial_name).reset_index() - res_df.rename({'index': 'community'}, inplace=True, axis=1) - res_df['community'] = res_df['community'].str.replace('community', '') - res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL" - mean_df = df[[nt, "community", "community_size"]].\ - groupby(["community", "community_size"]).mean().reset_index() - return res_df.merge(mean_df, how="left", on="community") - - def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]: """ Return the id of every gene/exons in chia-pet database. @@ -189,36 +163,136 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]: return [str(cid[0]) for cid in res] -def create_ctrl_community(df: pd.DataFrame, outfile: Path, - feature: str = 'exon') -> pd.DataFrame: +def create_ctrl_community(df: pd.DataFrame, + feature: str = 'exon', region: str = "", + cpnt_type: str = 'nt') -> pd.DataFrame: """ :param df: A dataframe containing the frequency of each feature in a \ community. - :param outfile: The output table containing frequencies :param feature: The kind of feature to analyse + :param region: only use if feature is 'gene'. Used to focus on \ + a given region in genes (can be gene, exon, intron). + :param cpnt_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. :return: A dataframe containing the frequency of every nucleotides \ of every exon in a large community """ - if outfile.is_file(): - return pd.read_csv(outfile, sep="\t") size_threshold = 10 if feature == "gene" else 50 cnx = sqlite3.connect(ConfigGraph.db_file) ft_id = get_ft_id(cnx, feature) list_ft = [cid for cid in ft_id if cid not in df[f"id_{feature}"].to_list()] - df_nt = get_nt_frequency(cnx, list_ft, feature) + df_nt = get_cpnt_frequency(cnx, list_ft, feature, region, cpnt_type) df_com = get_community_table([list_ft], size_threshold, feature) df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}") df_nt['community'] = ['C-CTRL'] * df_nt.shape[0] df = pd.concat([df, df_nt], axis=0, ignore_index=True) - df.to_csv(outfile, sep="\t", index=False) return df -def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, - global_weight: int, same_gene: bool, - nt: str, feature: str = "exon", - ) -> pd.Series: +def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict: + """ + Create a dictionary containing the exons contained in each community. + + :param df: A dataframe containing the frequency of each nucleotide \ + in each exons belonging to a community. + :param feature: the feature of interest (exon, gene) + :return: A dictionary linking each community to the exons it contains + + >>> dataf = pd.DataFrame({"id_gene": ['1', '2', '3', '4'], + ... 'community': ['C1', 'C1', 'C2', 'C2']}) + >>> get_feature_by_community(dataf, 'gene') + {'C1': ['1', '2'], 'C2': ['3', '4']} + >>> dataf.rename({"id_gene": "id_exon"}, axis=1, inplace=True) + >>> get_feature_by_community(dataf, 'exon') + {'C1': ['1', '2'], 'C2': ['3', '4']} + """ + dic = {} + for i in range(df.shape[0]): + com, id_ft = df.iloc[i, :][['community', f'id_{feature}']] + + if com in dic: + dic[com].append(id_ft) + else: + dic[com] = [id_ft] + return dic + + +def prepare_dataframe(df: pd.DataFrame, test_type: str, nt: str, + iteration: int) -> pd.DataFrame: + """ + + :param df: A dataframe with the enrichment of a \ + nucleotide frequency for every community + :param test_type: The kind of test performed to + :param nt: The nucleotide of interest + :param iteration: The number of iteration permformed to \ + produce the database + :return: The dataframe ready for barplot visualisation + """ + if test_type == "lm": + # removing the size parameter + df = df[df["community"] != "log(_size)"].copy() + df.rename({"Pr(>|t|)": "p-adj"}, axis=1, inplace=True) + df_bar = df[['community', nt, 'p-adj']] + df_ctrl = df_bar[df_bar["community"] == "C-CTRL"] + df_bar = df_bar[df_bar["community"] != "C-CTRL"].copy() + df_bar.sort_values(nt, ascending=True, inplace=True) + else: + df_bar = df[["community", nt, f"{nt}_mean_{iteration}_ctrl", + "p-adj"]].copy() + ctrl_val = df_bar[f"{nt}_mean_{iteration}_ctrl"] + df_bar.drop(f"{nt}_mean_{iteration}_ctrl", axis=1, inplace=True) + df_bar.sort_values(nt, ascending=True, inplace=True) + df_ctrl = pd.DataFrame({"community": ["C-CTRL"] * len(ctrl_val), + nt: ctrl_val}) + df_bar = pd.concat([df_ctrl, df_bar], axis=0, ignore_index=True) + return df_bar + + +def create_outfiles(project: str, weight: int, global_weight: int, + same_gene: bool, feature: str, cpnt_type: str, + cpnt: str, test_type: str + ) -> Tuple[Path, Path]: + """ + Create a file used to store diagnostics and a file used to store the \ + table containing the test communities and the control community + + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + :param cpnt_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. + :param cpnt: The component (nt, aa, dnt) of interest + :param feature: The kind of feature analysed + :param test_type: The type of test to make (permutation or lm) + :return: file used to store diagnostics and a file used to store the \ + table containing the test communities and the control community + """ + + outfolder = f"community_enrichment/{cpnt_type}_analysis" + outfile = ConfigGraph.\ + get_community_file(project, weight, global_weight, same_gene, feature, + f"{cpnt}-{cpnt_type}_stat_{test_type}.txt", + outfolder) + outfile_ctrl = ConfigGraph.\ + get_community_file(project, weight, global_weight, same_gene, feature, + f"{cpnt}-{cpnt_type}_VS_CTRL_stat_{test_type}.pdf", + outfolder) + return outfile, outfile_ctrl + + +def get_stat_cpnt_communities(df: pd.DataFrame, project: str, weight: int, + global_weight: int, same_gene: bool, + cpnt_type: str, cpnt: str, + dic_com: Dict, feature: str = "exon", + test_type: str = "", + iteration: int = 1000) -> pd.Series: """ Get data (proportion of `reg` regulated exons by a splicing factor `sf_name` within a community) for every community. @@ -233,41 +307,32 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) (default False) - :param nt: The nucleotide of interest - :param feature: The king of feature analysed + :param cpnt_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. + :param cpnt: The component (nt, aa, dnt) of interest + :param dic_com: A dictionary linking each community to the exons \ + it contains. + :param feature: The kind of feature analysed + :param test_type: The type of test to make (permutation or lm) + :param iteration: The number of sub samples to create """ - logging.debug(f"lmm for {project}, w:{weight}, " - f"g:{global_weight} nt: {nt}") - res = {"project": [], "nt": [], 'pval': []} - - outfile = ConfigGraph.get_community_file(project, weight, - global_weight, - same_gene, feature, - f"{nt}_stat.txt", - True) - nfolder = outfile.parent / "nt_analysis" - nfolder.mkdir(exist_ok=True, parents=True) - noutfile = nfolder / outfile.name - pval = lmm_maker(df, noutfile, nt) - res['project'] = project - res['nt'] = nt - res['pval'] = pval - nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt", - "_ctrl.txt") - ndf = create_ctrl_community(df, nt_ctrl_table, feature) - sum_df = lmm_maker_summary(ndf, outfile, nt) - outfile_ctrl = ConfigGraph.get_community_file(project, weight, - global_weight, - same_gene, feature, - f"{nt}_VS_CTRL_stat.txt", - True) - noutfile_ctrl = nfolder / outfile_ctrl.name - sum_df.to_csv(noutfile_ctrl, sep="\t", index=False) + logging.debug(f"{test_type} for {project}, w:{weight}, " + f"g:{global_weight} cpnt: {cpnt}({cpnt_type})") + outfile, outfile_ctrl = create_outfiles(project, weight, global_weight, + same_gene, feature, cpnt_type, + cpnt, test_type) + res = {"project": project, "cpnt": cpnt, + 'pval': lm_maker(df, outfile, cpnt)} + create_community_fig(df, feature, cpnt, outfile_ctrl, test_type, + dic_com, cpnt_type, iteration) return pd.Series(res) def create_dataframe(project: str, weight: int, global_weight: int, - same_gene: bool, feature: str = 'exon') -> pd.DataFrame: + same_gene: bool, feature: str = 'exon', + region: str = "", component_type: str = 'nt', + from_communities: bool = True + ) -> pd.DataFrame: """ :param project: The name of the project of interest :param weight: The minimum weight of interaction to consider @@ -278,91 +343,134 @@ def create_dataframe(project: str, weight: int, global_weight: int, :param same_gene: Say if we consider as co-localised, exons within the \ same gene (True) or not (False) :param feature: The kind of feature to analyse + :param from_communities: True if we only select gene/exons + :param region: the region of interest to extract from gene + :param component_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. :return: A dataframe containing the frequency of every nucleotides \ of every exon in a large community """ size_threshold = 10 if feature == "gene" else 50 - result = ConfigGraph.get_community_file(project, weight, global_weight, - same_gene, feature, - "_communities.txt") - communities = get_communities(result, 0) - ncommunities = [c for c in communities if len(c) >= size_threshold] cnx = sqlite3.connect(ConfigGraph.db_file) - list_exon = reduce(lambda x, y: x + y, ncommunities) - df = get_nt_frequency(cnx, list_exon, feature) - df_com = get_community_table(communities, size_threshold, feature) - df = df.merge(df_com, how="left", on=f"id_{feature}") + if from_communities: + result = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, feature, + "_communities.txt") + communities = get_communities(result, 0) + ncommunities = [c for c in communities if len(c) >= size_threshold] + list_exon = reduce(lambda x, y: x + y, ncommunities) + + df_com = get_community_table(communities, size_threshold, feature) + df = get_cpnt_frequency(cnx, list_exon, feature, region, + component_type) + df = df.merge(df_com, how="left", on=f"id_{feature}") + else: + list_exon = get_ft_id(cnx, feature) + df = get_cpnt_frequency(cnx, list_exon, feature, region, + component_type) return df -def multiple_nt_lmm_launcher(ps: int, weights: List[int], - global_weights: List[int], - same_gene: bool, - feature: str = 'exon', - logging_level: str = "DISABLE"): +def create_dataframes(project, weight, global_weight, same_gene, feature, + region, test_type, component_type: str + ) -> Tuple[pd.DataFrame, Dict]: + """ + :param weight: The weight of interaction to consider + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. + :param project: The project name, used only if global_weight = 0 + :param same_gene: Say if we consider as co-localised exon within the \ + same gene + :param feature: The kind of analysed feature + :param region: the region of interest to extract from gene + :param test_type: The type of test to make (permutation or lm) + :param component_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. + """ + df = create_dataframe(project, weight, global_weight, same_gene, + feature, region, component_type) + df_ctrl = create_dataframe(project, weight, global_weight, same_gene, + feature, region, component_type, + from_communities=False) + df_ctrl = df_ctrl.loc[-df_ctrl[f"id_{feature}"].isin(df[f"id_{feature}"]), + :].copy() + dic_com = {} if test_type == 'lm' \ + else get_feature_by_community(df, feature) + df = pd.concat([df, df_ctrl], axis=0, ignore_index=True) + return df, dic_com + + +def multiple_nt_lm_launcher(ps: int, + weight: int, + global_weight: int, + project: str, + same_gene: bool, + feature: str = 'exon', + region: str = '', + component_type: str = "nt", + test_type: str = "lm", + iteration: int = 1000, + logging_level: str = "DISABLE"): """ Launch the statistical analysis for every - :param ps: The number of processes we want to use. - :param weights: The list of weights of interaction to consider - :param global_weights: The list global weights to consider. if \ - the global weight is equal to 0 then then density figure are calculated \ - by project, else all projcet are merge together and the interaction \ - seen in `global_weight` project are taken into account + :param ps: The number of processes to use + :param weight: The weight of interaction to consider + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. + :param project: The project name, used only if global_weight = 0 :param same_gene: Say if we consider as co-localised exon within the \ same gene :param feature: The kind of analysed feature + :param component_type: The type of component to analyse; It \ + can be 'nt', 'dnt' or 'aa'. + :param region: the region of interest to extract from gene + :param test_type: The type of test to make (permutation or lmm) + :param iteration: The number of iteration to perform :param logging_level: Level of information to display """ ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) logging_def(ConfigGraph.community_folder, __file__, logging_level) - logging.info("Checking if communities as an effect on nucleotide " + if feature == "gene" and region != "exon" and component_type == "aa": + msg = f"If you looking at amino acid at gene level, then the region " \ + f"must be 'exon' not '{region}'. Setting it to exon. " \ + f"(corresponds to the concatenation of exons for all genes)" + logging.warning(msg) + region = 'exon' + logging.info(f"Checking if communities as an effect on {component_type} " "frequency") - global_weights = list(np.unique(global_weights)) - weights = list(np.unique(weights)) - projects, dic_project = get_projects_name(global_weights) - nt_list = ["A", "C", "G", "T", "S", "W"] - condition = list(product(projects, weights, nt_list)) - processes = {} + project = get_projects(global_weight, project) + cpnt_list = get_features(component_type) + condition = list(product([project], [weight], cpnt_list)) + processes = [] pool = mp.Pool(processes=min(ps, len(condition))) - logging.debug("Calculating stats...") - dic_df = {} - for project, weight, nt in condition: - global_weight = dic_project[project] - ckey = get_key(project, weight) - if ckey in dic_df: - df = dic_df[ckey] - else: - df = create_dataframe(project, weight, global_weight, same_gene, - feature) - nfile_table = ConfigGraph.get_community_file(project, weight, - global_weight, - same_gene, feature, - f"_nt_table.txt", - True) - df.to_csv(nfile_table, sep="\t", index=False) - dic_df[ckey] = df - args = [df, project, weight, global_weight, same_gene, nt, feature] - if ckey not in processes.keys(): - processes[ckey] = [pool.apply_async(get_stat_nt_communities, args)] - else: - processes[ckey].append( - pool.apply_async(get_stat_nt_communities, args)) - for p, value in processes.items(): - project, weight = p.split("_") - results = [p.get(timeout=None) for p in value] - pool.close() - pool.join() - fdf = pd.DataFrame(results) - fdf["padj"] = multipletests(fdf['pval'].values, method='fdr_bh')[1] - outfile = ConfigGraph.get_community_file(project, weight, - dic_project[project], - same_gene, feature, - f"lmm-nt_stat.txt", - True) - nfolder = outfile.parent / "nt_analysis" - noutfile = nfolder / outfile.name - fdf.to_csv(noutfile, sep="\t", index=False) + logging.debug("Creating tables") + df, dic_com = create_dataframes(project, weight, global_weight, + same_gene, feature, region, + test_type, component_type) + for project, weight, cpnt in condition: + nfile_table = ConfigGraph.get_community_file( + project, weight, global_weight, same_gene, feature, + f"_{component_type}_table.txt", "community_enrichment") + df.to_csv(nfile_table, sep="\t", index=False) + args = [df, project, weight, global_weight, same_gene, component_type, + cpnt, dic_com, feature, test_type, iteration] + processes.append(pool.apply_async(get_stat_cpnt_communities, args)) + results = [p.get(timeout=None) for p in processes] + pool.close() + pool.join() + fdf = pd.DataFrame(results) + fdf["padj"] = multipletests(fdf['pval'].values, method='fdr_bh')[1] + outfile = ConfigGraph.get_community_file(project, weight, + global_weight, + same_gene, feature, + f"lmm-{component_type}_stat.txt", + "community_enrichment") + nfolder = outfile.parent / f"{component_type}_analysis" + noutfile = nfolder / outfile.name + fdf.to_csv(noutfile, sep="\t", index=False) if __name__ == "__main__": diff --git a/src/find_interaction_cluster/ppi_scripts/__init__.py b/src/find_interaction_cluster/ppi_scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..60f3a118d41cf7494b1cd067bf5d649b42ba1e05 --- /dev/null +++ b/src/find_interaction_cluster/ppi_scripts/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: +""" diff --git a/src/find_interaction_cluster/ppi_scripts/__main__.py b/src/find_interaction_cluster/ppi_scripts/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..f2f10e03ded5cbaf1644ba3a5f4883f1c0189483 --- /dev/null +++ b/src/find_interaction_cluster/ppi_scripts/__main__.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: +""" + +from .get_ensembl_hg19_gene_ccoordinates import create_protein_gene_file +from .create_gene_interaction_file import get_protein_interaction_table + +def launcher_ppi(): + """ + launch every script in this submodules. + """ + create_protein_gene_file() + get_protein_interaction_table() + + +launcher_ppi() \ No newline at end of file diff --git a/src/find_interaction_cluster/ppi_scripts/config_ppi.py b/src/find_interaction_cluster/ppi_scripts/config_ppi.py new file mode 100644 index 0000000000000000000000000000000000000000..d61820b4b05874c7d86013efed9c5018f6407623 --- /dev/null +++ b/src/find_interaction_cluster/ppi_scripts/config_ppi.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: This script contains all the variables needed in this \ +submodules +""" + +from ..config import ConfigGraph + + +class ConfigPPI: + """ + contains all the variables used in this submodules + """ + output_folder = ConfigGraph.output_folder / "ppi_analysis" + data = ConfigGraph.data + bed_ensembl = data / "ppi_data" / "liftover_hg38gene2hg19.bed" + ensembl_gene = data / "ppi_data" / "mart_export_hg37_proteins_genes.txt" + ppi_string = data / "ppi_data" / "9606.protein.links.v11.0.txt.gz" + protein_gene = output_folder / "protein_gene.txt" + gene_ppi_file = output_folder / "ppi_gene_file.txt" + protein_gene_fasterdb = output_folder / "protein_gene_fasterdb.txt" + fasterdb_ppi = output_folder / "fasterdb_ppi.txt" \ No newline at end of file diff --git a/src/find_interaction_cluster/ppi_scripts/create_gene_interaction_file.py b/src/find_interaction_cluster/ppi_scripts/create_gene_interaction_file.py new file mode 100644 index 0000000000000000000000000000000000000000..3c3bebabfea017d47bcfa3ae0cdef0dc53949518 --- /dev/null +++ b/src/find_interaction_cluster/ppi_scripts/create_gene_interaction_file.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: Create a file of genes having their proteins interacting with \ +each other +""" + +from pathlib import Path +from ..config import Config +from .config_ppi import ConfigPPI +import pandas as pd +from typing import Dict, Optional, Tuple + + +def load_string_ppi_file(ppi_file: Path) -> pd.DataFrame: + """ + Load the string protein protein interaction file. + + :param ppi_file: A file containing the protein protein interaction. + :return: The loaded string ppi interaction file + """ + df = pd.read_csv(ppi_file, sep=" ") + df['protein1'] = df['protein1'].str.replace("9606.", "", regex=False) + df['protein2'] = df['protein2'].str.replace("9606.", "", regex=False) + return df + + +def load_protein_gene_links(prot_gene_file: Path) -> pd.DataFrame: + """ + Load the file linking each gene to it's hg19 coordinates and encoded \ + proteins. + + :param prot_gene_file: A file linking each gene to it's hg19 \ + coordinates and encoded proteins. + :return: The loaded file + """ + df = pd.read_csv(prot_gene_file, sep="\t") + df.rename({"Chromosome-scaffold_name": "chr", "Gene_start": "start", + "Gene_end": "stop", "Strand": "strand"}, axis=1, inplace=True) + df['Gene_name'] = df['Gene_name'].str.upper() + return df + + +def load_fasterdb_gene(bed_gene: Path) -> Dict: + """ + Load a bed file containing all fasterDB gene. + + :param bed_gene: A bed file containing all fasterdb gene + :return: A dictionary linking each fasterDB gene name to its coordinates + """ + dic = {} + with bed_gene.open("r") as inbed: + for line in inbed: + line = line.replace("\n", "").split("\t") + gene_name = line[4].upper() + if gene_name not in dic.keys(): + dic[gene_name] = [[line[0], int(line[1]), int(line[2]), + int(line[3]), gene_name, line[5]]] + else: + dic[gene_name].append( + [line[0], int(line[1]), int(line[2]), + int(line[3]), gene_name, line[5]]) + return dic + + +def find_fasterdb_id(mseries: pd.Series, dic: Dict) -> Optional[int]: + """ + Find the fasterDB gene id from a line corresponding to an ensembl gene. + + :param mseries: A line containing the ensembl gene id and gene name of \ + a gene + :param dic: A dictionary linking each gene name to it's coordinate + :return: The fasterDB id of the ensembl gene. + """ + if mseries['Gene_name'].upper() not in dic.keys(): + return None + genes = dic[mseries['Gene_name'].upper()] + if len(genes) == 1: + return genes[0][3] + elif len(genes) >= 1: + for gene in genes: + if ( + mseries['chr'] == gene[0] + and mseries['start'] == gene[1] + and mseries['stop'] == gene[2] + and mseries['strand'] == gene[5] + ): + return gene[3] + return None + return None + + +def add_gene_id_column(prot_gene_df: pd.DataFrame, dic: Dict) -> pd.DataFrame: + """ + Add the fasterDB gene id to ensembl gene then only return lines in \ + the dataframe with a fasterDB gene id. + + :param prot_gene_df: A table containing ensembl gene id. + :param dic: A dictionary linking gene name to is fasterdb id. + :return: The table prot_gene_df with a column FasterDB_id + """ + prot_gene_df['FasterDB_id'] = prot_gene_df.apply(find_fasterdb_id, + axis=1, dic=dic) + df = prot_gene_df.loc[-prot_gene_df['FasterDB_id'].isna(), :].copy() + df['FasterDB_id'] = df['FasterDB_id'].astype(int) + return df + + +def add_fasterdb_id_to_ppi_file(ppi_df: pd.DataFrame, + df_fasterdb: pd.DataFrame) -> pd.DataFrame: + """ + Add two columns to the ppi_df: one containing the fasterdb gene_id \ + of the gene that encode the first interacting protein and the second \ + containing the fasterdb gene id of the gene encoding the second \ + interacting protein. + + :param ppi_df: Dataframe of interaction + :param df_fasterdb: df with the fasterDB gene id of the gene en their \ + encoding protein identified by ensembl protein id. + :return: The dataframe with the gene id of genes encoding the interacting \ + protein. + """ + df_fasterdb = df_fasterdb[["Protein_stable_ID", "FasterDB_id"]].copy() + df_fasterdb.rename({"FasterDB_id": "gene"}, axis=1, inplace=True) + ppi_df = ppi_df.merge(df_fasterdb, how="left", left_on="protein1", + right_on="Protein_stable_ID") + ppi_df = ppi_df.merge(df_fasterdb, how="left", left_on="protein2", + right_on="Protein_stable_ID", suffixes=['1', '2']) + ppi_df = ppi_df.loc[(-ppi_df['gene1'].isna()) & + (-ppi_df['gene2'].isna()), :] + ppi_df.drop(["Protein_stable_ID1", "Protein_stable_ID2"], axis=1, + inplace=True) + ppi_df['gene1'] = ppi_df['gene1'].astype(int) + ppi_df['gene2'] = ppi_df['gene2'].astype(int) + return ppi_df + + +def get_protein_interaction_table(): + """ + link ensemble gene and their encoding protein to a fasterDB gene id. + """ + df_prot_gene = load_protein_gene_links(ConfigPPI.protein_gene) + dic_gene = load_fasterdb_gene(Config.bed_gene) + df_fasterdb_id = add_gene_id_column(df_prot_gene, dic_gene) + df_fasterdb_id.to_csv(ConfigPPI.protein_gene_fasterdb, sep="\t", + index=False) + ppi_df = load_string_ppi_file(ConfigPPI.ppi_string) + ppi_fasterdb = add_fasterdb_id_to_ppi_file(ppi_df, df_fasterdb_id) + ppi_fasterdb.to_csv(ConfigPPI.fasterdb_ppi, sep="\t", index=False) diff --git a/src/find_interaction_cluster/ppi_scripts/get_ensembl_hg19_gene_ccoordinates.py b/src/find_interaction_cluster/ppi_scripts/get_ensembl_hg19_gene_ccoordinates.py new file mode 100644 index 0000000000000000000000000000000000000000..0f2c9fa94c751d03bde3351dff9a3f5367ab6a3d --- /dev/null +++ b/src/find_interaction_cluster/ppi_scripts/get_ensembl_hg19_gene_ccoordinates.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to get only coding gene from \ +ensembl.. +""" + +from .config_ppi import ConfigPPI +import pandas as pd +from pathlib import Path +import numpy as np + + +def load_genes(ensembl_file: Path) -> pd.DataFrame: + """ + Load a table containing all the human protein of ensembl database \ + and the name and location of their genes. + + :param ensembl_file: A file containing all the human protein of \ + ensembl database and the name and location of their genes. + :return: table containing all the human protein of ensembl database \ + and the name and location of their genes. + """ + df = pd.read_csv(ensembl_file, sep="\t", + dtype={"Gene_stable_ID": str, + "Protein_stable_ID": str, + "Chromosome-scaffold_name": str, + "Gene_start": int, + "Gene_end": int, + "Strand": str, + "Gene_name": str}) + df = df.loc[-df['Protein_stable_ID'].isna(), :] + df = df.loc[df["Chromosome-scaffold_name"].isin( + list(map(str, range(1, 23))) + ["X", "Y"]), :] + return df.drop_duplicates() + + +def create_protein_gene_file(): + """ + Create a file linking each ensembl protein to it's gene and the hg19 \ + coordinate of this gene. + """ + + df_gene = load_genes(ConfigPPI.ensembl_gene) + ConfigPPI.protein_gene.parent.mkdir(parents=True, exist_ok=True) + df_gene.to_csv(ConfigPPI.protein_gene, sep="\t", index=False) diff --git a/src/find_interaction_cluster/radomization_test_ppi.py b/src/find_interaction_cluster/radomization_test_ppi.py new file mode 100644 index 0000000000000000000000000000000000000000..4a7e2e95c6a94ecae97d0193928a5da6f9f34c3c --- /dev/null +++ b/src/find_interaction_cluster/radomization_test_ppi.py @@ -0,0 +1,447 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: Contains all the functions needed to make randomisation tests. +""" + + +import pandas as pd +import numpy as np +from doctest import testmod +from typing import Dict, Tuple +import logging +from tqdm import tqdm +from statsmodels.stats.multitest import multipletests + + +class IterationNumberError(Exception): + pass + + +def get_ppi_community_gene(df: pd.DataFrame) -> np.array: + """ + From a dataframe containing ppi communities return an array of \ + the gene id found in those community + + :param df: dataframe containing ppi communities + :return: An array containing genes belonging to PPI community + + >>> df_test = pd.DataFrame({"community": ["C1", "C2"], "node": [3, 2], + ... "edges": [3, 1], "cov": [0.77, 0.77], "modularity": [0.5, 0.5], + ... "genes": ["1, 2, 3", "5, 7"], "project": ["Global-weight-3"] * 2}) + >>> get_ppi_community_gene(df_test) + array([1, 2, 3, 5, 7]) + """ + gene_list = [] + for i in range(df.shape[0]): + gene_list += df.iloc[i, :]['genes'].split(", ") + return np.unique(np.array(gene_list, dtype=int)) + + +def get_dna_community_gene(df: pd.DataFrame) -> Dict[str, np.array]: + """ + Create a dictionary containing gene communities at DNA level. + + :param df: A dataframe containing the community at dna level + :return: A dictionary linking each community to the genes it contains + + >>> df_test = pd.DataFrame({"community": ["C1", "C2"], "node": [3, 2], + ... "edges": [3, 1], "cov": [0.77, 0.77], "modularity": [0.5, 0.5], + ... "genes": ["1, 2, 3", "5, 7"], "project": ["Global-weight-3"] * 2}) + >>> get_dna_community_gene(df_test) + {'C1': array([1, 2, 3]), 'C2': array([5, 7])} + >>> df_test = pd.DataFrame({"community": ["C1", "C1"], "node": [3, 2], + ... "edges": [3, 1], "cov": [0.77, 0.77], "modularity": [0.5, 0.5], + ... "genes": ["1, 2, 3", "5, 7"], "project": ["Global-weight-3"] * 2}) + >>> get_dna_community_gene(df_test) + Traceback (most recent call last): + ... + ValueError: A community name is duplicated + """ + dic_dna_gene = {} + if df.shape[0] != len(df['community'].unique()): + raise ValueError("A community name is duplicated") + for i in range(df.shape[0]): + ms = df.iloc[i, :] + dic_dna_gene[ms["community"]] = np.array(ms["genes"].split(', '), + dtype=int) + return dic_dna_gene + + +def get_common_genes(dna_gene: np.array, ppi_gene: np.array) -> int: + """ + Return the number of gene common between ppi_gene and size_ppi. + + :param ppi_gene: A community of gene whose protein product interact \ + between each other. + :param dna_gene: A community of gene at dna level + :return: The number of common exons between the two lists + + >>> get_common_genes(np.array([1, 5, 9, 15]), np.array([5, 9, 15, 17])) + 3 + >>> get_common_genes(np.array([1, 5, 9, 15]), np.array([5, 9, 15, 1])) + 4 + >>> get_common_genes(np.array([1, 5, 9, 15]), np.array([18, 19, 115, 0])) + 0 + """ + return len(np.intersect1d(dna_gene, ppi_gene)) + + +def get_list_common_gene(community: str, dic_dna_gene: Dict[str, np.array], + ppi_size: int, ppi_gene: np.array, iteration: int, + use_seed: bool = True) -> np.array: + """ + Return the number of common gene between genes in the community \ + `community`and a sublist of randomly chosen genes in `ppi_gene` of \ + size `ppi_size`. + + :param community: A name of a gene community at dna level + :param dic_dna_gene: The dictionary containing gene interacting at dna \ + level + :param ppi_size: The size of the ppi_gene subsample + :param ppi_gene: All the gene interacting at protein level + :param iteration: The number of time we want to repeat the process + :param use_seed: True to use a fixed seed, false else. + :return: The number of gene in common between the genes in community \ + `community` and the subsample of ppi_size + + >>> dna_gene = {"C1": [1, 2, 3, 4, 5]} + >>> ppi_g = list(range(1, 51)) + >>> get_list_common_gene("C1", dna_gene, 10, ppi_g, 5) + array([2, 1, 0, 1, 1]) + >>> ppi_g = list(range(1, 1001)) + >>> get_list_common_gene("C1", dna_gene, 100, ppi_g, 5) + array([0, 0, 1, 0, 0]) + >>> ppi_g = list(range(1, 11)) + >>> get_list_common_gene("C1", dna_gene, 5, ppi_g, 5) + array([3, 2, 2, 4, 3]) + >>> get_list_common_gene("C1", dna_gene, 10, ppi_g, 5) + array([5, 5, 5, 5, 5]) + """ + if use_seed: + np.random.seed(1) + common = [ + get_common_genes( + dic_dna_gene[community], + np.random.choice(ppi_gene, ppi_size, replace=False), + ) + for _ in range(iteration) + ] + return np.array(common) + + +def get_pvalue(values: np.array, target: float, iteration: int + ) -> Tuple[float, str]: + """ + Return The p-value and the regulation + :param values: The list of control values + :param target: The test value + :param iteration: The iteration of interest + :return: The p-value and the regulation + >>> get_pvalue(np.arange(10), 1.75, 10) + (0.2, ' . ') + >>> get_pvalue(np.arange(10), -1, 10) + (0.0, ' . ') + >>> get_pvalue(np.arange(10), 9.75, 10) + (0.0, ' . ') + >>> get_pvalue(np.arange(10), 8.8, 10) + (0.1, ' . ') + >>> get_pvalue(np.arange(100), 98.8, 100) + (0.01, ' + ') + >>> get_pvalue(np.arange(100), 100, 100) + (0.0, ' + ') + >>> get_pvalue(np.arange(100), -1, 100) + (0.0, ' - ') + >>> get_pvalue(np.arange(0, 0.10, 0.01), 0.05, 10) + (0.5, ' . ') + """ + values = np.sort(values) + val_len = len(values) + if val_len != iteration: + msg = f"Error the length of values vector {len(values)} " \ + f"is different than iteration {iteration}" + logging.exception(msg) + raise IterationNumberError(msg) + idx = values[values <= target].size + impov = idx / val_len + idx = values[values >= target].size + enrich = idx / val_len + regulation = " . " + if impov < enrich: + pval = impov + if pval <= 0.05: + regulation = " - " + else: + pval = enrich + if pval <= 0.05: + regulation = " + " + if iteration < 20: + regulation = " . " + return pval, regulation + + +def adjust_regulation(x: pd.Series) -> str: + """ + Return the adjusted regulation. + + :param x: A row a the dataframe + :return: The adjusted row + """ + return x["regulation"] if x["p-adjust"] <= 0.05 else " . " + + +def update_overlap_df(df_overlap: pd.DataFrame, + dic_dna_gene: Dict[str, np.array], + ppi_gene: np.array, iteration: int, + use_seed: bool = True, + ) -> Tuple[pd.DataFrame, np.array]: + """ + Perform a randomization test to check if communities at \ + DNA level are more similar at protein level than randomly expected. + + :param df_overlap: A dataframe containing every gene community at dna \ + level and the number of common gene with a gne community at protein \ + level + :param dic_dna_gene: The dictionary containing gene interacting at dna \ + level + :param ppi_gene: All the gene interacting at protein level + :param iteration: The number of time we want to repeat the process + :param use_seed: True to use a fixed seed, false else. + :return: The updated Df (column p-value, p-adjust and regulation) + \ + the list of common gene shared between gene community at DNA and \ + protein level obtained by randmization analysis + + >>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"], + ... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1], + ... "size_com-ppi": [10, 10, 10]}) + >>> dna_gene = {"C1": [1, 2], "C2": [3, 4, 5], "C3": [6, 7]} + >>> ppi_g = list(range(11)) + >>> d, di = update_overlap_df(do, dna_gene, ppi_g, 100) + >>> d.iloc[:, [0, 1, 2, 4, 5]] + DNA_community community_size nb_com-ppi p-value p-adjust + 0 C1 2 2 0.79 0.79 + 1 C2 3 3 0.72 0.79 + 2 C3 2 1 0.18 0.54 + >>> d.iloc[:, 6].to_list() == d.iloc[:, 7].to_list() == [" . "] * 3 + True + >>> d, di = update_overlap_df(do, dna_gene, list(range(1000)), 100) + >>> d.iloc[:, [0, 1, 2, 4, 5]] + DNA_community community_size nb_com-ppi p-value p-adjust + 0 C1 2 2 0.01 0.01 + 1 C2 3 3 0.00 0.00 + 2 C3 2 1 0.00 0.00 + >>> d.iloc[:, 6].to_list() == d.iloc[:, 7].to_list() == [" + "] * 3 + True + >>> len(di.keys()) == 3 + True + >>> len(di[list(di.keys())[0]]) == 100 + True + """ + pval_list = [] + reg_list = [] + dic = {} + for i in tqdm(range(df_overlap.shape[0])): + community = df_overlap.iloc[i, :]["DNA_community"] + values = get_list_common_gene(community, + dic_dna_gene, + df_overlap.iloc[i, :]["size_com-ppi"], + ppi_gene, + iteration, use_seed) + dic[community] = values + pval, reg = get_pvalue(values, df_overlap.iloc[i, :]["nb_com-ppi"], + iteration) + pval_list.append(pval) + reg_list.append(reg) + df_overlap["p-value"] = pval_list + df_overlap["p-adjust"] = multipletests(pval_list, alpha=0.05, + method='fdr_bh', + is_sorted=False, + returnsorted=False)[1] + df_overlap["regulation"] = reg_list + df_overlap["regulation-adjusted"] = \ + df_overlap.apply(adjust_regulation, axis=1) + return df_overlap, dic + + +def summarize_df_overlap(df_overlap: pd.DataFrame) -> pd.DataFrame: + """ + Create a summary of the dataframe `df_overlap` that indicates the number \ + of common gene shared between communities at protein and DNA level. + + :param df_overlap: The dataframe containing community at DNA and \ + protein level and their enrichment + :return: The dataframe showing the number of enrichment, impoverishment \ + or no regulation + + >>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"], + ... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1], + ... "size_com-ppi": [10, 10, 10], "p-value": [0.01, 0, 0], + ... "p-adjust": [0.01, 0.01, 0.01], "regulation": [" + "] * 3, + ... "regulation-adjusted": [" + "] * 3}) + >>> summarize_df_overlap(do) + regulation-adjusted community_size nb_com-ppi size_com-ppi number + 0 + 2.333333 2.0 10.0 3 + 1 - NaN NaN NaN 0 + 2 . NaN NaN NaN 0 + """ + df_overlap = df_overlap[["regulation-adjusted", "community_size", + "nb_com-ppi", "size_com-ppi", "p-adjust"]] + df_tmp = pd.DataFrame({"regulation-adjusted": [" + ", " - ", " . "], + "community_size": [np.nan] * 3, + "nb_com-ppi": [np.nan] * 3, + "size_com-ppi": [np.nan] * 3, + "p-adjust": [np.nan] * 3}) + df = pd.concat([df_overlap, df_tmp], axis=0, ignore_index=True) + df = df.groupby("regulation-adjusted").agg({"community_size": "mean", + "nb_com-ppi": "mean", + "size_com-ppi": "mean", + "p-adjust": "count"}) + return df.rename({"p-adjust": "number"}, axis=1).reset_index() + + +def get_enriched_impoverished_communities(df_overlap: pd.DataFrame, + dic_dna_gene: Dict[str, np.array], + ppi_gene: np.array, iteration: int, + dic_values: Dict[str, np.array], + use_seed: bool = True + ) -> Dict[str, int]: + """ + For each communities, randomly samples only once in the list \ + of `ppi_gene` to get the number of common gene betwwen this sample \ + and the community of gene at DNA level. Then this function determines \ + how many communites are enriched, impovershied thanks to `di_values`. + + :param df_overlap: A dataframe containing every gene community at dna \ + level and the number of common gene with a gne community at protein \ + level + :param dic_dna_gene: The dictionary containing gene interacting at dna \ + level + :param ppi_gene: All the gene interacting at protein level + :param iteration: The number of time we want to repeat the process + :param dic_values: A dictionary containing the number of common \ + gene between communities at DNA and protein level for each \ + subsampling made from the ppi_gene list. + :param use_seed: True to use a fixed seed, false else. + :return: The number of enriched, impoverished number of \ + genes shared betwwen gene at DNA and protein level. + + >>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"], + ... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1], + ... "size_com-ppi": [10, 10, 10], "p-value": [0.01, 0, 0], + ... "p-adjust": [0.01, 0.01, 0.01], "regulation": [" + "] * 3, + ... "regulation-adjusted": [" + "] * 3}) + >>> dna_gene = {"C1": [1, 2], "C2": [3, 4, 5], "C3": [6, 7]} + >>> ppi_g = list(range(11)) + >>> dv = {"C1": [0] * 95 + [1, 1, 1, 2, 2], + ... "C2": [0] * 95 + [1, 1, 1, 2, 2], + ... "C3": [0] * 75 + [1] * 10 + [2] * 10 + [3] * 5} + >>> get_enriched_impoverished_communities(do, dna_gene, ppi_g, 100, dv) + {' + ': 2, ' . ': 1, ' - ': 0} + >>> dv = {"C1": [0] * 95 + [1, 1, 1, 2, 2], + ... "C2": [0] * 95 + [1, 1, 1, 2, 2], + ... "C3": [3] * 75 + [4] * 23 + [1] * 2} + >>> get_enriched_impoverished_communities(do, dna_gene, ppi_g, 100, dv) + {' + ': 2, ' . ': 0, ' - ': 1} + """ + pval_list = [] + reg_list = [] + for i in range(df_overlap.shape[0]): + community = df_overlap.iloc[i, :]["DNA_community"] + my_value = get_list_common_gene(community, + dic_dna_gene, + df_overlap.iloc[i, :]["size_com-ppi"], + ppi_gene, + 1, use_seed)[0] + pval, reg = get_pvalue(dic_values[community], my_value, + iteration) + pval_list.append(pval) + reg_list.append(reg) + p_adjust = multipletests(pval_list, alpha=0.05, method='fdr_bh', + is_sorted=False, returnsorted=False)[1] + dic = {" + ": 0, " . ": 0, " - ": 0} + for i, reg in enumerate(reg_list): + dic[reg if p_adjust[i] <= 0.05 else " . "] += 1 + return dic + + +def summary_randomisation_test(df_overlap: pd.DataFrame, + dic_dna_gene: Dict[str, np.array], + ppi_gene: np.array, iteration: int, + dic_values: Dict[str, np.array], + use_seed: bool = True + ) -> pd.DataFrame: + """ + Says if the number of community that shared an enriched number \ + of common gene between the protein and DNA level is enriched as well. + + :param df_overlap: A dataframe containing every gene community at dna \ + level and the number of common gene with a gne community at protein \ + level + :param dic_dna_gene: The dictionary containing gene interacting at dna \ + level + :param ppi_gene: All the gene interacting at protein level + :param iteration: The number of time we want to repeat the process + :param dic_values: A dictionary containing the number of common \ + gene between communities at DNA and protein level for each \ + subsampling made from the ppi_gene list. + :param use_seed: True to use a fixed seed, false else. + :return: The updated summary table + + >>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"], + ... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1], + ... "size_com-ppi": [10, 10, 10], "p-value": [0.01, 0, 0], + ... "p-adjust": [0.01, 0.01, 0.01], "regulation": [" + "] * 3, + ... "regulation-adjusted": [" + "] * 3}) + >>> dna_gene = {"C1": [1, 2], "C2": [3, 4, 5], "C3": [6, 7]} + >>> ppi_g = list(range(11)) + >>> dv = {"C1": [0] * 95 + [1, 1, 1, 2, 2], + ... "C2": [0] * 95 + [1, 1, 1, 2, 2], + ... "C3": [0] * 75 + [1] * 10 + [2] * 10 + [3] * 5} + >>> ds = summary_randomisation_test(do, dna_gene, ppi_g, 100, dv) + >>> ds.iloc[:, range(4)] + regulation-adjusted community_size nb_com-ppi size_com-ppi + 0 + 2.333333 2.0 10.0 + 1 - NaN NaN NaN + 2 . NaN NaN NaN + >>> ds.iloc[:, range(4, 7)] + number mean_100_number p-adjust + 0 3 2.0 0.0 + 1 0 0.0 1.0 + 2 0 1.0 0.0 + >>> ds.iloc[:, 7].to_list() == [" + ", " . ", " - "] + True + """ + df_summarized = summarize_df_overlap(df_overlap) + dic_rand_sum = {k: [] for k in df_summarized["regulation-adjusted"].values} + for _ in tqdm(range(iteration), desc="Making stats to summarise global " + "enrichment"): + d = get_enriched_impoverished_communities(df_overlap, dic_dna_gene, + ppi_gene, iteration, + dic_values, use_seed) + for k, v in dic_rand_sum.items(): + v.append(d[k]) + pvalues = [] + regulations = [] + mean_number = [] + for i in range(df_summarized.shape[0]): + reg, val = df_summarized.iloc[i, :][["regulation-adjusted", "number"]] + pval, creg = get_pvalue(dic_rand_sum[reg], val, + iteration) + # print(reg, val, dic_rand_sum[reg]) + pvalues.append(pval) + regulations.append(creg) + mean_number.append(np.mean(dic_rand_sum[reg])) + p_adjust = multipletests(pvalues, alpha=0.05, method='fdr_bh', + is_sorted=False, returnsorted=False)[1] + reg_adj = [regulations[i] if p_adjust[i] <= 0.05 else " . " + for i in range(len(regulations))] + df_summarized[f"mean_{iteration}_number"] = mean_number + df_summarized[f"p-adjust"] = p_adjust + df_summarized[f"reg_adj"] = reg_adj + return df_summarized + + +if __name__ == "__main__": + testmod() diff --git a/src/find_interaction_cluster/sf_and_communities.py b/src/find_interaction_cluster/sf_and_communities.py index b950c8b2c7fdf0ce4d63916a53b564b0c60e1080..d81dc5931eaa789c0ef2a119a64cd3197077571b 100644 --- a/src/find_interaction_cluster/sf_and_communities.py +++ b/src/find_interaction_cluster/sf_and_communities.py @@ -15,7 +15,7 @@ from .community_finder import get_communities import pandas as pd import numpy as np from itertools import product -from .community_finder import get_projects_name +from .community_finder import get_projects import multiprocessing as mp import logging from ..logging_conf import logging_def @@ -182,7 +182,7 @@ def glmm_maker(expanded_df: pd.DataFrame, outfile: Path) -> float: ... "%reg in community": [40, 42.85], 'pval': [1, 0.5], 'padj': [1, 1]}) >>> e_df = expand_dataframe(d) >>> outfile = ConfigGraph.get_community_file("Test", 1, 1, True, - ... "_stat.txt", True) + ... "_stat.txt", "sf_community_enrichment") >>> glmm_maker(e_df, outfile) 1.0 """ @@ -235,7 +235,8 @@ def glmm_statistics(df: pd.DataFrame, sf_name: str, reg: str, expanded_df = expand_dataframe(ndf) outfile = ConfigGraph.get_community_file(project, weight, global_weight, same_gene, feature, - f"{sf_name}_{reg}_stat.txt", True) + f"{sf_name}_{reg}_stat.txt", + "sf_community_enrichment") noutfold = outfile.parent / "expanded_df" noutfold.mkdir(exist_ok=True, parents=True) noutfile = noutfold / outfile.name @@ -361,19 +362,21 @@ def get_key(project: str, weight: int) -> str: return f"{project}_{weight}" -def multiple_stat_launcher(ps: int, weights: List[int], - global_weights: List[int], +def multiple_stat_launcher(ps: int, + weight: int, + global_weight: int, + project: str, same_gene: bool, feature: str = 'exon', logging_level: str = "DISABLE"): """ Launch the statistical analysis for every - :param ps: The number of processes we want to use. - :param weights: The list of weights of interaction to consider - :param global_weights: The list global weights to consider. if \ - the global weight is equal to 0 then then density figure are calculated \ - by project, else all projcet are merge together and the interaction \ - seen in `global_weight` project are taken into account + :param ps: The number of processes to use + :param weight: The weight of interaction to consider + :param global_weight: The global weighs to consider. if \ + the global weight is equal to 0 then the project `project` is \ + used. + :param project: The project name, used only if global_weight = 0 :param same_gene: Say if we consider as co-localised exon within the \ same gene :param feature: The feature we want to analyse @@ -384,22 +387,19 @@ def multiple_stat_launcher(ps: int, weights: List[int], logging.info(f"Checking if communities contains often {feature}s " f"regulated by a splicing factor") sf_list = get_sfname() - global_weights = list(np.unique(global_weights)) - weights = list(np.unique(weights)) - projects, dic_project = get_projects_name(global_weights) - condition = list(product(projects, weights, sf_list, ['down', 'up'])) + project = get_projects(global_weight, project) + condition = list(product([project], [weight], sf_list, ['down', 'up'])) processes = {} pool = mp.Pool(processes=min(ps, len(condition))) logging.debug("Calculating stats...") for project, weight, sf_name, reg in condition: ckey = get_key(project, weight) - global_weight = dic_project[project] args = [sf_name, reg, project, weight, global_weight, same_gene, feature] - if ckey not in processes.keys(): - processes[ckey] = [pool.apply_async(get_stat4communities, args)] - else: + if ckey in processes: processes[ckey].append(pool.apply_async(get_stat4communities, args)) + else: + processes[ckey] = [pool.apply_async(get_stat4communities, args)] for p, value in processes.items(): project, weight = p.split("_") list_tuples = [proc.get(timeout=None) for proc in value] @@ -409,17 +409,19 @@ def multiple_stat_launcher(ps: int, weights: List[int], list_series = [t[1] for t in list_tuples] df = pd.concat(list_df, axis=0, ignore_index=True) outfile = ConfigGraph.get_community_file(project, weight, - dic_project[project], + global_weight, same_gene, feature, - "_stat.txt", True) + "_stat.txt", + "sf_community_enrichment") df.to_csv(outfile, sep="\t", index=False) glm_df = pd.DataFrame(list_series) glm_df["padj"] = multipletests(glm_df['pval'].values, method='fdr_bh')[1] outfile = ConfigGraph.get_community_file(project, weight, - dic_project[project], + global_weight, same_gene, feature, - "_glmm_stat.txt", True) + "_glmm_stat.txt", + "sf_community_enrichment") glm_df.to_csv(outfile, sep="\t", index=False) diff --git a/src/nt_composition/make_nt_correlation.py b/src/nt_composition/make_nt_correlation.py index ddbb5b18f23c5119ce860dd3fc7fa7c760374a88..58969b9e7c329f8729656d85dd995a688a0b2d47 100644 --- a/src/nt_composition/make_nt_correlation.py +++ b/src/nt_composition/make_nt_correlation.py @@ -61,7 +61,7 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str, global_weight: int, same_gene: bool, get_weight: bool = False, exon_bc: Optional[np.array] = None, - inter_chr: bool = False, + inter_chr: bool = False, distance: int = 10000, level: str = "exon") -> np.array: """ Get the interactions in project `project` for gene and exons @@ -84,8 +84,11 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str, """ logging.debug(f'Recovering interaction ({os.getpid()})') select_add = get_select_addition(global_weight, get_weight, same_gene) - inter_str = " AND level = 'inter'" if inter_chr else "" - inter_str_t = " AND t1.level = 'inter'" if inter_chr else "" + inter_str = " AND level = 'inter'" \ + if inter_chr else f"AND (level = 'inter' OR distance >= {distance})" + inter_str_t = " AND t1.level = 'inter'" \ + if inter_chr \ + else f"AND (t1.level = 'inter' OR t1.distance >= {distance})" if level == "gene" and not same_gene: logging.warning(f"When level is equal to gene then same_gene must " f"be true. Setting it to True.")