diff --git a/src/nt_composition/make_nt_correlation.py b/src/nt_composition/make_nt_correlation.py index fff9326f3480508dff2a5b2eb44f13964813ea1f..ef7ceb8b372371c9b417a52cdf35d3f115b045b0 100644 --- a/src/nt_composition/make_nt_correlation.py +++ b/src/nt_composition/make_nt_correlation.py @@ -16,7 +16,7 @@ import numpy as np import doctest from .config import ConfigNt from ..logging_conf import logging_def -from typing import Dict, Tuple, Any, List +from typing import Dict, Tuple, Any, List, Optional import seaborn as sns import matplotlib.pyplot as plt from scipy.stats import linregress @@ -24,6 +24,7 @@ from itertools import product from random import random import multiprocessing as mp import os +from ..find_interaction_cluster.config import get_communities class NoInteractionError(Exception): @@ -58,7 +59,8 @@ def get_select_addition(global_weight: int, get_weight: bool, same_gene: bool): def get_project_colocalisation(cnx: sqlite3.Connection, project: str, weight: int, global_weight: int, same_gene: bool, - get_weight: bool = False) -> np.array: + get_weight: bool = False, + exon_bc: Optional[np.array] = None) -> np.array: """ Get the interactions in project `project` @@ -72,16 +74,24 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str, :param get_weight: Say if we want to recover the weight of the interaction :param same_gene: Say if we consider as co-localised exon within the \ same gene + :param exon_bc: exons in big communities :return: The table containing the number of interaction by projects """ logging.debug(f'Recovering interaction ({os.getpid()})') select_add = get_select_addition(global_weight, get_weight, same_gene) + if exon_bc is None: + filter_exons = '' + filter_exons_t = filter_exons + else: + tmp = tuple(exon_bc) + filter_exons = f' AND exon1 IN {tmp} AND exon2 IN {tmp}' + filter_exons_t = f' AND t1.exon1 IN {tmp} AND t1.exon2 IN {tmp}' if global_weight == 0: if same_gene: query = f"SELECT exon1, exon2{select_add} " \ "FROM cin_exon_interaction " \ f"WHERE weight >= {weight} " \ - f"AND id_project = '{project}' " + f"AND id_project = '{project}'" + filter_exons else: query = f"""SELECT t1.exon1, t1.exon2{select_add} FROM cin_exon_interaction t1, cin_exon t2, cin_exon t3 @@ -89,7 +99,7 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str, AND id_project = '{project}' AND t1.exon1 = t2.id AND t1.exon2 = t3.id - AND t2.id_gene != t3.id_gene""" + AND t2.id_gene != t3.id_gene""" + filter_exons_t else: good_projects = tuple(ConfigNt.good_projects) if same_gene: @@ -97,6 +107,7 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str, f"FROM cin_exon_interaction " \ f"WHERE weight >= {weight} " \ f"AND id_project IN {good_projects}" \ + f"{filter_exons}" \ f"GROUP BY exon1, exon2 " \ f"HAVING COUNT(*) >= {global_weight}" else: @@ -107,6 +118,7 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str, AND t1.exon2 = t3.id AND t2.id_gene != t3.id_gene AND t1.id_project IN {good_projects} + {filter_exons_t} GROUP BY exon1, exon2 HAVING COUNT(*) >= {global_weight}""" @@ -245,7 +257,8 @@ def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float], def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str, - weight: int, global_weight: int) -> Tuple[float, float]: + weight: int, global_weight: int, + community_size: Optional[int]) -> Tuple[float, float]: """ Compute a density file showing if the feature frequency of \ an exons correlates with the frequency in other co-localised exons. @@ -280,7 +293,8 @@ def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str, plt.title(f'Freq of {ft} of exons and their co-localized partner in ' f'{project}') plt.savefig(ConfigNt.get_density_file(weight, global_weight, project, - ft_type, ft, fig=True)) + ft_type, ft, fig=True, + community_size=community_size)) plt.close() return r, p @@ -288,6 +302,7 @@ def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str, def create_density_figure(nt: str, ft_type: str, project: str, weight: int, global_weight: int, same_gene: bool, compute_mean: bool, + community_size: Optional[int], logging_level: str = "DISABLE" ) -> Tuple[float, float]: """ @@ -306,21 +321,28 @@ def create_density_figure(nt: str, ft_type: str, same gene :param compute_mean: True to compute the mean frequency of co-localised \ exons, false to only compute the frequency of one co-localized exons. + :param community_size: The size of the community to consider in the \ + analysis. :param logging_level: The level of information to display :return: The correlation and the p-value """ logging_def(ConfigNt.interaction, __file__, logging_level) outfile = ConfigNt.get_density_file(weight, global_weight, project, - ft_type, nt, fig=False) + ft_type, nt, fig=False, + community_size=community_size) if not outfile.is_file(): + exons_bc = recover_exon_in_big_communities(community_size, project, + weight, + global_weight) cnx = sqlite3.connect(ConfigNt.db_file) arr_interaction = get_project_colocalisation(cnx, project, weight, - global_weight, same_gene) + global_weight, same_gene, + False, exons_bc) dic_freq = get_frequency_dic(cnx, nt, ft_type) df = create_density_table(arr_interaction, dic_freq, compute_mean) df.to_csv(outfile, sep="\t", index=False) r, p = create_density_fig(df, project, ft_type, nt, weight, - global_weight) + global_weight, community_size) else: logging.debug(f'The file {outfile} exist, recovering data ' f'({os.getpid()})') @@ -330,7 +352,8 @@ def create_density_figure(nt: str, ft_type: str, def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str, - weight: int, global_weight: int): + weight: int, global_weight: int, + community_size: Optional[int]): """ Create a scatterplot showing the R-value according to the \ number of interaction. @@ -361,7 +384,8 @@ def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str, plt.title(f'Project correlation for {ft} ({ft_type}) ' f'according to the number of interaction in projects') plt.savefig(ConfigNt.get_density_recap(weight, global_weight, ft_type, - ft, fig=True)) + ft, fig=True, + community_size=community_size)) plt.close() @@ -369,7 +393,8 @@ def execute_density_figure_function(di: pd.DataFrame, project : str, ft_type: str, ft: str, weight: int, global_weight: int, same_gene: bool, - compute_mean) -> Dict[str, Any]: + compute_mean: bool, + community_size: Optional[int]) -> Dict[str, Any]: """ Execute create_density_figure and organized the results in a dictionary. @@ -386,11 +411,14 @@ def execute_density_figure_function(di: pd.DataFrame, project : str, same gene :param compute_mean: True to compute the mean frequency of co-localised \ exons, false to only compute the frequency of one co-localized exons. + :param community_size: he size of the community to consider in the \ + analysis. :return: """ logging.info(f'Working on {project}, {ft_type}, {ft} - {os.getpid()}') r, p = create_density_figure(ft, ft_type, project, weight, - global_weight, same_gene, compute_mean) + global_weight, same_gene, compute_mean, + community_size) if global_weight == 0: tmp = {"project": project, "ft_type": ft_type, "ft": ft, "cor": r, "pval": p, @@ -415,9 +443,45 @@ def combine_dic(list_dic: List[Dict]) -> Dict: return dic +def recover_exon_in_big_communities(community_size: Optional[int], + project: str, weight: int, + global_weight: int + ) -> Optional[np.array]: + """ + Recover the list of exon present in community with a larger size than \ + `community_size` + + :param community_size: The minimum size of community that we want to \ + consider + :param project: The name of the project of interest + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param weight: The weight of interaction to consider + :return: The list of exon of interest + """ + if community_size is None: + return None + outfile = ConfigNt.get_community_file(project, weight, global_weight, + True, "_communities.txt") + if not outfile.is_file(): + msg = f"The file {outfile} was not found ! You must try " \ + f"to launch find_interaction_cluster script with " \ + f"the same parameters !" + logging.exception(msg) + raise FileNotFoundError(msg) + communities = get_communities(outfile, community_size) + res = [] + for c in communities: + res += c + return res + + def create_all_frequency_figures(ps: int, weight: int = 1, global_weight: int = 0, ft_type: str = "nt", same_gene = True, compute_mean: bool = True, + community_size: Optional[int] = None, logging_level: str = "DISABLE"): """ Make density figure for every selected projects. @@ -433,6 +497,8 @@ def create_all_frequency_figures(ps: int, weight: int = 1, same gene :param compute_mean: True to compute the mean frequency of co-localised \ exons, false to only compute the frequency of one co-localized exons. + :param community_size: The size of the community to consider in the \ + analysis. :param ps: The number of processes to create """ logging_def(ConfigNt.interaction, __file__, logging_level) @@ -442,16 +508,13 @@ def create_all_frequency_figures(ps: int, weight: int = 1, else: di = pd.DataFrame() projects = [f"Global_projects_{global_weight}"] - # with open(ConfigNt.selected_project, 'r') as f: - # projects = f.read().splitlines() - # projects = projects[:-4] ft_list = ConfigNt.get_features(ft_type) param = product(projects, ft_list, [ft_type]) pool = mp.Pool(processes=ps) processes = [] for project, ft, ft_type in param: args = [di, project, ft_type, ft, weight, global_weight, same_gene, - compute_mean] + compute_mean, community_size] processes.append(pool.apply_async(execute_density_figure_function, args)) results = [] @@ -460,11 +523,13 @@ def create_all_frequency_figures(ps: int, weight: int = 1, dic = combine_dic(results) df_corr = pd.DataFrame(dic) df_corr.to_csv(ConfigNt.get_density_recap(weight, global_weight, ft_type, - 'ALL', fig=False), + 'ALL', fig=False, + community_size=community_size), sep="\t") if global_weight == 0: for ft in ft_list: - create_scatterplot(df_corr, ft_type, ft, weight, global_weight) + create_scatterplot(df_corr, ft_type, ft, weight, global_weight, + community_size) if __name__ == "__main__":