src/nt_composition/make_nt_correlation.py: major modification corresponding to...

src/nt_composition/make_nt_correlation.py: major modification corresponding to the addition of the parameter community_size to limit nt frequency analysis to only exons located in large community of co-regulated exons. This parameter can be set to -1 for using the old behavior

src/nt_composition/make_nt_correlation.py: major modification corresponding to...
src/nt_composition/make_nt_correlation.py: major modification corresponding to the addition of the parameter community_size to limit nt frequency analysis to only exons located in large community of co-regulated exons. This parameter can be set to -1 for using the old behavior
1db24ea1 · nfontrod · cd4eef50 · 1db24ea1
Commit 1db24ea1 authored 4 years ago by nfontrod
--- a/src/nt_composition/make_nt_correlation.py
+++ b/src/nt_composition/make_nt_correlation.py
@@ -16,7 +16,7 @@ import numpy as np
 import doctest
 from .config import ConfigNt
 from ..logging_conf import logging_def
-from typing import Dict, Tuple, Any, List
+from typing import Dict, Tuple, Any, List, Optional
 import seaborn as sns
 import matplotlib.pyplot as plt
 from scipy.stats import linregress
@@ -24,6 +24,7 @@ from itertools import product
 from random import random
 import multiprocessing as mp
 import os
+from ..find_interaction_cluster.config import get_communities


 class NoInteractionError(Exception):
@@ -58,7 +59,8 @@ def get_select_addition(global_weight: int, get_weight: bool, same_gene: bool):
 def get_project_colocalisation(cnx: sqlite3.Connection, project: str,
                               weight: int,
                               global_weight: int, same_gene: bool,
-                               get_weight: bool = False) -> np.array:
+                               get_weight: bool = False,
+                               exon_bc: Optional[np.array] = None) -> np.array:
    """
    Get the interactions in project `project`

@@ -72,16 +74,24 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str,
    :param get_weight: Say if we want to recover the weight of the interaction
    :param same_gene: Say if we consider as co-localised exon within the \
    same gene
+    :param exon_bc: exons in big communities
    :return: The table containing the number of interaction by projects
    """
    logging.debug(f'Recovering interaction ({os.getpid()})')
    select_add = get_select_addition(global_weight, get_weight, same_gene)
+    if exon_bc is None:
+        filter_exons = ''
+        filter_exons_t = filter_exons
+    else:
+        tmp = tuple(exon_bc)
+        filter_exons = f' AND exon1 IN {tmp} AND exon2 IN {tmp}'
+        filter_exons_t = f' AND t1.exon1 IN {tmp} AND t1.exon2 IN {tmp}'
    if global_weight == 0:
        if same_gene:
            query = f"SELECT exon1, exon2{select_add} " \
                    "FROM cin_exon_interaction " \
                    f"WHERE weight >= {weight} " \
-                    f"AND id_project = '{project}' "
+                    f"AND id_project = '{project}'" + filter_exons
        else:
            query = f"""SELECT t1.exon1, t1.exon2{select_add}
                        FROM cin_exon_interaction t1, cin_exon t2, cin_exon t3
@@ -89,7 +99,7 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str,
                        AND id_project = '{project}' 
                        AND t1.exon1 = t2.id
                        AND t1.exon2 = t3.id
-                        AND t2.id_gene != t3.id_gene"""
+                        AND t2.id_gene != t3.id_gene""" + filter_exons_t
    else:
        good_projects = tuple(ConfigNt.good_projects)
        if same_gene:
@@ -97,6 +107,7 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str,
                    f"FROM cin_exon_interaction " \
                    f"WHERE weight >= {weight} " \
                    f"AND id_project IN {good_projects}" \
+                    f"{filter_exons}" \
                    f"GROUP BY exon1, exon2 " \
                    f"HAVING COUNT(*) >= {global_weight}"
        else:
@@ -107,6 +118,7 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str,
                        AND t1.exon2 = t3.id
                        AND t2.id_gene != t3.id_gene
                        AND t1.id_project IN {good_projects}
+                        {filter_exons_t}
                        GROUP BY exon1, exon2
                        HAVING COUNT(*) >= {global_weight}"""

@@ -245,7 +257,8 @@ def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float],


 def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str,
-                       weight: int, global_weight: int) -> Tuple[float, float]:
+                       weight: int, global_weight: int,
+                       community_size: Optional[int]) -> Tuple[float, float]:
    """
    Compute a density file showing if the feature frequency of \
    an exons correlates with the frequency in other co-localised exons.
@@ -280,7 +293,8 @@ def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str,
    plt.title(f'Freq of {ft} of exons and their co-localized partner in '
              f'{project}')
    plt.savefig(ConfigNt.get_density_file(weight, global_weight, project,
-                                          ft_type, ft, fig=True))
+                                          ft_type, ft, fig=True,
+                                          community_size=community_size))
    plt.close()
    return r, p

@@ -288,6 +302,7 @@ def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str,
 def create_density_figure(nt: str, ft_type: str,
                          project: str, weight: int, global_weight: int,
                          same_gene: bool, compute_mean: bool,
+                          community_size: Optional[int],
                          logging_level: str = "DISABLE"
                          ) -> Tuple[float, float]:
    """
@@ -306,21 +321,28 @@ def create_density_figure(nt: str, ft_type: str,
    same gene
    :param compute_mean: True to compute the mean frequency of co-localised \
    exons, false to only compute the frequency of one co-localized exons.
+    :param community_size: The size of the community to consider in the \
+    analysis.
    :param logging_level: The level of information to display
    :return: The correlation and the p-value
    """
    logging_def(ConfigNt.interaction, __file__, logging_level)
    outfile = ConfigNt.get_density_file(weight, global_weight, project,
-                                        ft_type, nt, fig=False)
+                                        ft_type, nt, fig=False,
+                                        community_size=community_size)
    if not outfile.is_file():
+        exons_bc = recover_exon_in_big_communities(community_size, project,
+                                                   weight,
+                                                   global_weight)
        cnx = sqlite3.connect(ConfigNt.db_file)
        arr_interaction = get_project_colocalisation(cnx, project, weight,
-                                                     global_weight, same_gene)
+                                                     global_weight, same_gene,
+                                                     False, exons_bc)
        dic_freq = get_frequency_dic(cnx, nt, ft_type)
        df = create_density_table(arr_interaction, dic_freq, compute_mean)
        df.to_csv(outfile, sep="\t", index=False)
        r, p = create_density_fig(df, project, ft_type, nt, weight,
-                                  global_weight)
+                                  global_weight, community_size)
    else:
        logging.debug(f'The file {outfile} exist, recovering data '
                      f'({os.getpid()})')
@@ -330,7 +352,8 @@ def create_density_figure(nt: str, ft_type: str,


 def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str,
-                       weight: int, global_weight: int):
+                       weight: int, global_weight: int,
+                       community_size: Optional[int]):
    """
    Create a scatterplot showing the R-value according to the  \
    number of interaction.
@@ -361,7 +384,8 @@ def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str,
    plt.title(f'Project correlation for {ft} ({ft_type}) '
              f'according to the number of interaction in projects')
    plt.savefig(ConfigNt.get_density_recap(weight, global_weight, ft_type,
-                                           ft, fig=True))
+                                           ft, fig=True,
+                                           community_size=community_size))
    plt.close()


@@ -369,7 +393,8 @@ def execute_density_figure_function(di: pd.DataFrame, project : str,
                                    ft_type: str, ft: str, weight: int,
                                    global_weight: int,
                                    same_gene: bool,
-                                    compute_mean) -> Dict[str, Any]:
+                                    compute_mean: bool,
+                                    community_size: Optional[int]) -> Dict[str, Any]:
    """
    Execute create_density_figure and organized the results in a dictionary.

@@ -386,11 +411,14 @@ def execute_density_figure_function(di: pd.DataFrame, project : str,
    same gene
    :param compute_mean: True to compute the mean frequency of co-localised \
    exons, false to only compute the frequency of one co-localized exons.
+    :param community_size: he size of the community to consider in the \
+    analysis.
    :return:
    """
    logging.info(f'Working on {project}, {ft_type}, {ft} - {os.getpid()}')
    r, p = create_density_figure(ft, ft_type, project, weight,
-                                 global_weight, same_gene, compute_mean)
+                                 global_weight, same_gene, compute_mean,
+                                 community_size)
    if global_weight == 0:
        tmp = {"project": project, "ft_type": ft_type,
               "ft": ft, "cor": r, "pval": p,
@@ -415,9 +443,45 @@ def combine_dic(list_dic: List[Dict]) -> Dict:
    return dic


+def recover_exon_in_big_communities(community_size: Optional[int],
+                                    project: str, weight: int,
+                                    global_weight: int
+                                    ) -> Optional[np.array]:
+    """
+    Recover the list of exon present in community with a larger size than \
+    `community_size`
+
+    :param community_size: The minimum size of community that we want to \
+    consider
+    :param project: The name of the project of interest
+    :param global_weight: The global weight to consider. if \
+    the global weight is equal to 0 then then density figure are calculated \
+    by project, else all projet are merge together and the interaction \
+    seen in `global_weight` project are taken into account
+    :param weight: The weight of interaction to consider
+    :return: The list of exon of interest
+    """
+    if community_size is None:
+        return None
+    outfile = ConfigNt.get_community_file(project, weight, global_weight,
+                                          True, "_communities.txt")
+    if not outfile.is_file():
+        msg = f"The file {outfile} was not found ! You must try " \
+              f"to launch find_interaction_cluster script with " \
+              f"the same parameters !"
+        logging.exception(msg)
+        raise FileNotFoundError(msg)
+    communities = get_communities(outfile, community_size)
+    res = []
+    for c in communities:
+        res += c
+    return res
+
+
 def create_all_frequency_figures(ps: int, weight: int = 1,
                                 global_weight: int = 0, ft_type: str = "nt",
                                 same_gene = True, compute_mean: bool = True,
+                                 community_size: Optional[int] = None,
                                 logging_level: str = "DISABLE"):
    """
    Make density figure for every selected projects.
@@ -433,6 +497,8 @@ def create_all_frequency_figures(ps: int, weight: int = 1,
    same gene
    :param compute_mean: True to compute the mean frequency of co-localised \
    exons, false to only compute the frequency of one co-localized exons.
+    :param community_size: The size of the community to consider in the \
+    analysis.
    :param ps: The number of processes to create
    """
    logging_def(ConfigNt.interaction, __file__, logging_level)
@@ -442,16 +508,13 @@ def create_all_frequency_figures(ps: int, weight: int = 1,
    else:
        di = pd.DataFrame()
        projects = [f"Global_projects_{global_weight}"]
-    # with open(ConfigNt.selected_project, 'r') as f:
-    #     projects = f.read().splitlines()
-    # projects = projects[:-4]
    ft_list = ConfigNt.get_features(ft_type)
    param = product(projects, ft_list, [ft_type])
    pool = mp.Pool(processes=ps)
    processes = []
    for project, ft, ft_type in param:
        args = [di, project, ft_type, ft, weight, global_weight, same_gene,
-                compute_mean]
+                compute_mean, community_size]
        processes.append(pool.apply_async(execute_density_figure_function,
                                          args))
    results = []
@@ -460,11 +523,13 @@ def create_all_frequency_figures(ps: int, weight: int = 1,
    dic = combine_dic(results)
    df_corr = pd.DataFrame(dic)
    df_corr.to_csv(ConfigNt.get_density_recap(weight, global_weight, ft_type,
-                                              'ALL', fig=False),
+                                              'ALL', fig=False,
+                                              community_size=community_size),
                   sep="\t")
    if global_weight == 0:
        for ft in ft_list:
-            create_scatterplot(df_corr, ft_type, ft, weight, global_weight)
+            create_scatterplot(df_corr, ft_type, ft, weight, global_weight,
+                               community_size)


 if __name__ == "__main__":