Skip to content
Snippets Groups Projects
Commit 1db24ea1 authored by nfontrod's avatar nfontrod
Browse files

src/nt_composition/make_nt_correlation.py: major modification corresponding to...

src/nt_composition/make_nt_correlation.py: major modification corresponding to the addition of the parameter community_size to limit nt frequency analysis to only exons located in large community of co-regulated exons. This parameter can be set to -1 for using the old behavior
parent cd4eef50
No related branches found
No related tags found
No related merge requests found
......@@ -16,7 +16,7 @@ import numpy as np
import doctest
from .config import ConfigNt
from ..logging_conf import logging_def
from typing import Dict, Tuple, Any, List
from typing import Dict, Tuple, Any, List, Optional
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import linregress
......@@ -24,6 +24,7 @@ from itertools import product
from random import random
import multiprocessing as mp
import os
from ..find_interaction_cluster.config import get_communities
class NoInteractionError(Exception):
......@@ -58,7 +59,8 @@ def get_select_addition(global_weight: int, get_weight: bool, same_gene: bool):
def get_project_colocalisation(cnx: sqlite3.Connection, project: str,
weight: int,
global_weight: int, same_gene: bool,
get_weight: bool = False) -> np.array:
get_weight: bool = False,
exon_bc: Optional[np.array] = None) -> np.array:
"""
Get the interactions in project `project`
......@@ -72,16 +74,24 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str,
:param get_weight: Say if we want to recover the weight of the interaction
:param same_gene: Say if we consider as co-localised exon within the \
same gene
:param exon_bc: exons in big communities
:return: The table containing the number of interaction by projects
"""
logging.debug(f'Recovering interaction ({os.getpid()})')
select_add = get_select_addition(global_weight, get_weight, same_gene)
if exon_bc is None:
filter_exons = ''
filter_exons_t = filter_exons
else:
tmp = tuple(exon_bc)
filter_exons = f' AND exon1 IN {tmp} AND exon2 IN {tmp}'
filter_exons_t = f' AND t1.exon1 IN {tmp} AND t1.exon2 IN {tmp}'
if global_weight == 0:
if same_gene:
query = f"SELECT exon1, exon2{select_add} " \
"FROM cin_exon_interaction " \
f"WHERE weight >= {weight} " \
f"AND id_project = '{project}' "
f"AND id_project = '{project}'" + filter_exons
else:
query = f"""SELECT t1.exon1, t1.exon2{select_add}
FROM cin_exon_interaction t1, cin_exon t2, cin_exon t3
......@@ -89,7 +99,7 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str,
AND id_project = '{project}'
AND t1.exon1 = t2.id
AND t1.exon2 = t3.id
AND t2.id_gene != t3.id_gene"""
AND t2.id_gene != t3.id_gene""" + filter_exons_t
else:
good_projects = tuple(ConfigNt.good_projects)
if same_gene:
......@@ -97,6 +107,7 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str,
f"FROM cin_exon_interaction " \
f"WHERE weight >= {weight} " \
f"AND id_project IN {good_projects}" \
f"{filter_exons}" \
f"GROUP BY exon1, exon2 " \
f"HAVING COUNT(*) >= {global_weight}"
else:
......@@ -107,6 +118,7 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str,
AND t1.exon2 = t3.id
AND t2.id_gene != t3.id_gene
AND t1.id_project IN {good_projects}
{filter_exons_t}
GROUP BY exon1, exon2
HAVING COUNT(*) >= {global_weight}"""
......@@ -245,7 +257,8 @@ def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float],
def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str,
weight: int, global_weight: int) -> Tuple[float, float]:
weight: int, global_weight: int,
community_size: Optional[int]) -> Tuple[float, float]:
"""
Compute a density file showing if the feature frequency of \
an exons correlates with the frequency in other co-localised exons.
......@@ -280,7 +293,8 @@ def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str,
plt.title(f'Freq of {ft} of exons and their co-localized partner in '
f'{project}')
plt.savefig(ConfigNt.get_density_file(weight, global_weight, project,
ft_type, ft, fig=True))
ft_type, ft, fig=True,
community_size=community_size))
plt.close()
return r, p
......@@ -288,6 +302,7 @@ def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str,
def create_density_figure(nt: str, ft_type: str,
project: str, weight: int, global_weight: int,
same_gene: bool, compute_mean: bool,
community_size: Optional[int],
logging_level: str = "DISABLE"
) -> Tuple[float, float]:
"""
......@@ -306,21 +321,28 @@ def create_density_figure(nt: str, ft_type: str,
same gene
:param compute_mean: True to compute the mean frequency of co-localised \
exons, false to only compute the frequency of one co-localized exons.
:param community_size: The size of the community to consider in the \
analysis.
:param logging_level: The level of information to display
:return: The correlation and the p-value
"""
logging_def(ConfigNt.interaction, __file__, logging_level)
outfile = ConfigNt.get_density_file(weight, global_weight, project,
ft_type, nt, fig=False)
ft_type, nt, fig=False,
community_size=community_size)
if not outfile.is_file():
exons_bc = recover_exon_in_big_communities(community_size, project,
weight,
global_weight)
cnx = sqlite3.connect(ConfigNt.db_file)
arr_interaction = get_project_colocalisation(cnx, project, weight,
global_weight, same_gene)
global_weight, same_gene,
False, exons_bc)
dic_freq = get_frequency_dic(cnx, nt, ft_type)
df = create_density_table(arr_interaction, dic_freq, compute_mean)
df.to_csv(outfile, sep="\t", index=False)
r, p = create_density_fig(df, project, ft_type, nt, weight,
global_weight)
global_weight, community_size)
else:
logging.debug(f'The file {outfile} exist, recovering data '
f'({os.getpid()})')
......@@ -330,7 +352,8 @@ def create_density_figure(nt: str, ft_type: str,
def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str,
weight: int, global_weight: int):
weight: int, global_weight: int,
community_size: Optional[int]):
"""
Create a scatterplot showing the R-value according to the \
number of interaction.
......@@ -361,7 +384,8 @@ def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str,
plt.title(f'Project correlation for {ft} ({ft_type}) '
f'according to the number of interaction in projects')
plt.savefig(ConfigNt.get_density_recap(weight, global_weight, ft_type,
ft, fig=True))
ft, fig=True,
community_size=community_size))
plt.close()
......@@ -369,7 +393,8 @@ def execute_density_figure_function(di: pd.DataFrame, project : str,
ft_type: str, ft: str, weight: int,
global_weight: int,
same_gene: bool,
compute_mean) -> Dict[str, Any]:
compute_mean: bool,
community_size: Optional[int]) -> Dict[str, Any]:
"""
Execute create_density_figure and organized the results in a dictionary.
......@@ -386,11 +411,14 @@ def execute_density_figure_function(di: pd.DataFrame, project : str,
same gene
:param compute_mean: True to compute the mean frequency of co-localised \
exons, false to only compute the frequency of one co-localized exons.
:param community_size: he size of the community to consider in the \
analysis.
:return:
"""
logging.info(f'Working on {project}, {ft_type}, {ft} - {os.getpid()}')
r, p = create_density_figure(ft, ft_type, project, weight,
global_weight, same_gene, compute_mean)
global_weight, same_gene, compute_mean,
community_size)
if global_weight == 0:
tmp = {"project": project, "ft_type": ft_type,
"ft": ft, "cor": r, "pval": p,
......@@ -415,9 +443,45 @@ def combine_dic(list_dic: List[Dict]) -> Dict:
return dic
def recover_exon_in_big_communities(community_size: Optional[int],
project: str, weight: int,
global_weight: int
) -> Optional[np.array]:
"""
Recover the list of exon present in community with a larger size than \
`community_size`
:param community_size: The minimum size of community that we want to \
consider
:param project: The name of the project of interest
:param global_weight: The global weight to consider. if \
the global weight is equal to 0 then then density figure are calculated \
by project, else all projet are merge together and the interaction \
seen in `global_weight` project are taken into account
:param weight: The weight of interaction to consider
:return: The list of exon of interest
"""
if community_size is None:
return None
outfile = ConfigNt.get_community_file(project, weight, global_weight,
True, "_communities.txt")
if not outfile.is_file():
msg = f"The file {outfile} was not found ! You must try " \
f"to launch find_interaction_cluster script with " \
f"the same parameters !"
logging.exception(msg)
raise FileNotFoundError(msg)
communities = get_communities(outfile, community_size)
res = []
for c in communities:
res += c
return res
def create_all_frequency_figures(ps: int, weight: int = 1,
global_weight: int = 0, ft_type: str = "nt",
same_gene = True, compute_mean: bool = True,
community_size: Optional[int] = None,
logging_level: str = "DISABLE"):
"""
Make density figure for every selected projects.
......@@ -433,6 +497,8 @@ def create_all_frequency_figures(ps: int, weight: int = 1,
same gene
:param compute_mean: True to compute the mean frequency of co-localised \
exons, false to only compute the frequency of one co-localized exons.
:param community_size: The size of the community to consider in the \
analysis.
:param ps: The number of processes to create
"""
logging_def(ConfigNt.interaction, __file__, logging_level)
......@@ -442,16 +508,13 @@ def create_all_frequency_figures(ps: int, weight: int = 1,
else:
di = pd.DataFrame()
projects = [f"Global_projects_{global_weight}"]
# with open(ConfigNt.selected_project, 'r') as f:
# projects = f.read().splitlines()
# projects = projects[:-4]
ft_list = ConfigNt.get_features(ft_type)
param = product(projects, ft_list, [ft_type])
pool = mp.Pool(processes=ps)
processes = []
for project, ft, ft_type in param:
args = [di, project, ft_type, ft, weight, global_weight, same_gene,
compute_mean]
compute_mean, community_size]
processes.append(pool.apply_async(execute_density_figure_function,
args))
results = []
......@@ -460,11 +523,13 @@ def create_all_frequency_figures(ps: int, weight: int = 1,
dic = combine_dic(results)
df_corr = pd.DataFrame(dic)
df_corr.to_csv(ConfigNt.get_density_recap(weight, global_weight, ft_type,
'ALL', fig=False),
'ALL', fig=False,
community_size=community_size),
sep="\t")
if global_weight == 0:
for ft in ft_list:
create_scatterplot(df_corr, ft_type, ft, weight, global_weight)
create_scatterplot(df_corr, ft_type, ft, weight, global_weight,
community_size)
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment