Skip to content
Snippets Groups Projects
Commit 71530c2c authored by nfontrod's avatar nfontrod
Browse files

src/nt_composition/make_nt_correlation.py: add a global weight parameters to...

src/nt_composition/make_nt_correlation.py: add a global weight parameters to combine projects co-localisations
parent 7f594d1f
No related branches found
No related tags found
No related merge requests found
......@@ -30,11 +30,10 @@ import os
class NoInteractionError(Exception):
pass
POSITION = 0
def get_project_colocalisation(cnx: sqlite3.Connection, project: str,
weight: int, expected_interaction: int
weight: int,
global_weight: int,
) -> np.array:
"""
Get the interactions in project `project`
......@@ -42,15 +41,25 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str,
:param cnx: Connection to chia-pet database
:param project: The project of interest
:param weight: The minimum weight of interaction to consider
:param expected_interaction: The number of interaction we are expected \
to find
:param global_weight: The global weight to consider. if \
the global weight is equal to 0 then then density figure are calculated \
by project, else all projet are merge together and the interaction \
seen in `global_weight` project are taken into account
:return: The table containing the number of interaction by projects
"""
logging.debug(f'Recovering interaction ({os.getpid()})')
query = "SELECT exon1, exon2 " \
"FROM cin_exon_interaction " \
f"WHERE id_project = '{project}' " \
f"AND weight >= {weight}"
if global_weight == 0:
query = "SELECT exon1, exon2 " \
"FROM cin_exon_interaction " \
f"WHERE weight >= {weight} " \
f"AND id_project = '{project}' "
else:
query = f"SELECT exon1, exon2 " \
f"FROM cin_exon_interaction " \
f"WHERE weight >= {weight} " \
f"GROUP BY exon1, exon2 " \
f"HAVING COUNT(*) >= {global_weight}"
c = cnx.cursor()
c.execute(query)
result = np.asarray(c.fetchall())
......@@ -58,12 +67,6 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str,
msg = f'No interaction found in project {project}'
logging.exception(msg)
raise NoInteractionError(msg)
if expected_interaction != len(result):
msg = f"Found an unanticipated number of interaction : " \
f"(Interaction found {len(result)}, " \
f"expected : {expected_interaction}"
logging.exception(msg)
raise ValueError(msg)
logging.debug(result[0:5])
return result
......@@ -162,7 +165,7 @@ def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float],
def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, nt: str,
weight: int) -> Tuple[float, float]:
weight: int, global_weight: int) -> Tuple[float, float]:
"""
Compute a density file showing if the nucleotide frequency of \
an exons correlates with the frequency in other co-localised exons.
......@@ -174,6 +177,10 @@ def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, nt: str,
:param ft_type: The type of feature of interest
:param nt: The nucleotide of interest
:param weight: The minimum weight of interaction to consider
:param global_weight: The global weight to consider. if \
the global weight is equal to 0 then then density figure are calculated \
by project, else all projet are merge together and the interaction \
seen in `global_weight` project are taken into account
:return: The correlation and the p-value
"""
logging.debug('Creating the density figure')
......@@ -192,13 +199,14 @@ def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, nt: str,
plt.ylabel(f"Freq of {nt} in co-localized exons")
plt.title(f'Freq of {nt} of exons and their co-localized partner in '
f'{project}')
plt.savefig(ConfigNt.get_density_file(weight, project, ft_type, nt, fig=True))
plt.savefig(ConfigNt.get_density_file(weight, global_weight, project,
ft_type, nt, fig=True))
plt.close()
return r, p
def create_density_figure(nt: str, ft_type: str,
project: str, weight: int, expected_interaction: int,
project: str, weight: int, global_weight: int,
logging_level: str = "DISABLE"
) -> Tuple[float, float]:
"""
......@@ -209,22 +217,26 @@ def create_density_figure(nt: str, ft_type: str,
:param nt: The nucleotide used to build the figure
:param ft_type: The type of feature of interest
:param weight: The minimum weight of interaction to consider
:param expected_interaction: The number of interaction we're expecting
:param global_weight: The global weight to consider. if \
the global weight is equal to 0 then then density figure are calculated \
by project, else all projet are merge together and the interaction \
seen in `global_weight` project are taken into account
:param logging_level: The level of information to display
:return: The correlation and the p-value
"""
logging_def(ConfigNt.interaction, __file__, logging_level)
outfile = ConfigNt.get_density_file(weight, project, ft_type, nt,
fig=False)
outfile = ConfigNt.get_density_file(weight, global_weight, project,
ft_type, nt, fig=False)
if not outfile.is_file():
cnx = sqlite3.connect(ConfigNt.db_file)
arr_interaction = get_project_colocalisation(cnx, project, weight,
expected_interaction)
global_weight)
dic_freq = get_frequency_dic(cnx, nt, ft_type)
analyse_id = f"{project}_{ft_type}_{nt}"
df = create_density_table(arr_interaction, dic_freq, analyse_id)
df.to_csv(outfile, sep="\t", index=False)
r, p = create_density_fig(df, project, ft_type, nt, weight)
r, p = create_density_fig(df, project, ft_type, nt, weight,
global_weight)
else:
logging.debug(f'The file {outfile} exist, recovering data '
f'({os.getpid()})')
......@@ -234,7 +246,7 @@ def create_density_figure(nt: str, ft_type: str,
def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str,
weight: int):
weight: int, global_weight: int):
"""
Create a scatterplot showing the R-value according to the \
number of interaction.
......@@ -243,6 +255,10 @@ def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str,
:param ft_type: The feature type of interest
:param ft: The feature of interest
:param weight: The minimum weight of interaction to consider
:param global_weight: The global weight to consider. if \
the global weight is equal to 0 then then density figure are calculated \
by project, else all projet are merge together and the interaction \
seen in `global_weight` project are taken into account
"""
sns.set()
sns.set_context('talk')
......@@ -260,13 +276,14 @@ def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str,
plt.ylabel("Number of total interaction in projects")
plt.title(f'Project correlation for {ft} ({ft_type}) '
f'according to the number of interaction in projects')
plt.savefig(ConfigNt.get_density_recap(weight, ft_type, ft, fig=True))
plt.savefig(ConfigNt.get_density_recap(weight, global_weight, ft_type,
ft, fig=True))
plt.close()
def execute_density_figure_function(di: pd.DataFrame, project : str,
ft_type: str, ft: str, weight: int
) -> Dict[str, Any]:
ft_type: str, ft: str, weight: int,
global_weight: int) -> Dict[str, Any]:
"""
Execute create_density_figure and organized the results in a dictionary.
......@@ -275,16 +292,22 @@ def execute_density_figure_function(di: pd.DataFrame, project : str,
:param ft_type: The feature type of interest
:param ft: The feature of interest
:param weight: The minimum weight of interaction to consider
:param global_weight: The global weight to consider. if \
the global weight is equal to 0 then then density figure are calculated \
by project, else all projet are merge together and the interaction \
seen in `global_weight` project are taken into account
:return:
"""
logging.info(f'Working on {project}, {ft_type}, {ft} - {os.getpid()}')
expected_interaction = di.loc[di['projects'] == project,
'interaction_count'].values[0]
r, p = create_density_figure(ft, ft_type, project, weight,
expected_interaction)
tmp = {"project": project, "ft_type": ft_type,
"ft": ft, "cor": r, "pval": p,
'nb_interaction': di[di['projects'] == project].iloc[0, 1]}
global_weight)
if global_weight == 0:
tmp = {"project": project, "ft_type": ft_type,
"ft": ft, "cor": r, "pval": p,
'nb_interaction': di[di['projects'] == project].iloc[0, 1]}
else:
tmp = {"project": project, "ft_type": ft_type,
"ft": ft, "cor": r, "pval": p}
return tmp
......@@ -302,18 +325,27 @@ def combine_dic(list_dic: List[Dict]) -> Dict:
return dic
def create_all_frequency_figures(ps: int, weight: int,
def create_all_frequency_figures(ps: int, weight: int = 1,
global_weight: int = 0,
logging_level: str = "DISABLE"):
"""
Make density figure for every selected projects.
:param logging_level: The level of data to display.
:param weight: The weight of interaction to consider
:param global_weight: The global weight to consider. if \
the global weight is equal to 0 then then density figure are calculated \
by project, else all projet are merge together and the interaction \
seen in `global_weight` project are taken into account
:param ps: The number of processes to create
"""
logging_def(ConfigNt.interaction, __file__, logging_level)
di = pd.read_csv(ConfigNt.get_interaction_file(weight), sep="\t")
projects = di.loc[di['interaction_count'] > 300, 'projects'].values
if global_weight == 0:
di = pd.read_csv(ConfigNt.get_interaction_file(weight), sep="\t")
projects = di.loc[di['interaction_count'] > 300, 'projects'].values
else:
di = pd.DataFrame()
projects = [f"Global_projects_{global_weight}"]
# with open(ConfigNt.selected_project, 'r') as f:
# projects = f.read().splitlines()
# projects = projects[:-4]
......@@ -322,7 +354,7 @@ def create_all_frequency_figures(ps: int, weight: int,
pool = mp.Pool(processes=ps)
processes = []
for project, nt, ft_type in param:
args = [di, project, ft_type, nt, weight]
args = [di, project, ft_type, nt, weight, global_weight]
processes.append(pool.apply_async(execute_density_figure_function,
args))
results = []
......@@ -330,9 +362,12 @@ def create_all_frequency_figures(ps: int, weight: int,
results.append(proc.get(timeout=None))
dic = combine_dic(results)
df_corr = pd.DataFrame(dic)
df_corr.to_csv(ConfigNt.get_density_recap(weight, 'ALL', 'ALL', fig=False),
df_corr.to_csv(ConfigNt.get_density_recap(weight, global_weight, 'ALL',
'ALL', fig=False),
sep="\t")
create_scatterplot(df_corr, "nt", "S", weight)
if global_weight == 0:
for nt in nt_list:
create_scatterplot(df_corr, "nt", nt, weight, global_weight)
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment