diff --git a/src/nt_composition/make_nt_correlation.py b/src/nt_composition/make_nt_correlation.py index 6af50f94e2c62bd23b984da0c63c8075ec52b238..0b2b82a5754c1d663ccf80368c2618fccedae5e8 100644 --- a/src/nt_composition/make_nt_correlation.py +++ b/src/nt_composition/make_nt_correlation.py @@ -25,7 +25,6 @@ from itertools import product from random import random import multiprocessing as mp import os -import re class NoInteractionError(Exception): @@ -34,7 +33,7 @@ class NoInteractionError(Exception): def get_project_colocalisation(cnx: sqlite3.Connection, project: str, weight: int, - global_weight: int, + global_weight: int, same_gene: bool ) -> np.array: """ Get the interactions in project `project` @@ -50,16 +49,35 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str, """ logging.debug(f'Recovering interaction ({os.getpid()})') if global_weight == 0: - query = "SELECT exon1, exon2 " \ - "FROM cin_exon_interaction " \ - f"WHERE weight >= {weight} " \ - f"AND id_project = '{project}' " + if same_gene: + query = "SELECT exon1, exon2 " \ + "FROM cin_exon_interaction " \ + f"WHERE weight >= {weight} " \ + f"AND id_project = '{project}' " + else: + query = f"""SELECT t1.exon1, t1.exon2 + FROM cin_exon_interaction t1, cin_exon t2, cin_exon t3 + WHERE t1.weight >= {weight} + AND id_project = '{project}' + AND t1.exon1 = t2.id + AND t1.exon2 = t3.id + AND t2.id_gene != t3.id_gene""" else: - query = f"SELECT exon1, exon2 " \ - f"FROM cin_exon_interaction " \ - f"WHERE weight >= {weight} " \ - f"GROUP BY exon1, exon2 " \ - f"HAVING COUNT(*) >= {global_weight}" + if same_gene: + query = f"SELECT exon1, exon2 " \ + f"FROM cin_exon_interaction " \ + f"WHERE weight >= {weight} " \ + f"GROUP BY exon1, exon2 " \ + f"HAVING COUNT(*) >= {global_weight}" + else: + query = f"""SELECT t1.exon1, t1.exon2 + FROM cin_exon_interaction t1, cin_exon t2, cin_exon t3 + WHERE t1.weight >= {weight} + AND t1.exon1 = t2.id + AND t1.exon2 = t3.id + AND t2.id_gene != t3.id_gene + GROUP BY exon1, exon2 + HAVING COUNT(*) >= {global_weight}""" c = cnx.cursor() c.execute(query) @@ -103,21 +121,6 @@ def get_all_exon_interacting_with_another(exon: str, arr: np.array, return exons[exons != exon] -def remove_exon_in_same_gene(prefix: str, oexon: np.array): - """ - Remove all exons having the prefix `prefix` from `oexon`. - - :param prefix: A prefix - :param oexon: The list of interacting exons - :return: Removing all exon with the preifx in oexon - - >>> remove_exon_in_same_gene("19_", - ... np.asarray(["1_5", "1_3", "19_1", "19_19", "3_3"])) - array(['1_5', '1_3', '3_3'], dtype='<U5') - """ - return oexon[np.nonzero(np.core.defchararray.find(oexon, prefix))] - - def get_frequency_dic(cnx: sqlite3.Connection, ft: str, ft_type: str ) -> Dict[str, float]: """ @@ -149,7 +152,7 @@ def get_frequency_dic(cnx: sqlite3.Connection, ft: str, ft_type: str def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float], - analyse_id: str, same_gene: bool) -> pd.DataFrame: + analyse_id: str) -> pd.DataFrame: """ Create the density table, a table showing the frequency of \ a nucleotide in every exon in a chia-pet projet and the mean \ @@ -158,24 +161,16 @@ def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float], :param arr_interaction: array of interaction between exons. :param dic_freq: The frequency dataframe. :param analyse_id The id of the current analysis - :param same_gene: Say if we consider as co-localised exon within the \ - same gene :return: The density table """ logging.debug(f'Calculating density table ({os.getpid()})') exons_list = get_interacting_exon(arr_interaction) dic = {'exon': [], 'freq_exon': [], 'freq_coloc_exon': [], 'oexon': []} pbar = tqdm(exons_list, desc=f"Working on {analyse_id} ({os.getpid()}) :", - position=mp.current_process()._identity[0] - 1) - pat = None - if not same_gene: - pat = re.compile(r"_\d+") + position=mp.current_process()._identity[0] - 1, disable=None) for exon in pbar: freq_ex = dic_freq[exon] oexon = get_all_exon_interacting_with_another(exon, arr_interaction) - if not same_gene: - prefix = re.sub(pat, '_', exon) - oexon = remove_exon_in_same_gene(prefix, oexon) freq_oexon = np.nanmean(np.asarray([dic_freq[ex] for ex in oexon], dtype=float)) if freq_ex is not None and not np.isnan(freq_oexon): @@ -257,11 +252,10 @@ def create_density_figure(nt: str, ft_type: str, if not outfile.is_file(): cnx = sqlite3.connect(ConfigNt.db_file) arr_interaction = get_project_colocalisation(cnx, project, weight, - global_weight) + global_weight, same_gene) dic_freq = get_frequency_dic(cnx, nt, ft_type) analyse_id = f"{project}_{ft_type}_{nt}" - df = create_density_table(arr_interaction, dic_freq, analyse_id, - same_gene) + df = create_density_table(arr_interaction, dic_freq, analyse_id) df.to_csv(outfile, sep="\t", index=False) r, p = create_density_fig(df, project, ft_type, nt, weight, global_weight)