From e2d22cc1e60d01cb35f3fa46a53ba7a134cc2d24 Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Thu, 18 Jun 2020 10:42:54 +0200 Subject: [PATCH] src/nt_composition/make_nt_correlation.py src/nt_composition/__main__.py: Creation of a new parameter same_gene allowing to remove or keep the co-localisation within the same gene --- src/nt_composition/__main__.py | 7 +++- src/nt_composition/make_nt_correlation.py | 46 ++++++++++++++++++++--- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/src/nt_composition/__main__.py b/src/nt_composition/__main__.py index dd1bb8fa..3e0f220e 100644 --- a/src/nt_composition/__main__.py +++ b/src/nt_composition/__main__.py @@ -12,9 +12,12 @@ from .make_nt_correlation import create_all_frequency_figures from .get_projects_interaction import get_interactions_number import lazyparser as lp + +@lp.flag(same_gene=True) @lp.parse(weight='weight > 0', ft_type=['nt', 'dnt', 'tnt', 'codon', 'aa', 'properties']) def launcher(weight: int = 1, global_weight: int = 0, ft_type: str = 'nt', + same_gene: bool = False, logging_level: str = "DISABLE"): """ Launch the creation of density file. @@ -25,11 +28,13 @@ def launcher(weight: int = 1, global_weight: int = 0, ft_type: str = 'nt', by project, else all project are merge together and the interaction \ seen in `global_weight` project are taken into account (default 0) :param ft_type: The feature type of interest (default 'nt') + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) :param logging_level: The level of data to display (default 'DISABLE') """ get_interactions_number(weight, logging_level) create_all_frequency_figures(ConfigNt.cpu, weight, global_weight, ft_type, - logging_level) + same_gene, logging_level) launcher() \ No newline at end of file diff --git a/src/nt_composition/make_nt_correlation.py b/src/nt_composition/make_nt_correlation.py index 6283ecb3..6af50f94 100644 --- a/src/nt_composition/make_nt_correlation.py +++ b/src/nt_composition/make_nt_correlation.py @@ -25,6 +25,7 @@ from itertools import product from random import random import multiprocessing as mp import os +import re class NoInteractionError(Exception): @@ -85,7 +86,7 @@ def get_interacting_exon(arr: np.array) -> np.array: return np.unique(arr.flatten()) -def get_all_exon_interacting_with_another(exon: str, arr: np.array +def get_all_exon_interacting_with_another(exon: str, arr: np.array, ) -> np.array: """ From an exons, get the other exons directly co-localised with it. @@ -102,6 +103,21 @@ def get_all_exon_interacting_with_another(exon: str, arr: np.array return exons[exons != exon] +def remove_exon_in_same_gene(prefix: str, oexon: np.array): + """ + Remove all exons having the prefix `prefix` from `oexon`. + + :param prefix: A prefix + :param oexon: The list of interacting exons + :return: Removing all exon with the preifx in oexon + + >>> remove_exon_in_same_gene("19_", + ... np.asarray(["1_5", "1_3", "19_1", "19_19", "3_3"])) + array(['1_5', '1_3', '3_3'], dtype='<U5') + """ + return oexon[np.nonzero(np.core.defchararray.find(oexon, prefix))] + + def get_frequency_dic(cnx: sqlite3.Connection, ft: str, ft_type: str ) -> Dict[str, float]: """ @@ -133,7 +149,7 @@ def get_frequency_dic(cnx: sqlite3.Connection, ft: str, ft_type: str def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float], - analyse_id) -> pd.DataFrame: + analyse_id: str, same_gene: bool) -> pd.DataFrame: """ Create the density table, a table showing the frequency of \ a nucleotide in every exon in a chia-pet projet and the mean \ @@ -142,6 +158,8 @@ def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float], :param arr_interaction: array of interaction between exons. :param dic_freq: The frequency dataframe. :param analyse_id The id of the current analysis + :param same_gene: Say if we consider as co-localised exon within the \ + same gene :return: The density table """ logging.debug(f'Calculating density table ({os.getpid()})') @@ -149,9 +167,15 @@ def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float], dic = {'exon': [], 'freq_exon': [], 'freq_coloc_exon': [], 'oexon': []} pbar = tqdm(exons_list, desc=f"Working on {analyse_id} ({os.getpid()}) :", position=mp.current_process()._identity[0] - 1) + pat = None + if not same_gene: + pat = re.compile(r"_\d+") for exon in pbar: freq_ex = dic_freq[exon] oexon = get_all_exon_interacting_with_another(exon, arr_interaction) + if not same_gene: + prefix = re.sub(pat, '_', exon) + oexon = remove_exon_in_same_gene(prefix, oexon) freq_oexon = np.nanmean(np.asarray([dic_freq[ex] for ex in oexon], dtype=float)) if freq_ex is not None and not np.isnan(freq_oexon): @@ -207,6 +231,7 @@ def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str, def create_density_figure(nt: str, ft_type: str, project: str, weight: int, global_weight: int, + same_gene: bool, logging_level: str = "DISABLE" ) -> Tuple[float, float]: """ @@ -221,6 +246,8 @@ def create_density_figure(nt: str, ft_type: str, the global weight is equal to 0 then then density figure are calculated \ by project, else all projet are merge together and the interaction \ seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised exon within the \ + same gene :param logging_level: The level of information to display :return: The correlation and the p-value """ @@ -233,7 +260,8 @@ def create_density_figure(nt: str, ft_type: str, global_weight) dic_freq = get_frequency_dic(cnx, nt, ft_type) analyse_id = f"{project}_{ft_type}_{nt}" - df = create_density_table(arr_interaction, dic_freq, analyse_id) + df = create_density_table(arr_interaction, dic_freq, analyse_id, + same_gene) df.to_csv(outfile, sep="\t", index=False) r, p = create_density_fig(df, project, ft_type, nt, weight, global_weight) @@ -283,7 +311,8 @@ def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str, def execute_density_figure_function(di: pd.DataFrame, project : str, ft_type: str, ft: str, weight: int, - global_weight: int) -> Dict[str, Any]: + global_weight: int, + same_gene: bool) -> Dict[str, Any]: """ Execute create_density_figure and organized the results in a dictionary. @@ -296,11 +325,13 @@ def execute_density_figure_function(di: pd.DataFrame, project : str, the global weight is equal to 0 then then density figure are calculated \ by project, else all projet are merge together and the interaction \ seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised exon within the \ + same gene :return: """ logging.info(f'Working on {project}, {ft_type}, {ft} - {os.getpid()}') r, p = create_density_figure(ft, ft_type, project, weight, - global_weight) + global_weight, same_gene) if global_weight == 0: tmp = {"project": project, "ft_type": ft_type, "ft": ft, "cor": r, "pval": p, @@ -327,6 +358,7 @@ def combine_dic(list_dic: List[Dict]) -> Dict: def create_all_frequency_figures(ps: int, weight: int = 1, global_weight: int = 0, ft_type: str = "nt", + same_gene = True, logging_level: str = "DISABLE"): """ Make density figure for every selected projects. @@ -338,6 +370,8 @@ def create_all_frequency_figures(ps: int, weight: int = 1, by project, else all projet are merge together and the interaction \ seen in `global_weight` project are taken into account :param ft_type: The kind of feature to analyse + :param same_gene: Say if we consider as co-localised exon within the \ + same gene :param ps: The number of processes to create """ logging_def(ConfigNt.interaction, __file__, logging_level) @@ -355,7 +389,7 @@ def create_all_frequency_figures(ps: int, weight: int = 1, pool = mp.Pool(processes=ps) processes = [] for project, ft, ft_type in param: - args = [di, project, ft_type, ft, weight, global_weight] + args = [di, project, ft_type, ft, weight, global_weight, same_gene] processes.append(pool.apply_async(execute_density_figure_function, args)) results = [] -- GitLab