diff --git a/src/nt_composition/make_nt_correlation.py b/src/nt_composition/make_nt_correlation.py index d0e13294736fa23602662a16813b9d4025c92771..286daa528b0ea3f34777fac8c755909b58e8a5eb 100644 --- a/src/nt_composition/make_nt_correlation.py +++ b/src/nt_composition/make_nt_correlation.py @@ -169,25 +169,23 @@ def get_dic_co_regulated_exon(arr_interaction: np.array): return d -def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float], - ) -> pd.DataFrame: +def density_mean(arr_interaction: np.array, dic_freq: Dict[str, float]): """ Create the density table, a table showing the frequency of \ - a nucleotide in every exon in a chia-pet projet and the mean \ - frequency of every other co-localised exons. + a nucleotide in every exon in a chia-pet project and the mean \ + frequency of every other co-localised exons :param arr_interaction: array of interaction between exons. :param dic_freq: The frequency dataframe. :return: The density table """ - logging.debug(f'Calculating density table ({os.getpid()})') exons_dic = get_dic_co_regulated_exon(arr_interaction) dic = {'exon': [], 'freq_exon': [], 'freq_coloc_exon': [], 'oexon': []} for exon in exons_dic.keys(): freq_ex = dic_freq[exon] oexon = np.unique(exons_dic[exon]) freq_oexon = np.nanmean(np.asarray([dic_freq[ex] for ex in oexon], - dtype=float)) + dtype=float)) if freq_ex is not None and not np.isnan(freq_oexon): dic['exon'].append(exon) dic['freq_exon'].append(freq_ex) @@ -198,6 +196,51 @@ def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float], return df +def simple_density(arr_interaction: np.array, dic_freq: Dict[str, float]): + """ + Create the density table, a table showing the frequency of \ + a nucleotide in every exon in a chia-pet project and the \ + frequency of of one co-localised exon. + + :param arr_interaction: array of interaction between exons. + :param dic_freq: The frequency dataframe. + :return: The density table + """ + dic = {'exon1': [], 'freq_exon': [], 'freq_coloc_exon': [], 'exon2': []} + for exon1, exon2 in arr_interaction: + freq_ex = dic_freq[exon1] + freq_ex2 = dic_freq[exon2] + if freq_ex is not None and freq_ex2 is not None: + dic['exon1'].append(exon1) + dic['freq_exon'].append(freq_ex) + dic['freq_coloc_exon'].append(freq_ex2) + dic['exon2'].append(exon2) + df = pd.DataFrame(dic) + logging.debug(df.head()) + return df + + +def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float], + compute_mean: bool = False) -> pd.DataFrame: + """ + Create the density table, a table showing the frequency of \ + a nucleotide in every exon in a chia-pet project and the mean \ + frequency of every other co-localised exons or the frequency of one \ + co-localised exons. + + :param arr_interaction: array of interaction between exons. + :param dic_freq: The frequency dataframe. + :param compute_mean: True to compute the mean frequency of co-localised \ + exons, false to only compute the frequency of one co-localized exons. + :return: The density table + """ + logging.debug(f'Calculating density table ({os.getpid()})') + if compute_mean: + return density_mean(arr_interaction, dic_freq) + else: + return simple_density(arr_interaction, dic_freq) + + def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str, weight: int, global_weight: int) -> Tuple[float, float]: """ @@ -241,7 +284,7 @@ def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str, def create_density_figure(nt: str, ft_type: str, project: str, weight: int, global_weight: int, - same_gene: bool, + same_gene: bool, compute_mean: bool, logging_level: str = "DISABLE" ) -> Tuple[float, float]: """ @@ -258,6 +301,8 @@ def create_density_figure(nt: str, ft_type: str, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised exon within the \ same gene + :param compute_mean: True to compute the mean frequency of co-localised \ + exons, false to only compute the frequency of one co-localized exons. :param logging_level: The level of information to display :return: The correlation and the p-value """ @@ -269,7 +314,7 @@ def create_density_figure(nt: str, ft_type: str, arr_interaction = get_project_colocalisation(cnx, project, weight, global_weight, same_gene) dic_freq = get_frequency_dic(cnx, nt, ft_type) - df = create_density_table(arr_interaction, dic_freq) + df = create_density_table(arr_interaction, dic_freq, compute_mean) df.to_csv(outfile, sep="\t", index=False) r, p = create_density_fig(df, project, ft_type, nt, weight, global_weight) @@ -320,7 +365,8 @@ def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str, def execute_density_figure_function(di: pd.DataFrame, project : str, ft_type: str, ft: str, weight: int, global_weight: int, - same_gene: bool) -> Dict[str, Any]: + same_gene: bool, + compute_mean) -> Dict[str, Any]: """ Execute create_density_figure and organized the results in a dictionary. @@ -335,11 +381,13 @@ def execute_density_figure_function(di: pd.DataFrame, project : str, seen in `global_weight` project are taken into account :param same_gene: Say if we consider as co-localised exon within the \ same gene + :param compute_mean: True to compute the mean frequency of co-localised \ + exons, false to only compute the frequency of one co-localized exons. :return: """ logging.info(f'Working on {project}, {ft_type}, {ft} - {os.getpid()}') r, p = create_density_figure(ft, ft_type, project, weight, - global_weight, same_gene) + global_weight, same_gene, compute_mean) if global_weight == 0: tmp = {"project": project, "ft_type": ft_type, "ft": ft, "cor": r, "pval": p, @@ -366,7 +414,7 @@ def combine_dic(list_dic: List[Dict]) -> Dict: def create_all_frequency_figures(ps: int, weight: int = 1, global_weight: int = 0, ft_type: str = "nt", - same_gene = True, + same_gene = True, compute_mean: bool = True, logging_level: str = "DISABLE"): """ Make density figure for every selected projects. @@ -380,6 +428,8 @@ def create_all_frequency_figures(ps: int, weight: int = 1, :param ft_type: The kind of feature to analyse :param same_gene: Say if we consider as co-localised exon within the \ same gene + :param compute_mean: True to compute the mean frequency of co-localised \ + exons, false to only compute the frequency of one co-localized exons. :param ps: The number of processes to create """ logging_def(ConfigNt.interaction, __file__, logging_level) @@ -397,7 +447,8 @@ def create_all_frequency_figures(ps: int, weight: int = 1, pool = mp.Pool(processes=ps) processes = [] for project, ft, ft_type in param: - args = [di, project, ft_type, ft, weight, global_weight, same_gene] + args = [di, project, ft_type, ft, weight, global_weight, same_gene, + compute_mean] processes.append(pool.apply_async(execute_density_figure_function, args)) results = []