diff --git a/src/nt_composition/__main__.py b/src/nt_composition/__main__.py index 8948a419cae14906965339c7d97260551c1f709e..9b0bd1187bf4a50fead22fe6f52938920a4ca6f7 100644 --- a/src/nt_composition/__main__.py +++ b/src/nt_composition/__main__.py @@ -22,6 +22,7 @@ def launcher(weight: int = 1, global_weight: int = 0, ft_type: str = 'nt', same_gene: bool = False, compute_mean: bool = True, iteration: int = 10000, kind: str = 'density', community_size: int = -1, inter_chr: bool = False, + level: str = "exon", logging_level: str = "DISABLE"): """ Launch the creation of density file. @@ -48,6 +49,7 @@ def launcher(weight: int = 1, global_weight: int = 0, ft_type: str = 'nt', than community size. if community_size = -1. This option is deactivated. :param inter_chr: True to only get inter-chromosomal interactions \ False else (default: False) + :param level: The kind of feature to analyse (exon or gene) :param logging_level: The level of data to display (default 'DISABLE') """ get_interactions_number(weight, same_gene, inter_chr, logging_level) @@ -56,12 +58,12 @@ def launcher(weight: int = 1, global_weight: int = 0, ft_type: str = 'nt', if kind == "density": create_all_frequency_figures(ConfigNt.cpu, weight, global_weight, ft_type, same_gene, compute_mean, - community_size, inter_chr, + community_size, inter_chr, level, logging_level) else: create_all_distance_figures(ConfigNt.cpu, weight, global_weight, ft_type, same_gene, iteration, inter_chr, - logging_level) + level, logging_level) launcher() \ No newline at end of file diff --git a/src/nt_composition/distance.py b/src/nt_composition/distance.py index 5ec3ebdbc1b0695e8f98cfcd80c5a3ed9a9b5e40..41c64c12bc0406061ecaa39c18b05b25f3f6b800 100644 --- a/src/nt_composition/distance.py +++ b/src/nt_composition/distance.py @@ -123,7 +123,11 @@ def compute_mean_distance(arr_interaction: np.array, # for exon1, exon2 in arr_interaction])) res = [] for exon1, exon2 in arr_interaction: - if exon1 != exon2: + if ( + exon1 != exon2 + and dic_freq[exon1] is not None + and dic_freq[exon2] is not None + ): val = abs(dic_freq[exon1] - dic_freq[exon2]) res.append(val) return float(np.nanmean(res)) @@ -204,7 +208,7 @@ def create_distance_table(arr_interaction: np.array, """ dic = {"iteration": [], 'kind': [], 'distance': [], 'coloc': []} - # Â Add ctrl distance to the dictionary + # Add ctrl distance to the dictionary ctrl_distance, nb_int = compute_controls_distances(arr_interaction, dic_freq, iteration) for k, d in enumerate(ctrl_distance): @@ -223,7 +227,7 @@ def create_distance_table(arr_interaction: np.array, def create_distance_figure(df: pd.DataFrame, project: str, figfile: Path, - ft: str, iteration: int): + ft: str, iteration: int, level: str): """ Create a barplot figure showing the nucleotide distance between \ co-localised exons in a ChIA-PET project and what this distance would be \ @@ -234,6 +238,7 @@ def create_distance_figure(df: pd.DataFrame, project: str, figfile: Path, :param figfile: The file where the figure will be created :param ft: The feature of interest :param iteration: Number of iteration + :param level: The kind of feature to analyse (exon or gene) """ sns.set(context='talk') pval = df.loc[-df['kind'].isin(['CTRL', project]), 'kind'].values[0] @@ -245,7 +250,7 @@ def create_distance_figure(df: pd.DataFrame, project: str, figfile: Path, loc='inside', box_pair_list=[['CTRL', project, pval]], linewidth=2, min_pval=1/iteration) plt.title(f"Difference of {ft} frequency distance between " - f"co-localized exons and\nthe one of those same exons " + f"co-localized {level} and\nthe one of those same {level} " f"if the co-localisation where random ({iteration} " f"samples).") g.savefig(figfile) @@ -255,7 +260,7 @@ def create_distance_figure(df: pd.DataFrame, project: str, figfile: Path, def create_figures(ft: str, ft_type: str, project: str, weight: int, global_weight: int, same_gene: bool, iteration: int, inter_chr: bool = False, - logging_level: str = "DISABLE" + level: str = "exon", logging_level: str = "DISABLE" ) -> None: """ Create a distance figure for the project `project` for the nucleotide \ @@ -274,6 +279,7 @@ def create_figures(ft: str, ft_type: str, :param inter_chr: True to only get inter-chromosomal interactions \ False else :param logging_level: The level of information to display + :param level: The kind of feature to analyse (exon or gene) :param iteration: The number of iteration """ logging_def(ConfigNt.interaction, __file__, logging_level) @@ -285,8 +291,9 @@ def create_figures(ft: str, ft_type: str, cnx = sqlite3.connect(ConfigNt.db_file) arr_interaction = get_project_colocalisation(cnx, project, weight, global_weight, same_gene, - inter_chr=inter_chr) - dic_freq = get_frequency_dic(cnx, ft, ft_type) + inter_chr=inter_chr, + level=level) + dic_freq = get_frequency_dic(cnx, ft, ft_type, level) df = create_distance_table(arr_interaction, dic_freq, project, iteration, ft, ft_type) df.to_csv(outfile, sep="\t", index=False) @@ -294,7 +301,7 @@ def create_figures(ft: str, ft_type: str, ft_type, ft, fig=True, kind="distance", inter_chr=inter_chr) - create_distance_figure(df, project, figfile, ft, iteration) + create_distance_figure(df, project, figfile, ft, iteration, level) def execute_density_figure_function(project: str, @@ -302,7 +309,8 @@ def execute_density_figure_function(project: str, global_weight: int, same_gene: bool, iteration: int, - inter_chr: bool = False) -> None: + inter_chr: bool = False, + level: str = "exon") -> None: """ Execute create_density_figure and organized the results in a dictionary. @@ -320,17 +328,18 @@ def execute_density_figure_function(project: str, the control set. :param inter_chr: True to only get inter-chromosomal interactions \ False else + :param level: The kind of feature to analyse (exon or gene) """ logging.info(f'Working on {project}, {ft_type}, {ft} - {os.getpid()}') create_figures(ft, ft_type, project, weight, - global_weight, same_gene, iteration, inter_chr) + global_weight, same_gene, iteration, inter_chr, level) def create_all_distance_figures(ps: int, weight: int = 1, global_weight: int = 0, ft_type: str = "nt", same_gene: bool = True, iteration: int = 10000, - inter_chr: bool = False, + inter_chr: bool = False, level: str = "exon", logging_level: str = "DISABLE"): """ Make density figure for every selected projects. @@ -348,6 +357,7 @@ def create_all_distance_figures(ps: int, weight: int = 1, :param inter_chr: True to only get inter-chromosomal interactions \ False else :param ps: The number of processes to create + :param level: The kind of feature to analyse (exon or gene) """ logging_def(ConfigNt.interaction, __file__, logging_level) if global_weight == 0: @@ -360,7 +370,7 @@ def create_all_distance_figures(ps: int, weight: int = 1, processes = [] for project, ft, ft_type in param: args = [project, ft_type, ft, weight, global_weight, same_gene, - iteration, inter_chr] + iteration, inter_chr, level] processes.append(pool.apply_async(execute_density_figure_function, args)) for proc in processes: diff --git a/src/nt_composition/make_nt_correlation.py b/src/nt_composition/make_nt_correlation.py index 58969b9e7c329f8729656d85dd995a688a0b2d47..f1fcfb709922c12bc1b56b6d333c044887e02b97 100644 --- a/src/nt_composition/make_nt_correlation.py +++ b/src/nt_composition/make_nt_correlation.py @@ -152,8 +152,8 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str, return result -def get_frequency_dic(cnx: sqlite3.Connection, ft: str, ft_type: str - ) -> Dict[str, float]: +def get_frequency_dic(cnx: sqlite3.Connection, ft: str, ft_type: str, + level: str = "exon") -> Dict[str, float]: """ Get the frequency of a feature for every exon in \ the chia-pet database. @@ -161,23 +161,30 @@ def get_frequency_dic(cnx: sqlite3.Connection, ft: str, ft_type: str :param cnx: Connection to chia-pet database :param ft: The feature of interest :param ft_type: The type of feature of interest + :param level: The kind of feature to analyse (exon or gene) :return: The frequency dic """ logging.debug('Calculating frequency table') - query = "SELECT id_exon, frequency " \ - "FROM cin_exon_frequency " \ + reg = '' + if level == "gene": + reg = "AND region='gene'" + query = f"SELECT id_{level}, frequency " \ + f"FROM cin_{level}_frequency " \ f"WHERE ft = '{ft}' " \ - f"AND ft_type = '{ft_type}'" + f"AND ft_type = '{ft_type}' " \ + f"{reg}" + c = cnx.cursor() c.execute(query) result = c.fetchall() if len(result) == 0: - msg = f'No Frequencies found for {ft} ({ft_type})' + msg = f'No Frequencies found for {level} {ft} ({ft_type})' logging.exception(msg) raise NoInteractionError(msg) - dic = {} - for exon, freq in result: - dic[exon] = freq + if level == "exon": + dic = {exon: freq for exon, freq in result} + else: + dic = {str(exon): freq for exon, freq in result} logging.debug({k: dic[k] for k in list(dic.keys())[0:5]}) return dic @@ -203,7 +210,8 @@ def get_dic_co_regulated_exon(arr_interaction: np.array): return d -def density_mean(arr_interaction: np.array, dic_freq: Dict[str, float]): +def density_mean(arr_interaction: np.array, dic_freq: Dict[str, float], + level: str): """ Create the density table, a table showing the frequency of \ a nucleotide in every exon in a chia-pet project and the mean \ @@ -211,26 +219,29 @@ def density_mean(arr_interaction: np.array, dic_freq: Dict[str, float]): :param arr_interaction: array of interaction between exons. :param dic_freq: The frequency dataframe. + :param level: The kind of feature to analyse (exon or gene) :return: The density table """ exons_dic = get_dic_co_regulated_exon(arr_interaction) - dic = {'exon': [], 'freq_exon': [], 'freq_coloc_exon': [], 'oexon': []} + dic = {f'{level}': [], f'freq_{level}': [], f'freq_coloc_{level}': [], + f'o{level}': []} for exon in exons_dic.keys(): freq_ex = dic_freq[exon] oexon = np.unique(exons_dic[exon]) freq_oexon = np.nanmean(np.asarray([dic_freq[ex] for ex in oexon], dtype=float)) if freq_ex is not None and not np.isnan(freq_oexon): - dic['exon'].append(exon) - dic['freq_exon'].append(freq_ex) - dic['freq_coloc_exon'].append(freq_oexon) - dic['oexon'].append(", ".join(oexon)) + dic[level].append(exon) + dic[f'freq_{level}'].append(freq_ex) + dic[f'freq_coloc_{level}'].append(freq_oexon) + dic[f'o{level}'].append(", ".join(oexon)) df = pd.DataFrame(dic) logging.debug(df.head()) return df -def simple_density(arr_interaction: np.array, dic_freq: Dict[str, float]): +def simple_density(arr_interaction: np.array, dic_freq: Dict[str, float], + level: str): """ Create the density table, a table showing the frequency of \ a nucleotide in every exon in a chia-pet project and the \ @@ -238,24 +249,27 @@ def simple_density(arr_interaction: np.array, dic_freq: Dict[str, float]): :param arr_interaction: array of interaction between exons. :param dic_freq: The frequency dataframe. + :param level: The kind of feature to analyse (exon or gene) :return: The density table """ - dic = {'exon1': [], 'freq_exon': [], 'freq_coloc_exon': [], 'exon2': []} + dic = {f'{level}1': [], f'freq_{level}': [], + f'freq_coloc_{level}': [], f'{level}2': []} for exon1, exon2 in arr_interaction: freq_ex = dic_freq[exon1] freq_ex2 = dic_freq[exon2] if freq_ex is not None and freq_ex2 is not None: - dic['exon1'].append(exon1) - dic['freq_exon'].append(freq_ex) - dic['freq_coloc_exon'].append(freq_ex2) - dic['exon2'].append(exon2) + dic[f'{level}1'].append(exon1) + dic[f'freq_{level}'].append(freq_ex) + dic[f'freq_coloc_{level}'].append(freq_ex2) + dic[f'{level}2'].append(exon2) df = pd.DataFrame(dic) logging.debug(df.head()) return df def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float], - compute_mean: bool = False) -> pd.DataFrame: + compute_mean: bool = False, level: str = "exon" + ) -> pd.DataFrame: """ Create the density table, a table showing the frequency of \ a nucleotide in every exon in a chia-pet project and the mean \ @@ -266,19 +280,21 @@ def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float], :param dic_freq: The frequency dataframe. :param compute_mean: True to compute the mean frequency of co-localised \ exons, false to only compute the frequency of one co-localized exons. + :param level: The kind of feature to analyse (exon or gene) :return: The density table """ logging.debug(f'Calculating density table ({os.getpid()})') if compute_mean: - return density_mean(arr_interaction, dic_freq) + return density_mean(arr_interaction, dic_freq, level) else: - return simple_density(arr_interaction, dic_freq) + return simple_density(arr_interaction, dic_freq, level) def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str, weight: int, global_weight: int, community_size: Optional[int], - inter_chr: bool) -> Tuple[float, float]: + inter_chr: bool, level: str + ) -> Tuple[float, float]: """ Compute a density file showing if the feature frequency of \ an exons correlates with the frequency in other co-localised exons. @@ -296,23 +312,26 @@ def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, ft: str, seen in `global_weight` project are taken into account :param inter_chr: True to only get inter-chromosomal interactions \ False else + :param level: The kind of feature to analyse (exon or gene) :return: The correlation and the p-value """ logging.debug('Creating the density figure') sns.set() sns.set_context('talk') plt.figure(figsize=(20, 12)) - df2 = df[['freq_exon', 'freq_coloc_exon']].copy() - s, i, r, p, stderr = linregress(df.freq_exon, df.freq_coloc_exon) - sns.kdeplot(df2.freq_exon, df2.freq_coloc_exon, shade=True, + df2 = df[[f'freq_{level}', f'freq_coloc_{level}']].copy() + s, i, r, p, stderr = linregress(df[f"freq_{level}"], + df[f'freq_coloc_{level}']) + sns.kdeplot(df2[f'freq_{level}'], df2[f'freq_coloc_{level}'], shade=True, shade_lowest=False, cmap="YlOrBr") - plt.plot(df.freq_exon, i + s * df.freq_exon, 'r', + plt.plot(df[f"freq_{level}"], i + s * df[f"freq_{level}"], 'r', label=f'r={round(r,4)}, p={round(p, 4)}') plt.legend() - plt.xlabel(f"Freq of {ft} in an exon") - plt.ylabel(f"Freq of {ft} in co-localized exons") - title = f'Freq of {ft} of exons and their co-localized partner in ' \ + ml = "an exon" if level == "exon" else "a gene" + plt.xlabel(f"Freq of {ft} in {ml}") + plt.ylabel(f"Freq of {ft} in co-localized {level}s") + title = f'Freq of {ft} of {level}s and their co-localized partner in ' \ f'{project}' if inter_chr: title += '\n(inter chromosomal interactions)' @@ -329,7 +348,7 @@ def create_density_figure(nt: str, ft_type: str, project: str, weight: int, global_weight: int, same_gene: bool, compute_mean: bool, community_size: Optional[int], - inter_chr: bool = False, + inter_chr: bool = False, level: str = "exon", logging_level: str = "DISABLE" ) -> Tuple[float, float]: """ @@ -352,6 +371,7 @@ def create_density_figure(nt: str, ft_type: str, analysis. :param inter_chr: True to only get inter-chromosomal interactions \ False else + :param level: The kind of feature to analyse (exon or gene) :param logging_level: The level of information to display :return: The correlation and the p-value """ @@ -368,17 +388,20 @@ def create_density_figure(nt: str, ft_type: str, arr_interaction = get_project_colocalisation(cnx, project, weight, global_weight, same_gene, False, exons_bc, - inter_chr) - dic_freq = get_frequency_dic(cnx, nt, ft_type) - df = create_density_table(arr_interaction, dic_freq, compute_mean) + inter_chr, level=level) + dic_freq = get_frequency_dic(cnx, nt, ft_type, level) + df = create_density_table(arr_interaction, dic_freq, compute_mean, + level) df.to_csv(outfile, sep="\t", index=False) r, p = create_density_fig(df, project, ft_type, nt, weight, - global_weight, community_size, inter_chr) + global_weight, community_size, inter_chr, + level) else: logging.debug(f'The file {outfile} exist, recovering data ' f'({os.getpid()})') df = pd.read_csv(outfile, sep="\t") - s, i, r, p, stderr = linregress(df.freq_exon, df.freq_coloc_exon) + s, i, r, p, stderr = linregress(df[f"freq_{level}"], + df[f"freq_coloc_{level}"]) return r, p @@ -433,7 +456,8 @@ def execute_density_figure_function(di: pd.DataFrame, project : str, same_gene: bool, compute_mean: bool, community_size: Optional[int], - inter_chr: bool = False + inter_chr: bool = False, + level: str= "exon", ) -> Dict[str, Any]: """ Execute create_density_figure and organized the results in a dictionary. @@ -455,20 +479,20 @@ def execute_density_figure_function(di: pd.DataFrame, project : str, analysis. :param inter_chr: True to only get inter-chromosomal interactions \ False else + :param level: The kind of feature to analyse (exon or gene) :return: """ logging.info(f'Working on {project}, {ft_type}, {ft} - {os.getpid()}') r, p = create_density_figure(ft, ft_type, project, weight, global_weight, same_gene, compute_mean, - community_size, inter_chr) + community_size, inter_chr, level) if global_weight == 0: - tmp = {"project": project, "ft_type": ft_type, + return {"project": project, "ft_type": ft_type, "ft": ft, "cor": r, "pval": p, 'nb_interaction': di[di['projects'] == project].iloc[0, 1]} else: - tmp = {"project": project, "ft_type": ft_type, + return {"project": project, "ft_type": ft_type, "ft": ft, "cor": r, "pval": p} - return tmp def combine_dic(list_dic: List[Dict]) -> Dict: @@ -524,7 +548,7 @@ def create_all_frequency_figures(ps: int, weight: int = 1, global_weight: int = 0, ft_type: str = "nt", same_gene = True, compute_mean: bool = True, community_size: Optional[int] = None, - inter_chr: bool = False, + inter_chr: bool = False, level: str = "exon", logging_level: str = "DISABLE"): """ Make density figure for every selected projects. @@ -545,6 +569,7 @@ def create_all_frequency_figures(ps: int, weight: int = 1, :param ps: The number of processes to create :param inter_chr: True to only get inter-chromosomal interactions \ False else + :param level: The kind of feature to analyse (exon or gene) """ logging_def(ConfigNt.interaction, __file__, logging_level) if global_weight == 0: @@ -560,7 +585,7 @@ def create_all_frequency_figures(ps: int, weight: int = 1, processes = [] for project, ft, ft_type in param: args = [di, project, ft_type, ft, weight, global_weight, same_gene, - compute_mean, community_size, inter_chr] + compute_mean, community_size, inter_chr, level] processes.append(pool.apply_async(execute_density_figure_function, args)) results = [proc.get(timeout=None) for proc in processes]