diff --git a/src/nt_composition/make_nt_correlation.py b/src/nt_composition/make_nt_correlation.py index f506a23d09d145dab24dc7b60d87a37250cd6a5b..0123d0143560096aa057f8c0cad3415f5deb6a4d 100644 --- a/src/nt_composition/make_nt_correlation.py +++ b/src/nt_composition/make_nt_correlation.py @@ -33,19 +33,24 @@ class NoInteractionError(Exception): POSITION = 0 -def get_project_colocalisation(cnx: sqlite3.Connection, project: str +def get_project_colocalisation(cnx: sqlite3.Connection, project: str, + weight: int, expected_interaction: int ) -> np.array: """ Get the interactions in project `project` :param cnx: Connection to chia-pet database :param project: The project of interest + :param weight: The minimum weight of interaction to consider + :param expected_interaction: The number of interaction we are expected \ + to find :return: The table containing the number of interaction by projects """ logging.debug(f'Recovering interaction ({os.getpid()})') query = "SELECT exon1, exon2 " \ "FROM cin_exon_interaction " \ - f"WHERE id_project = '{project}'" + f"WHERE id_project = '{project}' " \ + f"AND weight >= {weight}" c = cnx.cursor() c.execute(query) result = np.asarray(c.fetchall()) @@ -53,6 +58,12 @@ def get_project_colocalisation(cnx: sqlite3.Connection, project: str msg = f'No interaction found in project {project}' logging.exception(msg) raise NoInteractionError(msg) + if expected_interaction != len(result): + msg = f"Found an unanticipated number of interaction : " \ + f"(Interaction found {len(result)}, " \ + f"expected : {expected_interaction}" + logging.exception(msg) + raise ValueError(msg) logging.debug(result[0:5]) return result @@ -150,8 +161,8 @@ def create_density_table(arr_interaction: np.array, dic_freq: Dict[str, float], return df -def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, nt: str - ) -> Tuple[float, float]: +def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, nt: str, + weight: int) -> Tuple[float, float]: """ Compute a density file showing if the nucleotide frequency of \ an exons correlates with the frequency in other co-localised exons. @@ -162,6 +173,7 @@ def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, nt: str where observed :param ft_type: The type of feature of interest :param nt: The nucleotide of interest + :param weight: The minimum weight of interaction to consider :return: The correlation and the p-value """ logging.debug('Creating the density figure') @@ -180,13 +192,14 @@ def create_density_fig(df: pd.DataFrame, project: str, ft_type: str, nt: str plt.ylabel(f"Freq of {nt} in co-localized exons") plt.title(f'Freq of {nt} of exons and their co-localized partner in ' f'{project}') - plt.savefig(ConfigNt.get_density_file(project, ft_type, nt, fig=True)) + plt.savefig(ConfigNt.get_density_file(weight, project, ft_type, nt, fig=True)) plt.close() return r, p def create_density_figure(nt: str, ft_type: str, - project: str, logging_level: str = "DISABLE" + project: str, weight: int, expected_interaction: int, + logging_level: str = "DISABLE" ) -> Tuple[float, float]: """ Create a density figure for the project `project` for the nucleotide \ @@ -195,19 +208,23 @@ def create_density_figure(nt: str, ft_type: str, :param project: The chia-pet project of interest :param nt: The nucleotide used to build the figure :param ft_type: The type of feature of interest + :param weight: The minimum weight of interaction to consider + :param expected_interaction: The number of interaction we're expecting :param logging_level: The level of information to display :return: The correlation and the p-value """ logging_def(ConfigNt.interaction, __file__, logging_level) - outfile = ConfigNt.get_density_file(project, ft_type, nt, fig=False) + outfile = ConfigNt.get_density_file(weight, project, ft_type, nt, + fig=False) if not outfile.is_file(): cnx = sqlite3.connect(ConfigNt.db_file) - arr_interaction = get_project_colocalisation(cnx, project) + arr_interaction = get_project_colocalisation(cnx, project, weight, + expected_interaction) dic_freq = get_frequency_dic(cnx, nt, ft_type) analyse_id = f"{project}_{ft_type}_{nt}" df = create_density_table(arr_interaction, dic_freq, analyse_id) df.to_csv(outfile, sep="\t", index=False) - r, p = create_density_fig(df, project, ft_type, nt) + r, p = create_density_fig(df, project, ft_type, nt, weight) else: logging.debug(f'The file {outfile} exist, recovering data ' f'({os.getpid()})') @@ -216,7 +233,8 @@ def create_density_figure(nt: str, ft_type: str, return r, p -def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str): +def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str, + weight: int): """ Create a scatterplot showing the R-value according to the \ number of interaction. @@ -224,6 +242,7 @@ def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str): :param df_cor: The dataframe of correlation. :param ft_type: The feature type of interest :param ft: The feature of interest + :param weight: The minimum weight of interaction to consider """ sns.set() sns.set_context('talk') @@ -241,29 +260,35 @@ def create_scatterplot(df_cor: pd.DataFrame, ft_type: str, ft: str): plt.ylabel("Number of total interaction in projects") plt.title(f'Project correlation for {ft} ({ft_type}) ' f'according to the number of interaction in projects') - plt.savefig(ConfigNt.get_density_recap(ft_type, ft, fig=True)) + plt.savefig(ConfigNt.get_density_recap(weight, ft_type, ft, fig=True)) plt.close() def execute_density_figure_function(di: pd.DataFrame, project : str, - ft_type: str, ft: str) -> Dict[str, Any]: + ft_type: str, ft: str, weight: int + ) -> Dict[str, Any]: """ Execute create_density_figure and organized the results in a dictionary. + :param di: The dataframe of interaction :param project: The project of interest :param ft_type: The feature type of interest :param ft: The feature of interest + :param weight: The minimum weight of interaction to consider :return: """ logging.info(f'Working on {project}, {ft_type}, {ft} - {os.getpid()}') - r, p = create_density_figure(ft, ft_type, project) + expected_interaction = di.loc[di['projects'] == project, + 'interaction_count'].values[0] + r, p = create_density_figure(ft, ft_type, project, weight, + expected_interaction) tmp = {"project": project, "ft_type": ft_type, "ft": ft, "cor": r, "pval": p, 'nb_interaction': di[di['projects'] == project].iloc[0, 1]} return tmp -def combine_dic(list_dic: Dict) -> Dict: +def combine_dic(list_dic: List[Dict]) -> Dict: """ Combine The dictionaries in list_dic. @@ -277,33 +302,37 @@ def combine_dic(list_dic: Dict) -> Dict: return dic -def create_all_frequency_figures(ps: int, +def create_all_frequency_figures(ps: int, weight: int, logging_level: str = "DISABLE"): """ Make density figure for every selected projects. :param logging_level: The level of data to display. + :param weight: The weight of interaction to consider :param ps: The number of processes to create """ logging_def(ConfigNt.interaction, __file__, logging_level) - di = pd.read_csv(ConfigNt.interaction_file, sep="\t") - projects = di['projects'].values + di = pd.read_csv(ConfigNt.get_interaction_file(weight), sep="\t") + projects = di.loc[di['interaction_count'] > 300, 'projects'].values # with open(ConfigNt.selected_project, 'r') as f: # projects = f.read().splitlines() + # projects = projects[:-4] nt_list = ['A', 'C', 'G', 'T', 'S', 'W'] param = product(projects, nt_list, ['nt']) pool = mp.Pool(processes=ps) processes = [] for project, nt, ft_type in param: - args = [di, project, ft_type, nt] - processes.append(pool.apply_async(execute_density_figure_function, args)) + args = [di, project, ft_type, nt, weight] + processes.append(pool.apply_async(execute_density_figure_function, + args)) results = [] for proc in processes: results.append(proc.get(timeout=None)) dic = combine_dic(results) df_corr = pd.DataFrame(dic) - df_corr.to_csv(ConfigNt.density_folder / "density_recap.txt", sep="\t") - create_scatterplot(df_corr, "nt", "S") + df_corr.to_csv(ConfigNt.get_density_recap(weight, 'ALL', 'ALL', fig=False), + sep="\t") + create_scatterplot(df_corr, "nt", "S", weight) if __name__ == "__main__":