diff --git a/src/figures_utils/exons_interactions.py b/src/figures_utils/exons_interactions.py new file mode 100644 index 0000000000000000000000000000000000000000..ea6042e9a5d6fba54f9f5806f03fe5c3bca35539 --- /dev/null +++ b/src/figures_utils/exons_interactions.py @@ -0,0 +1,421 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +##TO ADD +""" + +import sqlite3 +from .config_figures import Config +import seaborn as sns +import random +import pandas as pd + +SF = "HNRNPC" + + +# ########################GET DATA FROM THE DATABASE######################### +def get_info_from_database(cnx: sqlite3.Connection, query: str) -> list: + """ + Get data from the database, for example: + - the ID of all fasterdb exons, e.g: ['9995_6', '9995_7', + '9996_1', '9996_10'] + - the ID of all ChIA-PET experiments, e.g: ['GSM1327093', + 'GSM1327094', 'GSM1436259'] + + :param cnx: connexion to the ChIA-PET database + :param query: the SQL query that allows us to get data from the database + :return list_exons: the list of the ID of all fasterdb exons, e.g: + ['9995_6', '9995_7', '9996_1', '9996_10'] + """ + cursor = cnx.cursor() + cursor.execute(query) + res = cursor.fetchall() + list_data = [] + for data in res: + data = str(data).lstrip("('").rstrip("',)") + list_data.append(data) + return list_data + + +# ####################GET EXONS DOWN OR UP REGULATED BY SFs#################### +def get_projects_links_to_a_splicing_factor(cnx: sqlite3.Connection, + sf_name: str) -> list: + """ + Get the ID of every projects corresponding to a particular splicing factor. + + :param cnx: connexion to the ChIA-PET database + :param sf_name: the splicing factor name, e.g: PTBP1 + :return id_project: a list of id_project (table cin_project_splicing_lore), + corresponding to a particular splicing factor. E.g: [7, 30, 96, 135] + """ + cursor = cnx.cursor() + query = """SELECT id + FROM cin_project_splicing_lore + WHERE sf_name = ?""" + cursor.execute(query, (sf_name,)) + res = cursor.fetchall() + + id_project = [val[0] for val in res if val[0] not in [139, 13, 164]] + return id_project + + +def get_ase_events(cnx: sqlite3.Connection, id_project: str) -> list: + """ + Get every exons regulated (down or up) according to a particular project. + + :param cnx: connexion to the ChIA-PET database + :param id_project: a project ID of the table cin_project_splicing_lore + :return nres: each sublist corresponds to an exon (exon_regulation + + gene_id + exon_position on gene), e.g: ['down', 18673, '18673_17'] + """ + cursor = cnx.cursor() + query = """SELECT delta_psi, gene_id, exon_id + FROM cin_ase_event + WHERE id_project = ? + AND (delta_psi >= 0.1 OR delta_psi <= -0.1) + AND pvalue_glm_cor <= 0.05""" + cursor.execute(query, (id_project,)) + res = cursor.fetchall() + + if len(res) == 0: + query = """SELECT delta_psi, gene_id, exon_id + FROM cin_ase_event + WHERE id_project = ? + AND (delta_psi >= 0.1 OR delta_psi <= -0.1) + AND pvalue <= 0.05""" + cursor.execute(query, (id_project,)) + res = cursor.fetchall() + + nres = [] + for exon in res: + nexon = list(exon[1:3]) + if exon[0] < 0: + nexon = ["down"] + nexon + else: + nexon = ["up"] + nexon + nres.append(nexon) + return nres + + +def washing_events(exon_list: list) -> list: + """ + Remove redundant exons or remove exons showing different regulation. + + :param exon_list: each sublist corresponds to an exon (exon_regulation + + gene_id + exon_position on gene), e.g: ['down', 18673, '18673_17'] + Every exon regulated by a splicing factor in different projects. + :return new_exon_list: each sublist corresponds to an exon (exon_regulation + + gene_id + exon_position on gene), e.g: ['down', '18962', '18962', '14']. + Every exon regulated by a splicing factor in different projects without + redundancy. + """ + replace_dic = {"up": "down", "down": "up"} + dic = {} + prefix_list = [] + for exon in exon_list: + exon_prefix = "%s_%s" % (exon[1], exon[2]) + exon_name = "%s_%s" % (exon[0], exon_prefix) + if exon_name not in dic: + if exon_prefix not in prefix_list: + dic[exon_name] = 1 + prefix_list.append(exon_prefix) + else: + reverse_name = exon_name.replace(exon[0], replace_dic[exon[0]]) + if reverse_name in dic: + del(dic[reverse_name]) + # Else : the exon was deleted before because of a different + # regulation + else: + dic[exon_name] += 1 + + # Creation of the new list of exons + new_exon_list = [] + for key in dic: + my_exon = key.split("_") + new_exon_list.append(my_exon) + return new_exon_list + + +def get_every_events_4_a_sl(cnx: sqlite3.Connection, sf_name: str, + regulation: str) -> tuple: + """ + Get every splicing events for a given splicing factor. + + :param cnx: connexion to the ChIA-PET database + :param sf_name: the splicing factor name, e.g: PTBP1 + :param regulation: up or down + :return sf_reg: a dictionary with a list of regulated exons depending on a + splicing factor and its regulation, e.g: {'PTBP1_up': ['345_38', '681_2', + '781_4', '1090_16', '1291_12']} + :return number_exons: a str which is the concatenation of the sf_name, the + regulation and the number of exons regulated by this SF according to the + type of regulation, e.g: HNRNPC_down_2482. + """ + exons_list = [] + id_projects = get_projects_links_to_a_splicing_factor(cnx, sf_name) + for id_project in id_projects: + ase_event = get_ase_events(cnx, id_project) + exons_list += ase_event + + washed_exon_list = washing_events(exons_list) + reg_exon_list = [] + for exon in washed_exon_list: + if exon[0] == regulation: + reg_exon_list.append(exon[2] + "_" + exon[3]) + + sf_reg = dict() + sf_reg[sf_name + "_" + regulation] = reg_exon_list + number_exons = sf_name + "_" + regulation + "_" + str(len(reg_exon_list)) + return sf_reg, number_exons + + +# ##########GET ALL EXONS IN INTERACTION (for all ChIA-PET datasets)########### +def get_exons_in_interaction(cnx: sqlite3.Connection, id_project: str) -> dict: + """ + Allows: + - to retrieve in the database the pairs of interacting exons, according to + a ChIA-PET dataset. + - then to obtain a dictionary, with for each exon the list of exons with + which it interacts, e.g: '9843_10.GSM1234': ['9843_2', '9843_3', '9843_4'], + according to a ChIA-PET dataset. + + :param cnx: connexion to the ChIA-PET database + :param id_project: the ID of the ENCODE or GEO project that will allow us + to obtain the list of exons in interactions. + :return exons_interactions: a dictionary, with for each exon the list of + exons with which it interacts, e.g: '9843_10.GSM1234': ['9843_2', '9843_3', + '9843_4'], according to a ChIA-PET dataset. + """ + cursor = cnx.cursor() + query = """SELECT exon1, exon2 + FROM cin_exon_interaction + WHERE id_project = ?""" + cursor.execute(query, (id_project,)) + res = cursor.fetchall() + + exons_interactions = {} + for key, value in res: + key = key + "." + id_project + if key not in exons_interactions: + exons_interactions[key] = [value] + else: + exons_interactions[key].append(value) + return exons_interactions + + +def get_every_exons_in_interaction() -> dict: + """ + Allows to: + - obtain for each dataset of ChIA-PET, a dictionary, with for each exon + the list of exons with which it interacts, e.g: '9843_10.GSM1234': + ['9843_2', '9843_3', '9843_4']. All dictionaries are stored in one total + dictionary, see get_exons_in_interaction(). + - from this total dictionary, we obtain for an exon the complete list of + exons with which it interacts, not only for one ChIA-PET dataset, but for + all the datasets that we have, e.g: '4448_11': ['4448_20'], '4857_5': + ['4878_1', '494_6']. + + :return final_exons_interactions: a dictionary with for each exon the + complete list of exons with which it interacts, for all the datasets that + we have, e.g: '4448_11': ['4448_20'], '4857_5': ['4878_1', '494_6']. + """ + dict_interaction = {} + id_chia_pets = get_info_from_database(sqlite3.connect(Config.db_file), + """SELECT id_sample FROM + cin_projects""") + for id_chia_pet in id_chia_pets: + inter = get_exons_in_interaction(sqlite3.connect(Config.db_file), + id_chia_pet) + dict_interaction.update(inter) + + final_exons_interactions = {} + for key, value in dict_interaction.items(): + key = key.split(".")[0] + if key not in final_exons_interactions: + final_exons_interactions[key] = value + else: + final_exons_interactions[key].extend(value) + return final_exons_interactions + + +# ##############PRODUCE DATA FOR THE CONTROLs PART OF THE FIGURE############### +def find_controls_exons_in_interaction(final_exons_interactions: dict, + list_exons_ctrl: list): + """ + - For each exon control of the list list_exons_ctrl it allows to obtain the + complete list of exons with which it interacts, for all the datasets that + we have. + - Then for each exon control, we look in the complete list of exons with + which it interacts, if at least one of the exons is also a exon control. + + :param final_exons_interactions: a dictionary with for each exon the + complete list of exons with which it interacts, for all the datasets that + we have, e.g: '4448_11': ['4448_20'], '4857_5': ['4878_1', '494_6'], see + get_every_exons_in_interaction(). + :param list_exons_ctrl: a list of exons control of size identical to that + of the SF studied according to its regulation (down or up), e.g: + ['15919_3', '7546_8', '4946_3', '3158_10']. + :return nb_exons_ctrl_int: the number of exons control in interaction with + at least one other exon control. + """ + ctrl_final_inter = {} + for exons in list_exons_ctrl: + for key, value in final_exons_interactions.items(): + if exons == key: + ctrl_final_inter[key] = value + + exons_ctrl_interact_exons_ctrl = {} + for key_a, value_a in ctrl_final_inter.items(): + for elmt_a in value_a: + for elmt_b in list_exons_ctrl: + if elmt_a == elmt_b: + exons_ctrl_interact_exons_ctrl[key_a] = value_a + nb_exons_ctrl_int = len(exons_ctrl_interact_exons_ctrl.keys()) + return nb_exons_ctrl_int + + +def random_exons_ctrl(sf_reg: dict, number_exons_ctrl: str) -> float: + """ + Allows to: + - Generate a list of exons control of size identical to that of the + SF studied according to its regulation (down or up). This list does not + contain exons regulated by the SF and the regulation studied. + - This exons control list is randomly generated 3 times. For each time, we + determine the percentage of exons control in interaction with at least one + other exon control. Then we average the 3 percentages obtained. + + :param sf_reg: a dictionary with a list of regulated exons depending on a + splicing factor and its regulation, e.g: {'PTBP1_up': ['345_38', '681_2', + '781_4', '1090_16', '1291_12']}, see get_every_events_4_a_sl(). + :param number_exons_ctrl: a str which is the concatenation of the sf_name, + the regulation and the number of exons regulated by this SF according to + the type of regulation, e.g: HNRNPC_down_2482, see + get_every_events_4_a_sl(). + :return average_percent_ctrl: is the percentage of exons control in + interaction with at least one other exon control. This percentage is + calculated for the 3 exons control list, and then we average it. + """ + list_exons = get_info_from_database(sqlite3.connect(Config.db_file), + """SELECT id FROM cin_exon""") + + number_exons_ctrl = int(number_exons_ctrl.split("_")[-1]) + + for key, value in sf_reg.items(): + for elmt in value: + list_exons.remove(elmt) + + i = 0 + list_percentage_ctrl = [] + while i < 3: + list_exons_ctrl = random.choices(list_exons, k=number_exons_ctrl) + nb_exons_ctrl_int = find_controls_exons_in_interaction( + get_every_exons_in_interaction(), list_exons_ctrl) + percentage_ctrl = (nb_exons_ctrl_int / number_exons_ctrl) * 100 + rounded_percentage_ctrl = round(percentage_ctrl, 2) + list_percentage_ctrl.append(rounded_percentage_ctrl) + i += 1 + average_percent_ctrl = sum(list_percentage_ctrl)/len(list_percentage_ctrl) + return average_percent_ctrl + + +# ####KEPT INTERACTIONS FOR EXONS DOWN OR UP REGULATED BY THE SF STUDIED##### +def kept_interest_exons_sf(final_exons_interactions: dict, sf_reg: dict, + number_exons_sf: str) -> float: + """ + - For each exon (down and in a second time up) regulated by the SF of + interest, it allows to obtain the complete list of exons with which it + interacts, for all the datasets that we have. + - Then for each exon (down and in a second time up) regulated by the SF of + interest, we look in the complete list of exons with which it interacts, + if at least one of the exons is also regulated by the same SF, in the same + way. + - Finally, we determine the percentage of exons (down and in a second time + up) regulated by the SF of interest in interaction with at least one other + exon regulated by the same SF, in the same way. + + :param final_exons_interactions: a dictionary with for each exon the + complete list of exons with which it interacts, for all the datasets that + we have, e.g: '4448_11': ['4448_20'], '4857_5': ['4878_1', '494_6'], see + get_every_exons_in_interaction(). + :param sf_reg: a dictionary with a list of regulated exons depending on a + splicing factor and its regulation, e.g: {'PTBP1_up': ['345_38', '681_2', + '781_4', '1090_16', '1291_12']}, see get_every_events_4_a_sl(). + :param number_exons_sf: a str which is the concatenation of the sf_name, + the regulation and the number of exons regulated by this SF according to + the type of regulation, e.g: HNRNPC_down_2482, + see get_every_events_4_a_sl(). + :return rounded_percentage_sf: the percentage of exons (down and in a + second time up) regulated by the SF of interest in interaction with at + least one other exon regulated by the same SF, in the same way. + """ + sf_final_inter = dict() + for key_a, value_a in final_exons_interactions.items(): + key_a = key_a.split(".")[0] + for key_b, value_b in sf_reg.items(): + for elmt_b in value_b: + if key_a == elmt_b: + sf_final_inter[key_a] = value_a + + exons_sf_interact_exons_sf = {} + for key_c, value_c in sf_final_inter.items(): + for elmt_c in value_c: + for key_d, value_d in sf_reg.items(): + for elmt_d in value_d: + if elmt_c == elmt_d: + exons_sf_interact_exons_sf[key_c] = value_c + + nb_exons_sf_int = len(exons_sf_interact_exons_sf.keys()) + number_exons_sf = int(number_exons_sf.split("_")[-1]) + percentage_sf = (nb_exons_sf_int / number_exons_sf) * 100 + rounded_percentage_sf = round(percentage_sf, 2) + return rounded_percentage_sf + + +def launch_figures_creation(): + """ + Main function to create figure, with: + - the percentage of exons (down and up) regulated by the SF of interest in + interaction with at least one other exon regulated by the same SF, in the + same way. + - the percentage of exons control in interaction with at least one other + exon control. This percentage is calculated for the 3 exons control list, + and then we average it. This manipulation is carried out twice, depending + on the number of down and up regulated exons. + + """ + final_exons_interactions = get_every_exons_in_interaction() + + # SF_DOWN + sf_reg_do, number_exons_do = get_every_events_4_a_sl(sqlite3.connect( + Config.db_file), SF, "down") + rounded_percentage_sf_do = kept_interest_exons_sf(final_exons_interactions, + sf_reg_do, + number_exons_do) + # CTRL_DOWN + average_percent_ctrl_do = random_exons_ctrl(sf_reg_do, number_exons_do) + + # SF_UP + sf_reg_up, number_exons_up = get_every_events_4_a_sl(sqlite3.connect( + Config.db_file), SF, "up") + rounded_percentage_sf_up = kept_interest_exons_sf(final_exons_interactions, + sf_reg_up, + number_exons_up) + # CTRL_UP + average_percent_ctrl_up = random_exons_ctrl(sf_reg_up, number_exons_up) + + # CREATE FIGURE + df = pd.DataFrame({"A": [f"{SF}_down", "Control_down", f"{SF}_up", + "Control_up"], + "B": [rounded_percentage_sf_do, average_percent_ctrl_do, + rounded_percentage_sf_up, + average_percent_ctrl_up]}) + data = sns.barplot(data=df, x="A", y="B", palette=["blue", "black", + "red", "grey"]) + fig = data.get_figure() + fig.savefig('/home/audrey/Figures_test/Figure_1.png') + + +if __name__ == "__main__": + launch_figures_creation()