diff --git a/src/find_interaction_cluster/sf_and_communities.py b/src/find_interaction_cluster/sf_and_communities.py new file mode 100644 index 0000000000000000000000000000000000000000..b14d6995ef2d55fc1e630c267256dbf727ca20d8 --- /dev/null +++ b/src/find_interaction_cluster/sf_and_communities.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to check whether communities \ +are often regulated by a splicing factor +""" + +from typing import List, Tuple, Dict +import sqlite3 +from .config import ConfigGraph +from ..figures_utils.exons_interactions import get_every_events_4_a_sl +from .community_finder import get_communities +import pandas as pd +import numpy as np +from itertools import product +from .community_finder import get_projects_name +import multiprocessing as mp +import logging +from ..logging_conf import logging_def +import doctest +from functools import reduce +from operator import add +import statsmodels.formula.api as api +from statsmodels.stats.multitest import multipletests + + +def get_exon_regulated_in_communities(community: List[str], + regulated_exons: List[str] + ) -> Tuple[int, int, float]: + """ + + :param community: strongly co-localized group of exons belonging to \ + the same community. + :param regulated_exons: List of regulated exons by a splicing factor + :return: The number of exons regulated by the splicing factor inside \ + the community. The total number of exons within the community and \ + the percentage of regulated exons within the community + """ + reg_community = len([e for e in community if e in regulated_exons]) + tot_commmunity = len(community) + prop = reg_community / tot_commmunity * 100 + return reg_community, tot_commmunity, prop + + +def get_ctrl_exon(cnx: sqlite3.Connection) -> List[str]: + """ + Get the list of fasterDB exons. + + :param cnx: Connection to chIA-PET database + :return: The list of fasterDB exons + """ + cursor = cnx.cursor() + query = 'SELECT id FROM cin_exon' + cursor.execute(query) + res = np.asarray(cursor.fetchall()).flatten() + cursor.close() + return list(res) + + +def get_stat_community_exons(cnx: sqlite3.Connection, dic: Dict[str, List], + communities: List[List[str]], + regulated_exons: List[str]): + """ + Get the data for the list of exons belonging to a community, and update \ + `dic`with it. + + :param cnx: connection to ChIA-PET database + :param dic: Dictionary containing data about a community and it's content \ + corresponding to eoxns regulated by a splicing factor. + :param communities: strongly co-localized group of exons belonging to \ + the same community. + :param regulated_exons: List of regulated exons by a splicing factor + :return: The update dictionary and the number of regulated exon by a \ + splicing factor that belong to a large community and the number of \ + exons belonging to a large community. + """ + full_com = reduce(add, communities) + ctrl_exons = get_ctrl_exon(cnx) + n_reg, n_tot = np.nan, np.nan + for cur_com, name in [[full_com, 'All-community'], [ctrl_exons, "FASTERDB"]]: + reg_community, tot_commmunity, prop = \ + get_exon_regulated_in_communities(cur_com, regulated_exons) + if name == 'All-community': + n_reg = reg_community + n_tot = tot_commmunity + dic["community"].append(name) + dic['reg_in_community'].append(reg_community) + dic['community_size'].append(tot_commmunity) + dic['%reg in community'].append(prop) + dic['pval'].append(np.nan) + return dic, n_reg, n_tot + + +def get_vector(n: int, t: int) -> List[int]: + """ + Get a vector containing n ones and t - n zeros. + + :param n: The number of ones wanted + :param t: The size of the array + :return: The vector of n one's and t zeros + + >>> get_vector(2, 5) + [1, 1, 0, 0, 0] + """ + return [1] * n + [0] * (t - n) + + +def logistic_regression(n1: int, t1: int, n2: int, t2: int) -> float: + """ + Get the p-value of the logisitcal test performed on the \ + test vector data defined by n1 ones and t1-n1 zeros and the control \ + vector data defined by n2 ones and t2-n2 zeros. + + :param n1: number of ones in test vectors + :param t1: size of test vector + :param n2: number of ones in control vector + :param t2: size of control vector + :return: The p-value of the logistic test + """ + if n1 == 0 or n2 == 0 or n1 == t1 or n2 == t2: + return 1 + df = pd.DataFrame({'y': get_vector(n1, t1) + + get_vector(n2, t2), + 'x': ['test'] * t1 + ['ctrl'] * t2}) + return api.logit('y ~ x', data=df).fit(disp=False).pvalues['x[T.test]'] + + +def get_stat4communities(sf_name: str, reg: str, + project: str, weight: int, + global_weight: int, same_gene: bool) -> pd.DataFrame: + """ + Get data (proportion of `reg` regulated exons by a splicing factor + `sf_name` within a community) for every community. + + :param sf_name: The splicing factor of interest + :param reg: The regulation of interest + :param project: The name of the project of interest + :param weight: The minimum weight of interaction to consider + :param global_weight: The global weight to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised, exons within the \ + same gene (True) or not (False) (default False) + """ + logging.debug(f"Working on {sf_name}-{reg}, for {project}, w:{weight}, " + f"g:{global_weight}") + cnx = sqlite3.connect(ConfigGraph.db_file) + result = ConfigGraph.get_community_file(project, weight, global_weight, + same_gene, + "_communities.txt") + communities = get_communities(result, 49) + regulated_dic, number = get_every_events_4_a_sl(cnx, sf_name, reg) + reg_exons = regulated_dic[sf_name + "_" + reg] + d = {"community": [], "reg_in_community": [], "community_size": [], + "%reg in community": [], 'pval': []} + d, n2, t2 = get_stat_community_exons(cnx, d, communities, reg_exons) + for k, community in enumerate(communities): + if len(community) > 49: + reg_community, tot_commmunity, prop = \ + get_exon_regulated_in_communities(community, reg_exons) + d["community"].append(f"C{k}") + d['reg_in_community'].append(reg_community) + d['community_size'].append(tot_commmunity) + d['%reg in community'].append(prop) + d['pval'].append(logistic_regression(reg_community, tot_commmunity, + n2, t2)) + d['padj'] = [np.nan, np.nan] + list(multipletests(d['pval'][2:], + method='fdr_bh')[1]) + d['sf'] = [sf_name] * (len(communities) + 2) + d['reg'] = [reg] * (len(communities) + 2) + d['project'] = [project] * (len(communities) + 2) + return pd.DataFrame(d) + + +def get_sfname() -> List[str]: + """ + Recover the name of every splicing factor of interest. + + :return: The list of splicing factor + + >>> len(get_sfname()) == 42 + True + """ + logging.debug('recovering sf factors ...') + return ["PCBP2", "HNRNPA1", "HNRNPU", "QKI", "PTBP1", "TRA2A_B", "KHSRP", + "MBNL1", "HNRNPL", "HNRNPK", "SRSF7", "HNRNPA2B1", "SFPQ", "RBM15", + "HNRNPM", "FUS", "DAZAP1", "RBM39", "SRSF9", "RBM25", "RBM22", + "HNRNPF", "SRSF5", "PCBP1", "RBFOX2", "HNRNPH1", "RBMX", "SRSF6", + "MBNL2", "SRSF1","SRSF2", "HNRNPC", "SRSF3", "U2AF1", "SF3B1", + "SNRNP70", "SNRPC", "DDX5_DDX17", "SF1", "SF3A3", "SF3B4", "U2AF2"] + + +def get_key(project: str, weight: int) -> str: + """ + Get the combination of project and weight parameters + + :param project: A project name + :param weight: A weight + :return: The string combination of the two + """ + return f"{project}_{weight}" + + +def multiple_stat_launcher(ps: int, weights: List[int], + global_weights: List[int], + same_gene: bool, logging_level: str = "DISABLE"): + """ + Launch the statistical analysis for every + + :param ps: The number of processes we want to use. + :param weights: The list of weights of interaction to consider + :param global_weights: The list global weights to consider. if \ + the global weight is equal to 0 then then density figure are calculated \ + by project, else all projcet are merge together and the interaction \ + seen in `global_weight` project are taken into account + :param same_gene: Say if we consider as co-localised exon within the \ + same gene + :param logging_level: Level of information to display + """ + ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) + logging_def(ConfigGraph.community_folder, __file__, logging_level) + logging.info("Checking if communities contains often exons regulated by a" + " splicing factor") + sf_list = get_sfname() + global_weights = list(np.unique(global_weights)) + weights = list(np.unique(weights)) + projects, dic_project = get_projects_name(global_weights) + condition = list(product(projects, weights, sf_list, ['down', 'up'])) + processes = {} + pool = mp.Pool(processes=min(ps, len(condition))) + logging.debug("Calculating stats...") + for project, weight, sf_name, reg in condition: + ckey = get_key(project, weight) + global_weight = dic_project[project] + args = [sf_name, reg, project, weight, global_weight, same_gene] + if ckey not in processes.keys(): + processes[ckey] = [pool.apply_async(get_stat4communities, args)] + else: + processes[ckey].append(pool.apply_async(get_stat4communities, args)) + for p in processes: + project, weight = p.split("_") + list_df = [] + for proc in processes[p]: + list_df.append(proc.get(timeout=None)) + pool.close() + pool.join() + df = pd.concat(list_df, axis=0, ignore_index=True) + outfile = ConfigGraph.get_community_file(project, weight, + dic_project[project], + same_gene, + "_stat.txt", True) + df.to_csv(outfile, sep="\t", index=False) + + +if __name__ == "__main__": + doctest.testmod() \ No newline at end of file