Skip to content
Snippets Groups Projects
Commit aacaf00f authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/nt_and_community.py: add permutations tests

parent 10a59086
No related branches found
No related tags found
No related merge requests found
...@@ -7,12 +7,11 @@ Description: The goal of this script is to see if the nucleotide \ ...@@ -7,12 +7,11 @@ Description: The goal of this script is to see if the nucleotide \
frequency is not random between communities frequency is not random between communities
""" """
import pandas as pd import pandas as pd
import logging import logging
import sqlite3 import sqlite3
from .config import ConfigGraph, get_communities from .config import ConfigGraph, get_communities
from typing import List from typing import List, Tuple, Dict
from doctest import testmod from doctest import testmod
from functools import reduce from functools import reduce
from pathlib import Path from pathlib import Path
...@@ -22,6 +21,9 @@ from .community_finder import get_projects ...@@ -22,6 +21,9 @@ from .community_finder import get_projects
from ..logging_conf import logging_def from ..logging_conf import logging_def
from itertools import product from itertools import product
import multiprocessing as mp import multiprocessing as mp
import numpy as np
from .radomization_test_ppi import get_pvalue
from tqdm import tqdm
def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str],
...@@ -82,20 +84,20 @@ def get_community_table(communities: List[List[str]], ...@@ -82,20 +84,20 @@ def get_community_table(communities: List[List[str]],
>>> c = [['1_1', '2_5'], ['7_9', '4_19', '3_3']] >>> c = [['1_1', '2_5'], ['7_9', '4_19', '3_3']]
>>> get_community_table(c, 3, 'exon') >>> get_community_table(c, 3, 'exon')
community id_exon community_size community id_exon community_size
0 C1 7_9 3 0 C2 7_9 3
1 C1 4_19 3 1 C2 4_19 3
2 C1 3_3 3 2 C2 3_3 3
>>> c = [['1', '2'], ['7', '49', '3']] >>> c = [['1', '2'], ['7', '49', '3']]
>>> get_community_table(c, 3, 'gene') >>> get_community_table(c, 3, 'gene')
community id_gene community_size community id_gene community_size
0 C1 7 3 0 C2 7 3
1 C1 49 3 1 C2 49 3
2 C1 3 3 2 C2 3 3
""" """
dic = {"community": [], f"id_{feature}": [], "community_size": []} dic = {"community": [], f"id_{feature}": [], "community_size": []}
for k, comm in enumerate(communities): for k, comm in enumerate(communities):
if len(comm) >= size_threshold: if len(comm) >= size_threshold:
name = f'C{k}' name = f'C{k + 1}'
clen = len(comm) clen = len(comm)
for exon in comm: for exon in comm:
dic["community"].append(name) dic["community"].append(name)
...@@ -173,7 +175,7 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str ...@@ -173,7 +175,7 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str
res_df.rename({'index': 'community'}, inplace=True, axis=1) res_df.rename({'index': 'community'}, inplace=True, axis=1)
res_df['community'] = res_df['community'].str.replace('community', '') res_df['community'] = res_df['community'].str.replace('community', '')
res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL" res_df.loc[res_df['community'] == "(Intercept)", "community"] = "C-CTRL"
mean_df = df[[nt, "community", "community_size"]].\ mean_df = df[[nt, "community", "community_size"]]. \
groupby(["community", "community_size"]).mean().reset_index() groupby(["community", "community_size"]).mean().reset_index()
return res_df.merge(mean_df, how="left", on="community") return res_df.merge(mean_df, how="left", on="community")
...@@ -193,21 +195,18 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]: ...@@ -193,21 +195,18 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]:
return [str(cid[0]) for cid in res] return [str(cid[0]) for cid in res]
def create_ctrl_community(df: pd.DataFrame, outfile: Path, def create_ctrl_community(df: pd.DataFrame,
feature: str = 'exon', region: str = "" feature: str = 'exon', region: str = ""
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
:param df: A dataframe containing the frequency of each feature in a \ :param df: A dataframe containing the frequency of each feature in a \
community. community.
:param outfile: The output table containing frequencies
:param feature: The kind of feature to analyse :param feature: The kind of feature to analyse
:param region: only use if feature is 'gene'. Used to focus on \ :param region: only use if feature is 'gene'. Used to focus on \
a given region in genes (can be gene, exon, intron). a given region in genes (can be gene, exon, intron).
:return: A dataframe containing the frequency of every nucleotides \ :return: A dataframe containing the frequency of every nucleotides \
of every exon in a large community of every exon in a large community
""" """
if outfile.is_file():
return pd.read_csv(outfile, sep="\t")
size_threshold = 10 if feature == "gene" else 50 size_threshold = 10 if feature == "gene" else 50
cnx = sqlite3.connect(ConfigGraph.db_file) cnx = sqlite3.connect(ConfigGraph.db_file)
ft_id = get_ft_id(cnx, feature) ft_id = get_ft_id(cnx, feature)
...@@ -218,15 +217,165 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path, ...@@ -218,15 +217,165 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path,
df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}") df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}")
df_nt['community'] = ['C-CTRL'] * df_nt.shape[0] df_nt['community'] = ['C-CTRL'] * df_nt.shape[0]
df = pd.concat([df, df_nt], axis=0, ignore_index=True) df = pd.concat([df, df_nt], axis=0, ignore_index=True)
df.to_csv(outfile, sep="\t", index=False)
return df return df
def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str,
nt: str, outfile_diag: Path) -> pd.DataFrame:
"""
:param df: df: A dataframe containing the frequency of each nucleotide \
in each exons belonging to a community.
:param feature: The kind of feature analysed
:param region: the region of interest to extract from gene
:param nt: The nucleotide of interest
:param outfile_diag: File from which the diagnostics folder will be \
inferred
:return: The dataframe with the p-value compared to the control \
list of exons.
"""
ndf = create_ctrl_community(df, feature, region)
return lmm_maker_summary(ndf, outfile_diag, nt)
def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict:
"""
Create a dictionary containing the exons contained in each community.
:param df: A dataframe containing the frequency of each nucleotide \
in each exons belonging to a community.
:param feature: the feature of interest (exon, gene)
:return: A dictionary linking each community to the exons it contains
>>> dataf = pd.DataFrame({"id_gene": ['1', '2', '3', '4'],
... 'community': ['C1', 'C1', 'C2', 'C2']})
>>> get_feature_by_community(dataf, 'gene')
{'C1': ['1', '2'], 'C2': ['3', '4']}
>>> dataf.rename({"id_gene": "id_exon"}, axis=1, inplace=True)
>>> get_feature_by_community(dataf, 'exon')
{'C1': ['1', '2'], 'C2': ['3', '4']}
"""
dic = {}
for i in range(df.shape[0]):
com, id_ft = df.iloc[i, :][['community', f'id_{feature}']]
if com in dic:
dic[com].append(id_ft)
else:
dic[com] = [id_ft]
return dic
def get_permutation_mean(df_ctrl: pd.DataFrame,
nt: str, size: int, iteration: int) -> List[float]:
"""
Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \
of `nt` frequencies from it.
:param df_ctrl: A dataframe containing the frequency of each nucleotide \
in each exons/gene in fasterdb.
:param nt: A nucleotide of interest
:param size: The size of each sub samples to create
:param iteration: The number of sub samples to create
:return: The list of mean frequencies of `nt` in each subsample
"""
return [
float(np.mean(df_ctrl[nt].sample(size, replace=True).values))
for _ in range(iteration)
]
def perm_community_pval(row: pd.Series, df_ctrl: pd.DataFrame,
nt: str, iteration: int) -> Tuple[float, float, str]:
"""
Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \
of `nt` frequencies from it.
:param row: A line of a dataframe containing the frequency of \
each feature inside a community.
:param df_ctrl: A dataframe containing the frequency of each nucleotide \
in each exons/gene in fasterdb.
:param nt: A nucleotide of interest
:param iteration: The number of sub samples to create
:return: The ctrl mean frequency value of `nt` \
the pvalue and the regulation of the enrichment/impoverishment \
of the community in `row` compared to control exons.
"""
list_values = get_permutation_mean(df_ctrl, nt, row["community_size"],
iteration)
pval, reg = get_pvalue(np.array(list_values), row[nt], iteration)
return float(np.mean(list_values)), pval, reg
def perm_pvalues(df: pd.DataFrame, df_ctrl: pd.DataFrame, feature: str,
nt: str, iteration: int, dic_com: Dict) -> pd.DataFrame:
"""
Randomly sample `size` `feature` from `df_ctrl` to extract `iteration` \
of `nt` frequencies from it.
:param df: A dataframe containing the frequency of each nucleotide \
in each exons belonging to a community.
:param df_ctrl: A dataframe containing the frequency of each nucleotide \
in each exons/gene in fasterdb.
:param feature: The feature of interest (gene, exon)
:param nt: A nucleotide of interest
:param iteration: The number of sub samples to create
:param dic_com: A dictionary linking each community to the exons \
it contains.
:return: The dataframe containing p-values and regulation \
indicating the enrichment of
"""
list_pval, list_reg, mean_ctrl = ([] for _ in range(3))
for i in tqdm(range(df.shape[0]), desc="performing permutations"):
row = df.iloc[i, :]
res = perm_community_pval(row,
df_ctrl.loc[
-df_ctrl[f'id_{feature}'
].isin(dic_com[row['community']]),
:],
nt, iteration)
[x.append(y) for x, y in zip([mean_ctrl, list_pval, list_reg], res)]
adj_pvals = multipletests(list_pval, alpha=0.05,
method='fdr_bh',
is_sorted=False,
returnsorted=False)[1]
adj_regs = [list_reg[i] if adj_pvals[i] <= 0.05 else " . "
for i in range(len(list_reg))]
df[f'mean_{iteration}_ctrl'] = mean_ctrl
df[f'p-adj'] = adj_pvals
df[f'reg-adj'] = adj_regs
return df
def perm_with_ctrl(df: pd.DataFrame, feature: str,
nt: str, df_ctrl: pd.DataFrame, dic_com: Dict,
iteration: int) -> pd.DataFrame:
"""
:param df: A dataframe containing the frequency of each nucleotide \
in each exons belonging to a community.
:param feature: The kind of feature analysed
:param nt: The nucleotide of interest
:param df_ctrl: A dataframe containing the frequency of each nucleotide \
in each exons/gene in fasterdb.
:param dic_com: A dictionary linking each community to the exons \
it contains.
:param iteration: The number of sub samples to create
:return: The dataframe with the p-value compared to the control \
list of exons.
"""
mean_df = df[[nt, "community", "community_size"]]. \
groupby(["community", "community_size"]).mean().reset_index()
return perm_pvalues(mean_df, df_ctrl, feature, nt, iteration, dic_com)
def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
global_weight: int, same_gene: bool, global_weight: int, same_gene: bool,
nt: str, feature: str = "exon", nt: str, df_ctrl: pd.DataFrame,
region: str = "", dic_com: Dict,
) -> pd.Series: feature: str = "exon",
region: str = "", test_type: str = "",
iteration: int = 1000) -> pd.Series:
""" """
Get data (proportion of `reg` regulated exons by a splicing factor Get data (proportion of `reg` regulated exons by a splicing factor
`sf_name` within a community) for every community. `sf_name` within a community) for every community.
...@@ -242,17 +391,22 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, ...@@ -242,17 +391,22 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
:param same_gene: Say if we consider as co-localised, exons within the \ :param same_gene: Say if we consider as co-localised, exons within the \
same gene (True) or not (False) (default False) same gene (True) or not (False) (default False)
:param nt: The nucleotide of interest :param nt: The nucleotide of interest
:param feature: The king of feature analysed :param df_ctrl: A dataframe containing the frequency of each nucleotide \
in each exons/gene in fasterdb.
:param dic_com: A dictionary linking each community to the exons \
it contains.
:param feature: The kind of feature analysed
:param region: the region of interest to extract from gene :param region: the region of interest to extract from gene
:param test_type: The type of test to make (permutation or lmm)
:param iteration: The number of sub samples to create
""" """
logging.debug(f"lmm for {project}, w:{weight}, " logging.debug(f"{test_type} for {project}, w:{weight}, "
f"g:{global_weight} nt: {nt}") f"g:{global_weight} nt: {nt}")
res = {"project": [], "nt": [], 'pval': []} res = {"project": [], "nt": [], 'pval': []}
outfile = ConfigGraph.get_community_file(project, weight, outfile = ConfigGraph.get_community_file(project, weight,
global_weight, global_weight,
same_gene, feature, same_gene, feature,
f"{nt}_stat.txt", f"{nt}_stat_{test_type}.txt",
"sf_community_enrichment") "sf_community_enrichment")
nfolder = outfile.parent / "nt_analysis" nfolder = outfile.parent / "nt_analysis"
nfolder.mkdir(exist_ok=True, parents=True) nfolder.mkdir(exist_ok=True, parents=True)
...@@ -261,23 +415,25 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, ...@@ -261,23 +415,25 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
res['project'] = project res['project'] = project
res['nt'] = nt res['nt'] = nt
res['pval'] = pval res['pval'] = pval
nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt",
"_ctrl.txt")
print(df.head())
ndf = create_ctrl_community(df, nt_ctrl_table, feature, region)
sum_df = lmm_maker_summary(ndf, outfile, nt)
outfile_ctrl = ConfigGraph.get_community_file(project, weight, outfile_ctrl = ConfigGraph.get_community_file(project, weight,
global_weight, global_weight,
same_gene, feature, same_gene, feature,
f"{nt}_VS_CTRL_stat.txt", f"{nt}_VS_CTRL_stat_"
f"{test_type}.txt",
"sf_community_enrichment") "sf_community_enrichment")
if test_type == "lmm":
rdf = lmm_with_ctrl(df, feature, region, nt, outfile)
else:
rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration)
noutfile_ctrl = nfolder / outfile_ctrl.name noutfile_ctrl = nfolder / outfile_ctrl.name
sum_df.to_csv(noutfile_ctrl, sep="\t", index=False) rdf.to_csv(noutfile_ctrl, sep="\t", index=False)
return pd.Series(res) return pd.Series(res)
def create_dataframe(project: str, weight: int, global_weight: int, def create_dataframe(project: str, weight: int, global_weight: int,
same_gene: bool, feature: str = 'exon') -> pd.DataFrame: same_gene: bool, feature: str = 'exon',
region: str = "", from_communities: bool = True
) -> pd.DataFrame:
""" """
:param project: The name of the project of interest :param project: The name of the project of interest
:param weight: The minimum weight of interaction to consider :param weight: The minimum weight of interaction to consider
...@@ -288,29 +444,65 @@ def create_dataframe(project: str, weight: int, global_weight: int, ...@@ -288,29 +444,65 @@ def create_dataframe(project: str, weight: int, global_weight: int,
:param same_gene: Say if we consider as co-localised, exons within the \ :param same_gene: Say if we consider as co-localised, exons within the \
same gene (True) or not (False) same gene (True) or not (False)
:param feature: The kind of feature to analyse :param feature: The kind of feature to analyse
:param from_communities: True if we only select gene/exons
:param region: the region of interest to extract from gene
:return: A dataframe containing the frequency of every nucleotides \ :return: A dataframe containing the frequency of every nucleotides \
of every exon in a large community of every exon in a large community
""" """
size_threshold = 10 if feature == "gene" else 50 size_threshold = 10 if feature == "gene" else 50
result = ConfigGraph.get_community_file(project, weight, global_weight,
same_gene, feature,
"_communities.txt")
communities = get_communities(result, 0)
ncommunities = [c for c in communities if len(c) >= size_threshold]
cnx = sqlite3.connect(ConfigGraph.db_file) cnx = sqlite3.connect(ConfigGraph.db_file)
list_exon = reduce(lambda x, y: x + y, ncommunities) if from_communities:
df = get_nt_frequency(cnx, list_exon, feature) result = ConfigGraph.get_community_file(project, weight, global_weight,
df_com = get_community_table(communities, size_threshold, feature) same_gene, feature,
df = df.merge(df_com, how="left", on=f"id_{feature}") "_communities.txt")
communities = get_communities(result, 0)
ncommunities = [c for c in communities if len(c) >= size_threshold]
list_exon = reduce(lambda x, y: x + y, ncommunities)
df_com = get_community_table(communities, size_threshold, feature)
df = get_nt_frequency(cnx, list_exon, feature, region)
df = df.merge(df_com, how="left", on=f"id_{feature}")
else:
list_exon = get_ft_id(cnx, feature)
df = get_nt_frequency(cnx, list_exon, feature, region)
return df return df
def create_dataframes(project, weight, global_weight, same_gene, feature,
region, test_type
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
"""
:param weight: The weight of interaction to consider
:param global_weight: The global weighs to consider. if \
the global weight is equal to 0 then the project `project` is \
used.
:param project: The project name, used only if global_weight = 0
:param same_gene: Say if we consider as co-localised exon within the \
same gene
:param feature: The kind of analysed feature
:param region: the region of interest to extract from gene
:param test_type: The type of test to make (permutation or lmm)
"""
df = create_dataframe(project, weight, global_weight, same_gene,
feature, region)
if test_type == 'lmm':
df_ctrl = pd.DataFrame()
dic_com = {}
else:
df_ctrl = create_dataframe(project, weight, global_weight, same_gene,
feature, region, from_communities=False)
dic_com = get_feature_by_community(df, feature)
return df, df_ctrl, dic_com
def multiple_nt_lmm_launcher(ps: int, def multiple_nt_lmm_launcher(ps: int,
weight: int, weight: int,
global_weight: int, global_weight: int,
project: str, project: str,
same_gene: bool, same_gene: bool,
feature: str = 'exon', region: str = '', feature: str = 'exon', region: str = '',
test_type: str = "lmm",
iteration: int = 1000,
logging_level: str = "DISABLE"): logging_level: str = "DISABLE"):
""" """
Launch the statistical analysis for every Launch the statistical analysis for every
...@@ -325,6 +517,8 @@ def multiple_nt_lmm_launcher(ps: int, ...@@ -325,6 +517,8 @@ def multiple_nt_lmm_launcher(ps: int,
same gene same gene
:param feature: The kind of analysed feature :param feature: The kind of analysed feature
:param region: the region of interest to extract from gene :param region: the region of interest to extract from gene
:param test_type: The type of test to make (permutation or lmm)
:param iteration: The number of iteration to perform
:param logging_level: Level of information to display :param logging_level: Level of information to display
""" """
ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True)
...@@ -332,22 +526,21 @@ def multiple_nt_lmm_launcher(ps: int, ...@@ -332,22 +526,21 @@ def multiple_nt_lmm_launcher(ps: int,
logging.info("Checking if communities as an effect on nucleotide " logging.info("Checking if communities as an effect on nucleotide "
"frequency") "frequency")
project = get_projects(global_weight, project) project = get_projects(global_weight, project)
nt_list = ["A", "C", "G", "T", "S", "W"] nt_list = ["A"] # , "C", "G", "T", "S", "W"]
condition = list(product([project], [weight], nt_list)) condition = list(product([project], [weight], nt_list))
processes = [] processes = []
pool = mp.Pool(processes=min(ps, len(condition))) pool = mp.Pool(processes=min(ps, len(condition)))
logging.debug("Calculating stats...") logging.debug("Creating tables")
df, df_ctrl, dic_com = create_dataframes(project, weight, global_weight,
same_gene, feature, region,
test_type)
for project, weight, nt in condition: for project, weight, nt in condition:
df = create_dataframe(project, weight, global_weight, same_gene,
feature)
nfile_table = ConfigGraph.get_community_file( nfile_table = ConfigGraph.get_community_file(
project, weight, global_weight, same_gene, feature, project, weight, global_weight, same_gene, feature,
f"_nt_table.txt", "sf_community_enrichment") f"_nt_table.txt", "sf_community_enrichment")
df.to_csv(nfile_table, sep="\t", index=False) df.to_csv(nfile_table, sep="\t", index=False)
args = [df, project, weight, global_weight, same_gene, nt, df_ctrl,
dic_com, feature, region, test_type, iteration]
args = [df, project, weight, global_weight, same_gene, nt, feature,
region]
processes.append(pool.apply_async(get_stat_nt_communities, args)) processes.append(pool.apply_async(get_stat_nt_communities, args))
results = [p.get(timeout=None) for p in processes] results = [p.get(timeout=None) for p in processes]
pool.close() pool.close()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment