Skip to content
Snippets Groups Projects
Commit 93caebe1 authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/radomization_test_ppi.py: add 3 functions to...

src/find_interaction_cluster/radomization_test_ppi.py: add 3 functions to check if the number communities that are enriched in common genes between dna and protein level is also enriched
parent dffdc190
No related branches found
No related tags found
No related merge requests found
...@@ -90,8 +90,8 @@ def get_common_genes(dna_gene: np.array, ppi_gene: np.array) -> int: ...@@ -90,8 +90,8 @@ def get_common_genes(dna_gene: np.array, ppi_gene: np.array) -> int:
def get_list_common_gene(community: str, dic_dna_gene: Dict[str, np.array], def get_list_common_gene(community: str, dic_dna_gene: Dict[str, np.array],
ppi_size: int, ppi_gene: np.array, iteration: int ppi_size: int, ppi_gene: np.array, iteration: int,
) -> np.array: use_seed: bool = True) -> np.array:
""" """
Return the number of common gene between genes in the community \ Return the number of common gene between genes in the community \
`community`and a sublist of randomly chosen genes in `ppi_gene` of \ `community`and a sublist of randomly chosen genes in `ppi_gene` of \
...@@ -103,6 +103,7 @@ def get_list_common_gene(community: str, dic_dna_gene: Dict[str, np.array], ...@@ -103,6 +103,7 @@ def get_list_common_gene(community: str, dic_dna_gene: Dict[str, np.array],
:param ppi_size: The size of the ppi_gene subsample :param ppi_size: The size of the ppi_gene subsample
:param ppi_gene: All the gene interacting at protein level :param ppi_gene: All the gene interacting at protein level
:param iteration: The number of time we want to repeat the process :param iteration: The number of time we want to repeat the process
:param use_seed: True to use a fixed seed, false else.
:return: The number of gene in common between the genes in community \ :return: The number of gene in common between the genes in community \
`community` and the subsample of ppi_size `community` and the subsample of ppi_size
...@@ -119,7 +120,8 @@ def get_list_common_gene(community: str, dic_dna_gene: Dict[str, np.array], ...@@ -119,7 +120,8 @@ def get_list_common_gene(community: str, dic_dna_gene: Dict[str, np.array],
>>> get_list_common_gene("C1", dna_gene, 10, ppi_g, 5) >>> get_list_common_gene("C1", dna_gene, 10, ppi_g, 5)
array([5, 5, 5, 5, 5]) array([5, 5, 5, 5, 5])
""" """
np.random.seed(1) if use_seed:
np.random.seed(1)
common = [ common = [
get_common_genes( get_common_genes(
dic_dna_gene[community], dic_dna_gene[community],
...@@ -180,11 +182,24 @@ def get_pvalue(values: np.array, target: float, iteration: int ...@@ -180,11 +182,24 @@ def get_pvalue(values: np.array, target: float, iteration: int
return pval, regulation return pval, regulation
def adjust_regulation(x: pd.Series) -> str:
"""
Return the adjusted regulation.
:param x: A row a the dataframe
:return: The adjusted row
"""
return x["regulation"] if x["p-adjust"] <= 0.05 else " . "
def update_overlap_df(df_overlap: pd.DataFrame, def update_overlap_df(df_overlap: pd.DataFrame,
dic_dna_gene: Dict[str, np.array], dic_dna_gene: Dict[str, np.array],
ppi_gene: np.array, iteration: int ppi_gene: np.array, iteration: int,
) -> pd.DataFrame: use_seed: bool = True,
) -> Tuple[pd.DataFrame, np.array]:
""" """
Perform a randomization test to check if communities at \
DNA level are more similar at protein level than randomly expected.
:param df_overlap: A dataframe containing every gene community at dna \ :param df_overlap: A dataframe containing every gene community at dna \
level and the number of common gene with a gne community at protein \ level and the number of common gene with a gne community at protein \
...@@ -193,36 +208,48 @@ def update_overlap_df(df_overlap: pd.DataFrame, ...@@ -193,36 +208,48 @@ def update_overlap_df(df_overlap: pd.DataFrame,
level level
:param ppi_gene: All the gene interacting at protein level :param ppi_gene: All the gene interacting at protein level
:param iteration: The number of time we want to repeat the process :param iteration: The number of time we want to repeat the process
:return: The updated Df (column p-value, p-adjust and regulation :param use_seed: True to use a fixed seed, false else.
:return: The updated Df (column p-value, p-adjust and regulation) + \
the list of common gene shared between gene community at DNA and \
protein level obtained by randmization analysis
>>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"], >>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"],
... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1], ... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1],
... "size_com-ppi": [10, 10, 10]}) ... "size_com-ppi": [10, 10, 10]})
>>> dna_gene = {"C1": [1, 2], "C2": [3, 4, 5], "C3": [6, 7]} >>> dna_gene = {"C1": [1, 2], "C2": [3, 4, 5], "C3": [6, 7]}
>>> ppi_g = list(range(11)) >>> ppi_g = list(range(11))
>>> update_overlap_df(do, dna_gene, ppi_g, 100).iloc[:, >>> d, di = update_overlap_df(do, dna_gene, ppi_g, 100)
... [0, 1, 2, 4, 5]] >>> d.iloc[:, [0, 1, 2, 4, 5]]
DNA_community community_size nb_com-ppi p-value p-adjust DNA_community community_size nb_com-ppi p-value p-adjust
0 C1 2 2 0.79 0.79 0 C1 2 2 0.79 0.79
1 C2 3 3 0.72 0.79 1 C2 3 3 0.72 0.79
2 C3 2 1 0.18 0.54 2 C3 2 1 0.18 0.54
>>> d = update_overlap_df(do, dna_gene, list(range(1000)), 100) >>> d.iloc[:, 6].to_list() == d.iloc[:, 7].to_list() == [" . "] * 3
True
>>> d, di = update_overlap_df(do, dna_gene, list(range(1000)), 100)
>>> d.iloc[:, [0, 1, 2, 4, 5]] >>> d.iloc[:, [0, 1, 2, 4, 5]]
DNA_community community_size nb_com-ppi p-value p-adjust DNA_community community_size nb_com-ppi p-value p-adjust
0 C1 2 2 0.01 0.01 0 C1 2 2 0.01 0.01
1 C2 3 3 0.00 0.00 1 C2 3 3 0.00 0.00
2 C3 2 1 0.00 0.00 2 C3 2 1 0.00 0.00
>>> d["regulation"].to_list() == [" + "] * 3 >>> d.iloc[:, 6].to_list() == d.iloc[:, 7].to_list() == [" + "] * 3
True
>>> len(di.keys()) == 3
True
>>> len(di[list(di.keys())[0]]) == 100
True True
""" """
pval_list = [] pval_list = []
reg_list = [] reg_list = []
dic = {}
for i in tqdm(range(df_overlap.shape[0])): for i in tqdm(range(df_overlap.shape[0])):
values = get_list_common_gene(df_overlap.iloc[i, :]["DNA_community"], community = df_overlap.iloc[i, :]["DNA_community"]
values = get_list_common_gene(community,
dic_dna_gene, dic_dna_gene,
df_overlap.iloc[i, :]["size_com-ppi"], df_overlap.iloc[i, :]["size_com-ppi"],
ppi_gene, ppi_gene,
iteration) iteration, use_seed)
dic[community] = values
pval, reg = get_pvalue(values, df_overlap.iloc[i, :]["nb_com-ppi"], pval, reg = get_pvalue(values, df_overlap.iloc[i, :]["nb_com-ppi"],
iteration) iteration)
pval_list.append(pval) pval_list.append(pval)
...@@ -233,7 +260,187 @@ def update_overlap_df(df_overlap: pd.DataFrame, ...@@ -233,7 +260,187 @@ def update_overlap_df(df_overlap: pd.DataFrame,
is_sorted=False, is_sorted=False,
returnsorted=False)[1] returnsorted=False)[1]
df_overlap["regulation"] = reg_list df_overlap["regulation"] = reg_list
return df_overlap df_overlap["regulation-adjusted"] = \
df_overlap.apply(adjust_regulation, axis=1)
return df_overlap, dic
def summarize_df_overlap(df_overlap: pd.DataFrame) -> pd.DataFrame:
"""
Create a summary of the dataframe `df_overlap` that indicates the number \
of common gene shared between communities at protein and DNA level.
:param df_overlap: The dataframe containing community at DNA and \
protein level and their enrichment
:return: The dataframe showing the number of enrichment, impoverishment \
or no regulation
>>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"],
... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1],
... "size_com-ppi": [10, 10, 10], "p-value": [0.01, 0, 0],
... "p-adjust": [0.01, 0.01, 0.01], "regulation": [" + "] * 3,
... "regulation-adjusted": [" + "] * 3})
>>> summarize_df_overlap(do)
regulation-adjusted community_size nb_com-ppi size_com-ppi number
0 + 2.333333 2.0 10.0 3
1 - NaN NaN NaN 0
2 . NaN NaN NaN 0
"""
df_overlap = df_overlap[["regulation-adjusted", "community_size",
"nb_com-ppi", "size_com-ppi", "p-adjust"]]
df_tmp = pd.DataFrame({"regulation-adjusted": [" + ", " - ", " . "],
"community_size": [np.nan] * 3,
"nb_com-ppi": [np.nan] * 3,
"size_com-ppi": [np.nan] * 3,
"p-adjust": [np.nan] * 3})
df = pd.concat([df_overlap, df_tmp], axis=0, ignore_index=True)
df = df.groupby("regulation-adjusted").agg({"community_size": "mean",
"nb_com-ppi": "mean",
"size_com-ppi": "mean",
"p-adjust": "count"})
return df.rename({"p-adjust": "number"}, axis=1).reset_index()
def get_enriched_impoverished_communities(df_overlap: pd.DataFrame,
dic_dna_gene: Dict[str, np.array],
ppi_gene: np.array, iteration: int,
dic_values: Dict[str, np.array],
use_seed: bool = True
) -> Dict[str, int]:
"""
For each communities, randomly samples only once in the list \
of `ppi_gene` to get the number of common gene betwwen this sample \
and the community of gene at DNA level. Then this function determines \
how many communites are enriched, impovershied thanks to `di_values`.
:param df_overlap: A dataframe containing every gene community at dna \
level and the number of common gene with a gne community at protein \
level
:param dic_dna_gene: The dictionary containing gene interacting at dna \
level
:param ppi_gene: All the gene interacting at protein level
:param iteration: The number of time we want to repeat the process
:param dic_values: A dictionary containing the number of common \
gene between communities at DNA and protein level for each \
subsampling made from the ppi_gene list.
:param use_seed: True to use a fixed seed, false else.
:return: The number of enriched, impoverished number of \
genes shared betwwen gene at DNA and protein level.
>>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"],
... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1],
... "size_com-ppi": [10, 10, 10], "p-value": [0.01, 0, 0],
... "p-adjust": [0.01, 0.01, 0.01], "regulation": [" + "] * 3,
... "regulation-adjusted": [" + "] * 3})
>>> dna_gene = {"C1": [1, 2], "C2": [3, 4, 5], "C3": [6, 7]}
>>> ppi_g = list(range(11))
>>> dv = {"C1": [0] * 95 + [1, 1, 1, 2, 2],
... "C2": [0] * 95 + [1, 1, 1, 2, 2],
... "C3": [0] * 75 + [1] * 10 + [2] * 10 + [3] * 5}
>>> get_enriched_impoverished_communities(do, dna_gene, ppi_g, 100, dv)
{' + ': 2, ' . ': 1, ' - ': 0}
>>> dv = {"C1": [0] * 95 + [1, 1, 1, 2, 2],
... "C2": [0] * 95 + [1, 1, 1, 2, 2],
... "C3": [3] * 75 + [4] * 23 + [1] * 2}
>>> get_enriched_impoverished_communities(do, dna_gene, ppi_g, 100, dv)
{' + ': 2, ' . ': 0, ' - ': 1}
"""
pval_list = []
reg_list = []
for i in range(df_overlap.shape[0]):
community = df_overlap.iloc[i, :]["DNA_community"]
my_value = get_list_common_gene(community,
dic_dna_gene,
df_overlap.iloc[i, :]["size_com-ppi"],
ppi_gene,
1, use_seed)[0]
pval, reg = get_pvalue(dic_values[community], my_value,
iteration)
pval_list.append(pval)
reg_list.append(reg)
p_adjust = multipletests(pval_list, alpha=0.05, method='fdr_bh',
is_sorted=False, returnsorted=False)[1]
dic = {" + ": 0, " . ": 0, " - ": 0}
for i, reg in enumerate(reg_list):
dic[reg if p_adjust[i] <= 0.05 else " . "] += 1
return dic
def summary_randomisation_test(df_overlap: pd.DataFrame,
dic_dna_gene: Dict[str, np.array],
ppi_gene: np.array, iteration: int,
dic_values: Dict[str, np.array],
use_seed: bool = True
) -> pd.DataFrame:
"""
Says if the number of community that shared an enriched number \
of common gene between the protein and DNA level is enriched as well.
:param df_overlap: A dataframe containing every gene community at dna \
level and the number of common gene with a gne community at protein \
level
:param dic_dna_gene: The dictionary containing gene interacting at dna \
level
:param ppi_gene: All the gene interacting at protein level
:param iteration: The number of time we want to repeat the process
:param dic_values: A dictionary containing the number of common \
gene between communities at DNA and protein level for each \
subsampling made from the ppi_gene list.
:param use_seed: True to use a fixed seed, false else.
:return: The updated summary table
>>> do = pd.DataFrame({"DNA_community": ["C1", "C2", "C3"],
... "community_size": [2, 3, 2], "nb_com-ppi": [2, 3, 1],
... "size_com-ppi": [10, 10, 10], "p-value": [0.01, 0, 0],
... "p-adjust": [0.01, 0.01, 0.01], "regulation": [" + "] * 3,
... "regulation-adjusted": [" + "] * 3})
>>> dna_gene = {"C1": [1, 2], "C2": [3, 4, 5], "C3": [6, 7]}
>>> ppi_g = list(range(11))
>>> dv = {"C1": [0] * 95 + [1, 1, 1, 2, 2],
... "C2": [0] * 95 + [1, 1, 1, 2, 2],
... "C3": [0] * 75 + [1] * 10 + [2] * 10 + [3] * 5}
>>> ds = summary_randomisation_test(do, dna_gene, ppi_g, 100, dv)
>>> ds.iloc[:, range(4)]
regulation-adjusted community_size nb_com-ppi size_com-ppi
0 + 2.333333 2.0 10.0
1 - NaN NaN NaN
2 . NaN NaN NaN
>>> ds.iloc[:, range(4, 7)]
number mean_100_number p-adjust
0 3 2.0 0.0
1 0 0.0 1.0
2 0 1.0 0.0
>>> ds.iloc[:, 7].to_list() == [" + ", " . ", " - "]
True
"""
df_summarized = summarize_df_overlap(df_overlap)
dic_rand_sum = {k: [] for k in df_summarized["regulation-adjusted"].values}
for _ in tqdm(range(iteration), desc="Making stats to summarise global "
"enrichment"):
d = get_enriched_impoverished_communities(df_overlap, dic_dna_gene,
ppi_gene, iteration,
dic_values, use_seed)
for k, v in dic_rand_sum.items():
v.append(d[k])
pvalues = []
regulations = []
mean_number = []
for i in range(df_summarized.shape[0]):
reg, val = df_summarized.iloc[i, :][["regulation-adjusted", "number"]]
pval, creg = get_pvalue(dic_rand_sum[reg], val,
iteration)
# print(reg, val, dic_rand_sum[reg])
pvalues.append(pval)
regulations.append(creg)
mean_number.append(np.mean(dic_rand_sum[reg]))
p_adjust = multipletests(pvalues, alpha=0.05, method='fdr_bh',
is_sorted=False, returnsorted=False)[1]
reg_adj = [regulations[i] if p_adjust[i] <= 0.05 else " . "
for i in range(len(regulations))]
df_summarized[f"mean_{iteration}_number"] = mean_number
df_summarized[f"p-adjust"] = p_adjust
df_summarized[f"reg_adj"] = reg_adj
return df_summarized
if __name__ == "__main__": if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment