Skip to content
Snippets Groups Projects
Commit 7fea9838 authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/graph_figures/graph_functions.py: add the...

src/find_interaction_cluster/graph_figures/graph_functions.py: add the possibility of selecting communities by doing a permutation test
parent a99189bf
No related branches found
No related tags found
No related merge requests found
......@@ -6,7 +6,6 @@
Description:
"""
import sqlite3
import networkx as nx
import pandas as pd
from .config_graph_figure import ConfigGraph, Config
......@@ -20,6 +19,9 @@ import numpy as np
from ..community_finder import lighten_color, create_figure
from ..sf_and_communities import get_sfname
from itertools import product
from ..community_figures.fig_functions import get_feature_by_community, \
perm_with_ctrl
import multiprocessing as mp
class Parameters:
......@@ -98,11 +100,32 @@ def check_if_exist(graph_file: Path) -> None:
f"parameter to build it")
def get_regulated_community(df: pd.DataFrame, threshold: float,
min_reg: int = 2) -> List[str]:
def get_regulated_community_perm(df: pd.DataFrame, iteration: int,
feature: str) -> List[str]:
"""
Get the list of community containing a significant amount of \
regulated genes/exons thanks to a permutation test.
:param df: A dataframe containing for each gene, the community \
to which it belong and if it is regulated by a splicing factor.
:param iteration: The number of iteration to perform
:param feature: The kind of feature to use
:return: The list of colony containing a significant amount of \
genes/exons regulated by a splicing factor
"""
dic_com = get_feature_by_community(df, feature)
df_stat = perm_with_ctrl(df, feature, "regulation", dic_com, iteration)
return df_stat.loc[(df_stat['p-adj'] <= 0.05) &
(df_stat["reg-adj"] == " + ") &
(df_stat["community_size"] >= 10),
"community"].to_list()
def get_regulated_community_thresholds(df: pd.DataFrame, threshold: float,
min_reg: int = 2) -> List[str]:
"""
Get the list of community containing a significant amount of \
regulated genes/exons.
regulated genes/exons by using thresholds.
:param df: A dataframe containing for each gene, the community \
to which it belong and if it is regulated by a splicing factor.
......@@ -110,22 +133,22 @@ def get_regulated_community(df: pd.DataFrame, threshold: float,
select it (but it must also contains at least min_reg gene regulated)
:param min_reg: The minimum of regulated exon in a community to \
take it into account
:return: The list of colony containing a significant amount of \
:return: The list of community containing a significant amount of \
genes/exons regulated by a splicing factor
>>> d = pd.DataFrame({"id_gene": [1, 2, 3, 4, 5, 6, 7, 8, 9],
... "community": ["C1"] * 3 + ["C2"] * 3 + ["C3"] * 3,
... "community_size": [3] * 9,
... "regulation": [1, 0, 0, 1, 1, 0, 0, 0, 0]})
>>> get_regulated_community(d, 0.1, 2)
>>> get_regulated_community_thresholds(d, 0.1, 2)
['C2']
>>> get_regulated_community(d, 0.1, 3)
>>> get_regulated_community_thresholds(d, 0.1, 3)
[]
>>> get_regulated_community(d, 0.1, 1)
>>> get_regulated_community_thresholds(d, 0.1, 1)
['C1', 'C2']
>>> get_regulated_community(d, 0.5, 1)
>>> get_regulated_community_thresholds(d, 0.5, 1)
['C2']
>>> get_regulated_community(d, 0.0, 0)
>>> get_regulated_community_thresholds(d, 0.0, 0)
['C1', 'C2', 'C3']
"""
ndf = df[["community", "community_size", "regulation"]]\
......@@ -136,6 +159,29 @@ def get_regulated_community(df: pd.DataFrame, threshold: float,
"community"].to_list()
def get_regulated_community(df: pd.DataFrame, feature: str,
threshold: float, min_reg: int = 2,
iteration: int = 0):
"""
Get the list of community containing a significant amount of \
regulated genes/exons with the wanted method.
:param df: A dataframe containing for each gene, the community \
to which it belong and if it is regulated by a splicing factor.
:param threshold: Minimum frequency of gene regulated in a colony to \
select it (but it must also contains at least min_reg gene regulated)
:param min_reg: The minimum of regulated exon in a community to \
take it into account
:param iteration: The number of iteration to perform
:param feature: The kind of feature to use
:return: The list of community containing a significant amount of \
genes/exons regulated by a splicing factor
"""
if iteration >= 20:
return get_regulated_community_perm(df, iteration, feature)
return get_regulated_community_thresholds(df, threshold, min_reg)
def select_gene_in_selected_communities(df: pd.DataFrame,
list_communities: List[str],
feature: str
......@@ -232,8 +278,8 @@ def build_community_dic(list_ft: List[List], list_community: List[str],
return dic_community
def get_title(sf_name: str, reg: str, threshold: float, min_reg, feature: str
) -> str:
def get_title(sf_name: str, reg: str, threshold: float, min_reg, feature: str,
iteration: int) -> str:
"""
Return a title
......@@ -244,20 +290,25 @@ def get_title(sf_name: str, reg: str, threshold: float, min_reg, feature: str
:param min_reg: The minimum of regulated exon in a community to \
take it into account
:param feature: The kind of feature we want to analyse
:param iteration: The number of iteration to perform
:return: The title of the figure
>>> get_title("SF1", "down", 0.1, 2, "gene")
>>> get_title("SF1", "down", 0.1, 2, "gene", 0)
'Figure of the communities containing at least 10.0 % of their genes \
(or more than 2 genes) down-regulated by SF1'
"""
reg = "regulated" if reg == "reg" else f"{reg}-regulated"
return f"Figure of the communities containing at least " \
f"{threshold * 100} % of their {feature}s (or more than " \
f"{min_reg} {feature}s) {reg} by {sf_name}"
if iteration < 20:
return f"Figure of the communities containing at least " \
f"{threshold * 100} % of their {feature}s (or more than " \
f"{min_reg} {feature}s) {reg} by {sf_name}"
return f"Figure of the communities enriched in {feature}s" \
f" {reg} by {sf_name} (permutation test {iteration} iteration)"
def get_outfile(graph_file: Path, sf_name: str, reg: str, threshold: float,
min_reg, feature: str) -> Path:
min_reg, feature: str, iteration: int, num_community: int
) -> Path:
"""
Return The outfile of the figure
......@@ -269,16 +320,29 @@ def get_outfile(graph_file: Path, sf_name: str, reg: str, threshold: float,
:param min_reg: The minimum of regulated exon in a community to \
take it into account
:param feature: The kind of feature we want to analyse
:param iteration: The number of iteration to perform
:param num_community: The number of community in the figure
"""
output_dir = graph_file.parent / "graph_figures"
output_dir.mkdir(exist_ok=True)
return output_dir / f"{sf_name}_{reg}_{threshold}_{min_reg}_{feature}.html"
output_tmp = graph_file.parent / "graph_figures"
if iteration < 20:
output_dir = output_tmp / "graph_threshold"
output_dir.mkdir(exist_ok=True, parents=True)
return output_dir / \
f"{sf_name}_{reg}_{threshold}_{min_reg}_{feature}" \
f"_cluster-{num_community}.html"
else:
output_dir = output_tmp / "graph_permutation"
output_dir.mkdir(exist_ok=True, parents=True)
return output_dir / \
f"{sf_name}_{reg}_perm-{iteration}_{feature}_" \
f"cluster-{num_community}.html"
def create_graph_figure(project: str, weight: int, global_weight: int,
same_gene: bool, inflation: float, cell_line: str,
feature: str, sf_name: str, reg: str, threshold: float,
min_reg: int = 2):
min_reg: int = 2, iteration: int = 0,
min_community: int = 3):
"""
:param project: A project name of interest. Used only if \
global_weight is 0
......@@ -299,7 +363,13 @@ def create_graph_figure(project: str, weight: int, global_weight: int,
select it (but it must also contains at least min_reg gene regulated)
:param min_reg: The minimum of regulated exon in a community to \
take it into account
:param iteration: If this parameter is greater or equal to 20 then a \
permutation test is made to find the significantly enriched communities. \
Below 20, significant communities are found
:param min_community: The minimum number of enriched community \
required to produce a figure
"""
print(f"Working on {sf_name}, {reg}")
p = Parameters(project, weight, global_weight, same_gene, inflation,
cell_line, feature)
graph_file, comm_file = recover_json_graph_of_interest(p)
......@@ -309,32 +379,39 @@ def create_graph_figure(project: str, weight: int, global_weight: int,
df_com_file = pd.read_csv(comm_file, sep="\t")
df_com = get_community_tables(df_com_file, feature)
full_com = merge_dataframes(reg_table, df_com, feature)
full_com = full_com[full_com["community_size"] >= 10.0]
reg_ft = full_com.loc[full_com["regulation"] == 1, f"id_{feature}"].to_list()
list_communities = get_regulated_community(full_com, threshold, min_reg)
if len(list_communities) >= 3:
reg_ft = full_com.loc[full_com["regulation"] == 1,
f"id_{feature}"].to_list()
full_com.loc[full_com["community_size"] < 10.0,
["community", "community_size"]] = [np.nan, np.nan]
list_communities = get_regulated_community(full_com, feature, threshold,
min_reg, iteration)
num_communities = len(list_communities)
if num_communities >= min_community:
list_ft = select_gene_in_selected_communities(full_com,
list_communities,
feature)
graph = load_graphic(graph_file)
sub_graph = subgraph_creation(graph, list_ft)
dic_community = build_community_dic(list_ft, list_communities, reg_ft)
my_title = get_title(sf_name, reg, threshold, min_reg, feature)
my_title = get_title(sf_name, reg, threshold, min_reg, feature,
iteration)
outfile = get_outfile(graph_file, sf_name, reg, threshold, min_reg,
feature)
feature, iteration, num_communities)
create_figure(sub_graph, outfile, dic_community, my_title)
else:
print(f"only {len(list_communities)} cluster(s) found ! "
print(f"only {num_communities} cluster(s) found ! "
f"no figure created")
def create_many_graph_figures(project: str, weight: int, global_weight: int,
same_gene: bool, inflation: float, cell_line: str,
def create_many_graph_figures(ps: int, project: str, weight: int,
global_weight: int, same_gene: bool,
inflation: float, cell_line: str,
feature: str, threshold: float,
min_reg: int = 2):
min_reg: int = 2, iteration: int = 0,
min_community: int = 3):
"""
:param ps: The number of processes to create
:param project: A project name of interest. Used only if \
global_weight is 0
:param weight: The weight of interaction to consider
......@@ -350,14 +427,24 @@ def create_many_graph_figures(project: str, weight: int, global_weight: int,
select it (but it must also contains at least min_reg gene regulated)
:param min_reg: The minimum of regulated exon in a community to \
take it into account
:param iteration: If this parameter is greater or equal to 20 then a \
permutation test is made to find the significantly enriched communities. \
Below 20, significant communities are found
:param min_community: The minimum number of enriched community \
required to produce a figure
"""
sf_list = get_sfname()
my_prod = product(sf_list, ["down", "up"])
my_prod = list(product(sf_list, ["down", "up"]))
processes = []
pool = mp.Pool(processes=min(len(my_prod), ps))
for sf_name, reg in my_prod:
print(f"Working on {sf_name}, {reg}")
create_graph_figure(project, weight, global_weight, same_gene,
inflation, cell_line, feature, sf_name, reg,
threshold, min_reg)
args = [project, weight, global_weight, same_gene, inflation,
cell_line, feature, sf_name, reg, threshold, min_reg,
iteration, min_community]
processes.append(pool.apply_async(create_graph_figure, args))
pool.close()
[p.get(timeout=None) for p in processes]
pool.join()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment