Skip to content
Snippets Groups Projects
Commit c06baf31 authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/clip_figures/clip_analyser.py: add sl_reg parameter

parent 0effc7d1
No related branches found
No related tags found
No related merge requests found
......@@ -19,6 +19,9 @@ from ...logging_conf import logging_def
import logging
from typing import Tuple
import multiprocessing as mp
from ...figures_utils.exons_interactions import get_every_events_4_a_sl
import sqlite3
from ..sf_and_communities import get_sfname
def bedtools_intersect(gene_bed: Path, clip_bed: Path,
......@@ -230,15 +233,74 @@ def select_community_file(project: str, weight: int, global_weight: int,
raise FileNotFoundError(f"File {com_file} was not found !")
tmp_name = com_file.name.replace(".txt", "")
output = ConfigClip.output_folder / \
f"CLIP_community_figures-{feature}-{tmp_name}"
f"CLIP_community_figures-{feature}-{tmp_name}"
return com_file, output
def add_regulation_column(df_table: pd.DataFrame, sf_name: str, feature: str,
) -> pd.DataFrame:
"""
Add a column community_data on df_table corresponding to the mean \
number of exons regulated in each gene if feature is gene or 1 if \
the exon is regulated.
:param df_table: A dataframe containing the peak density for each gene \
or exons.
:param sf_name: The splicing factor of interest
:param feature: The kind of feature analysed
:return: The dataframe updated
>>> dexon = {"id_exon": ['11553_16', '3222_30', '1001_3'],
... 'clip_peak': [0, 1, 0], "peak_density": [0, 0.1, 0],
... 'community': ['C1', 'C1', 'C2']}
>>> add_regulation_column(pd.DataFrame(dexon), 'TRA2A_B', 'exon')
id_exon clip_peak peak_density community community_data
0 11553_16 0 0.0 C1 1
1 3222_30 1 0.1 C1 1
2 1001_3 0 0.0 C2 0
>>> dgene = {"id_gene": [11553, 3222, 1001],
... 'clip_peak': [0, 1, 0], "peak_density": [0, 0.1, 0],
... 'community': ['C1', 'C1', 'C2']}
>>> add_regulation_column(pd.DataFrame(dgene), 'TRA2A_B', 'gene')
id_gene clip_peak peak_density community community_data
0 11553 0 0.0 C1 0.117647
1 3222 1 0.1 C1 0.020408
2 1001 0 0.0 C2 0.000000
>>> add_regulation_column(pd.DataFrame(dgene), 'TRAgdghfh', 'gene')
id_gene clip_peak peak_density community
0 11553 0 0.0 C1
1 3222 1 0.1 C1
2 1001 0 0.0 C2
"""
if sf_name not in get_sfname():
return df_table
up_exon, x = get_every_events_4_a_sl(sqlite3.connect(ConfigGraph.db_file),
sf_name, "up")
down_exon, x = get_every_events_4_a_sl(
sqlite3.connect(ConfigGraph.db_file), sf_name, "down")
exons = up_exon[f"{sf_name}_up"] + down_exon[f"{sf_name}_down"]
if feature == "exon":
df_table["community_data"] = [0] * len(df_table)
df_table.loc[df_table[f"id_{feature}"].isin(exons),
"community_data"] = 1
else:
df = pd.read_csv(ConfigClip.bed_exon, sep="\t",
names=["chr", "start", "end", "id_exon", "sc", "s"])
df = df[["id_exon"]]
df["id_gene"] = df["id_exon"].str.replace(r"_\d+", "").astype(int)
df["community_data"] = [0] * df.shape[0]
df.loc[df["id_exon"].isin(exons), "community_data"] = 1
df.drop("id_exon", axis=1, inplace=True)
df = df.groupby("id_gene").mean().reset_index()
df_table = df_table.merge(df, how="left", on="id_gene")
return df_table
def create_figure(project: str, weight: int, global_weight: int,
same_gene: bool, feature: str, clip_file: Path,
feature_bed: Path, test_type: str = "permutation",
iteration: int = 10000, display_size: bool=False,
community_file: str = "") -> None:
iteration: int = 10000, display_size: bool = False,
community_file: str = "", sl_reg: bool = False) -> None:
"""
Create the final figure
:param project: The name of the project of interest
......@@ -261,7 +323,8 @@ def create_figure(project: str, weight: int, global_weight: int,
used to find the community files computed with ChIA-PET data.
:param display_size: True to display the size of the community. \
False to display nothing. (default False)
:param ps: The number of processes to create (default 1)
:param sl_reg: True to display the FaRLine regulation of the \
same factor, False to not display it.
"""
logging.info(f"Working on {clip_file}")
com_file, output = select_community_file(project, weight, global_weight,
......@@ -270,6 +333,10 @@ def create_figure(project: str, weight: int, global_weight: int,
output.mkdir(exist_ok=True, parents=True)
outfile = output / f"{clip_file.name.split('.')[0]}.pdf"
final_table = create_table(feature, clip_file, feature_bed, com_file)
if sl_reg:
final_table = add_regulation_column(final_table,
clip_file.name.split("_")[0],
feature)
create_community_fig(final_table, feature, "peak_density", outfile,
test_type, iteration=iteration,
display_size=display_size)
......@@ -279,8 +346,8 @@ def clip_folder_analysis(clip_folder: Path, project: str, weight: int,
global_weight: int, same_gene: bool, feature: str,
test_type: str = "permutation",
iteration: int = 10000, display_size: bool=False,
community_file: str = "", ps: int = 1,
logging_level: str = "DEBUG") -> None:
community_file: str = "", sl_reg: bool = False,
ps: int = 1, logging_level: str = "DEBUG") -> None:
"""
Create the final figure
:param project: The name of the project of interest
......@@ -301,6 +368,8 @@ def clip_folder_analysis(clip_folder: Path, project: str, weight: int,
:param community_file: A file containing custom communities. If \
it equals to '' then weight, global weight and same genes parameter are \
used to find the community files computed with ChIA-PET data.
:param sl_reg: True to display the FaRLine regulation of the \
same factor, False to not display it.
:param ps: The number of processes to create (default 1)
:param logging_level: The level of data to display (default 'DISABLE')
"""
......@@ -309,11 +378,12 @@ def clip_folder_analysis(clip_folder: Path, project: str, weight: int,
else ConfigClip.bed_exon
files = list(clip_folder.glob("*.bed")) + \
list(clip_folder.glob("*.bed.gz"))
files = [files[0]]
pool = mp.Pool(processes=min(len(files), ps))
processes = []
for mfile in files:
args = [project, weight, global_weight, same_gene, feature,
mfile, feature_bed, test_type, iteration, display_size,
community_file]
community_file, sl_reg]
processes.append(pool.apply_async(create_figure, args))
[p.get(timeout=None) for p in processes]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment