src/find_interaction_cluster/clip_figures/clip_compo_*: creation of a script...

src/find_interaction_cluster/clip_figures/clip_compo_*: creation of a script to analyse the nucleotide composition of communities enrichied or impoverished in peak density from clip data

src/find_interaction_cluster/clip_figures/clip_compo_*: creation of a script...
496626e8 · nfontrod · b01ba86e · 496626e8 · 496626e8
Commit 496626e8 authored Jan 13, 2021 by nfontrod
--- a/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py
+++ b/src/find_interaction_cluster/clip_figures/clip_compo_analyser.py
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+"""
+Description: The goal of this script is to check whether a communities \
+of genes that are more frequently bind by a splicing factor \
+has a composition biais.
+"""
+from pathlib import Path
+from typing import List, Tuple, Optional
+import pandas as pd
+from .config import ConfigClip, Config, ConfigGraph
+import doctest
+from math import floor, ceil
+from ..nt_and_community import get_ft_id, get_cpnt_frequency, get_features
+import sqlite3
+import numpy as np
+from scipy.stats import mannwhitneyu
+from itertools import combinations, product
+from statsmodels.stats.multitest import multipletests
+import seaborn as sns
+import matplotlib.pyplot as plt
+from ...logging_conf import logging_def
+import logging
+import multiprocessing as mp
+def find_final_clip_tables(folder: Path) -> List[Path]:
+    """
+    From a folder return every files ending with *_bar.txt.
+    :param folder: A folder containing final tables used to produce \
+    community figures for clip data (figure produced by the function \
+    clip_folder_analysis of the script clip_analyse.py).
+    :return: The list of files ending with *_bar.txt
+    >>> find_final_clip_tables(Path("fziufriubdsibf"))
+    Traceback (most recent call last):
+    ...
+    NotADirectoryError: Folder fziufriubdsibf doesn't exists !
+    >>> find_final_clip_tables(Path("src/"))
+    []
+    """
+    if not folder.is_dir():
+        raise NotADirectoryError(f"Folder {folder} doesn't exists !")
+    return list(folder.glob("*_bar.txt"))
+def prepare_df(clip_df: pd.DataFrame, feature: str) -> pd.DataFrame:
+    """
+    Prepare the table containing data to produce the clip community figure.
+    :param clip_df: table containing data to produce the clip \
+     community figure.
+    :param feature: The feature of interest (gene or exon)
+    :return: The filtered table
+    >>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"),
+    ... "gene")
+    >>> d.head()
+      community  community_size     p-adj reg-adj  peak_density
+    8      C852             1.0  0.015356      -       0.000000
+    9       C91             2.0  0.027338      -       0.000000
+    0       C10             3.0  0.228296      .       0.046749
+    1      C165            12.0  0.252854      .       0.271487
+    3      C325            18.0  0.228296      .       0.272363
+    >>> d["community"].to_list() == ['C852', 'C91', 'C10', 'C165', 'C325',
+    ... 'C232', 'C397', 'C952', 'C365', 'C680', 'C916', 'C690']
+    True
+    """
+    df = clip_df.copy()
+    df = df.loc[df[f"id_{feature}"] != "ctrl",
+                ["peak_density", "community", "community_size", "p-adj",
+                 "reg-adj", f"id_{feature}"]]
+    df = df.groupby(["community", "community_size", "p-adj", "reg-adj"]
+                    ).mean().reset_index()
+    df = df.sort_values("peak_density", ascending=True)
+    return df
+def get_feature_id(clip_df: pd.DataFrame, feature: str,
+                   list_community: List[str]) -> pd.DataFrame:
+    """
+    Get the list of gene contained in the list of communities given.
+    :param clip_df: table containing data to produce the clip \
+     community figure.
+    :param feature: The feature of interest (gene or exon)
+    :param list_community: A list of community
+    :return: The filtered table)
+    >>> d = pd.read_table(ConfigClip.test_clip_file, sep="\\t")
+    >>> get_feature_id(d, "gene", ['C916', 'C690'])
+    ['11489', '11723', '9289', '9734', '9708']
+    >>> get_feature_id(d, "gene", ['C680', 'C916',
+    ... 'C690'])
+    ['16283', '16852', '11489', '11723', '9289', '9734', '9708']
+    >>> get_feature_id(d, "gene", ['C91', 'C852'])
+    ['13856', '13360', '7831']
+    >>> get_feature_id(d, "gene", ['C91', 'C852',
+    ... 'C10'])
+    ['13856', '13360', '7831', '16763', '16676', '16817']
+    """
+    df = clip_df.copy()
+    df = df[df[f"id_{feature}"] != "ctrl"]
+    return df[df["community"].isin(list_community)][f"id_{feature}"].to_list()
+def get_extreme_groups(df: pd.DataFrame, table_file: Path, group: str,
+                       threshold_feature: int,
+                       default_groups: int) -> Tuple[List, str]:
+    """
+    Recover the impoverished or enriched communities for peaks \
+    in a given splicing factor if they exists.
+    :param df: A dataframe used to build the community figures \
+    of clip density.
+    :param table_file: The file from  which the dataframe df was created
+    :param group: The kind of community to recover (enriched or impoverished)
+    :param threshold_feature: The minimum number of feature to \
+    recover from enriched/impoverished communities
+    :param default_groups: The number of community to get if none \
+    (or not enough) community are enriched or impoverished \
+    (depending on group parameter)
+    :return: The list of enriched or impoverished community of feature \
+    and the name of the list
+    >>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"),
+    ... "gene")
+    >>> get_extreme_groups(d, ConfigClip.test_clip_file, "enriched", 5, 3)
+    (['C916', 'C690'], 'SF_test_enriched')
+    >>> get_extreme_groups(d, ConfigClip.test_clip_file, "enriched", 6, 3)
+    (['C680', 'C916', 'C690'], 'SF_test_enriched_nosig')
+    >>> get_extreme_groups(d, ConfigClip.test_clip_file, "impoverished", 3, 3)
+    (['C852', 'C91'], 'SF_test_impoverished')
+    >>> get_extreme_groups(d, ConfigClip.test_clip_file, "impoverished", 4, 3)
+    (['C852', 'C91', 'C10'], 'SF_test_impoverished_nosig')
+    """
+    outname = table_file.name.rstrip("bar.txt") + group
+    my_reg = " + " if group == "enriched" else " - "
+    tmp = df[df["reg-adj"] == my_reg]
+    if not tmp.empty and sum(tmp["community_size"]) >= threshold_feature:
+        return tmp["community"].to_list(), outname
+    outname += "_nosig"
+    list_com = df["community"].to_list()
+    if group == "enriched":
+        return list_com[-default_groups:], outname
+    else:
+        return list_com[:default_groups], outname
+def get_middle_group(df: pd.DataFrame, table_file: Path,
+                     default_groups: int) -> Tuple[List, str]:
+    """
+    Recover the impoverished or enriched communities for peaks \
+    in a given splicing factor if they exists.
+    :param df: A dataframe used to build the community figures \
+    of clip density.
+    :param table_file: The file from  which the dataframe df was created
+    :param default_groups: The number of community to get if none \
+    (or not enough) community are enriched or impoverished \
+    (depending on group parameter)
+    :return: The list of enriched or impoverished community of feature \
+    and the name of the list
+    >>> d = prepare_df(pd.read_table(ConfigClip.test_clip_file, sep="\\t"),
+    ... "gene")
+    >>> get_middle_group(d, ConfigClip.test_clip_file, 3)
+    (['C232', 'C397', 'C952'], 'SF_test_middle')
+    >>> get_middle_group(d, ConfigClip.test_clip_file, 4)
+    (['C325', 'C232', 'C397', 'C952'], 'SF_test_middle')
+    """
+    outname = table_file.name.rstrip("bar.txt") + "middle"
+    list_com = df["community"].to_list()
+    index = floor(len(list_com) / 2)
+    return list_com[index - floor(default_groups / 2):
+                    index + ceil(default_groups / 2)], outname
+def add_ctrl_df(df: pd.DataFrame, feature: str) -> pd.DataFrame:
+    """
+    Create a dataframe containing every genes/exons not in a community.
+    :param df: The dataframe used to create a community figure
+    :param feature: The feature of interest (exon or gene)
+    :return: A table containing 2 columns: id_{feature} and group \
+    (containing 'ctrl')
+    >>> import time
+    >>> s = time.time()
+    >>> add_ctrl_df(pd.DataFrame({"id_gene": list(range(1, 19420)) + ["ctrl"]
+    ... }), "gene")
+       id_gene group
+    0    19420  ctrl
+    1    19421  ctrl
+    2    19422  ctrl
+    3    19423  ctrl
+    4    19424  ctrl
+    5    19425  ctrl
+    6    19426  ctrl
+    """
+    list_id = get_ft_id(sqlite3.connect(Config.db_file), feature)
+    com_id = df[df[f"id_{feature}"] != "ctrl"][f"id_{feature}"].to_list()
+    if feature == "gene":
+        list_id = list(map(int, list_id))
+        com_id = list(map(int, com_id))
+    ctrl_id = [mid for mid in list_id if mid not in com_id]
+    return pd.DataFrame({f"id_{feature}": ctrl_id,
+                         "group": ["ctrl"] * len(ctrl_id)})
+def get_interest_groups(clip_table_file: Path, feature: str,
+                        threshold_feature: int, default_groups: int
+                        ) -> Optional[pd.DataFrame]:
+    """
+    :param clip_table_file: table containing data to produce the clip \
+     community figure.
+    :param feature: The feature of interest (gene or exon)
+    :param threshold_feature: The minimum number of feature to \
+    recover from enriched/impoverished communities
+    :param default_groups: The number of community to get if none \
+    (or not enough) community are enriched or impoverished \
+    (depending on group parameter)
+    :return: The dataframe indicating for a id of a feature, the groups where \
+    if belong.
+    >>> get_interest_groups(ConfigClip.test_clip_file, "gene", 3, 3)
+       id_gene                 group
+    0    13856  SF_test_impoverished
+    1    13360  SF_test_impoverished
+    2     7831  SF_test_impoverished
+    3    17226        SF_test_middle
+    4    17418        SF_test_middle
+    5    16985        SF_test_middle
+    6    16897        SF_test_middle
+    7    16536        SF_test_middle
+    8    17214        SF_test_middle
+    9    17271        SF_test_middle
+    10   17360        SF_test_middle
+    11   16022        SF_test_middle
+    12   16660        SF_test_middle
+    13   17080        SF_test_middle
+    14   17038        SF_test_middle
+    15   10558        SF_test_middle
+    16   11520        SF_test_middle
+    17   10569        SF_test_middle
+    18   10641        SF_test_middle
+    19   10333        SF_test_middle
+    20   12509        SF_test_middle
+    21   11687        SF_test_middle
+    22   10886        SF_test_middle
+    23   10784        SF_test_middle
+    24   10997        SF_test_middle
+    25   10907        SF_test_middle
+    26   12115        SF_test_middle
+    27   10922        SF_test_middle
+    28   10586        SF_test_middle
+    29   12030        SF_test_middle
+    30   10627        SF_test_middle
+    31   12525        SF_test_middle
+    32   12172        SF_test_middle
+    33   11346        SF_test_middle
+    34   11949        SF_test_middle
+    35   15390        SF_test_middle
+    36   15348        SF_test_middle
+    37   14966        SF_test_middle
+    38   15272        SF_test_middle
+    39   15287        SF_test_middle
+    40   15135        SF_test_middle
+    41   14992        SF_test_middle
+    42   15006        SF_test_middle
+    43   15199        SF_test_middle
+    44   15285        SF_test_middle
+    45   11489      SF_test_enriched
+    46   11723      SF_test_enriched
+    47    9289      SF_test_enriched
+    48    9734      SF_test_enriched
+    49    9708      SF_test_enriched
+    >>> r = get_interest_groups(ConfigClip.test_clip_file, "gene", 6, 3)
+    >>> r is None
+    True
+    """
+    ini_df = pd.read_csv(clip_table_file, sep="\t")
+    df = prepare_df(ini_df, feature)
+    enrich_com, name_enrich = get_extreme_groups(
+        df, clip_table_file, "enriched", threshold_feature, default_groups)
+    impov_com, name_impov = get_extreme_groups(
+        df, clip_table_file, "impoverished", threshold_feature, default_groups)
+    if "nosig" in name_enrich and "nosig" in name_impov:
+        return None
+    middle_com, name_middle = get_middle_group(df, clip_table_file,
+                                               default_groups)
+    enrich_grp = get_feature_id(ini_df, feature, enrich_com)
+    impov_grp = get_feature_id(ini_df, feature, impov_com)
+    middle_grp = get_feature_id(ini_df, feature, middle_com)
+    return pd.DataFrame({f"id_{feature}": impov_grp + middle_grp + enrich_grp,
+                         "group": [name_impov] * len(impov_grp) +
+                                  [name_middle] * len(middle_grp) +
+                                  [name_enrich] * len(enrich_grp)})
+def create_stat_df(df_comp: pd.DataFrame, cpnt: str) -> pd.DataFrame:
+    """
+    Compute statistical analysis for every group in df_comp.
+    :param df_comp: The dataframe containing the frequency of every `cpnt_type`
+    :param cpnt: The component studied
+    :return: Dataframe of p-values
+    >>> d = pd.DataFrame({"id_gene": [1, 2, 3, 4, 5, 6, 7, 8],
+    ... "A": [1, 2, 3, 4, 5, 6, 7, 8], "group": ["test"] * 4 + ["ctrl"] * 4})
+    >>> create_stat_df(d, "A")
+       grp1  grp2     p-val
+    0  ctrl  test  0.030383
+    >>> d = pd.DataFrame({"id_gene": list(range(1, 13)),
+    ... "A": list(range(1, 12)) + [5], "group": ["test"] * 4 + ["ctrl"] * 4 +
+    ... ["test2"] * 4})
+    >>> create_stat_df(d, "A")
+       grp1   grp2     p-val     p-adj
+    0  ctrl   test  0.030383  0.045574
+    1  ctrl  test2  0.245383  0.245383
+    2  test  test2  0.030383  0.045574
+    """
+    comparison = combinations(np.unique(df_comp["group"]), 2)
+    content = []
+    for grp1, grp2 in comparison:
+        p_val = mannwhitneyu(
+            df_comp.loc[df_comp["group"] == grp1, cpnt].values,
+            df_comp.loc[df_comp["group"] == grp2, cpnt].values,
+            alternative="two-sided")[1]
+        content.append([grp1, grp2, p_val])
+    df = pd.DataFrame(content, columns=["grp1", "grp2", "p-val"])
+    if df.shape[0] > 1:
+        df["p-adj"] = multipletests(df['p-val'].values, method="fdr_bh")[1]
+    return df
+def make_barplot(df_comp: pd.DataFrame, outfile: Path, cpnt: str,
+                 cpnt_type: str, clip_name: str) -> None:
+    """
+    Create a barplot showing the composition for different groups of feature.
+    :param df_comp: The dataframe containing the frequency of every `cpnt_type`
+    :param outfile: File were the figure will be created
+    :param cpnt: The component of interest
+    :param cpnt_type: The type of component of interest
+    :param clip_name: The name of the clip studied
+    """
+    sns.set(context="talk")
+    g = sns.catplot(x="group", y=cpnt, data=df_comp, height=12, aspect=1.7,
+                    kind="violin")
+    g.fig.suptitle(f"Mean frequency of {cpnt}({cpnt_type})"
+                   f"among different community groups\n"
+                   f"created from the peak_density from clip {clip_name}")
+    g.ax.set_ylabel(f'Frequency of {cpnt} ({cpnt_type})')
+    g.savefig(outfile)
+    plt.clf()
+    plt.close()
+def create_figure_4_clip_table(clip_table_file: Path, feature: str,
+                               threshold_feature: int, default_groups: int,
+                               df_comp: pd.DataFrame, cpnt_type: str,
+                               cpnt: str, df_id_ctrl: pd.DataFrame,
+                               write_comp_table: bool) -> None:
+    """
+    Create a component figure for a clip file
+    :param clip_table_file: table containing data to produce the clip \
+     community figure.
+    :param feature: The feature of interest (gene or exon)
+    :param threshold_feature: The minimum number of feature to \
+    recover from enriched/impoverished communities
+    :param default_groups: The number of community to get if none \
+    (or not enough) community are enriched or impoverished \
+    (depending on group parameter)
+    :param df_comp: The dataframe containing the frequency of every `cpnt_type`
+    :param cpnt_type: The kind of component of interest
+    :param cpnt: The kind of component of interest
+    :param df_id_ctrl: A dataframe indicating control exons
+    :param write_comp_table: True to save comp table, false else
+    """
+    df_group = get_interest_groups(clip_table_file, feature,
+                                   threshold_feature, default_groups)
+    if feature == "gene":
+        df_group["id_gene"] = df_group["id_gene"].astype(int)
+        df_comp["id_gene"] = df_comp["id_gene"].astype(int)
+    if df_group is None:
+        return None
+    df_group = pd.concat([df_group, df_id_ctrl], ignore_index=True)
+    if len(np.unique(df_group[f"id_{feature}"].values)) != df_group.shape[0]:
+        raise ValueError("Found duplicates value in df_group, exiting...")
+    df_comp = df_comp.merge(df_group, how="left", on=f"id_{feature}")
+    df_comp = df_comp[-df_comp["group"].isna()]
+    sf_name = clip_table_file.name.rstrip("_bar.txt")
+    outfiles = [clip_table_file.parent / sf_name /
+                f"{cpnt_type}_{cpnt}_{sf_name}_thd" 
+                f"{threshold_feature}_dft{default_groups}.{ext}"
+                for ext in ['txt', 'pdf', 'bar.txt']]
+    outfiles[0].parent.mkdir(exist_ok=True)
+    df_stat = create_stat_df(df_comp, cpnt)
+    df_stat.to_csv(outfiles[0], sep="\t", index=False)
+    make_barplot(df_comp, outfiles[1], cpnt, cpnt_type, sf_name)
+    if write_comp_table:
+        df_comp.to_csv(outfiles[2], sep="\t", index=False)
+def create_multiple_figures(clip_result_folder: Path, feature: str,
+                            threshold_feature: int, default_groups: int,
+                            cpnt_type: str, region: str = "gene",
+                            ps: int = 1,
+                            logging_level: str = "DEBUG") -> None:
+    """
+    Create multiple composition figures from a result clip folder.
+    :param clip_result_folder: Folder containing tables used to \
+    create clip community figures
+    :param feature: The feature of interest (exon, or gene)
+    :param threshold_feature: The minimum number of feature needed \
+    inside communities enriched or impoverished in peak_density to consider \
+    those community (otherwise `default_groups` default number of community \
+    are taken
+    :param default_groups: The number of communities to take as default.
+    :param cpnt_type: The kind of component analysed
+    :param region: Only used when feature is gene. Corresponds to \
+    the region studied within genes.
+    :param ps: The number of processes to create
+    :param logging_level: The level of data to display (default 'DISABLE')
+    """
+    logging_def(ConfigGraph.community_folder, __file__, logging_level)
+    list_tables = find_final_clip_tables(clip_result_folder)
+    list_ft = get_features(cpnt_type)
+    if len(list_ft) == 0:
+        logging.warning(f"No clip table found in {clip_result_folder} !\n"
+                        f"Exiting...")
+        return None
+    cnx = sqlite3.connect(Config.db_file)
+    df_comp = get_cpnt_frequency(cnx, get_ft_id(cnx, feature), feature, region,
+                                 cpnt_type)
+    df_ctrl = add_ctrl_df(pd.read_csv(list_tables[0], sep="\t"), feature)
+    conditions = list(product(list_tables, list_ft))
+    ps = min(len(conditions), ps)
+    pool = mp.Pool(processes=ps if ps > 0 else 1)
+    processes = []
+    conditions = [(Path("results/community_of_co-localized-exons/communities/weight-3_global_weight-3/CLIP_community_figures/PCBP1_merged_bar.txt"), "A")]
+    write_bar_table = True
+    for clip_file, cpnt in conditions:
+        args = [clip_file, feature, threshold_feature, default_groups,
+                df_comp, cpnt_type, cpnt, df_ctrl, write_bar_table]
+        write_bar_table = False
+        processes.append(pool.apply_async(create_figure_4_clip_table, args))
+    [p.get(timeout=None) for p in processes]
+    pool.close()
+    pool.join()
+if __name__ == "__main__":
+    doctest.testmod()
--- a/src/find_interaction_cluster/clip_figures/clip_compo_launcher.py
+++ b/src/find_interaction_cluster/clip_figures/clip_compo_launcher.py
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+"""
+Description: Launch clip_compo_analyser
+"""
+import lazyparser as lp
+from .clip_compo_analyser import create_multiple_figures
+from pathlib import Path
+import multiprocessing as mp
+@lp.parse(ps = range(1, mp.cpu_count() + 1), feature=["gene", "exon"],
+          threshold_feature="9 < threshold_feature < 10000",
+          default_groups=range(10, 101), cpnt_type=["nt", "dnt", "tnt", "aa"],
+          region=["gene", "exon", "intron"])
+def launcher(clip_result_folder: str, feature: str,
+             threshold_feature: int, default_groups: int,
+             cpnt_type: str, region: str = "gene",
+             ps: int = 1, logging_level: str = "DISABLE") -> None:
+    """
+    Create multiple composition figures from a result clip folder.
+    :param clip_result_folder: Folder containing tables used to \
+    create clip community figures
+    :param feature: The feature of interest (exon, or gene)
+    :param threshold_feature: The minimum number of feature needed \
+    inside communities enriched or impoverished in peak_density to consider \
+    those community (otherwise `default_groups` default number of community \
+    are taken
+    :param default_groups: The number of communities to take as default.
+    :param cpnt_type: The kind of component analysed
+    :param region: Only used when feature is gene. Corresponds to \
+    the region studied within genes. (default: 'gene')
+    :param ps: The number of processes to create (default 1)
+    :param logging_level: The level of data to display (default 'DISABLE')
+    """
+    clip_result_folder = Path(clip_result_folder)
+    if not clip_result_folder.is_dir():
+        raise NotADirectoryError(f"Directory {clip_result_folder} don't exist")
+    create_multiple_figures(clip_result_folder, feature, threshold_feature,
+                            default_groups, cpnt_type, region, ps,
+                            logging_level)
+# launcher()