src/find_interaction_cluster/community_figures/create_table_4_community_figure...

src/find_interaction_cluster/community_figures/create_table_4_community_figures.py: script to build the input table used in src/find_interaction_cluster/community_figures/__main__.py script

src/find_interaction_cluster/community_figures/create_table_4_community_figure...
src/find_interaction_cluster/community_figures/create_table_4_community_figures.py: script to build the input table used in src/find_interaction_cluster/community_figures/__main__.py script
abc0f948 · nfontrod · b53b91fe · abc0f948
Commit abc0f948 authored 3 years ago by nfontrod
--- a/src/find_interaction_cluster/community_figures/create_table_4_community_figures.py
+++ b/src/find_interaction_cluster/community_figures/create_table_4_community_figures.py
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description: The goal of this script is to create a table that can be used \
+as an input to create community file (with the __main__.py script of \
+this module). You need to have a table of community and a table, \
+a file containing a list of fasterDB id and a bed file containing \
+fasterDB exons or genes
+"""
+
+from pathlib import Path
+from typing import List, Union
+import pandas as pd
+from ..clip_figures.config import Config
+import lazyparser as lp
+
+
+def get_fasterdb_id(bed_file: Path, feature: str) -> List[Union[int, str]]:
+    """
+    Get the fasterDB if  located in a bed file
+
+    :param bed_file: A bed file containing fasterDB exons of genes
+    :param feature: The kind of feature of interest
+    :return: The list of of gene or exon id.
+
+    >>> mfile = Path(Config.tests_files / "exons.bed")
+    >>> get_fasterdb_id(mfile, "exon")
+    ['1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9']
+    >>> get_fasterdb_id(Path(Config.tests_files / "genes.bed"), "gene")
+    [1, 2, 3, 4, 5, 6, 7, 8, 9, 415, 10123]
+    >>> get_fasterdb_id(Path(Config.tests_files / "genes.bed"), "exon")
+    Traceback (most recent call last):
+    ...
+    ValueError: Id of exons should contains '_' !
+    >>> get_fasterdb_id(Path(Config.tests_files / "exons.bed"), "gene")
+    Traceback (most recent call last):
+    ...
+    ValueError: Error, the id of genes should be a number
+    """
+    df = pd.read_csv(bed_file, sep="\t", names=["chr", "start", "stop",
+                                                "id", "names", "strand"])
+    my_list = df["id"].astype(str).to_list()
+    if feature == "gene":
+        if not my_list[0].isdigit():
+            raise ValueError("Error, the id of genes should be a number")
+        my_list = df["id"].astype(int).to_list()
+    elif feature == "exon" and "_" not in my_list[0]:
+        raise ValueError("Id of exons should contains '_' !")
+    return my_list
+
+
+def load_list_feature(mfile: Path, feature: str) -> List[Union[int, str]]:
+    """
+    Load every fasterdb id in mfile.
+
+    :param mfile: A file containing a list of fasterDB id.
+    :param feature: The kind of feature analysed
+    :return: The list of id in the file
+
+    >>> load_list_feature(Path(Config.tests_files / "gene_list.txt"), "gene")
+    [1, 25, 30, 78]
+    """
+    return [int(x) if feature == "gene" else x
+            for x in mfile.open('r').read().splitlines()]
+
+
+def simplify_community_table(mfile: Path, feature: str) -> pd.DataFrame:
+    """
+    Create a dataframe that indicates for each feature, the community to \
+    which it belongs.
+
+    :param mfile: A file containing a community table.
+    feature: The kind of feature in the community file (mfile). exon or gene
+    :param feature: The feature of interest. exon or gene
+    :return: A dataframe indicating for each feature to which community it \
+    belong
+    >>> mfile = Path(Config.tests_files / "test_community_file.txt")
+    >>> simplify_community_table(mfile, 'gene')
+        id_gene community  community_size
+    0       415        C1              11
+    1       416        C1              11
+    2       421        C1              11
+    3       422        C1              11
+    4       423        C1              11
+    5       433        C1              11
+    6       441        C1              11
+    7       475        C1              11
+    8       481        C1              11
+    9       502        C1              11
+    10      511        C1              11
+    11    10123        C2               5
+    12    10209        C2               5
+    13     8812        C2               5
+    14     9140        C2               5
+    15     9166        C2               5
+    """
+    content = []
+    df = pd.read_csv(mfile, sep="\t")
+    expected_columns = ["community", "nodes", f"{feature}s", "project"]
+    df = df[expected_columns]
+    for i in range(df.shape[0]):
+        mserie = df.iloc[i, :]
+        features = mserie[f"{feature}s"].split(", ")
+        if feature == "gene":
+            features = [int(f) for f in features]
+        for f in features:
+            content.append([f, mserie["community"], mserie["nodes"]])
+    return pd.DataFrame(content, columns=[f"id_{feature}", "community",
+                                          "community_size"])
+
+
+def build_table(list_feature: List[Union[int, str]],
+                list_all_id: List[Union[int, str]], df_community: pd.DataFrame,
+                feature: str, name_column: str):
+    """
+
+    :param list_feature: The list of regulated features
+    :param list_all_id:  The list of every fasterDB feature
+    :param df_community: A dataframe indicating which gene belong to which \
+    community
+    :param feature: The feature of interest
+    :param name_column: the name to give to the column \
+    that will say if our feature where find in our list of feature of interest
+    :return: The complete dataframe
+
+    >>> mdf = pd.DataFrame({
+    ... 'id_gene': {0: 415, 1: 416, 2: 421, 3: 422, 4: 423, 5: 433, 6: 441},
+    ... 'community': {0: 'C1', 1: 'C1', 2: 'C1', 3: 'C1', 4: 'C1', 5: 'C1',
+    ... 6: 'C1'},
+    ... 'community_size': {0: 11, 1: 11, 2: 11, 3: 11, 4: 11, 5: 11, 6: 11}})
+    >>> build_table([415, 416, 421], [1, 415, 416, 421, 422, 423, 433, 441],
+    ... mdf, 'gene', 'test')
+       id_gene  test community  community_size
+    0        1     0       NaN             NaN
+    1      415     1        C1            11.0
+    2      416     1        C1            11.0
+    3      421     1        C1            11.0
+    4      422     0        C1            11.0
+    5      423     0        C1            11.0
+    6      433     0        C1            11.0
+    7      441     0        C1            11.0
+    """
+    df = pd.DataFrame({f"id_{feature}": list_all_id,
+                       name_column: [0] * len(list_all_id)})
+    df.loc[df[f"id_{feature}"].isin(list_feature), name_column] = 1
+    return df.merge(df_community, how="left", on=f"id_{feature}")
+
+
+def filter_table(df_community: pd.DataFrame, threshold: int):
+    """
+    remove every community with a size below to the threshold of interest.
+
+    :param df_community: A dataframe of community
+    :param threshold: The minimum size of the community to keep.
+    :return: The dataframe filtered
+    >>> mdf = pd.DataFrame({
+    ... 'id_gene': {0: 415, 1: 416, 2: 421, 3: 422, 4: 423, 5: 433, 6: 441},
+    ... 'community': {0: 'C1', 1: 'C1', 2: 'C1', 3: 'C2', 4: 'C2', 5: 'C2',
+    ... 6: 'C2'},
+    ... 'community_size': {0: 11, 1: 11, 2: 11, 3: 5, 4: 5, 5: 5, 6: 5}})
+    >>> filter_table(mdf, 7)
+       id_gene community  community_size
+    0      415        C1              11
+    1      416        C1              11
+    2      421        C1              11
+    """
+    return df_community[df_community["community_size"] >= threshold]
+
+
+def table_make(input_file: Path, bed_file: Path, community_file: Path,
+               name_input: str, feature: str = "gene", threshold: int = -1,
+               output: Path = "."):
+    """
+    Create the input table used to create figures of communities.
+
+    :param input_file: A file containing a list of genes/exons of \
+    interest. Every exons in this file will have the value 1.0 in \
+    the column `name_input`of the result column.
+    :param bed_file: A bed file containing exons or genes (if it \
+    contains genes, feature parameter must be equals to gene. The same \
+    method applies for exons).
+    :param community_file: A file containing spacial communities
+    :param name_input: The name of the column that highlight the \
+     genes in input_file
+    :param feature: The kind of feature of interest (exon or gene). Note \
+    that you must use a community file, a bed file and an input file \
+    containing this type of feature. (default gene)
+    :param threshold: The minimum threshold used to keep the communities. \
+    (10 for genes and 50 for exons).
+    """
+    if threshold == -1:
+        threshold = 10 if feature == "gene" else 50
+    feature_interest = load_list_feature(input_file, feature)
+    all_id = get_fasterdb_id(bed_file, feature)
+    df_community = simplify_community_table(community_file, feature)
+    table_community = build_table(feature_interest, all_id, df_community,
+                                  feature, name_input)
+    final_table = filter_table(table_community, threshold)
+    final_table.to_csv(output / f"{name_input}_table.txt", sep="\t",
+                       index=False)
+
+
+@lp.parse(input_file="file", bed_file="file", community_file="file",
+          feature=["gene", "exon"], threshold=range(-1, 1001))
+def launcher(input_file: str, bed_file: str, community_file: str,
+               name_input: str, feature: str = "gene", threshold: int = -1,
+               output: str = "."):
+    """
+    Create the input table used to create figures of communities.
+
+    :param input_file: A file containing a list of genes/exons of \
+    interest. Every exons in this file will have the value 1.0 in \
+    the column `name_input`of the result column.
+    :param bed_file: A bed file containing exons or genes (if it \
+    contains genes, feature parameter must be equals to gene. The same \
+    method applies for exons).
+    :param community_file: A file containing spacial communities
+    :param name_input: The name of the column that highlight the \
+     genes in input_file
+    :param feature: The kind of feature of interest (exon or gene). Note \
+    that you must use a community file, a bed file and an input file \
+    containing this type of feature. (default gene)
+    :param threshold: The minimum threshold used to keep the communities. \
+    (10 for genes and 50 for exons).
+    """
+    table_make(Path(input_file), Path(bed_file), Path(community_file),
+               name_input, feature, threshold, Path(output))
+
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) == 1:
+        import doctest
+        doctest.testmod()
+    else:
+        launcher()
\ No newline at end of file