Skip to content
Snippets Groups Projects
Commit abc0f948 authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/community_figures/create_table_4_community_figure...

src/find_interaction_cluster/community_figures/create_table_4_community_figures.py: script to build the input table used in src/find_interaction_cluster/community_figures/__main__.py script
parent b53b91fe
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: The goal of this script is to create a table that can be used \
as an input to create community file (with the __main__.py script of \
this module). You need to have a table of community and a table, \
a file containing a list of fasterDB id and a bed file containing \
fasterDB exons or genes
"""
from pathlib import Path
from typing import List, Union
import pandas as pd
from ..clip_figures.config import Config
import lazyparser as lp
def get_fasterdb_id(bed_file: Path, feature: str) -> List[Union[int, str]]:
"""
Get the fasterDB if located in a bed file
:param bed_file: A bed file containing fasterDB exons of genes
:param feature: The kind of feature of interest
:return: The list of of gene or exon id.
>>> mfile = Path(Config.tests_files / "exons.bed")
>>> get_fasterdb_id(mfile, "exon")
['1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9']
>>> get_fasterdb_id(Path(Config.tests_files / "genes.bed"), "gene")
[1, 2, 3, 4, 5, 6, 7, 8, 9, 415, 10123]
>>> get_fasterdb_id(Path(Config.tests_files / "genes.bed"), "exon")
Traceback (most recent call last):
...
ValueError: Id of exons should contains '_' !
>>> get_fasterdb_id(Path(Config.tests_files / "exons.bed"), "gene")
Traceback (most recent call last):
...
ValueError: Error, the id of genes should be a number
"""
df = pd.read_csv(bed_file, sep="\t", names=["chr", "start", "stop",
"id", "names", "strand"])
my_list = df["id"].astype(str).to_list()
if feature == "gene":
if not my_list[0].isdigit():
raise ValueError("Error, the id of genes should be a number")
my_list = df["id"].astype(int).to_list()
elif feature == "exon" and "_" not in my_list[0]:
raise ValueError("Id of exons should contains '_' !")
return my_list
def load_list_feature(mfile: Path, feature: str) -> List[Union[int, str]]:
"""
Load every fasterdb id in mfile.
:param mfile: A file containing a list of fasterDB id.
:param feature: The kind of feature analysed
:return: The list of id in the file
>>> load_list_feature(Path(Config.tests_files / "gene_list.txt"), "gene")
[1, 25, 30, 78]
"""
return [int(x) if feature == "gene" else x
for x in mfile.open('r').read().splitlines()]
def simplify_community_table(mfile: Path, feature: str) -> pd.DataFrame:
"""
Create a dataframe that indicates for each feature, the community to \
which it belongs.
:param mfile: A file containing a community table.
feature: The kind of feature in the community file (mfile). exon or gene
:param feature: The feature of interest. exon or gene
:return: A dataframe indicating for each feature to which community it \
belong
>>> mfile = Path(Config.tests_files / "test_community_file.txt")
>>> simplify_community_table(mfile, 'gene')
id_gene community community_size
0 415 C1 11
1 416 C1 11
2 421 C1 11
3 422 C1 11
4 423 C1 11
5 433 C1 11
6 441 C1 11
7 475 C1 11
8 481 C1 11
9 502 C1 11
10 511 C1 11
11 10123 C2 5
12 10209 C2 5
13 8812 C2 5
14 9140 C2 5
15 9166 C2 5
"""
content = []
df = pd.read_csv(mfile, sep="\t")
expected_columns = ["community", "nodes", f"{feature}s", "project"]
df = df[expected_columns]
for i in range(df.shape[0]):
mserie = df.iloc[i, :]
features = mserie[f"{feature}s"].split(", ")
if feature == "gene":
features = [int(f) for f in features]
for f in features:
content.append([f, mserie["community"], mserie["nodes"]])
return pd.DataFrame(content, columns=[f"id_{feature}", "community",
"community_size"])
def build_table(list_feature: List[Union[int, str]],
list_all_id: List[Union[int, str]], df_community: pd.DataFrame,
feature: str, name_column: str):
"""
:param list_feature: The list of regulated features
:param list_all_id: The list of every fasterDB feature
:param df_community: A dataframe indicating which gene belong to which \
community
:param feature: The feature of interest
:param name_column: the name to give to the column \
that will say if our feature where find in our list of feature of interest
:return: The complete dataframe
>>> mdf = pd.DataFrame({
... 'id_gene': {0: 415, 1: 416, 2: 421, 3: 422, 4: 423, 5: 433, 6: 441},
... 'community': {0: 'C1', 1: 'C1', 2: 'C1', 3: 'C1', 4: 'C1', 5: 'C1',
... 6: 'C1'},
... 'community_size': {0: 11, 1: 11, 2: 11, 3: 11, 4: 11, 5: 11, 6: 11}})
>>> build_table([415, 416, 421], [1, 415, 416, 421, 422, 423, 433, 441],
... mdf, 'gene', 'test')
id_gene test community community_size
0 1 0 NaN NaN
1 415 1 C1 11.0
2 416 1 C1 11.0
3 421 1 C1 11.0
4 422 0 C1 11.0
5 423 0 C1 11.0
6 433 0 C1 11.0
7 441 0 C1 11.0
"""
df = pd.DataFrame({f"id_{feature}": list_all_id,
name_column: [0] * len(list_all_id)})
df.loc[df[f"id_{feature}"].isin(list_feature), name_column] = 1
return df.merge(df_community, how="left", on=f"id_{feature}")
def filter_table(df_community: pd.DataFrame, threshold: int):
"""
remove every community with a size below to the threshold of interest.
:param df_community: A dataframe of community
:param threshold: The minimum size of the community to keep.
:return: The dataframe filtered
>>> mdf = pd.DataFrame({
... 'id_gene': {0: 415, 1: 416, 2: 421, 3: 422, 4: 423, 5: 433, 6: 441},
... 'community': {0: 'C1', 1: 'C1', 2: 'C1', 3: 'C2', 4: 'C2', 5: 'C2',
... 6: 'C2'},
... 'community_size': {0: 11, 1: 11, 2: 11, 3: 5, 4: 5, 5: 5, 6: 5}})
>>> filter_table(mdf, 7)
id_gene community community_size
0 415 C1 11
1 416 C1 11
2 421 C1 11
"""
return df_community[df_community["community_size"] >= threshold]
def table_make(input_file: Path, bed_file: Path, community_file: Path,
name_input: str, feature: str = "gene", threshold: int = -1,
output: Path = "."):
"""
Create the input table used to create figures of communities.
:param input_file: A file containing a list of genes/exons of \
interest. Every exons in this file will have the value 1.0 in \
the column `name_input`of the result column.
:param bed_file: A bed file containing exons or genes (if it \
contains genes, feature parameter must be equals to gene. The same \
method applies for exons).
:param community_file: A file containing spacial communities
:param name_input: The name of the column that highlight the \
genes in input_file
:param feature: The kind of feature of interest (exon or gene). Note \
that you must use a community file, a bed file and an input file \
containing this type of feature. (default gene)
:param threshold: The minimum threshold used to keep the communities. \
(10 for genes and 50 for exons).
"""
if threshold == -1:
threshold = 10 if feature == "gene" else 50
feature_interest = load_list_feature(input_file, feature)
all_id = get_fasterdb_id(bed_file, feature)
df_community = simplify_community_table(community_file, feature)
table_community = build_table(feature_interest, all_id, df_community,
feature, name_input)
final_table = filter_table(table_community, threshold)
final_table.to_csv(output / f"{name_input}_table.txt", sep="\t",
index=False)
@lp.parse(input_file="file", bed_file="file", community_file="file",
feature=["gene", "exon"], threshold=range(-1, 1001))
def launcher(input_file: str, bed_file: str, community_file: str,
name_input: str, feature: str = "gene", threshold: int = -1,
output: str = "."):
"""
Create the input table used to create figures of communities.
:param input_file: A file containing a list of genes/exons of \
interest. Every exons in this file will have the value 1.0 in \
the column `name_input`of the result column.
:param bed_file: A bed file containing exons or genes (if it \
contains genes, feature parameter must be equals to gene. The same \
method applies for exons).
:param community_file: A file containing spacial communities
:param name_input: The name of the column that highlight the \
genes in input_file
:param feature: The kind of feature of interest (exon or gene). Note \
that you must use a community file, a bed file and an input file \
containing this type of feature. (default gene)
:param threshold: The minimum threshold used to keep the communities. \
(10 for genes and 50 for exons).
"""
table_make(Path(input_file), Path(bed_file), Path(community_file),
name_input, feature, threshold, Path(output))
if __name__ == "__main__":
import sys
if len(sys.argv) == 1:
import doctest
doctest.testmod()
else:
launcher()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment