Select Git revision
docker_init.sh
graph_2d_3d.py 6.93 KiB
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Script that allows to produce three dimension plot, with distribution of
spatial clusters according to their normalized M, R and S gene composition.
It can be done for prerna, gene, exon or intron region and for hic, chiapet or
tad data.
"""
import pandas as pd
from .config_figures import Config
import argparse
import sqlite3
import pathlib
import plotly.graph_objects as go
parser = argparse.ArgumentParser(description="See script description")
parser.add_argument("--data_type", help="chiapet, hic or tad", required=True)
parser.add_argument("--region", help="prerna, gene, exon or intron",
required=True)
args = parser.parse_args()
def obtain_percent_by_gene(cnx: sqlite3.Connection, query: str) -> \
pd.DataFrame:
"""
It allows to obtain a dataframe with the percentage of n nucleotides for
each genes of the FasterDB annotation, e.g.:
id_gene percent
1 36.78145
2 35.19284
3 36.10462
:param cnx: connexion to the ChIA-PET database
:param query: the SQL query that allows us to get data from the database
:return df_percent: see above for dataframe columns description
"""
cursor = cnx.cursor()
cursor.execute(query)
res = cursor.fetchall()
list_gene = [str(elmt[0]) for elmt in res]
list_percent = [elmt[1] for elmt in res]
data = {"id_gene": list_gene, "percent": list_percent}
df_percent = pd.DataFrame(data)
return df_percent
def obtain_community_info() -> pd.DataFrame:
"""
This function allows to obtain for each gene its community in which it is
found according to a data_type: hic, chiapet or tad. It is only done for
communities with more than 10 genes. It allows to produce a dataframe with
these columns, e.g.:
id_gene community
13234 C111
13333 C111
9936 C29
:return df_comu: see above for dataframe description
"""
my_file = ""
if args.data_type == "chiapet":
my_file = Config.chiapet_clusters
elif args.data_type == "hic":
my_file = Config.hic_clusters
elif args.data_type == "tad":
my_file = Config.tad_clusters
with open(my_file) as my_file_i:
list_gene_cluster = []
list_communities = []
for line in my_file_i:
if not line.startswith("community"):
l_clean = line.rstrip()
elmt = l_clean.split("\t")
if int(elmt[1]) >= 10:
for gene in elmt[-2].split(','):
gene = gene.replace(" ", "")
list_gene_cluster.append(gene)
list_communities.append(elmt[0])
data = {"id_gene": list_gene_cluster, "community": list_communities}
df_comu = pd.DataFrame(data)
return df_comu
def obtain_percent_by_community(nt: str) -> pd.DataFrame:
"""
This function allows to product a dataframe with for a community, its
median percentage in the nt studied and for a given region (prerna, gene,
exon or intron), e.g.:
community percent_A
C1 49.341263
C10 49.238067
C100 49.358244
:param nt: the nucleotide that we want to study the percentage in the
communities
:return df_per_com: see above for dataframe description
"""
df_percent = obtain_percent_by_gene(sqlite3.connect(Config.db_file),
f"""SELECT id_gene, frequency
FROM cin_gene_frequency
WHERE (ft_type == 'nt' and
ft == '{nt}' and
region == '{args.region}')""")
df_comu = obtain_community_info()
df_comu['id_gene'] = df_comu['id_gene'].astype(str)
new_df = pd.merge(df_comu, df_percent, on="id_gene")
df_per_com = new_df.groupby(["community"]).median().reset_index()
df_per_com.rename(columns={'percent': f'percent_{nt}'}, inplace=True)
return df_per_com
def normalise_percent(nt: str) -> pd.DataFrame:
"""
Allows for a nt studied to launch the obtain_percent_by_community()
function, so to obtain for a community, its median percentage in the nt
studied. Then this percent is normalized in comparison to the median value
of the percentage of the nucleotide in the different communities, e.g.:
community new_percent_R
C1 0.017281
C10 -0.085915
C100 0.034263
:param nt: the nucleotide that we want to study the percentage in the
communities
:return df_per_com_nt: see above for dataframe description
"""
df_per_com_nt = obtain_percent_by_community(nt)
median_com_nt = df_per_com_nt[f"percent_{nt}"].median()
df_per_com_nt[f"new_percent_{nt}"] = \
((df_per_com_nt[f"percent_{nt}"] - median_com_nt) / median_com_nt) * 100
df_per_com_nt.drop(columns=[f"percent_{nt}"], inplace=True)
return df_per_com_nt
def three_d_plot():
"""
This function allows to produce the three dimension plot, with distribution
of spatial clusters according to their normalized M, R and S gene
composition.
"""
df_per_com_R = normalise_percent("R")
df_per_com_S = normalise_percent("S")
df_per_com_M = normalise_percent("M")
new_df = pd.merge(df_per_com_R, df_per_com_S, on="community")
new_df_2 = pd.merge(new_df, df_per_com_M, on="community")
trace1 = go.Scatter3d(x=new_df_2["new_percent_S"],
y=new_df_2["new_percent_R"],
z=new_df_2["new_percent_M"],
mode="markers", marker=dict(size=6, opacity=.9))
x_line = [min(min(new_df_2['new_percent_S']), 0),
max(max(new_df_2['new_percent_S']), 0), None, 0, 0, None, 0, 0]
y_line = [0, 0, None, min(min(new_df_2['new_percent_R']), 0),
max(max(new_df_2['new_percent_R']), 0), None, 0, 0]
z_line = [0, 0, None, 0, 0, None, min(min(new_df_2['new_percent_M']), 0),
max(max(new_df_2['new_percent_M']), 0)]
trace2 = go.Scatter3d(x=x_line, y=y_line, z=z_line, mode='lines',
name='lines')
fig = go.Figure(data=[trace1, trace2])
fig.update_traces(line=dict(color='black', width=5))
fig.update_layout(showlegend=False)
fig.update_layout(scene=dict(xaxis_title='S', yaxis_title='R',
zaxis_title='M'))
title = f"Répartition des clusters spatiaux (type de données : " \
f"{args.data_type}) en fonction de la composition de leurs " \
f"{args.region} en R, S et M"
fig.update_layout(title=title)
pathlib.Path(Config.three_d_repo).mkdir(exist_ok=True, parents=True)
fig.write_html(Config.three_d_repo / f"3d_figures_{args.region}_"
f"{args.data_type}.html")
if __name__ == "__main__":
three_d_plot()