graph_2d_3d.py

#!/usr/bin/env python3

# -*- coding: UTF-8 -*-

"""
Script that allows to produce three dimension plot, with distribution of
spatial clusters according to their normalized M, R and S gene composition.
It can be done for prerna, gene, exon or intron region and for hic, chiapet or
tad data.
"""

import pandas as pd
from .config_figures import Config
import argparse
import sqlite3
import pathlib
import plotly.graph_objects as go


parser = argparse.ArgumentParser(description="See script description")
parser.add_argument("--data_type", help="chiapet, hic or tad", required=True)
parser.add_argument("--region", help="prerna, gene, exon or intron",
                    required=True)
args = parser.parse_args()


def obtain_percent_by_gene(cnx: sqlite3.Connection, query: str) -> \
        pd.DataFrame:
    """
    It allows to obtain a dataframe with the percentage of n nucleotides for
    each genes of the FasterDB annotation, e.g.:
    id_gene         percent
    1               36.78145
    2               35.19284
    3               36.10462

    :param cnx: connexion to the ChIA-PET database
    :param query: the SQL query that allows us to get data from the database
    :return df_percent: see above for dataframe columns description
    """
    cursor = cnx.cursor()
    cursor.execute(query)
    res = cursor.fetchall()
    list_gene = [str(elmt[0]) for elmt in res]
    list_percent = [elmt[1] for elmt in res]
    data = {"id_gene": list_gene, "percent": list_percent}
    df_percent = pd.DataFrame(data)
    return df_percent


def obtain_community_info() -> pd.DataFrame:
    """
    This function allows to obtain for each gene its community in which it is
    found according to a data_type: hic, chiapet or tad. It is only done for
    communities with more than 10 genes. It allows to produce a dataframe with
    these columns, e.g.:
    id_gene     community
    13234       C111
    13333       C111
    9936        C29

    :return df_comu: see above for dataframe description
    """
    my_file = ""
    if args.data_type == "chiapet":
        my_file = Config.chiapet_clusters
    elif args.data_type == "hic":
        my_file = Config.hic_clusters
    elif args.data_type == "tad":
        my_file = Config.tad_clusters
    with open(my_file) as my_file_i:
        list_gene_cluster = []
        list_communities = []
        for line in my_file_i:
            if not line.startswith("community"):
                l_clean = line.rstrip()
                elmt = l_clean.split("\t")
                if int(elmt[1]) >= 10:
                    for gene in elmt[-2].split(','):
                        gene = gene.replace(" ", "")
                        list_gene_cluster.append(gene)
                        list_communities.append(elmt[0])
    data = {"id_gene": list_gene_cluster, "community": list_communities}
    df_comu = pd.DataFrame(data)
    return df_comu


def obtain_percent_by_community(nt: str) -> pd.DataFrame:
    """
    This function allows to product a dataframe with for a community, its
    median percentage in the nt studied and for a given region (prerna, gene,
    exon or intron), e.g.:
    community       percent_A
    C1              49.341263
    C10             49.238067
    C100            49.358244

    :param nt: the nucleotide that we want to study the percentage in the
    communities
    :return df_per_com: see above for dataframe description
    """
    df_percent = obtain_percent_by_gene(sqlite3.connect(Config.db_file),
                                        f"""SELECT id_gene, frequency
                                            FROM cin_gene_frequency
                                            WHERE (ft_type == 'nt' and
                                            ft == '{nt}' and
                                            region == '{args.region}')""")
    df_comu = obtain_community_info()
    df_comu['id_gene'] = df_comu['id_gene'].astype(str)
    new_df = pd.merge(df_comu, df_percent, on="id_gene")
    df_per_com = new_df.groupby(["community"]).median().reset_index()
    df_per_com.rename(columns={'percent': f'percent_{nt}'}, inplace=True)
    return df_per_com


def normalise_percent(nt: str) -> pd.DataFrame:
    """
    Allows for a nt studied to launch the obtain_percent_by_community()
    function, so to obtain for a community, its median percentage in the nt
    studied. Then this percent is normalized in comparison to the median value
    of the percentage of the nucleotide in the different communities, e.g.:
    community           new_percent_R
    C1                  0.017281
    C10                 -0.085915
    C100                0.034263

    :param nt: the nucleotide that we want to study the percentage in the
    communities
    :return df_per_com_nt: see above for dataframe description
    """
    df_per_com_nt = obtain_percent_by_community(nt)
    median_com_nt = df_per_com_nt[f"percent_{nt}"].median()
    df_per_com_nt[f"new_percent_{nt}"] = \
        ((df_per_com_nt[f"percent_{nt}"] - median_com_nt) / median_com_nt) * 100
    df_per_com_nt.drop(columns=[f"percent_{nt}"], inplace=True)
    return df_per_com_nt


def three_d_plot():
    """
    This function allows to produce the three dimension plot, with distribution
    of spatial clusters according to their normalized M, R and S gene
    composition.
    """
    df_per_com_R = normalise_percent("R")
    df_per_com_S = normalise_percent("S")
    df_per_com_M = normalise_percent("M")
    new_df = pd.merge(df_per_com_R, df_per_com_S, on="community")
    new_df_2 = pd.merge(new_df, df_per_com_M, on="community")

    trace1 = go.Scatter3d(x=new_df_2["new_percent_S"],
                          y=new_df_2["new_percent_R"],
                          z=new_df_2["new_percent_M"],
                          mode="markers", marker=dict(size=6, opacity=.9))

    x_line = [min(min(new_df_2['new_percent_S']), 0),
              max(max(new_df_2['new_percent_S']), 0), None, 0, 0, None, 0, 0]
    y_line = [0, 0, None, min(min(new_df_2['new_percent_R']), 0),
              max(max(new_df_2['new_percent_R']), 0), None, 0, 0]
    z_line = [0, 0, None, 0, 0, None, min(min(new_df_2['new_percent_M']), 0),
              max(max(new_df_2['new_percent_M']), 0)]

    trace2 = go.Scatter3d(x=x_line, y=y_line, z=z_line, mode='lines',
                          name='lines')
    fig = go.Figure(data=[trace1, trace2])
    fig.update_traces(line=dict(color='black', width=5))
    fig.update_layout(showlegend=False)
    fig.update_layout(scene=dict(xaxis_title='S', yaxis_title='R',
                                 zaxis_title='M'))
    title = f"Répartition des clusters spatiaux (type de données : " \
            f"{args.data_type}) en fonction de la composition de leurs " \
            f"{args.region} en R, S et M"
    fig.update_layout(title=title)
    pathlib.Path(Config.three_d_repo).mkdir(exist_ok=True, parents=True)
    fig.write_html(Config.three_d_repo / f"3d_figures_{args.region}_"
                                         f"{args.data_type}.html")


if __name__ == "__main__":
    three_d_plot()