Skip to content
Snippets Groups Projects
Select Git revision
  • 4d8776b97b7737e0fcf58c8ed09f2d5d7c393882
  • master default protected
  • dev
  • v2.0.0
  • v0.4.0
  • v0.3.0
  • v0.2.9
  • v0.2.8
  • v0.2.7
  • v0.2.6
  • v0.1.0
  • v0.2.5
  • v0.2.4
  • v0.2.3
  • v0.2.2
  • v0.2.1
  • v0.2.0
  • v0.1.2
18 results

docker_init.sh

Blame
  • graph_2d_3d.py 6.93 KiB
    #!/usr/bin/env python3
    
    # -*- coding: UTF-8 -*-
    
    """
    Script that allows to produce three dimension plot, with distribution of
    spatial clusters according to their normalized M, R and S gene composition.
    It can be done for prerna, gene, exon or intron region and for hic, chiapet or
    tad data.
    """
    
    import pandas as pd
    from .config_figures import Config
    import argparse
    import sqlite3
    import pathlib
    import plotly.graph_objects as go
    
    
    parser = argparse.ArgumentParser(description="See script description")
    parser.add_argument("--data_type", help="chiapet, hic or tad", required=True)
    parser.add_argument("--region", help="prerna, gene, exon or intron",
                        required=True)
    args = parser.parse_args()
    
    
    def obtain_percent_by_gene(cnx: sqlite3.Connection, query: str) -> \
            pd.DataFrame:
        """
        It allows to obtain a dataframe with the percentage of n nucleotides for
        each genes of the FasterDB annotation, e.g.:
        id_gene         percent
        1               36.78145
        2               35.19284
        3               36.10462
    
        :param cnx: connexion to the ChIA-PET database
        :param query: the SQL query that allows us to get data from the database
        :return df_percent: see above for dataframe columns description
        """
        cursor = cnx.cursor()
        cursor.execute(query)
        res = cursor.fetchall()
        list_gene = [str(elmt[0]) for elmt in res]
        list_percent = [elmt[1] for elmt in res]
        data = {"id_gene": list_gene, "percent": list_percent}
        df_percent = pd.DataFrame(data)
        return df_percent
    
    
    def obtain_community_info() -> pd.DataFrame:
        """
        This function allows to obtain for each gene its community in which it is
        found according to a data_type: hic, chiapet or tad. It is only done for
        communities with more than 10 genes. It allows to produce a dataframe with
        these columns, e.g.:
        id_gene     community
        13234       C111
        13333       C111
        9936        C29
    
        :return df_comu: see above for dataframe description
        """
        my_file = ""
        if args.data_type == "chiapet":
            my_file = Config.chiapet_clusters
        elif args.data_type == "hic":
            my_file = Config.hic_clusters
        elif args.data_type == "tad":
            my_file = Config.tad_clusters
        with open(my_file) as my_file_i:
            list_gene_cluster = []
            list_communities = []
            for line in my_file_i:
                if not line.startswith("community"):
                    l_clean = line.rstrip()
                    elmt = l_clean.split("\t")
                    if int(elmt[1]) >= 10:
                        for gene in elmt[-2].split(','):
                            gene = gene.replace(" ", "")
                            list_gene_cluster.append(gene)
                            list_communities.append(elmt[0])
        data = {"id_gene": list_gene_cluster, "community": list_communities}
        df_comu = pd.DataFrame(data)
        return df_comu
    
    
    def obtain_percent_by_community(nt: str) -> pd.DataFrame:
        """
        This function allows to product a dataframe with for a community, its
        median percentage in the nt studied and for a given region (prerna, gene,
        exon or intron), e.g.:
        community       percent_A
        C1              49.341263
        C10             49.238067
        C100            49.358244
    
        :param nt: the nucleotide that we want to study the percentage in the
        communities
        :return df_per_com: see above for dataframe description
        """
        df_percent = obtain_percent_by_gene(sqlite3.connect(Config.db_file),
                                            f"""SELECT id_gene, frequency
                                                FROM cin_gene_frequency
                                                WHERE (ft_type == 'nt' and 
                                                ft == '{nt}' and 
                                                region == '{args.region}')""")
        df_comu = obtain_community_info()
        df_comu['id_gene'] = df_comu['id_gene'].astype(str)
        new_df = pd.merge(df_comu, df_percent, on="id_gene")
        df_per_com = new_df.groupby(["community"]).median().reset_index()
        df_per_com.rename(columns={'percent': f'percent_{nt}'}, inplace=True)
        return df_per_com
    
    
    def normalise_percent(nt: str) -> pd.DataFrame:
        """
        Allows for a nt studied to launch the obtain_percent_by_community()
        function, so to obtain for a community, its median percentage in the nt
        studied. Then this percent is normalized in comparison to the median value
        of the percentage of the nucleotide in the different communities, e.g.:
        community           new_percent_R
        C1                  0.017281
        C10                 -0.085915
        C100                0.034263
    
        :param nt: the nucleotide that we want to study the percentage in the
        communities
        :return df_per_com_nt: see above for dataframe description
        """
        df_per_com_nt = obtain_percent_by_community(nt)
        median_com_nt = df_per_com_nt[f"percent_{nt}"].median()
        df_per_com_nt[f"new_percent_{nt}"] = \
            ((df_per_com_nt[f"percent_{nt}"] - median_com_nt) / median_com_nt) * 100
        df_per_com_nt.drop(columns=[f"percent_{nt}"], inplace=True)
        return df_per_com_nt
    
    
    def three_d_plot():
        """
        This function allows to produce the three dimension plot, with distribution
        of spatial clusters according to their normalized M, R and S gene
        composition.
        """
        df_per_com_R = normalise_percent("R")
        df_per_com_S = normalise_percent("S")
        df_per_com_M = normalise_percent("M")
        new_df = pd.merge(df_per_com_R, df_per_com_S, on="community")
        new_df_2 = pd.merge(new_df, df_per_com_M, on="community")
    
        trace1 = go.Scatter3d(x=new_df_2["new_percent_S"],
                              y=new_df_2["new_percent_R"],
                              z=new_df_2["new_percent_M"],
                              mode="markers", marker=dict(size=6, opacity=.9))
    
        x_line = [min(min(new_df_2['new_percent_S']), 0),
                  max(max(new_df_2['new_percent_S']), 0), None, 0, 0, None, 0, 0]
        y_line = [0, 0, None, min(min(new_df_2['new_percent_R']), 0),
                  max(max(new_df_2['new_percent_R']), 0), None, 0, 0]
        z_line = [0, 0, None, 0, 0, None, min(min(new_df_2['new_percent_M']), 0),
                  max(max(new_df_2['new_percent_M']), 0)]
    
        trace2 = go.Scatter3d(x=x_line, y=y_line, z=z_line, mode='lines',
                              name='lines')
        fig = go.Figure(data=[trace1, trace2])
        fig.update_traces(line=dict(color='black', width=5))
        fig.update_layout(showlegend=False)
        fig.update_layout(scene=dict(xaxis_title='S', yaxis_title='R',
                                     zaxis_title='M'))
        title = f"Répartition des clusters spatiaux (type de données : " \
                f"{args.data_type}) en fonction de la composition de leurs " \
                f"{args.region} en R, S et M"
        fig.update_layout(title=title)
        pathlib.Path(Config.three_d_repo).mkdir(exist_ok=True, parents=True)
        fig.write_html(Config.three_d_repo / f"3d_figures_{args.region}_"
                                             f"{args.data_type}.html")
    
    
    if __name__ == "__main__":
        three_d_plot()