diff --git a/src/figures_utils/circle_packing_fc.py b/src/figures_utils/circle_packing_fc.py new file mode 100644 index 0000000000000000000000000000000000000000..072dfa6852113054195a11cfefe1318647990761 --- /dev/null +++ b/src/figures_utils/circle_packing_fc.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 + +# -*- coding: utf-8 -*- + +""" +Description: The goal of this script is to be able to get n most enriched \ +components in communities given by the user and create a circle packing \ +visualization of these components. +""" + +import circlify +import lazyparser as lp +import pandas as pd +import polars as pl + +from .circle_packing import ( + create_bubble_packing, + create_colors, + create_hierarchical_dictionnary, +) +from .community_hubs_umap import ( + create_dataframe, +) +from .config_figures import Config + + +@lp.parse( + cpnt_type=[ + "5mer", + "4mer", + "nt", + "aa", + "codon", + "dnt", + "properties", + "properties_lvl2", + "prop", + ], + region=["gene", "premrna", "mrna", "cds", "prot"], +) +def main_circle_pack_hub( + com_file: str, + cpnt_type: str = "nt", + region: str = "gene", + top_motif: int = 5, +) -> None: + """ + The goal of this script is to be able to get n most enriched \ + components in communities given by the user and create a circle packing \ + visualization of these components. + + :param com_file: A community file or leave it empty to create an umap \ + of hubs and coloring each hub in the color of it's SPIN + :param cpnt_type: The component type of interest + :param region: The region of the gene considered + :param keep_all_gene: A boolean indicating whether or not to keep all gene \ + in the analysis, not only those in a defined community + :param top_motif: The number of top motif to consider in each HUB + """ + Config.output_hubs_umap.mkdir(exist_ok=True) + groups = pd.read_csv(com_file, sep="\t")["community"].unique().tolist() + df_freq = create_dataframe( + com_file, "gene", region, cpnt_type, size_threshold=0 + ).rename({"community": "group"}, axis=1) + df = pl.from_pandas(df_freq) + df = df.drop("id_gene").group_by(["group", "community_size"]).mean() + for col in [c for c in df.columns if c not in ["group", "community_size"]]: + df = df.with_columns((pl.col(col) / pl.col(col).mean()).log(base=2)) + df = ( + df.melt( + id_vars=["community_size", "group"], + value_name="log2fc", + variable_name="cpnt", + ) + .sort(["group", "log2fc"], descending=[False, True]) + .with_columns( + pl.col("log2fc") + .rank(method="dense", descending=True) + .over("group") + .alias("rank"), + pl.col("log2fc").round(2), + ) + ) + cpnt_2_keep = ( + df.filter(pl.col("rank") <= top_motif)["cpnt"].unique().to_list() + ) + df = df.filter( + (pl.col("cpnt").is_in(cpnt_2_keep)) & (pl.col("log2fc") > 0) + ) + dfp = df.to_pandas().drop(["community_size", "rank"], axis=1) + dfp["group"] = pd.Categorical( + dfp["group"], categories=groups, ordered=True + ) + dfp = dfp.sort_values(["group", "log2fc"], ascending=[True, False]) + res = [ + create_hierarchical_dictionnary( + dfp, + name_group="group", + name_weight="log2fc", + ) + ] + colors_base = create_colors(groups) + circles = circlify.circlify( + res, + show_enclosure=False, + target_enclosure=circlify.Circle(x=0, y=0, r=1), + ) + create_bubble_packing( + circles, + Config.output_hubs_umap + / f"Circle_packing_log2fc_{region}_{cpnt_type}_{top_motif}.pdf", + region, + cpnt_type, + colors_base, + ) + + +if __name__ == "__main__": + main_circle_pack_hub()