Skip to content
Snippets Groups Projects
Select Git revision
  • 660b46b9747b6f062d908406ef54321a497bc81d
  • master default protected
  • dev
  • non-b_dev
  • distance
  • Ali_branch
6 results

pointplot_component_comparison.py

Blame
  • pointplot_component_comparison.py 3.89 KiB
    #!/usr/bin/env python3
    
    # -*- coding: UTF-8 -*-
    
    """
    Description: The goal of this script is to create a pointplot showing the \
    ratio between nucleotides/codon/aa in every community of a community \
    file
    """
    from pathlib import Path
    from typing import List
    
    import lazyparser as lp
    import matplotlib.pyplot as plt
    import pandas as pd
    import seaborn as sns
    
    from ..find_interaction_cluster.nt_and_community import create_dataframe
    from .config_figures import Config
    
    
    def build_frequency_dataframe(
        community_file: str,
        comparison: List[str],
        region: str = "",
        component_type: str = "nt",
        size_threshold: int = -1,
    ) -> pd.DataFrame:
        """
        :param community_file: A community file
        :param comparison: The nucleotides to use for comparison separated by "/" \
        example: A/T
        :param region: the region of interest to extract from gene
        :param component_type: The type of component to analyse; It \
        can be 'nt', 'dnt' or 'aa'.
        :return: A dataframe containing the frequency of every nucleotides \
        of every exon in a large community
        """
        df = create_dataframe(
            community_file, "gene", region, component_type, True, size_threshold
        )
        for c in comparison:
            if c.count("/") != 1:
                raise ValueError(
                    "Wrong comparison: it should always contain a / inside"
                    + f" (problem with {c})"
                )
            nt1, nt2 = c.split("/")
            df[c] = df[nt1] / df[nt2]
        return df[["community", "id_gene"] + comparison].copy()
    
    
    def create_pointplot(
        df: pd.DataFrame, target_col: pd.DataFrame, outfile: Path
    ) -> None:
        """
        Create a pointplot showing the value defined in target_col
    
        :param df: A dataframe containing ratio of frequencies of \
        components of interest
        :param target_col: The column that will be represented on y axis
        :param outfile: File where the figure will be stored
        """
        tmp = (
            df.groupby("community")
            .mean(target_col)
            .reset_index()
            .sort_values(target_col, ascending=False)
        )["community"].to_list()
        tmp2 = df.copy()
        tmp2["community"] = pd.Categorical(
            tmp2["community"], categories=tmp, ordered=True
        )
        tmp2 = tmp2.sort_values("community")
        sns.set(style="whitegrid", font_scale=2)
        g = sns.catplot(
            x="community",
            y=target_col,
            data=tmp2,
            kind="point",
            errorbar="sd",
            aspect=1.7,
            height=10,
        )
        g.ax.axhline(1, color="red")
        g.set(ylabel=f"Ratio of {target_col}", xlabel="", title="", xticklabels=[])
        g.savefig(outfile)
        plt.clf()
        plt.close()
    
    
    def make_figures(
        community_file: str,
        comparison: List[str],
        component_type: str,
        region: str,
    ) -> None:
        """
        Create the pointplot figure of the ratio of a component
    
        :param community_file: The community file to use for this analysis
        :param comparison: The comparisons to make
        :param component_type: The component type used
        :param region: The target gene regions
        """
        Config.poinplot_ratio.mkdir(exist_ok=True)
        df = build_frequency_dataframe(
            community_file, comparison, region, component_type
        )
        for c in comparison:
            cn = c.replace("/", "vs")
            outfile = (
                Config.poinplot_ratio
                / f"{Path(community_file).stem}_{region}_{component_type}_{cn}.pdf"
            )
            create_pointplot(df, c, outfile)
    
    
    @lp.parse
    def main(
        community_file: str,
        comparison: List[str],
        component_type: str,
        region: str,
    ) -> None:
        """
        Create the pointplot figure of the ratio of a component
    
        :param community_file: The community file to use for this analysis
        :param comparison: The comparisons to make
        :param component_type: The component type used
        :param region: The target gene regions
        """
        make_figures(community_file, comparison, component_type, region)
    
    
    if __name__ == "__main__":
        main()