pointplot_component_comparison.py

#!/usr/bin/env python3

# -*- coding: UTF-8 -*-

"""
Description: The goal of this script is to create a pointplot showing the \
ratio between nucleotides/codon/aa in every community of a community \
file
"""
from pathlib import Path
from typing import List

import lazyparser as lp
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from ..find_interaction_cluster.nt_and_community import create_dataframe
from .config_figures import Config


def build_frequency_dataframe(
    community_file: str,
    comparison: List[str],
    region: str = "",
    component_type: str = "nt",
    size_threshold: int = -1,
) -> pd.DataFrame:
    """
    :param community_file: A community file
    :param comparison: The nucleotides to use for comparison separated by "/" \
    example: A/T
    :param region: the region of interest to extract from gene
    :param component_type: The type of component to analyse; It \
    can be 'nt', 'dnt' or 'aa'.
    :return: A dataframe containing the frequency of every nucleotides \
    of every exon in a large community
    """
    df = create_dataframe(
        community_file, "gene", region, component_type, True, size_threshold
    )
    for c in comparison:
        if c.count("/") != 1:
            raise ValueError(
                "Wrong comparison: it should always contain a / inside"
                + f" (problem with {c})"
            )
        nt1, nt2 = c.split("/")
        df[c] = df[nt1] / df[nt2]
    return df[["community", "id_gene"] + comparison].copy()


def create_pointplot(
    df: pd.DataFrame, target_col: pd.DataFrame, outfile: Path
) -> None:
    """
    Create a pointplot showing the value defined in target_col

    :param df: A dataframe containing ratio of frequencies of \
    components of interest
    :param target_col: The column that will be represented on y axis
    :param outfile: File where the figure will be stored
    """
    tmp = (
        df.groupby("community")
        .mean(target_col)
        .reset_index()
        .sort_values(target_col, ascending=False)
    )["community"].to_list()
    tmp2 = df.copy()
    tmp2["community"] = pd.Categorical(
        tmp2["community"], categories=tmp, ordered=True
    )
    tmp2 = tmp2.sort_values("community")
    sns.set(style="whitegrid", font_scale=2)
    g = sns.catplot(
        x="community",
        y=target_col,
        data=tmp2,
        kind="point",
        errorbar="sd",
        aspect=1.7,
        height=10,
    )
    g.ax.axhline(1, color="red")
    g.set(ylabel=f"Ratio of {target_col}", xlabel="", title="", xticklabels=[])
    g.savefig(outfile)
    plt.clf()
    plt.close()


def make_figures(
    community_file: str,
    comparison: List[str],
    component_type: str,
    region: str,
) -> None:
    """
    Create the pointplot figure of the ratio of a component

    :param community_file: The community file to use for this analysis
    :param comparison: The comparisons to make
    :param component_type: The component type used
    :param region: The target gene regions
    """
    Config.poinplot_ratio.mkdir(exist_ok=True)
    df = build_frequency_dataframe(
        community_file, comparison, region, component_type
    )
    for c in comparison:
        cn = c.replace("/", "vs")
        outfile = (
            Config.poinplot_ratio
            / f"{Path(community_file).stem}_{region}_{component_type}_{cn}.pdf"
        )
        create_pointplot(df, c, outfile)


@lp.parse
def main(
    community_file: str,
    comparison: List[str],
    component_type: str,
    region: str,
) -> None:
    """
    Create the pointplot figure of the ratio of a component

    :param community_file: The community file to use for this analysis
    :param comparison: The comparisons to make
    :param component_type: The component type used
    :param region: The target gene regions
    """
    make_figures(community_file, comparison, component_type, region)


if __name__ == "__main__":
    main()