Select Git revision
pointplot_component_comparison.py

nfontrod authored
pointplot_component_comparison.py 3.89 KiB
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: The goal of this script is to create a pointplot showing the \
ratio between nucleotides/codon/aa in every community of a community \
file
"""
from pathlib import Path
from typing import List
import lazyparser as lp
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from ..find_interaction_cluster.nt_and_community import create_dataframe
from .config_figures import Config
def build_frequency_dataframe(
community_file: str,
comparison: List[str],
region: str = "",
component_type: str = "nt",
size_threshold: int = -1,
) -> pd.DataFrame:
"""
:param community_file: A community file
:param comparison: The nucleotides to use for comparison separated by "/" \
example: A/T
:param region: the region of interest to extract from gene
:param component_type: The type of component to analyse; It \
can be 'nt', 'dnt' or 'aa'.
:return: A dataframe containing the frequency of every nucleotides \
of every exon in a large community
"""
df = create_dataframe(
community_file, "gene", region, component_type, True, size_threshold
)
for c in comparison:
if c.count("/") != 1:
raise ValueError(
"Wrong comparison: it should always contain a / inside"
+ f" (problem with {c})"
)
nt1, nt2 = c.split("/")
df[c] = df[nt1] / df[nt2]
return df[["community", "id_gene"] + comparison].copy()
def create_pointplot(
df: pd.DataFrame, target_col: pd.DataFrame, outfile: Path
) -> None:
"""
Create a pointplot showing the value defined in target_col
:param df: A dataframe containing ratio of frequencies of \
components of interest
:param target_col: The column that will be represented on y axis
:param outfile: File where the figure will be stored
"""
tmp = (
df.groupby("community")
.mean(target_col)
.reset_index()
.sort_values(target_col, ascending=False)
)["community"].to_list()
tmp2 = df.copy()
tmp2["community"] = pd.Categorical(
tmp2["community"], categories=tmp, ordered=True
)
tmp2 = tmp2.sort_values("community")
sns.set(style="whitegrid", font_scale=2)
g = sns.catplot(
x="community",
y=target_col,
data=tmp2,
kind="point",
errorbar="sd",
aspect=1.7,
height=10,
)
g.ax.axhline(1, color="red")
g.set(ylabel=f"Ratio of {target_col}", xlabel="", title="", xticklabels=[])
g.savefig(outfile)
plt.clf()
plt.close()
def make_figures(
community_file: str,
comparison: List[str],
component_type: str,
region: str,
) -> None:
"""
Create the pointplot figure of the ratio of a component
:param community_file: The community file to use for this analysis
:param comparison: The comparisons to make
:param component_type: The component type used
:param region: The target gene regions
"""
Config.poinplot_ratio.mkdir(exist_ok=True)
df = build_frequency_dataframe(
community_file, comparison, region, component_type
)
for c in comparison:
cn = c.replace("/", "vs")
outfile = (
Config.poinplot_ratio
/ f"{Path(community_file).stem}_{region}_{component_type}_{cn}.pdf"
)
create_pointplot(df, c, outfile)
@lp.parse
def main(
community_file: str,
comparison: List[str],
component_type: str,
region: str,
) -> None:
"""
Create the pointplot figure of the ratio of a component
:param community_file: The community file to use for this analysis
:param comparison: The comparisons to make
:param component_type: The component type used
:param region: The target gene regions
"""
make_figures(community_file, comparison, component_type, region)
if __name__ == "__main__":
main()