Skip to content
Snippets Groups Projects
Commit d742be18 authored by nfontrod's avatar nfontrod
Browse files

src/visu/figure_maker.py: add line at bins different between two conditions

parent 56afa7be
Branches
No related tags found
No related merge requests found
......@@ -18,7 +18,9 @@ from tqdm import tqdm
from loguru import logger
import matplotlib as mpl
import matplotlib.font_manager
from rpy2.robjects import r, pandas2ri
from rpy2.robjects import r, pandas2ri, FloatVector
from matplotlib import collections as mc
import numpy as np
def load_bed(bed: Path, bed_name: str) -> List[List[Union[int, str]]]:
......@@ -292,11 +294,118 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
return df_sum, condition_col
def line_maker(list_pval, up_value, position=0):
"""
Create a list of lines, with their colour.
Create a line if the a p-value in list_pval[i] is below 0.05.
If up_mean[i] > down_mean[i] the line will be green, purple else.
:param list_pval: (list of float), list of pvalue get by the comparison
of a propensity scale in a particular sequence position in an
up-regulated and down_regulated set of sequences.
:param up_value: (int) the ordinate coordinates where the line will be
placed.
:param position: (int) the abscissa position to begin to draw the lines
:return: lines - (list of list of 2 tuple), the list of 2 tuple corresponds
to a lines with the coordinates [(x1, y1), (x2, y2)]
"""
lcolor = []
lines = []
for i in range(len(list_pval)):
if list_pval[i] <= 0.05:
val = i + position
lines.append([(val - 0.5, up_value), (val + 0.5, up_value)])
lcolor.append("#000000") # red
return lines, lcolor
def paired_t_test(values1: List[float], values2: List[float]) -> float:
"""
Get the p-value of a paired t-test for each bin
:param values1: A list of values
:param values2: Another list of values
:return: The p-values of the paired t-test
>>> paired_t_test([1, 2, 8], [5, 8, 15])
0.02337551764357566
"""
if len(values1) != len(values2):
raise IndexError("values1 and values2 should have the same length")
ttp = r("""
function(values1, values2) {
return(t.test(values1, values2, paired=T)$p.value)
}
""")
return ttp(FloatVector(values1), FloatVector(values2))[0]
def compute_stats(dff: pd.DataFrame, y_line: float, group_col: str,
outfile: Path) -> Tuple[List[List[Tuple]], List]:
"""
:param dff: A dataframe containing the coverage displayed in the figure
:param y_line: The height of the p-value line
:param group_col: A group column
:param outfile: The file to save the stats
:return: A list of lines coordinates in the form of [(x1, y1), (x2, y2)], \
and the color associated to each line
"""
df = dff.sort_values(["bin", group_col], ascending=True)
groups = df[group_col].unique()
if len(groups) != 2:
raise NotImplementedError("Statistical analysis only implemented for "
"2 groups of data")
p_values_ttp = []
grp1 = []
vgrp1 = []
grp2 = []
vgrp2 = []
for bin in df["bin"].unique():
tmp = df[df["bin"] == bin]
values1 = tmp.loc[tmp[group_col] == groups[0], "coverage"].values
values2 = tmp.loc[tmp[group_col] == groups[1], "coverage"].values
p_values_ttp.append(paired_t_test(values1, values2))
grp1.append(np.mean(values1))
grp2.append(np.mean(values2))
vgrp1.append(";".join(map(str, [round(v, 3) for v in values1])))
vgrp2.append(";".join(map(str, [round(v, 3) for v in values2])))
stats_df = pd.DataFrame({"bin": df["bin"].unique(),
"p_values": p_values_ttp,
groups[0]: grp1, groups[1]: grp2,
f"val-{groups[0]}": vgrp1,
f"val-{groups[1]}": vgrp2,})
stats_df.to_csv(outfile, sep="\t", index=False)
return line_maker(p_values_ttp, y_line, min(df["bin"].unique()))
def display_stat(g: sns.FacetGrid, dff: pd.DataFrame, y_line: float,
group_col: str, stat: bool, outfile: Path) -> sns.FacetGrid:
"""
Update the graphics by displaying stats.
:param g: A seaborn FacetGrid objects corresponding to the metagene \
figure
:param dff: A dataframe containing the coverage displayed in the figure
:param y_line: The height of the p-value line
:param group_col: The column containing the groups analyzed
:param outfile: The file where the metagene will be saved
:return: The facetGrid with the stats
"""
if not stat:
return g
stat_file = outfile.parent / f"{outfile.stem}.txt"
lines, lcolor = compute_stats(dff, y_line, group_col, stat_file)
lc = mc.LineCollection(lines, colors=lcolor, linewidths=5)
g.ax.add_collection(lc)
return g
def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
border_names: List[str], nb_bin: int,
environment: List[int], bed_name: str,
output: Path, norm: Union[int, Path],
condition_col: str, ylim: Optional[List[float]]) -> Path:
condition_col: str, ylim: Optional[List[float]],
stat: bool = False) -> Path:
"""
Create a metagene figure on the region of interest.
......@@ -314,6 +423,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
each samples
:param condition_col: The name of the condition columns
:param ylim: The range of the y axis
:param stat: A boolean indicating wether to perform statistical analysis
:return: The name of the figure
"""
font_files = matplotlib.font_manager.findSystemFonts(fontpaths=None,
......@@ -356,6 +466,9 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
outfile_title += ".pdf"
if ylim[0] is not None:
g.ax.set_ylim(ylim[0], ylim[1])
ymin, ymax = g.ax.get_ylim()
g = display_stat(g, df_sum, ymin + ((ymax - ymin) / 50), condition_col,
stat, output / outfile_title)
g.ax.tick_params(left=True, bottom=True)
g.savefig(output / outfile_title)
g.fig.clf()
......@@ -513,27 +626,6 @@ def get_shapiro_an_lm_pvalue(df: pd.DataFrame, location: str) -> pd.Series:
"location": location})
def compute_stat_dataframe(df_sum: pd.DataFrame,
region_name: str) -> pd.DataFrame:
"""
Create a dataframe containing the statistical analysis of \
the difference of coverage between two conditions.
:param df_sum: A dataframe containing the coverage values between \
two conditions
:param region_name: The name of the region of interest
:return: The dataframe containing statistical analysis
"""
df_diff = compute_diff(df_sum)
if 'location' not in df_diff.columns:
df_diff["location"] = [region_name] * df_diff.shape[0]
list_series = [
get_shapiro_an_lm_pvalue(df_diff, region)
for region in df_diff["location"].unique()
]
return pd.DataFrame(list_series)
def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
bed_names: List[str], nb_bin: int = 100,
figure_type: str = 'metagene',
......@@ -600,13 +692,9 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
environment, region_kind,
ordered_condition, bed_names)
if figure_type == "metagene":
outfile = figure_metagene(df_sum, show_replicate, border_names, nb_bin,
environment, region_kind, output, norm,
cond_col, ylim)
if stat:
df_stat = compute_stat_dataframe(df_sum, region_kind)
df_stat.to_csv(outfile.parent / f"{outfile.stem}.txt", sep="\t",
index=False)
figure_metagene(df_sum, show_replicate, border_names, nb_bin,
environment, region_kind, output, norm,
cond_col, ylim, stat)
else:
if 'location' in df_sum.columns:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment