Skip to content
Snippets Groups Projects
Commit a520d7c7 authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/nt_and_community.py: error bar on barplots

parent c1ec2983
No related branches found
No related tags found
No related merge requests found
...@@ -152,7 +152,6 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str ...@@ -152,7 +152,6 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str
:param outfile: A name of a file :param outfile: A name of a file
:param nt: the nucleotide of interest :param nt: the nucleotide of interest
:return: the pvalue of lmm :return: the pvalue of lmm
""" """
pandas2ri.activate() pandas2ri.activate()
lmm = r( lmm = r(
...@@ -222,7 +221,8 @@ def create_ctrl_community(df: pd.DataFrame, ...@@ -222,7 +221,8 @@ def create_ctrl_community(df: pd.DataFrame,
def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str, def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str,
nt: str, outfile_diag: Path) -> pd.DataFrame: nt: str, outfile_diag: Path
) -> Tuple[pd.DataFrame, pd.DataFrame]:
""" """
:param df: df: A dataframe containing the frequency of each nucleotide \ :param df: df: A dataframe containing the frequency of each nucleotide \
...@@ -232,11 +232,12 @@ def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str, ...@@ -232,11 +232,12 @@ def lmm_with_ctrl(df: pd.DataFrame, feature: str, region: str,
:param nt: The nucleotide of interest :param nt: The nucleotide of interest
:param outfile_diag: File from which the diagnostics folder will be \ :param outfile_diag: File from which the diagnostics folder will be \
inferred inferred
:return: The dataframe with the p-value compared to the control \ :return: The dataframe with ctrl exon and \
list of exons. The dataframe with the p-value compared to the control \
list of feature.
""" """
ndf = create_ctrl_community(df, feature, region) ndf = create_ctrl_community(df, feature, region)
return lmm_maker_summary(ndf, outfile_diag, nt) return ndf, lmm_maker_summary(ndf, outfile_diag, nt)
def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict: def get_feature_by_community(df: pd.DataFrame, feature: str) -> Dict:
...@@ -418,24 +419,78 @@ def make_barplot(df_bar: pd.DataFrame, outfile: Path, nt: str, test_type: str, ...@@ -418,24 +419,78 @@ def make_barplot(df_bar: pd.DataFrame, outfile: Path, nt: str, test_type: str,
sns.set() sns.set()
test_name = "permutation" if test_type == "perm" else "lmm" test_name = "permutation" if test_type == "perm" else "lmm"
g = sns.catplot(x="community", y=nt, data=df_bar, kind="bar", g = sns.catplot(x="community", y=nt, data=df_bar, kind="bar",
ci="sd", aspect=2.5, height=12, ci="sd", aspect=2.5, height=12, errwidth=0.5, capsize=.4,
palette=["red"] + ["grey"] * (df_bar.shape[0] - 1)) palette=["red"] + ["lightgray"] * (df_bar.shape[0] - 1))
g.fig.suptitle(f"Mean frequency of {nt} among community of {feature}s\n" g.fig.suptitle(f"Mean frequency of {nt} among community of {feature}s\n"
f"(stats obtained with as {test_name} test)") f"(stats obtained with as {test_name} test)")
g.set(xticklabels=[]) g.set(xticklabels=[])
g.ax.set_ylabel(f'Frequency of {nt}') g.ax.set_ylabel(f'Frequency of {nt}')
if test_type == "perm": df_bara = df_bar.drop_duplicates(subset="community", keep="first")
df_bar = df_bar.drop_duplicates(subset="community", keep="last")
for i, p in enumerate(g.ax.patches): for i, p in enumerate(g.ax.patches):
stats = "*" if df_bar.iloc[i, :]["p-adj"] < 0.05 else "" stats = "*" if df_bara.iloc[i, :]["p-adj"] < 0.05 else ""
print(i, stats, df_bar.iloc[i, :]["p-adj"]) com = df_bara.iloc[i, :]["community"]
csd = np.std(df_bar.loc[df_bar["community"] == com, nt])
g.ax.annotate(stats, g.ax.annotate(stats,
(p.get_x() + p.get_width() / 2., p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height() + csd),
ha='center', va='center', xytext=(0, 10), ha='center', va='center', xytext=(0, 10),
textcoords='offset points') textcoords='offset points')
g.savefig(outfile) g.savefig(outfile)
def expand_results_lmm(df: pd.DataFrame, rdf: pd.DataFrame,
nt: str, feature: str) -> pd.DataFrame:
"""
Merge df and rdf together.
:param df: A dataframe containing the frequency of each nucleotide \
in each feature belonging to a community.
:param rdf: The dataframe containing the mean frequency for \
each community and the p-value of their enrichment compared to control \
exons.
:param nt: the nucleotide of interest
:param feature: The feature of interest
:return: The merged dataframe: i.e df with the stats columns
"""
p_col = "Pr(>|t|)"
df = df[[f"id_{feature}", nt, "community", "community_size"]].copy()
rdf = rdf[["community", "community_size", p_col, nt]].copy()
rdf.rename({nt: f"mean_{nt}", p_col: "p-adj"}, axis=1, inplace=True)
df = df.merge(rdf, how="left", on=["community", "community_size"])
df_ctrl = df[df["community"] == "C-CTRL"]
df = df[df["community"] != "C-CTRL"].copy()
df.sort_values(f"mean_{nt}", ascending=True, inplace=True)
return pd.concat([df_ctrl, df], axis=0, ignore_index=True)
def expand_results_perm(df: pd.DataFrame, rdf: pd.DataFrame,
nt: str, feature: str, iteration: int) -> pd.DataFrame:
"""
Merge df and rdf together.
:param df: A dataframe containing the frequency of each nucleotide \
in each feature belonging to a community.
:param rdf: The dataframe containing the mean frequency for \
each community and the p-value of their enrichment compared to control \
exons.
:param nt: the nucleotide of interest
:param feature: The feature of interest
:param iteration: The number of iteration
:return: The merged dataframe: i.e df with the stats columns
"""
df = df[[f"id_{feature}", nt, "community", "community_size"]].copy()
ctrl_val = rdf[f"{nt}_mean_{iteration}_ctrl"]
rdf = rdf[["community", "community_size", nt, "p-adj"]].copy()
rdf.rename({nt: f"mean_{nt}"}, axis=1, inplace=True)
df = df.merge(rdf, how="left", on=["community", "community_size"])
df_ctrl = pd.DataFrame({nt: ctrl_val,
f"mean_{nt}": [np.mean(ctrl_val)] * len(ctrl_val),
f"id_{feature}": ['ctrl'] * len(ctrl_val),
"community_size": [len(ctrl_val)] * len(ctrl_val),
"community": ["C-CTRL"] * len(ctrl_val)})
df.sort_values(f"mean_{nt}", ascending=True, inplace=True)
return pd.concat([df_ctrl, df], axis=0, ignore_index=True)
def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str, def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str,
region: str, nt: str, outfile: Path, region: str, nt: str, outfile: Path,
test_type: str, df_ctrl: pd.DataFrame, test_type: str, df_ctrl: pd.DataFrame,
...@@ -461,32 +516,34 @@ def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str, ...@@ -461,32 +516,34 @@ def create_and_save_ctrl_dataframe(df: pd.DataFrame, feature: str,
containing the test communities and the control community containing the test communities and the control community
""" """
if test_type == "lmm": if test_type == "lmm":
rdf = lmm_with_ctrl(df, feature, region, nt, ndf, rdf = lmm_with_ctrl(df, feature, region, nt,
outfile.parents[1] / outfile.name) outfile.parents[1] / outfile.name)
df_bar = expand_results_lmm(ndf, rdf, nt, feature)
else: else:
rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration) rdf = perm_with_ctrl(df, feature, nt, df_ctrl, dic_com, iteration)
df_bar = expand_results_perm(df, rdf, nt, feature, iteration)
rdf.to_csv(outfile_ctrl, sep="\t", index=False) rdf.to_csv(outfile_ctrl, sep="\t", index=False)
barplot_creation(rdf, outfile_ctrl, nt, barplot_creation(df_bar, outfile_ctrl, nt,
test_type, feature, iteration) test_type, feature)
def barplot_creation(rdf: pd.DataFrame, outfile: Path, nt: str, def barplot_creation(df_bar: pd.DataFrame, outfile: Path, nt: str,
test_type: str, feature: str, iteration: int) -> None: test_type: str, feature: str) -> None:
""" """
Reformat a dataframe with the enrichment of a nucleotide frequency \ Reformat a dataframe with the enrichment of a nucleotide frequency \
for every community and then create a barplot showing those frequencies. for every feature for every community and then create a \
barplot showing those frequencies.
:param rdf: A dataframe with the enrichment of a \ :param df_bar: A dataframe with the enrichment of a \
nucleotide frequency for every community nucleotide frequency for every community and showing the frequency \
of each feature in each community
:param outfile: File were rdf is stored :param outfile: File were rdf is stored
:param nt: The nucleotide for which we are seeking enrichment :param nt: The nucleotide for which we are seeking enrichment
:param test_type: The kind of test make :param test_type: The kind of test make
:param feature: The king of feature of interest :param feature: The king of feature of interest
:param iteration: The number of sub samples to create
""" """
rdf = prepare_dataframe(rdf, test_type, nt, iteration)
outfig = outfile.parent / outfile.name.replace(".txt", ".pdf") outfig = outfile.parent / outfile.name.replace(".txt", ".pdf")
make_barplot(rdf, outfig, nt, test_type, feature) make_barplot(df_bar, outfig, nt, test_type, feature)
def create_outfiles(project: str, weight: int, global_weight: int, def create_outfiles(project: str, weight: int, global_weight: int,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment