Skip to content
Snippets Groups Projects
Commit 38a6e219 authored by nfontrod's avatar nfontrod
Browse files

src/07_ribosome_density.py: build density figure

parent 1e80378d
No related branches found
No related tags found
No related merge requests found
#!/usr/bine/env
# -*- condig: UTF-8 -*-
"""
The goal of this script is to compute the ribosomal density for genes in \
experiment 245-6-7
"""
from multiprocessing import context
from pathlib import Path
from typing import List
import doctest
import pandas as pd
from functools import reduce
import seaborn as sns
import numpy as np
from rpy2.robjects import r, pandas2ri
class Config_file:
data = Path(__file__).parents[1] / "data"
htseq_fold = data / "htseq_count_manip_245-6-7_density"
htseq_rnaseq = htseq_fold / "htseq_count_rna"
htseq_ribo = htseq_fold / "htseq_count_ribo"
result = Path(__file__).parents[1] / "results"
tdd_braf = result / "TDD_analysis" / "correlation" / "TDD_BRAF.txt"
tdd_dmso = result / "TDD_analysis" / "correlation" / "TDD_DMSO.txt"
output = result / "TDD_analysis" / "density_figures"
def load_htseqfiles(mfolder: Path) -> List[Path]:
"""
Load the list of htseq files located in that folder
:param mfolder: A folder containing htseq count file
:return: The list of files in `mfolder`
>>> res = load_htseqfiles(Config_file.htseq_ribo)
>>> len(res)
6
>>> [x.stem for x in res[0:2]]
['DMSO_245_ribo', 'DMSO_247_ribo']
"""
return list(Path(mfolder).glob("*.tsv"))
def load_df(list_files: List[Path]) -> pd.DataFrame:
"""
load all count files
:param list_files: A list of htseq files
:return: A dataframe contaning counts for every files
>>> mfiles = [Config_file.htseq_ribo / "DMSO_245_ribo.tsv",
... Config_file.htseq_ribo / "DMSO_247_ribo.tsv"]
>>> load_df(mfiles).head()
gene DMSO_245_ribo DMSO_247_ribo
0 A1BG 44 84
1 A1CF 0 0
2 A2M 4376 5682
3 A2ML1 0 0
4 A3GALT2 0 0
"""
list_df = [pd.read_csv(x, sep="\t", names=["gene", x.stem])
for x in list_files]
return reduce(lambda x, y: x.merge(y, how="inner"), list_df)
def compute_cpm(df: pd.DataFrame) -> pd.DataFrame:
"""
Compute the cpm of the dataframe df
:param df: A dataframe containing raw read count
:return: The dataframe with the cpm read counts
>>> dsdf = pd.DataFrame({"gene": [f"gn_{i}" for i in range(1, 6)],
... "x1": [10000, 20000, 50000, 12000, 8000],
... "x2": [200000, 400000, 1000000, 240000, 160000]})
>>> compute_cpm(dsdf)
gene x1 x2
0 gn_1 100000.0 100000.0
1 gn_2 200000.0 200000.0
2 gn_3 500000.0 500000.0
3 gn_4 120000.0 120000.0
4 gn_5 80000.0 80000.0
"""
col = [c for c in df.columns if c != "gene"]
for cc in col:
fctr = 1e6 / df[cc].sum()
df[cc] = df[cc] * fctr
return df
def compute_ribosome_density(df_cpm: pd.DataFrame):
"""
compute the density of ribosomes
:param df_cpm: A dataframe containing CPM
:return: The ribosomes density for every replicates
>>> dsdf = pd.DataFrame({"gene": list("ABCDE"),
... "DMSO_245_ribo": [5, 10, 15, 20, 25],
... "DMSO_245_rna": [5, 5, 5, 5, 5],
... "DMSO_246_ribo": [5, 5, 5, 5, 5],
... "DMSO_246_rna": [5, 10, 15, 20, 0]})
>>> compute_ribosome_density(dsdf)
gene treatment replicate density
0 A DMSO 245 1.000000
1 A DMSO 246 1.000000
2 B DMSO 245 2.000000
3 B DMSO 246 0.500000
4 C DMSO 245 3.000000
5 C DMSO 246 0.333333
6 D DMSO 245 4.000000
7 D DMSO 246 0.250000
8 E DMSO 245 5.000000
"""
df = df_cpm.melt(id_vars=["gene"], value_name="cpm", var_name="sample")
df["condition"] = df["sample"].str.replace("_(ribo|rna)", "", regex=True)
df["kind"] = df["sample"].str.extract(r'(ribo|rna)')
df = df.pivot(index=["gene", "condition"], columns="kind", values="cpm"
).reset_index()
df.columns.name = None
df["density"] = df["ribo"] / df["rna"]
df["treatment"] = df["condition"].str.extract(r"(DMSO|BRAF)_")
df["replicate"] = df["condition"].str.extract(r"_(24.)")
df.drop(["ribo", "rna", "condition"], inplace=True, axis=1)
df = df[(-df["density"].isna() & -np.isinf(df["density"]))]
return df[["gene", "treatment", "replicate", "density"]]
def create_density_df() -> pd.DataFrame:
"""
Compute the density dataframe
"""
file_ribo = load_htseqfiles(Config_file.htseq_ribo)
file_rna = load_htseqfiles(Config_file.htseq_rnaseq)
mfiles = file_ribo + file_rna
df = load_df(mfiles)
df = compute_cpm(df)
df = compute_ribosome_density(df)
return df
def add_group_column(df_density: pd.DataFrame, gene_file: Path
) -> pd.DataFrame:
"""
Add a column containing gene having a TDD increasing/desreasing in BRAF
condition
:param df_density: A dataframe containing densities
:param gene_file: A file containing gene whose TDD changes in BRAF
condition
>>> genes_l = list("ABC")
>>> temp_file = Path("/tmp/tmp.genes")
>>> tf = temp_file.open("w")
>>> x = tf.write("\\n".join(genes_l))
>>> tf.close()
>>> dsdd = pd.DataFrame({
... 'gene': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E', 'E'],
... 'treatment': ['DMSO'] * 10,
... 'replicate': [245, 246, 245, 246, 245, 246, 245, 246, 245, 246],
... 'density': [1.0, 1.0, 2.0, 0.5, 3.0, 0.333333, 4.0, 0.25, 5.0, 0.2]})
>>> add_group_column(dsdd, temp_file)
gene treatment replicate density group
0 A DMSO 245 1.000000 tmp
1 A DMSO 246 1.000000 tmp
2 B DMSO 245 2.000000 tmp
3 B DMSO 246 0.500000 tmp
4 C DMSO 245 3.000000 tmp
5 C DMSO 246 0.333333 tmp
6 D DMSO 245 4.000000 CTRL
7 D DMSO 246 0.250000 CTRL
8 E DMSO 245 5.000000 CTRL
9 E DMSO 246 0.200000 CTRL
>>> temp_file.unlink()
"""
genes = gene_file.open("r").read().splitlines()
df_density["group"] = ["CTRL"] * df_density.shape[0]
df_density.loc[df_density["gene"].isin(genes), "group"] = gene_file.stem
return df_density
def avg_replicate(df_density: pd.DataFrame) -> pd.DataFrame:
"""
Averages the density by gene and drop replicate column
:param df_density: A dataframe containing ribosomes densities
>>> dsdd = pd.DataFrame(
... {'gene': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E', 'E'],
... 'treatment': ['DMSO'] * 10 ,
... 'replicate': [245, 246] * 5,
... 'density': [1.0, 1.0, 2.0, 0.5, 3.0, 0.333333, 4.0, 0.25, 5.0, 0.2],
... 'group': ['tmp'] * 6 + ['CTRL'] * 4})
>>> avg_replicate(dsdd)
gene treatment group density
0 A DMSO tmp 1.000000
1 B DMSO tmp 1.250000
2 C DMSO tmp 1.666667
3 D DMSO CTRL 2.125000
4 E DMSO CTRL 2.600000
"""
df = df_density.drop("replicate", axis=1) \
.groupby(["gene", "treatment", "group"]) \
.mean().reset_index()
return df
def statistical_analysis(df_density: pd.DataFrame, output: Path) -> float:
"""Get the density pvalue
:param df_density: A density dataframe
:type df_density: pd.DataFrame
:param outout: The folder where the diagnostics will be produced
:return: The pvalue needed
"""
pandas2ri.activate()
output = output / "diag"
output.mkdir(exist_ok=True)
treat = df_density["treatment"].unique()[0]
outfig = output / f"diag_{treat}.pdf"
outpval = output / f"diag_pval_{treat}.txt"
stat_s = r(f"""
require("DHARMa")
require("glmmTMB")
function(data, outfig, outpval){{
mod <- glmmTMB(log1p(density) ~ group + replicate, ziformula = ~1,
family="ziGamma", data=data)
simulationOutput <- simulateResiduals(fittedModel = mod, n = 200)
pdf(outfig)
plot(simulationOutput)
dev.off()
res <- summary(mod)
tmp <- as.data.frame(res$coefficients$cond)
tmp['col'] <- rownames(tmp)
write.table(tmp, outpval, sep="\t")
return(res$coefficients$cond[2, 4])
}}
""")
res = stat_s(df_density, str(outfig), str(outpval))
return float(res)
def create_density_figure(df_density: pd.DataFrame, gene_file: Path,
output_file: Path) -> None:
"""
Create a violinplot indicating the ribosome densities between the groups
of genes studied between each treatment
:param df_density: A DataFrame of density
:param gene_file: The gene_file used to build the group column of the \
df_density dataframe
:param output_file: File where the figure will be created
"""
sns.set(context="talk")
df_mean = avg_replicate(df_density)
df_mean["log10_density"] = np.log(df_mean["density"] + 1)
g = sns.catplot(x="treatment", y="log10_density", hue="group",
data=df_mean, kind="violin", height=10, aspect=1.7,
palette={"TDD_BRAF": "#a600ff", "CTRL": "dimgrey",
"TDD_DMSO": "#ff8400"}, cut=True)
pdmso = statistical_analysis(df_density[df_density["treatment"] == "DMSO"].copy(),
output_file.parent)
pbraf = statistical_analysis(df_density[df_density["treatment"] == "BRAF"].copy(),
output_file.parent)
pdmso = "%.2e" % pdmso
pbraf = "%.2e" % pbraf
g.fig.suptitle(f"Ribosome density between {gene_file.stem} and CTRL"
f"genes in BRAF and DMSO condition\n"
f"(p_treatment_BRAF = {pbraf} & p_treatment_BRAF = {pdmso}")
g.set(ylabel="log10(density +1)")
g.savefig(output_file)
def density_figure_maker():
df = create_density_df()
list_files = [Config_file.tdd_braf, Config_file.tdd_dmso]
for mfile in list_files:
dfc = add_group_column(df.copy(), mfile)
Config_file.output.mkdir(exist_ok=True)
output_fig = Config_file.output / \
f"density_figure_gene_{mfile.stem}.pdf"
dfc.to_csv(output_fig.parent /
f"{output_fig.stem}.txt", sep="\t", index=False)
create_density_figure(dfc, mfile, output_fig)
if __name__ == "__main__":
import sys
if len(sys.argv) >= 2 and sys.argv[1] == "test":
doctest.testmod()
else:
density_figure_maker()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment