diff --git a/src/06_compute_tdd_index.R b/src/06_compute_tdd_index.R index 5be0059be728fd1500fed1980fb380f4b8f33d92..2c00edd6299c112e476662d726cbbbb0b187c4ca 100644 --- a/src/06_compute_tdd_index.R +++ b/src/06_compute_tdd_index.R @@ -8,6 +8,7 @@ library(genefilter) library(gridExtra) library(DHARMa) library(emmeans) +library(glmmTMB) source("src/common_functions.R") @@ -543,14 +544,20 @@ produce_tdd_cor_figure <- function(mean_gene_count = 10, ercc = F, #' @return the p-value of the test glm_nb_pvalue <- function(tab, pic_col, output_folder = "./results/TDD_analysis") { - mod <- glm.nb(paste0(pic_col, "~ group"), data = tab) # nolint output_diag <- paste0(output_folder, "/diagnotics") dir.create(output_diag, showWarnings = F) output_fig <- paste0(output_diag, "/glm_nb_", pic_col, ".pdf") + if (pic_col == "peak") { + mod <- glm.nb(paste0(pic_col, "~ group"), data = tab) # nolint + pval <- summary(mod)$coeff[2, 4] + } else { + mod <- glmmTMB(log1p(peak_surface) ~ group, data = tab, zi = ~1, family = "gaussian") # nolint + pval <- summary(mod)$coeff$cond[2, 4] + } pdf(output_fig, width = 12, height = 8) plot(simulateResiduals(mod)) # nolint dev.off() - return(summary(mod)$coeff[2, 4]) + return(pval) } #' Get the negative binomial regression p-value @@ -601,7 +608,6 @@ create_peaks_barplots <- function(tdd_filt, pval_glm_nb, output_fig, !!as.symbol(has_col) := ifelse(!!as.symbol(data_col) > 0, 1, 0) ) - pval_log <- glm_log_pvalue(tdd_filt, has_col, dirname(output_fig)) tdd_fig <- tdd_filt %>% select(!!as.symbol(data_col), !!as.symbol(has_col), group) %>% group_by(group) %>% @@ -609,26 +615,41 @@ create_peaks_barplots <- function(tdd_filt, pval_glm_nb, output_fig, !!as.symbol(res_col) := mean(!!as.symbol(data_col)), !!as.symbol(has_res_col) := mean(!!as.symbol(has_col)) ) + if (data_col == "peak") { + title <- paste0( + "Average proportion of ", treatment, " peaks in gene", regulation, + "-regulated by BRAF (p = ", pval_glm_nb, ")" + ) + } else { + title <- paste0( + "Average surface of ", treatment, " peaks in gene", regulation, + "-regulated by BRAF (p = ", pval_glm_nb, ")" + ) + } p1 <- ggplot(tdd_fig, mapping = aes_string(x = "group", y = res_col, fill = "group") ) + geom_bar(stat = "identity") + - ggtitle(paste0( - "Average proportion of ", treatment, " peaks in gene", regulation, - "-regulated by BRAF (p = ", pval_glm_nb, ")" - )) - p2 <- ggplot(tdd_fig, mapping = aes_string( - x = "group", y = has_res_col, fill = "group" - )) + - geom_bar(stat = "identity") + - ggtitle(paste0( - "Average proportion of ", regulation, - "-regulated gene by BRAF having at least a ", treatment, - " peak (p = ", pval_log, ")" - )) - pdf(output_fig, width = 12, height = 12) - grid.arrange(p1, p2, ncol = 1) # nolint - dev.off() + ggtitle(title) + if (data_col == "peak") { + pval_log <- glm_log_pvalue(tdd_filt, has_col, dirname(output_fig)) + p2 <- ggplot(tdd_fig, mapping = aes_string( + x = "group", y = has_res_col, fill = "group" + )) + + geom_bar(stat = "identity") + + ggtitle(paste0( + "Average proportion of ", regulation, + "-regulated gene by BRAF having at least a ", treatment, + " peak (p = ", pval_log, ")" + )) + pdf(output_fig, width = 12, height = 12) + grid.arrange(p1, p2, ncol = 1) # nolint + dev.off() + } else { + pdf(output_fig, width = 12, height = 6) + print(p1) + dev.off() + } } @@ -643,27 +664,44 @@ create_peaks_barplots <- function(tdd_filt, pval_glm_nb, output_fig, #' @param regulation (string) The regulation selected create_peaks_density <- function(tdd_filt, pval_glm_nb, outfile, data_col, treatment, regulation) { + if (data_col == "peak") { + title_p <- "" + bw <- 1 + xlab <- "Number of ribosome peaks" + xcol <- data_col + } else { + title_p <- "the surface of " + bw <- 0.2 + xlab <- "log1p ribosome surface" + xcol <- paste0("log1p(",data_col,")") + } tdd_filt <- rename(tdd_filt, tmp_grp = group) # nolint - p <- ggplot(tdd_filt, mapping = aes(x = !!as.symbol(data_col))) + - geom_density(mapping = aes(fill = tmp_grp), bw = 1, alpha = 0.5) + + p <- ggplot(tdd_filt, mapping = aes_string(x = xcol)) + + geom_density(mapping = aes(fill = tmp_grp), bw = bw, alpha = 0.5) + ggtitle(paste0( - "Density of ", treatment, " peaks in gene", regulation, + "Density of ", title_p, treatment, " peaks in gene", regulation, "-regulated by BRAF (p = ", pval_glm_nb, ")" )) + - labs(x = "Number of ribosome peaks") - p1 <- ggplot(tdd_filt, mapping = aes(x = !!as.symbol(data_col))) + - geom_bar( - position = position_dodge2(preserve = "single"), - mapping = aes(fill = tmp_grp, y = ..prop..) - ) + - ggtitle(paste0( - "Density of ", treatment, " peaks in gene", regulation, - "-regulated by BRAF (p = ", pval_glm_nb, ")" - )) + - labs(x = "Number of ribosome peaks") - pdf(outfile, width = 24, height = 12) - print(grid.arrange(p, p1, ncol = 2)) # nolint - dev.off() + labs(x = xlab) + if (data_col == "peak") { + p1 <- ggplot(tdd_filt, mapping = aes(x = !!as.symbol(data_col))) + + geom_bar( + position = position_dodge2(preserve = "single"), + mapping = aes(fill = tmp_grp, y = ..prop..) + ) + + ggtitle(paste0( + "Density of ", title_p, treatment, " peaks in gene", regulation, + "-regulated by BRAF (p = ", pval_glm_nb, ")" + )) + + labs(x = "Number of ribosome peaks") + pdf(outfile, width = 24, height = 12) + print(grid.arrange(p, p1, ncol = 2)) # nolint + dev.off() + } else { + pdf(outfile, width = 24, height = 12) + print(p) + dev.off() + } } #' Create figures showing if different groups of genes have a different number @@ -674,9 +712,11 @@ create_peaks_density <- function(tdd_filt, pval_glm_nb, outfile, data_col, #' analysis #' @param output_folder Folder where the results will be created #' @param peak_type The type of the peak +#' @param kind The kind of feature to analyse in the feature, either 'peak' or +#' 'peak_surface' create_pic_figs <- function(tdd_full, gene_type = "BRAF_DOWN", output_folder = "./results/TDD_analysis", - peak_type = "RiboRNApeak") { + peak_type = "RiboRNApeak", kind = "peak") { treatment <- sub("_.*", "", gene_type) regulation <- sub(".*_", "", gene_type) if (regulation == "UP") { @@ -687,14 +727,14 @@ create_pic_figs <- function(tdd_full, gene_type = "BRAF_DOWN", tdd_filt <- tdd_filt %>% mutate(group = ifelse(!!as.symbol(target), paste0(group, "_TDD"), group )) - data_col <- "peak" + data_col <- kind res_col <- paste0("mean_", data_col) has_col <- paste0("has_", data_col) has_res_col <- paste0("mean_", has_col) pval <- glm_nb_pvalue(tdd_filt, data_col, output_folder) output_fig <- paste0( output_folder, "/barplot_peak_analysis", gene_type, "_", - peak_type, ".pdf" + peak_type, "_", kind, ".pdf" ) create_peaks_barplots( tdd_filt, pval, output_fig, @@ -703,7 +743,7 @@ create_pic_figs <- function(tdd_full, gene_type = "BRAF_DOWN", ) outfile <- paste0( output_folder, "/density_peak_analysis", gene_type, "_", - peak_type, ".pdf" + peak_type, "_", kind, ".pdf" ) create_peaks_density( tdd_filt, pval, outfile, data_col, @@ -712,6 +752,34 @@ create_pic_figs <- function(tdd_full, gene_type = "BRAF_DOWN", } +#' load peak file +#' +#' @param peak_file (string) A bed file containing peaks for a given condition +#' @param kind (string) The kind of table to create: peak or peak_surface +get_peak_df <- function(peak_file, kind = "peak") { + if (kind != "peak" & kind != "peak_surface") { + stop(paste0("kind must be either 'peak' or 'peak_surface' not ", kind)) + } + if (kind == "peak") { + peak_list <- read.table(peak_file)$V1 + peak_table <- tibble( + gene = names(table(peak_list)), + peak = as.vector(table(peak_list)) + ) + } else { + peak_table <- read_tsv(peak_file, + col_names = c("gene", "start", "stop", "name", "score", "strand") + ) + peak_table <- peak_table %>% + mutate(size = stop - start) %>% + select(c(gene, size)) %>% + group_by(gene) %>% + summarise(peak_surface = sum(size)) + } + return(peak_table) +} + + #' function that calcultaed if genes are TDD (more in BRAF or DMSO condition) #' and if they have peaks #' @@ -720,12 +788,8 @@ create_pic_figs <- function(tdd_full, gene_type = "BRAF_DOWN", #' @param ercc Indicate if the normalisation with ercc should be used to compute #' the TDD index get_tdd_pic_table <- function(peak_file, mean_gene_count = 10, ercc = F, - output_folder) { - peak_list <- read.table(peak_file)$V1 - peak_table <- tibble( - gene = names(table(peak_list)), - peak = as.vector(table(peak_list)) - ) + output_folder, kind = "peak") { + peak_table <- get_peak_df(peak_file, kind) tdd_full <- get_final_tdd_table(mean_gene_count, ercc) tdd_full <- tdd_full[ , @@ -746,7 +810,7 @@ get_tdd_pic_table <- function(peak_file, mean_gene_count = 10, ercc = F, ) %>% select(gene, TDD_BRAF, TDD_DMSO) # nolint tdd_full <- tdd_full %>% left_join(peak_table) # nolint - tdd_full <- tdd_full %>% replace_na(list(peak = 0)) # nolint + tdd_full <- tdd_full %>% replace_na(list(peak = 0, peak_surface = 0)) # nolint return(tdd_full) } @@ -760,13 +824,22 @@ get_tdd_pic_table <- function(peak_file, mean_gene_count = 10, ercc = F, #' @param gene_type The selected group of gene on which we want to perform the #' analysis #' @param peak_type The type of peak inside the figure +#' @param kind The kind of feature to analyse in the feature, either 'peak' or +#' 'peak_surface' build_peaks_fig <- function(peak_file, mean_gene_count = 10, ercc = F, gene_type = "BRAF_DOWN", output_folder = "./results/TDD_analysis", - peak_type = "RiboRNApeak") { - tdd_table <- get_tdd_pic_table(peak_file, mean_gene_count, ercc, output_folder) - tdd_table <- get_group_columns(tdd_table) - create_pic_figs(tdd_table, gene_type, output_folder, peak_type = peak_type) + peak_type = "RiboRNApeak", + kind = "peak") { + tdd_table <- get_tdd_pic_table(peak_file, mean_gene_count, ercc, + output_folder, + kind = kind + ) # nolint + tdd_table <- get_group_columns(tdd_table) # nolint + create_pic_figs(tdd_table, gene_type, output_folder, + peak_type = peak_type, + kind = kind + ) } create_figures(mean_gene_count = 0, ercc = F, output_folder = "./results/TDD_analysis") @@ -789,4 +862,8 @@ for (i in seq_len(length(files))) { peak_file = peak_file, gene_type = gene_types[i], peak_type = peak_types[i] ) + build_peaks_fig( + peak_file = peak_file, gene_type = gene_types[i], + peak_type = peak_types[i], kind = "peak_surface" + ) }