diff --git a/src/03_ERCC_analysis_function.R b/src/03_ERCC_analysis_function.R new file mode 100644 index 0000000000000000000000000000000000000000..53b4784f574bf6acbae950a5aae72bd335c47da1 --- /dev/null +++ b/src/03_ERCC_analysis_function.R @@ -0,0 +1,145 @@ +#!/bin/Rscript + +# This script aims to compute the relative abundance of ERCC spike-in + + +library(tidyverse) +library(DESeq2) +library(gridExtra) + +#' Create a count matrix from HTSEQ file +#' +#' @param directory A directory where the htseq count file are defined +#' @param selection_vec A vector containing string to subselect the HTSEQ files +#' @param suffix_to_remove Suffix to remove in file name +#' @param read_number_filter A filter to eliminate sample with low read counts +#' @import DESeq2 +#' @import tidyverse +get_count_matrix <- function(directory, condition_pattern, selection_vec = "", + suffix_to_remove = "", read_number_filter = 0, + filtering_transcript = F) { + # load files + count_files <- list.files( + path = directory, + pattern = ".tsv", + full.names = TRUE + ) + # select file + if (selection_vec != "") { + selected_files <- NULL + for (i in seq_len(length(selection_vec))) { + selected_files <- c( + grep(selection_vec[i], count_files, value = TRUE), + selected_files + ) + } + } else { + selected_files <- count_files + } + sample_name <- sub( + paste0(directory, "/"), "", + sub(suffix_to_remove, "", selected_files) + ) + condition <- as.factor(str_extract(sample_name, condition_pattern)) # nolint + + # Build the count matrix + sample_table <- data.frame( + sampleName = sample_name, + fileName = selected_files, + condition = condition + ) + dds_input <- DESeqDataSetFromHTSeqCount( + sampleTable = sample_table, + directory = ".", + design = ~condition + ) + dds <- DESeq(dds_input) # nolint + cts <- counts(dds, norm = F) # nolint + if (filtering_transcript) { + coding_gene <- read_tsv("results/coding_genes/coding_gene.txt")$gene_id + cts <- cts[which(rownames(cts) %in% coding_gene), ] + } + if (read_number_filter > 0) { + # filter by read number + sample_size <- cts %>% + as_tibble() %>% + summarise_if(is.numeric, sum) + cts <- cts[, colnames(cts)[(sample_size > 5e6)]] + } + return(cts) +} + +#' Get the total counts in each sample +#' +#' @param count_df A dataframe containing read counts +get_sample_count <- function(count_df) { + sample_count <- count_df %>% + as_tibble() %>% + summarise_if(is.numeric, sum) + new_df <- sample_count %>% pivot_longer(everything(), + names_to = "samples", + values_to = "counts" + ) + return(new_df %>% mutate(name = sub("276_|277_|278_", "", samples))) # nolint +} + +#' Get relative ercc count +get_relative_count <- function(data_count, ercc_count) { + res <- inner_join(data_count, ercc_count, + by = c("name", "samples"), + suffix = c("", "_ercc") + ) + res <- res %>% mutate(relative_count = counts_ercc / counts) + return(res) +} + +#' get ERCC correlation plot for one sample +#' +#' @param ercc_raw a dataframe containing raw ERCC count +#' @param col The selected sample for which we want to display the correlation figure +#' @param size_list a named vector containing the number of reads in coding gene for each samples +create_ercc_correlation_plots <- function(ercc_raw, col, size_list) { + selected <- which(ercc_raw[[col]] > 0 & ercc_raw$concentration > 0) + size <- size_list[col] + coli <- ercc_raw[[col]][selected] + colc <- ercc_raw$concentration[selected] + cor_val <- cor.test(log2(colc), log2(coli))$estimate + ercc <- nrow(ercc_raw) + p <- ggplot(ercc_raw, mapping = aes(x = log2(concentration), y = log2(!!as.symbol(col)))) + + geom_point() + + ggtitle(paste("R = ", round(cor_val, 3), "- n =", ercc, "- s = ", round(size / 1e6, 1), "M")) + + stat_smooth(method = "lm", se = FALSE, color = "red") + p$my_cor <- cor_val + return(p) +} + +#' Create ERCC correlation figure for all samples +#' +#' @param ercc_count_matrix The ercc count matrix +#' @param count_threshold The average count threshold an ercc must have to be displayed in the figure +create_correlation_figures <- function(ercc_count_matrix, count_threshold) { + ercc_raw <- ercc_count_matrix + ercc_raw <- as_tibble(data.frame(gene = rownames(ercc_count_matrix), ercc_raw)) + ercc_raw <- ercc_raw %>% left_join(ercc_real_concentration, by = "gene") + ercc_raw2 <- ercc_raw %>% + select(-concentration) %>% + pivot_longer(-gene, names_to = "sample", values_to = "counts") %>% + group_by(gene) %>% + summarise(min_count = mean(counts)) + ercc_raw <- left_join(ercc_raw, ercc_raw2, by = "gene") %>% filter(min_count > count_threshold) + + sample_size <- apply(data_count_matrix %>% as_tibble() %>% select(-gene), 2, sum) + names_c <- ercc_raw %>% + select(-gene, -concentration, -min_count) %>% + colnames() + sum(names(sample_size) == names_c) # 96 they are in the same order + my_plots <- lapply(names_c, create_ercc_correlation_plots, ercc_raw = ercc_raw, size_list = sample_size) + correlation_sup20 <- sapply(my_plots, function(x) { + return(x$my_cor) + }) + dir.create("./results/ERCC_analysis") + pdf(paste0("./results/ERCC_analysis/correlations_ERCC-meancount>", count_threshold, "_quant_vs_concentration.pdf"), width = 12, height = 100) + print(do.call("grid.arrange", c(my_plots, ncol = 3))) + dev.off() + return(correlation_sup20) +}