src/03_ERCC_analysis_function.R: contains functions used to analyze ERCC spike-in

6bdd2882 · nfontrod · a41cccac · 6bdd2882
Commit 6bdd2882 authored 3 years ago by nfontrod
--- a/src/03_ERCC_analysis_function.R
+++ b/src/03_ERCC_analysis_function.R
+#!/bin/Rscript
+
+# This script aims to compute the relative abundance of ERCC spike-in
+
+
+library(tidyverse)
+library(DESeq2)
+library(gridExtra)
+
+#' Create a count matrix from HTSEQ file
+#'
+#' @param directory A directory where the htseq count file are defined
+#' @param selection_vec A vector containing string to subselect the HTSEQ files
+#' @param suffix_to_remove Suffix to remove in file name
+#' @param read_number_filter A filter to eliminate sample with low read counts
+#' @import DESeq2
+#' @import tidyverse
+get_count_matrix <- function(directory, condition_pattern, selection_vec = "",
+                             suffix_to_remove = "", read_number_filter = 0,
+                             filtering_transcript = F) {
+    # load files
+    count_files <- list.files(
+        path = directory,
+        pattern = ".tsv",
+        full.names = TRUE
+    )
+    # select file
+    if (selection_vec != "") {
+        selected_files <- NULL
+        for (i in seq_len(length(selection_vec))) {
+            selected_files <- c(
+                grep(selection_vec[i], count_files, value = TRUE),
+                selected_files
+            )
+        }
+    } else {
+        selected_files <- count_files
+    }
+    sample_name <- sub(
+        paste0(directory, "/"), "",
+        sub(suffix_to_remove, "", selected_files)
+    )
+    condition <- as.factor(str_extract(sample_name, condition_pattern)) # nolint
+
+    # Build the count matrix
+    sample_table <- data.frame(
+        sampleName = sample_name,
+        fileName = selected_files,
+        condition = condition
+    )
+    dds_input <- DESeqDataSetFromHTSeqCount(
+        sampleTable = sample_table,
+        directory = ".",
+        design = ~condition
+    )
+    dds <- DESeq(dds_input) # nolint
+    cts <- counts(dds, norm = F) # nolint
+    if (filtering_transcript) {
+        coding_gene <- read_tsv("results/coding_genes/coding_gene.txt")$gene_id
+        cts <- cts[which(rownames(cts) %in% coding_gene), ]
+    }
+    if (read_number_filter > 0) {
+        # filter by read number
+        sample_size <- cts %>%
+            as_tibble() %>%
+            summarise_if(is.numeric, sum)
+        cts <- cts[, colnames(cts)[(sample_size > 5e6)]]
+    }
+    return(cts)
+}
+
+#' Get the total counts in each sample
+#'
+#' @param count_df A dataframe containing read counts
+get_sample_count <- function(count_df) {
+    sample_count <- count_df %>%
+        as_tibble() %>%
+        summarise_if(is.numeric, sum)
+    new_df <- sample_count %>% pivot_longer(everything(),
+        names_to = "samples",
+        values_to = "counts"
+    )
+    return(new_df %>% mutate(name = sub("276_|277_|278_", "", samples))) # nolint
+}
+
+#' Get relative ercc count
+get_relative_count <- function(data_count, ercc_count) {
+    res <- inner_join(data_count, ercc_count,
+        by = c("name", "samples"),
+        suffix = c("", "_ercc")
+    )
+    res <- res %>% mutate(relative_count = counts_ercc / counts)
+    return(res)
+}
+
+#' get ERCC correlation plot for one sample
+#'
+#' @param ercc_raw a dataframe containing raw ERCC count
+#' @param col The selected sample for which we want to display the correlation figure
+#' @param size_list a named vector containing the number of reads  in coding gene for each samples
+create_ercc_correlation_plots <- function(ercc_raw, col, size_list) {
+    selected <- which(ercc_raw[[col]] > 0 & ercc_raw$concentration > 0)
+    size <- size_list[col]
+    coli <- ercc_raw[[col]][selected]
+    colc <- ercc_raw$concentration[selected]
+    cor_val <- cor.test(log2(colc), log2(coli))$estimate
+    ercc <- nrow(ercc_raw)
+    p <- ggplot(ercc_raw, mapping = aes(x = log2(concentration), y = log2(!!as.symbol(col)))) +
+        geom_point() +
+        ggtitle(paste("R = ", round(cor_val, 3), "- n =", ercc, "- s = ", round(size / 1e6, 1), "M")) +
+        stat_smooth(method = "lm", se = FALSE, color = "red")
+    p$my_cor <- cor_val
+    return(p)
+}
+
+#' Create ERCC correlation figure for all samples
+#'
+#' @param ercc_count_matrix The ercc count matrix
+#' @param count_threshold The average count threshold an ercc must have to be displayed in the figure
+create_correlation_figures <- function(ercc_count_matrix, count_threshold) {
+    ercc_raw <- ercc_count_matrix
+    ercc_raw <- as_tibble(data.frame(gene = rownames(ercc_count_matrix), ercc_raw))
+    ercc_raw <- ercc_raw %>% left_join(ercc_real_concentration, by = "gene")
+    ercc_raw2 <- ercc_raw %>%
+        select(-concentration) %>%
+        pivot_longer(-gene, names_to = "sample", values_to = "counts") %>%
+        group_by(gene) %>%
+        summarise(min_count = mean(counts))
+    ercc_raw <- left_join(ercc_raw, ercc_raw2, by = "gene") %>% filter(min_count > count_threshold)
+
+    sample_size <- apply(data_count_matrix %>% as_tibble() %>% select(-gene), 2, sum)
+    names_c <- ercc_raw %>%
+        select(-gene, -concentration, -min_count) %>%
+        colnames()
+    sum(names(sample_size) == names_c) # 96 they are in the same order
+    my_plots <- lapply(names_c, create_ercc_correlation_plots, ercc_raw = ercc_raw, size_list = sample_size)
+    correlation_sup20 <- sapply(my_plots, function(x) {
+        return(x$my_cor)
+    })
+    dir.create("./results/ERCC_analysis")
+    pdf(paste0("./results/ERCC_analysis/correlations_ERCC-meancount>", count_threshold, "_quant_vs_concentration.pdf"), width = 12, height = 100)
+    print(do.call("grid.arrange", c(my_plots, ncol = 3)))
+    dev.off()
+    return(correlation_sup20)
+}