Skip to content
Snippets Groups Projects
Commit 6bdd2882 authored by nfontrod's avatar nfontrod
Browse files

src/03_ERCC_analysis_function.R: contains functions used to analyze ERCC spike-in

parent a41cccac
No related branches found
No related tags found
No related merge requests found
#!/bin/Rscript
# This script aims to compute the relative abundance of ERCC spike-in
library(tidyverse)
library(DESeq2)
library(gridExtra)
#' Create a count matrix from HTSEQ file
#'
#' @param directory A directory where the htseq count file are defined
#' @param selection_vec A vector containing string to subselect the HTSEQ files
#' @param suffix_to_remove Suffix to remove in file name
#' @param read_number_filter A filter to eliminate sample with low read counts
#' @import DESeq2
#' @import tidyverse
get_count_matrix <- function(directory, condition_pattern, selection_vec = "",
suffix_to_remove = "", read_number_filter = 0,
filtering_transcript = F) {
# load files
count_files <- list.files(
path = directory,
pattern = ".tsv",
full.names = TRUE
)
# select file
if (selection_vec != "") {
selected_files <- NULL
for (i in seq_len(length(selection_vec))) {
selected_files <- c(
grep(selection_vec[i], count_files, value = TRUE),
selected_files
)
}
} else {
selected_files <- count_files
}
sample_name <- sub(
paste0(directory, "/"), "",
sub(suffix_to_remove, "", selected_files)
)
condition <- as.factor(str_extract(sample_name, condition_pattern)) # nolint
# Build the count matrix
sample_table <- data.frame(
sampleName = sample_name,
fileName = selected_files,
condition = condition
)
dds_input <- DESeqDataSetFromHTSeqCount(
sampleTable = sample_table,
directory = ".",
design = ~condition
)
dds <- DESeq(dds_input) # nolint
cts <- counts(dds, norm = F) # nolint
if (filtering_transcript) {
coding_gene <- read_tsv("results/coding_genes/coding_gene.txt")$gene_id
cts <- cts[which(rownames(cts) %in% coding_gene), ]
}
if (read_number_filter > 0) {
# filter by read number
sample_size <- cts %>%
as_tibble() %>%
summarise_if(is.numeric, sum)
cts <- cts[, colnames(cts)[(sample_size > 5e6)]]
}
return(cts)
}
#' Get the total counts in each sample
#'
#' @param count_df A dataframe containing read counts
get_sample_count <- function(count_df) {
sample_count <- count_df %>%
as_tibble() %>%
summarise_if(is.numeric, sum)
new_df <- sample_count %>% pivot_longer(everything(),
names_to = "samples",
values_to = "counts"
)
return(new_df %>% mutate(name = sub("276_|277_|278_", "", samples))) # nolint
}
#' Get relative ercc count
get_relative_count <- function(data_count, ercc_count) {
res <- inner_join(data_count, ercc_count,
by = c("name", "samples"),
suffix = c("", "_ercc")
)
res <- res %>% mutate(relative_count = counts_ercc / counts)
return(res)
}
#' get ERCC correlation plot for one sample
#'
#' @param ercc_raw a dataframe containing raw ERCC count
#' @param col The selected sample for which we want to display the correlation figure
#' @param size_list a named vector containing the number of reads in coding gene for each samples
create_ercc_correlation_plots <- function(ercc_raw, col, size_list) {
selected <- which(ercc_raw[[col]] > 0 & ercc_raw$concentration > 0)
size <- size_list[col]
coli <- ercc_raw[[col]][selected]
colc <- ercc_raw$concentration[selected]
cor_val <- cor.test(log2(colc), log2(coli))$estimate
ercc <- nrow(ercc_raw)
p <- ggplot(ercc_raw, mapping = aes(x = log2(concentration), y = log2(!!as.symbol(col)))) +
geom_point() +
ggtitle(paste("R = ", round(cor_val, 3), "- n =", ercc, "- s = ", round(size / 1e6, 1), "M")) +
stat_smooth(method = "lm", se = FALSE, color = "red")
p$my_cor <- cor_val
return(p)
}
#' Create ERCC correlation figure for all samples
#'
#' @param ercc_count_matrix The ercc count matrix
#' @param count_threshold The average count threshold an ercc must have to be displayed in the figure
create_correlation_figures <- function(ercc_count_matrix, count_threshold) {
ercc_raw <- ercc_count_matrix
ercc_raw <- as_tibble(data.frame(gene = rownames(ercc_count_matrix), ercc_raw))
ercc_raw <- ercc_raw %>% left_join(ercc_real_concentration, by = "gene")
ercc_raw2 <- ercc_raw %>%
select(-concentration) %>%
pivot_longer(-gene, names_to = "sample", values_to = "counts") %>%
group_by(gene) %>%
summarise(min_count = mean(counts))
ercc_raw <- left_join(ercc_raw, ercc_raw2, by = "gene") %>% filter(min_count > count_threshold)
sample_size <- apply(data_count_matrix %>% as_tibble() %>% select(-gene), 2, sum)
names_c <- ercc_raw %>%
select(-gene, -concentration, -min_count) %>%
colnames()
sum(names(sample_size) == names_c) # 96 they are in the same order
my_plots <- lapply(names_c, create_ercc_correlation_plots, ercc_raw = ercc_raw, size_list = sample_size)
correlation_sup20 <- sapply(my_plots, function(x) {
return(x$my_cor)
})
dir.create("./results/ERCC_analysis")
pdf(paste0("./results/ERCC_analysis/correlations_ERCC-meancount>", count_threshold, "_quant_vs_concentration.pdf"), width = 12, height = 100)
print(do.call("grid.arrange", c(my_plots, ncol = 3)))
dev.off()
return(correlation_sup20)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment