diff --git a/src/clonality_paper.R b/src/clonality_paper.R index 19acfe5bab689915969422b19582e4b7d6800af8..040e155e245e97c6dd250309924c8f59a2b14b9a 100644 --- a/src/clonality_paper.R +++ b/src/clonality_paper.R @@ -417,40 +417,6 @@ for (day in names(sce_day)) { save(sce_day, file = "results/2020_01_02_clonality_paper_sce.Rdata") load(file = "results/2020_01_02_clonality_paper_sce.Rdata") - - -DEA_DEA_clone_cell_type_size <- list() -min_clone_size <- 3 -for (day in names(sce_day)) { - colData(sce_day[[day]])$clone_id <- colData(sce_day[[day]]) %>% - as_tibble() %>% - mutate(clone_id = ifelse( - clone_id == 0, - (max(clone_id) + 1):(max(clone_id) + length(which(clone_id == 0))), - clone_id)) %>% - pull(clone_id) - colData(sce_day[[day]])$clone_size <- colData(sce_day[[day]]) %>% - as_tibble() %>% - left_join( - colData(sce_day[[day]]) %>% - as_tibble() %>% - group_by(clone_id) %>% - dplyr::summarise(clone_size = n()) - ) %>% - pull(clone_size) - DEA_DEA_clone_cell_type_size[[day]] <- DEA( - sce_day[[day]][, colData(sce_day[[day]])$clone_size >= min_clone_size], - test = "~ (1|clone_id)", - formula = "count ~ p_PLS_DEA_cell_type + (1|clone_id)", - assay_name = "counts_vst", - cpus = 10 - ) - save( - DEA_DEA_clone_cell_type_size, - file = "results/2020_01_01_DEA_DEA_clone_size.Rdata" - ) -} - genes_PLS <- read_csv("data/2017_11_28_List_Laurent_Genes_PLS.csv") %>% pivot_longer(cols = c("Genes_EFF", "Genes_MEM"), names_to = "type", @@ -509,25 +475,25 @@ for (day in names(sce_day)) { min_clone_size <- 3 load(file = "results/2020_01_02_clonality_paper_sce.Rdata") load(file = "results/2020_01_01_DEA_DEA_clone_size.Rdata") - +load(file = "results/2020_01_01_DEA_clone_PCA_cell_type_size.Rdata", v=T) ## adj pvalue for (day in names(sce_day)) { print(day) - rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size <- NA - rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size <- - get_genes_pval(DEA_DEA_clone_cell_type_size[[day]], sce_day[[day]]) + rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size <- NA + rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size <- + get_genes_pval(DEA_clone_PCA_cell_type_size[[day]], sce_day[[day]]) - rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size %>% + rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size %>% is.na() %>% table() %>% print() - rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size_adj <- p.adjust( - rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size, + rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size_adj <- p.adjust( + rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size, method = "BH" ) - table(rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size_adj < 0.05) %>% print() + table(rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size_adj < 0.05) %>% print() } day <- "593" assays(sce_day[[day]])$logcounts <- scater::logNormCounts( @@ -538,8 +504,8 @@ for (day in names(sce_day)) { assay(., "logcounts") %>% Matrix::Matrix(sparse = T) sce_DEA_hm <- sce_day[[day]][ - !is.na(rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size_adj) & - rowData(sce_day[[day]])$pval_DEA_clone_cell_type_size_adj < 0.05 + !is.na(rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size_adj) & + rowData(sce_day[[day]])$pval_DEA_clone_PCA_cell_type_size_adj < 0.05 ] sce_DEA_hm %>% dim() colData(sce_DEA_hm) <- colData(sce_DEA_hm) %>% @@ -568,7 +534,6 @@ rm_genes <- readr::read_delim( future::plan("multiprocess", workers = 10) test <- assay(sce_DEA_hm, "logcounts") %>% - log1p() %>% as.matrix() %>% as_tibble(rownames = "gene_name") %>% tidyr::nest(counts = !c(gene_name)) %>% @@ -584,17 +549,23 @@ rm_genes <- readr::read_delim( mutate( count_var = purrr::map(.x = counts, .f = function(.x){ var(.x$count) + }), + count_mean = purrr::map(.x = counts, .f = function(.x){ + (.x$count) }) ) %>% unnest(count_var) %>% - filter(count_var > 0.2) %>% + unnest(count_mean) %>% + select(count_var, count_mean) %>% summary() + filter(!(gene_name %in% rm_genes$geneid)) %>% + filter(count_var > 0.1) %>% mutate( model = furrr::future_map(.x = counts, .f = function(.x){ model.matrix(~-1 + clone_id, data = .x) %>% - glmnet( + glmnet::glmnet( x = ., y = (.x %>% pull(count)), - lambda = cv.glmnet( + lambda = glmnet::cv.glmnet( ., (.x %>% pull(count)) )$lambda.1se @@ -605,58 +576,104 @@ rm_genes <- readr::read_delim( mutate(coefs = map(model, tidy)) %>% select(-c(counts, model)) %>% tidyr::unnest(coefs) - - gene_name <- test %>% - janitor::clean_names() %>% - dplyr::filter( - dev_ratio > 0 - ) %>% - group_by(gene_name) %>% - dplyr::summarize( - estimate = max(abs(estimate)), - ) %>% - # filter(estimate >= quantile(estimate, 0.90)) %>% - pull(gene_name) - - # cluster_row <- - test %>% + + + cluster_row <- assay(sce_DEA_hm, "logcounts") %>% + as.matrix() %>% + as_tibble(rownames = "gene_name") %>% + filter(!(gene_name %in% rm_genes$geneid)) %>% + tidyr::nest(counts = !c(gene_name)) %>% + mutate( + counts = purrr::map(.x = counts, .f = function(.x){ + tibble( + id = colnames(.x), + count = t(.x)[, 1], + clone_id = colData(sce_DEA_hm)$clone_id) + } + ) + ) %>% + mutate( + count_var = purrr::map(.x = counts, .f = function(.x){ + var(.x$count) + }) + ) %>% + unnest(count_var) %>% + mutate( + model = furrr::future_map(.x = counts, .f = function(.x){ + lm(count ~ clone_id, data = .x) + }, + .progress = TRUE) + ) %>% + mutate(coefs = map(model, tidy)) %>% + select(-c(counts, model)) %>% + tidyr::unnest(coefs) %>% janitor::clean_names() %>% - filter(gene_name %in% gene_name) %>% - mutate(term = ifelse( + dplyr::mutate(term = ifelse( term == "(Intercept)", colData(sce_DEA_hm)$clone_id %>% as.factor() %>% levels() %>% .[1] %>% str_c("clone_id", .), term) ) %>% - mutate( + dplyr::mutate( term = str_replace(term, "clone_id(.*)", "\\1") ) %>% - select(gene_name, term, estimate) %>% - mutate(id = 1:nrow(.), - estimate = ifelse(is.na(estimate), 0, estimate)) %>% - group_by(c(gene_name, id) %>% - summarise(estimate = sum(estimate)) - summarize() - pivot_wider( - id_cols = c(id, gene_name), + dplyr::select(gene_name, term, estimate) %>% + tidyr::pivot_wider( + id_cols = gene_name, names_from = term, - values_from = estimate + values_from = estimate, + values_fill = 0, + values_fn = sum ) %>% - select(-id) %>% - - as.data.frame() + as.data.frame() rownames(cluster_row) <- cluster_row$gene_name - cluster_row %>% dim() - sce_DEA_hm %>% dim() +cluster_row <- + assay(sce_DEA_hm, "logcounts") %>% + as.matrix() %>% + as_tibble(rownames = "gene_name") %>% + filter(!(gene_name %in% rm_genes$geneid)) %>% + tidyr::nest(counts = !c(gene_name)) %>% + mutate( + counts = purrr::map(.x = counts, .f = function(.x){ + tibble( + id = colnames(.x), + count = t(.x)[, 1], + clone_id = as.factor(colData(sce_DEA_hm)$clone_id) + ) %>% + group_by(clone_id) %>% + dplyr::summarise( + id = id, + count = count, + clone_id = clone_id, + count_mean = max(count) + ) %>% + ungroup() + } + ) + ) %>% + tidyr::unnest(counts) %>% + janitor::clean_names() %>% + dplyr::select(gene_name, clone_id, count_mean) %>% + tidyr::pivot_wider( + id_cols = gene_name, + names_from = clone_id, + values_from = count_mean, + values_fill = 0, + values_fn = sum + ) %>% + as.data.frame() +rownames(cluster_row) <- cluster_row$gene_name -sce_DEA_hm_plot <- sce_DEA_hm[rownames(sce_DEA_hm) %in% gene_name, ] +sce_DEA_hm_plot <- sce_DEA_hm[rownames(sce_DEA_hm) %in% cluster_row$gene_name, ] rowData(sce_DEA_hm_plot)$gene_order <- cluster_row[, -1] %>% - dist() %>% + dist(method = "canberra") %>% hclust() %>% .$order +rownames(sce_DEA_hm_plot) <- rowData(sce_DEA_hm_plot)$gene_name +sce_DEA_hm_plot <- sce_DEA_hm_plot[rowData(sce_DEA_hm_plot)$gene_order, ] plotHeatmap( - sce_DEA_hm_plot, + sce_DEA_hm_plot[!(rowData(sce_DEA_hm_plot)$gene_name %in% rm_genes),], features = rownames(sce_DEA_hm_plot), order_columns_by = c("clone_id", "p_PLS_DEA_cell_type"), colour_columns_by = c("clone_id", "p_PLS_DEA_cell_type"), @@ -665,7 +682,4 @@ plotHeatmap( zlim = c(-5, 5), main = day, cluster_rows = F, - # order_rows_by = c("gene_order") ) - -