From 01b816a2d1749ec4341a6cc26e503ab965303a40 Mon Sep 17 00:00:00 2001 From: Gilquin <laurent.gilquin@ens-lyon.fr> Date: Tue, 8 Nov 2022 09:58:54 +0100 Subject: [PATCH] added scripts folder and test script --- .gitignore | 10 ++- 5hAPF.R => scripts/5hAPF.R | 33 ++++++--- 9396.R => scripts/9396.R | 41 +++++++---- WP.R => scripts/WP.R | 33 ++++++--- analysis.R => scripts/analysis.R | 44 ++++++----- export.R => scripts/export.R | 8 +- ggtheme.R => scripts/ggtheme.R | 0 statistics.R => scripts/statistics.R | 1 + scripts/test.R | 106 +++++++++++++++++++++++++++ utilities.R => scripts/utilities.R | 0 umap.yml => seurat.yml | 2 +- 11 files changed, 223 insertions(+), 55 deletions(-) rename 5hAPF.R => scripts/5hAPF.R (84%) rename 9396.R => scripts/9396.R (81%) rename WP.R => scripts/WP.R (85%) rename analysis.R => scripts/analysis.R (83%) rename export.R => scripts/export.R (98%) rename ggtheme.R => scripts/ggtheme.R (100%) rename statistics.R => scripts/statistics.R (99%) create mode 100644 scripts/test.R rename utilities.R => scripts/utilities.R (100%) rename umap.yml => seurat.yml (95%) diff --git a/.gitignore b/.gitignore index 802b6f8..c4d5e6b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,14 @@ -# specific files +# specific files and folders mnist.R pbmc.R +data/ +results/ +scripts/Python/ + +# files at root +*.pdf +*.pptx +*.xlsm # History files .Rhistory diff --git a/5hAPF.R b/scripts/5hAPF.R similarity index 84% rename from 5hAPF.R rename to scripts/5hAPF.R index c3b2697..a612da9 100644 --- a/5hAPF.R +++ b/scripts/5hAPF.R @@ -29,20 +29,20 @@ import::from("SeuratObject", "Misc<-", .character_only=TRUE) import::from("SeuratDisk", "SaveH5Seurat", .character_only=TRUE) # relative import -import::from("ggtheme.R", "theme_custom", .character_only=TRUE) -import::from("statistics.R", "varPCA", .character_only=TRUE) +import::from("./scripts/ggtheme.R", "theme_custom", .character_only=TRUE) +import::from("./scripts/statistics.R", "varPCA", .character_only=TRUE) # set global ggplot2 theme ggplot2::theme_set(theme_custom()) # load reticulate env -reticulate::use_condaenv("umap", required = TRUE) +reticulate::use_condaenv("seurat", required = TRUE) # path to read raw data -datadir <- "/data" +datadir <- "./data" # path to save figure and data -savedir <- "../results/5hAPF" +savedir <- "./results/5hAPF" # --------------------------------------------------------------- # DATA LOADING AND PREPROCESSING @@ -68,7 +68,7 @@ Sobject <- PercentageFeatureSet( gg <- VlnPlot( Sobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3 ) -ggsave(file=paste0(savedir, "_vlnplot_before_QC.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "_vlnplot_before_QC.svg"), device="svg", gg) # empirical filtering based on QC metrics Sobject <- subset(Sobject, subset = nFeature_RNA > 600 & nFeature_RNA < 5000) @@ -82,7 +82,7 @@ Sobject <- subset(Sobject, subset = GFP != 0 | Mef2 != 0 | twi != 0) gg <- VlnPlot( Sobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3 ) -ggsave(file=paste0(savedir, "_vlnplot_after_QC.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "_vlnplot_after_QC.svg"), device="svg", gg) # --------------------------------------------------------------- # DATA NORMALIZATION THROUGH SCTRANSFORM @@ -115,7 +115,7 @@ gg <- vfp_labels & scale_y_continuous(trans='log10') & theme_custom() & theme(legend.position=c(0.5, 0.2)) -ggsave(file=paste0(savedir, "top_variable_features.svg"), device = "svg", +ggsave(file=file.path(savedir, "top_variable_features.svg"), device = "svg", width = 10, height = 8, gg) # --------------------------------------------------------------- @@ -135,7 +135,7 @@ ncomp <- ncol(Sobject@reductions$pca) # view the PCA threshold gg <- ElbowPlot(Sobject, ndims=ncomp) & ylab("Variance explained (%)") & theme_custom() -ggsave(file=paste0(savedir, "pca.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "pca.svg"), device="svg", gg) ncomp <- 20 # export PCA data to text file @@ -170,7 +170,7 @@ gg <- clustree::clustree( scale_colour_gradientn( colors = rev(pals::plasma(256)[120:256]) ) -ggsave(file=paste0(savedir, "clustree.svg"), device="svg", +ggsave(file=file.path(savedir, "clustree.svg"), device="svg", width = 8, height = 12, gg) # construct UMAP embedding with custom parameters @@ -188,5 +188,18 @@ write.table( row.names = FALSE, col.names=FALSE ) +# export list of missing genes +sheets <- c( + "TF_with_names", "Secreted_proteins", "GPI_anchored protein", + "multi-pass_transmembrane", "transmembrane_Cterm", "transmembrane_Nterm" +) +for (sheet in sheets){ + features <- readxl::read_excel( + path = file.path(datadir, "List_genes.xlsm"), + sheet = sheet, + col_names = TRUE)$SYMBOL + export_missing_markers(Sobject, features, savedir, prefix = sheet) +} + # save Seurat object SaveH5Seurat(Sobject, filename = file.path(savedir, "5hAPF.h5Seurat")) diff --git a/9396.R b/scripts/9396.R similarity index 81% rename from 9396.R rename to scripts/9396.R index b942af8..eb3dd2e 100644 --- a/9396.R +++ b/scripts/9396.R @@ -29,20 +29,20 @@ import::from("SeuratObject", "Misc<-", .character_only=TRUE) import::from("SeuratDisk", "SaveH5Seurat", .character_only=TRUE) # relative import -import::from("ggtheme.R", "theme_custom", .character_only=TRUE) -import::from("statistics.R", "varPCA", .character_only=TRUE) +import::from("./scripts/ggtheme.R", "theme_custom", .character_only=TRUE) +import::from("./scripts/statistics.R", "varPCA", .character_only=TRUE) # set global ggplot2 theme ggplot2::theme_set(theme_custom()) # load reticulate env -reticulate::use_condaenv("umap", required = TRUE) +reticulate::use_condaenv("seurat", required = TRUE) # path to read raw data -datadir <- "/data" +datadir <- "./data" # path to save figure and data -savedir <- "../results/9396" +savedir <- "./results/9396" # --------------------------------------------------------------- # DATA LOADING AND PREPROCESSING @@ -68,7 +68,7 @@ Sobject <- PercentageFeatureSet( gg <- VlnPlot( Sobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3 ) -ggsave(file=paste0(savedir, "_vlnplot_before_QC.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "_vlnplot_before_QC.svg"), device="svg", gg) # empirical filtering based on QC metrics Sobject <- subset(Sobject, subset = nFeature_RNA > 600 & nFeature_RNA < 5000) @@ -82,7 +82,7 @@ Sobject <- subset(Sobject, subset = GFP != 0 | Mef2 != 0 | twi != 0) gg <- VlnPlot( Sobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3 ) -ggsave(file=paste0(savedir, "_vlnplot_after_QC.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "_vlnplot_after_QC.svg"), device="svg", gg) # --------------------------------------------------------------- # DATA NORMALIZATION THROUGH SCTRANSFORM @@ -115,7 +115,7 @@ gg <- vfp_labels & scale_y_continuous(trans='log10') & theme_custom() & theme(legend.position=c(0.5, 0.2)) -ggsave(file=paste0(savedir, "top_variable_features.svg"), device = "svg", +ggsave(file=file.path(savedir, "top_variable_features.svg"), device = "svg", width = 10, height = 8, gg) # --------------------------------------------------------------- @@ -135,8 +135,8 @@ ncomp <- ncol(Sobject@reductions$pca) # view the PCA threshold gg <- ElbowPlot(Sobject, ndims=ncomp) & ylab("Variance explained (%)") & theme_custom() -ggsave(file=paste0(savedir, "pca.svg"), device="svg", gg) -ncomp <- 12 +ggsave(file=file.path(savedir, "pca.svg"), device="svg", gg) +ncomp <- 25 # export PCA data to text file write.table( @@ -170,23 +170,36 @@ gg <- clustree::clustree( scale_colour_gradientn( colors = rev(pals::plasma(256)[120:256]) ) -ggsave(file=paste0(savedir, "clustree.svg"), device="svg", +ggsave(file=file.path(savedir, "clustree.svg"), device="svg", width = 8, height = 12, gg) # construct UMAP embedding with custom parameters Sobject <- RunUMAP( Sobject , dims = 1:ncomp, n.neighbors = n_neighbors, metric="cosine", reduction = "pca", umap.method = "umap-learn", n.epochs = 1000, - learning.rate = 1.0, negative.sample.rate = 2, local.connectivity = 2, - set.op.mix.ratio = 1.0, a = 1.0, b = 1.0, seed.use = 1672071369 + learning.rate = 1, negative.sample.rate = 9, local.connectivity = 1, + set.op.mix.ratio = 0.5, min.dist = 0.05, seed.use = 1672071369 ) # export clusters ids to text file write.table( - x = matrix(as.numeric(Sobject@meta.data$SCT_snn_res.0.45)-1, nrow=1), + x = matrix(as.numeric(Sobject@meta.data$SCT_snn_res.0.4)-1, nrow=1), file = file.path(savedir, "clustid.txt"), sep = "\t", row.names = FALSE, col.names=FALSE ) +# export list of missing genes +sheets <- c( + "TF_with_names", "Secreted_proteins", "GPI_anchored protein", + "multi-pass_transmembrane", "transmembrane_Cterm", "transmembrane_Nterm" +) +for (sheet in sheets){ + features <- readxl::read_excel( + path = file.path(datadir, "List_genes.xlsm"), + sheet = sheet, + col_names = TRUE)$SYMBOL + export_missing_markers(Sobject, features, savedir, prefix = sheet) +} + # save Seurat object SaveH5Seurat(Sobject, filename = file.path(savedir, "9396.h5Seurat")) diff --git a/WP.R b/scripts/WP.R similarity index 85% rename from WP.R rename to scripts/WP.R index 530b94d..b204216 100644 --- a/WP.R +++ b/scripts/WP.R @@ -29,20 +29,20 @@ import::from("SeuratObject", "Misc<-", .character_only=TRUE) import::from("SeuratDisk", "SaveH5Seurat", .character_only=TRUE) # relative import -import::from("ggtheme.R", "theme_custom", .character_only=TRUE) -import::from("statistics.R", "varPCA", .character_only=TRUE) +import::from("./scripts/ggtheme.R", "theme_custom", .character_only=TRUE) +import::from("./scripts/statistics.R", "varPCA", .character_only=TRUE) # set global ggplot2 theme ggplot2::theme_set(theme_custom()) # load reticulate env -reticulate::use_condaenv("umap", required = TRUE) +reticulate::use_condaenv("seurat", required = TRUE) # path to read raw data -datadir <- "/data" +datadir <- "./data" # path to save figure and data -savedir <- "../results/WP" +savedir <- "./results/WP" # --------------------------------------------------------------- # DATA LOADING AND PREPROCESSING @@ -68,7 +68,7 @@ Sobject <- PercentageFeatureSet( gg <- VlnPlot( Sobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3 ) -ggsave(file=paste0(savedir, "_vlnplot_before_QC.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "_vlnplot_before_QC.svg"), device="svg", gg) # empirical filtering based on QC metrics Sobject <- subset(Sobject, subset = nFeature_RNA > 600 & nFeature_RNA < 5000) @@ -82,7 +82,7 @@ Sobject <- subset(Sobject, subset = GFP != 0 | Mef2 != 0 | twi != 0) gg <- VlnPlot( Sobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3 ) -ggsave(file=paste0(savedir, "_vlnplot_after_QC.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "_vlnplot_after_QC.svg"), device="svg", gg) # --------------------------------------------------------------- # DATA NORMALIZATION THROUGH SCTRANSFORM @@ -115,7 +115,7 @@ gg <- vfp_labels & scale_y_continuous(trans='log10') & theme_custom() & theme(legend.position=c(0.5, 0.2)) -ggsave(file=paste0(savedir, "top_variable_features.svg"), device = "svg", +ggsave(file=file.path(savedir, "top_variable_features.svg"), device = "svg", width = 10, height = 8, gg) # --------------------------------------------------------------- @@ -135,7 +135,7 @@ ncomp <- ncol(Sobject@reductions$pca) # view the PCA threshold gg <- ElbowPlot(Sobject, ndims=ncomp) & ylab("Variance explained (%)") & theme_custom() -ggsave(file=paste0(savedir, "pca.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "pca.svg"), device="svg", gg) ncomp <- 14 # export PCA data to text file @@ -170,7 +170,7 @@ gg <- clustree::clustree( scale_colour_gradientn( colors = rev(pals::plasma(256)[120:256]) ) -ggsave(file=paste0(savedir, "clustree.svg"), device="svg", +ggsave(file=file.path(savedir, "clustree.svg"), device="svg", width = 8, height = 12, gg) # construct UMAP embedding with custom parameters @@ -192,5 +192,18 @@ write.table( row.names = FALSE, col.names=FALSE ) +# export list of missing genes +sheets <- c( + "TF_with_names", "Secreted_proteins", "GPI_anchored protein", + "multi-pass_transmembrane", "transmembrane_Cterm", "transmembrane_Nterm" +) +for (sheet in sheets){ + features <- readxl::read_excel( + path = file.path(datadir, "List_genes.xlsm"), + sheet = sheet, + col_names = TRUE)$SYMBOL + export_missing_markers(Sobject, features, savedir, prefix = sheet) +} + # save Seurat object SaveH5Seurat(Sobject, filename = file.path(savedir, "WP.h5Seurat")) diff --git a/analysis.R b/scripts/analysis.R similarity index 83% rename from analysis.R rename to scripts/analysis.R index 7b8ac43..e9b884d 100644 --- a/analysis.R +++ b/scripts/analysis.R @@ -13,7 +13,7 @@ import::from("SeuratDisk", "LoadH5Seurat", .character_only=TRUE) # local imports import::from( - "export.R", + "./scripts/export.R", c( "export_dotplots", "export_embeddings", "export_heatmap", "export_markers", "export_markers_unique", "FindFullMarkers", @@ -21,6 +21,8 @@ import::from( ), .character_only=TRUE ) +import::from("./scripts/ggtheme.R", "theme_custom", .character_only=TRUE) + # ------------------------------------------------------------------------- @@ -28,21 +30,17 @@ import::from( ggplot2::theme_set(theme_custom()) # path to the folder where the raw data are stored -datadir <- "/data" +datadir <- "./data" # path to the folder where to export figures and save data -savedir <- "/results/..." +savedir <- "./results/9396" # Load Seurat object -Sobject <- LoadH5Seurat(file = file.path(savedir, "[...].h5Seurat")) +Sobject <- LoadH5Seurat(file = file.path(savedir, "9396.h5Seurat")) # ------------------------------------------------------------------------- -# choose clustering resolution (the number of clusters) -ident <- "SCT_snn_res.0.35" -Idents(Sobject) <- ident - -# set colormap +# set colormap (a valid string from the pals package) cmap <- "alphabet" # export UMAP embedding with custom coloring based on clustering @@ -55,25 +53,34 @@ export_embeddings(Sobject, res_vec, cmap, savedir, assay="SCT", device = "png") # ------------------------------------------------------------------------- +# choose clustering resolution (the number of clusters) +ident <- "SCT_snn_res.0.4" +Idents(Sobject) <- ident + +# ------------------------------------------------------------------------- + # find all relevant markers with constrained parameters markers <- FindAllMarkers( Sobject, only.pos = TRUE, min.pct = 0.1, min.diff.pct = 0.50, - logfc.threshold = 0.25) + logfc.threshold = 0.25 + ) # save result table export_markers(markers, ident, 10, savedir) # save heatmap of top50 genes top_genes <- head(VariableFeatures(object = Sobject, assay = "SCT"), 50) -plot_heatmap(Sobject, top_genes, "SCT", TRUE, cmap, "top50", savedir, - device = "png") +export_heatmap( + Sobject, top_genes, "SCT", TRUE, cmap, "top50", savedir, device = "png" + ) # ------------------------------------------------------------------------- # visualization of genes: "GFP", "Mef2" and "twi" features <- c("GFP","Mef2","twi") -visualize_markers(Sobject, features, ident, cmap, "GMt", savedir, - device = "png") +visualize_markers( + Sobject, features, ident, cmap, "GMt", savedir, device = "png" + ) # ------------------------------------------------------------------------- @@ -94,8 +101,9 @@ for (sheet in sheets){ path = file.path(datadir, "List_genes.xlsm"), sheet = sheet, col_names = TRUE)$SYMBOL - export_dotplots(Sobject, features, ident, savedir, prefix = sheet, - device = "png") + export_dotplots( + Sobject, features, ident, savedir, prefix = sheet, device = "png" + ) } # ------------------------------------------------------------------------- @@ -108,7 +116,7 @@ export_markers(markers, ident, NULL, savedir, prefix = "full_") top5_genes <- markers %>% dplyr::group_by(cluster) %>% dplyr::top_n(n = 5, wt = avg_log2FC) -plot_heatmap( +export_heatmap( Sobject, top5_genes$gene, "SCT", TRUE, cmap, "top5_perclust", savedir, device = "png" ) @@ -117,7 +125,7 @@ plot_heatmap( top10_genes <- markers %>% dplyr::group_by(cluster) %>% dplyr::top_n(n = 10, wt = avg_log2FC) -plot_heatmap( +export_heatmap( Sobject, top10_genes$gene, "SCT", TRUE, cmap, "top10_perclust", savedir, device = "png" ) diff --git a/export.R b/scripts/export.R similarity index 98% rename from export.R rename to scripts/export.R index 65e27cc..3f8114b 100644 --- a/export.R +++ b/scripts/export.R @@ -15,6 +15,7 @@ import::here( "ggsave", "guide_axis", "scale_color_gradientn", + "scale_color_manual", "scale_fill_gradientn", "scale_x_discrete" ), @@ -490,7 +491,12 @@ export_heatmap <- function( # save the heatmap filepath <- file.path(savedir, subdir, prefix) gg <- suppressWarnings(suppressMessages( - DoHeatmap(object, assay=assay, features=features, group.colors=cols) + DoHeatmap( + object, assay=assay, features=features, group.colors=cols + ) & + scale_color_manual( + values = cols, name = "Identity", na.translate = FALSE + ) )) if (!legend) { gg <- gg + NoLengend()} ggsave(file=paste0(filepath, "_heatmap", ".", device), device = device, diff --git a/ggtheme.R b/scripts/ggtheme.R similarity index 100% rename from ggtheme.R rename to scripts/ggtheme.R diff --git a/statistics.R b/scripts/statistics.R similarity index 99% rename from statistics.R rename to scripts/statistics.R index 05339c0..2830183 100644 --- a/statistics.R +++ b/scripts/statistics.R @@ -6,6 +6,7 @@ import::here("dplyr", c("left_join", "rename"), .character_only=TRUE) import::here("janitor", "adorn_totals", .character_only=TRUE) import::here("magrittr", "%>%", .character_only=TRUE) +import::here("matrixStats", "rowVars", .character_only=TRUE) import::here("glue", "glue_collapse", .character_only=TRUE) import::here( "Seurat", diff --git a/scripts/test.R b/scripts/test.R new file mode 100644 index 0000000..7327ba2 --- /dev/null +++ b/scripts/test.R @@ -0,0 +1,106 @@ +# ------------------------------------------------------------------------- +# Test clustering analysis on pbmc_small dataset +# ------------------------------------------------------------------------- + +# global imports +import::from("magrittr", "%>%", .character_only=TRUE) +import::from( + "Seurat", + c("FindAllMarkers", "FindMarkers", "Idents<-", "VariableFeatures"), + .character_only=TRUE +) +import::from("SeuratDisk", "LoadH5Seurat", .character_only=TRUE) + +# local imports +import::from( + "./scripts/export.R", + c( + "export_dotplots", "export_embeddings", "export_heatmap", + "export_markers", "export_markers_unique", "FindFullMarkers", + "visualize_markers" + ), + .character_only=TRUE +) +import::from("./scripts/ggtheme.R", "theme_custom", .character_only=TRUE) + + +# ------------------------------------------------------------------------- + +# set global ggplot2 theme +ggplot2::theme_set(theme_custom()) + +# path to the folder where the raw data are stored +datadir <- "./data" + +# path to the folder where to export figures and save data +savedir <- "./results/pbmc" + +# Load Seurat object +Sobject <- LoadH5Seurat(file = file.path(savedir, "pbmc_small.h5Seurat")) + +# ------------------------------------------------------------------------- + +# set colormap (a valid string from the pals package) +cmap <- "alphabet" + +# export UMAP embedding with custom coloring based on clustering +res_vec <- colnames(Sobject@meta.data) %>% + strsplit(., split="SCT_snn_res.") %>% + sapply(., function(x) x[2]) %>% + na.omit(.) %>% + as.numeric(.) +export_embeddings(Sobject, res_vec, cmap, savedir, assay="SCT", device = "png") + +# ------------------------------------------------------------------------- + +# choose clustering resolution (the number of clusters) +ident <- "SCT_snn_res.1" +Idents(Sobject) <- ident + +# ------------------------------------------------------------------------- + +# find all relevant markers with constrained parameters +markers <- FindAllMarkers( + Sobject, only.pos = TRUE, min.pct = 0.1, min.diff.pct = 0.50, + logfc.threshold = 0.25 +) + +# save result table +export_markers(markers, ident, 10, savedir) + +# save heatmap of top50 genes +top_genes <- head(VariableFeatures(object = Sobject, assay = "SCT"), 50) +export_heatmap( + Sobject, top_genes, "SCT", TRUE, cmap, "top50", savedir, device = "png" +) + +# ------------------------------------------------------------------------- + +# export table of markers that are mostly expressed in a single cluster +# min_pct = 5 means expressed in at least 5% of the cells of a cluster +# threshold = 1 means expressed in less than 1% of the remaining cells +export_markers_unique(Sobject, ident, savedir, threshold = 1, min_pct = 5) + +# ------------------------------------------------------------------------- + +# get all markers per cluster irregardless of their expression power +markers <- FindFullMarkers(Sobject, ident, savedir) +export_markers(markers, ident, NULL, savedir, prefix = "full_") + +# heatmap of top5 genes per cluster +top5_genes <- markers %>% + dplyr::group_by(cluster) %>% + dplyr::top_n(n = 5, wt = avg_log2FC) +export_heatmap( + Sobject, top5_genes$gene, "SCT", TRUE, cmap, "top5_perclust", savedir, + device = "png" +) + +# heatmap of top10 genes per cluster +top10_genes <- markers %>% + dplyr::group_by(cluster) %>% + dplyr::top_n(n = 10, wt = avg_log2FC) +export_heatmap( + Sobject, top10_genes$gene, "SCT", TRUE, cmap, "top10_perclust", savedir, + device = "png" +) diff --git a/utilities.R b/scripts/utilities.R similarity index 100% rename from utilities.R rename to scripts/utilities.R diff --git a/umap.yml b/seurat.yml similarity index 95% rename from umap.yml rename to seurat.yml index 44ff267..005159e 100644 --- a/umap.yml +++ b/seurat.yml @@ -1,4 +1,4 @@ -name: umap +name: seurat channels: - bokeh - conda-forge -- GitLab