diff --git a/.gitignore b/.gitignore index 802b6f83e3879ecf30ba1e659d5be3dfed8f5f2a..c4d5e6be2b304ff01f928ff55799ec62e465c8d0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,14 @@ -# specific files +# specific files and folders mnist.R pbmc.R +data/ +results/ +scripts/Python/ + +# files at root +*.pdf +*.pptx +*.xlsm # History files .Rhistory diff --git a/5hAPF.R b/scripts/5hAPF.R similarity index 84% rename from 5hAPF.R rename to scripts/5hAPF.R index c3b26979938137ff12cdf6d0bcff741dee0b3651..a612da93de939f65d179c7a146253990ecfd9d91 100644 --- a/5hAPF.R +++ b/scripts/5hAPF.R @@ -29,20 +29,20 @@ import::from("SeuratObject", "Misc<-", .character_only=TRUE) import::from("SeuratDisk", "SaveH5Seurat", .character_only=TRUE) # relative import -import::from("ggtheme.R", "theme_custom", .character_only=TRUE) -import::from("statistics.R", "varPCA", .character_only=TRUE) +import::from("./scripts/ggtheme.R", "theme_custom", .character_only=TRUE) +import::from("./scripts/statistics.R", "varPCA", .character_only=TRUE) # set global ggplot2 theme ggplot2::theme_set(theme_custom()) # load reticulate env -reticulate::use_condaenv("umap", required = TRUE) +reticulate::use_condaenv("seurat", required = TRUE) # path to read raw data -datadir <- "/data" +datadir <- "./data" # path to save figure and data -savedir <- "../results/5hAPF" +savedir <- "./results/5hAPF" # --------------------------------------------------------------- # DATA LOADING AND PREPROCESSING @@ -68,7 +68,7 @@ Sobject <- PercentageFeatureSet( gg <- VlnPlot( Sobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3 ) -ggsave(file=paste0(savedir, "_vlnplot_before_QC.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "_vlnplot_before_QC.svg"), device="svg", gg) # empirical filtering based on QC metrics Sobject <- subset(Sobject, subset = nFeature_RNA > 600 & nFeature_RNA < 5000) @@ -82,7 +82,7 @@ Sobject <- subset(Sobject, subset = GFP != 0 | Mef2 != 0 | twi != 0) gg <- VlnPlot( Sobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3 ) -ggsave(file=paste0(savedir, "_vlnplot_after_QC.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "_vlnplot_after_QC.svg"), device="svg", gg) # --------------------------------------------------------------- # DATA NORMALIZATION THROUGH SCTRANSFORM @@ -115,7 +115,7 @@ gg <- vfp_labels & scale_y_continuous(trans='log10') & theme_custom() & theme(legend.position=c(0.5, 0.2)) -ggsave(file=paste0(savedir, "top_variable_features.svg"), device = "svg", +ggsave(file=file.path(savedir, "top_variable_features.svg"), device = "svg", width = 10, height = 8, gg) # --------------------------------------------------------------- @@ -135,7 +135,7 @@ ncomp <- ncol(Sobject@reductions$pca) # view the PCA threshold gg <- ElbowPlot(Sobject, ndims=ncomp) & ylab("Variance explained (%)") & theme_custom() -ggsave(file=paste0(savedir, "pca.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "pca.svg"), device="svg", gg) ncomp <- 20 # export PCA data to text file @@ -170,7 +170,7 @@ gg <- clustree::clustree( scale_colour_gradientn( colors = rev(pals::plasma(256)[120:256]) ) -ggsave(file=paste0(savedir, "clustree.svg"), device="svg", +ggsave(file=file.path(savedir, "clustree.svg"), device="svg", width = 8, height = 12, gg) # construct UMAP embedding with custom parameters @@ -188,5 +188,18 @@ write.table( row.names = FALSE, col.names=FALSE ) +# export list of missing genes +sheets <- c( + "TF_with_names", "Secreted_proteins", "GPI_anchored protein", + "multi-pass_transmembrane", "transmembrane_Cterm", "transmembrane_Nterm" +) +for (sheet in sheets){ + features <- readxl::read_excel( + path = file.path(datadir, "List_genes.xlsm"), + sheet = sheet, + col_names = TRUE)$SYMBOL + export_missing_markers(Sobject, features, savedir, prefix = sheet) +} + # save Seurat object SaveH5Seurat(Sobject, filename = file.path(savedir, "5hAPF.h5Seurat")) diff --git a/9396.R b/scripts/9396.R similarity index 81% rename from 9396.R rename to scripts/9396.R index b942af82cfe5601f2ccfc237d0c2ea56d2a7447d..eb3dd2e20e9140f0c3b3144e27e3fa0748739a0e 100644 --- a/9396.R +++ b/scripts/9396.R @@ -29,20 +29,20 @@ import::from("SeuratObject", "Misc<-", .character_only=TRUE) import::from("SeuratDisk", "SaveH5Seurat", .character_only=TRUE) # relative import -import::from("ggtheme.R", "theme_custom", .character_only=TRUE) -import::from("statistics.R", "varPCA", .character_only=TRUE) +import::from("./scripts/ggtheme.R", "theme_custom", .character_only=TRUE) +import::from("./scripts/statistics.R", "varPCA", .character_only=TRUE) # set global ggplot2 theme ggplot2::theme_set(theme_custom()) # load reticulate env -reticulate::use_condaenv("umap", required = TRUE) +reticulate::use_condaenv("seurat", required = TRUE) # path to read raw data -datadir <- "/data" +datadir <- "./data" # path to save figure and data -savedir <- "../results/9396" +savedir <- "./results/9396" # --------------------------------------------------------------- # DATA LOADING AND PREPROCESSING @@ -68,7 +68,7 @@ Sobject <- PercentageFeatureSet( gg <- VlnPlot( Sobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3 ) -ggsave(file=paste0(savedir, "_vlnplot_before_QC.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "_vlnplot_before_QC.svg"), device="svg", gg) # empirical filtering based on QC metrics Sobject <- subset(Sobject, subset = nFeature_RNA > 600 & nFeature_RNA < 5000) @@ -82,7 +82,7 @@ Sobject <- subset(Sobject, subset = GFP != 0 | Mef2 != 0 | twi != 0) gg <- VlnPlot( Sobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3 ) -ggsave(file=paste0(savedir, "_vlnplot_after_QC.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "_vlnplot_after_QC.svg"), device="svg", gg) # --------------------------------------------------------------- # DATA NORMALIZATION THROUGH SCTRANSFORM @@ -115,7 +115,7 @@ gg <- vfp_labels & scale_y_continuous(trans='log10') & theme_custom() & theme(legend.position=c(0.5, 0.2)) -ggsave(file=paste0(savedir, "top_variable_features.svg"), device = "svg", +ggsave(file=file.path(savedir, "top_variable_features.svg"), device = "svg", width = 10, height = 8, gg) # --------------------------------------------------------------- @@ -135,8 +135,8 @@ ncomp <- ncol(Sobject@reductions$pca) # view the PCA threshold gg <- ElbowPlot(Sobject, ndims=ncomp) & ylab("Variance explained (%)") & theme_custom() -ggsave(file=paste0(savedir, "pca.svg"), device="svg", gg) -ncomp <- 12 +ggsave(file=file.path(savedir, "pca.svg"), device="svg", gg) +ncomp <- 25 # export PCA data to text file write.table( @@ -170,23 +170,36 @@ gg <- clustree::clustree( scale_colour_gradientn( colors = rev(pals::plasma(256)[120:256]) ) -ggsave(file=paste0(savedir, "clustree.svg"), device="svg", +ggsave(file=file.path(savedir, "clustree.svg"), device="svg", width = 8, height = 12, gg) # construct UMAP embedding with custom parameters Sobject <- RunUMAP( Sobject , dims = 1:ncomp, n.neighbors = n_neighbors, metric="cosine", reduction = "pca", umap.method = "umap-learn", n.epochs = 1000, - learning.rate = 1.0, negative.sample.rate = 2, local.connectivity = 2, - set.op.mix.ratio = 1.0, a = 1.0, b = 1.0, seed.use = 1672071369 + learning.rate = 1, negative.sample.rate = 9, local.connectivity = 1, + set.op.mix.ratio = 0.5, min.dist = 0.05, seed.use = 1672071369 ) # export clusters ids to text file write.table( - x = matrix(as.numeric(Sobject@meta.data$SCT_snn_res.0.45)-1, nrow=1), + x = matrix(as.numeric(Sobject@meta.data$SCT_snn_res.0.4)-1, nrow=1), file = file.path(savedir, "clustid.txt"), sep = "\t", row.names = FALSE, col.names=FALSE ) +# export list of missing genes +sheets <- c( + "TF_with_names", "Secreted_proteins", "GPI_anchored protein", + "multi-pass_transmembrane", "transmembrane_Cterm", "transmembrane_Nterm" +) +for (sheet in sheets){ + features <- readxl::read_excel( + path = file.path(datadir, "List_genes.xlsm"), + sheet = sheet, + col_names = TRUE)$SYMBOL + export_missing_markers(Sobject, features, savedir, prefix = sheet) +} + # save Seurat object SaveH5Seurat(Sobject, filename = file.path(savedir, "9396.h5Seurat")) diff --git a/WP.R b/scripts/WP.R similarity index 85% rename from WP.R rename to scripts/WP.R index 530b94dfd8885dc1b1d537406aa4472fd320e2a5..b2042163d1997beb82457da2ef0485c77c66db90 100644 --- a/WP.R +++ b/scripts/WP.R @@ -29,20 +29,20 @@ import::from("SeuratObject", "Misc<-", .character_only=TRUE) import::from("SeuratDisk", "SaveH5Seurat", .character_only=TRUE) # relative import -import::from("ggtheme.R", "theme_custom", .character_only=TRUE) -import::from("statistics.R", "varPCA", .character_only=TRUE) +import::from("./scripts/ggtheme.R", "theme_custom", .character_only=TRUE) +import::from("./scripts/statistics.R", "varPCA", .character_only=TRUE) # set global ggplot2 theme ggplot2::theme_set(theme_custom()) # load reticulate env -reticulate::use_condaenv("umap", required = TRUE) +reticulate::use_condaenv("seurat", required = TRUE) # path to read raw data -datadir <- "/data" +datadir <- "./data" # path to save figure and data -savedir <- "../results/WP" +savedir <- "./results/WP" # --------------------------------------------------------------- # DATA LOADING AND PREPROCESSING @@ -68,7 +68,7 @@ Sobject <- PercentageFeatureSet( gg <- VlnPlot( Sobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3 ) -ggsave(file=paste0(savedir, "_vlnplot_before_QC.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "_vlnplot_before_QC.svg"), device="svg", gg) # empirical filtering based on QC metrics Sobject <- subset(Sobject, subset = nFeature_RNA > 600 & nFeature_RNA < 5000) @@ -82,7 +82,7 @@ Sobject <- subset(Sobject, subset = GFP != 0 | Mef2 != 0 | twi != 0) gg <- VlnPlot( Sobject, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3 ) -ggsave(file=paste0(savedir, "_vlnplot_after_QC.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "_vlnplot_after_QC.svg"), device="svg", gg) # --------------------------------------------------------------- # DATA NORMALIZATION THROUGH SCTRANSFORM @@ -115,7 +115,7 @@ gg <- vfp_labels & scale_y_continuous(trans='log10') & theme_custom() & theme(legend.position=c(0.5, 0.2)) -ggsave(file=paste0(savedir, "top_variable_features.svg"), device = "svg", +ggsave(file=file.path(savedir, "top_variable_features.svg"), device = "svg", width = 10, height = 8, gg) # --------------------------------------------------------------- @@ -135,7 +135,7 @@ ncomp <- ncol(Sobject@reductions$pca) # view the PCA threshold gg <- ElbowPlot(Sobject, ndims=ncomp) & ylab("Variance explained (%)") & theme_custom() -ggsave(file=paste0(savedir, "pca.svg"), device="svg", gg) +ggsave(file=file.path(savedir, "pca.svg"), device="svg", gg) ncomp <- 14 # export PCA data to text file @@ -170,7 +170,7 @@ gg <- clustree::clustree( scale_colour_gradientn( colors = rev(pals::plasma(256)[120:256]) ) -ggsave(file=paste0(savedir, "clustree.svg"), device="svg", +ggsave(file=file.path(savedir, "clustree.svg"), device="svg", width = 8, height = 12, gg) # construct UMAP embedding with custom parameters @@ -192,5 +192,18 @@ write.table( row.names = FALSE, col.names=FALSE ) +# export list of missing genes +sheets <- c( + "TF_with_names", "Secreted_proteins", "GPI_anchored protein", + "multi-pass_transmembrane", "transmembrane_Cterm", "transmembrane_Nterm" +) +for (sheet in sheets){ + features <- readxl::read_excel( + path = file.path(datadir, "List_genes.xlsm"), + sheet = sheet, + col_names = TRUE)$SYMBOL + export_missing_markers(Sobject, features, savedir, prefix = sheet) +} + # save Seurat object SaveH5Seurat(Sobject, filename = file.path(savedir, "WP.h5Seurat")) diff --git a/analysis.R b/scripts/analysis.R similarity index 83% rename from analysis.R rename to scripts/analysis.R index 7b8ac4309beaff6e5fdcbdb2dfbcddfc73478a43..e9b884d31caad087186ae849c7c7e3f3e953e08a 100644 --- a/analysis.R +++ b/scripts/analysis.R @@ -13,7 +13,7 @@ import::from("SeuratDisk", "LoadH5Seurat", .character_only=TRUE) # local imports import::from( - "export.R", + "./scripts/export.R", c( "export_dotplots", "export_embeddings", "export_heatmap", "export_markers", "export_markers_unique", "FindFullMarkers", @@ -21,6 +21,8 @@ import::from( ), .character_only=TRUE ) +import::from("./scripts/ggtheme.R", "theme_custom", .character_only=TRUE) + # ------------------------------------------------------------------------- @@ -28,21 +30,17 @@ import::from( ggplot2::theme_set(theme_custom()) # path to the folder where the raw data are stored -datadir <- "/data" +datadir <- "./data" # path to the folder where to export figures and save data -savedir <- "/results/..." +savedir <- "./results/9396" # Load Seurat object -Sobject <- LoadH5Seurat(file = file.path(savedir, "[...].h5Seurat")) +Sobject <- LoadH5Seurat(file = file.path(savedir, "9396.h5Seurat")) # ------------------------------------------------------------------------- -# choose clustering resolution (the number of clusters) -ident <- "SCT_snn_res.0.35" -Idents(Sobject) <- ident - -# set colormap +# set colormap (a valid string from the pals package) cmap <- "alphabet" # export UMAP embedding with custom coloring based on clustering @@ -55,25 +53,34 @@ export_embeddings(Sobject, res_vec, cmap, savedir, assay="SCT", device = "png") # ------------------------------------------------------------------------- +# choose clustering resolution (the number of clusters) +ident <- "SCT_snn_res.0.4" +Idents(Sobject) <- ident + +# ------------------------------------------------------------------------- + # find all relevant markers with constrained parameters markers <- FindAllMarkers( Sobject, only.pos = TRUE, min.pct = 0.1, min.diff.pct = 0.50, - logfc.threshold = 0.25) + logfc.threshold = 0.25 + ) # save result table export_markers(markers, ident, 10, savedir) # save heatmap of top50 genes top_genes <- head(VariableFeatures(object = Sobject, assay = "SCT"), 50) -plot_heatmap(Sobject, top_genes, "SCT", TRUE, cmap, "top50", savedir, - device = "png") +export_heatmap( + Sobject, top_genes, "SCT", TRUE, cmap, "top50", savedir, device = "png" + ) # ------------------------------------------------------------------------- # visualization of genes: "GFP", "Mef2" and "twi" features <- c("GFP","Mef2","twi") -visualize_markers(Sobject, features, ident, cmap, "GMt", savedir, - device = "png") +visualize_markers( + Sobject, features, ident, cmap, "GMt", savedir, device = "png" + ) # ------------------------------------------------------------------------- @@ -94,8 +101,9 @@ for (sheet in sheets){ path = file.path(datadir, "List_genes.xlsm"), sheet = sheet, col_names = TRUE)$SYMBOL - export_dotplots(Sobject, features, ident, savedir, prefix = sheet, - device = "png") + export_dotplots( + Sobject, features, ident, savedir, prefix = sheet, device = "png" + ) } # ------------------------------------------------------------------------- @@ -108,7 +116,7 @@ export_markers(markers, ident, NULL, savedir, prefix = "full_") top5_genes <- markers %>% dplyr::group_by(cluster) %>% dplyr::top_n(n = 5, wt = avg_log2FC) -plot_heatmap( +export_heatmap( Sobject, top5_genes$gene, "SCT", TRUE, cmap, "top5_perclust", savedir, device = "png" ) @@ -117,7 +125,7 @@ plot_heatmap( top10_genes <- markers %>% dplyr::group_by(cluster) %>% dplyr::top_n(n = 10, wt = avg_log2FC) -plot_heatmap( +export_heatmap( Sobject, top10_genes$gene, "SCT", TRUE, cmap, "top10_perclust", savedir, device = "png" ) diff --git a/export.R b/scripts/export.R similarity index 98% rename from export.R rename to scripts/export.R index 65e27cc1744effb850dee79f30be10cad418d423..3f8114bbf8d16610cc93145dc06dd4c0ee1f9c94 100644 --- a/export.R +++ b/scripts/export.R @@ -15,6 +15,7 @@ import::here( "ggsave", "guide_axis", "scale_color_gradientn", + "scale_color_manual", "scale_fill_gradientn", "scale_x_discrete" ), @@ -490,7 +491,12 @@ export_heatmap <- function( # save the heatmap filepath <- file.path(savedir, subdir, prefix) gg <- suppressWarnings(suppressMessages( - DoHeatmap(object, assay=assay, features=features, group.colors=cols) + DoHeatmap( + object, assay=assay, features=features, group.colors=cols + ) & + scale_color_manual( + values = cols, name = "Identity", na.translate = FALSE + ) )) if (!legend) { gg <- gg + NoLengend()} ggsave(file=paste0(filepath, "_heatmap", ".", device), device = device, diff --git a/ggtheme.R b/scripts/ggtheme.R similarity index 100% rename from ggtheme.R rename to scripts/ggtheme.R diff --git a/statistics.R b/scripts/statistics.R similarity index 99% rename from statistics.R rename to scripts/statistics.R index 05339c0dfd3f6f42f5d0b17367b31035052fe767..28301836c077bdf7f3f1326580697a8f257f7969 100644 --- a/statistics.R +++ b/scripts/statistics.R @@ -6,6 +6,7 @@ import::here("dplyr", c("left_join", "rename"), .character_only=TRUE) import::here("janitor", "adorn_totals", .character_only=TRUE) import::here("magrittr", "%>%", .character_only=TRUE) +import::here("matrixStats", "rowVars", .character_only=TRUE) import::here("glue", "glue_collapse", .character_only=TRUE) import::here( "Seurat", diff --git a/scripts/test.R b/scripts/test.R new file mode 100644 index 0000000000000000000000000000000000000000..7327ba2653357cff97f52efb1ac2e8962db31e47 --- /dev/null +++ b/scripts/test.R @@ -0,0 +1,106 @@ +# ------------------------------------------------------------------------- +# Test clustering analysis on pbmc_small dataset +# ------------------------------------------------------------------------- + +# global imports +import::from("magrittr", "%>%", .character_only=TRUE) +import::from( + "Seurat", + c("FindAllMarkers", "FindMarkers", "Idents<-", "VariableFeatures"), + .character_only=TRUE +) +import::from("SeuratDisk", "LoadH5Seurat", .character_only=TRUE) + +# local imports +import::from( + "./scripts/export.R", + c( + "export_dotplots", "export_embeddings", "export_heatmap", + "export_markers", "export_markers_unique", "FindFullMarkers", + "visualize_markers" + ), + .character_only=TRUE +) +import::from("./scripts/ggtheme.R", "theme_custom", .character_only=TRUE) + + +# ------------------------------------------------------------------------- + +# set global ggplot2 theme +ggplot2::theme_set(theme_custom()) + +# path to the folder where the raw data are stored +datadir <- "./data" + +# path to the folder where to export figures and save data +savedir <- "./results/pbmc" + +# Load Seurat object +Sobject <- LoadH5Seurat(file = file.path(savedir, "pbmc_small.h5Seurat")) + +# ------------------------------------------------------------------------- + +# set colormap (a valid string from the pals package) +cmap <- "alphabet" + +# export UMAP embedding with custom coloring based on clustering +res_vec <- colnames(Sobject@meta.data) %>% + strsplit(., split="SCT_snn_res.") %>% + sapply(., function(x) x[2]) %>% + na.omit(.) %>% + as.numeric(.) +export_embeddings(Sobject, res_vec, cmap, savedir, assay="SCT", device = "png") + +# ------------------------------------------------------------------------- + +# choose clustering resolution (the number of clusters) +ident <- "SCT_snn_res.1" +Idents(Sobject) <- ident + +# ------------------------------------------------------------------------- + +# find all relevant markers with constrained parameters +markers <- FindAllMarkers( + Sobject, only.pos = TRUE, min.pct = 0.1, min.diff.pct = 0.50, + logfc.threshold = 0.25 +) + +# save result table +export_markers(markers, ident, 10, savedir) + +# save heatmap of top50 genes +top_genes <- head(VariableFeatures(object = Sobject, assay = "SCT"), 50) +export_heatmap( + Sobject, top_genes, "SCT", TRUE, cmap, "top50", savedir, device = "png" +) + +# ------------------------------------------------------------------------- + +# export table of markers that are mostly expressed in a single cluster +# min_pct = 5 means expressed in at least 5% of the cells of a cluster +# threshold = 1 means expressed in less than 1% of the remaining cells +export_markers_unique(Sobject, ident, savedir, threshold = 1, min_pct = 5) + +# ------------------------------------------------------------------------- + +# get all markers per cluster irregardless of their expression power +markers <- FindFullMarkers(Sobject, ident, savedir) +export_markers(markers, ident, NULL, savedir, prefix = "full_") + +# heatmap of top5 genes per cluster +top5_genes <- markers %>% + dplyr::group_by(cluster) %>% + dplyr::top_n(n = 5, wt = avg_log2FC) +export_heatmap( + Sobject, top5_genes$gene, "SCT", TRUE, cmap, "top5_perclust", savedir, + device = "png" +) + +# heatmap of top10 genes per cluster +top10_genes <- markers %>% + dplyr::group_by(cluster) %>% + dplyr::top_n(n = 10, wt = avg_log2FC) +export_heatmap( + Sobject, top10_genes$gene, "SCT", TRUE, cmap, "top10_perclust", savedir, + device = "png" +) diff --git a/utilities.R b/scripts/utilities.R similarity index 100% rename from utilities.R rename to scripts/utilities.R diff --git a/umap.yml b/seurat.yml similarity index 95% rename from umap.yml rename to seurat.yml index 44ff2670a956162a92b4b22077a1e9b8a446cd7a..005159e3950ceb9540b559271b808fd23f051394 100644 --- a/umap.yml +++ b/seurat.yml @@ -1,4 +1,4 @@ -name: umap +name: seurat channels: - bokeh - conda-forge