From b68cce25a788f908e345efda0e3da7663aba1ad5 Mon Sep 17 00:00:00 2001 From: Yusuke Koga Date: Fri, 16 Aug 2024 14:09:36 -0400 Subject: [PATCH 1/3] Adding Celda module-based analysis tutorial --- .../articles/SCTK_Modules_Seurat_Tutorial.Rmd | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd diff --git a/vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd b/vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd new file mode 100644 index 00000000..78dcfe3e --- /dev/null +++ b/vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd @@ -0,0 +1,123 @@ +--- +title: "Celda module analysis tutorial" +author: "Salam AlAbdullatif" +date: "2024-07-19" +output: html_document +--- + +```{r setup, include=TRUE} +suppressPackageStartupMessages({ + #Load library + library(dplyr) + library(Seurat) + library(patchwork) + library(celda) + library(singleCellTK) + library(knitr) + library(kableExtra) + library(ggplot2) + library(dendextend) +}) +knitr::opts_chunk$set(echo = TRUE) + +``` + +## Convert Seurat Object to an SingleCellExperiment (SCE) object (Optional) + +While Celda provides the functionality for cell cluster generation, some users may opt to import cluster labels generated from the popular Seurat pipeline. (https://pubmed.ncbi.nlm.nih.gov/29608179/) We can apply the `convertSeuratToSCE` function contained within the SingleCellTK package for the object conversion. + +```{r, warning=FALSE} +#Seurat pipeline (Taken from https://satijalab.org/seurat/articles/pbmc3k_tutorial): +##Read in 10X output, PBMC 3K +pbmc.data <- Read10X(data.dir = "./filtered_gene_bc_matrices/hg19/") +##Create Seurat object +pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200) +##Normalize data +pbmc <- NormalizeData(pbmc, verbose = FALSE) +##Feature selection +pbmc <- FindVariableFeatures(pbmc, verbose = FALSE) +##Linear transformation +pbmc <- ScaleData(pbmc, verbose = FALSE) +pbmc <- RunPCA(pbmc, verbose = FALSE) +pbmc <- RunUMAP(pbmc, dims = 1:10, verbose = FALSE) +##Generate clusters +pbmc <- FindNeighbors(pbmc, dims = 1:10, verbose = FALSE) +pbmc <- FindClusters(pbmc, resolution = 0.5, verbose = FALSE) +``` + +```{r} +#Convert the Seurat object to a SingleCellExperiment object +pbmcSce <- convertSeuratToSCE(pbmc, normAssayName = "logcounts") +``` + +## Generating feature modules from pre-determined clusters + +Generate Celda feature modules based on cell clusters that you have provided through the `recursiveSplitModule` function: + +```{r} +#Convert Seurat cluster IDs, as Celda requires clusters be numeric vectors starting from 1 (Seurat cluster 0 = Celda cluster 1) +pbmcSce$seurat_clusters <- as.numeric(as.character(pbmcSce$seurat_clusters)) + 1 + +pbmcSce <- selectFeatures(pbmcSce) +#Run recursiveSplitModule; Seurat clusters will be imported in the zInit parameter. +rsmRes <- recursiveSplitModule(x = pbmcSce, useAssay = "counts", altExpName = "featureSubset", initialL = 2, maxL = 30, zInit = pbmcSce$seurat_clusters, verbose = FALSE) +``` + +```{r} +#Identify the optimal L (number of modules) using the elbow plot +plotRPC(rsmRes) +``` + +A heuristic method that may be applied to identify the optimal number of modules. Based on the elbow plot, the rate of the perplexity change levels off around L = 12 ~ 15. For this tutorial, we will choose L = 15 for the number of modules. + +```{r} +#Subset Celda object +sce <- subsetCeldaList(rsmRes, list(L = 15)) +``` + +```{r} +featureTable <- featureModuleTable(sce, useAssay = "counts", altExpName = "featureSubset") + +kable(featureTable, style = "html", row.names = FALSE) %>% + kable_styling(bootstrap_options = "striped") %>% + scroll_box(width = "100%", height = "500px") +``` + +For instance, we can see that each module represents a set of co-expressing features, such as module L7 (B lymphocytes - CD79A, MS4A1, TCL1A) and module L8 (NK cells - NKG7, GNLY) + +## Run DE on Modules + +Differential expression tools can also be used on the gene modules in order to identify the modules that define each cell cluster. To do this, first run the `factorizeMatrix` function, which will generate a matrix that measures the contribution of each gene module to each cell population. + +```{r DE modules, message=FALSE} +factorize <- factorizeMatrix(sce, useAssay = "counts", altExpName = "featureSubset") +### Take the module counts, log-normalize, then use for DE, violin plots, etc +factorizedCounts <- factorize$counts$cell +factorizedLogcounts <- normalizeCounts(factorizedCounts) +reducedDim(sce, "factorizeMatrix") <- t(factorizedLogcounts) +sce <- runFindMarker(sce, useReducedDim = "factorizeMatrix", cluster = "seurat_clusters", useAssay = NULL) +``` + +The differential expression results are contained within `sce@metadata$findMarker`. The `Gene` column denotes the differentially expressed module, whilst the `Log2_FC` column denotes the level of upregulation of the module in the cluster. + +```{r DE module table, message=FALSE} +kable(sce@metadata$findMarker, style = "html", row.names = FALSE) %>% + kable_styling(bootstrap_options = "striped") %>% + scroll_box(width = "100%", height = "500px") +``` + + +## Generate a Decision Tree + +To identify which modules distinguish which clusters, we can use use decision trees implemented in the `findMarkersTree` function. Upon applying the celda algorithm, we will use the generated factorized counts for the modules (a matrix of modules x cells) and cluster labels, as an input to the findMarkersTree function. + +```{r, message=FALSE} +classes <- as.integer(celdaClusters(sce)) # needs to be an integer vector +DecTree <- findMarkersTree(factorizedCounts, classes) +``` + +The output decision tree can be visualized through a dendrogram using the `plotDendro` function: + +```{r, warning=FALSE} +plotDendro(DecTree) +``` From 4c1d843bda2882f286c463f8131d0099769f4e1e Mon Sep 17 00:00:00 2001 From: Yusuke Koga Date: Fri, 16 Aug 2024 16:33:04 -0400 Subject: [PATCH 2/3] Edits to tutorial --- ...utorial.Rmd => seurat_to_celda_pbmc3k.Rmd} | 72 ++++++++++++++----- 1 file changed, 53 insertions(+), 19 deletions(-) rename vignettes/articles/{SCTK_Modules_Seurat_Tutorial.Rmd => seurat_to_celda_pbmc3k.Rmd} (70%) diff --git a/vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd b/vignettes/articles/seurat_to_celda_pbmc3k.Rmd similarity index 70% rename from vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd rename to vignettes/articles/seurat_to_celda_pbmc3k.Rmd index 78dcfe3e..418dad20 100644 --- a/vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd +++ b/vignettes/articles/seurat_to_celda_pbmc3k.Rmd @@ -5,31 +5,32 @@ date: "2024-07-19" output: html_document --- -```{r setup, include=TRUE} -suppressPackageStartupMessages({ - #Load library - library(dplyr) - library(Seurat) - library(patchwork) - library(celda) - library(singleCellTK) - library(knitr) - library(kableExtra) - library(ggplot2) - library(dendextend) -}) +```{r setup, include=TRUE, message=FALSE} +#Load library +library(dplyr) +library(Seurat) +library(patchwork) +library(celda) +library(singleCellTK) +library(knitr) +library(kableExtra) +library(ggplot2) +library(dendextend) + knitr::opts_chunk$set(echo = TRUE) +source("/restricted/projectnb/camplab/projects/20240719_Codathon/Tutorial/findMarkersTree.R") ``` -## Convert Seurat Object to an SingleCellExperiment (SCE) object (Optional) +## Applying the Seurat clustering algorithm (Optional) While Celda provides the functionality for cell cluster generation, some users may opt to import cluster labels generated from the popular Seurat pipeline. (https://pubmed.ncbi.nlm.nih.gov/29608179/) We can apply the `convertSeuratToSCE` function contained within the SingleCellTK package for the object conversion. ```{r, warning=FALSE} #Seurat pipeline (Taken from https://satijalab.org/seurat/articles/pbmc3k_tutorial): ##Read in 10X output, PBMC 3K -pbmc.data <- Read10X(data.dir = "./filtered_gene_bc_matrices/hg19/") +## Check seurat fxn if exists: +pbmc.data <- Read10X(data.dir = "/restricted/projectnb/camplab/projects/20240719_Codathon/Tutorial/filtered_gene_bc_matrices/hg19/") ##Create Seurat object pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200) ##Normalize data @@ -45,6 +46,8 @@ pbmc <- FindNeighbors(pbmc, dims = 1:10, verbose = FALSE) pbmc <- FindClusters(pbmc, resolution = 0.5, verbose = FALSE) ``` +## Convert Seurat Object to an SingleCellExperiment (SCE) object (Optional) + ```{r} #Convert the Seurat object to a SingleCellExperiment object pbmcSce <- convertSeuratToSCE(pbmc, normAssayName = "logcounts") @@ -54,6 +57,12 @@ pbmcSce <- convertSeuratToSCE(pbmc, normAssayName = "logcounts") Generate Celda feature modules based on cell clusters that you have provided through the `recursiveSplitModule` function: +```{r} +useAssay <- "counts" +altExpName <- "featureSubset" +``` + + ```{r} #Convert Seurat cluster IDs, as Celda requires clusters be numeric vectors starting from 1 (Seurat cluster 0 = Celda cluster 1) pbmcSce$seurat_clusters <- as.numeric(as.character(pbmcSce$seurat_clusters)) + 1 @@ -76,7 +85,7 @@ sce <- subsetCeldaList(rsmRes, list(L = 15)) ``` ```{r} -featureTable <- featureModuleTable(sce, useAssay = "counts", altExpName = "featureSubset") +featureTable <- featureModuleTable(sce, useAssay = useAssay, altExpName = altExpName) kable(featureTable, style = "html", row.names = FALSE) %>% kable_styling(bootstrap_options = "striped") %>% @@ -90,7 +99,7 @@ For instance, we can see that each module represents a set of co-expressing feat Differential expression tools can also be used on the gene modules in order to identify the modules that define each cell cluster. To do this, first run the `factorizeMatrix` function, which will generate a matrix that measures the contribution of each gene module to each cell population. ```{r DE modules, message=FALSE} -factorize <- factorizeMatrix(sce, useAssay = "counts", altExpName = "featureSubset") +factorize <- factorizeMatrix(sce, useAssay = useAssay, altExpName = altExpName) ### Take the module counts, log-normalize, then use for DE, violin plots, etc factorizedCounts <- factorize$counts$cell factorizedLogcounts <- normalizeCounts(factorizedCounts) @@ -98,14 +107,39 @@ reducedDim(sce, "factorizeMatrix") <- t(factorizedLogcounts) sce <- runFindMarker(sce, useReducedDim = "factorizeMatrix", cluster = "seurat_clusters", useAssay = NULL) ``` -The differential expression results are contained within `sce@metadata$findMarker`. The `Gene` column denotes the differentially expressed module, whilst the `Log2_FC` column denotes the level of upregulation of the module in the cluster. +The differential expression results are contained within `sce@metadata$findMarker`. The `Gene` column denotes the differentially expressed module, whilst the `Log2_FC` column denotes the level of upregulation of the module in the cluster. These results can be accessed through the `getFindMarkerTopTable` function. + +```{r, eval = FALSE} +getFindMarkerTopTable(sce) +``` + -```{r DE module table, message=FALSE} +```{r DE module table, message=FALSE, echo = FALSE} kable(sce@metadata$findMarker, style = "html", row.names = FALSE) %>% kable_styling(bootstrap_options = "striped") %>% scroll_box(width = "100%", height = "500px") ``` +### Module exploration + +Celda provides several methods for the visualization of the module expression within the dataset. + +#### UMAP + +The function `plotDimReduceModule` can be used visualize the probabilities of a particular module or sets of modules on a reduced dimensional plot such as a UMAP. This can be another quick method to see how modules are expressed across various cells in 2-D space. As an example, we can look at modules L7 and L8: + +```{r, fig.width = 5.5, fig.height = 4.5} +sce <- celdaUmap(sce, useAssay = useAssay, altExpName = altExpName) +plotDimReduceModule(sce, modules = 7:8, useAssay = useAssay, altExpName = altExpName, reducedDimName = "celda_UMAP") +``` + +#### Heatmap + +The function `moduleHeatmap` can be used to view the expression of features across cells for a specific module. + +```{r, fig.width = 5, fig.height = 8} +moduleHeatmap(sce, featureModule = 8, useAssay = useAssay, topFeatures = 25) +``` ## Generate a Decision Tree From 2f548606a4246e46d65991ae53b9b61c06fb0e36 Mon Sep 17 00:00:00 2001 From: Yusuke Koga Date: Fri, 16 Aug 2024 16:55:53 -0400 Subject: [PATCH 3/3] Minor edit to tutorial, getFindMarkerTopTable workaround --- vignettes/articles/seurat_to_celda_pbmc3k.Rmd | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/vignettes/articles/seurat_to_celda_pbmc3k.Rmd b/vignettes/articles/seurat_to_celda_pbmc3k.Rmd index 418dad20..54f3ac03 100644 --- a/vignettes/articles/seurat_to_celda_pbmc3k.Rmd +++ b/vignettes/articles/seurat_to_celda_pbmc3k.Rmd @@ -107,15 +107,11 @@ reducedDim(sce, "factorizeMatrix") <- t(factorizedLogcounts) sce <- runFindMarker(sce, useReducedDim = "factorizeMatrix", cluster = "seurat_clusters", useAssay = NULL) ``` -The differential expression results are contained within `sce@metadata$findMarker`. The `Gene` column denotes the differentially expressed module, whilst the `Log2_FC` column denotes the level of upregulation of the module in the cluster. These results can be accessed through the `getFindMarkerTopTable` function. - -```{r, eval = FALSE} -getFindMarkerTopTable(sce) -``` +The differential expression results are contained within the metadata slot `findMarker`. The `Gene` column denotes the differentially expressed module, whilst the `Log2_FC` column denotes the level of upregulation of the module in the cluster. ```{r DE module table, message=FALSE, echo = FALSE} -kable(sce@metadata$findMarker, style = "html", row.names = FALSE) %>% +kable(metadata(sce)$findMarker, style = "html", row.names = FALSE) %>% kable_styling(bootstrap_options = "striped") %>% scroll_box(width = "100%", height = "500px") ```