From b68cce25a788f908e345efda0e3da7663aba1ad5 Mon Sep 17 00:00:00 2001 From: Yusuke Koga Date: Fri, 16 Aug 2024 14:09:36 -0400 Subject: [PATCH] Adding Celda module-based analysis tutorial --- .../articles/SCTK_Modules_Seurat_Tutorial.Rmd | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd diff --git a/vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd b/vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd new file mode 100644 index 00000000..78dcfe3e --- /dev/null +++ b/vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd @@ -0,0 +1,123 @@ +--- +title: "Celda module analysis tutorial" +author: "Salam AlAbdullatif" +date: "2024-07-19" +output: html_document +--- + +```{r setup, include=TRUE} +suppressPackageStartupMessages({ + #Load library + library(dplyr) + library(Seurat) + library(patchwork) + library(celda) + library(singleCellTK) + library(knitr) + library(kableExtra) + library(ggplot2) + library(dendextend) +}) +knitr::opts_chunk$set(echo = TRUE) + +``` + +## Convert Seurat Object to an SingleCellExperiment (SCE) object (Optional) + +While Celda provides the functionality for cell cluster generation, some users may opt to import cluster labels generated from the popular Seurat pipeline. (https://pubmed.ncbi.nlm.nih.gov/29608179/) We can apply the `convertSeuratToSCE` function contained within the SingleCellTK package for the object conversion. + +```{r, warning=FALSE} +#Seurat pipeline (Taken from https://satijalab.org/seurat/articles/pbmc3k_tutorial): +##Read in 10X output, PBMC 3K +pbmc.data <- Read10X(data.dir = "./filtered_gene_bc_matrices/hg19/") +##Create Seurat object +pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200) +##Normalize data +pbmc <- NormalizeData(pbmc, verbose = FALSE) +##Feature selection +pbmc <- FindVariableFeatures(pbmc, verbose = FALSE) +##Linear transformation +pbmc <- ScaleData(pbmc, verbose = FALSE) +pbmc <- RunPCA(pbmc, verbose = FALSE) +pbmc <- RunUMAP(pbmc, dims = 1:10, verbose = FALSE) +##Generate clusters +pbmc <- FindNeighbors(pbmc, dims = 1:10, verbose = FALSE) +pbmc <- FindClusters(pbmc, resolution = 0.5, verbose = FALSE) +``` + +```{r} +#Convert the Seurat object to a SingleCellExperiment object +pbmcSce <- convertSeuratToSCE(pbmc, normAssayName = "logcounts") +``` + +## Generating feature modules from pre-determined clusters + +Generate Celda feature modules based on cell clusters that you have provided through the `recursiveSplitModule` function: + +```{r} +#Convert Seurat cluster IDs, as Celda requires clusters be numeric vectors starting from 1 (Seurat cluster 0 = Celda cluster 1) +pbmcSce$seurat_clusters <- as.numeric(as.character(pbmcSce$seurat_clusters)) + 1 + +pbmcSce <- selectFeatures(pbmcSce) +#Run recursiveSplitModule; Seurat clusters will be imported in the zInit parameter. +rsmRes <- recursiveSplitModule(x = pbmcSce, useAssay = "counts", altExpName = "featureSubset", initialL = 2, maxL = 30, zInit = pbmcSce$seurat_clusters, verbose = FALSE) +``` + +```{r} +#Identify the optimal L (number of modules) using the elbow plot +plotRPC(rsmRes) +``` + +A heuristic method that may be applied to identify the optimal number of modules. Based on the elbow plot, the rate of the perplexity change levels off around L = 12 ~ 15. For this tutorial, we will choose L = 15 for the number of modules. + +```{r} +#Subset Celda object +sce <- subsetCeldaList(rsmRes, list(L = 15)) +``` + +```{r} +featureTable <- featureModuleTable(sce, useAssay = "counts", altExpName = "featureSubset") + +kable(featureTable, style = "html", row.names = FALSE) %>% + kable_styling(bootstrap_options = "striped") %>% + scroll_box(width = "100%", height = "500px") +``` + +For instance, we can see that each module represents a set of co-expressing features, such as module L7 (B lymphocytes - CD79A, MS4A1, TCL1A) and module L8 (NK cells - NKG7, GNLY) + +## Run DE on Modules + +Differential expression tools can also be used on the gene modules in order to identify the modules that define each cell cluster. To do this, first run the `factorizeMatrix` function, which will generate a matrix that measures the contribution of each gene module to each cell population. + +```{r DE modules, message=FALSE} +factorize <- factorizeMatrix(sce, useAssay = "counts", altExpName = "featureSubset") +### Take the module counts, log-normalize, then use for DE, violin plots, etc +factorizedCounts <- factorize$counts$cell +factorizedLogcounts <- normalizeCounts(factorizedCounts) +reducedDim(sce, "factorizeMatrix") <- t(factorizedLogcounts) +sce <- runFindMarker(sce, useReducedDim = "factorizeMatrix", cluster = "seurat_clusters", useAssay = NULL) +``` + +The differential expression results are contained within `sce@metadata$findMarker`. The `Gene` column denotes the differentially expressed module, whilst the `Log2_FC` column denotes the level of upregulation of the module in the cluster. + +```{r DE module table, message=FALSE} +kable(sce@metadata$findMarker, style = "html", row.names = FALSE) %>% + kable_styling(bootstrap_options = "striped") %>% + scroll_box(width = "100%", height = "500px") +``` + + +## Generate a Decision Tree + +To identify which modules distinguish which clusters, we can use use decision trees implemented in the `findMarkersTree` function. Upon applying the celda algorithm, we will use the generated factorized counts for the modules (a matrix of modules x cells) and cluster labels, as an input to the findMarkersTree function. + +```{r, message=FALSE} +classes <- as.integer(celdaClusters(sce)) # needs to be an integer vector +DecTree <- findMarkersTree(factorizedCounts, classes) +``` + +The output decision tree can be visualized through a dendrogram using the `plotDendro` function: + +```{r, warning=FALSE} +plotDendro(DecTree) +```