From b68cce25a788f908e345efda0e3da7663aba1ad5 Mon Sep 17 00:00:00 2001
From: Yusuke Koga <ykoga07@bu.edu>
Date: Fri, 16 Aug 2024 14:09:36 -0400
Subject: [PATCH 1/3] Adding Celda module-based analysis tutorial

---
 .../articles/SCTK_Modules_Seurat_Tutorial.Rmd | 123 ++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd

diff --git a/vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd b/vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd
new file mode 100644
index 00000000..78dcfe3e
--- /dev/null
+++ b/vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd
@@ -0,0 +1,123 @@
+---
+title: "Celda module analysis tutorial"
+author: "Salam AlAbdullatif"
+date: "2024-07-19"
+output: html_document
+---
+
+```{r setup, include=TRUE}
+suppressPackageStartupMessages({
+    #Load library
+    library(dplyr)
+    library(Seurat)
+    library(patchwork)
+    library(celda)
+    library(singleCellTK)
+    library(knitr)
+    library(kableExtra)
+    library(ggplot2)
+    library(dendextend)
+})
+knitr::opts_chunk$set(echo = TRUE)
+
+```
+
+## Convert Seurat Object to an SingleCellExperiment (SCE) object (Optional)
+
+While Celda provides the functionality for cell cluster generation, some users may opt to import cluster labels generated from the popular Seurat pipeline. (https://pubmed.ncbi.nlm.nih.gov/29608179/) We can apply the `convertSeuratToSCE` function contained within the SingleCellTK package for the object conversion.
+
+```{r, warning=FALSE}
+#Seurat pipeline (Taken from https://satijalab.org/seurat/articles/pbmc3k_tutorial):
+##Read in 10X output, PBMC 3K
+pbmc.data <- Read10X(data.dir = "./filtered_gene_bc_matrices/hg19/")
+##Create Seurat object
+pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
+##Normalize data
+pbmc <- NormalizeData(pbmc, verbose = FALSE)
+##Feature selection
+pbmc <- FindVariableFeatures(pbmc, verbose = FALSE)
+##Linear transformation
+pbmc <- ScaleData(pbmc, verbose = FALSE)
+pbmc <- RunPCA(pbmc, verbose = FALSE)
+pbmc <- RunUMAP(pbmc, dims = 1:10, verbose = FALSE)
+##Generate clusters
+pbmc <- FindNeighbors(pbmc, dims = 1:10, verbose = FALSE)
+pbmc <- FindClusters(pbmc, resolution = 0.5, verbose = FALSE)
+```
+
+```{r}
+#Convert the Seurat object to a SingleCellExperiment object 
+pbmcSce <- convertSeuratToSCE(pbmc, normAssayName = "logcounts")
+```
+
+## Generating feature modules from pre-determined clusters
+
+Generate Celda feature modules based on cell clusters that you have provided through the `recursiveSplitModule` function:
+
+```{r}
+#Convert Seurat cluster IDs, as Celda requires clusters be numeric vectors starting from 1 (Seurat cluster 0 = Celda cluster 1)
+pbmcSce$seurat_clusters <- as.numeric(as.character(pbmcSce$seurat_clusters)) + 1
+
+pbmcSce <- selectFeatures(pbmcSce)
+#Run recursiveSplitModule; Seurat clusters will be imported in the zInit parameter.
+rsmRes <- recursiveSplitModule(x = pbmcSce, useAssay = "counts", altExpName = "featureSubset", initialL = 2, maxL = 30, zInit = pbmcSce$seurat_clusters, verbose = FALSE)
+```
+
+```{r}
+#Identify the optimal L (number of modules) using the elbow plot
+plotRPC(rsmRes)
+```
+
+A heuristic method that may be applied to identify the optimal number of modules. Based on the elbow plot, the rate of the perplexity change levels off around L = 12 ~ 15. For this tutorial, we will choose L = 15 for the number of modules.
+
+```{r}
+#Subset Celda object
+sce <- subsetCeldaList(rsmRes, list(L = 15))
+```
+
+```{r}
+featureTable <- featureModuleTable(sce, useAssay = "counts", altExpName = "featureSubset")
+
+kable(featureTable, style = "html", row.names = FALSE) %>%
+                 kable_styling(bootstrap_options = "striped") %>%
+                 scroll_box(width = "100%", height = "500px")
+```
+
+For instance, we can see that each module represents a set of co-expressing features, such as module L7 (B lymphocytes - CD79A, MS4A1, TCL1A) and module L8 (NK cells - NKG7, GNLY)
+
+## Run DE on Modules
+
+Differential expression tools can also be used on the gene modules in order to identify the modules that define each cell cluster. To do this, first run the `factorizeMatrix` function, which will generate a matrix that measures the contribution of each gene module to each cell population.
+
+```{r DE modules, message=FALSE}
+factorize <- factorizeMatrix(sce, useAssay = "counts", altExpName = "featureSubset")
+### Take the module counts, log-normalize, then use for DE, violin plots, etc
+factorizedCounts <- factorize$counts$cell
+factorizedLogcounts <- normalizeCounts(factorizedCounts)
+reducedDim(sce, "factorizeMatrix") <- t(factorizedLogcounts)
+sce <- runFindMarker(sce, useReducedDim = "factorizeMatrix", cluster = "seurat_clusters", useAssay = NULL)
+```
+
+The differential expression results are contained within `sce@metadata$findMarker`. The `Gene` column denotes the differentially expressed module, whilst the `Log2_FC` column denotes the level of upregulation of the module in the cluster.
+
+```{r DE module table, message=FALSE}
+kable(sce@metadata$findMarker, style = "html", row.names = FALSE) %>%
+                 kable_styling(bootstrap_options = "striped") %>%
+                 scroll_box(width = "100%", height = "500px")
+```
+
+
+## Generate a Decision Tree
+
+To identify which modules distinguish which clusters, we can use use decision trees implemented in the `findMarkersTree` function. Upon applying the celda algorithm, we will use the generated factorized counts for the modules (a matrix of modules x cells) and cluster labels, as an input to the findMarkersTree function. 
+
+```{r, message=FALSE}
+classes <- as.integer(celdaClusters(sce)) # needs to be an integer vector 
+DecTree <- findMarkersTree(factorizedCounts, classes)
+```
+
+The output decision tree can be visualized through a dendrogram using the `plotDendro` function:
+
+```{r, warning=FALSE}
+plotDendro(DecTree)
+```

From 4c1d843bda2882f286c463f8131d0099769f4e1e Mon Sep 17 00:00:00 2001
From: Yusuke Koga <ykoga07@bu.edu>
Date: Fri, 16 Aug 2024 16:33:04 -0400
Subject: [PATCH 2/3] Edits to tutorial

---
 ...utorial.Rmd => seurat_to_celda_pbmc3k.Rmd} | 72 ++++++++++++++-----
 1 file changed, 53 insertions(+), 19 deletions(-)
 rename vignettes/articles/{SCTK_Modules_Seurat_Tutorial.Rmd => seurat_to_celda_pbmc3k.Rmd} (70%)

diff --git a/vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd b/vignettes/articles/seurat_to_celda_pbmc3k.Rmd
similarity index 70%
rename from vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd
rename to vignettes/articles/seurat_to_celda_pbmc3k.Rmd
index 78dcfe3e..418dad20 100644
--- a/vignettes/articles/SCTK_Modules_Seurat_Tutorial.Rmd
+++ b/vignettes/articles/seurat_to_celda_pbmc3k.Rmd
@@ -5,31 +5,32 @@ date: "2024-07-19"
 output: html_document
 ---
 
-```{r setup, include=TRUE}
-suppressPackageStartupMessages({
-    #Load library
-    library(dplyr)
-    library(Seurat)
-    library(patchwork)
-    library(celda)
-    library(singleCellTK)
-    library(knitr)
-    library(kableExtra)
-    library(ggplot2)
-    library(dendextend)
-})
+```{r setup, include=TRUE, message=FALSE}
+#Load library
+library(dplyr)
+library(Seurat)
+library(patchwork)
+library(celda)
+library(singleCellTK)
+library(knitr)
+library(kableExtra)
+library(ggplot2)
+library(dendextend)
+
 knitr::opts_chunk$set(echo = TRUE)
 
+source("/restricted/projectnb/camplab/projects/20240719_Codathon/Tutorial/findMarkersTree.R")
 ```
 
-## Convert Seurat Object to an SingleCellExperiment (SCE) object (Optional)
+## Applying the Seurat clustering algorithm (Optional)
 
 While Celda provides the functionality for cell cluster generation, some users may opt to import cluster labels generated from the popular Seurat pipeline. (https://pubmed.ncbi.nlm.nih.gov/29608179/) We can apply the `convertSeuratToSCE` function contained within the SingleCellTK package for the object conversion.
 
 ```{r, warning=FALSE}
 #Seurat pipeline (Taken from https://satijalab.org/seurat/articles/pbmc3k_tutorial):
 ##Read in 10X output, PBMC 3K
-pbmc.data <- Read10X(data.dir = "./filtered_gene_bc_matrices/hg19/")
+## Check seurat fxn if exists:
+pbmc.data <- Read10X(data.dir = "/restricted/projectnb/camplab/projects/20240719_Codathon/Tutorial/filtered_gene_bc_matrices/hg19/")
 ##Create Seurat object
 pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
 ##Normalize data
@@ -45,6 +46,8 @@ pbmc <- FindNeighbors(pbmc, dims = 1:10, verbose = FALSE)
 pbmc <- FindClusters(pbmc, resolution = 0.5, verbose = FALSE)
 ```
 
+## Convert Seurat Object to an SingleCellExperiment (SCE) object (Optional)
+
 ```{r}
 #Convert the Seurat object to a SingleCellExperiment object 
 pbmcSce <- convertSeuratToSCE(pbmc, normAssayName = "logcounts")
@@ -54,6 +57,12 @@ pbmcSce <- convertSeuratToSCE(pbmc, normAssayName = "logcounts")
 
 Generate Celda feature modules based on cell clusters that you have provided through the `recursiveSplitModule` function:
 
+```{r}
+useAssay <- "counts"
+altExpName <- "featureSubset"
+```
+
+
 ```{r}
 #Convert Seurat cluster IDs, as Celda requires clusters be numeric vectors starting from 1 (Seurat cluster 0 = Celda cluster 1)
 pbmcSce$seurat_clusters <- as.numeric(as.character(pbmcSce$seurat_clusters)) + 1
@@ -76,7 +85,7 @@ sce <- subsetCeldaList(rsmRes, list(L = 15))
 ```
 
 ```{r}
-featureTable <- featureModuleTable(sce, useAssay = "counts", altExpName = "featureSubset")
+featureTable <- featureModuleTable(sce, useAssay = useAssay, altExpName = altExpName)
 
 kable(featureTable, style = "html", row.names = FALSE) %>%
                  kable_styling(bootstrap_options = "striped") %>%
@@ -90,7 +99,7 @@ For instance, we can see that each module represents a set of co-expressing feat
 Differential expression tools can also be used on the gene modules in order to identify the modules that define each cell cluster. To do this, first run the `factorizeMatrix` function, which will generate a matrix that measures the contribution of each gene module to each cell population.
 
 ```{r DE modules, message=FALSE}
-factorize <- factorizeMatrix(sce, useAssay = "counts", altExpName = "featureSubset")
+factorize <- factorizeMatrix(sce, useAssay = useAssay, altExpName = altExpName)
 ### Take the module counts, log-normalize, then use for DE, violin plots, etc
 factorizedCounts <- factorize$counts$cell
 factorizedLogcounts <- normalizeCounts(factorizedCounts)
@@ -98,14 +107,39 @@ reducedDim(sce, "factorizeMatrix") <- t(factorizedLogcounts)
 sce <- runFindMarker(sce, useReducedDim = "factorizeMatrix", cluster = "seurat_clusters", useAssay = NULL)
 ```
 
-The differential expression results are contained within `sce@metadata$findMarker`. The `Gene` column denotes the differentially expressed module, whilst the `Log2_FC` column denotes the level of upregulation of the module in the cluster.
+The differential expression results are contained within `sce@metadata$findMarker`. The `Gene` column denotes the differentially expressed module, whilst the `Log2_FC` column denotes the level of upregulation of the module in the cluster. These results can be accessed through the `getFindMarkerTopTable` function.
+
+```{r, eval = FALSE}
+getFindMarkerTopTable(sce)
+```
+
 
-```{r DE module table, message=FALSE}
+```{r DE module table, message=FALSE, echo = FALSE}
 kable(sce@metadata$findMarker, style = "html", row.names = FALSE) %>%
                  kable_styling(bootstrap_options = "striped") %>%
                  scroll_box(width = "100%", height = "500px")
 ```
 
+### Module exploration
+
+Celda provides several methods for the visualization of the module expression within the dataset.
+
+#### UMAP
+
+The function `plotDimReduceModule` can be used visualize the probabilities of a particular module or sets of modules on a reduced dimensional plot such as a UMAP. This can be another quick method to see how modules are expressed across various cells in 2-D space. As an example, we can look at modules L7 and L8:
+
+```{r, fig.width = 5.5, fig.height = 4.5}
+sce <- celdaUmap(sce, useAssay = useAssay, altExpName = altExpName)
+plotDimReduceModule(sce, modules = 7:8, useAssay = useAssay, altExpName = altExpName, reducedDimName = "celda_UMAP")
+```
+
+#### Heatmap
+
+The function `moduleHeatmap` can be used to view the expression of features across cells for a specific module.
+
+```{r, fig.width = 5, fig.height = 8}
+moduleHeatmap(sce, featureModule = 8, useAssay = useAssay, topFeatures = 25)
+```
 
 ## Generate a Decision Tree
 

From 2f548606a4246e46d65991ae53b9b61c06fb0e36 Mon Sep 17 00:00:00 2001
From: Yusuke Koga <ykoga07@bu.edu>
Date: Fri, 16 Aug 2024 16:55:53 -0400
Subject: [PATCH 3/3] Minor edit to tutorial, getFindMarkerTopTable workaround

---
 vignettes/articles/seurat_to_celda_pbmc3k.Rmd | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/vignettes/articles/seurat_to_celda_pbmc3k.Rmd b/vignettes/articles/seurat_to_celda_pbmc3k.Rmd
index 418dad20..54f3ac03 100644
--- a/vignettes/articles/seurat_to_celda_pbmc3k.Rmd
+++ b/vignettes/articles/seurat_to_celda_pbmc3k.Rmd
@@ -107,15 +107,11 @@ reducedDim(sce, "factorizeMatrix") <- t(factorizedLogcounts)
 sce <- runFindMarker(sce, useReducedDim = "factorizeMatrix", cluster = "seurat_clusters", useAssay = NULL)
 ```
 
-The differential expression results are contained within `sce@metadata$findMarker`. The `Gene` column denotes the differentially expressed module, whilst the `Log2_FC` column denotes the level of upregulation of the module in the cluster. These results can be accessed through the `getFindMarkerTopTable` function.
-
-```{r, eval = FALSE}
-getFindMarkerTopTable(sce)
-```
+The differential expression results are contained within the metadata slot `findMarker`. The `Gene` column denotes the differentially expressed module, whilst the `Log2_FC` column denotes the level of upregulation of the module in the cluster. 
 
 
 ```{r DE module table, message=FALSE, echo = FALSE}
-kable(sce@metadata$findMarker, style = "html", row.names = FALSE) %>%
+kable(metadata(sce)$findMarker, style = "html", row.names = FALSE) %>%
                  kable_styling(bootstrap_options = "striped") %>%
                  scroll_box(width = "100%", height = "500px")
 ```