diff --git a/_site/cca.html b/_site/cca.html new file mode 100644 index 0000000..fc83ad4 --- /dev/null +++ b/_site/cca.html @@ -0,0 +1,796 @@ + + + + + + + + + + + + + + + +cca.R + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
library( tidyverse )
+
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
+## ✔ dplyr     1.1.4     ✔ readr     2.1.5
+## ✔ forcats   1.0.0     ✔ stringr   1.5.1
+## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
+## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
+## ✔ purrr     1.0.2     
+## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
+## ✖ dplyr::filter() masks stats::filter()
+## ✖ dplyr::lag()    masks stats::lag()
+## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
+
library( Seurat )
+
## Loading required package: SeuratObject
+## Loading required package: sp
+## 
+## Attaching package: 'SeuratObject'
+## 
+## The following objects are masked from 'package:base':
+## 
+##     intersect, t
+
# Load `ifnb` data set
+ifnb <- SeuratData::LoadData("ifnb")
+
## Validating object structure
+## Updating object slots
+## Ensuring keys are in the proper structure
+
## Warning: Assay RNA changing from Assay to Assay
+
## Ensuring keys are in the proper structure
+## Ensuring feature names don't have underscores or pipes
+## Updating slots in RNA
+## Validating object structure for Assay 'RNA'
+## Object representation is consistent with the most current Seurat version
+
## Warning: Assay RNA changing from Assay to Assay5
+
ifnb
+
## An object of class Seurat 
+## 14053 features across 13999 samples within 1 assay 
+## Active assay: RNA (14053 features, 0 variable features)
+##  2 layers present: counts, data
+
# Split by `stim`:
+ifnbs <- ifnb
+ifnbs[["RNA"]] <- split( ifnbs[["RNA"]], f = ifnbs$stim )
+ifnbs
+
## An object of class Seurat 
+## 14053 features across 13999 samples within 1 assay 
+## Active assay: RNA (14053 features, 0 variable features)
+##  4 layers present: counts.CTRL, counts.STIM, data.CTRL, data.STIM
+
# Run standard analysis workflow
+ifnbs %>%
+NormalizeData() %>%
+FindVariableFeatures() %>%
+ScaleData() %>%
+RunPCA() %>%
+FindNeighbors( dims=1:30, reduction="pca" ) %>%
+FindClusters( resolution=2, cluster.name="unintegrated_clusters" ) %>%
+RunUMAP( dims=1:30, reduction="pca", reduction.name="umap.unintegrated" ) -> ifnbs
+
## Normalizing layer: counts.CTRL
+## Normalizing layer: counts.STIM
+## Finding variable features for layer counts.CTRL
+## Finding variable features for layer counts.STIM
+## Centering and scaling data matrix
+## PC_ 1 
+## Positive:  TYROBP, C15orf48, FCER1G, CST3, SOD2, ANXA5, FTL, TYMP, TIMP1, CD63 
+##     LGALS1, CTSB, S100A4, LGALS3, KYNU, PSAP, FCN1, NPC2, ANXA2, S100A11 
+##     IGSF6, LYZ, SPI1, APOBEC3A, CD68, CTSL, SDCBP, NINJ1, HLA-DRA, CCL2 
+## Negative:  NPM1, CCR7, GIMAP7, LTB, CD7, SELL, CD2, TMSB4X, TRAT1, IL7R 
+##     IL32, RHOH, ITM2A, RGCC, LEF1, CD3G, ALOX5AP, CREM, NHP2, PASK 
+##     MYC, SNHG8, TSC22D3, GPR171, BIRC3, NOP58, RARRES3, CD27, SRM, CD8B 
+## PC_ 2 
+## Positive:  ISG15, ISG20, IFIT3, IFIT1, LY6E, TNFSF10, IFIT2, MX1, IFI6, RSAD2 
+##     CXCL10, OAS1, CXCL11, IFITM3, MT2A, OASL, TNFSF13B, IDO1, IL1RN, APOBEC3A 
+##     GBP1, CCL8, HERC5, FAM26F, GBP4, HES4, WARS, VAMP5, DEFB1, XAF1 
+## Negative:  IL8, CLEC5A, CD14, VCAN, S100A8, IER3, MARCKSL1, IL1B, PID1, CD9 
+##     GPX1, PHLDA1, INSIG1, PLAUR, PPIF, THBS1, OSM, SLC7A11, GAPDH, CTB-61M7.2 
+##     LIMS1, S100A9, GAPT, CXCL3, ACTB, C19orf59, CEBPB, OLR1, MGST1, FTH1 
+## PC_ 3 
+## Positive:  HLA-DQA1, CD83, HLA-DQB1, CD74, HLA-DPA1, HLA-DRA, HLA-DRB1, HLA-DPB1, SYNGR2, IRF8 
+##     CD79A, MIR155HG, HERPUD1, REL, HLA-DMA, MS4A1, HSP90AB1, FABP5, TVP23A, ID3 
+##     CCL22, EBI3, TSPAN13, PMAIP1, TCF4, PRMT1, NME1, HSPE1, HSPD1, CD70 
+## Negative:  ANXA1, GIMAP7, TMSB4X, CD7, CD2, RARRES3, MT2A, IL32, GNLY, PRF1 
+##     NKG7, CCL5, TRAT1, RGCC, S100A9, KLRD1, CCL2, GZMH, GZMA, CD3G 
+##     S100A8, CTSW, CCL7, ITM2A, HPSE, FGFBP2, CTSL, GPR171, CCL8, OASL 
+## PC_ 4 
+## Positive:  NKG7, GZMB, GNLY, CST7, CCL5, PRF1, CLIC3, KLRD1, GZMH, GZMA 
+##     APOBEC3G, CTSW, FGFBP2, KLRC1, FASLG, C1orf21, HOPX, CXCR3, SH2D1B, LINC00996 
+##     TNFRSF18, SPON2, RARRES3, GCHFR, SH2D2A, IGFBP7, ID2, C12orf75, XCL2, RAMP1 
+## Negative:  CCR7, LTB, SELL, LEF1, IL7R, ADTRP, TRAT1, PASK, MYC, NPM1 
+##     SOCS3, TSC22D3, TSHZ2, HSP90AB1, SNHG8, GIMAP7, PIM2, HSPD1, CD3G, TXNIP 
+##     RHOH, GBP1, C12orf57, CA6, PNRC1, CMSS1, CD27, SESN3, NHP2, BIRC3 
+## PC_ 5 
+## Positive:  VMO1, FCGR3A, MS4A4A, CXCL16, MS4A7, PPM1N, HN1, LST1, SMPDL3A, ATP1B3 
+##     CASP5, CDKN1C, CH25H, AIF1, PLAC8, SERPINA1, LRRC25, CD86, HCAR3, GBP5 
+##     TMSB4X, RP11-290F20.3, RGS19, VNN2, ADA, LILRA5, STXBP2, C3AR1, PILRA, FCGR3B 
+## Negative:  CCL2, CCL7, CCL8, PLA2G7, LMNA, S100A9, SDS, TXN, CSTB, ATP6V1F 
+##     CCR1, EMP1, CAPG, CCR5, TPM4, IDO1, MGST1, HPSE, CTSB, LILRB4 
+##     RSAD2, HSPA1A, VIM, CCNA1, CTSL, GCLM, PDE4DIP, SGTB, SLC7A11, FABP5 
+## Computing nearest neighbor graph
+## Computing SNN
+
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
+## 
+## Number of nodes: 13999
+## Number of edges: 555146
+## 
+## Running Louvain algorithm...
+## Maximum modularity in 10 random starts: 0.8153
+## Number of communities: 26
+## Elapsed time: 3 seconds
+
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
+## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
+## This message will be shown once per session
+
## 15:24:47 UMAP embedding parameters a = 0.9922 b = 1.112
+## 15:24:47 Read 13999 rows and found 30 numeric columns
+## 15:24:47 Using Annoy for neighbor search, n_neighbors = 30
+## 15:24:47 Building Annoy index with metric = cosine, n_trees = 50
+## 0%   10   20   30   40   50   60   70   80   90   100%
+## [----|----|----|----|----|----|----|----|----|----|
+## **************************************************|
+## 15:24:49 Writing NN index file to temp file /tmp/Rtmppe4ngQ/file16b5013fa6700
+## 15:24:49 Searching Annoy index using 1 thread, search_k = 3000
+## 15:24:55 Annoy recall = 100%
+## 15:24:56 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
+## 15:24:57 Initializing from normalized Laplacian + noise (using RSpectra)
+## 15:24:58 Commencing optimization for 200 epochs, with 614378 positive edges
+## 15:25:07 Optimization finished
+
# Plot UMAP, colour by `stim` and by Leiden clusters
+DimPlot( ifnbs, reduction = "umap.unintegrated", group.by = "stim" ) + coord_equal()
+

+
DimPlot( ifnbs, reduction = "umap.unintegrated", group.by = "seurat_clusters" ) + coord_equal()
+

+
# Also show UMAP using colours for annotation from previous analysis
+DimPlot( ifnbs, reduction = "umap.unintegrated", group.by = "seurat_annotations", label=TRUE ) + coord_equal()
+

+
# Now, we use Sleepwalk to compare the two cell subsets:
+sleepwalk::sleepwalk(
+  list( 
+    Embeddings(ifnbs,"umap.unintegrated")[ ifnbs$stim=="CTRL", ],
+    Embeddings(ifnbs,"umap.unintegrated")[ ifnbs$stim=="STIM", ] ),
+  list( 
+    Embeddings(ifnbs,"pca")[ ifnbs$stim=="CTRL", ],
+    Embeddings(ifnbs,"pca")[ ifnbs$stim=="STIM", ] ),
+  same="features" )
+
## Estimating 'maxdist' for feature matrix 1
+## Estimating 'maxdist' for feature matrix 2
+
# Perform CCA integration (This takes a few minutes)
+# ifnbs <- IntegrateLayers( object = ifnbs, method = CCAIntegration, 
+#             orig.reduction = "pca", new.reduction = "integrated.cca" )
+
+# Save result of this lengthy calculation
+#save( ifnbs, file="~/tmp/ifnbs_integrated.rda" )
+load( "~/tmp/ifnbs_integrated.rda" )
+
+# Re-join layers after integration
+ifnb <- ifnbs
+ifnb[["RNA"]] <- JoinLayers( ifnb[["RNA"]] )
+
+# Redo neighbor search, clustering, and UMAP, now based on CCA instead of PCA:
+ifnb %>% 
+FindNeighbors( reduction = "integrated.cca", dims = 1:30 ) %>%
+FindClusters( resolution = 1 ) %>%
+RunUMAP( dims = 1:30, reduction = "integrated.cca" ) -> ifnb
+
## Computing nearest neighbor graph
+## Computing SNN
+
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
+## 
+## Number of nodes: 13999
+## Number of edges: 591032
+## 
+## Running Louvain algorithm...
+## Maximum modularity in 10 random starts: 0.8448
+## Number of communities: 18
+## Elapsed time: 3 seconds
+
## 15:25:27 UMAP embedding parameters a = 0.9922 b = 1.112
+## 15:25:27 Read 13999 rows and found 30 numeric columns
+## 15:25:27 Using Annoy for neighbor search, n_neighbors = 30
+## 15:25:27 Building Annoy index with metric = cosine, n_trees = 50
+## 0%   10   20   30   40   50   60   70   80   90   100%
+## [----|----|----|----|----|----|----|----|----|----|
+## **************************************************|
+## 15:25:30 Writing NN index file to temp file /tmp/Rtmppe4ngQ/file16b501ab9f9f1
+## 15:25:30 Searching Annoy index using 1 thread, search_k = 3000
+## 15:25:38 Annoy recall = 100%
+## 15:25:38 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
+## 15:25:39 Initializing from normalized Laplacian + noise (using RSpectra)
+## 15:25:40 Commencing optimization for 200 epochs, with 629042 positive edges
+## 15:25:49 Optimization finished
+
# Plot UMAP again, coloured as before by `stim` covariate, clusters (from new clusterin),
+# and cell types (from original analysis):
+DimPlot( ifnb, reduction = "umap", group.by = "stim" ) + coord_equal()
+

+
DimPlot( ifnb, reduction = "umap", group.by = "seurat_clusters" ) + coord_equal()
+

+
DimPlot( ifnb, reduction = "umap", group.by = "seurat_annotations", label=TRUE ) + coord_equal()
+

+
# The plot coloured by `stim` is misleading, as the blue points are plotted after 
+# (and hence on top of) the red points.
+# Let's redo this plot manually and use `sample_frac` to permute the table rows and
+# thus shuffle the order in which the points are placed
+Embeddings(ifnb,"umap") %>%
+as_tibble() %>%
+add_column( cond = ifnb$stim ) %>%
+sample_frac() %>%
+ggplot() +
+  geom_point(aes(x=umap_1,y=umap_2,col=cond),size=.01) +
+  coord_equal()
+

+
## Now we perform a simplified CCA manually, to compare
+
+# First, here's the number of cells of each condition:
+table( ifnb$stim )
+
## 
+## CTRL STIM 
+## 6548 7451
+
# Let's calculate a cross correlation between the control and the
+# the stimulated cells. The following matrix has one row for each
+# control cell and one column for each stimulated cells. The matrix entries
+# are the dot products of the corresponding cells' expression vectors,
+# as formed from the log-normalized expressions for the 2000 most variable genes.
+
+crosscor <- as.matrix(
+  t(LayerData(ifnb)[,ifnb$stim=="CTRL"][VariableFeatures(ifnb),]) %*% 
+    LayerData(ifnb)[,ifnb$stim=="STIM"][VariableFeatures(ifnb),] ) 
+
+# Strictly speaking, this is not a cross-correlation yet, as we should have
+# standardized the two expression matrices before multiplying them by subtracting
+# for each gene its mean and dividing by its standard deviation. I'll add this when
+# I go through this code once more. 
+
+# Now, do an SVD on the cross-correlation matrix
+# svd <- irlba::svdr( crosscor, 20 )
+
+#save( svd, file="~/tmp/svd.rda" )
+load( "~/tmp/svd.rda" )
+
+# We use the left and right singular vectors as new coordinates, to replace the
+# PCA coordinates. The diagonal matrix in the middle is split and each square root
+# multiplied to one side. This gives us better distances, as can be seen with Sleepwalk:
+
+sleepwalk::sleepwalk(
+  list( 
+    Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="CTRL", ],
+    Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="STIM", ] ),
+  list(
+    t( t(svd$u) * sqrt(svd$d) ),
+    t( t(svd$v) * sqrt(svd$d) ) ),
+  same="features" )
+
## Estimating 'maxdist' for feature matrix 1
+## Estimating 'maxdist' for feature matrix 2
+## Server has been stopped.
+
# Therefore, we use these as new reduction coordinates
+
+cca <- rbind( 
+  t( t(svd$u) * sqrt(svd$d) ),
+  t( t(svd$v) * sqrt(svd$d) ) )
+rownames(cca) <- c( colnames(ifnb)[ifnb$stim=="CTRL"], colnames(ifnb)[ifnb$stim=="STIM"] )
+all( rownames(cca) == colnames(ifnb) )
+
## [1] TRUE
+
# Add these to the Seurat object
+ifnb@reductions$svd <- CreateDimReducObject( cca, assay="RNA", key="svd_" )
+
## Warning: No columnames present in cell embeddings, setting to 'svd_1:20'
+
# Let's make a UMAP from these:
+RunUMAP( ifnb, dims=1:20, reduction="svd", reduction.name = "umap.svd" ) -> ifnb
+
## 15:26:05 UMAP embedding parameters a = 0.9922 b = 1.112
+## 15:26:05 Read 13999 rows and found 20 numeric columns
+## 15:26:05 Using Annoy for neighbor search, n_neighbors = 30
+## 15:26:05 Building Annoy index with metric = cosine, n_trees = 50
+## 0%   10   20   30   40   50   60   70   80   90   100%
+## [----|----|----|----|----|----|----|----|----|----|
+## **************************************************|
+## 15:26:07 Writing NN index file to temp file /tmp/Rtmppe4ngQ/file16b5071182fb9
+## 15:26:07 Searching Annoy index using 1 thread, search_k = 3000
+## 15:26:14 Annoy recall = 100%
+## 15:26:14 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
+## 15:26:15 Initializing from normalized Laplacian + noise (using RSpectra)
+## 15:26:15 Commencing optimization for 200 epochs, with 612288 positive edges
+## 15:26:23 Optimization finished
+
# Here's a plot for the new UMAP:
+perm <- sample.int(ncol(ifnb))
+plot( Embeddings(ifnb,"umap.svd")[perm,], cex=.2, asp=1, col=1+as.integer(factor(ifnb$stim[perm])) )
+

+
# And the usual colourings
+DimPlot( ifnb, reduction = "umap.svd", group.by = "seurat_clusters" ) + coord_equal()
+

+
DimPlot( ifnb, reduction = "umap.svd", group.by = "seurat_annotations", label=TRUE ) + coord_equal()
+

+
# Of course, the clustering does not fit that well, as it was done using another embedding.
+
+# And here a comparison between Seurat's CCA integration and our simplified one:
+sleepwalk::sleepwalk(
+  list( Embeddings(ifnb,"umap"), Embeddings(ifnb,"umap.svd") ),
+  list( Embeddings(ifnb,"integrated.cca"), Embeddings(ifnb,"svd"))
+)
+
## Estimating 'maxdist' for feature matrix 1
+## Estimating 'maxdist' for feature matrix 2
+## Server has been stopped.
+
### Integration via mutual nearest neighbours
+
+# (FROM HERE ON: only draft)
+
+# We start with the PCA that was computed at the very beginning, using all cells
+# and ignoring the stim covariate
+Embeddings( ifnb, "pca" ) -> pca
+
+# Here ios the number of cells 
+table( ifnb$stim )
+
## 
+## CTRL STIM 
+## 6548 7451
+
# cross-NN search:
+nn_cs <- FNN::get.knnx( pca[ifnb$stim=="CTRL",], pca[ifnb$stim=="STIM",], k=25 )
+nn_sc <- FNN::get.knnx( pca[ifnb$stim=="STIM",], pca[ifnb$stim=="CTRL",], k=25 )
+
+bind_rows(
+  nn_cs$nn.index %>%
+    `colnames<-`( 1:ncol(.) ) %>%
+    as_tibble() %>%
+    add_column( center_cell = names(which(ifnb$stim=="STIM")) ) %>%
+    pivot_longer( -`center_cell`, names_to="ngb_index", values_to="ngb_cell" ) %>%
+    mutate( ngb_index = as.integer(ngb_index) ) %>%
+    mutate( ngb_cell = names(which(ifnb$stim=="CTRL"))[ ngb_cell ] ) %>%
+    add_column( center_is = "STIM" ),
+  nn_sc$nn.index %>%
+    `colnames<-`( 1:ncol(.) ) %>%
+    as_tibble() %>%
+    add_column( center_cell = names(which(ifnb$stim=="CTRL")) ) %>%
+    pivot_longer( -`center_cell`, names_to="ngb_index", values_to="ngb_cell" ) %>%
+    mutate( ngb_index = as.integer(ngb_index) ) %>%
+    mutate( ngb_cell = names(which(ifnb$stim=="STIM"))[ ngb_cell ] ) %>%
+    add_column( center_is = "CTRL" ) ) -> knn_tbl
+
+head( knn_tbl )
+
## # A tibble: 6 × 4
+##   center_cell      ngb_index ngb_cell         center_is
+##   <chr>                <int> <chr>            <chr>    
+## 1 AAACATACCAAGCT.1         1 AGAGATGATCCCAC.1 STIM     
+## 2 AAACATACCAAGCT.1         2 CCATGCTGGGATCT.1 STIM     
+## 3 AAACATACCAAGCT.1         3 CCCGAACTACCAAC.1 STIM     
+## 4 AAACATACCAAGCT.1         4 GTTAAATGCGCCTT.1 STIM     
+## 5 AAACATACCAAGCT.1         5 TAATCGCTACCATG.1 STIM     
+## 6 AAACATACCAAGCT.1         6 TCCCTACTACTGTG.1 STIM
+
# How many of the neighbours are mutual?
+knn_tbl %>%
+mutate( stim_cell = ifelse( center_is=="STIM", center_cell, ngb_cell ) ) %>%
+mutate( ctrl_cell = ifelse( center_is=="CTRL", center_cell, ngb_cell ) ) %>%
+group_by( stim_cell, ctrl_cell ) %>%
+summarise( n=n() ) %>% ungroup() %>%
+count( n ) 
+
## `summarise()` has grouped output by 'stim_cell'. You can override using the
+## `.groups` argument.
+## Storing counts in `nn`, as `n` already present in input
+
## # A tibble: 2 × 2
+##       n     nn
+##   <int>  <int>
+## 1     1 297781
+## 2     2  26097
+
# Make table of MNNs
+knn_tbl %>%
+mutate( stim_cell = ifelse( center_is=="STIM", center_cell, ngb_cell ) ) %>%
+mutate( ctrl_cell = ifelse( center_is=="CTRL", center_cell, ngb_cell ) ) %>%
+group_by( stim_cell, ctrl_cell ) %>%
+summarise( n=n(), .groups="drop" ) %>%
+filter( n==2 ) %>% select(-n) -> mnn_tbl
+head( mnn_tbl )
+
## # A tibble: 6 × 2
+##   stim_cell        ctrl_cell       
+##   <chr>            <chr>           
+## 1 AAACATACCCCTAC.1 ATAGTTGATAAAGG.1
+## 2 AAACATACCCGTAA.1 AGAATTTGCTCAGA.1
+## 3 AAACATACCCGTAA.1 CCAACCTGTGCCAA.1
+## 4 AAACATACCCGTAA.1 CGAATCGAGCAGAG.1
+## 5 AAACATACCCGTAA.1 CTCAGGCTGGACAG.1
+## 6 AAACATACCCGTAA.1 GAGATAGACTGACA.1
+
# How many MNNs?
+mnn_tbl %>%
+group_by( ctrl_cell ) %>%
+summarise(n())
+
## # A tibble: 3,752 × 2
+##    ctrl_cell        `n()`
+##    <chr>            <int>
+##  1 AAACATACATTTCC.1     1
+##  2 AAACATACCAGAAA.1     1
+##  3 AAACATACCTGGTA.1    19
+##  4 AAACATACGATGAA.1     9
+##  5 AAACATACTGCGTA.1     5
+##  6 AAACCGTGAGCCAT.1     1
+##  7 AAACCGTGTGCTAG.1     8
+##  8 AAACGCACACTTTC.1     3
+##  9 AAACGCACAGTACC.1     6
+## 10 AAACGCACCAACCA.1    19
+## # ℹ 3,742 more rows
+
## Save image
+#save.image( file="~/tmp/cca.rda" )
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/_site/index.html b/_site/index.html index 286d202..4c5801c 100644 --- a/_site/index.html +++ b/_site/index.html @@ -206,6 +206,7 @@

Material and
  • prepared for following lectures
  • diff --git a/_site/search.json b/_site/search.json index 29e19eb..f6873c1 100644 --- a/_site/search.json +++ b/_site/search.json @@ -123,7 +123,7 @@ "href": "index.html#material-and-video-recodings", "title": "Mathematics of Single-cell Omics", "section": "Material and Video Recodings", - "text": "Material and Video Recodings\n\nLecture of 2024-04-22:\n\nVideo\nBiology Primer\nOverview High-throughput Sequencing\nA simple analysis with Seurat\nExample data: PBMC3k (available on 10X web site, and Moodle)\n\nExercise class of 2024-04-24\n\nVideo\nDoing the Seurat analysis “on foot”\n\nLecture of 2024-05-06\n\nVideo\nModularity clustering\nblackboard photo\n\nExercise class of 2024-05-08\n\nVideo\n\nImplementing modularity score in Python\nlouvain.py\n\nLecture of 2024-05-13\n\nVideo\nMathematics of PCA\nBlackboard photos: 1 2\n\nExercise class of 2024-05-15\n\nVideo:\nfitting pseudotimes and splines\nR code: pseudotime_1.R, splines_1.R\nexample data: cmss.rda (on Moodle)\n\nLecture of 2024-05-22\n\nVideo\nExpression dynamics along pseudotime trajectories\nSmoothing\nblackboard photo\n\nLecture of 2024-05-27\n\nVideo\nSmoothing with principal curves\n\nLecture of 2024-06-03\n\nVideo\nBlackboard photos: 1 2\nSpectra of graphs (Skript still missing)\nDiffusion on graphs\n\nLecture of 2024-06-10\n\nVideo\nLaplacian eigenmaps\n\nLecture of 2024-06-17\n\nVideo (not yet uploaded)\nComparison of t-SNE and UMAP for our example data\nMathematics of t-SNE and UMAP\nReimplementing UMAP in Python\n\nprepared for following lectures\n\nCell cycle regression" + "text": "Material and Video Recodings\n\nLecture of 2024-04-22:\n\nVideo\nBiology Primer\nOverview High-throughput Sequencing\nA simple analysis with Seurat\nExample data: PBMC3k (available on 10X web site, and Moodle)\n\nExercise class of 2024-04-24\n\nVideo\nDoing the Seurat analysis “on foot”\n\nLecture of 2024-05-06\n\nVideo\nModularity clustering\nblackboard photo\n\nExercise class of 2024-05-08\n\nVideo\n\nImplementing modularity score in Python\nlouvain.py\n\nLecture of 2024-05-13\n\nVideo\nMathematics of PCA\nBlackboard photos: 1 2\n\nExercise class of 2024-05-15\n\nVideo:\nfitting pseudotimes and splines\nR code: pseudotime_1.R, splines_1.R\nexample data: cmss.rda (on Moodle)\n\nLecture of 2024-05-22\n\nVideo\nExpression dynamics along pseudotime trajectories\nSmoothing\nblackboard photo\n\nLecture of 2024-05-27\n\nVideo\nSmoothing with principal curves\n\nLecture of 2024-06-03\n\nVideo\nBlackboard photos: 1 2\nSpectra of graphs (Skript still missing)\nDiffusion on graphs\n\nLecture of 2024-06-10\n\nVideo\nLaplacian eigenmaps\n\nLecture of 2024-06-17\n\nVideo (not yet uploaded)\nComparison of t-SNE and UMAP for our example data\nMathematics of t-SNE and UMAP\nReimplementing UMAP in Python\n\nprepared for following lectures\n\nCell cycle regression\nCCA Integration" }, { "objectID": "index.html#source-code", diff --git a/cca.R b/cca.R index 78f79a1..11f5545 100644 --- a/cca.R +++ b/cca.R @@ -1,41 +1,71 @@ library( tidyverse ) library( Seurat ) +# Load `ifnb` data set ifnb <- SeuratData::LoadData("ifnb") - -ifnb[["RNA"]] <- split(ifnb[["RNA"]], f = ifnb$stim) ifnb -# run standard anlaysis workflow -ifnb <- NormalizeData(ifnb) -ifnb <- FindVariableFeatures(ifnb) -ifnb <- ScaleData(ifnb) -ifnb <- RunPCA(ifnb) -ifnb <- FindNeighbors(ifnb, dims = 1:30, reduction = "pca") -ifnb <- FindClusters(ifnb, resolution = 2, cluster.name = "unintegrated_clusters") - -ifnb <- RunUMAP(ifnb, dims = 1:30, reduction = "pca", reduction.name = "umap.unintegrated") -DimPlot(ifnb, reduction = "umap.unintegrated", group.by = c("stim", "seurat_clusters")) - -ifnb <- IntegrateLayers(object = ifnb, method = CCAIntegration, - orig.reduction = "pca", new.reduction = "integrated.cca" ) -# re-join layers after integration -ifnb[["RNA"]] <- JoinLayers(ifnb[["RNA"]]) +# Split by `stim`: +ifnbs <- ifnb +ifnbs[["RNA"]] <- split( ifnbs[["RNA"]], f = ifnbs$stim ) +ifnbs + +# Run standard analysis workflow +ifnbs %>% +NormalizeData() %>% +FindVariableFeatures() %>% +ScaleData() %>% +RunPCA() %>% +FindNeighbors( dims=1:30, reduction="pca" ) %>% +FindClusters( resolution=2, cluster.name="unintegrated_clusters" ) %>% +RunUMAP( dims=1:30, reduction="pca", reduction.name="umap.unintegrated" ) -> ifnbs + +# Plot UMAP, colour by `stim` and by Leiden clusters +DimPlot( ifnbs, reduction = "umap.unintegrated", group.by = "stim" ) + coord_equal() +DimPlot( ifnbs, reduction = "umap.unintegrated", group.by = "seurat_clusters" ) + coord_equal() + +# Also show UMAP using colours for annotation from previous analysis +DimPlot( ifnbs, reduction = "umap.unintegrated", group.by = "seurat_annotations", label=TRUE ) + coord_equal() + +# Now, we use Sleepwalk to compare the two cell subsets: +sleepwalk::sleepwalk( + list( + Embeddings(ifnbs,"umap.unintegrated")[ ifnbs$stim=="CTRL", ], + Embeddings(ifnbs,"umap.unintegrated")[ ifnbs$stim=="STIM", ] ), + list( + Embeddings(ifnbs,"pca")[ ifnbs$stim=="CTRL", ], + Embeddings(ifnbs,"pca")[ ifnbs$stim=="STIM", ] ), + same="features" ) -ifnb <- FindNeighbors(ifnb, reduction = "integrated.cca", dims = 1:30) -ifnb <- FindClusters(ifnb, resolution = 1) -ifnb <- RunUMAP(ifnb, dims = 1:30, reduction = "pca", reduction.name = "umap.unintegrated") -DimPlot(ifnb, reduction = "umap.unintegrated", group.by = c("stim", "seurat_clusters")) +# Perform CCA integration (This takes a few minutes) +# ifnbs <- IntegrateLayers( object = ifnbs, method = CCAIntegration, +# orig.reduction = "pca", new.reduction = "integrated.cca" ) -ifnb <- RunUMAP(ifnb, dims = 1:30, reduction = "integrated.cca") +# Save result of this lengthy calculation +#save( ifnbs, file="~/tmp/ifnbs_integrated.rda" ) +load( "~/tmp/ifnbs_integrated.rda" ) -# Visualization -DimPlot(ifnb, reduction = "umap", group.by = c("stim", "seurat_annotations")) +# Re-join layers after integration +ifnb <- ifnbs +ifnb[["RNA"]] <- JoinLayers( ifnb[["RNA"]] ) -save( ifnb, file="~/tmp/ifnb_integrated.rda" ) +# Redo neighbor search, clustering, and UMAP, now based on CCA instead of PCA: +ifnb %>% +FindNeighbors( reduction = "integrated.cca", dims = 1:30 ) %>% +FindClusters( resolution = 1 ) %>% +RunUMAP( dims = 1:30, reduction = "integrated.cca" ) -> ifnb +# Plot UMAP again, coloured as before by `stim` covariate, clusters (from new clusterin), +# and cell types (from original analysis): +DimPlot( ifnb, reduction = "umap", group.by = "stim" ) + coord_equal() +DimPlot( ifnb, reduction = "umap", group.by = "seurat_clusters" ) + coord_equal() +DimPlot( ifnb, reduction = "umap", group.by = "seurat_annotations", label=TRUE ) + coord_equal() +# The plot coloured by `stim` is misleading, as the blue points are plotted after +# (and hence on top of) the red points. +# Let's redo this plot manually and use `sample_frac` to permute the table rows and +# thus shuffle the order in which the points are placed Embeddings(ifnb,"umap") %>% as_tibble() %>% add_column( cond = ifnb$stim ) %>% @@ -44,23 +74,88 @@ ggplot() + geom_point(aes(x=umap_1,y=umap_2,col=cond),size=.01) + coord_equal() +## Now we perform a simplified CCA manually, to compare +# First, here's the number of cells of each condition: +table( ifnb$stim ) +# Let's calculate a cross correlation between the control and the +# the stimulated cells. The following matrix has one row for each +# control cell and one column for each stimulated cells. The matrix entries +# are the dot products of the corresponding cells' expression vectors, +# as formed from the log-normalized expressions for the 2000 most variable genes. + +crosscor <- as.matrix( + t(LayerData(ifnb)[,ifnb$stim=="CTRL"][VariableFeatures(ifnb),]) %*% + LayerData(ifnb)[,ifnb$stim=="STIM"][VariableFeatures(ifnb),] ) +# Strictly speaking, this is not a cross-correlation yet, as we should have +# standardized the two expression matrices before multiplying them by subtracting +# for each gene its mean and dividing by its standard deviation. I'll add this when +# I go through this code once more. +# Now, do an SVD on the cross-correlation matrix +# svd <- irlba::svdr( crosscor, 20 ) +#save( svd, file="~/tmp/svd.rda" ) +load( "~/tmp/svd.rda" ) +# We use the left and right singular vectors as new coordinates, to replace the +# PCA coordinates. The diagonal matrix in the middle is split and each square root +# multiplied to one side. This gives us better distances, as can be seen with Sleepwalk: + +sleepwalk::sleepwalk( + list( + Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="CTRL", ], + Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="STIM", ] ), + list( + t( t(svd$u) * sqrt(svd$d) ), + t( t(svd$v) * sqrt(svd$d) ) ), + same="features" ) -### Manual +# Therefore, we use these as new reduction coordinates -# PCA performend on all cells together, regardless of origin +cca <- rbind( + t( t(svd$u) * sqrt(svd$d) ), + t( t(svd$v) * sqrt(svd$d) ) ) +rownames(cca) <- c( colnames(ifnb)[ifnb$stim=="CTRL"], colnames(ifnb)[ifnb$stim=="STIM"] ) +all( rownames(cca) == colnames(ifnb) ) + +# Add these to the Seurat object +ifnb@reductions$svd <- CreateDimReducObject( cca, assay="RNA", key="svd_" ) + +# Let's make a UMAP from these: +RunUMAP( ifnb, dims=1:20, reduction="svd", reduction.name = "umap.svd" ) -> ifnb + +# Here's a plot for the new UMAP: +perm <- sample.int(ncol(ifnb)) +plot( Embeddings(ifnb,"umap.svd")[perm,], cex=.2, asp=1, col=1+as.integer(factor(ifnb$stim[perm])) ) + +# And the usual colourings +DimPlot( ifnb, reduction = "umap.svd", group.by = "seurat_clusters" ) + coord_equal() +DimPlot( ifnb, reduction = "umap.svd", group.by = "seurat_annotations", label=TRUE ) + coord_equal() + +# Of course, the clustering does not fit that well, as it was done using another embedding. + +# And here a comparison between Seurat's CCA integration and our simplified one: +sleepwalk::sleepwalk( + list( Embeddings(ifnb,"umap"), Embeddings(ifnb,"umap.svd") ), + list( Embeddings(ifnb,"integrated.cca"), Embeddings(ifnb,"svd")) +) + +### Integration via mutual nearest neighbours + +# (FROM HERE ON: only draft) + +# We start with the PCA that was computed at the very beginning, using all cells +# and ignoring the stim covariate Embeddings( ifnb, "pca" ) -> pca -# Origin of the cells: +# Here ios the number of cells table( ifnb$stim ) -# NN search: +# cross-NN search: nn_cs <- FNN::get.knnx( pca[ifnb$stim=="CTRL",], pca[ifnb$stim=="STIM",], k=25 ) nn_sc <- FNN::get.knnx( pca[ifnb$stim=="STIM",], pca[ifnb$stim=="CTRL",], k=25 ) @@ -82,8 +177,9 @@ bind_rows( mutate( ngb_cell = names(which(ifnb$stim=="STIM"))[ ngb_cell ] ) %>% add_column( center_is = "CTRL" ) ) -> knn_tbl +head( knn_tbl ) + # How many of the neighbours are mutual? - knn_tbl %>% mutate( stim_cell = ifelse( center_is=="STIM", center_cell, ngb_cell ) ) %>% mutate( ctrl_cell = ifelse( center_is=="CTRL", center_cell, ngb_cell ) ) %>% @@ -91,82 +187,21 @@ group_by( stim_cell, ctrl_cell ) %>% summarise( n=n() ) %>% ungroup() %>% count( n ) +# Make table of MNNs knn_tbl %>% mutate( stim_cell = ifelse( center_is=="STIM", center_cell, ngb_cell ) ) %>% mutate( ctrl_cell = ifelse( center_is=="CTRL", center_cell, ngb_cell ) ) %>% group_by( stim_cell, ctrl_cell ) %>% summarise( n=n(), .groups="drop" ) %>% filter( n==2 ) %>% select(-n) -> mnn_tbl +head( mnn_tbl ) -Embeddings(ifnb,"umap.unintegrated") %>% -as_tibble( rownames="cell" ) %>% -add_column( cond = ifnb$stim ) %>% -mutate( is_among_mnn = cell %in% c( mnn_tbl$ctrl_cell, mnn_tbl$stim_cell ) ) %>% -sample_frac() %>% -ggplot( aes( x=umapunintegrated_1, y=umapunintegrated_2, col=cond ) ) + - geom_point( size=.1, alpha=.2 ) + - geom_point( size=.1, alpha=1, data=function(x) filter(x,is_among_mnn) ) + - coord_equal() - - - +# How many MNNs? mnn_tbl %>% group_by( ctrl_cell ) %>% summarise(n()) -sleepwalk::sleepwalk( - list( - Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="CTRL", ], - Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="STIM", ] ), - list( - Embeddings(ifnb,"pca")[ ifnb$stim=="CTRL", ], - Embeddings(ifnb,"pca")[ ifnb$stim=="STIM", ] ), - same="features" ) - - - -#### SVD - -crosscor <- as.matrix( - t(LayerData(ifnb)[,ifnb$stim=="CTRL"][VariableFeatures(ifnb),]) %*% - LayerData(ifnb)[,ifnb$stim=="STIM"][VariableFeatures(ifnb),] ) - -svd <- irlba::svdr( crosscor, 20 ) - -sleepwalk::sleepwalk( - list( - Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="CTRL", ], - Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="STIM", ] ), - list( - t( t(svd$u) * sqrt(svd$d) ), - t( t(svd$v) * sqrt(svd$d) ) ), - same="features" ) - -cca <- rbind( - t( t(svd$u) * sqrt(svd$d) ), - t( t(svd$v) * sqrt(svd$d) ) ) -rownames(cca) <- c( colnames(ifnb)[ifnb$stim=="CTRL"], colnames(ifnb)[ifnb$stim=="STIM"] ) -ump_cca <- uwot::umap(cca) - -all( rownames(ump_cca) == colnames(ifnb) ) - -perm <- sample.int(nrow(ump_cca)) -plot( ump_cca[perm,], cex=.2, asp=1, col=1+as.integer(factor(ifnb$stim[perm])) ) - -sleepwalk::sleepwalk( - list( Embeddings(ifnb,"umap"), ump_cca ), - list( Embeddings(ifnb,"integrated.cca"), cca) -) - - -qqplot( svd$u[,1], svd$v[,1] ) -u1_sorted <- sort( svd$u[,1] ) -v1_sorted <- sort( svd$v[,1] ) -plot( - u1_sorted[ ceiling( seq(0,1,length.out=300)*length(u1_sorted) ) ], - v1_sorted[ ceiling( seq(0,1,length.out=300)*length(v1_sorted) ) ] ) -plot( - cor( t(as.matrix(LayerData(ifnb)[,ifnb$stim=="CTRL"][VariableFeatures(ifnb),])), svd$u[,1] ), - cor( t(as.matrix(LayerData(ifnb)[,ifnb$stim=="STIM"][VariableFeatures(ifnb),])), svd$v[,1] ) ) +## Save image +#save.image( file="~/tmp/cca.rda" ) diff --git a/index.qmd b/index.qmd index 71ced29..8969e60 100644 --- a/index.qmd +++ b/index.qmd @@ -90,6 +90,7 @@ Place: - [Reimplementing UMAP in Python](UMAP_pedestrian.html) - prepared for following lectures - [Cell cycle regression](cell_cycle.html) + - [CCA Integration](cca.html) ## Source code