diff --git a/_site/cca.html b/_site/cca.html new file mode 100644 index 0000000..fc83ad4 --- /dev/null +++ b/_site/cca.html @@ -0,0 +1,796 @@ + + + + +
+ + + + + + + + + + +library( tidyverse )
+## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
+## ✔ dplyr 1.1.4 ✔ readr 2.1.5
+## ✔ forcats 1.0.0 ✔ stringr 1.5.1
+## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
+## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
+## ✔ purrr 1.0.2
+## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
+## ✖ dplyr::filter() masks stats::filter()
+## ✖ dplyr::lag() masks stats::lag()
+## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
+library( Seurat )
+## Loading required package: SeuratObject
+## Loading required package: sp
+##
+## Attaching package: 'SeuratObject'
+##
+## The following objects are masked from 'package:base':
+##
+## intersect, t
+# Load `ifnb` data set
+ifnb <- SeuratData::LoadData("ifnb")
+## Validating object structure
+## Updating object slots
+## Ensuring keys are in the proper structure
+## Warning: Assay RNA changing from Assay to Assay
+## Ensuring keys are in the proper structure
+## Ensuring feature names don't have underscores or pipes
+## Updating slots in RNA
+## Validating object structure for Assay 'RNA'
+## Object representation is consistent with the most current Seurat version
+## Warning: Assay RNA changing from Assay to Assay5
+ifnb
+## An object of class Seurat
+## 14053 features across 13999 samples within 1 assay
+## Active assay: RNA (14053 features, 0 variable features)
+## 2 layers present: counts, data
+# Split by `stim`:
+ifnbs <- ifnb
+ifnbs[["RNA"]] <- split( ifnbs[["RNA"]], f = ifnbs$stim )
+ifnbs
+## An object of class Seurat
+## 14053 features across 13999 samples within 1 assay
+## Active assay: RNA (14053 features, 0 variable features)
+## 4 layers present: counts.CTRL, counts.STIM, data.CTRL, data.STIM
+# Run standard analysis workflow
+ifnbs %>%
+NormalizeData() %>%
+FindVariableFeatures() %>%
+ScaleData() %>%
+RunPCA() %>%
+FindNeighbors( dims=1:30, reduction="pca" ) %>%
+FindClusters( resolution=2, cluster.name="unintegrated_clusters" ) %>%
+RunUMAP( dims=1:30, reduction="pca", reduction.name="umap.unintegrated" ) -> ifnbs
+## Normalizing layer: counts.CTRL
+## Normalizing layer: counts.STIM
+## Finding variable features for layer counts.CTRL
+## Finding variable features for layer counts.STIM
+## Centering and scaling data matrix
+## PC_ 1
+## Positive: TYROBP, C15orf48, FCER1G, CST3, SOD2, ANXA5, FTL, TYMP, TIMP1, CD63
+## LGALS1, CTSB, S100A4, LGALS3, KYNU, PSAP, FCN1, NPC2, ANXA2, S100A11
+## IGSF6, LYZ, SPI1, APOBEC3A, CD68, CTSL, SDCBP, NINJ1, HLA-DRA, CCL2
+## Negative: NPM1, CCR7, GIMAP7, LTB, CD7, SELL, CD2, TMSB4X, TRAT1, IL7R
+## IL32, RHOH, ITM2A, RGCC, LEF1, CD3G, ALOX5AP, CREM, NHP2, PASK
+## MYC, SNHG8, TSC22D3, GPR171, BIRC3, NOP58, RARRES3, CD27, SRM, CD8B
+## PC_ 2
+## Positive: ISG15, ISG20, IFIT3, IFIT1, LY6E, TNFSF10, IFIT2, MX1, IFI6, RSAD2
+## CXCL10, OAS1, CXCL11, IFITM3, MT2A, OASL, TNFSF13B, IDO1, IL1RN, APOBEC3A
+## GBP1, CCL8, HERC5, FAM26F, GBP4, HES4, WARS, VAMP5, DEFB1, XAF1
+## Negative: IL8, CLEC5A, CD14, VCAN, S100A8, IER3, MARCKSL1, IL1B, PID1, CD9
+## GPX1, PHLDA1, INSIG1, PLAUR, PPIF, THBS1, OSM, SLC7A11, GAPDH, CTB-61M7.2
+## LIMS1, S100A9, GAPT, CXCL3, ACTB, C19orf59, CEBPB, OLR1, MGST1, FTH1
+## PC_ 3
+## Positive: HLA-DQA1, CD83, HLA-DQB1, CD74, HLA-DPA1, HLA-DRA, HLA-DRB1, HLA-DPB1, SYNGR2, IRF8
+## CD79A, MIR155HG, HERPUD1, REL, HLA-DMA, MS4A1, HSP90AB1, FABP5, TVP23A, ID3
+## CCL22, EBI3, TSPAN13, PMAIP1, TCF4, PRMT1, NME1, HSPE1, HSPD1, CD70
+## Negative: ANXA1, GIMAP7, TMSB4X, CD7, CD2, RARRES3, MT2A, IL32, GNLY, PRF1
+## NKG7, CCL5, TRAT1, RGCC, S100A9, KLRD1, CCL2, GZMH, GZMA, CD3G
+## S100A8, CTSW, CCL7, ITM2A, HPSE, FGFBP2, CTSL, GPR171, CCL8, OASL
+## PC_ 4
+## Positive: NKG7, GZMB, GNLY, CST7, CCL5, PRF1, CLIC3, KLRD1, GZMH, GZMA
+## APOBEC3G, CTSW, FGFBP2, KLRC1, FASLG, C1orf21, HOPX, CXCR3, SH2D1B, LINC00996
+## TNFRSF18, SPON2, RARRES3, GCHFR, SH2D2A, IGFBP7, ID2, C12orf75, XCL2, RAMP1
+## Negative: CCR7, LTB, SELL, LEF1, IL7R, ADTRP, TRAT1, PASK, MYC, NPM1
+## SOCS3, TSC22D3, TSHZ2, HSP90AB1, SNHG8, GIMAP7, PIM2, HSPD1, CD3G, TXNIP
+## RHOH, GBP1, C12orf57, CA6, PNRC1, CMSS1, CD27, SESN3, NHP2, BIRC3
+## PC_ 5
+## Positive: VMO1, FCGR3A, MS4A4A, CXCL16, MS4A7, PPM1N, HN1, LST1, SMPDL3A, ATP1B3
+## CASP5, CDKN1C, CH25H, AIF1, PLAC8, SERPINA1, LRRC25, CD86, HCAR3, GBP5
+## TMSB4X, RP11-290F20.3, RGS19, VNN2, ADA, LILRA5, STXBP2, C3AR1, PILRA, FCGR3B
+## Negative: CCL2, CCL7, CCL8, PLA2G7, LMNA, S100A9, SDS, TXN, CSTB, ATP6V1F
+## CCR1, EMP1, CAPG, CCR5, TPM4, IDO1, MGST1, HPSE, CTSB, LILRB4
+## RSAD2, HSPA1A, VIM, CCNA1, CTSL, GCLM, PDE4DIP, SGTB, SLC7A11, FABP5
+## Computing nearest neighbor graph
+## Computing SNN
+## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
+##
+## Number of nodes: 13999
+## Number of edges: 555146
+##
+## Running Louvain algorithm...
+## Maximum modularity in 10 random starts: 0.8153
+## Number of communities: 26
+## Elapsed time: 3 seconds
+## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
+## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
+## This message will be shown once per session
+## 15:24:47 UMAP embedding parameters a = 0.9922 b = 1.112
+## 15:24:47 Read 13999 rows and found 30 numeric columns
+## 15:24:47 Using Annoy for neighbor search, n_neighbors = 30
+## 15:24:47 Building Annoy index with metric = cosine, n_trees = 50
+## 0% 10 20 30 40 50 60 70 80 90 100%
+## [----|----|----|----|----|----|----|----|----|----|
+## **************************************************|
+## 15:24:49 Writing NN index file to temp file /tmp/Rtmppe4ngQ/file16b5013fa6700
+## 15:24:49 Searching Annoy index using 1 thread, search_k = 3000
+## 15:24:55 Annoy recall = 100%
+## 15:24:56 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
+## 15:24:57 Initializing from normalized Laplacian + noise (using RSpectra)
+## 15:24:58 Commencing optimization for 200 epochs, with 614378 positive edges
+## 15:25:07 Optimization finished
+# Plot UMAP, colour by `stim` and by Leiden clusters
+DimPlot( ifnbs, reduction = "umap.unintegrated", group.by = "stim" ) + coord_equal()
+
+DimPlot( ifnbs, reduction = "umap.unintegrated", group.by = "seurat_clusters" ) + coord_equal()
+
+# Also show UMAP using colours for annotation from previous analysis
+DimPlot( ifnbs, reduction = "umap.unintegrated", group.by = "seurat_annotations", label=TRUE ) + coord_equal()
+
+# Now, we use Sleepwalk to compare the two cell subsets:
+sleepwalk::sleepwalk(
+ list(
+ Embeddings(ifnbs,"umap.unintegrated")[ ifnbs$stim=="CTRL", ],
+ Embeddings(ifnbs,"umap.unintegrated")[ ifnbs$stim=="STIM", ] ),
+ list(
+ Embeddings(ifnbs,"pca")[ ifnbs$stim=="CTRL", ],
+ Embeddings(ifnbs,"pca")[ ifnbs$stim=="STIM", ] ),
+ same="features" )
+## Estimating 'maxdist' for feature matrix 1
+## Estimating 'maxdist' for feature matrix 2
+# Perform CCA integration (This takes a few minutes)
+# ifnbs <- IntegrateLayers( object = ifnbs, method = CCAIntegration,
+# orig.reduction = "pca", new.reduction = "integrated.cca" )
+
+# Save result of this lengthy calculation
+#save( ifnbs, file="~/tmp/ifnbs_integrated.rda" )
+load( "~/tmp/ifnbs_integrated.rda" )
+
+# Re-join layers after integration
+ifnb <- ifnbs
+ifnb[["RNA"]] <- JoinLayers( ifnb[["RNA"]] )
+
+# Redo neighbor search, clustering, and UMAP, now based on CCA instead of PCA:
+ifnb %>%
+FindNeighbors( reduction = "integrated.cca", dims = 1:30 ) %>%
+FindClusters( resolution = 1 ) %>%
+RunUMAP( dims = 1:30, reduction = "integrated.cca" ) -> ifnb
+## Computing nearest neighbor graph
+## Computing SNN
+## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
+##
+## Number of nodes: 13999
+## Number of edges: 591032
+##
+## Running Louvain algorithm...
+## Maximum modularity in 10 random starts: 0.8448
+## Number of communities: 18
+## Elapsed time: 3 seconds
+## 15:25:27 UMAP embedding parameters a = 0.9922 b = 1.112
+## 15:25:27 Read 13999 rows and found 30 numeric columns
+## 15:25:27 Using Annoy for neighbor search, n_neighbors = 30
+## 15:25:27 Building Annoy index with metric = cosine, n_trees = 50
+## 0% 10 20 30 40 50 60 70 80 90 100%
+## [----|----|----|----|----|----|----|----|----|----|
+## **************************************************|
+## 15:25:30 Writing NN index file to temp file /tmp/Rtmppe4ngQ/file16b501ab9f9f1
+## 15:25:30 Searching Annoy index using 1 thread, search_k = 3000
+## 15:25:38 Annoy recall = 100%
+## 15:25:38 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
+## 15:25:39 Initializing from normalized Laplacian + noise (using RSpectra)
+## 15:25:40 Commencing optimization for 200 epochs, with 629042 positive edges
+## 15:25:49 Optimization finished
+# Plot UMAP again, coloured as before by `stim` covariate, clusters (from new clusterin),
+# and cell types (from original analysis):
+DimPlot( ifnb, reduction = "umap", group.by = "stim" ) + coord_equal()
+
+DimPlot( ifnb, reduction = "umap", group.by = "seurat_clusters" ) + coord_equal()
+
+DimPlot( ifnb, reduction = "umap", group.by = "seurat_annotations", label=TRUE ) + coord_equal()
+
+# The plot coloured by `stim` is misleading, as the blue points are plotted after
+# (and hence on top of) the red points.
+# Let's redo this plot manually and use `sample_frac` to permute the table rows and
+# thus shuffle the order in which the points are placed
+Embeddings(ifnb,"umap") %>%
+as_tibble() %>%
+add_column( cond = ifnb$stim ) %>%
+sample_frac() %>%
+ggplot() +
+ geom_point(aes(x=umap_1,y=umap_2,col=cond),size=.01) +
+ coord_equal()
+
+## Now we perform a simplified CCA manually, to compare
+
+# First, here's the number of cells of each condition:
+table( ifnb$stim )
+##
+## CTRL STIM
+## 6548 7451
+# Let's calculate a cross correlation between the control and the
+# the stimulated cells. The following matrix has one row for each
+# control cell and one column for each stimulated cells. The matrix entries
+# are the dot products of the corresponding cells' expression vectors,
+# as formed from the log-normalized expressions for the 2000 most variable genes.
+
+crosscor <- as.matrix(
+ t(LayerData(ifnb)[,ifnb$stim=="CTRL"][VariableFeatures(ifnb),]) %*%
+ LayerData(ifnb)[,ifnb$stim=="STIM"][VariableFeatures(ifnb),] )
+
+# Strictly speaking, this is not a cross-correlation yet, as we should have
+# standardized the two expression matrices before multiplying them by subtracting
+# for each gene its mean and dividing by its standard deviation. I'll add this when
+# I go through this code once more.
+
+# Now, do an SVD on the cross-correlation matrix
+# svd <- irlba::svdr( crosscor, 20 )
+
+#save( svd, file="~/tmp/svd.rda" )
+load( "~/tmp/svd.rda" )
+
+# We use the left and right singular vectors as new coordinates, to replace the
+# PCA coordinates. The diagonal matrix in the middle is split and each square root
+# multiplied to one side. This gives us better distances, as can be seen with Sleepwalk:
+
+sleepwalk::sleepwalk(
+ list(
+ Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="CTRL", ],
+ Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="STIM", ] ),
+ list(
+ t( t(svd$u) * sqrt(svd$d) ),
+ t( t(svd$v) * sqrt(svd$d) ) ),
+ same="features" )
+## Estimating 'maxdist' for feature matrix 1
+## Estimating 'maxdist' for feature matrix 2
+## Server has been stopped.
+# Therefore, we use these as new reduction coordinates
+
+cca <- rbind(
+ t( t(svd$u) * sqrt(svd$d) ),
+ t( t(svd$v) * sqrt(svd$d) ) )
+rownames(cca) <- c( colnames(ifnb)[ifnb$stim=="CTRL"], colnames(ifnb)[ifnb$stim=="STIM"] )
+all( rownames(cca) == colnames(ifnb) )
+## [1] TRUE
+# Add these to the Seurat object
+ifnb@reductions$svd <- CreateDimReducObject( cca, assay="RNA", key="svd_" )
+## Warning: No columnames present in cell embeddings, setting to 'svd_1:20'
+# Let's make a UMAP from these:
+RunUMAP( ifnb, dims=1:20, reduction="svd", reduction.name = "umap.svd" ) -> ifnb
+## 15:26:05 UMAP embedding parameters a = 0.9922 b = 1.112
+## 15:26:05 Read 13999 rows and found 20 numeric columns
+## 15:26:05 Using Annoy for neighbor search, n_neighbors = 30
+## 15:26:05 Building Annoy index with metric = cosine, n_trees = 50
+## 0% 10 20 30 40 50 60 70 80 90 100%
+## [----|----|----|----|----|----|----|----|----|----|
+## **************************************************|
+## 15:26:07 Writing NN index file to temp file /tmp/Rtmppe4ngQ/file16b5071182fb9
+## 15:26:07 Searching Annoy index using 1 thread, search_k = 3000
+## 15:26:14 Annoy recall = 100%
+## 15:26:14 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
+## 15:26:15 Initializing from normalized Laplacian + noise (using RSpectra)
+## 15:26:15 Commencing optimization for 200 epochs, with 612288 positive edges
+## 15:26:23 Optimization finished
+# Here's a plot for the new UMAP:
+perm <- sample.int(ncol(ifnb))
+plot( Embeddings(ifnb,"umap.svd")[perm,], cex=.2, asp=1, col=1+as.integer(factor(ifnb$stim[perm])) )
+
+# And the usual colourings
+DimPlot( ifnb, reduction = "umap.svd", group.by = "seurat_clusters" ) + coord_equal()
+
+DimPlot( ifnb, reduction = "umap.svd", group.by = "seurat_annotations", label=TRUE ) + coord_equal()
+
+# Of course, the clustering does not fit that well, as it was done using another embedding.
+
+# And here a comparison between Seurat's CCA integration and our simplified one:
+sleepwalk::sleepwalk(
+ list( Embeddings(ifnb,"umap"), Embeddings(ifnb,"umap.svd") ),
+ list( Embeddings(ifnb,"integrated.cca"), Embeddings(ifnb,"svd"))
+)
+## Estimating 'maxdist' for feature matrix 1
+## Estimating 'maxdist' for feature matrix 2
+## Server has been stopped.
+### Integration via mutual nearest neighbours
+
+# (FROM HERE ON: only draft)
+
+# We start with the PCA that was computed at the very beginning, using all cells
+# and ignoring the stim covariate
+Embeddings( ifnb, "pca" ) -> pca
+
+# Here ios the number of cells
+table( ifnb$stim )
+##
+## CTRL STIM
+## 6548 7451
+# cross-NN search:
+nn_cs <- FNN::get.knnx( pca[ifnb$stim=="CTRL",], pca[ifnb$stim=="STIM",], k=25 )
+nn_sc <- FNN::get.knnx( pca[ifnb$stim=="STIM",], pca[ifnb$stim=="CTRL",], k=25 )
+
+bind_rows(
+ nn_cs$nn.index %>%
+ `colnames<-`( 1:ncol(.) ) %>%
+ as_tibble() %>%
+ add_column( center_cell = names(which(ifnb$stim=="STIM")) ) %>%
+ pivot_longer( -`center_cell`, names_to="ngb_index", values_to="ngb_cell" ) %>%
+ mutate( ngb_index = as.integer(ngb_index) ) %>%
+ mutate( ngb_cell = names(which(ifnb$stim=="CTRL"))[ ngb_cell ] ) %>%
+ add_column( center_is = "STIM" ),
+ nn_sc$nn.index %>%
+ `colnames<-`( 1:ncol(.) ) %>%
+ as_tibble() %>%
+ add_column( center_cell = names(which(ifnb$stim=="CTRL")) ) %>%
+ pivot_longer( -`center_cell`, names_to="ngb_index", values_to="ngb_cell" ) %>%
+ mutate( ngb_index = as.integer(ngb_index) ) %>%
+ mutate( ngb_cell = names(which(ifnb$stim=="STIM"))[ ngb_cell ] ) %>%
+ add_column( center_is = "CTRL" ) ) -> knn_tbl
+
+head( knn_tbl )
+## # A tibble: 6 × 4
+## center_cell ngb_index ngb_cell center_is
+## <chr> <int> <chr> <chr>
+## 1 AAACATACCAAGCT.1 1 AGAGATGATCCCAC.1 STIM
+## 2 AAACATACCAAGCT.1 2 CCATGCTGGGATCT.1 STIM
+## 3 AAACATACCAAGCT.1 3 CCCGAACTACCAAC.1 STIM
+## 4 AAACATACCAAGCT.1 4 GTTAAATGCGCCTT.1 STIM
+## 5 AAACATACCAAGCT.1 5 TAATCGCTACCATG.1 STIM
+## 6 AAACATACCAAGCT.1 6 TCCCTACTACTGTG.1 STIM
+# How many of the neighbours are mutual?
+knn_tbl %>%
+mutate( stim_cell = ifelse( center_is=="STIM", center_cell, ngb_cell ) ) %>%
+mutate( ctrl_cell = ifelse( center_is=="CTRL", center_cell, ngb_cell ) ) %>%
+group_by( stim_cell, ctrl_cell ) %>%
+summarise( n=n() ) %>% ungroup() %>%
+count( n )
+## `summarise()` has grouped output by 'stim_cell'. You can override using the
+## `.groups` argument.
+## Storing counts in `nn`, as `n` already present in input
+## # A tibble: 2 × 2
+## n nn
+## <int> <int>
+## 1 1 297781
+## 2 2 26097
+# Make table of MNNs
+knn_tbl %>%
+mutate( stim_cell = ifelse( center_is=="STIM", center_cell, ngb_cell ) ) %>%
+mutate( ctrl_cell = ifelse( center_is=="CTRL", center_cell, ngb_cell ) ) %>%
+group_by( stim_cell, ctrl_cell ) %>%
+summarise( n=n(), .groups="drop" ) %>%
+filter( n==2 ) %>% select(-n) -> mnn_tbl
+head( mnn_tbl )
+## # A tibble: 6 × 2
+## stim_cell ctrl_cell
+## <chr> <chr>
+## 1 AAACATACCCCTAC.1 ATAGTTGATAAAGG.1
+## 2 AAACATACCCGTAA.1 AGAATTTGCTCAGA.1
+## 3 AAACATACCCGTAA.1 CCAACCTGTGCCAA.1
+## 4 AAACATACCCGTAA.1 CGAATCGAGCAGAG.1
+## 5 AAACATACCCGTAA.1 CTCAGGCTGGACAG.1
+## 6 AAACATACCCGTAA.1 GAGATAGACTGACA.1
+# How many MNNs?
+mnn_tbl %>%
+group_by( ctrl_cell ) %>%
+summarise(n())
+## # A tibble: 3,752 × 2
+## ctrl_cell `n()`
+## <chr> <int>
+## 1 AAACATACATTTCC.1 1
+## 2 AAACATACCAGAAA.1 1
+## 3 AAACATACCTGGTA.1 19
+## 4 AAACATACGATGAA.1 9
+## 5 AAACATACTGCGTA.1 5
+## 6 AAACCGTGAGCCAT.1 1
+## 7 AAACCGTGTGCTAG.1 8
+## 8 AAACGCACACTTTC.1 3
+## 9 AAACGCACAGTACC.1 6
+## 10 AAACGCACCAACCA.1 19
+## # ℹ 3,742 more rows
+## Save image
+#save.image( file="~/tmp/cca.rda" )
+
+
+
+
+