Skip to content

Commit

Permalink
cca
Browse files Browse the repository at this point in the history
  • Loading branch information
simon-anders committed Jun 24, 2024
1 parent a6fad51 commit 3688430
Show file tree
Hide file tree
Showing 5 changed files with 930 additions and 97 deletions.
796 changes: 796 additions & 0 deletions _site/cca.html

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions _site/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ <h2 class="anchored" data-anchor-id="material-and-video-recodings">Material and
<li>prepared for following lectures
<ul>
<li><a href="cell_cycle.html">Cell cycle regression</a></li>
<li><a href="cca.html">CCA Integration</a></li>
</ul></li>
</ul>
</section>
Expand Down
2 changes: 1 addition & 1 deletion _site/search.json
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@
"href": "index.html#material-and-video-recodings",
"title": "Mathematics of Single-cell Omics",
"section": "Material and Video Recodings",
"text": "Material and Video Recodings\n\nLecture of 2024-04-22:\n\nVideo\nBiology Primer\nOverview High-throughput Sequencing\nA simple analysis with Seurat\nExample data: PBMC3k (available on 10X web site, and Moodle)\n\nExercise class of 2024-04-24\n\nVideo\nDoing the Seurat analysis “on foot”\n\nLecture of 2024-05-06\n\nVideo\nModularity clustering\nblackboard photo\n\nExercise class of 2024-05-08\n\nVideo\n\nImplementing modularity score in Python\nlouvain.py\n\nLecture of 2024-05-13\n\nVideo\nMathematics of PCA\nBlackboard photos: 1 2\n\nExercise class of 2024-05-15\n\nVideo:\nfitting pseudotimes and splines\nR code: pseudotime_1.R, splines_1.R\nexample data: cmss.rda (on Moodle)\n\nLecture of 2024-05-22\n\nVideo\nExpression dynamics along pseudotime trajectories\nSmoothing\nblackboard photo\n\nLecture of 2024-05-27\n\nVideo\nSmoothing with principal curves\n\nLecture of 2024-06-03\n\nVideo\nBlackboard photos: 1 2\nSpectra of graphs (Skript still missing)\nDiffusion on graphs\n\nLecture of 2024-06-10\n\nVideo\nLaplacian eigenmaps\n\nLecture of 2024-06-17\n\nVideo (not yet uploaded)\nComparison of t-SNE and UMAP for our example data\nMathematics of t-SNE and UMAP\nReimplementing UMAP in Python\n\nprepared for following lectures\n\nCell cycle regression"
"text": "Material and Video Recodings\n\nLecture of 2024-04-22:\n\nVideo\nBiology Primer\nOverview High-throughput Sequencing\nA simple analysis with Seurat\nExample data: PBMC3k (available on 10X web site, and Moodle)\n\nExercise class of 2024-04-24\n\nVideo\nDoing the Seurat analysis “on foot”\n\nLecture of 2024-05-06\n\nVideo\nModularity clustering\nblackboard photo\n\nExercise class of 2024-05-08\n\nVideo\n\nImplementing modularity score in Python\nlouvain.py\n\nLecture of 2024-05-13\n\nVideo\nMathematics of PCA\nBlackboard photos: 1 2\n\nExercise class of 2024-05-15\n\nVideo:\nfitting pseudotimes and splines\nR code: pseudotime_1.R, splines_1.R\nexample data: cmss.rda (on Moodle)\n\nLecture of 2024-05-22\n\nVideo\nExpression dynamics along pseudotime trajectories\nSmoothing\nblackboard photo\n\nLecture of 2024-05-27\n\nVideo\nSmoothing with principal curves\n\nLecture of 2024-06-03\n\nVideo\nBlackboard photos: 1 2\nSpectra of graphs (Skript still missing)\nDiffusion on graphs\n\nLecture of 2024-06-10\n\nVideo\nLaplacian eigenmaps\n\nLecture of 2024-06-17\n\nVideo (not yet uploaded)\nComparison of t-SNE and UMAP for our example data\nMathematics of t-SNE and UMAP\nReimplementing UMAP in Python\n\nprepared for following lectures\n\nCell cycle regression\nCCA Integration"
},
{
"objectID": "index.html#source-code",
Expand Down
227 changes: 131 additions & 96 deletions cca.R
Original file line number Diff line number Diff line change
@@ -1,41 +1,71 @@
library( tidyverse )
library( Seurat )

# Load `ifnb` data set
ifnb <- SeuratData::LoadData("ifnb")

ifnb[["RNA"]] <- split(ifnb[["RNA"]], f = ifnb$stim)
ifnb
# run standard anlaysis workflow
ifnb <- NormalizeData(ifnb)
ifnb <- FindVariableFeatures(ifnb)
ifnb <- ScaleData(ifnb)
ifnb <- RunPCA(ifnb)
ifnb <- FindNeighbors(ifnb, dims = 1:30, reduction = "pca")
ifnb <- FindClusters(ifnb, resolution = 2, cluster.name = "unintegrated_clusters")

ifnb <- RunUMAP(ifnb, dims = 1:30, reduction = "pca", reduction.name = "umap.unintegrated")
DimPlot(ifnb, reduction = "umap.unintegrated", group.by = c("stim", "seurat_clusters"))

ifnb <- IntegrateLayers(object = ifnb, method = CCAIntegration,
orig.reduction = "pca", new.reduction = "integrated.cca" )

# re-join layers after integration
ifnb[["RNA"]] <- JoinLayers(ifnb[["RNA"]])
# Split by `stim`:
ifnbs <- ifnb
ifnbs[["RNA"]] <- split( ifnbs[["RNA"]], f = ifnbs$stim )
ifnbs

# Run standard analysis workflow
ifnbs %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA() %>%
FindNeighbors( dims=1:30, reduction="pca" ) %>%
FindClusters( resolution=2, cluster.name="unintegrated_clusters" ) %>%
RunUMAP( dims=1:30, reduction="pca", reduction.name="umap.unintegrated" ) -> ifnbs

# Plot UMAP, colour by `stim` and by Leiden clusters
DimPlot( ifnbs, reduction = "umap.unintegrated", group.by = "stim" ) + coord_equal()
DimPlot( ifnbs, reduction = "umap.unintegrated", group.by = "seurat_clusters" ) + coord_equal()

# Also show UMAP using colours for annotation from previous analysis
DimPlot( ifnbs, reduction = "umap.unintegrated", group.by = "seurat_annotations", label=TRUE ) + coord_equal()

# Now, we use Sleepwalk to compare the two cell subsets:
sleepwalk::sleepwalk(
list(
Embeddings(ifnbs,"umap.unintegrated")[ ifnbs$stim=="CTRL", ],
Embeddings(ifnbs,"umap.unintegrated")[ ifnbs$stim=="STIM", ] ),
list(
Embeddings(ifnbs,"pca")[ ifnbs$stim=="CTRL", ],
Embeddings(ifnbs,"pca")[ ifnbs$stim=="STIM", ] ),
same="features" )

ifnb <- FindNeighbors(ifnb, reduction = "integrated.cca", dims = 1:30)
ifnb <- FindClusters(ifnb, resolution = 1)

ifnb <- RunUMAP(ifnb, dims = 1:30, reduction = "pca", reduction.name = "umap.unintegrated")
DimPlot(ifnb, reduction = "umap.unintegrated", group.by = c("stim", "seurat_clusters"))
# Perform CCA integration (This takes a few minutes)
# ifnbs <- IntegrateLayers( object = ifnbs, method = CCAIntegration,
# orig.reduction = "pca", new.reduction = "integrated.cca" )

ifnb <- RunUMAP(ifnb, dims = 1:30, reduction = "integrated.cca")
# Save result of this lengthy calculation
#save( ifnbs, file="~/tmp/ifnbs_integrated.rda" )
load( "~/tmp/ifnbs_integrated.rda" )

# Visualization
DimPlot(ifnb, reduction = "umap", group.by = c("stim", "seurat_annotations"))
# Re-join layers after integration
ifnb <- ifnbs
ifnb[["RNA"]] <- JoinLayers( ifnb[["RNA"]] )

save( ifnb, file="~/tmp/ifnb_integrated.rda" )
# Redo neighbor search, clustering, and UMAP, now based on CCA instead of PCA:
ifnb %>%
FindNeighbors( reduction = "integrated.cca", dims = 1:30 ) %>%
FindClusters( resolution = 1 ) %>%
RunUMAP( dims = 1:30, reduction = "integrated.cca" ) -> ifnb

# Plot UMAP again, coloured as before by `stim` covariate, clusters (from new clusterin),
# and cell types (from original analysis):
DimPlot( ifnb, reduction = "umap", group.by = "stim" ) + coord_equal()
DimPlot( ifnb, reduction = "umap", group.by = "seurat_clusters" ) + coord_equal()
DimPlot( ifnb, reduction = "umap", group.by = "seurat_annotations", label=TRUE ) + coord_equal()

# The plot coloured by `stim` is misleading, as the blue points are plotted after
# (and hence on top of) the red points.
# Let's redo this plot manually and use `sample_frac` to permute the table rows and
# thus shuffle the order in which the points are placed
Embeddings(ifnb,"umap") %>%
as_tibble() %>%
add_column( cond = ifnb$stim ) %>%
Expand All @@ -44,23 +74,88 @@ ggplot() +
geom_point(aes(x=umap_1,y=umap_2,col=cond),size=.01) +
coord_equal()

## Now we perform a simplified CCA manually, to compare

# First, here's the number of cells of each condition:
table( ifnb$stim )

# Let's calculate a cross correlation between the control and the
# the stimulated cells. The following matrix has one row for each
# control cell and one column for each stimulated cells. The matrix entries
# are the dot products of the corresponding cells' expression vectors,
# as formed from the log-normalized expressions for the 2000 most variable genes.

crosscor <- as.matrix(
t(LayerData(ifnb)[,ifnb$stim=="CTRL"][VariableFeatures(ifnb),]) %*%
LayerData(ifnb)[,ifnb$stim=="STIM"][VariableFeatures(ifnb),] )

# Strictly speaking, this is not a cross-correlation yet, as we should have
# standardized the two expression matrices before multiplying them by subtracting
# for each gene its mean and dividing by its standard deviation. I'll add this when
# I go through this code once more.

# Now, do an SVD on the cross-correlation matrix
# svd <- irlba::svdr( crosscor, 20 )

#save( svd, file="~/tmp/svd.rda" )
load( "~/tmp/svd.rda" )

# We use the left and right singular vectors as new coordinates, to replace the
# PCA coordinates. The diagonal matrix in the middle is split and each square root
# multiplied to one side. This gives us better distances, as can be seen with Sleepwalk:

sleepwalk::sleepwalk(
list(
Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="CTRL", ],
Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="STIM", ] ),
list(
t( t(svd$u) * sqrt(svd$d) ),
t( t(svd$v) * sqrt(svd$d) ) ),
same="features" )


### Manual
# Therefore, we use these as new reduction coordinates

# PCA performend on all cells together, regardless of origin
cca <- rbind(
t( t(svd$u) * sqrt(svd$d) ),
t( t(svd$v) * sqrt(svd$d) ) )
rownames(cca) <- c( colnames(ifnb)[ifnb$stim=="CTRL"], colnames(ifnb)[ifnb$stim=="STIM"] )
all( rownames(cca) == colnames(ifnb) )

# Add these to the Seurat object
ifnb@reductions$svd <- CreateDimReducObject( cca, assay="RNA", key="svd_" )

# Let's make a UMAP from these:
RunUMAP( ifnb, dims=1:20, reduction="svd", reduction.name = "umap.svd" ) -> ifnb

# Here's a plot for the new UMAP:
perm <- sample.int(ncol(ifnb))
plot( Embeddings(ifnb,"umap.svd")[perm,], cex=.2, asp=1, col=1+as.integer(factor(ifnb$stim[perm])) )

# And the usual colourings
DimPlot( ifnb, reduction = "umap.svd", group.by = "seurat_clusters" ) + coord_equal()
DimPlot( ifnb, reduction = "umap.svd", group.by = "seurat_annotations", label=TRUE ) + coord_equal()

# Of course, the clustering does not fit that well, as it was done using another embedding.

# And here a comparison between Seurat's CCA integration and our simplified one:
sleepwalk::sleepwalk(
list( Embeddings(ifnb,"umap"), Embeddings(ifnb,"umap.svd") ),
list( Embeddings(ifnb,"integrated.cca"), Embeddings(ifnb,"svd"))
)

### Integration via mutual nearest neighbours

# (FROM HERE ON: only draft)

# We start with the PCA that was computed at the very beginning, using all cells
# and ignoring the stim covariate
Embeddings( ifnb, "pca" ) -> pca

# Origin of the cells:
# Here ios the number of cells
table( ifnb$stim )

# NN search:
# cross-NN search:
nn_cs <- FNN::get.knnx( pca[ifnb$stim=="CTRL",], pca[ifnb$stim=="STIM",], k=25 )
nn_sc <- FNN::get.knnx( pca[ifnb$stim=="STIM",], pca[ifnb$stim=="CTRL",], k=25 )

Expand All @@ -82,91 +177,31 @@ bind_rows(
mutate( ngb_cell = names(which(ifnb$stim=="STIM"))[ ngb_cell ] ) %>%
add_column( center_is = "CTRL" ) ) -> knn_tbl

head( knn_tbl )

# How many of the neighbours are mutual?

knn_tbl %>%
mutate( stim_cell = ifelse( center_is=="STIM", center_cell, ngb_cell ) ) %>%
mutate( ctrl_cell = ifelse( center_is=="CTRL", center_cell, ngb_cell ) ) %>%
group_by( stim_cell, ctrl_cell ) %>%
summarise( n=n() ) %>% ungroup() %>%
count( n )

# Make table of MNNs
knn_tbl %>%
mutate( stim_cell = ifelse( center_is=="STIM", center_cell, ngb_cell ) ) %>%
mutate( ctrl_cell = ifelse( center_is=="CTRL", center_cell, ngb_cell ) ) %>%
group_by( stim_cell, ctrl_cell ) %>%
summarise( n=n(), .groups="drop" ) %>%
filter( n==2 ) %>% select(-n) -> mnn_tbl
head( mnn_tbl )

Embeddings(ifnb,"umap.unintegrated") %>%
as_tibble( rownames="cell" ) %>%
add_column( cond = ifnb$stim ) %>%
mutate( is_among_mnn = cell %in% c( mnn_tbl$ctrl_cell, mnn_tbl$stim_cell ) ) %>%
sample_frac() %>%
ggplot( aes( x=umapunintegrated_1, y=umapunintegrated_2, col=cond ) ) +
geom_point( size=.1, alpha=.2 ) +
geom_point( size=.1, alpha=1, data=function(x) filter(x,is_among_mnn) ) +
coord_equal()



# How many MNNs?
mnn_tbl %>%
group_by( ctrl_cell ) %>%
summarise(n())

sleepwalk::sleepwalk(
list(
Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="CTRL", ],
Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="STIM", ] ),
list(
Embeddings(ifnb,"pca")[ ifnb$stim=="CTRL", ],
Embeddings(ifnb,"pca")[ ifnb$stim=="STIM", ] ),
same="features" )



#### SVD

crosscor <- as.matrix(
t(LayerData(ifnb)[,ifnb$stim=="CTRL"][VariableFeatures(ifnb),]) %*%
LayerData(ifnb)[,ifnb$stim=="STIM"][VariableFeatures(ifnb),] )

svd <- irlba::svdr( crosscor, 20 )

sleepwalk::sleepwalk(
list(
Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="CTRL", ],
Embeddings(ifnb,"umap.unintegrated")[ ifnb$stim=="STIM", ] ),
list(
t( t(svd$u) * sqrt(svd$d) ),
t( t(svd$v) * sqrt(svd$d) ) ),
same="features" )

cca <- rbind(
t( t(svd$u) * sqrt(svd$d) ),
t( t(svd$v) * sqrt(svd$d) ) )
rownames(cca) <- c( colnames(ifnb)[ifnb$stim=="CTRL"], colnames(ifnb)[ifnb$stim=="STIM"] )
ump_cca <- uwot::umap(cca)

all( rownames(ump_cca) == colnames(ifnb) )

perm <- sample.int(nrow(ump_cca))
plot( ump_cca[perm,], cex=.2, asp=1, col=1+as.integer(factor(ifnb$stim[perm])) )

sleepwalk::sleepwalk(
list( Embeddings(ifnb,"umap"), ump_cca ),
list( Embeddings(ifnb,"integrated.cca"), cca)
)


qqplot( svd$u[,1], svd$v[,1] )

u1_sorted <- sort( svd$u[,1] )
v1_sorted <- sort( svd$v[,1] )
plot(
u1_sorted[ ceiling( seq(0,1,length.out=300)*length(u1_sorted) ) ],
v1_sorted[ ceiling( seq(0,1,length.out=300)*length(v1_sorted) ) ] )

plot(
cor( t(as.matrix(LayerData(ifnb)[,ifnb$stim=="CTRL"][VariableFeatures(ifnb),])), svd$u[,1] ),
cor( t(as.matrix(LayerData(ifnb)[,ifnb$stim=="STIM"][VariableFeatures(ifnb),])), svd$v[,1] ) )
## Save image
#save.image( file="~/tmp/cca.rda" )
1 change: 1 addition & 0 deletions index.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ Place:
- [Reimplementing UMAP in Python](UMAP_pedestrian.html)
- prepared for following lectures
- [Cell cycle regression](cell_cycle.html)
- [CCA Integration](cca.html)

## Source code

Expand Down

0 comments on commit 3688430

Please sign in to comment.