-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7732a3d
commit 987effb
Showing
2 changed files
with
245 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
library( Matrix ) | ||
library( sparseMatrixStats ) | ||
|
||
|
||
## Read count matrix | ||
|
||
Seurat::ReadMtx( | ||
"~/tmp/ifnagrko/ifnagrko_raw_counts.mtx.gz", | ||
"~/tmp/ifnagrko/ifnagrko_obs.csv.gz", | ||
"~/tmp/ifnagrko/ifnagrko_var.csv", | ||
feature.column=2, skip.cell=1, skip.feature=1, | ||
cell.sep=",", feature.sep=",", | ||
mtx.transpose=TRUE ) -> count_matrix | ||
|
||
count_matrix[ 1:5, 1:5 ] | ||
|
||
|
||
## Calculate fraction | ||
|
||
fractions <- t( t(count_matrix) / colSums(count_matrix) ) | ||
|
||
|
||
## Get highly variable genes | ||
|
||
hvg <- names( head( sort( rowVars(fractions) / rowMeans(fractions), decreasing=TRUE ), 2000 ) ) | ||
|
||
plot( rowMeans(fractions), rowVars(fractions) / rowMeans(fractions), | ||
cex=.3, log="xy", | ||
col = scales::alpha( ifelse( rownames(fractions) %in% hvg, "red", "black" ), .3 ) ) | ||
abline( h = mean( 1/colSums(count_matrix)), col="orange") | ||
|
||
|
||
# Log transform | ||
|
||
expr <- log1p( fractions * 1e4 ) | ||
|
||
|
||
# PCA | ||
|
||
pca <- irlba::prcomp_irlba( t( expr[hvg,] ), n=20, center=TRUE, scale.=TRUE ) | ||
rownames(pca$rotation) <- hvg | ||
rownames(pca$x) <- colnames(expr) | ||
|
||
|
||
# Distance matrix | ||
|
||
dm <- as.matrix( dist( pca$x ) ) | ||
|
||
t(sapply( 1:ncol(expr), function(cell) | ||
order( dm[cell,] )[1:20] )) -> nn | ||
head( nn ) | ||
|
||
# Distance matrix, faster (using kd-tree algorithm: https://en.wikipedia.org/wiki/K-d_tree) | ||
FNN::get.knn( pca$x, 20 ) -> nn2 | ||
|
||
|
||
# Make UMAP | ||
|
||
uwot::umap( pca$x, nn_method = list( idx=nn2$nn.index, dist=nn2$nn.dist ) ) -> ump | ||
colnames(ump) <- c( "U1", "U2" ) | ||
|
||
plot( ump, cex=.1, asp=1, col="#00000050" ) | ||
|
||
sleepwalk::sleepwalk( ump, pca$x ) | ||
|
||
|
||
# Feature plots | ||
library( tidyverse ) | ||
as_tibble( ump ) %>% | ||
add_column( expr=expr["Mki67",] ) %>% | ||
ggplot + geom_point( aes( x=U1, y=U2, col=expr ), size=.3 ) + coord_equal() | ||
|
||
|
||
# Make a k-NN graph | ||
|
||
library( igraph ) | ||
|
||
# Construct edge list: each cell to it's k-th nearest neighbor | ||
k <- 3 | ||
# start of the edge list: | ||
( rbind( 1:nrow(nn), nn[,k+1] ) )[ , 1:20 ] | ||
# the same, flattened (this is the format that igraph wants) | ||
as.vector( rbind( 1:nrow(nn), nn[,k+1] ) )[ 1:40 ] | ||
# now for all values of k | ||
do.call( c, lapply( 1:9, function(k) as.vector( rbind( 1:nrow(nn), nn[,k+1] ) ) ) ) -> edgelist | ||
# make a graph | ||
make_graph( edges=edgelist, n=nrow(nn), directed=FALSE ) -> nn_graph | ||
|
||
# Check the vertex degrees | ||
hist( degree(nn_graph) ) | ||
|
||
|
||
# Leiden clustering | ||
cluster_leiden( nn_graph, objective_function="modularity", resolution_parameter=.2 ) -> cm | ||
|
||
table( membership(cm) ) | ||
|
||
as_tibble( ump ) %>% | ||
add_column( cluster = factor( membership(cm) ) ) %>% | ||
ggplot + geom_point( aes( x=U1, y=U2, col=cluster ), size=.3 ) + coord_equal() | ||
|
||
|
||
# Check modularity score | ||
modularity( nn_graph, membership(cm) ) | ||
|
||
# Calculate by hand: | ||
|
||
# How many of the edges are connecting two vertices in the same group? | ||
get.data.frame( nn_graph, what = "edges" ) %>% | ||
as_tibble() %>% | ||
mutate( | ||
from_cluster = membership(cm)[from], | ||
to_cluster = membership(cm)[to] ) %>% | ||
summarise( mean( from_cluster == to_cluster ) ) | ||
# -> 0.923 | ||
|
||
# And how many would we expect under random permutation of connections? | ||
get.data.frame( nn_graph, what = "edges" ) %>% | ||
as_tibble() %>% | ||
mutate( to = sample(to) ) %>% | ||
mutate( | ||
from_cluster = membership(cm)[from], | ||
to_cluster = membership(cm)[to] ) %>% | ||
summarise( mean( from_cluster == to_cluster ) ) | ||
# -> 0.077 | ||
|
||
# The modularity score is the difference between these two: | ||
0.923 - 0.077 | ||
|
||
# Can we calculated the expectation after permutation without | ||
# actually performing the permutations? | ||
|
||
# We can see for each community how many edge stubs it contains | ||
tibble( | ||
vertex = 1:vcount(nn_graph), | ||
degree = degree(nn_graph), | ||
group = membership(cm) ) %>% | ||
group_by( group ) %>% | ||
summarise( n_stubs = sum(degree) ) -> stub_counts | ||
stub_counts | ||
|
||
# Let's write s_g for the number of stubs in cluster g. If we | ||
# randomly pick to stubs to connect, what is the probability | ||
# for them to be in the same group? | ||
|
||
sum(stub_counts$n_stubs^2) / sum(stub_counts$n_stubs)^2 | ||
|
||
# Here, we have the 0.077 from above | ||
|
||
|
||
# Calculation with matrix | ||
|
||
get.data.frame( nn_graph, what = "edges" ) %>% | ||
as_tibble() %>% | ||
mutate( | ||
from_cluster = membership(cm)[from], | ||
to_cluster = membership(cm)[to] ) %>% | ||
group_by( from_cluster, to_cluster ) %>% | ||
summarise( n=n() ) %>% | ||
mutate( | ||
from_cluster = factor( from_cluster, 1:max(from_cluster) ), | ||
to_cluster = factor( to_cluster, 1:max(to_cluster) ) ) %>% | ||
pivot_wider( id_cols = "from_cluster", names_from = "to_cluster", values_from = "n", values_fill = 0, names_sort=TRUE ) %>% | ||
column_to_rownames("from_cluster") %>% as.matrix -> ccm # cluster connection matrix | ||
ccm + t(ccm) -> ccm # both directions | ||
|
||
|
||
# Fraction of within-group edges | ||
sum(diag(ccm)) / sum(ccm) | ||
|
||
# Expected fraction after permuting edge conenctions | ||
colSums(ccm) -> vertex_sums | ||
sum( vertex_sums^2 ) / sum(vertex_sums)^2 | ||
|
||
# Modularity | ||
sum(diag(ccm)) / sum(ccm) - sum( colSums(ccm)^2 ) / sum( colSums(ccm) )^2 | ||
|
||
|
||
sum(diag(ccm)) / sum(ccm) - sum( ( vertex_sums/sum(vertex_sums) )^2 ) | ||
|
||
# Compare to igraph's calculation | ||
modularity( nn_graph, membership(cm) ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# Comparison of four types of smoothing splines | ||
|
||
# Make example data | ||
n <- 15 | ||
x <- runif( n, 0, 10 ) | ||
y <- sin(x) + rnorm( n, sd=.1 ) | ||
|
||
# Make grid to plot smooth curves | ||
xg <- seq( 0, 10, length.out=1000 ) | ||
|
||
# Determine knot positions: | ||
# The knots should have the same range as the data x values | ||
# and the number of data points between knots should be similar | ||
nknots <- 10 | ||
stopifnot( knots < n ) | ||
f <- approxfun( 1:length(x), sort(x) ) | ||
knots <- f( seq( 1, length(x), length.out=nknots ) ) | ||
|
||
# Make plot to justify knot positions | ||
plot( 1:length(x), sort(x) ) | ||
lines( seq( 0,length(x), length.out=1000), f(seq( 0,length(x), length.out=1000)), type="l" ) | ||
abline( v = seq( 1, length(x), length.out=nknots ), col="gray" ) | ||
abline( h = knots, col="gray" ) | ||
|
||
# Plot data and knot positions | ||
plot( x, y, xlim=c(-1, 11), ylim=c(-1.3,1.3) ) | ||
abline( v = knots, col="gray" ) | ||
abline( v = range(x) ) | ||
|
||
# With bs | ||
mysp <- function(x) bs( x, intercept=TRUE, | ||
Boundary.knots=knots[c(1,length(knots))], knots=knots[2:(length(knots)-1)] ) | ||
fit <- lm.fit( mysp(x), y ) | ||
yg <- mysp(xg) %*% fit$coefficients | ||
lines( xg, yg, col="pink" ) | ||
|
||
# With ns (like bs, but zero curvature at boundaries) | ||
mysp <- function(x) ns( x, intercept=TRUE, | ||
Boundary.knots=knots[c(1,length(knots))], knots=knots[2:(length(knots)-1)] ) | ||
fit <- lm.fit( mysp(x), y ) | ||
yg <- mysp(xg) %*% fit$coefficients | ||
lines( xg, yg, col="darkgreen" ) | ||
|
||
# With smooth.spline (like bs, but with curvature penalty) | ||
fit <- smooth.spline( x, y, df=df ) | ||
yg <- predict( fit, xg )$y | ||
lines( xg, yg, col="blue" ) | ||
|
||
# With P-splines | ||
m <- 50 | ||
mysp <- function(xx) bs( xx, intercept=TRUE, | ||
Boundary.knots=range(x), knots=seq(min(x),max(x),length.out=m-4) ) | ||
|
||
pty <- diag(2,m) | ||
pty[ row(pty) == col(pty)+1 ] <- -1 | ||
pty[ row(pty) == col(pty)-1 ] <- -1 | ||
pty[1,1] <- 1 | ||
pty[m,m] <- 1 | ||
|
||
lambda_scale <- sum(diag( t(mysp(x)) %*% mysp(x) )) / sum(diag(pty)) | ||
lambda <- 3 * lambda_scale # <- needs to be optimized | ||
beta <- solve( t(mysp(x)) %*% mysp(x) + lambda * pty , t(mysp(x)) %*% y ) | ||
lines( xg, mysp(xg) %*% beta, col="brown" ) |