some older test code

anders-biostat · Jun 5, 2024 · 987effb · 987effb
1 parent 7732a3d
commit 987effb
Show file tree

Hide file tree

Showing 2 changed files with 245 additions and 0 deletions.
diff --git a/ped_ifnagrko.R b/ped_ifnagrko.R
@@ -0,0 +1,182 @@
+library( Matrix )
+library( sparseMatrixStats )
+
+
+## Read count matrix
+
+Seurat::ReadMtx( 
+  "~/tmp/ifnagrko/ifnagrko_raw_counts.mtx.gz",
+  "~/tmp/ifnagrko/ifnagrko_obs.csv.gz",
+  "~/tmp/ifnagrko/ifnagrko_var.csv", 
+  feature.column=2, skip.cell=1, skip.feature=1, 
+  cell.sep=",", feature.sep=",",
+  mtx.transpose=TRUE ) -> count_matrix
+
+count_matrix[ 1:5, 1:5 ]
+
+
+## Calculate fraction
+
+fractions <- t( t(count_matrix) / colSums(count_matrix) )
+
+
+## Get highly variable genes
+
+hvg <- names( head( sort( rowVars(fractions) / rowMeans(fractions), decreasing=TRUE ), 2000 ) )
+
+plot( rowMeans(fractions), rowVars(fractions) / rowMeans(fractions), 
+      cex=.3, log="xy", 
+      col = scales::alpha( ifelse( rownames(fractions) %in% hvg, "red", "black" ), .3 ) )
+abline( h = mean( 1/colSums(count_matrix)), col="orange")
+
+
+# Log transform
+
+expr <- log1p( fractions * 1e4 )
+
+
+# PCA
+
+pca <- irlba::prcomp_irlba( t( expr[hvg,] ), n=20, center=TRUE, scale.=TRUE )
+rownames(pca$rotation) <- hvg
+rownames(pca$x) <- colnames(expr)
+
+
+# Distance matrix
+
+dm <- as.matrix( dist( pca$x ) )
+
+t(sapply( 1:ncol(expr), function(cell) 
+  order( dm[cell,] )[1:20] )) -> nn
+head( nn )
+
+# Distance matrix, faster (using kd-tree algorithm: https://en.wikipedia.org/wiki/K-d_tree)
+FNN::get.knn( pca$x, 20 ) -> nn2
+
+
+# Make UMAP
+
+uwot::umap( pca$x, nn_method = list( idx=nn2$nn.index, dist=nn2$nn.dist ) ) -> ump
+colnames(ump) <- c( "U1", "U2" )
+
+plot( ump, cex=.1, asp=1, col="#00000050" )
+
+sleepwalk::sleepwalk( ump, pca$x )
+
+
+# Feature plots
+library( tidyverse )
+as_tibble( ump ) %>%
+add_column( expr=expr["Mki67",] ) %>%
+ggplot + geom_point( aes( x=U1, y=U2, col=expr ), size=.3 ) + coord_equal()
+
+
+# Make a k-NN graph
+
+library( igraph )
+
+# Construct edge list: each cell to it's k-th nearest neighbor
+k <- 3
+#  start of the edge list:
+( rbind( 1:nrow(nn), nn[,k+1] ) )[ , 1:20 ]
+# the same, flattened (this is the format that igraph wants)
+as.vector( rbind( 1:nrow(nn), nn[,k+1] ) )[ 1:40 ]
+# now for all values of k
+do.call( c, lapply( 1:9, function(k) as.vector( rbind( 1:nrow(nn), nn[,k+1] ) ) ) ) -> edgelist
+# make a graph
+make_graph( edges=edgelist, n=nrow(nn), directed=FALSE ) -> nn_graph
+
+# Check the vertex degrees
+hist( degree(nn_graph) )
+
+
+# Leiden clustering
+cluster_leiden( nn_graph, objective_function="modularity", resolution_parameter=.2 ) -> cm
+
+table( membership(cm) )
+
+as_tibble( ump ) %>%
+add_column( cluster = factor( membership(cm) ) ) %>%
+ggplot + geom_point( aes( x=U1, y=U2, col=cluster ), size=.3 ) + coord_equal()
+
+
+# Check modularity score
+modularity( nn_graph, membership(cm) )
+
+# Calculate by hand:
+
+# How many of the edges are connecting two vertices in the same group?
+get.data.frame( nn_graph, what = "edges" ) %>%
+as_tibble() %>%
+mutate( 
+  from_cluster = membership(cm)[from],
+  to_cluster = membership(cm)[to] ) %>%
+summarise( mean( from_cluster == to_cluster ) )
+# -> 0.923
+
+# And how many would we expect under random permutation of connections?
+get.data.frame( nn_graph, what = "edges" ) %>%
+as_tibble() %>%
+mutate( to = sample(to) ) %>%
+mutate(
+  from_cluster = membership(cm)[from],
+  to_cluster = membership(cm)[to] ) %>%
+summarise( mean( from_cluster == to_cluster ) )
+# -> 0.077
+
+# The modularity score is the difference between these two:
+0.923 - 0.077
+
+# Can we calculated the expectation after permutation without
+# actually performing the permutations?
+
+# We can see for each community how many edge stubs it contains
+tibble( 
+  vertex = 1:vcount(nn_graph),
+  degree = degree(nn_graph),
+  group = membership(cm) ) %>%
+group_by( group ) %>%
+summarise( n_stubs = sum(degree) ) -> stub_counts
+stub_counts
+
+# Let's write s_g for the number of stubs in cluster g. If we
+# randomly pick to stubs to connect, what is the probability
+# for them to be in the same group?
+
+sum(stub_counts$n_stubs^2) / sum(stub_counts$n_stubs)^2
+
+# Here, we have the 0.077 from above
+
+
+# Calculation with matrix
+
+get.data.frame( nn_graph, what = "edges" ) %>%
+as_tibble() %>%
+mutate(
+  from_cluster = membership(cm)[from],
+  to_cluster = membership(cm)[to] ) %>%
+group_by( from_cluster, to_cluster ) %>%
+summarise( n=n() ) %>%
+mutate(
+  from_cluster = factor( from_cluster, 1:max(from_cluster) ),
+  to_cluster = factor( to_cluster, 1:max(to_cluster) ) ) %>%
+pivot_wider( id_cols = "from_cluster", names_from = "to_cluster", values_from = "n", values_fill = 0, names_sort=TRUE ) %>%
+column_to_rownames("from_cluster") %>% as.matrix -> ccm  # cluster connection matrix
+ccm + t(ccm) -> ccm   # both directions
+
+
+# Fraction of within-group edges
+sum(diag(ccm)) / sum(ccm)
+
+# Expected fraction after permuting edge conenctions
+colSums(ccm) -> vertex_sums
+sum( vertex_sums^2 ) / sum(vertex_sums)^2
+
+# Modularity
+sum(diag(ccm)) / sum(ccm) - sum( colSums(ccm)^2 ) / sum( colSums(ccm) )^2
+
+
+sum(diag(ccm)) / sum(ccm) - sum( ( vertex_sums/sum(vertex_sums) )^2 )
+
+# Compare to igraph's calculation
+modularity( nn_graph, membership(cm) )
diff --git a/smooth_comp.R b/smooth_comp.R
@@ -0,0 +1,63 @@
+# Comparison of four types of smoothing splines
+
+# Make example data
+n <- 15
+x <- runif( n, 0, 10 )
+y <- sin(x) + rnorm( n, sd=.1 )
+
+# Make grid to plot smooth curves
+xg <- seq( 0, 10, length.out=1000 )
+
+# Determine knot positions:
+#   The knots should have the same range as the data x values
+#   and the number of data points between knots should be similar
+nknots <- 10
+stopifnot( knots < n )
+f <- approxfun( 1:length(x), sort(x) )
+knots <- f( seq( 1, length(x), length.out=nknots ) )  
+
+# Make plot to justify knot positions
+plot( 1:length(x), sort(x) )
+lines( seq( 0,length(x), length.out=1000), f(seq( 0,length(x), length.out=1000)), type="l" )
+abline( v = seq( 1, length(x), length.out=nknots ), col="gray" )
+abline( h = knots, col="gray" )
+
+# Plot data and knot positions
+plot( x, y, xlim=c(-1, 11), ylim=c(-1.3,1.3) )
+abline( v = knots, col="gray" )
+abline( v = range(x) )
+
+# With bs
+mysp <- function(x) bs( x, intercept=TRUE, 
+    Boundary.knots=knots[c(1,length(knots))], knots=knots[2:(length(knots)-1)] )
+fit <- lm.fit( mysp(x), y )
+yg <- mysp(xg) %*% fit$coefficients
+lines( xg, yg, col="pink" )
+
+# With ns (like bs, but zero curvature at boundaries)
+mysp <- function(x) ns( x, intercept=TRUE, 
+    Boundary.knots=knots[c(1,length(knots))], knots=knots[2:(length(knots)-1)] )
+fit <- lm.fit( mysp(x), y )
+yg <- mysp(xg) %*% fit$coefficients
+lines( xg, yg, col="darkgreen" )
+
+# With smooth.spline (like bs, but with curvature penalty)
+fit <- smooth.spline( x, y, df=df )
+yg <- predict( fit, xg )$y
+lines( xg, yg, col="blue" )
+
+# With P-splines
+m <- 50
+mysp <- function(xx) bs( xx, intercept=TRUE, 
+    Boundary.knots=range(x), knots=seq(min(x),max(x),length.out=m-4) )
+
+pty <- diag(2,m)
+pty[ row(pty) == col(pty)+1 ] <- -1
+pty[ row(pty) == col(pty)-1 ] <- -1
+pty[1,1] <- 1
+pty[m,m] <- 1
+
+lambda_scale <- sum(diag( t(mysp(x)) %*% mysp(x) )) / sum(diag(pty))
+lambda <- 3 * lambda_scale   # <- needs to be optimized
+beta <- solve( t(mysp(x)) %*% mysp(x) + lambda * pty , t(mysp(x)) %*% y )
+lines( xg, mysp(xg) %*% beta, col="brown" )