-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path.Rhistory
512 lines (512 loc) · 24.1 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
query_cluster <- split(query@meta.data, query@meta.data[, x1])
query_means <- data.frame()
for (i in 1:length(query_cluster)) {
cluster_name <- names(query_cluster)[i]
cluster_cell <- data.frame(cells=rownames(query_cluster[[i]]))
cluster_embedding <- dplyr::left_join(cluster_cell, embedding,by="cells")
rownames(cluster_embedding) <- cluster_embedding[, 1]
cluster_embedding <- cluster_embedding[, -1]
cluster_embedding <- as.data.frame(lapply(cluster_embedding, as.numeric))
cluster_vec <- colMeans(cluster_embedding)
cluster_vec <- data.frame(cluster_vec)
colnames(cluster_vec)[1] <- cluster_name
if (i==1) {
query_means <- cluster_vec
}
else
query_means <- cbind(query_means, cluster_vec)
}
query_embedding_list[[x1-3]] <- query_means
}
#correlation of CCA cluster representation
CCA_PCC_list <- list()
for (x1 in 1:length(reference_embedding_list)) {
PCC_list <- list()
for (y1 in 1:length(query_embedding_list)) {
CCA_PCC <- cor(query_embedding_list[[y1]], reference_embedding_list[[x1]])
PCC_list[[y1]] <- CCA_PCC
}
CCA_PCC_list[[x1]] <- PCC_list
}
#Epsilon
reference_all_means <- embedding[1:ncol(reference_matrix), ]
reference_all_means <- reference_all_means[, -1]
reference_all_means <- as.data.frame(lapply(reference_all_means, as.numeric))
reference_all_means_vec <- colMeans(reference_all_means)
query_all_means <- embedding[(ncol(reference_matrix)+1):nrow(embedding), ]
query_all_means <- query_all_means[, -1]
query_all_means <- as.data.frame(lapply(query_all_means, as.numeric))
query_all_means_vec <- colMeans(query_all_means)
Epsilon <- cor(reference_all_means_vec, query_all_means_vec)
#match
strongest_match_avg <- data.frame()
for (x1 in 1:length(CCA_PCC_list)) {
for (y1 in 1:length(CCA_PCC_list[[x1]])) {
match_matrix <- Match(data = CCA_PCC_list[[x1]][[y1]], top = 1, threshold = Epsilon)
strongest_match_PCC <- CCA_PCC_list[[x1]][[y1]][match_matrix == 2]
strongest_match_avg[x1,y1] <- mean(strongest_match_PCC)
}
}
#output
name_row <- data.frame(res=seq(from = start.resolution, to = (ncol(reference@meta.data)-3)*step, by = step))
name_row$name <- "D1_"
name_row$res <- stringr::str_c(name_row$name, name_row$res)
rownames(strongest_match_avg) <- name_row$res
name_col <- data.frame(res=seq(from = start.resolution, to = (ncol(query@meta.data)-3)*step, by = step))
name_col$name <- "D2_"
name_col$res <- stringr::str_c(name_col$name, name_col$res)
colnames(strongest_match_avg) <- name_col$res
a <- as.matrix(strongest_match_avg)
b <- which(a==max(a),arr.ind=T)
b <- as.data.frame(b)
c <- nrow(b)
D1_res <- start.resolution + step*(b[c, 1]-1)
D2_res <- start.resolution + step*(b[c, 2]-1)
ClusterMatch_resolution <- list(Resolution_cor = strongest_match_avg, D1_ref_res = D1_res, D2_que_res =D2_res)
return(ClusterMatch_resolution)
}
#' Matching clusters of two datasets
#' @param data the correlation between clusters of two datasets
#' @param top the maximum number of matches for each cluster
#' @param threshold the minimum threshold for matching
#' @return matching matrix
Match<-function(data,top=3,threshold=1.96){
match_matrix<-data
for (i in 1:nrow(data)) {
for (j in 1:ncol(data)) {
top_row<-data[i,order(data[i,],decreasing = TRUE)[1:top]]
top_col<-data[order(data[,j],decreasing = TRUE)[1:top],j]
if (data[i,j] %in% top_row & data[i,j] %in% top_col & data[i,j]>threshold) {
match_matrix[i,j]<-1
}
else
match_matrix[i,j]<-0
}
}
for (i in 1:nrow(data)) {
for (j in 1:ncol(data)) {
top_row<-data[i,order(data[i,],decreasing = TRUE)[1]]
top_col<-data[order(data[,j],decreasing = TRUE)[1],j]
if (data[i,j] %in% top_row & data[i,j] %in% top_col & data[i,j]>threshold) {
match_matrix[i,j]<-2
}
}
}
return(match_matrix)
}
#' Calculating the matching matrix between two datasets at the optimal resolutions
#' @param reference_matrix the gene expression matrix of dataset 1
#' @param query_matrix the gene expression matrix of dataset 2
#' @param ref.res the optimal clustering resolution of dataset 1
#' @param que.res the optimal clustering resolution of dataset 2
#' @param ref.norm whether dataset 1 is normalized
#' @param que.norm whether dataset 2 is normalized
#' @param dim.pca number of PCs to calculate (50 by default)
#' @param dim.cca number of canonical vectors to calculate (30 by default)
#' @param min.cluster.cells the minimum number of cells in a cluster
#' @param random_PCC the minimum value of random noise
#' @param min.pct Seurat FindAllMarkers min.pct
#' @param logfc.threshold Seurat FindAllMarkers logfc.threshold
#' @return a list containing the following elements:
#' - D1_reference: SeuratObject of dataset 1
#' - D2_query: SeuratObject of dataset 2
#' - Matching_matrix: matching matrix, 2 is mutually most correlated clusters, 1 is mutually correlated clusters
#' - strongest_matching_marker_top10: top 10 common marker genes of mutually most correlated clusters
#' - com_marker_list: common marker genes of mutually most correlated clusters
#' - Epsilon: random noise
#' - CCA_cor: PCC between clusters based on CCA representation
#' - Marker_cor:Jaccard coefficient between clusters based on CCA representation
#' - ClusterMatch_cor = ClusterMatch correlation between clusters
#' @import Seurat CreateSeuratObject NormalizeData ScaleData RunPCA FindNeighbors FindClusters RunCCA L2Dim SplitObject FindAllMarkers Idents
#' @import dplyr select left_join
#' @import stringr str_c
#' @export
ClusterMatch_matching <- function(reference_matrix, query_matrix, ref.res, que.res, ref.norm = TRUE, que.norm = TRUE,
dim.pca = 50, dim.cca = 30, min.cluster.cells = 20, random_PCC = 0, min.pct = 0.25, logfc.threshold = 0.25){
#object and clustering
reference <- Seurat::CreateSeuratObject(counts = reference_matrix)
if (ref.norm == TRUE) {
reference <- Seurat::NormalizeData(reference)
}
reference <- Seurat::FindVariableFeatures(reference, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
reference <- Seurat::ScaleData(reference, verbose = FALSE)
reference <- Seurat::RunPCA(reference, features = VariableFeatures(object = reference), verbose = FALSE)
reference <- Seurat::FindNeighbors(reference, dims = 1:dim.pca,verbose = FALSE)
reference <- Seurat::FindClusters(reference, resolution = ref.res)
reference@meta.data$name <- "D1_"
reference$seurat_clusters <- stringr::str_c(reference$name, reference$seurat_clusters)
reference@meta.data <- dplyr::select(reference@meta.data, -c("name"))
query <- Seurat::CreateSeuratObject(counts = query_matrix)
if (que.norm == TRUE) {
query <- Seurat::NormalizeData(query)
}
query <- Seurat::FindVariableFeatures(query, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
query <- Seurat::ScaleData(query, verbose = FALSE)
query <- Seurat::RunPCA(query, features = VariableFeatures(object = query), verbose = FALSE)
query <- Seurat::FindNeighbors(query, dims = 1:dim.pca, verbose = FALSE)
query <- Seurat::FindClusters(query, resolution = que.res)
query@meta.data$name <- "D2_"
query$seurat_clusters <- stringr::str_c(query$name,query$seurat_clusters)
query@meta.data <- dplyr::select(query@meta.data, -c("name"))
#CCA embedding
CCA <- Seurat::RunCCA(object1 = reference, object2 = query, num.cc = dim.cca)
L2CCA <- Seurat::L2Dim(CCA, reduction = "cca")
embedding <- L2CCA@reductions[["cca.l2"]]@cell.embeddings
embedding <- data.frame(cbind(cells=rownames(embedding), embedding))
#CCA cluster representation
#reference
reference_cluster <- Seurat::SplitObject(reference, split.by = "seurat_clusters")
reference_means <- data.frame()
reference_cell_cluster_embedding <- list()
for (i in 1:length(reference_cluster)) {
cluster_name <- names(reference_cluster)[i]
cluster_cell <- data.frame(cells = rownames(reference_cluster[[i]]@meta.data))
cluster_embedding <- dplyr::left_join(cluster_cell, embedding, by="cells")
rownames(cluster_embedding) <- cluster_embedding[, 1]
cluster_embedding <- cluster_embedding[, -1]
reference_cell_cluster_embedding[[i]] <- as.data.frame(lapply(cluster_embedding, as.numeric))
rownames(reference_cell_cluster_embedding[[i]]) <- cluster_cell$cells
reference_cell_cluster_embedding[[i]] <- t(reference_cell_cluster_embedding[[i]])
names(reference_cell_cluster_embedding)[i] <- cluster_name
cluster_embedding <- as.data.frame(lapply(cluster_embedding, as.numeric))
cluster_vec <- colMeans(cluster_embedding)
cluster_vec <- data.frame(cluster_vec)
colnames(cluster_vec)[1] <- cluster_name
if (i==1) {
reference_means <- cluster_vec
}
else
reference_means <- cbind(reference_means, cluster_vec)
}
#query
query_cluster <- Seurat::SplitObject(query, split.by = "seurat_clusters")
query_means <- data.frame()
query_cell_cluster_embedding <- list()
for (i in 1:length(query_cluster)) {
cluster_name <- names(query_cluster)[i]
cluster_cell <- data.frame(cells = rownames(query_cluster[[i]]@meta.data))
cluster_embedding <- dplyr::left_join(cluster_cell, embedding, by="cells")
rownames(cluster_embedding) <- cluster_embedding[, 1]
cluster_embedding <- cluster_embedding[, -1]
query_cell_cluster_embedding[[i]] <- as.data.frame(lapply(cluster_embedding, as.numeric))
rownames(query_cell_cluster_embedding[[i]]) <- cluster_cell$cells
query_cell_cluster_embedding[[i]] <- t(query_cell_cluster_embedding[[i]])
names(query_cell_cluster_embedding)[i] <- cluster_name
cluster_embedding <- as.data.frame(lapply(cluster_embedding, as.numeric))
cluster_vec <- colMeans(cluster_embedding)
cluster_vec <- data.frame(cluster_vec)
colnames(cluster_vec)[1] <- cluster_name
if (i==1) {
query_means<-cluster_vec
}
else
query_means<-cbind(query_means, cluster_vec)
}
#marker cluster representation
#reference
Seurat::Idents(object = reference) <- reference@meta.data$seurat_clusters
reference_marker <- Seurat::FindAllMarkers(reference, only.pos = TRUE, min.pct = min.pct, logfc.threshold = logfc.threshold)
reference_marker_cluster <- split(reference_marker, reference_marker$cluster)
#query
Seurat::Idents(object = query) <- query@meta.data$seurat_clusters
query_marker <- Seurat::FindAllMarkers(query, only.pos = TRUE, min.pct = min.pct, logfc.threshold = logfc.threshold)
query_marker_cluster <- split(query_marker,query_marker$cluster)
#correlation of clusters
#CCA
CCA_PCC <- cor(query_means,reference_means)
CCA_PCC[CCA_PCC<0] <- 0
#marker
jaccard <- function(a, b) {
intersection = length(intersect(a, b))
union = length(a) + length(b) - intersection
return (intersection/union)
}
marker_jaccard<-data.frame()
for (i in 1:length(query_marker_cluster)) {
for (j in 1:length(reference_marker_cluster)) {
marker_jaccard[i,j] <- jaccard(query_marker_cluster[[i]]$gene,reference_marker_cluster[[j]]$gene)
}
}
rownames(marker_jaccard)<-names(query_marker_cluster)
colnames(marker_jaccard)<-names(reference_marker_cluster)
marker_jaccard[marker_jaccard=='NaN']<-0
#Epsilon and beta
reference_all_means <- embedding[1:ncol(reference_matrix), ]
reference_all_means <- reference_all_means[, -1]
reference_all_means <- as.data.frame(lapply(reference_all_means, as.numeric))
reference_all_means_vec <- colMeans(reference_all_means)
query_all_means <- embedding[(ncol(reference_matrix)+1):nrow(embedding), ]
query_all_means <- query_all_means[, -1]
query_all_means <- as.data.frame(lapply(query_all_means, as.numeric))
query_all_means_vec <- colMeans(query_all_means)
Epsilon <- cor(reference_all_means_vec, query_all_means_vec)
beta <- max(CCA_PCC)/max(marker_jaccard)
Epsilon <- max(Epsilon, random_PCC)
CCA_marker_similarity <- CCA_PCC + beta*marker_jaccard
CCA_marker_similarity[CCA_marker_similarity < Epsilon] <- 0
match_matrix <- Match(data = CCA_marker_similarity, top = 3, threshold = Epsilon)
CCA_PCC_match_matrix<-Match(data = CCA_PCC,top = 1,threshold = Epsilon)
#strongest matching cluster marker
overlap<-data.frame()
flag=1
for (i in 1:nrow(match_matrix)) {
for (j in 1:ncol(match_matrix)) {
if (match_matrix[i,j]==2) {
overlap[flag,1] <- colnames(match_matrix)[j]
overlap[flag,2] <- rownames(match_matrix)[i]
flag<-flag+1
}
}
}
overlap$name <- c('/')
overlap$name <- stringr::str_c(overlap$V1, overlap$name)
overlap$name <- stringr::str_c(overlap$name, overlap$V2)
reference_marker_strongest_matching_list <- subset(reference_marker_cluster, names(reference_marker_cluster) %in% overlap$V1)
reference_marker_strongest_matching_list <- reference_marker_strongest_matching_list[overlap$V1]
query_marker_strongest_matching_list <- subset(query_marker_cluster,names(query_marker_cluster) %in% overlap$V2)
query_marker_strongest_matching_list <- query_marker_strongest_matching_list[overlap$V2]
com_marker <- data.frame()
com_marker_list <- list()
com_marker_top_10 <- data.frame()
for (i in 1:nrow(overlap)) {
com_marker_list[[i]] <- merge(reference_marker_strongest_matching_list[[i]], query_marker_strongest_matching_list[[i]], by = "gene" )
com_marker_list[[i]]$avg_log2FC <- (com_marker_list[[i]]$avg_log2FC.x+com_marker_list[[i]]$avg_log2FC.y)/2
com_marker_list[[i]] <- com_marker_list[[i]][order(com_marker_list[[i]]$avg_log2FC, decreasing = TRUE), ]
if (i==1) {
com_marker_top_10 <- data.frame(com_marker_list[[i]]$gene[1:10])
}
else
com_marker_top_10 <- cbind(com_marker_top_10, data.frame(com_marker_list[[i]]$gene[1:10]))
}
names(com_marker_list) <- overlap$name
colnames(com_marker_top_10) <- overlap$name
#output
ClusterMatch_matching <- list(D1_reference = reference, D2_query = query, Matching_matrix = match_matrix,
strongest_matching_marker_top10 = com_marker_top_10, com_marker_list = com_marker_list, Epsilon = Epsilon, CCA_cor = CCA_PCC, Marker_cor = marker_jaccard, ClusterMatch_cor = CCA_marker_similarity )
}
#' Matching clusters and cell types
#' @param data the correlation between clusters and cell types
#' @param threshold the minimum threshold for matching
#' @return the matching matrix of clusters and cell types
#' @import matrixStats rowMaxs
Match_annotation<-function(data,threshold=0){
data<-as.matrix(data)
row_max<-matrixStats::rowMaxs(data)
match_matrix<-data
for (i in 1:nrow(data)) {
for (j in 1:ncol(data)) {
if (data[i,j] == row_max[i] & data[i,j]>threshold) {
match_matrix[i,j]<-1
}
else
match_matrix[i,j]<-0
}
}
return(match_matrix)
}
#' Annotate the query data based on the reference data labels
#' @param reference_matrix the gene expression matrix of reference
#' @param query_matrix the gene expression matrix of query
#' @param reference_label reference data labels
#' @param que.res the query clustering resolution, default 2
#' @param ref.norm whether reference is normalized
#' @param que.norm whether query is normalized
#' @param dim.pca number of PCs to calculate (50 by default)
#' @param dim.cca number of canonical vectors to calculate (30 by default)
#' @param min.cluster.cells the minimum number of cells in a cluster
#' @param random_PCC the minimum value of random noise
#' @return a list containing the following elements:
#' - D1_reference: SeuratObject of reference
#' - D2_query: SeuratObject of query
#' - Matching_matrix: Matching matrix of cell types and clusters
#' @import Seurat CreateSeuratObject NormalizeData ScaleData RunPCA FindNeighbors FindClusters RunCCA L2Dim SplitObject FindAllMarkers Idents
#' @import dplyr select left_join
#' @import stringr str_c
#' @import reshape2 melt
#' @export
ClusterMatch_annotation <- function(reference_matrix, query_matrix, reference_label, que.res=2, ref.norm = TRUE, que.norm = TRUE,
dim.pca = 50, dim.cca = 30, min.cluster.cells = 20, random_PCC = 0){
#object and clustering
reference <- Seurat::CreateSeuratObject(counts = reference_matrix, meta.data = reference_label)
if (ref.norm == TRUE) {
reference <- Seurat::NormalizeData(reference)
}
reference <- Seurat::FindVariableFeatures(reference, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
reference <- Seurat::ScaleData(reference, verbose = FALSE)
reference <- Seurat::RunPCA(reference, features = VariableFeatures(object = reference), verbose = FALSE)
query <- Seurat::CreateSeuratObject(counts = query_matrix)
if (que.norm == TRUE) {
query <- Seurat::NormalizeData(query)
}
query <- Seurat::FindVariableFeatures(query, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
query <- Seurat::ScaleData(query, verbose = FALSE)
query <- Seurat::RunPCA(query, features = VariableFeatures(object = query), verbose = FALSE)
query <- Seurat::FindNeighbors(query, dims = 1:dim.pca, verbose = FALSE)
query <- Seurat::FindClusters(query, resolution = que.res)
query@meta.data$name <- "D2_"
query$seurat_clusters <- stringr::str_c(query$name,query$seurat_clusters)
query@meta.data <- dplyr::select(query@meta.data, -c("name"))
query_cells <- rownames(query@meta.data)
#CCA embedding
CCA <- Seurat::RunCCA(object1 = reference, object2 = query, num.cc = dim.cca)
L2CCA <- Seurat::L2Dim(CCA, reduction = "cca")
embedding <- L2CCA@reductions[["cca.l2"]]@cell.embeddings
embedding <- data.frame(cbind(cells=rownames(embedding), embedding))
#CCA cluster representation
#reference
reference_cluster <- Seurat::SplitObject(reference, split.by = "celltype")
reference_means <- data.frame()
reference_cell_cluster_embedding <- list()
for (i in 1:length(reference_cluster)) {
cluster_name <- names(reference_cluster)[i]
cluster_cell <- data.frame(cells = rownames(reference_cluster[[i]]@meta.data))
cluster_embedding <- dplyr::left_join(cluster_cell, embedding, by="cells")
rownames(cluster_embedding) <- cluster_embedding[, 1]
cluster_embedding <- cluster_embedding[, -1]
reference_cell_cluster_embedding[[i]] <- as.data.frame(lapply(cluster_embedding, as.numeric))
rownames(reference_cell_cluster_embedding[[i]]) <- cluster_cell$cells
reference_cell_cluster_embedding[[i]] <- t(reference_cell_cluster_embedding[[i]])
names(reference_cell_cluster_embedding)[i] <- cluster_name
cluster_embedding <- as.data.frame(lapply(cluster_embedding, as.numeric))
cluster_vec <- colMeans(cluster_embedding)
cluster_vec <- data.frame(cluster_vec)
colnames(cluster_vec)[1] <- cluster_name
if (i==1) {
reference_means <- cluster_vec
}
else
reference_means <- cbind(reference_means, cluster_vec)
}
#query
query_cluster <- Seurat::SplitObject(query, split.by = "seurat_clusters")
query_means <- data.frame()
query_cell_cluster_embedding <- list()
for (i in 1:length(query_cluster)) {
cluster_name <- names(query_cluster)[i]
cluster_cell <- data.frame(cells = rownames(query_cluster[[i]]@meta.data))
cluster_embedding <- dplyr::left_join(cluster_cell, embedding, by="cells")
rownames(cluster_embedding) <- cluster_embedding[, 1]
cluster_embedding <- cluster_embedding[, -1]
query_cell_cluster_embedding[[i]] <- as.data.frame(lapply(cluster_embedding, as.numeric))
rownames(query_cell_cluster_embedding[[i]]) <- cluster_cell$cells
query_cell_cluster_embedding[[i]] <- t(query_cell_cluster_embedding[[i]])
names(query_cell_cluster_embedding)[i] <- cluster_name
cluster_embedding <- as.data.frame(lapply(cluster_embedding, as.numeric))
cluster_vec <- colMeans(cluster_embedding)
cluster_vec <- data.frame(cluster_vec)
colnames(cluster_vec)[1] <- cluster_name
if (i==1) {
query_means<-cluster_vec
}
else
query_means<-cbind(query_means, cluster_vec)
}
#marker cluster representation
#reference
Seurat::Idents(object = reference) <- reference@meta.data$celltype
reference_marker <- Seurat::FindAllMarkers(reference, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
reference_marker_cluster <- split(reference_marker, reference_marker$cluster)
#query
Seurat::Idents(object = query) <- query@meta.data$seurat_clusters
query_marker <- Seurat::FindAllMarkers(query, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
query_marker_cluster <- split(query_marker,query_marker$cluster)
#correlation of clusters
#CCA
CCA_PCC <- cor(query_means,reference_means)
CCA_PCC[CCA_PCC<0] <- 0
#marker
jaccard <- function(a, b) {
intersection = length(intersect(a, b))
union = length(a) + length(b) - intersection
return (intersection/union)
}
marker_jaccard<-data.frame()
for (i in 1:length(query_marker_cluster)) {
for (j in 1:length(reference_marker_cluster)) {
marker_jaccard[i,j] <- jaccard(query_marker_cluster[[i]]$gene,reference_marker_cluster[[j]]$gene)
}
}
rownames(marker_jaccard)<-names(query_marker_cluster)
colnames(marker_jaccard)<-names(reference_marker_cluster)
marker_jaccard[marker_jaccard=='NaN']<-0
#Epsilon and beta
reference_all_means <- embedding[1:ncol(reference_matrix), ]
reference_all_means <- reference_all_means[, -1]
reference_all_means <- as.data.frame(lapply(reference_all_means, as.numeric))
reference_all_means_vec <- colMeans(reference_all_means)
query_all_means <- embedding[(ncol(reference_matrix)+1):nrow(embedding), ]
query_all_means <- query_all_means[, -1]
query_all_means <- as.data.frame(lapply(query_all_means, as.numeric))
query_all_means_vec <- colMeans(query_all_means)
Epsilon <- cor(reference_all_means_vec, query_all_means_vec)
beta <- max(CCA_PCC)/max(marker_jaccard)
Epsilon <- max(Epsilon, random_PCC)
CCA_marker_similarity <- CCA_PCC + beta*marker_jaccard
CCA_marker_similarity[CCA_marker_similarity < Epsilon] <- 0
match_predict_matrix<-Match_annotation(data = CCA_marker_similarity)
match_predict_matrix_melt<-reshape2::melt(match_predict_matrix)
match_predict_matrix_melt<-subset(match_predict_matrix_melt,match_predict_matrix_melt$value>0)
match_predict_matrix_melt<-match_predict_matrix_melt[,1:2]
colnames(match_predict_matrix_melt)<-c("seurat_clusters","predicted_celltypes")
query@meta.data<-dplyr::left_join(query@meta.data,match_predict_matrix_melt)
query@meta.data$predicted_celltypes <- as.character(query@meta.data$predicted_celltypes)
query@meta.data[is.na(query@meta.data)]<-'unknown'
rownames(query@meta.data)<-query_cells
#output
ClusterMatch_transfer <- list(D1_reference = reference, D2_query = query, Matching_matrix = match_predict_matrix)
}
#2.1 Load data
dendritic_batch1 <- read.csv("/Users/bingbao/Desktop/paper_R_package /ClusterMatch/data/dendritic/batch1.csv", row.names = 1)
dendritic_batch2 <- read.csv("/Users/bingbao/Desktop/paper_R_package /ClusterMatch/data/dendritic/batch2.csv", row.names = 1)
dendritic_celltype <- read.csv("/Users/bingbao/Desktop/paper_R_package /ClusterMatch/data/dendritic/celltype.csv")
human_dLGN <- read.csv("/Users/bingbao/Desktop/paper_R_package /ClusterMatch/data/dLGN/human_dLGN.csv", row.names = 1)
macaque_dLGN <- read.csv("/Users/bingbao/Desktop/paper_R_package /ClusterMatch/data/dLGN/macaque_dLGN.csv", row.names = 1)
human_celltype <- read.csv("/Users/bingbao/Desktop/paper_R_package /ClusterMatch/data/dLGN/human_celltype.csv")
rownames(human_celltype) <- human_celltype$cells
macaque_celltype <- read.csv("/Users/bingbao/Desktop/paper_R_package /ClusterMatch/data/dLGN/macaque_celltype.csv")
rownames(macaque_celltype) <- macaque_celltype$cells
#2.2 Find optimal resolutions
dendritic_res <- ClusterMatch_resolution(dendritic_batch1, dendritic_batch2, ref.norm = FALSE, que.norm = FALSE)
#2.3 Match scRNA-seq data at the cluster level
dendritic_matching <- ClusterMatch_matching(dendritic_batch1, dendritic_batch2, ref.res = dendritic_res$D1_ref_res, que.res = dendritic_res$D2_que_res, ref.norm = FALSE, que.norm = FALSE, random_PCC = 1.3)
#2.4 Integrate scRNA-seq datasets
dendritic_integration <- ClusterMatch_integration(dendritic_batch1, dendritic_batch2, ref.res = dendritic_res$D1_ref_res,
que.res = dendritic_res$D2_que_res, ref.norm = FALSE, que.norm = FALSE, random_PCC = 1.3, distance_diff = 3, distance_same = 1)
umap_df <- ClusterMatch_UMAP(embedding = dendritic_integration$cell_embedding, cell = dendritic_celltype)
batch_colour=c("#E64540","#3F81BB")
celltype_colour=c("#E64136","#5F78A3","#EDA6C3","#96C561")
library(ggplot2)
ggplot(umap_df,aes(X1,X2,color=batch)) +
scale_color_manual(values = batch_colour)+
geom_point() + theme_bw() +
theme(panel.grid=element_blank(),plot.title = element_text(hjust = 0.5),text = element_text(size = 20)) +
labs(x="UMAP_1",y="UMAP_2",
title = "ClusterMatch")
ggplot(umap_df,aes(X1,X2,color=label)) +
scale_color_manual(values = celltype_colour)+
geom_point() + theme_bw() +
theme(panel.grid=element_blank(),plot.title = element_text(hjust = 0.5),text = element_text(size = 20)) +
labs(x="UMAP_1",y="UMAP_2",
title = "ClusterMatch")
#2.5 Annotate the query data based on the reference data labels
dLGN_annotation <- ClusterMatch_annotation(human_dLGN, macaque_dLGN, human_celltype, que.res=2, ref.norm = FALSE, que.norm = FALSE)
table(dLGN_annotation$D2_query$predicted_celltypes==macaque_celltype$celltype)
#FALSE TRUE
#31 1693
devtools::document()
library(ClusterMatch)
library(usethis)
use_package(package = "Seurat")
use_package(package = "Seurat", ">= 3.0.0")
use_package(package = "Seurat")
use_package(package = "dplyr")
use_package(package = "stringr")
use_package(package = "umap")
use_package(package = "reshape2")
use_package(package = "matrixStats")
?dplyr
?matrixStats
?reshape2
?reshape2
?stringr
?umap