-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathPseudodepth_scores.R
81 lines (70 loc) · 3.09 KB
/
Pseudodepth_scores.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
## Bulk pseudodepth approach
pseudobulk <- pblapply(unique(sce$Short_Sample), function(x){
message(x)
rs <- Matrix::rowSums(counts(sce)[, sce$Short_Sample == x])
return(rs)
})
pseudobulk <- do.call(cbind, pseudobulk)
colnames(pseudobulk) <- unique(sce$Short_Sample)
#then calculate TPM
TPM <- calculateTPM(pseudobulk)
pseudobulk <- log2(TPM + 1)
#read in the TCGA data
#now get the bulk TCGA data
TCGA <- readRDS(file.path(dirname(getwd()), "Kidney RNAseq datasets", "Data/TCGA kidney normal_processed.RDS"))
#subset this to samples which have a spatial label
TCGA <- TCGA[, colData(TCGA)$cluster %in% c("Cortex enriched", "Corticomedulla enriched", "Medulla enriched")]
require(gam)
t <- colData(TCGA)$pt
# for time, only look at the most variable genes
Y <- assay(TCGA, "exprs")[metadata(TCGA)$hvg, ]
#select genes that have high p values testing against pseudotime
gam.pval <- apply(Y,1,function(z){
d <- data.frame(z=z, t=t)
tmp <- gam(z ~ lo(t), data=d)
p <- summary(tmp)[4][[1]][1,5]
p
})
topgenes <- names(sort(gam.pval, decreasing = FALSE))[1:1000]
#RF predictions
set.seed(100)
common_genes <- intersect(rownames(pseudobulk), topgenes)
cluster <- factor(colData(TCGA)$cluster)
data_train <- data.frame(t(assay(TCGA, "exprs")[common_genes, ]), cluster = cluster)
#set up test data
data_test <- data.frame(t(pseudobulk[common_genes, ]))
cs <- colSums(data_test) == 0
cleandat <- sva::ComBat(t(as.matrix(data_test[ !cs])), droplevels(annot$Project))
data_test <- t(cleandat)
#random forests model training
rf_model <- ranger::ranger(formula = cluster~., data =data_train[, !cs] , probability = TRUE)
predictions <- predict(rf_model, data =data_test)
pt <- get_pseudotime(pc$x, 1)
pseudospace_score <- pt$Pseudotime
names(pseudospace_score) <- colnames(pseudobulk)
plot_pseudotime(pt)
#categorical predictions
common_genes <- intersect(rownames(pseudobulk), topgenes)
cluster <- factor(colData(TCGA)$cluster)
data_train <- data.frame(t(assay(TCGA, "exprs")[common_genes, ]), cluster = cluster)
data_test <- cleandat
rf_model <- ranger::ranger(formula = cluster~., data =data_train[, !cs] , probability = FALSE)
predictions <- predict(rf_model, data = t(data_test))
pseudospace_category <- predictions$predictions
names(pseudospace_category) <- colnames(pseudobulk)
#### Bulk GSVA scores
eset <- readRDS( file.path(dirname(getwd()), "Kidney RNAseq datasets", "Data/Lindgren processed data.RDS"))
pt <- readRDS(file.path(dirname(getwd()), "Kidney RNAseq datasets", "Data/Lindgren pt.RDS"))
pt <- pt$Pseudotime
library(GSVA)
gsva_in <- Y <- exprs(eset)
rownames(gsva_in) <- fData(eset)$Symbol
nephron_markers <- readRDS("Data/nephron_markers.RDS")
markers_use <- pblapply(nephron_markers, function(m){m[1:50, "Gene"]})
gsva_out <- as.matrix(gsva(gsva_in, markers_use, method = "gsva", parallel.sz=1))
gsva_plot <- pblapply(c(rownames(gsva_out)), function(g){
ggplot(data.frame("Pseudodepth" = pt,
"Score" = gsva_out[g, ]),
aes(x = Pseudodepth, y = Score)) + geom_point(size = 1, color = "grey") + geom_smooth(color = "black", fill = "grey") + ggtitle(g)
})
plot_grid(plotlist = gsva_plot, ncol = 4)