-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMCubeUtils.R
321 lines (305 loc) · 11.6 KB
/
MCubeUtils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
#' Filter out lowly expressed genes in spatial transcriptomics data
#'
#' @description
#' Fuction for data preprocessing.
#' Credit goes to the R package `spacexr` (\url{https://github.com/dmcable/spacexr}).
#'
#' @param counts A matrix. Each row represents a spot and each column represents a gene.
#' @param library_sizes A numeric vector containing library sizes of all spots.
#' @param threshold A numeric value. Genes with average relative expression below this threshold are filtered out.
#' @param batch_size An integer.
#' The number of genes to process in each batch.
#'
#' @return A character vector containing the names of genes that pass the threshold.
#'
#' @export
mcubeFilterGenes <- function(
counts, library_sizes = NULL,
threshold = 5e-5, batch_size = 1000) {
message(
"mcubeFilterGenes: Filter genes based on relative expression",
" with threshold = ", threshold, "."
)
if (is.null(library_sizes)) {
library_sizes <- rowSums(as.matrix(counts))
}
n_genes <- ncol(counts)
gene_means <- numeric(n_genes)
names(gene_means) <- colnames(counts)
n_batches <- ceiling(n_genes / batch_size)
for (j in 1:n_batches) {
if (j < n_batches) {
index_range <- (1:batch_size) + (j - 1) * batch_size
} else {
index_range <- (1 + (n_batches - 1) * batch_size):n_genes
}
norm_counts <- as.matrix(counts[, index_range, drop = FALSE]) / library_sizes
gene_means[index_range] <- colMeans(norm_counts)
}
gene_list_total <- names(which(gene_means > threshold))
mito_genes_idx <- c(
grep("^MT-", gene_list_total),
grep("^mt-", gene_list_total)
)
if (length(mito_genes_idx) > 0) {
gene_list_total <- gene_list_total[-mito_genes_idx]
}
return(gene_list_total)
}
#' Filter out lowly expressed genes for a specific cell type
#'
#' @description
#' Fuction for data preprocessing.
#' Credit goes to the R package `spacexr` (\url{https://github.com/dmcable/spacexr}).
#'
#' @importFrom stats median
#'
#' @param celltype A character specifying the cell type.
#' @param celltype_all A character vector containing all cell types.
#' @param gene_test A character vector specifying the genes to test.
#' @param library_sizes A numeric vector containing library sizes of all spots.
#' @param proportions A numeric matrix containing the proportion of each cell type at each spot.
#' @param reference A numeric matrix containing the reference expression of genes for each cell type.
#' @param reference_threshold A numeric value. Genes with relative expression below this threshold will be filtered out.
#' @param platform_effects A numeric vector containing the platform effects for each gene.
#'
#' @return A character vector containing the names of genes that pass the threshold.
#'
#' @export
mcubeFilterGenesCellType <- function(
celltype, celltype_all, gene_test,
library_sizes, proportions, reference,
reference_threshold = 0.5, platform_effects) {
C <- 15
N_cells <- colSums(proportions)[celltype]
library_sizes_list <- library_sizes[
which(proportions[, celltype] >= .99)
]
if (length(library_sizes_list) < 10) {
library_sizes_list <- library_sizes[
which(proportions[, celltype] >= .80)
]
}
if (length(library_sizes_list) < 10) {
library_sizes_list <- library_sizes[
which(proportions[, celltype] >= .50)
]
}
if (length(library_sizes_list) < 10) {
library_sizes_list <- library_sizes[
which(proportions[, celltype] >= .01)
]
}
library_sizes_median <- median(library_sizes_list)
expr_thresh <- C / (N_cells * library_sizes_median)
reference_renorm <- sweep(
reference[celltype_all, gene_test, drop = FALSE],
MARGIN = 2,
STATS = exp(platform_effects[gene_test]),
FUN = "*"
)
gene_list_type <- setdiff(
gene_test,
gene_test[which(reference_renorm[celltype, ] < expr_thresh)]
)
# Compare with the reference of other cell types
celltype_means <- reference_renorm[celltype_all, gene_list_type, drop = FALSE]
if (ncol(proportions) > 1) {
celltype_mean_ratio <- celltype_means[celltype, ] / apply(celltype_means, 2, max)
gene_list_type <- gene_list_type[which(celltype_mean_ratio >= reference_threshold)]
}
return(gene_list_type)
}
#' Get the platform effects for each gene
#'
#' @description
#' Fuction for data preprocessing.
#' Credit goes to the R package `spacexr` (\url{https://github.com/dmcable/spacexr}).
#'
#' @param counts A matrix. Each row represents a spot and each column represents a gene.
#' @param library_sizes A numeric vector containing library sizes of all spots.
#' @param proportions A numeric matrix containing the proportion of each cell type at each spot.
#' @param reference A numeric matrix containing the reference expression of genes for each cell type.
#' @param spot_effects A numeric vector containing the spot effects for each spot.
#'
#' @return A numeric vector containing the platform effects for each gene.
#'
#' @export
mcubeGetPlatformEffects <- function(
counts, library_sizes, proportions,
reference, spot_effects) {
bulk_vec <- colSums(as.matrix(counts))
weight_celltype <- colSums(
library_sizes * exp(spot_effects) * proportions
) / sum(library_sizes)
weight_avg <- colSums(
reference * weight_celltype / sum(weight_celltype),
)
target_means <- bulk_vec / sum(library_sizes)
platform_effects <- log(target_means / weight_avg)
platform_effects[!is.finite(platform_effects)] <- 0
names(platform_effects) <- colnames(reference)
return(platform_effects)
}
#' Get main cell types
#'
#' @description
#' Fuction for data preprocessing.
#' Credit goes to the R package `spacexr` (\url{https://github.com/dmcable/spacexr}).
#'
#' @param proportions A numeric matrix containing the proportion of each cell type at each spot.
#' @param celltype_test A character vector specifying the cell types to test.
#' @param proportion_threshold A numeric value. The proportions below this threshold will be ignored.
#' @param celltype_threshold A numeric value. Cell types with column sums less than this number will be filtered out.
#'
#' @return A character vector containing the names of cell types that pass the threshold.
#'
#' @export
mcubeFilterCellTypes <- function(
proportions, celltype_test = NULL,
proportion_threshold = 0.1, celltype_threshold = 100) {
proportions[proportions < proportion_threshold] <- 0
celltype_default <- names(which(colSums(proportions) >= celltype_threshold))
if (!is.null(celltype_test)) {
diff_celltypes <- setdiff(celltype_test, celltype_default)
if (length(diff_celltypes) > 0) {
message(
"mcubeFilterCellTypes: Cell types ",
paste0(diff_celltypes, collapse = ", "),
" have less than the minimum celltype_threshold = ", celltype_threshold,
". To include these cell-types, please reduce the celltype_threshold."
)
}
celltype_test <- intersect(celltype_test, celltype_default)
} else {
celltype_test <- celltype_default
}
if (length(celltype_test) == 0) {
stop(
"mcubeFilterCellTypes: No cell types occure greater than",
" celltype_threshold ", celltype_threshold, "!"
)
}
message(
"mcubeFilterCellTypes: Cell types ",
paste(celltype_default, collapse = ", "),
" pass the celltype_threshold = ", celltype_threshold, "."
)
return(celltype_test)
}
#' Get cell-type-specific spatially variable genes
#'
#' @description Get spatially variable genes specific to a cell type based on the adjusted p-values.
#'
#' @importFrom stats p.adjust
#'
#' @param pvalues_list A list of data frames. Each data frame contains p-values of genes for a certain cell type.
#' @param which_pvalue A character specifying the column name of p-values in each data frame.
#' @param adjust_method A character specifying the method for multiple testing correction.
#' @param alpha A numeric value. The significance level.
#'
#' @return A list of data frames. Each data frame contains the significant genes for a certain cell type.
#'
#' @export
mcubeGetSigGenes <- function(
pvalues_list, which_pvalue = "combined_pvalue",
adjust_method = "BH", alpha = 0.05) {
all_methods <- c(
"holm", "hochberg", "hommel", "bonferroni",
"BH", "BY", "fdr", "none"
)
if (!(adjust_method %in% all_methods)) {
stop(
"mcubeGetSigGenes: Adjust method must be one of ",
paste(all_methods, collapse = ", "), "!"
) # End
}
message(
"mcubeGetSigGenes: Set adjust_method as ", adjust_method,
" and alpha as ", alpha, "."
)
sig_genes_list <- lapply(
pvalues_list,
FUN = function(x) {
x <- x[order(x[, which_pvalue]), ]
adjusted_pvalues <- stats::p.adjust(x[, which_pvalue], method = adjust_method)
sig_genes_idx <- which(adjusted_pvalues <= alpha)
data.frame(
pvalue = x[sig_genes_idx, which_pvalue],
adjusted_pvalue = adjusted_pvalues[sig_genes_idx],
row.names = rownames(x)[sig_genes_idx]
)
}
)
names(sig_genes_list) <- names(pvalues_list)
return(sig_genes_list)
}
#' Aggregated Cauchy Assocaition Test
#'
#' @description
#' A p-value combination method using the Cauchy distribution.
#' Credit goes to the R package `ACAT` (\url{https://github.com/yaowuliu/ACAT}).
#'
#' @param Weights a numeric vector/matrix of non-negative weights for the combined p-values. When it is NULL, the equal weights are used.
#' @param Pvals a numeric vector/matrix of p-values. When it is a matrix, each column of p-values is combined by ACAT.
#' @param threshold a numeric value of the numerical precision of the Cauchy cumulative density function.
#'
#' @return The p-value(s) of ACAT.
#'
#' @examples p.values <- c(2e-02, 4e-04, 0.2, 0.1, 0.8)
#' ACAT(Pvals = p.values)
#' @examples ACAT(matrix(runif(1000), ncol = 10))
#'
#' @references Liu, Y., & Xie, J. (2019). Cauchy combination test: a powerful test with analytic p-value calculation
#' under arbitrary dependency structures. \emph{Journal of American Statistical Association},115(529), 393-402. (\href{https://amstat.tandfonline.com/doi/abs/10.1080/01621459.2018.1554485}{pub})
#'
#' @export
ACAT <- function(Pvals, Weights = NULL, threshold = 5.55e-17) {
#### check if there is NA
if (sum(is.na(Pvals)) > 0) {
stop("ACAT: Cannot have NAs in the p-values!")
}
#### check if Pvals are between 0 and 1
if ((sum(Pvals < 0) + sum(Pvals > 1)) > 0) {
stop("ACAT: P-values must be between 0 and 1!")
}
#### check if there are pvals that are either exactly 0 or 1.
is.zero <- (sum(Pvals == 0) >= 1)
is.one <- (sum(Pvals == 1) >= 1)
if (is.zero && is.one) {
# stop("ACAT: Cannot have both 0 and 1 p-values!")
return(NA)
}
if (is.zero) {
Pvals[which(Pvals == 0)] <- 5.55e-17
}
if (is.one) {
Pvals[which((1 - Pvals) < 1e-3)] <- 0.99
}
#### Default: equal weights. If not, check the validity of the user supplied weights and standadize them.
if (is.null(Weights)) {
Weights <- rep(1 / length(Pvals), length(Pvals))
} else if (length(Weights) != length(Pvals)) {
stop("ACAT: The length of weights should be the same as that of the p-values")
} else if (sum(Weights < 0) > 0) {
stop("ACAT: All the weights must be positive!")
} else {
Weights <- Weights / sum(Weights)
}
#### check if there are very small non-zero p values
is.small <- (Pvals < 1e-16)
if (sum(is.small) == 0) {
cct.stat <- sum(Weights * tan((0.5 - Pvals) * pi))
} else {
cct.stat <- sum((Weights[is.small] / Pvals[is.small]) / pi)
cct.stat <- cct.stat + sum(Weights[!is.small] * tan((0.5 - Pvals[!is.small]) * pi))
}
#### check if the test statistic is very large.
if (cct.stat > 1e+15) {
pval <- (1 / cct.stat) / pi
} else {
pval <- 1 - stats::pcauchy(cct.stat)
}
pval[pval < threshold] <- min(c(Pvals, threshold))
return(pval)
}