-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMCubeHelpers.R
466 lines (400 loc) · 15.9 KB
/
MCubeHelpers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
#' Convert a list of p-values to a long table
#'
#' @param pvalues_list A list recording the p-values from the SVG test.
#' Each element is a data.frame corresponding to a cell type.
#' @param which_pvalue A character specifying the column name of p-values. The default is `combined_pvalue`.
#'
#' @return A data.frame for the p-values in a long table.
#'
#' @export
mcubePvalueList2LongTable <- function(pvalues_list, which_pvalue = "combined_pvalue") {
pvalues_df <- do.call(
rbind,
lapply(
names(pvalues_list),
function(x) {
data.frame(
celltype = x,
gene = rownames(pvalues_list[[x]]),
pvalue = pvalues_list[[x]][, which_pvalue]
)
}
)
)
return(pvalues_df)
}
#' Generate a data.frame for QQ plot
#'
#' @param pvalues_df A data.frame with p-values.
#' @param which_pvalue A character string specifying the column name of p-values. The default is `combined_pvalue`.
#' @param minus_log10p_max A numeric value specifying the maximum value of -log10(p-value). The default is `5`.
#'
#' @return A data.frame for QQ plot.
#'
#' @export
mcubeQQPlotDF <- function(pvalues_df, which_pvalue = "combined_pvalue", minus_log10p_max = 5) {
pvalues_order <- order(pvalues_df[, which_pvalue])
pvalues <- pvalues_df[pvalues_order, which_pvalue]
n_pvalues <- length(pvalues)
pvalues_theoretical <- ppoints(n_pvalues)
minus_log10p <- -log10(pvalues)
minus_log10p[minus_log10p > minus_log10p_max] <- minus_log10p_max
minus_log10p_theoretical <- -log10(pvalues_theoretical)
qqplot_df <- data.frame(
gene = rownames(pvalues_df)[pvalues_order],
pvalue = pvalues,
pvalue_theoretical = pvalues_theoretical,
minus_log10p = minus_log10p,
minus_log10p_theoretical = minus_log10p_theoretical
)
return(qqplot_df)
}
#' Rotate a 2D spatial coordinate matrix
#'
#' @param theta The angle of rotation in degrees.
#'
#' @return A rotation matrix of the given angle.
#'
#' @export
rotation_matrix_2d <- function(theta) {
theta_rad <- theta * (pi / 180) # Convert degrees to radians
matrix(
c(
cos(theta_rad), -sin(theta_rad),
sin(theta_rad), cos(theta_rad)
),
nrow = 2,
ncol = 2,
byrow = TRUE
)
}
#' Format the GWAS summary statistics data
#'
#' @details
#' Format the GWAS summary statistics data for linkage disequilibrium score regression (LDSC) (\url{https://doi.org/10.1038/ng.3211}).
#' Credit goes to `ldsc` (\url{https://github.com/bulik/ldsc}) and the R package `MR-APSS` (\url{https://github.com/YangLabHKUST/MR-APSS}).
#'
#' @importFrom readr read_delim
#' @importFrom dplyr inner_join select filter distinct one_of
#' @importFrom tidyr drop_na
#' @importFrom stats qchisq pchisq sd median
#'
#' @param sumstats Summary statistics data or name of the summary statistics data file.
#' @param delim A single character used to separate fields within a record.
#' @param snps_merge A data.frame with SNPs to extract.
#' @param snps_mhc A set of SNPs that needed to be removed. For example, SNPs in MHC region.
#' @param dbSNP A tibble with two columns `rsID` = (rs) identifiers and `SNP` = chr:location. The data is from the dbSNP153 database (https://genome.ucsc.edu/FAQ/FAQdownloads.html#snp).
#' @param snp_col Name of a column with rs numbers. The default is `SNP`.
#' @param beta_col Name of a column with effect sizes (= log(or)). The default is `BETA`.
#' @param or_col Name of a column with odds ratios. The default is `OR`.
#' @param se_col Name of a column with standard errors. The default is `SE`.
#' @param freq_col Name of a column with effect allele frequencies. The default is `FREQ`.
#' @param a1_col Name of a column with effect alleles. The default is `A1`.
#' @param a2_col Name of a column with the other alleles. The default is `A2`.
#' @param p_col Name of a column with p-values. The default is `P`.
#' @param n_col Name of a column with sample sizes. The default is `N`.
#' @param ncase_col Name of a column with the number of cases. The default is `N_CASE`.
#' @param ncontrol_col Name of a column with the number of controls. The default is `N_CONTROL`.
#' @param n Sample size. If the column for sample size is not available, users can use argument "n" to specify the total sample size for each SNP.
#' @param z_col Name of a column with z scores. The default is `Z`.
#' @param info_col Name of a column with imputation INFO. The default is `INFO`.
#' @param log_pval Logical, whether the p-value is in -log10 scale. The default is `FALSE`.
#' @param chi2_max SNPs with tested chi^2 statistics large than chi2_max will be removed. The default is `80`.
#' @param min_freq SNPs with allele frequency less than min_freq will be removed. The default is `0.05`.
#'
#' @return Formatted summary statistics data
#'
#' @export
format_sumstats <- function(
sumstats, delim = NULL,
snps_merge = NULL, snps_mhc = NULL, dbSNP = NULL,
snp_col = "SNP", beta_col = "BETA", or_col = "OR", se_col = "SE",
freq_col = "FREQ", a1_col = "A1", a2_col = "A2", p_col = "P",
n_col = "N", ncase_col = "N_CASE", ncontrol_col = "N_CONTROL", n = NULL,
z_col = "Z", info_col = "INFO", log_pval = FALSE,
chi2_max = NULL, min_freq = 0.05) {
if (is.null(sumstats)) {
stop("please provide the information on summary statistics!")
}
if (is.character(sumstats)) {
message("Begin reading in summary statistics data...")
sumstats <- readr::read_delim(sumstats, delim = delim, comment = "##")
}
message("Begin formatting...")
message("The raw dataset has ", nrow(sumstats), " lines.")
cols <- c(
snp_col, beta_col, or_col, se_col,
freq_col, a1_col, a2_col, p_col,
ncase_col, ncontrol_col, n_col,
z_col, info_col
)
cols <- names(sumstats)[names(sumstats) %in% cols]
out <- c("SNP", "A1", "A2", "Z", "N", "CHI2", "P")
sumstats <- sumstats[, cols]
# check SNP
if (!snp_col %in% names(sumstats)) {
stop("SNP not found!")
}
# check A1/A2
if ((!a1_col %in% names(sumstats)) | (!a2_col %in% names(sumstats))) {
stop("A1/A2 not found!")
}
# check signed statistics
if ((!beta_col %in% names(sumstats)) &
(!or_col %in% names(sumstats)) &
(!z_col %in% names(sumstats))) {
stop("Signed statistics not found!")
}
# check sample size n
if ((!n_col %in% names(sumstats)) & is.null(n) &
(!(ncase_col %in% names(sumstats) & ncontrol_col %in% names(sumstats)))) {
stop("information for sample size not found!")
}
# check sample size n
if (ncase_col %in% names(sumstats)) {
names(sumstats)[which(names(sumstats) == ncase_col)[1]] <- "N_CASE"
if (!is.numeric(sumstats$N_CASE)) {
message("n_case column is not numeric data. Coercing...")
sumstats$N_CASE <- as.numeric(as.character(sumstats$N_CASE))
}
}
if (ncontrol_col %in% names(sumstats)) {
names(sumstats)[which(names(sumstats) == ncontrol_col)[1]] <- "N_CONTROL"
if (!is.numeric(sumstats$N_CONTROL)) {
message("n_control column is not numeric data. Coercing...")
sumstats$N_CONTROL <- as.numeric(as.character(sumstats$N_CONTROL))
}
}
if (n_col %in% names(sumstats)) {
names(sumstats)[which(names(sumstats) == n_col)[1]] <- "N"
if (!is.numeric(sumstats$N)) {
message("Sample size column is not numeric data. Coercing...")
sumstats$N <- as.numeric(as.character(sumstats$N))
}
} else if ("N_CASE" %in% names(sumstats) & "N_CONTROL" %in% names(sumstats)) {
message("Generate sample size from sample size of case and control.")
sumstats$N <- sumstats$N_CASE + sumstats$N_CONTROL
} else if (!is.null(n)) {
message("Generate sample size from specified sample size.")
sumstats$N <- n
}
sumstats$N <- as.integer(sumstats$N)
names(sumstats)[which(names(sumstats) == snp_col)[1]] <- "SNP"
# Format SNP id
sumstats$SNP <- tolower(sumstats$SNP)
sumstats$SNP <- gsub("[[:space:]]", "", sumstats$SNP)
if (grepl("^rs[0-9]+:", sumstats$SNP[1])) {
sumstats$SNP <- gsub(":.*", "", sumstats$SNP)
}
# Merge with dbSNP
if (!is.null(dbSNP)) {
sumstats <- dplyr::inner_join(sumstats, dbSNP, by = "SNP")
sumstats <- dplyr::select(sumstats, -("SNP"))
names(sumstats)[which(names(sumstats) == "rsID")[1]] <- "SNP"
message(
"Merge the data with dbSNP..., remaining ",
nrow(sumstats), " SNPs."
)
}
# Remove NAs
sumstats <- tidyr::drop_na(sumstats, SNP)
message("Remove missing data..., remaining ", nrow(sumstats), " SNPs.")
# Merge with HapMap 3 SNPs lists
if (!is.null(snps_merge)) {
snps_merge <- dplyr::select(snps_merge, SNP, A1, A2)
names(snps_merge) <- c("SNP", "ref_A1", "ref_A2")
sumstats <- dplyr::inner_join(sumstats, snps_merge, by = "SNP")
message(
"Merge the data with the HapMap 3 SNPs list by SNP..., remaining ",
nrow(sumstats), " SNPs."
)
}
# remove MHC SNPs
if (!is.null(snps_mhc)) {
sumstats <- dplyr::filter(sumstats, !SNP %in% snps_mhc)
message("Remove the SNPs in MHC region..., remaining ", nrow(sumstats), " SNPs.")
}
# check imputation INFO
if (info_col %in% names(sumstats)) {
names(sumstats)[which(names(sumstats) == info_col)[1]] <- "INFO"
sumstats <- dplyr::filter(sumstats, INFO > 0.9 | is.na(INFO))
message(
"Remove the SNPs with imputation INFO less than 0.9 (keeping NAs)..., remaining ",
nrow(sumstats), " SNPs."
)
}
# check effect allele (A1)
if (a1_col %in% names(sumstats)) {
names(sumstats)[which(names(sumstats) == a1_col)[1]] <- "A1"
if (is.logical(sumstats$A1)) {
sumstats$A1 <- substr(as.character(sumstats$A1), 1, 1)
}
if (!is.character(sumstats$A1)) {
message("Effect allele column is not character data. Coercing...")
sumstats$A1 <- as.character(sumstats$A1)
}
sumstats$A1 <- toupper(sumstats$A1)
sumstats <- dplyr::filter(sumstats, grepl("^[ACTG]+$", A1))
message(
"Remove the SNPs whose effect allele has the value that is not A/C/T/G...., remaining ",
nrow(sumstats), " SNPs."
)
}
# return(sumstats)
# check the other allele (A2)
if (a2_col %in% names(sumstats)) {
names(sumstats)[which(names(sumstats) == a2_col)[1]] <- "A2"
if (is.logical(sumstats$A2)) {
sumstats$A2 <- substr(as.character(sumstats$A2), 1, 1)
}
if (!is.character(sumstats$A2)) {
message("The other allele column is not character data. Coercing...")
sumstats$A2 <- as.character(sumstats$A2)
}
sumstats$A2 <- toupper(sumstats$A2)
sumstats <- dplyr::filter(sumstats, grepl("^[ACTG]+$", A2))
message(
"Remove the SNPs whose non effect allele has the value that is not A/C/T/G...., remaining ",
nrow(sumstats), " SNPs."
)
}
# remove ambiguous SNPs
# A-T, C-G
sumstats <- dplyr::filter(
sumstats,
(A1 == "A" & A2 == "C") |
(A1 == "A" & A2 == "G") |
(A1 == "T" & A2 == "C") |
(A1 == "T" & A2 == "G") |
(A1 == "C" & A2 == "A") |
(A1 == "C" & A2 == "T") |
(A1 == "G" & A2 == "A") |
(A1 == "G" & A2 == "T")
)
message("Remove the ambiguous SNPs..., remaining ", nrow(sumstats), " SNPs.")
# duplicated SNPs
sumstats <- dplyr::distinct(sumstats, SNP, .keep_all = TRUE)
message("Remove the duplicated SNPs..., remaining ", nrow(sumstats), " SNPs.")
# check estimate of effect size (BETA) and standard error (SE)
if (beta_col %in% names(sumstats)) {
names(sumstats)[which(names(sumstats) == beta_col)[1]] <- "BETA"
if (!is.numeric(sumstats$BETA)) {
message("Effect size column is not numeric data. Coercing...")
sumstats$BETA <- as.numeric(as.character(sumstats$BETA))
}
sumstats <- dplyr::filter(sumstats, is.finite(BETA))
# check se
if (se_col %in% names(sumstats)) {
names(sumstats)[which(names(sumstats) == se_col)[1]] <- "SE"
if (!is.numeric(sumstats$SE)) {
message("Standard error column is not numeric data. Coercing...")
sumstats$SE <- as.numeric(as.character(sumstats$SE))
}
sumstats <- dplyr::filter(sumstats, is.finite(SE) & SE > 0)
}
}
# set BETA as log(OR) when BETA is not available
if (!beta_col %in% names(sumstats) & or_col %in% names(sumstats)) {
names(sumstats)[which(names(sumstats) == or_col)[1]] <- "OR"
message("Infer effect size column from log(OR)...")
sumstats$BETA <- log(sumstats$OR)
# check se
if (se_col %in% names(sumstats)) {
names(sumstats)[which(names(sumstats) == se_col)[1]] <- "SE"
if (!is.numeric(sumstats$SE)) {
message("Standard error column is not numeric data. Coercing...")
sumstats$SE <- as.numeric(as.character(sumstats$SE))
}
sumstats <- dplyr::filter(sumstats, is.finite(SE) & SE > 0)
}
}
# check FREQ
if (freq_col %in% names(sumstats)) {
names(sumstats)[which(names(sumstats) == freq_col)[1]] <- "FREQ"
if (!is.numeric(sumstats$FREQ)) {
message("Frequency column is not numeric data. Coercing...")
sumstats$FREQ <- as.numeric(as.character(sumstats$FREQ))
}
# remove SNP with allele frequency less than min_freq
sumstats <- dplyr::filter(sumstats, (FREQ > min_freq & FREQ < (1 - min_freq)) | is.na(FREQ))
message(
"Remove the SNPs with allele frequency less than min_freq (keeping NAs)..., remaining ",
nrow(sumstats), " SNPs."
)
}
# check p-value
if (p_col %in% names(sumstats) & log_pval) {
names(sumstats)[which(names(sumstats) == p_col)[1]] <- "P"
if (!is.numeric(sumstats$P)) {
message("p-value column is not numeric data. Coercing...")
sumstats$P <- as.numeric(sumstats$P)
}
sumstats$P <- 10^(-sumstats$P)
} else if (p_col %in% names(sumstats) & (!log_pval)) {
names(sumstats)[which(names(sumstats) == p_col)[1]] <- "P"
if (!is.numeric(sumstats$P)) {
message("p-value column is not numeric data. Coercing...")
sumstats$P <- as.numeric(sumstats$P)
}
sumstats <- dplyr::filter(sumstats, P >= 0 & P <= 1)
message(
"Remove the SNPs with p-value < 0 or p-value > 1..., remaining ",
nrow(sumstats), " SNPs."
)
}
# check z-score
if (z_col %in% names(sumstats)) {
names(sumstats)[which(names(sumstats) == z_col)[1]] <- "Z"
if (!is.numeric(sumstats$Z)) {
message("z-score column is not numeric data. Coercing...")
sumstats$Z <- as.numeric(sumstats$Z)
}
message("Calculate chi-square column from z-score.")
sumstats$CHI2 <- (sumstats$Z)^2
}
# calculate z-score from p-value
if ((!"Z" %in% names(sumstats)) & "P" %in% names(sumstats) & "BETA" %in% names(sumstats)) {
message("Infer z-score from p-value and effect size.")
sumstats$CHI2 <- stats::qchisq(sumstats$P, 1, lower.tail = FALSE)
sumstats$Z <- sign(sumstats$BETA) * sqrt(sumstats$CHI2)
}
# calculate z-score if no Z, but with BETA and SE
if ((!"Z" %in% names(sumstats)) &
("BETA" %in% names(sumstats) & "SE" %in% names(sumstats))) {
message("Infer z-score from effect size and standard error.")
sumstats$Z <- sumstats$BETA / sumstats$SE
sumstats$CHI2 <- (sumstats$Z)^2
}
# calculate chi-square if no CHI2
if (!"CHI2" %in% names(sumstats)) {
sumstats$CHI2 <- (sumstats$Z)^2
}
# calculate p-value if no P
if (!"P" %in% names(sumstats)) {
sumstats$P <- stats::pchisq(sumstats$CHI2, 1, lower.tail = FALSE)
}
if (!"Z" %in% names(sumstats)) {
stop("no information for z-score!")
}
# check missing data
sumstats <- dplyr::select(sumstats, dplyr::one_of(out))
sumstats <- tidyr::drop_na(sumstats)
message("Remove missing values..., remaining ", nrow(sumstats), " SNPs.")
n_min <- mean(sumstats$N) - 5 * stats::sd(sumstats$N)
n_max <- mean(sumstats$N) + 5 * stats::sd(sumstats$N)
sumstats <- dplyr::filter(sumstats, N >= n_min & N <= n_max)
message(
"Remove SNPs with sample size 5 standard deviations away from the mean..., remaining ",
nrow(sumstats), " SNPs."
)
if (is.null(chi2_max)) {
chi2_max <- max(c(80, stats::median(sumstats$N) / 1000))
}
sumstats <- dplyr::filter(sumstats, CHI2 < chi2_max)
message(
"Remove SNPs with chi-square > chi2_max..., remaining ",
nrow(sumstats), " SNPs."
)
message("The formatted data has ", nrow(sumstats), " lines.")
return(sumstats)
}