-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Tutku YARAŞ
authored and
Tutku YARAŞ
committed
Jan 8, 2025
0 parents
commit 5aa7733
Showing
100 changed files
with
2,544 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
#!/bin/bash -ue | ||
zcat phasedvcf_chr1.vcf.gz | grep -v '^#' | awk '{print $1, $2, $3}' > rsid_map_chr1.txt |
Empty file.
1 change: 1 addition & 0 deletions
1
.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/CURRENT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
MANIFEST-000002 |
Empty file.
Binary file added
BIN
+50 Bytes
.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/MANIFEST-000002
Binary file not shown.
Empty file.
Binary file not shown.
1 change: 1 addition & 0 deletions
1
.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/CURRENT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
MANIFEST-000002 |
Empty file.
Binary file added
BIN
+50 Bytes
.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/MANIFEST-000002
Binary file not shown.
Binary file added
BIN
+833 Bytes
.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/index.kickass_euler
Binary file not shown.
Empty file.
1 change: 1 addition & 0 deletions
1
.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/CURRENT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
MANIFEST-000002 |
Empty file.
Binary file added
BIN
+50 Bytes
.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/MANIFEST-000002
Binary file not shown.
Empty file.
Binary file not shown.
1 change: 1 addition & 0 deletions
1
.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/CURRENT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
MANIFEST-000002 |
Empty file.
Binary file not shown.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
2024-12-09 16:49:55 - gigantic_keller - 348db7354f08ee080d5ed7021b1e2114 8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf nextflow run main.nf --run_plink false --run_prsice false --run_LDpred2grid false --run_LDpred2auto false --run_Lassosum2 false --run_prscsx false --run_megaprs false | ||
2024-12-09 16:51:20 6.6s berserk_faggin ERR f324416b50445b817c88a9a700016174 8a32e597-0fae-4fe6-bde4-332d5f87fdea nextflow run main.nf --run_plink false --run_prsice false --run_LDpred2grid false --run_LDpred2auto false --run_Lassosum2 false --run_prscsx false --run_megaprs false | ||
2024-12-09 16:56:12 7.6s evil_legentil ERR db0d5f7f64fe21ad8d61a059f22a9e9e 047bdf78-a8b9-456c-bf7a-fa792e31cc93 nextflow run main.nf --run_plink false --run_prsice false --run_LDpred2grid false --run_LDpred2auto false --run_Lassosum2 false --run_prscsx false --run_megaprs false | ||
2024-12-09 17:07:42 6m 4s kickass_euler ERR b1cc4c25d53196387214f31567700398 4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf nextflow run main.nf --run_plink false --run_prsice false --run_LDpred2grid false --run_LDpred2auto false --run_Lassosum2 false --run_prscsx false --run_megaprs false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
FROM tutkuyaras/pgsxplorer_image:latest | ||
|
||
RUN apt-get update && apt-get install -y \ | ||
wget \ | ||
curl \ | ||
bash \ | ||
bzip2 \ | ||
build-essential \ | ||
libbz2-dev \ | ||
zlib1g-dev \ | ||
liblzma-dev \ | ||
libcurl4-openssl-dev \ | ||
libssl-dev \ | ||
&& apt-get clean && rm -rf /var/lib/apt/lists/* | ||
|
||
RUN wget -O /usr/local/bin/beagle.jar https://faculty.washington.edu/browning/beagle/beagle.28Jun21.220.jar && \ | ||
echo '#!/bin/bash\njava -jar /usr/local/bin/beagle.jar "$@"' > /usr/local/bin/beagle && \ | ||
chmod +x /usr/local/bin/beagle | ||
|
||
RUN wget -O /tmp/eagle.tar.gz https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/Eagle_v2.4.1.tar.gz && \ | ||
tar -xzvf /tmp/eagle.tar.gz -C /usr/local/bin/ && \ | ||
chmod +x /usr/local/bin/Eagle_v2.4.1/eagle && \ | ||
rm /tmp/eagle.tar.gz | ||
|
||
RUN wget https://github.com/samtools/bcftools/releases/download/1.17/bcftools-1.17.tar.bz2 && \ | ||
tar -xjf bcftools-1.17.tar.bz2 && \ | ||
cd bcftools-1.17 && \ | ||
./configure && \ | ||
make && \ | ||
make install && \ | ||
cd .. && \ | ||
rm -rf bcftools-1.17 bcftools-1.17.tar.bz2 | ||
|
||
ENV PATH="/usr/local/bin/Eagle_v2.4.1:$PATH" | ||
|
||
RUN apt-get clean && \ | ||
rm -rf /var/lib/apt/lists/* /tmp/* |
Binary file not shown.
Binary file not shown.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,233 @@ | ||
# Set CRAN mirror | ||
options(repos = c(CRAN = "https://cran.rstudio.com")) | ||
options(bigstatsr.check.parallel.blas = FALSE) | ||
options(default.nproc.blas = NULL) | ||
|
||
# Install necessary packages if not already installed | ||
if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes") | ||
if (!requireNamespace("bigsnpr", quietly = TRUE)) remotes::install_github("privefl/bigsnpr") | ||
if (!requireNamespace("dplyr", quietly = TRUE)) install.packages("dplyr") | ||
if (!requireNamespace("R.utils", quietly = TRUE)) install.packages("R.utils") | ||
if (!requireNamespace("data.table", quietly = TRUE)) install.packages("data.table") | ||
if (!requireNamespace("magrittr", quietly = TRUE)) install.packages("magrittr") | ||
|
||
library(bigsnpr) | ||
library(dplyr) | ||
library(R.utils) | ||
library(data.table) | ||
library(magrittr) | ||
library(fmsb) | ||
|
||
# Parse arguments | ||
args <- commandArgs(trailingOnly = TRUE) | ||
phenotype_file <- args[1] | ||
eigenvec_file <- args[2] | ||
gwas_sumstat <- args[3] | ||
bed_file <- args[4] | ||
|
||
# 1. Read in the phenotype and covariate files | ||
phenotype <- fread(phenotype_file) | ||
pcs <- fread(eigenvec_file) | ||
colnames(pcs) <- c('FID', 'IID', paste0('PC', 1:10)) | ||
pheno <- merge(phenotype, pcs) | ||
pheno$Phenotype <- ifelse(pheno$Phenotype == 2, 1, 0) | ||
|
||
# 2. Information about the HapMap3+ variants | ||
info <- readRDS(runonce::download_file( | ||
'https://figshare.com/ndownloader/files/37802721', | ||
dir = 'tmp-data2', fname = 'map_hm3_plus.rds')) | ||
str(info) | ||
|
||
# 3. Read external summary statistics | ||
sumstats <- bigreadr::fread2(gwas_sumstat) | ||
sumstats <- sumstats %>% rename('chr' = 'CHR', 'pos' = 'BP', 'a1' = 'A1', 'a0' = 'A2', 'A1_FREQ' = 'A1_FREQ', 'n_eff' = 'N', 'OR' = 'OR', 'beta_se' = 'SE', 'L95' = 'L95', 'U95' = 'U95', 'Z_STAT' = 'Z_STAT', 'p' = 'P', 'rsid' = 'SNP') | ||
sumstats <- sumstats %>% select(chr, pos, rsid, a1, a0, n_eff, beta_se, p, OR) | ||
sumstats$beta <- log(sumstats$OR) | ||
sumstats <- sumstats[sumstats$rsid %in% info$rsid,] | ||
fwrite(sumstats, 'ldpred2auto_output.txt') | ||
|
||
# Calculate LD Matrix | ||
NCORES <- nb_cores() | ||
tmp <- tempfile(tmpdir = 'tmp-data2') | ||
on.exit(file.remove(paste0(tmp, '.sbk')), add = TRUE) | ||
corr <- NULL | ||
ld <- NULL | ||
fam.order <- NULL | ||
|
||
# Debug messages | ||
cat("Reading bed file...\n") | ||
# Read from bed/bim/fam, it generates .bk and .rds files. | ||
snp_readBed(bed_file) | ||
cat("Finished reading bed file.\n") | ||
# Debug message | ||
cat("Attaching bigSNP object...\n") | ||
# Attach the "bigSNP" object in R session | ||
rds_file <- sub(".bed$", ".rds", bed_file) | ||
cat("RDS file path:", rds_file, "\n") | ||
obj.bigSNP <- snp_attach(rds_file) | ||
# extract the SNP information from the genotype | ||
map <- setNames(obj.bigSNP$map[-3], c('chr', 'rsid', 'pos', 'a1', 'a0')) | ||
# perform SNP matching | ||
df_beta <- snp_match(sumstats, map) | ||
|
||
# Get aliases for useful slots | ||
G <- obj.bigSNP$genotypes | ||
CHR <- obj.bigSNP$map$chromosome | ||
POS <- obj.bigSNP$map$physical.pos | ||
# Change affection column values into 0 and 1 | ||
obj.bigSNP$fam$affection[obj.bigSNP$fam$affection == 1] <- 0 | ||
obj.bigSNP$fam$affection[obj.bigSNP$fam$affection == 2] <- 1 | ||
y <- obj.bigSNP$fam$affection | ||
|
||
#correlation (take 3cm) | ||
# To convert physical positions (in bp) to genetic positions (in cM), use | ||
POS2 <- snp_asGeneticPos(CHR, POS, dir = "tmp-data2", ncores = NCORES) | ||
# Before computing the LD matrices, let us filter out variants with small minor allele frequencies (MAFs): | ||
ind.row <- rows_along(G) | ||
maf <- snp_MAF(G, ind.row = ind.row, ind.col = df_beta$`_NUM_ID_`, ncores = NCORES) | ||
maf_thr <- 1 / sqrt(length(ind.row)) # threshold I like to use | ||
df_beta <- df_beta[maf > maf_thr, ] | ||
#df_beta <- snp_match(sumstats, map, join_by_pos = FALSE) # use rsid instead of pos | ||
|
||
## Prepare data as validation and test | ||
set.seed(1) | ||
# Calculate the number of samples for validation data | ||
n_val <- floor(0.7 * nrow(G)) | ||
# Sample indices for validation data | ||
ind.val <- sample(nrow(G), n_val) | ||
# Remaining indices for test data | ||
ind.test <- setdiff(rows_along(G), ind.val) | ||
# Debug message | ||
cat("Split data as validation (70%) and test (30%)\n") | ||
|
||
# Debug message | ||
cat("Calculating LD matrix...\n") | ||
# calculate LD | ||
for (chr in unique(df_beta$chr)) { | ||
|
||
# print(chr) | ||
# indices in 'df_beta' | ||
ind.chr <- which(df_beta$chr == chr) | ||
|
||
# indices in 'G' | ||
ind.chr2 <- df_beta$`_NUM_ID_`[ind.chr] | ||
|
||
# here we compute LD matrices ourselves, BUT | ||
# we recall that we provide pre-computed LD matrices that are | ||
# usually much better (larger N, LD blocks for robustness, etc) | ||
corr0 <- snp_cor(G, ind.col = ind.chr2, size = 3 / 1000, | ||
infos.pos = POS2[ind.chr2], ncores = NCORES) | ||
|
||
if (chr == 1) { | ||
ld <- Matrix::colSums(corr0^2) | ||
corr <- as_SFBM(corr0, tmp, compact = TRUE) | ||
} else { | ||
ld <- c(ld, Matrix::colSums(corr0^2)) | ||
corr$add_columns(corr0, nrow(corr)) | ||
} | ||
} | ||
|
||
# We assume the fam order is the same across different chromosomes | ||
fam.order <- as.data.table(obj.bigSNP$fam) | ||
# Rename fam order | ||
setnames(fam.order,c("family.ID", "sample.ID"), c("FID", "IID")) | ||
|
||
# Debug message | ||
cat("Perform LD score regression\n") | ||
|
||
# Perform LD score regression | ||
ldsc <- snp_ldsc( ld, | ||
length(ld), | ||
chi2 = (df_beta$beta / df_beta$beta_se)^2, | ||
sample_size = df_beta$n_eff, | ||
blocks = NULL) | ||
h2_est <- ldsc[["h2"]] | ||
|
||
# Calculate the null R2 (binary trait | ||
# Reformat the phenotype file such that y is of the same order as the | ||
# sample ordering in the genotype file | ||
a <- pheno[fam.order, on = c("FID", "IID")] | ||
# Calculate the null R2 | ||
# use glm for binary trait | ||
# (will also need the fmsb package to calculate the pseudo R2) | ||
null.model <- paste("PC", 1:10, sep = "", collapse = "+") %>% | ||
paste0("Phenotype~", .) %>% | ||
as.formula %>% | ||
glm(data = a, family=binomial) | ||
null.r2 <- fmsb::NagelkerkeR2(null.model) | ||
|
||
### LD-Pred:auto | ||
coef_shrink <- 0.95 # reduce this up to 0.4 if you have some (large) mismatch with the LD ref | ||
set.seed(1) # to get the same result every time | ||
|
||
# Get adjusted beta from the auto model | ||
multi_auto <- snp_ldpred2_auto( | ||
corr, | ||
df_beta, | ||
h2_init = h2_est, | ||
vec_p_init = seq_log(1e-4, 0.9, length.out = 30), | ||
ncores = NCORES, allow_jump_sign = FALSE, shrink_corr = coef_shrink) | ||
|
||
str(multi_auto, max.level = 1) | ||
|
||
str(multi_auto[[1]], max.level = 1) | ||
# Debug message | ||
cat("Visualization of auto model \n") | ||
|
||
# Visualization of auto | ||
library(ggplot2) | ||
auto <- multi_auto[[1]] # first chain | ||
|
||
pdf("LDpred2 auto graph.pdf") | ||
plot_grid( | ||
qplot(y = auto$path_p_est) + | ||
theme_bigstatsr() + | ||
geom_hline(yintercept = auto$p_est, col = "blue") + | ||
scale_y_log10() + | ||
labs(y = "p"), | ||
qplot(y = auto$path_h2_est) + | ||
theme_bigstatsr() + | ||
geom_hline(yintercept = auto$h2_est, col = "blue") + | ||
labs(y = "h2"), | ||
ncol = 1, align = "hv") | ||
dev.off() | ||
|
||
|
||
(range <- sapply(multi_auto, function(auto) diff(range(auto$corr_est)))) | ||
(keep <- which(range > (0.95 * quantile(range, 0.95, na.rm = TRUE)))) | ||
|
||
# Debug message | ||
cat("Obtain Model PRS \n") | ||
|
||
# (Obtain Model PRS) | ||
beta_auto <- rowMeans(sapply(multi_auto[keep], function(auto) auto$beta_est)) | ||
pred_auto <- big_prodVec(G, beta_auto, ind.row = ind.test, ind.col = df_beta[["_NUM_ID_"]]) | ||
pcor(pred_auto, y[ind.test], NULL) | ||
|
||
# Get final effects(for all) | ||
|
||
# Get final effects (for all data) | ||
beta_auto_all <- rowMeans(sapply(multi_auto[keep], function(auto) auto$beta_est)) | ||
pred_auto_all <- big_prodVec(G, beta_auto, ind.row = ind.row, ind.col = df_beta[["_NUM_ID_"]]) | ||
|
||
reg.formula <- paste("PC", 1:10, sep = "", collapse = "+") %>% | ||
paste0("Phenotype~", .) %>% | ||
as.formula | ||
reg.dat <- a | ||
reg.dat$PRS <- pred_auto_all | ||
auto.model <- lm(reg.formula, dat=reg.dat) %>% | ||
summary | ||
(result <- data.table( | ||
auto = auto.model$r.squared - null.r2$R2, | ||
null = null.r2$R2 | ||
)) | ||
|
||
write.table(reg.dat, "LDpred2_auto.txt", sep = "\t", row.names = FALSE, col.names = TRUE) | ||
|
||
# Debug message | ||
cat("Auto results was written as LDpred2_auto.txt file \n") | ||
|
||
# Debug message | ||
cat("Some cleaning \n") | ||
# Some cleaning | ||
rm(corr); gc(); file.remove(paste0(tmp, ".sbk")) |
Oops, something went wrong.