Relase 2 for PGSXplorer

tutkuyaras · Jan 8, 2025 · 5aa7733 · 5aa7733
commit 5aa7733
Show file tree

Hide file tree

Showing 100 changed files with 2,544 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.command.sh b/.command.sh
@@ -0,0 +1,2 @@
+#!/bin/bash -ue
+zcat phasedvcf_chr1.vcf.gz | grep -v '^#' | awk '{print $1, $2, $3}' > rsid_map_chr1.txt
diff --git a/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/000003.log b/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/000003.log
diff --git a/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/CURRENT b/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/CURRENT
@@ -0,0 +1 @@
+MANIFEST-000002
diff --git a/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/LOCK b/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/LOCK
diff --git a/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/MANIFEST-000002 b/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/MANIFEST-000002
diff --git a/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/index.evil_legentil b/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/index.evil_legentil
diff --git a/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/000003.log b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/000003.log
diff --git a/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/CURRENT b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/CURRENT
@@ -0,0 +1 @@
+MANIFEST-000002
diff --git a/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/LOCK b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/LOCK
diff --git a/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/MANIFEST-000002 b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/MANIFEST-000002
diff --git a/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/index.kickass_euler b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/index.kickass_euler
diff --git a/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/000003.log b/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/000003.log
diff --git a/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/CURRENT b/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/CURRENT
@@ -0,0 +1 @@
+MANIFEST-000002
diff --git a/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/LOCK b/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/LOCK
diff --git a/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/MANIFEST-000002 b/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/MANIFEST-000002
diff --git a/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/index.berserk_faggin b/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/index.berserk_faggin
diff --git a/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/000003.log b/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/000003.log
diff --git a/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/CURRENT b/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/CURRENT
@@ -0,0 +1 @@
+MANIFEST-000002
diff --git a/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/LOCK b/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/LOCK
diff --git a/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/MANIFEST-000002 b/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/MANIFEST-000002
diff --git a/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/index.gigantic_keller b/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/index.gigantic_keller
diff --git a/.nextflow/history b/.nextflow/history
@@ -0,0 +1,4 @@
+2024-12-09 16:49:55	-	gigantic_keller	-	348db7354f08ee080d5ed7021b1e2114	8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf	nextflow run main.nf --run_plink false --run_prsice false --run_LDpred2grid false --run_LDpred2auto false --run_Lassosum2 false --run_prscsx false --run_megaprs false
+2024-12-09 16:51:20	6.6s	berserk_faggin	ERR	f324416b50445b817c88a9a700016174	8a32e597-0fae-4fe6-bde4-332d5f87fdea	nextflow run main.nf --run_plink false --run_prsice false --run_LDpred2grid false --run_LDpred2auto false --run_Lassosum2 false --run_prscsx false --run_megaprs false
+2024-12-09 16:56:12	7.6s	evil_legentil	ERR	db0d5f7f64fe21ad8d61a059f22a9e9e	047bdf78-a8b9-456c-bf7a-fa792e31cc93	nextflow run main.nf --run_plink false --run_prsice false --run_LDpred2grid false --run_LDpred2auto false --run_Lassosum2 false --run_prscsx false --run_megaprs false
+2024-12-09 17:07:42	6m 4s	kickass_euler	ERR	b1cc4c25d53196387214f31567700398	4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf	nextflow run main.nf --run_plink false --run_prsice false --run_LDpred2grid false --run_LDpred2auto false --run_Lassosum2 false --run_prscsx false --run_megaprs false
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,37 @@
+FROM tutkuyaras/pgsxplorer_image:latest
+
+RUN apt-get update && apt-get install -y \
+    wget \
+    curl \
+    bash \
+    bzip2 \
+    build-essential \
+    libbz2-dev \
+    zlib1g-dev \
+    liblzma-dev \
+    libcurl4-openssl-dev \
+    libssl-dev \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN wget -O /usr/local/bin/beagle.jar https://faculty.washington.edu/browning/beagle/beagle.28Jun21.220.jar && \
+    echo '#!/bin/bash\njava -jar /usr/local/bin/beagle.jar "$@"' > /usr/local/bin/beagle && \
+    chmod +x /usr/local/bin/beagle
+
+RUN wget -O /tmp/eagle.tar.gz https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/Eagle_v2.4.1.tar.gz && \
+    tar -xzvf /tmp/eagle.tar.gz -C /usr/local/bin/ && \
+    chmod +x /usr/local/bin/Eagle_v2.4.1/eagle && \
+    rm /tmp/eagle.tar.gz
+
+RUN wget https://github.com/samtools/bcftools/releases/download/1.17/bcftools-1.17.tar.bz2 && \
+    tar -xjf bcftools-1.17.tar.bz2 && \
+    cd bcftools-1.17 && \
+    ./configure && \
+    make && \
+    make install && \
+    cd .. && \
+    rm -rf bcftools-1.17 bcftools-1.17.tar.bz2
+
+ENV PATH="/usr/local/bin/Eagle_v2.4.1:$PATH"
+
+RUN apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
diff --git a/QC_graphs/.DS_Store b/QC_graphs/.DS_Store
diff --git a/bin/.DS_Store b/bin/.DS_Store
diff --git a/bin/.Rhistory b/bin/.Rhistory
diff --git a/bin/LDpred2auto.R b/bin/LDpred2auto.R
@@ -0,0 +1,233 @@
+# Set CRAN mirror
+options(repos = c(CRAN = "https://cran.rstudio.com"))
+options(bigstatsr.check.parallel.blas = FALSE)
+options(default.nproc.blas = NULL)
+
+# Install necessary packages if not already installed
+if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")
+if (!requireNamespace("bigsnpr", quietly = TRUE)) remotes::install_github("privefl/bigsnpr") 
+if (!requireNamespace("dplyr", quietly = TRUE)) install.packages("dplyr")
+if (!requireNamespace("R.utils", quietly = TRUE)) install.packages("R.utils")
+if (!requireNamespace("data.table", quietly = TRUE)) install.packages("data.table") 
+if (!requireNamespace("magrittr", quietly = TRUE)) install.packages("magrittr")
+
+library(bigsnpr)
+library(dplyr)
+library(R.utils)
+library(data.table)
+library(magrittr)
+library(fmsb)
+
+# Parse arguments
+args <- commandArgs(trailingOnly = TRUE)
+phenotype_file <- args[1]
+eigenvec_file <- args[2]
+gwas_sumstat <- args[3]
+bed_file <- args[4]
+
+# 1. Read in the phenotype and covariate files
+phenotype <- fread(phenotype_file)
+pcs <- fread(eigenvec_file)
+colnames(pcs) <- c('FID', 'IID', paste0('PC', 1:10))
+pheno <- merge(phenotype, pcs)
+pheno$Phenotype <- ifelse(pheno$Phenotype == 2, 1, 0)
+
+# 2. Information about the HapMap3+ variants
+info <- readRDS(runonce::download_file(
+  'https://figshare.com/ndownloader/files/37802721',
+  dir = 'tmp-data2', fname = 'map_hm3_plus.rds'))
+str(info)
+
+# 3. Read external summary statistics
+sumstats <- bigreadr::fread2(gwas_sumstat)
+sumstats <- sumstats %>% rename('chr' = 'CHR', 'pos' = 'BP', 'a1' = 'A1', 'a0' = 'A2', 'A1_FREQ' = 'A1_FREQ', 'n_eff' = 'N', 'OR' = 'OR', 'beta_se' = 'SE', 'L95' = 'L95', 'U95' = 'U95', 'Z_STAT' = 'Z_STAT', 'p' = 'P', 'rsid' = 'SNP')
+sumstats <- sumstats %>% select(chr, pos, rsid, a1, a0, n_eff, beta_se, p, OR)
+sumstats$beta <- log(sumstats$OR)
+sumstats <- sumstats[sumstats$rsid %in% info$rsid,]
+fwrite(sumstats, 'ldpred2auto_output.txt')
+
+# Calculate LD Matrix
+NCORES <- nb_cores()
+tmp <- tempfile(tmpdir = 'tmp-data2')
+on.exit(file.remove(paste0(tmp, '.sbk')), add = TRUE)
+corr <- NULL
+ld <- NULL
+fam.order <- NULL
+
+# Debug messages
+cat("Reading bed file...\n")
+# Read from bed/bim/fam, it generates .bk and .rds files.
+snp_readBed(bed_file)
+cat("Finished reading bed file.\n")
+# Debug message
+cat("Attaching bigSNP object...\n")
+# Attach the "bigSNP" object in R session
+rds_file <- sub(".bed$", ".rds", bed_file)
+cat("RDS file path:", rds_file, "\n")
+obj.bigSNP <- snp_attach(rds_file)
+# extract the SNP information from the genotype
+map <- setNames(obj.bigSNP$map[-3], c('chr', 'rsid', 'pos', 'a1', 'a0'))
+# perform SNP matching
+df_beta <- snp_match(sumstats, map)
+
+# Get aliases for useful slots
+G <- obj.bigSNP$genotypes
+CHR <- obj.bigSNP$map$chromosome
+POS <- obj.bigSNP$map$physical.pos
+# Change affection column values into 0 and 1 
+obj.bigSNP$fam$affection[obj.bigSNP$fam$affection == 1] <- 0
+obj.bigSNP$fam$affection[obj.bigSNP$fam$affection == 2] <- 1
+y <- obj.bigSNP$fam$affection
+
+#correlation (take 3cm)
+# To convert physical positions (in bp) to genetic positions (in cM), use
+POS2 <- snp_asGeneticPos(CHR, POS, dir = "tmp-data2", ncores = NCORES)
+# Before computing the LD matrices, let us filter out variants with small minor allele frequencies (MAFs):
+ind.row <- rows_along(G)
+maf <- snp_MAF(G, ind.row = ind.row, ind.col = df_beta$`_NUM_ID_`, ncores = NCORES)
+maf_thr <- 1 / sqrt(length(ind.row))  # threshold I like to use
+df_beta <- df_beta[maf > maf_thr, ]
+#df_beta <- snp_match(sumstats, map, join_by_pos = FALSE)  # use rsid instead of pos
+
+## Prepare data as validation and test
+set.seed(1)
+# Calculate the number of samples for validation data
+n_val <- floor(0.7 * nrow(G))
+# Sample indices for validation data
+ind.val <- sample(nrow(G), n_val)
+# Remaining indices for test data
+ind.test <- setdiff(rows_along(G), ind.val)
+# Debug message
+cat("Split data as validation (70%) and test (30%)\n")
+
+# Debug message
+cat("Calculating LD matrix...\n")
+# calculate LD
+for (chr in unique(df_beta$chr)) {
+
+  # print(chr)
+  # indices in 'df_beta'
+  ind.chr <- which(df_beta$chr == chr)
+
+  # indices in 'G'
+  ind.chr2 <- df_beta$`_NUM_ID_`[ind.chr]
+
+  # here we compute LD matrices ourselves, BUT
+  # we recall that we provide pre-computed LD matrices that are 
+  # usually much better (larger N, LD blocks for robustness, etc)
+  corr0 <- snp_cor(G, ind.col = ind.chr2, size = 3 / 1000,
+                   infos.pos = POS2[ind.chr2], ncores = NCORES)
+
+  if (chr == 1) {
+    ld <- Matrix::colSums(corr0^2)
+    corr <- as_SFBM(corr0, tmp, compact = TRUE)
+  } else {
+    ld <- c(ld, Matrix::colSums(corr0^2))
+    corr$add_columns(corr0, nrow(corr))
+  }
+}
+
+# We assume the fam order is the same across different chromosomes
+fam.order <- as.data.table(obj.bigSNP$fam)
+# Rename fam order
+setnames(fam.order,c("family.ID", "sample.ID"), c("FID", "IID"))
+
+# Debug message
+cat("Perform LD score regression\n")
+
+# Perform LD score regression
+ldsc <- snp_ldsc(   ld, 
+                    length(ld), 
+                    chi2 = (df_beta$beta / df_beta$beta_se)^2,
+                    sample_size = df_beta$n_eff, 
+                    blocks = NULL)
+h2_est <- ldsc[["h2"]]
+
+# Calculate the null R2 (binary trait
+# Reformat the phenotype file such that y is of the same order as the 
+# sample ordering in the genotype file
+a <- pheno[fam.order, on = c("FID", "IID")]
+# Calculate the null R2
+# use glm for binary trait 
+# (will also need the fmsb package to calculate the pseudo R2)
+null.model <- paste("PC", 1:10, sep = "", collapse = "+") %>%
+  paste0("Phenotype~", .) %>%
+  as.formula %>%
+  glm(data = a, family=binomial) 
+null.r2 <- fmsb::NagelkerkeR2(null.model)
+
+### LD-Pred:auto
+coef_shrink <- 0.95  # reduce this up to 0.4 if you have some (large) mismatch with the LD ref
+set.seed(1)  # to get the same result every time
+
+ # Get adjusted beta from the auto model
+ multi_auto <- snp_ldpred2_auto(
+ corr,
+ df_beta,
+ h2_init = h2_est,
+ vec_p_init = seq_log(1e-4, 0.9, length.out = 30),
+ ncores = NCORES,  allow_jump_sign = FALSE, shrink_corr = coef_shrink)
+
+ str(multi_auto, max.level = 1)
+
+ str(multi_auto[[1]], max.level = 1)
+# Debug message
+cat("Visualization of auto model \n")
+
+# Visualization of auto
+library(ggplot2)
+auto <- multi_auto[[1]]  # first chain
+
+pdf("LDpred2 auto graph.pdf")
+plot_grid(
+qplot(y = auto$path_p_est) + 
+theme_bigstatsr() + 
+geom_hline(yintercept = auto$p_est, col = "blue") +
+scale_y_log10() +
+labs(y = "p"),
+qplot(y = auto$path_h2_est) + 
+theme_bigstatsr() + 
+geom_hline(yintercept = auto$h2_est, col = "blue") +
+labs(y = "h2"),
+ncol = 1, align = "hv")
+dev.off()
+
+
+(range <- sapply(multi_auto, function(auto) diff(range(auto$corr_est))))
+(keep <- which(range > (0.95 * quantile(range, 0.95, na.rm = TRUE))))
+
+# Debug message
+cat("Obtain Model PRS \n")
+
+# (Obtain Model PRS)
+beta_auto <- rowMeans(sapply(multi_auto[keep], function(auto) auto$beta_est))
+pred_auto <- big_prodVec(G, beta_auto, ind.row = ind.test, ind.col = df_beta[["_NUM_ID_"]])
+pcor(pred_auto, y[ind.test], NULL)
+
+# Get final effects(for all) 
+
+# Get final effects (for all data)
+beta_auto_all <- rowMeans(sapply(multi_auto[keep], function(auto) auto$beta_est))
+pred_auto_all <- big_prodVec(G, beta_auto, ind.row = ind.row, ind.col = df_beta[["_NUM_ID_"]])
+
+reg.formula <- paste("PC", 1:10, sep = "", collapse = "+") %>%
+paste0("Phenotype~", .) %>%
+as.formula
+reg.dat <- a
+reg.dat$PRS <- pred_auto_all
+auto.model <- lm(reg.formula, dat=reg.dat) %>%
+summary
+(result <- data.table(
+  auto = auto.model$r.squared - null.r2$R2,
+  null = null.r2$R2
+))
+
+write.table(reg.dat, "LDpred2_auto.txt", sep = "\t", row.names = FALSE, col.names = TRUE)
+
+# Debug message
+cat("Auto results was written as LDpred2_auto.txt file \n")
+
+# Debug message
+cat("Some cleaning \n")
+# Some cleaning
+rm(corr); gc(); file.remove(paste0(tmp, ".sbk"))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#!/bin/bash -ue
		zcat phasedvcf_chr1.vcf.gz \| grep -v '^#' \| awk '{print $1, $2, $3}' > rsid_map_chr1.txt