diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..692f1d2 Binary files /dev/null and b/.DS_Store differ diff --git a/.command.sh b/.command.sh new file mode 100644 index 0000000..e258707 --- /dev/null +++ b/.command.sh @@ -0,0 +1,2 @@ +#!/bin/bash -ue +zcat phasedvcf_chr1.vcf.gz | grep -v '^#' | awk '{print $1, $2, $3}' > rsid_map_chr1.txt diff --git a/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/000003.log b/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/000003.log new file mode 100644 index 0000000..e69de29 diff --git a/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/CURRENT b/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/CURRENT new file mode 100644 index 0000000..1a84852 --- /dev/null +++ b/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/CURRENT @@ -0,0 +1 @@ +MANIFEST-000002 diff --git a/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/LOCK b/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/LOCK new file mode 100644 index 0000000..e69de29 diff --git a/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/MANIFEST-000002 b/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/MANIFEST-000002 new file mode 100644 index 0000000..bbbc585 Binary files /dev/null and b/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/db/MANIFEST-000002 differ diff --git a/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/index.evil_legentil b/.nextflow/cache/047bdf78-a8b9-456c-bf7a-fa792e31cc93/index.evil_legentil new file mode 100644 index 0000000..e69de29 diff --git a/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/000003.log b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/000003.log new file mode 100644 index 0000000..748d4e8 Binary files /dev/null and b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/000003.log differ diff --git a/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/CURRENT b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/CURRENT new file mode 100644 index 0000000..1a84852 --- /dev/null +++ b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/CURRENT @@ -0,0 +1 @@ +MANIFEST-000002 diff --git a/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/LOCK b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/LOCK new file mode 100644 index 0000000..e69de29 diff --git a/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/MANIFEST-000002 b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/MANIFEST-000002 new file mode 100644 index 0000000..bbbc585 Binary files /dev/null and b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/db/MANIFEST-000002 differ diff --git a/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/index.kickass_euler b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/index.kickass_euler new file mode 100644 index 0000000..fcafd86 Binary files /dev/null and b/.nextflow/cache/4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf/index.kickass_euler differ diff --git a/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/000003.log b/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/000003.log new file mode 100644 index 0000000..e69de29 diff --git a/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/CURRENT b/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/CURRENT new file mode 100644 index 0000000..1a84852 --- /dev/null +++ b/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/CURRENT @@ -0,0 +1 @@ +MANIFEST-000002 diff --git a/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/LOCK b/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/LOCK new file mode 100644 index 0000000..e69de29 diff --git a/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/MANIFEST-000002 b/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/MANIFEST-000002 new file mode 100644 index 0000000..bbbc585 Binary files /dev/null and b/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/db/MANIFEST-000002 differ diff --git a/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/index.berserk_faggin b/.nextflow/cache/8a32e597-0fae-4fe6-bde4-332d5f87fdea/index.berserk_faggin new file mode 100644 index 0000000..e69de29 diff --git a/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/000003.log b/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/000003.log new file mode 100644 index 0000000..9e0f96a Binary files /dev/null and b/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/000003.log differ diff --git a/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/CURRENT b/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/CURRENT new file mode 100644 index 0000000..1a84852 --- /dev/null +++ b/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/CURRENT @@ -0,0 +1 @@ +MANIFEST-000002 diff --git a/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/LOCK b/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/LOCK new file mode 100644 index 0000000..e69de29 diff --git a/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/MANIFEST-000002 b/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/MANIFEST-000002 new file mode 100644 index 0000000..594b326 Binary files /dev/null and b/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/db/MANIFEST-000002 differ diff --git a/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/index.gigantic_keller b/.nextflow/cache/8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf/index.gigantic_keller new file mode 100644 index 0000000..e69de29 diff --git a/.nextflow/history b/.nextflow/history new file mode 100644 index 0000000..89a5ca6 --- /dev/null +++ b/.nextflow/history @@ -0,0 +1,4 @@ +2024-12-09 16:49:55 - gigantic_keller - 348db7354f08ee080d5ed7021b1e2114 8b7dcc42-cfa0-4267-993c-d92d4e2ec0bf nextflow run main.nf --run_plink false --run_prsice false --run_LDpred2grid false --run_LDpred2auto false --run_Lassosum2 false --run_prscsx false --run_megaprs false +2024-12-09 16:51:20 6.6s berserk_faggin ERR f324416b50445b817c88a9a700016174 8a32e597-0fae-4fe6-bde4-332d5f87fdea nextflow run main.nf --run_plink false --run_prsice false --run_LDpred2grid false --run_LDpred2auto false --run_Lassosum2 false --run_prscsx false --run_megaprs false +2024-12-09 16:56:12 7.6s evil_legentil ERR db0d5f7f64fe21ad8d61a059f22a9e9e 047bdf78-a8b9-456c-bf7a-fa792e31cc93 nextflow run main.nf --run_plink false --run_prsice false --run_LDpred2grid false --run_LDpred2auto false --run_Lassosum2 false --run_prscsx false --run_megaprs false +2024-12-09 17:07:42 6m 4s kickass_euler ERR b1cc4c25d53196387214f31567700398 4d6a8e6c-25c0-4bad-86b8-715eb5a5fbdf nextflow run main.nf --run_plink false --run_prsice false --run_LDpred2grid false --run_LDpred2auto false --run_Lassosum2 false --run_prscsx false --run_megaprs false diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1e83897 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,37 @@ +FROM tutkuyaras/pgsxplorer_image:latest + +RUN apt-get update && apt-get install -y \ + wget \ + curl \ + bash \ + bzip2 \ + build-essential \ + libbz2-dev \ + zlib1g-dev \ + liblzma-dev \ + libcurl4-openssl-dev \ + libssl-dev \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN wget -O /usr/local/bin/beagle.jar https://faculty.washington.edu/browning/beagle/beagle.28Jun21.220.jar && \ + echo '#!/bin/bash\njava -jar /usr/local/bin/beagle.jar "$@"' > /usr/local/bin/beagle && \ + chmod +x /usr/local/bin/beagle + +RUN wget -O /tmp/eagle.tar.gz https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/Eagle_v2.4.1.tar.gz && \ + tar -xzvf /tmp/eagle.tar.gz -C /usr/local/bin/ && \ + chmod +x /usr/local/bin/Eagle_v2.4.1/eagle && \ + rm /tmp/eagle.tar.gz + +RUN wget https://github.com/samtools/bcftools/releases/download/1.17/bcftools-1.17.tar.bz2 && \ + tar -xjf bcftools-1.17.tar.bz2 && \ + cd bcftools-1.17 && \ + ./configure && \ + make && \ + make install && \ + cd .. && \ + rm -rf bcftools-1.17 bcftools-1.17.tar.bz2 + +ENV PATH="/usr/local/bin/Eagle_v2.4.1:$PATH" + +RUN apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* diff --git a/QC_graphs/.DS_Store b/QC_graphs/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/QC_graphs/.DS_Store differ diff --git a/bin/.DS_Store b/bin/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/bin/.DS_Store differ diff --git a/bin/.Rhistory b/bin/.Rhistory new file mode 100644 index 0000000..e69de29 diff --git a/bin/LDpred2auto.R b/bin/LDpred2auto.R new file mode 100644 index 0000000..6b88e35 --- /dev/null +++ b/bin/LDpred2auto.R @@ -0,0 +1,233 @@ +# Set CRAN mirror +options(repos = c(CRAN = "https://cran.rstudio.com")) +options(bigstatsr.check.parallel.blas = FALSE) +options(default.nproc.blas = NULL) + +# Install necessary packages if not already installed +if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes") +if (!requireNamespace("bigsnpr", quietly = TRUE)) remotes::install_github("privefl/bigsnpr") +if (!requireNamespace("dplyr", quietly = TRUE)) install.packages("dplyr") +if (!requireNamespace("R.utils", quietly = TRUE)) install.packages("R.utils") +if (!requireNamespace("data.table", quietly = TRUE)) install.packages("data.table") +if (!requireNamespace("magrittr", quietly = TRUE)) install.packages("magrittr") + +library(bigsnpr) +library(dplyr) +library(R.utils) +library(data.table) +library(magrittr) +library(fmsb) + +# Parse arguments +args <- commandArgs(trailingOnly = TRUE) +phenotype_file <- args[1] +eigenvec_file <- args[2] +gwas_sumstat <- args[3] +bed_file <- args[4] + +# 1. Read in the phenotype and covariate files +phenotype <- fread(phenotype_file) +pcs <- fread(eigenvec_file) +colnames(pcs) <- c('FID', 'IID', paste0('PC', 1:10)) +pheno <- merge(phenotype, pcs) +pheno$Phenotype <- ifelse(pheno$Phenotype == 2, 1, 0) + +# 2. Information about the HapMap3+ variants +info <- readRDS(runonce::download_file( + 'https://figshare.com/ndownloader/files/37802721', + dir = 'tmp-data2', fname = 'map_hm3_plus.rds')) +str(info) + +# 3. Read external summary statistics +sumstats <- bigreadr::fread2(gwas_sumstat) +sumstats <- sumstats %>% rename('chr' = 'CHR', 'pos' = 'BP', 'a1' = 'A1', 'a0' = 'A2', 'A1_FREQ' = 'A1_FREQ', 'n_eff' = 'N', 'OR' = 'OR', 'beta_se' = 'SE', 'L95' = 'L95', 'U95' = 'U95', 'Z_STAT' = 'Z_STAT', 'p' = 'P', 'rsid' = 'SNP') +sumstats <- sumstats %>% select(chr, pos, rsid, a1, a0, n_eff, beta_se, p, OR) +sumstats$beta <- log(sumstats$OR) +sumstats <- sumstats[sumstats$rsid %in% info$rsid,] +fwrite(sumstats, 'ldpred2auto_output.txt') + +# Calculate LD Matrix +NCORES <- nb_cores() +tmp <- tempfile(tmpdir = 'tmp-data2') +on.exit(file.remove(paste0(tmp, '.sbk')), add = TRUE) +corr <- NULL +ld <- NULL +fam.order <- NULL + +# Debug messages +cat("Reading bed file...\n") +# Read from bed/bim/fam, it generates .bk and .rds files. +snp_readBed(bed_file) +cat("Finished reading bed file.\n") +# Debug message +cat("Attaching bigSNP object...\n") +# Attach the "bigSNP" object in R session +rds_file <- sub(".bed$", ".rds", bed_file) +cat("RDS file path:", rds_file, "\n") +obj.bigSNP <- snp_attach(rds_file) +# extract the SNP information from the genotype +map <- setNames(obj.bigSNP$map[-3], c('chr', 'rsid', 'pos', 'a1', 'a0')) +# perform SNP matching +df_beta <- snp_match(sumstats, map) + +# Get aliases for useful slots +G <- obj.bigSNP$genotypes +CHR <- obj.bigSNP$map$chromosome +POS <- obj.bigSNP$map$physical.pos +# Change affection column values into 0 and 1 +obj.bigSNP$fam$affection[obj.bigSNP$fam$affection == 1] <- 0 +obj.bigSNP$fam$affection[obj.bigSNP$fam$affection == 2] <- 1 +y <- obj.bigSNP$fam$affection + +#correlation (take 3cm) +# To convert physical positions (in bp) to genetic positions (in cM), use +POS2 <- snp_asGeneticPos(CHR, POS, dir = "tmp-data2", ncores = NCORES) +# Before computing the LD matrices, let us filter out variants with small minor allele frequencies (MAFs): +ind.row <- rows_along(G) +maf <- snp_MAF(G, ind.row = ind.row, ind.col = df_beta$`_NUM_ID_`, ncores = NCORES) +maf_thr <- 1 / sqrt(length(ind.row)) # threshold I like to use +df_beta <- df_beta[maf > maf_thr, ] +#df_beta <- snp_match(sumstats, map, join_by_pos = FALSE) # use rsid instead of pos + +## Prepare data as validation and test +set.seed(1) +# Calculate the number of samples for validation data +n_val <- floor(0.7 * nrow(G)) +# Sample indices for validation data +ind.val <- sample(nrow(G), n_val) +# Remaining indices for test data +ind.test <- setdiff(rows_along(G), ind.val) +# Debug message +cat("Split data as validation (70%) and test (30%)\n") + +# Debug message +cat("Calculating LD matrix...\n") +# calculate LD +for (chr in unique(df_beta$chr)) { + + # print(chr) + # indices in 'df_beta' + ind.chr <- which(df_beta$chr == chr) + + # indices in 'G' + ind.chr2 <- df_beta$`_NUM_ID_`[ind.chr] + + # here we compute LD matrices ourselves, BUT + # we recall that we provide pre-computed LD matrices that are + # usually much better (larger N, LD blocks for robustness, etc) + corr0 <- snp_cor(G, ind.col = ind.chr2, size = 3 / 1000, + infos.pos = POS2[ind.chr2], ncores = NCORES) + + if (chr == 1) { + ld <- Matrix::colSums(corr0^2) + corr <- as_SFBM(corr0, tmp, compact = TRUE) + } else { + ld <- c(ld, Matrix::colSums(corr0^2)) + corr$add_columns(corr0, nrow(corr)) + } +} + +# We assume the fam order is the same across different chromosomes +fam.order <- as.data.table(obj.bigSNP$fam) +# Rename fam order +setnames(fam.order,c("family.ID", "sample.ID"), c("FID", "IID")) + +# Debug message +cat("Perform LD score regression\n") + +# Perform LD score regression +ldsc <- snp_ldsc( ld, + length(ld), + chi2 = (df_beta$beta / df_beta$beta_se)^2, + sample_size = df_beta$n_eff, + blocks = NULL) +h2_est <- ldsc[["h2"]] + +# Calculate the null R2 (binary trait +# Reformat the phenotype file such that y is of the same order as the +# sample ordering in the genotype file +a <- pheno[fam.order, on = c("FID", "IID")] +# Calculate the null R2 +# use glm for binary trait +# (will also need the fmsb package to calculate the pseudo R2) +null.model <- paste("PC", 1:10, sep = "", collapse = "+") %>% + paste0("Phenotype~", .) %>% + as.formula %>% + glm(data = a, family=binomial) +null.r2 <- fmsb::NagelkerkeR2(null.model) + +### LD-Pred:auto +coef_shrink <- 0.95 # reduce this up to 0.4 if you have some (large) mismatch with the LD ref +set.seed(1) # to get the same result every time + + # Get adjusted beta from the auto model + multi_auto <- snp_ldpred2_auto( + corr, + df_beta, + h2_init = h2_est, + vec_p_init = seq_log(1e-4, 0.9, length.out = 30), + ncores = NCORES, allow_jump_sign = FALSE, shrink_corr = coef_shrink) + + str(multi_auto, max.level = 1) + + str(multi_auto[[1]], max.level = 1) +# Debug message +cat("Visualization of auto model \n") + +# Visualization of auto +library(ggplot2) +auto <- multi_auto[[1]] # first chain + +pdf("LDpred2 auto graph.pdf") +plot_grid( +qplot(y = auto$path_p_est) + +theme_bigstatsr() + +geom_hline(yintercept = auto$p_est, col = "blue") + +scale_y_log10() + +labs(y = "p"), +qplot(y = auto$path_h2_est) + +theme_bigstatsr() + +geom_hline(yintercept = auto$h2_est, col = "blue") + +labs(y = "h2"), +ncol = 1, align = "hv") +dev.off() + + +(range <- sapply(multi_auto, function(auto) diff(range(auto$corr_est)))) +(keep <- which(range > (0.95 * quantile(range, 0.95, na.rm = TRUE)))) + +# Debug message +cat("Obtain Model PRS \n") + +# (Obtain Model PRS) +beta_auto <- rowMeans(sapply(multi_auto[keep], function(auto) auto$beta_est)) +pred_auto <- big_prodVec(G, beta_auto, ind.row = ind.test, ind.col = df_beta[["_NUM_ID_"]]) +pcor(pred_auto, y[ind.test], NULL) + +# Get final effects(for all) + +# Get final effects (for all data) +beta_auto_all <- rowMeans(sapply(multi_auto[keep], function(auto) auto$beta_est)) +pred_auto_all <- big_prodVec(G, beta_auto, ind.row = ind.row, ind.col = df_beta[["_NUM_ID_"]]) + +reg.formula <- paste("PC", 1:10, sep = "", collapse = "+") %>% +paste0("Phenotype~", .) %>% +as.formula +reg.dat <- a +reg.dat$PRS <- pred_auto_all +auto.model <- lm(reg.formula, dat=reg.dat) %>% +summary +(result <- data.table( + auto = auto.model$r.squared - null.r2$R2, + null = null.r2$R2 +)) + +write.table(reg.dat, "LDpred2_auto.txt", sep = "\t", row.names = FALSE, col.names = TRUE) + +# Debug message +cat("Auto results was written as LDpred2_auto.txt file \n") + +# Debug message +cat("Some cleaning \n") +# Some cleaning +rm(corr); gc(); file.remove(paste0(tmp, ".sbk")) diff --git a/bin/LDpred2grid.R b/bin/LDpred2grid.R new file mode 100644 index 0000000..0d828d1 --- /dev/null +++ b/bin/LDpred2grid.R @@ -0,0 +1,254 @@ +# Set CRAN mirror +options(repos = c(CRAN = "https://cran.rstudio.com")) +options(bigstatsr.check.parallel.blas = FALSE) +options(default.nproc.blas = NULL) + +# Install necessary packages if not already installed +if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes") +if (!requireNamespace("bigsnpr", quietly = TRUE)) remotes::install_github("privefl/bigsnpr") +if (!requireNamespace("dplyr", quietly = TRUE)) install.packages("dplyr") +if (!requireNamespace("R.utils", quietly = TRUE)) install.packages("R.utils") +if (!requireNamespace("data.table", quietly = TRUE)) install.packages("data.table") +if (!requireNamespace("magrittr", quietly = TRUE)) install.packages("magrittr") +if (!requireNamespace("fmsb", quietly = TRUE)) install.packages("fmsb") + +library(bigsnpr) +library(dplyr) +library(R.utils) +library(data.table) +library(magrittr) +library(fmsb) + +# Parse arguments +args <- commandArgs(trailingOnly = TRUE) +phenotype_file <- args[1] +eigenvec_file <- args[2] +gwas_sumstat <- args[3] +bed_file <- args[4] + +# 1. Read in the phenotype and covariate files +phenotype <- fread(phenotype_file) +pcs <- fread(eigenvec_file) +colnames(pcs) <- c('FID', 'IID', paste0('PC', 1:10)) +pheno <- merge(phenotype, pcs) +pheno$Phenotype <- ifelse(pheno$Phenotype == 2, 1, 0) + +# 2. Information about the HapMap3+ variants +info <- readRDS(runonce::download_file( + 'https://figshare.com/ndownloader/files/37802721', + dir = 'tmp-data', fname = 'map_hm3_plus.rds')) +str(info) + +# 3. Read external summary statistics +sumstats <- bigreadr::fread2(gwas_sumstat) +sumstats <- sumstats %>% rename('chr' = 'CHR', 'pos' = 'BP', 'a1' = 'A1', 'a0' = 'A2', 'A1_FREQ' = 'A1_FREQ', 'n_eff' = 'N', 'OR' = 'OR', 'beta_se' = 'SE', 'L95' = 'L95', 'U95' = 'U95', 'Z_STAT' = 'Z_STAT', 'p' = 'P', 'rsid' = 'SNP') +sumstats <- sumstats %>% select(chr, pos, rsid, a1, a0, n_eff, beta_se, p, OR) +sumstats$beta <- log(sumstats$OR) +sumstats <- sumstats[sumstats$rsid %in% info$rsid,] +fwrite(sumstats, 'ldpred2grid_output.txt') + +# Calculate LD Matrix +NCORES <- nb_cores() +tmp <- tempfile(tmpdir = 'tmp-data') +on.exit(file.remove(paste0(tmp, '.sbk')), add = TRUE) +corr <- NULL +ld <- NULL +fam.order <- NULL + +# Debug messages +cat("Reading bed file...\n") +# Read from bed/bim/fam, it generates .bk and .rds files. +snp_readBed(bed_file) +cat("Finished reading bed file.\n") +# Debug message +cat("Attaching bigSNP object...\n") +# Attach the "bigSNP" object in R session +rds_file <- sub(".bed$", ".rds", bed_file) +cat("RDS file path:", rds_file, "\n") +obj.bigSNP <- snp_attach(rds_file) +# extract the SNP information from the genotype +map <- setNames(obj.bigSNP$map[-3], c('chr', 'rsid', 'pos', 'a1', 'a0')) +# perform SNP matching +df_beta <- snp_match(sumstats, map) + +# Get aliases for useful slots +G <- obj.bigSNP$genotypes +CHR <- obj.bigSNP$map$chromosome +POS <- obj.bigSNP$map$physical.pos +# Change affection column values into 0 and 1 +obj.bigSNP$fam$affection[obj.bigSNP$fam$affection == 1] <- 0 +obj.bigSNP$fam$affection[obj.bigSNP$fam$affection == 2] <- 1 +y <- obj.bigSNP$fam$affection + +#correlation (take 3cm) +# To convert physical positions (in bp) to genetic positions (in cM), use +POS2 <- snp_asGeneticPos(CHR, POS, dir = "tmp-data", ncores = NCORES) +# Before computing the LD matrices, let us filter out variants with small minor allele frequencies (MAFs): +ind.row <- rows_along(G) +maf <- snp_MAF(G, ind.row = ind.row, ind.col = df_beta$`_NUM_ID_`, ncores = NCORES) +maf_thr <- 1 / sqrt(length(ind.row)) # threshold I like to use +df_beta <- df_beta[maf > maf_thr, ] +#df_beta <- snp_match(sumstats, map, join_by_pos = FALSE) # use rsid instead of pos + +## Prepare data as validation and test +set.seed(1) +# Calculate the number of samples for validation data +n_val <- floor(0.7 * nrow(G)) +# Sample indices for validation data +ind.val <- sample(nrow(G), n_val) +# Remaining indices for test data +ind.test <- setdiff(rows_along(G), ind.val) +# Debug message +cat("Split data as validation (70%) and test (30%)\n") + +# Debug message +cat("Calculating LD matrix...\n") +# calculate LD +for (chr in unique(df_beta$chr)) { + + # print(chr) + + ## indices in 'df_beta' + ind.chr <- which(df_beta$chr == chr) + ## indices in 'G' + ind.chr2 <- df_beta$`_NUM_ID_`[ind.chr] + + # here we compute LD matrices ourselves, BUT + # we recall that we provide pre-computed LD matrices that are + # usually much better (larger N, LD blocks for robustness, etc) + corr0 <- snp_cor(G, ind.col = ind.chr2, size = 3 / 1000, + infos.pos = POS2[ind.chr2], ncores = NCORES) + + if (chr == 1) { + ld <- Matrix::colSums(corr0^2) + corr <- as_SFBM(corr0, tmp, compact = TRUE) + } else { + ld <- c(ld, Matrix::colSums(corr0^2)) + corr$add_columns(corr0, nrow(corr)) + } +} + +# We assume the fam order is the same across different chromosomes +fam.order <- as.data.table(obj.bigSNP$fam) +# Rename fam order +setnames(fam.order,c("family.ID", "sample.ID"), c("FID", "IID")) + +# Debug message +cat("Perform LD score regression\n") + +# Perform LD score regression +ldsc <- snp_ldsc( ld, + length(ld), + chi2 = (df_beta$beta / df_beta$beta_se)^2, + sample_size = df_beta$n_eff, + blocks = NULL) +h2_est <- ldsc[["h2"]] + +# Calculate the null R2 (binary trait +# Reformat the phenotype file such that y is of the same order as the +# sample ordering in the genotype file +a <- pheno[fam.order, on = c("FID", "IID")] +# Calculate the null R2 +# use glm for binary trait +# (will also need the fmsb package to calculate the pseudo R2) +null.model <- paste("PC", 1:10, sep = "", collapse = "+") %>% + paste0("Phenotype~", .) %>% + as.formula %>% + glm(data = a, family=binomial) +null.r2 <- fmsb::NagelkerkeR2(null.model) + +# Debug message +cat("Prepare data for grid model... \n") + +# Prepare data for grid model +p_seq <- signif(seq_log(1e-5, 1, length.out = 21), 2) +h2_seq <- round(h2_est * c(0.3,0.7, 1, 1.4), 4) +grid.param <- expand.grid(p = p_seq, h2 = h2_seq,sparse = c(FALSE, TRUE)) + +# Debug message +cat("Prepared data for grid model \n") + +# Get adjusted beta from grid model +set.seed(1) # to get the same result every time +beta_grid <- snp_ldpred2_grid(corr, df_beta, grid.param, ncores = NCORES) + +# Debug message +cat("Get adjusted beta from grid model \n") + +## Obtain Model PRS (Validation set) +# calculate PRS for samples +pred_grid <- big_prodMat( G, beta_grid, ind.col = df_beta$`_NUM_ID_`) + +## Obtain scores For validation data +grid.param$score <- apply(pred_grid[ind.val, ], 2, function(x) { +if (all(is.na(x))) return(NA) +# summary(lm(y[ind.val] ~ x))$coef["x", 3] +summary(glm(y[ind.val] ~ x, family = "binomial"))$coef["x", 3] +}) + +# Debug message +cat("Obtain scores For validation data \n") + +## Visualization of grid glm-z (For validation set) +library(ggplot2) + +pdf("LDpred2 grid glm-z.pdf") +ggplot(grid.param, aes(x = p, y = score, color = as.factor(h2))) + +theme_bigstatsr() + +geom_point() + +geom_line() + +scale_x_log10(breaks = 10^(-5:0), minor_breaks = grid.param$p) + +facet_wrap(grid.param$sparse, labeller = label_both) + +labs(y = "GLM Z-Score", color = "h2") + +theme(legend.position = "top", panel.spacing = unit(1, "lines")) +dev.off() + +# Debug message +cat("Choose the best model \n") + +## Choose the best model +library(dplyr) +grid.param %>% +mutate(sparsity = colMeans(beta_grid == 0), id = row_number()) %>% +arrange(desc(score)) %>% +mutate_at(c("score", "sparsity"), round, digits = 3) %>% +slice(1:10) + + +best_beta_grid <- grid.param %>% +mutate(id = row_number()) %>% +filter(sparse) %>% +arrange(desc(score)) %>% +slice(1) %>% +print() %>% +pull(id) %>% +beta_grid[, .] + +pred <- big_prodVec(G, best_beta_grid, ind.row = ind.test, +ind.col = df_beta[["_NUM_ID_"]]) +pcor(pred, y[ind.test], NULL) + +## Get the final performance +reg.formula <- paste("PC", 1:10, sep = "", collapse = "+") %>% + paste0("Phenotype~", .) %>% +as.formula +reg.dat <- a +max.r2 <- 0 +for(i in 1:ncol(pred_grid)){ +reg.dat$PRS <- pred_grid[,i] +grid.model <- lm(reg.formula, dat=reg.dat) %>% +summary() +if(max.r2 < grid.model$r.squared){ + max.r2 <- grid.model$r.squared +} +} +write.table(reg.dat, "LDpred2_grid.txt", sep = "\t", row.names = FALSE, col.names = TRUE) +(result <- data.table(grid = max.r2 - null.r2$R2, null = null.r2$R2)) + +# Debug message +cat("Grid results was written as LDpred2_grid.txt file \n") + +# Debug message +cat("Some cleaning \n") +# Some cleaning +rm(corr); gc(); file.remove(paste0(tmp, ".sbk")) \ No newline at end of file diff --git a/bin/Lassosum2.R b/bin/Lassosum2.R new file mode 100644 index 0000000..e57c5be --- /dev/null +++ b/bin/Lassosum2.R @@ -0,0 +1,252 @@ +# Set CRAN mirror +options(repos = c(CRAN = "https://cran.rstudio.com")) +options(bigstatsr.check.parallel.blas = FALSE) +options(default.nproc.blas = NULL) + +# Install necessary packages if not already installed +if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes") +if (!requireNamespace("bigsnpr", quietly = TRUE)) remotes::install_github("privefl/bigsnpr") +if (!requireNamespace("dplyr", quietly = TRUE)) install.packages("dplyr") +if (!requireNamespace("R.utils", quietly = TRUE)) install.packages("R.utils") +if (!requireNamespace("data.table", quietly = TRUE)) install.packages("data.table") +if (!requireNamespace("magrittr", quietly = TRUE)) install.packages("magrittr") +if (!requireNamespace("fmsb", quietly = TRUE)) install.packages("fmsb") + +library(bigsnpr) +library(dplyr) +library(R.utils) +library(data.table) +library(magrittr) +library(fmsb) + +# Parse arguments +args <- commandArgs(trailingOnly = TRUE) +phenotype_file <- args[1] +eigenvec_file <- args[2] +gwas_sumstat <- args[3] +bed_file <- args[4] + +# 1. Read in the phenotype and covariate files +phenotype <- fread(phenotype_file) +pcs <- fread(eigenvec_file) +colnames(pcs) <- c('FID', 'IID', paste0('PC', 1:10)) +pheno <- merge(phenotype, pcs) +pheno$Phenotype <- ifelse(pheno$Phenotype == 2, 1, 0) + +# 2. Information about the HapMap3+ variants +info <- readRDS(runonce::download_file( + 'https://figshare.com/ndownloader/files/37802721', + dir = 'tmp-data3', fname = 'map_hm3_plus.rds')) +str(info) + +# 3. Read external summary statistics +sumstats <- bigreadr::fread2(gwas_sumstat) +sumstats <- sumstats %>% rename('chr' = 'CHR', 'pos' = 'BP', 'a1' = 'A1', 'a0' = 'A2', 'A1_FREQ' = 'A1_FREQ', 'n_eff' = 'N', 'OR' = 'OR', 'beta_se' = 'SE', 'L95' = 'L95', 'U95' = 'U95', 'Z_STAT' = 'Z_STAT', 'p' = 'P', 'rsid' = 'SNP') +sumstats <- sumstats %>% select(chr, pos, rsid, a1, a0, n_eff, beta_se, p, OR) +sumstats$beta <- log(sumstats$OR) +sumstats <- sumstats[sumstats$rsid %in% info$rsid,] +fwrite(sumstats, 'lassosum2_output.txt') + +# Calculate LD Matrix +NCORES <- nb_cores() +tmp <- tempfile(tmpdir = 'tmp-data3') +on.exit(file.remove(paste0(tmp, '.sbk')), add = TRUE) +corr <- NULL +ld <- NULL +fam.order <- NULL + +# Debug messages +cat("Reading bed file...\n") +# Read from bed/bim/fam, it generates .bk and .rds files. +snp_readBed(bed_file) +cat("Finished reading bed file.\n") +# Debug message +cat("Attaching bigSNP object...\n") +# Attach the "bigSNP" object in R session +rds_file <- sub(".bed$", ".rds", bed_file) +cat("RDS file path:", rds_file, "\n") +obj.bigSNP <- snp_attach(rds_file) +# extract the SNP information from the genotype +map <- setNames(obj.bigSNP$map[-3], c('chr', 'rsid', 'pos', 'a1', 'a0')) +# perform SNP matching +df_beta <- snp_match(sumstats, map) + +# Get aliases for useful slots +G <- obj.bigSNP$genotypes +CHR <- obj.bigSNP$map$chromosome +POS <- obj.bigSNP$map$physical.pos +# Change affection column values into 0 and 1 +obj.bigSNP$fam$affection[obj.bigSNP$fam$affection == 1] <- 0 +obj.bigSNP$fam$affection[obj.bigSNP$fam$affection == 2] <- 1 +y <- obj.bigSNP$fam$affection + +#correlation (take 3cm) +# To convert physical positions (in bp) to genetic positions (in cM), use +POS2 <- snp_asGeneticPos(CHR, POS, dir = "tmp-data3", ncores = NCORES) +# Before computing the LD matrices, let us filter out variants with small minor allele frequencies (MAFs): +ind.row <- rows_along(G) +maf <- snp_MAF(G, ind.row = ind.row, ind.col = df_beta$`_NUM_ID_`, ncores = NCORES) +maf_thr <- 1 / sqrt(length(ind.row)) # threshold I like to use +df_beta <- df_beta[maf > maf_thr, ] +#df_beta <- snp_match(sumstats, map, join_by_pos = FALSE) # use rsid instead of pos + +## Prepare data as validation and test +set.seed(1) +# Calculate the number of samples for validation data +n_val <- floor(0.7 * nrow(G)) +# Sample indices for validation data +ind.val <- sample(nrow(G), n_val) +# Remaining indices for test data +ind.test <- setdiff(rows_along(G), ind.val) +# Debug message +cat("Split data as validation (70%) and test (30%)\n") + +# Debug message +cat("Calculating LD matrix...\n") +# calculate LD +for (chr in unique(df_beta$chr)) { + + # print(chr) + + ## indices in 'df_beta' + ind.chr <- which(df_beta$chr == chr) + ## indices in 'G' + ind.chr2 <- df_beta$`_NUM_ID_`[ind.chr] + + # here we compute LD matrices ourselves, BUT + # we recall that we provide pre-computed LD matrices that are + # usually much better (larger N, LD blocks for robustness, etc) + corr0 <- snp_cor(G, ind.col = ind.chr2, size = 3 / 1000, + infos.pos = POS2[ind.chr2], ncores = NCORES) + + if (chr == 1) { + ld <- Matrix::colSums(corr0^2) + corr <- as_SFBM(corr0, tmp, compact = TRUE) + } else { + ld <- c(ld, Matrix::colSums(corr0^2)) + corr$add_columns(corr0, nrow(corr)) + } +} + +# We assume the fam order is the same across different chromosomes +fam.order <- as.data.table(obj.bigSNP$fam) +# Rename fam order +setnames(fam.order,c("family.ID", "sample.ID"), c("FID", "IID")) + +# Debug message +cat("Perform LD score regression\n") + +# Perform LD score regression +ldsc <- snp_ldsc( ld, + length(ld), + chi2 = (df_beta$beta / df_beta$beta_se)^2, + sample_size = df_beta$n_eff, + blocks = NULL) +h2_est <- ldsc[["h2"]] + +# Calculate the null R2 (binary trait +# Reformat the phenotype file such that y is of the same order as the +# sample ordering in the genotype file +a <- pheno[fam.order, on = c("FID", "IID")] +# Calculate the null R2 +# use glm for binary trait +# (will also need the fmsb package to calculate the pseudo R2) +null.model <- paste("PC", 1:10, sep = "", collapse = "+") %>% + paste0("Phenotype~", .) %>% + as.formula %>% + glm(data = a, family=binomial) +null.r2 <- fmsb::NagelkerkeR2(null.model) + +# Debug message +cat("Prepare data for grid model... \n") + +# Prepare data for grid model +p_seq <- signif(seq_log(1e-5, 1, length.out = 21), 2) +h2_seq <- round(h2_est * c(0.3,0.7, 1, 1.4), 4) +grid.param <- expand.grid(p = p_seq, h2 = h2_seq,sparse = c(FALSE, TRUE)) + + +# Debug message +cat("Start calculations for Lassosum2 \n") +beta_lassosum2 <- snp_lassosum2(corr, df_beta, ncores = NCORES) +(params2 <- attr(beta_lassosum2, "grid_param")) +pred_grid2 <- big_prodMat(G, beta_lassosum2, ind.col = df_beta[["_NUM_ID_"]]) + +params2$score <- apply(pred_grid2[ind.val, ], 2, function(x) { + if (all(is.na(x))) return(NA) + #summary(lm(y[ind.val] ~ x))$coef["x", 3] + summary(glm(y[ind.val] ~ x, family = "binomial"))$coef["x", 3] +}) + +library(ggplot2) +pdf("Lassosum2_graph.pdf") +ggplot(params2, aes(x = lambda, y = score, color = as.factor(delta))) + + theme_bigstatsr() + + geom_point() + + geom_line() + + scale_x_log10(breaks = 10^(-5:0)) + + labs(y = "GLM Z-Score", color = "delta") + + theme(legend.position = "top") + + guides(colour = guide_legend(nrow = 1)) +dev.off() + +best_grid_lassosum2 <- params2 %>% + mutate(id = row_number()) %>% + arrange(desc(score)) %>% + print() %>% + slice(1) %>% + pull(id) %>% + beta_lassosum2[, .] + +# Debug message +cat("Choose the best among all LDpred2-grid and lassosum2 models \n") + +best_grid_overall <- + `if`(max(params2$score, na.rm = TRUE) > max(grid.param$score, na.rm = TRUE), + best_grid_lassosum2, best_beta_grid) + +coef_shrink <- 0.95 + + +multi_auto <- snp_ldpred2_auto( + corr, df_beta, h2_init = h2_est, + vec_p_init = seq_log(1e-4, 0.9, length.out = 50), ncores = NCORES, + burn_in = 500, num_iter = 500, report_step = 20, + # use_MLE = FALSE, # uncomment if you have convergence issues or when power is low (need v1.11.9) + allow_jump_sign = FALSE, shrink_corr = coef_shrink) + +(range <- sapply(multi_auto, function(auto) diff(range(auto$corr_est)))) + +(keep <- which(range > (0.95 * quantile(range, 0.95, na.rm = TRUE)))) +all_h2 <- sapply(multi_auto[keep], function(auto) tail(auto$path_h2_est, 500)) +quantile(all_h2, c(0.5, 0.025, 0.975)) + +all_p <- sapply(multi_auto[keep], function(auto) tail(auto$path_p_est, 500)) +quantile(all_p, c(0.5, 0.025, 0.975)) + +all_alpha <- sapply(multi_auto[keep], function(auto) tail(auto$path_alpha_est, 500)) +quantile(all_alpha, c(0.5, 0.025, 0.975)) + +bsamp <- lapply(multi_auto[keep], function(auto) auto$sample_beta) +all_r2 <- do.call("cbind", lapply(seq_along(bsamp), function(ic) { + b1 <- bsamp[[ic]] + Rb1 <- apply(b1, 2, function(x) + coef_shrink * bigsparser::sp_prodVec(corr, x) + (1 - coef_shrink) * x) + b2 <- do.call("cbind", bsamp[-ic]) + b2Rb1 <- as.matrix(Matrix::crossprod(b2, Rb1)) +})) +quantile(all_r2, c(0.5, 0.025, 0.975)) + +beta_auto <- rowMeans(sapply(multi_auto[keep], function(auto) auto$beta_est)) +pred_auto <- big_prodVec(G, beta_auto, ind.col = df_beta[["_NUM_ID_"]]) +pcor(pred_auto, y, NULL)^2 + +postp <- rowMeans(sapply(multi_auto[keep], function(auto) auto$postp_est)) +pdf("Lassosum2_qplot.pdf") +qplot(y = postp, alpha = I(0.2)) + theme_bigstatsr() +dev.off() + +# Debug message +cat("Some cleaning \n") +# Some cleaning +rm(corr); gc(); file.remove(paste0(tmp, ".sbk")) \ No newline at end of file diff --git a/bin/MAFcheck.R b/bin/MAFcheck.R new file mode 100644 index 0000000..f6daf42 --- /dev/null +++ b/bin/MAFcheck.R @@ -0,0 +1,9 @@ + #!/usr/bin/env Rscript + library(ggplot2) + library(data.table) + args <- commandArgs(trailingOnly = TRUE) + maf_freq <- read.table(args[1], header =TRUE, as.is=T) + pdf("Target_MAF distribution.pdf") + MAF <- hist(maf_freq[,5],main = "MAF distribution", xlab = "MAF",col="#d6604d") + dev.off() + \ No newline at end of file diff --git a/bin/PRSice_mac b/bin/PRSice_mac new file mode 100644 index 0000000..5e72f5f Binary files /dev/null and b/bin/PRSice_mac differ diff --git a/bin/format_gwas.R b/bin/format_gwas.R new file mode 100644 index 0000000..7d0f552 --- /dev/null +++ b/bin/format_gwas.R @@ -0,0 +1,27 @@ +#!/usr/bin/env Rscript +args <- commandArgs(trailingOnly = TRUE) +input_file <- args[1] + +library(dplyr) +library(data.table) + +# Read GWAS +gwas_data <- read.table(input_file, header = TRUE, stringsAsFactors = FALSE) + +# Kolonları yeniden adlandırma ve seçme +formatted_gwas <- gwas_data %>% + dplyr::select(SNP, A1, A2, A1_FREQ, OR, SE, P, N) %>% + dplyr::rename( + freq = A1_FREQ, + b = OR, + se = SE, + p = P, + N = N + ) + +# Odds Ratio'yu log Odds Ratio'ya dönüştürme +formatted_gwas <- formatted_gwas %>% + dplyr::mutate(b = log(b)) + +# Yeni formattaki dosyayı dışa aktarma +write.table(formatted_gwas, "formatted_gwas.ma", quote = FALSE, row.names = FALSE, sep = "\t") diff --git a/bin/heterozygosity.R b/bin/heterozygosity.R new file mode 100644 index 0000000..a420cd7 --- /dev/null +++ b/bin/heterozygosity.R @@ -0,0 +1,8 @@ + #!/usr/bin/env Rscript + library(data.table) + args <- commandArgs(trailingOnly = TRUE) + het <- read.table(args[1], header =TRUE, as.is=T) + het_fail = subset(het, (het$HET_RATE < mean(het$HET_RATE)-3*sd(het$HET_RATE)) | (het$HET_RATE > mean(het$HET_RATE)+3*sd(het$HET_RATE))); + het_fail$HET_DST = (het_fail$HET_RATE-mean(het$HET_RATE))/sd(het$HET_RATE); + write.table(het_fail, "target_fail-het-qc.txt", row.names=FALSE) + \ No newline at end of file diff --git a/bin/ldak6.mac b/bin/ldak6.mac new file mode 100755 index 0000000..4f1ac80 Binary files /dev/null and b/bin/ldak6.mac differ diff --git a/bin/plot_het.R b/bin/plot_het.R new file mode 100644 index 0000000..9ddbfaf --- /dev/null +++ b/bin/plot_het.R @@ -0,0 +1,15 @@ +args <- commandArgs(trailingOnly = TRUE) +het_file <- args[1] + +library(ggplot2) +library(data.table) +library(dplyr) + +# Read data into R +het <- read.table(het_file, header = TRUE) +het$HET_RATE = (het$"N.NM." - het$"O.HOM.") / het$"N.NM." + +# Generate plot +pdf("het.pdf") +hist(het$HET_RATE, xlab = "Heterozygosity Rate", ylab = "Frequency", main = "Heterozygosity Rate", col = "#92c5de") +dev.off() diff --git a/bin/plot_hwe.R b/bin/plot_hwe.R new file mode 100644 index 0000000..24735cb --- /dev/null +++ b/bin/plot_hwe.R @@ -0,0 +1,14 @@ +args <- commandArgs(trailingOnly = TRUE) +hwe_file <- args[1] + +library(ggplot2) +library(data.table) +library(dplyr) + +# Read data into R +hwe <- read.table(hwe_file, header = TRUE) + +# Generate plot +pdf("hwe.pdf") +hist(hwe[, 9], main = "Histogram HWE", col = "#f4a582") +dev.off() diff --git a/bin/plot_ibd.R b/bin/plot_ibd.R new file mode 100644 index 0000000..cd64625 --- /dev/null +++ b/bin/plot_ibd.R @@ -0,0 +1,14 @@ +args <- commandArgs(trailingOnly = TRUE) +ibd_file <- args[1] + +library(ggplot2) +library(data.table) +library(dplyr) + +# Read data into R +ibd_data <- read.table(ibd_file, header = TRUE) + +# Generate plot +pdf("ibd_plot.pdf") +hist(ibd_data[, "PI_HAT"], main = "Histogram of IBD (PI_HAT)", col = "#66c2a4") +dev.off() diff --git a/bin/plot_maf.R b/bin/plot_maf.R new file mode 100644 index 0000000..423bf7f --- /dev/null +++ b/bin/plot_maf.R @@ -0,0 +1,12 @@ +library(ggplot2) +library(data.table) +library(dplyr) +args <- commandArgs(trailingOnly = TRUE) +maf_freq_file <- args[1] +# Read data into R +maf_freq <- read.table("MAF_check.frq", header =TRUE, as.is=T) + +# Generate plot +pdf("MAF_dist.pdf") +MAF <- hist(maf_freq[,5], main = "MAF distribution", xlab = "MAF", col="#d6604d") +dev.off() diff --git a/bin/process_het.R b/bin/process_het.R new file mode 100644 index 0000000..9ddbfaf --- /dev/null +++ b/bin/process_het.R @@ -0,0 +1,15 @@ +args <- commandArgs(trailingOnly = TRUE) +het_file <- args[1] + +library(ggplot2) +library(data.table) +library(dplyr) + +# Read data into R +het <- read.table(het_file, header = TRUE) +het$HET_RATE = (het$"N.NM." - het$"O.HOM.") / het$"N.NM." + +# Generate plot +pdf("het.pdf") +hist(het$HET_RATE, xlab = "Heterozygosity Rate", ylab = "Frequency", main = "Heterozygosity Rate", col = "#92c5de") +dev.off() diff --git a/eagle b/eagle new file mode 100644 index 0000000..e87cf74 Binary files /dev/null and b/eagle differ diff --git a/main.nf b/main.nf new file mode 100644 index 0000000..335e6cd --- /dev/null +++ b/main.nf @@ -0,0 +1,442 @@ +/* + * HELP MESSAGE + */ + +if (params.help) { + log.info """ + How to use : + + Required Arguments: + --target prefix of plink files (.bed, .bim, .fam) + + Optional Arguments: + --mind Threshold value to delete SNPs with missingness + --geno Threshold value to delete individuals with missingness + --maf Threshold value for minimum MAF frequencies of SNPs + --hwe_case Threshold value for Hardy-Weinberg Equilibrium for controls + --hwe_ctrl Threshold value for Hardy-Weinberg Equilibrium for cases + --indep_window_size The window size + --indep_step_size The number of SNPs to shift the window at each step + --indep_threshold The multiple correlation coefficient for a SNP being regressed on all other SNPs simultaneously. + --pihat The default threshold 0.1875 represents the half-way point between 2nd and 3rd degree relatives + --relatedness The same threshold with pihat value + --ref_phase Path of Reference BCF folder for Phasing + --genetic_map Path of genetic_map file for Phasing + --ref_imp Path of Refenrence bref3 folder for Imputation + --g_map Path of Reference genetic map folder for Imputation + --pca Number of principal components to compute, default is 10 + --pheno_file Name of the phenotype file located under the target folder + --run_plink Run the PLINK part of the workflow, default = true + --run_prsice Run the PRSice-2 part of the workflow, default = true + --run_pca Run the PCA part of the workflow if set to true, cretaes ".eigenvec" file + --run_LDpred2grid Run the LDPred2 Grid Model of the workflow , default = true + --run_LDpred2auto Run the LDPred2 Auto Model of the workflow, default = true + --run_Lassosum2 Run the Lassosum2 model of the workflow, default = true + --run_megaprs Run the MegaPRS model of the workflow, default = true + --ldak_executable Path to the LDAK executable (For MegaPRS) + --mega_model Model type for MegaPRS. Options: lasso, ridge, bayesr, etc. Default = bayesr + --run_prscsx Run the PRScsx of the workflow, default = true + --run_mussel Run the MUSSEL of the workflow if set to true, default = false + --prsice_script Path to the PRSice R script + --prsice_executable Path to the PRSice executable + --prscsxref_dir Path to PRScsx reference panel, could be UKBB or 1KG + --prscsx_gwas1 1st GWAS sum stat for PRScsx analysis + --prscsx_gwas2 2nd GWAS sum stat for PRScsx analysis + --n_gwas1 Sample size for GWAS1 + --n_gwas2 Sample size for GWAS2 + --pop1 Ancestry of 1st Gwas sum stat, could be AFR,AMR,EAS,EUR,SAS + --pop2 Ancestry of 2nd Gwas sum stat, could be AFR,AMR,EAS,EUR,SAS + --phi Global shrinkage parameter phi, fixing phi to 1e-2(for highly polygenic traits) or 1e-4(for less polygenic tratits) + --meta Return combined SNP effect sizes across populations using inverse variance weighted meta-analysis of population-specific posterior effect size estimates. Default is True. + --pack Path to MUSSEL folder + --data Path to MUSSEL data + --LDref Path to LDref folder for MUSSEL module + --sst Path to summary statistic data files for MUSSEL module + --pop Used populations for MUSSEL module, could be EUR, AFR, AMR, EAS or SAS + --mussel_chrom Specify the chromosomes to be analyzed by MUSSEL module , by default all chromosomes are analyzed + --bfile_tuning Path to PLINK binary input file prefix for tuning of MUSSEL module + --pheno_tuning Path to phenotype file (PLINK format) for tuning of MUSSEL module + --covar_tuning Path to quantitative covariates (PLINK format) for tuning + --bfile_testing Path to PLINK binary input file prefix for testing of MUSSEL module + --pheno_testing Path to phenotype file (PLINK format) for testing of MUSSEL module + --covar_testing Path to quantitative covariates (PLINK format) for testing + --trait_type Type of phenotype, continuous or binary for MUSSEL module. Default: continuous + --NCORES How many cores to use for MUSSEL modules + --plink path to plink2 for MUSSEL module + """ + exit 1 +} + +println """ + + Quality Control Steps of Target Data + + ========================================= + target: ${params.target} + outdir: ${params.outdir} + visualization : ${params.graphs} + + --help parameter could be used to see arguments + +""".stripIndent() +include {ConvertVCFtoPLINK } from './modules/ConvertVCFtoPLINK' +include { GWAS_QC } from './modules/GWAS_QC' +include { QCSNPmissingness } from './modules/QCSNPmissingness' +include { QCindmissingness } from './modules/QCindmissingness' +include { FilterMAF } from './modules/FilterMAF' +include { PlotMAF } from './modules/PlotMAF' +include { RemoveLowMAF } from './modules/RemoveLowMAF' +include { CalculateHWE } from './modules/CalculateHWE' +include { PlotHWE } from './modules/PlotHWE' +include { InitialHWEFilter } from './modules/InitialHWEFilter' +include { SecondHWEFilter } from './modules/SecondHWEFilter' +include { IBDLDPruning } from './modules/IBDLDPruning' +include { GenerateIBD } from './modules/GenerateIBD' +include { PlotIBD } from './modules/PlotIBD' +include { RelCutoff } from './modules/RelCutoff' +include { FilterIndividuals } from './modules/FilterIndividuals' +include { CalculateHet } from './modules/CalculateHet' +include { PlotHet } from './modules/PlotHet' +include { ProcessHet } from './modules/ProcessHet' +include { CleanHetFailInd } from './modules/CleanHetFailInd' +include { RemoveHetOutliers } from './modules/RemoveHetOutliers' +include { FindDuplicateSNPs } from './modules/FindDuplicateSNPs' +include { RemoveDuplicateSNPs } from './modules/RemoveDuplicateSNPs' +include { ConvertPLINKtoVCF } from './modules/ConvertPLINKtoVCF' +include { SortVCF } from './modules/SortVCF.nf' +include { SplitVCF } from './modules/SplitVCF.nf' +include { PhaseVCF } from './modules/PhaseVCF' +include { Takersid } from './modules/Takersid' +include { ImputeVCF } from './modules/ImputeVCF' +include { MergedVCF } from './modules/MergedVCF' +include { fastmixture } from './modules/fastmixture' +include { VCFtoPLINK } from './modules/VCFtoPLINK' +include { FindDuplicates } from './modules/FindDuplicates' +include { RemoveDuplicates } from './modules/RemoveDuplicates' +include { ClumpSNPs } from './modules/ClumpSNPs' +include { CreateValidSNPs } from './modules/CreateValidSNPs' +include { CreateSNPpvalue } from './modules/CreateSNPpvalue' +include { CreateRangeList } from './modules/CreateRangeList' +include { PruneSNPs } from './modules/PruneSNPs' +include { CalculatePCA } from './modules/CalculatePCA' +include { CalculatePGS } from './modules/CalculatePGS' +include { PRSice2 } from './modules/PRSice2.nf' +include { LDpred2grid } from './modules/LDpred2grid.nf' +include { LDpred2auto } from './modules/LDpred2auto.nf' +include { Lassosum2 } from './modules/Lassosum2.nf' +include { PRSCSx } from './modules/PRSCSx.nf' +include { Step1_Run_LDpred2 } from './modules/Step1_Run_LDpred2.nf' +include { Step2_LDpred2_Tuning } from './modules/Step2_LDpred2_Tuning.nf' +include { Step3_Run_MUSS } from './modules/Step3_Run_MUSS.nf' +include { Step4_Combine_PRS_Models } from './modules/Step4_Combine_PRS_Models.nf' +include { MegaPRS } from './modules/MegaPRS.nf' +include { FormatGWAS } from './modules/FormatGWAS.nf' +include { Impute } from './modules/Impute.nf' +include { Sbayesrc } from './modules/Sbayesrc.nf' + +workflow { + // Choose File Type + + if (params.vcf_to_plink) { + println "Files are VCF, Lets convert to PLINK format.." + // Take VCF file + vcf_ch = Channel.fromPath("${params.vcf}").ifEmpty { error "VCF file not found: ${params.vcf}" } + converted_plink_ch = ConvertVCFtoPLINK(vcf_ch) + target_ch = converted_plink_ch + } else { + println "Files are PLINK format" + // Kullanıcı PLINK formatında dosya belirttiğinde bu süreç çalışır + target_ch = Channel.fromPath("${params.target_prefix}").ifEmpty { error "PLINK files not found: ${params.target_prefix}" } + } + + // Define the input channel + fail_het_qc_ch = Channel.fromPath("fail-het-qc.txt").ifEmpty { error "Input file not found: fail-het-qc.txt" } + gwas_sumstat_ch = Channel.fromPath("${params.target}/GWAS_sumstat_t1.txt").ifEmpty { error "Input file not found: ${params.target}/GWAS_sumstat_t1.txt" } + pheno_file_ch = Channel.fromPath("${params.pheno_file}").ifEmpty { error "Input file not found: ${params.pheno_file}"} + mega_summaries_ch = Channel.fromPath("${params.outdir}/quant.summaries").ifEmpty { error "Input file not found: ${params.outdir}/quant.summaries" } + ref_phase_ch = Channel.fromPath("${params.ref_phase}/Ref_chr${params.chr}_hg38.bcf.gz").ifEmpty { error "Input file not found: ${params.ref_phase}/Ref_chr${params.chr}_hg38.bcf.gz" } + genetic_map_ch = Channel.fromPath("$PWD/genetic_map_hg38_withX.txt.gz").ifEmpty { error "Input file not found: $PWD/genetic_map_hg38_withX.txt.gz"} + formatted_gwas_ch = Channel.fromPath("${params.outdir}/formatted_gwas.ma").ifEmpty { error "Input file not found: ${params.outdir}/formatted_gwas.ma" } + impute_out_ch = Channel.fromPath("$PWD/sbayesrc.imputed.ma").ifEmpty { error "Input file not found: $PWD/sbayesrc.imputed.ma"} + + // Execute QC Part + + // Execute GWAS QC + gwas_qc = GWAS_QC(gwas_sumstat_ch) + + // Execute QCSNPmissingness + qcsnp_out = QCSNPmissingness(target_ch) + + // Execute QCindmissingness + qcind_out = QCindmissingness(qcsnp_out) + + // Execute FilterMAF + maf_out = FilterMAF(qcind_out) + + // Separate the outputs + merged_bfile_ch = maf_out.map { it[0] } + maf_frq_ch = maf_out.map { it[1] } + + // Execute RemoveLowMAF + remove_low_maf_out = RemoveLowMAF(merged_bfile_ch.first()) + + // Execute PlotMAF + plot_out = PlotMAF(maf_frq_ch) + + // Execute CalculateHWE + hwe_out = CalculateHWE(remove_low_maf_out) + + // Execute PlotHWE + plot_hwe_out = PlotHWE(hwe_out) + + // Execute InitialHWEFilter + initial_hwe_filter_out = InitialHWEFilter(remove_low_maf_out) + + // Execute SecondHWEFilter + second_hwe_filter_out = SecondHWEFilter(initial_hwe_filter_out) + + // Execute IBD-LD-pruning + ibd_ld_pruning_out = IBDLDPruning(second_hwe_filter_out) + + // Extract the prune.in file + prune_in_file_ch = ibd_ld_pruning_out.prune_in + + // Execute GenerateIBD + generate_ibd_out = GenerateIBD(second_hwe_filter_out, prune_in_file_ch) + + // Execute PlotIBD + plot_ibd_out = PlotIBD(generate_ibd_out) + + // Execute RelCutoff + rel_cutoff_out = RelCutoff(second_hwe_filter_out, prune_in_file_ch) + + // Execute FilterIndividuals + filter_individuals_out = FilterIndividuals(second_hwe_filter_out, rel_cutoff_out) + + // Execute CalculateHet + calculate_het_out = CalculateHet(filter_individuals_out, prune_in_file_ch) + + // Execute PlotHet + plot_het_out = PlotHet(calculate_het_out) + + // Execute ProcessHet + process_het_out = ProcessHet(calculate_het_out) + + // Execute CleanHetFailInd + clean_het_fail_ind_out = CleanHetFailInd(fail_het_qc_ch) + + // Execute RemoveHetOutliers + remove_het_outliers_out = RemoveHetOutliers(filter_individuals_out, clean_het_fail_ind_out ) + + // Execute FindDuplicatesSNPs + find_duplicate_snps_out = FindDuplicateSNPs(remove_het_outliers_out) + + // Execute RemoveDuplicateSNPs + remove_duplicate_snps_out = RemoveDuplicateSNPs(remove_het_outliers_out, find_duplicate_snps_out ) + + // LD Pruning + prune_snps_out = PruneSNPs(remove_duplicate_snps_out) + + // Convert PLINK to VCF + vcf_out = ConvertPLINKtoVCF(remove_duplicate_snps_out) + + // Sort VCF Files + sorted_vcf_ch = SortVCF(input_vcf = vcf_out) + + // Split VCF into chromosomes + chr_ch = Channel.of(1..22) + + split_ch = SplitVCF(vcf_file = sorted_vcf_ch.first(), + chr = chr_ch.flatten()) + + // Phase VCF file + // Map split VCF files to corresponding reference files + split_ref_ch = split_ch + .map { splitted_vcf -> + // Extract chromosome number from the split VCF filename + def chr = splitted_vcf.name.replaceAll(/.*vcfbychrom_(\d+)\.vcf\.gz/, '$1') + def ref_bcf = file("${params.ref_phase}/Ref_chr${chr}_hg38.bcf.gz") + [splitted_vcf, ref_bcf, chr] + } + .filter { tuple -> tuple[0] && tuple[1].exists() } // Ensure both files exist + + phased_vcf_ch = PhaseVCF( + splitted_vcf = split_ref_ch.map { it[0] }, + ref_bcf = split_ref_ch.map { it[1] }, + genetic_map = file("${params.genetic_map}"), + chr = split_ref_ch.map { it[2] } + ) + phased_vcf_ch.view { "Processing (Phasing): Phased VCF=${it[0]}, Ref=${it[1]}, Chr=${it[2]}" } + + // Takersid + takersid_input_ch = phased_vcf_ch + .map { phased_vcf -> + // Extract chromosome number from phased VCF filename + def chr = phased_vcf.name.replaceAll(/.*phasedvcf_chr(\d+)\.vcf\.gz/, '$1') + [phased_vcf, chr] + } + .filter { tuple -> tuple[0].exists() } + + rsid_map_ch = Takersid ( + phased_vcf = takersid_input_ch.map { it[0] }, + chr = takersid_input_ch.map { it[1] } + ) + + // Imputation + // Map phased VCF files and reference files for imputations and combine rsid + imputation_input_ch = phased_vcf_ch + .map { phased_vcf -> + def chr = phased_vcf.name.replaceAll(/.*phasedvcf_chr(\d+)\.vcf\.gz/, '$1') + def ref_bref = file("${params.ref_imp}/Ref_chr${chr}_hg38.bref3") + def g_map = file("${params.g_map}/plink.chr${chr}.GRCh38.map") + [phased_vcf, ref_bref, g_map, chr] + } + .filter { tuple -> tuple[0] && tuple[1].exists() && tuple[2].exists() } // Ensure all files exist + + // Debug to view matched files + imputation_input_ch.view { + "Processing (Imputation): Phased VCF=${it[0]}, Ref=${it[1]}, Map=${it[2]}, Chr=${it[3]}" + } + + // Run imputation process + imputed_vcf_ch = ImputeVCF( + phased_vcf = imputation_input_ch.map { it[0] }, + ref_bref = imputation_input_ch.map { it[1] }, + g_map = imputation_input_ch.map { it[2] }, + chr = imputation_input_ch.map { it[3] }, + rsid_map = rsid_map_ch + ) + + + // Merge VCF Files + merged_vcf_ch = MergedVCF(imputed_vcf_ch.collect()) + + // Convert Imputed VCF to PLINK format + plink_out = VCFtoPLINK(merged_vcf_ch) + + // Remove Duplicates + find_duplicates = FindDuplicates(plink_out) + + last_plink_out = RemoveDuplicates(plink_out, find_duplicates) + + // Target Ancestry Inference + ancestry_inf = fastmixture(last_plink_out) + + // Conditionally execute PLINK processes + if (params.run_plink) { + clump_snps_out = ClumpSNPs(last_plink_out, gwas_qc) + create_valid_snps_out = CreateValidSNPs(clump_snps_out) + create_snp_pvalue_out = CreateSNPpvalue(gwas_qc) + create_range_list_out = CreateRangeList() + calculate_pgs_out = CalculatePGS( + last = last_plink_out, + gwas = gwas_qc, + range_list = create_range_list_out, + valid_snp = create_valid_snps_out, + snp_pvalue = create_snp_pvalue_out + ) + } + + if (params.run_pca) { + // Execute CalculatePCA and extract .eigenvec file + calculate_pca_out = CalculatePCA(last_plink_out, prune_snps_out) + + // Separate the outputs + eigenvec_ch = calculate_pca_out.map { it[0] } + eigenval_ch = calculate_pca_out.map { it[1] } + } + + // Conditionally execute PRSice-2 processes + if (params.run_prsice) { + prsice_out = PRSice2( + last = last_plink_out, + gwas = gwas_qc, + pheno_file = pheno_file_ch, + eigenvec = eigenvec_ch, + ) + } + + // Conditionally execute LDpred grid process + if (params.run_LDpred2grid) { + bed_ch = last_plink_out.map { it[0] } + + LDpred2grid( + pheno_file = pheno_file_ch, + eigenvec = eigenvec_ch, + gwas = gwas_qc, + last = last_plink_out + + ) + } + + // Conditionally execute LDpred auto process + if (params.run_LDpred2auto) { + bed_ch = last_plink_out.map { it[0] } + + LDpred2auto( + pheno_file = pheno_file_ch, + eigenvec = eigenvec_ch, + gwas = gwas_qc, + last = last_plink_out, + + ) + } + + // Conditionally execute Lassosum2 process + if (params.run_Lassosum2) { + bed_ch = last_plink_out.map { it[0] } + + Lassosum2( + pheno_file = pheno_file_ch, + eigenvec = eigenvec_ch, + gwas = gwas_qc, + last = last_plink_out, + + ) + } + + // Conditionally execute PRS-CSx process + if (params.run_prscsx) { + PRSCSx( + last = last_plink_out + ) + } + + // Conditionally execute MUSSEL process + if (params.run_mussel) { + ldpred2_outputs = Step1_Run_LDpred2() + ldpred2_tuned_outputs = Step2_LDpred2_Tuning(ldpred2_outputs) + muss_outputs = Step3_Run_MUSS() + Step4_Combine_PRS_Models() + } + + // Conditionally execute MegaPRS process + + if (params.run_megaprs) { + mega_prs_out = MegaPRS( + target_qc_prefix = last_plink_out, + pheno_file = pheno_file_ch, + mega_summaries = mega_summaries_ch) + } + + if (params.run_sbayesrc) { + formatted_gwas = FormatGWAS(gwas_qc) + impute_out = Impute( + ld_folder = file(params.ld_folder), + ma_file = formatted_gwas_ch, + out_prefix = params.out_prefix, + threads = params.threads + ) + sbayesr_out = Sbayesrc( + ld_folder = file(params.ld_folder), + imp_file = impute_out_ch, + annot = file(params.annot_file), + out_prefix = params.out_prefix, + threads = params.threads + ) + } +} + + + + diff --git a/modules/.DS_Store b/modules/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/modules/.DS_Store differ diff --git a/modules/CalculateHWE.nf b/modules/CalculateHWE.nf new file mode 100644 index 0000000..732b2e3 --- /dev/null +++ b/modules/CalculateHWE.nf @@ -0,0 +1,16 @@ +process CalculateHWE { + tag "Calculating Hardy-Weinberg Equilibrium" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_4 + + output: + path "HWE.hwe" + + script: + """ + plink --bfile target_4 --hardy --out HWE + """ +} diff --git a/modules/CalculateHet.nf b/modules/CalculateHet.nf new file mode 100644 index 0000000..3d3e7e0 --- /dev/null +++ b/modules/CalculateHet.nf @@ -0,0 +1,17 @@ +process CalculateHet { + tag "Calculate heterozygosity rate" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_7 + path prune_in_file + + output: + path "het_R_check.het" + + script: + """ + plink --bfile target_7 --extract ${prune_in_file} --het --out het_R_check + """ +} diff --git a/modules/CalculatePCA.nf b/modules/CalculatePCA.nf new file mode 100644 index 0000000..e91e2a9 --- /dev/null +++ b/modules/CalculatePCA.nf @@ -0,0 +1,20 @@ +process CalculatePCA { + tag "CalculatePCA" + publishDir "${params.outdir}", mode: 'copy' + + input: + path last + path prune_in_file + + output: + tuple path("*.eigenvec"), path("*.eigenval") + + + script: + """ + plink --bfile last \ + --extract ${prune_in_file} \ + --pca ${params.pca} \ + --out last_OR + """ +} diff --git a/modules/CalculatePGS.nf b/modules/CalculatePGS.nf new file mode 100644 index 0000000..3642bed --- /dev/null +++ b/modules/CalculatePGS.nf @@ -0,0 +1,25 @@ +process CalculatePGS { + tag "Calculate PGS by PLINK" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path last + path gwas + path range_list + path valid_snp + path snp_pvalue + + output: + path "target_9_OR.*" + + script: + """ + plink \ + --bfile last \ + --score ${gwas} 3 5 8 header \ + --q-score-range ${range_list} ${snp_pvalue} \ + --extract ${valid_snp} \ + --out target_9_OR + """ +} diff --git a/modules/CleanHetFailInd.nf b/modules/CleanHetFailInd.nf new file mode 100644 index 0000000..2f8fca2 --- /dev/null +++ b/modules/CleanHetFailInd.nf @@ -0,0 +1,16 @@ +process CleanHetFailInd { + tag "Clean het fail individuals file" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path fail_het_qc_file + + output: + path "het_fail_ind.txt" + + script: + """ + sed 's/\"//g' ${fail_het_qc_file} | awk '{print \$1, \$2}' > het_fail_ind.txt + """ +} diff --git a/modules/ClumpSNPs.nf b/modules/ClumpSNPs.nf new file mode 100644 index 0000000..9bb7585 --- /dev/null +++ b/modules/ClumpSNPs.nf @@ -0,0 +1,24 @@ +process ClumpSNPs { + tag "ClumpSNPs" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path last + path gwas + + output: + path "last.clumped" + + script: + """ + plink --bfile last \ + --clump-p1 1 \ + --clump-r2 0.1 \ + --clump-kb 250 \ + --clump ${gwas} \ + --clump-snp-field SNP \ + --clump-field P \ + --out last + """ +} diff --git a/modules/ConvertPLINKtoVCF.nf b/modules/ConvertPLINKtoVCF.nf new file mode 100644 index 0000000..5e56809 --- /dev/null +++ b/modules/ConvertPLINKtoVCF.nf @@ -0,0 +1,15 @@ +process ConvertPLINKtoVCF { + tag "Convert PLINK to VCF and index files" + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_9 + + output: + path "*.vcf" + + script: + """ + plink --bfile target_9 --recode vcf --out target_9 + """ +} diff --git a/modules/ConvertVCFtoPLINK.nf b/modules/ConvertVCFtoPLINK.nf new file mode 100644 index 0000000..3864a9b --- /dev/null +++ b/modules/ConvertVCFtoPLINK.nf @@ -0,0 +1,20 @@ +process ConvertVCFtoPLINK { + tag "Convert VCF to PLINK" + publishDir "${params.target}", mode: 'copy' + + input: + path vcf_file + + output: + path "target.*" + + script: + """ + # Step 1: Index the VCF file + bcftools sort ${vcf_file} -Oz -o target.vcf.gz + tabix -p vcf target.vcf.gz || { echo "Tabix indexing failed"; exit 1; } + + # Step 2: Convert VCF to PLINK format + plink --vcf target.vcf.gz --allow-extra-chr --make-bed --out target || { echo "PLINK conversion failed"; exit 1; } + """ +} diff --git a/modules/CreateRangeList.nf b/modules/CreateRangeList.nf new file mode 100644 index 0000000..8954240 --- /dev/null +++ b/modules/CreateRangeList.nf @@ -0,0 +1,19 @@ +process CreateRangeList { + tag "CreateRangeList for P+T" + debug true + publishDir "${params.outdir}", mode: 'copy' + + output: + path "range_list" + + script: + """ + echo "0.001 0 0.001" > range_list + echo "0.05 0 0.05" >> range_list + echo "0.1 0 0.1" >> range_list + echo "0.2 0 0.2" >> range_list + echo "0.3 0 0.3" >> range_list + echo "0.4 0 0.4" >> range_list + echo "0.5 0 0.5" >> range_list + """ +} diff --git a/modules/CreateSNPpvalue.nf b/modules/CreateSNPpvalue.nf new file mode 100644 index 0000000..ae7f5e7 --- /dev/null +++ b/modules/CreateSNPpvalue.nf @@ -0,0 +1,16 @@ +process CreateSNPpvalue { + tag "CreateSNPpvalue" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path gwas_sumstat + + output: + path "SNP.pvalue" + + script: + """ + awk '{print \$3,\$13}' ${gwas_sumstat} > SNP.pvalue + """ +} diff --git a/modules/CreateValidSNPs.nf b/modules/CreateValidSNPs.nf new file mode 100644 index 0000000..4a545a6 --- /dev/null +++ b/modules/CreateValidSNPs.nf @@ -0,0 +1,16 @@ +process CreateValidSNPs { + tag "CreateValidSNPs" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path clumped_file + + output: + path "last.valid.snp" + + script: + """ + awk 'NR!=1{print \$3}' ${clumped_file} > last.valid.snp + """ +} diff --git a/modules/FilterIndividuals.nf b/modules/FilterIndividuals.nf new file mode 100644 index 0000000..cc8f18a --- /dev/null +++ b/modules/FilterIndividuals.nf @@ -0,0 +1,17 @@ +process FilterIndividuals { + tag "Filter individuals based on relatedness" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_6 + path rel_id_file + + output: + path "target_7.*" + + script: + """ + plink --bfile target_6 --keep ${rel_id_file} --make-bed --out target_7 + """ +} diff --git a/modules/FilterMAF.nf b/modules/FilterMAF.nf new file mode 100644 index 0000000..06d9b0a --- /dev/null +++ b/modules/FilterMAF.nf @@ -0,0 +1,18 @@ +process FilterMAF { + tag "Filtering autosomal SNPs and calculating MAF" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_2 + + output: + tuple path("target_3.*"), path("MAF_check.frq") + + script: + """ + awk '{ if (\$1 >= 1 && \$1 <= 22) print \$2 }' target_2.bim > snp_1_22.txt + plink --bfile target_2 --extract snp_1_22.txt --make-bed --out target_3 + plink --bfile target_3 --freq --out MAF_check + """ +} diff --git a/modules/FindDuplicateSNPs.nf b/modules/FindDuplicateSNPs.nf new file mode 100644 index 0000000..8c483d0 --- /dev/null +++ b/modules/FindDuplicateSNPs.nf @@ -0,0 +1,16 @@ +process FindDuplicateSNPs { + tag "Find duplicate SNPs" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_8 + + output: + path "target_8_duplicate_snps.list" + + script: + """ + awk '{print \$2}' target_8.bim | sort | uniq -d > target_8_duplicate_snps.list + """ +} diff --git a/modules/FindDuplicates.nf b/modules/FindDuplicates.nf new file mode 100644 index 0000000..5e372c2 --- /dev/null +++ b/modules/FindDuplicates.nf @@ -0,0 +1,16 @@ +process FindDuplicates { + tag "Find duplicate SNPs " + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path merged + + output: + path "last_duplicate_snps.list" + + script: + """ + awk '{print \$2}' merged.bim | sort | uniq -d > last_duplicate_snps.list + """ +} diff --git a/modules/FormatGWAS.nf b/modules/FormatGWAS.nf new file mode 100644 index 0000000..694480a --- /dev/null +++ b/modules/FormatGWAS.nf @@ -0,0 +1,18 @@ +process FormatGWAS { + tag "Format GWAS for SBayesR-C" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path gwas + + output: + path "formatted_gwas.ma" + + script: + """ + Rscript ${params.formatgwas} ${gwas} + """ +} + + diff --git a/modules/GWAS_QC.nf b/modules/GWAS_QC.nf new file mode 100644 index 0000000..eecdc52 --- /dev/null +++ b/modules/GWAS_QC.nf @@ -0,0 +1,24 @@ +process GWAS_QC { + tag "GWAS QC Steps" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path gwas_sumstat + + output: + path 'GWAS_QC.*' + + script: + """ + echo "Input file: ${gwas_sumstat}" + awk 'NR==1 || (\$6 > 0.01) && (\$14 > 0.8) {print}' < ${gwas_sumstat} |\ + awk '{seen[\$3]++; if(seen[\$3]==1){ print}}' > GWAS_QC.txt + """ + +} + + + + + diff --git a/modules/GenerateIBD.nf b/modules/GenerateIBD.nf new file mode 100644 index 0000000..38fe5b5 --- /dev/null +++ b/modules/GenerateIBD.nf @@ -0,0 +1,16 @@ +process GenerateIBD { + tag "Generate genome-wide IBD estimates" + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_6 + path prune_in_file + + output: + path "ibd_prune.*" + + script: + """ + plink --bfile target_6 --extract ${prune_in_file} --genome --out ibd_prune + """ +} diff --git a/modules/IBDLDPruning.nf b/modules/IBDLDPruning.nf new file mode 100644 index 0000000..a488916 --- /dev/null +++ b/modules/IBDLDPruning.nf @@ -0,0 +1,16 @@ +process IBDLDPruning { + tag "IBD-LD-pruning" + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_6 + + output: + path "merged_indepSNP.prune.in", emit: prune_in + path "merged_indepSNP.prune.out" + + script: + """ + plink --bfile target_6 --indep-pairwise ${params.indep_window_size} ${params.indep_step_size} ${params.indep_threshold} --out merged_indepSNP + """ +} diff --git a/modules/Impute.nf b/modules/Impute.nf new file mode 100644 index 0000000..83c62ff --- /dev/null +++ b/modules/Impute.nf @@ -0,0 +1,19 @@ +process Impute { + tag "Run SBayesR-C" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path ld_folder + path ma_file + val out_prefix + val threads + + script: + """ + # Main: SBayesRC + docker pull zhiliz/sbayesrc + docker run -v ${PWD}:/data zhiliz/sbayesrc --ldm-eigen /data/${ld_folder} --gwas-summary /data/${ma_file} --impute-summary --out /data/${out_prefix} --threads ${threads} + """ + +} \ No newline at end of file diff --git a/modules/ImputeVCF.nf b/modules/ImputeVCF.nf new file mode 100644 index 0000000..9990687 --- /dev/null +++ b/modules/ImputeVCF.nf @@ -0,0 +1,37 @@ +process ImputeVCF { + tag "Imputation for chromosome ${chr} and QC" + publishDir "${params.outdir}", mode: 'copy' + + input: + path phased_vcf + path ref_bref + path g_map + val chr + path rsid_map + + output: + path "imputed_chr${chr}_with_rsid.vcf.gz" + + script: + """ + tabix -p vcf ${phased_vcf} + beagle ref=${ref_bref} gt=${phased_vcf} map=${g_map} out=imputed_chr${chr} + bcftools index imputed_chr${chr}.vcf.gz + bcftools view -O z -o imputed_chr${chr}_QC.vcf.gz -e 'INFO/DR2<0.8' imputed_chr${chr}.vcf.gz + bcftools index imputed_chr${chr}_QC.vcf.gz + + awk 'BEGIN {OFS="\t"} {if (NR > 1) print \$1, \$2, \$3, ".", ".", ".", ".", "."}' ${rsid_map} > rsid${chr}.vcf + bgzip rsid${chr}.vcf + tabix -p vcf rsid${chr}.vcf.gz + bcftools annotate -a rsid${chr}.vcf.gz -c CHROM,POS,ID -o imputed_chr${chr}_with_rsid.vcf -O v imputed_chr${chr}_QC.vcf.gz + bgzip imputed_chr${chr}_with_rsid.vcf + bcftools index imputed_chr${chr}_with_rsid.vcf.gz + + + sleep 200 + echo "Files are ready, proceeding." + """ +} + + + diff --git a/modules/InitialHWEFilter.nf b/modules/InitialHWEFilter.nf new file mode 100644 index 0000000..16420b0 --- /dev/null +++ b/modules/InitialHWEFilter.nf @@ -0,0 +1,16 @@ +process InitialHWEFilter { + tag "Initial HWE filtering" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_4 + + output: + path "target_5*" + + script: + """ + plink --bfile target_4 --hwe ${params.hwe_case} --make-bed --out target_5 + """ +} diff --git a/modules/LDpred2auto.nf b/modules/LDpred2auto.nf new file mode 100644 index 0000000..112a467 --- /dev/null +++ b/modules/LDpred2auto.nf @@ -0,0 +1,20 @@ +process LDpred2auto { + tag "Calculate PGS with LDpred2 Auto Model" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path pheno_file + path eigenvec + path gwas + path last + + output: + path "*" + + script: + """ + Rscript ${params.ldpred_auto_script} ${pheno_file} ${eigenvec} ${gwas} ${last} + """ +} + diff --git a/modules/LDpred2grid.nf b/modules/LDpred2grid.nf new file mode 100644 index 0000000..be34620 --- /dev/null +++ b/modules/LDpred2grid.nf @@ -0,0 +1,20 @@ +process LDpred2grid { + tag "Calculate PGS with LDpred2 Grid Model" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path pheno_file + path eigenvec + path gwas + path last + + output: + path "*" + + script: + """ + Rscript ${params.ldpred_grid_script} ${pheno_file} ${eigenvec} ${gwas} ${last} + """ +} + diff --git a/modules/Lassosum2.nf b/modules/Lassosum2.nf new file mode 100644 index 0000000..76919fb --- /dev/null +++ b/modules/Lassosum2.nf @@ -0,0 +1,20 @@ +process Lassosum2 { + tag "Lassosum2" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path pheno_file + path eigenvec + path gwas + path last + + output: + path "*" + + script: + """ + Rscript ${params.lassosum2_script} ${pheno_file} ${eigenvec} ${gwas} ${last} + """ +} + diff --git a/modules/MegaPRS.nf b/modules/MegaPRS.nf new file mode 100644 index 0000000..ba28bf6 --- /dev/null +++ b/modules/MegaPRS.nf @@ -0,0 +1,32 @@ +process MegaPRS { + tag "Run MegaPRS Workflow" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_qc_prefix + path pheno_file + path mega_summaries + + output: + path "*" + + script: + """ + # Run MegaPRS logistic regression + ${params.ldak_executable} --logistic quant --bfile ${params.target_qc_prefix} --pheno ${params.pheno_file} + sleep 60 + ${params.ldak_executable} --calc-cors cors --bfile ${params.target_qc_prefix} + ${params.ldak_executable} --mega-prs mega --model ${params.mega_model} --summary ${params.mega_summaries} --power -0.25 --cors cors --allow-ambiguous YES + + """ +} + + + + + + + + + diff --git a/modules/MergedVCF.nf b/modules/MergedVCF.nf new file mode 100644 index 0000000..8dd976a --- /dev/null +++ b/modules/MergedVCF.nf @@ -0,0 +1,23 @@ +process MergedVCF { + tag "Convert Last QC ed VCF files into PLINK format" + publishDir "${params.outdir}", mode: 'copy' + + input: + path imputed_vcfs + + output: + path "merged_output.vcf.gz" + + script: + """ + for vcf in ${imputed_vcfs}; do + echo "Indexing \$vcf" + bcftools index \$vcf + done + + bcftools concat ${imputed_vcfs.join(' ')} -Oz -o merged_output.vcf.gz + """ +} + + + diff --git a/modules/PRSCSx.nf b/modules/PRSCSx.nf new file mode 100644 index 0000000..4a31129 --- /dev/null +++ b/modules/PRSCSx.nf @@ -0,0 +1,28 @@ +process PRSCSx { + tag "PRS-CSx" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path last + + + output: + path "target_ukbb_meta*" + + script: + """ + echo "Output directory: ${params.outdir}" + python /sw/bip_apps/PRScsx/PRScsx.py \ + --ref_dir=${params.prscsxref_dir} \ + --bim_prefix=last \ + --sst_file=${params.prscsx_gwas1},${params.prscsx_gwas2} \ + --n_gwas=${params.n_gwas1},${params.n_gwas2} \ + --pop=${params.pop1},${params.pop2} \ + --chrom=${params.chrom} \ + --phi=${params.phi} \ + --meta=${params.meta} \ + --out_dir=./ \ + --out_name=${params.out_name} + """ +} diff --git a/modules/PRSice2.nf b/modules/PRSice2.nf new file mode 100644 index 0000000..df17647 --- /dev/null +++ b/modules/PRSice2.nf @@ -0,0 +1,29 @@ +process PRSice2 { + tag "CalculatePGS with PRSice2" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path last + path gwas + path pheno_file + path eigenvec + + output: + path "last_PRSice2*" + + script: + """ + Rscript ${params.prsice_script} \ + --prsice ${params.prsice_executable} \ + --base ${gwas} \ + --target last \ + --binary-target T \ + --pheno ${pheno_file} \ + --cov ${eigenvec} \ + --stat OR \ + --or \ + --out last_PRSice2 + """ +} + diff --git a/modules/PhaseVCF.nf b/modules/PhaseVCF.nf new file mode 100644 index 0000000..980f648 --- /dev/null +++ b/modules/PhaseVCF.nf @@ -0,0 +1,31 @@ +process PhaseVCF { + tag "Phasing of VCF file" + tag { "chr${chr}" } + publishDir "${params.outdir}", mode: 'copy' + + input: + path splitted_vcf + path ref_bcf + path genetic_map + val chr + + output: + path "phasedvcf_chr${chr}.vcf.gz" + + + script: + """ + bcftools index ${ref_bcf} + tabix -p vcf ${splitted_vcf} + $PWD/eagle \ + --vcfTarget ${splitted_vcf} \ + --vcfRef ${ref_bcf} \ + --geneticMapFile ${genetic_map} \ + --vcfOutFormat z \ + --numThreads 24 \ + --outPrefix phasedvcf_chr${chr} + + + + """ +} diff --git a/modules/PlotHWE.nf b/modules/PlotHWE.nf new file mode 100644 index 0000000..cc2235f --- /dev/null +++ b/modules/PlotHWE.nf @@ -0,0 +1,16 @@ +process PlotHWE { + tag "Plotting Hardy-Weinberg Equilibrium" + debug true + publishDir "${params.graphs}", mode: 'copy' + + input: + path hwe_file + + output: + path "hwe.pdf" + + script: + """ + Rscript $PWD/bin/plot_hwe.R ${hwe_file} + """ +} diff --git a/modules/PlotHet.nf b/modules/PlotHet.nf new file mode 100644 index 0000000..c90d6b6 --- /dev/null +++ b/modules/PlotHet.nf @@ -0,0 +1,15 @@ +process PlotHet { + tag "Plot heterozygosity rate" + publishDir "${params.graphs}", mode: 'copy' + + input: + path het_file + + output: + path "het.pdf" + + script: + """ + Rscript $PWD/bin/plot_het.R ${het_file} + """ +} diff --git a/modules/PlotIBD.nf b/modules/PlotIBD.nf new file mode 100644 index 0000000..fc44287 --- /dev/null +++ b/modules/PlotIBD.nf @@ -0,0 +1,15 @@ +process PlotIBD { + tag "Plot IBD distribution" + publishDir "${params.graphs}", mode: 'copy' + + input: + path ibd_file + + output: + path "ibd_plot.pdf" + + script: + """ + Rscript $PWD/bin/plot_ibd.R ${ibd_file} + """ +} diff --git a/modules/PlotMAF.nf b/modules/PlotMAF.nf new file mode 100644 index 0000000..8a0b46a --- /dev/null +++ b/modules/PlotMAF.nf @@ -0,0 +1,16 @@ +process PlotMAF { + tag "Generate MAF distribution plot" + debug true + publishDir "${params.graphs}", mode: 'copy' + + input: + path maf_freq_file + + output: + path 'MAF_dist.pdf' + + script: + """ + Rscript $PWD/bin/plot_maf.R ${maf_freq_file} + """ +} diff --git a/modules/ProcessHet.nf b/modules/ProcessHet.nf new file mode 100644 index 0000000..13b472f --- /dev/null +++ b/modules/ProcessHet.nf @@ -0,0 +1,16 @@ +process ProcessHet { + tag "Process heterozygosity data" + debug true + publishDir "${params.graphs}", mode: 'copy' + + input: + path het_file + + output: + path "het.pdf" + + script: + """ + Rscript $PWD/bin/process_het.R ${het_file} + """ +} diff --git a/modules/PruneSNPs.nf b/modules/PruneSNPs.nf new file mode 100644 index 0000000..45f63ae --- /dev/null +++ b/modules/PruneSNPs.nf @@ -0,0 +1,18 @@ +process PruneSNPs { + tag "PruneSNPs by PLINK" + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_9 + + output: + path "target_9_OR.prune.in" + + script: + """ + plink \ + --bfile target_9 \ + --indep-pairwise ${params.indep_window_size} ${params.indep_step_size} ${params.indep_threshold} \ + --out target_9_OR + """ +} diff --git a/modules/QCSNPmissingness.nf b/modules/QCSNPmissingness.nf new file mode 100644 index 0000000..ffc912b --- /dev/null +++ b/modules/QCSNPmissingness.nf @@ -0,0 +1,16 @@ +process QCSNPmissingness { + tag "Removal of missing SNPs according to the specified threshold value" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path target + + output: + path "target_*" + + script: + """ + plink --bfile ${params.target_prefix} --geno ${params.geno} --make-bed --out target_1 + """ +} diff --git a/modules/QCindmissingness.nf b/modules/QCindmissingness.nf new file mode 100644 index 0000000..ab4a624 --- /dev/null +++ b/modules/QCindmissingness.nf @@ -0,0 +1,16 @@ +process QCindmissingness { + tag "Removal of missing individuals according to the specified threshold value" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_1 + + output: + path 'target_2.*' + + script: + """ + plink --bfile target_1 --mind ${params.mind} --make-bed --out target_2 + """ +} diff --git a/modules/RelCutoff.nf b/modules/RelCutoff.nf new file mode 100644 index 0000000..55c02ca --- /dev/null +++ b/modules/RelCutoff.nf @@ -0,0 +1,16 @@ +process RelCutoff { + tag "Create file with non-relative individuals" + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_6 + path prune_in_file + + output: + path "pihat.rel.id" + + script: + """ + plink --bfile target_6 --extract ${prune_in_file} --rel-cutoff ${params.pihat} --out pihat + """ +} diff --git a/modules/RemoveDuplicateSNPs.nf b/modules/RemoveDuplicateSNPs.nf new file mode 100644 index 0000000..ba074cd --- /dev/null +++ b/modules/RemoveDuplicateSNPs.nf @@ -0,0 +1,17 @@ +process RemoveDuplicateSNPs { + tag "Remove duplicate SNPs" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_8 + path duplicate_snps_list + + output: + path "target_9.*" + + script: + """ + plink --bfile target_8 --exclude ${duplicate_snps_list} --make-bed --out target_9 + """ +} diff --git a/modules/RemoveDuplicates.nf b/modules/RemoveDuplicates.nf new file mode 100644 index 0000000..8f881e9 --- /dev/null +++ b/modules/RemoveDuplicates.nf @@ -0,0 +1,17 @@ +process RemoveDuplicates { + tag "Remove duplicates" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path last + path duplicate_snps_list + + output: + path "last.*" + + script: + """ + plink --bfile merged --exclude ${duplicate_snps_list} --make-bed --out last + """ +} diff --git a/modules/RemoveHetOutliers.nf b/modules/RemoveHetOutliers.nf new file mode 100644 index 0000000..f2640f7 --- /dev/null +++ b/modules/RemoveHetOutliers.nf @@ -0,0 +1,17 @@ +process RemoveHetOutliers { + tag "Remove heterozygosity rate outliers" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_7 + path het_fail_ind_file + + output: + path "target_8.*" + + script: + """ + plink --bfile target_7 --allow-no-sex --remove ${het_fail_ind_file} --make-bed --out target_8 + """ +} diff --git a/modules/RemoveLowMAF.nf b/modules/RemoveLowMAF.nf new file mode 100644 index 0000000..ed8917b --- /dev/null +++ b/modules/RemoveLowMAF.nf @@ -0,0 +1,16 @@ +process RemoveLowMAF { + tag "Removing SNPs with low MAF frequency" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_3 + + output: + path "target_4.*" + + script: + """ + plink --bfile target_3 --maf ${params.maf} --make-bed --out target_4 + """ +} diff --git a/modules/Sbayesrc.nf b/modules/Sbayesrc.nf new file mode 100644 index 0000000..77d5805 --- /dev/null +++ b/modules/Sbayesrc.nf @@ -0,0 +1,19 @@ +process Sbayesrc { + tag "Run SBayesR-C" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path ld_folder + path imp_file + path annot + val out_prefix + val threads + + script: + """ + # Main: SBayesRC + docker pull zhiliz/sbayesrc + docker run -v ${PWD}:/data zhiliz/sbayesrc --ldm-eigen /data/${params.ld_folder} --gwas-summary /data/${imp_file} --sbayes RC --annot /data/${params.annot_file} --out /data/${params.out_prefix} --threads ${params.threads} + """ +} diff --git a/modules/SecondHWEFilter.nf b/modules/SecondHWEFilter.nf new file mode 100644 index 0000000..ee716fe --- /dev/null +++ b/modules/SecondHWEFilter.nf @@ -0,0 +1,16 @@ +process SecondHWEFilter { + tag "Second HWE filtering" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path target_5 + + output: + path "target_6.*" + + script: + """ + plink --bfile target_5 --hwe ${params.hwe_ctrl} --hwe-all --make-bed --out target_6 + """ +} diff --git a/modules/SortVCF.nf b/modules/SortVCF.nf new file mode 100644 index 0000000..0932c0e --- /dev/null +++ b/modules/SortVCF.nf @@ -0,0 +1,16 @@ +process SortVCF { + tag "Sort VCF File" + publishDir "${params.outdir}", mode: 'copy' + + input: + path input_vcf + + output: + path "*" + + script: + """ + bcftools sort ${input_vcf} -Oz -o sorted.vcf.gz + tabix -p vcf sorted.vcf.gz + """ +} diff --git a/modules/SplitVCF.nf b/modules/SplitVCF.nf new file mode 100644 index 0000000..8fba167 --- /dev/null +++ b/modules/SplitVCF.nf @@ -0,0 +1,23 @@ +process SplitVCF { + tag "Split VCF files into Chromosomes" + tag { "chr${chr}" } + publishDir "${params.outdir}", mode: 'copy' + + input: + path vcf_file + val chr + + output: + path "vcfbychrom_${chr}.vcf.gz" + + + script: + """ + tabix -p vcf ${vcf_file} + bcftools view ${vcf_file} --regions ${chr} -o vcfbychrom_${chr}.vcf.gz -Oz + """ +} + + + + diff --git a/modules/Step1_Run_LDpred2.nf b/modules/Step1_Run_LDpred2.nf new file mode 100644 index 0000000..00c2a9e --- /dev/null +++ b/modules/Step1_Run_LDpred2.nf @@ -0,0 +1,28 @@ +process Step1_Run_LDpred2 { + tag "Step 1: LDpred2 is running" + publishDir "${params.out}", mode: 'copy' + + output: + path "*", emit: ldpred2_outputs + + script: + """ + Rscript ${params.pack}/R/LDpred2_jobs.R \\ + --PATH_package ${params.pack} \\ + --PATH_data ${params.data} \\ + --PATH_LDref ${params.LDref} \\ + --PATH_out ${params.outdir} \\ + --FILE_sst ${params.sst} \\ + --pop ${params.pop} \\ + --bfile_tuning ${params.bfile_tuning} \\ + --NCORES 20 + + while [ ! -f "${params.outdir}/EUR/tmp/beta_files/beta_in_all_settings/params.txt" ] || [ ! -f "${params.outdir}/AFR/tmp/beta_files/beta_in_all_settings/params.txt" ]; do + echo "Files are writing..." + sleep 60 + done + + echo "Files are ready, proceeding." + + """ +} \ No newline at end of file diff --git a/modules/Step2_LDpred2_Tuning.nf b/modules/Step2_LDpred2_Tuning.nf new file mode 100644 index 0000000..2842f5b --- /dev/null +++ b/modules/Step2_LDpred2_Tuning.nf @@ -0,0 +1,35 @@ +process Step2_LDpred2_Tuning { + tag "Step 2: LDpred2 tuning is running" + publishDir "${params.outdir}", mode: 'copy' + + input: + path ldpred2_outputs + + output: + tuple path ("EUR_optim_params.txt"), + path ("AFR_optim_params.txt"), emit: ldpred2_tuned_outputs + + script: + """ + Rscript ${params.pack}/R/LDpred2_tuning.R \\ + --PATH_package ${params.pack} \\ + --PATH_out ${params.outdir} \\ + --PATH_plink ${params.plink} \\ + --FILE_sst ${params.sst} \\ + --pop ${params.pop} \\ + --chrom ${params.mussel_chrom} \\ + --bfile_tuning ${params.bfile_tuning} \\ + --pheno_tuning ${params.pheno_tuning} \\ + --covar_tuning ${params.covar_tuning} \\ + --bfile_testing ${params.bfile_testing} \\ + --pheno_testing ${params.pheno_testing} \\ + --covar_testing ${params.covar_testing} \\ + --trait_type ${params.trait_type} \\ + --testing TRUE \\ + --NCORES 4 + + # Dosyaları işlem dizinine kopyalayın + cp ${params.outdir}/LDpred2/EUR_optim_params.txt EUR_optim_params.txt + cp ${params.outdir}/LDpred2/AFR_optim_params.txt AFR_optim_params.txt + """ +} diff --git a/modules/Step3_Run_MUSS.nf b/modules/Step3_Run_MUSS.nf new file mode 100644 index 0000000..d1aafe1 --- /dev/null +++ b/modules/Step3_Run_MUSS.nf @@ -0,0 +1,33 @@ +process Step3_Run_MUSS { + tag "Step 3: Running MUSS" + publishDir "${params.outdir}", mode: 'copy' + + output: + path "MUSS_beta_in_all_settings_bychrom/*", emit: muss_outputs + + script: + """ + Rscript ${params.pack}/R/MUSS_jobs.R \\ + --PATH_package ${params.pack} \\ + --PATH_data ${params.data} \\ + --PATH_LDref ${params.LDref} \\ + --PATH_out ${params.outdir} \\ + --FILE_sst ${params.sst} \\ + --pop ${params.pop} \\ + --LDpred2_params ${params.outdir}/LDpred2/EUR_optim_params.txt,${params.outdir}/LDpred2/AFR_optim_params.txt \\ + --chrom ${params.mussel_chrom} \\ + --bfile_tuning ${params.bfile_tuning} \\ + --NCORES 5 + + while [ ! -f "${params.outdir}/tmp/MUSS_beta_in_all_settings_bychrom/settings_22.txt" ] ; do + echo "Files are writing..." + sleep 60 + done + + echo "Files are ready, proceeding." + + # Copy the output files to the current working directory + mkdir -p MUSS_beta_in_all_settings_bychrom + cp ${params.outdir}/tmp/MUSS_beta_in_all_settings_bychrom/*.txt MUSS_beta_in_all_settings_bychrom/ + """ +} diff --git a/modules/Step4_Combine_PRS_Models.nf b/modules/Step4_Combine_PRS_Models.nf new file mode 100644 index 0000000..58c474c --- /dev/null +++ b/modules/Step4_Combine_PRS_Models.nf @@ -0,0 +1,28 @@ +process Step4_Combine_PRS_Models { + tag "Step 4: Combining PRS models with SuperLearner" + publishDir "${params.outdir}", mode: 'copy' + + output: + path "*" + + + script: + """ + Rscript ${params.pack}/R/MUSSEL.R \\ + --PATH_package ${params.pack} \\ + --PATH_out ${params.outdir} \\ + --PATH_plink ${params.plink} \\ + --pop ${params.pop} \\ + --target_pop ${params.target_pop} \\ + --chrom ${params.mussel_chrom} \\ + --bfile_tuning ${params.bfile_tuning} \\ + --pheno_tuning ${params.pheno_tuning} \\ + --covar_tuning ${params.covar_tuning} \\ + --bfile_testing ${params.bfile_testing} \\ + --pheno_testing ${params.pheno_testing} \\ + --covar_testing ${params.covar_testing} \\ + --trait_type ${params.trait_type} \\ + --testing TRUE \\ + --NCORES 4 + """ +} diff --git a/modules/Takersid.nf b/modules/Takersid.nf new file mode 100644 index 0000000..4c2209a --- /dev/null +++ b/modules/Takersid.nf @@ -0,0 +1,25 @@ +process Takersid { + tag "Takingrsid" + publishDir "${params.outdir}", mode: 'copy' + + input: + path phased_vcf + val chr + + output: + path "rsid_map_chr${chr}.txt" + + script: + """ + zcat ${phased_vcf} | grep -v '^#' | awk '{print \$1, \$2, \$3}' > rsid_map_chr${chr}.txt + """ +} + + + + + + + + + diff --git a/modules/VCFtoPLINK.nf b/modules/VCFtoPLINK.nf new file mode 100644 index 0000000..79261d0 --- /dev/null +++ b/modules/VCFtoPLINK.nf @@ -0,0 +1,20 @@ +process VCFtoPLINK { + tag "Convert Imputed VCF to PLINK" + publishDir "${params.outdir}", mode: 'copy' + + input: + path merged + + output: + path "merged.*" + + script: + """ + # Step 1: Index the VCF file + bcftools sort ${merged} -Oz -o merged_sorted.vcf.gz + tabix -p vcf merged_sorted.vcf.gz || { echo "Tabix indexing failed"; exit 1; } + + # Step 2: Convert VCF to PLINK format + plink --vcf merged_sorted.vcf.gz --allow-extra-chr --make-bed --out merged + """ +} diff --git a/modules/fastmixture.nf b/modules/fastmixture.nf new file mode 100644 index 0000000..8bd766f --- /dev/null +++ b/modules/fastmixture.nf @@ -0,0 +1,16 @@ +process fastmixture { + tag "Target Ancestry Inference" + debug true + publishDir "${params.outdir}", mode: 'copy' + + input: + path last + + output: + path "*" + + script: + """ + fastmixture --bfile last --K ${params.num_of_ancestry} --threads 8 --seed 1 --out ancestry + """ +} diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..62fc891 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,112 @@ +/* + * User Input Parameters + * The specified parameters and input files can be given to the pipeline via command line options + */ +process { + errorStrategy = 'ignore' +} + +params { + // Required parameters + vcf_to_plink = false + target = "$PWD/target/" + target_prefix = "$PWD/target/target" + target_qc_prefix = "$PWD/outputs/last" + pheno_file = "$PWD/target/phenotype_file_t2.txt" // Default phenotype file + gwas_sumstat = "$PWD/target/GWAS_sumstat_t1.txt" // Default GWAS summary statistics file + mega_summaries = "$PWD/outputs/quant.summaries" + + // Optional parameters with default values + geno = 0.02 + mind = 0.02 + maf = 0.05 + hwe_case = 1e-6 + hwe_ctrl = 1e-10 + pihat = 0.1875 + relatedness = 0.1875 + indep_window_size = 100 + indep_step_size = 5 + indep_threshold = 0.2 + pca = 10 + + // Help flag + help = false + + // Workflow control flags + run_plink = true // Set to false if PLINK part should not be run + run_prsice = true // Set to false if PRSice-2 part should not be run + run_pca = true // To create .eigenvec file to be used as covariate file for further analysis + run_LDpred2grid = true // Set to false if LDPred2 Grid part should not be run + run_LDpred2auto = true // Set to false if LDPred2 Auto part should not be run + run_Lassosum2 = true // Set to false if Lassosum2 part should not be run + run_prscsx = true // Set to false if PRS-CSx part should not be run + run_mussel = false // Set to true if MUSSEL part should not be run + run_megaprs = true // Set to false if MegaPRS part should not be run + run_sbayesrc = true // Set to false if SBayesR-C part should not be run + + // Phasing - Imputation + ref_phase = "/archive/yarast/1KG_hg38_BCF/" + genetic_map = "$PWD/genetic_map_hg38_withX.txt.gz" + ref_imp = "/archive/yarast/1KG_hg38_bref3/" + g_map = "/archive/yarast/plink.GRCh38.map/" // Genetic map directory + + // Target Ancesty Inference + num_of_ancestry = 3 + + // PRSice-2 parameters + prsice_script = "$PWD/PRSice.R" + prsice_executable = "$PWD/bin/PRSice_linux" + + // LDpred2 parameters + ldpred_grid_script = "$PWD/bin/LDpred2grid.R" + ldpred_auto_script = "$PWD/bin/LDpred2auto.R" + + // Lassosum2 + lassosum2_script = "$PWD/bin/Lassosum2.R" + + // PRScsx parameters + prscsxref_dir = "$PWD/PRScsx/UKBB" // Default reference directory (can be '$PWD/PRScsx/UKBB' or '$PWD/PRScsx/1KG') + prscsx_gwas1 = "$PWD/target/EUR_GWAS_PRScsx.txt" // First GWAS summary statistics file in the target folder + prscsx_gwas2 = "$PWD/target/EAS_GWAS_PRScsx.txt" // Second GWAS summary statistics file in the target folder + n_gwas1 = '10000' // Sample sizes for GWAS1 + n_gwas2 = '10000' // Sample sizes for GWAS1 + pop1 = 'EUR' // Population1 used in PRS-CSx + pop2 = 'EAS' // Population2 used in PRS-CSx + phi = '1e-2' // Tuning parameter for PRS-CSx + chrom = '1' // the chromosome on which model is fitted, can be separated by comma e.g., --chrom=1,3,5. + meta = true // Whether to run meta-analysis + out_name = 'target_ukbb_meta' // Specify output prefix + + // MUSSEL parameters + pack = "$PWD/MUSSEL" + data = "$PWD/MUSSEL/example" + LDref = "$PWD/MUSSEL/LDref" + sst = "$PWD/MUSSEL/example/summdata/EUR.txt,$PWD/MUSSEL/example/summdata/AFR.txt" + pop = "EUR,AFR" + mussel_chrom = "1-22" + bfile_tuning = "$PWD/MUSSEL/example/sample_data/EUR/tuning_geno,$PWD/MUSSEL/example/sample_data/AFR/tuning_geno" + pheno_tuning = "$PWD/example/sample_data/EUR/pheno.txt,$PWD/MUSSEL/example/sample_data/AFR/pheno.txt" + covar_tuning = "$PWD/MUSSEL/example/sample_data/EUR/covar.txt,$PWD/MUSSEL/example/sample_data/AFR/covar.txt" + bfile_testing = "$PWD/MUSSEL/example/sample_data/EUR/testing_geno,$PWD/MUSSEL/example/sample_data/AFR/testing_geno" + pheno_testing = "$PWD/MUSSEL/example/sample_data/EUR/pheno.txt,$PWD/MUSSEL/example/sample_data/AFR/pheno.txt" + covar_testing = "$PWD/MUSSEL/example/sample_data/EUR/covar.txt,$PWD/MUSSEL/example/sample_data/AFR/covar.txt" + trait_type = "continuous" + NCORES = 4 + plink = "/sw/bip_apps/plink2/2.0.0.alpha/bin/plink2" + + // MegaPRS Parameters + ldak_executable = "$PWD/bin/ldak6.linux" //Path to the LDAK executable (e.g., ldak6.mac or ldak6.linux) + mega_model = "bayesr" // Model type for MegaPRS. Options: lasso, ridge, bayesr, etc. Default = bayesr + + //SBayesR-C Parameters + formatgwas = "$PWD/bin/format_gwas.R" + ld_folder = 'ukbEAS_Imputed' + annot_file = 'annot_baseline2.2.txt' + out_prefix = 'sbayesrc' + threads = 16 + + + // Output directories + outdir = "$PWD/outputs" + graphs = "$PWD/QC_graphs" +} diff --git a/outputs/.DS_Store b/outputs/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/outputs/.DS_Store differ diff --git a/target/.DS_Store b/target/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/target/.DS_Store differ