diff --git a/index.html b/index.html index 79f310c..7201307 100644 --- a/index.html +++ b/index.html @@ -171,6 +171,7 @@

OpenMP

Known Issues

+

If the error is ld: library "crypto" not found, install openssl (e.g. brew install openssl).

Compiling the package requires the compiler to find the libraries for the dependencies. For unix systems, the libraries are typically installed at /usr/local/lib and /usr/local/include. For users using OS X and homebrew, the libraries are typically installed at /opt/homebrew/lib and /opt/homebrew/include.

Non-standard library paths need to be configured. The src/Makevars file configures the compiler flags and considers the LDFLAGS and CPPFLAGS from the ~/.R/Makevars file.

diff --git a/pkgdown.yml b/pkgdown.yml index 17c2d01..db90add 100644 --- a/pkgdown.yml +++ b/pkgdown.yml @@ -8,7 +8,7 @@ articles: tutorial-create-mask-file: tutorial-create-mask-file.html tutorial-memory-optimization: tutorial-memory-optimization.html tutorial-simulate-traits: tutorial-simulate-traits.html -last_built: 2025-01-17T13:33Z +last_built: 2025-01-17T14:32Z urls: reference: https://lcrawlab.github.io/sme/reference article: https://lcrawlab.github.io/sme/articles diff --git a/reference/simulate_traits.html b/reference/simulate_traits.html index a2a36c6..7edb6cc 100644 --- a/reference/simulate_traits.html +++ b/reference/simulate_traits.html @@ -162,12 +162,12 @@

Examplesfrom_file <- read.table(out_file, header = TRUE) head(from_file) #> FID IID TRAIT -#> 1 1 1 -0.96907670 -#> 2 2 1 1.04343383 -#> 3 3 1 0.38835392 -#> 4 4 1 -0.02194594 -#> 5 5 1 -0.27492285 -#> 6 6 1 0.68657810 +#> 1 1 1 0.07327543 +#> 2 2 1 -0.82216034 +#> 3 3 1 -2.31319310 +#> 4 4 1 0.74270008 +#> 5 5 1 -2.53693229 +#> 6 6 1 1.26249797 diff --git a/reference/sme.html b/reference/sme.html index 2e7bd9e..7119a71 100644 --- a/reference/sme.html +++ b/reference/sme.html @@ -241,14 +241,14 @@

Examples) head(sme_result$summary) #> # A tibble: 6 × 8 -#> id index chromosome position p pve vc se -#> <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> -#> 1 rs1 1 1 1 0.724 -0.0233 -0.0228 0.0382 -#> 2 rs2 2 1 2 0.201 0.0407 0.0395 0.0472 -#> 3 rs3 3 1 3 0.273 0.0464 0.0455 0.0754 -#> 4 rs4 4 1 4 0.401 0.0161 0.0157 0.0624 -#> 5 rs5 5 1 5 0.855 -0.0389 -0.0377 0.0357 -#> 6 rs6 6 1 6 0.323 0.0306 0.0299 0.0650 +#> id index chromosome position p pve vc se +#> <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> +#> 1 rs1 1 1 1 0.879 -0.0395 -0.0388 0.0332 +#> 2 rs2 2 1 2 0.276 0.0218 0.0212 0.0358 +#> 3 rs3 3 1 3 0.539 -0.00572 -0.00557 0.0565 +#> 4 rs4 4 1 4 0.398 0.0151 0.0146 0.0565 +#> 5 rs5 5 1 5 0.860 -0.0441 -0.0428 0.0397 +#> 6 rs6 6 1 6 0.524 -0.00360 -0.00351 0.0575 diff --git a/search.json b/search.json index 09824ae..adbc5da 100644 --- a/search.json +++ b/search.json @@ -1 +1 @@ -[{"path":"https://lcrawlab.github.io/sme/LICENSE.html","id":null,"dir":"","previous_headings":"","what":"MIT License","title":"MIT License","text":"Copyright (c) 2024 smer authors Permission hereby granted, free charge, person obtaining copy software associated documentation files (“Software”), deal Software without restriction, including without limitation rights use, copy, modify, merge, publish, distribute, sublicense, /sell copies Software, permit persons Software furnished , subject following conditions: copyright notice permission notice shall included copies substantial portions Software. SOFTWARE PROVIDED “”, WITHOUT WARRANTY KIND, EXPRESS IMPLIED, INCLUDING LIMITED WARRANTIES MERCHANTABILITY, FITNESS PARTICULAR PURPOSE NONINFRINGEMENT. EVENT SHALL AUTHORS COPYRIGHT HOLDERS LIABLE CLAIM, DAMAGES LIABILITY, WHETHER ACTION CONTRACT, TORT OTHERWISE, ARISING , CONNECTION SOFTWARE USE DEALINGS SOFTWARE.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"run-sme-with-plink-data","dir":"Articles","previous_headings":"","what":"Run SME with PLINK data","title":"How To Use the Sparse Marginal Epistasis Test","text":"","code":"library(smer) library(dplyr) library(ggplot2)"},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"data-requirements-and-file-formats","dir":"Articles","previous_headings":"Run SME with PLINK data","what":"Data Requirements and File Formats","title":"How To Use the Sparse Marginal Epistasis Test","text":"SME implemented R requires genetic data PLINK format, consists three files: .bim: Contains SNP information .bed: Contains genetic data .fam: Contains sample information Note: sme() handle missing genotypes. Prior running sme(), use PLINK remove variants missing genotypes impute . Additionally, phenotype (trait) data separate file following PLINK’s phenotype format. sme() function includes parameters let control memory usage computational resources. detailed guidance optimizing settings system, please see tutorial Optimize Memory Requirements SME.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"specifying-snps-for-analysis","dir":"Articles","previous_headings":"Run SME with PLINK data","what":"Specifying SNPs for Analysis","title":"How To Use the Sparse Marginal Epistasis Test","text":"selecting SNPs analyze, must provide positions 1-based indices. indices correspond row numbers .bim file, first SNP index 1, second index 2, . complete details function parameters, please refer documentation sme() function. analysis uses simulated data demonstration purposes. simulated synthetic phenotypes 5000 synthetic genotypes 6000 SNPs. like learn simulate data, please refer tutorial Simulate Traits.","code":"# File inputs plink_file <- \"path/to/plink/file\" pheno_file <- \"path/to/pheno/file\" mask_file <- \"path/to/mask/file\" # Parameter inputs chun_ksize <- 10 n_randvecs <- 10 n_blocks <- 10 n_threads <- 5 # 1-based Indices of SNPs to be analyzed n_snps <- 100 snp_indices <- 1:n_snps sme_result <- sme( plink_file, pheno_file, mask_file, snp_indices, chunk_size, n_randvecs, n_blocks, n_threads )"},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"understanding-the-results","dir":"Articles","previous_headings":"Run SME with PLINK data","what":"Understanding the Results","title":"How To Use the Sparse Marginal Epistasis Test","text":"call sme() function, returns list containing multiple elements. important one called summary, contains main analysis results. results formatted tidy data, making compatible popular R packages like ggplot2 dplyr analysis visualization.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"visualizing-genomic-associations","dir":"Articles","previous_headings":"Run SME with PLINK data","what":"Visualizing Genomic Associations","title":"How To Use the Sparse Marginal Epistasis Test","text":"use Manhattan plots visualize genome-wide analyses effectively highlight strong associations genetic variants traits. case, Manhattan plot specifically shows statistical epistasis (interactions genes). reference, ’ve marked true causal SNPs (Single Nucleotide Polymorphisms) green plot - genetic variants included simulation real effects.","code":"sme_result$summary %>% ggplot(aes( x = index, y = -log10(p), color = true_gxg_snp )) + geom_point() + xlab(\"Position\") + labs(color = \"Epistatic SNP\")"},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"understanding-variance-components-and-effect-sizes","dir":"Articles","previous_headings":"Run SME with PLINK data","what":"Understanding Variance Components and Effect Sizes","title":"How To Use the Sparse Marginal Epistasis Test","text":"SME estimates much total trait variation can explained genetic interactions. simulation, set Phenotypic Variance Explained (PVE) 5% SNP involved epistatic interactions. plot shows well method recovered effects. displays two distributions: estimated PVE SNPs know truly involved epistatic interactions estimated PVE SNPs real epistatic effects dashed line marks true 5% PVE level used simulation, allowing see accurately method estimated actual effect sizes.","code":"sme_result$summary %>% ggplot(aes(x = true_gxg_snp, y = pve, fill = true_gxg_snp)) + geom_boxplot() + geom_hline(yintercept = 0.05, color = \"grey40\", linetype = \"dashed\") + annotate(\"text\", x = 0.8, y = 0.055, label = \"True per SNP epistatic PVE\", color = \"black\") + xlab(\"Epistatic SNP\") + ylab(\"Phenotypic Variance Explained\") + theme(legend.position = \"none\")"},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"narrow-sense-heritability-estimates","dir":"Articles","previous_headings":"Run SME with PLINK data","what":"Narrow Sense Heritability Estimates","title":"How To Use the Sparse Marginal Epistasis Test","text":"SME method uses linear mixed model separate different sources trait variation. One key component estimates narrow sense heritability (h2h^2), measures much trait variation can explained additive genetic effects. plot breaks estimated sources trait variation: “grm”: Shows narrow sense heritability (h2h^2) “gxg”: Shows variance due gene--gene interactions “error”: Shows unexplained variance, environmental effects simulation, set true narrow sense heritability 30%, shown dashed line plot. reference line helps evaluate accurately SME estimated genetic components trait variation. estimate narrow-sense heritability h2h^2 much less variable always informed genetic relatedness matrix. small data example overestimates heritability unbiased general.","code":"sme_result$vc_estimate %>% ggplot(aes(x = component, y = vc_estimate, fill = component)) + geom_boxplot() + geom_hline(yintercept = 0.3, color = \"grey40\", linetype = \"dashed\") + annotate(\"text\", x = 0.7, y = 0.33, label = expression(\"True \" * h^2), color = \"black\") + xlab(\"Component\") + ylab(\"Variance Component Estimate\") + theme(legend.position = \"none\")"},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"sessioninfo","dir":"Articles","previous_headings":"","what":"SessionInfo","title":"How To Use the Sparse Marginal Epistasis Test","text":"","code":"sessionInfo() #> R version 4.4.2 (2024-10-31) #> Platform: x86_64-pc-linux-gnu #> Running under: Ubuntu 24.04.1 LTS #> #> Matrix products: default #> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 #> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0 #> #> locale: #> [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 #> [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 #> [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C #> [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C #> #> time zone: UTC #> tzcode source: system (glibc) #> #> attached base packages: #> [1] stats graphics grDevices utils datasets methods base #> #> other attached packages: #> [1] ggplot2_3.5.1 dplyr_1.1.4 smer_0.0.1 #> #> loaded via a namespace (and not attached): #> [1] gtable_0.3.6 jsonlite_1.8.9 compiler_4.4.2 #> [4] tidyselect_1.2.1 Rcpp_1.0.14 FMStable_0.1-4 #> [7] parallel_4.4.2 tidyr_1.3.1 jquerylib_0.1.4 #> [10] scales_1.3.0 systemfonts_1.2.0 textshaping_0.4.1 #> [13] harmonicmeanp_3.0.1 yaml_2.3.10 fastmap_1.2.0 #> [16] R6_2.5.1 labeling_0.4.3 generics_0.1.3 #> [19] knitr_1.49 genio_1.1.2 iterators_1.0.14 #> [22] backports_1.5.0 checkmate_2.3.2 tibble_3.2.1 #> [25] desc_1.4.3 munsell_0.5.1 bslib_0.8.0 #> [28] pillar_1.10.1 rlang_1.1.4 cachem_1.1.0 #> [31] xfun_0.50 fs_1.6.5 sass_0.4.9 #> [34] cli_3.6.3 withr_3.0.2 pkgdown_2.1.1 #> [37] magrittr_2.0.3 grid_4.4.2 digest_0.6.37 #> [40] mvMAPIT_2.0.3 foreach_1.5.2 mvtnorm_1.3-3 #> [43] lifecycle_1.0.4 CompQuadForm_1.4.3 vctrs_0.6.5 #> [46] evaluate_1.0.3 glue_1.8.0 farver_2.1.2 #> [49] codetools_0.2-20 ragg_1.3.3 colorspace_2.1-1 #> [52] purrr_1.0.2 rmarkdown_2.29 tools_4.4.2 #> [55] pkgconfig_2.0.3 htmltools_0.5.8.1"},{"path":"https://lcrawlab.github.io/sme/articles/study-erythroid-differentiation-data.html","id":"dnase-i-hypersensitive-sites-of-erythroid-differentiation-reveal-statistical-epistasis-in-human-hematology-traits","dir":"Articles","previous_headings":"","what":"DNAse I hypersensitive sites of erythroid differentiation reveal statistical epistasis in human hematology traits","title":"Conditioning Epistasis Search on Open Chromatin","text":"apply SME hematology traits white British individuals UK Biobank. quality control, remaining data 349,411 individuals 543,813 SNPs common variants. select traits mean corpuscular hemoglobin (MCH), mean corpuscular hemoglobin concentration, mean corpuscular volume (MCV), hematocrit (HCT). external sparse data source, leverage DNase -hypersensitive sites (DHSs) data measured 12 days ex-vivo erythroid differentiation (Georgolopoulos et al. 2024). DHS enriched transcriptional activity used identify regulatory DNA. first three traits, MCH, MCHC, MCV traits red blood cells (RBC). Previous GWAS studies found genes associated traits implicated erythroid differentiation. Therefore, expect genomic data indicates regulatory regions gathered erythropoiesis informative traits. HCT measures percentage red blood cells blood. maturation erythroid progenitor cells regulated oxygen-sensing mechanism. hypothesise HCT, informed functional data erythropoiesis.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/study-erythroid-differentiation-data.html","id":"mask-file-preparation","dir":"Articles","previous_headings":"DNAse I hypersensitive sites of erythroid differentiation reveal statistical epistasis in human hematology traits","what":"Mask File Preparation","title":"Conditioning Epistasis Search on Open Chromatin","text":"external data sources used study represented genomic intervals DHS regions LD blocks. following mock data illustrate create mask file sme(). See Create Mask File details. make use package GenomicRanges efficiently map PLINK data intervals DHS LD data. 543,813 SNPs data, 4,932 located DHS regions. DHS regions data distributed along whole genome. test marginal epistasis SME consider variants DHS regions important. Next map PLINK data LD blocks. objects, can create mask file. larger data, recommend splitting PLINK variants analyzed batches, create one mask file per batch, submit one job per batch High Peformance Cluster. mask file can run SME. genome-wide association test marginal epistasis red blood cell traits MCH MCV finds genome-wide significant statistical epistasis (P<5 × 10−8P < \\num{5e-8}) chromosome 6 (Fig. 1). Importantly, SNPs genes map previously discovered non-additive gene action related erythropoiesis RBC traits.","code":"bim_data <- data.frame( chromosome = c(1, 1, 1, 2, 2, 2, 3, 3, 3), variant_id = c(\"rs1\", \"rs2\", \"rs3\", \"rs4\", \"rs5\", \"rs6\", \"rs7\", \"rs8\", \"rs9\"), cm_position = c(0, 0, 0, 0, 0, 0, 0, 0, 0), bp_position = c(10, 20, 30, 40, 50, 60, 70, 80, 90), allele1 = c(\"A\", \"A\", \"A\", \"G\", \"C\", \"C\", \"T\", \"T\", \"A\"), allele1 = c(\"G\", \"G\", \"G\", \"A\", \"T\", \"T\", \"A\", \"A\", \"G\") ) bim_data$index <- 1:nrow(bim_data) # DHS intervals hg19_dhs_regions <- data.frame( chromosome = c(1, 2, 3), start = c(5, 45, 85), stop = c(15, 55, 95) ) # LD block intervals hg19_ld_blocks <- data.frame( chromosome = c(1, 1, 2, 2, 3, 3, 3), start = c(5, 25, 35, 45, 65, 75, 85), stop = c(25, 35, 45, 65, 75, 85, 95) ) # Convert .bim to GRanges object bim_gr <- GRanges( seqnames = paste0(\"chr\", bim_data$chromosome), ranges = IRanges(start = bim_data$bp_position, end = bim_data$bp_position), variant_id = bim_data$variant_id, genome = \"hg19\" ) # Convert DHS to GRanges object dhs_gr <- GRanges( seqnames = paste0(\"chr\", hg19_dhs_regions$chromosome), ranges = IRanges(start = hg19_dhs_regions$start, end = hg19_dhs_regions$stop), genome = \"hg19\" ) # Find overlaps of BIM variants and DHS intervals overlaps <- findOverlaps(bim_gr, dhs_gr, maxgap = 0) # Extract overlapping variants dhs_data <- bim_data[queryHits(overlaps), ] dhs_data <- dhs_data[!duplicated(dhs_data$index), ] # Convert to GRanges object ld_gr <- GRanges( seqnames = paste0(\"chr\", hg19_ld_blocks$chromosome), ranges = IRanges(start = hg19_ld_blocks$start, end = hg19_ld_blocks$stop), genome = \"hg19\" ) # Find LD block of bim variants ld_overlaps <- findOverlaps(query = bim_gr, subject = ld_gr) output_file <- tempfile() gxg_group <- \"gxg\" ld_group <- \"ld\" gxg_variants <- dhs_data$index - 1 # 0-base index for C++ create_hdf5_file(output_file) for (j in bim_data$index - 1) { # 0-base index for C++ # Write DHS mask gxg_ds <- sprintf(\"%s/%d\", gxg_group, j) write_hdf5_dataset(file_name = output_file, dataset_name = gxg_ds, gxg_variants) # Find LD block of focal SNP focal_gr <- ld_gr[subjectHits(ld_overlaps[j,])] # Find variants in LD block of focal SNP focal_ld <- findOverlaps(query = bim_gr, subject = focal_gr) ld_data <- bim_data[queryHits(focal_ld),] ld_variants <- ld_data$index - 1 # 0-base index for C++ # Write LD mask ld_ds <- sprintf(\"%s/%d\", ld_group, j) write_hdf5_dataset(file_name = output_file, dataset_name = ld_ds, ld_variants) } dhs_indices <- read_hdf5_dataset(file_name = output_file, dataset_name = gxg_ds) print(sprintf(\"DHS indices: %s\", paste(dhs_indices, collapse = \", \"))) #> [1] \"DHS indices: 0, 4, 8\" sme_result <- sme( plink_file = \"/path/to/plink/data\", pheno_file = \"/path/to/pheno/data\", mask_file = \"/path/to/mask/file\", gxg_indices = c(1, 2, 3), chunk_size = 250, n_randvecs = 10, n_blocks = 200, n_threads = 6 )"},{"path":"https://lcrawlab.github.io/sme/articles/study-erythroid-differentiation-data.html","id":"section","dir":"Articles","previous_headings":"","what":"Conditioning Epistasis Search on Open Chromatin","title":"Conditioning Epistasis Search on Open Chromatin","text":"Figure 1. Manhattan plot SME analysis. dashed blue line significance threshold Bonferroni correction. strongest association trait MCH, SNP rs4711092 (P=1.41 × 10−11P = \\num{1.41e-11}, PVE 0.7%), maps gene secretagogin (). gene regulates exocytosis interacting two soluble NSF adaptor proteins ( ) critical cell growth tissues. total five SNPs SME significantly associates MCH (strongest association rs9366624 P=1.8 × 10−9P = \\num{1.8e-9}, PVE 1.1%) gene capping protein regulator myosin 1 linker 1 (). gene known interact regulate caping protein (). plays role via protein-protein interaction regulating erythrpoiesis. Specifically, proteins regulate actin dynamics regulating activity . Erythropoiesis leads modifications expression membrane cytoskeletal proteins, whose interactions impact cell structure function. genes previously associated hemoglobin concentration. strongest association trait MCV, SNP rs9276 (P=9.09 × 10−10P = \\num{9.09e-10}, PVE 0.24%) maps gene major histocompatibility complex. SNP rs9366624 (P=1.86 × 10−8P = \\num{1.86e-8}, PVE 0.8%), also gene significantly associated trait marginal epistasis. complete list significant associations produced SME reported Tab. 1. Table 1. Significant trait associations marginal epistasis. Fitting linear mixed model SME also produces narrow-sense heritability estimates equivalent RHE regression. heritability estimates SME four traits study similar heritability estimates found literature(Tab. 2). Table 2. Narrow-sense heritability (h2h^2) estimates SME analysis.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/study-erythroid-differentiation-data.html","id":"references","dir":"Articles","previous_headings":"","what":"References","title":"Conditioning Epistasis Search on Open Chromatin","text":"Stamp J, Smith Pattillo S, Weinreich D, Crawford L (2025). Sparse modeling interactions enables fast detection genome-wide epistasis biobank-scale studies. biorxiv, https://doi.org/10.1101/2025.01.11.632557 Georgolopoulos, G. GEO Data Set: Discrete regulatory modules instruct hematopoietic lineage commitment differentiation https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE182816 (2024).","code":""},{"path":"https://lcrawlab.github.io/sme/articles/study-erythroid-differentiation-data.html","id":"sessioninfo","dir":"Articles","previous_headings":"","what":"SessionInfo","title":"Conditioning Epistasis Search on Open Chromatin","text":"","code":"sessionInfo() #> R version 4.4.2 (2024-10-31) #> Platform: x86_64-pc-linux-gnu #> Running under: Ubuntu 24.04.1 LTS #> #> Matrix products: default #> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 #> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0 #> #> locale: #> [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 #> [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 #> [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C #> [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C #> #> time zone: UTC #> tzcode source: system (glibc) #> #> attached base packages: #> [1] stats4 stats graphics grDevices utils datasets methods #> [8] base #> #> other attached packages: #> [1] smer_0.0.1 GenomicRanges_1.58.0 GenomeInfoDb_1.42.1 #> [4] IRanges_2.40.1 S4Vectors_0.44.0 BiocGenerics_0.52.0 #> #> loaded via a namespace (and not attached): #> [1] tidyr_1.3.1 sass_0.4.9 generics_0.1.3 #> [4] digest_0.6.37 magrittr_2.0.3 evaluate_1.0.3 #> [7] iterators_1.0.14 CompQuadForm_1.4.3 mvtnorm_1.3-3 #> [10] fastmap_1.2.0 foreach_1.5.2 genio_1.1.2 #> [13] jsonlite_1.8.9 backports_1.5.0 httr_1.4.7 #> [16] purrr_1.0.2 UCSC.utils_1.2.0 codetools_0.2-20 #> [19] textshaping_0.4.1 jquerylib_0.1.4 cli_3.6.3 #> [22] rlang_1.1.4 XVector_0.46.0 cachem_1.1.0 #> [25] yaml_2.3.10 FMStable_0.1-4 tools_4.4.2 #> [28] parallel_4.4.2 checkmate_2.3.2 dplyr_1.1.4 #> [31] GenomeInfoDbData_1.2.13 vctrs_0.6.5 R6_2.5.1 #> [34] lifecycle_1.0.4 mvMAPIT_2.0.3 zlibbioc_1.52.0 #> [37] fs_1.6.5 ragg_1.3.3 pkgconfig_2.0.3 #> [40] desc_1.4.3 pkgdown_2.1.1 pillar_1.10.1 #> [43] bslib_0.8.0 glue_1.8.0 Rcpp_1.0.14 #> [46] harmonicmeanp_3.0.1 systemfonts_1.2.0 xfun_0.50 #> [49] tibble_3.2.1 tidyselect_1.2.1 knitr_1.49 #> [52] htmltools_0.5.8.1 rmarkdown_2.29 compiler_4.4.2"},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-cite-us.html","id":"the-sparse-marginal-epistasis-test-sme","dir":"Articles","previous_headings":"","what":"The Sparse Marginal Epistasis Test (SME)","title":"How To Cite Our Work","text":"Stamp J, Smith Pattillo S, Weinreich D, Crawford L (2025). Sparse modeling interactions enables fast detection genome-wide epistasis biobank-scale studies. biorxiv, https://doi.org/10.1101/2025.01.11.632557 Stamp J & Crawford L (2025). smer: Sparse Marginal Epistasis Test. https://github.com/lcrawlab/sme, https://lcrawlab.github.io/sme/","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-cite-us.html","id":"the-multivariate-marginal-epistasis-test-mvmapit","dir":"Articles","previous_headings":"","what":"The multivariate Marginal Epistasis Test (mvMAPIT)","title":"How To Cite Our Work","text":"Stamp J, DenAdel , Weinreich D, Crawford, L (2023). Leveraging Genetic Correlation Traits Improves Detection Epistasis Genome-wide Association Studies. G3 Genes|Genomes|Genetics 13(8), jkad118; doi: https://doi.org/10.1093/g3journal/jkad118 Stamp J, Crawford L (2022). mvMAPIT: Multivariate Genome Wide Marginal Epistasis Test. https://github.com/lcrawlab/mvMAPIT, https://lcrawlab.github.io/mvMAPIT/","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-cite-us.html","id":"the-marginal-epistasis-test-mapit","dir":"Articles","previous_headings":"","what":"The Marginal Epistasis Test (MAPIT)","title":"How To Cite Our Work","text":"Crawford L, Zeng P, Mukherjee S, & Zhou X (2017). Detecting epistasis marginal epistasis test genetic mapping studies quantitative traits. PLoS genetics, 13(7), e1006869. https://doi.org/10.1371/journal.pgen.1006869","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-create-mask-file.html","id":"mask-file-format","dir":"Articles","previous_headings":"","what":"Mask File Format","title":"How To Create a Mask File","text":"sme() function expects mask data HDF5 file. HDF5 format includes two primary object types: Datasets - typed multidimensional arrays Groups - container structures can hold datasets groups","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-create-mask-file.html","id":"mask-format-requirements","dir":"Articles","previous_headings":"","what":"Mask Format Requirements","title":"How To Create a Mask File","text":"mask data organized following groups datasets: Groups: ld: Contains SNPs linkage disequilibrium (LD) focal SNP, excluded. gxg: Contains indices SNPs used condition marginal epistasis test, included. required group names can configured input parameters sme(). defaults ld gxg. Datasets: ld/: focal SNP , dataset contains indices SNPs LD block SNP. SNPs excluded gene--gene interaction covariance matrix. gxg/: focal SNP , dataset contains indices SNPs include gene--gene interaction covariance matrix focal SNP . Important: indices mask file must zero-based correspond zero-based row indices PLINK .bim file. includes dataset index ( gxg/) data . zero-based indexing necessary mask data read C++ subroutine sme(), uses zero-based indexing, unlike R’s one-based indexing SNP indices function call.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-create-mask-file.html","id":"creating-and-using-mask-files","dir":"Articles","previous_headings":"","what":"Creating and Using Mask Files","title":"How To Create a Mask File","text":"package provides utility functions create, write, read valid mask files sme(). can check data written correctly.","code":"hdf5_file <- tempfile() # Group names gxg_h5_group <- \"gxg\" ld_h5_group <- \"ld\" # Data (still in 1-based R indexing) include_gxg_snps <- 1:10 exclude_ld_snps <- 5:6 # Focal SNP (still in 1-based R indexing) focal_snp <- 4 # Dataset names dataset_name_pattern <- \"%s/%s\" # 0-based index! gxg_dataset <- sprintf(dataset_name_pattern, gxg_h5_group, focal_snp - 1) ld_dataset <- sprintf(dataset_name_pattern, ld_h5_group, focal_snp - 1) # Create an empty HDF5 file create_hdf5_file(hdf5_file) # Write LD data write_hdf5_dataset(hdf5_file, ld_dataset, exclude_ld_snps - 1) # 0-based index! # Write GXG data write_hdf5_dataset(hdf5_file, gxg_dataset, include_gxg_snps - 1) ld_read <- read_hdf5_dataset(hdf5_file, ld_dataset) gxg_read <- read_hdf5_dataset(hdf5_file, gxg_dataset) print(sprintf(\"Zero-based indices of SNPs to exclude: %s\", str(ld_read))) #> int [1:2] 4 5 #> character(0) print(sprintf(\"Zero-based indices of SNPs to include: %s\", str(gxg_read))) #> int [1:10] 0 1 2 3 4 5 6 7 8 9 #> character(0)"},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-create-mask-file.html","id":"sessioninfo","dir":"Articles","previous_headings":"","what":"SessionInfo","title":"How To Create a Mask File","text":"","code":"sessionInfo() #> R version 4.4.2 (2024-10-31) #> Platform: x86_64-pc-linux-gnu #> Running under: Ubuntu 24.04.1 LTS #> #> Matrix products: default #> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 #> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0 #> #> locale: #> [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 #> [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 #> [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C #> [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C #> #> time zone: UTC #> tzcode source: system (glibc) #> #> attached base packages: #> [1] stats graphics grDevices utils datasets methods base #> #> other attached packages: #> [1] smer_0.0.1 #> #> loaded via a namespace (and not attached): #> [1] jsonlite_1.8.9 dplyr_1.1.4 compiler_4.4.2 #> [4] tidyselect_1.2.1 Rcpp_1.0.14 FMStable_0.1-4 #> [7] parallel_4.4.2 tidyr_1.3.1 jquerylib_0.1.4 #> [10] systemfonts_1.2.0 textshaping_0.4.1 harmonicmeanp_3.0.1 #> [13] yaml_2.3.10 fastmap_1.2.0 R6_2.5.1 #> [16] generics_0.1.3 knitr_1.49 genio_1.1.2 #> [19] iterators_1.0.14 backports_1.5.0 checkmate_2.3.2 #> [22] tibble_3.2.1 desc_1.4.3 bslib_0.8.0 #> [25] pillar_1.10.1 rlang_1.1.4 cachem_1.1.0 #> [28] xfun_0.50 fs_1.6.5 sass_0.4.9 #> [31] cli_3.6.3 pkgdown_2.1.1 magrittr_2.0.3 #> [34] digest_0.6.37 mvMAPIT_2.0.3 foreach_1.5.2 #> [37] mvtnorm_1.3-3 lifecycle_1.0.4 CompQuadForm_1.4.3 #> [40] vctrs_0.6.5 evaluate_1.0.3 glue_1.8.0 #> [43] codetools_0.2-20 ragg_1.3.3 purrr_1.0.2 #> [46] rmarkdown_2.29 tools_4.4.2 pkgconfig_2.0.3 #> [49] htmltools_0.5.8.1"},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"genotype-data-size-and-number-of-blocks","dir":"Articles","previous_headings":"","what":"Genotype Data Size and Number of Blocks","title":"How To Optimize the Memory Requirements of SME","text":"sample size primary factor influencing memory requirements. phenotype genotype data need loaded memory computation. large datasets, like Biobank-scale data (350k samples 500k SNPs), loading entire dataset memory requires 1.4TB (assuming double precision data matrix), exceeds machines’ capacities. manage large datasets efficiently, sme() reads genotype data smaller blocks. parameter n_blocks controls number blocks. instance, 500k SNPs, setting n_blocks = 100 load 5000 SNPs memory time, reducing memory load allowing computations proceed block block.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"number-of-random-vectors","dir":"Articles","previous_headings":"","what":"Number of Random Vectors","title":"How To Optimize the Memory Requirements of SME","text":"sme() function uses stochastic trace estimator approximate trace matrix products efficiently. number random vectors impacts accuracy trace estimates memory computational efficiency. blockwise computation, algorithm stores intermediate matrices sized sample_size x n_randvecs. Increasing number random vectors improves accuracy also increases memory usage computation time. Typically, using around 10 random vectors provides reasonably accurate results.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"number-of-snps-sharing-random-vectors","dir":"Articles","previous_headings":"","what":"Number of SNPs Sharing Random Vectors","title":"How To Optimize the Memory Requirements of SME","text":"chunk_size parameter controls many SNPs share set random vectors, enhancing efficiency genome-wide data processing. method reduces redundant calculations genetic relatedness covariance matrix minimizes time spent reading genotype data memory. set SNPs analyzed together (“chunk”), intermediate results must stored. Consequently, memory requirement grows chunk size, calculated : chunk_size x (sample_size x n_randvecs). Figure 1. Schematic overview illustrating compuational speedup resulting sharing random vectors. () randomized trace estimates can identify reusable matrix vector products. Computing exact trace product two covariance matrices prohibitively computationally expensive. Instead, sparse marginal epistasis (SME) test approximates traces using random vectors zz. full MQS computation point estimates variance components, see matrix--vector products form AzAz ∈{K,G}\\\\{K, G\\} appear repeatedly. (b) genetic relatedness matrix KK focal SNPs. Using unique random vectors computation every focal SNP, compute quantity repeatedly. Computing matrix--vector products KzKz constitutes almost half computation time point estimates. (c) sharing random vectors zz focal SNPs, computing KzKz can done focal SNPs share random vectors. , computation time KzKz becomes negligible.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"genotype-masking-for-the-gene-by-gene-interaction-covariance","dir":"Articles","previous_headings":"","what":"Genotype Masking for the Gene-by-Gene Interaction Covariance","title":"How To Optimize the Memory Requirements of SME","text":"Masking genotypes contribute epistasis can help reduce memory usage computation time. masked, genotypes need stored memory, significantly decreasing memory requirements. Note approximate_memory_requirements() function account reduction.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"explore-the-memory-requirements","dir":"Articles","previous_headings":"","what":"Explore the Memory Requirements","title":"How To Optimize the Memory Requirements of SME","text":"estimate memory needs based chosen parameters, use approximate_memory_requirements() function. function helps determine planned settings fit within available memory identify parameters can adjusted meet resource constraints. parameters n_blocks, n_randvecs, chunk_size particularly flexible significant impact memory usage. Note however, account masking therefore likely overestimates required memory.","code":"n_samples <- c(350000) n_snps <- c(500000) n_blocks <- c(1, 100, 1000) n_randvecs <- c(10, 100) chunk_size <- c(10, 100) parameters <- crossing( n_samples = n_samples, n_snps = n_snps, n_blocks = n_blocks, n_randvecs = n_randvecs, chunk_size = chunk_size ) estimated_memory <- parameters %>% mutate(memory_gb = round( approximate_memory_requirements(n_samples, n_snps, n_blocks, n_randvecs, chunk_size), 2 )) kable(estimated_memory)"},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"a-note-on-the-runtime-of-sme","dir":"Articles","previous_headings":"","what":"A Note on the Runtime of SME","title":"How To Optimize the Memory Requirements of SME","text":"Despite computational efficiency SME, genome-wide testing requires considerable resources. recommend analyze data batches, launch multiple processes simultaneously high-performance cluster (HPC). study, analyzed 544k SNPs genotype 350k individuals. launched 544 slurm jobs requesting 43GB memory 6 CPUs analyze batches 1000 SNPs chunk sizes 250 SNPs. Genome-wide testing single trait HPC 960 CPUs 6840GB memory available took 3.5 days. Figure 2. SME improved power detect marginal epistasis runs 10x 90x faster state---art methods. CPU time measured 350,000 individuals.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"sessioninfo","dir":"Articles","previous_headings":"","what":"SessionInfo","title":"How To Optimize the Memory Requirements of SME","text":"","code":"sessionInfo() #> R version 4.4.2 (2024-10-31) #> Platform: x86_64-pc-linux-gnu #> Running under: Ubuntu 24.04.1 LTS #> #> Matrix products: default #> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 #> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0 #> #> locale: #> [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 #> [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 #> [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C #> [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C #> #> time zone: UTC #> tzcode source: system (glibc) #> #> attached base packages: #> [1] stats graphics grDevices utils datasets methods base #> #> other attached packages: #> [1] knitr_1.49 dplyr_1.1.4 tidyr_1.3.1 smer_0.0.1 #> #> loaded via a namespace (and not attached): #> [1] jsonlite_1.8.9 compiler_4.4.2 tidyselect_1.2.1 #> [4] Rcpp_1.0.14 FMStable_0.1-4 parallel_4.4.2 #> [7] jquerylib_0.1.4 systemfonts_1.2.0 textshaping_0.4.1 #> [10] harmonicmeanp_3.0.1 yaml_2.3.10 fastmap_1.2.0 #> [13] R6_2.5.1 generics_0.1.3 genio_1.1.2 #> [16] iterators_1.0.14 backports_1.5.0 checkmate_2.3.2 #> [19] tibble_3.2.1 desc_1.4.3 bslib_0.8.0 #> [22] pillar_1.10.1 rlang_1.1.4 cachem_1.1.0 #> [25] xfun_0.50 fs_1.6.5 sass_0.4.9 #> [28] cli_3.6.3 pkgdown_2.1.1 magrittr_2.0.3 #> [31] digest_0.6.37 mvMAPIT_2.0.3 foreach_1.5.2 #> [34] mvtnorm_1.3-3 lifecycle_1.0.4 CompQuadForm_1.4.3 #> [37] vctrs_0.6.5 evaluate_1.0.3 glue_1.8.0 #> [40] codetools_0.2-20 ragg_1.3.3 purrr_1.0.2 #> [43] rmarkdown_2.29 tools_4.4.2 pkgconfig_2.0.3 #> [46] htmltools_0.5.8.1"},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-simulate-traits.html","id":"sessioninfo","dir":"Articles","previous_headings":"","what":"SessionInfo","title":"How To Simulate Traits","text":"","code":"sessionInfo() #> R version 4.4.2 (2024-10-31) #> Platform: x86_64-pc-linux-gnu #> Running under: Ubuntu 24.04.1 LTS #> #> Matrix products: default #> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 #> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0 #> #> locale: #> [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 #> [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 #> [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C #> [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C #> #> time zone: UTC #> tzcode source: system (glibc) #> #> attached base packages: #> [1] stats graphics grDevices utils datasets methods base #> #> other attached packages: #> [1] genio_1.1.2 smer_0.0.1 #> #> loaded via a namespace (and not attached): #> [1] jsonlite_1.8.9 dplyr_1.1.4 compiler_4.4.2 #> [4] tidyselect_1.2.1 Rcpp_1.0.14 FMStable_0.1-4 #> [7] parallel_4.4.2 tidyr_1.3.1 jquerylib_0.1.4 #> [10] systemfonts_1.2.0 textshaping_0.4.1 harmonicmeanp_3.0.1 #> [13] yaml_2.3.10 fastmap_1.2.0 R6_2.5.1 #> [16] generics_0.1.3 knitr_1.49 iterators_1.0.14 #> [19] backports_1.5.0 checkmate_2.3.2 tibble_3.2.1 #> [22] desc_1.4.3 bslib_0.8.0 pillar_1.10.1 #> [25] rlang_1.1.4 cachem_1.1.0 xfun_0.50 #> [28] fs_1.6.5 sass_0.4.9 cli_3.6.3 #> [31] pkgdown_2.1.1 magrittr_2.0.3 digest_0.6.37 #> [34] mvMAPIT_2.0.3 foreach_1.5.2 mvtnorm_1.3-3 #> [37] lifecycle_1.0.4 CompQuadForm_1.4.3 vctrs_0.6.5 #> [40] evaluate_1.0.3 glue_1.8.0 codetools_0.2-20 #> [43] ragg_1.3.3 purrr_1.0.2 rmarkdown_2.29 #> [46] tools_4.4.2 pkgconfig_2.0.3 htmltools_0.5.8.1"},{"path":"https://lcrawlab.github.io/sme/authors.html","id":null,"dir":"","previous_headings":"","what":"Authors","title":"Authors and Citation","text":"Julian Stamp. Maintainer, author. Lorin Crawford. Author. sriramlab. Copyright holder. Author included mailman algorithm Blue Brain Project/EPFL. Copyright holder. Author included HighFive library","code":""},{"path":"https://lcrawlab.github.io/sme/authors.html","id":"citation","dir":"","previous_headings":"","what":"Citation","title":"Authors and Citation","text":"Stamp J, Crawford L (2025). smer: Sparse Marginal Epistasis Test. R package version 0.0.1, https://lcrawlab.github.io/sme/, https://github.com/lcrawlab/sme.","code":"@Manual{, title = {smer: Sparse Marginal Epistasis Test}, author = {Julian Stamp and Lorin Crawford}, year = {2025}, note = {R package version 0.0.1, https://lcrawlab.github.io/sme/}, url = {https://github.com/lcrawlab/sme}, }"},{"path":"https://lcrawlab.github.io/sme/index.html","id":"the-sparse-marginal-epistasis-test-","dir":"","previous_headings":"","what":"Sparse Marginal Epistasis Test","title":"Sparse Marginal Epistasis Test","text":"smer package implements computationally statistically efficient method detecting marginal epistasis genome-wide association studies (GWAS). Find full package documentation including examples articles : Sparse Marginal Epistasis test Documentation.","code":""},{"path":"https://lcrawlab.github.io/sme/index.html","id":"key-features","dir":"","previous_headings":"","what":"Key Features","title":"Sparse Marginal Epistasis Test","text":"Hutchinson’s stochastic trace estimator: efficient scalable computation Mailman algorithm: fast vector--matrix operation Linear mixed model: controls population structure Multimodal Input: incorporates additional data HDF5 files improve power detecting gene--gene interactions. Optimize Memory Constraints: Highly configurable block wise processing data allows make available resources. See also Optimize Memory Requirements SME. Parallelization: Utilizes OpenMP multi-threaded processing.","code":""},{"path":[]},{"path":"https://lcrawlab.github.io/sme/index.html","id":"installation-from-cran","dir":"","previous_headings":"Installation","what":"Installation from CRAN","title":"Sparse Marginal Epistasis Test","text":"can install latest release CRAN","code":"install.packages(\"smer\")"},{"path":"https://lcrawlab.github.io/sme/index.html","id":"installation-from-source","dir":"","previous_headings":"Installation","what":"Installation from source","title":"Sparse Marginal Epistasis Test","text":"can install development version smer GitHub :","code":"install.packages(\"devtools\") devtools::install_github(\"lcrawlab/sme\")"},{"path":"https://lcrawlab.github.io/sme/index.html","id":"dependencies","dir":"","previous_headings":"","what":"Dependencies","title":"Sparse Marginal Epistasis Test","text":"System requirements package: GNU make R (>= 4.4) Rhdf5lib (BioConductor) OpenMP (optional) install Rhdf5lib, first install tool BiocManager CRAN, install library using tool. full list R dependencies can found DESCRIPTION file.","code":"if (!require(\"BiocManager\", quietly = TRUE)) install.packages(\"BiocManager\") BiocManager::install(\"Rhdf5lib\")"},{"path":"https://lcrawlab.github.io/sme/index.html","id":"openmp","dir":"","previous_headings":"Dependencies","what":"OpenMP","title":"Sparse Marginal Epistasis Test","text":"OS X Linux, OpenMP library can installed via one (shell) commands specified : enable openMP, may necessary configure compiler flags SHLIB_OPENMP_CXXFLAGS LDFLAGS ~/.R/Makevars file.","code":""},{"path":"https://lcrawlab.github.io/sme/index.html","id":"known-issues","dir":"","previous_headings":"","what":"Known Issues","title":"Sparse Marginal Epistasis Test","text":"Compiling package requires compiler find libraries dependencies. unix systems, libraries typically installed /usr/local/lib /usr/local/include. users using OS X homebrew, libraries typically installed /opt/homebrew/lib /opt/homebrew/include. Non-standard library paths need configured. src/Makevars file configures compiler flags considers LDFLAGS CPPFLAGS ~/.R/Makevars file.","code":""},{"path":"https://lcrawlab.github.io/sme/index.html","id":"references","dir":"","previous_headings":"","what":"References","title":"Sparse Marginal Epistasis Test","text":"Stamp J, Crawford L (2025). smer: Sparse Marginal Epistasis Test. R package version 0.0.1, https://lcrawlab.github.io/sme/, https://github.com/lcrawlab/sme. Stamp J, Smith Pattillo S, Weinreich D, Crawford L (2025). Sparse modeling interactions enables fast detection genome-wide epistasis biobank-scale studies. biorxiv, https://doi.org/10.1101/2025.01.11.632557 Stamp J, Crawford L (2024). mvMAPIT: Multivariate Genome Wide Marginal Epistasis Test. R package version 2.0.3, https://lcrawlab.github.io/mvMAPIT/, https://github.com/lcrawlab/mvMAPIT. Stamp et al. (2023): Leveraging genetic correlation traits epistasis detection GWAS. G3: Genes, Genomes, Genetics. Fu, B., Pazokitoroudi, ., Xue, ., Anand, ., Anand, P., Zaitlen, N., & Sankararaman, S. (2023). biobank-scale test marginal epistasis reveals genome-wide signals polygenic epistasis. bioRxiv. Crawford et al. (2017): Detecting epistasis marginal epistasis test. PLoS Genetics. Devresse et al. (2024): HighFive - Header-C++ HDF5 interface. https://zenodo.org/records/13120799","code":""},{"path":"https://lcrawlab.github.io/sme/reference/approximate_memory_requirements.html","id":null,"dir":"Reference","previous_headings":"","what":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","title":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","text":"function provides approximate estimate memory requirements (gigabytes) running Sparse Marginal Epistasis (SME) routine based input parameters number samples, SNPs, configurations.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/approximate_memory_requirements.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","text":"","code":"approximate_memory_requirements( n_samples, n_snps, n_blocks, n_randvecs, chunksize )"},{"path":"https://lcrawlab.github.io/sme/reference/approximate_memory_requirements.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","text":"n_samples Integer. number samples dataset. n_snps Integer. total number SNPs dataset. n_blocks Integer. number genotype blocks used partition SNPs. Affects size encoded genotype segments. n_randvecs Integer. number random vectors used stochastic trace estimation. Affects memory operations involving random vectors. chunksize Integer. number focal SNPs processed per chunk.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/approximate_memory_requirements.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","text":"Numeric. approximate memory requirement (gigabytes) SME routine.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/approximate_memory_requirements.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","text":"function calculates memory usage summing contributions various components used SME routine, including: Variance component estimates (vc_estimates) Phenotype-related matrices Random vector-based computations Genotype objects block statistics Gene--gene interaction masks estimated memory requirement derived data dimensions operational needs, provides guideline configuring resources analysis.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/approximate_memory_requirements.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","text":"","code":"n_samples <- 1e5 n_snps <- 1e6 n_blocks <- 100 n_randvecs <- 100 chunksize <- 10 approximate_memory_requirements(n_samples, n_snps, n_blocks, n_randvecs, chunksize) #> [1] 6.447136"},{"path":"https://lcrawlab.github.io/sme/reference/create_hdf5_file.html","id":null,"dir":"Reference","previous_headings":"","what":"Create an HDF5 File — create_hdf5_file","title":"Create an HDF5 File — create_hdf5_file","text":"function creates new, empty HDF5 file specified location.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/create_hdf5_file.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Create an HDF5 File — create_hdf5_file","text":"","code":"create_hdf5_file(hdf5_file)"},{"path":"https://lcrawlab.github.io/sme/reference/create_hdf5_file.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Create an HDF5 File — create_hdf5_file","text":"hdf5_file character string specifying path name HDF5 file created.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/create_hdf5_file.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Create an HDF5 File — create_hdf5_file","text":"return value; function creates HDF5 file specified location.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/create_hdf5_file.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Create an HDF5 File — create_hdf5_file","text":"","code":"# Create an empty HDF5 file hdf5_file <- tempfile() create_hdf5_file(hdf5_file)"},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":null,"dir":"Reference","previous_headings":"","what":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"getting_started simulated dataset created demonstrate use sme() function genome-wide interaction analyses. contains results simulated analysis involving additive genetic effects gene--gene (GxG) interactions.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"","code":"data(\"getting_started\")"},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":"format","dir":"Reference","previous_headings":"","what":"Format","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"list results sme(), including following components: summary data frame summarizing analysis results, including p-values SNP associations (p). pve data frame containing per SNP variance component estimates normalized phenotypic variance explained (PVE). vc data frame containing per SNP variance component estimates. gxg_snps vector containing indices SNPs assigned epistatic interactions trait simulations.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":"source","dir":"Reference","previous_headings":"","what":"Source","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"data-raw/getting_started.R","code":""},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"dataset generated follows: Genotype Simulation: Genotype data 5000 individuals 6,000 SNPs simulated synthetic allele counts. Phenotype Simulation: Phenotypic values simulated additive heritability 0.3 GxG interaction heritability 0.25. set 100 SNPs selected additive effects, two groups 5 SNPs used GxG interactions. PLINK-Compatible Files: simulated data saved PLINK-compatible .bed, .fam, .bim files. Interaction Analysis: sme() function used perform genome-wide interaction analyses subset SNP indices, including GxG SNP groups 100 additional additive SNPs. Memory-efficient computation parameters (e.g., chun_ksize, n_randvecs, n_blocks) applied.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":"key-parameters","dir":"Reference","previous_headings":"","what":"Key Parameters","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"Additive Heritability: 0.3 GxG Heritability: 0.25 Number Samples: 5000 Number SNPs: 6,000 Selected Additive SNPs: 100 Selected GxG SNP Groups: Group 1: 5 SNPs Group 2: 5 SNPs","code":""},{"path":[]},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"","code":"data(\"getting_started\") head(getting_started$summary) #> # A tibble: 6 × 9 #> id index chromosome position p pve vc se true_gxg_snp #> #> 1 rs1498 1498 1 1498 0.000581 0.0447 0.0446 0.0137 TRUE #> 2 rs2032 2032 1 2032 0.00722 0.0377 0.0377 0.0154 TRUE #> 3 rs2364 2364 1 2364 0.00178 0.0450 0.0450 0.0154 TRUE #> 4 rs2867 2867 1 2867 0.000496 0.0519 0.0518 0.0157 TRUE #> 5 rs4610 4610 1 4610 0.0000783 0.0581 0.0580 0.0153 TRUE #> 6 rs822 822 1 822 0.00522 0.0367 0.0367 0.0143 TRUE"},{"path":"https://lcrawlab.github.io/sme/reference/read_hdf5_dataset.html","id":null,"dir":"Reference","previous_headings":"","what":"Read Dataset from an HDF5 File — read_hdf5_dataset","title":"Read Dataset from an HDF5 File — read_hdf5_dataset","text":"function reads dataset existing HDF5 file.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/read_hdf5_dataset.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Read Dataset from an HDF5 File — read_hdf5_dataset","text":"","code":"read_hdf5_dataset(file_name, dataset_name)"},{"path":"https://lcrawlab.github.io/sme/reference/read_hdf5_dataset.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Read Dataset from an HDF5 File — read_hdf5_dataset","text":"file_name character string specifying path HDF5 file. dataset_name character string specifying name dataset within HDF5 file read.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/read_hdf5_dataset.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Read Dataset from an HDF5 File — read_hdf5_dataset","text":"content dataset HDF5 file, typically form R object.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/read_hdf5_dataset.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Read Dataset from an HDF5 File — read_hdf5_dataset","text":"","code":"data_to_write <- 1:10 # Create an empty HDF5 file hdf5_file <- tempfile() create_hdf5_file(hdf5_file) # Write new data to a dataset in the HDF5 file write_hdf5_dataset(hdf5_file, \"group/dataset\", data_to_write) # Read a dataset from an HDF5 file hdf5_data <- read_hdf5_dataset(hdf5_file, \"group/dataset\") print(hdf5_data) #> [1] 1 2 3 4 5 6 7 8 9 10"},{"path":"https://lcrawlab.github.io/sme/reference/simulate_traits.html","id":null,"dir":"Reference","previous_headings":"","what":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","title":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","text":"function simulates quantitative trait based additive epistatic genetic effects using genotype data PLINK dataset. simulated trait saved specified output file phenotype format compatible PLINK.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/simulate_traits.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","text":"","code":"simulate_traits( plink_file, output_file, additive_heritability, gxg_heritability, additive_indices, gxg_indices_1, gxg_indices_2, log_level = \"WARNING\" )"},{"path":"https://lcrawlab.github.io/sme/reference/simulate_traits.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","text":"plink_file Character. Path PLINK dataset (without file extension). function append .bed, .bim, .fam extensions needed. output_file Character. Path output file simulated trait saved. additive_heritability Numeric. value 0 1 specifying proportion trait variance due additive genetic effects. gxg_heritability Numeric. value 0 1 specifying proportion trait variance due gene--gene (epistatic) interactions. sum additive_heritability gxg_heritability must exceed 1. additive_indices Integer vector. Indices SNPs contributing additive genetic effects. gxg_indices_1 Integer vector. Indices SNPs first group epistatic interactions. gxg_indices_2 Integer vector. Indices SNPs second group epistatic interactions. log_level Character. Logging level messages (e.g., \"DEBUG\", \"INFO\", \"WARNING\"). Default \"WARNING\".","code":""},{"path":"https://lcrawlab.github.io/sme/reference/simulate_traits.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","text":"None. simulated trait written specified output_file.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/simulate_traits.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","text":"function uses following components simulate trait: Additive genetic effects: Determined additive_indices specified additive_heritability. Epistatic interactions: Simulated using pairs SNPs gxg_indices_1 gxg_indices_2, contributing gxg_heritability. Environmental effects: remaining variance explained genetic effects assigned random environmental noise. output file PLINK-compatible phenotype format three columns: Family ID (FID), Individual ID (IID), simulated trait (TRAIT).","code":""},{"path":"https://lcrawlab.github.io/sme/reference/simulate_traits.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","text":"","code":"plink_file <- gsub(\"\\\\.bed\", \"\", system.file(\"testdata\", \"test.bed\", package = \"smer\")) out_file <- tempfile() additive_heritability <- 0.3 gxg_heritability <- 0.1 additive_snps <- sort(sample(1:100, 50, replace = FALSE)) gxg_group_1 <- sort(sample(additive_snps, 10, replace = FALSE)) gxg_group_2 <- sort(sample(setdiff(additive_snps, gxg_group_1), 10, replace = FALSE)) n_samples <- 200 simulate_traits( plink_file, out_file, additive_heritability, gxg_heritability, additive_snps, gxg_group_1, gxg_group_2 ) from_file <- read.table(out_file, header = TRUE) head(from_file) #> FID IID TRAIT #> 1 1 1 -0.96907670 #> 2 2 1 1.04343383 #> 3 3 1 0.38835392 #> 4 4 1 -0.02194594 #> 5 5 1 -0.27492285 #> 6 6 1 0.68657810"},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":null,"dir":"Reference","previous_headings":"","what":"Sparse Marginal Epistasis Test (SME) — sme","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"SME fits linear mixed model order test marginal epistasis. concentrates scans epistasis regions genome known functional enrichment trait interest.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"","code":"sme( plink_file, pheno_file, mask_file = NULL, gxg_indices = NULL, chunk_size = NULL, n_randvecs = 10, n_blocks = 100, n_threads = 1, gxg_h5_group = \"gxg\", ld_h5_group = \"ld\", rand_seed = -1, log_level = \"WARNING\" )"},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"plink_file Character. File path PLINK dataset (without *.bed extension). function append .bim, .bed, .fam extensions automatically. genotype data must missing genotypes. Use PLINK remove variants missing genotypes impute . pheno_file Character. File path phenotype file PLINK format. file contain exactly one phenotype column. mask_file Character NULL. File path HDF5 file specifying per-SNP masks gene--gene interaction tests. file informs SNPs tested marginal epistasis. Defaults NULL, indicating masking. Masking impacts scaling memory time. gxg_indices Integer vector NULL. List indices corresponding SNPs test marginal epistasis. NULL, SNPs dataset tested. indices 1-based. chunk_size Integer NULL. Number SNPs processed per chunk. influences memory usage can left NULL automatically determine chunk size based gxg_indices number threads. n_randvecs Integer. Number random vectors used stochastic trace estimation. Higher values yield accurate estimates increase computational cost. Default 10. n_blocks Integer. Number blocks SNPs divided processing. parameter affects memory requirements. Default 100. n_threads Integer. Number threads OpenMP parallel processing. Default 1. gxg_h5_group Character. Name HDF5 group within mask file containing gene--gene interaction masks. SNPs group included gene--gene interactions. Defaults \"gxg\". ld_h5_group Character. Name HDF5 group within mask file containing linkage disequilibrium masks. SNPs group excluded analysis. Defaults \"ld\". rand_seed Integer. Seed random vector generation. -1, seed set. Default -1. log_level Character. Logging level messages. Must uppercase (e.g., \"DEBUG\", \"INFO\", \"WARNING\", \"ERROR\"). Default \"WARNING\".","code":""},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"list containing: summary: tibble summarizing results tested SNP, including: id: Variant ID. index: Index SNP dataset. chromosome: Chromosome number. position: Genomic position SNP. p: P value gene--gene interaction test. pve: Proportion variance explained (PVE) gene--gene interactions. vc: Variance component estimate. se: Standard error variance component. pve: long-format tibble PVE variance components. vc_estimate: long-format tibble variance component estimates. vc_se: long-format tibble standard errors variance components. average_duration: Average computation time per SNP.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"function integrates PLINK-formatted genotype phenotype data perform marginal epistasis tests set SNPs. Using stochastic trace estimation, method computes variance components gene--gene interaction genetic relatedness using MQS estimator. process parallelized using OpenMP n_threads > 1. memory requirements computation time scaling can optimized parameters chunk_size, n_randvecs, n_blocks. Mask Format Requirements mask file format HDF5 file used storing index data masking process. format supports data retrieval index. required groups datasets within HDF5 file: required group names can configured input parameters. defaults described . Groups: ld: Stores SNPs LD focal SNP. SNPs excluded. gxg: Stores indices SNPs marginal epistasis test conditioned . SNPs included. Datasets: ld/: focal SNP , dataset contains indices SNPs LD block SNP. SNPs excluded gene--gene interaction covariance matrix. gxg/: focal SNP , dataset contains indices SNPs include gene--gene interaction covariance matrix focal SNP . Important: indices mask file data zero-based, matching zero-based indices PLINK .bim file.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":"references","dir":"Reference","previous_headings":"","what":"References","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"Stamp, J., Pattillo Smith, S., Weinreich, D., & Crawford, L. (2025). Sparse modeling interactions enables fast detection genome-wide epistasis biobank-scale studies. bioRxiv, 2025.01.11.632557. Stamp, J., DenAdel, ., Weinreich, D., & Crawford, L. (2023). Leveraging genetic correlation traits improves detection epistasis genome-wide association studies. G3: Genes, Genomes, Genetics, 13(8), jkad118. Crawford, L., Zeng, P., Mukherjee, S., & Zhou, X. (2017). Detecting epistasis marginal epistasis test genetic mapping studies quantitative traits. PLoS genetics, 13(7), e1006869.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"","code":"plink_file <- gsub(\"\\\\.bed\", \"\", system.file(\"testdata\", \"test.bed\", package=\"smer\")) pheno_file <- system.file(\"testdata\", \"test_h2_0.5.pheno\", package=\"smer\") mask_file <- \"\" # Parameter inputs chunk_size <- 10 n_randvecs <- 10 n_blocks <- 10 n_threads <- 1 # 1-based Indices of SNPs to be analyzed n_snps <- 100 snp_indices <- 1:n_snps sme_result <- sme( plink_file, pheno_file, mask_file, snp_indices, chunk_size, n_randvecs, n_blocks, n_threads ) head(sme_result$summary) #> # A tibble: 6 × 8 #> id index chromosome position p pve vc se #> #> 1 rs1 1 1 1 0.724 -0.0233 -0.0228 0.0382 #> 2 rs2 2 1 2 0.201 0.0407 0.0395 0.0472 #> 3 rs3 3 1 3 0.273 0.0464 0.0455 0.0754 #> 4 rs4 4 1 4 0.401 0.0161 0.0157 0.0624 #> 5 rs5 5 1 5 0.855 -0.0389 -0.0377 0.0357 #> 6 rs6 6 1 6 0.323 0.0306 0.0299 0.0650"},{"path":"https://lcrawlab.github.io/sme/reference/write_hdf5_dataset.html","id":null,"dir":"Reference","previous_headings":"","what":"Write Data to an HDF5 Dataset — write_hdf5_dataset","title":"Write Data to an HDF5 Dataset — write_hdf5_dataset","text":"function writes new data existing HDF5 file. dataset already exists, replaced new data.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/write_hdf5_dataset.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Write Data to an HDF5 Dataset — write_hdf5_dataset","text":"","code":"write_hdf5_dataset(file_name, dataset_name, new_data)"},{"path":"https://lcrawlab.github.io/sme/reference/write_hdf5_dataset.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Write Data to an HDF5 Dataset — write_hdf5_dataset","text":"file_name character string specifying path HDF5 file. dataset_name character string specifying name dataset written HDF5 file. new_data new data write dataset. data must compatible dataset's structure.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/write_hdf5_dataset.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Write Data to an HDF5 Dataset — write_hdf5_dataset","text":"return value; function modifies specified dataset HDF5 file.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/write_hdf5_dataset.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Write Data to an HDF5 Dataset — write_hdf5_dataset","text":"","code":"data_to_write <- 1:10 # Create an empty HDF5 file hdf5_file <- tempfile() create_hdf5_file(hdf5_file) # Write new data to a dataset in the HDF5 file write_hdf5_dataset(hdf5_file, \"group/dataset\", data_to_write)"},{"path":"https://lcrawlab.github.io/sme/news/index.html","id":"smer-001","dir":"Changelog","previous_headings":"","what":"smer 0.0.1","title":"smer 0.0.1","text":"CRAN release: 2025-01-16 Version used publication SME.","code":""}] +[{"path":"https://lcrawlab.github.io/sme/LICENSE.html","id":null,"dir":"","previous_headings":"","what":"MIT License","title":"MIT License","text":"Copyright (c) 2024 smer authors Permission hereby granted, free charge, person obtaining copy software associated documentation files (“Software”), deal Software without restriction, including without limitation rights use, copy, modify, merge, publish, distribute, sublicense, /sell copies Software, permit persons Software furnished , subject following conditions: copyright notice permission notice shall included copies substantial portions Software. SOFTWARE PROVIDED “”, WITHOUT WARRANTY KIND, EXPRESS IMPLIED, INCLUDING LIMITED WARRANTIES MERCHANTABILITY, FITNESS PARTICULAR PURPOSE NONINFRINGEMENT. EVENT SHALL AUTHORS COPYRIGHT HOLDERS LIABLE CLAIM, DAMAGES LIABILITY, WHETHER ACTION CONTRACT, TORT OTHERWISE, ARISING , CONNECTION SOFTWARE USE DEALINGS SOFTWARE.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"run-sme-with-plink-data","dir":"Articles","previous_headings":"","what":"Run SME with PLINK data","title":"How To Use the Sparse Marginal Epistasis Test","text":"","code":"library(smer) library(dplyr) library(ggplot2)"},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"data-requirements-and-file-formats","dir":"Articles","previous_headings":"Run SME with PLINK data","what":"Data Requirements and File Formats","title":"How To Use the Sparse Marginal Epistasis Test","text":"SME implemented R requires genetic data PLINK format, consists three files: .bim: Contains SNP information .bed: Contains genetic data .fam: Contains sample information Note: sme() handle missing genotypes. Prior running sme(), use PLINK remove variants missing genotypes impute . Additionally, phenotype (trait) data separate file following PLINK’s phenotype format. sme() function includes parameters let control memory usage computational resources. detailed guidance optimizing settings system, please see tutorial Optimize Memory Requirements SME.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"specifying-snps-for-analysis","dir":"Articles","previous_headings":"Run SME with PLINK data","what":"Specifying SNPs for Analysis","title":"How To Use the Sparse Marginal Epistasis Test","text":"selecting SNPs analyze, must provide positions 1-based indices. indices correspond row numbers .bim file, first SNP index 1, second index 2, . complete details function parameters, please refer documentation sme() function. analysis uses simulated data demonstration purposes. simulated synthetic phenotypes 5000 synthetic genotypes 6000 SNPs. like learn simulate data, please refer tutorial Simulate Traits.","code":"# File inputs plink_file <- \"path/to/plink/file\" pheno_file <- \"path/to/pheno/file\" mask_file <- \"path/to/mask/file\" # Parameter inputs chun_ksize <- 10 n_randvecs <- 10 n_blocks <- 10 n_threads <- 5 # 1-based Indices of SNPs to be analyzed n_snps <- 100 snp_indices <- 1:n_snps sme_result <- sme( plink_file, pheno_file, mask_file, snp_indices, chunk_size, n_randvecs, n_blocks, n_threads )"},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"understanding-the-results","dir":"Articles","previous_headings":"Run SME with PLINK data","what":"Understanding the Results","title":"How To Use the Sparse Marginal Epistasis Test","text":"call sme() function, returns list containing multiple elements. important one called summary, contains main analysis results. results formatted tidy data, making compatible popular R packages like ggplot2 dplyr analysis visualization.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"visualizing-genomic-associations","dir":"Articles","previous_headings":"Run SME with PLINK data","what":"Visualizing Genomic Associations","title":"How To Use the Sparse Marginal Epistasis Test","text":"use Manhattan plots visualize genome-wide analyses effectively highlight strong associations genetic variants traits. case, Manhattan plot specifically shows statistical epistasis (interactions genes). reference, ’ve marked true causal SNPs (Single Nucleotide Polymorphisms) green plot - genetic variants included simulation real effects.","code":"sme_result$summary %>% ggplot(aes( x = index, y = -log10(p), color = true_gxg_snp )) + geom_point() + xlab(\"Position\") + labs(color = \"Epistatic SNP\")"},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"understanding-variance-components-and-effect-sizes","dir":"Articles","previous_headings":"Run SME with PLINK data","what":"Understanding Variance Components and Effect Sizes","title":"How To Use the Sparse Marginal Epistasis Test","text":"SME estimates much total trait variation can explained genetic interactions. simulation, set Phenotypic Variance Explained (PVE) 5% SNP involved epistatic interactions. plot shows well method recovered effects. displays two distributions: estimated PVE SNPs know truly involved epistatic interactions estimated PVE SNPs real epistatic effects dashed line marks true 5% PVE level used simulation, allowing see accurately method estimated actual effect sizes.","code":"sme_result$summary %>% ggplot(aes(x = true_gxg_snp, y = pve, fill = true_gxg_snp)) + geom_boxplot() + geom_hline(yintercept = 0.05, color = \"grey40\", linetype = \"dashed\") + annotate(\"text\", x = 0.8, y = 0.055, label = \"True per SNP epistatic PVE\", color = \"black\") + xlab(\"Epistatic SNP\") + ylab(\"Phenotypic Variance Explained\") + theme(legend.position = \"none\")"},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"narrow-sense-heritability-estimates","dir":"Articles","previous_headings":"Run SME with PLINK data","what":"Narrow Sense Heritability Estimates","title":"How To Use the Sparse Marginal Epistasis Test","text":"SME method uses linear mixed model separate different sources trait variation. One key component estimates narrow sense heritability (h2h^2), measures much trait variation can explained additive genetic effects. plot breaks estimated sources trait variation: “grm”: Shows narrow sense heritability (h2h^2) “gxg”: Shows variance due gene--gene interactions “error”: Shows unexplained variance, environmental effects simulation, set true narrow sense heritability 30%, shown dashed line plot. reference line helps evaluate accurately SME estimated genetic components trait variation. estimate narrow-sense heritability h2h^2 much less variable always informed genetic relatedness matrix. small data example overestimates heritability unbiased general.","code":"sme_result$vc_estimate %>% ggplot(aes(x = component, y = vc_estimate, fill = component)) + geom_boxplot() + geom_hline(yintercept = 0.3, color = \"grey40\", linetype = \"dashed\") + annotate(\"text\", x = 0.7, y = 0.33, label = expression(\"True \" * h^2), color = \"black\") + xlab(\"Component\") + ylab(\"Variance Component Estimate\") + theme(legend.position = \"none\")"},{"path":"https://lcrawlab.github.io/sme/articles/smer.html","id":"sessioninfo","dir":"Articles","previous_headings":"","what":"SessionInfo","title":"How To Use the Sparse Marginal Epistasis Test","text":"","code":"sessionInfo() #> R version 4.4.2 (2024-10-31) #> Platform: x86_64-pc-linux-gnu #> Running under: Ubuntu 24.04.1 LTS #> #> Matrix products: default #> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 #> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0 #> #> locale: #> [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 #> [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 #> [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C #> [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C #> #> time zone: UTC #> tzcode source: system (glibc) #> #> attached base packages: #> [1] stats graphics grDevices utils datasets methods base #> #> other attached packages: #> [1] ggplot2_3.5.1 dplyr_1.1.4 smer_0.0.1 #> #> loaded via a namespace (and not attached): #> [1] gtable_0.3.6 jsonlite_1.8.9 compiler_4.4.2 #> [4] tidyselect_1.2.1 Rcpp_1.0.14 FMStable_0.1-4 #> [7] parallel_4.4.2 tidyr_1.3.1 jquerylib_0.1.4 #> [10] scales_1.3.0 systemfonts_1.2.0 textshaping_0.4.1 #> [13] harmonicmeanp_3.0.1 yaml_2.3.10 fastmap_1.2.0 #> [16] R6_2.5.1 labeling_0.4.3 generics_0.1.3 #> [19] knitr_1.49 genio_1.1.2 iterators_1.0.14 #> [22] backports_1.5.0 checkmate_2.3.2 tibble_3.2.1 #> [25] desc_1.4.3 munsell_0.5.1 bslib_0.8.0 #> [28] pillar_1.10.1 rlang_1.1.4 cachem_1.1.0 #> [31] xfun_0.50 fs_1.6.5 sass_0.4.9 #> [34] cli_3.6.3 withr_3.0.2 pkgdown_2.1.1 #> [37] magrittr_2.0.3 grid_4.4.2 digest_0.6.37 #> [40] mvMAPIT_2.0.3 foreach_1.5.2 mvtnorm_1.3-3 #> [43] lifecycle_1.0.4 CompQuadForm_1.4.3 vctrs_0.6.5 #> [46] evaluate_1.0.3 glue_1.8.0 farver_2.1.2 #> [49] codetools_0.2-20 ragg_1.3.3 colorspace_2.1-1 #> [52] purrr_1.0.2 rmarkdown_2.29 tools_4.4.2 #> [55] pkgconfig_2.0.3 htmltools_0.5.8.1"},{"path":"https://lcrawlab.github.io/sme/articles/study-erythroid-differentiation-data.html","id":"dnase-i-hypersensitive-sites-of-erythroid-differentiation-reveal-statistical-epistasis-in-human-hematology-traits","dir":"Articles","previous_headings":"","what":"DNAse I hypersensitive sites of erythroid differentiation reveal statistical epistasis in human hematology traits","title":"Conditioning Epistasis Search on Open Chromatin","text":"apply SME hematology traits white British individuals UK Biobank. quality control, remaining data 349,411 individuals 543,813 SNPs common variants. select traits mean corpuscular hemoglobin (MCH), mean corpuscular hemoglobin concentration, mean corpuscular volume (MCV), hematocrit (HCT). external sparse data source, leverage DNase -hypersensitive sites (DHSs) data measured 12 days ex-vivo erythroid differentiation (Georgolopoulos et al. 2024). DHS enriched transcriptional activity used identify regulatory DNA. first three traits, MCH, MCHC, MCV traits red blood cells (RBC). Previous GWAS studies found genes associated traits implicated erythroid differentiation. Therefore, expect genomic data indicates regulatory regions gathered erythropoiesis informative traits. HCT measures percentage red blood cells blood. maturation erythroid progenitor cells regulated oxygen-sensing mechanism. hypothesise HCT, informed functional data erythropoiesis.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/study-erythroid-differentiation-data.html","id":"mask-file-preparation","dir":"Articles","previous_headings":"DNAse I hypersensitive sites of erythroid differentiation reveal statistical epistasis in human hematology traits","what":"Mask File Preparation","title":"Conditioning Epistasis Search on Open Chromatin","text":"external data sources used study represented genomic intervals DHS regions LD blocks. following mock data illustrate create mask file sme(). See Create Mask File details. make use package GenomicRanges efficiently map PLINK data intervals DHS LD data. 543,813 SNPs data, 4,932 located DHS regions. DHS regions data distributed along whole genome. test marginal epistasis SME consider variants DHS regions important. Next map PLINK data LD blocks. objects, can create mask file. larger data, recommend splitting PLINK variants analyzed batches, create one mask file per batch, submit one job per batch High Peformance Cluster. mask file can run SME. genome-wide association test marginal epistasis red blood cell traits MCH MCV finds genome-wide significant statistical epistasis (P<5 × 10−8P < \\num{5e-8}) chromosome 6 (Fig. 1). Importantly, SNPs genes map previously discovered non-additive gene action related erythropoiesis RBC traits.","code":"bim_data <- data.frame( chromosome = c(1, 1, 1, 2, 2, 2, 3, 3, 3), variant_id = c(\"rs1\", \"rs2\", \"rs3\", \"rs4\", \"rs5\", \"rs6\", \"rs7\", \"rs8\", \"rs9\"), cm_position = c(0, 0, 0, 0, 0, 0, 0, 0, 0), bp_position = c(10, 20, 30, 40, 50, 60, 70, 80, 90), allele1 = c(\"A\", \"A\", \"A\", \"G\", \"C\", \"C\", \"T\", \"T\", \"A\"), allele1 = c(\"G\", \"G\", \"G\", \"A\", \"T\", \"T\", \"A\", \"A\", \"G\") ) bim_data$index <- 1:nrow(bim_data) # DHS intervals hg19_dhs_regions <- data.frame( chromosome = c(1, 2, 3), start = c(5, 45, 85), stop = c(15, 55, 95) ) # LD block intervals hg19_ld_blocks <- data.frame( chromosome = c(1, 1, 2, 2, 3, 3, 3), start = c(5, 25, 35, 45, 65, 75, 85), stop = c(25, 35, 45, 65, 75, 85, 95) ) # Convert .bim to GRanges object bim_gr <- GRanges( seqnames = paste0(\"chr\", bim_data$chromosome), ranges = IRanges(start = bim_data$bp_position, end = bim_data$bp_position), variant_id = bim_data$variant_id, genome = \"hg19\" ) # Convert DHS to GRanges object dhs_gr <- GRanges( seqnames = paste0(\"chr\", hg19_dhs_regions$chromosome), ranges = IRanges(start = hg19_dhs_regions$start, end = hg19_dhs_regions$stop), genome = \"hg19\" ) # Find overlaps of BIM variants and DHS intervals overlaps <- findOverlaps(bim_gr, dhs_gr, maxgap = 0) # Extract overlapping variants dhs_data <- bim_data[queryHits(overlaps), ] dhs_data <- dhs_data[!duplicated(dhs_data$index), ] # Convert to GRanges object ld_gr <- GRanges( seqnames = paste0(\"chr\", hg19_ld_blocks$chromosome), ranges = IRanges(start = hg19_ld_blocks$start, end = hg19_ld_blocks$stop), genome = \"hg19\" ) # Find LD block of bim variants ld_overlaps <- findOverlaps(query = bim_gr, subject = ld_gr) output_file <- tempfile() gxg_group <- \"gxg\" ld_group <- \"ld\" gxg_variants <- dhs_data$index - 1 # 0-base index for C++ create_hdf5_file(output_file) for (j in bim_data$index - 1) { # 0-base index for C++ # Write DHS mask gxg_ds <- sprintf(\"%s/%d\", gxg_group, j) write_hdf5_dataset(file_name = output_file, dataset_name = gxg_ds, gxg_variants) # Find LD block of focal SNP focal_gr <- ld_gr[subjectHits(ld_overlaps[j,])] # Find variants in LD block of focal SNP focal_ld <- findOverlaps(query = bim_gr, subject = focal_gr) ld_data <- bim_data[queryHits(focal_ld),] ld_variants <- ld_data$index - 1 # 0-base index for C++ # Write LD mask ld_ds <- sprintf(\"%s/%d\", ld_group, j) write_hdf5_dataset(file_name = output_file, dataset_name = ld_ds, ld_variants) } dhs_indices <- read_hdf5_dataset(file_name = output_file, dataset_name = gxg_ds) print(sprintf(\"DHS indices: %s\", paste(dhs_indices, collapse = \", \"))) #> [1] \"DHS indices: 0, 4, 8\" sme_result <- sme( plink_file = \"/path/to/plink/data\", pheno_file = \"/path/to/pheno/data\", mask_file = \"/path/to/mask/file\", gxg_indices = c(1, 2, 3), chunk_size = 250, n_randvecs = 10, n_blocks = 200, n_threads = 6 )"},{"path":"https://lcrawlab.github.io/sme/articles/study-erythroid-differentiation-data.html","id":"section","dir":"Articles","previous_headings":"","what":"Conditioning Epistasis Search on Open Chromatin","title":"Conditioning Epistasis Search on Open Chromatin","text":"Figure 1. Manhattan plot SME analysis. dashed blue line significance threshold Bonferroni correction. strongest association trait MCH, SNP rs4711092 (P=1.41 × 10−11P = \\num{1.41e-11}, PVE 0.7%), maps gene secretagogin (). gene regulates exocytosis interacting two soluble NSF adaptor proteins ( ) critical cell growth tissues. total five SNPs SME significantly associates MCH (strongest association rs9366624 P=1.8 × 10−9P = \\num{1.8e-9}, PVE 1.1%) gene capping protein regulator myosin 1 linker 1 (). gene known interact regulate caping protein (). plays role via protein-protein interaction regulating erythrpoiesis. Specifically, proteins regulate actin dynamics regulating activity . Erythropoiesis leads modifications expression membrane cytoskeletal proteins, whose interactions impact cell structure function. genes previously associated hemoglobin concentration. strongest association trait MCV, SNP rs9276 (P=9.09 × 10−10P = \\num{9.09e-10}, PVE 0.24%) maps gene major histocompatibility complex. SNP rs9366624 (P=1.86 × 10−8P = \\num{1.86e-8}, PVE 0.8%), also gene significantly associated trait marginal epistasis. complete list significant associations produced SME reported Tab. 1. Table 1. Significant trait associations marginal epistasis. Fitting linear mixed model SME also produces narrow-sense heritability estimates equivalent RHE regression. heritability estimates SME four traits study similar heritability estimates found literature(Tab. 2). Table 2. Narrow-sense heritability (h2h^2) estimates SME analysis.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/study-erythroid-differentiation-data.html","id":"references","dir":"Articles","previous_headings":"","what":"References","title":"Conditioning Epistasis Search on Open Chromatin","text":"Stamp J, Smith Pattillo S, Weinreich D, Crawford L (2025). Sparse modeling interactions enables fast detection genome-wide epistasis biobank-scale studies. biorxiv, https://doi.org/10.1101/2025.01.11.632557 Georgolopoulos, G. GEO Data Set: Discrete regulatory modules instruct hematopoietic lineage commitment differentiation https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE182816 (2024).","code":""},{"path":"https://lcrawlab.github.io/sme/articles/study-erythroid-differentiation-data.html","id":"sessioninfo","dir":"Articles","previous_headings":"","what":"SessionInfo","title":"Conditioning Epistasis Search on Open Chromatin","text":"","code":"sessionInfo() #> R version 4.4.2 (2024-10-31) #> Platform: x86_64-pc-linux-gnu #> Running under: Ubuntu 24.04.1 LTS #> #> Matrix products: default #> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 #> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0 #> #> locale: #> [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 #> [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 #> [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C #> [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C #> #> time zone: UTC #> tzcode source: system (glibc) #> #> attached base packages: #> [1] stats4 stats graphics grDevices utils datasets methods #> [8] base #> #> other attached packages: #> [1] smer_0.0.1 GenomicRanges_1.58.0 GenomeInfoDb_1.42.1 #> [4] IRanges_2.40.1 S4Vectors_0.44.0 BiocGenerics_0.52.0 #> #> loaded via a namespace (and not attached): #> [1] tidyr_1.3.1 sass_0.4.9 generics_0.1.3 #> [4] digest_0.6.37 magrittr_2.0.3 evaluate_1.0.3 #> [7] iterators_1.0.14 CompQuadForm_1.4.3 mvtnorm_1.3-3 #> [10] fastmap_1.2.0 foreach_1.5.2 genio_1.1.2 #> [13] jsonlite_1.8.9 backports_1.5.0 httr_1.4.7 #> [16] purrr_1.0.2 UCSC.utils_1.2.0 codetools_0.2-20 #> [19] textshaping_0.4.1 jquerylib_0.1.4 cli_3.6.3 #> [22] rlang_1.1.4 XVector_0.46.0 cachem_1.1.0 #> [25] yaml_2.3.10 FMStable_0.1-4 tools_4.4.2 #> [28] parallel_4.4.2 checkmate_2.3.2 dplyr_1.1.4 #> [31] GenomeInfoDbData_1.2.13 vctrs_0.6.5 R6_2.5.1 #> [34] lifecycle_1.0.4 mvMAPIT_2.0.3 zlibbioc_1.52.0 #> [37] fs_1.6.5 ragg_1.3.3 pkgconfig_2.0.3 #> [40] desc_1.4.3 pkgdown_2.1.1 pillar_1.10.1 #> [43] bslib_0.8.0 glue_1.8.0 Rcpp_1.0.14 #> [46] harmonicmeanp_3.0.1 systemfonts_1.2.0 xfun_0.50 #> [49] tibble_3.2.1 tidyselect_1.2.1 knitr_1.49 #> [52] htmltools_0.5.8.1 rmarkdown_2.29 compiler_4.4.2"},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-cite-us.html","id":"the-sparse-marginal-epistasis-test-sme","dir":"Articles","previous_headings":"","what":"The Sparse Marginal Epistasis Test (SME)","title":"How To Cite Our Work","text":"Stamp J, Smith Pattillo S, Weinreich D, Crawford L (2025). Sparse modeling interactions enables fast detection genome-wide epistasis biobank-scale studies. biorxiv, https://doi.org/10.1101/2025.01.11.632557 Stamp J & Crawford L (2025). smer: Sparse Marginal Epistasis Test. https://github.com/lcrawlab/sme, https://lcrawlab.github.io/sme/","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-cite-us.html","id":"the-multivariate-marginal-epistasis-test-mvmapit","dir":"Articles","previous_headings":"","what":"The multivariate Marginal Epistasis Test (mvMAPIT)","title":"How To Cite Our Work","text":"Stamp J, DenAdel , Weinreich D, Crawford, L (2023). Leveraging Genetic Correlation Traits Improves Detection Epistasis Genome-wide Association Studies. G3 Genes|Genomes|Genetics 13(8), jkad118; doi: https://doi.org/10.1093/g3journal/jkad118 Stamp J, Crawford L (2022). mvMAPIT: Multivariate Genome Wide Marginal Epistasis Test. https://github.com/lcrawlab/mvMAPIT, https://lcrawlab.github.io/mvMAPIT/","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-cite-us.html","id":"the-marginal-epistasis-test-mapit","dir":"Articles","previous_headings":"","what":"The Marginal Epistasis Test (MAPIT)","title":"How To Cite Our Work","text":"Crawford L, Zeng P, Mukherjee S, & Zhou X (2017). Detecting epistasis marginal epistasis test genetic mapping studies quantitative traits. PLoS genetics, 13(7), e1006869. https://doi.org/10.1371/journal.pgen.1006869","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-create-mask-file.html","id":"mask-file-format","dir":"Articles","previous_headings":"","what":"Mask File Format","title":"How To Create a Mask File","text":"sme() function expects mask data HDF5 file. HDF5 format includes two primary object types: Datasets - typed multidimensional arrays Groups - container structures can hold datasets groups","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-create-mask-file.html","id":"mask-format-requirements","dir":"Articles","previous_headings":"","what":"Mask Format Requirements","title":"How To Create a Mask File","text":"mask data organized following groups datasets: Groups: ld: Contains SNPs linkage disequilibrium (LD) focal SNP, excluded. gxg: Contains indices SNPs used condition marginal epistasis test, included. required group names can configured input parameters sme(). defaults ld gxg. Datasets: ld/: focal SNP , dataset contains indices SNPs LD block SNP. SNPs excluded gene--gene interaction covariance matrix. gxg/: focal SNP , dataset contains indices SNPs include gene--gene interaction covariance matrix focal SNP . Important: indices mask file must zero-based correspond zero-based row indices PLINK .bim file. includes dataset index ( gxg/) data . zero-based indexing necessary mask data read C++ subroutine sme(), uses zero-based indexing, unlike R’s one-based indexing SNP indices function call.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-create-mask-file.html","id":"creating-and-using-mask-files","dir":"Articles","previous_headings":"","what":"Creating and Using Mask Files","title":"How To Create a Mask File","text":"package provides utility functions create, write, read valid mask files sme(). can check data written correctly.","code":"hdf5_file <- tempfile() # Group names gxg_h5_group <- \"gxg\" ld_h5_group <- \"ld\" # Data (still in 1-based R indexing) include_gxg_snps <- 1:10 exclude_ld_snps <- 5:6 # Focal SNP (still in 1-based R indexing) focal_snp <- 4 # Dataset names dataset_name_pattern <- \"%s/%s\" # 0-based index! gxg_dataset <- sprintf(dataset_name_pattern, gxg_h5_group, focal_snp - 1) ld_dataset <- sprintf(dataset_name_pattern, ld_h5_group, focal_snp - 1) # Create an empty HDF5 file create_hdf5_file(hdf5_file) # Write LD data write_hdf5_dataset(hdf5_file, ld_dataset, exclude_ld_snps - 1) # 0-based index! # Write GXG data write_hdf5_dataset(hdf5_file, gxg_dataset, include_gxg_snps - 1) ld_read <- read_hdf5_dataset(hdf5_file, ld_dataset) gxg_read <- read_hdf5_dataset(hdf5_file, gxg_dataset) print(sprintf(\"Zero-based indices of SNPs to exclude: %s\", str(ld_read))) #> int [1:2] 4 5 #> character(0) print(sprintf(\"Zero-based indices of SNPs to include: %s\", str(gxg_read))) #> int [1:10] 0 1 2 3 4 5 6 7 8 9 #> character(0)"},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-create-mask-file.html","id":"sessioninfo","dir":"Articles","previous_headings":"","what":"SessionInfo","title":"How To Create a Mask File","text":"","code":"sessionInfo() #> R version 4.4.2 (2024-10-31) #> Platform: x86_64-pc-linux-gnu #> Running under: Ubuntu 24.04.1 LTS #> #> Matrix products: default #> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 #> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0 #> #> locale: #> [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 #> [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 #> [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C #> [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C #> #> time zone: UTC #> tzcode source: system (glibc) #> #> attached base packages: #> [1] stats graphics grDevices utils datasets methods base #> #> other attached packages: #> [1] smer_0.0.1 #> #> loaded via a namespace (and not attached): #> [1] jsonlite_1.8.9 dplyr_1.1.4 compiler_4.4.2 #> [4] tidyselect_1.2.1 Rcpp_1.0.14 FMStable_0.1-4 #> [7] parallel_4.4.2 tidyr_1.3.1 jquerylib_0.1.4 #> [10] systemfonts_1.2.0 textshaping_0.4.1 harmonicmeanp_3.0.1 #> [13] yaml_2.3.10 fastmap_1.2.0 R6_2.5.1 #> [16] generics_0.1.3 knitr_1.49 genio_1.1.2 #> [19] iterators_1.0.14 backports_1.5.0 checkmate_2.3.2 #> [22] tibble_3.2.1 desc_1.4.3 bslib_0.8.0 #> [25] pillar_1.10.1 rlang_1.1.4 cachem_1.1.0 #> [28] xfun_0.50 fs_1.6.5 sass_0.4.9 #> [31] cli_3.6.3 pkgdown_2.1.1 magrittr_2.0.3 #> [34] digest_0.6.37 mvMAPIT_2.0.3 foreach_1.5.2 #> [37] mvtnorm_1.3-3 lifecycle_1.0.4 CompQuadForm_1.4.3 #> [40] vctrs_0.6.5 evaluate_1.0.3 glue_1.8.0 #> [43] codetools_0.2-20 ragg_1.3.3 purrr_1.0.2 #> [46] rmarkdown_2.29 tools_4.4.2 pkgconfig_2.0.3 #> [49] htmltools_0.5.8.1"},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"genotype-data-size-and-number-of-blocks","dir":"Articles","previous_headings":"","what":"Genotype Data Size and Number of Blocks","title":"How To Optimize the Memory Requirements of SME","text":"sample size primary factor influencing memory requirements. phenotype genotype data need loaded memory computation. large datasets, like Biobank-scale data (350k samples 500k SNPs), loading entire dataset memory requires 1.4TB (assuming double precision data matrix), exceeds machines’ capacities. manage large datasets efficiently, sme() reads genotype data smaller blocks. parameter n_blocks controls number blocks. instance, 500k SNPs, setting n_blocks = 100 load 5000 SNPs memory time, reducing memory load allowing computations proceed block block.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"number-of-random-vectors","dir":"Articles","previous_headings":"","what":"Number of Random Vectors","title":"How To Optimize the Memory Requirements of SME","text":"sme() function uses stochastic trace estimator approximate trace matrix products efficiently. number random vectors impacts accuracy trace estimates memory computational efficiency. blockwise computation, algorithm stores intermediate matrices sized sample_size x n_randvecs. Increasing number random vectors improves accuracy also increases memory usage computation time. Typically, using around 10 random vectors provides reasonably accurate results.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"number-of-snps-sharing-random-vectors","dir":"Articles","previous_headings":"","what":"Number of SNPs Sharing Random Vectors","title":"How To Optimize the Memory Requirements of SME","text":"chunk_size parameter controls many SNPs share set random vectors, enhancing efficiency genome-wide data processing. method reduces redundant calculations genetic relatedness covariance matrix minimizes time spent reading genotype data memory. set SNPs analyzed together (“chunk”), intermediate results must stored. Consequently, memory requirement grows chunk size, calculated : chunk_size x (sample_size x n_randvecs). Figure 1. Schematic overview illustrating compuational speedup resulting sharing random vectors. () randomized trace estimates can identify reusable matrix vector products. Computing exact trace product two covariance matrices prohibitively computationally expensive. Instead, sparse marginal epistasis (SME) test approximates traces using random vectors zz. full MQS computation point estimates variance components, see matrix--vector products form AzAz ∈{K,G}\\\\{K, G\\} appear repeatedly. (b) genetic relatedness matrix KK focal SNPs. Using unique random vectors computation every focal SNP, compute quantity repeatedly. Computing matrix--vector products KzKz constitutes almost half computation time point estimates. (c) sharing random vectors zz focal SNPs, computing KzKz can done focal SNPs share random vectors. , computation time KzKz becomes negligible.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"genotype-masking-for-the-gene-by-gene-interaction-covariance","dir":"Articles","previous_headings":"","what":"Genotype Masking for the Gene-by-Gene Interaction Covariance","title":"How To Optimize the Memory Requirements of SME","text":"Masking genotypes contribute epistasis can help reduce memory usage computation time. masked, genotypes need stored memory, significantly decreasing memory requirements. Note approximate_memory_requirements() function account reduction.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"explore-the-memory-requirements","dir":"Articles","previous_headings":"","what":"Explore the Memory Requirements","title":"How To Optimize the Memory Requirements of SME","text":"estimate memory needs based chosen parameters, use approximate_memory_requirements() function. function helps determine planned settings fit within available memory identify parameters can adjusted meet resource constraints. parameters n_blocks, n_randvecs, chunk_size particularly flexible significant impact memory usage. Note however, account masking therefore likely overestimates required memory.","code":"n_samples <- c(350000) n_snps <- c(500000) n_blocks <- c(1, 100, 1000) n_randvecs <- c(10, 100) chunk_size <- c(10, 100) parameters <- crossing( n_samples = n_samples, n_snps = n_snps, n_blocks = n_blocks, n_randvecs = n_randvecs, chunk_size = chunk_size ) estimated_memory <- parameters %>% mutate(memory_gb = round( approximate_memory_requirements(n_samples, n_snps, n_blocks, n_randvecs, chunk_size), 2 )) kable(estimated_memory)"},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"a-note-on-the-runtime-of-sme","dir":"Articles","previous_headings":"","what":"A Note on the Runtime of SME","title":"How To Optimize the Memory Requirements of SME","text":"Despite computational efficiency SME, genome-wide testing requires considerable resources. recommend analyze data batches, launch multiple processes simultaneously high-performance cluster (HPC). study, analyzed 544k SNPs genotype 350k individuals. launched 544 slurm jobs requesting 43GB memory 6 CPUs analyze batches 1000 SNPs chunk sizes 250 SNPs. Genome-wide testing single trait HPC 960 CPUs 6840GB memory available took 3.5 days. Figure 2. SME improved power detect marginal epistasis runs 10x 90x faster state---art methods. CPU time measured 350,000 individuals.","code":""},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-memory-optimization.html","id":"sessioninfo","dir":"Articles","previous_headings":"","what":"SessionInfo","title":"How To Optimize the Memory Requirements of SME","text":"","code":"sessionInfo() #> R version 4.4.2 (2024-10-31) #> Platform: x86_64-pc-linux-gnu #> Running under: Ubuntu 24.04.1 LTS #> #> Matrix products: default #> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 #> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0 #> #> locale: #> [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 #> [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 #> [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C #> [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C #> #> time zone: UTC #> tzcode source: system (glibc) #> #> attached base packages: #> [1] stats graphics grDevices utils datasets methods base #> #> other attached packages: #> [1] knitr_1.49 dplyr_1.1.4 tidyr_1.3.1 smer_0.0.1 #> #> loaded via a namespace (and not attached): #> [1] jsonlite_1.8.9 compiler_4.4.2 tidyselect_1.2.1 #> [4] Rcpp_1.0.14 FMStable_0.1-4 parallel_4.4.2 #> [7] jquerylib_0.1.4 systemfonts_1.2.0 textshaping_0.4.1 #> [10] harmonicmeanp_3.0.1 yaml_2.3.10 fastmap_1.2.0 #> [13] R6_2.5.1 generics_0.1.3 genio_1.1.2 #> [16] iterators_1.0.14 backports_1.5.0 checkmate_2.3.2 #> [19] tibble_3.2.1 desc_1.4.3 bslib_0.8.0 #> [22] pillar_1.10.1 rlang_1.1.4 cachem_1.1.0 #> [25] xfun_0.50 fs_1.6.5 sass_0.4.9 #> [28] cli_3.6.3 pkgdown_2.1.1 magrittr_2.0.3 #> [31] digest_0.6.37 mvMAPIT_2.0.3 foreach_1.5.2 #> [34] mvtnorm_1.3-3 lifecycle_1.0.4 CompQuadForm_1.4.3 #> [37] vctrs_0.6.5 evaluate_1.0.3 glue_1.8.0 #> [40] codetools_0.2-20 ragg_1.3.3 purrr_1.0.2 #> [43] rmarkdown_2.29 tools_4.4.2 pkgconfig_2.0.3 #> [46] htmltools_0.5.8.1"},{"path":"https://lcrawlab.github.io/sme/articles/tutorial-simulate-traits.html","id":"sessioninfo","dir":"Articles","previous_headings":"","what":"SessionInfo","title":"How To Simulate Traits","text":"","code":"sessionInfo() #> R version 4.4.2 (2024-10-31) #> Platform: x86_64-pc-linux-gnu #> Running under: Ubuntu 24.04.1 LTS #> #> Matrix products: default #> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 #> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0 #> #> locale: #> [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 #> [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 #> [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C #> [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C #> #> time zone: UTC #> tzcode source: system (glibc) #> #> attached base packages: #> [1] stats graphics grDevices utils datasets methods base #> #> other attached packages: #> [1] genio_1.1.2 smer_0.0.1 #> #> loaded via a namespace (and not attached): #> [1] jsonlite_1.8.9 dplyr_1.1.4 compiler_4.4.2 #> [4] tidyselect_1.2.1 Rcpp_1.0.14 FMStable_0.1-4 #> [7] parallel_4.4.2 tidyr_1.3.1 jquerylib_0.1.4 #> [10] systemfonts_1.2.0 textshaping_0.4.1 harmonicmeanp_3.0.1 #> [13] yaml_2.3.10 fastmap_1.2.0 R6_2.5.1 #> [16] generics_0.1.3 knitr_1.49 iterators_1.0.14 #> [19] backports_1.5.0 checkmate_2.3.2 tibble_3.2.1 #> [22] desc_1.4.3 bslib_0.8.0 pillar_1.10.1 #> [25] rlang_1.1.4 cachem_1.1.0 xfun_0.50 #> [28] fs_1.6.5 sass_0.4.9 cli_3.6.3 #> [31] pkgdown_2.1.1 magrittr_2.0.3 digest_0.6.37 #> [34] mvMAPIT_2.0.3 foreach_1.5.2 mvtnorm_1.3-3 #> [37] lifecycle_1.0.4 CompQuadForm_1.4.3 vctrs_0.6.5 #> [40] evaluate_1.0.3 glue_1.8.0 codetools_0.2-20 #> [43] ragg_1.3.3 purrr_1.0.2 rmarkdown_2.29 #> [46] tools_4.4.2 pkgconfig_2.0.3 htmltools_0.5.8.1"},{"path":"https://lcrawlab.github.io/sme/authors.html","id":null,"dir":"","previous_headings":"","what":"Authors","title":"Authors and Citation","text":"Julian Stamp. Maintainer, author. Lorin Crawford. Author. sriramlab. Copyright holder. Author included mailman algorithm Blue Brain Project/EPFL. Copyright holder. Author included HighFive library","code":""},{"path":"https://lcrawlab.github.io/sme/authors.html","id":"citation","dir":"","previous_headings":"","what":"Citation","title":"Authors and Citation","text":"Stamp J, Crawford L (2025). smer: Sparse Marginal Epistasis Test. R package version 0.0.1, https://lcrawlab.github.io/sme/, https://github.com/lcrawlab/sme.","code":"@Manual{, title = {smer: Sparse Marginal Epistasis Test}, author = {Julian Stamp and Lorin Crawford}, year = {2025}, note = {R package version 0.0.1, https://lcrawlab.github.io/sme/}, url = {https://github.com/lcrawlab/sme}, }"},{"path":"https://lcrawlab.github.io/sme/index.html","id":"the-sparse-marginal-epistasis-test-","dir":"","previous_headings":"","what":"Sparse Marginal Epistasis Test","title":"Sparse Marginal Epistasis Test","text":"smer package implements computationally statistically efficient method detecting marginal epistasis genome-wide association studies (GWAS). Find full package documentation including examples articles : Sparse Marginal Epistasis test Documentation.","code":""},{"path":"https://lcrawlab.github.io/sme/index.html","id":"key-features","dir":"","previous_headings":"","what":"Key Features","title":"Sparse Marginal Epistasis Test","text":"Hutchinson’s stochastic trace estimator: efficient scalable computation Mailman algorithm: fast vector--matrix operation Linear mixed model: controls population structure Multimodal Input: incorporates additional data HDF5 files improve power detecting gene--gene interactions. Optimize Memory Constraints: Highly configurable block wise processing data allows make available resources. See also Optimize Memory Requirements SME. Parallelization: Utilizes OpenMP multi-threaded processing.","code":""},{"path":[]},{"path":"https://lcrawlab.github.io/sme/index.html","id":"installation-from-cran","dir":"","previous_headings":"Installation","what":"Installation from CRAN","title":"Sparse Marginal Epistasis Test","text":"can install latest release CRAN","code":"install.packages(\"smer\")"},{"path":"https://lcrawlab.github.io/sme/index.html","id":"installation-from-source","dir":"","previous_headings":"Installation","what":"Installation from source","title":"Sparse Marginal Epistasis Test","text":"can install development version smer GitHub :","code":"install.packages(\"devtools\") devtools::install_github(\"lcrawlab/sme\")"},{"path":"https://lcrawlab.github.io/sme/index.html","id":"dependencies","dir":"","previous_headings":"","what":"Dependencies","title":"Sparse Marginal Epistasis Test","text":"System requirements package: GNU make R (>= 4.4) Rhdf5lib (BioConductor) OpenMP (optional) install Rhdf5lib, first install tool BiocManager CRAN, install library using tool. full list R dependencies can found DESCRIPTION file.","code":"if (!require(\"BiocManager\", quietly = TRUE)) install.packages(\"BiocManager\") BiocManager::install(\"Rhdf5lib\")"},{"path":"https://lcrawlab.github.io/sme/index.html","id":"openmp","dir":"","previous_headings":"Dependencies","what":"OpenMP","title":"Sparse Marginal Epistasis Test","text":"OS X Linux, OpenMP library can installed via one (shell) commands specified : enable openMP, may necessary configure compiler flags SHLIB_OPENMP_CXXFLAGS LDFLAGS ~/.R/Makevars file.","code":""},{"path":"https://lcrawlab.github.io/sme/index.html","id":"known-issues","dir":"","previous_headings":"","what":"Known Issues","title":"Sparse Marginal Epistasis Test","text":"error ld: library \"crypto\" found, install openssl (e.g. brew install openssl). Compiling package requires compiler find libraries dependencies. unix systems, libraries typically installed /usr/local/lib /usr/local/include. users using OS X homebrew, libraries typically installed /opt/homebrew/lib /opt/homebrew/include. Non-standard library paths need configured. src/Makevars file configures compiler flags considers LDFLAGS CPPFLAGS ~/.R/Makevars file.","code":""},{"path":"https://lcrawlab.github.io/sme/index.html","id":"references","dir":"","previous_headings":"","what":"References","title":"Sparse Marginal Epistasis Test","text":"Stamp J, Crawford L (2025). smer: Sparse Marginal Epistasis Test. R package version 0.0.1, https://lcrawlab.github.io/sme/, https://github.com/lcrawlab/sme. Stamp J, Smith Pattillo S, Weinreich D, Crawford L (2025). Sparse modeling interactions enables fast detection genome-wide epistasis biobank-scale studies. biorxiv, https://doi.org/10.1101/2025.01.11.632557 Stamp J, Crawford L (2024). mvMAPIT: Multivariate Genome Wide Marginal Epistasis Test. R package version 2.0.3, https://lcrawlab.github.io/mvMAPIT/, https://github.com/lcrawlab/mvMAPIT. Stamp et al. (2023): Leveraging genetic correlation traits epistasis detection GWAS. G3: Genes, Genomes, Genetics. Fu, B., Pazokitoroudi, ., Xue, ., Anand, ., Anand, P., Zaitlen, N., & Sankararaman, S. (2023). biobank-scale test marginal epistasis reveals genome-wide signals polygenic epistasis. bioRxiv. Crawford et al. (2017): Detecting epistasis marginal epistasis test. PLoS Genetics. Devresse et al. (2024): HighFive - Header-C++ HDF5 interface. https://zenodo.org/records/13120799","code":""},{"path":"https://lcrawlab.github.io/sme/reference/approximate_memory_requirements.html","id":null,"dir":"Reference","previous_headings":"","what":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","title":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","text":"function provides approximate estimate memory requirements (gigabytes) running Sparse Marginal Epistasis (SME) routine based input parameters number samples, SNPs, configurations.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/approximate_memory_requirements.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","text":"","code":"approximate_memory_requirements( n_samples, n_snps, n_blocks, n_randvecs, chunksize )"},{"path":"https://lcrawlab.github.io/sme/reference/approximate_memory_requirements.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","text":"n_samples Integer. number samples dataset. n_snps Integer. total number SNPs dataset. n_blocks Integer. number genotype blocks used partition SNPs. Affects size encoded genotype segments. n_randvecs Integer. number random vectors used stochastic trace estimation. Affects memory operations involving random vectors. chunksize Integer. number focal SNPs processed per chunk.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/approximate_memory_requirements.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","text":"Numeric. approximate memory requirement (gigabytes) SME routine.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/approximate_memory_requirements.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","text":"function calculates memory usage summing contributions various components used SME routine, including: Variance component estimates (vc_estimates) Phenotype-related matrices Random vector-based computations Genotype objects block statistics Gene--gene interaction masks estimated memory requirement derived data dimensions operational needs, provides guideline configuring resources analysis.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/approximate_memory_requirements.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Estimate Memory Requirements for SME Routine — approximate_memory_requirements","text":"","code":"n_samples <- 1e5 n_snps <- 1e6 n_blocks <- 100 n_randvecs <- 100 chunksize <- 10 approximate_memory_requirements(n_samples, n_snps, n_blocks, n_randvecs, chunksize) #> [1] 6.447136"},{"path":"https://lcrawlab.github.io/sme/reference/create_hdf5_file.html","id":null,"dir":"Reference","previous_headings":"","what":"Create an HDF5 File — create_hdf5_file","title":"Create an HDF5 File — create_hdf5_file","text":"function creates new, empty HDF5 file specified location.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/create_hdf5_file.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Create an HDF5 File — create_hdf5_file","text":"","code":"create_hdf5_file(hdf5_file)"},{"path":"https://lcrawlab.github.io/sme/reference/create_hdf5_file.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Create an HDF5 File — create_hdf5_file","text":"hdf5_file character string specifying path name HDF5 file created.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/create_hdf5_file.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Create an HDF5 File — create_hdf5_file","text":"return value; function creates HDF5 file specified location.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/create_hdf5_file.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Create an HDF5 File — create_hdf5_file","text":"","code":"# Create an empty HDF5 file hdf5_file <- tempfile() create_hdf5_file(hdf5_file)"},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":null,"dir":"Reference","previous_headings":"","what":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"getting_started simulated dataset created demonstrate use sme() function genome-wide interaction analyses. contains results simulated analysis involving additive genetic effects gene--gene (GxG) interactions.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"","code":"data(\"getting_started\")"},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":"format","dir":"Reference","previous_headings":"","what":"Format","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"list results sme(), including following components: summary data frame summarizing analysis results, including p-values SNP associations (p). pve data frame containing per SNP variance component estimates normalized phenotypic variance explained (PVE). vc data frame containing per SNP variance component estimates. gxg_snps vector containing indices SNPs assigned epistatic interactions trait simulations.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":"source","dir":"Reference","previous_headings":"","what":"Source","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"data-raw/getting_started.R","code":""},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"dataset generated follows: Genotype Simulation: Genotype data 5000 individuals 6,000 SNPs simulated synthetic allele counts. Phenotype Simulation: Phenotypic values simulated additive heritability 0.3 GxG interaction heritability 0.25. set 100 SNPs selected additive effects, two groups 5 SNPs used GxG interactions. PLINK-Compatible Files: simulated data saved PLINK-compatible .bed, .fam, .bim files. Interaction Analysis: sme() function used perform genome-wide interaction analyses subset SNP indices, including GxG SNP groups 100 additional additive SNPs. Memory-efficient computation parameters (e.g., chun_ksize, n_randvecs, n_blocks) applied.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":"key-parameters","dir":"Reference","previous_headings":"","what":"Key Parameters","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"Additive Heritability: 0.3 GxG Heritability: 0.25 Number Samples: 5000 Number SNPs: 6,000 Selected Additive SNPs: 100 Selected GxG SNP Groups: Group 1: 5 SNPs Group 2: 5 SNPs","code":""},{"path":[]},{"path":"https://lcrawlab.github.io/sme/reference/getting_started.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Simulated Dataset for Genome-Wide Interaction Analysis — getting_started","text":"","code":"data(\"getting_started\") head(getting_started$summary) #> # A tibble: 6 × 9 #> id index chromosome position p pve vc se true_gxg_snp #> #> 1 rs1498 1498 1 1498 0.000581 0.0447 0.0446 0.0137 TRUE #> 2 rs2032 2032 1 2032 0.00722 0.0377 0.0377 0.0154 TRUE #> 3 rs2364 2364 1 2364 0.00178 0.0450 0.0450 0.0154 TRUE #> 4 rs2867 2867 1 2867 0.000496 0.0519 0.0518 0.0157 TRUE #> 5 rs4610 4610 1 4610 0.0000783 0.0581 0.0580 0.0153 TRUE #> 6 rs822 822 1 822 0.00522 0.0367 0.0367 0.0143 TRUE"},{"path":"https://lcrawlab.github.io/sme/reference/read_hdf5_dataset.html","id":null,"dir":"Reference","previous_headings":"","what":"Read Dataset from an HDF5 File — read_hdf5_dataset","title":"Read Dataset from an HDF5 File — read_hdf5_dataset","text":"function reads dataset existing HDF5 file.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/read_hdf5_dataset.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Read Dataset from an HDF5 File — read_hdf5_dataset","text":"","code":"read_hdf5_dataset(file_name, dataset_name)"},{"path":"https://lcrawlab.github.io/sme/reference/read_hdf5_dataset.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Read Dataset from an HDF5 File — read_hdf5_dataset","text":"file_name character string specifying path HDF5 file. dataset_name character string specifying name dataset within HDF5 file read.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/read_hdf5_dataset.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Read Dataset from an HDF5 File — read_hdf5_dataset","text":"content dataset HDF5 file, typically form R object.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/read_hdf5_dataset.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Read Dataset from an HDF5 File — read_hdf5_dataset","text":"","code":"data_to_write <- 1:10 # Create an empty HDF5 file hdf5_file <- tempfile() create_hdf5_file(hdf5_file) # Write new data to a dataset in the HDF5 file write_hdf5_dataset(hdf5_file, \"group/dataset\", data_to_write) # Read a dataset from an HDF5 file hdf5_data <- read_hdf5_dataset(hdf5_file, \"group/dataset\") print(hdf5_data) #> [1] 1 2 3 4 5 6 7 8 9 10"},{"path":"https://lcrawlab.github.io/sme/reference/simulate_traits.html","id":null,"dir":"Reference","previous_headings":"","what":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","title":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","text":"function simulates quantitative trait based additive epistatic genetic effects using genotype data PLINK dataset. simulated trait saved specified output file phenotype format compatible PLINK.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/simulate_traits.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","text":"","code":"simulate_traits( plink_file, output_file, additive_heritability, gxg_heritability, additive_indices, gxg_indices_1, gxg_indices_2, log_level = \"WARNING\" )"},{"path":"https://lcrawlab.github.io/sme/reference/simulate_traits.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","text":"plink_file Character. Path PLINK dataset (without file extension). function append .bed, .bim, .fam extensions needed. output_file Character. Path output file simulated trait saved. additive_heritability Numeric. value 0 1 specifying proportion trait variance due additive genetic effects. gxg_heritability Numeric. value 0 1 specifying proportion trait variance due gene--gene (epistatic) interactions. sum additive_heritability gxg_heritability must exceed 1. additive_indices Integer vector. Indices SNPs contributing additive genetic effects. gxg_indices_1 Integer vector. Indices SNPs first group epistatic interactions. gxg_indices_2 Integer vector. Indices SNPs second group epistatic interactions. log_level Character. Logging level messages (e.g., \"DEBUG\", \"INFO\", \"WARNING\"). Default \"WARNING\".","code":""},{"path":"https://lcrawlab.github.io/sme/reference/simulate_traits.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","text":"None. simulated trait written specified output_file.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/simulate_traits.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","text":"function uses following components simulate trait: Additive genetic effects: Determined additive_indices specified additive_heritability. Epistatic interactions: Simulated using pairs SNPs gxg_indices_1 gxg_indices_2, contributing gxg_heritability. Environmental effects: remaining variance explained genetic effects assigned random environmental noise. output file PLINK-compatible phenotype format three columns: Family ID (FID), Individual ID (IID), simulated trait (TRAIT).","code":""},{"path":"https://lcrawlab.github.io/sme/reference/simulate_traits.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Simulate Quantitative Traits from PLINK Genotypes — simulate_traits","text":"","code":"plink_file <- gsub(\"\\\\.bed\", \"\", system.file(\"testdata\", \"test.bed\", package = \"smer\")) out_file <- tempfile() additive_heritability <- 0.3 gxg_heritability <- 0.1 additive_snps <- sort(sample(1:100, 50, replace = FALSE)) gxg_group_1 <- sort(sample(additive_snps, 10, replace = FALSE)) gxg_group_2 <- sort(sample(setdiff(additive_snps, gxg_group_1), 10, replace = FALSE)) n_samples <- 200 simulate_traits( plink_file, out_file, additive_heritability, gxg_heritability, additive_snps, gxg_group_1, gxg_group_2 ) from_file <- read.table(out_file, header = TRUE) head(from_file) #> FID IID TRAIT #> 1 1 1 0.07327543 #> 2 2 1 -0.82216034 #> 3 3 1 -2.31319310 #> 4 4 1 0.74270008 #> 5 5 1 -2.53693229 #> 6 6 1 1.26249797"},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":null,"dir":"Reference","previous_headings":"","what":"Sparse Marginal Epistasis Test (SME) — sme","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"SME fits linear mixed model order test marginal epistasis. concentrates scans epistasis regions genome known functional enrichment trait interest.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"","code":"sme( plink_file, pheno_file, mask_file = NULL, gxg_indices = NULL, chunk_size = NULL, n_randvecs = 10, n_blocks = 100, n_threads = 1, gxg_h5_group = \"gxg\", ld_h5_group = \"ld\", rand_seed = -1, log_level = \"WARNING\" )"},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"plink_file Character. File path PLINK dataset (without *.bed extension). function append .bim, .bed, .fam extensions automatically. genotype data must missing genotypes. Use PLINK remove variants missing genotypes impute . pheno_file Character. File path phenotype file PLINK format. file contain exactly one phenotype column. mask_file Character NULL. File path HDF5 file specifying per-SNP masks gene--gene interaction tests. file informs SNPs tested marginal epistasis. Defaults NULL, indicating masking. Masking impacts scaling memory time. gxg_indices Integer vector NULL. List indices corresponding SNPs test marginal epistasis. NULL, SNPs dataset tested. indices 1-based. chunk_size Integer NULL. Number SNPs processed per chunk. influences memory usage can left NULL automatically determine chunk size based gxg_indices number threads. n_randvecs Integer. Number random vectors used stochastic trace estimation. Higher values yield accurate estimates increase computational cost. Default 10. n_blocks Integer. Number blocks SNPs divided processing. parameter affects memory requirements. Default 100. n_threads Integer. Number threads OpenMP parallel processing. Default 1. gxg_h5_group Character. Name HDF5 group within mask file containing gene--gene interaction masks. SNPs group included gene--gene interactions. Defaults \"gxg\". ld_h5_group Character. Name HDF5 group within mask file containing linkage disequilibrium masks. SNPs group excluded analysis. Defaults \"ld\". rand_seed Integer. Seed random vector generation. -1, seed set. Default -1. log_level Character. Logging level messages. Must uppercase (e.g., \"DEBUG\", \"INFO\", \"WARNING\", \"ERROR\"). Default \"WARNING\".","code":""},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"list containing: summary: tibble summarizing results tested SNP, including: id: Variant ID. index: Index SNP dataset. chromosome: Chromosome number. position: Genomic position SNP. p: P value gene--gene interaction test. pve: Proportion variance explained (PVE) gene--gene interactions. vc: Variance component estimate. se: Standard error variance component. pve: long-format tibble PVE variance components. vc_estimate: long-format tibble variance component estimates. vc_se: long-format tibble standard errors variance components. average_duration: Average computation time per SNP.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"function integrates PLINK-formatted genotype phenotype data perform marginal epistasis tests set SNPs. Using stochastic trace estimation, method computes variance components gene--gene interaction genetic relatedness using MQS estimator. process parallelized using OpenMP n_threads > 1. memory requirements computation time scaling can optimized parameters chunk_size, n_randvecs, n_blocks. Mask Format Requirements mask file format HDF5 file used storing index data masking process. format supports data retrieval index. required groups datasets within HDF5 file: required group names can configured input parameters. defaults described . Groups: ld: Stores SNPs LD focal SNP. SNPs excluded. gxg: Stores indices SNPs marginal epistasis test conditioned . SNPs included. Datasets: ld/: focal SNP , dataset contains indices SNPs LD block SNP. SNPs excluded gene--gene interaction covariance matrix. gxg/: focal SNP , dataset contains indices SNPs include gene--gene interaction covariance matrix focal SNP . Important: indices mask file data zero-based, matching zero-based indices PLINK .bim file.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":"references","dir":"Reference","previous_headings":"","what":"References","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"Stamp, J., Pattillo Smith, S., Weinreich, D., & Crawford, L. (2025). Sparse modeling interactions enables fast detection genome-wide epistasis biobank-scale studies. bioRxiv, 2025.01.11.632557. Stamp, J., DenAdel, ., Weinreich, D., & Crawford, L. (2023). Leveraging genetic correlation traits improves detection epistasis genome-wide association studies. G3: Genes, Genomes, Genetics, 13(8), jkad118. Crawford, L., Zeng, P., Mukherjee, S., & Zhou, X. (2017). Detecting epistasis marginal epistasis test genetic mapping studies quantitative traits. PLoS genetics, 13(7), e1006869.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/sme.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Sparse Marginal Epistasis Test (SME) — sme","text":"","code":"plink_file <- gsub(\"\\\\.bed\", \"\", system.file(\"testdata\", \"test.bed\", package=\"smer\")) pheno_file <- system.file(\"testdata\", \"test_h2_0.5.pheno\", package=\"smer\") mask_file <- \"\" # Parameter inputs chunk_size <- 10 n_randvecs <- 10 n_blocks <- 10 n_threads <- 1 # 1-based Indices of SNPs to be analyzed n_snps <- 100 snp_indices <- 1:n_snps sme_result <- sme( plink_file, pheno_file, mask_file, snp_indices, chunk_size, n_randvecs, n_blocks, n_threads ) head(sme_result$summary) #> # A tibble: 6 × 8 #> id index chromosome position p pve vc se #> #> 1 rs1 1 1 1 0.879 -0.0395 -0.0388 0.0332 #> 2 rs2 2 1 2 0.276 0.0218 0.0212 0.0358 #> 3 rs3 3 1 3 0.539 -0.00572 -0.00557 0.0565 #> 4 rs4 4 1 4 0.398 0.0151 0.0146 0.0565 #> 5 rs5 5 1 5 0.860 -0.0441 -0.0428 0.0397 #> 6 rs6 6 1 6 0.524 -0.00360 -0.00351 0.0575"},{"path":"https://lcrawlab.github.io/sme/reference/write_hdf5_dataset.html","id":null,"dir":"Reference","previous_headings":"","what":"Write Data to an HDF5 Dataset — write_hdf5_dataset","title":"Write Data to an HDF5 Dataset — write_hdf5_dataset","text":"function writes new data existing HDF5 file. dataset already exists, replaced new data.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/write_hdf5_dataset.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Write Data to an HDF5 Dataset — write_hdf5_dataset","text":"","code":"write_hdf5_dataset(file_name, dataset_name, new_data)"},{"path":"https://lcrawlab.github.io/sme/reference/write_hdf5_dataset.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Write Data to an HDF5 Dataset — write_hdf5_dataset","text":"file_name character string specifying path HDF5 file. dataset_name character string specifying name dataset written HDF5 file. new_data new data write dataset. data must compatible dataset's structure.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/write_hdf5_dataset.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Write Data to an HDF5 Dataset — write_hdf5_dataset","text":"return value; function modifies specified dataset HDF5 file.","code":""},{"path":"https://lcrawlab.github.io/sme/reference/write_hdf5_dataset.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Write Data to an HDF5 Dataset — write_hdf5_dataset","text":"","code":"data_to_write <- 1:10 # Create an empty HDF5 file hdf5_file <- tempfile() create_hdf5_file(hdf5_file) # Write new data to a dataset in the HDF5 file write_hdf5_dataset(hdf5_file, \"group/dataset\", data_to_write)"},{"path":"https://lcrawlab.github.io/sme/news/index.html","id":"smer-001","dir":"Changelog","previous_headings":"","what":"smer 0.0.1","title":"smer 0.0.1","text":"CRAN release: 2025-01-16 Version used publication SME.","code":""}]