From 38bda5cfa20b53e83eafe8946db2842fd00133ca Mon Sep 17 00:00:00 2001 From: Al-Murphy Date: Tue, 17 Dec 2024 17:48:42 +0000 Subject: [PATCH] Update IEU GWAS API --- .github/workflows/rworkflows.yml | 11 +- DESCRIPTION | 2 +- docs/404.html | 121 ++ docs/articles/MungeSumstats.html | 1418 +++++++++++++++++ docs/articles/OpenGWAS.html | 341 ++++ docs/articles/docker.html | 278 ++++ docs/articles/index.html | 96 ++ docs/authors.html | 122 ++ docs/bootstrap-toc.css | 60 + docs/bootstrap-toc.js | 159 ++ docs/docsearch.css | 148 ++ docs/docsearch.js | 85 + docs/index.html | 249 +++ docs/link.svg | 12 + docs/news/index.html | 1098 +++++++++++++ docs/pkgdown.css | 384 +++++ docs/pkgdown.js | 108 ++ docs/pkgdown.yml | 9 + docs/reference/DF_to_dt.html | 118 ++ docs/reference/axel.html | 156 ++ docs/reference/check_allele_flip.html | 231 +++ docs/reference/check_allele_merge.html | 116 ++ docs/reference/check_bi_allelic.html | 164 ++ docs/reference/check_bp_range.html | 163 ++ docs/reference/check_chr.html | 155 ++ docs/reference/check_col_order.html | 118 ++ docs/reference/check_drop_indels.html | 159 ++ docs/reference/check_dup_bp.html | 162 ++ docs/reference/check_dup_col.html | 116 ++ docs/reference/check_dup_row.html | 149 ++ docs/reference/check_dup_snp.html | 162 ++ .../check_effect_columns_nonzero.html | 150 ++ docs/reference/check_empty_cols.html | 117 ++ docs/reference/check_four_step_col.html | 116 ++ docs/reference/check_frq.html | 150 ++ docs/reference/check_frq_maf.html | 116 ++ docs/reference/check_info_score.html | 142 ++ docs/reference/check_ldsc_format.html | 174 ++ docs/reference/check_miss_data.html | 153 ++ docs/reference/check_multi_gwas.html | 136 ++ docs/reference/check_multi_rs_snp.html | 163 ++ docs/reference/check_n_int.html | 129 ++ docs/reference/check_n_num.html | 156 ++ docs/reference/check_no_allele.html | 185 +++ docs/reference/check_no_chr_bp.html | 171 ++ docs/reference/check_no_rs_snp.html | 182 +++ docs/reference/check_no_snp.html | 178 +++ docs/reference/check_numeric.html | 120 ++ docs/reference/check_on_ref_genome.html | 184 +++ docs/reference/check_pos_se.html | 174 ++ docs/reference/check_range_p_val.html | 146 ++ docs/reference/check_row_snp.html | 143 ++ docs/reference/check_save_path.html | 149 ++ docs/reference/check_signed_col.html | 165 ++ docs/reference/check_small_p_val.html | 140 ++ docs/reference/check_strand_ambiguous.html | 156 ++ docs/reference/check_tabular.html | 111 ++ docs/reference/check_two_step_col.html | 117 ++ docs/reference/check_vcf.html | 111 ++ docs/reference/check_vital_col.html | 111 ++ docs/reference/check_zscore.html | 173 ++ docs/reference/column_dictionary.html | 128 ++ docs/reference/compute_nsize.html | 180 +++ docs/reference/compute_sample_size.html | 165 ++ docs/reference/compute_sample_size_n.html | 140 ++ docs/reference/compute_sample_size_neff.html | 150 ++ docs/reference/convert_sumstats.html | 116 ++ docs/reference/download_vcf.html | 169 ++ docs/reference/downloader.html | 188 +++ docs/reference/drop_duplicate_cols.html | 111 ++ docs/reference/drop_duplicate_rows.html | 115 ++ docs/reference/find_sumstats.html | 233 +++ docs/reference/format_sumstats.html | 663 ++++++++ docs/reference/formatted_example.html | 140 ++ docs/reference/get_chain_file.html | 140 ++ docs/reference/get_eff_frq_allele_combns.html | 124 ++ docs/reference/get_genome_build.html | 178 +++ docs/reference/get_genome_builds.html | 204 +++ docs/reference/get_unique_name_log_file.html | 117 ++ docs/reference/get_vcf_sample_ids.html | 113 ++ docs/reference/granges_to_dt.html | 116 ++ docs/reference/hg19ToHg38.html | 119 ++ docs/reference/hg38ToHg19.html | 119 ++ docs/reference/ieu-a-298.html | 116 ++ docs/reference/import_sumstats.html | 515 ++++++ docs/reference/index.html | 209 +++ docs/reference/index_tabular.html | 179 +++ docs/reference/index_vcf.html | 151 ++ docs/reference/infer_effect_column.html | 231 +++ docs/reference/is_tabix.html | 114 ++ docs/reference/liftover.html | 221 +++ docs/reference/list_sumstats.html | 142 ++ docs/reference/load_ref_genome_data.html | 146 ++ docs/reference/load_snp_loc_data.html | 129 ++ docs/reference/logs_example.html | 135 ++ docs/reference/make_allele_upper.html | 112 ++ docs/reference/message_parallel.html | 105 ++ docs/reference/messager.html | 115 ++ docs/reference/parse_dropped_INFO.html | 111 ++ docs/reference/parse_dropped_chrom.html | 111 ++ docs/reference/parse_dropped_duplicates.html | 111 ++ docs/reference/parse_dropped_nonA1A2.html | 111 ++ .../reference/parse_dropped_nonBiallelic.html | 111 ++ docs/reference/parse_dropped_nonRef.html | 111 ++ docs/reference/parse_flipped.html | 111 ++ docs/reference/parse_genome_build.html | 111 ++ docs/reference/parse_idStandard.html | 111 ++ docs/reference/parse_logs.html | 137 ++ docs/reference/parse_pval_large.html | 111 ++ docs/reference/parse_pval_neg.html | 111 ++ docs/reference/parse_pval_small.html | 111 ++ docs/reference/parse_report.html | 111 ++ docs/reference/parse_snps_freq_05.html | 111 ++ docs/reference/parse_snps_not_formatted.html | 111 ++ docs/reference/parse_time.html | 111 ++ docs/reference/preview_sumstats.html | 112 ++ docs/reference/raw_ALSvcf.html | 129 ++ docs/reference/raw_eduAttainOkbay.html | 141 ++ docs/reference/read_header.html | 134 ++ docs/reference/read_log_pval.html | 129 ++ docs/reference/read_sumstats.html | 167 ++ docs/reference/read_vcf.html | 277 ++++ docs/reference/read_vcf_genome.html | 130 ++ docs/reference/read_vcf_info.html | 111 ++ docs/reference/read_vcf_markername.html | 111 ++ docs/reference/read_vcf_parallel.html | 235 +++ docs/reference/register_cores.html | 123 ++ docs/reference/remove_empty_cols.html | 117 ++ docs/reference/report_summary.html | 112 ++ docs/reference/select_vcf_fields.html | 144 ++ docs/reference/sort_coord_genomicranges.html | 114 ++ docs/reference/sort_coords.html | 130 ++ docs/reference/sort_coords_datatable.html | 127 ++ docs/reference/standardise_header.html | 163 ++ docs/reference/sumstatsColHeaders.html | 168 ++ docs/reference/supported_suffixes.html | 128 ++ docs/reference/to_granges.html | 147 ++ docs/reference/to_vranges.html | 112 ++ docs/reference/unlist_dt.html | 115 ++ docs/reference/validate_parameters.html | 457 ++++++ docs/reference/vcf2df.html | 197 +++ docs/reference/write_sumstats.html | 196 +++ docs/sitemap.xml | 399 +++++ 143 files changed, 23969 insertions(+), 5 deletions(-) create mode 100644 docs/404.html create mode 100644 docs/articles/MungeSumstats.html create mode 100644 docs/articles/OpenGWAS.html create mode 100644 docs/articles/docker.html create mode 100644 docs/articles/index.html create mode 100644 docs/authors.html create mode 100644 docs/bootstrap-toc.css create mode 100644 docs/bootstrap-toc.js create mode 100644 docs/docsearch.css create mode 100644 docs/docsearch.js create mode 100644 docs/index.html create mode 100644 docs/link.svg create mode 100644 docs/news/index.html create mode 100644 docs/pkgdown.css create mode 100644 docs/pkgdown.js create mode 100644 docs/pkgdown.yml create mode 100644 docs/reference/DF_to_dt.html create mode 100644 docs/reference/axel.html create mode 100644 docs/reference/check_allele_flip.html create mode 100644 docs/reference/check_allele_merge.html create mode 100644 docs/reference/check_bi_allelic.html create mode 100644 docs/reference/check_bp_range.html create mode 100644 docs/reference/check_chr.html create mode 100644 docs/reference/check_col_order.html create mode 100644 docs/reference/check_drop_indels.html create mode 100644 docs/reference/check_dup_bp.html create mode 100644 docs/reference/check_dup_col.html create mode 100644 docs/reference/check_dup_row.html create mode 100644 docs/reference/check_dup_snp.html create mode 100644 docs/reference/check_effect_columns_nonzero.html create mode 100644 docs/reference/check_empty_cols.html create mode 100644 docs/reference/check_four_step_col.html create mode 100644 docs/reference/check_frq.html create mode 100644 docs/reference/check_frq_maf.html create mode 100644 docs/reference/check_info_score.html create mode 100644 docs/reference/check_ldsc_format.html create mode 100644 docs/reference/check_miss_data.html create mode 100644 docs/reference/check_multi_gwas.html create mode 100644 docs/reference/check_multi_rs_snp.html create mode 100644 docs/reference/check_n_int.html create mode 100644 docs/reference/check_n_num.html create mode 100644 docs/reference/check_no_allele.html create mode 100644 docs/reference/check_no_chr_bp.html create mode 100644 docs/reference/check_no_rs_snp.html create mode 100644 docs/reference/check_no_snp.html create mode 100644 docs/reference/check_numeric.html create mode 100644 docs/reference/check_on_ref_genome.html create mode 100644 docs/reference/check_pos_se.html create mode 100644 docs/reference/check_range_p_val.html create mode 100644 docs/reference/check_row_snp.html create mode 100644 docs/reference/check_save_path.html create mode 100644 docs/reference/check_signed_col.html create mode 100644 docs/reference/check_small_p_val.html create mode 100644 docs/reference/check_strand_ambiguous.html create mode 100644 docs/reference/check_tabular.html create mode 100644 docs/reference/check_two_step_col.html create mode 100644 docs/reference/check_vcf.html create mode 100644 docs/reference/check_vital_col.html create mode 100644 docs/reference/check_zscore.html create mode 100644 docs/reference/column_dictionary.html create mode 100644 docs/reference/compute_nsize.html create mode 100644 docs/reference/compute_sample_size.html create mode 100644 docs/reference/compute_sample_size_n.html create mode 100644 docs/reference/compute_sample_size_neff.html create mode 100644 docs/reference/convert_sumstats.html create mode 100644 docs/reference/download_vcf.html create mode 100644 docs/reference/downloader.html create mode 100644 docs/reference/drop_duplicate_cols.html create mode 100644 docs/reference/drop_duplicate_rows.html create mode 100644 docs/reference/find_sumstats.html create mode 100644 docs/reference/format_sumstats.html create mode 100644 docs/reference/formatted_example.html create mode 100644 docs/reference/get_chain_file.html create mode 100644 docs/reference/get_eff_frq_allele_combns.html create mode 100644 docs/reference/get_genome_build.html create mode 100644 docs/reference/get_genome_builds.html create mode 100644 docs/reference/get_unique_name_log_file.html create mode 100644 docs/reference/get_vcf_sample_ids.html create mode 100644 docs/reference/granges_to_dt.html create mode 100644 docs/reference/hg19ToHg38.html create mode 100644 docs/reference/hg38ToHg19.html create mode 100644 docs/reference/ieu-a-298.html create mode 100644 docs/reference/import_sumstats.html create mode 100644 docs/reference/index.html create mode 100644 docs/reference/index_tabular.html create mode 100644 docs/reference/index_vcf.html create mode 100644 docs/reference/infer_effect_column.html create mode 100644 docs/reference/is_tabix.html create mode 100644 docs/reference/liftover.html create mode 100644 docs/reference/list_sumstats.html create mode 100644 docs/reference/load_ref_genome_data.html create mode 100644 docs/reference/load_snp_loc_data.html create mode 100644 docs/reference/logs_example.html create mode 100644 docs/reference/make_allele_upper.html create mode 100644 docs/reference/message_parallel.html create mode 100644 docs/reference/messager.html create mode 100644 docs/reference/parse_dropped_INFO.html create mode 100644 docs/reference/parse_dropped_chrom.html create mode 100644 docs/reference/parse_dropped_duplicates.html create mode 100644 docs/reference/parse_dropped_nonA1A2.html create mode 100644 docs/reference/parse_dropped_nonBiallelic.html create mode 100644 docs/reference/parse_dropped_nonRef.html create mode 100644 docs/reference/parse_flipped.html create mode 100644 docs/reference/parse_genome_build.html create mode 100644 docs/reference/parse_idStandard.html create mode 100644 docs/reference/parse_logs.html create mode 100644 docs/reference/parse_pval_large.html create mode 100644 docs/reference/parse_pval_neg.html create mode 100644 docs/reference/parse_pval_small.html create mode 100644 docs/reference/parse_report.html create mode 100644 docs/reference/parse_snps_freq_05.html create mode 100644 docs/reference/parse_snps_not_formatted.html create mode 100644 docs/reference/parse_time.html create mode 100644 docs/reference/preview_sumstats.html create mode 100644 docs/reference/raw_ALSvcf.html create mode 100644 docs/reference/raw_eduAttainOkbay.html create mode 100644 docs/reference/read_header.html create mode 100644 docs/reference/read_log_pval.html create mode 100644 docs/reference/read_sumstats.html create mode 100644 docs/reference/read_vcf.html create mode 100644 docs/reference/read_vcf_genome.html create mode 100644 docs/reference/read_vcf_info.html create mode 100644 docs/reference/read_vcf_markername.html create mode 100644 docs/reference/read_vcf_parallel.html create mode 100644 docs/reference/register_cores.html create mode 100644 docs/reference/remove_empty_cols.html create mode 100644 docs/reference/report_summary.html create mode 100644 docs/reference/select_vcf_fields.html create mode 100644 docs/reference/sort_coord_genomicranges.html create mode 100644 docs/reference/sort_coords.html create mode 100644 docs/reference/sort_coords_datatable.html create mode 100644 docs/reference/standardise_header.html create mode 100644 docs/reference/sumstatsColHeaders.html create mode 100644 docs/reference/supported_suffixes.html create mode 100644 docs/reference/to_granges.html create mode 100644 docs/reference/to_vranges.html create mode 100644 docs/reference/unlist_dt.html create mode 100644 docs/reference/validate_parameters.html create mode 100644 docs/reference/vcf2df.html create mode 100644 docs/reference/write_sumstats.html create mode 100644 docs/sitemap.xml diff --git a/.github/workflows/rworkflows.yml b/.github/workflows/rworkflows.yml index cffd6a3e..9f91cafe 100644 --- a/.github/workflows/rworkflows.yml +++ b/.github/workflows/rworkflows.yml @@ -22,13 +22,16 @@ jobs: fail-fast: ${{ false }} matrix: config: - - os: ubuntu-latest + #- os: ubuntu-latest + # r: devel + # bioc: devel + cont: bioconductor/bioconductor_docker:devel + - os: macOS-latest + #r: latest + #bioc: release r: devel bioc: devel cont: bioconductor/bioconductor_docker:devel - - os: macOS-latest - r: latest - bioc: release # - os: windows-latest # r: latest # bioc: release diff --git a/DESCRIPTION b/DESCRIPTION index 241a2abc..041004f7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MungeSumstats Type: Package Title: Standardise summary statistics from GWAS -Version: 1.15.3 +Version: 1.15.4 Authors@R: c(person(given = "Alan", family = "Murphy", diff --git a/docs/404.html b/docs/404.html new file mode 100644 index 00000000..7cd223ac --- /dev/null +++ b/docs/404.html @@ -0,0 +1,121 @@ + + + + + + + +Page not found (404) • MungeSumstats + + + + + + + + + + + +
+
+ + + + +
+
+ + +Content not found. Please use links in the navbar. + +
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.9.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/MungeSumstats.html b/docs/articles/MungeSumstats.html new file mode 100644 index 00000000..d0fc4c27 --- /dev/null +++ b/docs/articles/MungeSumstats.html @@ -0,0 +1,1418 @@ + + + + + + + +`MungeSumstats`: Getting started • MungeSumstats + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + + +
+

Overview +

+

The MungeSumstats package is designed to facilitate the +standardisation of GWAS summary statistics as utilised in our Nature +Genetics paper1.

+

The package is designed to handle the lack of standardisation of +output files by the GWAS community. There is a group who have now +manually standardised many GWAS: R interface to the IEU GWAS +database API • ieugwasr and gwasvcf but because a lot +of GWAS remain closed access, these repositories are not all +encompassing.

+

The GWAS-Download +project has collated summary statistics from 200+ GWAS. This +repository has been utilsed to identify the most common formats, all of +which can be standardised with MungeSumstats.

+

Moreover, there is an emerging standard of VCF format for summary +statistics files with multiple, useful, associated R packages such as +vcfR. However, there is currently no method to convert VCF +formats to a standardised format that matches older approaches.

+

The MungeSumstats package standardises both VCF and the most +common summary statistic file formats to enable downstream integration +and analysis.

+

MungeSumstats also offers comprehensive Quality Control (QC) +steps which are important prerequisites for downstream analysis like +Linkage disequilibrium score regression (LDSC) and MAGMA.

+

Moreover, MungeSumstats is efficiently written resulting in +all reformatting and quality control checks completing in minutes for +GWAS summary statistics with 500k SNPs on a standard desktop machine. +This speed can be increased further by increasing the number of threads +(nThread) for data.table to use.

+

Currently MungeSumstats only works on data from humans, as +it uses human-based genome references.

+
+
+

Aim +

+

MungeSumstats will ensure that the all essential columns for +analysis are present and syntactically correct. Generally, summary +statistic files include (but are not limited to) the columns:

+
    +
  • SNP : SNP ID (rs IDs)
  • +
  • CHR : Chromosome number
  • +
  • BP : Base pair positions
  • +
  • A1 : reference allele
  • +
  • A2 : alternative allele
  • +
  • Z : Z-score
  • +
  • BETA : Effect size estimate relative to the alternative allele
  • +
  • P : Unadjusted p-value for SNP
  • +
  • SE : The standard error
  • +
  • N : Sample size
  • +
  • INFO: The imputation information score
  • +
  • FRQ: The minor/effect allele frequency (MAF/EAF) of the SNP
  • +
+

MungeSumstats uses a mapping file to infer the inputted +column names (run data("sumstatsColHeaders") to view +these). This mapping file is far more comprehensive than any other +publicly available munging tool containing more than 200 unique mappings +at the time of writing this vignette. However, if your column headers +are missing or if you want to change the mapping, you can do so by +passing your own mapping file (see +format_sumstats(mapping_file)).

+

MungeSumstats offers unmatched levels of quality control to +ensure, for example, consistency of allele assignment and direction of +effects. Tests run by MungeSumstats include:

+
    +
  • Check VCF format
  • +
  • Check tab, space or comma delimited, zipped, csv or tsv file
  • +
  • Check for header name synonyms
  • +
  • Check for multiple models or traits in GWAS
  • +
  • Check for uniformity in SNP ID - no mix of rs/missing rs/chr:bp
  • +
  • Check for CHR:BP:A2:A1 all in one column
  • +
  • Check for CHR:BP in one column
  • +
  • Check for A1/A2 in one column
  • +
  • Check if CHR and/or BP is missing (infer from reference genome)
  • +
  • Check if SNP ID is missing (infer from reference genome)
  • +
  • Check if A1 and/or A2 are missing (infer from reference genome)
  • +
  • Check that vital columns are present (SNP,CHR,BP,P,A1,A2)
  • +
  • Check for one signed/effect column +(Z,OR,BETA,LOG_ODDS,SIGNED_SUMSTAT)
  • +
  • Check for missing data
  • +
  • Check for duplicated columns
  • +
  • Check for small p-values (lower than 5e-324)
  • +
  • Check N column is an integer
  • +
  • Check for SNPs with N greater than 5 times standard dev. plus the +mean
  • +
  • Check SNPs are RS ID’s
  • +
  • Check for uniformity of SNP ID format
  • +
  • Check for duplicated rows, based on SNP ID
  • +
  • Check for duplicated rows, based on base-pair position
  • +
  • Check for SNPs on reference genome. Correct not found SNP IDs using +CHR and BP (infer from reference genome)
  • +
  • Check INFO score
  • +
  • Check FRQ value
  • +
  • Check FRQ is minor allele frequency (MAF)
  • +
  • Check that the SNPs’ standard error (SE) is positive
  • +
  • Check that SNPs’ effect columns (like BETA) aren’t equal to 0
  • +
  • Check for strand-ambiguous SNPs
  • +
  • Check for non-biallelic SNPs (infer from reference genome)
  • +
  • Check for allele flipping
  • +
  • Check for SNPs with nonstandard chromosome names
  • +
  • Check for SNPs on excluded chromosomes (removes non-autosomal SNPs +by default)
  • +
  • Check for z-score (Z) and impute if missing
  • +
  • Check for N and impute if missing
  • +
  • Check output format is LDSC ready
  • +
  • Check output format is IEU OpenGWAS ready
  • +
  • Check and perform liftover to desired reference genome if +necessary
  • +
  • Check for indels in the sumstats and drop them if found (not run by +default)
  • +
+

Users can specify which checks to run on their data. A +note on the allele flipping check: +MungeSumstats infers the effect allele will always be +the A2 allele, this is the approach done for IEU GWAS +VCF and has such also been adopted here. This inference is first +from the inputted file’s column headers however, the allele flipping +check ensures this by comparing A1, what should be the reference allele, +to the reference genome. If a SNP’s A1 DNA base doesn’t match the +reference genome but it’s A2 (what should be the alternative allele) +does, the alleles will be flipped along with the effect information +(e.g. Beta, Odds Ratio, signed summary statistics, FRQ, Z-score*).

+

*-by default the Z-score is assumed to be calculated off the effect +size not the P-value and so will be flipped. This can be changed by a +user.

+

If a test is failed, the user will be notified and if possible, the +input will be corrected. The QC steps from the checks above can also be +adjusted to suit the user’s analysis, see +MungeSumstats::format_sumstats.

+

MungeSumstats can handle VCF, txt, tsv, csv file types or +.gz/.bgz versions of these file types. The package also gives the user +the flexibility to export the reformatted file as tab-delimited, VCF or +R native objects such as data.table, GRanges or VRanges objects. The +output can also be outputted in an LDSC ready format +which means the file can be fed directly into LDSC without the need for +additional munging. NOTE - If LDSC format is used, the +naming convention of A1 as the reference (genome build) allele and A2 as +the effect allele will be reversed to match LDSC (A1 will now be the +effect allele). See more info on this here. +Note that any effect columns (e.g. Z) will be inrelation to A1 now +instead of A2.

+

Please read carefully through our FAQ +Website to gain insight on how best to run MungeSumstats on your +data.

+
+
+

Data +

+

The MungeSumstats package contains small subsets of GWAS +summary statistics files. Firstly, on Educational Attainment by Okbay et +al 2016: PMID: 27898078 PMCID: PMC5509058 DOI: 10.1038/ng1216-1587b.

+

Secondly, a VCF file (VCFv4.2) relating to the GWAS Amyotrophic +lateral sclerosis from ieu open GWAS project. Dataset: ebi-a-GCST005647: +https://gwas.mrcieu.ac.uk/datasets/ebi-a-GCST005647/

+

These datasets will be used to showcase MungeSumstats +functionality.

+
+
+

Running MungeSumstats +

+

MungeSumstats is available on Bioconductor. To install the +package on Bioconductor run the following lines of code:

+
if (!require("BiocManager")) install.packages("BiocManager")
+BiocManager::install("MungeSumstats")
+

Once installed, load the package:

+ +

To standardise the summary statistics’ file format, simply call +format_sumstats() passing in the path to your summary +statistics file or directly pass the summary statistics as a dataframe +or datatable. You can specify which genome build was used in the +GWAS(GRCh37 or GRCh38) or, as default, infer the genome build from the +data.The reference genome is used for multiple checks like deriving +missing data such SNP/BP/CHR/A1/A2 and for QC steps like removing +non-biallelic SNPs, strand-ambiguous SNPs or ensuring correct allele and +direction of SNP effects. The path to the reformatted summary statistics +file can be returned by the function call, the user can specify a +location to save the file or the user can return an R native object for +the data: data.table, VRanges or GRanges object.

+

Note that for a number of the checks implored by +MungeSumstats a reference genome is used. If your GWAS summary +statistics file of interest relates to GRCh38, you will need to +install SNPlocs.Hsapiens.dbSNP155.GRCh38 and +BSgenome.Hsapiens.NCBI.GRCh38 from Bioconductor as +follows:

+
#increase permissible time to download data, in case of slow internet access
+options(timeout=2000)
+BiocManager::install("SNPlocs.Hsapiens.dbSNP155.GRCh38")
+BiocManager::install("BSgenome.Hsapiens.NCBI.GRCh38")
+

If your GWAS summary statistics file of interest relates to +GRCh37, you will need to install +SNPlocs.Hsapiens.dbSNP155.GRCh37 and +BSgenome.Hsapiens.1000genomes.hs37d5 from Bioconductor as +follows:

+
BiocManager::install("SNPlocs.Hsapiens.dbSNP155.GRCh37")
+BiocManager::install("BSgenome.Hsapiens.1000genomes.hs37d5")
+

These may take some time to install and are not included in the +package as some users may only need one of +GRCh37/GRCh38.

+

The Educational Attainment by Okbay GWAS summary statistics file is +saved as a text document in the package’s external data folder so we can +just pass the file path to it straight to MungeSumstats.

+

NOTE - By default, Formatted results will be saved +to tempdir(). This means all formatted summary stats will +be deleted upon ending the R session if not copied to a local file path. +Otherwise, to keep formatted summary stats, change +save_path ( +e.g.file.path('./formatted',basename(path))), or make sure +to copy files elsewhere after processing ( +e.g.file.copy(save_path, './formatted/' ).

+
+eduAttainOkbayPth <- system.file("extdata","eduAttainOkbay.txt",
+                                  package="MungeSumstats")
+reformatted <- 
+  MungeSumstats::format_sumstats(path=eduAttainOkbayPth,
+                                 ref_genome="GRCh37")
+
## 
+## 
+## ******::NOTE::******
+##  - Formatted results will be saved to `tempdir()` by default.
+##  - This means all formatted summary stats will be deleted upon ending the R session.
+##  - To keep formatted summary stats, change `save_path`  ( e.g. `save_path=file.path('./formatted',basename(path))` ),   or make sure to copy files elsewhere after processing  ( e.g. `file.copy(save_path, './formatted/' )`.
+##  ********************
+
## Formatted summary statistics will be saved to ==>  /var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T//RtmpJonkzo/filec4ec6d3b393.tsv.gz
+
## Warning: replacing previous import 'utils::findMatches' by
+## 'S4Vectors::findMatches' when loading 'SNPlocs.Hsapiens.dbSNP155.GRCh37'
+
## Importing tabular file: /private/var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T/RtmpKLvRpi/temp_libpath17f3d19176b21/MungeSumstats/extdata/eduAttainOkbay.txt
+
## Checking for empty columns.
+
## Infer Effect Column
+
## First line of summary statistics file:
+
## MarkerName   CHR POS A1  A2  EAF Beta    SE  Pval    
+
## Allele columns are ambiguous, attempting to infer direction
+
## Can't infer allele columns from sumstats
+
## Standardising column headers.
+
## First line of summary statistics file:
+
## MarkerName   CHR POS A1  A2  EAF Beta    SE  Pval    
+
## Summary statistics report:
+##    - 93 rows
+##    - 93 unique variants
+##    - 70 genome-wide significant variants (P<5e-8)
+##    - 20 chromosomes
+
## Checking for multi-GWAS.
+
## Checking for multiple RSIDs on one row.
+
## Checking SNP RSIDs.
+
## Checking for merged allele column.
+
## Checking A1 is uppercase
+
## Checking A2 is uppercase
+
## Checking for incorrect base-pair positions
+
## Checking for missing data.
+
## Checking for duplicate columns.
+
## Checking for duplicated rows.
+
## INFO column not available. Skipping INFO score filtering step.
+
## Filtering SNPs, ensuring SE>0.
+
## Ensuring all SNPs have N<5 std dev above mean.
+
## 47 SNPs (50.5%) have FRQ values > 0.5. Conventionally the FRQ column is intended to show the minor/effect allele frequency.
+## The FRQ column was mapped from one of the following from the inputted  summary statistics file:
+## FRQ, EAF, FREQUENCY, FRQ_U, F_U, MAF, FREQ, FREQ_TESTED_ALLELE, FRQ_TESTED_ALLELE, FREQ_EFFECT_ALLELE, FRQ_EFFECT_ALLELE, EFFECT_ALLELE_FREQUENCY, EFFECT_ALLELE_FREQ, EFFECT_ALLELE_FRQ, A2FREQ, A2FRQ, ALLELE_FREQUENCY, ALLELE_FREQ, ALLELE_FRQ, AF, MINOR_AF, EFFECT_AF, A2_AF, EFF_AF, ALT_AF, ALTERNATIVE_AF, INC_AF, A_2_AF, TESTED_AF, ALLELEFREQ, ALT_FREQ, EAF_HRC, EFFECTALLELEFREQ, FREQ.B, FREQ_EUROPEAN_1000GENOMES, FREQ_HAPMAP, FREQ_TESTED_ALLELE_IN_HRS, FRQ_U_113154, FRQ_U_31358, FRQ_U_344901, FRQ_U_43456, POOLED_ALT_AF, AF_ALT, AF.ALT, AF-ALT, ALT.AF, ALT-AF, A2.AF, A2-AF, AF.EFF, AF_EFF, ALL_AF
+
## As frq_is_maf=TRUE, the FRQ column will not be renamed. If the FRQ values were intended to represent major allele frequency,
+## set frq_is_maf=FALSE to rename the column as MAJOR_ALLELE_FRQ and differentiate it from minor/effect allele frequency.
+
## Sorting coordinates with 'data.table'.
+
## Writing in tabular format ==> /var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T//RtmpJonkzo/filec4ec6d3b393.tsv.gz
+
## Summary statistics report:
+##    - 93 rows (100% of original 93 rows)
+##    - 93 unique variants
+##    - 70 genome-wide significant variants (P<5e-8)
+##    - 20 chromosomes
+
## Done munging in 0.051 minutes.
+
## Successfully finished preparing sumstats file, preview:
+
## Reading header.
+
##           SNP   CHR       BP     A1     A2     FRQ   BETA    SE         P
+##        <char> <int>    <int> <char> <char>   <num>  <num> <num>     <num>
+## 1:   rs301800     1  8490603      T      C 0.17910  0.019 0.003 1.794e-08
+## 2: rs11210860     1 43982527      A      G 0.36940  0.017 0.003 2.359e-10
+## 3: rs34305371     1 72733610      A      G 0.08769  0.035 0.005 3.762e-14
+## 4:  rs2568955     1 72762169      T      C 0.23690 -0.017 0.003 1.797e-08
+
## Returning path to saved data.
+

Here we know the summary statistics are based on the reference genome +GRCh37, GRCh38 can also be inputted. Moreover, if you are unsure of the +genome build, leave it as NULL and Mungesumstats will infer +it from the data.

+

Also note that the default dbSNP version used along with the +reference genome is the latest version available on Bioconductor +(currently dbSNP 155) but older versions are also availble. Use the +dbSNP input parameter to control this.

+

The arguments format_sumstats in that control the level +of QC conducted by MungeSumstats are:

+
    +
  • +convert_small_p Binary, should +p-values < 5e-324 be converted to 0? Small p-values pass +the R limit and can cause errors with LDSC/MAGMA and should be +converted. Default is TRUE.
  • +
  • +convert_large_p Binary, should p-values >1 be +converted to 1? P-values >1 should not be possible and can cause +errors with LDSC/MAGMA and should be converted. Default is TRUE.
  • +
  • +convert_neg_p Binary, should p-values <0 be +converted to 0? Negative p-values should not be possible and can cause +errors with LDSC/MAGMA and should be converted. Default is TRUE.
  • +
  • +compute_z Whether to compute Z-score column from P. +Default is FALSE. Note that imputing the Z-score for +every SNP will not correct be perfectly correct and may result in a loss +of power. This should only be done as a last resort.
  • +
  • +force_new_z When a “Z” column already exists, it +will be used by default. To override and compute a new Z-score column +from P set to TRUE.
  • +
  • +compute_n Whether to impute N. Default of 0 won’t +impute, any other integer will be imputed as the N (sample size) for +every SNP in the dataset. Note that imputing the sample +size for every SNP is not correct and should only be done as a last +resort. N can also be inputted with “ldsc”, “sum”, “giant” or “metal” by +passing one of these for this field or a vector of multiple. Sum and an +integer value creates an N column in the output whereas giant, metal or +ldsc create an Neff or effective sample size. If multiples are passed, +the formula used to derive it will be indicated.
  • +
  • +convert_n_int Binary, if N (the number of samples) +is not an integer, should this be rounded? Default is TRUE. +analysis_trait If multiple traits were studied, name of the trait for +analysis from the GWAS. Default is NULL.
  • +
  • +impute_beta Binary, whether BETA should be imputed +using other effect data if it isn’t present in the sumstats. Note that +this imputation is an approximation so could have an effect on +downstream analysis. Use with caution. The different methods +MungeSumstats will try and impute beta (in this order or priority) are: +1. log(OR) 2. Z x SE. Default value is FALSE.
  • +
  • +es_is_beta Binary, whether to map ES to BETA. We +take BETA to be any BETA-like value (including Effect Size). If this is +not the case for your sumstats, change this to FALSE. Default is +TRUE.
  • +
  • +impute_se Binary, whether the standard error should +be imputed using other effect data if it isn’t present in the sumstats. +Note that this imputation is an approximation so could have an effect on +downstream analysis. Use with caution. The different methods +MungeSumstats will try and impute se (in this order or priority) are: 1. +BETA / Z +
      +
    1. abs(BETA/ qnorm(P/2)). Default value is FALSE.
    2. +
    +
  • +
  • +analysis_trait If multiple traits were studied, +name of the trait for analysis from the GWAS. Default is NULL.
  • +
  • +INFO_filter 0-1 The minimum value permissible of +the imputation information score (if present in sumstatsfile). Default +0.9
  • +
  • +FRQ_filter 0-1 The minimum value permissible of the +frequency(FRQ) of the SNP (i.e. Allele Frequency (AF)) (if present in +sumstats file). By default no filtering is done, i.e. value of 0.
    +
  • +
  • +pos_se Binary Should the standard Error (SE) column +be checked to ensure it is greater than 0? Those that are, are removed +(if present in sumstats file). Default TRUE.
  • +
  • +effect_columns_nonzero Binary should the effect +columns in the data BETA,OR (odds ratio),LOG_ODDS,SIGNED_SUMSTAT be +checked to ensure no SNP=0. Those that do are removed(if present in +sumstats file). Default TRUE.
    +
  • +
  • +N_std Numeric, the number of standard deviations +above the mean a SNP’s N is needed to be removed. Default is 5. +N_dropNA controls whether the SNPs with a missing N +value are dropped or not (Default is TRUE).
  • +
  • +N_dropNA Drop rows where N is missing.Default is +TRUE.
  • +
  • +chr_style Chromosome naming style to use in the +formatted summary statistics file (“NCBI”, “UCSC”, “dbSNP”, or +“Ensembl”). The NCBI and Ensembl styles both code chromosomes as +1-22, X, Y, MT; the UCSC style is +chr1-chr22, chrX, chrY, chrM; and the dbSNP style is +ch1-ch22, chX, chY, chMT. Default is Ensembl.
  • +
  • +rmv_chr Chromosomes to exclude from the formatted +summary statistics file. Use NULL if no filtering is necessary. Default +is c("X", "Y", "MT") which removes all non-autosomal +SNPs.
  • +
  • +on_ref_genome Binary, should a check take place +that all SNPs are on the reference genome by SNP ID. Any SNPs not on the +reference genome, will be corrected from the reference genome (if +possible) using the chromosome and base pair position data. Default is +TRUE
  • +
  • +convert_ref_genome name of the reference genome to +convert to (“GRCh37” or “GRCh38”). This will only occur if the current +genome build does not match. Default is not to convert the genome build +(NULL).
    +
  • +
  • +strand_ambig_filter Binary, should SNPs with +strand-ambiguous alleles be removed. Default is FALSE
  • +
  • +allele_flip_check Binary, should the allele columns +be checked against reference genome to infer if flipping is necessary. +Default is TRUE. allele_flip_drop controls whether the +SNPs for which neither their A1 or A2 base pair values match a reference +genome be dropped. Default is TRUE. allele_flip_z +controls whether the Z-score value should be flipped along with effect +and FRQ columns (e.g. Beta). Default is TRUE. +allele_flip_frq controls whether the frequency (FRQ) +value should be flipped along with effect and Z-score columns +(e.g. Beta). Default is TRUE.
  • +
  • +bi_allelic_filter Binary, should non-biallelic SNPs +be removed. Default is TRUE
  • +
  • +flip_frq_as_biallelic Binary, Should non-bi-allelic +SNPs frequency values be flipped as 1-p despite there being other +alternative alleles? Default is FALSE but if set to TRUE, this allows +non-bi-allelic SNPs to be kept despite needing flipping.
    +
  • +
  • +snp_ids_are_rs_ids Binary, should the SNP IDs +inputted be inferred as RS IDs or some arbitrary ID. Default is +TRUE.
    +
  • +
  • +remove_multi_rs_snp Binary Sometimes summary +statistics can have multiple RSIDs on one row (i.e. related to one SNP), +for example “rs5772025_rs397784053”. This can cause an error so by +default, the first RS ID will be kept and the rest removed +e.g.”rs5772025”. If you want to just remove these SNPs entirely, set it +to TRUE. Default is FALSE.
  • +
  • +frq_is_maf Binary, conventionally the FRQ column is +intended to show the minor/effect allele frequency (MAF) but sometimes +the major allele frequency can be inferred as the FRQ column. This +logical variable indicates that the FRQ column should be renamed to +MAJOR_ALLELE_FRQ if the frequency values appear to relate to the major +allele i.e. >0.5. By default mapping won’t occur i.e. is TRUE.
  • +
  • +indels Binary does your Sumstats file contain +Indels? These don’t exist in our reference file so they will be excluded +from checks if this value is TRUE. Further information -the reference +dataset we use in MSS (dbSNP) does not include indels so any checks like +is the SNP on the reference genome, attempts to impute any missing data +for indels or check the direction of the effect columns can not be done +for indels. Indels will be kept in the dataset if possible but certain +situations (like if there is missing data) can cause an indel to be +removed. See the printed information by MSS during your run to know if +this affects you. Default is TRUE.
  • +
  • +drop_indels Binary should any indels found in the +sumstats be dropped? These can not be checked against a reference +dataset and will have the same RS ID and position as SNPs which can +affect downstream analysis. Default is False.
    +
  • +
  • +drop_na_cols A character vector of column names to +be checked for missing values. Rows with missing values in any of these +columns (if present in the dataset) will be dropped. If +NULL, all columns will be checked for missing values. +Default columns are SNP, chromosome, position, allele 1, allele 2, +effect columns (frequency, beta, Z-score, standard error, log odds, +signed sumstats, odds ratio), p value and N columns.
  • +
  • +dbSNP The dbSNP version to use as a reference - +defaults to the most recent version available (155). Note that with the +9x more SNPs in dbSNP 155 vs 144, run times will increase.
  • +
  • +sort_coordinates Whether to sort by coordinates of +resulting sumstats.
    +
  • +
  • +nThread Number of threads to use for parallel +processes.
  • +
  • +write_vcf Whether to write as VCF (TRUE) or tabular +file (FALSE). While tabix_index is a binary input for +whether to index the formatted summary statistics with tabix for fast +querying.
  • +
  • +return_data Return data.table, +GRanges or VRangesdirectly to user. Otherwise, +return the path to the save data. Default is FALSE.
  • +
  • +return_format If return_data is TRUE. Object type +to be returned (“data.table”,“vranges”,“granges”).
  • +
  • +save_format Ensure that output format meets all +requirements to be passed directly into LDSC (“ldsc”) without the need +for additional munging or for IEU OpenGWAS format (“opengwas”) before +saving as a VCF. NOTE - If LDSC format is used, the +naming convention of A1 as the reference (genome build) allele and A2 as +the effect allele will be reversed to match LDSC (A1 will now be the +effect allele). See more info on this here. +Note that any effect columns (e.g. Z) will be inrelation to A1 now +instead of A2.
  • +
  • +log_folder_ind Should log files be stored +containing all filtered out SNPs (separate file per filter). The data is +outputted in the same format specified for the resulting sumstats +file.
  • +
  • +log_mungesumstats_msgs Binary Should a log be +stored containing all messages and errors printed by MungeSumstats in a +run.
  • +
  • +imputation_ind Binary Should a column be added for +each imputation step to show what SNPs have imputed values for differing +fields. This includes a field denoting SNP allele flipping (flipped). On +the flipped value, this denoted whether the alelles where switched based +on MungeSumstats initial choice of A1, A2 from the input column headers +and thus may not align with what the creator +intended.Note these columns will be in the formatted +summary statistics returned.
  • +
  • +log_folder File path to the directory for the log +files and the log of MungeSumstats messages to be stored. Default is a +temporary directory.
  • +
  • +force_new If a formatted file of the same names as +exists, formatting will be skipped and this file will be imported +instead (default). Set to override this.
  • +
  • +mapping_file MungeSumstats has a pre-defined +column-name mapping file which should cover the most common column +headers and their interpretations. However, if a column header that is +in youf file is missing of the mapping we give is incorrect you can +supply your own mapping file. Must be a 2 column dataframe with column +names “Uncorrected” and “Corrected”. See +data(sumstatsColHeaders) for default mapping and necessary +format.
  • +
+

See ?MungeSumstats::format_sumstats() for the full list +of parameters to control MungeSumstats QC and standardisation steps.

+

VCF files can also be standardised to the same format as other +summary statistic files. A subset of the Amyotrophic lateral sclerosis +GWAS from the ieu open GWAS project (a .vcf file) has been added to +MungeSumstats to demonstrate this functionality.Simply pass the +path to the file in the same manner you would for other summary +statistic files:

+
+#save ALS GWAS from the ieu open GWAS project to a temp directory
+ALSvcfPth <- system.file("extdata","ALSvcf.vcf", package="MungeSumstats")
+
+reformatted_vcf <- 
+  MungeSumstats::format_sumstats(path=ALSvcfPth, 
+                                 ref_genome="GRCh37")
+

You can also get more information on the SNPs which have had data +imputed or have been filtered out by MungeSumstats by using the +imputation_ind and log_folder_ind parameters +respectively. For example:

+
+#set
+reformatted_vcf_2 <- 
+  MungeSumstats::format_sumstats(path=ALSvcfPth,
+                                 ref_genome="GRCh37",
+                                 log_folder_ind=TRUE,
+                                 imputation_ind=TRUE,
+                                 log_mungesumstats_msgs=TRUE)
+
## Time difference of 0.1 secs
+
## Time difference of 0.4 secs
+

Check the file snp_bi_allelic.tsv.gz in the +log_folder directory you supply (by default a temp +directory), for a list of SNPs removed as they are non-bi-allelic. The +text files containing the console output and messages are also stored in +the same directory.

+

Note you can also control the dbSNP version used as a reference +dataset by MungeSumstats using the dbSNP parameter. By +default this will be set to the most recent dbSNP version available +(155).

+

Note that using log_folder_ind returns a list from +format_sumstats which includes the file locations of the +differing classes of removed SNPs. Using +log_mungesumstats_msgs saves the messages to the console to +a file which is returned in the same list. Note that not all the +messages will also print to screen anymore when you set +log_mungesumstats_msgs:

+
+names(reformatted_vcf_2)
+
## [1] "sumstats"  "log_files"
+

A user can load a file to view the excluded SNPs.

+

In this case, SNPs were filtered based on non-bi-allelic +criterion:

+
+print(reformatted_vcf_2$log_files$snp_bi_allelic)
+
## NULL
+

The different types of exclusion which lead to the names are +explained below:

+
    +
  • +snp_multi_rs_one_row - Where the SNP (RS ID) +contained more than one RS ID.
  • +
  • +snp_missing_rs - Where the SNP (RS ID) was missing +the rs prefix. Note that these are only removed when other snps have an +rs prefix.
  • +
  • +snp_multi_colon - Where the SNP ID has mutliple +colons (“:”) in one SNP.
  • +
  • +snp_not_found_from_bp_chr - Where the RS ID was +attempted to be imputed from the CHR and BP (Base-Pair) information, +using the reference genome, but wasn’t successful.
  • +
  • +chr_bp_not_found_from_snp - Where the CHR and BP +(Base-Pair) was attempted to be imputed from the SNP (RS ID), using the +reference genome, but wasn’t successful.
  • +
  • +alleles_not_found_from_snp - Where the alleles (A1 +and/or A2) was attempted to be imputed from the SNP (RS ID), using the +reference genome, but wasn’t successful.
  • +
  • +alleles_dont_match_ref_gen - Where the alleles (A1 +and/or A2) don’t match what’s on the reference genome.
  • +
  • +missing_data - Where there is data missing across +the inputted columns.
  • +
  • +dup_snp_id - Where the SNP ID is duplicated in the +input.
  • +
  • +dup_base_pair_position - Where the base-pair +position is duplicated in the input.
  • +
  • +info_filter - SNP INFO value below the specified +threshold.
  • +
  • +se_neg - SNPs SE (Standard Error) value is 0 or +negative.
  • +
  • +effect_col_zero - SNPs effect column(s) value is +zero e.g. BETA=0.
  • +
  • +n_large - SNPs N is N standard deviations greater +than the mean.
  • +
  • +n_null - SNPs N is null.
  • +
  • +chr_excl - SNP has an unrecognized chromosome name +or is on a chromosome that was specified to be excluded.
  • +
  • +snp_strand_ambiguous - SNP is strand +ambiguous.
  • +
  • +snp_bi_allelic - SNP is not bi-allelic.
  • +
  • +MungeSumstats_log_msg - Text file of all messages +to the console created during MungeSumstats run.
  • +
  • +MungeSumstats_log_output - Text file of all errors +to the console created during MungeSumstats run.
  • +
+

Note to export to another type such as R native objects including +data.table, GRanges, VRanges or save as a VCF file, set +return_data=TRUE and choose your +return_format:

+
+#set
+reformatted_vcf_2 <- 
+  MungeSumstats::format_sumstats(path=ALSvcfPth,
+                                 ref_genome="GRCh37", 
+                                 log_folder_ind=TRUE,
+                                 imputation_ind=TRUE,
+                                 log_mungesumstats_msgs=TRUE,
+                                 return_data=TRUE,
+                                 return_format="GRanges")
+

Also you can now output a VCF compatible with IEU OpenGWAS format (Note that +currently all IEU OpenGWAS sumstats are GRCh37, MungeSumstats will throw +a warning if your data isn’t GRCh37 when saving):

+
+#set
+reformatted_vcf_2 <- 
+  MungeSumstats::format_sumstats(path=ALSvcfPth,
+                                 ref_genome="GRCh37", 
+                                 write_vcf=TRUE,
+                                 save_format ="openGWAS")
+

See our publication for further discussion of these checks and +options:

+

Murphy +et al. MungeSumstats: A Bioconductor package for the standardisation and +quality control of many GWAS summary statistics.

+
+
+

Extra Functionality +

+
+

Get genome builds +

+

MungeSumstats also contains a function to quickly infer the +genome build of multiple summary statistic files. This can be called +separately to format_sumstats() which is useful if you want +to just quickly check the genome build:

+
+# Pass path to Educational Attainment Okbay sumstat file to a temp directory
+eduAttainOkbayPth <- system.file("extdata", "eduAttainOkbay.txt",
+                                  package = "MungeSumstats")
+ALSvcfPth <- system.file("extdata","ALSvcf.vcf", package="MungeSumstats")
+sumstats_list <- list(ss1 = eduAttainOkbayPth, ss2 = ALSvcfPth)
+
+ref_genomes <- MungeSumstats::get_genome_builds(sumstats_list = sumstats_list)
+
+
+

Liftover +

+

MungeSumstats exposes the liftover() function +as a general utility for users.

+

Useful features include: - Automatic standardisation of genome build +names (i.e. “hg19”, “hg37”, and “GRCh37” will all be recognized as the +same genome build.) - Ability to specify chrom_col as well +as both start_col and end_col (for variants +that span >1bp). - Ability to return in data.table or +GRanges format. - Ability to specify which chromosome +format (e.g. “chr1” vs. 1) to return GRanges as.

+
+sumstats_dt <- MungeSumstats::formatted_example()
+
## Standardising column headers.
+
## First line of summary statistics file:
+
## MarkerName   CHR POS A1  A2  EAF Beta    SE  Pval    
+
## Sorting coordinates with 'data.table'.
+
+sumstats_dt_hg38 <- MungeSumstats::liftover(sumstats_dt = sumstats_dt, 
+                                            ref_genome = "hg19",
+                                            convert_ref_genome = "hg38")
+
## Performing data liftover from hg19 to hg38.
+
## Converting summary statistics to GenomicRanges.
+
## Downloading chain file...
+
## Downloading chain file from Ensembl.
+
## /var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T//RtmpJonkzo/GRCh37_to_GRCh38.chain.gz
+
## Reordering so first three column headers are SNP, CHR and BP in this order.
+
## Reordering so the fourth and fifth columns are A1 and A2.
+
+knitr::kable(head(sumstats_dt_hg38))
+ ++++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SNPCHRBPA1A2FRQBETASEPIMPUTATION_gen_build
rs30180018430543TC0.179100.0190.0030e+00TRUE
rs11210860143516856AG0.369400.0170.0030e+00TRUE
rs34305371172267927AG0.087690.0350.0050e+00TRUE
rs2568955172296486TC0.23690-0.0170.0030e+00TRUE
rs1008078190724174TC0.37310-0.0160.0030e+00TRUE
rs61787263198153158TC0.761200.0160.0031e-07TRUE
+
+
+

Quick formatting +

+

In some cases, users may not want to run the full munging pipeline +provided by
MungeSumstats::format_sumstats, but still would like to +take advantage of the file type conversion and column header +standardisation features. This will not be nearly as robust as the full +pipeline, but can still be helpful.

+
+

From disk +

+

To do this, simply run the following:

+
+eduAttainOkbayPth <- system.file("extdata", "eduAttainOkbay.txt",
+                                  package = "MungeSumstats")
+formatted_path <- tempfile(fileext = "_eduAttainOkbay_standardised.tsv.gz")
+
+
+#### 1. Read in the data and standardise header names ####
+dat <- MungeSumstats::read_sumstats(path = eduAttainOkbayPth, 
+                                    standardise_headers = TRUE)
+
## Importing tabular file: /private/var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T/RtmpKLvRpi/temp_libpath17f3d19176b21/MungeSumstats/extdata/eduAttainOkbay.txt
+
## Checking for empty columns.
+
## Standardising column headers.
+
## First line of summary statistics file:
+
## MarkerName   CHR POS A1  A2  EAF Beta    SE  Pval    
+
+knitr::kable(head(dat))
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SNPCHRBPA1A2FRQBETASEP
rs10061788587934707AG0.21640.0210.0040e+00
rs10078831651163406TC0.3713-0.0150.0031e-07
rs1008078191189731TC0.3731-0.0160.0030e+00
rs10432091423373986AG0.60260.0180.0030e+00
rs10496091261482261AG0.2705-0.0180.0030e+00
rs109300082161854736AG0.7183-0.0160.0031e-07
+
+#### 2. Write to disk as a compressed, tab-delimited, tabix-indexed file ####
+formatted_path <- MungeSumstats::write_sumstats(sumstats_dt = dat,
+                                                save_path = formatted_path,
+                                                tabix_index = TRUE,
+                                                write_vcf = FALSE,
+                                                return_path = TRUE)   
+
## Sorting coordinates with 'data.table'.
+
## Writing in tabular format ==> /var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T//RtmpJonkzo/filec4ec45c572ed_eduAttainOkbay_standardised.tsv
+
## Writing uncompressed instead of gzipped to enable tabix indexing.
+
## Converting full summary stats file to tabix format for fast querying...
+
## Reading header.
+
## Ensuring file is bgzipped.
+
## Tabix-indexing file.
+
## Removing temporary .tsv file.
+
+
+

From data.table +

+

If you already have your data imported as an data.table, +you can also standardise its headers like so:

+
+#### Mess up some column names ####
+dat_raw <- data.table::copy(dat)
+data.table::setnames(dat_raw, c("SNP","CHR"), c("rsID","Seqnames"))
+#### Add a non-standard column that I want to keep the casing for ####
+dat_raw$Support <- runif(nrow(dat_raw))
+
+dat2 <- MungeSumstats::standardise_header(sumstats_dt = dat_raw,
+                                          uppercase_unmapped = FALSE, 
+                                          return_list = FALSE )
+
## Standardising column headers.
+
## First line of summary statistics file:
+
## rsID Seqnames    BP  A1  A2  FRQ BETA    SE  P   Support 
+
## Returning unmapped column names without making them uppercase.
+
+knitr::kable(head(dat2))
+ ++++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SNPCHRBPA1A2FRQBETASEPSupport
rs30180018490603TC0.179100.0190.0030e+000.9805397
rs11210860143982527AG0.369400.0170.0030e+000.7415215
rs34305371172733610AG0.087690.0350.0050e+000.0514463
rs2568955172762169TC0.23690-0.0170.0030e+000.5302125
rs1008078191189731TC0.37310-0.0160.0030e+000.6958239
rs61787263198618714TC0.761200.0160.0031e-070.6885560
+
+
+
+
+

Future Enhancements +

+

The MungeSumstats package aims to be able to handle the most +common summary statistic file formats including VCF. If your file can +not be formatted by MungeSumstats feel free to report the bug +on github: https://github.com/neurogenomics/MungeSumstats along +with your summary statistic file header.

+

We also encourage people to edit the code to resolve their particular +issues too and are happy to incorporate these through pull requests on +github. If your summary statistic file headers are not recognised by +MungeSumstats but correspond to one of:

+
SNP, BP, CHR, A1, A2, P, Z, OR, BETA, LOG_ODDS,
+SIGNED_SUMSTAT, N, N_CAS, N_CON, NSTUDY, INFO or FRQ 
+

feel free to update the +MungeSumstats::sumstatsColHeaders following the approach in +the data.R file and add your mapping. Then use a pull request on github +and we will incorporate this change into the package.

+

A note on MungeSumstats::sumstatsColHeaders for summary +statistic files with A0/A1. The mapping in +MungeSumstats::sumstatsColHeaders converts A0 to A*, this +is a special case so that the code knows to map A0/A1 to A1/A2 +(ref/alt). The special case is needed since ordinarily A1 refers to the +reference not the alternative allele.

+

A note on MungeSumstats::sumstatsColHeaders for summary +statistic files with Effect Size (ES). By default, MSS takes BETA to be +any BETA-like value (including ES). This is coded into the mapping file +- MungeSumstats::sumstatsColHeaders. If this isn’t the case +for your sumstats, you can set the es_is_beta parameter in +MungeSumstats::format_sumstats() to FALSE to avoid this. +Note this is done to try and capture most use cases of MSS.

+
+
+

Further functionality +

+

See the Open +GWAS vignette for how MungeSumstats can be used along with data from +the MRC IEU Open GWAS Project and also Mungesumstats’ functionality to +handle lists of summary statistics files.

+
+
+

Session Information +

+
## R version 4.3.0 (2023-04-21)
+## Platform: x86_64-apple-darwin20 (64-bit)
+## Running under: macOS 15.1.1
+## 
+## Matrix products: default
+## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
+## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
+## 
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+## 
+## time zone: Europe/London
+## tzcode source: internal
+## 
+## attached base packages:
+## [1] stats     graphics  grDevices utils     datasets  methods   base     
+## 
+## other attached packages:
+## [1] MungeSumstats_1.15.4 BiocStyle_2.30.0    
+## 
+## loaded via a namespace (and not attached):
+##   [1] tidyselect_1.2.1                           
+##   [2] dplyr_1.1.4                                
+##   [3] blob_1.2.4                                 
+##   [4] filelock_1.0.3                             
+##   [5] R.utils_2.12.3                             
+##   [6] Biostrings_2.70.3                          
+##   [7] bitops_1.0-9                               
+##   [8] fastmap_1.2.0                              
+##   [9] RCurl_1.98-1.16                            
+##  [10] BiocFileCache_2.10.2                       
+##  [11] VariantAnnotation_1.48.1                   
+##  [12] GenomicAlignments_1.38.2                   
+##  [13] XML_3.99-0.17                              
+##  [14] digest_0.6.37                              
+##  [15] lifecycle_1.0.4                            
+##  [16] KEGGREST_1.42.0                            
+##  [17] RSQLite_2.3.7                              
+##  [18] magrittr_2.0.3                             
+##  [19] compiler_4.3.0                             
+##  [20] rlang_1.1.4                                
+##  [21] sass_0.4.9                                 
+##  [22] progress_1.2.3                             
+##  [23] tools_4.3.0                                
+##  [24] utf8_1.2.4                                 
+##  [25] yaml_2.3.10                                
+##  [26] data.table_1.16.0                          
+##  [27] rtracklayer_1.62.0                         
+##  [28] knitr_1.48                                 
+##  [29] prettyunits_1.2.0                          
+##  [30] S4Arrays_1.2.1                             
+##  [31] htmlwidgets_1.6.4                          
+##  [32] curl_5.2.3                                 
+##  [33] bit_4.5.0                                  
+##  [34] DelayedArray_0.28.0                        
+##  [35] ieugwasr_1.0.1                             
+##  [36] xml2_1.3.6                                 
+##  [37] abind_1.4-8                                
+##  [38] BiocParallel_1.36.0                        
+##  [39] purrr_1.0.2                                
+##  [40] BiocGenerics_0.48.1                        
+##  [41] desc_1.4.3                                 
+##  [42] R.oo_1.26.0                                
+##  [43] grid_4.3.0                                 
+##  [44] stats4_4.3.0                               
+##  [45] fansi_1.0.6                                
+##  [46] biomaRt_2.58.2                             
+##  [47] SummarizedExperiment_1.32.0                
+##  [48] cli_3.6.3                                  
+##  [49] rmarkdown_2.28                             
+##  [50] crayon_1.5.3                               
+##  [51] generics_0.1.3                             
+##  [52] ragg_1.3.1                                 
+##  [53] BSgenome.Hsapiens.1000genomes.hs37d5_0.99.1
+##  [54] rstudioapi_0.16.0                          
+##  [55] httr_1.4.7                                 
+##  [56] rjson_0.2.23                               
+##  [57] DBI_1.2.3                                  
+##  [58] cachem_1.1.0                               
+##  [59] stringr_1.5.1                              
+##  [60] zlibbioc_1.48.2                            
+##  [61] parallel_4.3.0                             
+##  [62] AnnotationDbi_1.64.1                       
+##  [63] BiocManager_1.30.25                        
+##  [64] XVector_0.42.0                             
+##  [65] restfulr_0.0.15                            
+##  [66] matrixStats_1.4.1                          
+##  [67] vctrs_0.6.5                                
+##  [68] Matrix_1.6-5                               
+##  [69] jsonlite_1.8.9                             
+##  [70] bookdown_0.39                              
+##  [71] IRanges_2.36.0                             
+##  [72] hms_1.1.3                                  
+##  [73] S4Vectors_0.40.2                           
+##  [74] bit64_4.5.2                                
+##  [75] GenomicFiles_1.38.0                        
+##  [76] systemfonts_1.0.6                          
+##  [77] GenomicFeatures_1.54.4                     
+##  [78] jquerylib_0.1.4                            
+##  [79] glue_1.8.0                                 
+##  [80] pkgdown_2.0.9                              
+##  [81] codetools_0.2-20                           
+##  [82] stringi_1.8.4                              
+##  [83] GenomeInfoDb_1.38.8                        
+##  [84] BiocIO_1.12.0                              
+##  [85] GenomicRanges_1.54.1                       
+##  [86] tibble_3.2.1                               
+##  [87] pillar_1.9.0                               
+##  [88] SNPlocs.Hsapiens.dbSNP155.GRCh37_0.99.24   
+##  [89] rappdirs_0.3.3                             
+##  [90] htmltools_0.5.8.1                          
+##  [91] GenomeInfoDbData_1.2.11                    
+##  [92] BSgenome_1.70.2                            
+##  [93] dbplyr_2.5.0                               
+##  [94] R6_2.5.1                                   
+##  [95] textshaping_0.3.7                          
+##  [96] evaluate_1.0.0                             
+##  [97] lattice_0.22-6                             
+##  [98] Biobase_2.62.0                             
+##  [99] R.methodsS3_1.8.2                          
+## [100] png_0.1-8                                  
+## [101] Rsamtools_2.18.0                           
+## [102] memoise_2.0.1                              
+## [103] bslib_0.8.0                                
+## [104] SparseArray_1.2.4                          
+## [105] xfun_0.48                                  
+## [106] fs_1.6.4                                   
+## [107] MatrixGenerics_1.14.0                      
+## [108] pkgconfig_2.0.3
+
+
+
+

References +

+
+
+
1.
+
Nathan G. Skene, T. E. B., Julien Bryois. +Genetic identification of brain cell types underlying schizophrenia. +Nature Genetics (2018). doi:10.1038/s41588-018-0129-5 +
+
+
+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.9.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/OpenGWAS.html b/docs/articles/OpenGWAS.html new file mode 100644 index 00000000..771f3cd6 --- /dev/null +++ b/docs/articles/OpenGWAS.html @@ -0,0 +1,341 @@ + + + + + + + +Import GWAS summary statistics from Open GWAS • MungeSumstats + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + + +

MungeSumstats now offers high throughput query and import +functionality to data from the MRC IEU Open GWAS Project.

+

This is made possible by the use the IEU OpwnGWAS R package: +ieugwasr.

+

Before you can use this functionality however, please complete the +following steps:

+
+

Authenticate Access IEU OpenGWAS API +

+

To authenticate, you need to generate a token from the OpenGWAS +website. The token behaves like a password, and it will be used to +authorise the requests you make to the OpenGWAS API. Here are the steps +to generate the token and then have ieugwasr automatically +use it for your queries:

+
    +
  1. Login to https://api.opengwas.io/profile/ +
  2. +
  3. Generate a new token
  4. +
  5. Add OPENGWAS_JWT=<token> to your .Renviron file, +thi can be edited in R by running +usethis::edit_r_environ() +
  6. +
  7. Restart your R session
  8. +
  9. To check that your token is being recognised, run +ieugwasr::get_opengwas_jwt(). If it returns a long random +string then you are authenticated.
  10. +
  11. To check that your token is working, run +ieugwasr::user(). It will make a request to the API for +your user information using your token. It should return a list with +your user information. If it returns an error, then your token is not +working.
  12. +
  13. Make sure you have submitted user information to increasse you API +limit at https://api.opengwas.io/profile/.
  14. +
+
+
+

Find GWAS datasets +

+

We can search by terms and with other filters like sample size:

+
+#### Search for datasets ####
+metagwas <- MungeSumstats::find_sumstats(traits = c("parkinson","alzheimer"), 
+                                         min_sample_size = 1000)
+head(metagwas,3)
+ids <- (dplyr::arrange(metagwas, nsnp))$id  
+
##          id               trait group_name year    author
+## 1 ieu-a-298 Alzheimer's disease     public 2013   Lambert
+## 2   ieu-b-2 Alzheimer's disease     public 2019 Kunkle BW
+## 3 ieu-a-297 Alzheimer's disease     public 2013   Lambert
+##                                                                                                                                                                                                                                                                                                                    consortium
+## 1                                                                                                                                                                                                                                                                                                                        IGAP
+## 2 Alzheimer Disease Genetics Consortium (ADGC), European Alzheimer's Disease Initiative (EADI), Cohorts for Heart and Aging Research in Genomic Epidemiology Consortium (CHARGE), Genetic and Environmental Risk in AD/Defining Genetic, Polygenic and Environmental Risk for Alzheimer's Disease Consortium (GERAD/PERADES),
+## 3                                                                                                                                                                                                                                                                                                                        IGAP
+##                 sex population     unit     nsnp sample_size       build
+## 1 Males and Females   European log odds    11633       74046 HG19/GRCh37
+## 2 Males and Females   European       NA 10528610       63926 HG19/GRCh37
+## 3 Males and Females   European log odds  7055882       54162 HG19/GRCh37
+##   category                subcategory ontology mr priority     pmid sd
+## 1  Disease Psychiatric / neurological       NA  1        1 24162737 NA
+## 2   Binary Psychiatric / neurological       NA  1        0 30820047 NA
+## 3  Disease Psychiatric / neurological       NA  1        2 24162737 NA
+##                                                                      note ncase
+## 1 Exposure only; Effect allele frequencies are missing; forward(+) strand 25580
+## 2                                                                      NA 21982
+## 3                Effect allele frequencies are missing; forward(+) strand 17008
+##   ncontrol     N
+## 1    48466 74046
+## 2    41944 63926
+## 3    37154 54162
+

You can also search by ID:

+
+### By ID and sample size
+metagwas <- find_sumstats(
+  ids = c("ieu-b-4760", "prot-a-1725", "prot-a-664"),
+  min_sample_size = 5000
+)
+
+
+

Import full results +

+

You can supply import_sumstats() with a list of as many +OpenGWAS IDs as you want, but we’ll just give one to save time.

+
+datasets <- MungeSumstats::import_sumstats(ids = "ieu-a-298",
+                                           ref_genome = "GRCH37")
+
+

Summarise results +

+

By default, import_sumstats results a named list where +the names are the Open GWAS dataset IDs and the items are the respective +paths to the formatted summary statistics.

+
+print(datasets)
+
## $`ieu-a-298`
+## [1] "/var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T//RtmpIEX1aF/ieu-a-298.tsv.gz"
+

You can easily turn this into a data.frame as well.

+
+results_df <- data.frame(id=names(datasets), 
+                         path=unlist(datasets))
+print(results_df)
+
##                  id
+## ieu-a-298 ieu-a-298
+##                                                                                    path
+## ieu-a-298 /var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T//RtmpIEX1aF/ieu-a-298.tsv.gz
+
+
+
+

Import full results (parallel) +

+

Optional: Speed up with multi-threaded download via axel.

+
+datasets <- MungeSumstats::import_sumstats(ids = ids, 
+                                           vcf_download = TRUE, 
+                                           download_method = "axel", 
+                                           nThread = max(2,future::availableCores()-2))
+
+
+

Further functionality +

+

See the Getting +started vignette for more information on how to use MungeSumstats +and its functionality.

+
+
+

Session Info +

+
+utils::sessionInfo()
+
## R version 4.3.0 (2023-04-21)
+## Platform: x86_64-apple-darwin20 (64-bit)
+## Running under: macOS 15.1.1
+## 
+## Matrix products: default
+## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
+## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
+## 
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+## 
+## time zone: Europe/London
+## tzcode source: internal
+## 
+## attached base packages:
+## [1] stats     graphics  grDevices utils     datasets  methods   base     
+## 
+## other attached packages:
+## [1] MungeSumstats_1.15.4 BiocStyle_2.30.0    
+## 
+## loaded via a namespace (and not attached):
+##   [1] tidyselect_1.2.1            dplyr_1.1.4                
+##   [3] blob_1.2.4                  filelock_1.0.3             
+##   [5] R.utils_2.12.3              Biostrings_2.70.3          
+##   [7] bitops_1.0-9                fastmap_1.2.0              
+##   [9] RCurl_1.98-1.16             BiocFileCache_2.10.2       
+##  [11] VariantAnnotation_1.48.1    GenomicAlignments_1.38.2   
+##  [13] XML_3.99-0.17               digest_0.6.37              
+##  [15] lifecycle_1.0.4             KEGGREST_1.42.0            
+##  [17] RSQLite_2.3.7               magrittr_2.0.3             
+##  [19] compiler_4.3.0              rlang_1.1.4                
+##  [21] sass_0.4.9                  progress_1.2.3             
+##  [23] tools_4.3.0                 utf8_1.2.4                 
+##  [25] yaml_2.3.10                 data.table_1.16.0          
+##  [27] rtracklayer_1.62.0          knitr_1.48                 
+##  [29] prettyunits_1.2.0           S4Arrays_1.2.1             
+##  [31] htmlwidgets_1.6.4           curl_5.2.3                 
+##  [33] bit_4.5.0                   DelayedArray_0.28.0        
+##  [35] ieugwasr_1.0.1              xml2_1.3.6                 
+##  [37] abind_1.4-8                 BiocParallel_1.36.0        
+##  [39] purrr_1.0.2                 BiocGenerics_0.48.1        
+##  [41] desc_1.4.3                  R.oo_1.26.0                
+##  [43] grid_4.3.0                  stats4_4.3.0               
+##  [45] fansi_1.0.6                 biomaRt_2.58.2             
+##  [47] SummarizedExperiment_1.32.0 cli_3.6.3                  
+##  [49] rmarkdown_2.28              crayon_1.5.3               
+##  [51] generics_0.1.3              ragg_1.3.1                 
+##  [53] rstudioapi_0.16.0           httr_1.4.7                 
+##  [55] rjson_0.2.23                DBI_1.2.3                  
+##  [57] cachem_1.1.0                stringr_1.5.1              
+##  [59] zlibbioc_1.48.2             parallel_4.3.0             
+##  [61] AnnotationDbi_1.64.1        BiocManager_1.30.25        
+##  [63] XVector_0.42.0              restfulr_0.0.15            
+##  [65] matrixStats_1.4.1           vctrs_0.6.5                
+##  [67] Matrix_1.6-5                jsonlite_1.8.9             
+##  [69] bookdown_0.39               IRanges_2.36.0             
+##  [71] hms_1.1.3                   S4Vectors_0.40.2           
+##  [73] bit64_4.5.2                 systemfonts_1.0.6          
+##  [75] GenomicFeatures_1.54.4      jquerylib_0.1.4            
+##  [77] glue_1.8.0                  pkgdown_2.0.9              
+##  [79] codetools_0.2-20            stringi_1.8.4              
+##  [81] GenomeInfoDb_1.38.8         BiocIO_1.12.0              
+##  [83] GenomicRanges_1.54.1        tibble_3.2.1               
+##  [85] pillar_1.9.0                rappdirs_0.3.3             
+##  [87] htmltools_0.5.8.1           GenomeInfoDbData_1.2.11    
+##  [89] BSgenome_1.70.2             dbplyr_2.5.0               
+##  [91] R6_2.5.1                    textshaping_0.3.7          
+##  [93] evaluate_1.0.0              lattice_0.22-6             
+##  [95] Biobase_2.62.0              R.methodsS3_1.8.2          
+##  [97] png_0.1-8                   Rsamtools_2.18.0           
+##  [99] memoise_2.0.1               bslib_0.8.0                
+## [101] SparseArray_1.2.4           xfun_0.48                  
+## [103] fs_1.6.4                    MatrixGenerics_1.14.0      
+## [105] pkgconfig_2.0.3
+
+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.9.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/docker.html b/docs/articles/docker.html new file mode 100644 index 00000000..b4ded813 --- /dev/null +++ b/docs/articles/docker.html @@ -0,0 +1,278 @@ + + + + + + + +Docker/Singularity Containers • MungeSumstats + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + +
+

Installation +

+

MungeSumstats is now available via ghcr.io +as a containerised environment with Rstudio and all necessary +dependencies pre-installed.

+
+

Method 1: via Docker +

+

First, install +Docker if you have not already.

+

Create an image of the Docker +container in command line:

+
docker pull ghcr.io/neurogenomics/MungeSumstats
+

Once the image has been created, you can launch it with:

+
docker run \
+  -d \
+  -e ROOT=true \
+  -e PASSWORD="<your_password>" \
+  -v ~/Desktop:/Desktop \
+  -v /Volumes:/Volumes \
+  -p 8900:8787 \
+  ghcr.io/neurogenomics/MungeSumstats
+
+

NOTES +

+
    +
  • Make sure to replace <your_password> above with +whatever you want your password to be.
    +
  • +
  • Change the paths supplied to the -v flags for your +particular use case.
  • +
  • The -d ensures the container will run in “detached” +mode, which means it will persist even after you’ve closed your command +line session.
    +
  • +
  • The username will be “rstudio” by default.
    +
  • +
  • Optionally, you can also install the Docker +Desktop to easily manage your containers.
  • +
+
+
+
+

Method 2: via Singularity +

+

If you are using a system that does not allow Docker (as is the case +for many institutional computing clusters), you can instead install +Docker images via Singularity.

+
singularity pull docker://ghcr.io/neurogenomics/MungeSumstats
+

For troubleshooting, see the Singularity +documentation.

+
+
+
+

Usage +

+

Finally, launch the containerised Rstudio by entering the following +URL in any web browser: http://localhost:8900/

+

Login using the credentials set during the Installation steps.

+
+
+

Session Info +

+
+utils::sessionInfo()
+
## R version 4.3.0 (2023-04-21)
+## Platform: x86_64-apple-darwin20 (64-bit)
+## Running under: macOS 15.1.1
+## 
+## Matrix products: default
+## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
+## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
+## 
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+## 
+## time zone: Europe/London
+## tzcode source: internal
+## 
+## attached base packages:
+## [1] stats     graphics  grDevices utils     datasets  methods   base     
+## 
+## other attached packages:
+## [1] MungeSumstats_1.15.4 BiocStyle_2.30.0    
+## 
+## loaded via a namespace (and not attached):
+##   [1] tidyselect_1.2.1            dplyr_1.1.4                
+##   [3] blob_1.2.4                  filelock_1.0.3             
+##   [5] R.utils_2.12.3              Biostrings_2.70.3          
+##   [7] bitops_1.0-9                fastmap_1.2.0              
+##   [9] RCurl_1.98-1.16             BiocFileCache_2.10.2       
+##  [11] VariantAnnotation_1.48.1    GenomicAlignments_1.38.2   
+##  [13] XML_3.99-0.17               digest_0.6.37              
+##  [15] lifecycle_1.0.4             KEGGREST_1.42.0            
+##  [17] RSQLite_2.3.7               magrittr_2.0.3             
+##  [19] compiler_4.3.0              rlang_1.1.4                
+##  [21] sass_0.4.9                  progress_1.2.3             
+##  [23] tools_4.3.0                 utf8_1.2.4                 
+##  [25] yaml_2.3.10                 data.table_1.16.0          
+##  [27] rtracklayer_1.62.0          knitr_1.48                 
+##  [29] prettyunits_1.2.0           S4Arrays_1.2.1             
+##  [31] htmlwidgets_1.6.4           curl_5.2.3                 
+##  [33] bit_4.5.0                   DelayedArray_0.28.0        
+##  [35] ieugwasr_1.0.1              xml2_1.3.6                 
+##  [37] abind_1.4-8                 BiocParallel_1.36.0        
+##  [39] purrr_1.0.2                 BiocGenerics_0.48.1        
+##  [41] desc_1.4.3                  R.oo_1.26.0                
+##  [43] grid_4.3.0                  stats4_4.3.0               
+##  [45] fansi_1.0.6                 biomaRt_2.58.2             
+##  [47] SummarizedExperiment_1.32.0 cli_3.6.3                  
+##  [49] rmarkdown_2.28              crayon_1.5.3               
+##  [51] generics_0.1.3              ragg_1.3.1                 
+##  [53] rstudioapi_0.16.0           httr_1.4.7                 
+##  [55] rjson_0.2.23                DBI_1.2.3                  
+##  [57] cachem_1.1.0                stringr_1.5.1              
+##  [59] zlibbioc_1.48.2             parallel_4.3.0             
+##  [61] AnnotationDbi_1.64.1        BiocManager_1.30.25        
+##  [63] XVector_0.42.0              restfulr_0.0.15            
+##  [65] matrixStats_1.4.1           vctrs_0.6.5                
+##  [67] Matrix_1.6-5                jsonlite_1.8.9             
+##  [69] bookdown_0.39               IRanges_2.36.0             
+##  [71] hms_1.1.3                   S4Vectors_0.40.2           
+##  [73] bit64_4.5.2                 systemfonts_1.0.6          
+##  [75] GenomicFeatures_1.54.4      jquerylib_0.1.4            
+##  [77] glue_1.8.0                  pkgdown_2.0.9              
+##  [79] codetools_0.2-20            stringi_1.8.4              
+##  [81] GenomeInfoDb_1.38.8         BiocIO_1.12.0              
+##  [83] GenomicRanges_1.54.1        tibble_3.2.1               
+##  [85] pillar_1.9.0                rappdirs_0.3.3             
+##  [87] htmltools_0.5.8.1           GenomeInfoDbData_1.2.11    
+##  [89] BSgenome_1.70.2             dbplyr_2.5.0               
+##  [91] R6_2.5.1                    textshaping_0.3.7          
+##  [93] evaluate_1.0.0              lattice_0.22-6             
+##  [95] Biobase_2.62.0              R.methodsS3_1.8.2          
+##  [97] png_0.1-8                   Rsamtools_2.18.0           
+##  [99] memoise_2.0.1               bslib_0.8.0                
+## [101] SparseArray_1.2.4           xfun_0.48                  
+## [103] fs_1.6.4                    MatrixGenerics_1.14.0      
+## [105] pkgconfig_2.0.3
+


+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.9.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/index.html b/docs/articles/index.html new file mode 100644 index 00000000..aaadf5e5 --- /dev/null +++ b/docs/articles/index.html @@ -0,0 +1,96 @@ + +Articles • MungeSumstats + + +
+
+ + + +
+ + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/authors.html b/docs/authors.html new file mode 100644 index 00000000..e76c2382 --- /dev/null +++ b/docs/authors.html @@ -0,0 +1,122 @@ + +Authors and Citation • MungeSumstats + + +
+
+ + + +
+
+
+ + + +
  • +

    Alan Murphy. Author, maintainer. +

    +
  • +
  • +

    Brian Schilder. Author, contributor. +

    +
  • +
  • +

    Nathan Skene. Author. +

    +
  • +
+
+
+

Citation

+ Source: DESCRIPTION +
+
+ + +

Murphy A, Schilder B, Skene N (2024). +MungeSumstats: Standardise summary statistics from GWAS. +R package version 1.15.4, https://github.com/neurogenomics/MungeSumstats. +

+
@Manual{,
+  title = {MungeSumstats: Standardise summary statistics from GWAS},
+  author = {Alan Murphy and Brian Schilder and Nathan Skene},
+  year = {2024},
+  note = {R package version 1.15.4},
+  url = {https://github.com/neurogenomics/MungeSumstats},
+}
+ +
+ +
+ + + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/bootstrap-toc.css b/docs/bootstrap-toc.css new file mode 100644 index 00000000..5a859415 --- /dev/null +++ b/docs/bootstrap-toc.css @@ -0,0 +1,60 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ + +/* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ + +/* All levels of nav */ +nav[data-toggle='toc'] .nav > li > a { + display: block; + padding: 4px 20px; + font-size: 13px; + font-weight: 500; + color: #767676; +} +nav[data-toggle='toc'] .nav > li > a:hover, +nav[data-toggle='toc'] .nav > li > a:focus { + padding-left: 19px; + color: #563d7c; + text-decoration: none; + background-color: transparent; + border-left: 1px solid #563d7c; +} +nav[data-toggle='toc'] .nav > .active > a, +nav[data-toggle='toc'] .nav > .active:hover > a, +nav[data-toggle='toc'] .nav > .active:focus > a { + padding-left: 18px; + font-weight: bold; + color: #563d7c; + background-color: transparent; + border-left: 2px solid #563d7c; +} + +/* Nav: second level (shown on .active) */ +nav[data-toggle='toc'] .nav .nav { + display: none; /* Hide by default, but at >768px, show it */ + padding-bottom: 10px; +} +nav[data-toggle='toc'] .nav .nav > li > a { + padding-top: 1px; + padding-bottom: 1px; + padding-left: 30px; + font-size: 12px; + font-weight: normal; +} +nav[data-toggle='toc'] .nav .nav > li > a:hover, +nav[data-toggle='toc'] .nav .nav > li > a:focus { + padding-left: 29px; +} +nav[data-toggle='toc'] .nav .nav > .active > a, +nav[data-toggle='toc'] .nav .nav > .active:hover > a, +nav[data-toggle='toc'] .nav .nav > .active:focus > a { + padding-left: 28px; + font-weight: 500; +} + +/* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ +nav[data-toggle='toc'] .nav > .active > ul { + display: block; +} diff --git a/docs/bootstrap-toc.js b/docs/bootstrap-toc.js new file mode 100644 index 00000000..1cdd573b --- /dev/null +++ b/docs/bootstrap-toc.js @@ -0,0 +1,159 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ +(function() { + 'use strict'; + + window.Toc = { + helpers: { + // return all matching elements in the set, or their descendants + findOrFilter: function($el, selector) { + // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ + // http://stackoverflow.com/a/12731439/358804 + var $descendants = $el.find(selector); + return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); + }, + + generateUniqueIdBase: function(el) { + var text = $(el).text(); + var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); + return anchor || el.tagName.toLowerCase(); + }, + + generateUniqueId: function(el) { + var anchorBase = this.generateUniqueIdBase(el); + for (var i = 0; ; i++) { + var anchor = anchorBase; + if (i > 0) { + // add suffix + anchor += '-' + i; + } + // check if ID already exists + if (!document.getElementById(anchor)) { + return anchor; + } + } + }, + + generateAnchor: function(el) { + if (el.id) { + return el.id; + } else { + var anchor = this.generateUniqueId(el); + el.id = anchor; + return anchor; + } + }, + + createNavList: function() { + return $(''); + }, + + createChildNavList: function($parent) { + var $childList = this.createNavList(); + $parent.append($childList); + return $childList; + }, + + generateNavEl: function(anchor, text) { + var $a = $(''); + $a.attr('href', '#' + anchor); + $a.text(text); + var $li = $('
  • '); + $li.append($a); + return $li; + }, + + generateNavItem: function(headingEl) { + var anchor = this.generateAnchor(headingEl); + var $heading = $(headingEl); + var text = $heading.data('toc-text') || $heading.text(); + return this.generateNavEl(anchor, text); + }, + + // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). + getTopLevel: function($scope) { + for (var i = 1; i <= 6; i++) { + var $headings = this.findOrFilter($scope, 'h' + i); + if ($headings.length > 1) { + return i; + } + } + + return 1; + }, + + // returns the elements for the top level, and the next below it + getHeadings: function($scope, topLevel) { + var topSelector = 'h' + topLevel; + + var secondaryLevel = topLevel + 1; + var secondarySelector = 'h' + secondaryLevel; + + return this.findOrFilter($scope, topSelector + ',' + secondarySelector); + }, + + getNavLevel: function(el) { + return parseInt(el.tagName.charAt(1), 10); + }, + + populateNav: function($topContext, topLevel, $headings) { + var $context = $topContext; + var $prevNav; + + var helpers = this; + $headings.each(function(i, el) { + var $newNav = helpers.generateNavItem(el); + var navLevel = helpers.getNavLevel(el); + + // determine the proper $context + if (navLevel === topLevel) { + // use top level + $context = $topContext; + } else if ($prevNav && $context === $topContext) { + // create a new level of the tree and switch to it + $context = helpers.createChildNavList($prevNav); + } // else use the current $context + + $context.append($newNav); + + $prevNav = $newNav; + }); + }, + + parseOps: function(arg) { + var opts; + if (arg.jquery) { + opts = { + $nav: arg + }; + } else { + opts = arg; + } + opts.$scope = opts.$scope || $(document.body); + return opts; + } + }, + + // accepts a jQuery object, or an options object + init: function(opts) { + opts = this.helpers.parseOps(opts); + + // ensure that the data attribute is in place for styling + opts.$nav.attr('data-toggle', 'toc'); + + var $topContext = this.helpers.createChildNavList(opts.$nav); + var topLevel = this.helpers.getTopLevel(opts.$scope); + var $headings = this.helpers.getHeadings(opts.$scope, topLevel); + this.helpers.populateNav($topContext, topLevel, $headings); + } + }; + + $(function() { + $('nav[data-toggle="toc"]').each(function(i, el) { + var $nav = $(el); + Toc.init($nav); + }); + }); +})(); diff --git a/docs/docsearch.css b/docs/docsearch.css new file mode 100644 index 00000000..e5f1fe1d --- /dev/null +++ b/docs/docsearch.css @@ -0,0 +1,148 @@ +/* Docsearch -------------------------------------------------------------- */ +/* + Source: https://github.com/algolia/docsearch/ + License: MIT +*/ + +.algolia-autocomplete { + display: block; + -webkit-box-flex: 1; + -ms-flex: 1; + flex: 1 +} + +.algolia-autocomplete .ds-dropdown-menu { + width: 100%; + min-width: none; + max-width: none; + padding: .75rem 0; + background-color: #fff; + background-clip: padding-box; + border: 1px solid rgba(0, 0, 0, .1); + box-shadow: 0 .5rem 1rem rgba(0, 0, 0, .175); +} + +@media (min-width:768px) { + .algolia-autocomplete .ds-dropdown-menu { + width: 175% + } +} + +.algolia-autocomplete .ds-dropdown-menu::before { + display: none +} + +.algolia-autocomplete .ds-dropdown-menu [class^=ds-dataset-] { + padding: 0; + background-color: rgb(255,255,255); + border: 0; + max-height: 80vh; +} + +.algolia-autocomplete .ds-dropdown-menu .ds-suggestions { + margin-top: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion { + padding: 0; + overflow: visible +} + +.algolia-autocomplete .algolia-docsearch-suggestion--category-header { + padding: .125rem 1rem; + margin-top: 0; + font-size: 1.3em; + font-weight: 500; + color: #00008B; + border-bottom: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--wrapper { + float: none; + padding-top: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column { + float: none; + width: auto; + padding: 0; + text-align: left +} + +.algolia-autocomplete .algolia-docsearch-suggestion--content { + float: none; + width: auto; + padding: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--content::before { + display: none +} + +.algolia-autocomplete .ds-suggestion:not(:first-child) .algolia-docsearch-suggestion--category-header { + padding-top: .75rem; + margin-top: .75rem; + border-top: 1px solid rgba(0, 0, 0, .1) +} + +.algolia-autocomplete .ds-suggestion .algolia-docsearch-suggestion--subcategory-column { + display: block; + padding: .1rem 1rem; + margin-bottom: 0.1; + font-size: 1.0em; + font-weight: 400 + /* display: none */ +} + +.algolia-autocomplete .algolia-docsearch-suggestion--title { + display: block; + padding: .25rem 1rem; + margin-bottom: 0; + font-size: 0.9em; + font-weight: 400 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--text { + padding: 0 1rem .5rem; + margin-top: -.25rem; + font-size: 0.8em; + font-weight: 400; + line-height: 1.25 +} + +.algolia-autocomplete .algolia-docsearch-footer { + width: 110px; + height: 20px; + z-index: 3; + margin-top: 10.66667px; + float: right; + font-size: 0; + line-height: 0; +} + +.algolia-autocomplete .algolia-docsearch-footer--logo { + background-image: url("data:image/svg+xml;utf8,"); + background-repeat: no-repeat; + background-position: 50%; + background-size: 100%; + overflow: hidden; + text-indent: -9000px; + width: 100%; + height: 100%; + display: block; + transform: translate(-8px); +} + +.algolia-autocomplete .algolia-docsearch-suggestion--highlight { + color: #FF8C00; + background: rgba(232, 189, 54, 0.1) +} + + +.algolia-autocomplete .algolia-docsearch-suggestion--text .algolia-docsearch-suggestion--highlight { + box-shadow: inset 0 -2px 0 0 rgba(105, 105, 105, .5) +} + +.algolia-autocomplete .ds-suggestion.ds-cursor .algolia-docsearch-suggestion--content { + background-color: rgba(192, 192, 192, .15) +} diff --git a/docs/docsearch.js b/docs/docsearch.js new file mode 100644 index 00000000..b35504cd --- /dev/null +++ b/docs/docsearch.js @@ -0,0 +1,85 @@ +$(function() { + + // register a handler to move the focus to the search bar + // upon pressing shift + "/" (i.e. "?") + $(document).on('keydown', function(e) { + if (e.shiftKey && e.keyCode == 191) { + e.preventDefault(); + $("#search-input").focus(); + } + }); + + $(document).ready(function() { + // do keyword highlighting + /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ + var mark = function() { + + var referrer = document.URL ; + var paramKey = "q" ; + + if (referrer.indexOf("?") !== -1) { + var qs = referrer.substr(referrer.indexOf('?') + 1); + var qs_noanchor = qs.split('#')[0]; + var qsa = qs_noanchor.split('&'); + var keyword = ""; + + for (var i = 0; i < qsa.length; i++) { + var currentParam = qsa[i].split('='); + + if (currentParam.length !== 2) { + continue; + } + + if (currentParam[0] == paramKey) { + keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); + } + } + + if (keyword !== "") { + $(".contents").unmark({ + done: function() { + $(".contents").mark(keyword); + } + }); + } + } + }; + + mark(); + }); +}); + +/* Search term highlighting ------------------------------*/ + +function matchedWords(hit) { + var words = []; + + var hierarchy = hit._highlightResult.hierarchy; + // loop to fetch from lvl0, lvl1, etc. + for (var idx in hierarchy) { + words = words.concat(hierarchy[idx].matchedWords); + } + + var content = hit._highlightResult.content; + if (content) { + words = words.concat(content.matchedWords); + } + + // return unique words + var words_uniq = [...new Set(words)]; + return words_uniq; +} + +function updateHitURL(hit) { + + var words = matchedWords(hit); + var url = ""; + + if (hit.anchor) { + url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; + } else { + url = hit.url + '?q=' + escape(words.join(" ")); + } + + return url; +} diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 00000000..2ad649cf --- /dev/null +++ b/docs/index.html @@ -0,0 +1,249 @@ + + + + + + + +Standardise summary statistics from GWAS • MungeSumstats + + + + + + + + + + + + +
    +
    + + + + +
    +
    + +
    + +
    +Authors: Alan Murphy, Brian Schilder and Nathan Skene + +
    +
    +Updated: Oct-30-2024 + +
    + + + +
    +
    +

    Introduction +

    +

    The MungeSumstats package is designed to facilitate the standardisation of GWAS summary statistics.

    +
    +

    Overview +

    +

    The package is designed to handle the lack of standardisation of output files by the GWAS community. The MRC IEU Open GWAS team have provided full summary statistics for >10k GWAS, which are API-accessible via the ieugwasr and gwasvcf packages. But these GWAS are only standardised in the sense that they are VCF format, and can be fully standardised with MungeSumstats.

    +

    MungeSumstats provides a framework to standardise the format for any GWAS summary statistics, including those in VCF format, enabling downstream integration and analysis. It addresses the most common discrepancies across summary statistic files, and offers a range of adjustable Quality Control (QC) steps.

    +
    +
    +

    Citation +

    +

    If you use MungeSumstats, please cite the original authors of the GWAS as well as:

    +
    +

    Alan E Murphy, Brian M Schilder, Nathan G Skene (2021) MungeSumstats: A Bioconductor package for the standardisation and quality control of many GWAS summary statistics. Bioinformatics, btab665, https://doi.org/10.1093/bioinformatics/btab665

    +
    +
    +
    +
    +

    Installing MungeSumstats + +

    +

    MungeSumstats is available on Bioconductor. To install MungeSumstats on Bioconductor run:

    +
    +if (!require("BiocManager")) install.packages("BiocManager")
    +
    +BiocManager::install("MungeSumstats")
    +

    You can then load the package and data package:

    + +

    Note that there is also a docker image for MungeSumstats.

    +

    Note that for a number of the checks implored by MungeSumstats a reference genome is used. If your GWAS summary statistics file of interest relates to GRCh38, you will need to install SNPlocs.Hsapiens.dbSNP155.GRCh38 and BSgenome.Hsapiens.NCBI.GRCh38 from Bioconductor as follows:

    +
    +BiocManager::install("SNPlocs.Hsapiens.dbSNP155.GRCh38")
    +BiocManager::install("BSgenome.Hsapiens.NCBI.GRCh38")
    +

    If your GWAS summary statistics file of interest relates to GRCh37, you will need to install SNPlocs.Hsapiens.dbSNP155.GRCh37 and BSgenome.Hsapiens.1000genomes.hs37d5 from Bioconductor as follows:

    +
    +BiocManager::install("SNPlocs.Hsapiens.dbSNP155.GRCh37")
    +BiocManager::install("BSgenome.Hsapiens.1000genomes.hs37d5")
    +

    These may take some time to install and are not included in the package as some users may only need one of GRCh37/GRCh38. If you are unsure of the genome build, MungeSumstats can also infer this information from your data.

    +
    +
    +

    Getting started +

    +

    See the Getting started vignette website for up-to-date instructions on usage.

    +

    See the OpenGWAS vignette website for information on how to use MungeSumstats to access, standardise and perform quality control on GWAS Summary Statistics from the MRC IEU Open GWAS Project.

    +

    Please read carefully through the FAQ website for an queries about running MungeSumstats. If you have any outside of this problems please do file an Issue here on GitHub.

    +
    +
    +

    Future Enhancements +

    +

    The MungeSumstats package aims to be able to handle the most common summary statistic file formats including VCF. If your file can not be formatted by MungeSumstats feel free to report the Issue on GitHub along with your summary statistics file header.

    +

    We also encourage people to edit the code to resolve their particular issues too and are happy to incorporate these through pull requests on github. If your summary statistic file headers are not recognised by MungeSumstats but correspond to one of

    +
    SNP, BP, CHR, A1, A2, P, Z, OR, BETA, LOG_ODDS, SIGNED_SUMSTAT, N, N_CAS, N_CON, 
    +NSTUDY, INFO or FRQ, 
    +

    Feel free to update the data("sumstatsColHeaders") following the approach in the data.R file and add your mapping. Then use a Pull Request on GitHub and we will incorporate this change into the package.

    +
    +
    +

    Contributors +

    +

    We would like to acknowledge all those who have contributed to MungeSumstats development:

    + +
    + +
    + + +
    + + +
    + +
    +

    +

    Site built with pkgdown 2.0.9.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/link.svg b/docs/link.svg new file mode 100644 index 00000000..88ad8276 --- /dev/null +++ b/docs/link.svg @@ -0,0 +1,12 @@ + + + + + + diff --git a/docs/news/index.html b/docs/news/index.html new file mode 100644 index 00000000..b8666df0 --- /dev/null +++ b/docs/news/index.html @@ -0,0 +1,1098 @@ + +Changelog • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    + +
    +

    Bug fix

    +

    *Updated retrieval of IEU OpenGWAS to new approach requiring login. Also updated to use the IEU OpwnGWAS R package as a dependency.

    +
    +
    +
    + +
    +

    New features

    +

    *FAQ Website updated.

    +
    +
    +
    + +
    +

    New features

    +

    *FAQ Website added.

    +
    +
    +
    + +
    +

    Bug fix

    +
    • +infer_eff_direction now includes A0 as an ambiguous case as well as A1/A2.
    • +
    +
    +

    New features

    +
    • +eff_on_minor_alleles parameter added (off by default) - controls whether MungeSumstats should assume that the effects are majoritively measured on the minor alleles. Default is FALSE as this is an assumption that won’t be appropriate in all cases. However, the benefit is that if we know the majority of SNPs have their effects based on the minor alleles, we can catch cases where the allele columns have been mislabelled.
    • +
    +
    +
    + +
    +

    New features

    +
    • Mappings added to mapping file for risk and non risk allele.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Bug fix for check 3 in infer effect column - previously A1 & A2 were swapped when there were more matches for the ref genome in A1 rather than A2 which was incorrect. Corrected now so it will only be flipped when A2 has more matches to the reference genome.
    • +
    +
    +
    + +
    +

    New features

    +
    • Handling of -log10 p-values (outside of VCFs) added.
    • +
    +
    +
    + +
    +

    New features

    +
    • Mapping for OA (other Alllele) added to A1.
    • +
    +
    +
    + +
    +

    New features

    +
    +
    +
    + +
    +

    New features

    +
    • Can now control what columns are checked for missing data (drop_na_cols in format_sumstats()). By default, SNP, effect columns and P/N columns are checked. Set to Null to check all columns or choose specific columns.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Force no tab indexing when writing removed rows of SNPs. This avoids any issues where missing data causes sort errors.
    • +
    • Issue fixed when sorting CHR column based on a format when CHR column is a factor.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Catch for overflow when NA’s in SNP col for check_no_rs_snp() check with imputation_ind=TRUE.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Minor fix to get_genome_builds() to help with RAM & CPU usage during unit tests. No change in functionality for end user.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • For LDSC format, rename A1 and A2 as LDSC expects A1 to be the effect column rather than A2 (the opposite to MSS’s default) - see more here. Although, this didn’t seem to make any difference to results in tests, see more here.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Remove unused argument make_ordered from sort_coords() +
    • +
    • Issue fixed with check ldsc format wehn compute_n type chosen
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Speed up unit test timing for bioc checks (predominately for linux tests)
    • +
    +
    +
    + +
    +

    New features

    +
    • +infer_eff_direction parameter added so user can decide whether to run the check
    • +
    +
    +

    Bug fix

    +
    • Typo in unit test for infer effect direction.
    • +
    • IEU GWAS unit tests updated to account for server outages.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Fixed column header mappings +
      • Made all uncorrected header names uppercase and removed duplicates
      • +
      • “TOTALSAMPLESIZE” now maps to “N” instead of “NSTUDY”
      • +
      • “MAJORALLELE”, “MAJOR_ALLELE”, “MAJOR-ALLELE”, and “MAJOR ALLELE” now map to “A1” instead of “A2”
      • +
      • Removed the mappings for “OR-A1”, “OR.A1”, “OR_A1”, and “BETA1” because MSS assumes that A2 is the effect allele
      • +
      • Removed mappings for “A1FREQ”, “A1FRQ”, “AF1”, “FREQ.A1.1000G.EUR”, “FREQ.A1.ESP.EUR”, “FREQ.ALLELE1.HAPMAPCEU”, “FREQ1”, “FREQ1.HAPMAP”, and “FRQ_A1” because MSS defines “FRQ” to be the allele frequency of A2
      • +
      • Removed mappings for “CHR36”, “BASE_GRCH36”, “POSITION36”, “POSGRCH36”, “BASEGRCH36”, “POS36”, “POS GRCH36”, “POS.GRCH36”, “POS-GRCH36”, and “POS_GRCH36” because MSS does not support the GRCh36 genome build
      • +
      • Removed the ambiguous mapping “NMISS” -> “N” because “NMISS” can refer to the number of samples with missing data
      • +
      • Removed the ambiguous mapping “WEIGHT” -> “N” because “WEIGHT” can refer to coefficient weights
      • +
    • +
    • Fixed inference of allele where ambiguous (A1, A2) naming used (see infer_effect_column.R for code) but in short: +
      • Three checks now made to infer which allele the effect/frequency information relates to. See infer_effect_column.R for further details.
      • +
      • See get_eff_frq_allele_combns.R for how effect/frequency columns that infer the allele are captured in the mapping file
      • +
    • +
    +
    +

    New features

    +
    • New column header mappings: +
      • “VARIANT_ID” and “RSIDS” –> “SNP”
      • +
      • “P_BOLT_LMM” –> “P”
      • +
      • “NCASES” –> “N_CAS”
      • +
      • “N_EFFECTIVE”, “N_INFORMATIVE”, and “TOTAL_N” –> “N”
      • +
      • “HET_P” –> “HETPVAL”
      • +
      • “HET_ISQ” –> “HETISQT”
      • +
      • “ALL_AF” –> “FRQ”
      • +
      • “DIRECT” –> “DIRECTION”
      • +
      • “ALT_EFFSIZE” –> “BETA”
      • +
      • “INFORMATIVE_ALT_AC” –> “AC”
      • +
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Cases checking ref genome where there are no indels would sometimes cause an error when joining. This resolved this issue.
    • +
    +
    +
    + +
    +

    New features

    +
    • flip_frq_as_biallelic parameter added enabling frequencies of non-bi-allelic SNPs to be flipped as if they were bi-allelic (1 - frequency) i.e. ignoring the frequencies of other alternative alleles (assuming these will be negligible). Note this will not be done as default as it is not fully correct but may be useful for some users.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Fix for imputation column when imputing RS ID from CHR:BP. Avoids crash and ensures correct identification of imputed SNPs.
    • +
    • Avoid running compute_nsize function when no imputation is wanted by user - also avoids message output in this situation.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Fix reporting of genome-wide sign variants before formatting.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • In check_bp_range ensure that the BP column is numeric.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • In check_no_rs_snp the order of operations had to be reversed to ensure all values were present before sorting column headers when imputation_ind=TRUE and imputing rsIDs.
    • +
    +
    +
    + +
    +

    New features

    +
    • The rmv_chrPrefix parameter in format_sumstats() has been replaced with the new chr_style parameter, which allows users to specify their desired chromosome name style. The supported chromosome styles are “NCBI”, “UCSC”, “dbSNP”, and “Ensembl” with “Ensembl” being the default.
    • +
    • +check_chr() now automatically removes all SNPs with nonstandard CHR entries (anything other than 1-22, X, Y, and MT in the Ensembl naming style).
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Better method to detect vcf files - looks for vcf in extension not in name.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Check ref genome change - if not match found for either genome build, an error will now be thrown.
    • +
    • Checks has been added so that if chrom col has chr as a prefix, this will be removed before testing genome build.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Bug fix when using imputation_ind with NA in chr column.
    • +
    +
    +
    + +
    +

    New features

    +
    • +ignore_multi_trait parameter added which will ignore any multi-trait p-values if set to TRUE. By default it is false to maintain the current default running conditions for MSS.
    • +
    +
    +
    + +
    +

    New features

    +
    • Check added, ensure BP is between 1 - length of chromosome using reference chromosome.
    • +
    +
    +
    + +
    +

    New features

    +
    • extra mapping for base-pair position (BP) column added
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • Fix ensembl chain file retrieval so works on all environments
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • +write_sumstats: +
    • +
    • +sort_coord: +
      • Renamed .R file from sort_coordinates to match current function name.
      • +
      • Add multiple sort_methods, including improved/more robust data.table-native method.
      • +
      • Added dedicated unit tests within test-index_tabular.R.
      • +
    • +
    • New helper function: check_numeric: +
      • Ensures relevant sumstats cols are numeric.
      • +
      • Added internally to: sort_coord, read_header +
      • +
    • +
    • +rworkflows.yml: +
      • Omit Windows runner.
      • +
      • Turn on run_biocheck +
      • +
    • +
    • +to_GRanges.R / to_VRanges.R: +
      • Rename files to match current function names.
      • +
    • +
    • Remove extra extdata files (I think these were created by accident): +
      • ALSvcf.vcf.bgz
      • +
      • ALSvcf.vcf.bgz.bgz
      • +
      • ALSvcf.vcf.bgz.bgz.tbi
      • +
      • ALSvcf.vcf.bgz.tbi
      • +
      • ALSvcf.vcf.gz
      • +
    • +
    • Remove .DS_Store files throughout.
    • +
    • Don’t check for duplicates based on RS ID with Indels, remove these first.
    • +
    +
    +

    New features

    +
    • Implement rworkflows. +
      • Removed old Dockerfile (not needed anymore) and workflow yaml.
      • +
    • +
    • Add drop_indels parameter so a user can decide to remove indels from sumstats.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • For downloading files use sed -E rather than sed -r as its compatible with mac which has issues with sed -r +
    • +
    +
    +

    New features

    +
    • For instances where a single column contains CHR, BP, A1 and A2. The default order has been updated to CHR:BP:A1:A2 to align with
      SPDI format. If your format differs and MSS doesn’t pick up on it, update the column name to the true format e.g. CHR:BP:A2:A1
    • +
    +
    +
    + +
    +

    New features

    +
    • Update to where SNP column is given by the four CHR, BP, A1, A2. Now, if A1 or A2 is also a separate column, these will be used to infer the order.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • further fix for Latex issues when rendering PDF of examples.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • fix for Latex issues when rendering PDF of examples.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • fix for offline runs and accessing chain files from 1.7.2.
    • +
    +
    +
    + +
    +

    New features

    +
    • New chain files used for lifting over the genome build from Ensembl have now been added. These will now be set as the default chain file instead of UCSC due to licensing issues. The choice to use UCSC files will still be there but the files will not be stored in the package themselves, they will instead be downloaded for use on the fly.
    • +
    +
    +
    + +
    +

    New features

    +
    • The use of the log_folder parameter in format_sumstats() has been updated. It is still used to point to the directory for the log files and the log of MungeSumstats messages to be stored. And the default is still a temporary directory. However, now the name of the log files (log messages and log outputs) are the same as the name of the file specified in the save_path parameter with the extension ’_log_msg.txt’ and ’_log_output.txt’ respectively.
    • +
    +
    +
    + +
    +

    Bug fix

    +
    • GHA fix.
    • +
    +
    +
    + +
    +

    New features

    +
    • By default ES taken as BETA new parameter added so users can specify if this isn’t the case (es_is_beta). If set to FALSE, mapping removed.
    • +
    • Imputing BETA ordering has been changed so log(OR) will be sued before calculating from Z, SE.
    • +
    +
    +
    + +
    +

    New features

    +
    • A new method for computing the Z-score of a sumstats (compute_z input) has been added: BETA/SE. To use it set compute_z = 'BETA' to continue to use the P-value calculation use compute_z = 'P'. Note the default is stil compute_z = FALSE.
    • +
    +
    +

    Bug fix

    +
    • Remove erroneous print statement.
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • Fix NA representation for tabular outputs - By default, data.table::fread() leaves NAs blank instead of including a literal NA. That’s fine for CSVs and if the output is read in by fread, but it breaks other tools for TSVs and is hard to read. Updated that and added a message when the table is switched to uncompressed for indexing.
    • +
    +
    +
    + +
    +

    New features

    +
    • +read_header: +
      • Can now read entire files by setting n=NULL.
      • +
      • Improved reading in of VCF files (can read .vcf.bgz now).
      • +
      • Now exported.
      • +
      • Added unit tests.
      • +
    • +
    • Remove seqminer from all code (too buggy).
    • +
    • Automatically remove residual .tsv files after tabix indexing.
    • +
    • +import_sumstats: +
      • Use @inheritDotParams format_sumstats for better documentation.
      • +
    • +
    • +parse_logs: Added new fields.
    • +
    • +format_sumstats: Added time report at the end (minutes taken total). Since this is a message, will be included in the logs, and is now parsed by parse_logs and put into the column “time”.
    • +
    +
    +

    Bug fixes

    +
    • +index_tabular: Fixed by replacing seqminer with Rsamtools.
    • +
    • When SNP ID’s passed with format 1:123456789, it will now be dealt with appropriately.
    • +
    • +compute_n can’t handle SNP level N values for imputation only population level. An explanatory error message has now been added.
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • Special characters causing issues with find empty columns function. Now fixed.
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • Mitchondrial (MT) SNPs’ chromosome value were being forced to NA by sort_coords function. This has been fixed.
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • Had to pass check_dups to other checks so they also wouldn’t be run. Now independent of non-biallelic check.
    • +
    +
    +
    + +
    +

    New features

    +
    • check_dups parameter added so duplicates won’t be removed if formatting QTL datasets
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • validate_parameters checks for incorrect version of dbSNP package, corrected.
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • MSS can now impute CHR, BP at a SNP level. For cases where CHR and/or BP are NA but the RS ID is present, these will now be imputed fromt he reference genome. Note previously, this imputation was done when the chr and/or bp column was missing.
    • +
    • Print statement from liftover silenced when no liftover required
    • +
    • check missing data function will no longer remove cases with NA’s in SNP_INFO column. The SNP_INFO column is created by MSS for cases with RS ID and some other information in the same SNP column (like rs1234:…..). Rather than throw out this info, it is stored in a new column - SNP_INFO. However, the remove missing data function was also looking in this column to remove SNPs. This has been corrected.
    • +
    • +find_sumstats(): +
      • Fix N column in metadata.
      • +
    • +
    +
    +
    + +
    +

    New features

    +
    • save_format parameter created for format_sumstats. This will replace ldsc_format which is now deprecated. Use save_format=“LDSC” instead. Other options for save_format are generic standardised (NULL) and IEU Open GWAS VCF format (“openGWAS”).
    • +
    • dbSNP version 155 has now been added. Users can now control the version of dbSNP to be used for imputation (144 or 155). Note that with the 9x more SNPs in dbSNP 155 vs 144, run times will increase.
    • +
    +
    +

    Bug fixes

    +
    • Change where sex chromosomes were made lower case removed to match UCSC
    • +
    +
    +
    + +
    +

    New features

    +
    • Further mappings added
    • +
    +
    +

    Bug fixes

    +
    • Duplication of non-bi-allelic and indels fixed
    • +
    • Correct compute_nsize documentation
    • +
    +
    +
    + +
    +

    New features

    +
    • Export vcf2df. +
      • Move some post-processing function inside this function (e.g. drop duplicate cols/rows).
      • +
    • +
    • +read_vcf can now be parallised: splits query into chunks, imports them, and (optionally) converts them to data.table before rbinding them back into one object. +
      • Added report of VCF size (variants x samples) before processing to give user an idea of long it will take to process.
      • +
      • Added arg mt_thresh to avoid using parallelisation when VCFs are small, due to the overhead outweighing the benefits in these cases.
      • +
    • +
    • Added Linux installation instructions for axel downloader.
    • +
    • Added 2nd tryCatch to downloader with different download.file parameters that may work better on certain machines.
    • +
    • Avoid using file.path to specify URL in: +
      • get_chain_file
      • +
      • import_sumstats
      • +
    • +
    • Allow download_vcf to pass URLs directly (without downloading the files) when vcf_download=FALSE.
    • +
    • +download_vcf: +
      • Make timeout 10min instead of 30min.
      • +
      • Make axel verbose.
      • +
    • +
    • +load_ref_genome_data: +
      • Give more informative messages that let user know which steps take a long time.
      • +
      • Speed up substring preprocessing.
      • +
    • +
    • +read_vcf_genome: more robust way to get genome build from VCF.
    • +
    • +read_sumstats: Speed up by using remove_empty_cols(sampled_rows=), and only run for tabular file (read_vcf already does this internally).
    • +
    +
    +

    Bug fixes

    +
    • +select_vcf_field: Got rid of “REF col doesn’t exists” warning by omitting rowRanges.
    • +
    • Ensured several unevaluated code chunks in vignettes/MungeSumstats.Rmd were surrounding by ticks.
    • +
    • +vcf2df: Accounted for scenarios where writeVcf accidentally converts geno data into redundant 3D matrices. +
      • Use data.table::rbindlist(fill=TRUE) to bind chunks back together.
      • +
    • +
    • Remove unused functions after read_vcf upgrades: +
      • infer_vcf_sample_ids
      • +
      • is_vcf_parsed
      • +
      • check_tab_delimited
      • +
      • read_vcf_data
      • +
      • remove_nonstandard_vcf_cols
      • +
    • +
    • Remove redundant dt_to_granges by merging functionality into to_granges. +
      • Adjusted liftover to accommodate the slight change.
      • +
    • +
    • Fix is_tabix (I had incorrectly made path all lowercase).
    • +
    • Let index_vcf recognize all compressed vcf suffixes. +
      • Add extra error handling when .gz is not actually bgz-compressed.
      • +
    • +
    • Set BiocParallel registered threads back to 1 after read_vcf_parallel finishes, to avoid potential conflicts with downstream steps.
    • +
    +
    +
    + +
    +

    New features

    +
    • Added “query” column to find_sumstats output to keep track of search parameters.
    • +
    • +import_sumstats: +
      • Check if formatted file (save_path) exists before downloading to save time.
      • +
      • Pass up force_new in additional to force_new_vcf.
      • +
    • +
    • Updated Description tag in DESCRIPTION file to better reflect the scope of MungeSumstats.
    • +
    • Upgraded read_vcf to be more robust.
    • +
    • Edited Deps/Suggests +
      • Elevate IRanges to Imports.
      • +
      • Remove stringr (no longer used)
      • +
    • +
    • Add new internal function is_tabix to check whether a file is already tabix-indexed.
    • +
    • +read_sumstats: +
      • now takes samples as an arg.
      • +
      • Parallises reading VCF using GenomicFiles.
      • +
    • +
    • +read_sumstats: now takes samples as an arg.
      +By default, only uses first sample (if multiple are present in file).
    • +
    • Remove INFO_filter= from ALS VCF examples in vignettes (no longer necessary now that INFO parsing has been corrected).
    • +
    • +download_vcf can now handle situations with vcf_url= is actually a local file (not remote).
    • +
    +
    +

    Bug fixes

    +
    • AF (allele frequency) was accidentally being assigned as INFO column in VCFs where the INFO rows started with “AF”. This caused a large number of SNPs to be incorrectly dropped during the check_info_score step.
    • +
    • If INFO score is not available, INFO column is now dropped entirely (rather than assigning all 1s). +
      • Adjusted test-vcf_formatting to reflect this. This avoids ambiguity about whether the INFO score is real or not.
      • +
    • +
    • +check_info_score: +
      • Added extra messages in various conditions where INFO is not used for filtering, and don’t add log_files$info_filter in these instances.
      • +
      • Added unit tests.
      • +
    • +
    • +check_empty_cols was accidentally dropping more columns than it should have.
    • +
    • Fix GHA pkgdown building: +
    • +
    • Fix write_sumstats when indexing VCF.
    • +
    • Ensure read_sumstats can read in any VCF files (local/remote, indexed/non-indexed).
    • +
    • Fix test-vcf_formatting.R +
      • line 51: had wrong AF value in string
      • +
      • line 109: encountering error? due to duplicate SNPs?
      • +
    • +
    • Fix test-check_impute_se_beta +
      • lines 51/52: setkey on SNP (now automatically renamed from ID by read_vcf).
      • +
    • +
    • Fix test-read_sumstats: +
      • standardising of headers is now handled internally by read_sumstats.
      • +
      • Ensure CHR is a character vector when being read in.
      • +
      • line 44: Ensure extra cols in vcf_ss are dropped.
      • +
    • +
    • +parse_logs: Add lines to parsing subfunctions to allow handling of logs that don’t contain certain info (thus avoid warnings when creating the final data.table).
    • +
    • Avoid the use of ’paste’ in condition signals’ fixed: +
      • check_pos_se
      • +
      • check_signed_col
      • +
    • +
    • Used to rely on gunzip to read bgz files, but apparently this functionality is no longer supported (possibly due to changes to how Rsamtools::bgzip does compression in Bioc 3.15. Switched to using fread + readLines in: +
      • read_header
      • +
      • read_sumstats
      • +
    • +
    • +read_header: wasn’t reading in enough lines to get past the VCF header. Increase to readLines(n=1000).
    • +
    • +read_vcf: Would sometimes induce duplicate rows. Now only unique rows are used (after sample and columns filtering).
    • +
    • Issue with mix of chr:bp:a1:a2 and chr:bp and rs id resolved
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • +format_sumstats can now import remote files (other than OpenGWAS).
    • +
    +
    +

    New features

    +
    • New sumstatsColHeaders entries: +
      • “PosGRCh37” –> “BP”
      • +
      • “testedAllele” –> “A1”
      • +
    • +
    +
    +
    + +
    +

    New features

    +
    • Can now handle general remote sumstats not just IEU GWAS
    • +
    • More column header mappings
    • +
    +
    +
    + +
    +

    New features

    +
    • Clean up of column header mapping file, including FREQUENCY given priority over MAF and addition of new CHR mappings.
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • Handle cases for multi-trait GWAS when P columns exists separate to the trait specific P value so that when renaming occurs there isn’t two P columns. Inputted P column will be renamed to ‘P_input’
    • +
    • Issue where ‘check allele flip’ wasn’t running when the sumstats had all SNP IDs missing and incorrect direction of A1/A2 and effect columns has now been fixed.
    • +
    +
    +
    + +
    +

    New features

    +
    • +liftover +
      • Now exported function.
      • +
      • Added args for more user flexibility.
      • +
      • Uses GenomeInfoDb::mapGenomeBuilds to standardise build names.
      • +
      • Warns users when mapped builds do not match one of the conversion options.
      • +
      • Choice to output as data.table or GRanges.
      • +
      • Added units tests for exported version.
      • +
    • +
    • +standardise_sumstats_column_headers_crossplatform +
      • Exported as standardise_header while keeping the original function name as an internal function (they call the same code).
      • +
      • Added unit tests for exported version.
      • +
    • +
    • Added chunks to *Getting startedvignette -liftover` tutorial +
      • “Quick formatting” of headers and file formats.
      • +
    • +
    +
    +

    Bug fixes

    +
    • +check_pos_se: Remove extra message() call around string.
    • +
    • +check_signed_col: Remove extra message() call around string.
    • +
    • +write_sumstats +
      • Added extra round of sorting when tabix_index=TRUE because this is required for tabix.
      • +
    • +
    +
    +
    + +
    +

    New Features

    +
    • Additional mappings for CHR
    • +
    • Make A1, A2 upper-case
    • +
    +
    +

    Bug fixes

    +
    • Bug fix for dealing with imputing SNP ID when there are indels
    • +
    +
    +
    + +
    +

    New Features

    +
    • MungeSumstats can now handle Indels better. It will: +
      • Not impute the RS ID of a SNP for an Indel
      • +
      • Not remove the Indel based on the RS ID not being present in the SNP ref dataset.
      • +
      • Not remove the Indel if it has the same base-pair location as a SNP in the sumstats.
      • +
    • +
    • Can now handle vcfs with extensions .vcf.tsv, .vcf.tsv.gz and .vcf.tsv.bgz
    • +
    +
    +

    Bug fixes

    +
    • For non-bi-allelic SNP runs, no longer remove duplicated SNPs based on their base-pair position or their RS ID.
    • +
    +
    +
    + +
    +

    New Features

    +
    • Exported functions. Added examples and unit tests: +
      • compute_nsize
      • +
      • standardise_sumstats_column_headers_crossplatform
      • +
      • formatted_example
      • +
    • +
    • New arguments: +
      • +standardise_sumstats_column_headers_crossplatform: Added arg uppercase_unmapped to to allow users to specify whether they want make the columns that could not be mapped to a standard name uppercase (default=TRUE for backcompatibility). Added arg return_list to specify whether to return a named list (default) or just the data.table.
      • +
      • +formatted_example: Added args formatted to specify whether the file should have its colnames standardised. Added args sorted to specify whether the file should sort the data by coordinates. Added arg return_list to specify whether to return a named list (default) or just the data.table.
      • +
    • +
    • Removed codecode.yml and *_pkgdown.yml* files (no longer necessary).
    • +
    • Added Issues templates for Bugs and Feature requests.
    • +
    • Added .datatable.aware=TRUE to .zzz as extra precaution.
    • +
    • +vcf2df: Documented arguments.
    • +
    • Made v2 of hex sticker: inst/hex/hex.png +
    • +
    +
    +

    Bug fixes

    +
    • Regenerated the gh-pages branch after it accidentally got deleted.
    • +
    • Remove temporary docs/ folder.
    • +
    • Updated GitHub Actions.
    • +
    • Updated Dockerfile so it doesn’t run checks (this is now take care of by the GHA workflow).
    • +
    • Added Windows-specific folders to .Rbuildignore.
    • +
    • Made to_GRanges.R and to_VRanges.R file names lowercase to be congruent with function names.
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • Bug in checking for bad characters in RSID fixed
    • +
    +
    +
    + +
    +

    New Features

    +
    • Columns Beta and Standard Error can now be imputed. However note that this imputation is an approximation so could have an effect on downstream analysis. Use with caution.
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • Flipping of Odds Ratio corrected (1/OR rather than -1*OR)
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • Issue downloading chain file resolved
    • +
    +
    +
    + +
    +

    New Features

    +
    • More mappings added to default mapping file.
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • Previously rsids with characters added (e.g. rs1234567w) would cause an error when checking for the rsid on the reference genome. This has been fixed and the correct rsid will now be imputed from the reference genome for these cases.
    • +
    +
    +
    + +
    +

    New Features

    +
    • +import_sumstats: Create individual folders for each GWAS dataset, with a respective logs subfolder to avoid overwriting log files when processing multiple GWAS.
    • +
    • +parse_logs: New function to convert logs from one or more munged GWAS into a data.table.
    • +
    • +list_sumstats: New function to recursively search for local summary stats files previously munged with MungeSumstats.
    • +
    • Added new dataset inst/extdata/MungeSumstats_log_msg.txt to test logs files.
    • +
    • Added unit tests for list_sumstats and parse_logs.
    • +
    • Added new Docker vignette.
    • +
    • Updated GHA workflows using r_workflows.
    • +
    • Remove docs/ folder as the website will now be pushed to the gh-pages branch automatically by new GHA workflow.
    • +
    • Made documentation in README more clear and concise.
    • +
    • Added checks for p-values >1 or <0 via args convert_large_p and convert_neg_p, respectively. These are both handled by the new internal function check_range_p_val, which also reports the number of SNPs found meeting these criteria to the console/logs.
    • +
    • +check_small_p_val records which SNPs were imputed in a more robust way, by recording which SNPs met the criteria before making the changes (as opposed to inferred this info from which columns are 0 after making the changes). This function now only handles non-negative p-values, so that rows with negative p-values can be recorded/reported separately in the check_range_p_val step.
    • +
    • +check_small_p_val now reports the number of SNPs <= 5e-324 to console/logs.
    • +
    • Unit tests have been added for both check_range_p_val and check_small_p_val.
    • +
    • +parse_logs can now extract information reported by check_range_p_val and check_small_p_val.
    • +
    • New internal function logs_example provides easy access to log file stored in inst/extdata, and includes documentation on how it was created.
    • +
    • Both check_range_p_val and check_small_p_val now use #' @inheritParams format_sumstats to improve consistency of documentation.
    • +
    +
    +

    Bug fixes

    +
    • Reduced vignette sizes.
    • +
    • Removed usage of suppressWarnings where possible.
    • +
    • Deleted old .Rproj file and hidden folder (contained large files).
    • +
    • Configured .Rproj so it doesn’t store large data files.
    • +
    • Fix badger issues: https://github.com/GuangchuangYu/badger/issues/34 +
    • +
    • Prevent test-index_tabix.R from running due to errors (for now).
    • +
    +
    +
    + +
    +

    New Features

    +
    • Version bump to align with Bioconductor release 3.14.
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • +validate_parameters can now handle ref_genome=NULL
    • +
    • +.tsv.gz no longer assigned suffix .tsv.
    • +
    • Made code width <80 characters.
    • +
    • Changed to_GRanges/to_GRanges functions to all-lowercase functions (for consistency with other functions).
    • +
    • Set nThread=1 in data.table test functions.
    • +
    +
    +

    New Features

    +
    • Added tests for get_genome_builds
    • +
    • Added early check for making sure the directory save_path is in was actually created (as opposed to finding out at the very end of the pipeline).
    • +
    • Tabix-indexing now available for tabular output data.
    • +
    • +read_header and read_sumstats now both work with .bgz files.
    • +
    +
    +
    + +
    +

    New Features

    +
    • Extra mappings for FRQ column, see data("sumstatsColHeaders") for details
    • +
    +
    +
    + +
    +

    New Features

    +
    • +format_sumstats(FRQ_filter) added so SNPs can now be filtered by allele frequency
    • +
    • Mapping file now has mappings for allele frequency (AF) to FRQ
    • +
    • VCF files with AF in INFO column e.g. ‘AF=…’ now converted to AF column
    • +
    • +format_sumstats(frq_is_maf) check added to infer if FRQ column values are minor/effect allele frequencies or not. frq_is_maf allows users to rename the FRQ column as MAJOR_ALLELE_FRQ if some values appear to be major allele frequencies
    • +
    +
    +
    + +
    +

    New Features

    +
    • +get_genome_builds() can now be called to quickly get the genome build without running the whole reformatting.
    • +
    • +format_sumstats(compute_n) now has more methods to compute the effective sample size with “ldsc”, “sum”, “giant” or “metal”.
    • +
    • +format_sumstats(convert_ref_genome) now implemented which can perform liftover to GRCh38 from GRCh37 and vice-versa enabling better cohesion between different study’s summary statistics.
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • +check_no_rs_snp can now handle extra information after an RS ID. So if you have rs1234:A:G that will be separated into two columns.
    • +
    • +check_two_step_col and check_four_step_col, the two checks for when multiple columns are in one, have been updated so if not all SNPs have multiple columns or some have more than the expected number, this can now be handled.
    • +
    • Extra mappings for the FRQ column have been added to the mapping file
    • +
    +
    +
    + +
    +

    New Features

    +
    • +check_multi_rs_snp can now handle all punctuation with/without spaces. So if a row contains rs1234,rs5678 or rs1234, rs5678 or any other punctuation character other than , these can be handled.
    • +
    • +format_sumstats(path) can now be passed a dataframe/datatable of the summary statistics directly as well as a path to their saved location.
    • +
    • Input summary statistics with A0/A1 corresponding to ref/alt can now be handled by the mappign file as well as A1/A2 corresponding to ref/alt.
    • +
    +
    +
    + +
    +

    New Features

    +
    • +import_sumstats reads GWAS sum stats directly from Open GWAS. Now parallelised and reports how long each dataset took to import/format in total.
    • +
    • +find_sumstats searches Open GWAS for datasets.
    • +
    • +compute_z computes Z-score from P.
    • +
    • +compute_n computes N for all SNPs from user defined smaple size.
    • +
    • +format_sumstats(ldsc_format=TRUE) ensures sum stats can be fed directly into LDSC without any additional munging.
    • +
    • +read_sumstats, write_sumstas, and download_vcf functions now exported.
    • +
    • +format_sumstats(sort_coordinates=TRUE) sorts results by their genomic coordinates.
    • +
    • +format_sumstats(return_data=TRUE) returns data directly to user. Can be returned in either data.table (default), GRanges or VRanges format using format_sumstats(return_format="granges").
    • +
    • +format_sumstats(N_dropNA=TRUE) (default) drops rows where N is missing.
    • +
    • +format_sumstats(snp_ids_are_rs_ids=TRUE) (default) Should the SNP IDs inputted be inferred as RS IDs or some arbitrary ID.
    • +
    • +format_sumstats(write_vcf=TRUE) writes a tabix-indexed VCF file instead of tabular format.
    • +
    • +format_sumstats(save_path=...) lets users decide where their results are saved and what they’re named.
    • +
    • When the save_path indicates it’s in tempdir(), message warns users that these files will be deleted when R session ends.
    • +
    • Summary of data is given at the beginning and the end of format_sumstats via report_summary().
    • +
    • Readability of preview_sumstats() messages improved.
    • +
    • New checks standard error (SE) must >0 and BETA (and other effect columns) must not equal 0: format_sumstats(pos_se=TRUE,effect_columns_nonzero=TRUE) +
    • +
    • Log directory containing all removed SNPs is now available and can be changed to a different directory by setting: format_sumstats(log_folder_ind=TRUE,log_folder=tempdir()) +
    • +
    • All imputed data can now be identified with a column in the output using: format_sumstats(imputation_ind=TRUE) +
    • +
    • Users can now input their own mapping file to be used for the column header mapping in place of data(sumstatsColHeaders). See format_sumstats(mapping_file = mapping_file).
    • +
    +
    +

    Bug fixes

    +
    • CHR column now standardised (X and Y caps, no “chr” prefix).
    • +
    • Allele flipping done on a per-SNP basis (instead of whole-column).
    • +
    • Allele flipping now includes FRQ column as well as effect columns.
    • +
    • The effect allele is now interpreted as the A2 allele consistent with IEU GWAS VCF approach. A1 will always be the reference allele.
    • +
    • +read_vcf upgraded to account for more VCF formats.
    • +
    • +check_n_num now accounts for situations where N is a character vector and converts to numeric.
    • +
    +
    +
    + +
    +

    Bug fixes

    +
    • Preprint publication citation added.
    • +
    +
    +
    + +
    +

    New Features

    +
    • MungeSumstats released to Bioconductor.
    • +
    +
    +
    + + + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/pkgdown.css b/docs/pkgdown.css new file mode 100644 index 00000000..80ea5b83 --- /dev/null +++ b/docs/pkgdown.css @@ -0,0 +1,384 @@ +/* Sticky footer */ + +/** + * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ + * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css + * + * .Site -> body > .container + * .Site-content -> body > .container .row + * .footer -> footer + * + * Key idea seems to be to ensure that .container and __all its parents__ + * have height set to 100% + * + */ + +html, body { + height: 100%; +} + +body { + position: relative; +} + +body > .container { + display: flex; + height: 100%; + flex-direction: column; +} + +body > .container .row { + flex: 1 0 auto; +} + +footer { + margin-top: 45px; + padding: 35px 0 36px; + border-top: 1px solid #e5e5e5; + color: #666; + display: flex; + flex-shrink: 0; +} +footer p { + margin-bottom: 0; +} +footer div { + flex: 1; +} +footer .pkgdown { + text-align: right; +} +footer p { + margin-bottom: 0; +} + +img.icon { + float: right; +} + +/* Ensure in-page images don't run outside their container */ +.contents img { + max-width: 100%; + height: auto; +} + +/* Fix bug in bootstrap (only seen in firefox) */ +summary { + display: list-item; +} + +/* Typographic tweaking ---------------------------------*/ + +.contents .page-header { + margin-top: calc(-60px + 1em); +} + +dd { + margin-left: 3em; +} + +/* Section anchors ---------------------------------*/ + +a.anchor { + display: none; + margin-left: 5px; + width: 20px; + height: 20px; + + background-image: url(./link.svg); + background-repeat: no-repeat; + background-size: 20px 20px; + background-position: center center; +} + +h1:hover .anchor, +h2:hover .anchor, +h3:hover .anchor, +h4:hover .anchor, +h5:hover .anchor, +h6:hover .anchor { + display: inline-block; +} + +/* Fixes for fixed navbar --------------------------*/ + +.contents h1, .contents h2, .contents h3, .contents h4 { + padding-top: 60px; + margin-top: -40px; +} + +/* Navbar submenu --------------------------*/ + +.dropdown-submenu { + position: relative; +} + +.dropdown-submenu>.dropdown-menu { + top: 0; + left: 100%; + margin-top: -6px; + margin-left: -1px; + border-radius: 0 6px 6px 6px; +} + +.dropdown-submenu:hover>.dropdown-menu { + display: block; +} + +.dropdown-submenu>a:after { + display: block; + content: " "; + float: right; + width: 0; + height: 0; + border-color: transparent; + border-style: solid; + border-width: 5px 0 5px 5px; + border-left-color: #cccccc; + margin-top: 5px; + margin-right: -10px; +} + +.dropdown-submenu:hover>a:after { + border-left-color: #ffffff; +} + +.dropdown-submenu.pull-left { + float: none; +} + +.dropdown-submenu.pull-left>.dropdown-menu { + left: -100%; + margin-left: 10px; + border-radius: 6px 0 6px 6px; +} + +/* Sidebar --------------------------*/ + +#pkgdown-sidebar { + margin-top: 30px; + position: -webkit-sticky; + position: sticky; + top: 70px; +} + +#pkgdown-sidebar h2 { + font-size: 1.5em; + margin-top: 1em; +} + +#pkgdown-sidebar h2:first-child { + margin-top: 0; +} + +#pkgdown-sidebar .list-unstyled li { + margin-bottom: 0.5em; +} + +/* bootstrap-toc tweaks ------------------------------------------------------*/ + +/* All levels of nav */ + +nav[data-toggle='toc'] .nav > li > a { + padding: 4px 20px 4px 6px; + font-size: 1.5rem; + font-weight: 400; + color: inherit; +} + +nav[data-toggle='toc'] .nav > li > a:hover, +nav[data-toggle='toc'] .nav > li > a:focus { + padding-left: 5px; + color: inherit; + border-left: 1px solid #878787; +} + +nav[data-toggle='toc'] .nav > .active > a, +nav[data-toggle='toc'] .nav > .active:hover > a, +nav[data-toggle='toc'] .nav > .active:focus > a { + padding-left: 5px; + font-size: 1.5rem; + font-weight: 400; + color: inherit; + border-left: 2px solid #878787; +} + +/* Nav: second level (shown on .active) */ + +nav[data-toggle='toc'] .nav .nav { + display: none; /* Hide by default, but at >768px, show it */ + padding-bottom: 10px; +} + +nav[data-toggle='toc'] .nav .nav > li > a { + padding-left: 16px; + font-size: 1.35rem; +} + +nav[data-toggle='toc'] .nav .nav > li > a:hover, +nav[data-toggle='toc'] .nav .nav > li > a:focus { + padding-left: 15px; +} + +nav[data-toggle='toc'] .nav .nav > .active > a, +nav[data-toggle='toc'] .nav .nav > .active:hover > a, +nav[data-toggle='toc'] .nav .nav > .active:focus > a { + padding-left: 15px; + font-weight: 500; + font-size: 1.35rem; +} + +/* orcid ------------------------------------------------------------------- */ + +.orcid { + font-size: 16px; + color: #A6CE39; + /* margins are required by official ORCID trademark and display guidelines */ + margin-left:4px; + margin-right:4px; + vertical-align: middle; +} + +/* Reference index & topics ----------------------------------------------- */ + +.ref-index th {font-weight: normal;} + +.ref-index td {vertical-align: top; min-width: 100px} +.ref-index .icon {width: 40px;} +.ref-index .alias {width: 40%;} +.ref-index-icons .alias {width: calc(40% - 40px);} +.ref-index .title {width: 60%;} + +.ref-arguments th {text-align: right; padding-right: 10px;} +.ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} +.ref-arguments .name {width: 20%;} +.ref-arguments .desc {width: 80%;} + +/* Nice scrolling for wide elements --------------------------------------- */ + +table { + display: block; + overflow: auto; +} + +/* Syntax highlighting ---------------------------------------------------- */ + +pre, code, pre code { + background-color: #f8f8f8; + color: #333; +} +pre, pre code { + white-space: pre-wrap; + word-break: break-all; + overflow-wrap: break-word; +} + +pre { + border: 1px solid #eee; +} + +pre .img, pre .r-plt { + margin: 5px 0; +} + +pre .img img, pre .r-plt img { + background-color: #fff; +} + +code a, pre a { + color: #375f84; +} + +a.sourceLine:hover { + text-decoration: none; +} + +.fl {color: #1514b5;} +.fu {color: #000000;} /* function */ +.ch,.st {color: #036a07;} /* string */ +.kw {color: #264D66;} /* keyword */ +.co {color: #888888;} /* comment */ + +.error {font-weight: bolder;} +.warning {font-weight: bolder;} + +/* Clipboard --------------------------*/ + +.hasCopyButton { + position: relative; +} + +.btn-copy-ex { + position: absolute; + right: 0; + top: 0; + visibility: hidden; +} + +.hasCopyButton:hover button.btn-copy-ex { + visibility: visible; +} + +/* headroom.js ------------------------ */ + +.headroom { + will-change: transform; + transition: transform 200ms linear; +} +.headroom--pinned { + transform: translateY(0%); +} +.headroom--unpinned { + transform: translateY(-100%); +} + +/* mark.js ----------------------------*/ + +mark { + background-color: rgba(255, 255, 51, 0.5); + border-bottom: 2px solid rgba(255, 153, 51, 0.3); + padding: 1px; +} + +/* vertical spacing after htmlwidgets */ +.html-widget { + margin-bottom: 10px; +} + +/* fontawesome ------------------------ */ + +.fab { + font-family: "Font Awesome 5 Brands" !important; +} + +/* don't display links in code chunks when printing */ +/* source: https://stackoverflow.com/a/10781533 */ +@media print { + code a:link:after, code a:visited:after { + content: ""; + } +} + +/* Section anchors --------------------------------- + Added in pandoc 2.11: https://github.com/jgm/pandoc-templates/commit/9904bf71 +*/ + +div.csl-bib-body { } +div.csl-entry { + clear: both; +} +.hanging-indent div.csl-entry { + margin-left:2em; + text-indent:-2em; +} +div.csl-left-margin { + min-width:2em; + float:left; +} +div.csl-right-inline { + margin-left:2em; + padding-left:1em; +} +div.csl-indent { + margin-left: 2em; +} diff --git a/docs/pkgdown.js b/docs/pkgdown.js new file mode 100644 index 00000000..6f0eee40 --- /dev/null +++ b/docs/pkgdown.js @@ -0,0 +1,108 @@ +/* http://gregfranko.com/blog/jquery-best-practices/ */ +(function($) { + $(function() { + + $('.navbar-fixed-top').headroom(); + + $('body').css('padding-top', $('.navbar').height() + 10); + $(window).resize(function(){ + $('body').css('padding-top', $('.navbar').height() + 10); + }); + + $('[data-toggle="tooltip"]').tooltip(); + + var cur_path = paths(location.pathname); + var links = $("#navbar ul li a"); + var max_length = -1; + var pos = -1; + for (var i = 0; i < links.length; i++) { + if (links[i].getAttribute("href") === "#") + continue; + // Ignore external links + if (links[i].host !== location.host) + continue; + + var nav_path = paths(links[i].pathname); + + var length = prefix_length(nav_path, cur_path); + if (length > max_length) { + max_length = length; + pos = i; + } + } + + // Add class to parent
  • , and enclosing
  • if in dropdown + if (pos >= 0) { + var menu_anchor = $(links[pos]); + menu_anchor.parent().addClass("active"); + menu_anchor.closest("li.dropdown").addClass("active"); + } + }); + + function paths(pathname) { + var pieces = pathname.split("/"); + pieces.shift(); // always starts with / + + var end = pieces[pieces.length - 1]; + if (end === "index.html" || end === "") + pieces.pop(); + return(pieces); + } + + // Returns -1 if not found + function prefix_length(needle, haystack) { + if (needle.length > haystack.length) + return(-1); + + // Special case for length-0 haystack, since for loop won't run + if (haystack.length === 0) { + return(needle.length === 0 ? 0 : -1); + } + + for (var i = 0; i < haystack.length; i++) { + if (needle[i] != haystack[i]) + return(i); + } + + return(haystack.length); + } + + /* Clipboard --------------------------*/ + + function changeTooltipMessage(element, msg) { + var tooltipOriginalTitle=element.getAttribute('data-original-title'); + element.setAttribute('data-original-title', msg); + $(element).tooltip('show'); + element.setAttribute('data-original-title', tooltipOriginalTitle); + } + + if(ClipboardJS.isSupported()) { + $(document).ready(function() { + var copyButton = ""; + + $("div.sourceCode").addClass("hasCopyButton"); + + // Insert copy buttons: + $(copyButton).prependTo(".hasCopyButton"); + + // Initialize tooltips: + $('.btn-copy-ex').tooltip({container: 'body'}); + + // Initialize clipboard: + var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { + text: function(trigger) { + return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); + } + }); + + clipboardBtnCopies.on('success', function(e) { + changeTooltipMessage(e.trigger, 'Copied!'); + e.clearSelection(); + }); + + clipboardBtnCopies.on('error', function() { + changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); + }); + }); + } +})(window.jQuery || window.$) diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml new file mode 100644 index 00000000..05915ce2 --- /dev/null +++ b/docs/pkgdown.yml @@ -0,0 +1,9 @@ +pandoc: '3.2' +pkgdown: 2.0.9 +pkgdown_sha: ~ +articles: + docker: docker.html + MungeSumstats: MungeSumstats.html + OpenGWAS: OpenGWAS.html +last_built: 2024-12-17T17:43Z + diff --git a/docs/reference/DF_to_dt.html b/docs/reference/DF_to_dt.html new file mode 100644 index 00000000..2c208491 --- /dev/null +++ b/docs/reference/DF_to_dt.html @@ -0,0 +1,118 @@ + +DataFrame to data.table — DF_to_dt • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Efficiently convert DataFrame to +data.table.

    +
    + +
    +
    DF_to_dt(DF)
    +
    + + +
    +

    Arguments

    +
    DF
    +

    DataFrame object.

    + +
    +
    +

    Value

    + + +

    VCF data in data.table format.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/axel.html b/docs/reference/axel.html new file mode 100644 index 00000000..222fc556 --- /dev/null +++ b/docs/reference/axel.html @@ -0,0 +1,156 @@ + +axel downloader — axel • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    R wrapper for axel, which enables multi-threaded download +of a single large file.

    +
    + +
    +
    axel(
    +  input_url,
    +  output_path,
    +  background = FALSE,
    +  nThread = 1,
    +  force_overwrite = FALSE,
    +  quiet = TRUE,
    +  alternate = TRUE,
    +  check_certificates = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    input_url
    +

    input_url.

    + + +
    output_path
    +

    output_path.

    + + +
    background
    +

    Run in background

    + + +
    nThread
    +

    Number of threads to parallelize over.

    + + +
    force_overwrite
    +

    Overwrite existing file.

    + + +
    quiet
    +

    Run quietly.

    + + +
    alternate
    +

    alternate,

    + + +
    check_certificates
    +

    check_certificates

    + +
    +
    +

    Value

    + + +

    Path where the file has been downloaded

    +
    +
    +

    See also

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_allele_flip.html b/docs/reference/check_allele_flip.html new file mode 100644 index 00000000..eb37c692 --- /dev/null +++ b/docs/reference/check_allele_flip.html @@ -0,0 +1,231 @@ + +Ensure A1 & A2 are correctly named, if GWAS SNP constructed as Alternative/Reference or Risk/Nonrisk alleles these SNPs will need to be converted to Reference/Alternative or Nonrisk/Risk. Here non-risk is defined as what's on the reference genome (this may not always be the case). — check_allele_flip • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure A1 & A2 are correctly named, if GWAS SNP constructed as +Alternative/Reference or Risk/Nonrisk alleles these SNPs will need to be +converted to Reference/Alternative or Nonrisk/Risk. Here non-risk is defined +as what's on the reference genome (this may not always be the case).

    +
    + +
    +
    check_allele_flip(
    +  sumstats_dt,
    +  path,
    +  ref_genome,
    +  rsids,
    +  allele_flip_check,
    +  allele_flip_drop,
    +  allele_flip_z,
    +  allele_flip_frq,
    +  bi_allelic_filter,
    +  flip_frq_as_biallelic,
    +  imputation_ind,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files,
    +  standardise_headers = FALSE,
    +  mapping_file,
    +  dbSNP
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    allele_flip_check
    +

    Binary Should the allele columns be checked against +reference genome to infer if flipping is necessary. Default is TRUE.

    + + +
    allele_flip_drop
    +

    Binary Should the SNPs for which neither their A1 or +A2 base pair values match a reference genome be dropped. Default is TRUE.

    + + +
    allele_flip_z
    +

    Binary should the Z-score be flipped along with effect +and FRQ columns like Beta? It is assumed to be calculated off the effect size +not the P-value and so will be flipped i.e. default TRUE.

    + + +
    allele_flip_frq
    +

    Binary should the frequency (FRQ) column be flipped +along with effect and z-score columns like Beta? Default TRUE.

    + + +
    bi_allelic_filter
    +

    Binary Should non-bi-allelic SNPs be removed. +Default is TRUE.

    + + +
    flip_frq_as_biallelic
    +

    Binary Should non-bi-allelic SNPs frequency +values be flipped as 1-p despite there being other alternative alleles? +Default is FALSE but if set to TRUE, this allows non-bi-allelic SNPs to be +kept despite needing flipping.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + + +
    standardise_headers
    +

    Run +standardise_sumstats_column_headers_crossplatform first.

    + + +
    mapping_file
    +

    MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.

    + + +
    dbSNP
    +

    version of dbSNP to be used for imputation (144 or 155).

    + +
    +
    +

    Value

    + + +

    A list containing two data tables:

    • sumstats_dt: the modified summary statistics +data.table object.

    • +
    • rsids: snpsById, filtered to SNPs of interest if +loaded already. Or else NULL.

    • +
    • log_files: log file list

    • +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_allele_merge.html b/docs/reference/check_allele_merge.html new file mode 100644 index 00000000..9bd0d30e --- /dev/null +++ b/docs/reference/check_allele_merge.html @@ -0,0 +1,116 @@ + +Ensure that A1:A2 or A1/A2 or A1>A2 or A2>A1 aren't merged into 1 column — check_allele_merge • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that A1:A2 or A1/A2 or A1>A2 or A2>A1 aren't merged into 1 column

    +
    + +
    +
    check_allele_merge(sumstats_dt, path)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the GWAS

    + + +
    path
    +

    Filepath for the summary statistics file to be formatted

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary +statistics data table object.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_bi_allelic.html b/docs/reference/check_bi_allelic.html new file mode 100644 index 00000000..aab0b48f --- /dev/null +++ b/docs/reference/check_bi_allelic.html @@ -0,0 +1,164 @@ + +Remove non-biallelic SNPs — check_bi_allelic • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Remove non-biallelic SNPs

    +
    + +
    +
    check_bi_allelic(
    +  sumstats_dt,
    +  path,
    +  ref_genome,
    +  bi_allelic_filter,
    +  rsids,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files,
    +  dbSNP
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    bi_allelic_filter
    +

    Binary Should non-bi-allelic SNPs be removed. +Default is TRUE.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + + +
    dbSNP
    +

    version of dbSNP to be used for imputation (144 or 155).

    + +
    +
    +

    Value

    + + +

    A list containing two data tables:

    • sumstats_dt: the modified summary statistics data table object

    • +
    • rsids: snpsById, filtered to SNPs of interest +if loaded already. Or else NULL.

    • +
    • log_files: log file list

    • +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_bp_range.html b/docs/reference/check_bp_range.html new file mode 100644 index 00000000..90800f3c --- /dev/null +++ b/docs/reference/check_bp_range.html @@ -0,0 +1,163 @@ + +Ensure that the Base-pair column values are all within the range for the chromosome — check_bp_range • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that the Base-pair column values are all within the range for the +chromosome

    +
    + +
    +
    check_bp_range(
    +  sumstats_dt,
    +  path,
    +  ref_genome,
    +  log_folder_ind,
    +  imputation_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and the log file list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_chr.html b/docs/reference/check_chr.html new file mode 100644 index 00000000..0d2b5a6e --- /dev/null +++ b/docs/reference/check_chr.html @@ -0,0 +1,155 @@ + +Standardize the CHR column — check_chr • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Maps chromosome names to the default Ensembl/NCBI naming style and removes +SNPs with nonstandard CHR entries. Optionally, also removes SNPs on +user-specified chromosomes.

    +
    + +
    +
    check_chr(
    +  sumstats_dt,
    +  log_files,
    +  check_save_out,
    +  rmv_chr,
    +  nThread,
    +  tabix_index,
    +  log_folder_ind
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data.table with summary statistics

    + + +
    log_files
    +

    list of locations for all log files

    + + +
    check_save_out
    +

    list of parameters for saved files

    + + +
    rmv_chr
    +

    Chromosomes to exclude from the formatted summary statistics +file. Use NULL if no filtering is necessary. Default is c("X", "Y", "MT") +which removes all non-autosomal SNPs.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + +
    +
    +

    Value

    + + +

    list containing the updated summary statistics data.table and the +updated log file locations list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_col_order.html b/docs/reference/check_col_order.html new file mode 100644 index 00000000..2e0e5a4c --- /dev/null +++ b/docs/reference/check_col_order.html @@ -0,0 +1,118 @@ + +Ensure that the first three columns are SNP, CHR, BP in that order and then A1, A2 if present — check_col_order • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that the first three columns are SNP, CHR, BP in that order and +then A1, A2 if present

    +
    + +
    +
    check_col_order(sumstats_dt, path)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the GWAS

    + + +
    path
    +

    Filepath for the summary statistics file to be formatted

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics +data table object

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_drop_indels.html b/docs/reference/check_drop_indels.html new file mode 100644 index 00000000..cc96d31a --- /dev/null +++ b/docs/reference/check_drop_indels.html @@ -0,0 +1,159 @@ + +Drop Indels from summary statistics — check_drop_indels • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Drop Indels from summary statistics

    +
    + +
    +
    check_drop_indels(
    +  sumstats_dt,
    +  drop_indels,
    +  path,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files
    +)
    +
    + +
    +

    Source

    +

    +sumstats_dt <- MungeSumstats:::formatted_example() +sumstats <- check_drop_indels(sumstats_dt = sumstats_dt, + drop_indels = TRUE) +

    +
    +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the GWAS

    + + +
    drop_indels
    +

    Binary, should any indels found in the sumstats be +dropped? These can not be checked against a reference dataset and will have +the same RS ID and position as SNPs which can affect downstream analysis. +Default is False.

    + + +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, +the modified summary statistics data table object

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_dup_bp.html b/docs/reference/check_dup_bp.html new file mode 100644 index 00000000..3ec64309 --- /dev/null +++ b/docs/reference/check_dup_bp.html @@ -0,0 +1,162 @@ + +Ensure all rows have unique positions, drop those that don't — check_dup_bp • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure all rows have unique positions, drop those that don't

    +
    + +
    +
    check_dup_bp(
    +  sumstats_dt,
    +  bi_allelic_filter,
    +  check_dups,
    +  indels,
    +  path,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files
    +)
    +
    + +
    +

    Arguments

    +
    bi_allelic_filter
    +

    Binary Should non-bi-allelic SNPs be removed. +Default is TRUE.

    + + +
    check_dups
    +

    whether to check for duplicates - if formatting QTL +datasets this should be set to FALSE otherwise keep as TRUE. Default is TRUE.

    + + +
    indels
    +

    Binary does your Sumstats file contain Indels? These don't +exist in our reference file so they will be excluded from checks if this +value is TRUE. Default is TRUE.

    + + +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and log files list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_dup_col.html b/docs/reference/check_dup_col.html new file mode 100644 index 00000000..37236ad1 --- /dev/null +++ b/docs/reference/check_dup_col.html @@ -0,0 +1,116 @@ + +Ensure that no columns are duplicated — check_dup_col • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that no columns are duplicated

    +
    + +
    +
    check_dup_col(sumstats_dt, path)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the GWAS

    + + +
    path
    +

    Filepath for the summary statistics file to be formatted

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified +summary statistics data table object

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_dup_row.html b/docs/reference/check_dup_row.html new file mode 100644 index 00000000..8d0c05ea --- /dev/null +++ b/docs/reference/check_dup_row.html @@ -0,0 +1,149 @@ + +Ensure all rows are unique based on SNP,CHR,BP,A1,A2, drop those that aren't — check_dup_row • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure all rows are unique based on SNP,CHR,BP,A1,A2, drop those that aren't

    +
    + +
    +
    check_dup_row(
    +  sumstats_dt,
    +  check_dups,
    +  path,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files
    +)
    +
    + +
    +

    Arguments

    +
    check_dups
    +

    whether to check for duplicates - if formatting QTL +datasets this should be set to FALSE otherwise keep as TRUE. Default is TRUE.

    + + +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and log files list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_dup_snp.html b/docs/reference/check_dup_snp.html new file mode 100644 index 00000000..ad4732f5 --- /dev/null +++ b/docs/reference/check_dup_snp.html @@ -0,0 +1,162 @@ + +Ensure all rows have unique SNP IDs, drop those that don't — check_dup_snp • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure all rows have unique SNP IDs, drop those that don't

    +
    + +
    +
    check_dup_snp(
    +  sumstats_dt,
    +  indels,
    +  path,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files,
    +  bi_allelic_filter,
    +  check_dups
    +)
    +
    + +
    +

    Arguments

    +
    indels
    +

    Binary does your Sumstats file contain Indels? These don't +exist in our reference file so they will be excluded from checks if this +value is TRUE. Default is TRUE.

    + + +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + + +
    bi_allelic_filter
    +

    Binary Should non-bi-allelic SNPs be removed. +Default is TRUE.

    + + +
    check_dups
    +

    whether to check for duplicates - if formatting QTL +datasets this should be set to FALSE otherwise keep as TRUE. Default is TRUE.

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and log files list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_effect_columns_nonzero.html b/docs/reference/check_effect_columns_nonzero.html new file mode 100644 index 00000000..648fcfdf --- /dev/null +++ b/docs/reference/check_effect_columns_nonzero.html @@ -0,0 +1,150 @@ + +Ensure that the standard error (se) is positive for all SNPs — check_effect_columns_nonzero • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that the standard error (se) is positive for all SNPs

    +
    + +
    +
    check_effect_columns_nonzero(
    +  sumstats_dt,
    +  path,
    +  effect_columns_nonzero,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    effect_columns_nonzero
    +

    Binary should the effect columns in the data +BETA,OR (odds ratio),LOG_ODDS,SIGNED_SUMSTAT be checked to ensure no SNP=0. +Those that do are removed(if present in sumstats file). Default FALSE.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and the log file list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_empty_cols.html b/docs/reference/check_empty_cols.html new file mode 100644 index 00000000..aa560992 --- /dev/null +++ b/docs/reference/check_empty_cols.html @@ -0,0 +1,117 @@ + +Check for empty columns — check_empty_cols • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Empty columns contain only ".", NA, or 0

    +
    + +
    +
    check_empty_cols(sumstats_dt, sampled_rows = NULL, verbose = TRUE)
    +
    + +
    +

    Arguments

    +
    sampled_rows
    +

    First N rows to sample. +Set NULL to use full sumstats_file. +when determining whether cols are empty.

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    empty_cols

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_four_step_col.html b/docs/reference/check_four_step_col.html new file mode 100644 index 00000000..a6dc7376 --- /dev/null +++ b/docs/reference/check_four_step_col.html @@ -0,0 +1,116 @@ + +Ensure that CHR:BP:A2:A1 aren't merged into 1 column — check_four_step_col • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that CHR:BP:A2:A1 aren't merged into 1 column

    +
    + +
    +
    check_four_step_col(sumstats_dt, path)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the GWAS

    + + +
    path
    +

    Filepath for the summary statistics file to be formatted

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified +summary statistics data table object

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_frq.html b/docs/reference/check_frq.html new file mode 100644 index 00000000..4db7ce49 --- /dev/null +++ b/docs/reference/check_frq.html @@ -0,0 +1,150 @@ + +Ensure all SNPs have frq score above threshold — check_frq • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure all SNPs have frq score above threshold

    +
    + +
    +
    check_frq(
    +  sumstats_dt,
    +  path,
    +  FRQ_filter,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    FRQ_filter
    +

    numeric The minimum value permissible of the frequency(FRQ) +of the SNP (i.e. Allele Frequency (AF)) (if present in sumstats file). By +default no filtering is done, i.e. value of 0.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and the log file list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_frq_maf.html b/docs/reference/check_frq_maf.html new file mode 100644 index 00000000..484ff47a --- /dev/null +++ b/docs/reference/check_frq_maf.html @@ -0,0 +1,116 @@ + +Check that FRQ column refers to minor/effect allele frequency not major — check_frq_maf • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Check that FRQ column refers to minor/effect allele frequency not major

    +
    + +
    +
    check_frq_maf(sumstats_dt, frq_is_maf)
    +
    + +
    +

    Arguments

    +
    frq_is_maf
    +

    Conventionally the FRQ column is intended to show the +minor/effect allele frequency (MAF) but sometimes the major allele frequency +can be inferred as the FRQ column. This logical variable indicates that the +FRQ column should be renamed to MAJOR_ALLELE_FRQ if the frequency values +appear to relate to the major allele i.e. >0.5. By default this mapping won't +occur i.e. is TRUE.

    + +
    +
    +

    Value

    + + +

    sumstats_dt, the modified summary statistics data table object

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_info_score.html b/docs/reference/check_info_score.html new file mode 100644 index 00000000..fe028a2e --- /dev/null +++ b/docs/reference/check_info_score.html @@ -0,0 +1,142 @@ + +Ensure all SNPs have info score above threshold — check_info_score • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure all SNPs have info score above threshold

    +
    + +
    +
    check_info_score(
    +  sumstats_dt,
    +  INFO_filter,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files
    +)
    +
    + +
    +

    Arguments

    +
    INFO_filter
    +

    numeric The minimum value permissible of the imputation +information score (if present in sumstats file). Default 0.9.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations.

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and the log file list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_ldsc_format.html b/docs/reference/check_ldsc_format.html new file mode 100644 index 00000000..92ce92c9 --- /dev/null +++ b/docs/reference/check_ldsc_format.html @@ -0,0 +1,174 @@ + +Ensures that parameters are compatible with LDSC format — check_ldsc_format • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Format summary statistics for direct input to +Linkage Disequilibrium SCore (LDSC) regression without the need +to use their munge_sumstats.py script first.

    +
    + +
    +
    check_ldsc_format(
    +  sumstats_dt,
    +  save_format,
    +  convert_n_int,
    +  allele_flip_check,
    +  compute_z,
    +  compute_n
    +)
    +
    + +
    +

    Source

    +

    LDSC GitHub

    +
    +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the +GWAS.

    + + +
    save_format
    +

    Output format of sumstats. Options are NULL - standardised +output format from MungeSumstats, LDSC - output format compatible with LDSC +and openGWAS - output compatible with openGWAS VCFs. Default is NULL. +NOTE - If LDSC format is used, the naming convention of A1 as the +reference (genome build) allele and A2 as the effect allele will be reversed +to match LDSC (A1 will now be the effect allele). See more info on this +here. Note that any +effect columns (e.g. Z) will be inrelation to A1 now instead of A2.

    + + +
    convert_n_int
    +

    Binary, if N (the number of samples) is not an integer, +should this be rounded? Default is TRUE.

    + + +
    allele_flip_check
    +

    Binary Should the allele columns be checked against +reference genome to infer if flipping is necessary. Default is TRUE.

    + + +
    compute_z
    +

    Whether to compute Z-score column. Default is FALSE. This +can be computed from Beta and SE with (Beta/SE) or P +(Z:=sign(BETA)*sqrt(stats::qchisq(P,1,lower=FALSE))). +Note that imputing the Z-score from P for every SNP will not be +perfectly correct and may result in a loss of power. This should only be done +as a last resort. Use 'BETA' to impute by BETA/SE and 'P' to impute by SNP +p-value.

    + + +
    compute_n
    +

    Whether to impute N. Default of 0 won't impute, any other +integer will be imputed as the N (sample size) for every SNP in the dataset. +Note that imputing the sample size for every SNP is not correct and +should only be done as a last resort. N can also be inputted with "ldsc", +"sum", "giant" or "metal" by passing one of these for this field or a vector +of multiple. Sum and an integer value creates an N column in the output +whereas giant, metal or ldsc create an Neff or effective sample size. If +multiples are passed, the formula used to derive it will be indicated.

    + +
    +
    +

    Value

    + + +

    Formatted summary statistics

    +
    +
    +

    Details

    +

    +LDSC documentation.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_miss_data.html b/docs/reference/check_miss_data.html new file mode 100644 index 00000000..4f3cb2e5 --- /dev/null +++ b/docs/reference/check_miss_data.html @@ -0,0 +1,153 @@ + +Remove SNPs with missing data — check_miss_data • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Remove SNPs with missing data

    +
    + +
    +
    check_miss_data(
    +  sumstats_dt,
    +  path,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files,
    +  drop_na_cols
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + + +
    drop_na_cols
    +

    A character vector of column names to be checked for +missing values. Rows with missing values in any of these columns (if present +in the dataset) will be dropped. If NULL, all columns will be checked for +missing values. Default columns are SNP, chromosome, position, allele 1, +allele2, effect columns (frequency, beta, Z-score, standard error, log odds, +signed sumstats, odds ratio), p value and N columns.

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and a log file list.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_multi_gwas.html b/docs/reference/check_multi_gwas.html new file mode 100644 index 00000000..20330398 --- /dev/null +++ b/docs/reference/check_multi_gwas.html @@ -0,0 +1,136 @@ + +Ensure that only one model in GWAS sumstats or only one trait tested — check_multi_gwas • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that only one model in GWAS sumstats or only one trait tested

    +
    + +
    +
    check_multi_gwas(
    +  sumstats_dt,
    +  path,
    +  analysis_trait,
    +  ignore_multi_trait,
    +  mapping_file
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the GWAS

    + + +
    path
    +

    Filepath for the summary statistics file to be formatted

    + + +
    analysis_trait
    +

    If multiple traits were studied, name of the trait for +analysis from the GWAS. Default is NULL

    + + +
    mapping_file
    +

    MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_multi_rs_snp.html b/docs/reference/check_multi_rs_snp.html new file mode 100644 index 00000000..52ddb635 --- /dev/null +++ b/docs/reference/check_multi_rs_snp.html @@ -0,0 +1,163 @@ + +Ensure that SNP ids don't have multiple rs ids on one line — check_multi_rs_snp • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that SNP ids don't have multiple rs ids on one line

    +
    + +
    +
    check_multi_rs_snp(
    +  sumstats_dt,
    +  path,
    +  remove_multi_rs_snp,
    +  imputation_ind,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    remove_multi_rs_snp
    +

    Binary Sometimes summary statistics can have +multiple RSIDs on one row (i.e. related to one SNP), for example +"rs5772025_rs397784053". This can cause an error so by default, the first +RS ID will be kept and the rest removed e.g."rs5772025". If you want to just +remove these SNPs entirely, set it to TRUE. Default is FALSE.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and the log file list.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_n_int.html b/docs/reference/check_n_int.html new file mode 100644 index 00000000..8cfa9188 --- /dev/null +++ b/docs/reference/check_n_int.html @@ -0,0 +1,129 @@ + +Ensure that the N column is all integers — check_n_int • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that the N column is all integers

    +
    + +
    +
    check_n_int(sumstats_dt, path, convert_n_int, imputation_ind)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the GWAS

    + + +
    path
    +

    Filepath for the summary statistics file to be formatted

    + + +
    convert_n_int
    +

    Binary, if N (the number of samples) is not an integer, +should this be rounded? Default is TRUE.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). Note +these columns will be in the formatted summary statistics returned. Default +is FALSE.

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary +statistics data table object.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_n_num.html b/docs/reference/check_n_num.html new file mode 100644 index 00000000..75cb3e32 --- /dev/null +++ b/docs/reference/check_n_num.html @@ -0,0 +1,156 @@ + +Ensure all SNPs have N less than X std dev below mean — check_n_num • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    In case some SNPs were genotyped by a specialized genotyping array and +have substantially more samples than others. These will be removed.

    +
    + +
    +
    check_n_num(
    +  sumstats_dt,
    +  path,
    +  N_std,
    +  N_dropNA = FALSE,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    N_std
    +

    numeric The number of standard deviations above the mean a SNP's +N is needed to be removed. Default is 5.

    + + +
    N_dropNA
    +

    Drop rows where N is missing.Default is TRUE.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and the log file list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_no_allele.html b/docs/reference/check_no_allele.html new file mode 100644 index 00000000..c67ddc7f --- /dev/null +++ b/docs/reference/check_no_allele.html @@ -0,0 +1,185 @@ + +Ensure that A1 & A2 are present, if not can find it with SNP and other allele — check_no_allele • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    More care needs to be taken if one of A1/A2 is present, before imputing the +other allele flipping needs to be checked

    +
    + +
    +
    check_no_allele(
    +  sumstats_dt,
    +  path,
    +  ref_genome,
    +  rsids,
    +  imputation_ind,
    +  allele_flip_check,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files,
    +  bi_allelic_filter,
    +  dbSNP
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + + +
    allele_flip_check
    +

    Binary Should the allele columns be checked against +reference genome to infer if flipping is necessary. Default is TRUE.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + + +
    bi_allelic_filter
    +

    Binary Should non-bi-allelic SNPs be removed. +Default is TRUE.

    + + +
    dbSNP
    +

    version of dbSNP to be used for imputation (144 or 155).

    + +
    +
    +

    Value

    + + +

    A list containing two data tables:

    • sumstats_dt: the modified summary statistics data table object

    • +
    • rsids: snpsById, filtered to SNPs of interest +if loaded already. Or else NULL.

    • +
    • allele_flip_check: does the dataset require allele flip check

    • +
    • log_files: log file list

    • +
    • bi_allelic_filter: should multi-allelic SNPs be filtered out

    • +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_no_chr_bp.html b/docs/reference/check_no_chr_bp.html new file mode 100644 index 00000000..d09a87eb --- /dev/null +++ b/docs/reference/check_no_chr_bp.html @@ -0,0 +1,171 @@ + +Ensure that CHR and BP are missing if SNP is present, can find them — check_no_chr_bp • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that CHR and BP are missing if SNP is present, can find them

    +
    + +
    +
    check_no_chr_bp(
    +  sumstats_dt,
    +  path,
    +  ref_genome,
    +  rsids,
    +  imputation_ind,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files,
    +  dbSNP
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + + +
    dbSNP
    +

    version of dbSNP to be used for imputation (144 or 155).

    + +
    +
    +

    Value

    + + +

    A list containing two data tables:

    • sumstats_dt + : the modified summary statistics data table object

    • +
    • rsids + : snpsById, filtered to SNPs of interest if loaded already. Or else NULL

    • +
    • log_files + : log file list

    • +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_no_rs_snp.html b/docs/reference/check_no_rs_snp.html new file mode 100644 index 00000000..d030e7f0 --- /dev/null +++ b/docs/reference/check_no_rs_snp.html @@ -0,0 +1,182 @@ + +Ensure that SNP appears to be valid RSIDs (starts with rs) — check_no_rs_snp • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that SNP appears to be valid RSIDs (starts with rs)

    +
    + +
    +
    check_no_rs_snp(
    +  sumstats_dt,
    +  path,
    +  ref_genome,
    +  snp_ids_are_rs_ids,
    +  indels,
    +  imputation_ind,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files,
    +  dbSNP
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    snp_ids_are_rs_ids
    +

    Binary Should the supplied SNP ID's be assumed to +be RSIDs. If not, imputation using the SNP ID for other columns like +base-pair position or chromosome will not be possible. If set to FALSE, the +SNP RS ID will be imputed from the reference genome if possible. Default is +TRUE.

    + + +
    indels
    +

    Binary does your Sumstats file contain Indels? These don't +exist in our reference file so they will be excluded from checks if this +value is TRUE. Default is TRUE.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + + +
    dbSNP
    +

    version of dbSNP to be used for imputation (144 or 155).

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and the log file list.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_no_snp.html b/docs/reference/check_no_snp.html new file mode 100644 index 00000000..dc81a705 --- /dev/null +++ b/docs/reference/check_no_snp.html @@ -0,0 +1,178 @@ + +Ensure that SNP is present if not can find it with CHR and BP — check_no_snp • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that SNP is present if not can find it with CHR and BP

    +
    + +
    +
    check_no_snp(
    +  sumstats_dt,
    +  path,
    +  ref_genome,
    +  indels,
    +  imputation_ind,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files,
    +  dbSNP,
    +  verbose = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    indels
    +

    Binary does your Sumstats file contain Indels? These don't +exist in our reference file so they will be excluded from checks if this +value is TRUE. Default is TRUE.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + + +
    dbSNP
    +

    version of dbSNP to be used for imputation (144 or 155).

    + + +
    verbose
    +

    should messages be printed. Default it TRUE.

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and the log files list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_numeric.html b/docs/reference/check_numeric.html new file mode 100644 index 00000000..6be823b1 --- /dev/null +++ b/docs/reference/check_numeric.html @@ -0,0 +1,120 @@ + +Check numeric columns — check_numeric • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Checks for any columns that should be numeric, +and ensures that they are indeed numeric.

    +
    + +
    +
    check_numeric(sumstats_dt, cols = c("P", "SE", "FRQ", "MAF", "BETA"))
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    Summary stats with column names already standardised by +format_sumstats.

    + + +
    cols
    +

    Names of columns that should be numeric. +If any of these columns are not actually present in sumstats_dt, +they will be skipped.

    + +
    +
    +

    Value

    + + +

    sumstats_dt

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_on_ref_genome.html b/docs/reference/check_on_ref_genome.html new file mode 100644 index 00000000..04880ce6 --- /dev/null +++ b/docs/reference/check_on_ref_genome.html @@ -0,0 +1,184 @@ + +Ensure all SNPs are on the reference genome — check_on_ref_genome • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure all SNPs are on the reference genome

    +
    + +
    +
    check_on_ref_genome(
    +  sumstats_dt,
    +  path,
    +  ref_genome,
    +  on_ref_genome,
    +  indels = indels,
    +  rsids,
    +  imputation_ind,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files,
    +  dbSNP
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    on_ref_genome
    +

    Binary Should a check take place that all SNPs are on +the reference genome by SNP ID. Default is TRUE.

    + + +
    indels
    +

    Binary does your Sumstats file contain Indels? These don't +exist in our reference file so they will be excluded from checks if this +value is TRUE. Default is TRUE.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + + +
    dbSNP
    +

    version of dbSNP to be used for imputation (144 or 155).

    + +
    +
    +

    Value

    + + +

    A list containing two data tables:

    • sumstats_dt + : the modified summary statistics data table object

    • +
    • rsids + : snpsById, filtered to SNPs of interest if loaded already. Or else NULL

    • +
    • log_files + : log file list

    • +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_pos_se.html b/docs/reference/check_pos_se.html new file mode 100644 index 00000000..347e6b9b --- /dev/null +++ b/docs/reference/check_pos_se.html @@ -0,0 +1,174 @@ + +Ensure that the standard error (se) is positive for all SNPs Also impute se if missing — check_pos_se • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that the standard error (se) is positive for all SNPs +Also impute se if missing

    +
    + +
    +
    check_pos_se(
    +  sumstats_dt,
    +  path,
    +  pos_se,
    +  log_folder_ind,
    +  imputation_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files,
    +  impute_se
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    pos_se
    +

    Binary Should the standard Error (SE) column be checked to +ensure it is greater than 0? Those that are, are removed (if present in +sumstats file). Default TRUE.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + + +
    impute_se
    +

    Binary, whether the standard error should be imputed using +other effect data if it isn't present in the sumstats. Note that this +imputation is an approximation so could have an effect on downstream +analysis. Use with caution. The different methods MungeSumstats will try and +impute se (in this order or priority) are:

    1. BETA / Z 2. abs(BETA/ qnorm(P/2)) +Default is FALSE.

    2. +
    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and the log file list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_range_p_val.html b/docs/reference/check_range_p_val.html new file mode 100644 index 00000000..f80b6bd8 --- /dev/null +++ b/docs/reference/check_range_p_val.html @@ -0,0 +1,146 @@ + +Ensure that the p values are not >1 and if so set to 1 — check_range_p_val • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that the p values are not >1 and if so set to 1

    +
    + +
    +
    check_range_p_val(sumstats_dt, convert_large_p, convert_neg_p, imputation_ind)
    +
    + +
    +

    Source

    +

    +sumstats_dt <- MungeSumstats:::formatted_example() +sumstats_dt$P[1:3] <- 5 +sumstats_dt$P[6:10] <- -5 +sumstats <- check_range_p_val(sumstats_dt = sumstats_dt, + convert_large_p = TRUE, + convert_neg_p = TRUE, + imputation_ind = TRUE) +

    +
    +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the GWAS

    + + +
    convert_large_p
    +

    Binary, should p-values >1 be converted to 1? +P-values >1 should not be possible and can cause errors with LDSC/MAGMA and +should be converted. Default is TRUE.

    + + +
    convert_neg_p
    +

    Binary, should p-values <0 be converted to 0? +Negative p-values should not be possible and can cause errors +with LDSC/MAGMA and should be converted. Default is TRUE.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, +the modified summary statistics data table object

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_row_snp.html b/docs/reference/check_row_snp.html new file mode 100644 index 00000000..92e7f941 --- /dev/null +++ b/docs/reference/check_row_snp.html @@ -0,0 +1,143 @@ + +Ensure all rows have SNPs beginning with rs or SNP, drop those that don't — check_row_snp • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure all rows have SNPs beginning with rs or SNP, drop those that don't

    +
    + +
    +
    check_row_snp(
    +  sumstats_dt,
    +  path,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and log file list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_save_path.html b/docs/reference/check_save_path.html new file mode 100644 index 00000000..8856b19d --- /dev/null +++ b/docs/reference/check_save_path.html @@ -0,0 +1,149 @@ + +Check if save path and log folder is appropriate — check_save_path • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Check if save path and log folder is appropriate

    +
    + +
    +
    check_save_path(
    +  save_path,
    +  log_folder,
    +  log_folder_ind,
    +  tabix_index,
    +  write_vcf = FALSE,
    +  verbose = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    save_path
    +

    File path to save formatted data. Defaults to +tempfile(fileext=".tsv.gz").

    + + +
    log_folder
    +

    Filepath to the directory for the log files and the log of +MungeSumstats messages to be stored. Default is a temporary directory. Note +the name of the log files (log messages and log outputs) are now the same as +the name of the file specified in the save path parameter with the extension +'_log_msg.txt' and '_log_output.txt' respectively.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    write_vcf
    +

    Whether to write as VCF (TRUE) or tabular file (FALSE).

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    Corrected save_path, the file type, the separator, corrected +log_folder,the log file extension.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_signed_col.html b/docs/reference/check_signed_col.html new file mode 100644 index 00000000..1dff6814 --- /dev/null +++ b/docs/reference/check_signed_col.html @@ -0,0 +1,165 @@ + +Ensure that there is at least one signed column in summary statistics file Impute beta if user requests — check_signed_col • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that there is at least one signed column in summary statistics file +Impute beta if user requests

    +
    + +
    +
    check_signed_col(
    +  sumstats_dt,
    +  impute_beta,
    +  log_folder_ind,
    +  rsids,
    +  imputation_ind,
    +  check_save_out,
    +  tabix_index,
    +  log_files,
    +  nThread
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics +file for the GWAS

    + + +
    impute_beta
    +

    Binary, whether BETA should be imputed using other effect +data if it isn't present in the sumstats. Note that this imputation is an +approximation (for Z & SE approach) so could have an effect on downstream +analysis. Use with caution. The different methods MungeSumstats will try and +impute beta (in this order or priority) are:

    1. log(OR) 2. Z x SE +Default value is FALSE.

    2. +
    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    log_files
    +

    list of log file locations

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + +
    +
    +

    Value

    + + +

    null

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_small_p_val.html b/docs/reference/check_small_p_val.html new file mode 100644 index 00000000..764a73d8 --- /dev/null +++ b/docs/reference/check_small_p_val.html @@ -0,0 +1,140 @@ + +Ensure that the non-negative p-values are not 5e-324 or lower, if so set to 0 — check_small_p_val • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that the non-negative p-values are not 5e-324 or lower, if so set to 0

    +
    + +
    +
    check_small_p_val(sumstats_dt, convert_small_p, imputation_ind)
    +
    + +
    +

    Source

    +

    +sumstats_dt <- MungeSumstats:::formatted_example() +sumstats_dt$P[1:3] <- 5e-324 +sumstats_dt$P[6:10] <- "5e-324" +sumstats <- check_small_p_val(sumstats_dt = sumstats_dt, + convert_small_p = TRUE, + imputation_ind = TRUE) +

    +
    +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the GWAS

    + + +
    convert_small_p
    +

    Binary, should non-negative +p-values <= 5e-324 be converted to 0? +Small p-values pass the R limit and can cause errors with LDSC/MAGMA and +should be converted. Default is TRUE.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, +the modified summary statistics data table object

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_strand_ambiguous.html b/docs/reference/check_strand_ambiguous.html new file mode 100644 index 00000000..8da860ae --- /dev/null +++ b/docs/reference/check_strand_ambiguous.html @@ -0,0 +1,156 @@ + +Remove SNPs with strand-ambiguous alleles — check_strand_ambiguous • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Remove SNPs with strand-ambiguous alleles

    +
    + +
    +
    check_strand_ambiguous(
    +  sumstats_dt,
    +  path,
    +  ref_genome,
    +  strand_ambig_filter,
    +  log_folder_ind,
    +  check_save_out,
    +  tabix_index,
    +  nThread,
    +  log_files
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    strand_ambig_filter
    +

    Binary Should SNPs with strand-ambiguous alleles +be removed. Default is FALSE.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    log_files
    +

    list of log file locations

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and the log file list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_tabular.html b/docs/reference/check_tabular.html new file mode 100644 index 00000000..2c317a74 --- /dev/null +++ b/docs/reference/check_tabular.html @@ -0,0 +1,111 @@ + +Ensure valid tabular format — check_tabular • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure valid tabular format

    +
    + +
    +
    check_tabular(header)
    +
    + +
    +

    Arguments

    +
    header
    +

    The summary statistics file for the GWAS

    + +
    +
    +

    Value

    + + +

    Whether the file is tabular

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_two_step_col.html b/docs/reference/check_two_step_col.html new file mode 100644 index 00000000..c89b8786 --- /dev/null +++ b/docs/reference/check_two_step_col.html @@ -0,0 +1,117 @@ + +Ensure that CHR:BP aren't merged into 1 column — check_two_step_col • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that CHR:BP aren't merged into 1 column

    +
    + +
    +
    check_two_step_col(sumstats_dt, path)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics +file for the GWAS

    + + +
    path
    +

    Filepath for the summary statistics file to be formatted

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary +statistics data table object

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_vcf.html b/docs/reference/check_vcf.html new file mode 100644 index 00000000..90cd5164 --- /dev/null +++ b/docs/reference/check_vcf.html @@ -0,0 +1,111 @@ + +Check if the inputted file is in VCF format — check_vcf • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Check if the inputted file is in VCF format

    +
    + +
    +
    check_vcf(header)
    +
    + +
    +

    Arguments

    +
    header
    +

    Header of the GWAS summary statistics file.

    + +
    +
    +

    Value

    + + +

    Whether the file is vcf or not

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_vital_col.html b/docs/reference/check_vital_col.html new file mode 100644 index 00000000..c3579263 --- /dev/null +++ b/docs/reference/check_vital_col.html @@ -0,0 +1,111 @@ + +Ensure that all necessary columns are in the summary statistics file — check_vital_col • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that all necessary columns are in the summary statistics file

    +
    + +
    +
    check_vital_col(sumstats_dt)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the GWAS

    + +
    +
    +

    Value

    + + +

    null

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/check_zscore.html b/docs/reference/check_zscore.html new file mode 100644 index 00000000..a0f7f4e3 --- /dev/null +++ b/docs/reference/check_zscore.html @@ -0,0 +1,173 @@ + +Check for Z-score column — check_zscore • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    The following ensures that a Z-score column is present. +The Z-score formula we used here is a R implementation of the formula +used in LDSC's munge_sumstats.py:

    +
    + +
    +
    check_zscore(
    +  sumstats_dt,
    +  imputation_ind,
    +  compute_z = "BETA",
    +  force_new_z = FALSE,
    +  standardise_headers = FALSE,
    +  mapping_file
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the +GWAS.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). Note +these columns will be in the formatted summary statistics returned. Default +is FALSE.

    + + +
    compute_z
    +

    Whether to compute Z-score column. Default is FALSE. This +can be computed from Beta and SE with (Beta/SE) or P +(Z:=sign(BETA)*sqrt(stats::qchisq(P,1,lower=FALSE))). +Note that imputing the Z-score from P for every SNP will not be +perfectly correct and may result in a loss of power. This should only be done +as a last resort. Use 'BETA' to impute by BETA/SE and 'P' to impute by SNP +p-value.

    + + +
    force_new_z
    +

    When a "Z" column already exists, it will be used by +default. To override and compute a new Z-score column from P set +force_new_z=TRUE.

    + + +
    standardise_headers
    +

    Run +standardise_sumstats_column_headers_crossplatform first.

    + + +
    mapping_file
    +

    MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.

    + +
    +
    +

    Value

    + + +

    list("sumstats_dt"=sumstats_dt)

    + + +
    +
    +

    Details

    +

    np.sqrt(chi2.isf(P, 1))

    +

    The R implementation is adapted from the GenomicSEM::munge function, +after optimizing for speed using data.table:

    +

    sumstats_dt[,Z:=sign(BETA)*sqrt(stats::qchisq(P,1,lower=FALSE))]

    +

    NOTE: compute_z is set to TRUE by +default to ensure standardisation +of the "Z" column (which can be computed differently in different datasets).

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/column_dictionary.html b/docs/reference/column_dictionary.html new file mode 100644 index 00000000..6cabe359 --- /dev/null +++ b/docs/reference/column_dictionary.html @@ -0,0 +1,128 @@ + +Map column names to positions. — column_dictionary • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Useful in situations where you need to specify columns by +index instead of name (e.g. awk queries).

    +
    + +
    +
    column_dictionary(file_path)
    +
    + +
    +

    Source

    +

    Borrowed function from + +echotabix.

    +

    +eduAttainOkbayPth <- system.file("extdata", "eduAttainOkbay.txt", + package = "MungeSumstats" +) +tmp <- tempfile(fileext = ".tsv") +file.copy(eduAttainOkbayPth, tmp) +cdict <- MungeSumstats:::column_dictionary(file_path = tmp) +

    +
    +
    +

    Arguments

    +
    file_path
    +

    Path to full summary stats file +(or any really file you want to make a column dictionary for).

    + +
    +
    +

    Value

    + + +

    Named list of column positions.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/compute_nsize.html b/docs/reference/compute_nsize.html new file mode 100644 index 00000000..4a0629b8 --- /dev/null +++ b/docs/reference/compute_nsize.html @@ -0,0 +1,180 @@ + +Check for N column if not present and user wants, impute N based on user's sample size. NOTE this will be the same value for each SNP which is not necessarily correct and may cause issues down the line. N can also be inputted with "ldsc", "sum", "giant" or "metal" by passing one or multiple of these. — compute_nsize • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Check for N column if not present and user wants, impute N based on user's +sample size. NOTE this will be the same value for each SNP which is not +necessarily correct and may cause issues down the line. N can also be +inputted with "ldsc", "sum", "giant" or "metal" by passing one or +multiple of these.

    +
    + +
    +
    compute_nsize(
    +  sumstats_dt,
    +  imputation_ind = FALSE,
    +  compute_n = c("ldsc", "giant", "metal", "sum"),
    +  standardise_headers = FALSE,
    +  force_new = FALSE,
    +  return_list = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the +GWAS.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). Note +these columns will be in the formatted summary statistics returned. Default +is FALSE.

    + + +
    compute_n
    +

    How to compute per-SNP sample size (new column "N").

    • 0: N will not be computed.

    • +
    • >0: If any number >0 is provided, +that value will be set as N for every row. +Note: Computing N this way is incorrect and should be avoided +if at all possible.

    • +
    • "sum": N will be computed as: +cases (N_CAS) + controls (N_CON), so long as both columns are present.

    • +
    • "ldsc": N will be computed as effective sample size: +Neff =(N_CAS+N_CON)*(N_CAS/(N_CAS+N_CON)) / mean((N_CAS/(N_CAS+N_CON))(N_CAS+N_CON)==max(N_CAS+N_CON)).

    • +
    • "giant": N will be computed as effective sample size: +Neff = 2 / (1/N_CAS + 1/N_CON).

    • +
    • "metal": N will be computed as effective sample size: +Neff = 4 / (1/N_CAS + 1/N_CON).

    • +
    + + +
    standardise_headers
    +

    Standardise headers first.

    + + +
    force_new
    +

    If "Neff" (or "N") already exists in sumstats_dt, +replace it with the recomputed version.

    + + +
    return_list
    +

    Return the sumstats_dt within a named list +(default: TRUE).

    + +
    +
    +

    Value

    + + +

    list("sumstats_dt"=sumstats_dt)

    + + +
    + +
    +

    Examples

    +
    sumstats_dt <- MungeSumstats::formatted_example()
    +#> Standardising column headers.
    +#> First line of summary statistics file: 
    +#> MarkerName	CHR	POS	A1	A2	EAF	Beta	SE	Pval	
    +#> Sorting coordinates with 'data.table'.
    +sumstats_dt2 <- MungeSumstats::compute_nsize(sumstats_dt=sumstats_dt,
    +                                             compute_n=10000)
    +#> Assigning N=10000 for all SNPs.
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/compute_sample_size.html b/docs/reference/compute_sample_size.html new file mode 100644 index 00000000..7f2134cf --- /dev/null +++ b/docs/reference/compute_sample_size.html @@ -0,0 +1,165 @@ + +Compute (effective) sample size — compute_sample_size • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Computes sample sum (as new column "N") or +effective sample size (ESS) (as new column "Neff"). +Computing ESS is important as it takes into account +the proportion of cases to controls (i.e. class imbalance) so as not to +overestimate your statistical power.

    +
    + +
    +
    compute_sample_size(
    +  sumstats_dt,
    +  method = c("ldsc", "giant", "metal", "sum"),
    +  force_new = FALSE,
    +  append_method_name = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    Summary statistics data.table.

    + + +
    method
    +

    Method for computing (effective) sample size.

    +
    + + +
    force_new
    +

    If "Neff" (or "N") already exists in sumstats_dt, +replace it with the recomputed version.

    + + +
    append_method_name
    +

    should Neff column have an indicator to explain the +method that makes it., Default is FALSE unless multiple methods are passed

    + +
    +
    +

    Value

    + + +

    A data.table with a new column "Neff" or "N"

    +
    +
    +

    Details

    +

    There are many different formulas for calculating ESS, +but LDSC is probably the best method available here, as it +doesn't assume that the proportion of controls:cases +is 2:1 (as in GIANT) or 4:1 (as in METAL).

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/compute_sample_size_n.html b/docs/reference/compute_sample_size_n.html new file mode 100644 index 00000000..e566bd2b --- /dev/null +++ b/docs/reference/compute_sample_size_n.html @@ -0,0 +1,140 @@ + +Add user supplied sample size — compute_sample_size_n • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Add user supplied sample size

    +
    + +
    +
    compute_sample_size_n(sumstats_dt, method, force_new = FALSE)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    Summary statistics data.table.

    + + +
    method
    +

    Method for computing (effective) sample size.

    +
    + + +
    force_new
    +

    If "Neff" (or "N") already exists in sumstats_dt, +replace it with the recomputed version.

    + +
    +
    +

    Value

    + + +

    No return

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/compute_sample_size_neff.html b/docs/reference/compute_sample_size_neff.html new file mode 100644 index 00000000..c1f122b8 --- /dev/null +++ b/docs/reference/compute_sample_size_neff.html @@ -0,0 +1,150 @@ + +Compute Neff/N — compute_sample_size_neff • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Compute Neff/N

    +
    + +
    +
    compute_sample_size_neff(
    +  sumstats_dt,
    +  method,
    +  force_new = FALSE,
    +  append_method_name = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    Summary statistics data.table.

    + + +
    method
    +

    Method for computing (effective) sample size.

    +
    + + +
    force_new
    +

    If "Neff" (or "N") already exists in sumstats_dt, +replace it with the recomputed version.

    + + +
    append_method_name
    +

    should Neff column have an indicator to explain the +method that makes it., Default is FALSE unless multiple methods are passed

    + +
    +
    +

    Value

    + + +

    No return

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/convert_sumstats.html b/docs/reference/convert_sumstats.html new file mode 100644 index 00000000..1f05eb77 --- /dev/null +++ b/docs/reference/convert_sumstats.html @@ -0,0 +1,116 @@ + +Convert summary statistics to desired object type — convert_sumstats • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Convert summary statistics to desired object type

    +
    + +
    +
    convert_sumstats(
    +  sumstats_dt,
    +  return_format = c("data.table", "vranges", "granges")
    +)
    +
    + +
    +

    Arguments

    +
    return_format
    +

    Object type to convert to; +"data.table", "GenomicRanges" or +"VRanges"(default is "data.table").

    + +
    +
    +

    Value

    + + +

    Summary statistics in the converted format

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/download_vcf.html b/docs/reference/download_vcf.html new file mode 100644 index 00000000..2e9112be --- /dev/null +++ b/docs/reference/download_vcf.html @@ -0,0 +1,169 @@ + +Download VCF file and its index file from Open GWAS — download_vcf • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ideally, we would use gwasvcf +instead but it hasn't been made available on CRAN or Bioconductor yet, +so we can't include it as a dep.

    +
    + +
    +
    download_vcf(
    +  vcf_url,
    +  vcf_dir = tempdir(),
    +  vcf_download = TRUE,
    +  download_method = "download.file",
    +  force_new = FALSE,
    +  quiet = FALSE,
    +  timeout = 10 * 60,
    +  nThread = 1
    +)
    +
    + +
    +

    Arguments

    +
    vcf_url
    +

    Remote URL to VCF file.

    + + +
    vcf_dir
    +

    Where to download the original VCF from Open GWAS. +WARNING: This is set to tempdir() by default. +This means the raw (pre-formatted) VCFs be deleted upon ending the R session. +Change this to keep the raw VCF file on disk +(e.g. vcf_dir="./raw_vcf").

    + + +
    vcf_download
    +

    Download the original VCF from Open GWAS.

    + + +
    download_method
    +

    "axel" (multi-threaded) or +"download.file" (single-threaded) .

    + + +
    force_new
    +

    Overwrite a previously downloaded VCF +with the same path name.

    + + +
    quiet
    +

    Run quietly.

    + + +
    timeout
    +

    How many seconds before giving up on download. +Passed to download.file. Default: 10*60 (10min).

    + + +
    nThread
    +

    Number of threads to parallelize over.

    + +
    +
    +

    Value

    + + +

    List containing the paths to the downloaded VCF and its index file.

    +
    + +
    +

    Examples

    +
    #only run the examples if user has internet access:
    +if(try(is.character(getURL("www.google.com")))==TRUE){
    +vcf_url <- "https://gwas.mrcieu.ac.uk/files/ieu-a-298/ieu-a-298.vcf.gz"
    +out_paths <- download_vcf(vcf_url = vcf_url)
    +}
    +#> Error in getURL("www.google.com") : could not find function "getURL"
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/downloader.html b/docs/reference/downloader.html new file mode 100644 index 00000000..9894eb11 --- /dev/null +++ b/docs/reference/downloader.html @@ -0,0 +1,188 @@ + +downloader wrapper — downloader • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    R wrapper for +axel +(multi-threaded) and +download.file (single-threaded) +download functions.

    +
    + +
    +
    downloader(
    +  input_url,
    +  output_path,
    +  download_method = "axel",
    +  background = FALSE,
    +  force_overwrite = FALSE,
    +  quiet = TRUE,
    +  show_progress = TRUE,
    +  continue = TRUE,
    +  nThread = 1,
    +  alternate = TRUE,
    +  check_certificates = TRUE,
    +  timeout = 10 * 60
    +)
    +
    + + +
    +

    Arguments

    +
    input_url
    +

    input_url.

    + + +
    output_path
    +

    output_path.

    + + +
    download_method
    +

    "axel" (multi-threaded) or +"download.file" (single-threaded) .

    + + +
    background
    +

    Run in background

    + + +
    force_overwrite
    +

    Overwrite existing file.

    + + +
    quiet
    +

    Run quietly.

    + + +
    show_progress
    +

    show_progress.

    + + +
    continue
    +

    continue.

    + + +
    nThread
    +

    Number of threads to parallelize over.

    + + +
    alternate
    +

    alternate,

    + + +
    check_certificates
    +

    check_certificates

    + + +
    timeout
    +

    How many seconds before giving up on download. +Passed to download.file. Default: 10*60 (10min).

    + +
    +
    +

    Value

    + + +

    Local path to downloaded file.

    +
    +
    +

    See also

    +

    Other downloaders: +axel()

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/drop_duplicate_cols.html b/docs/reference/drop_duplicate_cols.html new file mode 100644 index 00000000..7ff4c2bf --- /dev/null +++ b/docs/reference/drop_duplicate_cols.html @@ -0,0 +1,111 @@ + +Drop duplicate columns — drop_duplicate_cols • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Drop columns with identical names (if any exist) within a data.table.

    +
    + +
    +
    drop_duplicate_cols(dt)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.table

    + +
    +
    +

    Value

    + + +

    Null output

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/drop_duplicate_rows.html b/docs/reference/drop_duplicate_rows.html new file mode 100644 index 00000000..876a5296 --- /dev/null +++ b/docs/reference/drop_duplicate_rows.html @@ -0,0 +1,115 @@ + +Drop duplicate rows — drop_duplicate_rows • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Drop rows with duplicate values across all columns.

    +
    + +
    +
    drop_duplicate_rows(dt, verbose = TRUE)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.table

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    Filtered dt.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/find_sumstats.html b/docs/reference/find_sumstats.html new file mode 100644 index 00000000..61d7097e --- /dev/null +++ b/docs/reference/find_sumstats.html @@ -0,0 +1,233 @@ + +Search Open GWAS for datasets matching criteria — find_sumstats • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    For each argument, searches for any datasets matching +a case-insensitive substring search in the respective metadata column. +Users can supply a single character string or a +list/vector of character strings.

    +
    + +
    +
    find_sumstats(
    +  ids = NULL,
    +  traits = NULL,
    +  years = NULL,
    +  consortia = NULL,
    +  authors = NULL,
    +  populations = NULL,
    +  categories = NULL,
    +  subcategories = NULL,
    +  builds = NULL,
    +  pmids = NULL,
    +  min_sample_size = NULL,
    +  min_ncase = NULL,
    +  min_ncontrol = NULL,
    +  min_nsnp = NULL,
    +  include_NAs = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    ids
    +

    List of Open GWAS study IDs +(e.g. c("prot-a-664", "ieu-b-4760")).

    + + +
    traits
    +

    List of traits +(e.g. c("parkinson", "Alzheimer")).

    + + +
    years
    +

    List of years +(e.g. seq(2015,2021) or c(2010, 2012, 2021)).

    + + +
    consortia
    +

    List of consortia +(e.g. c("MRC-IEU","Neale Lab").

    + + +
    authors
    +

    List of authors +(e.g. c("Elsworth","Kunkle","Neale")).

    + + +
    populations
    +

    List of populations +(e.g. c("European","Asian")).

    + + +
    categories
    +

    List of categories +(e.g. c("Binary","Continuous","Disease","Risk factor"))).

    + + +
    subcategories
    +

    List of categories +(e.g. c("neurological","Immune","cardio"))).

    + + +
    builds
    +

    List of genome builds +(e.g. c("hg19","grch37")).

    + + +
    pmids
    +

    List of PubMed ID (exact matches only) +(e.g. c(29875488, 30305740, 28240269)).

    + + +
    min_sample_size
    +

    Minimum total number of study participants +(e.g. 5000).

    + + +
    min_ncase
    +

    Minimum number of case participants +(e.g. 1000).

    + + +
    min_ncontrol
    +

    Minimum number of control participants +(e.g. 1000).

    + + +
    min_nsnp
    +

    Minimum number of SNPs +(e.g. 200000).

    + + +
    include_NAs
    +

    Include datasets with missing metadata for size criteria +(i.e. min_sample_size, min_ncase, or min_ncontrol).

    + +
    +
    +

    Value

    + + +

    (Filtered) GWAS metadata table.

    +
    +
    +

    Details

    +

    By default, returns metadata for all studies currently in Open GWAS database.

    +
    + +
    +

    Examples

    +
    # Only run the examples if user has internet access
    +# and if access token has been added
    +if(try(is.character(getURL("www.google.com")))==TRUE && ieugwasr::get_opengwas_jwt()!=""){
    +### By ID
    +metagwas <- find_sumstats(ids = c(
    +    "ieu-b-4760",
    +    "prot-a-1725",
    +    "prot-a-664"
    +))
    +### By ID and sample size
    +metagwas <- find_sumstats(
    +    ids = c("ieu-b-4760", "prot-a-1725", "prot-a-664"),
    +    min_sample_size = 5000
    +)
    +### By criteria
    +metagwas <- find_sumstats(
    +    traits = c("alzheimer", "parkinson"),
    +    years = seq(2015, 2021)
    +)
    +}
    +#> Error in getURL("www.google.com") : could not find function "getURL"
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/format_sumstats.html b/docs/reference/format_sumstats.html new file mode 100644 index 00000000..7264faff --- /dev/null +++ b/docs/reference/format_sumstats.html @@ -0,0 +1,663 @@ + +Check that summary statistics from GWAS are in a homogeneous format — format_sumstats • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Check that summary statistics from GWAS are in a homogeneous format

    +
    + +
    +
    format_sumstats(
    +  path,
    +  ref_genome = NULL,
    +  convert_ref_genome = NULL,
    +  chain_source = "ensembl",
    +  local_chain = NULL,
    +  convert_small_p = TRUE,
    +  convert_large_p = TRUE,
    +  convert_neg_p = TRUE,
    +  compute_z = FALSE,
    +  force_new_z = FALSE,
    +  compute_n = 0L,
    +  convert_n_int = TRUE,
    +  impute_beta = FALSE,
    +  es_is_beta = TRUE,
    +  impute_se = FALSE,
    +  analysis_trait = NULL,
    +  ignore_multi_trait = FALSE,
    +  INFO_filter = 0.9,
    +  FRQ_filter = 0,
    +  pos_se = TRUE,
    +  effect_columns_nonzero = FALSE,
    +  N_std = 5,
    +  N_dropNA = TRUE,
    +  chr_style = "Ensembl",
    +  rmv_chr = c("X", "Y", "MT"),
    +  on_ref_genome = TRUE,
    +  infer_eff_direction = TRUE,
    +  eff_on_minor_alleles = FALSE,
    +  strand_ambig_filter = FALSE,
    +  allele_flip_check = TRUE,
    +  allele_flip_drop = TRUE,
    +  allele_flip_z = TRUE,
    +  allele_flip_frq = TRUE,
    +  bi_allelic_filter = TRUE,
    +  flip_frq_as_biallelic = FALSE,
    +  snp_ids_are_rs_ids = TRUE,
    +  remove_multi_rs_snp = FALSE,
    +  frq_is_maf = TRUE,
    +  indels = TRUE,
    +  drop_indels = FALSE,
    +  drop_na_cols = c("SNP", "CHR", "BP", "A1", "A2", "FRQ", "BETA", "Z", "OR", "LOG_ODDS",
    +    "SIGNED_SUMSTAT", "SE", "P", "N"),
    +  dbSNP = 155,
    +  check_dups = TRUE,
    +  sort_coordinates = TRUE,
    +  nThread = 1,
    +  save_path = tempfile(fileext = ".tsv.gz"),
    +  write_vcf = FALSE,
    +  tabix_index = FALSE,
    +  return_data = FALSE,
    +  return_format = "data.table",
    +  ldsc_format = FALSE,
    +  save_format = NULL,
    +  log_folder_ind = FALSE,
    +  log_mungesumstats_msgs = FALSE,
    +  log_folder = tempdir(),
    +  imputation_ind = FALSE,
    +  force_new = FALSE,
    +  mapping_file = sumstatsColHeaders,
    +  rmv_chrPrefix = NULL
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    convert_ref_genome
    +

    name of the reference genome to convert to +("GRCh37" or "GRCh38"). This will only occur if the current genome build does +not match. Default is not to convert the genome build (NULL).

    + + +
    chain_source
    +

    source of the chain file to use in liftover, if converting +genome build ("ucsc" or "ensembl"). Note that the UCSC chain files require a +license for commercial use. The Ensembl chain is used by default ("ensembl").

    + + +
    local_chain
    +

    Path to local chain file to use instead of downlaoding. +Default of NULL i.e. no local file to use. NOTE if passing a local chain file +make sure to specify the path to convert from and to the correct build like +GRCh37 to GRCh38. We can not sense check this for local files. The chain file +can be submitted as a gz file (as downloaed from source) or unzipped.

    + + +
    convert_small_p
    +

    Binary, should non-negative +p-values <= 5e-324 be converted to 0? +Small p-values pass the R limit and can cause errors with LDSC/MAGMA and +should be converted. Default is TRUE.

    + + +
    convert_large_p
    +

    Binary, should p-values >1 be converted to 1? +P-values >1 should not be possible and can cause errors with LDSC/MAGMA and +should be converted. Default is TRUE.

    + + +
    convert_neg_p
    +

    Binary, should p-values <0 be converted to 0? +Negative p-values should not be possible and can cause errors +with LDSC/MAGMA and should be converted. Default is TRUE.

    + + +
    compute_z
    +

    Whether to compute Z-score column. Default is FALSE. This +can be computed from Beta and SE with (Beta/SE) or P +(Z:=sign(BETA)*sqrt(stats::qchisq(P,1,lower=FALSE))). +Note that imputing the Z-score from P for every SNP will not be +perfectly correct and may result in a loss of power. This should only be done +as a last resort. Use 'BETA' to impute by BETA/SE and 'P' to impute by SNP +p-value.

    + + +
    force_new_z
    +

    When a "Z" column already exists, it will be used by +default. To override and compute a new Z-score column from P set +force_new_z=TRUE.

    + + +
    compute_n
    +

    Whether to impute N. Default of 0 won't impute, any other +integer will be imputed as the N (sample size) for every SNP in the dataset. +Note that imputing the sample size for every SNP is not correct and +should only be done as a last resort. N can also be inputted with "ldsc", +"sum", "giant" or "metal" by passing one of these for this field or a vector +of multiple. Sum and an integer value creates an N column in the output +whereas giant, metal or ldsc create an Neff or effective sample size. If +multiples are passed, the formula used to derive it will be indicated.

    + + +
    convert_n_int
    +

    Binary, if N (the number of samples) is not an integer, +should this be rounded? Default is TRUE.

    + + +
    impute_beta
    +

    Binary, whether BETA should be imputed using other effect +data if it isn't present in the sumstats. Note that this imputation is an +approximation (for Z & SE approach) so could have an effect on downstream +analysis. Use with caution. The different methods MungeSumstats will try and +impute beta (in this order or priority) are:

    1. log(OR) 2. Z x SE +Default value is FALSE.

    2. +
    + + +
    es_is_beta
    +

    Binary, whether to map ES to BETA. We take BETA to be any +BETA-like value (including Effect Size). If this is not the case for your +sumstats, change this to FALSE. Default is TRUE.

    + + +
    impute_se
    +

    Binary, whether the standard error should be imputed using +other effect data if it isn't present in the sumstats. Note that this +imputation is an approximation so could have an effect on downstream +analysis. Use with caution. The different methods MungeSumstats will try and +impute se (in this order or priority) are:

    1. BETA / Z 2. abs(BETA/ qnorm(P/2)) +Default is FALSE.

    2. +
    + + +
    analysis_trait
    +

    If multiple traits were studied, name of the trait for +analysis from the GWAS. Default is NULL.

    + + +
    ignore_multi_trait
    +

    If you have multiple traits (p-values) in the study +but you want to ignorwe these and instead use a standard named p-value, set +to TRUE. By default is FALSE which will check for multi-traits.

    + + +
    INFO_filter
    +

    numeric The minimum value permissible of the imputation +information score (if present in sumstats file). Default 0.9.

    + + +
    FRQ_filter
    +

    numeric The minimum value permissible of the frequency(FRQ) +of the SNP (i.e. Allele Frequency (AF)) (if present in sumstats file). By +default no filtering is done, i.e. value of 0.

    + + +
    pos_se
    +

    Binary Should the standard Error (SE) column be checked to +ensure it is greater than 0? Those that are, are removed (if present in +sumstats file). Default TRUE.

    + + +
    effect_columns_nonzero
    +

    Binary should the effect columns in the data +BETA,OR (odds ratio),LOG_ODDS,SIGNED_SUMSTAT be checked to ensure no SNP=0. +Those that do are removed(if present in sumstats file). Default FALSE.

    + + +
    N_std
    +

    numeric The number of standard deviations above the mean a SNP's +N is needed to be removed. Default is 5.

    + + +
    N_dropNA
    +

    Drop rows where N is missing.Default is TRUE.

    + + +
    chr_style
    +

    Chromosome naming style to use in the formatted summary +statistics file ("NCBI", "UCSC", "dbSNP", or "Ensembl"). The NCBI and +Ensembl styles both code chromosomes as 1-22, X, Y, MT; the UCSC style is +chr1-chr22, chrX, chrY, chrM; and the dbSNP style is +ch1-ch22, chX, chY, chMT. Default is Ensembl.

    + + +
    rmv_chr
    +

    Chromosomes to exclude from the formatted summary statistics +file. Use NULL if no filtering is necessary. Default is c("X", "Y", "MT") +which removes all non-autosomal SNPs.

    + + +
    on_ref_genome
    +

    Binary Should a check take place that all SNPs are on +the reference genome by SNP ID. Default is TRUE.

    + + +
    infer_eff_direction
    +

    Binary Should a check take place to ensure the +alleles match the effect direction? Default is TRUE.

    + + +
    eff_on_minor_alleles
    +

    Binary Should MungeSumstats assume that the +effects are majoritively measured on the minor alleles? Default is FALSE as +this is an assumption that won't be appropriate in all cases. However, the +benefit is that if we know the majority of SNPs have their effects based on +the minor alleles, we can catch cases where the allele columns have been +mislabelled.

    + + +
    strand_ambig_filter
    +

    Binary Should SNPs with strand-ambiguous alleles +be removed. Default is FALSE.

    + + +
    allele_flip_check
    +

    Binary Should the allele columns be checked against +reference genome to infer if flipping is necessary. Default is TRUE.

    + + +
    allele_flip_drop
    +

    Binary Should the SNPs for which neither their A1 or +A2 base pair values match a reference genome be dropped. Default is TRUE.

    + + +
    allele_flip_z
    +

    Binary should the Z-score be flipped along with effect +and FRQ columns like Beta? It is assumed to be calculated off the effect size +not the P-value and so will be flipped i.e. default TRUE.

    + + +
    allele_flip_frq
    +

    Binary should the frequency (FRQ) column be flipped +along with effect and z-score columns like Beta? Default TRUE.

    + + +
    bi_allelic_filter
    +

    Binary Should non-bi-allelic SNPs be removed. +Default is TRUE.

    + + +
    flip_frq_as_biallelic
    +

    Binary Should non-bi-allelic SNPs frequency +values be flipped as 1-p despite there being other alternative alleles? +Default is FALSE but if set to TRUE, this allows non-bi-allelic SNPs to be +kept despite needing flipping.

    + + +
    snp_ids_are_rs_ids
    +

    Binary Should the supplied SNP ID's be assumed to +be RSIDs. If not, imputation using the SNP ID for other columns like +base-pair position or chromosome will not be possible. If set to FALSE, the +SNP RS ID will be imputed from the reference genome if possible. Default is +TRUE.

    + + +
    remove_multi_rs_snp
    +

    Binary Sometimes summary statistics can have +multiple RSIDs on one row (i.e. related to one SNP), for example +"rs5772025_rs397784053". This can cause an error so by default, the first +RS ID will be kept and the rest removed e.g."rs5772025". If you want to just +remove these SNPs entirely, set it to TRUE. Default is FALSE.

    + + +
    frq_is_maf
    +

    Conventionally the FRQ column is intended to show the +minor/effect allele frequency (MAF) but sometimes the major allele frequency +can be inferred as the FRQ column. This logical variable indicates that the +FRQ column should be renamed to MAJOR_ALLELE_FRQ if the frequency values +appear to relate to the major allele i.e. >0.5. By default this mapping won't +occur i.e. is TRUE.

    + + +
    indels
    +

    Binary does your Sumstats file contain Indels? These don't +exist in our reference file so they will be excluded from checks if this +value is TRUE. Default is TRUE.

    + + +
    drop_indels
    +

    Binary, should any indels found in the sumstats be +dropped? These can not be checked against a reference dataset and will have +the same RS ID and position as SNPs which can affect downstream analysis. +Default is False.

    + + +
    drop_na_cols
    +

    A character vector of column names to be checked for +missing values. Rows with missing values in any of these columns (if present +in the dataset) will be dropped. If NULL, all columns will be checked for +missing values. Default columns are SNP, chromosome, position, allele 1, +allele2, effect columns (frequency, beta, Z-score, standard error, log odds, +signed sumstats, odds ratio), p value and N columns.

    + + +
    dbSNP
    +

    version of dbSNP to be used for imputation (144 or 155).

    + + +
    check_dups
    +

    whether to check for duplicates - if formatting QTL +datasets this should be set to FALSE otherwise keep as TRUE. Default is TRUE.

    + + +
    sort_coordinates
    +

    Whether to sort by coordinates of resulting sumstats

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    save_path
    +

    File path to save formatted data. Defaults to +tempfile(fileext=".tsv.gz").

    + + +
    write_vcf
    +

    Whether to write as VCF (TRUE) or tabular file (FALSE).

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    return_data
    +

    Return data.table, GRanges or VRanges +directly to user. Otherwise, return the path to the save data. Default is +FALSE.

    + + +
    return_format
    +

    If return_data is TRUE. Object type to be returned +("data.table","vranges","granges").

    + + +
    ldsc_format
    +

    DEPRECATED, do not use. Use save_format="LDSC" instead.

    + + +
    save_format
    +

    Output format of sumstats. Options are NULL - standardised +output format from MungeSumstats, LDSC - output format compatible with LDSC +and openGWAS - output compatible with openGWAS VCFs. Default is NULL. +NOTE - If LDSC format is used, the naming convention of A1 as the +reference (genome build) allele and A2 as the effect allele will be reversed +to match LDSC (A1 will now be the effect allele). See more info on this +here. Note that any +effect columns (e.g. Z) will be inrelation to A1 now instead of A2.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    log_mungesumstats_msgs
    +

    Binary Should a log be stored containing all +messages and errors printed by MungeSumstats in a run. Default is FALSE

    + + +
    log_folder
    +

    Filepath to the directory for the log files and the log of +MungeSumstats messages to be stored. Default is a temporary directory. Note +the name of the log files (log messages and log outputs) are now the same as +the name of the file specified in the save path parameter with the extension +'_log_msg.txt' and '_log_output.txt' respectively.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + + +
    force_new
    +

    If a formatted file of the same names as save_path +exists, formatting will be skipped and this file will be imported instead +(default). Set force_new=TRUE to override this.

    + + +
    mapping_file
    +

    MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.

    + + +
    rmv_chrPrefix
    +

    Is now deprecated, do. not use. Use chr_style instead - +chr_style = 'Ensembl' will give the same result as rmv_chrPrefix=TRUE used to +give.

    + +
    +
    +

    Value

    + + +

    The address for the modified sumstats file or the actual data +dependent on user choice. Also, if log files wanted by the user, the return +in both above instances are a list.

    +
    + +
    +

    Examples

    +
    # Pass path to Educational Attainment Okbay sumstat file to a temp directory
    +
    +eduAttainOkbayPth <- system.file("extdata", "eduAttainOkbay.txt",
    +    package = "MungeSumstats"
    +)
    +
    +## Call uses reference genome as default with more than 2GB of memory,
    +## which is more than what 32-bit Windows can handle so remove certain checks
    +## Using dbSNP = 144 for speed as it's smaller but you should use 155 unless
    +## you know what you are doing and need 144
    +
    +is_32bit_windows <-
    +    .Platform$OS.type == "windows" && .Platform$r_arch == "i386"
    +if (!is_32bit_windows) {
    +    reformatted <- format_sumstats(
    +        path = eduAttainOkbayPth,
    +        ref_genome = "GRCh37",
    +        dbSNP = 144
    +    )
    +} else {
    +    reformatted <- format_sumstats(
    +        path = eduAttainOkbayPth,
    +        ref_genome = "GRCh37",
    +        on_ref_genome = FALSE,
    +        strand_ambig_filter = FALSE,
    +        bi_allelic_filter = FALSE,
    +        allele_flip_check = FALSE,
    +        dbSNP=144
    +    )
    +}
    +#> 
    +#> 
    +#> ******::NOTE::******
    +#>  - Formatted results will be saved to `tempdir()` by default.
    +#>  - This means all formatted summary stats will be deleted upon ending the R session.
    +#>  - To keep formatted summary stats, change `save_path`  ( e.g. `save_path=file.path('./formatted',basename(path))` ),   or make sure to copy files elsewhere after processing  ( e.g. `file.copy(save_path, './formatted/' )`.
    +#>  ******************** 
    +#> Formatted summary statistics will be saved to ==>  /var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T//Rtmp4DII6I/filec16d74d43914.tsv.gz
    +#> Warning: replacing previous import ‘utils::findMatches’ by ‘S4Vectors::findMatches’ when loading ‘SNPlocs.Hsapiens.dbSNP144.GRCh37’
    +#> Importing tabular file: /private/var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T/RtmpKLvRpi/temp_libpath17f3d19176b21/MungeSumstats/extdata/eduAttainOkbay.txt
    +#> Checking for empty columns.
    +#> Infer Effect Column
    +#> First line of summary statistics file: 
    +#> MarkerName	CHR	POS	A1	A2	EAF	Beta	SE	Pval	
    +#> Allele columns are ambiguous, attempting to infer direction
    +#> Standardising column headers.
    +#> First line of summary statistics file: 
    +#> MarkerName	CHR	POS	A1	A2	EAF	Beta	SE	Pval	
    +#> Loading SNPlocs data.
    +#> Loading reference genome data.
    +#> Preprocessing RSIDs.
    +#> Validating RSIDs of 93 SNPs using BSgenome::snpsById...
    +#> BSgenome::snpsById done in 25 seconds.
    +#> Effect/frq column(s) relate to A2 in the inputted sumstats
    +#> Found direction from matching reference genome - NOTE this assumes non-effect allele will match the reference genome
    +#> Standardising column headers.
    +#> First line of summary statistics file: 
    +#> MarkerName	CHR	POS	A1	A2	EAF	Beta	SE	Pval	
    +#> Summary statistics report:
    +#>    - 93 rows
    +#>    - 93 unique variants
    +#>    - 70 genome-wide significant variants (P<5e-8)
    +#>    - 20 chromosomes
    +#> Checking for multi-GWAS.
    +#> Checking for multiple RSIDs on one row.
    +#> Checking SNP RSIDs.
    +#> Checking for merged allele column.
    +#> Checking A1 is uppercase
    +#> Checking A2 is uppercase
    +#> Checking for incorrect base-pair positions
    +#> Ensuring all SNPs are on the reference genome.
    +#> Loading SNPlocs data.
    +#> Loading reference genome data.
    +#> Preprocessing RSIDs.
    +#> Validating RSIDs of 93 SNPs using BSgenome::snpsById...
    +#> BSgenome::snpsById done in 14 seconds.
    +#> Checking for correct direction of A1 (reference) and A2 (alternative allele).
    +#> There are 46 SNPs where A1 doesn't match the reference genome.
    +#> These will be flipped with their effect columns.
    +#> Checking for missing data.
    +#> Checking for duplicate columns.
    +#> Checking for duplicate SNPs from SNP ID.
    +#> Checking for SNPs with duplicated base-pair positions.
    +#> INFO column not available. Skipping INFO score filtering step.
    +#> Filtering SNPs, ensuring SE>0.
    +#> Ensuring all SNPs have N<5 std dev above mean.
    +#> Checking for bi-allelic SNPs.
    +#> 67 SNPs (72%) have FRQ values > 0.5. Conventionally the FRQ column is intended to show the minor/effect allele frequency.
    +#> The FRQ column was mapped from one of the following from the inputted  summary statistics file:
    +#> FRQ, EAF, FREQUENCY, FRQ_U, F_U, MAF, FREQ, FREQ_TESTED_ALLELE, FRQ_TESTED_ALLELE, FREQ_EFFECT_ALLELE, FRQ_EFFECT_ALLELE, EFFECT_ALLELE_FREQUENCY, EFFECT_ALLELE_FREQ, EFFECT_ALLELE_FRQ, A2FREQ, A2FRQ, ALLELE_FREQUENCY, ALLELE_FREQ, ALLELE_FRQ, AF, MINOR_AF, EFFECT_AF, A2_AF, EFF_AF, ALT_AF, ALTERNATIVE_AF, INC_AF, A_2_AF, TESTED_AF, ALLELEFREQ, ALT_FREQ, EAF_HRC, EFFECTALLELEFREQ, FREQ.B, FREQ_EUROPEAN_1000GENOMES, FREQ_HAPMAP, FREQ_TESTED_ALLELE_IN_HRS, FRQ_U_113154, FRQ_U_31358, FRQ_U_344901, FRQ_U_43456, POOLED_ALT_AF, AF_ALT, AF.ALT, AF-ALT, ALT.AF, ALT-AF, A2.AF, A2-AF, AF.EFF, AF_EFF, ALL_AF
    +#> As frq_is_maf=TRUE, the FRQ column will not be renamed. If the FRQ values were intended to represent major allele frequency,
    +#> set frq_is_maf=FALSE to rename the column as MAJOR_ALLELE_FRQ and differentiate it from minor/effect allele frequency.
    +#> Sorting coordinates with 'data.table'.
    +#> Writing in tabular format ==> /var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T//Rtmp4DII6I/filec16d74d43914.tsv.gz
    +#> Summary statistics report:
    +#>    - 93 rows (100% of original 93 rows)
    +#>    - 93 unique variants
    +#>    - 70 genome-wide significant variants (P<5e-8)
    +#>    - 20 chromosomes
    +#> Done munging in 0.753 minutes.
    +#> Successfully finished preparing sumstats file, preview:
    +#> Reading header.
    +#>           SNP   CHR       BP     A1     A2     FRQ   BETA    SE         P
    +#>        <char> <int>    <int> <char> <char>   <num>  <num> <num>     <num>
    +#> 1:   rs301800     1  8490603      T      C 0.17910  0.019 0.003 1.794e-08
    +#> 2: rs11210860     1 43982527      G      A 0.63060 -0.017 0.003 2.359e-10
    +#> 3: rs34305371     1 72733610      G      A 0.91231 -0.035 0.005 3.762e-14
    +#> 4:  rs2568955     1 72762169      T      C 0.23690 -0.017 0.003 1.797e-08
    +#> Returning path to saved data.
    +# returned location has the updated summary statistics file
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/formatted_example.html b/docs/reference/formatted_example.html new file mode 100644 index 00000000..2850342d --- /dev/null +++ b/docs/reference/formatted_example.html @@ -0,0 +1,140 @@ + +Formatted example — formatted_example • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Returns an example of summary stats that have had their column names +already standardised with +standardise_header.

    +
    + +
    +
    formatted_example(
    +  path = system.file("extdata", "eduAttainOkbay.txt", package = "MungeSumstats"),
    +  formatted = TRUE,
    +  sorted = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Path to raw example file. Default to built-in dataset.

    + + +
    formatted
    +

    Whether the column names should be formatted +(default:TRUE).

    + + +
    sorted
    +

    Whether the rows should be sorted by genomic coordinates +(default:TRUE).

    + +
    +
    +

    Value

    + + +

    sumstats_dt

    + + +
    + +
    +

    Examples

    +
    sumstats_dt <- MungeSumstats::formatted_example()
    +#> Standardising column headers.
    +#> First line of summary statistics file: 
    +#> MarkerName	CHR	POS	A1	A2	EAF	Beta	SE	Pval	
    +#> Sorting coordinates with 'data.table'.
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_chain_file.html b/docs/reference/get_chain_file.html new file mode 100644 index 00000000..8fa1e5a9 --- /dev/null +++ b/docs/reference/get_chain_file.html @@ -0,0 +1,140 @@ + +Download chain file for liftover — get_chain_file • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Download chain file for liftover

    +
    + +
    +
    get_chain_file(
    +  from = c("hg38", "hg19"),
    +  to = c("hg19", "hg38"),
    +  chain_source = c("ucsc", "ensembl"),
    +  save_dir = tempdir(),
    +  verbose = TRUE
    +)
    +
    + + +
    +

    Arguments

    +
    from
    +

    genome build converted from ("hg38", "hg19")

    + + +
    to
    +

    genome build converted to ("hg19", "hg38")

    + + +
    chain_source
    +

    chain file source used ("ucsc" as default, or "ensembl")

    + + +
    save_dir
    +

    where is the chain file saved? Default is a temp directory

    + + +
    verbose
    +

    extra messages printed? Default is TRUE

    + +
    +
    +

    Value

    + + +

    loaded chain file for liftover

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_eff_frq_allele_combns.html b/docs/reference/get_eff_frq_allele_combns.html new file mode 100644 index 00000000..a1dc57b7 --- /dev/null +++ b/docs/reference/get_eff_frq_allele_combns.html @@ -0,0 +1,124 @@ + +Get combinations of uncorrected allele and effect (and frq) columns — get_eff_frq_allele_combns • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Get combinations of uncorrected allele and effect (and frq) columns

    +
    + +
    +
    get_eff_frq_allele_combns(
    +  mapping_file = sumstatsColHeaders,
    +  eff_frq_cols = c("BETA", "OR", "LOG_ODDS", "SIGNED_SUMSTAT", "Z", "FRQ")
    +)
    +
    + +
    +

    Arguments

    +
    mapping_file
    +

    MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.

    + + +
    eff_frq_cols
    +

    Corrected effect or frequency column names found in a +sumstats. Default of BETA, OR, LOG_ODDS, SIGNED_SUMSTAT, Z and FRQ.

    + +
    +
    +

    Value

    + + +

    datatable containing uncorrected and corrected combinations

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_genome_build.html b/docs/reference/get_genome_build.html new file mode 100644 index 00000000..139e5729 --- /dev/null +++ b/docs/reference/get_genome_build.html @@ -0,0 +1,178 @@ + +Infers the genome build of the summary statistics file (GRCh37 or GRCh38) from the data. Uses SNP (RSID) & CHR & BP to get genome build. — get_genome_build • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Infers the genome build of the summary statistics file (GRCh37 or GRCh38) +from the data. Uses SNP (RSID) & CHR & BP to get genome build.

    +
    + +
    +
    get_genome_build(
    +  sumstats,
    +  nThread = 1,
    +  sampled_snps = 10000,
    +  standardise_headers = TRUE,
    +  mapping_file = sumstatsColHeaders,
    +  dbSNP = 155,
    +  header_only = FALSE,
    +  allele_match_ref = FALSE,
    +  ref_genome = NULL,
    +  chr_filt = NULL
    +)
    +
    + +
    +

    Arguments

    +
    sumstats
    +

    data table/data frame obj of the summary statistics file for +the GWAS ,or file path to summary statistics file.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    sampled_snps
    +

    Downsample the number of SNPs used when inferring genome +build to save time.

    + + +
    standardise_headers
    +

    Run +standardise_sumstats_column_headers_crossplatform.

    + + +
    mapping_file
    +

    MungeSumstats has a pre-defined +column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in your file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.

    + + +
    dbSNP
    +

    version of dbSNP to be used (144 or 155). Default is 155.

    + + +
    header_only
    +

    Instead of reading in the entire sumstats file, +only read in the first N rows where N=sampled_snps. +This should help speed up cases where you have to read in sumstats +from disk each time.

    + + +
    allele_match_ref
    +

    Instead of returning the genome_build this will +return the proportion of matches to each genome build for each allele +(A1,A2).

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    chr_filt
    +

    Internal for testing - filter reference genomes and sumstats +to specific chromosomes for testing. Pass a list of chroms in format: +c("1","2"). Default is NULL i.e. no filtering

    + +
    +
    +

    Value

    + + +

    ref_genome the genome build of the data

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_genome_builds.html b/docs/reference/get_genome_builds.html new file mode 100644 index 00000000..0bc6e014 --- /dev/null +++ b/docs/reference/get_genome_builds.html @@ -0,0 +1,204 @@ + +Infer genome builds — get_genome_builds • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Infers the genome build of summary statistics files (GRCh37 or GRCh38) +from the data. Uses SNP (RSID) & CHR & BP to get genome build.

    +
    + +
    +
    get_genome_builds(
    +  sumstats_list,
    +  header_only = TRUE,
    +  sampled_snps = 10000,
    +  names_from_paths = FALSE,
    +  dbSNP = 155,
    +  nThread = 1,
    +  chr_filt = NULL
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_list
    +

    A named list of paths to summary statistics, +or a named list of data.table objects.

    + + +
    header_only
    +

    Instead of reading in the entire sumstats file, +only read in the first N rows where N=sampled_snps. +This should help speed up cases where you have to read in sumstats +from disk each time.

    + + +
    sampled_snps
    +

    Downsample the number of SNPs used when inferring genome +build to save time.

    + + +
    names_from_paths
    +

    Infer the name of each item in sumstats_list +from its respective file path. +Only works if sumstats_list is a list of paths.

    + + +
    dbSNP
    +

    version of dbSNP to be used (144 or 155). Default is 155.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    chr_filt
    +

    Internal for testing - filter reference genomes and sumstats +to specific chromosomes for testing. Pass a list of chroms in format: +c("1","2"). Default is NULL i.e. no filtering

    + +
    +
    +

    Value

    + + +

    ref_genome the genome build of the data

    +
    +
    +

    Details

    +

    Iterative version of get_genome_build.

    +
    + +
    +

    Examples

    +
    # Pass path to Educational Attainment Okbay sumstat file to a temp directory
    +
    +eduAttainOkbayPth <- system.file("extdata", "eduAttainOkbay.txt",
    +    package = "MungeSumstats"
    +)
    +sumstats_list <- list(ss1 = eduAttainOkbayPth, ss2 = eduAttainOkbayPth)
    +
    +## Call uses reference genome as default with more than 2GB of memory,
    +## which is more than what 32-bit Windows can handle so remove certain checks
    +is_32bit_windows <-
    +    .Platform$OS.type == "windows" && .Platform$r_arch == "i386"
    +if (!is_32bit_windows) {
    +    
    +    #multiple sumstats can be passed at once to get all their genome builds:
    +    #ref_genomes <- get_genome_builds(sumstats_list = sumstats_list)
    +    #just passing first here for speed
    +    sumstats_list_quick <- list(ss1 = eduAttainOkbayPth)
    +    ref_genomes <- get_genome_builds(sumstats_list = sumstats_list_quick,
    +                                     dbSNP=144)
    +}
    +#> Inferring genome build of 1 sumstats file(s).
    +#> Inferring genome build.
    +#> Reading in only the first 10000 rows of sumstats.
    +#> Importing tabular file: /private/var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T/RtmpKLvRpi/temp_libpath17f3d19176b21/MungeSumstats/extdata/eduAttainOkbay.txt
    +#> Checking for empty columns.
    +#> Standardising column headers.
    +#> First line of summary statistics file: 
    +#> MarkerName	CHR	POS	A1	A2	EAF	Beta	SE	Pval	
    +#> Loading SNPlocs data.
    +#> Loading reference genome data.
    +#> Preprocessing RSIDs.
    +#> Validating RSIDs of 93 SNPs using BSgenome::snpsById...
    +#> BSgenome::snpsById done in 16 seconds.
    +#> Loading SNPlocs data.
    +#> Warning: replacing previous import ‘utils::findMatches’ by ‘S4Vectors::findMatches’ when loading ‘SNPlocs.Hsapiens.dbSNP144.GRCh38’
    +#> Loading reference genome data.
    +#> Preprocessing RSIDs.
    +#> Validating RSIDs of 93 SNPs using BSgenome::snpsById...
    +#> BSgenome::snpsById done in 29 seconds.
    +#> Inferred genome build: GRCH37
    +#> Time difference of 47.96309 secs
    +#> GRCH37: 1 file(s)
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_unique_name_log_file.html b/docs/reference/get_unique_name_log_file.html new file mode 100644 index 00000000..bff09d15 --- /dev/null +++ b/docs/reference/get_unique_name_log_file.html @@ -0,0 +1,117 @@ + +Simple function to ensure the new entry name to a list doesn't have the same name as another entry — get_unique_name_log_file • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Simple function to ensure the new entry name to a list doesn't have the same +name as another entry

    +
    + +
    +
    get_unique_name_log_file(name, log_files)
    +
    + +
    +

    Arguments

    +
    name
    +

    proposed name for the entry

    + + +
    log_files
    +

    list of log file locations

    + +
    +
    +

    Value

    + + +

    a unique name (character)

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_vcf_sample_ids.html b/docs/reference/get_vcf_sample_ids.html new file mode 100644 index 00000000..cad4cfab --- /dev/null +++ b/docs/reference/get_vcf_sample_ids.html @@ -0,0 +1,113 @@ + +Get VCF sample ID(s) — get_vcf_sample_ids • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Get VCF sample ID(s)

    +
    + +
    +
    get_vcf_sample_ids(path)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + +
    +
    +

    Value

    + + +

    sample_id

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/granges_to_dt.html b/docs/reference/granges_to_dt.html new file mode 100644 index 00000000..658eee8c --- /dev/null +++ b/docs/reference/granges_to_dt.html @@ -0,0 +1,116 @@ + +GenomicRanges to data.table — granges_to_dt • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Convert a GRanges into a data.table.

    +
    + +
    +
    granges_to_dt(gr)
    +
    + + +
    +

    Arguments

    +
    gr
    +

    A GRanges object.

    + +
    +
    +

    Value

    + + +

    A data.table object.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/hg19ToHg38.html b/docs/reference/hg19ToHg38.html new file mode 100644 index 00000000..3ad0770e --- /dev/null +++ b/docs/reference/hg19ToHg38.html @@ -0,0 +1,119 @@ + +UCSC Chain file hg19 to hg38 — hg19ToHg38 • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    UCSC Chain file hg19 to hg38, .chain.gz file, downloaded from +https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/ on 09/10/21

    +
    + + +
    +

    Format

    +

    gunzipped chain file

    +
    +
    +

    Source

    +

    The chain file was downloaded from +https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/ + +utils::download.file('ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz',tempdir()) +

    +
    +
    +

    Details

    +

    UCSC Chain file hg19 to hg38, .chain.gz file, downloaded on 09/10/21 +To be used as a back up if the download from UCSC fails.

    +
    +
    +

    hg19ToHg38.over.chain.gz

    +

    NA

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/hg38ToHg19.html b/docs/reference/hg38ToHg19.html new file mode 100644 index 00000000..dcdf1293 --- /dev/null +++ b/docs/reference/hg38ToHg19.html @@ -0,0 +1,119 @@ + +UCSC Chain file hg38 to hg19 — hg38ToHg19 • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    UCSC Chain file hg38 to hg19, .chain.gz file, downloaded from +https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/ on 09/10/21

    +
    + + +
    +

    Format

    +

    gunzipped chain file

    +
    +
    +

    Source

    +

    The chain file was downloaded from +https://hgdownload.cse.ucsc.edu/goldenpath/hg38/liftOver/ + +utils::download.file('ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz',tempdir()) +

    +
    +
    +

    Details

    +

    UCSC Chain file hg38 to hg19, .chain.gz file, downloaded on 09/10/21 +To be used as a back up if the download from UCSC fails.

    +
    +
    +

    hg38ToHg19.over.chain.gz

    +

    NA

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/ieu-a-298.html b/docs/reference/ieu-a-298.html new file mode 100644 index 00000000..416593f7 --- /dev/null +++ b/docs/reference/ieu-a-298.html @@ -0,0 +1,116 @@ + +Local ieu-a-298 file from IEU Open GWAS — ieu-a-298 • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Local ieu-a-298 file from IEU Open GWAS, downloaded on 09/10/21.

    +
    + + +
    +

    Format

    +

    gunzipped tsv file

    +
    +
    +

    Source

    +

    The file was downloaded with: + +MungeSumstats::import_sumstats(ids = "ieu-a-298",ref_genome = "GRCH37") +

    +
    +
    +

    Details

    +

    Local ieu-a-298 file from IEU Open GWAS, downlaoded on 09/10/21. +This is done in case the download in the package vignette fails.

    +
    +
    +

    ieu-a-298.tsv.gz

    +

    NA

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/import_sumstats.html b/docs/reference/import_sumstats.html new file mode 100644 index 00000000..2d0c22db --- /dev/null +++ b/docs/reference/import_sumstats.html @@ -0,0 +1,515 @@ + +Import full genome-wide GWAS summary statistics from Open GWAS — import_sumstats • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Requires internet access to run.

    +
    + +
    +
    import_sumstats(
    +  ids,
    +  vcf_dir = tempdir(),
    +  vcf_download = TRUE,
    +  save_dir = tempdir(),
    +  write_vcf = FALSE,
    +  download_method = "download.file",
    +  quiet = TRUE,
    +  force_new = FALSE,
    +  force_new_vcf = FALSE,
    +  nThread = 1,
    +  parallel_across_ids = FALSE,
    +  ...
    +)
    +
    + +
    +

    Arguments

    +
    ids
    +

    List of Open GWAS study IDs +(e.g. c("prot-a-664", "ieu-b-4760")).

    + + +
    vcf_dir
    +

    Where to download the original VCF from Open GWAS. +WARNING: This is set to tempdir() by default. +This means the raw (pre-formatted) VCFs be deleted upon ending the R session. +Change this to keep the raw VCF file on disk +(e.g. vcf_dir="./raw_vcf").

    + + +
    vcf_download
    +

    Download the original VCF from Open GWAS.

    + + +
    save_dir
    +

    Directory to save formatted summary statistics in.

    + + +
    write_vcf
    +

    Whether to write as VCF (TRUE) or tabular file (FALSE).

    + + +
    download_method
    +

    "axel" (multi-threaded) or +"download.file" (single-threaded) .

    + + +
    quiet
    +

    Run quietly.

    + + +
    force_new
    +

    If a formatted file of the same names as save_path +exists, formatting will be skipped and this file will be imported instead +(default). Set force_new=TRUE to override this.

    + + +
    force_new_vcf
    +

    Overwrite a previously downloaded VCF +with the same path name.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    parallel_across_ids
    +

    If parallel_across_ids=TRUE +and nThread>1, +then each ID in ids will be processed in parallel.

    + + +
    ...
    +

    Arguments passed on to format_sumstats

    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + +
    convert_ref_genome
    +

    name of the reference genome to convert to +("GRCh37" or "GRCh38"). This will only occur if the current genome build does +not match. Default is not to convert the genome build (NULL).

    + +
    chain_source
    +

    source of the chain file to use in liftover, if converting +genome build ("ucsc" or "ensembl"). Note that the UCSC chain files require a +license for commercial use. The Ensembl chain is used by default ("ensembl").

    + +
    local_chain
    +

    Path to local chain file to use instead of downlaoding. +Default of NULL i.e. no local file to use. NOTE if passing a local chain file +make sure to specify the path to convert from and to the correct build like +GRCh37 to GRCh38. We can not sense check this for local files. The chain file +can be submitted as a gz file (as downloaed from source) or unzipped.

    + +
    convert_small_p
    +

    Binary, should non-negative +p-values <= 5e-324 be converted to 0? +Small p-values pass the R limit and can cause errors with LDSC/MAGMA and +should be converted. Default is TRUE.

    + +
    convert_large_p
    +

    Binary, should p-values >1 be converted to 1? +P-values >1 should not be possible and can cause errors with LDSC/MAGMA and +should be converted. Default is TRUE.

    + +
    convert_neg_p
    +

    Binary, should p-values <0 be converted to 0? +Negative p-values should not be possible and can cause errors +with LDSC/MAGMA and should be converted. Default is TRUE.

    + +
    compute_z
    +

    Whether to compute Z-score column. Default is FALSE. This +can be computed from Beta and SE with (Beta/SE) or P +(Z:=sign(BETA)*sqrt(stats::qchisq(P,1,lower=FALSE))). +Note that imputing the Z-score from P for every SNP will not be +perfectly correct and may result in a loss of power. This should only be done +as a last resort. Use 'BETA' to impute by BETA/SE and 'P' to impute by SNP +p-value.

    + +
    force_new_z
    +

    When a "Z" column already exists, it will be used by +default. To override and compute a new Z-score column from P set +force_new_z=TRUE.

    + +
    compute_n
    +

    Whether to impute N. Default of 0 won't impute, any other +integer will be imputed as the N (sample size) for every SNP in the dataset. +Note that imputing the sample size for every SNP is not correct and +should only be done as a last resort. N can also be inputted with "ldsc", +"sum", "giant" or "metal" by passing one of these for this field or a vector +of multiple. Sum and an integer value creates an N column in the output +whereas giant, metal or ldsc create an Neff or effective sample size. If +multiples are passed, the formula used to derive it will be indicated.

    + +
    convert_n_int
    +

    Binary, if N (the number of samples) is not an integer, +should this be rounded? Default is TRUE.

    + +
    impute_beta
    +

    Binary, whether BETA should be imputed using other effect +data if it isn't present in the sumstats. Note that this imputation is an +approximation (for Z & SE approach) so could have an effect on downstream +analysis. Use with caution. The different methods MungeSumstats will try and +impute beta (in this order or priority) are:

    1. log(OR) 2. Z x SE +Default value is FALSE.

    2. +
    + +
    es_is_beta
    +

    Binary, whether to map ES to BETA. We take BETA to be any +BETA-like value (including Effect Size). If this is not the case for your +sumstats, change this to FALSE. Default is TRUE.

    + +
    impute_se
    +

    Binary, whether the standard error should be imputed using +other effect data if it isn't present in the sumstats. Note that this +imputation is an approximation so could have an effect on downstream +analysis. Use with caution. The different methods MungeSumstats will try and +impute se (in this order or priority) are:

    1. BETA / Z 2. abs(BETA/ qnorm(P/2)) +Default is FALSE.

    2. +
    + +
    analysis_trait
    +

    If multiple traits were studied, name of the trait for +analysis from the GWAS. Default is NULL.

    + +
    ignore_multi_trait
    +

    If you have multiple traits (p-values) in the study +but you want to ignorwe these and instead use a standard named p-value, set +to TRUE. By default is FALSE which will check for multi-traits.

    + +
    INFO_filter
    +

    numeric The minimum value permissible of the imputation +information score (if present in sumstats file). Default 0.9.

    + +
    FRQ_filter
    +

    numeric The minimum value permissible of the frequency(FRQ) +of the SNP (i.e. Allele Frequency (AF)) (if present in sumstats file). By +default no filtering is done, i.e. value of 0.

    + +
    pos_se
    +

    Binary Should the standard Error (SE) column be checked to +ensure it is greater than 0? Those that are, are removed (if present in +sumstats file). Default TRUE.

    + +
    effect_columns_nonzero
    +

    Binary should the effect columns in the data +BETA,OR (odds ratio),LOG_ODDS,SIGNED_SUMSTAT be checked to ensure no SNP=0. +Those that do are removed(if present in sumstats file). Default FALSE.

    + +
    N_std
    +

    numeric The number of standard deviations above the mean a SNP's +N is needed to be removed. Default is 5.

    + +
    N_dropNA
    +

    Drop rows where N is missing.Default is TRUE.

    + +
    chr_style
    +

    Chromosome naming style to use in the formatted summary +statistics file ("NCBI", "UCSC", "dbSNP", or "Ensembl"). The NCBI and +Ensembl styles both code chromosomes as 1-22, X, Y, MT; the UCSC style is +chr1-chr22, chrX, chrY, chrM; and the dbSNP style is +ch1-ch22, chX, chY, chMT. Default is Ensembl.

    + +
    rmv_chrPrefix
    +

    Is now deprecated, do. not use. Use chr_style instead - +chr_style = 'Ensembl' will give the same result as rmv_chrPrefix=TRUE used to +give.

    + +
    rmv_chr
    +

    Chromosomes to exclude from the formatted summary statistics +file. Use NULL if no filtering is necessary. Default is c("X", "Y", "MT") +which removes all non-autosomal SNPs.

    + +
    on_ref_genome
    +

    Binary Should a check take place that all SNPs are on +the reference genome by SNP ID. Default is TRUE.

    + +
    infer_eff_direction
    +

    Binary Should a check take place to ensure the +alleles match the effect direction? Default is TRUE.

    + +
    eff_on_minor_alleles
    +

    Binary Should MungeSumstats assume that the +effects are majoritively measured on the minor alleles? Default is FALSE as +this is an assumption that won't be appropriate in all cases. However, the +benefit is that if we know the majority of SNPs have their effects based on +the minor alleles, we can catch cases where the allele columns have been +mislabelled.

    + +
    strand_ambig_filter
    +

    Binary Should SNPs with strand-ambiguous alleles +be removed. Default is FALSE.

    + +
    allele_flip_check
    +

    Binary Should the allele columns be checked against +reference genome to infer if flipping is necessary. Default is TRUE.

    + +
    allele_flip_drop
    +

    Binary Should the SNPs for which neither their A1 or +A2 base pair values match a reference genome be dropped. Default is TRUE.

    + +
    allele_flip_z
    +

    Binary should the Z-score be flipped along with effect +and FRQ columns like Beta? It is assumed to be calculated off the effect size +not the P-value and so will be flipped i.e. default TRUE.

    + +
    allele_flip_frq
    +

    Binary should the frequency (FRQ) column be flipped +along with effect and z-score columns like Beta? Default TRUE.

    + +
    bi_allelic_filter
    +

    Binary Should non-bi-allelic SNPs be removed. +Default is TRUE.

    + +
    flip_frq_as_biallelic
    +

    Binary Should non-bi-allelic SNPs frequency +values be flipped as 1-p despite there being other alternative alleles? +Default is FALSE but if set to TRUE, this allows non-bi-allelic SNPs to be +kept despite needing flipping.

    + +
    snp_ids_are_rs_ids
    +

    Binary Should the supplied SNP ID's be assumed to +be RSIDs. If not, imputation using the SNP ID for other columns like +base-pair position or chromosome will not be possible. If set to FALSE, the +SNP RS ID will be imputed from the reference genome if possible. Default is +TRUE.

    + +
    remove_multi_rs_snp
    +

    Binary Sometimes summary statistics can have +multiple RSIDs on one row (i.e. related to one SNP), for example +"rs5772025_rs397784053". This can cause an error so by default, the first +RS ID will be kept and the rest removed e.g."rs5772025". If you want to just +remove these SNPs entirely, set it to TRUE. Default is FALSE.

    + +
    frq_is_maf
    +

    Conventionally the FRQ column is intended to show the +minor/effect allele frequency (MAF) but sometimes the major allele frequency +can be inferred as the FRQ column. This logical variable indicates that the +FRQ column should be renamed to MAJOR_ALLELE_FRQ if the frequency values +appear to relate to the major allele i.e. >0.5. By default this mapping won't +occur i.e. is TRUE.

    + +
    indels
    +

    Binary does your Sumstats file contain Indels? These don't +exist in our reference file so they will be excluded from checks if this +value is TRUE. Default is TRUE.

    + +
    drop_indels
    +

    Binary, should any indels found in the sumstats be +dropped? These can not be checked against a reference dataset and will have +the same RS ID and position as SNPs which can affect downstream analysis. +Default is False.

    + +
    drop_na_cols
    +

    A character vector of column names to be checked for +missing values. Rows with missing values in any of these columns (if present +in the dataset) will be dropped. If NULL, all columns will be checked for +missing values. Default columns are SNP, chromosome, position, allele 1, +allele2, effect columns (frequency, beta, Z-score, standard error, log odds, +signed sumstats, odds ratio), p value and N columns.

    + +
    dbSNP
    +

    version of dbSNP to be used for imputation (144 or 155).

    + +
    check_dups
    +

    whether to check for duplicates - if formatting QTL +datasets this should be set to FALSE otherwise keep as TRUE. Default is TRUE.

    + +
    sort_coordinates
    +

    Whether to sort by coordinates of resulting sumstats

    + +
    save_path
    +

    File path to save formatted data. Defaults to +tempfile(fileext=".tsv.gz").

    + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + +
    return_data
    +

    Return data.table, GRanges or VRanges +directly to user. Otherwise, return the path to the save data. Default is +FALSE.

    + +
    return_format
    +

    If return_data is TRUE. Object type to be returned +("data.table","vranges","granges").

    + +
    ldsc_format
    +

    DEPRECATED, do not use. Use save_format="LDSC" instead.

    + +
    save_format
    +

    Output format of sumstats. Options are NULL - standardised +output format from MungeSumstats, LDSC - output format compatible with LDSC +and openGWAS - output compatible with openGWAS VCFs. Default is NULL. +NOTE - If LDSC format is used, the naming convention of A1 as the +reference (genome build) allele and A2 as the effect allele will be reversed +to match LDSC (A1 will now be the effect allele). See more info on this +here. Note that any +effect columns (e.g. Z) will be inrelation to A1 now instead of A2.

    + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + +
    log_mungesumstats_msgs
    +

    Binary Should a log be stored containing all +messages and errors printed by MungeSumstats in a run. Default is FALSE

    + +
    log_folder
    +

    Filepath to the directory for the log files and the log of +MungeSumstats messages to be stored. Default is a temporary directory. Note +the name of the log files (log messages and log outputs) are now the same as +the name of the file specified in the save path parameter with the extension +'_log_msg.txt' and '_log_output.txt' respectively.

    + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + +
    mapping_file
    +

    MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.

    + + +
    + +
    +
    +

    Value

    + + +

    Either a named list of data objects or paths, +depending on the arguments passed to format_sumstats.

    +
    + +
    +

    Examples

    +
    #only run the examples if user has internet access:
    +if(try(is.character(getURL("www.google.com")))==TRUE){
    +### Search by criteria
    +metagwas <- find_sumstats(
    +    traits = c("parkinson", "alzheimer"),
    +    min_sample_size = 5000
    +)
    +### Only use a subset for testing purposes
    +ids <- (dplyr::arrange(metagwas, nsnp))$id
    +
    +### Default usage
    +## You can supply \code{import_sumstats()}
    +## with a list of as many OpenGWAS IDs as you want,
    +## but we'll just give one to save time.
    +
    +## Call uses reference genome as default with more than 2GB of memory,
    +## which is more than what 32-bit Windows can handle so remove certain checks
    +## commented out down to runtime
    +# datasets <- import_sumstats(ids = ids[1])
    +}
    +#> Error in getURL("www.google.com") : could not find function "getURL"
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/index.html b/docs/reference/index.html new file mode 100644 index 00000000..409ae819 --- /dev/null +++ b/docs/reference/index.html @@ -0,0 +1,209 @@ + +Function reference • MungeSumstats + + +
    +
    + + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    All functions

    +

    +
    +

    check_ldsc_format()

    +

    Ensures that parameters are compatible with LDSC format

    +

    compute_nsize()

    +

    Check for N column if not present and user wants, impute N based on user's sample size. NOTE this will be the same value for each SNP which is not necessarily correct and may cause issues down the line. N can also be inputted with "ldsc", "sum", "giant" or "metal" by passing one or multiple of these.

    +

    download_vcf()

    +

    Download VCF file and its index file from Open GWAS

    +

    find_sumstats()

    +

    Search Open GWAS for datasets matching criteria

    +

    format_sumstats()

    +

    Check that summary statistics from GWAS are in a homogeneous format

    +

    formatted_example()

    +

    Formatted example

    +

    get_eff_frq_allele_combns()

    +

    Get combinations of uncorrected allele and effect (and frq) columns

    +

    get_genome_builds()

    +

    Infer genome builds

    +

    hg19ToHg38

    +

    UCSC Chain file hg19 to hg38

    +

    hg38ToHg19

    +

    UCSC Chain file hg38 to hg19

    +

    ieu-a-298

    +

    Local ieu-a-298 file from IEU Open GWAS

    +

    import_sumstats()

    +

    Import full genome-wide GWAS summary statistics from Open GWAS

    +

    index_tabular()

    +

    Tabix-index file: table

    +

    infer_effect_column()

    +

    Infer if effect relates to a1 or A2 if ambiguously named

    +

    liftover()

    +

    Genome build liftover

    +

    list_sumstats()

    +

    List munged summary statistics

    +

    load_ref_genome_data()

    +

    Load the reference genome data for SNPs of interest

    +

    load_snp_loc_data()

    +

    Loads the SNP locations and alleles for Homo sapiens extracted from NCBI dbSNP Build 144. Reference genome version is dependent on user input.

    +

    parse_logs()

    +

    Parse data from log files

    +

    raw_ALSvcf

    +

    GWAS Amyotrophic lateral sclerosis ieu open GWAS project - Subset

    +

    raw_eduAttainOkbay

    +

    GWAS Educational Attainment Okbay 2016 - Subset

    +

    read_header()

    +

    Read in file header

    +

    read_sumstats()

    +

    Determine summary statistics file type and read them into memory

    +

    read_vcf()

    +

    Read in VCF file

    +

    register_cores()

    +

    Register cores

    +

    standardise_header()

    +

    Standardise the column headers in the Summary Statistics files

    +

    sumstatsColHeaders

    +

    Summary Statistics Column Headers

    +

    vcf2df()

    +

    VCF to DF

    +

    write_sumstats()

    +

    Write sum stats file to disk

    + + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/index_tabular.html b/docs/reference/index_tabular.html new file mode 100644 index 00000000..513d46f6 --- /dev/null +++ b/docs/reference/index_tabular.html @@ -0,0 +1,179 @@ + +Tabix-index file: table — index_tabular • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Convert summary stats file to tabix format.

    +
    + +
    +
    index_tabular(
    +  path,
    +  chrom_col = "CHR",
    +  start_col = "BP",
    +  end_col = start_col,
    +  overwrite = TRUE,
    +  remove_tmp = TRUE,
    +  verbose = TRUE
    +)
    +
    + +
    +

    Source

    +

    Borrowed function from + +echotabix.

    +
    +
    +

    Arguments

    +
    path
    +

    Path to GWAS summary statistics file.

    + + +
    chrom_col
    +

    Name of the chromosome column in +sumstats_dt (e.g. "CHR").

    + + +
    start_col
    +

    Name of the starting genomic position +column in sumstats_dt (e.g. "POS","start").

    + + +
    end_col
    +

    Name of the ending genomic position +column in sumstats_dt (e.g. "POS","end"). +Can be the same as start_col when sumstats_dt +only contains SNPs that span 1 base pair (bp) each.

    + + +
    overwrite
    +

    A logical(1) indicating whether dest should + be over-written, if it already exists.

    + + +
    remove_tmp
    +

    Remove the temporary uncompressed version of the file +(.tsv).

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    Path to tabix-indexed tabular file

    +
    +
    +

    See also

    +

    Other tabix: +index_vcf()

    +
    + +
    +

    Examples

    +
    sumstats_dt <- MungeSumstats::formatted_example() 
    +#> Standardising column headers.
    +#> First line of summary statistics file: 
    +#> MarkerName	CHR	POS	A1	A2	EAF	Beta	SE	Pval	
    +#> Sorting coordinates with 'data.table'.
    +path <- tempfile(fileext = ".tsv")
    +MungeSumstats::write_sumstats(sumstats_dt = sumstats_dt, save_path = path)
    +#> Writing in tabular format ==> /var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T//Rtmp4DII6I/filec16d4d6776ee.tsv
    +indexed_file <- MungeSumstats::index_tabular(path = path)
    +#> Converting full summary stats file to tabix format for fast querying...
    +#> Reading header.
    +#> Ensuring file is bgzipped.
    +#> Tabix-indexing file.
    +#> Removing temporary .tsv file.
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/index_vcf.html b/docs/reference/index_vcf.html new file mode 100644 index 00000000..5b669e66 --- /dev/null +++ b/docs/reference/index_vcf.html @@ -0,0 +1,151 @@ + +Tabix-index file: VCF — index_vcf • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Convert summary stats file to tabix format

    +
    + +
    +
    index_vcf(path, verbose = TRUE)
    +
    + +
    +

    Source

    +

    Borrowed function from + +echotabix.

    +
    +
    +

    Arguments

    +
    path
    +

    Path to VCF.

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    Path to tabix-indexed tabular file

    +
    +
    +

    See also

    +

    Other tabix: +index_tabular()

    +
    + +
    +

    Examples

    +
    eduAttainOkbayPth <- system.file("extdata", "eduAttainOkbay.txt",
    +                                 package = "MungeSumstats")
    +sumstats_dt <- data.table::fread(eduAttainOkbayPth, nThread = 1)
    +sumstats_dt <- 
    +MungeSumstats:::standardise_sumstats_column_headers_crossplatform(
    +    sumstats_dt = sumstats_dt)$sumstats_dt
    +#> Standardising column headers.
    +#> First line of summary statistics file: 
    +#> MarkerName	CHR	POS	A1	A2	EAF	Beta	SE	Pval	
    +sumstats_dt <- MungeSumstats:::sort_coords(sumstats_dt = sumstats_dt)
    +#> Sorting coordinates with 'data.table'.
    +path <- tempfile(fileext = ".tsv")
    +MungeSumstats::write_sumstats(sumstats_dt = sumstats_dt, save_path = path)
    +#> Writing in tabular format ==> /var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T//Rtmp4DII6I/filec16d1d8e92cf.tsv
    +    
    +indexed_file <- MungeSumstats::index_tabular(path = path)
    +#> Converting full summary stats file to tabix format for fast querying...
    +#> Reading header.
    +#> Ensuring file is bgzipped.
    +#> Tabix-indexing file.
    +#> Removing temporary .tsv file.
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/infer_effect_column.html b/docs/reference/infer_effect_column.html new file mode 100644 index 00000000..c14003b1 --- /dev/null +++ b/docs/reference/infer_effect_column.html @@ -0,0 +1,231 @@ + +Infer if effect relates to a1 or A2 if ambiguously named — infer_effect_column • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Three checks are made to infer which allele the effect/frequency information +relates to if they are ambiguous (named A0, A1 and A2 or equivalent):

    1. Check if ambiguous naming conventions are used (i.e. allele 0, 1 and 2 or +equivalent). If not exit, otherwise continue to next checks. This can be +checked by using the mapping file and splitting A1/A2 mappings by those that +contain 0, 1 or 2 (ambiguous) or doesn't contain 0, 1 or 2 e.g. effect, +tested (unambiguous so fine for MSS to handle as is).

    2. +
    3. Look for effect column/frequency column where the A0/A1/A2 explicitly +mentioned, if found then we know the direction and should update A0/A1/A2 +naming so A2 is the effect column. We can look for such columns by getting +every combination of A0/A1/A2 naming and effect/frq naming.

    4. +
    5. If not found in 2, a final check should be against the reference genome, +whichever of A0, A1 and A2 has more of a match with the reference genome +should be taken as not the effect allele. There is an assumption in this +but is still better than guessing the ambiguous allele naming.

    6. +
    + +
    +
    infer_effect_column(
    +  sumstats_dt,
    +  dbSNP = 155,
    +  sampled_snps = 10000,
    +  mapping_file = sumstatsColHeaders,
    +  nThread = nThread,
    +  ref_genome = NULL,
    +  on_ref_genome = TRUE,
    +  infer_eff_direction = TRUE,
    +  eff_on_minor_alleles = FALSE,
    +  return_list = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the +GWAS.

    + + +
    dbSNP
    +

    version of dbSNP to be used for imputation (144 or 155).

    + + +
    sampled_snps
    +

    Downsample the number of SNPs used when inferring genome +build to save time.

    + + +
    mapping_file
    +

    MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    on_ref_genome
    +

    Binary Should a check take place that all SNPs are on +the reference genome by SNP ID. Default is TRUE.

    + + +
    infer_eff_direction
    +

    Binary Should a check take place to ensure the +alleles match the effect direction? Default is TRUE.

    + + +
    eff_on_minor_alleles
    +

    Binary Should MungeSumstats assume that the +effects are majoritively measured on the minor alleles? Default is FALSE as +this is an assumption that won't be appropriate in all cases. However, the +benefit is that if we know the majority of SNPs have their effects based on +the minor alleles, we can catch cases where the allele columns have been +mislabelled.

    + + +
    return_list
    +

    Return the sumstats_dt within a named list +(default: TRUE).

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object

    +
    +
    +

    Details

    +

    Also, if eff_on_minor_alleles=TRUE, check 3 will be used in all cases. +However, This assumes that the effects are majoritively measured on the +minor alleles and should be used with caution as this is an assumption that +won't be appropriate in all cases. However, the benefit is that if we know +the majority of SNPs have their effects based on the minor alleles, we can +catch cases where the allele columns have been mislabelled. IF +eff_on_minor_alleles=TRUE, checks 1 and 2 will be skipped.

    +
    + +
    +

    Examples

    +
    sumstats <- MungeSumstats::formatted_example()
    +#> Standardising column headers.
    +#> First line of summary statistics file: 
    +#> MarkerName	CHR	POS	A1	A2	EAF	Beta	SE	Pval	
    +#> Sorting coordinates with 'data.table'.
    +#for speed, don't run on_ref_genome part of check (on_ref_genome = FALSE)
    +sumstats_dt2<-infer_effect_column(sumstats_dt=sumstats,on_ref_genome = FALSE)
    +#> Infer Effect Column
    +#> First line of summary statistics file: 
    +#> SNP	CHR	BP	A1	A2	FRQ	BETA	SE	P	
    +#> Allele columns are ambiguous, attempting to infer direction
    +#> Can't infer allele columns from sumstats
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/is_tabix.html b/docs/reference/is_tabix.html new file mode 100644 index 00000000..f77a0dcb --- /dev/null +++ b/docs/reference/is_tabix.html @@ -0,0 +1,114 @@ + +Is tabix — is_tabix • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Is a file bgz-compressed and tabix-indexed.

    +
    + +
    +
    is_tabix(path)
    +
    + +
    +

    Arguments

    +
    path
    +

    Path to file.

    + +
    +
    +

    Value

    + + +

    logical: whether the file is tabix-indexed or not.

    + + +

    logical

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/liftover.html b/docs/reference/liftover.html new file mode 100644 index 00000000..3ad70a38 --- /dev/null +++ b/docs/reference/liftover.html @@ -0,0 +1,221 @@ + +Genome build liftover — liftover • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Transfer genomic coordinates from one genome build to another.

    +
    + +
    +
    liftover(
    +  sumstats_dt,
    +  convert_ref_genome,
    +  ref_genome,
    +  chain_source = "ensembl",
    +  imputation_ind = TRUE,
    +  chrom_col = "CHR",
    +  start_col = "BP",
    +  end_col = start_col,
    +  as_granges = FALSE,
    +  style = "NCBI",
    +  local_chain = NULL,
    +  verbose = TRUE
    +)
    +
    + + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics +file for the GWAS.

    + + +
    convert_ref_genome
    +

    name of the reference genome to convert to +("GRCh37" or "GRCh38"). This will only occur if the current genome build does +not match. Default is not to convert the genome build (NULL).

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    chain_source
    +

    chain file source used ("ucsc" as default, or "ensembl")

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + + +
    chrom_col
    +

    Name of the chromosome column in +sumstats_dt (e.g. "CHR").

    + + +
    start_col
    +

    Name of the starting genomic position +column in sumstats_dt (e.g. "POS","start").

    + + +
    end_col
    +

    Name of the ending genomic position +column in sumstats_dt (e.g. "POS","end"). +Can be the same as start_col when sumstats_dt +only contains SNPs that span 1 base pair (bp) each.

    + + +
    as_granges
    +

    Return results as GRanges +instead of a data.table (default: FALSE).

    + + +
    style
    +

    Style to return GRanges object in +(e.g. "NCBI" = 4; "UCSC" = "chr4";) (default: "NCBI").

    + + +
    local_chain
    +

    Path to local chain file to use instead of downlaoding. +Default of NULL i.e. no local file to use. NOTE if passing a local chain file +make sure to specify the path to convert from and to the correct build like +GRCh37 to GRCh38. We can not sense check this for local files. The chain file +can be submitted as a gz file (as downloaed from source) or unzipped.

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    Lifted summary stats in data.table

    + + +

    or GRanges format.

    +
    + +
    +

    Examples

    +
    sumstats_dt <- MungeSumstats::formatted_example()
    +#> Standardising column headers.
    +#> First line of summary statistics file: 
    +#> MarkerName	CHR	POS	A1	A2	EAF	Beta	SE	Pval	
    +#> Sorting coordinates with 'data.table'.
    +
    +sumstats_dt_hg38 <- liftover(sumstats_dt=sumstats_dt, 
    +                             ref_genome = "hg19",
    +                             convert_ref_genome="hg38")
    +#> Performing data liftover from hg19 to hg38.
    +#> Converting summary statistics to GenomicRanges.
    +#> Downloading chain file...
    +#> Downloading chain file from Ensembl.
    +#> /var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T//Rtmp4DII6I/GRCh37_to_GRCh38.chain.gz
    +#> Reordering so first three column headers are SNP, CHR and BP in this order.
    +#> Reordering so the fourth and fifth columns are A1 and A2.
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/list_sumstats.html b/docs/reference/list_sumstats.html new file mode 100644 index 00000000..b801f69f --- /dev/null +++ b/docs/reference/list_sumstats.html @@ -0,0 +1,142 @@ + +List munged summary statistics — list_sumstats • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Searches for and lists local GWAS summary statistics files munged by +format_sumstats or +import_sumstats.

    +
    + +
    +
    list_sumstats(
    +  save_dir = getwd(),
    +  pattern = "*.tsv.gz$",
    +  ids_from_file = TRUE,
    +  verbose = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    save_dir
    +

    Top-level directory to recursively search +for summary statistics files within.

    + + +
    pattern
    +

    Regex pattern to search for files with.

    + + +
    ids_from_file
    +

    Try to extract dataset IDs from file names. +If FALSE, will infer IDs from the directory names instead.

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    Named vector of summary stats paths.

    +
    + +
    +

    Examples

    +
    save_dir <- system.file("extdata",package = "MungeSumstats")
    +munged_files <- MungeSumstats::list_sumstats(save_dir = save_dir)
    +#> 1 file(s) found.
    +
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/load_ref_genome_data.html b/docs/reference/load_ref_genome_data.html new file mode 100644 index 00000000..4d2241e3 --- /dev/null +++ b/docs/reference/load_ref_genome_data.html @@ -0,0 +1,146 @@ + +Load the reference genome data for SNPs of interest — load_ref_genome_data • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Load the reference genome data for SNPs of interest

    +
    + +
    +
    load_ref_genome_data(
    +  snps,
    +  ref_genome,
    +  dbSNP = c(144, 155),
    +  msg = NULL,
    +  chr_filt = NULL
    +)
    +
    + +
    +

    Source

    +

    +sumstats_dt <- formatted_example() +rsids <- MungeSumstats:::load_ref_genome_data(snps = sumstats_dt$SNP, + ref_genome = "GRCH37", + dbSNP=144) +

    +
    +
    +

    Arguments

    +
    snps
    +

    Character vector SNPs by rs_id from sumstats file of interest.

    + + +
    ref_genome
    +

    Name of the reference genome used for the GWAS +(GRCh37 or GRCh38)

    + + +
    dbSNP
    +

    version of dbSNP to be used (144 or 155)

    + + +
    msg
    +

    Optional name of the column missing from the dataset in question. +Default is NULL

    + + +
    chr_filt
    +

    Internal for testing - filter reference genomes and sumstats +to specific chromosomes for testing. Pass a list of chroms in format: +c("1","2"). Default is NULL i.e. no filtering.

    + +
    +
    +

    Value

    + + +

    data table of snpsById, filtered to SNPs of interest.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/load_snp_loc_data.html b/docs/reference/load_snp_loc_data.html new file mode 100644 index 00000000..a997bf8c --- /dev/null +++ b/docs/reference/load_snp_loc_data.html @@ -0,0 +1,129 @@ + +Loads the SNP locations and alleles for Homo sapiens extracted from NCBI dbSNP Build 144. Reference genome version is dependent on user input. — load_snp_loc_data • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Loads the SNP locations and alleles for Homo sapiens extracted from +NCBI dbSNP Build 144. Reference genome version is dependent on user input.

    +
    + +
    +
    load_snp_loc_data(ref_genome, dbSNP = c(144, 155), msg = NULL)
    +
    + +
    +

    Arguments

    +
    ref_genome
    +

    name of the reference genome used for the GWAS +(GRCh37 or GRCh38)

    + + +
    dbSNP
    +

    version of dbSNP to be used (144 or 155)

    + + +
    msg
    +

    Optional name of the column missing from the dataset in question

    + +
    +
    +

    Value

    + + +

    SNP_LOC_DATA SNP positions and alleles for Homo sapiens extracted +from NCBI dbSNP Build 144

    +
    + +
    +

    Examples

    +
    SNP_LOC_DATA <- load_snp_loc_data("GRCH37",dbSNP=144)
    +#> Loading SNPlocs data.
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/logs_example.html b/docs/reference/logs_example.html new file mode 100644 index 00000000..e6606303 --- /dev/null +++ b/docs/reference/logs_example.html @@ -0,0 +1,135 @@ + +Example logs file — logs_example • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Example logs file produced by format_sumstats.

    +
    + +
    +
    logs_example(read = FALSE)
    +
    + +
    +

    Source

    +

    +eduAttainOkbayPth <- system.file("extdata", "eduAttainOkbay.txt", + package = "MungeSumstats") +sumstats_dt <- data.table::fread(eduAttainOkbayPth) +#### Introduce values that need to be fixed #### +sumstats_dt$Pval[10:15] <- 5 +sumstats_dt$Pval[20:22] <- -5 +sumstats_dt$Pval[23:25] <- "5e-324" +ss_path <- tempfile() +data.table::fwrite(sumstats_dt, ss_path) +log_folder <- tempdir() +reformatted <- MungeSumstats::format_sumstats( + path = ss_path, + ref_genome = "GRCh37", + log_folder = log_folder, + log_mungesumstats_msgs = TRUE, + log_folder_ind = TRUE, +) +file.copy(reformatted$log_files$MungeSumstats_log_msg, + "inst/extdata",overwrite = TRUE) +

    +
    +
    +

    Arguments

    +
    read
    +

    Whether to read the logs file into memory.

    + +
    +
    +

    Value

    + + +

    Path to logs file.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/make_allele_upper.html b/docs/reference/make_allele_upper.html new file mode 100644 index 00000000..b4364ecc --- /dev/null +++ b/docs/reference/make_allele_upper.html @@ -0,0 +1,112 @@ + +Ensure A1 and A2 are upper case — make_allele_upper • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure A1 and A2 are upper case

    +
    + +
    +
    make_allele_upper(sumstats_dt, log_files)
    +
    + +
    +

    Arguments

    +
    log_files
    +

    list of log file locations

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object and the log file list

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/message_parallel.html b/docs/reference/message_parallel.html new file mode 100644 index 00000000..6b6bbe74 --- /dev/null +++ b/docs/reference/message_parallel.html @@ -0,0 +1,105 @@ + +Send messages to console even from within parallel processes — message_parallel • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Send messages to console even from within parallel processes

    +
    + +
    +
    message_parallel(...)
    +
    + +
    +

    Value

    + + +

    A message

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/messager.html b/docs/reference/messager.html new file mode 100644 index 00000000..d53ecd4b --- /dev/null +++ b/docs/reference/messager.html @@ -0,0 +1,115 @@ + +Print messages — messager • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Print messages with option to silence.

    +
    + +
    +
    messager(..., v = TRUE)
    +
    + +
    +

    Arguments

    +
    ...
    +

    Message input.

    + + +
    v
    +

    Whether to print messages.

    + +
    +
    +

    Value

    + + +

    Null output.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_dropped_INFO.html b/docs/reference/parse_dropped_INFO.html new file mode 100644 index 00000000..235ef98e --- /dev/null +++ b/docs/reference/parse_dropped_INFO.html @@ -0,0 +1,111 @@ + +Parse number of SNPs dropped due to being below the INFO threshold — parse_dropped_INFO • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_dropped_INFO(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Numeric

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_dropped_chrom.html b/docs/reference/parse_dropped_chrom.html new file mode 100644 index 00000000..581686ad --- /dev/null +++ b/docs/reference/parse_dropped_chrom.html @@ -0,0 +1,111 @@ + +Parse number of SNPs dropped due to being on chrom X, Y or MT — parse_dropped_chrom • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_dropped_chrom(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Numeric

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_dropped_duplicates.html b/docs/reference/parse_dropped_duplicates.html new file mode 100644 index 00000000..60e41442 --- /dev/null +++ b/docs/reference/parse_dropped_duplicates.html @@ -0,0 +1,111 @@ + +Parse number of SNPs dropped due to being duplicates — parse_dropped_duplicates • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_dropped_duplicates(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Numeric

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_dropped_nonA1A2.html b/docs/reference/parse_dropped_nonA1A2.html new file mode 100644 index 00000000..a9efa156 --- /dev/null +++ b/docs/reference/parse_dropped_nonA1A2.html @@ -0,0 +1,111 @@ + +Parse number of SNPs dropped due to not matching the ref genome A1 or A2 — parse_dropped_nonA1A2 • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_dropped_nonA1A2(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Numeric

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_dropped_nonBiallelic.html b/docs/reference/parse_dropped_nonBiallelic.html new file mode 100644 index 00000000..32f06b16 --- /dev/null +++ b/docs/reference/parse_dropped_nonBiallelic.html @@ -0,0 +1,111 @@ + +Parse number of SNPs dropped due to not being bi-allelic — parse_dropped_nonBiallelic • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_dropped_nonBiallelic(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Numeric

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_dropped_nonRef.html b/docs/reference/parse_dropped_nonRef.html new file mode 100644 index 00000000..cc356055 --- /dev/null +++ b/docs/reference/parse_dropped_nonRef.html @@ -0,0 +1,111 @@ + +Parse number of SNPs dropped due to being in the ref genome — parse_dropped_nonRef • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_dropped_nonRef(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Numeric

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_flipped.html b/docs/reference/parse_flipped.html new file mode 100644 index 00000000..478cce55 --- /dev/null +++ b/docs/reference/parse_flipped.html @@ -0,0 +1,111 @@ + +Parse number of SNPs flipped to align with the ref genome — parse_flipped • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_flipped(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Numeric

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_genome_build.html b/docs/reference/parse_genome_build.html new file mode 100644 index 00000000..bd3ed198 --- /dev/null +++ b/docs/reference/parse_genome_build.html @@ -0,0 +1,111 @@ + +Genome build inferred from the summary statistics — parse_genome_build • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_genome_build(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Character

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_idStandard.html b/docs/reference/parse_idStandard.html new file mode 100644 index 00000000..4d67c65e --- /dev/null +++ b/docs/reference/parse_idStandard.html @@ -0,0 +1,111 @@ + +Standardised IEU MRC OpenGWAS ID — parse_idStandard • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_idStandard(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Character

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_logs.html b/docs/reference/parse_logs.html new file mode 100644 index 00000000..9192fcaa --- /dev/null +++ b/docs/reference/parse_logs.html @@ -0,0 +1,137 @@ + +Parse data from log files — parse_logs • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Parses data from the log files generated by +format_sumstats or +import_sumstats when the argument +log_mungesumstats_msgs is set to TRUE.

    +
    + +
    +
    parse_logs(
    +  save_dir = getwd(),
    +  pattern = "MungeSumstats_log_msg.txt$",
    +  verbose = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    save_dir
    +

    Top-level directory to recursively search +for log files within.

    + + +
    pattern
    +

    Regex pattern to search for files with.

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    data.table of parsed log data.

    +
    + +
    +

    Examples

    +
    save_dir <- system.file("extdata",package = "MungeSumstats")
    +log_data <- MungeSumstats::parse_logs(save_dir = save_dir)
    +#> Parsing info from 1 log file(s).
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_pval_large.html b/docs/reference/parse_pval_large.html new file mode 100644 index 00000000..149966bd --- /dev/null +++ b/docs/reference/parse_pval_large.html @@ -0,0 +1,111 @@ + +Parse number of SNPs with p-values >1 — parse_pval_large • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_pval_large(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Numeric

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_pval_neg.html b/docs/reference/parse_pval_neg.html new file mode 100644 index 00000000..12010a5c --- /dev/null +++ b/docs/reference/parse_pval_neg.html @@ -0,0 +1,111 @@ + +Parse number of SNPs with p-values <0 — parse_pval_neg • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_pval_neg(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Numeric

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_pval_small.html b/docs/reference/parse_pval_small.html new file mode 100644 index 00000000..1ee7ab26 --- /dev/null +++ b/docs/reference/parse_pval_small.html @@ -0,0 +1,111 @@ + +Parse number of SNPs with non-negative p-values <=5e-324 — parse_pval_small • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_pval_small(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Numeric

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_report.html b/docs/reference/parse_report.html new file mode 100644 index 00000000..dcbf97aa --- /dev/null +++ b/docs/reference/parse_report.html @@ -0,0 +1,111 @@ + +Parse "Summary statistics report" metrics — parse_report • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_report(l, entry = 1, line = 1)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Numeric

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_snps_freq_05.html b/docs/reference/parse_snps_freq_05.html new file mode 100644 index 00000000..e89a42b1 --- /dev/null +++ b/docs/reference/parse_snps_freq_05.html @@ -0,0 +1,111 @@ + +Parse number/percent of SNPs with FREQ values >0.5 — parse_snps_freq_05 • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_snps_freq_05(l, percent = FALSE)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Numeric

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_snps_not_formatted.html b/docs/reference/parse_snps_not_formatted.html new file mode 100644 index 00000000..d96e84ad --- /dev/null +++ b/docs/reference/parse_snps_not_formatted.html @@ -0,0 +1,111 @@ + +Parse number of SNPs not correctly formatted — parse_snps_not_formatted • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_snps_not_formatted(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Numeric

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/parse_time.html b/docs/reference/parse_time.html new file mode 100644 index 00000000..e2e3ebce --- /dev/null +++ b/docs/reference/parse_time.html @@ -0,0 +1,111 @@ + +Parse the total time taken the munge the file — parse_time • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Support function for parse_logs.

    +
    + +
    +
    parse_time(l)
    +
    + +
    +

    Arguments

    +
    l
    +

    Lines of text from log file.

    + +
    +
    +

    Value

    + + +

    Character

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/preview_sumstats.html b/docs/reference/preview_sumstats.html new file mode 100644 index 00000000..4b3f2d5e --- /dev/null +++ b/docs/reference/preview_sumstats.html @@ -0,0 +1,112 @@ + +Preview formatted sum stats saved to disk — preview_sumstats • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Prints the first n lines of the sum stats.

    +
    + +
    +
    preview_sumstats(save_path, nrows = 5L)
    +
    + +
    +

    Arguments

    +
    save_path
    +

    File path to save formatted data. Defaults to +tempfile(fileext=".tsv.gz").

    + +
    +
    +

    Value

    + + +

    No return

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/raw_ALSvcf.html b/docs/reference/raw_ALSvcf.html new file mode 100644 index 00000000..cf1b2a15 --- /dev/null +++ b/docs/reference/raw_ALSvcf.html @@ -0,0 +1,129 @@ + +GWAS Amyotrophic lateral sclerosis ieu open GWAS project - Subset — raw_ALSvcf • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    VCF (VCFv4.2) of the GWAS Amyotrophic lateral sclerosis ieu +open GWAS project Dataset: ebi-a-GCST005647. +A subset of 99 SNPs

    +
    + + +
    +

    Format

    +

    vcf document with 528 items relating to 99 SNPs

    +
    +
    +

    Source

    +

    The summary statistics VCF (VCFv4.2) file was downloaded from +https://gwas.mrcieu.ac.uk/datasets/ebi-a-GCST005647/ +and formatted to a .rda with the following: + +#Get example VCF dataset, use GWAS Amyotrophic lateral sclerosis +ALS_GWAS_VCF <- readLines("ebi-a-GCST005647.vcf.gz") +#Subset to just the first 99 SNPs +ALSvcf <- ALS_GWAS_VCF[1:528] +writeLines(ALSvcf,"inst/extdata/ALSvcf.vcf") +

    +
    +
    +

    Details

    +

    A VCF file (VCFv4.2) of the GWAS Amyotrophic lateral sclerosis ieu +open GWAS project has been subsetted here to act as an example summary +statistic file in VCF format which has some issues in the formatting. +MungeSumstats can correct these issues and produced a standardised summary +statistics format.

    +
    +
    +

    ALSvcf.vcf

    +

    NA

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/raw_eduAttainOkbay.html b/docs/reference/raw_eduAttainOkbay.html new file mode 100644 index 00000000..40f191b5 --- /dev/null +++ b/docs/reference/raw_eduAttainOkbay.html @@ -0,0 +1,141 @@ + +GWAS Educational Attainment Okbay 2016 - Subset — raw_eduAttainOkbay • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    GWAS Summary Statistics on Educational Attainment by Okbay et +al 2016: +PMID: 27898078 PMCID: PMC5509058 DOI: 10.1038/ng1216-1587b. +A subset of 93 SNPs

    +
    + + +
    +

    Format

    +

    txt document with 94 items

    +
    +
    +

    Source

    +

    The summary statistics file was downloaded from +https://www.nature.com/articles/ng.3552 +and formatted to a .rda with the following: + +#Get example dataset, use Educational-Attainment_Okbay_2016 +link<-"Educational-Attainment_Okbay_2016/EduYears_Discovery_5000.txt" +eduAttainOkbay<-readLines(link,n=100) +#There is an issue where values end with .0, this 0 is removed in func +#There are also SNPs not on ref genome or arebi/tri allelic +#So need to remove these in this dataset as its used for testing +tmp <- tempfile() +writeLines(eduAttainOkbay,con=tmp) +eduAttainOkbay <- data.table::fread(tmp) #DT read removes the .0's +#remove those not on ref genome and withbi/tri allelic +rmv <- c("rs192818565","rs79925071","rs1606974","rs1871109", + "rs73074378","rs7955289") +eduAttainOkbay <- eduAttainOkbay[!MarkerName +data.table::fwrite(eduAttainOkbay,file=tmp,sep="\t") +eduAttainOkbay <- readLines(tmp) +writeLines(eduAttainOkbay,"inst/extdata/eduAttainOkbay.txt") +

    +
    +
    +

    Details

    +

    GWAS Summary Statistics on Educational Attainment by Okbay et +al 2016 has been subsetted here to act as an example summary statistic file +which has some issues in the formatting. MungeSumstats can correct these +issues.

    +
    +
    +

    eduAttainOkbay.txt

    +

    NA

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/read_header.html b/docs/reference/read_header.html new file mode 100644 index 00000000..87102526 --- /dev/null +++ b/docs/reference/read_header.html @@ -0,0 +1,134 @@ + +Read in file header — read_header • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Read in file header

    +
    + +
    +
    read_header(path, n = 2L, skip_vcf_metadata = FALSE, nThread = 1)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    n
    +

    integer. The (maximal) number of lines to read. Negative values +indicate that one should read up to the end of input on the connection.

    + + +
    skip_vcf_metadata
    +

    logical, should VCF metadata be ignored

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + +
    +
    +

    Value

    + + +

    First n lines of the VCF header

    +
    + +
    +

    Examples

    +
    path <- system.file("extdata", "eduAttainOkbay.txt", 
    +                    package = "MungeSumstats") 
    +header <- read_header(path = path)                    
    +#> Reading header.
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/read_log_pval.html b/docs/reference/read_log_pval.html new file mode 100644 index 00000000..ff382699 --- /dev/null +++ b/docs/reference/read_log_pval.html @@ -0,0 +1,129 @@ + +Read -log10 p-value column — read_log_pval • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Parse p-value column in VCF file.of other general -loq10 p-values

    +
    + +
    +
    read_log_pval(
    +  sumstats_dt,
    +  mapping_file = sumstatsColHeaders,
    +  return_list = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    Summary stats data.table.

    + + +
    mapping_file
    +

    MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.

    + + +
    return_list
    +

    Binary, whether to return the dt in a list or not - list +is standard for the format_sumstats() function.

    + +
    +
    +

    Value

    + + +

    Null output.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/read_sumstats.html b/docs/reference/read_sumstats.html new file mode 100644 index 00000000..5a87dd1d --- /dev/null +++ b/docs/reference/read_sumstats.html @@ -0,0 +1,167 @@ + +Determine summary statistics file type and read them into memory — read_sumstats • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Determine summary statistics file type and read them into memory

    +
    + +
    +
    read_sumstats(
    +  path,
    +  nrows = Inf,
    +  standardise_headers = FALSE,
    +  samples = 1,
    +  sampled_rows = 10000L,
    +  nThread = 1,
    +  mapping_file = sumstatsColHeaders
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    nrows
    +

    integer. The (maximal) number of lines to read. +If Inf, will read in all rows.

    + + +
    standardise_headers
    +

    Standardise headers first.

    + + +
    samples
    +

    Which samples to use:

    • 1 : Only the first sample will be used (DEFAULT).

    • +
    • NULL : All samples will be used.

    • +
    • c("<sample_id1>","<sample_id2>",...) : +Only user-selected samples will be used (case-insensitive).

    • +
    + + +
    sampled_rows
    +

    First N rows to sample. +Set NULL to use full sumstats_file. +when determining whether cols are empty.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    mapping_file
    +

    MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.

    + +
    +
    +

    Value

    + + +

    data.table of formatted summary statistics

    +
    + +
    +

    Examples

    +
    path <- system.file("extdata", "eduAttainOkbay.txt",
    +    package = "MungeSumstats"
    +)
    +eduAttainOkbay <- read_sumstats(path = path)
    +#> Importing tabular file: /private/var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T/RtmpKLvRpi/temp_libpath17f3d19176b21/MungeSumstats/extdata/eduAttainOkbay.txt
    +#> Checking for empty columns.
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/read_vcf.html b/docs/reference/read_vcf.html new file mode 100644 index 00000000..3a4c5532 --- /dev/null +++ b/docs/reference/read_vcf.html @@ -0,0 +1,277 @@ + +Read in VCF file — read_vcf • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Read in a VCF file as a VCF or a +data.table. +Can optionally save the VCF/data.table as well.

    +
    + +
    +
    read_vcf(
    +  path,
    +  as_datatable = TRUE,
    +  save_path = NULL,
    +  tabix_index = FALSE,
    +  samples = 1,
    +  which = NULL,
    +  use_params = TRUE,
    +  sampled_rows = 10000L,
    +  download = TRUE,
    +  vcf_dir = tempdir(),
    +  download_method = "download.file",
    +  force_new = FALSE,
    +  mt_thresh = 100000L,
    +  nThread = 1,
    +  verbose = TRUE
    +)
    +
    + + +
    +

    Arguments

    +
    path
    +

    Path to local or remote VCF file.

    + + +
    as_datatable
    +

    Return the data as a +data.table (default: TRUE) +or a VCF (FALSE).

    + + +
    save_path
    +

    File path to save formatted data. Defaults to +tempfile(fileext=".tsv.gz").

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    samples
    +

    Which samples to use:

    • 1 : Only the first sample will be used (DEFAULT).

    • +
    • NULL : All samples will be used.

    • +
    • c("<sample_id1>","<sample_id2>",...) : +Only user-selected samples will be used (case-insensitive).

    • +
    + + +
    which
    +

    Genomic ranges to be added if supplied. Default is NULL.

    + + +
    use_params
    +

    When TRUE (default), increases the speed of reading in the VCF by +omitting columns that are empty based on the head of the VCF (NAs only). +NOTE that that this requires the VCF to be sorted, bgzip-compressed, +tabix-indexed, which read_vcf will attempt to do.

    + + +
    sampled_rows
    +

    First N rows to sample. +Set NULL to use full sumstats_file. +when determining whether cols are empty.

    + + +
    download
    +

    Download the VCF (and its index file) +to a temp folder before reading it into R. +This is important to keep TRUE when nThread>1 to avoid +making too many queries to remote file.

    + + +
    vcf_dir
    +

    Where to download the original VCF from Open GWAS. +WARNING: This is set to tempdir() by default. +This means the raw (pre-formatted) VCFs be deleted upon ending the R session. +Change this to keep the raw VCF file on disk +(e.g. vcf_dir="./raw_vcf").

    + + +
    download_method
    +

    "axel" (multi-threaded) or +"download.file" (single-threaded) .

    + + +
    force_new
    +

    If a formatted file of the same names as save_path +exists, formatting will be skipped and this file will be imported instead +(default). Set force_new=TRUE to override this.

    + + +
    mt_thresh
    +

    When the number of rows (variants) in the VCF is +< mt_thresh, only use single-threading for reading in the VCF. +This is because the overhead of parallelisation outweighs the speed benefits +when VCFs are small.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    The VCF file in data.table format.

    +
    + +
    +

    Examples

    +
    #### Local file ####
    +path <- system.file("extdata","ALSvcf.vcf", package="MungeSumstats")
    +sumstats_dt <- read_vcf(path = path)
    +#> Loading required namespace: GenomicFiles
    +#> Using local VCF.
    +#> bgzip-compressing VCF file.
    +#> Finding empty VCF columns based on first 10,000 rows.
    +#> Dropping 1 duplicate column(s).
    +#> 1 sample detected: EBI-a-GCST005647
    +#> Constructing ScanVcfParam object.
    +#> VCF contains: 39,630,630 variant(s) x 1 sample(s)
    +#> Reading VCF file: single-threaded
    +#> Converting VCF to data.table.
    +#> Expanding VCF first, so number of rows may increase.
    +#> Dropping 1 duplicate column(s).
    +#> Checking for empty columns.
    +#> Unlisting 3 columns.
    +#> Dropped 314 duplicate rows.
    +#> Time difference of 0.1 secs
    +#> VCF data.table contains: 101 rows x 11 columns.
    +#> Time difference of 0.4 secs
    +#> Renaming ID as SNP.
    +#> sumstats has -log10 P-values; these will be converted to unadjusted p-values in the 'P' column.
    +#> No INFO (SI) column detected.
    +
    +#### Remote file ####
    +## Small GWAS (0.2Mb)
    +# path <- "https://gwas.mrcieu.ac.uk/files/ieu-a-298/ieu-a-298.vcf.gz"
    +# sumstats_dt2 <- read_vcf(path = path)
    +
    +## Large GWAS (250Mb)
    +# path <- "https://gwas.mrcieu.ac.uk/files/ubm-a-2929/ubm-a-2929.vcf.gz"
    +# sumstats_dt3 <- read_vcf(path = path, nThread=11)
    +
    +### Very large GWAS (500Mb)
    +# path <- "https://gwas.mrcieu.ac.uk/files/ieu-a-1124/ieu-a-1124.vcf.gz"
    +# sumstats_dt4 <- read_vcf(path = path, nThread=11)
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/read_vcf_genome.html b/docs/reference/read_vcf_genome.html new file mode 100644 index 00000000..d11ba543 --- /dev/null +++ b/docs/reference/read_vcf_genome.html @@ -0,0 +1,130 @@ + +Read VCF genome — read_vcf_genome • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Get the genome build of a remote or local VCF file.

    +
    + +
    +
    read_vcf_genome(
    +  header = NULL,
    +  validate = FALSE,
    +  default_genome = "HG19/GRCh37",
    +  verbose = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    header
    +

    Header extracted by scanVcfHeader.

    + + +
    validate
    +

    Validate genome name using +mapGenomeBuilds.

    + + +
    default_genome
    +

    When no genome can be extracted, +default to this genome build.

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    genome

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/read_vcf_info.html b/docs/reference/read_vcf_info.html new file mode 100644 index 00000000..f57cb012 --- /dev/null +++ b/docs/reference/read_vcf_info.html @@ -0,0 +1,111 @@ + +Read VCF: INFO column — read_vcf_info • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Parse INFO column in VCF file.

    +
    + +
    +
    read_vcf_info(sumstats_dt)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    Summary stats data.table.

    + +
    +
    +

    Value

    + + +

    Null output.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/read_vcf_markername.html b/docs/reference/read_vcf_markername.html new file mode 100644 index 00000000..333693c8 --- /dev/null +++ b/docs/reference/read_vcf_markername.html @@ -0,0 +1,111 @@ + +Read VCF: MarkerName column — read_vcf_markername • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Parse MarkerName/SNP column in VCF file.

    +
    + +
    +
    read_vcf_markername(sumstats_dt)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    Summary stats data.table.

    + +
    +
    +

    Value

    + + +

    Null output.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/read_vcf_parallel.html b/docs/reference/read_vcf_parallel.html new file mode 100644 index 00000000..c5b053cc --- /dev/null +++ b/docs/reference/read_vcf_parallel.html @@ -0,0 +1,235 @@ + +Read VCF: parallel — read_vcf_parallel • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Read a VCF file across 1 or more threads in parallel. +If tilewidth is not specified, the size of each chunk will be +determined by total genome size divided by ntile. +By default, ntile is equal to the number of threads, nThread. +For further discussion on how this function was optimised, +see +here +and +here.

    +
    + +
    +
    read_vcf_parallel(
    +  path,
    +  samples = 1,
    +  which = NULL,
    +  use_params = TRUE,
    +  as_datatable = TRUE,
    +  sampled_rows = 10000L,
    +  include_xy = FALSE,
    +  download = TRUE,
    +  vcf_dir = tempdir(),
    +  download_method = "download.file",
    +  force_new = FALSE,
    +  tilewidth = NULL,
    +  mt_thresh = 100000L,
    +  nThread = 1,
    +  ntile = nThread,
    +  verbose = TRUE
    +)
    +
    + +
    +

    Source

    +

    +path <- "https://gwas.mrcieu.ac.uk/files/ieu-a-298/ieu-a-298.vcf.gz" +#### Single-threaded #### +vcf <- MungeSumstats:::read_vcf_parallel(path = path) +#### Parallel #### +vcf2 <- MungeSumstats:::read_vcf_parallel(path = path, nThread=11) +

    +
    +
    +

    Arguments

    +
    path
    +

    Path to local or remote VCF file.

    + + +
    samples
    +

    Which samples to use:

    • 1 : Only the first sample will be used (DEFAULT).

    • +
    • NULL : All samples will be used.

    • +
    • c("<sample_id1>","<sample_id2>",...) : +Only user-selected samples will be used (case-insensitive).

    • +
    + + +
    which
    +

    Genomic ranges to be added if supplied. Default is NULL.

    + + +
    use_params
    +

    When TRUE (default), increases the speed of reading in the VCF by +omitting columns that are empty based on the head of the VCF (NAs only). +NOTE that that this requires the VCF to be sorted, bgzip-compressed, +tabix-indexed, which read_vcf will attempt to do.

    + + +
    as_datatable
    +

    Return the data as a +data.table (default: TRUE) +or a VCF (FALSE).

    + + +
    sampled_rows
    +

    First N rows to sample. +Set NULL to use full sumstats_file. +when determining whether cols are empty.

    + + +
    download
    +

    Download the VCF (and its index file) +to a temp folder before reading it into R. +This is important to keep TRUE when nThread>1 to avoid +making too many queries to remote file.

    + + +
    vcf_dir
    +

    Where to download the original VCF from Open GWAS. +WARNING: This is set to tempdir() by default. +This means the raw (pre-formatted) VCFs be deleted upon ending the R session. +Change this to keep the raw VCF file on disk +(e.g. vcf_dir="./raw_vcf").

    + + +
    download_method
    +

    "axel" (multi-threaded) or +"download.file" (single-threaded) .

    + + +
    force_new
    +

    If a formatted file of the same names as save_path +exists, formatting will be skipped and this file will be imported instead +(default). Set force_new=TRUE to override this.

    + + +
    tilewidth
    +

    The desired tile width. The effective tile width might be slightly + different but is guaranteed to never be more than the desired width.

    + + +
    mt_thresh
    +

    When the number of rows (variants) in the VCF is +< mt_thresh, only use single-threading for reading in the VCF. +This is because the overhead of parallelisation outweighs the speed benefits +when VCFs are small.

    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    ntile
    +

    The number of tiles to generate.

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    VCF file.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/register_cores.html b/docs/reference/register_cores.html new file mode 100644 index 00000000..48857ba4 --- /dev/null +++ b/docs/reference/register_cores.html @@ -0,0 +1,123 @@ + +Register cores — register_cores • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Register a multi-threaded instances using BiocParallel.

    +
    + +
    +
    register_cores(workers = 1, progressbar = TRUE)
    +
    + +
    +

    Arguments

    +
    workers
    +
    + +

    integer(1) Number of workers. Defaults to the maximum of 1 or + the number of cores determined by detectCores minus 2 unless + environment variables R_PARALLELLY_AVAILABLECORES_FALLBACK or + BIOCPARALLEL_WORKER_NUMBER are set otherwise. For a + SOCK cluster, workers can be a character() + vector of host names.

    +

    + + +
    progressbar
    +

    logical(1) Enable progress bar (based on plyr:::progress_text).

    + +
    +
    +

    Value

    + + +

    Null output.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/remove_empty_cols.html b/docs/reference/remove_empty_cols.html new file mode 100644 index 00000000..e77dd483 --- /dev/null +++ b/docs/reference/remove_empty_cols.html @@ -0,0 +1,117 @@ + +Remove empty columns — remove_empty_cols • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Remote columns that are empty or contain all the same values in a data.table.

    +
    + +
    +
    remove_empty_cols(sumstats_dt, sampled_rows = NULL, verbose = TRUE)
    +
    + +
    +

    Arguments

    +
    sampled_rows
    +

    First N rows to sample. +Set NULL to use full sumstats_file. +when determining whether cols are empty.

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    Null output.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/report_summary.html b/docs/reference/report_summary.html new file mode 100644 index 00000000..e6526ef2 --- /dev/null +++ b/docs/reference/report_summary.html @@ -0,0 +1,112 @@ + +Report info on current state of the summary statistics — report_summary • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Prints report.

    +
    + +
    +
    report_summary(sumstats_dt, orig_dims = NULL)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary +statistics file for the GWAS.

    + +
    +
    +

    Value

    + + +

    No return

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/select_vcf_fields.html b/docs/reference/select_vcf_fields.html new file mode 100644 index 00000000..3cb3d05b --- /dev/null +++ b/docs/reference/select_vcf_fields.html @@ -0,0 +1,144 @@ + +Select VCF fields — select_vcf_fields • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Select non-empty columns from each VCF field type.

    +
    + +
    +
    select_vcf_fields(
    +  path,
    +  sampled_rows = 10000L,
    +  which = NULL,
    +  samples = NULL,
    +  nThread = 1,
    +  verbose = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Path to local or remote VCF file.

    + + +
    sampled_rows
    +

    First N rows to sample. +Set NULL to use full sumstats_file. +when determining whether cols are empty.

    + + +
    which
    +

    Genomic ranges to be added if supplied. Default is NULL.

    + + +
    samples
    +

    Which samples to use:

    • 1 : Only the first sample will be used (DEFAULT).

    • +
    • NULL : All samples will be used.

    • +
    • c("<sample_id1>","<sample_id2>",...) : +Only user-selected samples will be used (case-insensitive).

    • +
    + + +
    nThread
    +

    Number of threads to use for parallel processes.

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    ScanVcfParam object.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/sort_coord_genomicranges.html b/docs/reference/sort_coord_genomicranges.html new file mode 100644 index 00000000..43c941cd --- /dev/null +++ b/docs/reference/sort_coord_genomicranges.html @@ -0,0 +1,114 @@ + +Sort sum stats: GenomicRanges — sort_coord_genomicranges • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Sort summary statistics table by genomic coordinates using a slower +(but in some cases more robust) GenomicRanges strategy

    +
    + +
    +
    sort_coord_genomicranges(sumstats_dt)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data.table obj of the +summary statistics file for the GWAS.

    + +
    +
    +

    Value

    + + +

    Sorted sumstats_dt

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/sort_coords.html b/docs/reference/sort_coords.html new file mode 100644 index 00000000..0b42be7f --- /dev/null +++ b/docs/reference/sort_coords.html @@ -0,0 +1,130 @@ + +Sort sum stats — sort_coords • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Sort summary statistics table by genomic coordinates.

    +
    + +
    +
    sort_coords(
    +  sumstats_dt,
    +  sort_coordinates = TRUE,
    +  sort_method = c("data.table", "GenomicRanges")
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data.table obj of the +summary statistics file for the GWAS.

    + + +
    sort_method
    +

    Method to sort coordinates by:

    • "data.table" (default)Uses setorderv, +which is must faster than "GenomicRanges" +but less robust to variations in some sum stats files.

    • +
    • "GenomicRanges"Uses sort.GenomicRanges, +which is more robust to variations in sum stats files +but much slower than the "data.table" method.

    • +
    + + +
    sort_coords
    +

    Whether to sort by coordinates.

    + +
    +
    +

    Value

    + + +

    Sorted sumstats_dt

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/sort_coords_datatable.html b/docs/reference/sort_coords_datatable.html new file mode 100644 index 00000000..57c6513a --- /dev/null +++ b/docs/reference/sort_coords_datatable.html @@ -0,0 +1,127 @@ + +Sort sum stats: data.table — sort_coords_datatable • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Sort summary statistics table by genomic coordinates using a fast +data.table-native strategy

    +
    + +
    +
    sort_coords_datatable(
    +  sumstats_dt,
    +  chr_col = "CHR",
    +  start_col = "BP",
    +  end_col = start_col
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data.table obj of the +summary statistics file for the GWAS.

    + + +
    chr_col
    +

    Chromosome column name.

    + + +
    start_col
    +

    Genomic end position column name.

    + +
    +
    +

    Value

    + + +

    Sorted sumstats_dt

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/standardise_header.html b/docs/reference/standardise_header.html new file mode 100644 index 00000000..c0fd35f9 --- /dev/null +++ b/docs/reference/standardise_header.html @@ -0,0 +1,163 @@ + +Standardise the column headers in the Summary Statistics files — standardise_header • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Use a reference data table of common column header names (stored in +sumstatsColHeaders or user inputted mapping file) to convert them to a +standard set, i.e. chromosome -> CHR. This function does not check that all +the required column headers are present. The amended header is written +directly back into the file

    +
    + +
    +
    standardise_header(
    +  sumstats_dt,
    +  mapping_file = sumstatsColHeaders,
    +  uppercase_unmapped = TRUE,
    +  convert_A0 = TRUE,
    +  return_list = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file for the +GWAS.

    + + +
    mapping_file
    +

    MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.

    + + +
    uppercase_unmapped
    +

    For columns that could not be identified in +the mapping_file, return them in the same format they were input as +(without forcing them to uppercase).

    + + +
    convert_A0
    +

    Whether to convert A* (representing A0) to A1/A2. This +should be done unless checking if A0 was present in the input as if you do +it you can't infer this. Default is TRUE

    + + +
    return_list
    +

    Return the sumstats_dt within a named list +(default: TRUE).

    + +
    +
    +

    Value

    + + +

    list containing sumstats_dt, the modified summary statistics data +table object

    +
    + +
    +

    Examples

    +
    sumstats_dt <- data.table::fread(system.file("extdata", "eduAttainOkbay.txt",
    +                                             package = "MungeSumstats"))
    +sumstats_dt2 <- standardise_header(sumstats_dt=sumstats_dt)
    +#> Standardising column headers.
    +#> First line of summary statistics file: 
    +#> MarkerName	CHR	POS	A1	A2	EAF	Beta	SE	Pval	
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/sumstatsColHeaders.html b/docs/reference/sumstatsColHeaders.html new file mode 100644 index 00000000..daf7df41 --- /dev/null +++ b/docs/reference/sumstatsColHeaders.html @@ -0,0 +1,168 @@ + +Summary Statistics Column Headers — sumstatsColHeaders • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    List of uncorrected column headers often found in GWAS Summary +Statistics column headers. Note the effect allele will always be the A2 +allele, this is the approach done for +VCF(https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7805039/). This is enforced +with the column header corrections here and also the check allele flipping +test.

    +
    + +
    +
    data("sumstatsColHeaders")
    +
    + +
    +

    Format

    +

    dataframe with 2 columns

    +
    +
    +

    Source

    +

    The code to prepare the .Rda file file from the marker file is: + +# Most the data in the below table comes from the LDSC github wiki +data("sumstatsColHeaders") +# Make additions to sumstatsColHeaders using github version of MungeSumstats- +# Shown is an example of adding new A1 and A2 naming +a1_name <- c("NON","RISK","ALLELE") +a2_name <- c("RISK","ALLELE") +all_delims <- c("_",".",""," ","-") +all_uncorr_a1 <- vector(mode="list",length = length(all_delims)) +all_corr_a1 <- vector(mode="list",length = length(all_delims)) +all_uncorr_a2 <- vector(mode="list",length = length(all_delims)) +all_corr_a2 <- vector(mode="list",length = length(all_delims)) +for(i in seq_along(all_delims)){ +delim <- all_delims[i] +a1 <- unlist(paste(a1_name,collapse=delim)) +a2 <- unlist(paste(a2_name,collapse=delim)) +all_uncorr_a1[[i]] <- a1 +all_uncorr_a2[[i]] <- a2 +all_corr_a1[[i]] <- "A1" + all_corr_a2[[i]] <- "A2" +} +se_cols <- data.frame("Uncorrected"=c(unlist(all_uncorr_a1),unlist(all_uncorr_a2)), + "Corrected"=c(unlist(all_corr_a1),unlist(all_corr_a2))) +# Or another example ..... +# shown is an example of adding columns for Standard Error (SE) +se_cols <- data.frame("Uncorrected"=c("SE","se","STANDARD.ERROR", + "STANDARD_ERROR","STANDARD-ERROR"), + "Corrected"=rep("SE",5)) +sumstatsColHeaders <- rbind(sumstatsColHeaders,se_cols) +#Once additions are made, order & save the new mapping dataset +#now sort ordering -important for logic that +# uncorrected=corrected comes first +sumstatsColHeaders$ordering <- + sumstatsColHeaders$Uncorrected==sumstatsColHeaders$Corrected +sumstatsColHeaders <- + sumstatsColHeaders[order(sumstatsColHeaders$Corrected, + sumstatsColHeaders$ordering,decreasing = TRUE),] +rownames(sumstatsColHeaders)<-1:nrow(sumstatsColHeaders) +sumstatsColHeaders$ordering <- NULL +#manually move FREQUENCY to above MAR - github issue 95 +frequency <- sumstatsColHeaders[sumstatsColHeaders$Uncorrected=="FREQUENCY",] +maf <- sumstatsColHeaders[sumstatsColHeaders$Uncorrected=="MAF",] +if(as.integer(rownames(frequency))>as.integer(rownames(maf))){ + sumstatsColHeaders[as.integer(rownames(frequency)),] <- maf + sumstatsColHeaders[as.integer(rownames(maf)),] <- frequency +} +usethis::use_data(sumstatsColHeaders,overwrite = TRUE, internal=TRUE) +save(sumstatsColHeaders, + file="data/sumstatsColHeaders.rda") +# You will need to restart your r session for effects to take account +

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/supported_suffixes.html b/docs/reference/supported_suffixes.html new file mode 100644 index 00000000..677e85ef --- /dev/null +++ b/docs/reference/supported_suffixes.html @@ -0,0 +1,128 @@ + +List supported file formats — supported_suffixes • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    List supported file formats

    +
    + +
    +
    supported_suffixes(
    +  tabular = TRUE,
    +  tabular_compressed = TRUE,
    +  vcf = TRUE,
    +  vcf_compressed = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    tabular
    +

    Include tabular formats.

    + + +
    tabular_compressed
    +

    Include compressed tabular formats.

    + + +
    vcf
    +

    Include Variant Call Format.

    + + +
    vcf_compressed
    +

    Include compressed Variant Call Format.

    + +
    +
    +

    Value

    + + +

    File formats

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/to_granges.html b/docs/reference/to_granges.html new file mode 100644 index 00000000..f8f16d2e --- /dev/null +++ b/docs/reference/to_granges.html @@ -0,0 +1,147 @@ + +To GRanges — to_granges • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Convert a data.table to GRanges.

    +
    + +
    +
    to_granges(
    +  sumstats_dt,
    +  seqnames.field = "CHR",
    +  start.field = "BP",
    +  end.field = "BP",
    +  style = c("NCBI", "UCSC")
    +)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics file +for the GWAS.

    + + +
    seqnames.field
    +

    A character vector of recognized names for the column in df + that contains the chromosome name (a.k.a. sequence name) associated + with each genomic range. + Only the first name in seqnames.field that is found + in colnames(df) is used. + If no one is found, then an error is raised.

    + + +
    start.field
    +

    A character vector of recognized names for the column in df + that contains the start positions of the genomic ranges. + Only the first name in start.field that is found + in colnames(df) is used. + If no one is found, then an error is raised.

    + + +
    end.field
    +

    A character vector of recognized names for the column in df + that contains the end positions of the genomic ranges. + Only the first name in start.field that is found + in colnames(df) is used. + If no one is found, then an error is raised.

    + + +
    style
    +

    GRanges style to convert to, "NCBI" or "UCSC".

    + +
    +
    +

    Value

    + + +

    GRanges object

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/to_vranges.html b/docs/reference/to_vranges.html new file mode 100644 index 00000000..ac0821f0 --- /dev/null +++ b/docs/reference/to_vranges.html @@ -0,0 +1,112 @@ + +Convert to VRanges — to_vranges • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Convert to VRanges

    +
    + +
    +
    to_vranges(sumstats_dt)
    +
    + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics +file for the GWAS.

    + +
    +
    +

    Value

    + + +

    VRanges object

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/unlist_dt.html b/docs/reference/unlist_dt.html new file mode 100644 index 00000000..35c1ce3e --- /dev/null +++ b/docs/reference/unlist_dt.html @@ -0,0 +1,115 @@ + +Unlist a data.table — unlist_dt • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Identify columns that are lists and turn them into vectors.

    +
    + +
    +
    unlist_dt(dt, verbose = TRUE)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.table

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    dt with list columns turned into vectors.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/validate_parameters.html b/docs/reference/validate_parameters.html new file mode 100644 index 00000000..91cf2cae --- /dev/null +++ b/docs/reference/validate_parameters.html @@ -0,0 +1,457 @@ + +Ensure that the input parameters are logical — validate_parameters • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Ensure that the input parameters are logical

    +
    + +
    +
    validate_parameters(
    +  path,
    +  ref_genome,
    +  convert_ref_genome,
    +  convert_small_p,
    +  es_is_beta,
    +  compute_z,
    +  compute_n,
    +  convert_n_int,
    +  analysis_trait,
    +  INFO_filter,
    +  FRQ_filter,
    +  pos_se,
    +  effect_columns_nonzero,
    +  N_std,
    +  N_dropNA,
    +  chr_style,
    +  rmv_chr,
    +  on_ref_genome,
    +  infer_eff_direction,
    +  eff_on_minor_alleles,
    +  strand_ambig_filter,
    +  allele_flip_check,
    +  allele_flip_drop,
    +  allele_flip_z,
    +  allele_flip_frq,
    +  bi_allelic_filter,
    +  flip_frq_as_biallelic,
    +  snp_ids_are_rs_ids,
    +  remove_multi_rs_snp,
    +  frq_is_maf,
    +  indels,
    +  drop_indels,
    +  check_dups,
    +  dbSNP,
    +  write_vcf,
    +  return_format,
    +  ldsc_format,
    +  save_format,
    +  imputation_ind,
    +  log_folder_ind,
    +  log_mungesumstats_msgs,
    +  mapping_file,
    +  tabix_index,
    +  chain_source,
    +  local_chain,
    +  drop_na_cols,
    +  rmv_chrPrefix
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Filepath for the summary statistics file to be formatted. A +dataframe or datatable of the summary statistics file can also be passed +directly to MungeSumstats using the path parameter.

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    convert_ref_genome
    +

    name of the reference genome to convert to +("GRCh37" or "GRCh38"). This will only occur if the current genome build does +not match. Default is not to convert the genome build (NULL).

    + + +
    convert_small_p
    +

    Binary, should non-negative +p-values <= 5e-324 be converted to 0? +Small p-values pass the R limit and can cause errors with LDSC/MAGMA and +should be converted. Default is TRUE.

    + + +
    es_is_beta
    +

    Binary, whether to map ES to BETA. We take BETA to be any +BETA-like value (including Effect Size). If this is not the case for your +sumstats, change this to FALSE. Default is TRUE.

    + + +
    compute_z
    +

    Whether to compute Z-score column. Default is FALSE. This +can be computed from Beta and SE with (Beta/SE) or P +(Z:=sign(BETA)*sqrt(stats::qchisq(P,1,lower=FALSE))). +Note that imputing the Z-score from P for every SNP will not be +perfectly correct and may result in a loss of power. This should only be done +as a last resort. Use 'BETA' to impute by BETA/SE and 'P' to impute by SNP +p-value.

    + + +
    compute_n
    +

    Whether to impute N. Default of 0 won't impute, any other +integer will be imputed as the N (sample size) for every SNP in the dataset. +Note that imputing the sample size for every SNP is not correct and +should only be done as a last resort. N can also be inputted with "ldsc", +"sum", "giant" or "metal" by passing one of these for this field or a vector +of multiple. Sum and an integer value creates an N column in the output +whereas giant, metal or ldsc create an Neff or effective sample size. If +multiples are passed, the formula used to derive it will be indicated.

    + + +
    convert_n_int
    +

    Binary, if N (the number of samples) is not an integer, +should this be rounded? Default is TRUE.

    + + +
    analysis_trait
    +

    If multiple traits were studied, name of the trait for +analysis from the GWAS. Default is NULL.

    + + +
    INFO_filter
    +

    numeric The minimum value permissible of the imputation +information score (if present in sumstats file). Default 0.9.

    + + +
    FRQ_filter
    +

    numeric The minimum value permissible of the frequency(FRQ) +of the SNP (i.e. Allele Frequency (AF)) (if present in sumstats file). By +default no filtering is done, i.e. value of 0.

    + + +
    pos_se
    +

    Binary Should the standard Error (SE) column be checked to +ensure it is greater than 0? Those that are, are removed (if present in +sumstats file). Default TRUE.

    + + +
    effect_columns_nonzero
    +

    Binary should the effect columns in the data +BETA,OR (odds ratio),LOG_ODDS,SIGNED_SUMSTAT be checked to ensure no SNP=0. +Those that do are removed(if present in sumstats file). Default FALSE.

    + + +
    N_std
    +

    numeric The number of standard deviations above the mean a SNP's +N is needed to be removed. Default is 5.

    + + +
    N_dropNA
    +

    Drop rows where N is missing.Default is TRUE.

    + + +
    chr_style
    +

    Chromosome naming style to use in the formatted summary +statistics file ("NCBI", "UCSC", "dbSNP", or "Ensembl"). The NCBI and +Ensembl styles both code chromosomes as 1-22, X, Y, MT; the UCSC style is +chr1-chr22, chrX, chrY, chrM; and the dbSNP style is +ch1-ch22, chX, chY, chMT. Default is Ensembl.

    + + +
    rmv_chr
    +

    Chromosomes to exclude from the formatted summary statistics +file. Use NULL if no filtering is necessary. Default is c("X", "Y", "MT") +which removes all non-autosomal SNPs.

    + + +
    on_ref_genome
    +

    Binary Should a check take place that all SNPs are on +the reference genome by SNP ID. Default is TRUE.

    + + +
    infer_eff_direction
    +

    Binary Should a check take place to ensure the +alleles match the effect direction? Default is TRUE.

    + + +
    eff_on_minor_alleles
    +

    Binary Should MungeSumstats assume that the +effects are majoritively measured on the minor alleles? Default is FALSE as +this is an assumption that won't be appropriate in all cases. However, the +benefit is that if we know the majority of SNPs have their effects based on +the minor alleles, we can catch cases where the allele columns have been +mislabelled.

    + + +
    strand_ambig_filter
    +

    Binary Should SNPs with strand-ambiguous alleles +be removed. Default is FALSE.

    + + +
    allele_flip_check
    +

    Binary Should the allele columns be checked against +reference genome to infer if flipping is necessary. Default is TRUE.

    + + +
    allele_flip_drop
    +

    Binary Should the SNPs for which neither their A1 or +A2 base pair values match a reference genome be dropped. Default is TRUE.

    + + +
    allele_flip_z
    +

    Binary should the Z-score be flipped along with effect +and FRQ columns like Beta? It is assumed to be calculated off the effect size +not the P-value and so will be flipped i.e. default TRUE.

    + + +
    allele_flip_frq
    +

    Binary should the frequency (FRQ) column be flipped +along with effect and z-score columns like Beta? Default TRUE.

    + + +
    bi_allelic_filter
    +

    Binary Should non-bi-allelic SNPs be removed. +Default is TRUE.

    + + +
    flip_frq_as_biallelic
    +

    Binary Should non-bi-allelic SNPs frequency +values be flipped as 1-p despite there being other alternative alleles? +Default is FALSE but if set to TRUE, this allows non-bi-allelic SNPs to be +kept despite needing flipping.

    + + +
    snp_ids_are_rs_ids
    +

    Binary Should the supplied SNP ID's be assumed to +be RSIDs. If not, imputation using the SNP ID for other columns like +base-pair position or chromosome will not be possible. If set to FALSE, the +SNP RS ID will be imputed from the reference genome if possible. Default is +TRUE.

    + + +
    remove_multi_rs_snp
    +

    Binary Sometimes summary statistics can have +multiple RSIDs on one row (i.e. related to one SNP), for example +"rs5772025_rs397784053". This can cause an error so by default, the first +RS ID will be kept and the rest removed e.g."rs5772025". If you want to just +remove these SNPs entirely, set it to TRUE. Default is FALSE.

    + + +
    frq_is_maf
    +

    Conventionally the FRQ column is intended to show the +minor/effect allele frequency (MAF) but sometimes the major allele frequency +can be inferred as the FRQ column. This logical variable indicates that the +FRQ column should be renamed to MAJOR_ALLELE_FRQ if the frequency values +appear to relate to the major allele i.e. >0.5. By default this mapping won't +occur i.e. is TRUE.

    + + +
    indels
    +

    Binary does your Sumstats file contain Indels? These don't +exist in our reference file so they will be excluded from checks if this +value is TRUE. Default is TRUE.

    + + +
    drop_indels
    +

    Binary, should any indels found in the sumstats be +dropped? These can not be checked against a reference dataset and will have +the same RS ID and position as SNPs which can affect downstream analysis. +Default is False.

    + + +
    check_dups
    +

    whether to check for duplicates - if formatting QTL +datasets this should be set to FALSE otherwise keep as TRUE. Default is TRUE.

    + + +
    dbSNP
    +

    version of dbSNP to be used for imputation (144 or 155).

    + + +
    write_vcf
    +

    Whether to write as VCF (TRUE) or tabular file (FALSE).

    + + +
    return_format
    +

    If return_data is TRUE. Object type to be returned +("data.table","vranges","granges").

    + + +
    ldsc_format
    +

    DEPRECATED, do not use. Use save_format="LDSC" instead.

    + + +
    save_format
    +

    Output format of sumstats. Options are NULL - standardised +output format from MungeSumstats, LDSC - output format compatible with LDSC +and openGWAS - output compatible with openGWAS VCFs. Default is NULL. +NOTE - If LDSC format is used, the naming convention of A1 as the +reference (genome build) allele and A2 as the effect allele will be reversed +to match LDSC (A1 will now be the effect allele). See more info on this +here. Note that any +effect columns (e.g. Z) will be inrelation to A1 now instead of A2.

    + + +
    imputation_ind
    +

    Binary Should a column be added for each imputation +step to show what SNPs have imputed values for differing fields. This +includes a field denoting SNP allele flipping (flipped). On the flipped +value, this denoted whether the alelles where switched based on +MungeSumstats initial choice of A1, A2 from the input column headers and thus +may not align with what the creator intended.Note these columns will be +in the formatted summary statistics returned. Default is FALSE.

    + + +
    log_folder_ind
    +

    Binary Should log files be stored containing all +filtered out SNPs (separate file per filter). The data is outputted in the +same format specified for the resulting sumstats file. The only exception to +this rule is if output is vcf, then log file saved as .tsv.gz. Default is +FALSE.

    + + +
    log_mungesumstats_msgs
    +

    Binary Should a log be stored containing all +messages and errors printed by MungeSumstats in a run. Default is FALSE

    + + +
    mapping_file
    +

    MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    chain_source
    +

    source of the chain file to use in liftover, if converting +genome build ("ucsc" or "ensembl"). Note that the UCSC chain files require a +license for commercial use. The Ensembl chain is used by default ("ensembl").

    + + +
    local_chain
    +

    Path to local chain file to use instead of downlaoding. +Default of NULL i.e. no local file to use. NOTE if passing a local chain file +make sure to specify the path to convert from and to the correct build like +GRCh37 to GRCh38. We can not sense check this for local files. The chain file +can be submitted as a gz file (as downloaed from source) or unzipped.

    + + +
    drop_na_cols
    +

    A character vector of column names to be checked for +missing values. Rows with missing values in any of these columns (if present +in the dataset) will be dropped. If NULL, all columns will be checked for +missing values. Default columns are SNP, chromosome, position, allele 1, +allele2, effect columns (frequency, beta, Z-score, standard error, log odds, +signed sumstats, odds ratio), p value and N columns.

    + + +
    rmv_chrPrefix
    +

    Is now deprecated, do. not use. Use chr_style instead - +chr_style = 'Ensembl' will give the same result as rmv_chrPrefix=TRUE used to +give.

    + +
    +
    +

    Value

    + + +

    No return

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/vcf2df.html b/docs/reference/vcf2df.html new file mode 100644 index 00000000..e44d3177 --- /dev/null +++ b/docs/reference/vcf2df.html @@ -0,0 +1,197 @@ + +VCF to DF — vcf2df • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Function to convert a VariantAnnotation +CollapsedVCF/ExpandedVCF +object to a data.frame.

    +
    + +
    +
    vcf2df(
    +  vcf,
    +  add_sample_names = TRUE,
    +  add_rowranges = TRUE,
    +  drop_empty_cols = TRUE,
    +  unique_cols = TRUE,
    +  unique_rows = TRUE,
    +  unlist_cols = TRUE,
    +  sampled_rows = NULL,
    +  verbose = TRUE
    +)
    +
    + +
    +

    Source

    +

    +Original code source

    +

    +

    vcfR

    if(!require("pinfsc50")) install.packages("pinfsc50") +vcf_file <- system.file("extdata", "pinf_sc50.vcf.gz", package = "pinfsc50") +vcf <- read.vcfR( vcf_file, verbose = FALSE ) +vcf_df_list <- vcfR::vcfR2tidy(vcf, single_frame=TRUE) +vcf_df <- data.table::data.table(vcf_df_list$dat)

    +
    +
    +
    +

    Arguments

    +
    vcf
    +

    Variant Call Format (VCF) file imported into R +as a VariantAnnotation +CollapsedVCF/ +ExpandedVCF object.

    + + +
    add_sample_names
    +

    Append sample names to column names +(e.g. "EZ" --> "EZ_ubm-a-2929").

    + + +
    add_rowranges
    +

    Include rowRanges from VCF as well.

    + + +
    drop_empty_cols
    +

    Drop columns that are filled entirely with: +NA, ".", or "".

    + + +
    unique_cols
    +

    Only keep uniquely named columns.

    + + +
    unique_rows
    +

    Only keep unique rows.

    + + +
    unlist_cols
    +

    If any columns are lists instead of vectors, unlist them. +Required to be TRUE when unique_rows=TRUE.

    + + +
    sampled_rows
    +

    First N rows to sample. +Set NULL to use full sumstats_file. +when determining whether cols are empty.

    + + +
    verbose
    +

    Print messages.

    + +
    +
    +

    Value

    + + +

    data.frame version of VCF

    +
    + +
    +

    Examples

    +
      
    +#### VariantAnnotation ####
    +# path <- "https://github.com/brentp/vcfanno/raw/master/example/exac.vcf.gz"
    +path <- system.file("extdata", "ALSvcf.vcf",
    +                    package = "MungeSumstats")
    +                    
    +vcf <- VariantAnnotation::readVcf(file = path)
    +vcf_df <- MungeSumstats:::vcf2df(vcf = vcf)
    +#> Converting VCF to data.table.
    +#> Expanding VCF first, so number of rows may increase.
    +#> Checking for empty columns.
    +#> Removing 2 empty columns.
    +#> Unlisting 4 columns.
    +#> Dropped 314 duplicate rows.
    +#> Time difference of 0.1 secs
    +#> VCF data.table contains: 101 rows x 12 columns.
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/write_sumstats.html b/docs/reference/write_sumstats.html new file mode 100644 index 00000000..2d74d2c0 --- /dev/null +++ b/docs/reference/write_sumstats.html @@ -0,0 +1,196 @@ + +Write sum stats file to disk — write_sumstats • MungeSumstats + + +
    +
    + + + +
    +
    + + +
    +

    Write sum stats file to disk

    +
    + +
    +
    write_sumstats(
    +  sumstats_dt,
    +  save_path,
    +  ref_genome = NULL,
    +  sep = "\t",
    +  write_vcf = FALSE,
    +  save_format = NULL,
    +  tabix_index = FALSE,
    +  nThread = 1,
    +  return_path = FALSE,
    +  save_path_check = FALSE
    +)
    +
    + + +
    +

    Arguments

    +
    sumstats_dt
    +

    data table obj of the summary statistics +file for the GWAS.

    + + +
    save_path
    +

    File path to save formatted data. Defaults to +tempfile(fileext=".tsv.gz").

    + + +
    ref_genome
    +

    name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.

    + + +
    sep
    +

    The separator between columns. Defaults to the character in the set [,\t |;:] that separates the sample of rows into the most number of lines with the same number of fields. Use NULL or "" to specify no separator; i.e. each line a single character column like base::readLines does.

    + + +
    write_vcf
    +

    Whether to write as VCF (TRUE) or tabular file (FALSE).

    + + +
    save_format
    +

    Output format of sumstats. Options are NULL - standardised +output format from MungeSumstats, LDSC - output format compatible with LDSC +and openGWAS - output compatible with openGWAS VCFs. Default is NULL. +NOTE - If LDSC format is used, the naming convention of A1 as the +reference (genome build) allele and A2 as the effect allele will be reversed +to match LDSC (A1 will now be the effect allele). See more info on this +here. Note that any +effect columns (e.g. Z) will be inrelation to A1 now instead of A2.

    + + +
    tabix_index
    +

    Index the formatted summary statistics with +tabix for fast querying.

    + + +
    nThread
    +

    The number of threads to use. Experiment to see what works best for your data on your hardware.

    + + +
    return_path
    +

    Return save_path. +This will have been modified in some cases +(e.g. after compressing and tabix-indexing a +previously un-compressed file).

    + + +
    save_path_check
    +

    Ensure path name is valid (given the other arguments) +before writing (default: FALSE).

    + +
    +
    +

    Value

    + + +

    If return_path=TRUE, returns save_path. +Else returns NULL.

    +
    + +
    +

    Examples

    +
    path <- system.file("extdata", "eduAttainOkbay.txt",
    +    package = "MungeSumstats"
    +)
    +eduAttainOkbay <- read_sumstats(path = path)
    +#> Importing tabular file: /private/var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T/RtmpKLvRpi/temp_libpath17f3d19176b21/MungeSumstats/extdata/eduAttainOkbay.txt
    +#> Checking for empty columns.
    +write_sumstats(
    +    sumstats_dt = eduAttainOkbay,
    +    save_path = tempfile(fileext = ".tsv.gz")
    +)
    +#> Writing in tabular format ==> /var/folders/hd/jm8lzp7s4dl_wlkykzhz66x80000gn/T//Rtmp4DII6I/filec16d7adaa0e3.tsv.gz
    +
    +
    +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/sitemap.xml b/docs/sitemap.xml new file mode 100644 index 00000000..ac17bd90 --- /dev/null +++ b/docs/sitemap.xml @@ -0,0 +1,399 @@ + + + + /404.html + + + /articles/MungeSumstats.html + + + /articles/OpenGWAS.html + + + /articles/docker.html + + + /articles/index.html + + + /authors.html + + + /index.html + + + /news/index.html + + + /reference/DF_to_dt.html + + + /reference/axel.html + + + /reference/check_allele_flip.html + + + /reference/check_allele_merge.html + + + /reference/check_bi_allelic.html + + + /reference/check_bp_range.html + + + /reference/check_chr.html + + + /reference/check_col_order.html + + + /reference/check_drop_indels.html + + + /reference/check_dup_bp.html + + + /reference/check_dup_col.html + + + /reference/check_dup_row.html + + + /reference/check_dup_snp.html + + + /reference/check_effect_columns_nonzero.html + + + /reference/check_empty_cols.html + + + /reference/check_four_step_col.html + + + /reference/check_frq.html + + + /reference/check_frq_maf.html + + + /reference/check_info_score.html + + + /reference/check_ldsc_format.html + + + /reference/check_miss_data.html + + + /reference/check_multi_gwas.html + + + /reference/check_multi_rs_snp.html + + + /reference/check_n_int.html + + + /reference/check_n_num.html + + + /reference/check_no_allele.html + + + /reference/check_no_chr_bp.html + + + /reference/check_no_rs_snp.html + + + /reference/check_no_snp.html + + + /reference/check_numeric.html + + + /reference/check_on_ref_genome.html + + + /reference/check_pos_se.html + + + /reference/check_range_p_val.html + + + /reference/check_row_snp.html + + + /reference/check_save_path.html + + + /reference/check_signed_col.html + + + /reference/check_small_p_val.html + + + /reference/check_strand_ambiguous.html + + + /reference/check_tabular.html + + + /reference/check_two_step_col.html + + + /reference/check_vcf.html + + + /reference/check_vital_col.html + + + /reference/check_zscore.html + + + /reference/column_dictionary.html + + + /reference/compute_nsize.html + + + /reference/compute_sample_size.html + + + /reference/compute_sample_size_n.html + + + /reference/compute_sample_size_neff.html + + + /reference/convert_sumstats.html + + + /reference/download_vcf.html + + + /reference/downloader.html + + + /reference/drop_duplicate_cols.html + + + /reference/drop_duplicate_rows.html + + + /reference/find_sumstats.html + + + /reference/format_sumstats.html + + + /reference/formatted_example.html + + + /reference/get_chain_file.html + + + /reference/get_eff_frq_allele_combns.html + + + /reference/get_genome_build.html + + + /reference/get_genome_builds.html + + + /reference/get_unique_name_log_file.html + + + /reference/get_vcf_sample_ids.html + + + /reference/granges_to_dt.html + + + /reference/hg19ToHg38.html + + + /reference/hg38ToHg19.html + + + /reference/ieu-a-298.html + + + /reference/import_sumstats.html + + + /reference/index.html + + + /reference/index_tabular.html + + + /reference/index_vcf.html + + + /reference/infer_effect_column.html + + + /reference/is_tabix.html + + + /reference/liftover.html + + + /reference/list_sumstats.html + + + /reference/load_ref_genome_data.html + + + /reference/load_snp_loc_data.html + + + /reference/logs_example.html + + + /reference/make_allele_upper.html + + + /reference/message_parallel.html + + + /reference/messager.html + + + /reference/parse_dropped_INFO.html + + + /reference/parse_dropped_chrom.html + + + /reference/parse_dropped_duplicates.html + + + /reference/parse_dropped_nonA1A2.html + + + /reference/parse_dropped_nonBiallelic.html + + + /reference/parse_dropped_nonRef.html + + + /reference/parse_flipped.html + + + /reference/parse_genome_build.html + + + /reference/parse_idStandard.html + + + /reference/parse_logs.html + + + /reference/parse_pval_large.html + + + /reference/parse_pval_neg.html + + + /reference/parse_pval_small.html + + + /reference/parse_report.html + + + /reference/parse_snps_freq_05.html + + + /reference/parse_snps_not_formatted.html + + + /reference/parse_time.html + + + /reference/preview_sumstats.html + + + /reference/raw_ALSvcf.html + + + /reference/raw_eduAttainOkbay.html + + + /reference/read_header.html + + + /reference/read_log_pval.html + + + /reference/read_sumstats.html + + + /reference/read_vcf.html + + + /reference/read_vcf_genome.html + + + /reference/read_vcf_info.html + + + /reference/read_vcf_markername.html + + + /reference/read_vcf_parallel.html + + + /reference/register_cores.html + + + /reference/remove_empty_cols.html + + + /reference/report_summary.html + + + /reference/select_vcf_fields.html + + + /reference/sort_coord_genomicranges.html + + + /reference/sort_coords.html + + + /reference/sort_coords_datatable.html + + + /reference/standardise_header.html + + + /reference/sumstatsColHeaders.html + + + /reference/supported_suffixes.html + + + /reference/to_granges.html + + + /reference/to_vranges.html + + + /reference/unlist_dt.html + + + /reference/validate_parameters.html + + + /reference/vcf2df.html + + + /reference/write_sumstats.html + +