From 8a87fe285535b35f2fd3ee8e8e6c3bf2384e0af7 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 23 Dec 2024 23:49:28 +0100 Subject: [PATCH] nice groff --- external/popl/popl.hpp | 12 ++-- src/Cmd.cpp | 133 ++++++++++++++++++++--------------------- src/FilePlink.cpp | 4 +- src/Halko.cpp | 31 ++++------ src/Halko.hpp | 2 +- src/Main.cpp | 6 +- 6 files changed, 92 insertions(+), 96 deletions(-) diff --git a/external/popl/popl.hpp b/external/popl/popl.hpp index 9cce497..ef6b10b 100644 --- a/external/popl/popl.hpp +++ b/external/popl/popl.hpp @@ -1168,7 +1168,7 @@ inline std::string ConsoleOptionPrinter::print(const Attribute& max_attribute) c std::stringstream s; if (!option_parser_->description().empty()) - s << option_parser_->description() << ":\n"; + s << option_parser_->description() << "\n"; size_t optionRightMargin(20); const size_t maxDescriptionLeftMargin(40); @@ -1263,15 +1263,19 @@ inline std::string GroffOptionPrinter::print(const Attribute& max_attribute) con std::stringstream s; if (!option_parser_->description().empty()) - s << ".SS " << option_parser_->description() << ":\n"; + s << ".SH DESCRIPTION\n.PP\n" << option_parser_->description() << "\n"; for (const auto& option : option_parser_->options()) { if ((option->attribute() <= Attribute::hidden) || (option->attribute() > max_attribute)) continue; - s << ".TP\n\\fB" << to_string(option) << "\\fR\n"; - if (!option->description().empty()) + if (option->attribute() != Attribute::headline){ + s << ".TP\n\\fB" << to_string(option) << "\\fR\n"; + if (!option->description().empty()) s << option->description() << "\n"; + } else{ + s << ".SH\n" << option->description() << "\n"; + } } return s.str(); diff --git a/src/Cmd.cpp b/src/Cmd.cpp index edda6b6..b8c0505 100644 --- a/src/Cmd.cpp +++ b/src/Cmd.cpp @@ -15,96 +15,93 @@ Param::Param(int argc, char **argv) { bool haploid = false; std::string copyr{"PCA All In One (v" + (std::string)VERSION + ") https://github.com/Zilong-Li/PCAone\n" + "(C) 2021-2024 Zilong Li GNU General Public License v3\n" + - "\x1B[32m\n" + + "\n" + "Usage: use plink files as input and apply default window-based RSVD method\n" + " PCAone --bfile plink -n 20 \n\n" + " use csv file as input and apply the Implicitly Restarted Arnoldi Method\n" + " PCAone --csv csv.zst --svd 0 \n" + - "\033[0m\n"}; - OptionParser opts(copyr + "General options"); + "\n"}; + OptionParser opts(copyr); + opts.add, Attribute::headline>("","PCAone","General options:"); auto help_opt = opts.add("h", "help", "print all options including hidden advanced options"); opts.add>("m", "memory", "RAM usage in GB unit for out-of-core mode. default is in-core mode", memory, &memory); opts.add>("n", "threads", "the number of threads to be used", threads, &threads); - opts.add>("v", "verbose", "verbose level.\n" - "0: no message on screen\n" - "1: print messages to screen\n" - "2: enable verbose information\n" - "3: enable debug information" + opts.add>("v", "verbose", "verbosity level for logs. any level x includes messages for all levels (1...x).\n" + "0: silent. no message on screen;\n" + "1: concise messages to screen;\n" + "2: more verbose information;\n" + "3: enable debug information." , verbose, &verbose); opts.add, Attribute::headline>("","PCA","PCA algorithms:"); auto svd_opt = opts.add>("d", "svd", "SVD method to be applied. default 2 is recommended for big data.\n" - "0: the Implicitly Restarted Arnoldi Method (IRAM)\n" - "1: the Yu's single-pass Randomized SVD with power iterations\n" - "2: the accurate window-based Randomized SVD method (PCAone)\n" - "3: the full Singular Value Decomposition.", 2); + "0: the Implicitly Restarted Arnoldi Method (IRAM);\n" + "1: the Yu's single-pass Randomized SVD with power iterations;\n" + "2: the accurate window-based Randomized SVD method (PCAone);\n" + "3: the full Singular Value Decomposition.", 2); opts.add>("k", "pc", "top k principal components (PCs) to be calculated", k, &k); opts.add>("C", "scale", "do scaling for input file.\n" - "0: do just centering\n" - "1: do log transformation eg. log(x+0.01) for RNA-seq data\n" - "2: do count per median log transformation (CPMED) for scRNAs", - scale, &scale); - opts.add>("p", "maxp", "maximum number of power iterations for RSVD algorithm", maxp, &maxp); - opts.add("S", "no-shuffle", "do not shuffle columns of data for --svd 2 (if not locally correlated)", &noshuffle); - opts.add, Attribute::advanced>("w", "batches", "the number of mini-batches used by --svd 2", bands, &bands); - opts.add("", "emu", "use EMU algorithm for genotype input with missingness", &emu); - opts.add("", "pcangsd", "use PCAngsd algorithm for genotype likelihood input", &pcangsd); - opts.add, Attribute::advanced>("", "M", "the number of features (eg. SNPs) if already known", 0, &nsnps); - opts.add, Attribute::advanced>("", "N", "the number of samples if already known", 0, &nsamples); - opts.add, Attribute::advanced>("", "buffer", "memory buffer in GB unit for permuting the data", buffer, &buffer); - opts.add, Attribute::advanced>("", "imaxiter", "maximum number of IRAM iterations", imaxiter, &imaxiter); - opts.add, Attribute::advanced>("", "itol", "stopping tolerance for IRAM algorithm", itol, &itol); - opts.add, Attribute::advanced>("", "ncv", "the number of Lanzcos basis vectors for IRAM", ncv, &ncv); - opts.add, Attribute::advanced>("", "oversamples", "the number of oversampling columns for RSVD", oversamples, &oversamples); - opts.add, Attribute::advanced>("", "rand", "the random matrix type. 0: uniform, 1: guassian", rand, &rand); - opts.add, Attribute::advanced>("", "maxiter", "maximum number of EM iterations", maxiter, &maxiter); - opts.add, Attribute::advanced>("", "tol-rsvd", "tolerance for RSVD algorithm", tol, &tol); - opts.add, Attribute::advanced>("", "tol-em", "tolerance for EMU/PCAngsd algorithm", tolem, &tolem); - opts.add, Attribute::advanced>("", "tol-maf", "tolerance for MAF estimation by EM", tolmaf, &tolmaf); + "0: do just centering;\n" + "1: do log transformation eg. log(x+0.01) for RNA-seq data;\n" + "2: do count per median log transformation (CPMED) for scRNAs.", scale, &scale); + opts.add>("p", "maxp", "maximum number of power iterations for RSVD algorithm.", maxp, &maxp); + opts.add("S", "no-shuffle", "do not shuffle columns of data for --svd 2 (if not locally correlated).", &noshuffle); + opts.add, Attribute::advanced>("w", "batches", "the number of mini-batches used by --svd 2.", bands, &bands); + opts.add("", "emu", "use EMU algorithm for genotype input with missingness.", &emu); + opts.add("", "pcangsd", "use PCAngsd algorithm for genotype likelihood input.", &pcangsd); + opts.add, Attribute::advanced>("", "M", "the number of features (eg. SNPs) if already known.", 0, &nsnps); + opts.add, Attribute::advanced>("", "N", "the number of samples if already known.", 0, &nsamples); + opts.add, Attribute::advanced>("", "buffer", "memory buffer in GB unit for permuting the data.", buffer, &buffer); + opts.add, Attribute::advanced>("", "imaxiter", "maximum number of IRAM iterations.", imaxiter, &imaxiter); + opts.add, Attribute::advanced>("", "itol", "stopping tolerance for IRAM algorithm.", itol, &itol); + opts.add, Attribute::advanced>("", "ncv", "the number of Lanzcos basis vectors for IRAM.", ncv, &ncv); + opts.add, Attribute::advanced>("", "oversamples", "the number of oversampling columns for RSVD.", oversamples, &oversamples); + opts.add, Attribute::advanced>("", "rand", "the random matrix type. 0: uniform; 1: guassian.", rand, &rand); + opts.add, Attribute::advanced>("", "maxiter", "maximum number of EM iterations.", maxiter, &maxiter); + opts.add, Attribute::advanced>("", "tol-rsvd", "tolerance for RSVD algorithm.", tol, &tol); + opts.add, Attribute::advanced>("", "tol-em", "tolerance for EMU/PCAngsd algorithm.", tolem, &tolem); + opts.add, Attribute::advanced>("", "tol-maf", "tolerance for MAF estimation by EM.", tolmaf, &tolmaf); opts.add, Attribute::headline>("","INPUT","Input options:"); - auto plinkfile = opts.add>("b", "bfile", "prefix of PLINK .bed/.bim/.fam files", "", &filein); - opts.add("", "haploid", "the plink format represents haploid data", &haploid); - auto binfile = opts.add>("B", "binary", "path of binary file", "", &filein); - auto csvfile = opts.add>("c", "csv", "path of comma seperated CSV file compressed by zstd", "", &filein); - auto bgenfile = opts.add>("g", "bgen", "path of BGEN file compressed by gzip/zstd", "", &filein); - auto beaglefile = opts.add>("G", "beagle", "path of BEAGLE file compressed by gzip", "", &filein); - opts.add>("f", "match-bim", "the .mbim file to be matched, where the 7th column is allele frequency", "", &filebim); - auto usvprefix = opts.add>("", "USV", "prefix of PCAone .eigvecs/.eigvals/.loadings/.mbim"); - opts.add, Attribute::hidden>("", "read-U", "path of file with left singular vectors (.eigvecs)", "", &fileU); - opts.add, Attribute::hidden>("", "read-V", "path of file with right singular vectors (.loadings)", "", &fileV); - opts.add, Attribute::hidden>("", "read-S", "path of file with eigen values (.eigvals)", "", &fileS); + auto plinkfile = opts.add>("b", "bfile", "prefix of PLINK .bed/.bim/.fam files.", "", &filein); + opts.add("", "haploid", "the plink format represents haploid data.", &haploid); + auto binfile = opts.add>("B", "binary", "path of binary file.", "", &filein); + auto csvfile = opts.add>("c", "csv", "path of comma seperated CSV file compressed by zstd.", "", &filein); + auto bgenfile = opts.add>("g", "bgen", "path of BGEN file compressed by gzip/zstd.", "", &filein); + auto beaglefile = opts.add>("G", "beagle", "path of BEAGLE file compressed by gzip.", "", &filein); + opts.add>("f", "match-bim", "the .mbim file to be matched, where the 7th column is allele frequency.", "", &filebim); + auto usvprefix = opts.add>("", "USV", "prefix of PCAone .eigvecs/.eigvals/.loadings/.mbim."); + opts.add, Attribute::hidden>("", "read-U", "path of file with left singular vectors (.eigvecs).", "", &fileU); + opts.add, Attribute::hidden>("", "read-V", "path of file with right singular vectors (.loadings).", "", &fileV); + opts.add, Attribute::hidden>("", "read-S", "path of file with eigen values (.eigvals).", "", &fileS); opts.add, Attribute::headline>("","OUTPUT","Output options:"); - opts.add>("o", "out", "prefix of output files. default [pcaone]", fileout, &fileout); - opts.add("V", "printv", "output the right eigenvectors with suffix .loadings", &printv); - opts.add("D", "ld", "output a binary matrix for downstream LD related analysis", &ld); - opts.add("R", "print-r2", "print LD r2 to *.ld.gz file for pairwise SNPs within a window", &print_r2); + opts.add>("o", "out", "prefix of output files. default [pcaone].", fileout, &fileout); + opts.add("V", "printv", "output the right eigenvectors with suffix .loadings.", &printv); + opts.add("D", "ld", "output a binary matrix for downstream LD related analysis.", &ld); + opts.add("R", "print-r2", "print LD r2 to *.ld.gz file for pairwise SNPs within a window.", &print_r2); opts.add, Attribute::headline>("","MISC","Misc options:"); opts.add>("", "maf", "exclude variants with MAF lower than this value", maf, &maf); opts.add>("", "project", "project the new samples onto the existing PCs.\n" - "0: disabled\n" - "1: by multiplying the loadings with mean imputation for missing genotypes\n" - "2: by solving the least squares system Vx=g. skip sites with missingness\n" - "3: by Augmentation, Decomposition and Procrusters transformation\n", - project, &project); + "0: disabled;\n" + "1: by multiplying the loadings with mean imputation for missing genotypes;\n" + "2: by solving the least squares system Vx=g. skip sites with missingness;\n" + "3: by Augmentation, Decomposition and Procrusters transformation.\n", project, &project); opts.add>("", "inbreed", "compute the inbreeding coefficient accounting for population structure.\n" - "0: disabled\n" - "1: compute per-site inbreeding coefficient and HWE test\n", - inbreed, &inbreed); - opts.add>("", "ld-r2", "r2 cutoff for LD-based pruning. (usually 0.2)", ld_r2, &ld_r2); - opts.add>("", "ld-bp", "physical distance threshold in bases for LD. (usually 1000000)", ld_bp, &ld_bp); + "0: disabled;\n" + "1: compute per-site inbreeding coefficient and HWE test.\n", inbreed, &inbreed); + opts.add>("", "ld-r2", "r2 cutoff for LD-based pruning (usually 0.2).", ld_r2, &ld_r2); + opts.add>("", "ld-bp", "physical distance threshold in bases for LD (usually 1000000).", ld_bp, &ld_bp); opts.add>("", "ld-stats", "statistics to calculate LD r2 for pairwise SNPs.\n" - "0: the ancestry adjusted, i.e. correlation between residuals\n" - "1: the standard, i.e. correlation between two alleles\n", - ld_stats, &ld_stats); - auto clumpfile = opts.add>("", "clump", "assoc-like file with target variants and pvalues for clumping", "", &clump); - auto assocnames = opts.add>("", "clump-names", "column names in assoc-like file for locating chr, pos and pvalue", "CHR,BP,P", &assoc_colnames); - opts.add>("", "clump-p1", "significance threshold for index SNPs", clump_p1, &clump_p1); - opts.add>("", "clump-p2", "secondary significance threshold for clumped SNPs", clump_p2, &clump_p2); - opts.add>("", "clump-r2", "r2 cutoff for LD-based clumping", clump_r2, &clump_r2); - opts.add>("", "clump-bp", "physical distance threshold in bases for clumping", clump_bp, &clump_bp); - opts.add("", "groff", "print groff formatted help message", &groff); + "0: the ancestry adjusted, i.e. correlation between residuals;\n" + "1: the standard, i.e. correlation between two alleles.\n", ld_stats, &ld_stats); + auto clumpfile = opts.add>("", "clump", "assoc-like file with target variants and pvalues for clumping.", "", &clump); + auto assocnames = opts.add>("", "clump-names", "column names in assoc-like file for locating chr, pos and pvalue.", "CHR,BP,P", &assoc_colnames); + opts.add>("", "clump-p1", "significance threshold for index SNPs.", clump_p1, &clump_p1); + opts.add>("", "clump-p2", "secondary significance threshold for clumped SNPs.", clump_p2, &clump_p2); + opts.add>("", "clump-r2", "r2 cutoff for LD-based clumping.", clump_r2, &clump_r2); + opts.add>("", "clump-bp", "physical distance threshold in bases for clumping.", clump_bp, &clump_bp); + opts.add("", "groff", "print groff formatted help message.", &groff); // collect command line options acutal in effect ss << (std::string) "PCAone (v" + VERSION + ") https://github.com/Zilong-Li/PCAone\n"; diff --git a/src/FilePlink.cpp b/src/FilePlink.cpp index f889aca..bdfcf0d 100644 --- a/src/FilePlink.cpp +++ b/src/FilePlink.cpp @@ -174,8 +174,8 @@ void FileBed::read_block_initial(uint64 start_idx, uint64 stop_idx, bool standar // should remove sites with F=0 and 1.0 if (F(snp_idx) == 0.0 || F(snp_idx) == 1.0) cao.error("sites with MAF=0 found! remove them first!"); // in LD r2,F=0.5 means sample standard deviation is 0 - if (params.verbose && F(snp_idx) == 0.5) - cao.warn("sites with MAF=0.5 found. NaN values expected in LD r2."); + if (params.verbose > 1 && F(snp_idx) == 0.5) + cao.warn("MAF for site ", snp_idx, "is 0.5. NaN values expected in calculating LD R2."); // do centering and initialing centered_geno_lookup(1, snp_idx) = 0.0; // missing centered_geno_lookup(0, snp_idx) = BED2GENO[0] - F(snp_idx); // minor hom diff --git a/src/Halko.cpp b/src/Halko.cpp index 4740689..9948a78 100644 --- a/src/Halko.cpp +++ b/src/Halko.cpp @@ -17,6 +17,7 @@ void RsvdOpData::initOmg() { Omg = PCAone::StandardNormalRandom(cols(), size(), rng); else Omg = PCAone::UniformRandom(cols(), size(), rng); + Omg2 = Omg; } Mat2D RsvdOpData::computeU(const Mat2D& G, const Mat2D& H) { @@ -195,7 +196,7 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) { H = H1 + H2; Eigen::HouseholderQR qr(H); Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); - // PCAone::flipOmg(Omg2, Omg); + PCAone::flipOmg(Omg2, Omg); H2.setZero(); } } else if (i <= bandsize / 2) { @@ -212,6 +213,7 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) { H = H1 + H2; Eigen::HouseholderQR qr(H); Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); + PCAone::flipOmg(Omg2, Omg); if (i == bandsize) { H1.setZero(); i = 0; @@ -248,7 +250,7 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) { H = H1 + H2; Eigen::HouseholderQR qr(H); Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); - // PCAone::flipOmg(Omg2, Omg); + PCAone::flipOmg(Omg2, Omg); H2.setZero(); } } else if (i <= bandsize / 2) { @@ -257,25 +259,18 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) { H2.noalias() += data->G * G.middleRows(start_idx, actual_block_size); } if ((b + 1) < bandsize) continue; - - if (i == bandsize) { + // add up H and update Omg + if ((i == bandsize) || (i == bandsize / 2)) { H = H1 + H2; Eigen::HouseholderQR qr(H); Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); - // PCAone::flipOmg(Omg2, Omg); - H1 = Mat2D::Zero(cols(), size); - i = 0; - } else if (i == bandsize / 2) { - H = H1 + H2; - Eigen::HouseholderQR qr(H); - Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); - // PCAone::flipOmg(Omg2, Omg); - H2 = Mat2D::Zero(cols(), size); - } else if ((b + 1) == data->nblocks) { - H = H1 + H2; - Eigen::HouseholderQR qr(H); - Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); - // PCAone::flipOmg(Omg2, Omg); + PCAone::flipOmg(Omg2, Omg); + if (i == bandsize) { + H1.setZero(); + i = 0; + } else if (i == bandsize / 2) { + H2.setZero(); + } } } } diff --git a/src/Halko.hpp b/src/Halko.hpp index 8f7d1e0..e858909 100644 --- a/src/Halko.hpp +++ b/src/Halko.hpp @@ -9,7 +9,7 @@ class RsvdOpData { Data* data; using Index = Eigen::Index; bool update = false, standardize = false, reset = true; - Mat2D U, Omg; // nsamples x nk + Mat2D U, Omg, Omg2; // nsamples x nk Mat2D V; // nsnps x nk Mat1D S; // nk x 1 diff --git a/src/Main.cpp b/src/Main.cpp index f04e5a8..9e65f6e 100755 --- a/src/Main.cpp +++ b/src/Main.cpp @@ -27,7 +27,7 @@ using namespace std; static int bye() { - cao.print(tick.date(), "total elapsed wall time:", tick.abstime(), "seconds"); + cao.print(tick.date(), "total elapsed wall time:", tick.abstime(), " seconds"); cao.print(tick.date(), "have a nice day. bye!"); return 0; } @@ -92,7 +92,7 @@ int main(int argc, char* argv[]) { } else { cao.error("wrong file type used!"); } - cao.print(tick.date(), "elapsed time of permuting data:", tick.reltime(), "seconds"); + cao.print(tick.date(), "elapsed time of permuting data:", tick.reltime(), " seconds"); } else { if (params.file_t == FileType::PLINK) { data = new FileBed(params); @@ -132,6 +132,6 @@ int main(int argc, char* argv[]) { make_plink2_eigenvec_file(params.k, params.fileout + ".eigvecs2", params.fileout + ".eigvecs", params.filein + ".fam"); - cao.print(tick.date(), "total elapsed reading time: ", data->readtime, "seconds"); + cao.print(tick.date(), "total elapsed reading time: ", data->readtime, " seconds"); return bye(); }