From 1468e5480bc7335bf8d46dd4d3a73c610ed20cde Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Sun, 22 Dec 2024 12:34:53 +0100 Subject: [PATCH] add stuff --- src/Cmd.cpp | 14 ++-- src/Cmd.hpp | 1 - src/FilePlink.cpp | 12 ++-- src/Halko.cpp | 162 ++++++++++++++++++++++++---------------------- src/Halko.hpp | 6 ++ 5 files changed, 102 insertions(+), 93 deletions(-) diff --git a/src/Cmd.cpp b/src/Cmd.cpp index cc0a175..aaf5de2 100644 --- a/src/Cmd.cpp +++ b/src/Cmd.cpp @@ -28,7 +28,8 @@ Param::Param(int argc, char **argv) { opts.add>("v", "verbose", "verbose level.\n" "0: no message on screen\n" "1: print messages to screen\n" - "2: enable debug information" + "2: enable verbose information\n" + "3: enable debug information" , verbose, &verbose); opts.add, Attribute::headline>("","PCA","PCA algorithms:"); auto svd_opt = opts.add>("d", "svd", "SVD method to be applied. default 2 is recommended for big data.\n" @@ -59,7 +60,6 @@ Param::Param(int argc, char **argv) { opts.add, Attribute::advanced>("", "tol-rsvd", "tolerance for RSVD algorithm", tol, &tol); opts.add, Attribute::advanced>("", "tol-em", "tolerance for EMU/PCAngsd algorithm", tolem, &tolem); opts.add, Attribute::advanced>("", "tol-maf", "tolerance for MAF estimation by EM", tolmaf, &tolmaf); - opts.add("", "printu", "output eigen vector of each epoch (for tests)", &printu); opts.add, Attribute::headline>("","INPUT","Input options:"); auto plinkfile = opts.add>("b", "bfile", "prefix of PLINK .bed/.bim/.fam files", "", &filein); @@ -70,15 +70,15 @@ Param::Param(int argc, char **argv) { auto beaglefile = opts.add>("G", "beagle", "path of BEAGLE file compressed by gzip", "", &filein); opts.add>("f", "match-bim", "the .mbim file to be matched, where the 7th column is allele frequency", "", &filebim); auto usvprefix = opts.add>("", "USV", "prefix of PCAone .eigvecs/.eigvals/.loadings/.mbim"); - opts.add, Attribute::advanced>("", "read-U", "path of file with left singular vectors (.eigvecs)", "", &fileU); - opts.add, Attribute::advanced>("", "read-V", "path of file with right singular vectors (.loadings)", "", &fileV); - opts.add, Attribute::advanced>("", "read-S", "path of file with eigen values (.eigvals)", "", &fileS); + opts.add, Attribute::hidden>("", "read-U", "path of file with left singular vectors (.eigvecs)", "", &fileU); + opts.add, Attribute::hidden>("", "read-V", "path of file with right singular vectors (.loadings)", "", &fileV); + opts.add, Attribute::hidden>("", "read-S", "path of file with eigen values (.eigvals)", "", &fileS); opts.add, Attribute::headline>("","OUTPUT","Output options:"); opts.add>("o", "out", "prefix of output files. default [pcaone]", fileout, &fileout); opts.add("V", "printv", "output the right eigenvectors with suffix .loadings", &printv); - opts.add("", "ld", "output a binary matrix for downstream LD related analysis", &ld); - opts.add("", "print-r2", "print LD r2 to *.ld.gz file for pairwise SNPs within a window", &print_r2); + opts.add("D", "ld", "output a binary matrix for downstream LD related analysis", &ld); + opts.add("R", "print-r2", "print LD r2 to *.ld.gz file for pairwise SNPs within a window", &print_r2); opts.add, Attribute::headline>("","MISC","Misc options:"); opts.add>("", "maf", "exclude variants with MAF lower than this value", maf, &maf); diff --git a/src/Cmd.hpp b/src/Cmd.hpp index b338cbb..413a334 100644 --- a/src/Cmd.hpp +++ b/src/Cmd.hpp @@ -73,7 +73,6 @@ class Param { bool groff = false; bool cpmed = false; bool printv = false; - bool printu = false; bool impute = false; bool noshuffle = false; bool emu = false; diff --git a/src/FilePlink.cpp b/src/FilePlink.cpp index f0ed7f5..f889aca 100644 --- a/src/FilePlink.cpp +++ b/src/FilePlink.cpp @@ -77,14 +77,14 @@ void FileBed::read_all() { for (k = 0; k < 4; ++k, ++j) { if (j < nsamples) { G(j, i) = BED2GENO[buf & 3]; - if (G(j, i) != BED_MISSING_VALUE) { - // 0 indicate G(i,j) don't need to be predicted. - if (params.impute) C[i * nsamples + j] = 0; - } else { + buf >>= 2; + if (params.impute) { // 1 indicate G(i,j) need to be predicted and updated. - if (params.impute) C[i * nsamples + j] = 1; + if (G(j, i) != BED_MISSING_VALUE) + C[i * nsamples + j] = 0; + else + C[i * nsamples + j] = 1; } - buf >>= 2; } } } diff --git a/src/Halko.cpp b/src/Halko.cpp index 7aac80d..23aba6a 100644 --- a/src/Halko.cpp +++ b/src/Halko.cpp @@ -6,9 +6,9 @@ #include "Halko.hpp" -#include -#include +#include // std::this_thread::sleep_for +#include "Common.hpp" #include "Utils.hpp" using namespace std; @@ -45,6 +45,17 @@ void RsvdOpData::computeUSV(int p, double tol) { for (int pi = 0; pi <= p; ++pi) { computeGandH(G, H, pi); if (data->params.fancyem) { + if (data->params.verbose > 2) { + std::filesystem::path dirpath = "fancy-tests"; + if (!std::filesystem::exists(dirpath)) { + std::filesystem::create_directory(dirpath); + cao.print("dir created:", dirpath); + } + std::filesystem::path curfile = dirpath / tick.intime(); + std::ofstream logu(curfile); + logu << U; + std::this_thread::sleep_for(std::chrono::seconds(1)); + } continue; } // check if converged @@ -153,8 +164,7 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) { if (H.cols() != size || H.rows() != cols() || G.cols() != size || G.rows() != rows()) { cao.error("the size of G or H doesn't match with each other."); } - if (pi == 0) { - auto rng = std::default_random_engine{}; + if (pi == 0 && !data->params.fancyem) { if (data->params.rand) Omg = PCAone::StandardNormalRandom(data->nsamples, size, rng); else @@ -187,6 +197,7 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) { } // bandsize: how many blocks in each band, 2, 4, 8, 16, 32, 64, ... bandsize = fmin(bandsize * 2, data->params.bands); + bool saving = false; // b: the index of current block for (uint b = 0, i = 1, j = 1; b < data->params.bands; ++b, ++i, ++j) { start_idx = b * blocksize; @@ -199,6 +210,7 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) { data->G.middleCols(start_idx, actual_block_size) * G.middleRows(start_idx, actual_block_size); // additional complementary power iteration for last read if (j == std::pow(2, pi - 1)) { + saving = true; H = H1 + H2; Eigen::HouseholderQR qr(H); Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); @@ -216,38 +228,30 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) { if ((b + 1) < bandsize) continue; // add up H and update Omg if ((i == bandsize) || (i == bandsize / 2)) { + saving = true; H = H1 + H2; Eigen::HouseholderQR qr(H); Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); PCAone::flipOmg(Omg2, Omg); - // first find out which blocks/bands have been used - uint wb = b / bandsize, s = 0, e = 0; if (i == bandsize) { H1.setZero(); i = 0; - s = blocksize * bandsize * wb; - e = s + blocksize * bandsize; } else if (i == bandsize / 2) { H2.setZero(); - s = blocksize * bandsize * wb - blocksize * bandsize / 2; - e = s + blocksize * bandsize; } + } + + if (saving) { + saving = false; // reset + // find out which blocks/bands have been used + uint e = blocksize * j; // end point + uint s = e - blocksize * bandsize; e = e >= data->nsnps ? data->nsnps - 1 : e - 1; // bound check if (data->params.verbose > 1) - cao.print(b, ",", i, ",", j, ",", bandsize, ",s:", s, ",e:", e, ",wb:", wb); + cao.print(b, ",", bandsize, ",", i == 0 ? bandsize : i, ",", j, ",s:", s, ",e:", e, ",pi:", pi); if (data->params.fancyem) { // update estimates rightaway U = computeU(G.middleRows(s, e - s + 1), H); // only use G in sliding window - if (data->params.printu) { - std::filesystem::path dirpath = "fancy-tests"; - if (!std::filesystem::exists(dirpath)) { - std::filesystem::create_directory(dirpath); - cao.print("dir created:", dirpath); - } - std::filesystem::path curfile = dirpath / tick.intime(); - std::ofstream logu(curfile); - logu << U; - } data->predict_missing_E(U, s, e); } } @@ -262,63 +266,62 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) { } #endif } - } else { - if (pi == 0) { - bandsize = data->bandFactor; + + return; + } + + // out-of-core implementation + if (pi == 0) bandsize = data->bandFactor; + data->check_file_offset_first_var(); + // band : 2, 4, 8, 16, 32, 64 + bandsize = fmin(bandsize * 2, data->nblocks); + for (uint b = 0, i = 1, j = 1; b < data->nblocks; ++b, ++i, ++j) { + start_idx = data->start[b]; + stop_idx = data->stop[b]; + actual_block_size = stop_idx - start_idx + 1; + tick.clock(); + if (update) { + data->read_block_update(start_idx, stop_idx, U, S, V.transpose(), standardize); + } else { + data->read_block_initial(start_idx, stop_idx, standardize); } - { - data->check_file_offset_first_var(); - // band : 2, 4, 8, 16, 32, 64 - bandsize = fmin(bandsize * 2, data->nblocks); - for (uint b = 0, i = 1, j = 1; b < data->nblocks; ++b, ++i, ++j) { - start_idx = data->start[b]; - stop_idx = data->stop[b]; - actual_block_size = stop_idx - start_idx + 1; - tick.clock(); - if (update) { - data->read_block_update(start_idx, stop_idx, U, S, V.transpose(), standardize); - } else { - data->read_block_initial(start_idx, stop_idx, standardize); - } - data->readtime += tick.reltime(); - G.middleRows(start_idx, actual_block_size).noalias() = data->G.transpose() * Omg; - if (pi > 0 && j <= std::pow(2, pi - 1) * data->bandFactor && std::pow(2, pi) < data->params.bands) { - H1.noalias() += data->G * G.middleRows(start_idx, actual_block_size); - // additional complementary power iteration for last read - if (j == std::pow(2, pi - 1)) { - H = H1 + H2; - Eigen::HouseholderQR qr(H); - Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); - PCAone::flipOmg(Omg2, Omg); - H2.setZero(); - } - } else if (i <= bandsize / 2) { - H1.noalias() += data->G * G.middleRows(start_idx, actual_block_size); - } else if (i > bandsize / 2 && i <= bandsize) { - H2.noalias() += data->G * G.middleRows(start_idx, actual_block_size); - } - if ((b + 1) >= bandsize) { - if (i == bandsize) { - H = H1 + H2; - Eigen::HouseholderQR qr(H); - Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); - PCAone::flipOmg(Omg2, Omg); - H1 = Mat2D::Zero(cols(), size); - i = 0; - } else if (i == bandsize / 2) { - H = H1 + H2; - Eigen::HouseholderQR qr(H); - Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); - PCAone::flipOmg(Omg2, Omg); - H2 = Mat2D::Zero(cols(), size); - } else if ((b + 1) == data->nblocks) { - H = H1 + H2; - Eigen::HouseholderQR qr(H); - Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); - PCAone::flipOmg(Omg2, Omg); - } - } + data->readtime += tick.reltime(); + G.middleRows(start_idx, actual_block_size).noalias() = data->G.transpose() * Omg; + if (pi > 0 && j <= std::pow(2, pi - 1) * data->bandFactor && std::pow(2, pi) < data->params.bands) { + H1.noalias() += data->G * G.middleRows(start_idx, actual_block_size); + // additional complementary power iteration for last read + if (j == std::pow(2, pi - 1)) { + H = H1 + H2; + Eigen::HouseholderQR qr(H); + Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); + PCAone::flipOmg(Omg2, Omg); + H2.setZero(); } + } else if (i <= bandsize / 2) { + H1.noalias() += data->G * G.middleRows(start_idx, actual_block_size); + } else if (i > bandsize / 2 && i <= bandsize) { + H2.noalias() += data->G * G.middleRows(start_idx, actual_block_size); + } + if ((b + 1) < bandsize) continue; + + if (i == bandsize) { + H = H1 + H2; + Eigen::HouseholderQR qr(H); + Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); + PCAone::flipOmg(Omg2, Omg); + H1 = Mat2D::Zero(cols(), size); + i = 0; + } else if (i == bandsize / 2) { + H = H1 + H2; + Eigen::HouseholderQR qr(H); + Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); + PCAone::flipOmg(Omg2, Omg); + H2 = Mat2D::Zero(cols(), size); + } else if ((b + 1) == data->nblocks) { + H = H1 + H2; + Eigen::HouseholderQR qr(H); + Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size); + PCAone::flipOmg(Omg2, Omg); } } } @@ -354,7 +357,7 @@ void run_pca_with_halko(Data* data, const Param& params) { // flip_UV(rsvd->U, rsvd->V, false); double diff; rsvd->setFlags(true, false); - cao.print(tick.date(), "do EM-PCA algorithms for data with uncertainty."); + cao.print(tick.date(), "run EM-PCA for data with uncertainty. maxiter =", params.maxiter); for (uint i = 0; i < params.maxiter; ++i) { Vpre = rsvd->V; rsvd->computeUSV(params.maxp, params.tol); @@ -370,8 +373,8 @@ void run_pca_with_halko(Data* data, const Param& params) { } } - // if pcangsd, estimate GRM. if (params.pcangsd) { + cao.print(tick.date(), "estimate GRM for pcangsd"); data->pcangsd_standardize_E(rsvd->U, rsvd->S, rsvd->V.transpose()); // data->write_eigs_files(rsvd->S.array().square() / data->nsnps, rsvd->U, rsvd->V); Mat2D C = data->G * data->G.transpose(); @@ -385,8 +388,9 @@ void run_pca_with_halko(Data* data, const Param& params) { // output real eigenvectors of covariance in eigvecs2 write_eigvecs2_beagle(svd.matrixU(), params.filein, params.fileout + ".eigvecs2"); } else { + cao.print(tick.date(), "standardize the final matrix for EMU"); rsvd->setFlags(true, true); - rsvd->computeUSV(params.maxp, params.tol); + // rsvd->computeUSV(params.maxp, params.tol); } } // output PI diff --git a/src/Halko.hpp b/src/Halko.hpp index 1303b06..fcf0bf8 100644 --- a/src/Halko.hpp +++ b/src/Halko.hpp @@ -59,11 +59,17 @@ class FancyRsvdOpData : public RsvdOpData { const Index nk, os, size; uint64 bandsize, blocksize, actual_block_size, start_idx, stop_idx; Mat2D Omg, Omg2, H1, H2; + std::default_random_engine rng{}; public: FancyRsvdOpData(Data* data_, int k_, int os_ = 10) : RsvdOpData(data_), nk(k_), os(os_), size(k_ + os_) { H1 = Mat2D::Zero(cols(), size); H2 = Mat2D::Zero(cols(), size); + if (data->params.rand) + Omg = PCAone::StandardNormalRandom(data->nsamples, size, rng); + else + Omg = PCAone::UniformRandom(data->nsamples, size, rng); + Omg2 = Omg; } ~FancyRsvdOpData() {}