Skip to content

Commit

Permalink
add stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
Zilong-Li committed Dec 22, 2024
1 parent 16c54b7 commit 1468e54
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 93 deletions.
14 changes: 7 additions & 7 deletions src/Cmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ Param::Param(int argc, char **argv) {
opts.add<Value<uint>>("v", "verbose", "verbose level.\n"
"0: no message on screen\n"
"1: print messages to screen\n"
"2: enable debug information"
"2: enable verbose information\n"
"3: enable debug information"
, verbose, &verbose);
opts.add<Value<std::string>, Attribute::headline>("","PCA","PCA algorithms:");
auto svd_opt = opts.add<Value<uint>>("d", "svd", "SVD method to be applied. default 2 is recommended for big data.\n"
Expand Down Expand Up @@ -59,7 +60,6 @@ Param::Param(int argc, char **argv) {
opts.add<Value<double>, Attribute::advanced>("", "tol-rsvd", "tolerance for RSVD algorithm", tol, &tol);
opts.add<Value<double>, Attribute::advanced>("", "tol-em", "tolerance for EMU/PCAngsd algorithm", tolem, &tolem);
opts.add<Value<double>, Attribute::advanced>("", "tol-maf", "tolerance for MAF estimation by EM", tolmaf, &tolmaf);
opts.add<Switch, Attribute::hidden>("", "printu", "output eigen vector of each epoch (for tests)", &printu);

opts.add<Value<std::string>, Attribute::headline>("","INPUT","Input options:");
auto plinkfile = opts.add<Value<std::string>>("b", "bfile", "prefix of PLINK .bed/.bim/.fam files", "", &filein);
Expand All @@ -70,15 +70,15 @@ Param::Param(int argc, char **argv) {
auto beaglefile = opts.add<Value<std::string>>("G", "beagle", "path of BEAGLE file compressed by gzip", "", &filein);
opts.add<Value<std::string>>("f", "match-bim", "the .mbim file to be matched, where the 7th column is allele frequency", "", &filebim);
auto usvprefix = opts.add<Value<std::string>>("", "USV", "prefix of PCAone .eigvecs/.eigvals/.loadings/.mbim");
opts.add<Value<std::string>, Attribute::advanced>("", "read-U", "path of file with left singular vectors (.eigvecs)", "", &fileU);
opts.add<Value<std::string>, Attribute::advanced>("", "read-V", "path of file with right singular vectors (.loadings)", "", &fileV);
opts.add<Value<std::string>, Attribute::advanced>("", "read-S", "path of file with eigen values (.eigvals)", "", &fileS);
opts.add<Value<std::string>, Attribute::hidden>("", "read-U", "path of file with left singular vectors (.eigvecs)", "", &fileU);
opts.add<Value<std::string>, Attribute::hidden>("", "read-V", "path of file with right singular vectors (.loadings)", "", &fileV);
opts.add<Value<std::string>, Attribute::hidden>("", "read-S", "path of file with eigen values (.eigvals)", "", &fileS);

opts.add<Value<std::string>, Attribute::headline>("","OUTPUT","Output options:");
opts.add<Value<std::string>>("o", "out", "prefix of output files. default [pcaone]", fileout, &fileout);
opts.add<Switch>("V", "printv", "output the right eigenvectors with suffix .loadings", &printv);
opts.add<Switch>("", "ld", "output a binary matrix for downstream LD related analysis", &ld);
opts.add<Switch>("", "print-r2", "print LD r2 to *.ld.gz file for pairwise SNPs within a window", &print_r2);
opts.add<Switch>("D", "ld", "output a binary matrix for downstream LD related analysis", &ld);
opts.add<Switch>("R", "print-r2", "print LD r2 to *.ld.gz file for pairwise SNPs within a window", &print_r2);

opts.add<Value<std::string>, Attribute::headline>("","MISC","Misc options:");
opts.add<Value<double>>("", "maf", "exclude variants with MAF lower than this value", maf, &maf);
Expand Down
1 change: 0 additions & 1 deletion src/Cmd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ class Param {
bool groff = false;
bool cpmed = false;
bool printv = false;
bool printu = false;
bool impute = false;
bool noshuffle = false;
bool emu = false;
Expand Down
12 changes: 6 additions & 6 deletions src/FilePlink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,14 @@ void FileBed::read_all() {
for (k = 0; k < 4; ++k, ++j) {
if (j < nsamples) {
G(j, i) = BED2GENO[buf & 3];
if (G(j, i) != BED_MISSING_VALUE) {
// 0 indicate G(i,j) don't need to be predicted.
if (params.impute) C[i * nsamples + j] = 0;
} else {
buf >>= 2;
if (params.impute) {
// 1 indicate G(i,j) need to be predicted and updated.
if (params.impute) C[i * nsamples + j] = 1;
if (G(j, i) != BED_MISSING_VALUE)
C[i * nsamples + j] = 0;
else
C[i * nsamples + j] = 1;
}
buf >>= 2;
}
}
}
Expand Down
162 changes: 83 additions & 79 deletions src/Halko.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

#include "Halko.hpp"

#include <filesystem>
#include <string>
#include <thread> // std::this_thread::sleep_for

#include "Common.hpp"
#include "Utils.hpp"

using namespace std;
Expand Down Expand Up @@ -45,6 +45,17 @@ void RsvdOpData::computeUSV(int p, double tol) {
for (int pi = 0; pi <= p; ++pi) {
computeGandH(G, H, pi);
if (data->params.fancyem) {
if (data->params.verbose > 2) {
std::filesystem::path dirpath = "fancy-tests";
if (!std::filesystem::exists(dirpath)) {
std::filesystem::create_directory(dirpath);
cao.print("dir created:", dirpath);
}
std::filesystem::path curfile = dirpath / tick.intime();
std::ofstream logu(curfile);
logu << U;
std::this_thread::sleep_for(std::chrono::seconds(1));
}
continue;
}
// check if converged
Expand Down Expand Up @@ -153,8 +164,7 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) {
if (H.cols() != size || H.rows() != cols() || G.cols() != size || G.rows() != rows()) {
cao.error("the size of G or H doesn't match with each other.");
}
if (pi == 0) {
auto rng = std::default_random_engine{};
if (pi == 0 && !data->params.fancyem) {
if (data->params.rand)
Omg = PCAone::StandardNormalRandom<Mat2D, std::default_random_engine>(data->nsamples, size, rng);
else
Expand Down Expand Up @@ -187,6 +197,7 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) {
}
// bandsize: how many blocks in each band, 2, 4, 8, 16, 32, 64, ...
bandsize = fmin(bandsize * 2, data->params.bands);
bool saving = false;
// b: the index of current block
for (uint b = 0, i = 1, j = 1; b < data->params.bands; ++b, ++i, ++j) {
start_idx = b * blocksize;
Expand All @@ -199,6 +210,7 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) {
data->G.middleCols(start_idx, actual_block_size) * G.middleRows(start_idx, actual_block_size);
// additional complementary power iteration for last read
if (j == std::pow(2, pi - 1)) {
saving = true;
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
Expand All @@ -216,38 +228,30 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) {
if ((b + 1) < bandsize) continue;
// add up H and update Omg
if ((i == bandsize) || (i == bandsize / 2)) {
saving = true;
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
PCAone::flipOmg(Omg2, Omg);
// first find out which blocks/bands have been used
uint wb = b / bandsize, s = 0, e = 0;
if (i == bandsize) {
H1.setZero();
i = 0;
s = blocksize * bandsize * wb;
e = s + blocksize * bandsize;
} else if (i == bandsize / 2) {
H2.setZero();
s = blocksize * bandsize * wb - blocksize * bandsize / 2;
e = s + blocksize * bandsize;
}
}

if (saving) {
saving = false; // reset
// find out which blocks/bands have been used
uint e = blocksize * j; // end point
uint s = e - blocksize * bandsize;
e = e >= data->nsnps ? data->nsnps - 1 : e - 1; // bound check
if (data->params.verbose > 1)
cao.print(b, ",", i, ",", j, ",", bandsize, ",s:", s, ",e:", e, ",wb:", wb);
cao.print(b, ",", bandsize, ",", i == 0 ? bandsize : i, ",", j, ",s:", s, ",e:", e, ",pi:", pi);
if (data->params.fancyem) {
// update estimates rightaway
U = computeU(G.middleRows(s, e - s + 1), H); // only use G in sliding window
if (data->params.printu) {
std::filesystem::path dirpath = "fancy-tests";
if (!std::filesystem::exists(dirpath)) {
std::filesystem::create_directory(dirpath);
cao.print("dir created:", dirpath);
}
std::filesystem::path curfile = dirpath / tick.intime();
std::ofstream logu(curfile);
logu << U;
}
data->predict_missing_E(U, s, e);
}
}
Expand All @@ -262,63 +266,62 @@ void FancyRsvdOpData::computeGandH(Mat2D& G, Mat2D& H, int pi) {
}
#endif
}
} else {
if (pi == 0) {
bandsize = data->bandFactor;

return;
}

// out-of-core implementation
if (pi == 0) bandsize = data->bandFactor;
data->check_file_offset_first_var();
// band : 2, 4, 8, 16, 32, 64
bandsize = fmin(bandsize * 2, data->nblocks);
for (uint b = 0, i = 1, j = 1; b < data->nblocks; ++b, ++i, ++j) {
start_idx = data->start[b];
stop_idx = data->stop[b];
actual_block_size = stop_idx - start_idx + 1;
tick.clock();
if (update) {
data->read_block_update(start_idx, stop_idx, U, S, V.transpose(), standardize);
} else {
data->read_block_initial(start_idx, stop_idx, standardize);
}
{
data->check_file_offset_first_var();
// band : 2, 4, 8, 16, 32, 64
bandsize = fmin(bandsize * 2, data->nblocks);
for (uint b = 0, i = 1, j = 1; b < data->nblocks; ++b, ++i, ++j) {
start_idx = data->start[b];
stop_idx = data->stop[b];
actual_block_size = stop_idx - start_idx + 1;
tick.clock();
if (update) {
data->read_block_update(start_idx, stop_idx, U, S, V.transpose(), standardize);
} else {
data->read_block_initial(start_idx, stop_idx, standardize);
}
data->readtime += tick.reltime();
G.middleRows(start_idx, actual_block_size).noalias() = data->G.transpose() * Omg;
if (pi > 0 && j <= std::pow(2, pi - 1) * data->bandFactor && std::pow(2, pi) < data->params.bands) {
H1.noalias() += data->G * G.middleRows(start_idx, actual_block_size);
// additional complementary power iteration for last read
if (j == std::pow(2, pi - 1)) {
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
PCAone::flipOmg(Omg2, Omg);
H2.setZero();
}
} else if (i <= bandsize / 2) {
H1.noalias() += data->G * G.middleRows(start_idx, actual_block_size);
} else if (i > bandsize / 2 && i <= bandsize) {
H2.noalias() += data->G * G.middleRows(start_idx, actual_block_size);
}
if ((b + 1) >= bandsize) {
if (i == bandsize) {
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
PCAone::flipOmg(Omg2, Omg);
H1 = Mat2D::Zero(cols(), size);
i = 0;
} else if (i == bandsize / 2) {
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
PCAone::flipOmg(Omg2, Omg);
H2 = Mat2D::Zero(cols(), size);
} else if ((b + 1) == data->nblocks) {
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
PCAone::flipOmg(Omg2, Omg);
}
}
data->readtime += tick.reltime();
G.middleRows(start_idx, actual_block_size).noalias() = data->G.transpose() * Omg;
if (pi > 0 && j <= std::pow(2, pi - 1) * data->bandFactor && std::pow(2, pi) < data->params.bands) {
H1.noalias() += data->G * G.middleRows(start_idx, actual_block_size);
// additional complementary power iteration for last read
if (j == std::pow(2, pi - 1)) {
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
PCAone::flipOmg(Omg2, Omg);
H2.setZero();
}
} else if (i <= bandsize / 2) {
H1.noalias() += data->G * G.middleRows(start_idx, actual_block_size);
} else if (i > bandsize / 2 && i <= bandsize) {
H2.noalias() += data->G * G.middleRows(start_idx, actual_block_size);
}
if ((b + 1) < bandsize) continue;

if (i == bandsize) {
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
PCAone::flipOmg(Omg2, Omg);
H1 = Mat2D::Zero(cols(), size);
i = 0;
} else if (i == bandsize / 2) {
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
PCAone::flipOmg(Omg2, Omg);
H2 = Mat2D::Zero(cols(), size);
} else if ((b + 1) == data->nblocks) {
H = H1 + H2;
Eigen::HouseholderQR<Mat2D> qr(H);
Omg.noalias() = qr.householderQ() * Mat2D::Identity(cols(), size);
PCAone::flipOmg(Omg2, Omg);
}
}
}
Expand Down Expand Up @@ -354,7 +357,7 @@ void run_pca_with_halko(Data* data, const Param& params) {
// flip_UV(rsvd->U, rsvd->V, false);
double diff;
rsvd->setFlags(true, false);
cao.print(tick.date(), "do EM-PCA algorithms for data with uncertainty.");
cao.print(tick.date(), "run EM-PCA for data with uncertainty. maxiter =", params.maxiter);
for (uint i = 0; i < params.maxiter; ++i) {
Vpre = rsvd->V;
rsvd->computeUSV(params.maxp, params.tol);
Expand All @@ -370,8 +373,8 @@ void run_pca_with_halko(Data* data, const Param& params) {
}
}

// if pcangsd, estimate GRM.
if (params.pcangsd) {
cao.print(tick.date(), "estimate GRM for pcangsd");
data->pcangsd_standardize_E(rsvd->U, rsvd->S, rsvd->V.transpose());
// data->write_eigs_files(rsvd->S.array().square() / data->nsnps, rsvd->U, rsvd->V);
Mat2D C = data->G * data->G.transpose();
Expand All @@ -385,8 +388,9 @@ void run_pca_with_halko(Data* data, const Param& params) {
// output real eigenvectors of covariance in eigvecs2
write_eigvecs2_beagle(svd.matrixU(), params.filein, params.fileout + ".eigvecs2");
} else {
cao.print(tick.date(), "standardize the final matrix for EMU");
rsvd->setFlags(true, true);
rsvd->computeUSV(params.maxp, params.tol);
// rsvd->computeUSV(params.maxp, params.tol);
}
}
// output PI
Expand Down
6 changes: 6 additions & 0 deletions src/Halko.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,17 @@ class FancyRsvdOpData : public RsvdOpData {
const Index nk, os, size;
uint64 bandsize, blocksize, actual_block_size, start_idx, stop_idx;
Mat2D Omg, Omg2, H1, H2;
std::default_random_engine rng{};

public:
FancyRsvdOpData(Data* data_, int k_, int os_ = 10) : RsvdOpData(data_), nk(k_), os(os_), size(k_ + os_) {
H1 = Mat2D::Zero(cols(), size);
H2 = Mat2D::Zero(cols(), size);
if (data->params.rand)
Omg = PCAone::StandardNormalRandom<Mat2D, std::default_random_engine>(data->nsamples, size, rng);
else
Omg = PCAone::UniformRandom<Mat2D, std::default_random_engine>(data->nsamples, size, rng);
Omg2 = Omg;
}

~FancyRsvdOpData() {}
Expand Down

0 comments on commit 1468e54

Please sign in to comment.