Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement ICE balancing #11

Merged
merged 34 commits into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
81f5123
Initial implementation of VC balancing
robomics Sep 18, 2023
7e3fc1d
Initial implementation of ICE balancing
robomics Sep 18, 2023
fc703a1
Properly implement ICE cis balancing
robomics Sep 19, 2023
abbffe3
Properly implement ICE trans balancing. Add tests
robomics Sep 19, 2023
d520b14
Perf optimizations
robomics Sep 20, 2023
7d94262
Refactor
robomics Sep 20, 2023
4463608
Implement chunked ICE balancing
robomics Sep 22, 2023
d39230c
Bugfix
robomics Sep 22, 2023
5c7840c
Bugfix
robomics Sep 22, 2023
de16e6c
Bugfix
robomics Sep 23, 2023
60a50f7
Bugfix
robomics Sep 24, 2023
a138acd
Add more tests
robomics Sep 24, 2023
4a624fe
Bugfix
robomics Sep 24, 2023
071d9d9
Improve logging
robomics Sep 24, 2023
6660df4
Update serde code to use zstd contexts
robomics Sep 24, 2023
f004eb9
Refactor tests
robomics Sep 24, 2023
d9514c3
Bugfix
robomics Sep 24, 2023
c2aac06
Implement parallel ICE balancing
robomics Sep 26, 2023
cffaf9e
Merge branch 'main' into balancing
robomics Sep 27, 2023
d7a9053
Initial implementation of hictk balance
robomics Sep 27, 2023
a48e37a
Bugfix
robomics Sep 27, 2023
c55ba81
Add integration tests for hictk balance
robomics Sep 27, 2023
a61d309
Update .clang-tidy
robomics Sep 27, 2023
12ab184
Fix typos
robomics Sep 27, 2023
114a7be
Fix macos builds
robomics Sep 27, 2023
c35c9a8
Fix windows builds
robomics Sep 28, 2023
6a9f050
Fix integration tests on macOS
robomics Sep 28, 2023
13e01af
Bugfix
robomics Sep 28, 2023
35ab0e2
Update test dataset
robomics Sep 28, 2023
094582a
Fix macos builds
robomics Sep 28, 2023
b7180f3
Fix macos tests
robomics Sep 28, 2023
3399bb4
Fix ubuntu builds
robomics Sep 28, 2023
a12a1d6
Bugfix
robomics Sep 28, 2023
3b694cb
Fix ubuntu builds
robomics Sep 28, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ Checks: >
-cppcoreguidelines-pro-bounds-constant-array-index,
-hicpp-no-array-decay,
-misc-no-recursion,
-misc-use-anonymous-namespace,
-modernize-use-trailing-return-type,
-readability-identifier-length,
-readability-magic-numbers
-readability-magic-numbers,
-readability-static-definition-in-anonymous-namespace
WarningsAsErrors: ''
HeaderFilterRegex: ''
FormatStyle: none
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,8 @@ jobs:

- name: Run integration tests
run: |
test/scripts/hictk_balance.sh build/src/hictk/hictk hic_tools.jar

test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk
test/scripts/hictk_dump_bins.sh build/src/hictk/hictk
test/scripts/hictk_dump_resolutions.sh build/src/hictk/hictk
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/macos-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,10 @@ jobs:
zstd -dcf binaries.tar.zst | tar -xf -
tar -xf test/data/hictk_test_data.tar.xz

- name: Test hictk balance
run: |
test/scripts/hictk_balance.sh bin/hictk hic_tools.jar

- name: Test hictk dump chroms
run: |
test/scripts/hictk_dump_chroms.sh bin/hictk
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/ubuntu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,10 @@ jobs:
zstd -dcf binaries.tar.zst | tar -xf -
tar -xf test/data/hictk_test_data.tar.xz

- name: Test hictk balance
run: |
test/scripts/hictk_balance.sh bin/hictk hic_tools.jar

- name: Test hictk dump chroms
run: |
test/scripts/hictk_dump_chroms.sh bin/hictk
Expand Down
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ target_compile_features(hictk_project_options INTERFACE "cxx_std_${CMAKE_CXX_STA
target_compile_definitions(hictk_project_options INTERFACE FMT_HEADER_ONLY FMT_ENFORCE_COMPILE_STRING)
# Tweak spdlog
target_compile_definitions(hictk_project_options INTERFACE SPDLOG_FMT_EXTERNAL)
#Tweak xxHash
target_compile_definitions(hictk_project_options INTERFACE XXH_INLINE_ALL)

if(WIN32)
target_compile_definitions(hictk_project_options INTERFACE NOMINMAX _CRT_SECURE_NO_WARNINGS)
Expand Down
4 changes: 2 additions & 2 deletions cmake/FetchTestDataset.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

# cmake-format: off
file(
DOWNLOAD https://zenodo.org/record/8204309/files/hictk_test_data.tar.xz?download=1
EXPECTED_HASH SHA256=f9d0ccc8f8c7bade7f9049ea0abb6b26c061e6fed9e1e7bb8f2da0e1a402cfa4
DOWNLOAD https://zenodo.org/record/8386066/files/hictk_test_data.tar.xz?download=1
EXPECTED_HASH SHA256=77e2b9186d9edb90a8436fc41d6c0632727df3af95acb74e9635d1c0f29f2b8d
"${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.xz")
# cmake-format: on

Expand Down
5 changes: 5 additions & 0 deletions conanfile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

[requires]
boost/1.83.0#f0c3932db7f65b606ed78357ecbdcbef
bshoshany-thread-pool/3.5.0#3c9fd1e21a688432b7f31b40d2d168ee
cli11/2.3.2#1424b9b1d9e3682a7122f415b078b4d7
eigen/3.4.0#2e192482a8acff96fe34766adca2b24c
fast_float/5.2.0#9bf1a3fac625789f2b571d43efb8013b
Expand All @@ -13,7 +14,10 @@ highfive/2.7.1#a73bc6937c9add30c9d47a7a70a466eb
libdeflate/1.18#3697b637656a9af04cabcbed50db9a7e
parallel-hashmap/1.3.11#719aed501c271a34e2347a7731ab3bfb
readerwriterqueue/1.0.6#aaa5ff6fac60c2aee591e9e51b063b83
span-lite/0.10.3#1967d71abb32b314387c2ab9c558dd22
spdlog/1.12.0#248c215bc5f0718402fbf1de126ef847
xxhash/0.8.1#7bf1cd1fe3d31f1fbcc758d93c907e8d
zstd/1.5.5#93372fe14bb7883bd4de82914e0a1841

[generators]
CMakeDeps
Expand Down Expand Up @@ -69,3 +73,4 @@ highfive*:with_eigen=False
highfive*:with_opencv=False
highfive*:with_xtensor=False
spdlog*:header_only=True
xxhash*:utility=False
2 changes: 2 additions & 0 deletions src/hictk/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@ target_sources(
hictk
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_balance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_convert.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_dump.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_load.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_merge.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_validate.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_zoomify.cpp
${CMAKE_CURRENT_SOURCE_DIR}/balance/balance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/convert/convert.cpp
${CMAKE_CURRENT_SOURCE_DIR}/convert/cool_to_hic.cpp
${CMAKE_CURRENT_SOURCE_DIR}/convert/hic_to_cool.cpp
Expand Down
209 changes: 209 additions & 0 deletions src/hictk/balance/balance.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
// Copyright (C) 2023 Roberto Rossini <roberros@uio.no>
//
// SPDX-License-Identifier: MIT

#include <spdlog/spdlog.h>

#include <variant>

#include "hictk/balancing/ice.hpp"
#include "hictk/balancing/methods.hpp"
#include "hictk/cooler.hpp"
#include "hictk/file.hpp"
#include "hictk/hic.hpp"
#include "hictk/tools/common.hpp"
#include "hictk/tools/config.hpp"
#include "hictk/tools/juicer_tools.hpp"

namespace hictk::tools {

static void write_weights_hic(const hic::File& hf, const BalanceConfig& c,
const std::vector<double>& weights) {
auto tmpfile = c.tmp_dir / std::filesystem::path{hf.name()}.filename();
for (std::size_t i = 0; i < 1024; ++i) {
if (!std::filesystem::exists(tmpfile)) {
break;
}

tmpfile.replace_extension(".tmp" + std::to_string(i));

Check warning on line 28 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L28

Added line #L28 was not covered by tests
}

if (std::filesystem::exists(tmpfile)) {
throw std::runtime_error(
fmt::format(FMT_STRING("unable to create temporary file {}"), tmpfile));

Check warning on line 33 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L32-L33

Added lines #L32 - L33 were not covered by tests
}

try {
{
const std::unique_ptr<FILE> f(std::fopen(tmpfile.string().c_str(), "ae"));
if (!bool(f)) {
throw fmt::system_error(errno, FMT_STRING("cannot open file {}"), tmpfile);

Check warning on line 40 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L40

Added line #L40 was not covered by tests
}

std::ptrdiff_t i0 = 0;

for (const auto& chrom : hf.chromosomes()) {
if (chrom.is_all()) {
continue;
}
fmt::print(f.get(), FMT_STRING("vector\t{}\t{}\t{}\tBP\n"), c.name, chrom.name(),
hf.bin_size());

const auto num_bins = (chrom.size() + hf.bin_size() - 1) / hf.bin_size();
const auto i1 = i0 + static_cast<std::ptrdiff_t>(num_bins);
std::for_each(weights.begin() + i0, weights.begin() + i1, [&](const double w) {
std::isnan(w) ? fmt::print(f.get(), FMT_COMPILE(".\n"))
: fmt::print(f.get(), FMT_COMPILE("{}\n"), 1.0 / w);
if (!bool(f)) { // NOLINT
throw fmt::system_error(
errno, FMT_STRING("an error occurred while writing weights to file {}"), tmpfile);

Check warning on line 59 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L58-L59

Added lines #L58 - L59 were not covered by tests
}
});

i0 = i1;
}
}

auto jt = run_juicer_tools_add_norm(c.juicer_tools_jar, tmpfile, hf.url(), c.juicer_tools_xmx);
jt->wait();
if (jt->exit_code() != 0) {
throw std::runtime_error(
fmt::format(FMT_STRING("juicer_tools pre failed with exit code {}"), jt->exit_code()));

Check warning on line 71 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L70-L71

Added lines #L70 - L71 were not covered by tests
}
} catch (...) {
std::error_code ec{};
std::filesystem::remove(tmpfile, ec);

Check warning on line 75 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L74-L75

Added lines #L74 - L75 were not covered by tests
}
std::filesystem::remove(tmpfile);
}

static void write_weights_cooler(std::string_view uri, const BalanceConfig& c,
const std::vector<double>& weights,
const std::vector<double>& variance,
const std::vector<double>& scale) {
const auto& [file, grp] = cooler::parse_cooler_uri(uri);
const auto path = fmt::format(FMT_STRING("{}/bins/{}"), grp, c.name);
SPDLOG_INFO(FMT_STRING("Writing weights to {}::{}..."), uri, path);

const HighFive::File clr(file, HighFive::File::ReadWrite);

if (clr.exist(path)) {
assert(c.force);
clr.unlink(path);
}

cooler::Dataset dset(cooler::RootGroup{clr.getGroup(grp)}, path, 0.0);
dset.append(weights);

dset.write_attribute("cis_only", c.mode == "cis");
dset.write_attribute("divisive_weights", false);
dset.write_attribute("ignore_diags", std::int64_t(c.masked_diags));
dset.write_attribute("mad_max", std::int64_t(c.mad_max));
dset.write_attribute("min_count", std::int64_t(c.min_count));
dset.write_attribute("min_nnz", std::int64_t(c.min_nnz));
dset.write_attribute("tol", c.tolerance);

if (c.mode != "cis") {
dset.write_attribute("converged", variance.front() < c.tolerance);
dset.write_attribute("scale", scale.front());
dset.write_attribute("var", variance.front());

Check warning on line 109 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L107-L109

Added lines #L107 - L109 were not covered by tests
} else {
std::vector<bool> converged{};
for (const auto& var : variance) {
converged.push_back(var < c.tolerance); // NOLINT
}
dset.write_attribute("converged", converged);
dset.write_attribute("scale", scale);
dset.write_attribute("var", variance);
}
}

static int balance_singleres_file(File&& f, const BalanceConfig& c) {
std::filesystem::path tmpfile{};

if (!c.force && !c.stdout_ && f.has_normalization(c.name)) {
throw std::runtime_error(
fmt::format(FMT_STRING("Normalization weights for \"{}\" already exist in file {}. Pass "

Check warning on line 126 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L125-L126

Added lines #L125 - L126 were not covered by tests
"--force to overwrite existing weights."),
c.name, f.path()));

Check warning on line 128 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L128

Added line #L128 was not covered by tests
}

if (!c.in_memory) {
tmpfile = c.tmp_dir / std::filesystem::path{f.path()}.filename();
for (std::size_t i = 0; i < 1024; ++i) {
if (!std::filesystem::exists(tmpfile)) {
break;
}

tmpfile.replace_extension(".tmp" + std::to_string(i));

Check warning on line 138 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L138

Added line #L138 was not covered by tests
}

if (std::filesystem::exists(tmpfile)) {
throw std::runtime_error(
fmt::format(FMT_STRING("unable to create temporary file {}"), tmpfile));

Check warning on line 143 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L142-L143

Added lines #L142 - L143 were not covered by tests
}
}

const balancing::ICE::Params params{c.tolerance, c.max_iters, c.masked_diags,
c.min_nnz, c.min_count, c.mad_max,
tmpfile, c.chunk_size, c.threads};
balancing::ICE::Type mode{};
if (c.mode == "gw") {
mode = balancing::ICE::Type::gw;

Check warning on line 152 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L152

Added line #L152 was not covered by tests
} else if (c.mode == "cis") {
mode = balancing::ICE::Type::cis;
} else {
mode = balancing::ICE::Type::trans;

Check warning on line 156 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L156

Added line #L156 was not covered by tests
}

const auto balancer =
std::visit([&](const auto& ff) { return balancing::ICE(ff, mode, params); }, f.get());
const auto weights = balancer.get_weights(c.rescale_marginals);

if (c.stdout_) {
std::for_each(weights.begin(), weights.end(),
[&](const auto w) { fmt::print(FMT_COMPILE("{}\n"), w); });
return 0;

Check warning on line 166 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L164-L166

Added lines #L164 - L166 were not covered by tests
}

if (f.is_cooler()) {
const auto uri = f.uri();
f.get<cooler::File>().close();
write_weights_cooler(uri, c, weights, balancer.variance(), balancer.scale());
return 0;
}

write_weights_hic(f.get<hic::File>(), c, weights);

return 0;
}

static int balance_multires(const BalanceConfig& c) {
const auto resolutions = cooler::MultiResFile(c.path_to_input.string()).resolutions();

Check warning on line 182 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L182

Added line #L182 was not covered by tests

for (const auto& res : resolutions) {
balance_singleres_file(
File(fmt::format(FMT_STRING("{}::/resolutions/{}"), c.path_to_input.string(), res)), c);

Check warning on line 186 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L184-L186

Added lines #L184 - L186 were not covered by tests
}
return 0;

Check warning on line 188 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L188

Added line #L188 was not covered by tests
}

int balance_subcmd(const BalanceConfig& c) {
if (cooler::utils::is_multires_file(c.path_to_input.string())) {
return balance_multires(c);

Check warning on line 193 in src/hictk/balance/balance.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/balance/balance.cpp#L193

Added line #L193 was not covered by tests
}

std::vector<std::uint32_t> resolutions{};
if (hic::utils::is_hic_file(c.path_to_input)) {
resolutions = hic::utils::list_resolutions(c.path_to_input);
} else {
resolutions.push_back(File(c.path_to_input.string()).bin_size());
}

for (const auto& res : resolutions) {
balance_singleres_file(File(c.path_to_input.string(), res), c);
}

return 0;
}
} // namespace hictk::tools
13 changes: 12 additions & 1 deletion src/hictk/cli/cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
_cli.name(_exec_name);
_cli.parse(_argc, _argv);

if (_cli.get_subcommand("convert")->parsed()) {
if (_cli.get_subcommand("balance")->parsed()) {
_subcommand = subcommand::balance;
} else if (_cli.get_subcommand("convert")->parsed()) {
_subcommand = subcommand::convert;
} else if (_cli.get_subcommand("dump")->parsed()) {
_subcommand = subcommand::dump;
Expand Down Expand Up @@ -69,6 +71,8 @@

std::string_view Cli::subcommand_to_str(subcommand s) noexcept {
switch (s) {
case balance:
return "balance";

Check warning on line 75 in src/hictk/cli/cli.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli.cpp#L74-L75

Added lines #L74 - L75 were not covered by tests
case convert:
return "convert";
case dump:
Expand All @@ -93,6 +97,7 @@
_cli.set_version_flag("-V,--version", std::string{hictk::config::version::str_long()});
_cli.require_subcommand(1);

make_balance_subcommand();
make_convert_subcommand();
make_dump_subcommand();
make_load_subcommand();
Expand All @@ -103,6 +108,9 @@

void Cli::validate_args() const {
switch (_subcommand) {
case balance:
validate_balance_subcommand();
break;
case convert:
validate_convert_subcommand();
break;
Expand All @@ -127,6 +135,9 @@

void Cli::transform_args() {
switch (_subcommand) {
case balance:
transform_args_balance_subcommand();
break;
case convert:
transform_args_convert_subcommand();
break;
Expand Down
Loading