Skip to content

Commit

Permalink
Merge pull request #11 from paulsengroup/balancing
Browse files Browse the repository at this point in the history
Implement ICE balancing
  • Loading branch information
robomics authored Sep 28, 2023
2 parents 0ef9d13 + 3b694cb commit b839765
Show file tree
Hide file tree
Showing 43 changed files with 2,735 additions and 93 deletions.
4 changes: 3 additions & 1 deletion .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ Checks: >
-cppcoreguidelines-pro-bounds-constant-array-index,
-hicpp-no-array-decay,
-misc-no-recursion,
-misc-use-anonymous-namespace,
-modernize-use-trailing-return-type,
-readability-identifier-length,
-readability-magic-numbers
-readability-magic-numbers,
-readability-static-definition-in-anonymous-namespace
WarningsAsErrors: ''
HeaderFilterRegex: ''
FormatStyle: none
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,8 @@ jobs:
- name: Run integration tests
run: |
test/scripts/hictk_balance.sh build/src/hictk/hictk hic_tools.jar
test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk
test/scripts/hictk_dump_bins.sh build/src/hictk/hictk
test/scripts/hictk_dump_resolutions.sh build/src/hictk/hictk
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/macos-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,10 @@ jobs:
zstd -dcf binaries.tar.zst | tar -xf -
tar -xf test/data/hictk_test_data.tar.xz
- name: Test hictk balance
run: |
test/scripts/hictk_balance.sh bin/hictk hic_tools.jar
- name: Test hictk dump chroms
run: |
test/scripts/hictk_dump_chroms.sh bin/hictk
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/ubuntu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,10 @@ jobs:
zstd -dcf binaries.tar.zst | tar -xf -
tar -xf test/data/hictk_test_data.tar.xz
- name: Test hictk balance
run: |
test/scripts/hictk_balance.sh bin/hictk hic_tools.jar
- name: Test hictk dump chroms
run: |
test/scripts/hictk_dump_chroms.sh bin/hictk
Expand Down
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ target_compile_features(hictk_project_options INTERFACE "cxx_std_${CMAKE_CXX_STA
target_compile_definitions(hictk_project_options INTERFACE FMT_HEADER_ONLY FMT_ENFORCE_COMPILE_STRING)
# Tweak spdlog
target_compile_definitions(hictk_project_options INTERFACE SPDLOG_FMT_EXTERNAL)
#Tweak xxHash
target_compile_definitions(hictk_project_options INTERFACE XXH_INLINE_ALL)

if(WIN32)
target_compile_definitions(hictk_project_options INTERFACE NOMINMAX _CRT_SECURE_NO_WARNINGS)
Expand Down
4 changes: 2 additions & 2 deletions cmake/FetchTestDataset.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

# cmake-format: off
file(
DOWNLOAD https://zenodo.org/record/8204309/files/hictk_test_data.tar.xz?download=1
EXPECTED_HASH SHA256=f9d0ccc8f8c7bade7f9049ea0abb6b26c061e6fed9e1e7bb8f2da0e1a402cfa4
DOWNLOAD https://zenodo.org/record/8386066/files/hictk_test_data.tar.xz?download=1
EXPECTED_HASH SHA256=77e2b9186d9edb90a8436fc41d6c0632727df3af95acb74e9635d1c0f29f2b8d
"${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.xz")
# cmake-format: on

Expand Down
5 changes: 5 additions & 0 deletions conanfile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

[requires]
boost/1.83.0#f0c3932db7f65b606ed78357ecbdcbef
bshoshany-thread-pool/3.5.0#3c9fd1e21a688432b7f31b40d2d168ee
cli11/2.3.2#1424b9b1d9e3682a7122f415b078b4d7
eigen/3.4.0#2e192482a8acff96fe34766adca2b24c
fast_float/5.2.0#9bf1a3fac625789f2b571d43efb8013b
Expand All @@ -13,7 +14,10 @@ highfive/2.7.1#a73bc6937c9add30c9d47a7a70a466eb
libdeflate/1.18#3697b637656a9af04cabcbed50db9a7e
parallel-hashmap/1.3.11#719aed501c271a34e2347a7731ab3bfb
readerwriterqueue/1.0.6#aaa5ff6fac60c2aee591e9e51b063b83
span-lite/0.10.3#1967d71abb32b314387c2ab9c558dd22
spdlog/1.12.0#248c215bc5f0718402fbf1de126ef847
xxhash/0.8.1#7bf1cd1fe3d31f1fbcc758d93c907e8d
zstd/1.5.5#93372fe14bb7883bd4de82914e0a1841

[generators]
CMakeDeps
Expand Down Expand Up @@ -69,3 +73,4 @@ highfive*:with_eigen=False
highfive*:with_opencv=False
highfive*:with_xtensor=False
spdlog*:header_only=True
xxhash*:utility=False
2 changes: 2 additions & 0 deletions src/hictk/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@ target_sources(
hictk
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_balance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_convert.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_dump.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_load.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_merge.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_validate.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_zoomify.cpp
${CMAKE_CURRENT_SOURCE_DIR}/balance/balance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/convert/convert.cpp
${CMAKE_CURRENT_SOURCE_DIR}/convert/cool_to_hic.cpp
${CMAKE_CURRENT_SOURCE_DIR}/convert/hic_to_cool.cpp
Expand Down
209 changes: 209 additions & 0 deletions src/hictk/balance/balance.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
// Copyright (C) 2023 Roberto Rossini <roberros@uio.no>
//
// SPDX-License-Identifier: MIT

#include <spdlog/spdlog.h>

#include <variant>

#include "hictk/balancing/ice.hpp"
#include "hictk/balancing/methods.hpp"
#include "hictk/cooler.hpp"
#include "hictk/file.hpp"
#include "hictk/hic.hpp"
#include "hictk/tools/common.hpp"
#include "hictk/tools/config.hpp"
#include "hictk/tools/juicer_tools.hpp"

namespace hictk::tools {

static void write_weights_hic(const hic::File& hf, const BalanceConfig& c,
const std::vector<double>& weights) {
auto tmpfile = c.tmp_dir / std::filesystem::path{hf.name()}.filename();
for (std::size_t i = 0; i < 1024; ++i) {
if (!std::filesystem::exists(tmpfile)) {
break;
}

tmpfile.replace_extension(".tmp" + std::to_string(i));
}

if (std::filesystem::exists(tmpfile)) {
throw std::runtime_error(
fmt::format(FMT_STRING("unable to create temporary file {}"), tmpfile));
}

try {
{
const std::unique_ptr<FILE> f(std::fopen(tmpfile.string().c_str(), "ae"));
if (!bool(f)) {
throw fmt::system_error(errno, FMT_STRING("cannot open file {}"), tmpfile);
}

std::ptrdiff_t i0 = 0;

for (const auto& chrom : hf.chromosomes()) {
if (chrom.is_all()) {
continue;
}
fmt::print(f.get(), FMT_STRING("vector\t{}\t{}\t{}\tBP\n"), c.name, chrom.name(),
hf.bin_size());

const auto num_bins = (chrom.size() + hf.bin_size() - 1) / hf.bin_size();
const auto i1 = i0 + static_cast<std::ptrdiff_t>(num_bins);
std::for_each(weights.begin() + i0, weights.begin() + i1, [&](const double w) {
std::isnan(w) ? fmt::print(f.get(), FMT_COMPILE(".\n"))
: fmt::print(f.get(), FMT_COMPILE("{}\n"), 1.0 / w);
if (!bool(f)) { // NOLINT
throw fmt::system_error(
errno, FMT_STRING("an error occurred while writing weights to file {}"), tmpfile);
}
});

i0 = i1;
}
}

auto jt = run_juicer_tools_add_norm(c.juicer_tools_jar, tmpfile, hf.url(), c.juicer_tools_xmx);
jt->wait();
if (jt->exit_code() != 0) {
throw std::runtime_error(
fmt::format(FMT_STRING("juicer_tools pre failed with exit code {}"), jt->exit_code()));
}
} catch (...) {
std::error_code ec{};
std::filesystem::remove(tmpfile, ec);
}
std::filesystem::remove(tmpfile);
}

static void write_weights_cooler(std::string_view uri, const BalanceConfig& c,
const std::vector<double>& weights,
const std::vector<double>& variance,
const std::vector<double>& scale) {
const auto& [file, grp] = cooler::parse_cooler_uri(uri);
const auto path = fmt::format(FMT_STRING("{}/bins/{}"), grp, c.name);
SPDLOG_INFO(FMT_STRING("Writing weights to {}::{}..."), uri, path);

const HighFive::File clr(file, HighFive::File::ReadWrite);

if (clr.exist(path)) {
assert(c.force);
clr.unlink(path);
}

cooler::Dataset dset(cooler::RootGroup{clr.getGroup(grp)}, path, 0.0);
dset.append(weights);

dset.write_attribute("cis_only", c.mode == "cis");
dset.write_attribute("divisive_weights", false);
dset.write_attribute("ignore_diags", std::int64_t(c.masked_diags));
dset.write_attribute("mad_max", std::int64_t(c.mad_max));
dset.write_attribute("min_count", std::int64_t(c.min_count));
dset.write_attribute("min_nnz", std::int64_t(c.min_nnz));
dset.write_attribute("tol", c.tolerance);

if (c.mode != "cis") {
dset.write_attribute("converged", variance.front() < c.tolerance);
dset.write_attribute("scale", scale.front());
dset.write_attribute("var", variance.front());
} else {
std::vector<bool> converged{};
for (const auto& var : variance) {
converged.push_back(var < c.tolerance); // NOLINT
}
dset.write_attribute("converged", converged);
dset.write_attribute("scale", scale);
dset.write_attribute("var", variance);
}
}

static int balance_singleres_file(File&& f, const BalanceConfig& c) {
std::filesystem::path tmpfile{};

if (!c.force && !c.stdout_ && f.has_normalization(c.name)) {
throw std::runtime_error(
fmt::format(FMT_STRING("Normalization weights for \"{}\" already exist in file {}. Pass "
"--force to overwrite existing weights."),
c.name, f.path()));
}

if (!c.in_memory) {
tmpfile = c.tmp_dir / std::filesystem::path{f.path()}.filename();
for (std::size_t i = 0; i < 1024; ++i) {
if (!std::filesystem::exists(tmpfile)) {
break;
}

tmpfile.replace_extension(".tmp" + std::to_string(i));
}

if (std::filesystem::exists(tmpfile)) {
throw std::runtime_error(
fmt::format(FMT_STRING("unable to create temporary file {}"), tmpfile));
}
}

const balancing::ICE::Params params{c.tolerance, c.max_iters, c.masked_diags,
c.min_nnz, c.min_count, c.mad_max,
tmpfile, c.chunk_size, c.threads};
balancing::ICE::Type mode{};
if (c.mode == "gw") {
mode = balancing::ICE::Type::gw;
} else if (c.mode == "cis") {
mode = balancing::ICE::Type::cis;
} else {
mode = balancing::ICE::Type::trans;
}

const auto balancer =
std::visit([&](const auto& ff) { return balancing::ICE(ff, mode, params); }, f.get());
const auto weights = balancer.get_weights(c.rescale_marginals);

if (c.stdout_) {
std::for_each(weights.begin(), weights.end(),
[&](const auto w) { fmt::print(FMT_COMPILE("{}\n"), w); });
return 0;
}

if (f.is_cooler()) {
const auto uri = f.uri();
f.get<cooler::File>().close();
write_weights_cooler(uri, c, weights, balancer.variance(), balancer.scale());
return 0;
}

write_weights_hic(f.get<hic::File>(), c, weights);

return 0;
}

static int balance_multires(const BalanceConfig& c) {
const auto resolutions = cooler::MultiResFile(c.path_to_input.string()).resolutions();

for (const auto& res : resolutions) {
balance_singleres_file(
File(fmt::format(FMT_STRING("{}::/resolutions/{}"), c.path_to_input.string(), res)), c);
}
return 0;
}

int balance_subcmd(const BalanceConfig& c) {
if (cooler::utils::is_multires_file(c.path_to_input.string())) {
return balance_multires(c);
}

std::vector<std::uint32_t> resolutions{};
if (hic::utils::is_hic_file(c.path_to_input)) {
resolutions = hic::utils::list_resolutions(c.path_to_input);
} else {
resolutions.push_back(File(c.path_to_input.string()).bin_size());
}

for (const auto& res : resolutions) {
balance_singleres_file(File(c.path_to_input.string(), res), c);
}

return 0;
}
} // namespace hictk::tools
13 changes: 12 additions & 1 deletion src/hictk/cli/cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ auto Cli::parse_arguments() -> Config {
_cli.name(_exec_name);
_cli.parse(_argc, _argv);

if (_cli.get_subcommand("convert")->parsed()) {
if (_cli.get_subcommand("balance")->parsed()) {
_subcommand = subcommand::balance;
} else if (_cli.get_subcommand("convert")->parsed()) {
_subcommand = subcommand::convert;
} else if (_cli.get_subcommand("dump")->parsed()) {
_subcommand = subcommand::dump;
Expand Down Expand Up @@ -69,6 +71,8 @@ int Cli::exit(const CLI::ParseError& e) const { return _cli.exit(e); }

std::string_view Cli::subcommand_to_str(subcommand s) noexcept {
switch (s) {
case balance:
return "balance";
case convert:
return "convert";
case dump:
Expand All @@ -93,6 +97,7 @@ void Cli::make_cli() {
_cli.set_version_flag("-V,--version", std::string{hictk::config::version::str_long()});
_cli.require_subcommand(1);

make_balance_subcommand();
make_convert_subcommand();
make_dump_subcommand();
make_load_subcommand();
Expand All @@ -103,6 +108,9 @@ void Cli::make_cli() {

void Cli::validate_args() const {
switch (_subcommand) {
case balance:
validate_balance_subcommand();
break;
case convert:
validate_convert_subcommand();
break;
Expand All @@ -127,6 +135,9 @@ void Cli::validate_args() const {

void Cli::transform_args() {
switch (_subcommand) {
case balance:
transform_args_balance_subcommand();
break;
case convert:
transform_args_convert_subcommand();
break;
Expand Down
Loading

0 comments on commit b839765

Please sign in to comment.