Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support variable bin sizes #81

Merged
merged 29 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
e43ecfd
Refactor BinTable and initial implementation of BinTableVariable
robomics Dec 6, 2023
8eb8507
Update cooler::File to support reading files with variable bin size
robomics Dec 6, 2023
7776376
Initial support for creating coolers with variable bin size (C++ API)
robomics Dec 6, 2023
98e671b
Update test dataset
robomics Dec 6, 2023
ad8766b
Update hictk load to support variable bin sizes
robomics Dec 6, 2023
3c4da66
Bugfix
robomics Dec 6, 2023
3c537d6
Bugfix
robomics Dec 6, 2023
8e20097
Bugfix
robomics Dec 6, 2023
01778b8
Bugfix
robomics Dec 7, 2023
3c68959
Add more tests for BinTable
robomics Dec 7, 2023
0f1f8f9
Update pixel parsers
robomics Dec 7, 2023
44e93f3
Update hictk load
robomics Dec 7, 2023
b83a7e4
Detect when a fixed bin table is passed through --bin-table
robomics Dec 7, 2023
2199789
Update integration tests for hictk load
robomics Dec 7, 2023
13865cb
Add script to generate bin tables with variable bin sizes
robomics Dec 7, 2023
8d692dd
Update test dataset
robomics Dec 7, 2023
003d0f4
Fix off-by-one bug
robomics Dec 7, 2023
af0644c
Bugfix
robomics Dec 7, 2023
3c48ee3
Set sensible defaults for one/zero based coords
robomics Dec 7, 2023
1f89d83
Bugfix
robomics Dec 7, 2023
ca1fa65
Bugfix
robomics Dec 7, 2023
4e88525
Fix GCC12 builds
robomics Dec 7, 2023
b03ce51
Fix clang builds
robomics Dec 7, 2023
1cc71df
Address clang-tidy warnings
robomics Dec 8, 2023
a003682
Validate bins upon constructing an instance of BinTableVariable
robomics Dec 8, 2023
8017d56
Update hictk merge to support coolers with variable bin size
robomics Dec 8, 2023
e121363
Update hictk convert to prevent conversion of cooler with variable bi…
robomics Dec 8, 2023
05dc1bb
Update hictk zoomify to prevent processing of coolers with variable b…
robomics Dec 8, 2023
80965a7
Address clang-tidy warnings
robomics Dec 8, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,7 @@ jobs:
test/scripts/hictk_load_coo.sh build/src/hictk/hictk unsorted
test/scripts/hictk_load_bg2.sh build/src/hictk/hictk sorted
test/scripts/hictk_load_bg2.sh build/src/hictk/hictk unsorted
test/scripts/hictk_load_4dn.sh build/src/hictk/hictk sorted
test/scripts/hictk_load_4dn.sh build/src/hictk/hictk unsorted
test/scripts/hictk_load_4dn.sh build/src/hictk/hictk

test/scripts/hictk_merge.sh build/src/hictk/hictk

Expand Down
8 changes: 2 additions & 6 deletions .github/workflows/macos-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -430,13 +430,9 @@ jobs:
run: |
test/scripts/hictk_load_bg2.sh bin/hictk unsorted

- name: Test hictk load 4dn sorted
- name: Test hictk load 4dn
run: |
test/scripts/hictk_load_4dn.sh bin/hictk sorted

- name: Test hictk load 4dn unsorted
run: |
test/scripts/hictk_load_4dn.sh bin/hictk unsorted
test/scripts/hictk_load_4dn.sh bin/hictk

- name: Test hictk merge
run: |
Expand Down
8 changes: 2 additions & 6 deletions .github/workflows/ubuntu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -483,13 +483,9 @@ jobs:
run: |
test/scripts/hictk_load_bg2.sh bin/hictk unsorted

- name: Test hictk load 4dn sorted
- name: Test hictk load 4dn
run: |
test/scripts/hictk_load_4dn.sh bin/hictk sorted

- name: Test hictk load 4dn unsorted
run: |
test/scripts/hictk_load_4dn.sh bin/hictk unsorted
test/scripts/hictk_load_4dn.sh bin/hictk

- name: Test hictk merge
run: |
Expand Down
4 changes: 2 additions & 2 deletions cmake/FetchTestDataset.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

# cmake-format: off
file(
DOWNLOAD https://zenodo.org/record/8392109/files/hictk_test_data.tar.xz?download=1
EXPECTED_HASH SHA256=43d05077082603cf03dc5ce2bfc0f81b2422c7249ee987e09512d2e3a56afff1
DOWNLOAD https://zenodo.org/records/10289491/files/hictk_test_data.tar.xz?download=1
EXPECTED_HASH SHA256=5e69dceb8789d923a38aed7add8fc18abfdfe531aea6effcdb7efe3c9bcf5246
"${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.xz")
# cmake-format: on

Expand Down
62 changes: 54 additions & 8 deletions src/hictk/cli/cli_load.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,25 @@
->check(CLI::ExistingFile)
->required();

sc.add_option(
"bin-size",
c.bin_size,
"Bin size (bp).")
->check(CLI::PositiveNumber)
->required();

sc.add_option(
"output-uri",
c.uri,
"Path to output Cooler (URI syntax supported).")
->required();

sc.add_option(
"-b,--bin-size",
c.bin_size,
"Bin size (bp).\n"
"Required when --bin-table is not used.")
->check(CLI::PositiveNumber);

sc.add_option(
"-t,--bin-table",
c.path_to_bin_table,
"Path to a BED3+ file with the bin table.")
->check(CLI::ExistingFile);

sc.add_option(
"-f,--format",
c.format,
Expand All @@ -67,6 +73,13 @@
"Assembly name.")
->capture_default_str();

sc.add_flag(
"--one-based,!--zero-based",
c.one_based,
"Interpret genomic coordinates or bins as one/zero based.\n"
"By default coordinates are assumed to be one-based for interactions in\n"
"4dn and validapairs formats and zero-based otherwise.");

sc.add_flag(
"--count-as-float",
c.count_as_float,
Expand All @@ -89,24 +102,48 @@
sc.add_option(
"--batch-size",
c.batch_size,
"Number of pixels to buffer in memory. Only used when processing unsorted interactions or pairs")
"Number of pixels to buffer in memory.\n"
"Only used when processing unsorted interactions or pairs.")
->capture_default_str();
// clang-format on

sc.get_option("--bin-size")->excludes(sc.get_option("--bin-table"));
_config = std::monostate{};
}

void Cli::validate_load_subcommand() const {
assert(_cli.get_subcommand("load")->parsed());

std::vector<std::string> warnings;
std::vector<std::string> errors;
const auto& c = std::get<LoadConfig>(_config);
const auto& sc = *_cli.get_subcommand("load");

if (!c.force && std::filesystem::exists(c.uri)) {
errors.emplace_back(fmt::format(
FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.uri));
}

if (c.path_to_bin_table.empty() && c.path_to_chrom_sizes.empty()) {
assert(c.bin_size == 0);
errors.emplace_back("--bin-size is required when --bin-table is not specified.");

Check warning on line 129 in src/hictk/cli/cli_load.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_load.cpp#L128-L129

Added lines #L128 - L129 were not covered by tests
}

if ((c.format == "bg2" || c.format == "coo") && !sc.get_option("--bin-table")->empty()) {
errors.emplace_back(

Check warning on line 133 in src/hictk/cli/cli_load.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_load.cpp#L133

Added line #L133 was not covered by tests
"specifying bins through the --bin-table is not supported when ingesting pre-binned "
"interactions.");
}

if (c.format == "4dn" && c.format == "validpairs" && c.assume_sorted) {
warnings.emplace_back(

Check warning on line 139 in src/hictk/cli/cli_load.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_load.cpp#L139

Added line #L139 was not covered by tests
"--assume-sorted has no effect when ingesting interactions in 4dn or validpairs format.");
}

for (const auto& w : warnings) {
SPDLOG_WARN(FMT_STRING("{}"), w);

Check warning on line 144 in src/hictk/cli/cli_load.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_load.cpp#L144

Added line #L144 was not covered by tests
}

if (!errors.empty()) {
throw std::runtime_error(
fmt::format(FMT_STRING("the following error(s) where encountered while validating CLI "
Expand All @@ -117,6 +154,15 @@

void Cli::transform_args_load_subcommand() {
auto& c = std::get<LoadConfig>(_config);
const auto& sc = *_cli.get_subcommand("load");

if (sc.get_option("--one-based")->empty()) {
if (c.format == "4dn" || c.format == "validpairs") {
c.offset = -1;
}
} else {
c.offset = c.one_based ? -1 : 0;

Check warning on line 164 in src/hictk/cli/cli_load.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_load.cpp#L164

Added line #L164 was not covered by tests
}

// in spdlog, high numbers correspond to low log levels
assert(c.verbosity > 0 && c.verbosity < 5);
Expand Down
6 changes: 6 additions & 0 deletions src/hictk/cli/cli_zoomify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@
"--resolutions.");
}

if (clr.bin_size() == 0) { // Variable bin size
errors.clear();
warnings.clear();
errors.emplace_back("zoomifying files with variable bin size is not currently supported.");

Check warning on line 151 in src/hictk/cli/cli_zoomify.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_zoomify.cpp#L149-L151

Added lines #L149 - L151 were not covered by tests
}

for (const auto& w : warnings) {
SPDLOG_WARN(FMT_STRING("{}"), w);
}
Expand Down
6 changes: 6 additions & 0 deletions src/hictk/convert/cool_to_hic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,12 @@
c.path_to_input.string(), c.resolutions.front());

const cooler::File clr(uri);

if (clr.bin_size() == 0) {
throw std::runtime_error(
"converting cooler files with variable bin size is not supported.");

Check warning on line 320 in src/hictk/convert/cool_to_hic.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/convert/cool_to_hic.cpp#L319-L320

Added lines #L319 - L320 were not covered by tests
}

dump_chrom_sizes(clr, chrom_sizes);
dump_pixels(clr, pixels, c.gzip_compression_lvl, c.threads);
}
Expand Down
4 changes: 4 additions & 0 deletions src/hictk/include/hictk/tools/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,13 @@ struct LoadConfig {
std::string uri{};

std::filesystem::path path_to_chrom_sizes{};
std::filesystem::path path_to_bin_table{};
std::uint32_t bin_size{};

std::string format{};
std::string assembly{"unknown"};
bool one_based{true};
std::int64_t offset{0};
bool count_as_float{false};
bool assume_sorted{false};
bool force{false};
Expand Down
10 changes: 5 additions & 5 deletions src/hictk/load/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,20 @@

template <typename N>
[[nodiscard]] inline ThinPixel<N> parse_pixel(const BinTable& bins, std::string_view line,
Format format) {
Format format, std::int64_t offset) {
ThinPixel<N> pixel{};
switch (format) {
case Format::COO:
pixel = ThinPixel<N>::from_coo(bins, line);
pixel = ThinPixel<N>::from_coo(bins, line, offset);
break;
case Format::BG2:
pixel = Pixel<N>::from_bg2(bins, line).to_thin();
pixel = Pixel<N>::from_bg2(bins, line, offset).to_thin();
break;
case Format::VP:
pixel = Pixel<N>::from_validpair(bins, line).to_thin();
pixel = Pixel<N>::from_validpair(bins, line, offset).to_thin();

Check warning on line 44 in src/hictk/load/common.hpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/load/common.hpp#L44

Added line #L44 was not covered by tests
break;
case Format::_4DN:
pixel = Pixel<N>::from_4dn_pairs(bins, line).to_thin();
pixel = Pixel<N>::from_4dn_pairs(bins, line, offset).to_thin();
break;
}
if (pixel.bin1_id > pixel.bin2_id) {
Expand Down
Loading