Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update hictk dump to support dumping resolutions, cells and normalizations #62

Merged
merged 7 commits into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Checks: >
-cppcoreguidelines-pro-bounds-array-to-pointer-decay,
-cppcoreguidelines-pro-bounds-constant-array-index,
-hicpp-no-array-decay,
-misc-no-recursion,
-modernize-use-trailing-return-type,
-readability-identifier-length,
-readability-magic-numbers
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,10 @@ jobs:
run: |
test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk
test/scripts/hictk_dump_bins.sh build/src/hictk/hictk
test/scripts/hictk_dump_resolutions.sh build/src/hictk/hictk
test/scripts/hictk_dump_normalizations.sh build/src/hictk/hictk
test/scripts/hictk_dump_cells.sh build/src/hictk/hictk

test/scripts/hictk_dump_gw.sh build/src/hictk/hictk
test/scripts/hictk_dump_cis.sh build/src/hictk/hictk
test/scripts/hictk_dump_trans.sh build/src/hictk/hictk
Expand Down
12 changes: 12 additions & 0 deletions .github/workflows/macos-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,18 @@ jobs:
run: |
test/scripts/hictk_dump_bins.sh bin/hictk

- name: Test hictk dump resolutions
run: |
test/scripts/hictk_dump_resolutions.sh bin/hictk

- name: Test hictk dump normalizations
run: |
test/scripts/hictk_dump_normalizations.sh bin/hictk

- name: Test hictk dump cells
run: |
test/scripts/hictk_dump_cells.sh bin/hictk

- name: Test hictk dump genome-wide
run: |
test/scripts/hictk_dump_gw.sh bin/hictk
Expand Down
12 changes: 12 additions & 0 deletions .github/workflows/ubuntu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,18 @@ jobs:
run: |
test/scripts/hictk_dump_bins.sh bin/hictk

- name: Test hictk dump resolutions
run: |
test/scripts/hictk_dump_resolutions.sh bin/hictk

- name: Test hictk dump normalizations
run: |
test/scripts/hictk_dump_normalizations.sh bin/hictk

- name: Test hictk dump cells
run: |
test/scripts/hictk_dump_cells.sh bin/hictk

- name: Test hictk dump genome-wide
run: |
test/scripts/hictk_dump_gw.sh bin/hictk
Expand Down
50 changes: 29 additions & 21 deletions src/hictk/cli/cli_dump.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ void Cli::make_dump_subcommand() {
"uri",
c.uri,
"Path to a .hic, .cool or .mcool file (Cooler URI syntax supported).")
->check(IsValidHiCFile | IsValidCoolerFile)
->check(IsValidHiCFile |
IsValidCoolerFile |
IsValidMultiresCoolerFile |
IsValidSingleCellCoolerFile)
->required();

sc.add_option(
Expand All @@ -57,7 +60,8 @@ void Cli::make_dump_subcommand() {
"-t,--table",
c.table,
"Name of the table to dump.\n")
->check(CLI::IsMember({"chroms", "bins", "pixels"}))
->check(CLI::IsMember({"chroms", "bins", "pixels", "normalizations",
"resolutions", "cells"}))
->capture_default_str();

sc.add_option(
Expand Down Expand Up @@ -106,6 +110,7 @@ void Cli::make_dump_subcommand() {

// clang-format on

sc.get_option("--range2")->needs(sc.get_option("--range"));
sc.get_option("--query-file")->excludes(sc.get_option("--range"));
sc.get_option("--query-file")->excludes(sc.get_option("--range2"));

Expand All @@ -119,38 +124,40 @@ void Cli::validate_dump_subcommand() const {
std::vector<std::string> errors;
const auto& c = std::get<DumpConfig>(_config);

if (!errors.empty()) {
throw std::runtime_error(
fmt::format(FMT_STRING("the following error(s) where encountered while validating CLI "
"arguments and input file(s):\n - {}"),
fmt::join(errors, "\n - ")));
}
const auto& subcmd = *_cli.get_subcommand("dump");

const auto is_hic = hic::utils::is_hic_file(c.uri);
const auto is_cooler = cooler::utils::is_cooler(c.uri);
const auto is_mcooler = cooler::utils::is_multires_file(c.uri);
const auto is_scool = cooler::utils::is_scool_file(c.uri);

if (is_hic && c.resolution == 0 && c.table != "chroms") {
errors.emplace_back("--resolution is mandatory when file is in .hic format.");
if ((is_hic || is_mcooler) && c.resolution == 0 && (c.table == "pixels" || c.table == "bins")) {
errors.emplace_back("--resolution is mandatory when file is in .hic or .mcool format.");
}

const auto resolution_parsed = !_cli.get_subcommand("dump")->get_option("--resolution")->empty();
const auto resolution_parsed = !subcmd.get_option("--resolution")->empty();

if ((is_cooler || is_mcooler) && resolution_parsed) {
warnings.emplace_back("--resolution is ignored when file is in .cool or .mcool format.");
if ((is_cooler || is_scool) && resolution_parsed) {
warnings.emplace_back("--resolution is ignored when file is in .[s]cool format.");
}

const auto weight_type_parsed =
!_cli.get_subcommand("dump")->get_option("--weight-type")->empty();
const auto weight_type_parsed = !subcmd.get_option("--weight-type")->empty();

if (is_hic && weight_type_parsed) {
warnings.emplace_back("--weight-type is ignored when file is in .hic format.");
}

const auto matrix_type_parsed =
!_cli.get_subcommand("dump")->get_option("--matrix-type")->empty();
const auto matrix_unit_parsed =
!_cli.get_subcommand("dump")->get_option("--matrix-unit")->empty();
const auto range_parsed = !subcmd.get_option("--range")->empty();
if (range_parsed && c.table != "bins" && c.table != "pixels") {
warnings.emplace_back("--range and --range2 are ignore when --table is not bins or pixels");
}
const auto query_file_parsed = !subcmd.get_option("--query-file")->empty();
if (query_file_parsed && c.table != "bins" && c.table != "pixels") {
warnings.emplace_back("--query-file is ignored when --table is not bins or pixels");
}

const auto matrix_type_parsed = !subcmd.get_option("--matrix-type")->empty();
const auto matrix_unit_parsed = !subcmd.get_option("--matrix-unit")->empty();

if (!is_hic && (matrix_type_parsed || matrix_unit_parsed)) {
warnings.emplace_back(
Expand Down Expand Up @@ -181,9 +188,10 @@ void Cli::transform_args_dump_subcommand() {
c.verbosity = static_cast<std::uint8_t>(spdlog::level::critical) - c.verbosity;

c.format = infer_input_format(c.uri);
if (c.format == "hic" && c.resolution == 0) {
assert(c.table == "chroms");
if (c.format == "hic" && c.resolution == 0 && c.table == "chroms") {
c.resolution = hic::utils::list_resolutions(c.uri).back();
} else if (c.format == "mcool" && c.resolution == 0 && c.table == "chroms") {
c.resolution = cooler::utils::list_resolutions(c.uri).back();
}

if (_cli.get_subcommand("dump")->get_option("--range2")->empty()) {
Expand Down
4 changes: 2 additions & 2 deletions src/hictk/cli/cli_validate.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Copyright (C) 2023 Roberto Rossini <roberros@uio.no>
//
// Created by roby on 7/13/23.
//
// SPDX-License-Identifier: MIT

#include <fmt/format.h>
#include <fmt/std.h>
Expand Down
166 changes: 145 additions & 21 deletions src/hictk/dump/dump.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <variant>

#include "hictk/balancing/methods.hpp"
#include "hictk/cooler.hpp"
#include "hictk/file.hpp"
#include "hictk/tools/config.hpp"
#include "hictk/transformers.hpp"
Expand All @@ -18,23 +19,6 @@ static void print(const ThinPixel<double>& pixel) {
fmt::print(FMT_COMPILE("{:d}\t{:d}\t{:.16g}\n"), pixel.bin1_id, pixel.bin2_id, pixel.count);
}

static void dump_chroms(const File& f, std::string_view range) {
if (range == "all") {
for (const Chromosome& chrom : f.chromosomes()) {
if (!chrom.is_all()) {
fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), chrom.name(), chrom.size());
}
}
return;
}

const auto coords = GenomicInterval::parse_ucsc(f.chromosomes(), std::string{range});
auto it = f.chromosomes().find(coords.chrom());
if (it != f.chromosomes().end()) {
fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), it->name(), it->size());
}
}

template <typename File>
static void dump_bins(const File& f, std::string_view range) {
if (range == "all") {
Expand Down Expand Up @@ -127,9 +111,6 @@ static void dump_pixels(hic::File& f, std::string_view range1, std::string_view
static void process_query(File& f, std::string_view table, std::string_view range1,
std::string_view range2, std::string_view normalization, bool join,
bool sorted) {
if (table == "chroms") {
return dump_chroms(f, range1);
}
if (table == "bins") {
return dump_bins(f, range1);
}
Expand All @@ -139,7 +120,128 @@ static void process_query(File& f, std::string_view table, std::string_view rang
f.get());
}

int dump_subcmd(const DumpConfig& c) {
static int dump_chroms(std::string_view uri, std::string_view format, std::uint32_t resolution) {
Reference ref{};

if (format == "mcool") {
ref = cooler::MultiResFile{std::string{uri}}.chromosomes();
} else if (format == "scool") {
ref = cooler::SingleCellFile{std::string{uri}}.chromosomes();
} else {
ref = File{std::string{uri}, resolution}.chromosomes();
}

for (const Chromosome& chrom : ref) {
if (!chrom.is_all()) {
fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), chrom.name(), chrom.size());
}
}
return 0;
}

static phmap::btree_set<std::string> get_normalizations(std::string_view uri,
std::string_view format,
std::uint32_t resolution) {
assert(format != "mcool");
assert(format != "hic" || resolution != 0);
if (format == "scool") {
const auto cell_ids = cooler::SingleCellFile{uri}.cells();
if (cell_ids.empty()) {
return {};
}

const auto scool_uri = fmt::format(FMT_STRING("{}::/cells/{}"), uri, *cell_ids.begin());
return get_normalizations(scool_uri, "cool", 0);
}

phmap::btree_set<std::string> norms{};
if (uri == "hic" && resolution == 0) {
const hic::File hf{std::string{uri}, resolution};

for (const auto& norm : hf.avail_normalizations()) {
norms.emplace(std::string{norm.to_string()});
}
return norms;
}

const auto norms_ = File{std::string{uri}, resolution}.avail_normalizations();
std::transform(norms_.begin(), norms_.end(), std::inserter(norms, norms.begin()),
[](const auto& n) { return std::string{n.to_string()}; });

return norms;
}

static int dump_normalizations(std::string_view uri, std::string_view format,
std::uint32_t resolution) {
phmap::btree_set<std::string> norms{};
std::vector<std::uint32_t> resolutions{};
if (format == "mcool") {
resolutions = cooler::MultiResFile{uri}.resolutions();
if (resolutions.empty()) {
return 0;
}
} else if (format == "hic" && resolution == 0) {
resolutions = hic::utils::list_resolutions(std::string{uri});
if (resolutions.empty()) {
return 0;
}
}

if (resolutions.empty()) {
norms = get_normalizations(uri, format, resolution);
} else {
format = format == "hic" ? "hic" : "cool";
std::for_each(resolutions.begin(), resolutions.end(),
[&](const auto res) { norms.merge(get_normalizations(uri, format, res)); });
}

if (!norms.empty()) {
fmt::print(FMT_STRING("{}\n"), fmt::join(norms, "\n"));
}
return 0;
}

static int dump_resolutions(std::string_view uri, std::string_view format,
std::uint32_t resolution) {
std::vector<std::uint32_t> resolutions{};

if (format == "hic") {
resolutions = hic::utils::list_resolutions(uri);
if (resolution != 0) {
const auto res_found =
std::find(resolutions.begin(), resolutions.end(), resolution) != resolutions.end();
resolutions.clear();
if (res_found) {
resolutions.push_back(resolution);
}
}
} else if (format == "mcool") {
resolutions = cooler::MultiResFile{uri}.resolutions();
} else if (format == "scool") {
resolutions.push_back(cooler::SingleCellFile{uri}.bin_size());
} else {
assert(format == "cool");
resolutions.push_back(cooler::File{uri}.bin_size());
}

if (!resolutions.empty()) {
fmt::print(FMT_STRING("{}\n"), fmt::join(resolutions, "\n"));
}
return 0;
}

static int dump_cells(std::string_view uri, std::string_view format) {
if (format != "scool") {
throw std::runtime_error(fmt::format(FMT_STRING("\"{}\" is not a .scool file"), uri));
}
const auto cells = cooler::SingleCellFile{uri}.cells();
if (!cells.empty()) {
fmt::print(FMT_STRING("{}\n"), fmt::join(cells, "\n"));
}
return 0;
}

static int dump_tables(const DumpConfig& c) {
hictk::File f{c.uri, c.resolution, c.matrix_type, c.matrix_unit};

if (c.query_file.empty()) {
Expand All @@ -164,4 +266,26 @@ int dump_subcmd(const DumpConfig& c) {

return 0;
}

int dump_subcmd(const DumpConfig& c) {
if (c.table == "bins" || c.table == "pixels") {
return dump_tables(c);
}

if (c.table == "chroms") {
return dump_chroms(c.uri, c.format, c.resolution);
}

if (c.table == "resolutions") {
return dump_resolutions(c.uri, c.format, c.resolution);
}

if (c.table == "normalizations") {
return dump_normalizations(c.uri, c.format, c.resolution);
}

assert(c.table == "cells");

return dump_cells(c.uri, c.format);
}
} // namespace hictk::tools
Loading