diff --git a/src/libhictk/hic/include/hictk/hic.hpp b/src/libhictk/hic/include/hictk/hic.hpp index d5b123ba..53d12c34 100644 --- a/src/libhictk/hic/include/hictk/hic.hpp +++ b/src/libhictk/hic/include/hictk/hic.hpp @@ -85,6 +85,13 @@ class File { std::uint64_t first_bin2, std::uint64_t last_bin2, balancing::Method norm = balancing::Method::NONE()) const; + [[nodiscard]] balancing::Weights normalization(balancing::Method norm, + const Chromosome &chrom) const; + [[nodiscard]] balancing::Weights normalization(std::string_view norm, + const Chromosome &chrom) const; + [[nodiscard]] balancing::Weights normalization(balancing::Method norm) const; + [[nodiscard]] balancing::Weights normalization(std::string_view norm) const; + [[nodiscard]] std::size_t num_cached_footers() const noexcept; void purge_footer_cache(); diff --git a/src/libhictk/hic/include/hictk/hic/binary_buffer.hpp b/src/libhictk/hic/include/hictk/hic/binary_buffer.hpp index 671fd883..00a992b7 100644 --- a/src/libhictk/hic/include/hictk/hic/binary_buffer.hpp +++ b/src/libhictk/hic/include/hictk/hic/binary_buffer.hpp @@ -20,7 +20,13 @@ class BinaryBuffer { // NOLINTNEXTLINE template ::value>::type* = nullptr> T read(); + template ::value>::type* = nullptr> + void read(T& buff); + template ::value>::type* = nullptr> + void read(std::vector& data); void read(std::string& buff, std::size_t n); + void read(char* buff, std::size_t n); + std::string getline(char delim = '\n'); // NOLINTNEXTLINE template ::value>::type* = nullptr> void write(T data); diff --git a/src/libhictk/hic/include/hictk/hic/file_writer.hpp b/src/libhictk/hic/include/hictk/hic/file_writer.hpp index e52a8074..ed3d829f 100644 --- a/src/libhictk/hic/include/hictk/hic/file_writer.hpp +++ b/src/libhictk/hic/include/hictk/hic/file_writer.hpp @@ -24,6 +24,7 @@ #include #include +#include "hictk/balancing/weights.hpp" #include "hictk/bin_table.hpp" #include "hictk/default_delete.hpp" #include "hictk/hash.hpp" @@ -112,7 +113,7 @@ class HiCFileWriter { BlockMappers _block_mappers{}; using StatsTank = phmap::flat_hash_map; - using FooterTank = phmap::btree_map, FooterV5>; + using FooterTank = phmap::btree_map, FooterMasterIndex>; MatrixBodyMetadataTank _matrix_metadata{}; FooterTank _footers{}; @@ -123,6 +124,9 @@ class HiCFileWriter { std::unique_ptr _compressor{}; std::string _compression_buffer{}; + phmap::btree_set _normalized_expected_values{}; + phmap::btree_map> _normalization_vectors{}; + HiCSectionOffsets _header_section{}; HiCSectionOffsets _data_block_section{}; HiCSectionOffsets _body_metadata_section{}; @@ -137,12 +141,12 @@ class HiCFileWriter { public: HiCFileWriter() = default; - explicit HiCFileWriter( - std::string_view path_, Reference chromosomes_, std::vector resolutions_, - std::string_view assembly_ = "unknown", std::size_t n_threads = 1, - std::size_t chunk_size = 10'000'000, - const std::filesystem::path& tmpdir = std::filesystem::temp_directory_path(), - std::uint32_t compression_lvl = 12, std::size_t buffer_size = 32'000'000); + explicit HiCFileWriter(std::string_view path_); + HiCFileWriter(std::string_view path_, Reference chromosomes_, + std::vector resolutions_, std::string_view assembly_ = "unknown", + std::size_t n_threads = 1, std::size_t chunk_size = 10'000'000, + const std::filesystem::path& tmpdir = std::filesystem::temp_directory_path(), + std::uint32_t compression_lvl = 12, std::size_t buffer_size = 32'000'000); [[nodiscard]] std::string_view url() const noexcept; [[nodiscard]] const Reference& chromosomes() const noexcept; @@ -176,13 +180,26 @@ class HiCFileWriter { // Write expected/normalization values void compute_and_write_expected_values(); + // Write normalization vectors + void add_norm_vector(const NormalizationVectorIndexBlock& blk, const std::vector& weights); + void add_norm_vector(std::string_view type, const Chromosome& chrom, std::string_view unit, + std::uint32_t bin_size, const std::vector& weights, + std::size_t position = std::numeric_limits::max(), + std::size_t n_bytes = std::numeric_limits::max()); + void add_norm_vector(const NormalizationVectorIndexBlock& blk, const balancing::Weights& weights); + void add_norm_vector(std::string_view type, const Chromosome& chrom, std::string_view unit, + std::uint32_t bin_size, const balancing::Weights& weights, + std::size_t position = std::numeric_limits::max(), + std::size_t n_bytes = std::numeric_limits::max()); + void write_norm_vectors(); + void write_empty_expected_values(); void write_empty_normalized_expected_values(); - void write_empty_norm_vectors(); void finalize(); private: + [[nodiscard]] static HiCHeader read_header(std::string_view path); [[nodiscard]] static HiCHeader init_header(std::string_view path, Reference chromosomes, std::vector resolutions, std::string_view assembly); @@ -211,6 +228,10 @@ class HiCFileWriter { [[nodiscard]] std::size_t compute_num_bins(const Chromosome& chrom1, const Chromosome& chrom2, std::uint32_t resolution); + void read_normalized_expected_values(); + void read_norm_vectors(); + [[nodiscard]] std::vector read_norm_vector(const NormalizationVectorIndexBlock& blk); + // Methods to be called from worker threads auto merge_and_compress_blocks_thr( HiCInteractionToBlockMapper& mapper, std::mutex& mapper_mtx, diff --git a/src/libhictk/hic/include/hictk/hic/file_writer_data_structures.hpp b/src/libhictk/hic/include/hictk/hic/file_writer_data_structures.hpp index 64780031..ff6d7f05 100644 --- a/src/libhictk/hic/include/hictk/hic/file_writer_data_structures.hpp +++ b/src/libhictk/hic/include/hictk/hic/file_writer_data_structures.hpp @@ -15,6 +15,7 @@ #include #include "hictk/hic/binary_buffer.hpp" +#include "hictk/hic/filestream.hpp" #include "hictk/pixel.hpp" namespace hictk::hic::internal { @@ -124,7 +125,7 @@ struct MatrixInteractionBlock { }; // https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#master-index -struct MasterIndex { +struct FooterMasterIndex { std::string key; std::int64_t position; std::int32_t size; @@ -134,54 +135,103 @@ struct MasterIndex { struct ExpectedValuesBlock { std::string unit{}; std::int32_t binSize{}; - std::int64_t nValues{}; + [[nodiscard]] std::int64_t nValues() const noexcept; std::vector value{}; - std::int32_t nChrScaleFactors{}; + [[nodiscard]] std::int32_t nChrScaleFactors() const noexcept; std::vector chrIndex{}; std::vector chrScaleFactor{}; + ExpectedValuesBlock() = default; ExpectedValuesBlock(std::string_view unit_, std::uint32_t bin_size, const std::vector& weights, const std::vector& chrom_ids, const std::vector& scale_factors); + + [[nodiscard]] bool operator<(const ExpectedValuesBlock& other) const noexcept; + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] static ExpectedValuesBlock deserialize(filestream::FileStream& fs); }; // https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#expected-value-vectors -struct ExpectedValues { - std::int32_t nExpectedValueVectors = 0; - std::vector expectedValues; +class ExpectedValues { + std::vector _expected_values; + + public: + [[nodiscard]] std::int32_t nExpectedValueVectors() const noexcept; + [[nodiscard]] const std::vector& expectedValues() const noexcept; + void emplace_back(ExpectedValuesBlock evb); [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] static ExpectedValues deserialize(filestream::FileStream& fs); }; -// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalized-expected-value-vectors -struct NormalizedExpectedValues { - std::int32_t nNormExpectedValueVectors = 0; +struct NormalizedExpectedValuesBlock { + std::string type{}; + std::string unit{}; + std::int32_t binSize{}; + [[nodiscard]] std::int64_t nValues() const noexcept; + std::vector value{}; + [[nodiscard]] std::int32_t nChrScaleFactors() const noexcept; + std::vector chrIndex{}; + std::vector chrScaleFactor{}; + + NormalizedExpectedValuesBlock() = default; + NormalizedExpectedValuesBlock(std::string_view type_, std::string_view unit_, + std::uint32_t bin_size, const std::vector& weights, + const std::vector& chrom_ids, + const std::vector& scale_factors); + + [[nodiscard]] bool operator<(const NormalizedExpectedValuesBlock& other) const noexcept; + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] static NormalizedExpectedValuesBlock deserialize(filestream::FileStream& fs); }; -// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalization-vector-index -struct NormalizationVectorIndex { - std::int32_t nNormVectors = 0; +// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalized-expected-value-vectors +class NormalizedExpectedValues { + std::vector _normalized_expected_values; + + public: + [[nodiscard]] std::int32_t nNormExpectedValueVectors() const noexcept; + [[nodiscard]] const std::vector& normExpectedValues() + const noexcept; + void emplace_back(NormalizedExpectedValuesBlock evb); [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] static NormalizedExpectedValues deserialize(filestream::FileStream& fs); }; -// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalization-vector-arrays-1-per-normalization-vector -struct NormalizationVectorArray { - std::int64_t nValues = 0; +struct NormalizationVectorIndexBlock { + std::string type{}; + std::int32_t chrIdx{}; + std::string unit{}; + std::int32_t binSize{}; + std::int64_t position{}; + std::int64_t nBytes{}; + + private: + public: + NormalizationVectorIndexBlock() = default; + NormalizationVectorIndexBlock(std::string type_, std::uint32_t chrom_idx, std::string unit_, + std::uint32_t bin_size, std::size_t position_, std::size_t n_bytes); + + [[nodiscard]] bool operator<(const NormalizationVectorIndexBlock& other) const noexcept; + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] static NormalizationVectorIndexBlock deserialize(filestream::FileStream& fs); }; -struct FooterV5 { - MasterIndex masterIndex{}; +// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalization-vector-index +class NormalizationVectorIndex { + std::vector _norm_vect_idx{}; - ExpectedValues expectedValues{}; - NormalizedExpectedValues normExpectedValues{}; - NormalizationVectorIndex normVectIndex{}; - std::vector normVectArray{}; + public: + [[nodiscard]] std::int32_t nNormVectors() const noexcept; + [[nodiscard]] const std::vector normalizationVectorIndex() + const noexcept; + void emplace_back(NormalizationVectorIndexBlock blk); - FooterV5() = default; [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] static NormalizationVectorIndex deserialize(filestream::FileStream& fs); }; } // namespace hictk::hic::internal diff --git a/src/libhictk/hic/include/hictk/hic/filestream.hpp b/src/libhictk/hic/include/hictk/hic/filestream.hpp index 3e27e2e0..2cfd2d16 100644 --- a/src/libhictk/hic/include/hictk/hic/filestream.hpp +++ b/src/libhictk/hic/include/hictk/hic/filestream.hpp @@ -25,7 +25,7 @@ class FileStream { public: FileStream() = default; - explicit FileStream(std::string path); + explicit FileStream(std::string path, std::ios::openmode mode = std::ios::in); static FileStream create(std::string path); [[nodiscard]] const std::string &path() const noexcept; diff --git a/src/libhictk/hic/include/hictk/hic/impl/binary_buffer_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/binary_buffer_impl.hpp index 1343eb41..05d23bca 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/binary_buffer_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/binary_buffer_impl.hpp @@ -21,14 +21,34 @@ inline T BinaryBuffer::read() { return x; } +template ::value>::type *> +inline void BinaryBuffer::read(T &buff) { + buff = read(); +} + +template ::value>::type *> +inline void BinaryBuffer::read(std::vector &buff) { + read(reinterpret_cast(buff.data()), sizeof(T) * buff.size()); +} + inline void BinaryBuffer::read(std::string &buff, std::size_t n) { + buff.resize(n); + read(buff.data(), n); +} + +inline void BinaryBuffer::read(char *buff, std::size_t n) { static_assert(sizeof(char) == 1); assert(_i < _buffer.size()); - buff.resize(n); - std::memcpy(static_cast(buff.data()), _buffer.data() + _i, sizeof(char)); + std::memcpy(static_cast(buff), _buffer.data() + _i, n * sizeof(char)); _i += sizeof(char); } +inline std::string BinaryBuffer::getline(char delim) { + std::string_view view{_buffer}; + const auto pos = view.substr(_i).find(delim); + return std::string{view.substr(0, pos)}; +} + template ::value>::type *> inline void BinaryBuffer::write(T data) { static_assert(sizeof(char) == 1); diff --git a/src/libhictk/hic/include/hictk/hic/impl/file_writer_data_structures_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/file_writer_data_structures_impl.hpp index 2689c43c..eca204c5 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/file_writer_data_structures_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/file_writer_data_structures_impl.hpp @@ -147,7 +147,8 @@ inline void MatrixInteractionBlock::finalize() { const auto size_dense = compute_size_dense_repr(); const auto width = compute_dense_width(); - const auto use_lor = (size_lor < size_dense) || (width > std::numeric_limits::max()); + const auto use_lor = + (size_lor < size_dense) || (width > std::numeric_limits::max()); useFloatContact = 1; useIntXPos = 1; @@ -321,7 +322,7 @@ inline void MatrixInteractionBlock::compress(const std::string &buffer_in, } } -inline std::string MasterIndex::serialize(BinaryBuffer &buffer, bool clear) const { +inline std::string FooterMasterIndex::serialize(BinaryBuffer &buffer, bool clear) const { if (clear) { buffer.clear(); } @@ -333,15 +334,22 @@ inline std::string MasterIndex::serialize(BinaryBuffer &buffer, bool clear) cons return buffer.get(); } +inline std::int64_t ExpectedValuesBlock::nValues() const noexcept { + return static_cast(value.size()); +} + +inline std::int32_t ExpectedValuesBlock::nChrScaleFactors() const noexcept { + assert(chrIndex.size() == chrScaleFactor.size()); + return static_cast(chrIndex.size()); +} + inline ExpectedValuesBlock::ExpectedValuesBlock(std::string_view unit_, std::uint32_t bin_size, const std::vector &weights, const std::vector &chrom_ids, const std::vector &scale_factors) : unit(std::string{unit_}), binSize(static_cast(bin_size)), - nValues(static_cast(weights.size())), value(weights.size()), - nChrScaleFactors(static_cast(chrom_ids.size())), chrIndex(chrom_ids.size()), chrScaleFactor(chrom_ids.size()) { std::transform(weights.begin(), weights.end(), value.begin(), @@ -352,6 +360,14 @@ inline ExpectedValuesBlock::ExpectedValuesBlock(std::string_view unit_, std::uin [](const auto n) { return static_cast(n); }); } +inline bool ExpectedValuesBlock::operator<(const ExpectedValuesBlock &other) const noexcept { + if (unit != other.unit) { + return unit < other.unit; + } + + return binSize < other.binSize; +} + inline std::string ExpectedValuesBlock::serialize(BinaryBuffer &buffer, bool clear) const { if (clear) { buffer.clear(); @@ -359,9 +375,9 @@ inline std::string ExpectedValuesBlock::serialize(BinaryBuffer &buffer, bool cle buffer.write(unit); buffer.write(binSize); - buffer.write(nValues); + buffer.write(nValues()); buffer.write(value); - buffer.write(nChrScaleFactors); + buffer.write(nChrScaleFactors()); assert(chrIndex.size() == chrScaleFactor.size()); for (std::size_t i = 0; i < chrIndex.size(); ++i) { @@ -372,59 +388,269 @@ inline std::string ExpectedValuesBlock::serialize(BinaryBuffer &buffer, bool cle return buffer.get(); } +inline ExpectedValuesBlock ExpectedValuesBlock::deserialize(filestream::FileStream &fs) { + ExpectedValuesBlock evb{}; + evb.unit = fs.getline('\0'); + fs.read(evb.binSize); + const auto nValues = static_cast(fs.read()); + evb.value.resize(nValues); + fs.read(evb.value); + const auto nChrScaleFactors = static_cast(fs.read()); + evb.chrIndex.resize(nChrScaleFactors); + evb.chrScaleFactor.resize(nChrScaleFactors); + + for (std::size_t i = 0; i < nChrScaleFactors; ++i) { + evb.chrIndex.emplace_back(fs.read()); + evb.chrScaleFactor.emplace_back(fs.read()); + } + + return evb; +} + +inline std::int32_t ExpectedValues::nExpectedValueVectors() const noexcept { + return static_cast(expectedValues().size()); +} + +inline const std::vector &ExpectedValues::expectedValues() const noexcept { + return _expected_values; +} + +inline void ExpectedValues::emplace_back(ExpectedValuesBlock evb) { + _expected_values.emplace_back(std::move(evb)); +} + inline std::string ExpectedValues::serialize(BinaryBuffer &buffer, bool clear) const { if (clear) { buffer.clear(); } - buffer.write(nExpectedValueVectors); + buffer.write(nExpectedValueVectors()); - if (nExpectedValueVectors == 0) { + if (nExpectedValueVectors() == 0) { return buffer.get(); } - for (const auto &ev : expectedValues) { + for (const auto &ev : expectedValues()) { std::ignore = ev.serialize(buffer, false); } return buffer.get(); } -inline std::string NormalizedExpectedValues::serialize(BinaryBuffer &buffer, bool clear) const { +inline ExpectedValues ExpectedValues::deserialize(filestream::FileStream &fs) { + ExpectedValues evs{}; + const auto nExpectedValueVectors = static_cast(fs.read()); + for (std::size_t i = 0; i < nExpectedValueVectors; ++i) { + evs.emplace_back(ExpectedValuesBlock::deserialize(fs)); + } + return evs; +} + +inline std::int64_t NormalizedExpectedValuesBlock::nValues() const noexcept { + return static_cast(value.size()); +} + +inline std::int32_t NormalizedExpectedValuesBlock::nChrScaleFactors() const noexcept { + assert(chrIndex.size() == chrScaleFactor.size()); + return static_cast(chrIndex.size()); +} + +inline NormalizedExpectedValuesBlock::NormalizedExpectedValuesBlock( + std::string_view type_, std::string_view unit_, std::uint32_t bin_size, + const std::vector &weights, const std::vector &chrom_ids, + const std::vector &scale_factors) + : type(std::string{type_}), + unit(std::string{unit_}), + binSize(static_cast(bin_size)), + value(weights.size()), + chrIndex(chrom_ids.size()), + chrScaleFactor(chrom_ids.size()) { + std::transform(weights.begin(), weights.end(), value.begin(), + [](const auto n) { return static_cast(n); }); + std::transform(chrom_ids.begin(), chrom_ids.end(), chrIndex.begin(), + [](const auto n) { return static_cast(n); }); + std::transform(scale_factors.begin(), scale_factors.end(), chrScaleFactor.begin(), + [](const auto n) { return static_cast(n); }); +} + +inline bool NormalizedExpectedValuesBlock::operator<( + const NormalizedExpectedValuesBlock &other) const noexcept { + if (type != other.type) { + return type < other.type; + } + if (unit != other.unit) { + return unit < other.unit; + } + return binSize < other.binSize; +} + +inline std::string NormalizedExpectedValuesBlock::serialize(BinaryBuffer &buffer, + bool clear) const { if (clear) { buffer.clear(); } - buffer.write(nNormExpectedValueVectors); + buffer.write(type); + buffer.write(unit); + buffer.write(binSize); + buffer.write(nValues()); + buffer.write(value); + buffer.write(nChrScaleFactors()); + + assert(chrIndex.size() == chrScaleFactor.size()); + for (std::size_t i = 0; i < chrIndex.size(); ++i) { + buffer.write(chrIndex[i]); + buffer.write(chrScaleFactor[i]); + } return buffer.get(); } -inline std::string NormalizationVectorIndex::serialize(BinaryBuffer &buffer, bool clear) const { +inline NormalizedExpectedValuesBlock NormalizedExpectedValuesBlock::deserialize( + filestream::FileStream &fs) { + NormalizedExpectedValuesBlock nevb{}; + nevb.type = fs.getline('\0'); + nevb.unit = fs.getline('\0'); + fs.read(nevb.binSize); + const auto nValues = static_cast(fs.read()); + nevb.value.resize(nValues); + fs.read(nevb.value); + const auto nChrScaleFactors = static_cast(fs.read()); + nevb.chrIndex.resize(nChrScaleFactors); + nevb.chrScaleFactor.resize(nChrScaleFactors); + + for (std::size_t i = 0; i < nChrScaleFactors; ++i) { + nevb.chrIndex.emplace_back(fs.read()); + nevb.chrScaleFactor.emplace_back(fs.read()); + } + + return nevb; +} + +inline std::int32_t NormalizedExpectedValues::nNormExpectedValueVectors() const noexcept { + return static_cast(_normalized_expected_values.size()); +} + +inline const std::vector & +NormalizedExpectedValues::normExpectedValues() const noexcept { + return _normalized_expected_values; +} + +inline void NormalizedExpectedValues::emplace_back(NormalizedExpectedValuesBlock evb) { + _normalized_expected_values.emplace_back(std::move(evb)); +} + +inline std::string NormalizedExpectedValues::serialize(BinaryBuffer &buffer, bool clear) const { if (clear) { buffer.clear(); } - buffer.write(nNormVectors); + buffer.write(nNormExpectedValueVectors()); + for (const auto &nev : _normalized_expected_values) { + std::ignore = nev.serialize(buffer, false); + } return buffer.get(); } -inline std::string NormalizationVectorArray::serialize(BinaryBuffer &buffer, bool clear) const { +inline NormalizedExpectedValues NormalizedExpectedValues::deserialize(filestream::FileStream &fs) { + NormalizedExpectedValues nevs{}; + const auto nNormExpectedValueVectors = static_cast(fs.read()); + nevs._normalized_expected_values.reserve(nNormExpectedValueVectors); + for (std::size_t i = 0; i < nNormExpectedValueVectors; ++i) { + nevs.emplace_back(NormalizedExpectedValuesBlock::deserialize(fs)); + } + return nevs; +} + +inline NormalizationVectorIndexBlock::NormalizationVectorIndexBlock( + std::string type_, std::uint32_t chrom_idx, std::string unit_, std::uint32_t bin_size, + std::size_t position_, std::size_t n_bytes) + : type(std::move(type_)), + chrIdx(static_cast(chrom_idx)), + unit(std::move(unit_)), + binSize(static_cast(bin_size)), + position(static_cast(position_)), + nBytes(static_cast(n_bytes)) {} + +inline bool NormalizationVectorIndexBlock::operator<( + const NormalizationVectorIndexBlock &other) const noexcept { + if (type != other.type) { + return type < other.type; + } + if (chrIdx != other.chrIdx) { + return chrIdx < other.chrIdx; + } + if (unit != other.unit) { + return unit < other.unit; + } + return binSize < other.binSize; +} + +inline std::string NormalizationVectorIndexBlock::serialize(BinaryBuffer &buffer, + bool clear) const { if (clear) { buffer.clear(); } - buffer.write(nValues); + buffer.write(type); + buffer.write(chrIdx); + buffer.write(unit); + buffer.write(binSize); + buffer.write(position); + buffer.write(nBytes); return buffer.get(); } -inline std::string FooterV5::serialize(BinaryBuffer &buffer, bool clear) const { +inline NormalizationVectorIndexBlock NormalizationVectorIndexBlock::deserialize( + filestream::FileStream &fs) { + NormalizationVectorIndexBlock nvib{}; + + nvib.type = fs.getline('\0'); + nvib.chrIdx = fs.read(); + nvib.unit = fs.getline('\0'); + nvib.binSize = fs.read(); + nvib.position = fs.read(); + nvib.nBytes = fs.read(); + + return nvib; +} + +inline std::int32_t NormalizationVectorIndex::nNormVectors() const noexcept { + return static_cast(_norm_vect_idx.size()); +} + +inline const std::vector +NormalizationVectorIndex::normalizationVectorIndex() const noexcept { + return _norm_vect_idx; +} + +inline void NormalizationVectorIndex::emplace_back(NormalizationVectorIndexBlock blk) { + _norm_vect_idx.emplace_back(std::move(blk)); +} + +inline std::string NormalizationVectorIndex::serialize(BinaryBuffer &buffer, bool clear) const { if (clear) { buffer.clear(); } - return masterIndex.serialize(buffer); + buffer.write(nNormVectors()); + + for (const auto &nv : _norm_vect_idx) { + std::ignore = nv.serialize(buffer, false); + } + + return buffer.get(); +} + +inline NormalizationVectorIndex NormalizationVectorIndex::deserialize(filestream::FileStream &fs) { + NormalizationVectorIndex nvi{}; + const auto nNormVectors = static_cast(fs.read()); + for (std::size_t i = 0; i < nNormVectors; ++i) { + nvi.emplace_back(NormalizationVectorIndexBlock::deserialize(fs)); + } + return nvi; } + } // namespace hictk::hic::internal diff --git a/src/libhictk/hic/include/hictk/hic/impl/file_writer_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/file_writer_impl.hpp index bf439be7..e6e3a12f 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/file_writer_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/file_writer_impl.hpp @@ -131,6 +131,15 @@ inline auto MatrixBodyMetadataTank::operator()() const noexcept return _tank; } +inline HiCFileWriter::HiCFileWriter(std::string_view path_) + : _header(read_header(path_)), + _fs(_header.url, std::ios::in | std::ios::out), + _norm_vectors_section(_header.normVectorIndexPosition, + static_cast(_header.normVectorIndexLength)) { + // read_normalized_expected_values(); + read_norm_vectors(); +} + inline HiCFileWriter::HiCFileWriter(std::string_view path_, Reference chromosomes_, std::vector resolutions_, std::string_view assembly_, std::size_t n_threads, @@ -253,9 +262,8 @@ inline void HiCFileWriter::write_norm_vector_index() { static_cast(sizeof("HIC") + sizeof(_header.version) + sizeof(_header.footerPosition) + _header.genomeID.size() + 1); const auto normVectorIndexPosition = - conditional_static_cast(_expected_values_norm_section.start()); - const auto normVectorIndexLength = static_cast( - _expected_values_norm_section.size() + _norm_vectors_section.size()); + conditional_static_cast(_norm_vectors_section.start()); + const auto normVectorIndexLength = static_cast(_norm_vectors_section.size()); SPDLOG_DEBUG(FMT_STRING("writing normVectorIndex {}:{} at offset {}..."), normVectorIndexPosition, normVectorIndexLength, offset); @@ -492,10 +500,10 @@ inline void HiCFileWriter::write_footers() { for (auto &[chroms, footer] : _footers) { const auto offset = _matrix_metadata.offset(chroms.first, chroms.second); - footer.masterIndex.position = conditional_static_cast(offset.start()); - footer.masterIndex.size = static_cast(offset.size()); - SPDLOG_DEBUG(FMT_STRING("writing FooterV5 for {}:{} at offset {}"), chroms.first.name(), - chroms.second.name(), _fs.tellp()); + footer.position = conditional_static_cast(offset.start()); + footer.size = static_cast(offset.size()); + SPDLOG_DEBUG(FMT_STRING("writing FooterMasterIndex for {}:{} at offset {}"), + chroms.first.name(), chroms.second.name(), _fs.tellp()); _fs.write(footer.serialize(_bbuffer)); } @@ -509,10 +517,10 @@ inline void HiCFileWriter::add_footer(const Chromosome &chrom1, const Chromosome return; } - FooterV5 footer{}; - footer.masterIndex.key = fmt::format(FMT_STRING("{}_{}"), chrom1.id(), chrom2.id()); - footer.masterIndex.position = -1; - footer.masterIndex.size = -1; + FooterMasterIndex footer{}; + footer.key = fmt::format(FMT_STRING("{}_{}"), chrom1.id(), chrom2.id()); + footer.position = -1; + footer.size = -1; auto [it, inserted] = _footers.emplace(std::make_pair(chrom1, chrom2), footer); if (!inserted) { @@ -522,7 +530,6 @@ inline void HiCFileWriter::add_footer(const Chromosome &chrom1, const Chromosome inline void HiCFileWriter::write_empty_expected_values() { ExpectedValues ev{}; - ev.nExpectedValueVectors = 0; const auto offset = _fs.tellp(); SPDLOG_DEBUG(FMT_STRING("writing empty expected values section at offset {}..."), offset); @@ -543,20 +550,8 @@ inline void HiCFileWriter::write_empty_normalized_expected_values() { _expected_values_norm_section = {offset, _fs.tellp() - static_cast(offset)}; } -inline void HiCFileWriter::write_empty_norm_vectors() { - const auto offset = _expected_values_norm_section.end(); - SPDLOG_DEBUG(FMT_STRING("writing empty normalization vector section at offset {}..."), offset); - _fs.seekp(offset); - DISABLE_WARNING_PUSH - DISABLE_WARNING_USELESS_CAST - _fs.write(std::int32_t(0)); - DISABLE_WARNING_POP - _norm_vectors_section = {offset, _fs.tellp() - static_cast(offset)}; -} - inline void HiCFileWriter::compute_and_write_expected_values() { ExpectedValues ev{}; - ev.nExpectedValueVectors = static_cast(resolutions().size()); for (const auto &resolution : resolutions()) { SPDLOG_DEBUG(FMT_STRING("computing expected values at resolution {}..."), resolution); @@ -579,29 +574,127 @@ inline void HiCFileWriter::compute_and_write_expected_values() { scaling_factors.push_back(kv.second); }); - ev.expectedValues.emplace_back( + ev.emplace_back( ExpectedValuesBlock{"BP", resolution, aggr.weights(), chrom_ids, scaling_factors}); } - const auto offset = - _footer_section.end() - static_cast(sizeof(ev.nExpectedValueVectors)); + const auto offset = _footer_section.end() - + conditional_static_cast(sizeof(ev.nExpectedValueVectors())); _fs.seekp(offset); _fs.write(ev.serialize(_bbuffer)); _expected_values_section = {offset, _fs.tellp() - static_cast(offset)}; - _footer_section.size() += _expected_values_section.size() - sizeof(ev.nExpectedValueVectors); + _footer_section.size() += _expected_values_section.size() - sizeof(ev.nExpectedValueVectors()); +} + +inline void HiCFileWriter::add_norm_vector(const NormalizationVectorIndexBlock &blk, + const std::vector &weights) { + const auto &chrom = chromosomes().at(static_cast(blk.chrIdx)); + SPDLOG_INFO(FMT_STRING("adding {} normalization vector for {} at resolution {} ({}): {} values"), + blk.type, chrom.name(), blk.binSize, blk.unit, weights.size()); + + const auto bin_size = static_cast(blk.binSize); + const auto expected_shape = (chrom.size() + bin_size - 1) / bin_size; + + if (weights.size() != expected_shape) { + throw std::runtime_error( + fmt::format(FMT_STRING("weight shape mismatch: expected {} values, found {}"), + expected_shape, weights.size())); + } + + const auto [_, inserted] = _normalization_vectors.emplace(blk, weights); + if (!inserted) { + throw std::runtime_error(fmt::format( + FMT_STRING("file already contains {} normalization vector for {} at resolution {} ({})"), + blk.type, chrom.name(), blk.binSize, blk.unit)); + } +} + +inline void HiCFileWriter::add_norm_vector(std::string_view type, const Chromosome &chrom, + std::string_view unit, std::uint32_t bin_size, + const std::vector &weights, std::size_t position, + std::size_t n_bytes) { + add_norm_vector(NormalizationVectorIndexBlock{std::string{type}, chrom.id(), std::string{unit}, + bin_size, position, n_bytes}, + weights); +} + +inline void HiCFileWriter::add_norm_vector(const NormalizationVectorIndexBlock &blk, + const balancing::Weights &weights) { + std::vector weights_f(weights().size()); + if (weights.type() == balancing::Weights::Type::MULTIPLICATIVE) { + std::transform(weights().begin(), weights().end(), weights_f.begin(), + [](const double w) { return static_cast(1.0 / w); }); + } else { + std::transform(weights().begin(), weights().end(), weights_f.begin(), + [](const double w) { return static_cast(w); }); + } + add_norm_vector(blk, weights_f); +} + +inline void HiCFileWriter::add_norm_vector(std::string_view type, const Chromosome &chrom, + std::string_view unit, std::uint32_t bin_size, + const balancing::Weights &weights, std::size_t position, + std::size_t n_bytes) { + add_norm_vector(NormalizationVectorIndexBlock{std::string{type}, chrom.id(), std::string{unit}, + bin_size, position, n_bytes}, + weights); } inline void HiCFileWriter::finalize() { write_footer_offset(); write_footer_size(); write_empty_normalized_expected_values(); - write_empty_norm_vectors(); + write_norm_vectors(); write_norm_vector_index(); _fs.flush(); _fs.seekp(0, std::ios::end); } +inline void HiCFileWriter::write_norm_vectors() { + const auto offset1 = std::max(_expected_values_norm_section.end(), _norm_vectors_section.start()); + SPDLOG_DEBUG(FMT_STRING("writing {} normalization vectors at offset {}..."), + _normalization_vectors.size(), offset1); + _fs.seekp(offset1); + + const auto nNormVectors = static_cast(_normalization_vectors.size()); + _fs.write(nNormVectors); + + phmap::btree_map index_offsets{}; + for (const auto &[blk, _] : _normalization_vectors) { + const auto offset2 = _fs.tellp(); + _fs.write(blk.serialize(_bbuffer)); + index_offsets.emplace(blk, HiCSectionOffsets{offset2, _fs.tellp() - offset2}); + } + + phmap::btree_map vector_offsets{}; + for (const auto &[blk, weights] : _normalization_vectors) { + const auto offset2 = _fs.tellp(); + const auto nValues = static_cast(weights.size()); + _fs.write(nValues); + _fs.write(weights); + vector_offsets.emplace(blk, HiCSectionOffsets{offset2, _fs.tellp() - offset2}); + } + const auto offset3 = _fs.tellp(); + + for (const auto &[blk, idx_offsets] : index_offsets) { + const auto &vect_offsets = vector_offsets.at(blk); + auto new_blk = blk; + new_blk.position = vect_offsets.start(); + new_blk.nBytes = static_cast(vect_offsets.size()); + _fs.seekp(idx_offsets.start()); + _fs.write(new_blk.serialize(_bbuffer)); + } + + _fs.flush(); + + _norm_vectors_section = {offset1, offset3 - static_cast(offset1)}; +} + +inline HiCHeader HiCFileWriter::read_header(std::string_view path) { + return HiCFileReader(std::string{path}).header(); +} + inline HiCHeader HiCFileWriter::init_header(std::string_view path, Reference chromosomes, std::vector resolutions, std::string_view assembly) { @@ -792,6 +885,49 @@ inline std::size_t HiCFileWriter::compute_num_bins(const Chromosome &chrom1, return HiCInteractionToBlockMapper::compute_num_bins(chrom1, chrom2, resolution); } +inline void HiCFileWriter::read_normalized_expected_values() { + const auto offset = _header.normVectorIndexPosition; + _fs.seekg(offset); + const auto nev = NormalizedExpectedValues::deserialize(_fs); + + _expected_values_norm_section = {_header.normVectorIndexPosition, + _fs.tellg() - static_cast(offset)}; + + std::copy(nev.normExpectedValues().begin(), nev.normExpectedValues().end(), + std::inserter(_normalized_expected_values, _normalized_expected_values.begin())); +} + +inline void HiCFileWriter::read_norm_vectors() { + assert(_norm_vectors_section.start() != 0); + const auto offset = _norm_vectors_section.start(); + _fs.seekg(offset); + const auto nvi = NormalizationVectorIndex::deserialize(_fs); + + for (const auto &blk : nvi.normalizationVectorIndex()) { + add_norm_vector(blk, read_norm_vector(blk)); + } +} + +inline std::vector HiCFileWriter::read_norm_vector( + const NormalizationVectorIndexBlock &blk) { + const auto offset = blk.position; + _fs.seekg(offset); + + // https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalization-vector-arrays-1-per-normalization-vector + const auto nValues = static_cast(_fs.read()); + std::vector buffer(nValues); + _fs.read(buffer); + const auto bytes_read = _fs.tellg() - static_cast(offset); + if (bytes_read != static_cast(blk.nBytes)) { + throw std::runtime_error( + fmt::format(FMT_STRING("{} normalization vector for {} at {} resolution is corrupted: " + "expected to read {} bytes but read {}"), + blk.type, _header.chromosomes.at(static_cast(blk.chrIdx)).name(), + blk.binSize, blk.nBytes, bytes_read)); + } + return buffer; +} + inline std::size_t HiCFileWriter::compute_block_column_count(const Chromosome &chrom1, const Chromosome &chrom2, std::uint32_t resolution) { diff --git a/src/libhictk/hic/include/hictk/hic/impl/filestream_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/filestream_impl.hpp index 0ec3238e..ab21c32d 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/filestream_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/filestream_impl.hpp @@ -23,9 +23,12 @@ namespace hictk::hic::internal::filestream { -inline FileStream::FileStream(std::string path) +inline FileStream::FileStream(std::string path, std::ios::openmode mode) : _path(std::move(path)), - _ifs(open_file_read(_path, std::ios::binary | std::ios::ate)), + _ifs(open_file_read(_path, std::ios::in | std::ios::binary | std::ios::ate)), + _ofs(mode & std::ios::out + ? open_file_write(_path, std::ios::in | std::ios::out | std::ios::binary) + : std::ofstream{}), _file_size(static_cast(_ifs.tellg())) { _ifs.seekg(0, std::ios::beg); } diff --git a/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp index 164ad6e1..d648e60c 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp @@ -241,6 +241,66 @@ inline PixelSelector File::fetch(std::uint64_t first_bin1, std::uint64_t last_bi std::move(norm)); } +inline balancing::Weights File::normalization(balancing::Method norm, + const Chromosome& chrom) const { + std::vector weights_{}; + const auto expected_length = (chrom.size() + bins().bin_size() - 1) / bins().bin_size(); + try { + auto weights = fetch(chrom.name(), norm).weights1(); + if (!!weights && weights().size() != expected_length) { + throw std::runtime_error( + fmt::format(FMT_STRING("{} normalization vector for {} appears to be corrupted: " + "expected {} values, found {}"), + norm, chrom.name(), expected_length, weights().size())); + } + weights_ = weights(); + } catch (const std::exception& e) { + const std::string_view msg{e.what()}; + + const auto missing_interactions = + msg.find("unable to read file offset") != std::string_view::npos; + + const auto missing_norm_vect = + msg.find(fmt::format(FMT_STRING("unable to find {} normalization vector"), norm)) != + std::string_view::npos; + + if (!missing_interactions && !missing_norm_vect) { + throw; + } + } + + if (weights_.empty()) { + weights_.resize(expected_length, std::numeric_limits::quiet_NaN()); + } + + return {weights_, balancing::Weights::Type::DIVISIVE}; +} + +inline balancing::Weights File::normalization(std::string_view norm, + const Chromosome& chrom) const { + return normalization(balancing::Method{norm}, chrom); +} + +inline balancing::Weights File::normalization(balancing::Method norm) const { + std::vector weights{}; + weights.reserve(bins().size()); + for (const auto& chrom : chromosomes()) { + if (chrom.is_all()) { + continue; + } + + const auto chrom_weights = normalization(norm, chrom); + weights.insert(weights.end(), chrom_weights().begin(), chrom_weights().end()); + } + + assert(weights.size() == bins().size()); + return {weights, balancing::Weights::Type::DIVISIVE}; +} + +inline balancing::Weights File::normalization(std::string_view norm) const { + return normalization(balancing::Method{norm}); +} + inline std::size_t File::num_cached_footers() const noexcept { return _footers.size(); } inline void File::purge_footer_cache() { _footers.clear(); } diff --git a/test/units/hic/file_writer_test.cpp b/test/units/hic/file_writer_test.cpp index 23de0e81..4cba153c 100644 --- a/test/units/hic/file_writer_test.cpp +++ b/test/units/hic/file_writer_test.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -87,35 +88,81 @@ TEST_CASE("HiC: HiCInteractionToBlockMapper", "[hic][v9][short]") { CHECK(num_interactions == pixels1.size() + pixels2.size()); } -TEST_CASE("HiC: HiCFileWriter", "[hic][v9][medium]") { +TEST_CASE("HiC: HiCFileWriter", "[hic][v9][short]") { const auto path1 = (datadir / "4DNFIZ1ZVXC8.hic9").string(); - const auto path2 = (testdir() / "hic_writer.hic").string(); + const auto path2 = (testdir() / "hic_writer_001.hic").string(); + const auto path3 = (testdir() / "hic_writer_002.hic").string(); const std::vector resolutions{500'000, 1'000'000, 2'500'000}; - { - const auto chromosomes = hic::File(path1, resolutions.front()).chromosomes(); - HiCFileWriter w(path2, chromosomes, resolutions, "dm6", 16); - for (std::size_t i = 0; i < resolutions.size(); ++i) { - if (i % 2 == 0) { - const auto resolution = resolutions[i]; - const hic::File f((datadir / "4DNFIZ1ZVXC8.hic9").string(), resolution); - const auto sel = f.fetch(); - w.add_pixels(resolution, sel.begin(), sel.end()); + SECTION("create file") { + { + const auto chromosomes = hic::File(path1, resolutions.front()).chromosomes(); + HiCFileWriter w(path2, chromosomes, resolutions, "dm6", 3); + for (std::size_t i = 0; i < resolutions.size(); ++i) { + if (i % 2 == 0) { + const auto resolution = resolutions[i]; + const hic::File f((datadir / "4DNFIZ1ZVXC8.hic9").string(), resolution); + const auto sel = f.fetch(); + w.add_pixels(resolution, sel.begin(), sel.end()); + } + } + w.serialize(); + } + + for (const auto& resolution : resolutions) { + fmt::print(FMT_STRING("Comparing {}...\n"), resolution); + const hic::File f1(path1, resolution); + const hic::File f2(path2, resolution); + const auto expected_pixels = f1.fetch().read_all(); + const auto pixels = f2.fetch().read_all(); + + REQUIRE(expected_pixels.size() == pixels.size()); + for (std::size_t i = 0; i < pixels.size(); ++i) { + CHECK(expected_pixels[i] == pixels[i]); } } - w.serialize(); } - for (const auto& resolution : resolutions) { - fmt::print(FMT_STRING("Comparing {}...\n"), resolution); - const hic::File f1(path1, resolution); - const hic::File f2(path2, resolution); - const auto expected_pixels = f1.fetch().read_all(); - const auto pixels = f2.fetch().read_all(); + SECTION("add weights") { + const std::uint32_t resolution = 500'000; + const hic::File hf1(path1, resolution); - REQUIRE(expected_pixels.size() == pixels.size()); - for (std::size_t i = 0; i < pixels.size(); ++i) { - CHECK(expected_pixels[i] == pixels[i]); + { + // init file + HiCFileWriter w(path3, hf1.chromosomes(), {hf1.resolution()}, "dm6"); + const auto sel = hf1.fetch(); + w.add_pixels(resolution, sel.begin(), sel.end()); + w.serialize(); + } + + // add normalization weights + { + HiCFileWriter w(path3); + for (const auto& chrom : w.chromosomes()) { + if (chrom.is_all()) { + continue; + } + w.add_norm_vector("SCALE", chrom, "BP", hf1.resolution(), + hf1.normalization("SCALE", chrom)); + } + w.write_norm_vectors(); + CHECK_THROWS(w.add_norm_vector("VC", w.chromosomes().at("chr2L"), "BP", hf1.resolution(), + std::vector{1, 2, 3})); + } + + // compare + const hic::File hf2(path3, resolution); + const auto pixels1 = hf1.fetch(balancing::Method::SCALE()).read_all(); + const auto pixels2 = hf2.fetch(balancing::Method::SCALE()).read_all(); + + REQUIRE(pixels1.size() == pixels2.size()); + for (std::size_t i = 0; i < pixels1.size(); ++i) { + CHECK(pixels1[i].coords == pixels2[i].coords); + if (std::isnan(pixels1[i].count)) { + CHECK(std::isnan(pixels2[i].count)); + } else { + CHECK_THAT(pixels1[i].count, Catch::Matchers::WithinRel(pixels2[i].count)); + } } } }