diff --git a/include/pisa/block_inverted_index.hpp b/include/pisa/block_inverted_index.hpp index ba2e3e26..37ff503c 100644 --- a/include/pisa/block_inverted_index.hpp +++ b/include/pisa/block_inverted_index.hpp @@ -11,6 +11,7 @@ #include "mappable/mappable_vector.hpp" #include "memory_source.hpp" #include "temporary_directory.hpp" +#include "util/block_profiler.hpp" namespace pisa { @@ -24,7 +25,12 @@ namespace index::block { */ class BlockInvertedIndexCursor { public: - BlockInvertedIndexCursor(BlockCodec const* block_codec, std::uint8_t const* data, std::uint64_t universe) + BlockInvertedIndexCursor( + BlockCodec const* block_codec, + std::uint8_t const* data, + std::uint64_t universe, + std::optional profile_term + ) : m_base(TightVariableByte::decode(data, &m_n, 1)), m_blocks(ceil_div(m_n, block_codec->block_size())), m_block_maxs(m_base), @@ -38,6 +44,10 @@ class BlockInvertedIndexCursor { && concepts::SortedPostingCursor) ); + if (profile_term.has_value()) { + m_profiler = block_profiler::open_list(*profile_term, m_blocks); + } + m_docs_buf.resize(m_block_size); m_freqs_buf.resize(m_block_size); reset(); @@ -218,6 +228,10 @@ class BlockInvertedIndexCursor { m_pos_in_block = 0; m_cur_docid = m_docs_buf[0]; m_freqs_decoded = false; + + if (m_profiler != nullptr) { + ++m_profiler[2 * m_cur_block]; + } } void PISA_NOINLINE decode_freqs_block() { @@ -226,6 +240,10 @@ class BlockInvertedIndexCursor { ); intrinsics::prefetch(next_block); m_freqs_decoded = true; + + if (m_profiler != nullptr) { + ++m_profiler[2 * m_cur_block + 1]; + } } uint32_t m_n{0}; @@ -249,6 +267,7 @@ class BlockInvertedIndexCursor { std::vector m_freqs_buf; BlockCodec const* m_block_codec; std::size_t m_block_size; + block_profiler::counter_type* m_profiler = nullptr; }; class BlockInvertedIndex { @@ -259,6 +278,7 @@ class BlockInvertedIndex { mapper::mappable_vector m_lists; MemorySource m_source; std::unique_ptr m_block_codec; + bool m_profile; void check_term_range(std::size_t term_id) const; @@ -268,7 +288,7 @@ class BlockInvertedIndex { public: using document_enumerator = BlockInvertedIndexCursor; - explicit BlockInvertedIndex(MemorySource source, std::unique_ptr block_codec); + BlockInvertedIndex(MemorySource source, std::unique_ptr block_codec, bool profile = false); template void map(Visitor& visit) { diff --git a/include/pisa/codec/block_codec_registry.hpp b/include/pisa/codec/block_codec_registry.hpp new file mode 100644 index 00000000..a3b454ea --- /dev/null +++ b/include/pisa/codec/block_codec_registry.hpp @@ -0,0 +1,15 @@ +#pragma once + +#include +#include + +#include + +#include "codec/block_codec.hpp" + +namespace pisa { + +[[nodiscard]] auto get_block_codec(std::string_view name) -> std::unique_ptr; +[[nodiscard]] auto get_block_codec_names() -> gsl::span; + +} // namespace pisa diff --git a/include/pisa/codec/interpolative.hpp b/include/pisa/codec/interpolative.hpp index b4f6ff10..c59002df 100644 --- a/include/pisa/codec/interpolative.hpp +++ b/include/pisa/codec/interpolative.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include "codec/block_codec.hpp" @@ -10,6 +11,8 @@ class InterpolativeBlockCodec: public BlockCodec { static constexpr std::uint64_t m_block_size = 128; public: + constexpr static std::string_view name = "block_interpolative"; + void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) const; uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const; auto block_size() const noexcept -> std::size_t { return m_block_size; } diff --git a/include/pisa/codec/maskedvbyte.hpp b/include/pisa/codec/maskedvbyte.hpp index 13adf3a8..86f0686a 100644 --- a/include/pisa/codec/maskedvbyte.hpp +++ b/include/pisa/codec/maskedvbyte.hpp @@ -37,6 +37,8 @@ class MaskedVByteBlockCodec: public BlockCodec { static constexpr std::uint64_t m_overflow = 512; public: + constexpr static std::string_view name = "block_maskedvbyte"; + void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) const; uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const; auto block_size() const noexcept -> std::size_t { return m_block_size; } diff --git a/include/pisa/codec/optpfor.hpp b/include/pisa/codec/optpfor.hpp index 7d209539..f534ed5d 100644 --- a/include/pisa/codec/optpfor.hpp +++ b/include/pisa/codec/optpfor.hpp @@ -46,6 +46,8 @@ class OptPForBlockCodec: public BlockCodec { static const uint64_t m_block_size = Codec::BlockSize; public: + constexpr static std::string_view name = "block_optpfor"; + void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) const; uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const; auto block_size() const noexcept -> std::size_t { return m_block_size; } diff --git a/include/pisa/codec/qmx.hpp b/include/pisa/codec/qmx.hpp index 393a7ff9..f6c9d412 100644 --- a/include/pisa/codec/qmx.hpp +++ b/include/pisa/codec/qmx.hpp @@ -54,6 +54,8 @@ class QmxBlockCodec: public BlockCodec { static constexpr std::uint64_t m_overflow = 512; public: + constexpr static std::string_view name = "block_qmx"; + void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) const; uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const; auto block_size() const noexcept -> std::size_t { return m_block_size; } diff --git a/include/pisa/codec/simdbp.hpp b/include/pisa/codec/simdbp.hpp index 1c35b854..468f2bbf 100644 --- a/include/pisa/codec/simdbp.hpp +++ b/include/pisa/codec/simdbp.hpp @@ -41,6 +41,8 @@ class SimdBpBlockCodec: public BlockCodec { static constexpr std::uint64_t m_block_size = 128; public: + constexpr static std::string_view name = "block_simdbp"; + void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) const; uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const; auto block_size() const noexcept -> std::size_t { return m_block_size; } diff --git a/include/pisa/codec/simple16.hpp b/include/pisa/codec/simple16.hpp index 3d73537a..f145a0f2 100644 --- a/include/pisa/codec/simple16.hpp +++ b/include/pisa/codec/simple16.hpp @@ -40,6 +40,8 @@ class Simple16BlockCodec: public BlockCodec { static constexpr std::uint64_t m_block_size = 128; public: + constexpr static std::string_view name = "block_simple16"; + void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) const; uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const; auto block_size() const noexcept -> std::size_t { return m_block_size; } diff --git a/include/pisa/codec/simple8b.hpp b/include/pisa/codec/simple8b.hpp index 859a1da4..17dc1deb 100644 --- a/include/pisa/codec/simple8b.hpp +++ b/include/pisa/codec/simple8b.hpp @@ -35,6 +35,8 @@ class Simple8bBlockCodec: public BlockCodec { static constexpr std::uint64_t m_block_size = 128; public: + constexpr static std::string_view name = "block_simple8b"; + void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) const; uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const; auto block_size() const noexcept -> std::size_t { return m_block_size; } diff --git a/include/pisa/codec/streamvbyte.hpp b/include/pisa/codec/streamvbyte.hpp index 7fd1b0b5..e9c4817b 100644 --- a/include/pisa/codec/streamvbyte.hpp +++ b/include/pisa/codec/streamvbyte.hpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include "codec/block_codec.hpp" @@ -43,6 +44,8 @@ class StreamVByteBlockCodec: public BlockCodec { pisa::streamvbyte_max_compressedbytes(m_block_size); public: + constexpr static std::string_view name = "block_streamvbyte"; + void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) const; uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const; auto block_size() const noexcept -> std::size_t { return m_block_size; } diff --git a/include/pisa/codec/varint_g8iu.hpp b/include/pisa/codec/varint_g8iu.hpp index acc62732..f2e48b24 100644 --- a/include/pisa/codec/varint_g8iu.hpp +++ b/include/pisa/codec/varint_g8iu.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include "codec/block_codec.hpp" @@ -10,6 +11,8 @@ class VarintG8IUBlockCodec: public BlockCodec { static const uint64_t m_block_size = 128; public: + constexpr static std::string_view name = "block_varintg8iu"; + void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) const; uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const; auto block_size() const noexcept -> std::size_t { return m_block_size; } diff --git a/include/pisa/codec/varintgb.hpp b/include/pisa/codec/varintgb.hpp index 84b370ad..b501821b 100644 --- a/include/pisa/codec/varintgb.hpp +++ b/include/pisa/codec/varintgb.hpp @@ -261,6 +261,8 @@ class VarintGbBlockCodec: public BlockCodec { static constexpr std::uint64_t m_block_size = 128; public: + constexpr static std::string_view name = "block_varintgb"; + void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) const; uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const; auto block_size() const noexcept -> std::size_t { return m_block_size; } diff --git a/src/block_inverted_index.cpp b/src/block_inverted_index.cpp index 0bf21117..7bbcfdd6 100644 --- a/src/block_inverted_index.cpp +++ b/src/block_inverted_index.cpp @@ -4,8 +4,10 @@ namespace pisa { -BlockInvertedIndex::BlockInvertedIndex(MemorySource source, std::unique_ptr block_codec) - : m_source(std::move(source)), m_block_codec(std::move(block_codec)) { +BlockInvertedIndex::BlockInvertedIndex( + MemorySource source, std::unique_ptr block_codec, bool profile +) + : m_source(std::move(source)), m_block_codec(std::move(block_codec)), m_profile(profile) { PISA_ASSERT_CONCEPT((concepts::SortedInvertedIndex)); mapper::map(*this, m_source.data(), mapper::map_flags::warmup); } @@ -14,7 +16,13 @@ auto BlockInvertedIndex::operator[](std::size_t term_id) const -> BlockInvertedI check_term_range(term_id); compact_elias_fano::enumerator endpoints(m_endpoints, 0, m_lists.size(), m_size, m_params); auto endpoint = endpoints.move(term_id).second; - return BlockInvertedIndexCursor(m_block_codec.get(), m_lists.data() + endpoint, num_docs()); + std::optional profile_term = std::nullopt; + if (m_profile) { + profile_term = term_id; + } + return BlockInvertedIndexCursor( + m_block_codec.get(), m_lists.data() + endpoint, num_docs(), profile_term + ); } void BlockInvertedIndex::check_term_range(std::size_t term_id) const { diff --git a/src/codec/block_codec_registry.cpp b/src/codec/block_codec_registry.cpp new file mode 100644 index 00000000..2f4b8f12 --- /dev/null +++ b/src/codec/block_codec_registry.cpp @@ -0,0 +1,69 @@ +#include "codec/block_codec_registry.hpp" + +#include +#include +#include +#include + +#include +#include + +#include "codec/block_codec.hpp" +#include "codec/interpolative.hpp" +#include "codec/maskedvbyte.hpp" +#include "codec/optpfor.hpp" +#include "codec/qmx.hpp" +#include "codec/simdbp.hpp" +#include "codec/simple16.hpp" +#include "codec/simple8b.hpp" +#include "codec/streamvbyte.hpp" +#include "codec/varint_g8iu.hpp" +#include "codec/varintgb.hpp" + +namespace pisa { + +template +struct BlockCodecRegistry { + using BlockCodecConstructor = std::unique_ptr (*)(); + + constexpr static std::array names = + std::array{C::name...}; + + constexpr static std::array constructors = + std::array{[]() -> std::unique_ptr { + return std::make_unique(); + }...}; + + constexpr static auto count() -> std::size_t { return sizeof...(C); } + + static auto get(std::string_view name) -> std::unique_ptr { + auto pos = std::find(names.begin(), names.end(), name); + if (pos == names.end()) { + throw std::domain_error(fmt::format("invalid codec: {}", name)); + } + auto constructor = constructors[std::distance(names.begin(), pos)]; + return constructor(); + } +}; + +using BlockCodecs = BlockCodecRegistry< + InterpolativeBlockCodec, + MaskedVByteBlockCodec, + OptPForBlockCodec, + QmxBlockCodec, + SimdBpBlockCodec, + Simple16BlockCodec, + Simple8bBlockCodec, + StreamVByteBlockCodec, + VarintG8IUBlockCodec, + VarintGbBlockCodec>; + +auto get_block_codec(std::string_view name) -> std::unique_ptr { + return BlockCodecs::get(name); +} + +auto get_block_codec_names() -> gsl::span { + return gsl::make_span(&BlockCodecs::names[0], BlockCodecs::count()); +} + +} // namespace pisa diff --git a/tools/queries_dynamic.cpp b/tools/queries_dynamic.cpp index 9f335033..6055aa14 100644 --- a/tools/queries_dynamic.cpp +++ b/tools/queries_dynamic.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -16,17 +17,7 @@ #include "accumulator/simple_accumulator.hpp" #include "app.hpp" #include "block_inverted_index.hpp" -#include "codec/block_codec.hpp" -#include "codec/interpolative.hpp" -#include "codec/maskedvbyte.hpp" -#include "codec/optpfor.hpp" -#include "codec/qmx.hpp" -#include "codec/simdbp.hpp" -#include "codec/simple16.hpp" -#include "codec/simple8b.hpp" -#include "codec/streamvbyte.hpp" -#include "codec/varint_g8iu.hpp" -#include "codec/varintgb.hpp" +#include "codec/block_codec_registry.hpp" #include "cursor/block_max_scored_cursor.hpp" #include "cursor/cursor.hpp" #include "cursor/max_scored_cursor.hpp" @@ -319,44 +310,11 @@ using wand_raw_index = wand_data; using wand_uniform_index = wand_data>; using wand_uniform_index_quantized = wand_data>; -auto resolve_codec(std::string_view encoding) -> std::unique_ptr { - if (encoding == "block_interpolative") { - return std::make_unique(); - } - if (encoding == "block_maskedvbyte") { - return std::make_unique(); - } - if (encoding == "block_optpfor") { - return std::make_unique(); - } - if (encoding == "block_qmx") { - return std::make_unique(); - } - if (encoding == "block_simdbp") { - return std::make_unique(); - } - if (encoding == "block_simple16") { - return std::make_unique(); - } - if (encoding == "block_simple8b") { - return std::make_unique(); - } - if (encoding == "block_streamvbyte") { - return std::make_unique(); - } - if (encoding == "block_varintg8iu") { - return std::make_unique(); - } - if (encoding == "block_varintgb") { - return std::make_unique(); - } - throw std::domain_error("invalid encoding type"); -} - int main(int argc, const char** argv) { bool extract = false; bool safe = false; bool quantized = false; + bool list_encodings = false; App, @@ -370,6 +328,10 @@ int main(int argc, const char** argv) { app.add_flag("--extract", extract, "Extract individual query times"); app.add_flag("--safe", safe, "Rerun if not enough results with pruning.") ->needs(app.thresholds_option()); + + // TODO: only block ones for now + app.add_flag("--list-encodings", list_encodings, "List all available encodings."); + CLI11_PARSE(app, argc, argv); spdlog::set_default_logger(spdlog::stderr_color_mt("stderr")); @@ -378,8 +340,15 @@ int main(int argc, const char** argv) { std::cout << "qid\tusec\n"; } + if (list_encodings) { + for (auto encoding: get_block_codec_names()) { + std::cout << encoding << '\n'; + } + return 0; + } + BlockInvertedIndex index( - MemorySource::mapped_file(app.index_filename()), resolve_codec(app.index_encoding()) + MemorySource::mapped_file(app.index_filename()), get_block_codec(app.index_encoding()) ); auto params = std::make_tuple(