diff --git a/include/pisa/block_inverted_index.hpp b/include/pisa/block_inverted_index.hpp index 775d273d..ba2e3e26 100644 --- a/include/pisa/block_inverted_index.hpp +++ b/include/pisa/block_inverted_index.hpp @@ -1,18 +1,27 @@ #pragma once +#include + #include "bit_vector.hpp" #include "codec/block_codec.hpp" #include "codec/block_codecs.hpp" -#include "codec/compact_elias_fano.hpp" #include "concepts.hpp" #include "concepts/inverted_index.hpp" #include "global_parameters.hpp" #include "mappable/mappable_vector.hpp" -#include "mappable/mapper.hpp" #include "memory_source.hpp" +#include "temporary_directory.hpp" namespace pisa { +namespace index::block { + class InMemoryBuilder; + class StreamBuilder; +} // namespace index::block + +/** + * Cursor for a block-encoded posting list. + */ class BlockInvertedIndexCursor { public: BlockInvertedIndexCursor(BlockCodec const* block_codec, std::uint8_t const* data, std::uint64_t universe) @@ -243,7 +252,6 @@ class BlockInvertedIndexCursor { }; class BlockInvertedIndex { - private: global_parameters m_params; std::size_t m_size{0}; std::size_t m_num_docs{0}; @@ -252,16 +260,15 @@ class BlockInvertedIndex { MemorySource m_source; std::unique_ptr m_block_codec; + void check_term_range(std::size_t term_id) const; + + friend class index::block::InMemoryBuilder; + friend class index::block::StreamBuilder; + public: using document_enumerator = BlockInvertedIndexCursor; - explicit BlockInvertedIndex(MemorySource source, std::unique_ptr block_codec) - : m_source(std::move(source)), m_block_codec(std::move(block_codec)) { - PISA_ASSERT_CONCEPT( - (concepts::SortedInvertedIndex) - ); - mapper::map(*this, m_source.data(), mapper::map_flags::warmup); - } + explicit BlockInvertedIndex(MemorySource source, std::unique_ptr block_codec); template void map(Visitor& visit) { @@ -269,39 +276,289 @@ class BlockInvertedIndex { m_endpoints, "m_endpoints")(m_lists, "m_lists"); } - [[nodiscard]] auto operator[](std::size_t term_id) const -> BlockInvertedIndexCursor { - // check_term_range(term_id); - compact_elias_fano::enumerator endpoints(m_endpoints, 0, m_lists.size(), m_size, m_params); - auto endpoint = endpoints.move(term_id).second; - return BlockInvertedIndexCursor(m_block_codec.get(), m_lists.data() + endpoint, num_docs()); - } + [[nodiscard]] auto operator[](std::size_t term_id) const -> BlockInvertedIndexCursor; /** - * \returns The size of the index, i.e., the number of terms (posting lists). + * The size of the index, i.e., the number of terms (posting lists). */ - [[nodiscard]] std::size_t size() const noexcept { return m_size; } + [[nodiscard]] auto size() const noexcept -> std::size_t { return m_size; } /** - * \returns The number of distinct documents in the index. + * The number of distinct documents in the index. */ - [[nodiscard]] std::uint64_t num_docs() const noexcept { return m_num_docs; } + [[nodiscard]] auto num_docs() const noexcept -> std::uint64_t { return m_num_docs; } - void warmup(std::size_t term_id) const { - // check_term_range(term_id); - compact_elias_fano::enumerator endpoints(m_endpoints, 0, m_lists.size(), m_size, m_params); + void warmup(std::size_t term_id) const; +}; - auto begin = endpoints.move(term_id).second; - auto end = m_lists.size(); - if (term_id + 1 != size()) { - end = endpoints.move(term_id + 1).second; +namespace index::block { + + class BlockPostingWriter { + BlockCodec const* m_block_codec; + + public: + explicit BlockPostingWriter(BlockCodec const* block_codec) : m_block_codec(block_codec) {} + + template + void write( + std::vector& out, uint32_t n, DocsIterator docs_begin, FreqsIterator freqs_begin + ) { + TightVariableByte::encode_single(n, out); + + uint64_t block_size = m_block_codec->block_size(); + uint64_t blocks = ceil_div(n, block_size); + size_t begin_block_maxs = out.size(); + size_t begin_block_endpoints = begin_block_maxs + 4 * blocks; + size_t begin_blocks = begin_block_endpoints + 4 * (blocks - 1); + out.resize(begin_blocks); + + DocsIterator docs_it(docs_begin); + FreqsIterator freqs_it(freqs_begin); + std::vector docs_buf(block_size); + std::vector freqs_buf(block_size); + int32_t last_doc(-1); + uint32_t block_base = 0; + for (size_t b = 0; b < blocks; ++b) { + uint32_t cur_block_size = ((b + 1) * block_size <= n) ? block_size : (n % block_size); + + for (size_t i = 0; i < cur_block_size; ++i) { + uint32_t doc(*docs_it++); + docs_buf[i] = doc - last_doc - 1; + last_doc = doc; + + freqs_buf[i] = *freqs_it++ - 1; + } + *((uint32_t*)&out[begin_block_maxs + 4 * b]) = last_doc; + + m_block_codec->encode( + docs_buf.data(), last_doc - block_base - (cur_block_size - 1), cur_block_size, out + ); + m_block_codec->encode(freqs_buf.data(), uint32_t(-1), cur_block_size, out); + if (b != blocks - 1) { + *((uint32_t*)&out[begin_block_endpoints + 4 * b]) = out.size() - begin_blocks; + } + block_base = last_doc + 1; + } } + }; - volatile std::uint32_t tmp; - for (std::size_t i = begin; i != end; ++i) { - tmp = m_lists[i]; + template + void write_blocks(std::vector& out, uint32_t n, BlockDataRange const& input_blocks) { + TightVariableByte::encode_single(n, out); + assert(input_blocks.front().index == 0); // first block must remain first + + uint64_t blocks = input_blocks.size(); + size_t begin_block_maxs = out.size(); + size_t begin_block_endpoints = begin_block_maxs + 4 * blocks; + size_t begin_blocks = begin_block_endpoints + 4 * (blocks - 1); + out.resize(begin_blocks); + + for (auto const& block: input_blocks) { + size_t b = block.index; + // write endpoint + if (b != 0) { + *((uint32_t*)&out[begin_block_endpoints + 4 * (b - 1)]) = out.size() - begin_blocks; + } + + // write max + *((uint32_t*)&out[begin_block_maxs + 4 * b]) = block.max; + + // copy block + block.append_docs_block(out); + block.append_freqs_block(out); } - (void)tmp; } -}; + + /** + * In-memory block index builder. + * + * Builds the index in memory, which is eventually flushed to disk at the very end. + */ + class InMemoryBuilder { + global_parameters m_params; + std::size_t m_num_docs; + BlockPostingWriter m_posting_writer; + std::vector m_endpoints{}; + std::vector m_lists{}; + + public: + /** + * Constructs a builder for an index containing the given number of documents. + */ + InMemoryBuilder( + std::uint64_t num_docs, global_parameters const& params, BlockPostingWriter posting_writer + ); + + /** + * Records a new posting list. + * + * Postings are written to an in-memory buffer. + * + * \param n Posting list length. + * \param docs_begin Iterator that points at the first document ID. + * \param freqs_begin Iterator that points at the first frequency. + * \param occurrences Not used in this builder. + * + * \throws std::invalid_argument Thrown if `n == 0`. + */ + template + void + add_posting_list(std::uint64_t n, DocsIterator docs_begin, FreqsIterator freqs_begin, std::uint64_t /* occurrences */) { + if (!n) { + throw std::invalid_argument("List must be nonempty"); + } + m_posting_writer.write(m_lists, n, docs_begin, freqs_begin); + m_endpoints.push_back(m_lists.size()); + } + + /** + * Adds multiple posting list blocks. + * + * \tparam BlockDataRange This intended to be a vector of structs of the type + * block_posting_list::document_enumerator::block_data + * + * \param n Posting list length. + * \param blocks Encoded blocks. + * + * \throws std::invalid_argument Thrown if `n == 0`. + */ + template + void add_posting_list(std::uint64_t n, BlockDataRange const& blocks) { + if (!n) { + throw std::invalid_argument("List must be nonempty"); + } + write_blocks(m_lists, n, blocks); + m_endpoints.push_back(m_lists.size()); + } + + /** + * Adds a posting list that is already fully encoded. + * + * \tparam BytesRange A collection of bytes, e.g., std::vector + * + * \param data Encoded data. + */ + template + void add_posting_list(BytesRange const& data) { + m_lists.insert(m_lists.end(), std::begin(data), std::end(data)); + m_endpoints.push_back(m_lists.size()); + } + + /** + * Builds an index. + * + * \param sq Inverted index object that will take ownership of the data. + */ + void build(BlockInvertedIndex& sq); + }; + + /** + * Stream block index builder. + * + * Buffers postings on disk in order to support building indexes larger than memory. + */ + class StreamBuilder { + global_parameters m_params{}; + std::size_t m_num_docs = 0; + std::vector m_endpoints{}; + TemporaryDirectory tmp{}; + std::ofstream m_postings_output; + std::size_t m_postings_bytes_written{0}; + std::unique_ptr m_block_codec; + BlockPostingWriter m_posting_writer; + + public: + /** + * Constructs a builder for an index containing the given number of documents. + * + * This constructor opens a temporary file to write. This file is used for + * buffering postings. This buffer is flushed to the right destination when the + * `build` member function is called. + * + * \throws std::ios_base::failure Thrown if the the temporary buffer file cannot be opened. + */ + StreamBuilder( + std::uint64_t num_docs, global_parameters const& params, BlockPostingWriter posting_writer + ); + + /** + * Records a new posting list. + * + * Postings are written to the temporary file, while some other data is accumulated + * within the builder struct. + * + * \param n Posting list length. + * \param docs_begin Iterator that points at the first document ID. + * \param freqs_begin Iterator that points at the first frequency. + * \param occurrences Not used in this builder. + * + * \throws std::invalid_argument Thrown if `n == 0`. + * \throws std::ios_base::failure Thrown if failed to write to the temporary file buffer. + */ + template + void + add_posting_list(std::uint64_t n, DocsIterator docs_begin, FreqsIterator freqs_begin, std::uint64_t /* occurrences */) { + if (!n) { + throw std::invalid_argument("List must be nonempty"); + } + std::vector buf; + m_posting_writer.write(buf, n, docs_begin, freqs_begin); + m_postings_bytes_written += buf.size(); + m_postings_output.write(reinterpret_cast(buf.data()), buf.size()); + m_endpoints.push_back(m_postings_bytes_written); + } + + /** + * Adds multiple posting list blocks. + * + * \tparam BlockDataRange This intended to be a vector of structs of the type + * block_posting_list::document_enumerator::block_data + * + * \param n Posting list length. + * \param blocks Encoded blocks. + * + * \throws std::invalid_argument Thrown if `n == 0`. + * \throws std::ios_base::failure Thrown if failed to write to the temporary file buffer. + */ + template + void add_posting_list(std::uint64_t n, BlockDataRange const& blocks) { + if (!n) { + throw std::invalid_argument("List must be nonempty"); + } + std::vector buf; + write_blocks(buf, n, blocks); + m_postings_bytes_written += buf.size(); + m_postings_output.write(reinterpret_cast(buf.data()), buf.size()); + m_endpoints.push_back(m_postings_bytes_written); + } + + /** + * Adds a posting list that is already fully encoded. + * + * \tparam BytesRange A collection of bytes, e.g., std::vector + * + * \param data Encoded data. + * + * \throws std::ios_base::failure Thrown if failed to write to the temporary file buffer. + */ + template + void add_posting_list(BytesRange const& data) { + m_postings_bytes_written += data.size(); + m_postings_output.write(reinterpret_cast(data.data()), data.size()); + m_endpoints.push_back(m_postings_bytes_written); + } + + /** + * Flushes index data to disk. + * + * \param index_path Output index path. + * + * \throws std::ios_base::failure Thrown if failed to write to any file + * or failed to read from the temporary buffer. + */ + void build(std::string const& index_path); + }; + +}; // namespace index::block }; // namespace pisa diff --git a/include/pisa/codec/block_codec.hpp b/include/pisa/codec/block_codec.hpp index 12124f82..28c5495e 100644 --- a/include/pisa/codec/block_codec.hpp +++ b/include/pisa/codec/block_codec.hpp @@ -3,13 +3,6 @@ #include #include -#include "codec/block_codecs.hpp" -#include "util/util.hpp" - -extern "C" { -#include "simdcomp/include/simdbitpacking.h" -} - namespace pisa { class BlockCodec { @@ -25,65 +18,4 @@ class BlockCodec { [[nodiscard]] virtual auto block_size() const noexcept -> std::size_t = 0; }; -class SimdBpBlockCodec: public BlockCodec { - static constexpr std::uint64_t m_block_size = 128; - - void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) const { - assert(n <= m_block_size); - auto* src = const_cast(in); - if (n < m_block_size) { - interpolative_block::encode(src, sum_of_values, n, out); - return; - } - uint32_t b = maxbits(in); - thread_local std::vector buf(8 * n); - uint8_t* buf_ptr = buf.data(); - *buf_ptr++ = b; - simdpackwithoutmask(src, (__m128i*)buf_ptr, b); - out.insert(out.end(), buf.data(), buf.data() + b * sizeof(__m128i) + 1); - } - - uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const { - assert(n <= m_block_size); - if PISA_UNLIKELY (n < m_block_size) { - return interpolative_block::decode(in, out, sum_of_values, n); - } - uint32_t b = *in++; - simdunpack((const __m128i*)in, out, b); - return in + b * sizeof(__m128i); - } - - auto block_size() const noexcept -> std::size_t { return m_block_size; } -}; - -class Simple16BlockCodec: public BlockCodec { - static constexpr std::uint64_t m_block_size = 128; - - void - encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) const { - assert(n <= m_block_size); - thread_local FastPForLib::Simple16 codec; - thread_local std::array buf{}; - size_t out_len = buf.size(); - codec.encodeArray(in, n, reinterpret_cast(buf.data()), out_len); - out_len *= 4; - out.insert(out.end(), buf.data(), buf.data() + out_len); - } - - uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const { - assert(n <= m_block_size); - FastPForLib::Simple16 codec; - std::array buf{}; - - auto const* ret = reinterpret_cast( - codec.decodeArray(reinterpret_cast(in), 8 * n, buf.data(), n) - ); - - std::copy(buf.begin(), std::next(buf.begin(), n), out); - return ret; - } - - auto block_size() const noexcept -> std::size_t { return m_block_size; } -}; - }; // namespace pisa diff --git a/include/pisa/codec/simdbp.hpp b/include/pisa/codec/simdbp.hpp index 200aadba..1c35b854 100644 --- a/include/pisa/codec/simdbp.hpp +++ b/include/pisa/codec/simdbp.hpp @@ -1,7 +1,7 @@ #pragma once +#include "codec/block_codec.hpp" #include "codec/block_codecs.hpp" -#include "util/util.hpp" #include extern "C" { @@ -36,4 +36,14 @@ struct simdbp_block { return in + b * sizeof(__m128i); } }; + +class SimdBpBlockCodec: public BlockCodec { + static constexpr std::uint64_t m_block_size = 128; + + public: + void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) const; + uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const; + auto block_size() const noexcept -> std::size_t { return m_block_size; } +}; + } // namespace pisa diff --git a/include/pisa/codec/simple16.hpp b/include/pisa/codec/simple16.hpp index bd937e8c..303305ee 100644 --- a/include/pisa/codec/simple16.hpp +++ b/include/pisa/codec/simple16.hpp @@ -1,8 +1,10 @@ #pragma once -#include "FastPFor/headers/simple16.h" #include +#include "FastPFor/headers/simple16.h" +#include "codec/block_codec.hpp" + namespace pisa { struct simple16_block { @@ -34,4 +36,34 @@ struct simple16_block { } }; +class Simple16BlockCodec: public BlockCodec { + static constexpr std::uint64_t m_block_size = 128; + + void + encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) const { + assert(n <= m_block_size); + thread_local FastPForLib::Simple16 codec; + thread_local std::array buf{}; + size_t out_len = buf.size(); + codec.encodeArray(in, n, reinterpret_cast(buf.data()), out_len); + out_len *= 4; + out.insert(out.end(), buf.data(), buf.data() + out_len); + } + + uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const { + assert(n <= m_block_size); + FastPForLib::Simple16 codec; + std::array buf{}; + + auto const* ret = reinterpret_cast( + codec.decodeArray(reinterpret_cast(in), 8 * n, buf.data(), n) + ); + + std::copy(buf.begin(), std::next(buf.begin(), n), out); + return ret; + } + + auto block_size() const noexcept -> std::size_t { return m_block_size; } +}; + } // namespace pisa diff --git a/src/block_inverted_index.cpp b/src/block_inverted_index.cpp new file mode 100644 index 00000000..0bf21117 --- /dev/null +++ b/src/block_inverted_index.cpp @@ -0,0 +1,109 @@ +#include "block_inverted_index.hpp" +#include "codec/compact_elias_fano.hpp" +#include "mappable/mapper.hpp" + +namespace pisa { + +BlockInvertedIndex::BlockInvertedIndex(MemorySource source, std::unique_ptr block_codec) + : m_source(std::move(source)), m_block_codec(std::move(block_codec)) { + PISA_ASSERT_CONCEPT((concepts::SortedInvertedIndex)); + mapper::map(*this, m_source.data(), mapper::map_flags::warmup); +} + +auto BlockInvertedIndex::operator[](std::size_t term_id) const -> BlockInvertedIndexCursor { + check_term_range(term_id); + compact_elias_fano::enumerator endpoints(m_endpoints, 0, m_lists.size(), m_size, m_params); + auto endpoint = endpoints.move(term_id).second; + return BlockInvertedIndexCursor(m_block_codec.get(), m_lists.data() + endpoint, num_docs()); +} + +void BlockInvertedIndex::check_term_range(std::size_t term_id) const { + if (term_id >= size()) { + throw std::out_of_range( + fmt::format("given term ID ({}) is out of range, must be < {}", term_id, size()) + ); + } +} + +void BlockInvertedIndex::warmup(std::size_t term_id) const { + check_term_range(term_id); + compact_elias_fano::enumerator endpoints(m_endpoints, 0, m_lists.size(), m_size, m_params); + + auto begin = endpoints.move(term_id).second; + auto end = m_lists.size(); + if (term_id + 1 != size()) { + end = endpoints.move(term_id + 1).second; + } + + volatile std::uint32_t tmp; + for (std::size_t i = begin; i != end; ++i) { + tmp = m_lists[i]; + } + (void)tmp; +} + +index::block::InMemoryBuilder::InMemoryBuilder( + std::uint64_t num_docs, global_parameters const& params, BlockPostingWriter posting_writer +) + : m_params(params), m_num_docs(num_docs), m_posting_writer(posting_writer) { + m_endpoints.push_back(0); +} + +void index::block::InMemoryBuilder::build(BlockInvertedIndex& sq) { + sq.m_params = m_params; + sq.m_size = m_endpoints.size() - 1; + sq.m_num_docs = m_num_docs; + + // This is a workaround to QMX codex having to sometimes look beyond the buffer + // due to some SIMD loads. + std::array padding{}; + m_lists.insert(m_lists.end(), padding.begin(), padding.end()); + sq.m_lists.steal(m_lists); + + bit_vector_builder bvb; + compact_elias_fano::write(bvb, m_endpoints.begin(), sq.m_lists.size(), sq.m_size, m_params); + bit_vector(&bvb).swap(sq.m_endpoints); +} + +index::block::StreamBuilder::StreamBuilder( + std::uint64_t num_docs, global_parameters const& params, BlockPostingWriter posting_writer +) + : m_params(params), + m_postings_output((tmp.path() / "buffer").c_str()), + m_posting_writer(posting_writer) { + m_postings_output.exceptions(std::ios::badbit | std::ios::failbit); + m_num_docs = num_docs; + m_endpoints.push_back(0); +} + +void index::block::StreamBuilder::build(std::string const& index_path) { + // This is a workaround to QMX codex having to sometimes look beyond the buffer + // due to some SIMD loads. + std::array padding{}; + m_postings_output.write(padding.data(), padding.size()); + m_postings_bytes_written += padding.size(); + + std::ofstream os(index_path.c_str()); + std::cout << index_path.c_str() << "\n"; + os.exceptions(std::ios::badbit | std::ios::failbit); + mapper::detail::freeze_visitor freezer(os, 0); + freezer(m_params, "m_params"); + std::size_t size = m_endpoints.size() - 1; + freezer(size, "size"); + freezer(m_num_docs, "m_num_docs"); + + bit_vector_builder bvb; + compact_elias_fano::write(bvb, m_endpoints.begin(), m_postings_bytes_written, size, m_params); + bit_vector endpoints(&bvb); + freezer(endpoints, "endpoints"); + + m_postings_output.close(); + std::ifstream buf((tmp.path() / "buffer").c_str()); + buf.exceptions(std::ios::badbit); + os.write( + reinterpret_cast(&m_postings_bytes_written), sizeof(m_postings_bytes_written) + ); + os << buf.rdbuf(); +} + +} // namespace pisa diff --git a/src/codec/simdbp.cpp b/src/codec/simdbp.cpp new file mode 100644 index 00000000..86574cf1 --- /dev/null +++ b/src/codec/simdbp.cpp @@ -0,0 +1,33 @@ +#include "codec/simdbp.hpp" + +namespace pisa { + +void SimdBpBlockCodec::encode( + uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out +) const { + assert(n <= m_block_size); + auto* src = const_cast(in); + if (n < m_block_size) { + interpolative_block::encode(src, sum_of_values, n, out); + return; + } + uint32_t b = maxbits(in); + thread_local std::vector buf(8 * n); + uint8_t* buf_ptr = buf.data(); + *buf_ptr++ = b; + simdpackwithoutmask(src, (__m128i*)buf_ptr, b); + out.insert(out.end(), buf.data(), buf.data() + b * sizeof(__m128i) + 1); +} + +uint8_t const* +SimdBpBlockCodec::decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const { + assert(n <= m_block_size); + if PISA_UNLIKELY (n < m_block_size) { + return interpolative_block::decode(in, out, sum_of_values, n); + } + uint32_t b = *in++; + simdunpack((const __m128i*)in, out, b); + return in + b * sizeof(__m128i); +} + +} // namespace pisa diff --git a/tools/queries_dynamic.cpp b/tools/queries_dynamic.cpp index f2acb9ad..7289723e 100644 --- a/tools/queries_dynamic.cpp +++ b/tools/queries_dynamic.cpp @@ -310,6 +310,16 @@ using wand_raw_index = wand_data; using wand_uniform_index = wand_data>; using wand_uniform_index_quantized = wand_data>; +auto resolve_codec(std::string_view encoding) -> std::unique_ptr { + if (encoding == "block_simdbp") { + return std::make_unique(); + } + if (encoding == "simple16_simdbp") { + return std::make_unique(); + } + throw std::domain_error("invalid encoding type"); +} + int main(int argc, const char** argv) { bool extract = false; bool safe = false; @@ -335,20 +345,9 @@ int main(int argc, const char** argv) { std::cout << "qid\tusec\n"; } - auto index = [&]() { - if (app.index_encoding() == "block_simdbp") { - return BlockInvertedIndex( - MemorySource::mapped_file(app.index_filename()), std::make_unique() - ); - } - if (app.index_encoding() == "simple16_simdbp") { - return BlockInvertedIndex( - MemorySource::mapped_file(app.index_filename()), - std::make_unique() - ); - } - throw std::domain_error("invalid encoding type"); - }(); + BlockInvertedIndex index( + MemorySource::mapped_file(app.index_filename()), resolve_codec(app.index_encoding()) + ); auto params = std::make_tuple( &index,