diff --git a/.clang-format b/.clang-format index 06738d38..d28e5f99 100644 --- a/.clang-format +++ b/.clang-format @@ -1,6 +1,6 @@ --- AccessModifierOffset: '-2' -AlignAfterOpenBracket: AlwaysBreak +AlignAfterOpenBracket: BlockIndent AlignConsecutiveAssignments: 'false' AlignConsecutiveDeclarations: 'false' AlignEscapedNewlines: Left @@ -26,7 +26,7 @@ BreakBeforeTernaryOperators: 'true' BreakConstructorInitializers: BeforeColon BreakInheritanceList: BeforeColon BraceWrapping: - AfterFunction: true + AfterFunction: false SplitEmptyFunction: false ColumnLimit: '100' CompactNamespaces: 'true' diff --git a/.github/workflows/commit.yml b/.github/workflows/commit.yml index d2c8bdce..bb8d3337 100644 --- a/.github/workflows/commit.yml +++ b/.github/workflows/commit.yml @@ -100,7 +100,7 @@ jobs: - name: Install clang-format shell: bash run: | - sudo apt-get install -y clang-format-9 + sudo apt-get install -y clang-format - name: Set up Python uses: actions/setup-python@v1 @@ -112,7 +112,7 @@ jobs: run: | wget https://raw.githubusercontent.com/Sarcasm/run-clang-format/master/run-clang-format.py python run-clang-format.py \ - --clang-format-executable clang-format-9 \ + --clang-format-executable clang-format \ -r src/**/*.cpp include/pisa/**/*.hpp tools/*.cpp tools/*.hpp test/*.cpp headers: diff --git a/include/pisa/accumulator/lazy_accumulator.hpp b/include/pisa/accumulator/lazy_accumulator.hpp index 8bd4ec66..1842041e 100644 --- a/include/pisa/accumulator/lazy_accumulator.hpp +++ b/include/pisa/accumulator/lazy_accumulator.hpp @@ -24,7 +24,8 @@ class LazyAccumulator { using reference = float&; static_assert( - std::is_integral_v && std::is_unsigned_v, "must be unsigned number"); + std::is_integral_v && std::is_unsigned_v, "must be unsigned number" + ); constexpr static auto descriptor_size_in_bits = sizeof(Descriptor) * 8; constexpr static auto counters_in_descriptor = descriptor_size_in_bits / counter_bit_size; constexpr static auto cycle = (1U << counter_bit_size); @@ -34,8 +35,7 @@ class LazyAccumulator { Descriptor descriptor{}; std::array accumulators{}; - [[nodiscard]] auto counter(int pos) const noexcept -> int - { + [[nodiscard]] auto counter(int pos) const noexcept -> int { if constexpr (counter_bit_size == 8) { // NOLINT(readability-braces-around-statements) return static_cast(*(reinterpret_cast(&descriptor) + pos)); } else { @@ -43,8 +43,7 @@ class LazyAccumulator { } } - void reset_counter(int pos, int counter) - { + void reset_counter(int pos, int counter) { if constexpr (counter_bit_size == 8) { // NOLINT(readability-braces-around-statements) *(reinterpret_cast(&descriptor) + pos) = static_cast(counter); } else { @@ -58,13 +57,11 @@ class LazyAccumulator { public: explicit LazyAccumulator(std::size_t size) - : m_size(size), m_accumulators((size + counters_in_descriptor - 1) / counters_in_descriptor) - { + : m_size(size), m_accumulators((size + counters_in_descriptor - 1) / counters_in_descriptor) { PISA_ASSERT_CONCEPT(PartialScoreAccumulator); } - void reset() - { + void reset() { if (m_counter == 0) { auto first = reinterpret_cast(&m_accumulators.front()); auto last = @@ -73,8 +70,7 @@ class LazyAccumulator { } } - void accumulate(std::size_t document, float score) - { + void accumulate(std::size_t document, float score) { auto const block = document / counters_in_descriptor; auto const pos_in_block = document % counters_in_descriptor; if (m_accumulators[block].counter(pos_in_block) != m_counter) { @@ -83,8 +79,7 @@ class LazyAccumulator { m_accumulators[block].accumulators[pos_in_block] += score; } - void collect(topk_queue& topk) - { + void collect(topk_queue& topk) { uint64_t docid = 0U; for (auto const& block: m_accumulators) { int pos = 0; diff --git a/include/pisa/accumulator/simple_accumulator.hpp b/include/pisa/accumulator/simple_accumulator.hpp index 604e11ce..cd374d88 100644 --- a/include/pisa/accumulator/simple_accumulator.hpp +++ b/include/pisa/accumulator/simple_accumulator.hpp @@ -32,8 +32,7 @@ namespace pisa { */ class SimpleAccumulator: public std::vector { public: - explicit SimpleAccumulator(std::size_t size) : std::vector(size) - { + explicit SimpleAccumulator(std::size_t size) : std::vector(size) { PISA_ASSERT_CONCEPT(PartialScoreAccumulator); } @@ -41,8 +40,7 @@ class SimpleAccumulator: public std::vector { void accumulate(std::uint32_t doc, float score) { operator[](doc) += score; } - void collect(topk_queue& topk) - { + void collect(topk_queue& topk) { std::uint32_t docid = 0U; std::for_each(begin(), end(), [&](auto score) { if (topk.would_enter(score)) { diff --git a/include/pisa/algorithm.hpp b/include/pisa/algorithm.hpp index a35c23bc..62244a7d 100644 --- a/include/pisa/algorithm.hpp +++ b/include/pisa/algorithm.hpp @@ -9,12 +9,9 @@ namespace pisa { namespace execution { - class sequenced_policy { - }; - class parallel_policy { - }; - class parallel_unsequenced_policy { - }; + class sequenced_policy {}; + class parallel_policy {}; + class parallel_unsequenced_policy {}; inline constexpr sequenced_policy seq{}; inline constexpr parallel_policy par{}; @@ -23,20 +20,17 @@ namespace execution { #if defined(_LIBCPP_HAS_PARALLEL_ALGORITHMS) [[nodiscard]] constexpr auto to_std(pisa::execution::sequenced_policy /* policy */) - -> std::execution::sequenced_policy - { + -> std::execution::sequenced_policy { return std::execution::seq; } [[nodiscard]] constexpr auto to_std(pisa::execution::parallel_policy /* policy */) - -> std::execution::parallel_policy - { + -> std::execution::parallel_policy { return std::execution::par; } [[nodiscard]] constexpr auto to_std(pisa::execution::parallel_unsequenced_policy /* policy */) - -> std::execution::parallel_unsequenced_policy - { + -> std::execution::parallel_unsequenced_policy { return std::execution::par_unseq; } @@ -52,8 +46,8 @@ OutputIt transform( ForwardIt1 first, ForwardIt1 last, OutputIt d_first, - UnaryOperation unary_op) -{ + UnaryOperation unary_op +) { #if defined(_LIBCPP_HAS_PARALLEL_ALGORITHMS) auto std_policy = pisa::execution::to_std(policy); return std::transform(std_policy, first, last, d_first, unary_op); @@ -71,8 +65,8 @@ OutputIt transform( ForwardIt1 last1, ForwardIt2 first2, OutputIt d_first, - BinaryOperation binary_op) -{ + BinaryOperation binary_op +) { #if defined(_LIBCPP_HAS_PARALLEL_ALGORITHMS) auto std_policy = pisa::execution::to_std(policy); return std::transform(std_policy, first1, last1, first2, d_first, binary_op); @@ -82,8 +76,7 @@ OutputIt transform( } template -void sort([[maybe_unused]] ExecutionPolicy&& policy, RandomIt first, RandomIt last) -{ +void sort([[maybe_unused]] ExecutionPolicy&& policy, RandomIt first, RandomIt last) { #if defined(_LIBCPP_HAS_PARALLEL_ALGORITHMS) auto std_policy = pisa::execution::to_std(policy); return std::sort(std_policy, first, last); @@ -93,8 +86,7 @@ void sort([[maybe_unused]] ExecutionPolicy&& policy, RandomIt first, RandomIt la } template -void sort([[maybe_unused]] ExecutionPolicy&& policy, RandomIt first, RandomIt last, Compare comp) -{ +void sort([[maybe_unused]] ExecutionPolicy&& policy, RandomIt first, RandomIt last, Compare comp) { #if defined(_LIBCPP_HAS_PARALLEL_ALGORITHMS) auto std_policy = pisa::execution::to_std(policy); return std::sort(std_policy, first, last, comp); @@ -105,8 +97,8 @@ void sort([[maybe_unused]] ExecutionPolicy&& policy, RandomIt first, RandomIt la template void for_each( - [[maybe_unused]] ExecutionPolicy&& policy, ForwardIt first, ForwardIt last, UnaryFunction2 f) -{ + [[maybe_unused]] ExecutionPolicy&& policy, ForwardIt first, ForwardIt last, UnaryFunction2 f +) { #if defined(_LIBCPP_HAS_PARALLEL_ALGORITHMS) auto std_policy = pisa::execution::to_std(policy); std::for_each(std_policy, first, last, f); diff --git a/include/pisa/binary_collection.hpp b/include/pisa/binary_collection.hpp index 6274f825..c317a4bc 100644 --- a/include/pisa/binary_collection.hpp +++ b/include/pisa/binary_collection.hpp @@ -24,8 +24,7 @@ class base_binary_collection { using pointer = typename std:: conditional::value, posting_type const, posting_type>::type*; - explicit base_binary_collection(const char* filename) - { + explicit base_binary_collection(const char* filename) { std::error_code error; m_file.map(filename, error); if (error) { @@ -56,8 +55,7 @@ class base_binary_collection { pointer end() const { return m_end; } size_t size() const { return m_end - m_begin; } - posting_type back() const - { + posting_type back() const { assert(size()); return *(m_end - 1); } @@ -100,15 +98,13 @@ class base_binary_collection { auto const* operator->() const { return &m_cur_seq; } - base_iterator& operator++() - { + base_iterator& operator++() { m_pos = m_next_pos; read(); return *this; } - bool operator==(base_iterator const& other) const - { + bool operator==(base_iterator const& other) const { assert(m_data == other.m_data); assert(m_data_size == other.m_data_size); return m_pos == other.m_pos; @@ -120,13 +116,11 @@ class base_binary_collection { friend class base_binary_collection; base_iterator(base_binary_collection const* coll, size_t pos) - : m_data(coll->m_data), m_data_size(coll->m_data_size), m_pos(pos) - { + : m_data(coll->m_data), m_data_size(coll->m_data_size), m_pos(pos) { read(); } - void read() - { + void read() { assert(m_pos <= m_data_size); if (m_pos == m_data_size) { return; diff --git a/include/pisa/binary_freq_collection.hpp b/include/pisa/binary_freq_collection.hpp index 081ae7d5..8ececc0b 100644 --- a/include/pisa/binary_freq_collection.hpp +++ b/include/pisa/binary_freq_collection.hpp @@ -12,8 +12,7 @@ class binary_freq_collection { public: explicit binary_freq_collection(const char* basename) : m_docs((std::string(basename) + ".docs").c_str()), - m_freqs((std::string(basename) + ".freqs").c_str()) - { + m_freqs((std::string(basename) + ".freqs").c_str()) { auto firstseq = *m_docs.begin(); if (firstseq.size() != 1) { throw std::invalid_argument("First sequence should only contain number of documents"); @@ -23,8 +22,7 @@ class binary_freq_collection { class iterator; - iterator begin() const - { + iterator begin() const { auto docs_it = m_docs.begin(); return iterator(++docs_it, m_freqs.begin()); } @@ -54,8 +52,7 @@ class binary_freq_collection { sequence const* operator->() const { return &m_cur_seq; } - iterator& operator++() - { + iterator& operator++() { m_cur_seq.docs = *++m_docs_it; m_cur_seq.freqs = *++m_freqs_it; return *this; @@ -69,8 +66,7 @@ class binary_freq_collection { friend class binary_freq_collection; iterator(binary_collection::const_iterator docs_it, binary_collection::const_iterator freqs_it) - : m_docs_it(docs_it), m_freqs_it(freqs_it) - { + : m_docs_it(docs_it), m_freqs_it(freqs_it) { m_cur_seq.docs = *m_docs_it; m_cur_seq.freqs = *m_freqs_it; } diff --git a/include/pisa/bit_vector.hpp b/include/pisa/bit_vector.hpp index d03aac63..3640bb3c 100644 --- a/include/pisa/bit_vector.hpp +++ b/include/pisa/bit_vector.hpp @@ -16,8 +16,7 @@ class bit_vector { bit_vector() = default; template - explicit bit_vector(Range const& from) - { + explicit bit_vector(Range const& from) { std::vector bits; auto const first_mask = uint64_t(1); uint64_t mask = first_mask; @@ -44,21 +43,18 @@ class bit_vector { explicit bit_vector(bit_vector_builder* from); template - void map(Visitor& visit) - { + void map(Visitor& visit) { visit(m_size, "m_size")(m_bits, "m_bits"); } - void swap(bit_vector& other) - { + void swap(bit_vector& other) { std::swap(other.m_size, m_size); other.m_bits.swap(m_bits); } inline size_t size() const { return m_size; } - inline bool operator[](uint64_t pos) const - { + inline bool operator[](uint64_t pos) const { assert(pos < m_size); uint64_t block = pos / 64; assert(block < m_bits.size()); @@ -66,8 +62,7 @@ class bit_vector { return ((m_bits[block] >> shift) & 1) != 0U; } - inline uint64_t get_bits(uint64_t pos, uint64_t len) const - { + inline uint64_t get_bits(uint64_t pos, uint64_t len) const { assert(pos + len <= size()); assert(len <= 64); if (len == 0U) { @@ -84,8 +79,7 @@ class bit_vector { } // same as get_bits(pos, 64) but it can extend further size(), padding with zeros - inline uint64_t get_word(uint64_t pos) const - { + inline uint64_t get_word(uint64_t pos) const { assert(pos < size()); uint64_t block = pos / 64; uint64_t shift = pos % 64; @@ -97,8 +91,7 @@ class bit_vector { } // unsafe and fast version of get_word, it retrieves at least 56 bits - inline uint64_t get_word56(uint64_t pos) const - { + inline uint64_t get_word56(uint64_t pos) const { // XXX check endianness? const char* ptr = reinterpret_cast(m_bits.data()); std::uint64_t word; @@ -106,8 +99,7 @@ class bit_vector { return word >> (pos % 8); } - inline uint64_t predecessor0(uint64_t pos) const - { + inline uint64_t predecessor0(uint64_t pos) const { assert(pos < m_size); uint64_t block = pos / 64; uint64_t shift = 64 - pos % 64 - 1; @@ -122,8 +114,7 @@ class bit_vector { return block * 64 + ret; } - inline uint64_t successor0(uint64_t pos) const - { + inline uint64_t successor0(uint64_t pos) const { assert(pos < m_size); uint64_t block = pos / 64; uint64_t shift = pos % 64; @@ -138,8 +129,7 @@ class bit_vector { return block * 64 + ret; } - inline uint64_t predecessor1(uint64_t pos) const - { + inline uint64_t predecessor1(uint64_t pos) const { assert(pos < m_size); uint64_t block = pos / 64; uint64_t shift = 64 - pos % 64 - 1; @@ -154,8 +144,7 @@ class bit_vector { return block * 64 + ret; } - inline uint64_t successor1(uint64_t pos) const - { + inline uint64_t successor1(uint64_t pos) const { assert(pos < m_size); uint64_t block = pos / 64; uint64_t shift = pos % 64; @@ -175,13 +164,11 @@ class bit_vector { struct enumerator { enumerator() = default; - enumerator(bit_vector const& bv, size_t pos) : m_bv(&bv), m_pos(pos), m_buf(0), m_avail(0) - { + enumerator(bit_vector const& bv, size_t pos) : m_bv(&bv), m_pos(pos), m_buf(0), m_avail(0) { m_bv->data().prefetch(m_pos / 64); } - inline bool next() - { + inline bool next() { if (m_avail == 0U) { fill_buf(); } @@ -192,8 +179,7 @@ class bit_vector { return b; } - inline uint64_t take(size_t l) - { + inline uint64_t take(size_t l) { if (m_avail < l) { fill_buf(); } @@ -209,8 +195,7 @@ class bit_vector { return val; } - inline uint64_t skip_zeros() - { + inline uint64_t skip_zeros() { uint64_t zs = 0; // XXX the loop may be optimized by aligning access while (m_buf == 0U) { @@ -231,8 +216,7 @@ class bit_vector { inline uint64_t position() const { return m_pos; } private: - inline void fill_buf() - { + inline void fill_buf() { m_buf = m_bv->get_word(m_pos); m_avail = 64; } @@ -246,8 +230,7 @@ class bit_vector { struct unary_enumerator { unary_enumerator() : m_data(0), m_position(0), m_buf(0) {} - unary_enumerator(bit_vector const& bv, uint64_t pos) - { + unary_enumerator(bit_vector const& bv, uint64_t pos) { m_data = bv.data().data(); m_position = pos; m_buf = m_data[pos / 64]; @@ -257,8 +240,7 @@ class bit_vector { uint64_t position() const { return m_position; } - uint64_t next() - { + uint64_t next() { unsigned long pos_in_word; uint64_t buf = m_buf; while (broadword::lsb(buf, pos_in_word) == 0U) { @@ -272,8 +254,7 @@ class bit_vector { } // skip to the k-th one after the current position - void skip(uint64_t k) - { + void skip(uint64_t k) { uint64_t skipped = 0; uint64_t buf = m_buf; uint64_t w = 0; @@ -289,8 +270,7 @@ class bit_vector { } // return the position of the k-th one after the current position. - uint64_t skip_no_move(uint64_t k) - { + uint64_t skip_no_move(uint64_t k) { uint64_t position = m_position; uint64_t skipped = 0; uint64_t buf = m_buf; @@ -307,8 +287,7 @@ class bit_vector { } // skip to the k-th zero after the current position - void skip0(uint64_t k) - { + void skip0(uint64_t k) { uint64_t skipped = 0; uint64_t pos_in_word = m_position % 64; uint64_t buf = ~m_buf & (uint64_t(-1) << pos_in_word); diff --git a/include/pisa/bit_vector_builder.hpp b/include/pisa/bit_vector_builder.hpp index 9dcdd07c..2146145e 100644 --- a/include/pisa/bit_vector_builder.hpp +++ b/include/pisa/bit_vector_builder.hpp @@ -10,7 +10,9 @@ namespace pisa { namespace detail { /// Returns the number of 64-bit words needed to store `n` bits. - inline std::size_t words_for(uint64_t n) { return ceil_div(n, 64); } + inline std::size_t words_for(uint64_t n) { + return ceil_div(n, 64); + } } // namespace detail @@ -30,8 +32,7 @@ class bit_vector_builder { void reserve(uint64_t size); /// Appends one bit to the end of the vector. - inline void push_back(bool b) - { + inline void push_back(bool b) { uint64_t pos_in_word = m_size % 64; if (pos_in_word == 0) { m_bits.push_back(0); @@ -42,8 +43,7 @@ class bit_vector_builder { } /// Sets a bit at the position `pos` to the given value. - inline void set(uint64_t pos, bool b) - { + inline void set(uint64_t pos, bool b) { uint64_t word = pos / 64; uint64_t pos_in_word = pos % 64; @@ -52,8 +52,7 @@ class bit_vector_builder { } /// Overrides `len` bits, starting from `pos`, with the first `len` bits from `bits`. - inline void set_bits(uint64_t pos, uint64_t bits, size_t len) - { + inline void set_bits(uint64_t pos, uint64_t bits, size_t len) { assert(pos + len <= size()); // check there are no spurious bits assert(len == 64 || (bits >> len) == 0); @@ -75,8 +74,7 @@ class bit_vector_builder { } /// Appends the first `len` bits from `bits`. - inline void append_bits(uint64_t bits, size_t len) - { + inline void append_bits(uint64_t bits, size_t len) { // check there are no spurious bits assert(len == 64 || (bits >> len) == 0); if (len == 0U) { @@ -96,8 +94,7 @@ class bit_vector_builder { } /// Extends the vector with n zeroes. - inline void zero_extend(uint64_t n) - { + inline void zero_extend(uint64_t n) { m_size += n; uint64_t needed = detail::words_for(m_size) - m_bits.size(); if (needed != 0U) { @@ -107,8 +104,7 @@ class bit_vector_builder { } /// Extends the vector with n ones. - inline void one_extend(uint64_t n) - { + inline void one_extend(uint64_t n) { while (n >= 64) { append_bits(uint64_t(-1), 64); n -= 64; @@ -125,8 +121,7 @@ class bit_vector_builder { void reverse(); /// Returns a reference to the underlying data buffer. - bits_type& move_bits() - { + bits_type& move_bits() { assert(detail::words_for(m_size) == m_bits.size()); return m_bits; } diff --git a/include/pisa/bitvector_collection.hpp b/include/pisa/bitvector_collection.hpp index 73eb734b..712ea1eb 100644 --- a/include/pisa/bitvector_collection.hpp +++ b/include/pisa/bitvector_collection.hpp @@ -13,19 +13,16 @@ class bitvector_collection { class builder { public: - explicit builder(global_parameters const& params) : m_params(params) - { + explicit builder(global_parameters const& params) : m_params(params) { m_endpoints.push_back(0); } - void append(bit_vector_builder& bvb) - { + void append(bit_vector_builder& bvb) { m_bitvectors.append(bvb); m_endpoints.push_back(m_bitvectors.size()); } - void build(bitvector_collection& sq) - { + void build(bitvector_collection& sq) { sq.m_size = m_endpoints.size() - 1; // padding is necessary to not read after buffer m_bitvectors.append_bits(0, 64); @@ -33,7 +30,8 @@ class bitvector_collection { bit_vector_builder bvb; compact_elias_fano::write( - bvb, m_endpoints.begin(), m_bitvectors.size(), sq.m_size, m_params); + bvb, m_endpoints.begin(), m_bitvectors.size(), sq.m_size, m_params + ); // padding is necessary to not read after buffer bvb.append_bits(0, 64); bit_vector(&bvb).swap(sq.m_endpoints); @@ -49,8 +47,7 @@ class bitvector_collection { bit_vector const& bits() const { return m_bitvectors; } - bit_vector::enumerator get(global_parameters const& params, size_t i) const - { + bit_vector::enumerator get(global_parameters const& params, size_t i) const { assert(i < size()); compact_elias_fano::enumerator endpoints(m_endpoints, 0, m_bitvectors.size(), m_size, params); @@ -58,16 +55,14 @@ class bitvector_collection { return bit_vector::enumerator(m_bitvectors, endpoint); } - void swap(bitvector_collection& other) - { + void swap(bitvector_collection& other) { std::swap(m_size, other.m_size); m_endpoints.swap(other.m_endpoints); m_bitvectors.swap(other.m_bitvectors); } template - void map(Visitor& visit) - { + void map(Visitor& visit) { visit(m_size, "m_size")(m_endpoints, "m_endpoints")(m_bitvectors, "m_bitvectors"); } diff --git a/include/pisa/block_freq_index.hpp b/include/pisa/block_freq_index.hpp index 9fbdd365..6bf1a552 100644 --- a/include/pisa/block_freq_index.hpp +++ b/include/pisa/block_freq_index.hpp @@ -43,8 +43,7 @@ class block_freq_index { * throughout the life of the index. Once the source gets deallocated, * any index operations may result in undefined behavior. */ - explicit block_freq_index(MemorySource source) : m_source(std::move(source)) - { + explicit block_freq_index(MemorySource source) : m_source(std::move(source)) { mapper::map(*this, m_source.data(), mapper::map_flags::warmup); } @@ -58,8 +57,7 @@ class block_freq_index { /** * Constructs a builder for an index containing the given number of documents. */ - builder(std::uint64_t num_docs, global_parameters const& params) : m_params(params) - { + builder(std::uint64_t num_docs, global_parameters const& params) : m_params(params) { m_num_docs = num_docs; m_endpoints.push_back(0); } @@ -77,12 +75,8 @@ class block_freq_index { * \throws std::invalid_argument Thrown if `n == 0`. */ template - void add_posting_list( - std::uint64_t n, - DocsIterator docs_begin, - FreqsIterator freqs_begin, - std::uint64_t /* occurrences */) - { + void + add_posting_list(std::uint64_t n, DocsIterator docs_begin, FreqsIterator freqs_begin, std::uint64_t /* occurrences */) { if (!n) { throw std::invalid_argument("List must be nonempty"); } @@ -102,8 +96,7 @@ class block_freq_index { * \throws std::invalid_argument Thrown if `n == 0`. */ template - void add_posting_list(std::uint64_t n, BlockDataRange const& blocks) - { + void add_posting_list(std::uint64_t n, BlockDataRange const& blocks) { if (!n) { throw std::invalid_argument("List must be nonempty"); } @@ -119,8 +112,7 @@ class block_freq_index { * \param data Encoded data. */ template - void add_posting_list(BytesRange const& data) - { + void add_posting_list(BytesRange const& data) { m_lists.insert(m_lists.end(), std::begin(data), std::end(data)); m_endpoints.push_back(m_lists.size()); } @@ -130,8 +122,7 @@ class block_freq_index { * * \param sq Inverted index object that will take ownership of the data. */ - void build(block_freq_index& sq) - { + void build(block_freq_index& sq) { sq.m_params = m_params; sq.m_size = m_endpoints.size() - 1; sq.m_num_docs = m_num_docs; @@ -143,8 +134,7 @@ class block_freq_index { sq.m_lists.steal(m_lists); bit_vector_builder bvb; - compact_elias_fano::write( - bvb, m_endpoints.begin(), sq.m_lists.size(), sq.m_size, m_params); + compact_elias_fano::write(bvb, m_endpoints.begin(), sq.m_lists.size(), sq.m_size, m_params); bit_vector(&bvb).swap(sq.m_endpoints); } @@ -172,8 +162,7 @@ class block_freq_index { * \throws std::ios_base::failure Thrown if the the temporary buffer file cannot be opened. */ stream_builder(std::uint64_t num_docs, global_parameters const& params) - : m_params(params), m_postings_output((tmp.path() / "buffer").c_str()) - { + : m_params(params), m_postings_output((tmp.path() / "buffer").c_str()) { m_postings_output.exceptions(std::ios::badbit | std::ios::failbit); m_num_docs = num_docs; m_endpoints.push_back(0); @@ -194,12 +183,8 @@ class block_freq_index { * \throws std::ios_base::failure Thrown if failed to write to the temporary file buffer. */ template - void add_posting_list( - std::uint64_t n, - DocsIterator docs_begin, - FreqsIterator freqs_begin, - std::uint64_t /* occurrences */) - { + void + add_posting_list(std::uint64_t n, DocsIterator docs_begin, FreqsIterator freqs_begin, std::uint64_t /* occurrences */) { if (!n) { throw std::invalid_argument("List must be nonempty"); } @@ -223,8 +208,7 @@ class block_freq_index { * \throws std::ios_base::failure Thrown if failed to write to the temporary file buffer. */ template - void add_posting_list(std::uint64_t n, BlockDataRange const& blocks) - { + void add_posting_list(std::uint64_t n, BlockDataRange const& blocks) { if (!n) { throw std::invalid_argument("List must be nonempty"); } @@ -245,8 +229,7 @@ class block_freq_index { * \throws std::ios_base::failure Thrown if failed to write to the temporary file buffer. */ template - void add_posting_list(BytesRange const& data) - { + void add_posting_list(BytesRange const& data) { m_postings_bytes_written += data.size(); m_postings_output.write(reinterpret_cast(data.data()), data.size()); m_endpoints.push_back(m_postings_bytes_written); @@ -260,8 +243,7 @@ class block_freq_index { * \throws std::ios_base::failure Thrown if failed to write to any file * or failed to read from the temporary buffer. */ - void build(std::string const& index_path) - { + void build(std::string const& index_path) { // This is a workaround to QMX codex having to sometimes look beyond the buffer // due to some SIMD loads. std::array padding{}; @@ -279,7 +261,8 @@ class block_freq_index { bit_vector_builder bvb; compact_elias_fano::write( - bvb, m_endpoints.begin(), m_postings_bytes_written, size, m_params); + bvb, m_endpoints.begin(), m_postings_bytes_written, size, m_params + ); bit_vector endpoints(&bvb); freezer(endpoints, "endpoints"); @@ -288,7 +271,8 @@ class block_freq_index { buf.exceptions(std::ios::badbit); os.write( reinterpret_cast(&m_postings_bytes_written), - sizeof(m_postings_bytes_written)); + sizeof(m_postings_bytes_written) + ); os << buf.rdbuf(); } @@ -315,11 +299,11 @@ class block_freq_index { using document_enumerator = typename block_posting_list::document_enumerator; private: - void check_term_range(std::size_t term_id) const - { + void check_term_range(std::size_t term_id) const { if (term_id >= size()) { throw std::out_of_range( - fmt::format("given term ID ({}) is out of range, must be < {}", term_id, size())); + fmt::format("given term ID ({}) is out of range, must be < {}", term_id, size()) + ); } } @@ -334,8 +318,7 @@ class block_freq_index { * * \returns The cursor over the posting list. */ - [[nodiscard]] document_enumerator operator[](std::size_t term_id) const - { + [[nodiscard]] document_enumerator operator[](std::size_t term_id) const { check_term_range(term_id); compact_elias_fano::enumerator endpoints(m_endpoints, 0, m_lists.size(), m_size, m_params); auto endpoint = endpoints.move(term_id).second; @@ -350,8 +333,7 @@ class block_freq_index { * \throws std::out_of_range Thrown if term ID is greater than or equal to * number of terms in the index. */ - void warmup(std::size_t term_id) const - { + void warmup(std::size_t term_id) const { check_term_range(term_id); compact_elias_fano::enumerator endpoints(m_endpoints, 0, m_lists.size(), m_size, m_params); @@ -371,8 +353,7 @@ class block_freq_index { /** * Swaps all data with another index. */ - void swap(block_freq_index& other) - { + void swap(block_freq_index& other) { std::swap(m_params, other.m_params); std::swap(m_size, other.m_size); m_endpoints.swap(other.m_endpoints); @@ -380,8 +361,7 @@ class block_freq_index { } template - void map(Visitor& visit) - { + void map(Visitor& visit) { visit(m_params, "m_params")(m_size, "m_size")(m_num_docs, "m_num_docs")( m_endpoints, "m_endpoints")(m_lists, "m_lists"); } diff --git a/include/pisa/block_posting_list.hpp b/include/pisa/block_posting_list.hpp index a46ab1a5..2b05e2ca 100644 --- a/include/pisa/block_posting_list.hpp +++ b/include/pisa/block_posting_list.hpp @@ -10,8 +10,7 @@ template struct block_posting_list { template static void - write(std::vector& out, uint32_t n, DocsIterator docs_begin, FreqsIterator freqs_begin) - { + write(std::vector& out, uint32_t n, DocsIterator docs_begin, FreqsIterator freqs_begin) { TightVariableByte::encode_single(n, out); uint64_t block_size = BlockCodec::block_size; @@ -40,7 +39,8 @@ struct block_posting_list { *((uint32_t*)&out[begin_block_maxs + 4 * b]) = last_doc; BlockCodec::encode( - docs_buf.data(), last_doc - block_base - (cur_block_size - 1), cur_block_size, out); + docs_buf.data(), last_doc - block_base - (cur_block_size - 1), cur_block_size, out + ); BlockCodec::encode(freqs_buf.data(), uint32_t(-1), cur_block_size, out); if (b != blocks - 1) { *((uint32_t*)&out[begin_block_endpoints + 4 * b]) = out.size() - begin_blocks; @@ -50,8 +50,8 @@ struct block_posting_list { } template - static void write_blocks(std::vector& out, uint32_t n, BlockDataRange const& input_blocks) - { + static void + write_blocks(std::vector& out, uint32_t n, BlockDataRange const& input_blocks) { TightVariableByte::encode_single(n, out); assert(input_blocks.front().index == 0); // first block must remain first @@ -85,8 +85,7 @@ struct block_posting_list { m_block_maxs(m_base), m_block_endpoints(m_block_maxs + 4 * m_blocks), m_blocks_data(m_block_endpoints + 4 * (m_blocks - 1)), - m_universe(universe) - { + m_universe(universe) { if (Profile) { // std::cout << "OPEN\t" << m_term_id << "\t" << m_blocks << "\n"; m_block_profile = block_profiler::open_list(term_id, m_blocks); @@ -98,8 +97,7 @@ struct block_posting_list { void reset() { decode_docs_block(0); } - void PISA_ALWAYSINLINE next() - { + void PISA_ALWAYSINLINE next() { ++m_pos_in_block; if (PISA_UNLIKELY(m_pos_in_block == m_cur_block_size)) { if (m_cur_block + 1 == m_blocks) { @@ -119,8 +117,7 @@ struct block_posting_list { * In particular, if called with a value that is less than or equal * to the current document ID, the position will not change. */ - void PISA_ALWAYSINLINE next_geq(uint64_t lower_bound) - { + void PISA_ALWAYSINLINE next_geq(uint64_t lower_bound) { if (PISA_UNLIKELY(lower_bound > m_cur_block_max)) { // binary search seems to perform worse here if (lower_bound > block_max(m_blocks - 1)) { @@ -142,8 +139,7 @@ struct block_posting_list { } } - void PISA_ALWAYSINLINE move(uint64_t pos) - { + void PISA_ALWAYSINLINE move(uint64_t pos) { assert(pos >= position()); uint64_t block = pos / BlockCodec::block_size; if (PISA_UNLIKELY(block != m_cur_block)) { @@ -156,8 +152,7 @@ struct block_posting_list { uint64_t docid() const { return m_cur_docid; } - uint64_t PISA_ALWAYSINLINE freq() - { + uint64_t PISA_ALWAYSINLINE freq() { if (!m_freqs_decoded) { decode_freqs_block(); } @@ -170,8 +165,7 @@ struct block_posting_list { uint64_t num_blocks() const { return m_blocks; } - uint64_t stats_freqs_size() const - { + uint64_t stats_freqs_size() const { // XXX rewrite in terms of get_blocks() uint64_t bytes = 0; uint8_t const* ptr = m_blocks_data; @@ -183,7 +177,8 @@ struct block_posting_list { uint32_t cur_base = (b != 0U ? block_max(b - 1) : uint32_t(-1)) + 1; uint8_t const* freq_ptr = BlockCodec::decode( - ptr, buf.data(), block_max(b) - cur_base - (cur_block_size - 1), cur_block_size); + ptr, buf.data(), block_max(b) - cur_base - (cur_block_size - 1), cur_block_size + ); ptr = BlockCodec::decode(freq_ptr, buf.data(), uint32_t(-1), cur_block_size); bytes += ptr - freq_ptr; } @@ -197,24 +192,20 @@ struct block_posting_list { uint32_t size; uint32_t doc_gaps_universe; - void append_docs_block(std::vector& out) const - { + void append_docs_block(std::vector& out) const { out.insert(out.end(), docs_begin, freqs_begin); } - void append_freqs_block(std::vector& out) const - { + void append_freqs_block(std::vector& out) const { out.insert(out.end(), freqs_begin, end); } - void decode_doc_gaps(std::vector& out) const - { + void decode_doc_gaps(std::vector& out) const { out.resize(size); BlockCodec::decode(docs_begin, out.data(), doc_gaps_universe, size); } - void decode_freqs(std::vector& out) const - { + void decode_freqs(std::vector& out) const { out.resize(size); BlockCodec::decode(freqs_begin, out.data(), uint32_t(-1), size); } @@ -227,8 +218,7 @@ struct block_posting_list { uint8_t const* end; }; - std::vector get_blocks() - { + std::vector get_blocks() { std::vector blocks; uint8_t const* ptr = m_blocks_data; @@ -262,8 +252,7 @@ struct block_posting_list { private: uint32_t block_max(uint32_t block) const { return ((uint32_t const*)m_block_maxs)[block]; } - void PISA_NOINLINE decode_docs_block(uint64_t block) - { + void PISA_NOINLINE decode_docs_block(uint64_t block) { static const uint64_t block_size = BlockCodec::block_size; uint32_t endpoint = block != 0U ? ((uint32_t const*)m_block_endpoints)[block - 1] : 0; uint8_t const* block_data = m_blocks_data + endpoint; @@ -275,7 +264,8 @@ struct block_posting_list { block_data, m_docs_buf.data(), m_cur_block_max - cur_base - (m_cur_block_size - 1), - m_cur_block_size); + m_cur_block_size + ); intrinsics::prefetch(m_freqs_block_data); m_docs_buf[0] += cur_base; @@ -289,10 +279,10 @@ struct block_posting_list { } } - void PISA_NOINLINE decode_freqs_block() - { + void PISA_NOINLINE decode_freqs_block() { uint8_t const* next_block = BlockCodec::decode( - m_freqs_block_data, m_freqs_buf.data(), uint32_t(-1), m_cur_block_size); + m_freqs_block_data, m_freqs_buf.data(), uint32_t(-1), m_cur_block_size + ); intrinsics::prefetch(next_block); m_freqs_decoded = true; diff --git a/include/pisa/codec/all_ones_sequence.hpp b/include/pisa/codec/all_ones_sequence.hpp index 16ff8f22..aa1a610b 100644 --- a/include/pisa/codec/all_ones_sequence.hpp +++ b/include/pisa/codec/all_ones_sequence.hpp @@ -7,15 +7,14 @@ namespace pisa { struct all_ones_sequence { - inline static uint64_t bitsize(global_parameters const& /* params */, uint64_t universe, uint64_t n) - { + inline static uint64_t + bitsize(global_parameters const& /* params */, uint64_t universe, uint64_t n) { return (universe == n) ? 0 : uint64_t(-1); } template static void - write(bit_vector_builder&, Iterator, uint64_t universe, uint64_t n, global_parameters const&) - { + write(bit_vector_builder&, Iterator, uint64_t universe, uint64_t n, global_parameters const&) { assert(universe == n); (void)universe; (void)n; @@ -26,36 +25,31 @@ struct all_ones_sequence { using value_type = std::pair; // (position, value) enumerator(bit_vector const&, uint64_t, uint64_t universe, uint64_t n, global_parameters const&) - : m_universe(universe), m_position(size()) - { + : m_universe(universe), m_position(size()) { assert(universe == n); (void)n; } - value_type move(uint64_t position) - { + value_type move(uint64_t position) { assert(position <= size()); m_position = position; return value_type(m_position, m_position); } - value_type next_geq(uint64_t lower_bound) - { + value_type next_geq(uint64_t lower_bound) { assert(lower_bound <= size()); m_position = lower_bound; return value_type(m_position, m_position); } - value_type next() - { + value_type next() { m_position += 1; return value_type(m_position, m_position); } uint64_t size() const { return m_universe; } - uint64_t prev_value() const - { + uint64_t prev_value() const { if (m_position == 0) { return 0; } diff --git a/include/pisa/codec/block_codecs.hpp b/include/pisa/codec/block_codecs.hpp index 301db855..04a8a5b4 100644 --- a/include/pisa/codec/block_codecs.hpp +++ b/include/pisa/codec/block_codecs.hpp @@ -19,19 +19,16 @@ namespace pisa { class TightVariableByte { public: template - static uint8_t extract7bits(const uint32_t val) - { + static uint8_t extract7bits(const uint32_t val) { return static_cast((val >> (7 * i)) & ((1U << 7) - 1)); } template - static uint8_t extract7bitsmaskless(const uint32_t val) - { + static uint8_t extract7bitsmaskless(const uint32_t val) { return static_cast((val >> (7 * i))); } - static void encode(const uint32_t* in, const size_t length, uint8_t* out, size_t& nvalue) - { + static void encode(const uint32_t* in, const size_t length, uint8_t* out, size_t& nvalue) { uint8_t* bout = out; for (size_t k = 0; k < length; ++k) { const uint32_t val(in[k]); @@ -79,16 +76,14 @@ class TightVariableByte { nvalue = bout - out; } - static void encode_single(uint32_t val, std::vector& out) - { + static void encode_single(uint32_t val, std::vector& out) { uint8_t buf[5]; size_t nvalue; encode(&val, 1, buf, nvalue); out.insert(out.end(), buf, buf + nvalue); } - static uint8_t const* decode(const uint8_t* in, uint32_t* out, size_t n) - { + static uint8_t const* decode(const uint8_t* in, uint32_t* out, size_t n) { const uint8_t* inbyte = in; for (size_t i = 0; i < n; ++i) { unsigned int shift = 0; @@ -104,8 +99,7 @@ class TightVariableByte { return inbyte; } - static void decode(const uint8_t* in, uint32_t* out, size_t len, size_t& n) - { + static void decode(const uint8_t* in, uint32_t* out, size_t len, size_t& n) { const uint8_t* inbyte = in; while (inbyte < in + len) { unsigned int shift = 0; @@ -125,8 +119,8 @@ class TightVariableByte { struct interpolative_block { static constexpr std::uint64_t block_size = 128; - static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) - { + static void + encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) { assert(n <= block_size); thread_local std::array inbuf{}; thread_local std::vector outbuf; // TODO: Can we use array? How long does it need @@ -148,8 +142,7 @@ struct interpolative_block { } static uint8_t const* PISA_NOINLINE - decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) - { + decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) { assert(n <= block_size); if (sum_of_values == std::numeric_limits::max()) { in = TightVariableByte::decode(in, &sum_of_values, 1); @@ -174,8 +167,7 @@ struct optpfor_block { struct codec_type: FastPForLib::OPTPFor<4, FastPForLib::Simple16> { uint8_t const* force_b{nullptr}; - uint32_t findBestB(const uint32_t* in, uint32_t len) - { + uint32_t findBestB(const uint32_t* in, uint32_t len) { // trick to force the choice of b from a parameter if (force_b != nullptr) { return *force_b; @@ -213,7 +205,8 @@ struct optpfor_block { uint32_t sum_of_values, size_t n, std::vector& out, - uint8_t const* b = nullptr) // if non-null forces b + uint8_t const* b = nullptr + ) // if non-null forces b { thread_local codec_type optpfor_codec; thread_local std::array buf{}; @@ -233,8 +226,7 @@ struct optpfor_block { } static uint8_t const* PISA_NOINLINE - decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) - { + decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) { thread_local codec_type optpfor_codec; // pfor decoding is *not* thread-safe assert(n <= block_size); @@ -246,7 +238,8 @@ struct optpfor_block { uint8_t const* ret; ret = reinterpret_cast( - optpfor_codec.decodeBlock(reinterpret_cast(in), out, out_len)); + optpfor_codec.decodeBlock(reinterpret_cast(in), out, out_len) + ); assert(out_len == n); return ret; } @@ -260,8 +253,7 @@ struct varint_G8IU_block { // size is known rather than the input // the buffers pointed by src and dst must be respectively at least // 9 and 8 elements large - uint32_t decodeBlock(uint8_t const*& src, uint32_t* dst) const - { + uint32_t decodeBlock(uint8_t const*& src, uint32_t* dst) const { uint8_t desc = *src; src += 1; const __m128i data = _mm_lddqu_si128(reinterpret_cast<__m128i const*>(src)); @@ -275,16 +267,17 @@ struct varint_G8IU_block { _mm_shuffle_epi8(data, vecmask[desc][1]); //__builtin_ia32_pshufb128(data, // shf2); _mm_storeu_si128( - reinterpret_cast<__m128i*>(dst + 4), result2); //__builtin_ia32_storedqu(dst - //+ (16), result2); + reinterpret_cast<__m128i*>(dst + 4), result2 + ); //__builtin_ia32_storedqu(dst + //+ (16), result2); } return readSize; } }; - static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) - { + static void + encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) { thread_local codec_type varint_codec; thread_local std::array buf{}; assert(n <= block_size); @@ -309,8 +302,7 @@ struct varint_G8IU_block { } // we only allow varint to be inlined (others have PISA_NOILINE) - static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) - { + static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) { static codec_type varint_codec; // decodeBlock is thread-safe assert(n <= block_size); diff --git a/include/pisa/codec/compact_elias_fano.hpp b/include/pisa/codec/compact_elias_fano.hpp index eb8fcea8..6bd99ce8 100644 --- a/include/pisa/codec/compact_elias_fano.hpp +++ b/include/pisa/codec/compact_elias_fano.hpp @@ -12,8 +12,7 @@ namespace pisa { -[[nodiscard]] constexpr auto positive(std::uint64_t n) -> std::uint64_t -{ +[[nodiscard]] constexpr auto positive(std::uint64_t n) -> std::uint64_t { if (n == 0) { throw std::logic_error("argument must be positive"); } @@ -40,8 +39,7 @@ struct compact_elias_fano { pointers1_offset(pointers0_offset + pointers0 * pointer_size), higher_bits_offset(pointers1_offset + pointers1 * pointer_size), lower_bits_offset(higher_bits_offset + higher_bits_length), - end(lower_bits_offset + n * lower_bits) - {} + end(lower_bits_offset + n * lower_bits) {} uint64_t universe; uint64_t n; @@ -63,8 +61,7 @@ struct compact_elias_fano { }; static PISA_FLATTEN_FUNC uint64_t - bitsize(global_parameters const& params, uint64_t universe, uint64_t n) - { + bitsize(global_parameters const& params, uint64_t universe, uint64_t n) { return offsets(0, universe, n, params).end; } @@ -74,8 +71,8 @@ struct compact_elias_fano { Iterator begin, uint64_t universe, uint64_t n, - global_parameters const& params) - { + global_parameters const& params + ) { uint64_t base_offset = bvb.size(); offsets of(base_offset, universe, n, params); // initialize all the bits to 0 @@ -151,12 +148,14 @@ struct compact_elias_fano { uint64_t offset, uint64_t universe, uint64_t n, - global_parameters const& params) - : m_bv(&bv), m_of(offset, universe, n, params), m_position(size()), m_value(m_of.universe) - {} - - value_type move(uint64_t position) - { + global_parameters const& params + ) + : m_bv(&bv), + m_of(offset, universe, n, params), + m_position(size()), + m_value(m_of.universe) {} + + value_type move(uint64_t position) { assert(position <= m_of.n); if (position == m_position) { @@ -185,8 +184,7 @@ struct compact_elias_fano { return slow_move(position); } - value_type next_geq(uint64_t lower_bound) - { + value_type next_geq(uint64_t lower_bound) { if (lower_bound == m_value) { return value(); } @@ -218,8 +216,7 @@ struct compact_elias_fano { uint64_t size() const { return m_of.n; } - value_type next() - { + value_type next() { m_position += 1; assert(m_position <= size()); @@ -231,8 +228,7 @@ struct compact_elias_fano { return value(); } - uint64_t prev_value() const - { + uint64_t prev_value() const { if (m_position == 0) { return 0; } @@ -256,8 +252,7 @@ struct compact_elias_fano { inline value_type value() const { return value_type(m_position, m_value); } private: - value_type PISA_NOINLINE slow_move(uint64_t position) - { + value_type PISA_NOINLINE slow_move(uint64_t position) { if (PISA_UNLIKELY(position == size())) { m_position = position; m_value = m_of.universe; @@ -283,8 +278,7 @@ struct compact_elias_fano { return value(); } - value_type PISA_NOINLINE slow_next_geq(uint64_t lower_bound) - { + value_type PISA_NOINLINE slow_next_geq(uint64_t lower_bound) { if (PISA_UNLIKELY(lower_bound >= m_of.universe)) { return move(size()); } @@ -330,14 +324,12 @@ struct compact_elias_fano { static const uint64_t linear_scan_threshold = 8; - inline uint64_t read_low() - { + inline uint64_t read_low() { return m_bv->get_word56(m_of.lower_bits_offset + m_position * m_of.lower_bits) & m_of.mask; } - inline uint64_t read_next() - { + inline uint64_t read_next() { assert(m_position < size()); uint64_t high = m_high_enumerator.next() - m_of.higher_bits_offset; return ((high - m_position - 1) << m_of.lower_bits) | read_low(); @@ -351,16 +343,14 @@ struct compact_elias_fano { lower_bits(e.m_of.lower_bits), lower_base(e.m_of.lower_bits_offset + position * lower_bits), mask(e.m_of.mask), - bv(*e.m_bv) - {} + bv(*e.m_bv) {} next_reader(next_reader const&) = delete; next_reader(next_reader&&) = delete; next_reader& operator=(next_reader const&) = delete; next_reader& operator=(next_reader&&) = delete; ~next_reader() { e.m_high_enumerator = high_enumerator; } - uint64_t operator()() - { + uint64_t operator()() { uint64_t high = high_enumerator.next() - high_base; uint64_t low = bv.get_word56(lower_base) & mask; high_base += 1; @@ -374,8 +364,7 @@ struct compact_elias_fano { bit_vector const& bv; }; - inline uint64_t pointer(uint64_t offset, uint64_t i) const - { + inline uint64_t pointer(uint64_t offset, uint64_t i) const { if (i == 0) { return 0; } diff --git a/include/pisa/codec/compact_ranked_bitvector.hpp b/include/pisa/codec/compact_ranked_bitvector.hpp index 1aac00e3..c06c8f9b 100644 --- a/include/pisa/codec/compact_ranked_bitvector.hpp +++ b/include/pisa/codec/compact_ranked_bitvector.hpp @@ -30,8 +30,7 @@ struct compact_ranked_bitvector { rank1_samples_offset(base_offset), pointers1_offset(rank1_samples_offset + rank1_samples * rank1_sample_size), bits_offset(pointers1_offset + pointers1 * pointer_size), - end(bits_offset + universe) - {} + end(bits_offset + universe) {} uint64_t universe; uint64_t n; @@ -51,8 +50,7 @@ struct compact_ranked_bitvector { }; static PISA_FLATTEN_FUNC uint64_t - bitsize(global_parameters const& params, uint64_t universe, uint64_t n) - { + bitsize(global_parameters const& params, uint64_t universe, uint64_t n) { return offsets(0, universe, n, params).end; } @@ -62,8 +60,8 @@ struct compact_ranked_bitvector { Iterator begin, uint64_t universe, uint64_t n, - global_parameters const& params) - { + global_parameters const& params + ) { uint64_t base_offset = bvb.size(); offsets of(base_offset, universe, n, params); // initialize all the bits to 0 @@ -125,12 +123,14 @@ struct compact_ranked_bitvector { uint64_t offset, uint64_t universe, uint64_t n, - global_parameters const& params) - : m_bv(&bv), m_of(offset, universe, n, params), m_position(size()), m_value(m_of.universe) - {} - - value_type move(uint64_t position) - { + global_parameters const& params + ) + : m_bv(&bv), + m_of(offset, universe, n, params), + m_position(size()), + m_value(m_of.universe) {} + + value_type move(uint64_t position) { assert(position <= size()); if (position == m_position) { @@ -158,8 +158,7 @@ struct compact_ranked_bitvector { return slow_move(position); } - value_type next_geq(uint64_t lower_bound) - { + value_type next_geq(uint64_t lower_bound) { if (lower_bound == m_value) { return value(); } @@ -187,8 +186,7 @@ struct compact_ranked_bitvector { return slow_next_geq(lower_bound); } - value_type next() - { + value_type next() { m_position += 1; assert(m_position <= size()); @@ -202,8 +200,7 @@ struct compact_ranked_bitvector { uint64_t size() const { return m_of.n; } - uint64_t prev_value() const - { + uint64_t prev_value() const { if (m_position == 0) { return 0; } @@ -219,8 +216,7 @@ struct compact_ranked_bitvector { } private: - value_type PISA_NOINLINE slow_move(uint64_t position) - { + value_type PISA_NOINLINE slow_move(uint64_t position) { uint64_t skip = position - m_position; if (PISA_UNLIKELY(position == size())) { m_position = position; @@ -246,8 +242,7 @@ struct compact_ranked_bitvector { return value(); } - value_type PISA_NOINLINE slow_next_geq(uint64_t lower_bound) - { + value_type PISA_NOINLINE slow_next_geq(uint64_t lower_bound) { using broadword::popcount; if (PISA_UNLIKELY(lower_bound >= m_of.universe)) { @@ -297,21 +292,18 @@ struct compact_ranked_bitvector { inline uint64_t read_next() { return m_enumerator.next() - m_of.bits_offset; } - inline uint64_t pointer(uint64_t offset, uint64_t i, uint64_t size) const - { + inline uint64_t pointer(uint64_t offset, uint64_t i, uint64_t size) const { if (i == 0) { return 0; } return m_bv->get_word56(offset + (i - 1) * size) & ((uint64_t(1) << size) - 1); } - inline uint64_t pointer1(uint64_t i) const - { + inline uint64_t pointer1(uint64_t i) const { return pointer(m_of.pointers1_offset, i, m_of.pointer_size); } - inline uint64_t rank1_sample(uint64_t i) const - { + inline uint64_t rank1_sample(uint64_t i) const { return pointer(m_of.rank1_samples_offset, i, m_of.rank1_sample_size); } diff --git a/include/pisa/codec/integer_codes.hpp b/include/pisa/codec/integer_codes.hpp index d6fe793f..7690a94b 100644 --- a/include/pisa/codec/integer_codes.hpp +++ b/include/pisa/codec/integer_codes.hpp @@ -6,8 +6,7 @@ namespace pisa { // note: n can be 0 -inline void write_gamma(bit_vector_builder& bvb, uint64_t n) -{ +inline void write_gamma(bit_vector_builder& bvb, uint64_t n) { uint64_t nn = n + 1; uint64_t l = broadword::msb(nn); uint64_t hb = uint64_t(1) << l; @@ -15,26 +14,22 @@ inline void write_gamma(bit_vector_builder& bvb, uint64_t n) bvb.append_bits(nn ^ hb, l); } -inline void write_gamma_nonzero(bit_vector_builder& bvb, uint64_t n) -{ +inline void write_gamma_nonzero(bit_vector_builder& bvb, uint64_t n) { assert(n > 0); write_gamma(bvb, n - 1); } -inline uint64_t read_gamma(bit_vector::enumerator& it) -{ +inline uint64_t read_gamma(bit_vector::enumerator& it) { uint64_t l = it.skip_zeros(); assert(l < 64); return (it.take(l) | (uint64_t(1) << l)) - 1; } -inline uint64_t read_gamma_nonzero(bit_vector::enumerator& it) -{ +inline uint64_t read_gamma_nonzero(bit_vector::enumerator& it) { return read_gamma(it) + 1; } -inline void write_delta(bit_vector_builder& bvb, uint64_t n) -{ +inline void write_delta(bit_vector_builder& bvb, uint64_t n) { uint64_t nn = n + 1; uint64_t l = broadword::msb(nn); uint64_t hb = uint64_t(1) << l; @@ -42,8 +37,7 @@ inline void write_delta(bit_vector_builder& bvb, uint64_t n) bvb.append_bits(nn ^ hb, l); } -inline uint64_t read_delta(bit_vector::enumerator& it) -{ +inline uint64_t read_delta(bit_vector::enumerator& it) { uint64_t l = read_gamma(it); return (it.take(l) | (uint64_t(1) << l)) - 1; } diff --git a/include/pisa/codec/interpolative_coding.hpp b/include/pisa/codec/interpolative_coding.hpp index e179e3a8..ba0d2820 100644 --- a/include/pisa/codec/interpolative_coding.hpp +++ b/include/pisa/codec/interpolative_coding.hpp @@ -9,13 +9,11 @@ namespace pisa { class bit_writer { public: - explicit bit_writer(std::vector& buf) : m_buf(buf), m_size(0), m_cur_word(nullptr) - { + explicit bit_writer(std::vector& buf) : m_buf(buf), m_size(0), m_cur_word(nullptr) { m_buf.clear(); } - void write(uint32_t bits, uint32_t len) - { + void write(uint32_t bits, uint32_t len) { if (len == 0U) { return; } @@ -34,8 +32,7 @@ class bit_writer { size_t size() const { return m_size; } - void write_int(uint32_t val, uint32_t u) - { + void write_int(uint32_t val, uint32_t u) { assert(u > 0); assert(val < u); auto b = broadword::msb(u); @@ -51,8 +48,7 @@ class bit_writer { } } - void write_interpolative(uint32_t const* in, size_t n, uint32_t low, uint32_t high) - { + void write_interpolative(uint32_t const* in, size_t n, uint32_t low, uint32_t high) { if (n == 0U) { return; } @@ -77,8 +73,7 @@ class bit_reader { size_t position() const { return m_pos; } - uint32_t read(uint32_t len) - { + uint32_t read(uint32_t len) { if (len == 0U) { return 0; } @@ -96,8 +91,7 @@ class bit_reader { return val; } - uint32_t read_int(uint32_t u) - { + uint32_t read_int(uint32_t u) { assert(u > 0); auto b = broadword::msb(u); uint64_t m = (uint64_t(1) << (b + 1)) - u; @@ -111,8 +105,7 @@ class bit_reader { return val; } - void read_interpolative(uint32_t* out, size_t n, uint32_t low, uint32_t high) - { + void read_interpolative(uint32_t* out, size_t n, uint32_t low, uint32_t high) { assert(low <= high); assert(n > 0); diff --git a/include/pisa/codec/maskedvbyte.hpp b/include/pisa/codec/maskedvbyte.hpp index 1c535eba..05e9e587 100644 --- a/include/pisa/codec/maskedvbyte.hpp +++ b/include/pisa/codec/maskedvbyte.hpp @@ -10,8 +10,8 @@ namespace pisa { struct maskedvbyte_block { static constexpr std::uint64_t block_size = 128; - static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) - { + static void + encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) { assert(n <= block_size); auto* src = const_cast(in); if (n < block_size) { @@ -22,8 +22,7 @@ struct maskedvbyte_block { size_t out_len = vbyte_encode(src, n, buf.data()); out.insert(out.end(), buf.data(), buf.data() + out_len); } - static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) - { + static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) { assert(n <= block_size); if (PISA_UNLIKELY(n < block_size)) { return interpolative_block::decode(in, out, sum_of_values, n); diff --git a/include/pisa/codec/qmx.hpp b/include/pisa/codec/qmx.hpp index 7be27318..16781be1 100644 --- a/include/pisa/codec/qmx.hpp +++ b/include/pisa/codec/qmx.hpp @@ -8,8 +8,8 @@ struct qmx_block { static constexpr std::uint64_t block_size = 128; static constexpr std::uint64_t overflow = 512; - static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) - { + static void + encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) { assert(n <= block_size); auto* src = const_cast(in); if (n < block_size) { @@ -30,8 +30,7 @@ struct qmx_block { * This is NOT enforced by `encode`, because it would be very wasteful to add 15 bytes to each * block. Instead, the padding is added to the index, after all postings. */ - static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) - { + static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) { static QMX::compress_integer_qmx_improved qmx_codec; // decodeBlock is thread-safe assert(n <= block_size); if (PISA_UNLIKELY(n < block_size)) { diff --git a/include/pisa/codec/simdbp.hpp b/include/pisa/codec/simdbp.hpp index 42ba654a..a3955cbd 100644 --- a/include/pisa/codec/simdbp.hpp +++ b/include/pisa/codec/simdbp.hpp @@ -11,8 +11,8 @@ extern "C" { namespace pisa { struct simdbp_block { static const uint64_t block_size = 128; - static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) - { + static void + encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) { assert(n <= block_size); auto* src = const_cast(in); if (n < block_size) { @@ -26,8 +26,7 @@ struct simdbp_block { simdpackwithoutmask(src, (__m128i*)buf_ptr, b); out.insert(out.end(), buf.data(), buf.data() + b * sizeof(__m128i) + 1); } - static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) - { + static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) { assert(n <= block_size); if (PISA_UNLIKELY(n < block_size)) { return interpolative_block::decode(in, out, sum_of_values, n); diff --git a/include/pisa/codec/simple16.hpp b/include/pisa/codec/simple16.hpp index 10d6c5a6..bd937e8c 100644 --- a/include/pisa/codec/simple16.hpp +++ b/include/pisa/codec/simple16.hpp @@ -9,8 +9,7 @@ struct simple16_block { static constexpr std::uint64_t block_size = 128; static void - encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) - { + encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) { assert(n <= block_size); thread_local FastPForLib::Simple16 codec; thread_local std::array buf{}; @@ -21,14 +20,14 @@ struct simple16_block { } static uint8_t const* - decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n) - { + decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n) { assert(n <= block_size); FastPForLib::Simple16 codec; std::array buf{}; auto const* ret = reinterpret_cast( - codec.decodeArray(reinterpret_cast(in), 8 * n, buf.data(), n)); + codec.decodeArray(reinterpret_cast(in), 8 * n, buf.data(), n) + ); std::copy(buf.begin(), std::next(buf.begin(), n), out); return ret; diff --git a/include/pisa/codec/simple8b.hpp b/include/pisa/codec/simple8b.hpp index 71576413..5928d5da 100644 --- a/include/pisa/codec/simple8b.hpp +++ b/include/pisa/codec/simple8b.hpp @@ -9,8 +9,7 @@ struct simple8b_block { static constexpr std::uint64_t block_size = 128; static void - encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) - { + encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) { assert(n <= block_size); thread_local FastPForLib::Simple8b codec; thread_local std::array buf{}; @@ -21,12 +20,12 @@ struct simple8b_block { } static uint8_t const* - decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n) - { + decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n) { assert(n <= block_size); FastPForLib::Simple8b codec; return reinterpret_cast( - codec.decodeArray(reinterpret_cast(in), 8 * n, out, n)); + codec.decodeArray(reinterpret_cast(in), 8 * n, out, n) + ); } }; } // namespace pisa diff --git a/include/pisa/codec/streamvbyte.hpp b/include/pisa/codec/streamvbyte.hpp index 4f8f8089..8cd41ac8 100644 --- a/include/pisa/codec/streamvbyte.hpp +++ b/include/pisa/codec/streamvbyte.hpp @@ -10,8 +10,7 @@ namespace pisa { // This is a constexpr version of the function in the streamvbyte library. -constexpr std::size_t streamvbyte_max_compressedbytes(std::uint32_t length) -{ +constexpr std::size_t streamvbyte_max_compressedbytes(std::uint32_t length) { // number of control bytes: size_t cb = (length + 3) / 4; // maximum number of control bytes: @@ -22,8 +21,7 @@ constexpr std::size_t streamvbyte_max_compressedbytes(std::uint32_t length) struct streamvbyte_block { static const uint64_t block_size = 128; static void - encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) - { + encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector& out) { assert(n <= block_size); auto* src = const_cast(in); thread_local std::array buf{}; @@ -31,8 +29,7 @@ struct streamvbyte_block { out.insert(out.end(), buf.data(), buf.data() + out_len); } static uint8_t const* - decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n) - { + decode(uint8_t const* in, uint32_t* out, uint32_t /* sum_of_values */, size_t n) { assert(n <= block_size); auto read = streamvbyte_decode(in, out, n); return in + read; diff --git a/include/pisa/codec/strict_elias_fano.hpp b/include/pisa/codec/strict_elias_fano.hpp index 4d578d6c..f9c33758 100644 --- a/include/pisa/codec/strict_elias_fano.hpp +++ b/include/pisa/codec/strict_elias_fano.hpp @@ -9,8 +9,7 @@ namespace pisa { struct strict_elias_fano { static PISA_FLATTEN_FUNC uint64_t - bitsize(global_parameters const& params, uint64_t universe, uint64_t n) - { + bitsize(global_parameters const& params, uint64_t universe, uint64_t n) { assert(universe >= n); return compact_elias_fano::bitsize(params, universe - n + 1, n); } @@ -21,8 +20,8 @@ struct strict_elias_fano { Iterator begin, uint64_t universe, uint64_t n, - global_parameters const& params) - { + global_parameters const& params + ) { uint64_t new_universe = universe - n + 1; using value_type = typename std::iterator_traits::value_type; auto new_begin = make_function_iterator( @@ -31,7 +30,8 @@ struct strict_elias_fano { ++state.first; ++state.second; }, - +[](std::pair const& state) { return *state.second - state.first; }); + +[](std::pair const& state) { return *state.second - state.first; } + ); compact_elias_fano::write(bvb, new_begin, new_universe, n, params); } @@ -46,26 +46,23 @@ struct strict_elias_fano { uint64_t offset, uint64_t universe, uint64_t n, - global_parameters const& params) - : m_ef_enum(bv, offset, universe - n + 1, n, params) - {} + global_parameters const& params + ) + : m_ef_enum(bv, offset, universe - n + 1, n, params) {} - value_type move(uint64_t position) - { + value_type move(uint64_t position) { auto val = m_ef_enum.move(position); return value_type(val.first, val.second + val.first); } - value_type next() - { + value_type next() { auto val = m_ef_enum.next(); return value_type(val.first, val.second + val.first); } uint64_t size() const { return m_ef_enum.size(); } - uint64_t prev_value() const - { + uint64_t prev_value() const { if (m_ef_enum.position() != 0U) { return m_ef_enum.prev_value() + m_ef_enum.position() - 1; } diff --git a/include/pisa/codec/varintgb.hpp b/include/pisa/codec/varintgb.hpp index 34178f2e..6ebaee23 100644 --- a/include/pisa/codec/varintgb.hpp +++ b/include/pisa/codec/varintgb.hpp @@ -14,8 +14,7 @@ namespace pisa { template class VarIntGB { public: - size_t encodeArray(const uint32_t* in, const size_t length, uint8_t* out) - { + size_t encodeArray(const uint32_t* in, const size_t length, uint8_t* out) { uint32_t prev = 0; // for delta const uint8_t* const initbout = out; @@ -144,8 +143,7 @@ class VarIntGB { return storageinbytes; } - size_t decodeArray(const uint8_t* in, const size_t n, uint32_t* out) - { + size_t decodeArray(const uint8_t* in, const size_t n, uint32_t* out) { uint32_t prev = 0; // for delta const uint8_t* initin = in; uint32_t val; @@ -179,8 +177,7 @@ class VarIntGB { } protected: - const uint8_t* decodeGroupVarInt(const uint8_t* in, uint32_t* out) - { + const uint8_t* decodeGroupVarInt(const uint8_t* in, uint32_t* out) { const uint32_t sel = *in++; if (sel == 0) { @@ -205,8 +202,7 @@ class VarIntGB { return in; } - const uint8_t* decodeGroupVarIntDelta(const uint8_t* in, uint32_t* val, uint32_t* out) - { + const uint8_t* decodeGroupVarIntDelta(const uint8_t* in, uint32_t* val, uint32_t* out) { const uint32_t sel = *in++; if (sel == 0) { out[0] = (*val += static_cast(in[0])); @@ -238,8 +234,8 @@ class VarIntGB { struct varintgb_block { static const uint64_t block_size = 128; - static void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) - { + static void + encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector& out) { thread_local VarIntGB varintgb_codec; assert(n <= block_size); if (n < block_size) { @@ -251,8 +247,7 @@ struct varintgb_block { out.insert(out.end(), buf.data(), buf.data() + out_len); } - static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) - { + static uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) { thread_local VarIntGB varintgb_codec; assert(n <= block_size); if (PISA_UNLIKELY(n < block_size)) { diff --git a/include/pisa/compress.hpp b/include/pisa/compress.hpp index d2cc7c88..e8c21a00 100644 --- a/include/pisa/compress.hpp +++ b/include/pisa/compress.hpp @@ -15,6 +15,7 @@ void compress( std::string const& output_filename, ScorerParams const& scorer_params, std::optional quantization_bits, - bool check); + bool check +); } // namespace pisa diff --git a/include/pisa/concepts.hpp b/include/pisa/concepts.hpp index 288634de..ea0fc373 100644 --- a/include/pisa/concepts.hpp +++ b/include/pisa/concepts.hpp @@ -16,18 +16,16 @@ limitations under the License. */ #ifdef PISA_ENABLE_CONCEPTS -#include + #include -#define PISA_REQUIRES(x) \ - requires (x) + #define PISA_REQUIRES(x) requires(x) -#define PISA_ASSERT_CONCEPT(x) \ - static_assert(x) + #define PISA_ASSERT_CONCEPT(x) static_assert(x) #else -#define PISA_REQUIRES(x) /**/ + #define PISA_REQUIRES(x) /**/ -#define PISA_ASSERT_CONCEPT(x) /**/ + #define PISA_ASSERT_CONCEPT(x) /**/ #endif diff --git a/include/pisa/cursor/block_max_scored_cursor.hpp b/include/pisa/cursor/block_max_scored_cursor.hpp index 0dd3d7d4..a6a3eb15 100644 --- a/include/pisa/cursor/block_max_scored_cursor.hpp +++ b/include/pisa/cursor/block_max_scored_cursor.hpp @@ -19,23 +19,21 @@ class BlockMaxScoredCursor: public MaxScoredCursor { TermScorer term_scorer, float weight, float max_score, - typename Wand::wand_data_enumerator wdata) + typename Wand::wand_data_enumerator wdata + ) : MaxScoredCursor(std::move(cursor), std::move(term_scorer), weight, max_score), - m_wdata(std::move(wdata)) - {} + m_wdata(std::move(wdata)) {} BlockMaxScoredCursor(BlockMaxScoredCursor const&) = delete; BlockMaxScoredCursor(BlockMaxScoredCursor&&) = default; BlockMaxScoredCursor& operator=(BlockMaxScoredCursor const&) = delete; BlockMaxScoredCursor& operator=(BlockMaxScoredCursor&&) = default; ~BlockMaxScoredCursor() = default; - [[nodiscard]] PISA_ALWAYSINLINE auto block_max_score() -> float - { + [[nodiscard]] PISA_ALWAYSINLINE auto block_max_score() -> float { return m_wdata.score() * this->query_weight(); } - [[nodiscard]] PISA_ALWAYSINLINE auto block_max_docid() -> std::uint32_t - { + [[nodiscard]] PISA_ALWAYSINLINE auto block_max_docid() -> std::uint32_t { return m_wdata.docid(); } @@ -47,15 +45,18 @@ class BlockMaxScoredCursor: public MaxScoredCursor { template [[nodiscard]] auto make_block_max_scored_cursors( - Index const& index, WandType const& wdata, Scorer const& scorer, Query query, bool weighted = false) -{ + Index const& index, WandType const& wdata, Scorer const& scorer, Query query, bool weighted = false +) { auto terms = query.terms; auto query_term_freqs = query_freqs(terms); std::vector> cursors; cursors.reserve(query_term_freqs.size()); std::transform( - query_term_freqs.begin(), query_term_freqs.end(), std::back_inserter(cursors), [&](auto&& term) { + query_term_freqs.begin(), + query_term_freqs.end(), + std::back_inserter(cursors), + [&](auto&& term) { auto term_weight = 1.0F; auto term_id = term.first; auto max_weight = wdata.max_term_weight(term_id); @@ -66,19 +67,19 @@ template return BlockMaxScoredCursor( index[term_id], [scorer = scorer.term_scorer(term_id), weight = term_weight]( - uint32_t doc, uint32_t freq) { return weight * scorer(doc, freq); }, + uint32_t doc, uint32_t freq + ) { return weight * scorer(doc, freq); }, term_weight, max_weight, - wdata.getenum(term_id)); + wdata.getenum(term_id) + ); } return BlockMaxScoredCursor( - index[term_id], - scorer.term_scorer(term_id), - term_weight, - max_weight, - wdata.getenum(term_id)); - }); + index[term_id], scorer.term_scorer(term_id), term_weight, max_weight, wdata.getenum(term_id) + ); + } + ); return cursors; } diff --git a/include/pisa/cursor/cursor.hpp b/include/pisa/cursor/cursor.hpp index c27c6772..7703a268 100644 --- a/include/pisa/cursor/cursor.hpp +++ b/include/pisa/cursor/cursor.hpp @@ -6,8 +6,7 @@ namespace pisa { template -[[nodiscard]] auto make_cursors(Index const& index, Query query) -{ +[[nodiscard]] auto make_cursors(Index const& index, Query query) { auto terms = query.terms; remove_duplicate_terms(terms); using cursor = typename Index::document_enumerator; diff --git a/include/pisa/cursor/max_scored_cursor.hpp b/include/pisa/cursor/max_scored_cursor.hpp index b6d653e3..efacf001 100644 --- a/include/pisa/cursor/max_scored_cursor.hpp +++ b/include/pisa/cursor/max_scored_cursor.hpp @@ -15,8 +15,7 @@ class MaxScoredCursor: public ScoredCursor { MaxScoredCursor(Cursor cursor, TermScorer term_scorer, float query_weight, float max_score) : ScoredCursor(std::move(cursor), std::move(term_scorer), query_weight), - m_max_score(max_score) - {} + m_max_score(max_score) {} MaxScoredCursor(MaxScoredCursor const&) = delete; MaxScoredCursor(MaxScoredCursor&&) = default; MaxScoredCursor& operator=(MaxScoredCursor const&) = delete; @@ -31,15 +30,18 @@ class MaxScoredCursor: public ScoredCursor { template [[nodiscard]] auto make_max_scored_cursors( - Index const& index, WandType const& wdata, Scorer const& scorer, Query query, bool weighted = false) -{ + Index const& index, WandType const& wdata, Scorer const& scorer, Query query, bool weighted = false +) { auto terms = query.terms; auto query_term_freqs = query_freqs(terms); std::vector> cursors; cursors.reserve(query_term_freqs.size()); std::transform( - query_term_freqs.begin(), query_term_freqs.end(), std::back_inserter(cursors), [&](auto&& term) { + query_term_freqs.begin(), + query_term_freqs.end(), + std::back_inserter(cursors), + [&](auto&& term) { auto term_weight = 1.0F; auto term_id = term.first; auto max_weight = wdata.max_term_weight(term_id); @@ -50,14 +52,18 @@ template return MaxScoredCursor( index[term_id], [scorer = scorer.term_scorer(term_id), weight = term_weight]( - uint32_t doc, uint32_t freq) { return weight * scorer(doc, freq); }, + uint32_t doc, uint32_t freq + ) { return weight * scorer(doc, freq); }, term_weight, - max_weight); + max_weight + ); } return MaxScoredCursor( - index[term_id], scorer.term_scorer(term_id), term_weight, max_weight); - }); + index[term_id], scorer.term_scorer(term_id), term_weight, max_weight + ); + } + ); return cursors; } diff --git a/include/pisa/cursor/scored_cursor.hpp b/include/pisa/cursor/scored_cursor.hpp index afe24699..b0671338 100644 --- a/include/pisa/cursor/scored_cursor.hpp +++ b/include/pisa/cursor/scored_cursor.hpp @@ -16,20 +16,17 @@ class ScoredCursor { ScoredCursor(Cursor cursor, TermScorer term_scorer, float query_weight) : m_base_cursor(std::move(cursor)), m_term_scorer(std::move(term_scorer)), - m_query_weight(query_weight) - {} + m_query_weight(query_weight) {} ScoredCursor(ScoredCursor const&) = delete; ScoredCursor(ScoredCursor&&) = default; ScoredCursor& operator=(ScoredCursor const&) = delete; ScoredCursor& operator=(ScoredCursor&&) = default; ~ScoredCursor() = default; - [[nodiscard]] PISA_ALWAYSINLINE auto query_weight() const noexcept -> float - { + [[nodiscard]] PISA_ALWAYSINLINE auto query_weight() const noexcept -> float { return m_query_weight; } - [[nodiscard]] PISA_ALWAYSINLINE auto docid() const -> std::uint32_t - { + [[nodiscard]] PISA_ALWAYSINLINE auto docid() const -> std::uint32_t { return m_base_cursor.docid(); } [[nodiscard]] PISA_ALWAYSINLINE auto freq() -> std::uint32_t { return m_base_cursor.freq(); } @@ -46,15 +43,17 @@ class ScoredCursor { template [[nodiscard]] auto -make_scored_cursors(Index const& index, Scorer const& scorer, Query query, bool weighted = false) -{ +make_scored_cursors(Index const& index, Scorer const& scorer, Query query, bool weighted = false) { auto terms = query.terms; auto query_term_freqs = query_freqs(terms); std::vector> cursors; cursors.reserve(query_term_freqs.size()); std::transform( - query_term_freqs.begin(), query_term_freqs.end(), std::back_inserter(cursors), [&](auto&& term) { + query_term_freqs.begin(), + query_term_freqs.end(), + std::back_inserter(cursors), + [&](auto&& term) { auto term_weight = 1.0F; auto term_id = term.first; @@ -63,12 +62,16 @@ make_scored_cursors(Index const& index, Scorer const& scorer, Query query, bool return ScoredCursor( index[term_id], [scorer = scorer.term_scorer(term_id), weight = term_weight]( - uint32_t doc, uint32_t freq) { return weight * scorer(doc, freq); }, - term_weight); + uint32_t doc, uint32_t freq + ) { return weight * scorer(doc, freq); }, + term_weight + ); } return ScoredCursor( - index[term_id], scorer.term_scorer(term_id), term_weight); - }); + index[term_id], scorer.term_scorer(term_id), term_weight + ); + } + ); return cursors; } diff --git a/include/pisa/dec_time_prediction.hpp b/include/pisa/dec_time_prediction.hpp index d304fddd..fd09c057 100644 --- a/include/pisa/dec_time_prediction.hpp +++ b/include/pisa/dec_time_prediction.hpp @@ -19,14 +19,12 @@ namespace pisa { namespace time_prediction { enum class feature_type { BOOST_PP_SEQ_ENUM(PISA_FEATURE_TYPES), end }; - inline feature_type parse_feature_type(std::string const& name) - { + inline feature_type parse_feature_type(std::string const& name) { if (false) { -#define LOOP_BODY(R, DATA, T) \ - } \ - else if (name == BOOST_PP_STRINGIZE(T)) \ - { \ - return feature_type::T; \ +#define LOOP_BODY(R, DATA, T) \ + } \ + else if (name == BOOST_PP_STRINGIZE(T)) { \ + return feature_type::T; \ /**/ BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_FEATURE_TYPES); #undef LOOP_BODY @@ -35,8 +33,7 @@ namespace pisa { namespace time_prediction { } } - inline std::string feature_name(feature_type f) - { + inline std::string feature_name(feature_type f) { switch (f) { #define LOOP_BODY(R, DATA, T) \ case feature_type::T: \ @@ -56,8 +53,7 @@ namespace pisa { namespace time_prediction { float& operator[](feature_type f) { return m_features[(size_t)f]; } float const& operator[](feature_type f) const { return m_features[(size_t)f]; } - stats_line& dump(stats_line& sl) const - { + stats_line& dump(stats_line& sl) const { for (size_t i = 0; i < num_features; ++i) { auto ft = static_cast(i); sl(feature_name(ft), (*this)[ft]); @@ -73,8 +69,7 @@ namespace pisa { namespace time_prediction { public: predictor() = default; - explicit predictor(std::vector> const& values) - { + explicit predictor(std::vector> const& values) { for (auto const& kv: values) { if (kv.first == "bias") { bias() = kv.second; @@ -87,8 +82,7 @@ namespace pisa { namespace time_prediction { float& bias() { return m_bias; } float const& bias() const { return m_bias; } - float operator()(feature_vector const& f) const - { + float operator()(feature_vector const& f) const { float result = bias(); for (size_t i = 0; i < num_features; ++i) { auto ft = static_cast(i); @@ -101,8 +95,7 @@ namespace pisa { namespace time_prediction { float m_bias{0.0}; }; - inline void values_statistics(std::vector values, feature_vector& f) - { + inline void values_statistics(std::vector values, feature_vector& f) { std::sort(values.begin(), values.end()); f[feature_type::n] = values.size(); if (values.empty()) { @@ -141,8 +134,7 @@ namespace pisa { namespace time_prediction { } inline bool - read_block_stats(std::istream& is, uint32_t& list_id, std::vector& block_counts) - { + read_block_stats(std::istream& is, uint32_t& list_id, std::vector& block_counts) { thread_local std::string line; uint32_t count; block_counts.clear(); diff --git a/include/pisa/document_record.hpp b/include/pisa/document_record.hpp index e6ed99ca..8c363706 100644 --- a/include/pisa/document_record.hpp +++ b/include/pisa/document_record.hpp @@ -8,8 +8,7 @@ namespace pisa { struct Document_Record { Document_Record(std::string title, std::string content, std::string url) - : title_(std::move(title)), content_(std::move(content)), url_(std::move(url)) - {} + : title_(std::move(title)), content_(std::move(content)), url_(std::move(url)) {} [[nodiscard]] auto title() noexcept -> std::string& { return title_; } [[nodiscard]] auto title() const noexcept -> std::string const& { return title_; } [[nodiscard]] auto content() noexcept -> std::string& { return content_; } @@ -27,8 +26,7 @@ class Plaintext_Record { public: Plaintext_Record() = default; Plaintext_Record(std::string trecid, std::string content) - : m_trecid(std::move(trecid)), m_content(std::move(content)) - {} + : m_trecid(std::move(trecid)), m_content(std::move(content)) {} [[nodiscard]] auto content() -> std::string& { return m_content; } [[nodiscard]] auto content() const -> std::string const& { return m_content; } [[nodiscard]] auto trecid() -> std::string& { return m_trecid; } @@ -46,8 +44,7 @@ class Plaintext_Record { } // namespace pisa -inline auto operator>>(std::istream& is, pisa::Plaintext_Record& record) -> std::istream& -{ +inline auto operator>>(std::istream& is, pisa::Plaintext_Record& record) -> std::istream& { is >> record.trecid(); std::getline(is, record.content()); return is; diff --git a/include/pisa/ensure.hpp b/include/pisa/ensure.hpp index c2640349..19a7ff6e 100644 --- a/include/pisa/ensure.hpp +++ b/include/pisa/ensure.hpp @@ -15,23 +15,20 @@ class Ensure { explicit Ensure(bool condition) : m_condition(condition) {} template - PISA_ALWAYSINLINE auto or_throw(Error&& error) - { + PISA_ALWAYSINLINE auto or_throw(Error&& error) { if (not m_condition) { throw std::forward(error); } } template - PISA_ALWAYSINLINE auto or_else(Fn&& fn) - { + PISA_ALWAYSINLINE auto or_else(Fn&& fn) { if (not m_condition) { fn(); } } - PISA_ALWAYSINLINE auto or_panic(std::string_view error_msg) - { + PISA_ALWAYSINLINE auto or_panic(std::string_view error_msg) { if (not m_condition) { spdlog::error(error_msg); std::exit(EXIT_FAILURE); @@ -39,8 +36,7 @@ class Ensure { } template - PISA_ALWAYSINLINE auto or_panic_with(Fn&& fn) - { + PISA_ALWAYSINLINE auto or_panic_with(Fn&& fn) { if (not m_condition) { fn(); std::exit(EXIT_FAILURE); @@ -51,14 +47,13 @@ class Ensure { bool m_condition; }; -[[nodiscard]] inline auto ensure(bool condition) -> Ensure -{ +[[nodiscard]] inline auto ensure(bool condition) -> Ensure { return Ensure(condition); } template -[[nodiscard]] inline auto unwrap(std::optional value, const char* msg = "no value to unwrap") -> T -{ +[[nodiscard]] inline auto unwrap(std::optional value, const char* msg = "no value to unwrap") + -> T { if (!value.has_value()) { throw std::domain_error(msg); } diff --git a/include/pisa/filesystem.hpp b/include/pisa/filesystem.hpp index 2cb5ddd2..39ae88eb 100644 --- a/include/pisa/filesystem.hpp +++ b/include/pisa/filesystem.hpp @@ -7,8 +7,7 @@ namespace pisa { -[[nodiscard]] auto ls(std::filesystem::path dir, std::function predicate) -{ +[[nodiscard]] auto ls(std::filesystem::path dir, std::function predicate) { std::vector files; for (auto it = std::filesystem::directory_iterator(dir); it != std::filesystem::directory_iterator{}; diff --git a/include/pisa/forward_index.hpp b/include/pisa/forward_index.hpp index 4a4af032..ac1b8629 100644 --- a/include/pisa/forward_index.hpp +++ b/include/pisa/forward_index.hpp @@ -27,15 +27,13 @@ class forward_index: public std::vector> { : std::vector(document_count), m_term_count(term_count), m_term_counts(document_count), - m_compressed(compressed) - {} + m_compressed(compressed) {} const std::size_t& term_count() const { return m_term_count; } const std::size_t& term_count(id_type document) const { return m_term_counts[document]; } //! Compresses each document in `fwd` with a faster codec. - static forward_index read(const std::string& input_file) - { + static forward_index read(const std::string& input_file) { std::ifstream in(input_file.c_str()); bool compressed; size_t term_count, docs_count; @@ -54,8 +52,7 @@ class forward_index: public std::vector> { return fwd; } - static forward_index& compress(forward_index& fwd) - { + static forward_index& compress(forward_index& fwd) { progress p("Compressing forward index", fwd.size()); for (id_type doc = 0U; doc < fwd.size(); ++doc) { auto& encoded_terms = fwd[doc]; @@ -75,8 +72,7 @@ class forward_index: public std::vector> { } static forward_index - from_inverted_index(const std::string& input_basename, size_t min_len, bool use_compression) - { + from_inverted_index(const std::string& input_basename, size_t min_len, bool use_compression) { binary_collection coll((input_basename + ".docs").c_str()); auto firstseq = *coll.begin(); @@ -110,8 +106,7 @@ class forward_index: public std::vector> { return fwd; } - static void write(const forward_index& fwd, const std::string& output_file) - { + static void write(const forward_index& fwd, const std::string& output_file) { std::ofstream out(output_file.c_str()); size_t size = fwd.size(); out.write(reinterpret_cast(&fwd.m_compressed), sizeof(fwd.m_compressed)); @@ -120,16 +115,15 @@ class forward_index: public std::vector> { for (id_type doc = 0; doc < fwd.size(); ++doc) { size = fwd[doc].size(); out.write( - reinterpret_cast(&fwd.m_term_counts[doc]), - sizeof(fwd.m_term_counts[doc])); + reinterpret_cast(&fwd.m_term_counts[doc]), sizeof(fwd.m_term_counts[doc]) + ); out.write(reinterpret_cast(&size), sizeof(size)); out.write(reinterpret_cast(fwd[doc].data()), size); } } //! Decodes and returns the list of terms for a given document. - std::vector terms(id_type document) const - { + std::vector terms(id_type document) const { const entry_type& encoded_terms = (*this)[document]; std::vector terms; if (m_compressed) { diff --git a/include/pisa/forward_index_builder.hpp b/include/pisa/forward_index_builder.hpp index a3875bda..f98e6cc4 100644 --- a/include/pisa/forward_index_builder.hpp +++ b/include/pisa/forward_index_builder.hpp @@ -29,8 +29,7 @@ class Forward_Index_Builder { }; template - static std::ostream& write_document(std::ostream& os, Iterator first, Iterator last) - { + static std::ostream& write_document(std::ostream& os, Iterator first, Iterator last) { std::uint32_t length = std::distance(first, last); os.write(reinterpret_cast(&length), sizeof(length)); os.write(reinterpret_cast(&(*first)), length * sizeof(*first)); @@ -66,7 +65,8 @@ class Forward_Index_Builder { read_record_function_type next_record, std::shared_ptr text_analyzer, std::ptrdiff_t batch_size, - std::size_t threads) const; + std::size_t threads + ) const; /// Removes all intermediate batches. void remove_batches(std::string const& basename, std::ptrdiff_t batch_count) const; diff --git a/include/pisa/freq_index.hpp b/include/pisa/freq_index.hpp index ae228fe9..65877244 100644 --- a/include/pisa/freq_index.hpp +++ b/include/pisa/freq_index.hpp @@ -40,8 +40,7 @@ class freq_index { * throughout the life of the index. Once the source gets deallocated, * any index operations may result in undefined behavior. */ - explicit freq_index(MemorySource source) : m_source(std::move(source)) - { + explicit freq_index(MemorySource source) : m_source(std::move(source)) { mapper::map(*this, m_source.data(), mapper::map_flags::warmup); } @@ -59,8 +58,7 @@ class freq_index { : m_params(params), m_num_docs(num_docs), m_docs_sequences(params), - m_freqs_sequences(params) - {} + m_freqs_sequences(params) {} /** * Records a new posting list. @@ -76,8 +74,8 @@ class freq_index { */ template void add_posting_list( - uint64_t n, DocsIterator docs_begin, FreqsIterator freqs_begin, uint64_t occurrences) - { + uint64_t n, DocsIterator docs_begin, FreqsIterator freqs_begin, uint64_t occurrences + ) { if (!n) { throw std::invalid_argument("List must be nonempty"); } @@ -96,7 +94,8 @@ class freq_index { bit_vector_builder freqs_bits; FreqsSequence::write(freqs_bits, freqs_begin, occurrences + 1, n, m_params); m_freqs_sequences.append(freqs_bits); - }); + } + ); } /** @@ -104,8 +103,7 @@ class freq_index { * * \param sq Inverted index object that will take ownership of the data. */ - void build(freq_index& sq) - { + void build(freq_index& sq) { sq.m_num_docs = m_num_docs; sq.m_params = m_params; @@ -132,28 +130,24 @@ class freq_index { class document_enumerator { public: - void reset() - { + void reset() { m_cur_pos = 0; m_cur_docid = m_docs_enum.move(0).second; } - void PISA_FLATTEN_FUNC next() - { + void PISA_FLATTEN_FUNC next() { auto val = m_docs_enum.next(); m_cur_pos = val.first; m_cur_docid = val.second; } - void PISA_FLATTEN_FUNC next_geq(uint64_t lower_bound) - { + void PISA_FLATTEN_FUNC next_geq(uint64_t lower_bound) { auto val = m_docs_enum.next_geq(lower_bound); m_cur_pos = val.first; m_cur_docid = val.second; } - void PISA_FLATTEN_FUNC move(uint64_t position) - { + void PISA_FLATTEN_FUNC move(uint64_t position) { auto val = m_docs_enum.move(position); m_cur_pos = val.first; m_cur_docid = val.second; @@ -175,9 +169,9 @@ class freq_index { friend class freq_index; document_enumerator( - typename DocsSequence::enumerator docs_enum, typename FreqsSequence::enumerator freqs_enum) - : m_docs_enum(docs_enum), m_freqs_enum(freqs_enum) - { + typename DocsSequence::enumerator docs_enum, typename FreqsSequence::enumerator freqs_enum + ) + : m_docs_enum(docs_enum), m_freqs_enum(freqs_enum) { reset(); } @@ -197,11 +191,11 @@ class freq_index { * * \returns The cursor over the posting list. */ - [[nodiscard]] document_enumerator operator[](size_t term_id) const - { + [[nodiscard]] document_enumerator operator[](size_t term_id) const { if (term_id >= size()) { throw std::out_of_range( - fmt::format("given term ID ({}) is out of range, must be < {}", term_id, size())); + fmt::format("given term ID ({}) is out of range, must be < {}", term_id, size()) + ); } auto docs_it = m_docs_sequences.get(m_params, term_id); uint64_t occurrences = read_gamma_nonzero(docs_it); @@ -211,11 +205,13 @@ class freq_index { } typename DocsSequence::enumerator docs_enum( - m_docs_sequences.bits(), docs_it.position(), num_docs(), n, m_params); + m_docs_sequences.bits(), docs_it.position(), num_docs(), n, m_params + ); auto freqs_it = m_freqs_sequences.get(m_params, term_id); typename FreqsSequence::enumerator freqs_enum( - m_freqs_sequences.bits(), freqs_it.position(), occurrences + 1, n, m_params); + m_freqs_sequences.bits(), freqs_it.position(), occurrences + 1, n, m_params + ); return document_enumerator(docs_enum, freqs_enum); } @@ -223,8 +219,7 @@ class freq_index { /** * No-op. */ - void warmup(size_t /* i */) const - { + void warmup(size_t /* i */) const { // XXX implement this } @@ -233,8 +228,7 @@ class freq_index { /** * Swaps all data with another index. */ - void swap(freq_index& other) - { + void swap(freq_index& other) { std::swap(m_params, other.m_params); std::swap(m_num_docs, other.m_num_docs); m_docs_sequences.swap(other.m_docs_sequences); @@ -242,8 +236,7 @@ class freq_index { } template - void map(Visitor& visit) - { + void map(Visitor& visit) { visit(m_params, "m_params") // (m_num_docs, "m_num_docs") // (m_docs_sequences, "m_docs_sequences") // diff --git a/include/pisa/global_parameters.hpp b/include/pisa/global_parameters.hpp index 4a5febbc..25228aae 100644 --- a/include/pisa/global_parameters.hpp +++ b/include/pisa/global_parameters.hpp @@ -8,12 +8,10 @@ struct global_parameters { ef_log_sampling1(8), rb_log_rank1_sampling(9), rb_log_sampling1(8), - log_partition_size(7) - {} + log_partition_size(7) {} template - void map(Visitor& visit) - { + void map(Visitor& visit) { visit(ef_log_sampling0, "ef_log_sampling0")(ef_log_sampling1, "ef_log_sampling1")( rb_log_rank1_sampling, "rb_log_rank1_sampling")(rb_log_sampling1, "rb_log_sampling1")( log_partition_size, "log_partition_size"); diff --git a/include/pisa/index_types.hpp b/include/pisa/index_types.hpp index e199cd58..88bbd9bb 100644 --- a/include/pisa/index_types.hpp +++ b/include/pisa/index_types.hpp @@ -45,10 +45,9 @@ using block_simdbp_index = block_freq_index; } // namespace pisa -#define PISA_INDEX_TYPES \ - (ef)(single)(pefuniform)(pefopt)(block_optpfor)(block_varintg8iu)(block_streamvbyte)( \ - block_maskedvbyte)(block_interpolative)(block_qmx)(block_varintgb)(block_simple8b)( \ - block_simple16)(block_simdbp) -#define PISA_BLOCK_INDEX_TYPES \ - (block_optpfor)(block_varintg8iu)(block_streamvbyte)(block_maskedvbyte)(block_interpolative)( \ - block_qmx)(block_varintgb)(block_simple8b)(block_simple16)(block_simdbp) +#define PISA_INDEX_TYPES \ + (ef)(single)(pefuniform)(pefopt)(block_optpfor)(block_varintg8iu)(block_streamvbyte)(block_maskedvbyte)(block_interpolative)(block_qmx)(block_varintgb)(block_simple8b)(block_simple16)(block_simdbp \ + ) +#define PISA_BLOCK_INDEX_TYPES \ + (block_optpfor)(block_varintg8iu)(block_streamvbyte)(block_maskedvbyte)(block_interpolative)(block_qmx)(block_varintgb)(block_simple8b)(block_simple16)(block_simdbp \ + ) diff --git a/include/pisa/intersection.hpp b/include/pisa/intersection.hpp index ea6dba80..8f88f996 100644 --- a/include/pisa/intersection.hpp +++ b/include/pisa/intersection.hpp @@ -23,8 +23,7 @@ namespace intersection { using Mask = std::bitset; /// Returns a filtered copy of `query` containing only terms indicated by ones in the bit mask. - [[nodiscard]] inline auto filter(Query const& query, Mask mask) -> Query - { + [[nodiscard]] inline auto filter(Query const& query, Mask mask) -> Query { if (query.terms.size() > MAX_QUERY_LEN) { throw std::invalid_argument("Queries can be at most 2^32 terms long"); } @@ -54,14 +53,14 @@ struct Intersection { Index const& index, Wand const& wand, Query const& query, - std::optional term_mask = std::nullopt) -> Intersection; + std::optional term_mask = std::nullopt + ) -> Intersection; }; template inline auto Intersection::compute( - Index const& index, Wand const& wand, Query const& query, std::optional term_mask) - -> Intersection -{ + Index const& index, Wand const& wand, Query const& query, std::optional term_mask +) -> Intersection { auto filtered_query = term_mask ? intersection::filter(query, *term_mask) : query; scored_and_query retrieve{}; auto scorer = scorer::from_params(ScorerParams("bm25"), wand); @@ -80,8 +79,7 @@ inline auto Intersection::compute( /// Do `func` for all intersections in a query that have a given maximum number of terms. /// `Fn` takes `Query` and `Mask`. template -auto for_all_subsets(Query const& query, std::optional max_term_count, Fn func) -{ +auto for_all_subsets(Query const& query, std::optional max_term_count, Fn func) { auto subset_count = 1U << query.terms.size(); for (auto subset = 1U; subset < subset_count; ++subset) { auto mask = intersection::Mask(subset); diff --git a/include/pisa/invert.hpp b/include/pisa/invert.hpp index facbccff..e06b3574 100644 --- a/include/pisa/invert.hpp +++ b/include/pisa/invert.hpp @@ -33,9 +33,8 @@ namespace pisa { namespace invert { Inverted_Index() = default; Inverted_Index(Inverted_Index&, tbb::split); Inverted_Index( - Documents documents, - Frequencies frequencies, - std::vector document_sizes = {}); + Documents documents, Frequencies frequencies, std::vector document_sizes = {} + ); void operator()(tbb::blocked_range const& r); void join(Inverted_Index& rhs); @@ -58,7 +57,8 @@ namespace pisa { namespace invert { std::vector& lower_doc, std::vector& lower_freq, std::vector& higher_doc, - std::vector& higher_freq); + std::vector& higher_freq + ); /// Creates an in-memory inverted index for a single document range. auto invert_range(DocumentRange documents, Document_Id first_document_id, size_t threads) @@ -73,6 +73,7 @@ namespace pisa { namespace invert { /// Creates an inverted index (simple, uncompressed binary format) from a forward index. void invert_forward_index( - std::string const& input_basename, std::string const& output_basename, InvertParams params); + std::string const& input_basename, std::string const& output_basename, InvertParams params + ); }} // namespace pisa::invert diff --git a/include/pisa/io.hpp b/include/pisa/io.hpp index 5aca37d2..a1028309 100644 --- a/include/pisa/io.hpp +++ b/include/pisa/io.hpp @@ -34,8 +34,7 @@ class Line: public std::string { [[nodiscard]] auto read_string_vector(std::string const& filename) -> std::vector; template -void for_each_line(std::istream& is, Function fn) -{ +void for_each_line(std::istream& is, Function fn) { std::string line; while (std::getline(is, line)) { fn(line); diff --git a/include/pisa/linear_quantizer.hpp b/include/pisa/linear_quantizer.hpp index b6bb4e06..5be852ee 100644 --- a/include/pisa/linear_quantizer.hpp +++ b/include/pisa/linear_quantizer.hpp @@ -7,15 +7,14 @@ namespace pisa { struct LinearQuantizer { explicit LinearQuantizer(float max, uint8_t bits) - : m_max(max), m_scale(static_cast(1U << (bits)) / max) - { + : m_max(max), m_scale(static_cast(1U << (bits)) / max) { if (bits > 32 or bits == 0) { throw std::runtime_error(fmt::format( - "Linear quantizer must take a number of bits between 1 and 32 but {} passed", bits)); + "Linear quantizer must take a number of bits between 1 and 32 but {} passed", bits + )); } } - [[nodiscard]] auto operator()(float value) const -> std::uint32_t - { + [[nodiscard]] auto operator()(float value) const -> std::uint32_t { Expects(value <= m_max); return std::ceil(value * m_scale); } diff --git a/include/pisa/mappable/mappable_vector.hpp b/include/pisa/mappable/mappable_vector.hpp index 5b4c6a5a..dd791bf9 100644 --- a/include/pisa/mappable/mappable_vector.hpp +++ b/include/pisa/mappable/mappable_vector.hpp @@ -35,8 +35,7 @@ namespace pisa { namespace mapper { mappable_vector& operator=(mappable_vector&&) = delete; template - explicit mappable_vector(Range const& from) : m_data(0), m_size(0) - { + explicit mappable_vector(Range const& from) : m_data(0), m_size(0) { size_t size = boost::size(from); T* data = new T[size]; m_deleter = boost::lambda::bind(boost::lambda::delete_array(), data); @@ -46,15 +45,13 @@ namespace pisa { namespace mapper { m_size = size; } - ~mappable_vector() - { + ~mappable_vector() { if (not m_deleter.empty()) { m_deleter(); } } - void swap(mappable_vector& other) - { + void swap(mappable_vector& other) { using std::swap; swap(m_data, other.m_data); swap(m_size, other.m_size); @@ -63,8 +60,7 @@ namespace pisa { namespace mapper { void clear() { mappable_vector().swap(*this); } - void steal(std::vector& vec) - { + void steal(std::vector& vec) { clear(); m_size = vec.size(); if (m_size > 0) { @@ -76,8 +72,7 @@ namespace pisa { namespace mapper { } template - void assign(Range const& from) - { + void assign(Range const& from) { clear(); mappable_vector(from).swap(*this); } @@ -88,8 +83,7 @@ namespace pisa { namespace mapper { inline const_iterator end() const { return m_data + m_size; } - inline T const& operator[](uint64_t i) const - { + inline T const& operator[](uint64_t i) const { assert(i < m_size); return m_data[i]; } diff --git a/include/pisa/mappable/mapper.hpp b/include/pisa/mappable/mapper.hpp index f57f5dc0..3e6638fc 100644 --- a/include/pisa/mappable/mapper.hpp +++ b/include/pisa/mappable/mapper.hpp @@ -24,8 +24,7 @@ namespace pisa { namespace mapper { size_t size; std::vector children; - void dump(std::ostream& os = std::cerr, size_t depth = 0) - { + void dump(std::ostream& os = std::cerr, size_t depth = 0) { os << std::string(depth * 4, ' ') << name << ": " << size << '\n'; for (auto&& child: children) { child->dump(os, depth + 1); @@ -37,8 +36,7 @@ namespace pisa { namespace mapper { class freeze_visitor { public: freeze_visitor(std::ofstream& fout, uint64_t flags) - : m_fout(fout), m_flags(flags), m_written(0) - { + : m_fout(fout), m_flags(flags), m_written(0) { // Save freezing flags m_fout.write(reinterpret_cast(&m_flags), sizeof(m_flags)); m_written += sizeof(m_flags); @@ -52,24 +50,21 @@ namespace pisa { namespace mapper { template typename std::enable_if::value, freeze_visitor&>::type - operator()(T& val, const char* /* friendly_name */) - { + operator()(T& val, const char* /* friendly_name */) { val.map(*this); return *this; } template typename std::enable_if::value, freeze_visitor&>::type - operator()(T& val, const char* /* friendly_name */) - { + operator()(T& val, const char* /* friendly_name */) { m_fout.write(reinterpret_cast(&val), sizeof(T)); m_written += sizeof(T); return *this; } template - freeze_visitor& operator()(mappable_vector& vec, const char* /* friendly_name */) - { + freeze_visitor& operator()(mappable_vector& vec, const char* /* friendly_name */) { (*this)(vec.m_size, "size"); auto n_bytes = static_cast(vec.m_size * sizeof(T)); @@ -90,8 +85,7 @@ namespace pisa { namespace mapper { class map_visitor { public: map_visitor(const char* base_address, uint64_t flags) - : m_base(base_address), m_cur(m_base), m_flags(flags) - { + : m_base(base_address), m_cur(m_base), m_flags(flags) { m_freeze_flags = *reinterpret_cast(m_cur); m_cur += sizeof(m_freeze_flags); } @@ -104,24 +98,21 @@ namespace pisa { namespace mapper { template typename std::enable_if::value, map_visitor&>::type - operator()(T& val, const char* /* friendly_name */) - { + operator()(T& val, const char* /* friendly_name */) { val.map(*this); return *this; } template typename std::enable_if::value, map_visitor&>::type - operator()(T& val, const char* /* friendly_name */) - { + operator()(T& val, const char* /* friendly_name */) { std::memmove(&val, m_cur, sizeof(T)); m_cur += sizeof(T); return *this; } template - map_visitor& operator()(mappable_vector& vec, const char* /* friendly_name */) - { + map_visitor& operator()(mappable_vector& vec, const char* /* friendly_name */) { vec.clear(); (*this)(vec.m_size, "size"); @@ -151,8 +142,7 @@ namespace pisa { namespace mapper { class sizeof_visitor { public: - explicit sizeof_visitor(bool with_tree = false) : m_size(0) - { + explicit sizeof_visitor(bool with_tree = false) : m_size(0) { if (with_tree) { m_cur_size_node = std::make_shared(); } @@ -166,8 +156,7 @@ namespace pisa { namespace mapper { template typename std::enable_if::value, sizeof_visitor&>::type - operator()(T& val, const char* friendly_name) - { + operator()(T& val, const char* friendly_name) { size_t checkpoint = m_size; size_node_ptr parent_node; if (m_cur_size_node) { @@ -186,16 +175,14 @@ namespace pisa { namespace mapper { template typename std::enable_if::value, sizeof_visitor&>::type - operator()(T& /* val */, const char* /* friendly_name */) - { + operator()(T& /* val */, const char* /* friendly_name */) { // don't track PODs in the size tree (they are constant sized) m_size += sizeof(T); return *this; } template - sizeof_visitor& operator()(mappable_vector& vec, const char* friendly_name) - { + sizeof_visitor& operator()(mappable_vector& vec, const char* friendly_name) { size_t checkpoint = m_size; (*this)(vec.m_size, "size"); m_size += static_cast(vec.m_size * sizeof(T)); @@ -209,15 +196,13 @@ namespace pisa { namespace mapper { size_t size() const { return m_size; } - size_node_ptr size_tree() const - { + size_node_ptr size_tree() const { assert(m_cur_size_node); return m_cur_size_node; } protected: - size_node_ptr make_node(const char* name) - { + size_node_ptr make_node(const char* name) { size_node_ptr node = std::make_shared(); m_cur_size_node->children.push_back(node); node->name = name; @@ -245,8 +230,7 @@ namespace pisa { namespace mapper { */ template std::size_t - freeze(T& val, std::ofstream& fout, uint64_t flags = 0, const char* friendly_name = "") - { + freeze(T& val, std::ofstream& fout, uint64_t flags = 0, const char* friendly_name = "") { detail::freeze_visitor freezer(fout, flags); freezer(val, friendly_name); return freezer.written(); @@ -266,8 +250,7 @@ namespace pisa { namespace mapper { */ template std::size_t - freeze(T& val, const char* filename, uint64_t flags = 0, const char* friendly_name = "") - { + freeze(T& val, const char* filename, uint64_t flags = 0, const char* friendly_name = "") { std::ofstream fout(filename, std::ios::binary); fout.exceptions(std::ios::badbit | std::ios::failbit); return freeze(val, fout, flags, friendly_name); @@ -288,8 +271,7 @@ namespace pisa { namespace mapper { */ template size_t - map(T& val, const char* base_address, uint64_t flags = 0, const char* friendly_name = "") - { + map(T& val, const char* base_address, uint64_t flags = 0, const char* friendly_name = "") { detail::map_visitor mapper(base_address, flags); mapper(val, friendly_name); return mapper.bytes_read(); @@ -309,22 +291,19 @@ namespace pisa { namespace mapper { */ template size_t - map(T& val, const mio::mmap_source& m, uint64_t flags = 0, const char* friendly_name = "") - { + map(T& val, const mio::mmap_source& m, uint64_t flags = 0, const char* friendly_name = "") { return map(val, m.data(), flags, friendly_name); } template - std::size_t size_of(T& val) - { + std::size_t size_of(T& val) { detail::sizeof_visitor sizer; sizer(val, ""); return sizer.size(); } template - size_node_ptr size_tree_of(T& val, const char* friendly_name = "") - { + size_node_ptr size_tree_of(T& val, const char* friendly_name = "") { detail::sizeof_visitor sizer(true); sizer(val, friendly_name); assert(not sizer.size_tree()->children.empty()); diff --git a/include/pisa/memory.hpp b/include/pisa/memory.hpp index f2eac5aa..0677ca29 100644 --- a/include/pisa/memory.hpp +++ b/include/pisa/memory.hpp @@ -14,8 +14,8 @@ #pragma once -#include #include +#include #include namespace pisa { @@ -34,8 +34,7 @@ class ReinterpretProxy { void operator=(T const& value) { std::memcpy(m_ptr, &value, m_len); } - [[nodiscard]] auto operator*() const -> T - { + [[nodiscard]] auto operator*() const -> T { T dst{0}; std::memcpy(&dst, m_ptr, m_len); return dst; @@ -55,8 +54,7 @@ class ReinterpretProxy { * It will copy 4 bytes representing 789 to the memory location starting at `byte_ptr`. */ template -auto bitwise_reinterpret(std::uint8_t* dst) -> ReinterpretProxy -{ +auto bitwise_reinterpret(std::uint8_t* dst) -> ReinterpretProxy { return ReinterpretProxy{dst, sizeof(T)}; } @@ -76,8 +74,7 @@ auto bitwise_reinterpret(std::uint8_t* dst) -> ReinterpretProxy */ template auto bitwise_reinterpret(std::uint8_t const* dst, std::size_t len = sizeof(T)) - -> ReinterpretProxy -{ + -> ReinterpretProxy { return ReinterpretProxy{dst, len}; } diff --git a/include/pisa/memory_source.hpp b/include/pisa/memory_source.hpp index 6a3f0e4e..d6411759 100644 --- a/include/pisa/memory_source.hpp +++ b/include/pisa/memory_source.hpp @@ -106,8 +106,7 @@ class MemorySource { private: template - explicit MemorySource(T source) : m_source(std::make_unique>(std::move(source))) - {} + explicit MemorySource(T source) : m_source(std::make_unique>(std::move(source))) {} std::unique_ptr m_source; }; diff --git a/include/pisa/optimal_partition.hpp b/include/pisa/optimal_partition.hpp index c379dac1..e54b3969 100644 --- a/include/pisa/optimal_partition.hpp +++ b/include/pisa/optimal_partition.hpp @@ -28,22 +28,23 @@ struct optimal_partition { cost_t cost_upper_bound; // The maximum cost for this window cost_window(ForwardIterator begin, posting_t base, cost_t cost_upper_bound) - : start_it(begin), end_it(begin), min_p(base), max_p(0), cost_upper_bound(cost_upper_bound) - {} + : start_it(begin), + end_it(begin), + min_p(base), + max_p(0), + cost_upper_bound(cost_upper_bound) {} uint64_t universe() const { return max_p - min_p + 1; } uint64_t size() const { return end - start; } - void advance_start() - { + void advance_start() { min_p = *start_it + 1; ++start; ++start_it; } - void advance_end() - { + void advance_end() { max_p = *end_it; ++end; ++end_it; @@ -60,8 +61,8 @@ struct optimal_partition { uint64_t size, CostFunction cost_fun, double eps1, - double eps2) - { + double eps2 + ) { cost_t single_block_cost = cost_fun(universe - base, size); std::vector min_cost(size + 1, single_block_cost); min_cost[0] = 0; diff --git a/include/pisa/payload_vector.hpp b/include/pisa/payload_vector.hpp index 75193315..2e2f9c14 100644 --- a/include/pisa/payload_vector.hpp +++ b/include/pisa/payload_vector.hpp @@ -27,15 +27,13 @@ namespace detail { typename gsl::span::iterator offset_iter; typename gsl::span::iterator payload_iter; - constexpr auto operator++() -> Payload_Vector_Iterator& - { + constexpr auto operator++() -> Payload_Vector_Iterator& { ++offset_iter; std::advance(payload_iter, *offset_iter - *std::prev(offset_iter)); return *this; } - [[nodiscard]] constexpr auto operator++(int) -> Payload_Vector_Iterator - { + [[nodiscard]] constexpr auto operator++(int) -> Payload_Vector_Iterator { Payload_Vector_Iterator next_iter{offset_iter, payload_iter}; ++(*this); return next_iter; @@ -43,58 +41,52 @@ namespace detail { constexpr auto operator--() -> Payload_Vector_Iterator& { return *this -= 1; } - [[nodiscard]] constexpr auto operator--(int) -> Payload_Vector_Iterator - { + [[nodiscard]] constexpr auto operator--(int) -> Payload_Vector_Iterator { Payload_Vector_Iterator next_iter{offset_iter, payload_iter}; --(*this); return next_iter; } - [[nodiscard]] constexpr auto operator+(size_type n) const -> Payload_Vector_Iterator - { + [[nodiscard]] constexpr auto operator+(size_type n) const -> Payload_Vector_Iterator { return { std::next(offset_iter, n), - std::next(payload_iter, *std::next(offset_iter, n) - *offset_iter)}; + std::next(payload_iter, *std::next(offset_iter, n) - *offset_iter) + }; } - [[nodiscard]] constexpr auto operator+=(size_type n) -> Payload_Vector_Iterator& - { + [[nodiscard]] constexpr auto operator+=(size_type n) -> Payload_Vector_Iterator& { std::advance(payload_iter, *std::next(offset_iter, n) - *offset_iter); std::advance(offset_iter, n); return *this; } - [[nodiscard]] constexpr auto operator-(size_type n) const -> Payload_Vector_Iterator - { + [[nodiscard]] constexpr auto operator-(size_type n) const -> Payload_Vector_Iterator { return { std::prev(offset_iter, n), - std::prev(payload_iter, *offset_iter - *std::prev(offset_iter, n))}; + std::prev(payload_iter, *offset_iter - *std::prev(offset_iter, n)) + }; } - [[nodiscard]] constexpr auto operator-=(size_type n) -> Payload_Vector_Iterator& - { + [[nodiscard]] constexpr auto operator-=(size_type n) -> Payload_Vector_Iterator& { return *this = *this - n; } - [[nodiscard]] constexpr auto operator-(Payload_Vector_Iterator const& other) -> difference_type - { + [[nodiscard]] constexpr auto operator-(Payload_Vector_Iterator const& other) + -> difference_type { return offset_iter - other.offset_iter; } - [[nodiscard]] constexpr auto operator*() -> value_type - { + [[nodiscard]] constexpr auto operator*() -> value_type { return value_type( - reinterpret_cast(&*payload_iter), - *std::next(offset_iter) - *offset_iter); + reinterpret_cast(&*payload_iter), *std::next(offset_iter) - *offset_iter + ); } - [[nodiscard]] constexpr auto operator==(Payload_Vector_Iterator const& other) const -> bool - { + [[nodiscard]] constexpr auto operator==(Payload_Vector_Iterator const& other) const -> bool { return offset_iter == other.offset_iter; } - [[nodiscard]] constexpr auto operator!=(Payload_Vector_Iterator const& other) const -> bool - { + [[nodiscard]] constexpr auto operator!=(Payload_Vector_Iterator const& other) const -> bool { return offset_iter != other.offset_iter; } }; @@ -132,13 +124,13 @@ namespace detail { }; template - [[nodiscard]] static constexpr auto unpack(std::byte const* ptr) -> std::tuple - { + [[nodiscard]] static constexpr auto unpack(std::byte const* ptr) -> std::tuple { if constexpr (sizeof...(Ts) == 0U) { // NOLINT(readability-braces-around-statements) return std::tuple(*reinterpret_cast(ptr)); } else { return std::tuple_cat( - std::tuple(*reinterpret_cast(ptr)), unpack(ptr + sizeof(T))); + std::tuple(*reinterpret_cast(ptr)), unpack(ptr + sizeof(T)) + ); } } @@ -150,8 +142,7 @@ struct Payload_Vector_Buffer { std::vector const offsets; std::vector const payloads; - [[nodiscard]] static auto from_file(std::string const& filename) -> Payload_Vector_Buffer - { + [[nodiscard]] static auto from_file(std::string const& filename) -> Payload_Vector_Buffer { std::error_code ec; auto file_size = std::filesystem::file_size(std::filesystem::path(filename)); std::ifstream is(filename); @@ -170,14 +161,12 @@ struct Payload_Vector_Buffer { return Payload_Vector_Buffer{std::move(offsets), std::move(payloads)}; } - void to_file(std::string const& filename) const - { + void to_file(std::string const& filename) const { std::ofstream is(filename); to_stream(is); } - void to_stream(std::ostream& is) const - { + void to_stream(std::ostream& is) const { size_type length = offsets.size() - 1U; is.write(reinterpret_cast(&length), sizeof(length)); is.write(reinterpret_cast(offsets.data()), offsets.size() * sizeof(offsets[0])); @@ -187,8 +176,7 @@ struct Payload_Vector_Buffer { template [[nodiscard]] static auto make(InputIterator first, InputIterator last, PayloadEncodingFn encoding_fn) - -> Payload_Vector_Buffer - { + -> Payload_Vector_Buffer { std::vector offsets; offsets.push_back(0U); std::vector payloads; @@ -201,62 +189,59 @@ struct Payload_Vector_Buffer { }; template -auto encode_payload_vector(InputIterator first, InputIterator last, PayloadEncodingFn encoding_fn) -{ +auto encode_payload_vector(InputIterator first, InputIterator last, PayloadEncodingFn encoding_fn) { return Payload_Vector_Buffer::make(first, last, encoding_fn); } template -auto encode_payload_vector(gsl::span values, PayloadEncodingFn encoding_fn) -{ +auto encode_payload_vector(gsl::span values, PayloadEncodingFn encoding_fn) { return encode_payload_vector(values.begin(), values.end(), encoding_fn); } template -auto encode_payload_vector(InputIterator first, InputIterator last) -{ +auto encode_payload_vector(InputIterator first, InputIterator last) { return Payload_Vector_Buffer::make(first, last, [](auto str, auto out_iter) { - std::transform( - str.begin(), str.end(), out_iter, [](auto ch) { return static_cast(ch); }); + std::transform(str.begin(), str.end(), out_iter, [](auto ch) { + return static_cast(ch); + }); }); } -inline auto encode_payload_vector(gsl::span values) -{ +inline auto encode_payload_vector(gsl::span values) { return encode_payload_vector(values.begin(), values.end()); } template constexpr auto unpack_head(gsl::span mem) - -> std::tuple> -{ + -> std::tuple> { static_assert(detail::all_pod::value); auto offset = detail::sizeofs::value; if (offset > mem.size()) { throw std::runtime_error(fmt::format( - "Cannot unpack span of size {} into structure of size {}", mem.size(), offset)); + "Cannot unpack span of size {} into structure of size {}", mem.size(), offset + )); } auto tail = mem.subspan(offset); auto head = detail::unpack(mem.data()); return std::tuple_cat(head, std::tuple>(tail)); } -[[nodiscard]] inline auto split(gsl::span mem, std::size_t offset) -{ +[[nodiscard]] inline auto split(gsl::span mem, std::size_t offset) { if (offset > mem.size()) { throw std::runtime_error( - fmt::format("Cannot split span of size {} at position {}", mem.size(), offset)); + fmt::format("Cannot split span of size {} at position {}", mem.size(), offset) + ); } return std::tuple(mem.first(offset), mem.subspan(offset)); } template -[[nodiscard]] auto cast_span(gsl::span mem) -> gsl::span -{ +[[nodiscard]] auto cast_span(gsl::span mem) -> gsl::span { auto type_size = sizeof(T); if (mem.size() % type_size != 0) { throw std::runtime_error( - fmt::format("Failed to cast byte-span to span of T of size {}", type_size)); + fmt::format("Failed to cast byte-span to span of T of size {}", type_size) + ); } return gsl::make_span(reinterpret_cast(mem.data()), mem.size() / type_size); } @@ -270,28 +255,25 @@ class Payload_Vector { using iterator = detail::Payload_Vector_Iterator; explicit Payload_Vector(Payload_Vector_Buffer const& container) - : offsets_(container.offsets), payloads_(container.payloads) - {} + : offsets_(container.offsets), payloads_(container.payloads) {} Payload_Vector(gsl::span offsets, gsl::span payloads) - : offsets_(offsets), payloads_(payloads) - {} + : offsets_(offsets), payloads_(payloads) {} template - [[nodiscard]] constexpr static auto from(ContiguousContainer&& mem) -> Payload_Vector - { + [[nodiscard]] constexpr static auto from(ContiguousContainer&& mem) -> Payload_Vector { return from(gsl::make_span(reinterpret_cast(mem.data()), mem.size())); } - [[nodiscard]] static auto from(gsl::span mem) -> Payload_Vector - { + [[nodiscard]] static auto from(gsl::span mem) -> Payload_Vector { size_type length; gsl::span tail; try { std::tie(length, tail) = unpack_head(mem); } catch (std::runtime_error const& err) { throw std::runtime_error( - std::string("Failed to parse payload vector length: ") + err.what()); + std::string("Failed to parse payload vector length: ") + err.what() + ); } gsl::span offsets, payloads; @@ -299,36 +281,35 @@ class Payload_Vector { std::tie(offsets, payloads) = split(tail, (length + 1U) * sizeof(size_type)); } catch (std::runtime_error const& err) { throw std::runtime_error( - std::string("Failed to parse payload vector offset table: ") + err.what()); + std::string("Failed to parse payload vector offset table: ") + err.what() + ); } return Payload_Vector(cast_span(offsets), payloads); } - [[nodiscard]] constexpr auto operator[](size_type idx) const -> payload_type - { + [[nodiscard]] constexpr auto operator[](size_type idx) const -> payload_type { if (idx >= offsets_.size()) { - throw std::out_of_range(fmt::format( - "Index {} too large for payload vector of size {}", idx, offsets_.size())); + throw std::out_of_range( + fmt::format("Index {} too large for payload vector of size {}", idx, offsets_.size()) + ); } if (offsets_[idx] >= payloads_.size()) { throw std::runtime_error(fmt::format( - "Offset {} too large for payload array of {} bytes", offsets_[idx], payloads_.size())); + "Offset {} too large for payload array of {} bytes", offsets_[idx], payloads_.size() + )); } return *(begin() + idx); } - [[nodiscard]] constexpr auto begin() const -> iterator - { + [[nodiscard]] constexpr auto begin() const -> iterator { return {offsets_.begin(), payloads_.begin()}; } - [[nodiscard]] constexpr auto end() const -> iterator - { + [[nodiscard]] constexpr auto end() const -> iterator { return {std::prev(offsets_.end()), payloads_.end()}; } [[nodiscard]] constexpr auto cbegin() const -> iterator { return begin(); } [[nodiscard]] constexpr auto cend() const -> iterator { return end(); } - [[nodiscard]] constexpr auto size() const -> size_type - { + [[nodiscard]] constexpr auto size() const -> size_type { return offsets_.size() - size_type{1}; } @@ -346,8 +327,7 @@ class Payload_Vector { /// The function assumes that the elements between `begin` and `end` are sorted according to `cmp`. template > auto binary_search(Iter begin, Iter end, T value, Compare cmp = std::less<>{}) - -> std::optional::difference_type> -{ + -> std::optional::difference_type> { if (auto pos = std::lower_bound(begin, end, value, cmp); pos != end and *pos == value) { return std::distance(begin, pos); } @@ -360,8 +340,7 @@ auto binary_search(Iter begin, Iter end, T value, Compare cmp = std::less<>{}) /// information. template > auto binary_search(gsl::span> range, T value, Compare cmp = std::less{}) - -> std::optional -{ + -> std::optional { return pisa::binary_search(range.begin(), range.end(), value, cmp); } diff --git a/include/pisa/query/algorithm/and_query.hpp b/include/pisa/query/algorithm/and_query.hpp index 65f052d2..7d2077d1 100644 --- a/include/pisa/query/algorithm/and_query.hpp +++ b/include/pisa/query/algorithm/and_query.hpp @@ -18,8 +18,7 @@ namespace pisa { */ struct and_query { template - auto operator()(CursorRange&& cursors, uint32_t max_docid) const - { + auto operator()(CursorRange&& cursors, uint32_t max_docid) const { using Cursor = typename std::decay_t::value_type; using Result_t = uint32_t; @@ -72,8 +71,7 @@ struct and_query { */ struct scored_and_query { template - auto operator()(CursorRange&& cursors, uint32_t max_docid) const - { + auto operator()(CursorRange&& cursors, uint32_t max_docid) const { using Cursor = typename std::decay_t::value_type; using Document = uint32_t; diff --git a/include/pisa/query/algorithm/block_max_maxscore_query.hpp b/include/pisa/query/algorithm/block_max_maxscore_query.hpp index b750bc99..c247d4ae 100644 --- a/include/pisa/query/algorithm/block_max_maxscore_query.hpp +++ b/include/pisa/query/algorithm/block_max_maxscore_query.hpp @@ -11,8 +11,7 @@ struct block_max_maxscore_query { explicit block_max_maxscore_query(topk_queue& topk) : m_topk(topk) {} template - void operator()(CursorRange&& cursors, uint64_t max_docid) - { + void operator()(CursorRange&& cursors, uint64_t max_docid) { using Cursor = typename std::decay_t::value_type; if (cursors.empty()) { return; diff --git a/include/pisa/query/algorithm/block_max_ranked_and_query.hpp b/include/pisa/query/algorithm/block_max_ranked_and_query.hpp index 7c5069ea..71c59998 100644 --- a/include/pisa/query/algorithm/block_max_ranked_and_query.hpp +++ b/include/pisa/query/algorithm/block_max_ranked_and_query.hpp @@ -10,8 +10,7 @@ struct block_max_ranked_and_query { explicit block_max_ranked_and_query(topk_queue& topk) : m_topk(topk) {} template - void operator()(CursorRange&& cursors, uint64_t max_docid) - { + void operator()(CursorRange&& cursors, uint64_t max_docid) { using Cursor = typename std::decay_t::value_type; if (cursors.empty()) { diff --git a/include/pisa/query/algorithm/block_max_wand_query.hpp b/include/pisa/query/algorithm/block_max_wand_query.hpp index 0a7445c2..6f081767 100644 --- a/include/pisa/query/algorithm/block_max_wand_query.hpp +++ b/include/pisa/query/algorithm/block_max_wand_query.hpp @@ -9,8 +9,7 @@ struct block_max_wand_query { explicit block_max_wand_query(topk_queue& topk) : m_topk(topk) {} template - void operator()(CursorRange&& cursors, uint64_t max_docid) - { + void operator()(CursorRange&& cursors, uint64_t max_docid) { using Cursor = typename std::decay_t::value_type; if (cursors.empty()) { return; diff --git a/include/pisa/query/algorithm/maxscore_query.hpp b/include/pisa/query/algorithm/maxscore_query.hpp index 6167d99f..6d575b4e 100644 --- a/include/pisa/query/algorithm/maxscore_query.hpp +++ b/include/pisa/query/algorithm/maxscore_query.hpp @@ -15,8 +15,7 @@ struct maxscore_query { template [[nodiscard]] PISA_ALWAYSINLINE auto sorted(Cursors&& cursors) - -> std::vector::value_type> - { + -> std::vector::value_type> { std::vector term_positions(cursors.size()); std::iota(term_positions.begin(), term_positions.end(), 0); std::sort(term_positions.begin(), term_positions.end(), [&](auto&& lhs, auto&& rhs) { @@ -30,8 +29,7 @@ struct maxscore_query { } template - [[nodiscard]] PISA_ALWAYSINLINE auto calc_upper_bounds(Cursors&& cursors) -> std::vector - { + [[nodiscard]] PISA_ALWAYSINLINE auto calc_upper_bounds(Cursors&& cursors) -> std::vector { std::vector upper_bounds(cursors.size()); auto out = upper_bounds.rbegin(); float bound = 0.0; @@ -43,21 +41,19 @@ struct maxscore_query { } template - [[nodiscard]] PISA_ALWAYSINLINE auto min_docid(Cursors&& cursors) -> std::uint32_t - { + [[nodiscard]] PISA_ALWAYSINLINE auto min_docid(Cursors&& cursors) -> std::uint32_t { return std::min_element( cursors.begin(), cursors.end(), - [](auto&& lhs, auto&& rhs) { return lhs.docid() < rhs.docid(); }) - ->docid(); + [](auto&& lhs, auto&& rhs) { return lhs.docid() < rhs.docid(); } + )->docid(); } enum class UpdateResult : bool { Continue, ShortCircuit }; enum class DocumentStatus : bool { Insert, Skip }; template - PISA_ALWAYSINLINE void run_sorted(Cursors&& cursors, uint64_t max_docid) - { + PISA_ALWAYSINLINE void run_sorted(Cursors&& cursors, uint64_t max_docid) { auto upper_bounds = calc_upper_bounds(cursors); auto above_threshold = [&](auto score) { return m_topk.would_enter(score); }; @@ -126,8 +122,7 @@ struct maxscore_query { } template - void operator()(Cursors&& cursors_, uint64_t max_docid) - { + void operator()(Cursors&& cursors_, uint64_t max_docid) { if (cursors_.empty()) { return; } diff --git a/include/pisa/query/algorithm/or_query.hpp b/include/pisa/query/algorithm/or_query.hpp index 75e9be4c..465d21ab 100644 --- a/include/pisa/query/algorithm/or_query.hpp +++ b/include/pisa/query/algorithm/or_query.hpp @@ -8,8 +8,7 @@ namespace pisa { template struct or_query { template - uint64_t operator()(CursorRange&& cursors, uint64_t max_docid) const - { + uint64_t operator()(CursorRange&& cursors, uint64_t max_docid) const { using Cursor = typename std::decay_t::value_type; if (cursors.empty()) { return 0; diff --git a/include/pisa/query/algorithm/range_query.hpp b/include/pisa/query/algorithm/range_query.hpp index 74efd4c1..c32907b5 100644 --- a/include/pisa/query/algorithm/range_query.hpp +++ b/include/pisa/query/algorithm/range_query.hpp @@ -10,8 +10,7 @@ struct range_query { explicit range_query(topk_queue& topk) : m_topk(topk) {} template - void operator()(CursorRange&& cursors, uint64_t max_docid, size_t range_size) - { + void operator()(CursorRange&& cursors, uint64_t max_docid, size_t range_size) { m_topk.clear(); if (cursors.empty()) { return; @@ -26,8 +25,7 @@ struct range_query { std::vector const& topk() const { return m_topk.topk(); } template - void process_range(CursorRange&& cursors, size_t end) - { + void process_range(CursorRange&& cursors, size_t end) { QueryAlg query_alg(m_topk); query_alg(cursors, end); } diff --git a/include/pisa/query/algorithm/range_taat_query.hpp b/include/pisa/query/algorithm/range_taat_query.hpp index 0ba7306b..eaee9f5a 100644 --- a/include/pisa/query/algorithm/range_taat_query.hpp +++ b/include/pisa/query/algorithm/range_taat_query.hpp @@ -13,8 +13,7 @@ struct range_taat_query { template PISA_REQUIRES(PartialScoreAccumulator) - void operator()(CursorRange&& cursors, uint64_t max_docid, size_t range_size, Acc&& accumulator) - { + void operator()(CursorRange&& cursors, uint64_t max_docid, size_t range_size, Acc&& accumulator) { if (cursors.empty()) { return; } @@ -30,8 +29,7 @@ struct range_taat_query { std::vector const& topk() const { return m_topk.topk(); } template - void process_range(CursorRange&& cursors, size_t end, Acc&& accumulator) - { + void process_range(CursorRange&& cursors, size_t end, Acc&& accumulator) { QueryAlg query_alg(m_topk); query_alg(cursors, end, accumulator); } diff --git a/include/pisa/query/algorithm/ranked_and_query.hpp b/include/pisa/query/algorithm/ranked_and_query.hpp index 2c0ddfdf..4c4e9d39 100644 --- a/include/pisa/query/algorithm/ranked_and_query.hpp +++ b/include/pisa/query/algorithm/ranked_and_query.hpp @@ -10,8 +10,7 @@ struct ranked_and_query { explicit ranked_and_query(topk_queue& topk) : m_topk(topk) {} template - void operator()(CursorRange&& cursors, uint64_t max_docid) - { + void operator()(CursorRange&& cursors, uint64_t max_docid) { using Cursor = typename std::decay_t::value_type; if (cursors.empty()) { return; diff --git a/include/pisa/query/algorithm/ranked_or_query.hpp b/include/pisa/query/algorithm/ranked_or_query.hpp index 0c94e5b0..3fd5eef7 100644 --- a/include/pisa/query/algorithm/ranked_or_query.hpp +++ b/include/pisa/query/algorithm/ranked_or_query.hpp @@ -18,8 +18,7 @@ struct ranked_or_query { explicit ranked_or_query(topk_queue& topk) : m_topk(topk) {} template - void operator()(CursorRange&& cursors, uint64_t max_docid) - { + void operator()(CursorRange&& cursors, uint64_t max_docid) { using Cursor = typename std::decay_t::value_type; if (cursors.empty()) { return; diff --git a/include/pisa/query/algorithm/ranked_or_taat_query.hpp b/include/pisa/query/algorithm/ranked_or_taat_query.hpp index d4214617..f2338596 100644 --- a/include/pisa/query/algorithm/ranked_or_taat_query.hpp +++ b/include/pisa/query/algorithm/ranked_or_taat_query.hpp @@ -13,8 +13,7 @@ class ranked_or_taat_query { template PISA_REQUIRES(PartialScoreAccumulator) - void operator()(CursorRange&& cursors, uint64_t max_docid, Acc&& accumulator) - { + void operator()(CursorRange&& cursors, uint64_t max_docid, Acc&& accumulator) { if (cursors.empty()) { return; } diff --git a/include/pisa/query/algorithm/wand_query.hpp b/include/pisa/query/algorithm/wand_query.hpp index 7cbeee02..dd610cd1 100644 --- a/include/pisa/query/algorithm/wand_query.hpp +++ b/include/pisa/query/algorithm/wand_query.hpp @@ -11,8 +11,7 @@ struct wand_query { explicit wand_query(topk_queue& topk) : m_topk(topk) {} template - void operator()(CursorRange&& cursors, uint64_t max_docid) - { + void operator()(CursorRange&& cursors, uint64_t max_docid) { using Cursor = typename std::decay_t::value_type; if (cursors.empty()) { return; diff --git a/include/pisa/query/aol_reader.hpp b/include/pisa/query/aol_reader.hpp index f7ca34e0..dcdc010c 100644 --- a/include/pisa/query/aol_reader.hpp +++ b/include/pisa/query/aol_reader.hpp @@ -17,8 +17,7 @@ class aol_reader { public: explicit aol_reader(std::istream& is) : m_is(is) {} - std::optional next_query() - { + std::optional next_query() { m_is >> std::ws; while (not m_is.eof()) { std::string line; diff --git a/include/pisa/query/queries.hpp b/include/pisa/query/queries.hpp index 9e23e40f..3c804f9b 100644 --- a/include/pisa/query/queries.hpp +++ b/include/pisa/query/queries.hpp @@ -27,8 +27,8 @@ struct Query { -> std::pair, std::string_view>; [[nodiscard]] auto parse_query_terms( - std::string const& query_string, Tokenizer const& tokenizer, TermProcessor term_processor) - -> Query; + std::string const& query_string, Tokenizer const& tokenizer, TermProcessor term_processor +) -> Query; [[nodiscard]] auto parse_query_ids(std::string const& query_string) -> Query; @@ -37,7 +37,8 @@ struct Query { std::unique_ptr tokenizer, std::optional const& terms_file, std::optional const& stopwords_filename, - std::optional const& stemmer_type); + std::optional const& stemmer_type +); bool read_query(term_id_vec& ret, std::istream& is = std::cin); diff --git a/include/pisa/query/query_stemmer.hpp b/include/pisa/query/query_stemmer.hpp index 3a8d4ecc..ad817c62 100644 --- a/include/pisa/query/query_stemmer.hpp +++ b/include/pisa/query/query_stemmer.hpp @@ -14,10 +14,8 @@ namespace pisa { class QueryStemmer { public: explicit QueryStemmer(std::optional const& stemmer_name) - : m_stemmer(term_transformer_builder(stemmer_name)()) - {} - std::string operator()(std::string const& query_string) - { + : m_stemmer(term_transformer_builder(stemmer_name)()) {} + std::string operator()(std::string const& query_string) { std::stringstream tokenized_query; auto [id, raw_query] = split_query_at_colon(query_string); std::vector stemmed_terms; diff --git a/include/pisa/query/term_processor.hpp b/include/pisa/query/term_processor.hpp index 3706de84..f1716285 100644 --- a/include/pisa/query/term_processor.hpp +++ b/include/pisa/query/term_processor.hpp @@ -27,8 +27,8 @@ class TermProcessor { TermProcessor( std::optional const& terms_file, std::optional const& stopwords_filename, - std::optional const& stemmer_type) - { + std::optional const& stemmer_type + ) { auto source = std::make_shared(MemorySource::mapped_file(*terms_file)); auto terms = Payload_Vector<>::from(*source); auto to_id = [source = std::move(source), terms](auto str) -> std::optional { @@ -53,8 +53,7 @@ class TermProcessor { bool is_stopword(const term_id_type term) { return stopwords.find(term) != stopwords.end(); } - std::vector get_stopwords() - { + std::vector get_stopwords() { std::vector v; v.insert(v.end(), stopwords.begin(), stopwords.end()); sort(v.begin(), v.end()); diff --git a/include/pisa/query/trec_topic_reader.hpp b/include/pisa/query/trec_topic_reader.hpp index 14983cb2..73d421e4 100644 --- a/include/pisa/query/trec_topic_reader.hpp +++ b/include/pisa/query/trec_topic_reader.hpp @@ -25,8 +25,7 @@ namespace { static std::string const NARR_ATT = "Narrative:"; static std::string const NARR_END = ""; - static void consume(std::istream& is, std::string const& token, bool strict = true) - { + static void consume(std::istream& is, std::string const& token, bool strict = true) { is >> std::ws; for (auto pos = token.begin(); pos != token.end(); ++pos) { if (is.get() != *pos) { @@ -43,8 +42,7 @@ namespace { } template - static std::ostream& read_until(std::istream& is, Pred pred, std::ostream& os) - { + static std::ostream& read_until(std::istream& is, Pred pred, std::ostream& os) { is >> std::ws; while (not is.eof()) { if (is.peek() == std::istream::traits_type::eof() or pred(is.peek())) { @@ -67,8 +65,7 @@ class trec_topic_reader { public: explicit trec_topic_reader(std::istream& is) : m_is(is) {} - std::optional next_topic() - { + std::optional next_topic() { m_is >> std::ws; if (m_is.eof()) { return std::nullopt; @@ -82,14 +79,16 @@ class trec_topic_reader { consume(m_is, NUM); consume(m_is, NUM_ATT); read_until( - m_is, [](auto ch) { return ch == '<'; }, os); + m_is, [](auto ch) { return ch == '<'; }, os + ); topic.num = boost::algorithm::trim_copy(os.str()); consume(m_is, NUM_END, false); os.str(""); consume(m_is, TITLE); read_until( - m_is, [](auto ch) { return ch == '<'; }, os); + m_is, [](auto ch) { return ch == '<'; }, os + ); topic.title = boost::algorithm::trim_copy(os.str()); boost::replace_all(topic.title, "\n", " "); consume(m_is, TITLE_END, false); @@ -98,7 +97,8 @@ class trec_topic_reader { consume(m_is, DESC); consume(m_is, DESC_ATT, false); read_until( - m_is, [](auto ch) { return ch == '<'; }, os); + m_is, [](auto ch) { return ch == '<'; }, os + ); topic.desc = boost::algorithm::trim_copy(os.str()); boost::replace_all(topic.desc, "\n", " "); consume(m_is, DESC_END, false); @@ -107,7 +107,8 @@ class trec_topic_reader { consume(m_is, NARR); consume(m_is, NARR_ATT, false); read_until( - m_is, [](auto ch) { return ch == '<'; }, os); + m_is, [](auto ch) { return ch == '<'; }, os + ); topic.narr = boost::algorithm::trim_copy(os.str()); boost::replace_all(topic.narr, "\n", " "); consume(m_is, NARR_END, false); diff --git a/include/pisa/recursive_graph_bisection.hpp b/include/pisa/recursive_graph_bisection.hpp index 9eb9a37c..56db570e 100644 --- a/include/pisa/recursive_graph_bisection.hpp +++ b/include/pisa/recursive_graph_bisection.hpp @@ -32,8 +32,7 @@ namespace bp { ThreadLocalDegrees right_degrees; }; - PISA_ALWAYSINLINE double expb(double logn1, double logn2, size_t deg1, size_t deg2) - { + PISA_ALWAYSINLINE double expb(double logn1, double logn2, size_t deg1, size_t deg2) { __m128 _deg = _mm_cvtepi32_ps(_mm_set_epi32(deg1, deg1, deg2, deg2)); __m128 _log = _mm_set_ps(logn1, log2(deg1 + 1), logn2, log2(deg2 + 1)); __m128 _result = _mm_mul_ps(_deg, _log); @@ -44,8 +43,7 @@ namespace bp { template [[nodiscard]] PISA_ALWAYSINLINE auto& - clear_or_init(ThreadLocalContainer&& container, std::size_t size) - { + clear_or_init(ThreadLocalContainer&& container, std::size_t size) { bool exists = false; auto& ref = container.local(exists); if (exists) { @@ -70,39 +68,37 @@ class document_range { Iterator first, Iterator last, std::reference_wrapper fwdidx, - std::reference_wrapper> gains) - : m_first(first), m_last(last), m_fwdidx(fwdidx), m_gains(gains) - {} + std::reference_wrapper> gains + ) + : m_first(first), m_last(last), m_fwdidx(fwdidx), m_gains(gains) {} Iterator begin() { return m_first; } Iterator end() { return m_last; } std::ptrdiff_t size() const { return std::distance(m_first, m_last); } - PISA_ALWAYSINLINE document_partition split() const - { + PISA_ALWAYSINLINE document_partition split() const { Iterator mid = std::next(m_first, size() / 2); - return {document_range(m_first, mid, m_fwdidx, m_gains), - document_range(mid, m_last, m_fwdidx, m_gains), - term_count()}; + return { + document_range(m_first, mid, m_fwdidx, m_gains), + document_range(mid, m_last, m_fwdidx, m_gains), + term_count() + }; } - PISA_ALWAYSINLINE document_range operator()(std::ptrdiff_t left, std::ptrdiff_t right) const - { + PISA_ALWAYSINLINE document_range operator()(std::ptrdiff_t left, std::ptrdiff_t right) const { assert(left < right); assert(right <= size()); return document_range(std::next(m_first, left), std::next(m_first, right), m_fwdidx, m_gains); } std::size_t term_count() const { return m_fwdidx.get().term_count(); } - std::vector terms(value_type document) const - { + std::vector terms(value_type document) const { return m_fwdidx.get().terms(document); } double gain(value_type document) const { return m_gains.get()[document]; } double& gain(value_type document) { return m_gains.get()[document]; } - auto by_gain() - { + auto by_gain() { return [this](const value_type& lhs, const value_type& rhs) { return m_gains.get()[lhs] > m_gains.get()[rhs]; }; @@ -131,14 +127,14 @@ struct computation_node { document_partition partition; bool cache; - static computation_node from_stream(std::istream& is, const document_range& range) - { + static computation_node from_stream(std::istream& is, const document_range& range) { int level, iteration_count; std::ptrdiff_t left_first, right_first, left_last, right_last; bool cache; is >> level >> iteration_count >> left_first >> left_last >> right_first >> right_last; document_partition partition{ - range(left_first, left_last), range(right_first, right_last), range.term_count()}; + range(left_first, left_last), range(right_first, right_last), range.term_count() + }; if (not(is >> std::noboolalpha >> cache)) { cache = partition.size() > 64; } @@ -158,8 +154,7 @@ auto get_mapping = [](const auto& collection) { }; template -void compute_degrees(document_range& range, single_init_vector& deg_map) -{ +void compute_degrees(document_range& range, single_init_vector& deg_map) { for (const auto& document: range) { auto terms = range.terms(document); auto deg_map_inc = [&](const auto& t) { deg_map.set(t, deg_map[t] + 1); }; @@ -174,8 +169,8 @@ void compute_move_gains_caching( const std::ptrdiff_t to_n, const single_init_vector& from_lex, const single_init_vector& to_lex, - bp::ThreadLocal& thread_local_data) -{ + bp::ThreadLocal& thread_local_data +) { const auto logn1 = log2(from_n); const auto logn2 = log2(to_n); @@ -213,8 +208,8 @@ void compute_gains( document_partition& partition, const degree_map_pair& degrees, GainF gain_function, - bp::ThreadLocal& thread_local_data) -{ + bp::ThreadLocal& thread_local_data +) { auto n1 = partition.left.size(); auto n2 = partition.right.size(); gain_function(partition.left, n1, n2, degrees.left, degrees.right, thread_local_data); @@ -222,8 +217,7 @@ void compute_gains( } template -void swap(document_partition& partition, degree_map_pair& degrees) -{ +void swap(document_partition& partition, degree_map_pair& degrees) { auto left = partition.left; auto right = partition.right; auto lit = left.begin(); @@ -256,8 +250,8 @@ void process_partition( document_partition& partition, GainF gain_function, bp::ThreadLocal& thread_local_data, - int iterations = 20) -{ + int iterations = 20 +) { auto& left_degree = bp::clear_or_init(thread_local_data.left_degrees, partition.left.term_count()); auto& right_degree = @@ -274,15 +268,18 @@ void process_partition( pisa::execution::par_unseq, partition.left.begin(), partition.left.end(), - partition.left.by_gain()); + partition.left.by_gain() + ); }, [&] { pisa::sort( pisa::execution::par_unseq, partition.right.begin(), partition.right.end(), - partition.right.by_gain()); - }); + partition.right.by_gain() + ); + } + ); swap(partition, degrees); } } @@ -293,8 +290,8 @@ void recursive_graph_bisection( size_t depth, size_t cache_depth, progress& p, - std::shared_ptr thread_local_data = nullptr) -{ + std::shared_ptr thread_local_data = nullptr +) { if (thread_local_data == nullptr) { thread_local_data = std::make_shared(); } @@ -311,13 +308,14 @@ void recursive_graph_bisection( if (depth > 1 && documents.size() > 2) { tbb::parallel_invoke( [&, thread_local_data] { - recursive_graph_bisection( - partition.left, depth - 1, cache_depth, p, thread_local_data); + recursive_graph_bisection(partition.left, depth - 1, cache_depth, p, thread_local_data); }, [&, thread_local_data] { recursive_graph_bisection( - partition.right, depth - 1, cache_depth, p, thread_local_data); - }); + partition.right, depth - 1, cache_depth, p, thread_local_data + ); + } + ); } else { std::sort(partition.left.begin(), partition.left.end()); std::sort(partition.right.begin(), partition.right.end()); @@ -330,15 +328,15 @@ void recursive_graph_bisection( /// The caller must ensure that no range on the same level intersects with another. /// Failure to do so leads to undefined behavior. template -void recursive_graph_bisection(std::vector> nodes, progress& p) -{ +void recursive_graph_bisection(std::vector> nodes, progress& p) { bp::ThreadLocal thread_local_data; std::sort(nodes.begin(), nodes.end()); auto first = nodes.begin(); auto end = nodes.end(); while (first != end) { - auto last = std::find_if( - first, end, [&first](const auto& node) { return node.level > first->level; }); + auto last = std::find_if(first, end, [&first](const auto& node) { + return node.level > first->level; + }); bool last_level = last == end; tbb::task_group level_group; std::for_each(first, last, [&thread_local_data, &level_group, last_level, &p](auto& node) { @@ -350,13 +348,15 @@ void recursive_graph_bisection(std::vector> nodes, pr node.partition, compute_move_gains_caching, thread_local_data, - node.iteration_count); + node.iteration_count + ); } else { process_partition( node.partition, compute_move_gains_caching, thread_local_data, - node.iteration_count); + node.iteration_count + ); } if (last_level) { std::sort(node.partition.left.begin(), node.partition.left.end()); diff --git a/include/pisa/reorder_docids.hpp b/include/pisa/reorder_docids.hpp index a7b62569..55e0ae09 100644 --- a/include/pisa/reorder_docids.hpp +++ b/include/pisa/reorder_docids.hpp @@ -38,8 +38,7 @@ namespace detail { using node_type = computation_node; inline std::vector - read_node_config(const std::string& config_file, const range_type& initial_range) - { + read_node_config(const std::string& config_file, const range_type& initial_range) { std::vector nodes; std::ifstream is(config_file); std::string line; @@ -50,20 +49,20 @@ namespace detail { return nodes; } - inline void run_with_config(const std::string& config_file, const range_type& initial_range) - { + inline void run_with_config(const std::string& config_file, const range_type& initial_range) { auto nodes = read_node_config(config_file, initial_range); auto total_count = std::accumulate( - nodes.begin(), nodes.end(), std::ptrdiff_t(0), [](auto acc, const auto& node) { - return acc + node.partition.size(); - }); + nodes.begin(), + nodes.end(), + std::ptrdiff_t(0), + [](auto acc, const auto& node) { return acc + node.partition.size(); } + ); pisa::progress bp_progress("Graph bisection", total_count); bp_progress.update(0); recursive_graph_bisection(std::move(nodes), bp_progress); } - inline void run_default_tree(size_t depth, const range_type& initial_range) - { + inline void run_default_tree(size_t depth, const range_type& initial_range) { spdlog::info("Default tree with depth {}", depth); pisa::progress bp_progress("Graph bisection", initial_range.size() * depth); bp_progress.update(0); @@ -72,8 +71,7 @@ namespace detail { } // namespace detail -[[nodiscard]] auto recursive_graph_bisection(RecursiveGraphBisectionOptions const& options) -> int -{ +[[nodiscard]] auto recursive_graph_bisection(RecursiveGraphBisectionOptions const& options) -> int { if (not options.output_basename && not options.output_fwd) { spdlog::error("Must define at least one output parameter."); return 1; @@ -82,7 +80,8 @@ namespace detail { forward_index fwd = options.input_fwd ? forward_index::read(*options.input_fwd) : forward_index::from_inverted_index( - options.input_basename, options.min_length, options.compress_fwd); + options.input_basename, options.min_length, options.compress_fwd + ); if (options.output_fwd) { forward_index::write(fwd, *options.output_fwd); @@ -98,8 +97,8 @@ namespace detail { detail::run_with_config(*options.node_config, initial_range); } else { detail::run_default_tree( - options.depth.value_or(static_cast(std::log2(fwd.size()) - 5)), - initial_range); + options.depth.value_or(static_cast(std::log2(fwd.size()) - 5)), initial_range + ); } if (options.print_args) { @@ -138,8 +137,8 @@ struct ReorderOptions { inline auto reorder_postings( binary_freq_collection const& input, std::string_view output_basename, - gsl::span mapping) -{ + gsl::span mapping +) { pisa::progress progress("Reassigning IDs in posting lists", input.size()); std::ofstream output_docs(fmt::format("{}.docs", output_basename)); @@ -170,7 +169,8 @@ inline auto reorder_postings( inline auto reorder_lexicon( std::string const& input_lexicon, std::string const& output_lexicon, - gsl::span mapping) + gsl::span mapping +) { auto doc_buffer = Payload_Vector_Buffer::from_file(input_lexicon); @@ -188,8 +188,8 @@ inline auto reorder_sizes( binary_collection const& input_sizes, std::uint64_t num_docs, gsl::span mapping, - std::string_view output_basename) -{ + std::string_view output_basename +) { pisa::progress progress("Reordering document sizes", num_docs); auto sizes = *input_sizes.begin(); if (sizes.size() != num_docs) { @@ -212,8 +212,8 @@ inline void reorder_from_mapping( binary_freq_collection const& input_collection, binary_collection const& input_sizes, ReorderOptions const& options, - gsl::span mapping) -{ + gsl::span mapping +) { auto num_docs = input_collection.num_docs(); reorder_sizes(input_sizes, num_docs, mapping, options.output_basename); reorder_postings(input_collection, options.output_basename, mapping); @@ -222,8 +222,7 @@ inline void reorder_from_mapping( } } -inline auto reorder_random(ReorderOptions options, unsigned int seed) -> int -{ +inline auto reorder_random(ReorderOptions options, unsigned int seed) -> int { spdlog::info("Computing random permutation"); binary_freq_collection input_collection(options.input_basename.c_str()); auto num_docs = input_collection.num_docs(); @@ -237,8 +236,7 @@ inline auto reorder_random(ReorderOptions options, unsigned int seed) -> int } /// Feature file must contain the same number of labels as there are documents in the collection. -inline auto reorder_by_feature(ReorderOptions options, std::string const& feature_file) -> int -{ +inline auto reorder_by_feature(ReorderOptions options, std::string const& feature_file) -> int { spdlog::info("Sorting URLs"); binary_freq_collection input_collection(options.input_basename.c_str()); auto const mapping = [&] { @@ -265,8 +263,7 @@ inline auto reorder_by_feature(ReorderOptions options, std::string const& featur return 0; } -inline auto reorder_from_mapping(ReorderOptions options, std::string mapping_file) -> int -{ +inline auto reorder_from_mapping(ReorderOptions options, std::string mapping_file) -> int { spdlog::info("Reading mapping"); binary_freq_collection input_collection(options.input_basename.c_str()); auto const mapping = [&] { diff --git a/include/pisa/score_opt_partition.hpp b/include/pisa/score_opt_partition.hpp index 9fe107f8..bbe73857 100644 --- a/include/pisa/score_opt_partition.hpp +++ b/include/pisa/score_opt_partition.hpp @@ -40,11 +40,8 @@ struct score_opt_partition { uint64_t element_count; score_window( - ForwardIterator begin, - posting_t base, - wand_cost_t cost_upper_bound, - float fixed_cost, - size_t size) + ForwardIterator begin, posting_t base, wand_cost_t cost_upper_bound, float fixed_cost, size_t size + ) : start_it(begin), end_it(begin), min_p(base), @@ -53,15 +50,13 @@ struct score_opt_partition { m_fixed_cost(fixed_cost), sum(0), end_sequence(size), - element_count(0) - {} + element_count(0) {} uint64_t universe() const { return max_p - min_p + 1; } uint64_t size() const { return end - start; } - void advance_start() - { + void advance_start() { float v = std::get<1>(*start_it); if (std::get<1>(*start_it) == max_queue.front()) { max_queue.pop_front(); @@ -75,8 +70,7 @@ struct score_opt_partition { } } - void advance_end() - { + void advance_end() { float v = std::get<1>(*end_it); sum += v; @@ -93,8 +87,7 @@ struct score_opt_partition { } } - float cost() - { + float cost() { if (size() < 2) { return m_fixed_cost; } @@ -108,13 +101,12 @@ struct score_opt_partition { template score_opt_partition( - I begin, std::uint32_t base, std::uint64_t size, double eps1, double eps2, float fixed_cost) + I begin, std::uint32_t base, std::uint64_t size, double eps1, double eps2, float fixed_cost + ) PISA_REQUIRES( std::forward_iterator - && (std::convertible_to< - typename std::iterator_traits::value_type, - std::pair>)) - { + && (std::convertible_to::value_type, std::pair>) + ) { // compute cost of single block. float max = 0; float sum = 0; @@ -142,8 +134,8 @@ struct score_opt_partition { std::vector path(size + 1, 0); std::vector maxs(size + 1, 0); - auto max1 = std::max_element( - begin, begin + size, [](const auto& lhs, const auto& rhs) -> auto { + auto max1 = + std::max_element(begin, begin + size, [](const auto& lhs, const auto& rhs) -> auto { return std::get<1>(lhs) < std::get<1>(rhs); }); maxs[size] = std::get<1>(*max1); diff --git a/include/pisa/scorer/bm25.hpp b/include/pisa/scorer/bm25.hpp index ccb31fd1..7e586d3c 100644 --- a/include/pisa/scorer/bm25.hpp +++ b/include/pisa/scorer/bm25.hpp @@ -18,26 +18,22 @@ struct bm25: public index_scorer { using index_scorer::index_scorer; bm25(const Wand& wdata, const float b, const float k1) - : index_scorer(wdata), m_b(b), m_k1(k1) - {} + : index_scorer(wdata), m_b(b), m_k1(k1) {} - float doc_term_weight(uint64_t freq, float norm_len) const - { + float doc_term_weight(uint64_t freq, float norm_len) const { auto f = static_cast(freq); return f / (f + m_k1 * (1.0F - m_b + m_b * norm_len)); } // IDF (inverse document frequency) - float query_term_weight(uint64_t df, uint64_t num_docs) const - { + float query_term_weight(uint64_t df, uint64_t num_docs) const { auto fdf = static_cast(df); float idf = std::log((float(num_docs) - fdf + 0.5F) / (fdf + 0.5F)); static const float epsilon_score = 1.0E-6; return std::max(epsilon_score, idf) * (1.0F + m_k1); } - term_scorer_t term_scorer(uint64_t term_id) const override - { + term_scorer_t term_scorer(uint64_t term_id) const override { auto term_len = this->m_wdata.term_posting_count(term_id); auto term_weight = query_term_weight(term_len, this->m_wdata.num_docs()); auto s = [&, term_weight](uint32_t doc, uint32_t freq) { diff --git a/include/pisa/scorer/dph.hpp b/include/pisa/scorer/dph.hpp index ae13a0fb..1db508b1 100644 --- a/include/pisa/scorer/dph.hpp +++ b/include/pisa/scorer/dph.hpp @@ -21,8 +21,7 @@ template struct dph: public index_scorer { using index_scorer::index_scorer; - term_scorer_t term_scorer(uint64_t term_id) const override - { + term_scorer_t term_scorer(uint64_t term_id) const override { auto s = [&, term_id](uint32_t doc, uint32_t freq) { float f = (float)freq / this->m_wdata.doc_len(doc); float norm = (1.F - f) * (1.F - f) / (freq + 1.F); @@ -31,7 +30,8 @@ struct dph: public index_scorer { * std::log2( (freq * this->m_wdata.avg_len() / this->m_wdata.doc_len(doc)) * ((float)this->m_wdata.num_docs() - / this->m_wdata.term_occurrence_count(term_id))) + / this->m_wdata.term_occurrence_count(term_id)) + ) + .5F * std::log2(2.F * M_PI * freq * (1.F - f))); }; return s; diff --git a/include/pisa/scorer/pl2.hpp b/include/pisa/scorer/pl2.hpp index 44dbe0d8..91865be6 100644 --- a/include/pisa/scorer/pl2.hpp +++ b/include/pisa/scorer/pl2.hpp @@ -22,8 +22,7 @@ struct pl2: public index_scorer { pl2(const Wand& wdata, const float c) : index_scorer(wdata), m_c(c) {} - term_scorer_t term_scorer(uint64_t term_id) const override - { + term_scorer_t term_scorer(uint64_t term_id) const override { auto s = [&, term_id](uint32_t doc, uint32_t freq) { float tfn = freq * std::log2(1.F + (m_c * this->m_wdata.avg_len()) / this->m_wdata.doc_len(doc)); diff --git a/include/pisa/scorer/qld.hpp b/include/pisa/scorer/qld.hpp index 90330f32..610cca2a 100644 --- a/include/pisa/scorer/qld.hpp +++ b/include/pisa/scorer/qld.hpp @@ -22,8 +22,7 @@ struct qld: public index_scorer { qld(const Wand& wdata, const float mu) : index_scorer(wdata), m_mu(mu) {} - term_scorer_t term_scorer(uint64_t term_id) const override - { + term_scorer_t term_scorer(uint64_t term_id) const override { auto s = [&, term_id](uint32_t doc, uint32_t freq) { float numerator = 1 + freq diff --git a/include/pisa/scorer/quantized.hpp b/include/pisa/scorer/quantized.hpp index 26c1a8f9..89eeed6d 100644 --- a/include/pisa/scorer/quantized.hpp +++ b/include/pisa/scorer/quantized.hpp @@ -11,8 +11,7 @@ namespace pisa { template struct quantized: public index_scorer { using index_scorer::index_scorer; - term_scorer_t term_scorer([[maybe_unused]] uint64_t term_id) const - { + term_scorer_t term_scorer([[maybe_unused]] uint64_t term_id) const { return []([[maybe_unused]] uint32_t doc, uint32_t freq) { return freq; }; } }; diff --git a/include/pisa/scorer/scorer.hpp b/include/pisa/scorer/scorer.hpp index 61fa4977..2ab63016 100644 --- a/include/pisa/scorer/scorer.hpp +++ b/include/pisa/scorer/scorer.hpp @@ -22,12 +22,12 @@ struct ScorerParams { }; namespace pisa { namespace scorer { - inline auto from_params = - [](const ScorerParams& params, - auto const& wdata) -> std::unique_ptr>> { + inline auto from_params = [](const ScorerParams& params, auto const& wdata + ) -> std::unique_ptr>> { if (params.name == "bm25") { return std::make_unique>>( - wdata, params.bm25_b, params.bm25_k1); + wdata, params.bm25_b, params.bm25_k1 + ); } if (params.name == "qld") { return std::make_unique>>(wdata, params.qld_mu); diff --git a/include/pisa/sequence/indexed_sequence.hpp b/include/pisa/sequence/indexed_sequence.hpp index 937bad29..949e0396 100644 --- a/include/pisa/sequence/indexed_sequence.hpp +++ b/include/pisa/sequence/indexed_sequence.hpp @@ -22,8 +22,7 @@ struct indexed_sequence { static const uint64_t type_bits = 1; // all_ones is implicit static PISA_FLATTEN_FUNC uint64_t - bitsize(global_parameters const& params, uint64_t universe, uint64_t n) - { + bitsize(global_parameters const& params, uint64_t universe, uint64_t n) { uint64_t best_cost = all_ones_sequence::bitsize(params, universe, n); uint64_t ef_cost = compact_elias_fano::bitsize(params, universe, n) + type_bits; @@ -45,8 +44,8 @@ struct indexed_sequence { Iterator begin, uint64_t universe, uint64_t n, - global_parameters const& params) - { + global_parameters const& params + ) { uint64_t best_cost = all_ones_sequence::bitsize(params, universe, n); int best_type = all_ones; @@ -87,8 +86,8 @@ struct indexed_sequence { uint64_t offset, uint64_t universe, uint64_t n, - global_parameters const& params) - { + global_parameters const& params + ) { if (all_ones_sequence::bitsize(params, universe, n) == 0) { m_type = all_ones; } else { @@ -101,8 +100,8 @@ struct indexed_sequence { compact_elias_fano::enumerator(bv, offset + type_bits, universe, n, params); break; case ranked_bitvector: - m_enumerator = compact_ranked_bitvector::enumerator( - bv, offset + type_bits, universe, n, params); + m_enumerator = + compact_ranked_bitvector::enumerator(bv, offset + type_bits, universe, n, params); break; case all_ones: m_enumerator = @@ -112,27 +111,23 @@ struct indexed_sequence { } } - value_type move(uint64_t position) - { + value_type move(uint64_t position) { return std::visit([&position](auto&& e) { return e.move(position); }, m_enumerator); } - value_type next_geq(uint64_t lower_bound) - { + value_type next_geq(uint64_t lower_bound) { return std::visit( - [&lower_bound](auto&& e) { return e.next_geq(lower_bound); }, m_enumerator); + [&lower_bound](auto&& e) { return e.next_geq(lower_bound); }, m_enumerator + ); } - value_type next() - { + value_type next() { return std::visit([](auto&& e) { return e.next(); }, m_enumerator); } - uint64_t size() const - { + uint64_t size() const { return std::visit([](auto&& e) { return e.size(); }, m_enumerator); } - uint64_t prev_value() const - { + uint64_t prev_value() const { return std::visit([](auto&& e) { return e.prev_value(); }, m_enumerator); } diff --git a/include/pisa/sequence/partitioned_sequence.hpp b/include/pisa/sequence/partitioned_sequence.hpp index fd1d12bc..5d15ef24 100644 --- a/include/pisa/sequence/partitioned_sequence.hpp +++ b/include/pisa/sequence/partitioned_sequence.hpp @@ -25,8 +25,8 @@ struct partitioned_sequence { Iterator begin, uint64_t universe, uint64_t n, - global_parameters const& params) - { + global_parameters const& params + ) { assert(n > 0); auto partition = compute_partition(begin, universe, n, params); @@ -60,7 +60,8 @@ struct partitioned_sequence { } base_sequence_type::write( - bvb, cur_partition.begin(), cur_partition.back() + 1, cur_partition.size(), params); + bvb, cur_partition.begin(), cur_partition.back() + 1, cur_partition.size(), params + ); } else { bit_vector_builder bv_sequences; std::vector endpoints; @@ -86,7 +87,8 @@ struct partitioned_sequence { cur_partition.begin(), cur_partition.back() + 1, cur_partition.size(), // XXX skip last one? - params); + params + ); endpoints.push_back(bv_sequences.size()); upper_bounds.push_back(upper_bound); cur_base = upper_bound + 1; @@ -97,7 +99,8 @@ struct partitioned_sequence { bit_vector_builder bv_upper_bounds; compact_elias_fano::write( - bv_upper_bounds, upper_bounds.begin(), universe, partitions + 1, params); + bv_upper_bounds, upper_bounds.begin(), universe, partitions + 1, params + ); uint64_t endpoint_bits = ceil_log2(bv_sequences.size() + 1); write_gamma(bvb, endpoint_bits); @@ -124,9 +127,9 @@ struct partitioned_sequence { uint64_t offset, uint64_t universe, uint64_t n, - global_parameters const& params) - : m_params(params), m_size(n), m_universe(universe), m_bv(&bv) - { + global_parameters const& params + ) + : m_params(params), m_size(n), m_universe(universe), m_bv(&bv) { bit_vector::enumerator it(bv, offset); m_partitions = read_gamma_nonzero(it); if (m_partitions == 1) { @@ -153,8 +156,8 @@ struct partitioned_sequence { m_sizes = compact_elias_fano::enumerator(bv, cur_offset, n, m_partitions - 1, params); cur_offset += compact_elias_fano::bitsize(params, n, m_partitions - 1); - m_upper_bounds = compact_elias_fano::enumerator( - bv, cur_offset, universe, m_partitions + 1, params); + m_upper_bounds = + compact_elias_fano::enumerator(bv, cur_offset, universe, m_partitions + 1, params); cur_offset += compact_elias_fano::bitsize(params, universe, m_partitions + 1); m_endpoints_offset = cur_offset; @@ -168,8 +171,7 @@ struct partitioned_sequence { slow_move(); } - value_type PISA_ALWAYSINLINE move(uint64_t position) - { + value_type PISA_ALWAYSINLINE move(uint64_t position) { assert(position <= size()); m_position = position; @@ -183,8 +185,7 @@ struct partitioned_sequence { // note: this is instantiated oly if BaseSequence has next_geq template > - value_type PISA_ALWAYSINLINE next_geq(uint64_t lower_bound) - { + value_type PISA_ALWAYSINLINE next_geq(uint64_t lower_bound) { if (PISA_LIKELY(lower_bound >= m_cur_base && lower_bound <= m_cur_upper_bound)) { auto val = m_partition_enum.next_geq(lower_bound - m_cur_base); m_position = m_cur_begin + val.first; @@ -193,8 +194,7 @@ struct partitioned_sequence { return slow_next_geq(lower_bound); } - value_type PISA_ALWAYSINLINE next() - { + value_type PISA_ALWAYSINLINE next() { ++m_position; if (PISA_LIKELY(m_position < m_cur_end)) { @@ -206,8 +206,7 @@ struct partitioned_sequence { uint64_t size() const { return m_size; } - uint64_t prev_value() const - { + uint64_t prev_value() const { if (PISA_UNLIKELY(m_position == m_cur_begin)) { return m_cur_partition != 0U ? m_cur_base - 1 : 0; } @@ -224,8 +223,7 @@ struct partitioned_sequence { // next(), causing the code to grow. Since next is called in very // tight loops, on microbenchmarks this causes an improvement of // about 3ns on my i7 3Ghz - value_type PISA_NOINLINE slow_next() - { + value_type PISA_NOINLINE slow_next() { if (PISA_UNLIKELY(m_position == m_size)) { assert(m_cur_partition == m_partitions - 1); auto val = m_partition_enum.next(); @@ -239,8 +237,7 @@ struct partitioned_sequence { return value_type(m_position, val); } - value_type PISA_NOINLINE slow_move() - { + value_type PISA_NOINLINE slow_move() { if (m_position == size()) { if (m_partitions > 1) { switch_partition(m_partitions - 1); @@ -254,8 +251,7 @@ struct partitioned_sequence { return value_type(m_position, val); } - value_type PISA_NOINLINE slow_next_geq(uint64_t lower_bound) - { + value_type PISA_NOINLINE slow_next_geq(uint64_t lower_bound) { if (m_partitions == 1) { if (lower_bound < m_cur_base) { return move(0); @@ -276,8 +272,7 @@ struct partitioned_sequence { return next_geq(lower_bound); } - void switch_partition(uint64_t partition) - { + void switch_partition(uint64_t partition) { assert(m_partitions > 1); uint64_t endpoint = partition != 0U @@ -298,11 +293,8 @@ struct partitioned_sequence { m_cur_base = m_upper_bounds.prev_value() + (partition != 0U ? 1 : 0); m_partition_enum = base_sequence_enumerator( - *m_bv, - partition_begin, - m_cur_upper_bound - m_cur_base + 1, - m_cur_end - m_cur_begin, - m_params); + *m_bv, partition_begin, m_cur_upper_bound - m_cur_base + 1, m_cur_end - m_cur_begin, m_params + ); } global_parameters m_params; @@ -338,8 +330,8 @@ struct partitioned_sequence { uint64_t fix_cost = 64, double eps1 = 0.03, double eps2 = 0.3, - double eps3 = 0.01) - { + double eps3 = 0.01 + ) { std::vector partition; if (base_sequence_type::bitsize(params, universe, n) < 2 * fix_cost) { @@ -386,7 +378,8 @@ struct partitioned_sequence { superblock_size, cost_fun, eps1, - eps2); + eps2 + ); superblock_partition.reserve(opt.partition.size()); for (auto& endpoint: opt.partition) { @@ -402,7 +395,8 @@ struct partitioned_sequence { for (const auto& superblock_partition: superblock_partitions) { partition.insert( - partition.end(), superblock_partition.begin(), superblock_partition.end()); + partition.end(), superblock_partition.begin(), superblock_partition.end() + ); } return partition; diff --git a/include/pisa/sequence/positive_sequence.hpp b/include/pisa/sequence/positive_sequence.hpp index ea8a0bad..838fe4f9 100644 --- a/include/pisa/sequence/positive_sequence.hpp +++ b/include/pisa/sequence/positive_sequence.hpp @@ -17,13 +17,14 @@ struct positive_sequence { Iterator begin, uint64_t universe, uint64_t n, - global_parameters const& params) - { + global_parameters const& params + ) { assert(n > 0); auto cumulative_begin = make_function_iterator( std::make_pair(uint64_t(0), begin), +[](std::pair& state) { state.first += *state.second++; }, - +[](std::pair const& state) { return state.first + *state.second; }); + +[](std::pair const& state) { return state.first + *state.second; } + ); base_sequence_type::write(bvb, cumulative_begin, universe, n, params); } @@ -38,12 +39,11 @@ struct positive_sequence { uint64_t offset, uint64_t universe, uint64_t n, - global_parameters const& params) - : m_base_enum(bv, offset, universe, n, params), m_position(m_base_enum.size()) - {} + global_parameters const& params + ) + : m_base_enum(bv, offset, universe, n, params), m_position(m_base_enum.size()) {} - value_type move(uint64_t position) - { + value_type move(uint64_t position) { // we cache m_position and m_cur to avoid the call overhead in // the most common cases uint64_t prev = m_cur; diff --git a/include/pisa/sequence/strict_sequence.hpp b/include/pisa/sequence/strict_sequence.hpp index 5d8f6e29..a3565e65 100644 --- a/include/pisa/sequence/strict_sequence.hpp +++ b/include/pisa/sequence/strict_sequence.hpp @@ -21,8 +21,7 @@ struct strict_sequence { static const uint64_t type_bits = 1; // all_ones is implicit - static global_parameters strict_params(global_parameters params) - { + static global_parameters strict_params(global_parameters params) { // we do not need to index the zeros params.ef_log_sampling0 = 63; params.rb_log_rank1_sampling = 63; @@ -30,8 +29,7 @@ struct strict_sequence { } static PISA_FLATTEN_FUNC uint64_t - bitsize(global_parameters const& params, uint64_t universe, uint64_t n) - { + bitsize(global_parameters const& params, uint64_t universe, uint64_t n) { uint64_t best_cost = all_ones_sequence::bitsize(params, universe, n); auto sparams = strict_params(params); @@ -54,8 +52,8 @@ struct strict_sequence { Iterator begin, uint64_t universe, uint64_t n, - global_parameters const& params) - { + global_parameters const& params + ) { auto sparams = strict_params(params); uint64_t best_cost = all_ones_sequence::bitsize(params, universe, n); int best_type = all_ones; @@ -97,8 +95,8 @@ struct strict_sequence { uint64_t offset, uint64_t universe, uint64_t n, - global_parameters const& params) - { + global_parameters const& params + ) { auto sparams = strict_params(params); if (all_ones_sequence::bitsize(params, universe, n) == 0) { @@ -113,8 +111,8 @@ struct strict_sequence { strict_elias_fano::enumerator(bv, offset + type_bits, universe, n, sparams); break; case ranked_bitvector: - m_enumerator = compact_ranked_bitvector::enumerator( - bv, offset + type_bits, universe, n, sparams); + m_enumerator = + compact_ranked_bitvector::enumerator(bv, offset + type_bits, universe, n, sparams); break; case all_ones: m_enumerator = @@ -124,23 +122,19 @@ struct strict_sequence { } } - value_type move(uint64_t position) - { + value_type move(uint64_t position) { return std::visit([&position](auto&& e) { return e.move(position); }, m_enumerator); } - value_type next() - { + value_type next() { return std::visit([](auto&& e) { return e.next(); }, m_enumerator); } - uint64_t size() const - { + uint64_t size() const { return std::visit([](auto&& e) { return e.size(); }, m_enumerator); } - uint64_t prev_value() const - { + uint64_t prev_value() const { return std::visit([](auto&& e) { return e.prev_value(); }, m_enumerator); } diff --git a/include/pisa/sequence/uniform_partitioned_sequence.hpp b/include/pisa/sequence/uniform_partitioned_sequence.hpp index 684736cb..94f8b471 100644 --- a/include/pisa/sequence/uniform_partitioned_sequence.hpp +++ b/include/pisa/sequence/uniform_partitioned_sequence.hpp @@ -21,8 +21,8 @@ struct uniform_partitioned_sequence { Iterator begin, uint64_t universe, uint64_t n, - global_parameters const& params) - { + global_parameters const& params + ) { assert(n > 0); uint64_t partition_size = uint64_t(1) << params.log_partition_size; size_t partitions = ceil_div(n, partition_size); @@ -51,7 +51,8 @@ struct uniform_partitioned_sequence { } base_sequence_type::write( - bvb, cur_partition.begin(), cur_partition.back() + 1, cur_partition.size(), params); + bvb, cur_partition.begin(), cur_partition.back() + 1, cur_partition.size(), params + ); } else { bit_vector_builder bv_sequences; std::vector endpoints; @@ -79,7 +80,8 @@ struct uniform_partitioned_sequence { cur_partition.begin(), cur_partition.back() + 1, cur_partition.size(), // XXX skip last one? - params); + params + ); endpoints.push_back(bv_sequences.size()); upper_bounds.push_back(upper_bound); cur_base = upper_bound + 1; @@ -87,7 +89,8 @@ struct uniform_partitioned_sequence { bit_vector_builder bv_upper_bounds; compact_elias_fano::write( - bv_upper_bounds, upper_bounds.begin(), universe, partitions + 1, params); + bv_upper_bounds, upper_bounds.begin(), universe, partitions + 1, params + ); uint64_t endpoint_bits = ceil_log2(bv_sequences.size() + 1); write_gamma(bvb, endpoint_bits); @@ -112,9 +115,9 @@ struct uniform_partitioned_sequence { uint64_t offset, uint64_t universe, uint64_t n, - global_parameters const& params) - : m_params(params), m_size(n), m_universe(universe), m_bv(&bv) - { + global_parameters const& params + ) + : m_params(params), m_size(n), m_universe(universe), m_bv(&bv) { bit_vector::enumerator it(bv, offset); m_partitions = read_gamma_nonzero(it); if (m_partitions == 1) { @@ -138,8 +141,8 @@ struct uniform_partitioned_sequence { m_endpoint_bits = read_gamma(it); uint64_t cur_offset = it.position(); - m_upper_bounds = compact_elias_fano::enumerator( - bv, cur_offset, universe, m_partitions + 1, params); + m_upper_bounds = + compact_elias_fano::enumerator(bv, cur_offset, universe, m_partitions + 1, params); cur_offset += compact_elias_fano::offsets(0, universe, m_partitions + 1, params).end; m_endpoints_offset = cur_offset; @@ -153,8 +156,7 @@ struct uniform_partitioned_sequence { slow_move(); } - value_type PISA_ALWAYSINLINE move(uint64_t position) - { + value_type PISA_ALWAYSINLINE move(uint64_t position) { assert(position <= size()); m_position = position; @@ -168,8 +170,7 @@ struct uniform_partitioned_sequence { // note: this is instantiated oly if BaseSequence has next_geq template > - value_type PISA_ALWAYSINLINE next_geq(uint64_t lower_bound) - { + value_type PISA_ALWAYSINLINE next_geq(uint64_t lower_bound) { if (PISA_LIKELY(lower_bound >= m_cur_base && lower_bound <= m_cur_upper_bound)) { auto val = m_partition_enum.next_geq(lower_bound - m_cur_base); m_position = m_cur_begin + val.first; @@ -178,8 +179,7 @@ struct uniform_partitioned_sequence { return slow_next_geq(lower_bound); } - value_type PISA_ALWAYSINLINE next() - { + value_type PISA_ALWAYSINLINE next() { ++m_position; if (PISA_LIKELY(m_position < m_cur_end)) { @@ -191,8 +191,7 @@ struct uniform_partitioned_sequence { uint64_t size() const { return m_size; } - uint64_t prev_value() const - { + uint64_t prev_value() const { if (PISA_UNLIKELY(m_position == m_cur_begin)) { return m_cur_partition != 0U ? m_cur_base - 1 : 0; } @@ -205,8 +204,7 @@ struct uniform_partitioned_sequence { // next(), causing the code to grow. Since next is called in very // tight loops, on microbenchmarks this causes an improvement of // about 3ns on my i7 3Ghz - value_type PISA_NOINLINE slow_next() - { + value_type PISA_NOINLINE slow_next() { if (PISA_UNLIKELY(m_position == m_size)) { assert(m_cur_partition == m_partitions - 1); auto val = m_partition_enum.next(); @@ -220,8 +218,7 @@ struct uniform_partitioned_sequence { return value_type(m_position, val); } - value_type PISA_NOINLINE slow_move() - { + value_type PISA_NOINLINE slow_move() { if (m_position == size()) { if (m_partitions > 1) { switch_partition(m_partitions - 1); @@ -235,8 +232,7 @@ struct uniform_partitioned_sequence { return value_type(m_position, val); } - value_type PISA_NOINLINE slow_next_geq(uint64_t lower_bound) - { + value_type PISA_NOINLINE slow_next_geq(uint64_t lower_bound) { if (m_partitions == 1) { if (lower_bound < m_cur_base) { return move(0); @@ -257,13 +253,11 @@ struct uniform_partitioned_sequence { return next_geq(lower_bound); } - void switch_partition(uint64_t partition) - { + void switch_partition(uint64_t partition) { assert(m_partitions > 1); uint64_t endpoint = partition != 0U - ? m_bv->get_bits( - m_endpoints_offset + (partition - 1) * m_endpoint_bits, m_endpoint_bits) + ? m_bv->get_bits(m_endpoints_offset + (partition - 1) * m_endpoint_bits, m_endpoint_bits) : 0; m_bv->data().prefetch((m_sequences_offset + endpoint) / 64); @@ -280,7 +274,8 @@ struct uniform_partitioned_sequence { m_sequences_offset + endpoint, m_cur_upper_bound - m_cur_base + 1, m_cur_end - m_cur_begin, - m_params); + m_params + ); } global_parameters m_params; diff --git a/include/pisa/sequence_collection.hpp b/include/pisa/sequence_collection.hpp index 0a66304d..310277d2 100644 --- a/include/pisa/sequence_collection.hpp +++ b/include/pisa/sequence_collection.hpp @@ -18,24 +18,22 @@ class sequence_collection { class builder { public: explicit builder(global_parameters const& params) - : m_queue(1 << 24), m_params(params), m_sequences(params) - {} + : m_queue(1 << 24), m_params(params), m_sequences(params) {} template - void add_sequence(Iterator begin, uint64_t last_element, uint64_t n) - { + void add_sequence(Iterator begin, uint64_t last_element, uint64_t n) { if (!n) { throw std::invalid_argument("Sequence must be nonempty"); } // make_shared does not seem to work std::shared_ptr> ptr( - new sequence_adder(*this, begin, last_element, n)); + new sequence_adder(*this, begin, last_element, n) + ); m_queue.add_job(ptr, n); } - void build(sequence_collection& sq) - { + void build(sequence_collection& sq) { m_queue.complete(); sq.m_params = m_params; m_sequences.build(sq.m_sequences); @@ -45,18 +43,15 @@ class sequence_collection { template struct sequence_adder: semiasync_queue::job { sequence_adder(builder& b, Iterator begin, uint64_t last_element, uint64_t n) - : b(b), begin(begin), last_element(last_element), n(n) - {} + : b(b), begin(begin), last_element(last_element), n(n) {} - virtual void prepare() - { + virtual void prepare() { // store approximation of the universe as smallest power of two // that can represent last_element uint64_t universe_bits = ceil_log2(last_element); write_gamma(bits, universe_bits); write_gamma_nonzero(bits, n); - IndexedSequence::write( - bits, begin, (uint64_t(1) << universe_bits) + 1, n, b.m_params); + IndexedSequence::write(bits, begin, (uint64_t(1) << universe_bits) + 1, n, b.m_params); } virtual void commit() { b.m_sequences.append(bits); } @@ -75,27 +70,25 @@ class sequence_collection { size_t size() const { return m_sequences.size(); } - enumerator_type operator[](size_t i) const - { + enumerator_type operator[](size_t i) const { assert(i < size()); auto it = m_sequences.get(m_params, i); uint64_t universe_bits = read_gamma(it); uint64_t n = read_gamma_nonzero(it); return enumerator_type( - m_sequences.bits(), it.position(), (uint64_t(1) << universe_bits) + 1, n, m_params); + m_sequences.bits(), it.position(), (uint64_t(1) << universe_bits) + 1, n, m_params + ); } - void swap(sequence_collection& other) - { + void swap(sequence_collection& other) { std::swap(m_params, other.m_params); std::swap(m_size, other.m_size); m_sequences.swap(other.m_sequences); } template - void map(Visitor& visit) - { + void map(Visitor& visit) { visit(m_params, "m_params")(m_size, "m_size")(m_sequences, "m_sequences"); } diff --git a/include/pisa/sharding.hpp b/include/pisa/sharding.hpp index b6c1e6f0..27c6dd5d 100644 --- a/include/pisa/sharding.hpp +++ b/include/pisa/sharding.hpp @@ -25,13 +25,12 @@ auto mapping_from_files(std::string const& full_titles, gsl::span VecMap; auto create_random_mapping( - int document_count, int shard_count, std::optional seed = std::nullopt) - -> VecMap; + int document_count, int shard_count, std::optional seed = std::nullopt +) -> VecMap; auto create_random_mapping( - std::string const& input_basename, - int shard_count, - std::optional seed = std::nullopt) -> VecMap; + std::string const& input_basename, int shard_count, std::optional seed = std::nullopt +) -> VecMap; void copy_sequence(std::istream& is, std::ostream& os); @@ -39,17 +38,20 @@ void rearrange_sequences( std::string const& input_basename, std::string const& output_basename, VecMap& mapping, - std::optional shard_count = std::nullopt); + std::optional shard_count = std::nullopt +); void process_shard( std::string const& input_basename, std::string const& output_basename, Shard_Id shard_id, - VecMap const& terms); + VecMap const& terms +); void partition_fwd_index( std::string const& input_basename, std::string const& output_basename, - VecMap& mapping); + VecMap& mapping +); } // namespace pisa diff --git a/include/pisa/taily_stats.hpp b/include/pisa/taily_stats.hpp index f3219d42..05d92371 100644 --- a/include/pisa/taily_stats.hpp +++ b/include/pisa/taily_stats.hpp @@ -25,35 +25,33 @@ class TailyStats { public: explicit TailyStats(MemorySource source) : m_source(std::move(source)) {} - static auto from_mapped(std::string const& path) -> TailyStats - { + static auto from_mapped(std::string const& path) -> TailyStats { return TailyStats(MemorySource::mapped_file(path)); } [[nodiscard]] auto num_documents() const -> std::uint64_t { return read_at(0); } [[nodiscard]] auto num_terms() const -> std::uint64_t { return read_at(8); } - [[nodiscard]] auto term_stats(term_id_type term_id) const -> taily::Feature_Statistics - { + [[nodiscard]] auto term_stats(term_id_type term_id) const -> taily::Feature_Statistics { std::size_t offset = 16 + term_id * 24; auto expected_value = read_at(offset); auto variance = read_at(offset + sizeof(double)); auto frequency = read_at(offset + 2 * sizeof(double)); return taily::Feature_Statistics{expected_value, variance, frequency}; } - [[nodiscard]] auto query_stats(pisa::Query const& query) const -> taily::Query_Statistics - { + [[nodiscard]] auto query_stats(pisa::Query const& query) const -> taily::Query_Statistics { std::vector stats; std::transform( - query.terms.begin(), query.terms.end(), std::back_inserter(stats), [this](auto&& term_id) { - return this->term_stats(term_id); - }); + query.terms.begin(), + query.terms.end(), + std::back_inserter(stats), + [this](auto&& term_id) { return this->term_stats(term_id); } + ); return taily::Query_Statistics{std::move(stats), static_cast(num_documents())}; } private: template - [[nodiscard]] PISA_ALWAYSINLINE auto read_at(std::size_t pos) const -> T - { + [[nodiscard]] PISA_ALWAYSINLINE auto read_at(std::size_t pos) const -> T { static_assert(std::is_pod::value, "The value type must be POD."); T value{}; auto bytes = this->bytes(pos, sizeof(T)); @@ -62,8 +60,7 @@ class TailyStats { } [[nodiscard]] PISA_ALWAYSINLINE auto bytes(std::size_t start, std::size_t size) const - -> gsl::span - { + -> gsl::span { try { return m_source.subspan(start, size); } catch (std::out_of_range const&) { @@ -71,7 +68,8 @@ class TailyStats { "Tried to read bytes {}-{} but memory source is of size {}", start, start + size, - m_source.size())); + m_source.size() + )); } } @@ -82,8 +80,7 @@ class TailyStats { /// `scorer`. template [[nodiscard]] auto extract_feature_stats(pisa::binary_freq_collection const& collection, Scorer scorer) - -> std::vector -{ + -> std::vector { std::vector term_stats; { pisa::progress progress("Processing posting lists", collection.size()); @@ -108,10 +105,8 @@ template } void write_feature_stats( - gsl::span stats, - std::size_t num_documents, - std::string const& output_path) -{ + gsl::span stats, std::size_t num_documents, std::string const& output_path +) { std::ofstream ofs(output_path); ofs.write(reinterpret_cast(&num_documents), sizeof(num_documents)); std::size_t num_terms = stats.size(); @@ -129,19 +124,21 @@ void taily_score_shards( std::vector<::pisa::Query> const& global_queries, VecMap> const& shard_queries, std::size_t k, - Fn func) -{ + Fn func +) { if (shard_stats_paths.size() != shard_queries.size()) { throw std::invalid_argument(fmt::format( "Number of discovered shard stats paths ({}) does not match number of " "parsed query lists ({})", shard_stats_paths.size(), - shard_queries.size())); + shard_queries.size() + )); } std::for_each(shard_queries.begin(), shard_queries.end(), [&global_queries](auto&& sq) { if (global_queries.size() != sq.size()) { throw std::invalid_argument( - "Global queries and shard queries do not all have the same size."); + "Global queries and shard queries do not all have the same size." + ); } }); @@ -151,7 +148,8 @@ void taily_score_shards( shard_stats_paths.begin(), shard_stats_paths.end(), std::back_inserter(shard_stats), - [](auto&& path) { return pisa::TailyStats::from_mapped(path); }); + [](auto&& path) { return pisa::TailyStats::from_mapped(path); } + ); for (std::size_t query_idx = 0; query_idx < global_queries.size(); query_idx += 1) { auto global = global_stats.query_stats(global_queries[query_idx]); std::vector shards; @@ -160,10 +158,13 @@ void taily_score_shards( shard_stats.end(), shard_queries.begin(), std::back_inserter(shards), - [query_idx]( - auto&& shard, auto&& queries) { return shard.query_stats(queries[query_idx]); }); - auto [scores, time] = run_with_timer_ret( - [&] { return taily::score_shards(global, shards, k); }); + [query_idx](auto&& shard, auto&& queries) { + return shard.query_stats(queries[query_idx]); + } + ); + auto [scores, time] = run_with_timer_ret([&] { + return taily::score_shards(global, shards, k); + }); func(scores, time); } } diff --git a/include/pisa/text_analyzer.hpp b/include/pisa/text_analyzer.hpp index 835afe16..1b6d0446 100644 --- a/include/pisa/text_analyzer.hpp +++ b/include/pisa/text_analyzer.hpp @@ -21,14 +21,12 @@ class TextAnalyzer { void add_token_filter(std::unique_ptr token_filter); template - void emplace_text_filter(Args... args) - { + void emplace_text_filter(Args... args) { m_text_filters.emplace_back(std::make_unique(args...)); } template - void emplace_token_filter(Args... args) - { + void emplace_token_filter(Args... args) { m_token_filters.emplace_back(std::make_unique(args...)); } diff --git a/include/pisa/timer.hpp b/include/pisa/timer.hpp index 53166c2e..c77c86b6 100644 --- a/include/pisa/timer.hpp +++ b/include/pisa/timer.hpp @@ -17,8 +17,7 @@ namespace pisa { /// }); /// ``` template -Unit run_with_timer(std::function fn) -{ +Unit run_with_timer(std::function fn) { auto start_time = std::chrono::steady_clock::now(); fn(); auto end_time = std::chrono::steady_clock::now(); @@ -34,8 +33,7 @@ Unit run_with_timer(std::function fn) /// [](const auto& time) { log(time); }); /// ``` template -void run_with_timer(std::function fn, Handler handler) -{ +void run_with_timer(std::function fn, Handler handler) { auto start_time = std::chrono::steady_clock::now(); fn(); auto end_time = std::chrono::steady_clock::now(); @@ -59,8 +57,7 @@ struct TimedResult { /// [](const auto& time) { log(time); }); /// ``` template -auto run_with_timer_ret(Function fn, Handler handler) -> decltype(fn()) -{ +auto run_with_timer_ret(Function fn, Handler handler) -> decltype(fn()) { auto start_time = std::chrono::steady_clock::now(); auto result = fn(); auto end_time = std::chrono::steady_clock::now(); @@ -79,19 +76,18 @@ auto run_with_timer_ret(Function fn, Handler handler) -> decltype(fn()) /// []() { return get_stuff(); }); /// ``` template -auto run_with_timer_ret(Function fn) -> TimedResult -{ +auto run_with_timer_ret(Function fn) -> TimedResult { auto start_time = std::chrono::steady_clock::now(); auto result = fn(); auto end_time = std::chrono::steady_clock::now(); return TimedResult{ - result, std::chrono::duration_cast(end_time - start_time)}; + result, std::chrono::duration_cast(end_time - start_time) + }; } /// Formats time as hh:mm:ss:mil template -std::string format_time(Unit time) -{ +std::string format_time(Unit time) { int64_t hours = std::chrono::floor(time).count(); int64_t minutes = std::chrono::floor(time).count(); int64_t seconds = std::chrono::floor(time).count(); diff --git a/include/pisa/tokenizer.hpp b/include/pisa/tokenizer.hpp index e87d6782..64d3c997 100644 --- a/include/pisa/tokenizer.hpp +++ b/include/pisa/tokenizer.hpp @@ -36,7 +36,6 @@ using token_type = using lexer_type = lex::lexertl::actor_lexer; class EnglishTokenStream: public TokenStream { - using iterator = typename lexer_type::iterator_type; CowString m_input; diff --git a/include/pisa/topk_queue.hpp b/include/pisa/topk_queue.hpp index f53cada7..6cf22bed 100644 --- a/include/pisa/topk_queue.hpp +++ b/include/pisa/topk_queue.hpp @@ -25,8 +25,7 @@ struct topk_queue { /// the k-th highest score would be, then some top-k result will be missing /// from the final result, replaced by lower-scoring documents. explicit topk_queue(std::size_t k, Score initial_threshold = 0.0F) - : m_k(k), m_initial_threshold(initial_threshold) - { + : m_k(k), m_initial_threshold(initial_threshold) { m_effective_threshold = std::nextafter(m_initial_threshold, 0.0F); m_q.reserve(m_k + 1); } @@ -43,8 +42,7 @@ struct topk_queue { /// will be returned. Otherwise, the entry will be inserted, and `true` returned. /// If the heap is full, the entry with the lowest value will be removed, i.e., /// the heap will maintain its size. - auto insert(Score score, DocId docid = 0) -> bool - { + auto insert(Score score, DocId docid = 0) -> bool { if (PISA_UNLIKELY(not would_enter(score))) { return false; } @@ -71,14 +69,14 @@ struct topk_queue { /// /// After calling this function, the heap should be no longer modified, as /// the heap order will not be preserved. - void finalize() - { + void finalize() { std::sort_heap(m_q.begin(), m_q.end(), min_heap_order); size_t size = std::lower_bound( m_q.begin(), m_q.end(), 0, - [](std::pair l, Score r) { return l.first > r; }) + [](std::pair l, Score r) { return l.first > r; } + ) - m_q.begin(); m_q.resize(size); } @@ -91,30 +89,26 @@ struct topk_queue { /// Returns the threshold based on the heap state, defined as the score of the `k`-th document, /// or 0.0 if the heap is not full. - [[nodiscard]] auto true_threshold() const noexcept -> Score - { + [[nodiscard]] auto true_threshold() const noexcept -> Score { return capacity() == size() ? m_q.front().first : 0.0; } /// Returns the threshold set at the start (by default 0.0). [[nodiscard]] auto initial_threshold() const noexcept -> Score { return m_initial_threshold; } /// Returns the maximum of `true_threshold()` and `initial_threshold()`. - [[nodiscard]] auto effective_threshold() const noexcept -> Score - { + [[nodiscard]] auto effective_threshold() const noexcept -> Score { return m_effective_threshold; } /// Returns `true` if no documents have been missed up to this point. /// The reason why document could be missed is forcing a threshold that is too high /// (overestimated). - [[nodiscard]] auto is_safe() noexcept -> bool - { + [[nodiscard]] auto is_safe() noexcept -> bool { return m_effective_threshold >= m_initial_threshold; } /// Empties the queue and resets the threshold to 0 (or the given value). - void clear(Score initial_threshold = 0.0) noexcept - { + void clear(Score initial_threshold = 0.0) noexcept { m_q.clear(); m_effective_threshold = std::nextafter(m_initial_threshold, 0.0); m_initial_threshold = initial_threshold; @@ -128,8 +122,7 @@ struct topk_queue { private: [[nodiscard]] constexpr static auto - min_heap_order(entry_type const& lhs, entry_type const& rhs) noexcept -> bool - { + min_heap_order(entry_type const& lhs, entry_type const& rhs) noexcept -> bool { return lhs.first > rhs.first; } @@ -139,8 +132,7 @@ struct topk_queue { /// /// For justification for using it instead of STL functions, see /// https://github.com/pisa-engine/pisa/issues/504. - static void sift_down(entry_iterator_type first, entry_iterator_type last) - { + static void sift_down(entry_iterator_type first, entry_iterator_type last) { auto cmp = [first](std::size_t lhs, std::size_t rhs) { return (first + lhs)->first > (first + rhs)->first; }; diff --git a/include/pisa/type_safe.hpp b/include/pisa/type_safe.hpp index a5f098ae..474078a1 100644 --- a/include/pisa/type_safe.hpp +++ b/include/pisa/type_safe.hpp @@ -26,8 +26,7 @@ class Integer { template < typename U, typename std::enable_if && !std::is_same_v, bool>::type = true> - explicit operator U() const - { + explicit operator U() const { return m_val; } @@ -43,31 +42,26 @@ class Integer { [[nodiscard]] bool operator>(Integer const& other) const { return m_val > other.m_val; } [[nodiscard]] bool operator>=(Integer const& other) const { return m_val >= other.m_val; } - Integer& operator++() - { + Integer& operator++() { ++m_val; return *this; } Integer operator++(int) { return Integer{m_val++}; } [[nodiscard]] Integer operator+(T difference) const { return Integer(m_val + difference); } - Integer& operator+=(T difference) - { + Integer& operator+=(T difference) { m_val += difference; return *this; } - [[nodiscard]] Integer operator+(Integer const& other) const - { + [[nodiscard]] Integer operator+(Integer const& other) const { return Integer(m_val + other.m_val); } - Integer& operator+=(Integer const& other) - { + Integer& operator+=(Integer const& other) { m_val += other.m_val; return *this; } Integer operator-(Integer const& other) const { return Integer(m_val - other.m_val); } - Integer& operator-=(Integer const& other) - { + Integer& operator-=(Integer const& other) { m_val -= other.m_val; return *this; } @@ -82,8 +76,7 @@ namespace std { template struct hash> { - constexpr auto operator()(pisa::Integer const& key) const noexcept - { + constexpr auto operator()(pisa::Integer const& key) const noexcept { return hash{}(static_cast(key)); } }; @@ -93,8 +86,7 @@ struct hash> { namespace pisa { template -std::ostream& operator<<(std::ostream& os, Integer id) -{ +std::ostream& operator<<(std::ostream& os, Integer id) { return os << static_cast(id); } @@ -139,8 +131,7 @@ namespace literals { template struct fmt::formatter>: public fmt::formatter { template - auto format(pisa::Integer value, FormatContext& ctx) const - { + auto format(pisa::Integer value, FormatContext& ctx) const { return formatter::format(value.as_int(), ctx); } }; diff --git a/include/pisa/util/block_profiler.hpp b/include/pisa/util/block_profiler.hpp index c6c8d2b1..5994f7f2 100644 --- a/include/pisa/util/block_profiler.hpp +++ b/include/pisa/util/block_profiler.hpp @@ -13,8 +13,7 @@ class block_profiler { block_profiler(block_profiler&&) = delete; block_profiler operator=(block_profiler const&) = delete; block_profiler operator=(block_profiler&&) = delete; - ~block_profiler() - { + ~block_profiler() { std::lock_guard lock(m_mutex); for (auto const& it: m_block_freqs) { delete[] it.second.second; @@ -23,14 +22,12 @@ class block_profiler { using counter_type = std::atomic_uint_fast32_t; - static block_profiler& get() - { + static block_profiler& get() { static block_profiler instance; return instance; } - static counter_type* open_list(uint32_t term_id, uint32_t blocks) - { + static counter_type* open_list(uint32_t term_id, uint32_t blocks) { block_profiler& instance = get(); std::lock_guard lock(instance.m_mutex); auto& v = instance.m_block_freqs[term_id]; @@ -42,8 +39,7 @@ class block_profiler { return v.second; } - static void dump(std::ostream& os) - { + static void dump(std::ostream& os) { block_profiler& instance = get(); std::lock_guard lock(instance.m_mutex); diff --git a/include/pisa/util/broadword.hpp b/include/pisa/util/broadword.hpp index 7a6ff11b..70c25277 100644 --- a/include/pisa/util/broadword.hpp +++ b/include/pisa/util/broadword.hpp @@ -25,38 +25,34 @@ namespace pisa { namespace broadword { static const uint64_t magic_mask_5 = 0x0000FFFF0000FFFFULL; static const uint64_t magic_mask_6 = 0x00000000FFFFFFFFULL; - inline uint64_t leq_step_8(uint64_t x, uint64_t y) - { + inline uint64_t leq_step_8(uint64_t x, uint64_t y) { return ((((y | msbs_step_8) - (x & ~msbs_step_8)) ^ (x ^ y)) & msbs_step_8) >> 7; } - inline uint64_t uleq_step_8(uint64_t x, uint64_t y) - { + inline uint64_t uleq_step_8(uint64_t x, uint64_t y) { return (((((y | msbs_step_8) - (x & ~msbs_step_8)) ^ (x ^ y)) ^ (x & ~y)) & msbs_step_8) >> 7; } - inline uint64_t zcompare_step_8(uint64_t x) - { + inline uint64_t zcompare_step_8(uint64_t x) { return ((x | ((x | msbs_step_8) - ones_step_8)) & msbs_step_8) >> 7; } - inline uint64_t uleq_step_9(uint64_t x, uint64_t y) - { + inline uint64_t uleq_step_9(uint64_t x, uint64_t y) { return (((((y | msbs_step_9) - (x & ~msbs_step_9)) | (x ^ y)) ^ (x & ~y)) & msbs_step_9) >> 8; } - inline uint64_t byte_counts(uint64_t x) - { + inline uint64_t byte_counts(uint64_t x) { x = x - ((x & 0xa * ones_step_4) >> 1); x = (x & 3 * ones_step_4) + ((x >> 2) & 3 * ones_step_4); x = (x + (x >> 4)) & 0x0f * ones_step_8; return x; } - inline uint64_t bytes_sum(uint64_t x) { return x * ones_step_8 >> 56; } + inline uint64_t bytes_sum(uint64_t x) { + return x * ones_step_8 >> 56; + } - inline uint64_t popcount(uint64_t x) - { + inline uint64_t popcount(uint64_t x) { #if USE_POPCNT return intrinsics::popcount(x); #else @@ -64,18 +60,18 @@ namespace pisa { namespace broadword { #endif } - inline uint64_t reverse_bytes(uint64_t x) { return intrinsics::byteswap64(x); } + inline uint64_t reverse_bytes(uint64_t x) { + return intrinsics::byteswap64(x); + } - inline uint64_t reverse_bits(uint64_t x) - { + inline uint64_t reverse_bits(uint64_t x) { x = ((x >> 1) & magic_mask_1) | ((x & magic_mask_1) << 1); x = ((x >> 2) & magic_mask_2) | ((x & magic_mask_2) << 2); x = ((x >> 4) & magic_mask_3) | ((x & magic_mask_3) << 4); return reverse_bytes(x); } - inline uint64_t select_in_word(const uint64_t x, const uint64_t k) - { + inline uint64_t select_in_word(const uint64_t x, const uint64_t k) { assert(k < popcount(x)); uint64_t byte_sums = byte_counts(x) * ones_step_8; @@ -91,8 +87,7 @@ namespace pisa { namespace broadword { return place + tables::select_in_byte[((x >> place) & 0xFF) | (byte_rank << 8)]; } - inline uint64_t same_msb(uint64_t x, uint64_t y) - { + inline uint64_t same_msb(uint64_t x, uint64_t y) { return static_cast((x ^ y) <= (x & y)); } @@ -101,37 +96,33 @@ namespace pisa { namespace broadword { static const uint8_t debruijn64_mapping[64] = { 63, 0, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54, 33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62, 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, - 29, 10, 13, 21, 56, 45, 25, 31, 35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5}; + 29, 10, 13, 21, 56, 45, 25, 31, 35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5 + }; static const uint64_t debruijn64 = 0x07EDD5E59A4E28C2ULL; } // namespace detail // return the position of the single bit set in the word x - inline uint8_t bit_position(uint64_t x) - { + inline uint8_t bit_position(uint64_t x) { assert(popcount(x) == 1); return detail::debruijn64_mapping[(x * detail::debruijn64) >> 58]; } - inline uint8_t msb(uint64_t x, unsigned long& ret) - { + inline uint8_t msb(uint64_t x, unsigned long& ret) { return static_cast(intrinsics::bsr64(&ret, x)); } - inline uint8_t msb(uint64_t x) - { + inline uint8_t msb(uint64_t x) { assert(x); unsigned long ret = -1U; msb(x, ret); return (uint8_t)ret; } - inline uint8_t lsb(uint64_t x, unsigned long& ret) - { + inline uint8_t lsb(uint64_t x, unsigned long& ret) { return static_cast(intrinsics::bsf64(&ret, x)); } - inline uint8_t lsb(uint64_t x) - { + inline uint8_t lsb(uint64_t x) { assert(x); unsigned long ret = -1U; lsb(x, ret); diff --git a/include/pisa/util/do_not_optimize_away.hpp b/include/pisa/util/do_not_optimize_away.hpp index a5722e82..26db192e 100644 --- a/include/pisa/util/do_not_optimize_away.hpp +++ b/include/pisa/util/do_not_optimize_away.hpp @@ -8,8 +8,7 @@ inline void do_not_optimize_dependency_sink(const void*) {} #pragma optimize("", on) template -void do_not_optimize_away(const T& datum) -{ +void do_not_optimize_away(const T& datum) { doNotOptimizeDependencySink(&datum); } @@ -26,15 +25,13 @@ namespace detail { template auto do_not_optimize_away(const T& datum) -> - typename std::enable_if::value>::type -{ + typename std::enable_if::value>::type { asm volatile("" ::"r"(datum)); } template auto do_not_optimize_away(const T& datum) -> - typename std::enable_if::value>::type -{ + typename std::enable_if::value>::type { asm volatile("" ::"m"(datum) : "memory"); } #endif diff --git a/include/pisa/util/index_build_utils.hpp b/include/pisa/util/index_build_utils.hpp index d5728722..be3c63b8 100644 --- a/include/pisa/util/index_build_utils.hpp +++ b/include/pisa/util/index_build_utils.hpp @@ -12,8 +12,8 @@ namespace pisa { template void get_size_stats( - freq_index& coll, uint64_t& docs_size, uint64_t& freqs_size) -{ + freq_index& coll, uint64_t& docs_size, uint64_t& freqs_size +) { auto size_tree = mapper::size_tree_of(coll); size_tree->dump(); for (auto const& node: size_tree->children) { @@ -26,8 +26,9 @@ void get_size_stats( } template -void get_size_stats(block_freq_index& coll, uint64_t& docs_size, uint64_t& freqs_size) -{ +void get_size_stats( + block_freq_index& coll, uint64_t& docs_size, uint64_t& freqs_size +) { auto size_tree = mapper::size_tree_of(coll); size_tree->dump(); uint64_t total_size = 0; @@ -45,8 +46,7 @@ void get_size_stats(block_freq_index& coll, uint64_t& docs_ } template -void dump_stats(Collection& coll, std::string const& type, uint64_t postings) -{ +void dump_stats(Collection& coll, std::string const& type, uint64_t postings) { uint64_t docs_size = 0, freqs_size = 0; get_size_stats(coll, docs_size, freqs_size); diff --git a/include/pisa/util/intrinsics.hpp b/include/pisa/util/intrinsics.hpp index b8db2484..946b8499 100644 --- a/include/pisa/util/intrinsics.hpp +++ b/include/pisa/util/intrinsics.hpp @@ -18,8 +18,7 @@ namespace pisa { namespace intrinsics { - __INTRIN_INLINE uint64_t byteswap64(uint64_t value) - { + __INTRIN_INLINE uint64_t byteswap64(uint64_t value) { #if defined(__GNUC__) || defined(__clang__) return __builtin_bswap64(value); #elif defined(_MSC_VER) @@ -29,8 +28,7 @@ namespace pisa { namespace intrinsics { #endif } - __INTRIN_INLINE bool bsf64(unsigned long* const index, const uint64_t mask) - { + __INTRIN_INLINE bool bsf64(unsigned long* const index, const uint64_t mask) { #if defined(__GNUC__) || defined(__clang__) if (mask != 0U) { *index = (unsigned long)__builtin_ctzll(mask); @@ -45,8 +43,7 @@ namespace pisa { namespace intrinsics { #endif } - __INTRIN_INLINE bool bsr64(unsigned long* const index, const uint64_t mask) - { + __INTRIN_INLINE bool bsr64(unsigned long* const index, const uint64_t mask) { #if defined(__GNUC__) || defined(__clang__) if (mask != 0U) { *index = (unsigned long)(63 - __builtin_clzll(mask)); @@ -61,8 +58,7 @@ namespace pisa { namespace intrinsics { } template - __INTRIN_INLINE void prefetch(T const* ptr) - { + __INTRIN_INLINE void prefetch(T const* ptr) { #if defined(__SSE__) _mm_prefetch((const char*)ptr, _MM_HINT_T0); #endif @@ -70,7 +66,9 @@ namespace pisa { namespace intrinsics { #if USE_POPCNT - __INTRIN_INLINE uint64_t popcount(uint64_t x) { return uint64_t(_mm_popcnt_u64(x)); } + __INTRIN_INLINE uint64_t popcount(uint64_t x) { + return uint64_t(_mm_popcnt_u64(x)); + } #endif /* USE_POPCNT */ diff --git a/include/pisa/util/inverted_index_utils.hpp b/include/pisa/util/inverted_index_utils.hpp index 8a432756..f60d94c7 100644 --- a/include/pisa/util/inverted_index_utils.hpp +++ b/include/pisa/util/inverted_index_utils.hpp @@ -12,21 +12,18 @@ namespace pisa { template -std::ostream& write_sequence(std::ostream& os, gsl::span sequence) -{ +std::ostream& write_sequence(std::ostream& os, gsl::span sequence) { auto length = static_cast(sequence.size()); os.write(reinterpret_cast(&length), sizeof(length)); os.write(reinterpret_cast(sequence.data()), length * sizeof(T)); return os; } -inline void emit(std::ostream& os, const uint32_t* vals, size_t n) -{ +inline void emit(std::ostream& os, const uint32_t* vals, size_t n) { os.write(reinterpret_cast(vals), sizeof(*vals) * n); } -inline void emit(std::ostream& os, uint32_t val) -{ +inline void emit(std::ostream& os, uint32_t val) { emit(os, &val, 1); } @@ -36,14 +33,15 @@ void sample_inverted_index( std::string const& input_basename, std::string const& output_basename, SampleFn&& sample_fn, - std::unordered_set& terms_to_drop) -{ + std::unordered_set& terms_to_drop +) { binary_freq_collection input(input_basename.c_str()); std::filesystem::copy_file( fmt::format("{}.sizes", input_basename), fmt::format("{}.sizes", output_basename), - std::filesystem::copy_options::overwrite_existing); + std::filesystem::copy_options::overwrite_existing + ); std::ofstream dos(output_basename + ".docs"); std::ofstream fos(output_basename + ".freqs"); @@ -80,8 +78,8 @@ void sample_inverted_index( inline void reorder_inverted_index( const std::string& input_basename, const std::string& output_basename, - const std::vector& mapping) -{ + const std::vector& mapping +) { std::ofstream output_mapping(output_basename + ".mapping"); emit(output_mapping, mapping.data(), mapping.size()); @@ -107,7 +105,8 @@ inline void reorder_inverted_index( std::vector> pl; pisa::progress reorder_progress( - "Reorder inverted index", std::distance(input.begin(), input.end())); + "Reorder inverted index", std::distance(input.begin(), input.end()) + ); for (const auto& seq: input) { for (size_t i = 0; i < seq.docs.size(); ++i) { diff --git a/include/pisa/util/log.hpp b/include/pisa/util/log.hpp index 996430e3..5b7d14c0 100644 --- a/include/pisa/util/log.hpp +++ b/include/pisa/util/log.hpp @@ -11,15 +11,13 @@ class Log2 { static_assert(N >= 1, "number of precomputed values must be positive"); public: - constexpr Log2() - { + constexpr Log2() { m_values[0] = -std::numeric_limits::infinity(); for (std::size_t n = 1; n < N; ++n) { m_values[n] = std::log2(n); } } - constexpr double operator()(std::size_t n) const - { + constexpr double operator()(std::size_t n) const { if (n >= m_values.size()) { return std::log2(n); } diff --git a/include/pisa/util/semiasync_queue.hpp b/include/pisa/util/semiasync_queue.hpp index 7708d180..eb78ef32 100644 --- a/include/pisa/util/semiasync_queue.hpp +++ b/include/pisa/util/semiasync_queue.hpp @@ -12,8 +12,7 @@ namespace pisa { class semiasync_queue { public: explicit semiasync_queue(double work_per_thread) - : m_expected_work(0), m_work_per_thread(work_per_thread) - { + : m_expected_work(0), m_work_per_thread(work_per_thread) { m_max_threads = std::thread::hardware_concurrency(); spdlog::info("semiasync_queue using {} worker threads", m_max_threads); } @@ -32,8 +31,7 @@ class semiasync_queue { using job_ptr_type = std::shared_ptr; - void add_job(job_ptr_type j, double expected_work) - { + void add_job(job_ptr_type j, double expected_work) { if (m_max_threads != 0U) { m_next_thread.first.push_back(j); m_expected_work += expected_work; @@ -47,8 +45,7 @@ class semiasync_queue { } } - void complete() - { + void complete() { if (!m_next_thread.first.empty()) { spawn_next_thread(); } @@ -58,8 +55,7 @@ class semiasync_queue { } private: - void spawn_next_thread() - { + void spawn_next_thread() { if (m_running_threads.size() == m_max_threads) { commit_thread(); } @@ -77,8 +73,7 @@ class semiasync_queue { m_expected_work = 0; } - void commit_thread() - { + void commit_thread() { assert(!m_running_threads.empty()); m_running_threads.front().second.join(); for (auto& j: m_running_threads.front().first) { diff --git a/include/pisa/util/single_init_vector.hpp b/include/pisa/util/single_init_vector.hpp index 1dbad474..ce5bf876 100644 --- a/include/pisa/util/single_init_vector.hpp +++ b/include/pisa/util/single_init_vector.hpp @@ -9,8 +9,7 @@ class single_init_entry { const T& value() const { return m_value; } bool has_value(std::size_t generation) const { return m_generation == generation; } - void set(std::size_t generation, const T& v) - { + void set(std::size_t generation, const T& v) { m_value = v; m_generation = generation; } @@ -37,21 +36,19 @@ template class single_init_vector: public std::vector> { public: using std::vector>::vector; - const T& operator[](std::size_t i) const - { + const T& operator[](std::size_t i) const { return ( std::vector>::operator[](i).has_value(m_generation) ? std::vector>::operator[](i).value() - : m_defaultValue); + : m_defaultValue + ); } - bool has_value(std::size_t i) const - { + bool has_value(std::size_t i) const { return (std::vector>::operator[](i).has_value(m_generation)); } - void set(std::size_t i, const T& v) - { + void set(std::size_t i, const T& v) { std::vector>::operator[](i).set(m_generation, v); } diff --git a/include/pisa/util/tables.hpp b/include/pisa/util/tables.hpp index 83062103..1323c2c8 100644 --- a/include/pisa/util/tables.hpp +++ b/include/pisa/util/tables.hpp @@ -71,6 +71,7 @@ namespace pisa { namespace tables { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 7}; + 8, 7 + }; }} // namespace pisa::tables diff --git a/include/pisa/util/util.hpp b/include/pisa/util/util.hpp index f3c882de..aa765888 100644 --- a/include/pisa/util/util.hpp +++ b/include/pisa/util/util.hpp @@ -16,27 +16,23 @@ namespace pisa { template -inline IntType1 ceil_div(IntType1 dividend, IntType2 divisor) -{ +inline IntType1 ceil_div(IntType1 dividend, IntType2 divisor) { // XXX(ot): put some static check that IntType1 >= IntType2 auto d = IntType1(divisor); return IntType1(dividend + d - 1) / d; } template -inline void dispose(T& t) -{ +inline void dispose(T& t) { T().swap(t); } -inline uint64_t ceil_log2(const uint64_t x) -{ +inline uint64_t ceil_log2(const uint64_t x) { assert(x > 0); return (x > 1) ? broadword::msb(x - 1) + 1 : 0; } -inline double get_time_usecs() -{ +inline double get_time_usecs() { auto now = std::chrono::system_clock::now(); auto duration = now.time_since_epoch(); return std::chrono::duration_cast(duration).count(); @@ -73,28 +69,25 @@ class function_iterator { function_iterator() = default; explicit function_iterator( - State&& initial_state, AdvanceFunctor&& advance_functor, ValueFunctor&& value_functor) + State&& initial_state, AdvanceFunctor&& advance_functor, ValueFunctor&& value_functor + ) : m_state(std::forward(initial_state)), m_advance_functor(std::forward(advance_functor)), - m_value_functor(std::forward(value_functor)) - {} + m_value_functor(std::forward(value_functor)) {} - friend inline void swap(function_iterator& lhs, function_iterator& rhs) - { + friend inline void swap(function_iterator& lhs, function_iterator& rhs) { using std::swap; swap(lhs.m_state, rhs.m_state); } value_type operator*() const { return m_value_functor(m_state); } - function_iterator& operator++() - { + function_iterator& operator++() { m_advance_functor(m_state); return *this; } - function_iterator operator++(int) - { + function_iterator operator++(int) { function_iterator it(*this); operator++(); return it; @@ -112,12 +105,13 @@ class function_iterator { template function_iterator make_function_iterator( - State&& initial_state, AdvanceFunctor&& advance_functor, ValueFunctor&& value_functor) -{ + State&& initial_state, AdvanceFunctor&& advance_functor, ValueFunctor&& value_functor +) { return function_iterator( std::forward(initial_state), std::forward(advance_functor), - std::forward(value_functor)); + std::forward(value_functor) + ); } struct stats_line { @@ -129,8 +123,7 @@ struct stats_line { ~stats_line() { std::cout << "}" << std::endl; } template - stats_line& operator()(K const& key, T const& value) - { + stats_line& operator()(K const& key, T const& value) { if (!first) { std::cout << ", "; } else { @@ -144,15 +137,13 @@ struct stats_line { } template - stats_line& operator()(T const& obj) - { + stats_line& operator()(T const& obj) { return obj.dump(*this); } private: template - void emit(T const& v) const - { + void emit(T const& v) const { std::cout << v; } @@ -162,8 +153,7 @@ struct stats_line { void emit(std::string const& s) const { emit(s.c_str()); } template - void emit(std::vector const& v) const - { + void emit(std::vector const& v) const { std::cout << "["; bool first = true; for (auto const& i: v) { @@ -178,37 +168,32 @@ struct stats_line { } template - void emit(std::map const& m) const - { + void emit(std::map const& m) const { std::vector> v(m.begin(), m.end()); emit(v); } template - typename std::enable_if::type emit_tuple_helper(Tuple const& t) const - { + typename std::enable_if::type emit_tuple_helper(Tuple const& t) const { emit_tuple_helper(t); std::cout << ", "; emit(std::get(t)); } template - typename std::enable_if::type emit_tuple_helper(Tuple const& t) const - { + typename std::enable_if::type emit_tuple_helper(Tuple const& t) const { emit(std::get<0>(t)); } template - void emit(std::tuple const& t) const - { + void emit(std::tuple const& t) const { std::cout << "["; emit_tuple_helper, sizeof...(Tp) - 1>(t); std::cout << "]"; } template - void emit(std::pair const& p) const - { + void emit(std::pair const& p) const { emit(std::make_tuple(p.first, p.second)); } diff --git a/include/pisa/util/verify_collection.hpp b/include/pisa/util/verify_collection.hpp index c57d1e64..1a628018 100644 --- a/include/pisa/util/verify_collection.hpp +++ b/include/pisa/util/verify_collection.hpp @@ -9,8 +9,7 @@ namespace pisa { template -void verify_collection(InputCollection const& input, const char* filename) -{ +void verify_collection(InputCollection const& input, const char* filename) { Collection coll; auto source = MemorySource::mapped_file(std::filesystem::path(filename)); pisa::mapper::map(coll, source.data()); diff --git a/include/pisa/vec_map.hpp b/include/pisa/vec_map.hpp index 2df5b447..24969701 100644 --- a/include/pisa/vec_map.hpp +++ b/include/pisa/vec_map.hpp @@ -19,35 +19,29 @@ struct EnumerateIterator { using iterator_category = std::forward_iterator_tag; constexpr EnumerateIterator(Iterator iter, Index init) - : m_current_index(std::move(init)), m_value_iterator(std::move(iter)) - {} + : m_current_index(std::move(init)), m_value_iterator(std::move(iter)) {} ~EnumerateIterator() = default; constexpr EnumerateIterator(EnumerateIterator const&) = default; constexpr EnumerateIterator(EnumerateIterator&&) noexcept = default; constexpr auto operator=(EnumerateIterator const&) -> EnumerateIterator& = default; constexpr auto operator=(EnumerateIterator&&) noexcept -> EnumerateIterator& = default; - constexpr auto operator++() -> EnumerateIterator& - { + constexpr auto operator++() -> EnumerateIterator& { ++m_value_iterator; ++m_current_index; return *this; } - constexpr auto operator++(int) -> EnumerateIterator - { + constexpr auto operator++(int) -> EnumerateIterator { EnumerateIterator retval = *this; ++(*this); return retval; } - [[nodiscard]] constexpr auto operator==(EnumerateIterator other) const -> bool - { + [[nodiscard]] constexpr auto operator==(EnumerateIterator other) const -> bool { return m_value_iterator == other.m_value_iterator; } - [[nodiscard]] constexpr auto operator!=(EnumerateIterator other) const -> bool - { + [[nodiscard]] constexpr auto operator!=(EnumerateIterator other) const -> bool { return !(m_value_iterator == other.m_value_iterator); } - [[nodiscard]] constexpr auto operator*() const -> reference - { + [[nodiscard]] constexpr auto operator*() const -> reference { return reference(m_current_index, *m_value_iterator); } @@ -60,47 +54,38 @@ template struct Enumerate { template constexpr explicit Enumerate(Container&& container, Index init = Index{}) - : m_init(std::move(init)), m_value_begin(container.begin()), m_value_end(container.end()) - {} + : m_init(std::move(init)), m_value_begin(container.begin()), m_value_end(container.end()) {} - [[nodiscard]] constexpr auto begin() -> EnumerateIterator - { + [[nodiscard]] constexpr auto begin() -> EnumerateIterator { return EnumerateIterator(m_value_begin, m_init); } - [[nodiscard]] constexpr auto end() -> EnumerateIterator - { + [[nodiscard]] constexpr auto end() -> EnumerateIterator { return EnumerateIterator(m_value_end, m_init); } - [[nodiscard]] constexpr auto begin() const -> EnumerateIterator - { + [[nodiscard]] constexpr auto begin() const -> EnumerateIterator { return EnumerateIterator(m_value_begin, m_init); } - [[nodiscard]] constexpr auto end() const -> EnumerateIterator - { + [[nodiscard]] constexpr auto end() const -> EnumerateIterator { return EnumerateIterator(m_value_end, m_init); } - [[nodiscard]] constexpr auto cbegin() const -> EnumerateIterator - { + [[nodiscard]] constexpr auto cbegin() const -> EnumerateIterator { return begin(); } - [[nodiscard]] constexpr auto cend() const -> EnumerateIterator - { + [[nodiscard]] constexpr auto cend() const -> EnumerateIterator { return end(); } - [[nodiscard]] constexpr auto size() const -> std::size_t - { + [[nodiscard]] constexpr auto size() const -> std::size_t { return std::distance(m_value_begin, m_value_end); } [[nodiscard]] constexpr auto collect() const - -> std::vector::value_type>> - { + -> std::vector::value_type>> { std::vector::value_type>> vec(size()); std::copy(begin(), end(), vec.begin()); return vec; @@ -183,109 +168,89 @@ class VecMap: protected std::vector { VecMap() noexcept(noexcept(Allocator())) : std::vector() {} explicit VecMap(Allocator const& alloc) noexcept : std::vector(alloc) {} VecMap(size_type count, V const& value, Allocator const& alloc = Allocator()) - : std::vector(count, value, alloc) - {} + : std::vector(count, value, alloc) {} explicit VecMap(size_type count, Allocator const& alloc = Allocator()) - : std::vector(count, alloc) - {} + : std::vector(count, alloc) {} template VecMap(InputIt first, InputIt last, Allocator const& alloc = Allocator()) - : std::vector(first, last, alloc) - {} + : std::vector(first, last, alloc) {} VecMap(VecMap const& other) : std::vector(other) {} VecMap(VecMap const& other, const Allocator& alloc) : std::vector(other, alloc) {} VecMap(VecMap&& other) noexcept : std::vector(other) {} VecMap(VecMap&& other, Allocator const& alloc) : std::vector(other, alloc) {} VecMap(std::initializer_list init, Allocator const& alloc = Allocator()) - : std::vector(init, alloc) - {} + : std::vector(init, alloc) {} ~VecMap() = default; - auto operator=(VecMap const& other) -> VecMap& - { + auto operator=(VecMap const& other) -> VecMap& { if (this != &other) { std::vector::operator=(other); } return *this; }; - auto operator=(VecMap&& other) noexcept -> VecMap& - { + auto operator=(VecMap&& other) noexcept -> VecMap& { std::vector::operator=(other); return *this; }; - auto operator=(std::initializer_list init) -> VecMap& - { + auto operator=(std::initializer_list init) -> VecMap& { std::vector::operator=(init); return *this; } - auto operator[](K key) -> reference - { + auto operator[](K key) -> reference { return std::vector::operator[](static_cast(key)); } - auto operator[](K key) const -> const_reference - { + auto operator[](K key) const -> const_reference { return std::vector::operator[](static_cast(key)); } - auto at(K key) -> reference - { + auto at(K key) -> reference { return std::vector::at(static_cast(key)); } - auto at(K key) const -> const_reference - { + auto at(K key) const -> const_reference { return std::vector::at(static_cast(key)); } auto as_vector() const -> std::vector const& { return *this; } auto as_vector() -> std::vector& { return *this; } - [[nodiscard]] auto entries() const -> Enumerate::const_iterator> - { + [[nodiscard]] auto entries() const -> Enumerate::const_iterator> { return Enumerate::const_iterator>(*this, static_cast(0U)); } }; template -void swap(VecMap& lhs, VecMap& rhs) noexcept(noexcept(lhs.swap(rhs))) -{ +void swap(VecMap& lhs, VecMap& rhs) noexcept(noexcept(lhs.swap(rhs))) { return lhs.swap(rhs); } template -auto operator==(const VecMap& lhs, const VecMap& rhs) -> bool -{ +auto operator==(const VecMap& lhs, const VecMap& rhs) -> bool { return lhs.as_vector() == rhs.as_vector(); } template -auto operator!=(const VecMap& lhs, const VecMap& rhs) -> bool -{ +auto operator!=(const VecMap& lhs, const VecMap& rhs) -> bool { return lhs.as_vector() != rhs.as_vector(); } template -auto operator<(const VecMap& lhs, const VecMap& rhs) -> bool -{ +auto operator<(const VecMap& lhs, const VecMap& rhs) -> bool { return lhs.as_vector() < rhs.as_vector(); } template -auto operator<=(const VecMap& lhs, const VecMap& rhs) -> bool -{ +auto operator<=(const VecMap& lhs, const VecMap& rhs) -> bool { return lhs.as_vector() <= rhs.as_vector(); } template -auto operator>(const VecMap& lhs, const VecMap& rhs) -> bool -{ +auto operator>(const VecMap& lhs, const VecMap& rhs) -> bool { return lhs.as_vector() > rhs.as_vector(); } template -auto operator>=(const VecMap& lhs, const VecMap& rhs) -> bool -{ +auto operator>=(const VecMap& lhs, const VecMap& rhs) -> bool { return lhs.as_vector() >= rhs.as_vector(); } template [[nodiscard]] inline auto read_string_vec_map(std::string const& filename) - -> VecMap -{ + -> VecMap { VecMap vec; std::ifstream is(filename); std::string line; diff --git a/include/pisa/wand_data.hpp b/include/pisa/wand_data.hpp index 65584201..a87da801 100644 --- a/include/pisa/wand_data.hpp +++ b/include/pisa/wand_data.hpp @@ -29,8 +29,7 @@ class wand_data { using wand_data_enumerator = typename block_wand_type::enumerator; wand_data() = default; - explicit wand_data(MemorySource source) : m_source(std::move(source)) - { + explicit wand_data(MemorySource source) : m_source(std::move(source)) { mapper::map(*this, m_source.data(), mapper::map_flags::warmup); } @@ -42,9 +41,9 @@ class wand_data { const ScorerParams& scorer_params, BlockSize block_size, std::optional quantization_bits, - std::unordered_set const& terms_to_drop) - : m_num_docs(num_docs) - { + std::unordered_set const& terms_to_drop + ) + : m_num_docs(num_docs) { std::vector doc_lens(num_docs); std::vector max_term_weight; std::vector term_occurrence_counts; @@ -95,7 +94,8 @@ class wand_data { continue; } auto v = builder.add_sequence( - seq, coll, doc_lens, m_avg_len, scorer->term_scorer(new_term_id), block_size); + seq, coll, doc_lens, m_avg_len, scorer->term_scorer(new_term_id), block_size + ); max_term_weight.push_back(v); m_index_max_term_weight = std::max(m_index_max_term_weight, v); term_id += 1; @@ -118,8 +118,7 @@ class wand_data { size_t doc_len(uint64_t doc_id) const { return m_doc_lens[doc_id]; } - size_t term_occurrence_count(uint64_t term_id) const - { + size_t term_occurrence_count(uint64_t term_id) const { return m_term_occurrence_counts[term_id]; } @@ -135,16 +134,14 @@ class wand_data { float max_term_weight(uint64_t list) const { return m_max_term_weight[list]; } - wand_data_enumerator getenum(size_t i) const - { + wand_data_enumerator getenum(size_t i) const { return m_block_wand.get_enum(i, index_max_term_weight()); } const block_wand_type& get_block_wand() const { return m_block_wand; } template - void map(Visitor& visit) - { + void map(Visitor& visit) { visit(m_block_wand, "m_block_wand")(m_doc_lens, "m_doc_lens")( m_term_occurrence_counts, "m_term_occurrence_counts")( @@ -175,8 +172,8 @@ inline void create_wand_data( bool range, bool compress, std::optional quantization_bits, - std::unordered_set const& dropped_term_ids) -{ + std::unordered_set const& dropped_term_ids +) { spdlog::info("Dropping {} terms", dropped_term_ids.size()); binary_collection sizes_coll((input_basename + ".sizes").c_str()); binary_freq_collection coll(input_basename.c_str()); @@ -189,7 +186,8 @@ inline void create_wand_data( scorer_params, block_size, quantization_bits, - dropped_term_ids); + dropped_term_ids + ); mapper::freeze(wdata, output.c_str()); } else if (range) { wand_data> wdata( @@ -199,7 +197,8 @@ inline void create_wand_data( scorer_params, block_size, quantization_bits, - dropped_term_ids); + dropped_term_ids + ); mapper::freeze(wdata, output.c_str()); } else { wand_data wdata( @@ -209,7 +208,8 @@ inline void create_wand_data( scorer_params, block_size, quantization_bits, - dropped_term_ids); + dropped_term_ids + ); mapper::freeze(wdata, output.c_str()); } } diff --git a/include/pisa/wand_data_compressed.hpp b/include/pisa/wand_data_compressed.hpp index 0ff35130..ebcdb737 100644 --- a/include/pisa/wand_data_compressed.hpp +++ b/include/pisa/wand_data_compressed.hpp @@ -25,11 +25,9 @@ class uniform_score_compressor { : m_params(params), m_quantization_bits(quantization_bits), m_num_docs((num_docs + 1) << quantization_bits.as_int()), - m_docs_sequences(params) - {} + m_docs_sequences(params) {} - std::vector compress_data(std::vector effective_scores, float max_score) - { + std::vector compress_data(std::vector effective_scores, float max_score) { // Partition scores. LinearQuantizer quantizer(max_score, m_quantization_bits.as_int()); std::vector score_indexes; @@ -41,8 +39,7 @@ class uniform_score_compressor { } template - void add_posting_list(uint64_t n, DocsIterator docs_begin, DocsIterator score_begin) - { + void add_posting_list(uint64_t n, DocsIterator docs_begin, DocsIterator score_begin) { std::vector temp; for (size_t pos = 0; pos < n; ++pos) { uint64_t elem = *(docs_begin + pos); @@ -54,7 +51,8 @@ class uniform_score_compressor { "({}) lower than its predecessor ({})", pos, elem, - temp.back())); + temp.back() + )); } temp.push_back(elem); } @@ -81,8 +79,7 @@ class uniform_score_compressor { bitvector_collection::builder m_docs_sequences; }; - static float inline score(uint32_t quantized_score, Size quantization_bits) - { + static float inline score(uint32_t quantized_score, Size quantization_bits) { const float quant = 1.F / (1U << quantization_bits.as_int()); return quant * (quantized_score + 1); } @@ -98,14 +95,15 @@ class wand_data_compressed { builder( binary_freq_collection const& coll, global_parameters const& params, - std::optional quantization_bits) + std::optional quantization_bits + ) : total_elements(0), total_blocks(0), params(params), m_quantization_bits( - unwrap(quantization_bits, "compressed wand data needs quantization bits")), - compressor_builder(coll.num_docs(), params, m_quantization_bits) - { + unwrap(quantization_bits, "compressed wand data needs quantization bits") + ), + compressor_builder(coll.num_docs(), params, m_quantization_bits) { spdlog::info("Storing max weight for each list and for each block..."); } @@ -116,12 +114,13 @@ class wand_data_compressed { [[maybe_unused]] std::vector const& doc_lens, float avg_len, Scorer scorer, - BlockSize block_size) - { + BlockSize block_size + ) { auto t = std::holds_alternative(block_size) ? static_block_partition(seq, scorer, std::get(block_size).size) : variable_block_partition( - coll, seq, scorer, std::get(block_size).lambda); + coll, seq, scorer, std::get(block_size).lambda + ); float max_score = *(std::max_element(t.second.begin(), t.second.end())); max_term_weight.push_back(max_score); @@ -136,8 +135,7 @@ class wand_data_compressed { void quantize_block_max_term_weights([[maybe_unused]] float index_max_term_weight) {} - void build(wand_data_compressed& wdata) - { + void build(wand_data_compressed& wdata) { auto index_max_term_weight = *(std::max_element(max_term_weight.begin(), max_term_weight.end())); for (auto&& [docs, scores]: @@ -145,15 +143,16 @@ class wand_data_compressed { auto quantized_scores = compressor_builder.compress_data(scores, index_max_term_weight); compressor_builder.add_posting_list( - quantized_scores.size(), docs.begin(), quantized_scores.begin()); + quantized_scores.size(), docs.begin(), quantized_scores.begin() + ); } wdata.m_num_docs = compressor_builder.num_docs(); wdata.m_params = compressor_builder.params(); wdata.m_quantization_bits = m_quantization_bits; compressor_builder.build(wdata.m_docs_sequences); spdlog::info( - "number of elements / number of blocks: {}", - (float)total_elements / (float)total_blocks); + "number of elements / number of blocks: {}", (float)total_elements / (float)total_blocks + ); } uint64_t total_elements; @@ -171,26 +170,22 @@ class wand_data_compressed { public: enumerator( - compact_elias_fano::enumerator const& docs_enum, - float max_term_weight, - Size quantization_bits) + compact_elias_fano::enumerator const& docs_enum, float max_term_weight, Size quantization_bits + ) : m_docs_enum(docs_enum), m_max_term_weight(max_term_weight), - m_quantization_bits(quantization_bits) - { + m_quantization_bits(quantization_bits) { reset(); } - void reset() - { + void reset() { uint64_t val = m_docs_enum.move(0).second; m_cur_docid = val >> m_quantization_bits.as_int(); uint64_t mask = (1U << m_quantization_bits.as_int()) - 1; m_cur_score_index = (val & mask); } - void PISA_FLATTEN_FUNC next_geq(uint64_t lower_bound) - { + void PISA_FLATTEN_FUNC next_geq(uint64_t lower_bound) { if (docid() != lower_bound) { lower_bound = lower_bound << m_quantization_bits.as_int(); auto val = m_docs_enum.next_geq(lower_bound); @@ -200,8 +195,7 @@ class wand_data_compressed { } } - float PISA_FLATTEN_FUNC score() - { + float PISA_FLATTEN_FUNC score() { // NOLINTNEXTLINE(readability-braces-around-statements) if constexpr (IndexPayloadType == PayloadType::Quantized) { return m_cur_score_index; @@ -225,23 +219,23 @@ class wand_data_compressed { uint64_t num_docs() const { return m_num_docs; } - enumerator get_enum(size_t i, float max_term_weight) const - { + enumerator get_enum(size_t i, float max_term_weight) const { assert(i < size()); auto docs_it = m_docs_sequences.get(m_params, i); uint64_t n = read_gamma_nonzero(docs_it); typename compact_elias_fano::enumerator docs_enum( - m_docs_sequences.bits(), docs_it.position(), num_docs(), n, m_params); + m_docs_sequences.bits(), docs_it.position(), num_docs(), n, m_params + ); return enumerator(docs_enum, max_term_weight, m_quantization_bits); } template - void map(Visitor& visit) - { + void map(Visitor& visit) { visit(m_params, "m_params")(m_num_docs, "m_num_docs")(m_docs_sequences, "m_docs_sequences")( - m_quantization_bits, "m_quantization_bits"); + m_quantization_bits, "m_quantization_bits" + ); } private: diff --git a/include/pisa/wand_data_range.hpp b/include/pisa/wand_data_range.hpp index f5b16c6e..2fc61eea 100644 --- a/include/pisa/wand_data_range.hpp +++ b/include/pisa/wand_data_range.hpp @@ -17,8 +17,7 @@ template class wand_data_range { public: template - void for_each_posting(List& list, Fn func) const - { + void for_each_posting(List& list, Fn func) const { while (list.position() < list.size()) { func(list.docid(), list.freq()); list.next(); @@ -26,8 +25,7 @@ class wand_data_range { } template - auto compute_block_max_scores(List& list, Fn scorer) const - { + auto compute_block_max_scores(List& list, Fn scorer) const { std::vector block_max_scores(m_blocks_num, 0.0F); for_each_posting(list, [&](auto docid, auto freq) { float& current_max = block_max_scores[docid / range_size]; @@ -41,13 +39,13 @@ class wand_data_range { builder( binary_freq_collection const& coll, [[maybe_unused]] global_parameters const& params, - std::optional quantization_bits) + std::optional quantization_bits + ) : blocks_num(ceil_div(coll.num_docs(), range_size)), total_elements(0), blocks_start{0}, block_max_term_weight{}, - m_quantization_bits(quantization_bits) - { + m_quantization_bits(quantization_bits) { auto posting_lists = std::distance(coll.begin(), coll.end()); spdlog::info("Storing max weight for each list and for each block..."); spdlog::info( @@ -56,7 +54,8 @@ class wand_data_range { range_size, coll.num_docs(), blocks_num, - posting_lists); + posting_lists + ); } template @@ -66,8 +65,8 @@ class wand_data_range { [[maybe_unused]] std::vector const& doc_lens, float avg_len, Scorer scorer, - [[maybe_unused]] BlockSize block_size) - { + [[maybe_unused]] BlockSize block_size + ) { float max_score = 0.0F; std::vector b_max(blocks_num, 0.0F); @@ -90,22 +89,21 @@ class wand_data_range { return max_score; } - void quantize_block_max_term_weights(float index_max_term_weight) - { + void quantize_block_max_term_weights(float index_max_term_weight) { LinearQuantizer quantizer(index_max_term_weight, m_quantization_bits->as_int()); for (auto&& w: block_max_term_weight) { w = quantizer(w); } } - void build(wand_data_range& wdata) - { + void build(wand_data_range& wdata) { wdata.m_blocks_num = blocks_num; wdata.m_blocks_start.steal(blocks_start); wdata.m_block_max_term_weight.steal(block_max_term_weight); spdlog::info( "number of elements / number of blocks: {}", - static_cast(total_elements) / wdata.m_block_max_term_weight.size()); + static_cast(total_elements) / wdata.m_block_max_term_weight.size() + ); } uint64_t blocks_num; @@ -120,15 +118,13 @@ class wand_data_range { public: enumerator(uint32_t _block_start, mapper::mappable_vector const& block_max_term_weight) - : cur_pos(0), block_start(_block_start), m_block_max_term_weight(block_max_term_weight) - {} + : cur_pos(0), block_start(_block_start), m_block_max_term_weight(block_max_term_weight) {} void PISA_NOINLINE next_block() { cur_pos += 1; } void PISA_NOINLINE next_geq(uint64_t lower_bound) { cur_pos = lower_bound / range_size; } uint64_t PISA_FLATTEN_FUNC docid() const { return (cur_pos + 1) * range_size; } - float PISA_FLATTEN_FUNC score() const - { + float PISA_FLATTEN_FUNC score() const { return m_block_max_term_weight[block_start + cur_pos]; } @@ -138,14 +134,13 @@ class wand_data_range { mapper::mappable_vector const& m_block_max_term_weight; }; - enumerator get_enum(uint32_t i, float) const - { + enumerator get_enum(uint32_t i, float) const { return enumerator(m_blocks_start[i], m_block_max_term_weight); } static std::vector compute_live_blocks( - std::vector& enums, float threshold, std::pair document_range) - { + std::vector& enums, float threshold, std::pair document_range + ) { size_t len = ceil_div((document_range.second - document_range.first), range_size); std::vector live_blocks(len); for (auto&& e: enums) { @@ -163,10 +158,10 @@ class wand_data_range { } template - void map(Visitor& visit) - { + void map(Visitor& visit) { visit(m_blocks_num, "m_blocks_num")(m_blocks_start, "m_blocks_start")( - m_block_max_term_weight, "m_block_max_term_weight"); + m_block_max_term_weight, "m_block_max_term_weight" + ); } private: diff --git a/include/pisa/wand_data_raw.hpp b/include/pisa/wand_data_raw.hpp index 95f38b6a..a69b51a0 100644 --- a/include/pisa/wand_data_raw.hpp +++ b/include/pisa/wand_data_raw.hpp @@ -23,9 +23,9 @@ class wand_data_raw { builder( binary_freq_collection const& coll, global_parameters const& params, - std::optional quantization_bits) - : m_quantization_bits(quantization_bits) - { + std::optional quantization_bits + ) + : m_quantization_bits(quantization_bits) { (void)coll; (void)params; spdlog::info("Storing max weight for each list and for each block..."); @@ -42,15 +42,17 @@ class wand_data_raw { [[maybe_unused]] std::vector const& doc_lens, float avg_len, Scorer scorer, - BlockSize block_size) - { + BlockSize block_size + ) { auto t = std::holds_alternative(block_size) ? static_block_partition(seq, scorer, std::get(block_size).size) : variable_block_partition( - coll, seq, scorer, std::get(block_size).lambda); + coll, seq, scorer, std::get(block_size).lambda + ); block_max_term_weight.insert( - block_max_term_weight.end(), t.second.begin(), t.second.end()); + block_max_term_weight.end(), t.second.begin(), t.second.end() + ); block_docid.insert(block_docid.end(), t.first.begin(), t.first.end()); max_term_weight.push_back(*(std::max_element(t.second.begin(), t.second.end()))); blocks_start.push_back(t.first.size() + blocks_start.back()); @@ -61,22 +63,21 @@ class wand_data_raw { return max_term_weight.back(); } - void quantize_block_max_term_weights(float index_max_term_weight) - { + void quantize_block_max_term_weights(float index_max_term_weight) { LinearQuantizer quantizer(index_max_term_weight, m_quantization_bits->as_int()); for (auto&& w: block_max_term_weight) { w = quantizer(w); } } - void build(wand_data_raw& wdata) - { + void build(wand_data_raw& wdata) { wdata.m_block_max_term_weight.steal(block_max_term_weight); wdata.m_blocks_start.steal(blocks_start); wdata.m_block_docid.steal(block_docid); spdlog::info( "number of elements / number of blocks: {}", - static_cast(total_elements) / static_cast(total_blocks)); + static_cast(total_elements) / static_cast(total_blocks) + ); } std::optional m_quantization_bits; @@ -96,23 +97,21 @@ class wand_data_raw { uint32_t _block_start, uint32_t _block_number, mapper::mappable_vector const& max_term_weight, - mapper::mappable_vector const& block_docid) + mapper::mappable_vector const& block_docid + ) : cur_pos(0), block_start(_block_start), block_number(_block_number), m_block_max_term_weight(max_term_weight), - m_block_docid(block_docid) - {} + m_block_docid(block_docid) {} - void PISA_NOINLINE next_geq(uint64_t lower_bound) - { + void PISA_NOINLINE next_geq(uint64_t lower_bound) { while (cur_pos + 1 < block_number && m_block_docid[block_start + cur_pos] < lower_bound) { cur_pos++; } } - float PISA_FLATTEN_FUNC score() const - { + float PISA_FLATTEN_FUNC score() const { return m_block_max_term_weight[block_start + cur_pos]; } @@ -128,20 +127,17 @@ class wand_data_raw { mapper::mappable_vector const& m_block_docid; }; - enumerator get_enum(uint32_t i, float) const - { + enumerator get_enum(uint32_t i, float) const { return enumerator( - m_blocks_start[i], - m_blocks_start[i + 1] - m_blocks_start[i], - m_block_max_term_weight, - m_block_docid); + m_blocks_start[i], m_blocks_start[i + 1] - m_blocks_start[i], m_block_max_term_weight, m_block_docid + ); } template - void map(Visitor& visit) - { + void map(Visitor& visit) { visit(m_blocks_start, "m_blocks_start")(m_block_max_term_weight, "m_block_max_term_weight")( - m_block_docid, "m_block_docid"); + m_block_docid, "m_block_docid" + ); } private: diff --git a/include/pisa/wand_utils.hpp b/include/pisa/wand_utils.hpp index d64a0657..62a07aa4 100644 --- a/include/pisa/wand_utils.hpp +++ b/include/pisa/wand_utils.hpp @@ -21,8 +21,8 @@ using BlockSize = std::variant; template std::pair, std::vector> static_block_partition( - binary_freq_collection::sequence const& seq, Scorer scorer, const uint64_t block_size) -{ + binary_freq_collection::sequence const& seq, Scorer scorer, const uint64_t block_size +) { std::vector block_docid; std::vector block_max_term_weight; @@ -61,8 +61,8 @@ std::pair, std::vector> variable_block_partition( // Antonio Mallia, Giuseppe Ottaviano, Elia Porciani, Nicola Tonellotto, and Rossano Venturini. // 2017. Faster BlockMax WAND with Variable-sized Blocks. In Proc. SIGIR double eps1 = 0.01, - double eps2 = 0.4) -{ + double eps2 = 0.4 +) { // Auxiliary vector using doc_score_t = std::pair; std::vector doc_score; @@ -74,7 +74,8 @@ std::pair, std::vector> variable_block_partition( std::back_inserter(doc_score), [&](const uint64_t& doc, const uint64_t& freq) -> doc_score_t { return {doc, scorer(doc, freq)}; - }); + } + ); auto p = score_opt_partition(doc_score.begin(), 0, doc_score.size(), eps1, eps2, lambda); diff --git a/src/bit_vector.cpp b/src/bit_vector.cpp index 9fada7ca..a7d818a6 100644 --- a/src/bit_vector.cpp +++ b/src/bit_vector.cpp @@ -3,8 +3,7 @@ namespace pisa { -bit_vector::bit_vector(bit_vector_builder* from) -{ +bit_vector::bit_vector(bit_vector_builder* from) { m_size = from->size(); m_bits.steal(from->move_bits()); } diff --git a/src/bit_vector_builder.cpp b/src/bit_vector_builder.cpp index 71d2a209..60ef4a18 100644 --- a/src/bit_vector_builder.cpp +++ b/src/bit_vector_builder.cpp @@ -5,8 +5,7 @@ namespace pisa { -bit_vector_builder::bit_vector_builder(uint64_t size, bool init) : m_size(size) -{ +bit_vector_builder::bit_vector_builder(uint64_t size, bool init) : m_size(size) { m_bits.resize(detail::words_for(size), init ? std::numeric_limits::max() : 0U); if (size != 0U) { m_cur_word = &m_bits.back(); @@ -17,13 +16,11 @@ bit_vector_builder::bit_vector_builder(uint64_t size, bool init) : m_size(size) } } -void bit_vector_builder::reserve(uint64_t size) -{ +void bit_vector_builder::reserve(uint64_t size) { m_bits.reserve(detail::words_for(size)); } -void bit_vector_builder::append(bit_vector_builder const& rhs) -{ +void bit_vector_builder::append(bit_vector_builder const& rhs) { if (rhs.size() == 0U) { return; } @@ -50,8 +47,7 @@ void bit_vector_builder::append(bit_vector_builder const& rhs) m_cur_word = &m_bits.back(); } -void bit_vector_builder::reverse() -{ +void bit_vector_builder::reverse() { uint64_t shift = 64 - (size() % 64); uint64_t remainder = 0; @@ -69,8 +65,7 @@ void bit_vector_builder::reverse() std::reverse(m_bits.begin(), m_bits.end()); } -void bit_vector_builder::swap(bit_vector_builder& other) -{ +void bit_vector_builder::swap(bit_vector_builder& other) { m_bits.swap(other.m_bits); std::swap(m_size, other.m_size); std::swap(m_cur_word, other.m_cur_word); diff --git a/src/compress.cpp b/src/compress.cpp index 484973f9..38e8c875 100644 --- a/src/compress.cpp +++ b/src/compress.cpp @@ -18,16 +18,13 @@ namespace pisa { template -void dump_index_specific_stats(Collection const&, std::string const&) -{} +void dump_index_specific_stats(Collection const&, std::string const&) {} -void dump_index_specific_stats(pisa::pefuniform_index const& coll, std::string const& type) -{ +void dump_index_specific_stats(pisa::pefuniform_index const& coll, std::string const& type) { pisa::stats_line()("type", type)("log_partition_size", int(coll.params().log_partition_size)); } -void dump_index_specific_stats(pisa::pefopt_index const& coll, std::string const& type) -{ +void dump_index_specific_stats(pisa::pefopt_index const& coll, std::string const& type) { std::uint64_t length_threshold = 4096; double long_postings = 0; double docs_partitions = 0; @@ -43,14 +40,14 @@ void dump_index_specific_stats(pisa::pefopt_index const& coll, std::string const } pisa::stats_line()("type", type)("docs_avg_part", long_postings / docs_partitions)( - "freqs_avg_part", long_postings / freqs_partitions); + "freqs_avg_part", long_postings / freqs_partitions + ); } template struct QuantizedScorer { QuantizedScorer(std::unique_ptr> scorer, LinearQuantizer quantizer) - : scorer(std::move(scorer)), quantizer(quantizer) - {} + : scorer(std::move(scorer)), quantizer(quantizer) {} std::unique_ptr> scorer; LinearQuantizer quantizer; }; @@ -61,8 +58,8 @@ void compress_index_streaming( pisa::global_parameters const& params, std::string const& output_filename, std::optional> quantized_scorer, - bool check) -{ + bool check +) { spdlog::info("Processing {} documents (streaming)", input.num_docs()); double tick = get_time_usecs(); @@ -84,7 +81,8 @@ void compress_index_streaming( quantized_scores.push_back(quantizer(score)); } auto sum = std::accumulate( - quantized_scores.begin(), quantized_scores.end(), std::uint64_t(0)); + quantized_scores.begin(), quantized_scores.end(), std::uint64_t(0) + ); builder.add_posting_list(size, plist.docs.begin(), quantized_scores.begin(), sum); term_id += 1; quantized_scores.clear(); @@ -122,8 +120,8 @@ void compress_index( std::string const& seq_type, std::optional const& wand_data_filename, ScorerParams const& scorer_params, - std::optional quantization_bits) -{ + std::optional quantization_bits +) { if constexpr (std::is_same_v) { std::optional> quantized_scorer{}; WandType wdata; @@ -143,7 +141,8 @@ void compress_index( quantized_scorer = QuantizedScorer(std::move(scorer), quantizer); } compress_index_streaming( - input, params, *output_filename, std::move(quantized_scorer), check); + input, params, *output_filename, std::move(quantized_scorer), check + ); return; } @@ -203,7 +202,8 @@ void compress_index( spdlog::info("{} collection built in {} seconds", seq_type, elapsed_secs); stats_line()("type", seq_type)("worker_threads", std::thread::hardware_concurrency())( - "construction_time", elapsed_secs); + "construction_time", elapsed_secs + ); dump_stats(coll, seq_type, postings); dump_index_specific_stats(coll, seq_type); @@ -215,7 +215,8 @@ void compress_index( } if (check and not quantization_bits.has_value()) { verify_collection( - input, (*output_filename).c_str()); + input, (*output_filename).c_str() + ); } } } @@ -227,25 +228,18 @@ void compress( std::string const& output_filename, ScorerParams const& scorer_params, std::optional quantization_bits, - bool check) -{ + bool check +) { binary_freq_collection input(input_basename.c_str()); global_parameters params; if (false) { -#define LOOP_BODY(R, DATA, T) \ - } \ - else if (index_encoding == BOOST_PP_STRINGIZE(T)) \ - { \ - compress_index>( \ - input, \ - params, \ - output_filename, \ - check, \ - index_encoding, \ - wand_data_filename, \ - scorer_params, \ - quantization_bits); \ +#define LOOP_BODY(R, DATA, T) \ + } \ + else if (index_encoding == BOOST_PP_STRINGIZE(T)) { \ + compress_index>( \ + input, params, output_filename, check, index_encoding, wand_data_filename, scorer_params, quantization_bits \ + ); \ /**/ BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_INDEX_TYPES); #undef LOOP_BODY diff --git a/src/cow_string.cpp b/src/cow_string.cpp index c97bae82..c4e8e6ad 100644 --- a/src/cow_string.cpp +++ b/src/cow_string.cpp @@ -14,16 +14,14 @@ CowString& CowString::operator=(CowString const&) = default; CowString& CowString::operator=(CowString&&) = default; CowString::~CowString() = default; -auto CowString::as_view() const -> std::string_view -{ +auto CowString::as_view() const -> std::string_view { if (auto* value = std::get_if(&m_value); value != nullptr) { return *value; } return std::string_view(*std::get_if(&m_value)); } -auto CowString::to_owned() && -> std::string -{ +auto CowString::to_owned() && -> std::string { if (auto* value = std::get_if(&m_value); value != nullptr) { return std::string(*value); } diff --git a/src/forward_index_builder.cpp b/src/forward_index_builder.cpp index 9b32f3a7..1f7df326 100644 --- a/src/forward_index_builder.cpp +++ b/src/forward_index_builder.cpp @@ -13,26 +13,24 @@ namespace pisa { -std::ostream& Forward_Index_Builder::write_header(std::ostream& os, uint32_t document_count) -{ +std::ostream& Forward_Index_Builder::write_header(std::ostream& os, uint32_t document_count) { return Forward_Index_Builder::write_document(os, &document_count, std::next(&document_count)); } auto Forward_Index_Builder::batch_file(std::string const& output_file, std::ptrdiff_t batch_number) noexcept - -> std::string -{ + -> std::string { std::ostringstream os; os << output_file << ".batch." << batch_number; return os.str(); } -void Forward_Index_Builder::run(Batch_Process bp, TextAnalyzer const& text_analyzer) const -{ +void Forward_Index_Builder::run(Batch_Process bp, TextAnalyzer const& text_analyzer) const { spdlog::debug( "[Batch {}] Processing documents [{}, {})", bp.batch_number, bp.first_document, - bp.first_document + bp.records.size()); + bp.first_document + bp.records.size() + ); auto basename = batch_file(bp.output_file, bp.batch_number); std::ofstream os(basename); @@ -67,12 +65,12 @@ void Forward_Index_Builder::run(Batch_Process bp, TextAnalyzer const& text_analy "[Batch {}] Processed documents [{}, {})", bp.batch_number, bp.first_document, - bp.first_document + bp.records.size()); + bp.first_document + bp.records.size() + ); } auto Forward_Index_Builder::reverse_mapping(std::vector&& terms) - -> std::unordered_map -{ + -> std::unordered_map { std::unordered_map mapping; Term_Id term_id{0}; for (std::string& term: terms) { @@ -83,8 +81,7 @@ auto Forward_Index_Builder::reverse_mapping(std::vector&& terms) } auto Forward_Index_Builder::collect_terms(std::string const& basename, std::ptrdiff_t batch_count) - -> std::vector -{ + -> std::vector { struct Term_Span { size_t first; size_t last; @@ -133,8 +130,8 @@ auto Forward_Index_Builder::collect_terms(std::string const& basename, std::ptrd } void Forward_Index_Builder::merge( - std::string const& basename, std::ptrdiff_t document_count, std::ptrdiff_t batch_count) const -{ + std::string const& basename, std::ptrdiff_t document_count, std::ptrdiff_t batch_count +) const { std::ofstream term_os(basename + ".terms"); { @@ -150,7 +147,8 @@ void Forward_Index_Builder::merge( spdlog::info("Creating document lexicon"); std::ifstream title_is(basename + ".documents"); encode_payload_vector( - std::istream_iterator(title_is), std::istream_iterator()) + std::istream_iterator(title_is), std::istream_iterator() + ) .to_file(basename + ".doclex"); } { @@ -179,10 +177,9 @@ void Forward_Index_Builder::merge( spdlog::debug("[Remapping IDs] Batch {}/{}", batch, batch_count); auto batch_terms = io::read_string_vector(batch_file(basename, batch) + ".terms"); std::vector mapping(batch_terms.size()); - std::transform( - batch_terms.begin(), batch_terms.end(), mapping.begin(), [&](auto const& bterm) { - return term_mapping[bterm]; - }); + std::transform(batch_terms.begin(), batch_terms.end(), mapping.begin(), [&](auto const& bterm) { + return term_mapping[bterm]; + }); writable_binary_collection coll(batch_file(basename, batch).c_str()); for (auto doc_iter = ++coll.begin(); doc_iter != coll.end(); ++doc_iter) { for (auto& term_id: *doc_iter) { @@ -211,8 +208,8 @@ void Forward_Index_Builder::build( read_record_function_type next_record, std::shared_ptr text_analyzer, std::ptrdiff_t batch_size, - std::size_t threads) const -{ + std::size_t threads +) const { if (threads < 2) { spdlog::error("Building forward index requires at least 2 threads"); std::abort(); @@ -230,7 +227,8 @@ void Forward_Index_Builder::build( auto last_batch_size = record_batch.size(); auto batch_process = [&] { return Batch_Process{ - batch_number, std::move(record_batch), first_document, output_file}; + batch_number, std::move(record_batch), first_document, output_file + }; }; queue.push(0); batch_group.run([bp = batch_process(), this, &queue, text_analyzer]() { @@ -248,7 +246,8 @@ void Forward_Index_Builder::build( if (record_batch.size() == batch_size) { auto batch_process = [&] { return Batch_Process{ - batch_number, std::move(record_batch), first_document, output_file}; + batch_number, std::move(record_batch), first_document, output_file + }; }; queue.push(0); batch_group.run([bp = batch_process(), this, &queue, text_analyzer]() { @@ -266,8 +265,7 @@ void Forward_Index_Builder::build( remove_batches(output_file, batch_number); } -void try_remove(std::filesystem::path const& file) -{ +void try_remove(std::filesystem::path const& file) { try { std::filesystem::remove(file); } catch (...) { @@ -275,8 +273,7 @@ void try_remove(std::filesystem::path const& file) } } -void Forward_Index_Builder::remove_batches(std::string const& basename, std::ptrdiff_t batch_count) const -{ +void Forward_Index_Builder::remove_batches(std::string const& basename, std::ptrdiff_t batch_count) const { for (auto batch: ranges::views::iota(0, batch_count)) { auto batch_basename = batch_file(basename, batch); try_remove(std::filesystem::path{batch_basename + ".documents"}); diff --git a/src/invert.cpp b/src/invert.cpp index 9de5495a..d0f5036b 100644 --- a/src/invert.cpp +++ b/src/invert.cpp @@ -13,12 +13,13 @@ #include "pisa/util/inverted_index_utils.hpp" template -std::vector concatenate(std::vector> const& containers) -{ +std::vector concatenate(std::vector> const& containers) { auto full_size = std::accumulate( - containers.begin(), containers.end(), 0, [](auto const& acc, auto const& container) { - return acc + container.size(); - }); + containers.begin(), + containers.end(), + 0, + [](auto const& acc, auto const& container) { return acc + container.size(); } + ); std::vector vec(full_size); auto next_begin = std::begin(vec); for (auto const& container: containers) { @@ -29,8 +30,7 @@ std::vector concatenate(std::vector> const& containers) } template -std::istream& read_sequence(std::istream& is, std::vector& out) -{ +std::istream& read_sequence(std::istream& is, std::vector& out) { uint32_t length; is.read(reinterpret_cast(&length), sizeof(length)); auto size = out.size(); @@ -41,8 +41,7 @@ std::istream& read_sequence(std::istream& is, std::vector& out) namespace pisa { namespace invert { - auto map_to_postings(ForwardIndexSlice batch) -> std::vector - { + auto map_to_postings(ForwardIndexSlice batch) -> std::vector { auto docid = batch.document_ids.begin(); std::vector> postings; for (auto const& document: batch.documents) { @@ -58,8 +57,8 @@ namespace pisa { namespace invert { std::vector& lower_doc, std::vector& lower_freq, std::vector& higher_doc, - std::vector& higher_freq) - { + std::vector& higher_freq + ) { if (lower_doc.back() == higher_doc.front()) { lower_freq.back() += higher_freq.front(); lower_doc.insert(lower_doc.end(), std::next(higher_doc.begin()), higher_doc.end()); @@ -71,15 +70,15 @@ namespace pisa { namespace invert { } auto invert_range(DocumentRange documents, Document_Id first_document_id, size_t threads) - -> Inverted_Index - { + -> Inverted_Index { std::vector document_sizes(documents.size()); pisa::transform( pisa::execution::par_unseq, documents.begin(), documents.end(), document_sizes.begin(), - [](auto const& terms) { return terms.size(); }); + [](auto const& terms) { return terms.size(); } + ); gsl::index batch_size = (documents.size() + threads - 1) / threads; std::vector batches; for (gsl::index first_idx_in_batch = 0; first_idx_in_batch < documents.size(); @@ -90,16 +89,14 @@ namespace pisa { namespace invert { auto current_batch_size = last_idx_in_batch - first_idx_in_batch; batches.push_back(ForwardIndexSlice{ documents.subspan(first_idx_in_batch, current_batch_size), - ranges::views::iota(first_document_in_batch, last_document_in_batch)}); + ranges::views::iota(first_document_in_batch, last_document_in_batch) + }); } std::vector>> posting_vectors(batches.size()); pisa::transform( - pisa::execution::par_unseq, - batches.begin(), - batches.end(), - std::begin(posting_vectors), - map_to_postings); + pisa::execution::par_unseq, batches.begin(), batches.end(), std::begin(posting_vectors), map_to_postings + ); auto postings = concatenate(posting_vectors); posting_vectors.clear(); @@ -114,8 +111,7 @@ namespace pisa { namespace invert { } void - write(std::string const& basename, invert::Inverted_Index const& index, std::uint32_t term_count) - { + write(std::string const& basename, invert::Inverted_Index const& index, std::uint32_t term_count) { std::ofstream dstream(basename + ".docs"); std::ofstream fstream(basename + ".freqs"); std::ofstream sstream(basename + ".sizes"); @@ -136,10 +132,8 @@ namespace pisa { namespace invert { } [[nodiscard]] auto build_batches( - std::string const& input_basename, - std::string const& output_basename, - InvertParams const& params) -> uint32_t - { + std::string const& input_basename, std::string const& output_basename, InvertParams const& params + ) -> uint32_t { uint32_t batch = 0; binary_collection coll(input_basename.c_str()); auto doc_iter = ++coll.begin(); @@ -150,10 +144,12 @@ namespace pisa { namespace invert { auto document_sequence = *doc_iter; documents.emplace_back( reinterpret_cast(document_sequence.begin()), - document_sequence.size()); + document_sequence.size() + ); } spdlog::info( - "Inverting [{}, {})", documents_processed, documents_processed + documents.size()); + "Inverting [{}, {})", documents_processed, documents_processed + documents.size() + ); auto index = invert_range(documents, Document_Id(documents_processed), params.num_threads); write(fmt::format("{}.batch.{}", output_basename, batch), index, *params.term_count); @@ -163,8 +159,7 @@ namespace pisa { namespace invert { return batch; } - void merge_batches(std::string const& output_basename, uint32_t batch_count, uint32_t term_count) - { + void merge_batches(std::string const& output_basename, uint32_t batch_count, uint32_t term_count) { std::vector doc_collections; std::vector freq_collections; std::vector document_sizes; @@ -186,12 +181,14 @@ namespace pisa { namespace invert { doc_collections.begin(), doc_collections.end(), std::back_inserter(doc_iterators), - [](auto const& coll) { return ++coll.begin(); }); + [](auto const& coll) { return ++coll.begin(); } + ); std::transform( freq_collections.begin(), freq_collections.end(), std::back_inserter(freq_iterators), - [](auto const& coll) { return coll.begin(); }); + [](auto const& coll) { return coll.begin(); } + ); std::ofstream dos(output_basename + ".docs"); std::ofstream fos(output_basename + ".freqs"); @@ -217,7 +214,8 @@ namespace pisa { namespace invert { "but are {} and {} (term {})", dlist.size(), flist.size(), - term_id); + term_id + ); spdlog::error(msg); throw std::runtime_error(msg); } @@ -237,8 +235,8 @@ namespace pisa { namespace invert { } void invert_forward_index( - std::string const& input_basename, std::string const& output_basename, InvertParams params) - { + std::string const& input_basename, std::string const& output_basename, InvertParams params + ) { if (not params.term_count) { auto source = MemorySource::mapped_file(fmt::format("{}.termlex", input_basename)); auto terms = Payload_Vector<>::from(source); @@ -260,14 +258,13 @@ namespace pisa { namespace invert { Inverted_Index::Inverted_Index(Inverted_Index&, tbb::split) {} Inverted_Index::Inverted_Index( - Documents documents, Frequencies frequencies, std::vector document_sizes) + Documents documents, Frequencies frequencies, std::vector document_sizes + ) : documents(std::move(documents)), frequencies(std::move(frequencies)), - document_sizes(std::move(document_sizes)) - {} + document_sizes(std::move(document_sizes)) {} - void Inverted_Index::operator()(tbb::blocked_range const& r) - { + void Inverted_Index::operator()(tbb::blocked_range const& r) { if (auto first = r.begin(); first != r.end()) { if (auto current_term = first->first; not documents[current_term].empty()) { auto current_doc = documents[current_term].back(); @@ -292,10 +289,10 @@ namespace pisa { namespace invert { } } - void Inverted_Index::join(Inverted_Index& rhs) - { + void Inverted_Index::join(Inverted_Index& rhs) { document_sizes.insert( - document_sizes.end(), rhs.document_sizes.begin(), rhs.document_sizes.end()); + document_sizes.end(), rhs.document_sizes.begin(), rhs.document_sizes.end() + ); for (auto&& [term_id, document_ids]: rhs.documents) { if (auto pos = documents.find(term_id); pos == documents.end()) { std::swap(documents[term_id], document_ids); diff --git a/src/io.cpp b/src/io.cpp index 182c81b0..b3b15e39 100644 --- a/src/io.cpp +++ b/src/io.cpp @@ -6,16 +6,14 @@ namespace pisa::io { -NoSuchFile::NoSuchFile(std::string const& file) : m_message(fmt::format("No such file: {}", file)) -{} +NoSuchFile::NoSuchFile(std::string const& file) + : m_message(fmt::format("No such file: {}", file)) {} -[[nodiscard]] auto NoSuchFile::what() const noexcept -> char const* -{ +[[nodiscard]] auto NoSuchFile::what() const noexcept -> char const* { return m_message.c_str(); } -auto resolve_path(std::string const& file) -> std::filesystem::path -{ +auto resolve_path(std::string const& file) -> std::filesystem::path { std::filesystem::path p(file); if (not std::filesystem::exists(p)) { throw NoSuchFile(file); @@ -23,8 +21,7 @@ auto resolve_path(std::string const& file) -> std::filesystem::path return p; } -auto read_string_vector(std::string const& filename) -> std::vector -{ +auto read_string_vector(std::string const& filename) -> std::vector { std::vector vec; std::ifstream is(filename); std::string line; @@ -34,8 +31,7 @@ auto read_string_vector(std::string const& filename) -> std::vector return vec; } -auto load_data(std::string const& data_file) -> std::vector -{ +auto load_data(std::string const& data_file) -> std::vector { std::vector data; std::ifstream in(data_file.c_str(), std::ios::binary); in.seekg(0, std::ios::end); @@ -48,8 +44,7 @@ auto load_data(std::string const& data_file) -> std::vector return data; } -void write_data(std::string const& data_file, gsl::span bytes) -{ +void write_data(std::string const& data_file, gsl::span bytes) { std::ofstream os(data_file); os.write(reinterpret_cast(bytes.data()), bytes.size()); } diff --git a/src/memory_source.cpp b/src/memory_source.cpp index 1873513d..930dc001 100644 --- a/src/memory_source.cpp +++ b/src/memory_source.cpp @@ -8,76 +8,65 @@ namespace pisa { constexpr std::string_view EMPTY_MEMORY = "Empty memory source"; -auto MemorySource::from_vector(std::vector vec) -> MemorySource -{ +auto MemorySource::from_vector(std::vector vec) -> MemorySource { return MemorySource(std::move(vec)); } -auto MemorySource::from_span(gsl::span span) -> MemorySource -{ +auto MemorySource::from_span(gsl::span span) -> MemorySource { return MemorySource(span); } -auto MemorySource::mapped_file(std::string const& file) -> MemorySource -{ +auto MemorySource::mapped_file(std::string const& file) -> MemorySource { return MemorySource::mapped_file(io::resolve_path(file)); } -auto MemorySource::mapped_file(std::filesystem::path file) -> MemorySource -{ +auto MemorySource::mapped_file(std::filesystem::path file) -> MemorySource { if (not std::filesystem::exists(file)) { throw io::NoSuchFile(file.string()); } return MemorySource(mio::mmap_source(file.string().c_str())); } -auto MemorySource::is_mapped() noexcept -> bool -{ +auto MemorySource::is_mapped() noexcept -> bool { return m_source != nullptr; } -auto MemorySource::data() const -> pointer -{ +auto MemorySource::data() const -> pointer { if (m_source == nullptr) { throw std::domain_error(std::string(EMPTY_MEMORY)); } return m_source->data(); } -auto MemorySource::begin() const -> pointer -{ +auto MemorySource::begin() const -> pointer { if (m_source == nullptr) { throw std::domain_error(std::string(EMPTY_MEMORY)); } return m_source->data(); } -auto MemorySource::end() const -> pointer -{ +auto MemorySource::end() const -> pointer { if (m_source == nullptr) { throw std::domain_error(std::string(EMPTY_MEMORY)); } return std::next(m_source->data(), m_source->size()); } -auto MemorySource::size() const -> size_type -{ +auto MemorySource::size() const -> size_type { if (m_source == nullptr) { return 0; } return m_source->size(); } -auto MemorySource::span() const -> gsl::span -{ +auto MemorySource::span() const -> gsl::span { if (m_source == nullptr) { return gsl::span(); } return gsl::span(begin(), size()); } -auto MemorySource::subspan(size_type offset, size_type size) const -> gsl::span -{ +auto MemorySource::subspan(size_type offset, size_type size) const -> gsl::span { if (m_source == nullptr) { if (offset == 0 && (size == 0 || size == gsl::dynamic_extent)) { return gsl::span(); diff --git a/src/parser.cpp b/src/parser.cpp index df2b14ec..c531d036 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -18,8 +18,7 @@ namespace pisa { // Changes should be made to the upstream libraries to support the new API, // which also means upgrading fmt version upstream. template -[[nodiscard]] auto to_string(T&& value) -> std::string -{ +[[nodiscard]] auto to_string(T&& value) -> std::string { std::ostringstream os; os << value; return os.str(); @@ -28,20 +27,21 @@ template using namespace std::string_view_literals; template -[[nodiscard]] auto trec_record_parser(ReadSubsequentRecordFn read_subsequent_record) -{ +[[nodiscard]] auto trec_record_parser(ReadSubsequentRecordFn read_subsequent_record) { return [=](std::istream& in) -> std::optional { while (not in.eof()) { auto record = trecpp::match( read_subsequent_record(in), [](trecpp::Record rec) { return std::make_optional( - std::move(rec.trecid()), std::move(rec.content()), std::move(rec.url())); + std::move(rec.trecid()), std::move(rec.content()), std::move(rec.url()) + ); }, [](trecpp::Error const& error) { spdlog::warn("Skipped invalid record: {}", to_string(error)); return std::optional{}; - }); + } + ); if (record) { return record; } @@ -51,14 +51,14 @@ template } std::function(std::istream&)> -record_parser(std::string const& type, std::istream& is) -{ +record_parser(std::string const& type, std::istream& is) { if (type == "plaintext") { return [](std::istream& in) -> std::optional { Plaintext_Record record; if (in >> record) { return std::make_optional( - std::move(record.trecid()), std::move(record.content()), std::move(record.url())); + std::move(record.trecid()), std::move(record.content()), std::move(record.url()) + ); } return std::nullopt; }; @@ -67,19 +67,21 @@ record_parser(std::string const& type, std::istream& is) return trec_record_parser(trecpp::text::read_subsequent_record); } if (type == "trecweb") { - return [=, parser = std::make_shared(is)]( - std::istream& in) -> std::optional { + return [=, parser = std::make_shared(is)](std::istream& in + ) -> std::optional { while (not in.eof()) { auto record = trecpp::match( parser->read_record(), [](trecpp::Record rec) { return std::make_optional( - rec.trecid(), rec.content(), rec.url()); + rec.trecid(), rec.content(), rec.url() + ); }, [](trecpp::Error const& error) { spdlog::warn("Skipped invalid record: {}", to_string(error)); return std::optional{}; - }); + } + ); if (record) { return record; } @@ -99,21 +101,23 @@ record_parser(std::string const& type, std::istream& is) // TODO(michal): use std::move if (rec.has_trecid()) { return std::make_optional( - rec.trecid(), rec.content(), rec.url()); + rec.trecid(), rec.content(), rec.url() + ); } if (rec.has_recordid()) { return std::make_optional( - rec.recordid(), rec.content(), rec.url()); + rec.recordid(), rec.content(), rec.url() + ); } // This should be unreachable - spdlog::warn( - "Skipped invalid record: No warc-trec-id or warc-record-id..."); + spdlog::warn("Skipped invalid record: No warc-trec-id or warc-record-id..."); return std::optional{}; }, [](warcpp::Error const& error) { spdlog::warn("Skipped invalid record: {}", to_string(error)); return std::optional{}; - }); + } + ); if (record) { return record; } @@ -128,7 +132,8 @@ record_parser(std::string const& type, std::istream& is) if (std::get_if(&result) != nullptr) { spdlog::warn( "Skpped invalid record. Reason: {}", - to_string(std::get_if(&result)->msg)); + to_string(std::get_if(&result)->msg) + ); spdlog::debug("Invalid record: {}", std::get_if(&result)->json); } else { std::ostringstream os; @@ -162,24 +167,22 @@ record_parser(std::string const& type, std::istream& is) std::abort(); } -void parse_plaintext_content(std::string&& content, std::function process) -{ +void parse_plaintext_content(std::string&& content, std::function process) { EnglishTokenStream tokens(content); std::for_each(tokens.begin(), tokens.end(), process); } -[[nodiscard]] auto is_http(std::string_view content) -> bool -{ - auto start = std::find_if( - content.begin(), content.end(), [](unsigned char c) { return std::isspace(c) == 0; }); +[[nodiscard]] auto is_http(std::string_view content) -> bool { + auto start = std::find_if(content.begin(), content.end(), [](unsigned char c) { + return std::isspace(c) == 0; + }); if (start == content.end()) { return false; } return std::string_view(&*start, 4) == "HTTP"sv; } -void parse_html_content(std::string&& content, std::function process) -{ +void parse_html_content(std::string&& content, std::function process) { content = parsing::html::cleantext([&]() { auto pos = content.begin(); if (is_http(content)) { @@ -204,8 +207,7 @@ void parse_html_content(std::string&& content, std::function)> -content_parser(std::optional const& type) -{ +content_parser(std::optional const& type) { if (not type) { return parse_plaintext_content; } diff --git a/src/parsing/html.cpp b/src/parsing/html.cpp index c8fa397c..4a608543 100644 --- a/src/parsing/html.cpp +++ b/src/parsing/html.cpp @@ -4,8 +4,7 @@ namespace pisa::parsing::html { -[[nodiscard]] auto cleantext(GumboNode* node) -> std::string -{ +[[nodiscard]] auto cleantext(GumboNode* node) -> std::string { if (node->type == GUMBO_NODE_TEXT) { return std::string(node->v.text.text); } @@ -25,8 +24,7 @@ namespace pisa::parsing::html { return std::string(); } -[[nodiscard]] auto cleantext(std::string_view html) -> std::string -{ +[[nodiscard]] auto cleantext(std::string_view html) -> std::string { GumboOptions options = kGumboDefaultOptions; options.max_errors = 1000; GumboOutput* output = gumbo_parse_with_options(&options, html.data(), html.size()); diff --git a/src/progress.cpp b/src/progress.cpp index c9011143..47692a43 100644 --- a/src/progress.cpp +++ b/src/progress.cpp @@ -2,8 +2,7 @@ namespace pisa { -progress::progress(std::string const& name, size_t goal, bool always_enable) : m_name(name) -{ +progress::progress(std::string const& name, size_t goal, bool always_enable) : m_name(name) { if (goal == 0) { throw std::runtime_error("Progress bar must have a positive goal but 0 given"); } @@ -13,8 +12,7 @@ progress::progress(std::string const& name, size_t goal, bool always_enable) : m } } -progress::~progress() -{ +progress::~progress() { if (!m_disabled) { m_status.notify_one(); std::unique_lock lock(m_mut); @@ -23,8 +21,7 @@ progress::~progress() } } -void progress::update(std::size_t inc) -{ +void progress::update(std::size_t inc) { if (!m_disabled) { std::unique_lock lock(m_mut); m_count += inc; @@ -32,8 +29,7 @@ void progress::update(std::size_t inc) } } -void progress::print_status() -{ +void progress::print_status() { size_t progress = (100 * m_count) / m_goal; std::chrono::seconds elapsed = std::chrono::duration_cast(std::chrono::steady_clock::now() - m_start); @@ -46,8 +42,7 @@ void progress::print_status() } } -std::ostream& progress::format_interval(std::ostream& out, std::chrono::seconds time) -{ +std::ostream& progress::format_interval(std::ostream& out, std::chrono::seconds time) { using std::chrono::hours; using std::chrono::minutes; using std::chrono::seconds; diff --git a/src/query/queries.cpp b/src/query/queries.cpp index 72301d55..6d763271 100644 --- a/src/query/queries.cpp +++ b/src/query/queries.cpp @@ -12,8 +12,7 @@ namespace pisa { auto split_query_at_colon(std::string_view query_string) - -> std::pair, std::string_view> -{ + -> std::pair, std::string_view> { // query id : terms (or ids) auto colon = std::find(query_string.begin(), query_string.end(), ':'); std::optional id; @@ -26,8 +25,8 @@ auto split_query_at_colon(std::string_view query_string) } auto parse_query_terms( - std::string const& query_string, Tokenizer const& tokenizer, TermProcessor term_processor) -> Query -{ + std::string const& query_string, Tokenizer const& tokenizer, TermProcessor term_processor +) -> Query { auto [id, raw_query] = split_query_at_colon(query_string); auto tokens = tokenizer.tokenize(raw_query); std::vector parsed_query; @@ -46,8 +45,7 @@ auto parse_query_terms( return {std::move(id), std::move(parsed_query), {}}; } -auto parse_query_ids(std::string const& query_string) -> Query -{ +auto parse_query_ids(std::string const& query_string) -> Query { auto [id, raw_query] = split_query_at_colon(query_string); std::vector parsed_query; std::vector term_ids; @@ -72,8 +70,8 @@ std::function resolve_query_parser( std::unique_ptr tokenizer, std::optional const& terms_file, std::optional const& stopwords_filename, - std::optional const& stemmer_type) -{ + std::optional const& stemmer_type +) { if (terms_file) { auto term_processor = TermProcessor(terms_file, stopwords_filename, stemmer_type); return [&queries, @@ -87,8 +85,7 @@ std::function resolve_query_parser( }; } -bool read_query(term_id_vec& ret, std::istream& is) -{ +bool read_query(term_id_vec& ret, std::istream& is) { ret.clear(); std::string line; if (!std::getline(is, line)) { @@ -98,14 +95,12 @@ bool read_query(term_id_vec& ret, std::istream& is) return true; } -void remove_duplicate_terms(term_id_vec& terms) -{ +void remove_duplicate_terms(term_id_vec& terms) { std::sort(terms.begin(), terms.end()); terms.erase(std::unique(terms.begin(), terms.end()), terms.end()); } -term_freq_vec query_freqs(term_id_vec terms) -{ +term_freq_vec query_freqs(term_id_vec terms) { term_freq_vec query_term_freqs; std::sort(terms.begin(), terms.end()); // count query term frequencies diff --git a/src/query/query_parser.cpp b/src/query/query_parser.cpp index 0bb58500..7ecb7048 100644 --- a/src/query/query_parser.cpp +++ b/src/query/query_parser.cpp @@ -4,11 +4,9 @@ namespace pisa { QueryParser::QueryParser(TextAnalyzer analyzer, std::unique_ptr term_map) - : m_analyzer(std::move(analyzer)), m_term_map(std::move(term_map)) -{} + : m_analyzer(std::move(analyzer)), m_term_map(std::move(term_map)) {} -auto QueryParser::parse(std::string_view query) -> Query -{ +auto QueryParser::parse(std::string_view query) -> Query { auto [id, raw_query] = split_query_at_colon(query); auto tokens = m_analyzer.analyze(raw_query); std::vector query_ids; @@ -20,8 +18,7 @@ auto QueryParser::parse(std::string_view query) -> Query return {std::move(id), std::move(query_ids), {}}; } -auto QueryParser::parse(std::string const& query) -> Query -{ +auto QueryParser::parse(std::string const& query) -> Query { return parse(std::string_view(query)); } diff --git a/src/query/term_processor.cpp b/src/query/term_processor.cpp index f36dff10..dd64639a 100644 --- a/src/query/term_processor.cpp +++ b/src/query/term_processor.cpp @@ -6,8 +6,7 @@ namespace pisa { -auto term_transformer_builder(std::optional const& type) -> TermTransformerBuilder -{ +auto term_transformer_builder(std::optional const& type) -> TermTransformerBuilder { if (not type) { return [] { return [](std::string&& term) -> std::string { @@ -26,8 +25,8 @@ auto term_transformer_builder(std::optional const& type) -> TermTra } if (*type == "krovetz") { return []() { - return [kstemmer = std::make_shared()]( - std::string&& term) mutable -> std::string { + return [kstemmer = std::make_shared()](std::string&& term + ) mutable -> std::string { boost::algorithm::to_lower(term); return kstemmer->kstem_stemmer(term); }; diff --git a/src/sharding.cpp b/src/sharding.cpp index 453c7050..e462d6cd 100644 --- a/src/sharding.cpp +++ b/src/sharding.cpp @@ -21,22 +21,20 @@ namespace pisa { using pisa::literals::operator""_d; using pisa::literals::operator""_s; -auto format_shard(std::string_view basename, Shard_Id shard, std::string_view suffix) -> std::string -{ +auto format_shard(std::string_view basename, Shard_Id shard, std::string_view suffix) -> std::string { return fmt::format("{}.{:03d}{}", basename, shard.as_int(), suffix); } -auto expand_shard(std::string_view basename, Shard_Id shard) -> std::string -{ +auto expand_shard(std::string_view basename, Shard_Id shard) -> std::string { if (auto pos = basename.find("{}"); pos != std::string_view::npos) { return fmt::format( - "{}{:03d}{}", basename.substr(0, pos), shard.as_int(), basename.substr(pos + 2)); + "{}{:03d}{}", basename.substr(0, pos), shard.as_int(), basename.substr(pos + 2) + ); } return format_shard(basename, shard); } -auto resolve_shards(std::string_view basename, std::string_view suffix) -> std::vector -{ +auto resolve_shards(std::string_view basename, std::string_view suffix) -> std::vector { Shard_Id shard{0}; std::vector shards; while (true) { @@ -55,8 +53,7 @@ auto resolve_shards(std::string_view basename, std::string_view suffix) -> std:: } auto mapping_from_files(std::istream* full_titles, gsl::span shard_titles) - -> VecMap -{ + -> VecMap { std::unordered_map map; auto shard_id = Shard_Id(0); for (auto* is: shard_titles) { @@ -68,7 +65,8 @@ auto mapping_from_files(std::istream* full_titles, gsl::span shar "Document {} already belongs to shard {}: mapping for shard {} ignored", title, pos->second.as_int(), - shard_id.as_int()); + shard_id.as_int() + ); } }); shard_id += 1; @@ -88,8 +86,7 @@ auto mapping_from_files(std::istream* full_titles, gsl::span shar } auto mapping_from_files(std::string const& full_titles, gsl::span shard_titles) - -> VecMap -{ + -> VecMap { std::ifstream fis(full_titles); std::vector> shard_is; for (auto const& shard_file: shard_titles) { @@ -104,8 +101,7 @@ auto mapping_from_files(std::string const& full_titles, gsl::span seed) - -> VecMap -{ + -> VecMap { std::random_device rd; std::mt19937 g(seed.value_or(rd())); VecMap mapping(document_count); @@ -115,27 +111,26 @@ auto create_random_mapping(int document_count, int shard_count, std::optional seed) - -> VecMap -{ + std::string const& input_basename, int shard_count, std::optional seed +) -> VecMap { auto document_count = *(*binary_collection(input_basename.c_str()).begin()).begin(); return create_random_mapping(document_count, shard_count, seed); } -void copy_sequence(std::istream& is, std::ostream& os) -{ +void copy_sequence(std::istream& is, std::ostream& os) { uint32_t len; is.read(reinterpret_cast(&len), sizeof(len)); os.write(reinterpret_cast(&len), sizeof(len)); @@ -148,8 +143,8 @@ void rearrange_sequences( std::string const& input_basename, std::string const& output_basename, VecMap& mapping, - std::optional shard_count) -{ + std::optional shard_count +) { spdlog::info("Rearranging documents"); if (not shard_count) { *shard_count = *std::max_element(mapping.begin(), mapping.end()) + 1; @@ -201,8 +196,8 @@ void process_shard( std::string const& input_basename, std::string const& output_basename, Shard_Id shard_id, - VecMap const& terms) -{ + VecMap const& terms +) { auto basename = fmt::format("{}.{:03d}", output_basename, shard_id.as_int()); auto shard = writable_binary_collection(basename.c_str()); @@ -228,7 +223,8 @@ void process_shard( spdlog::debug("[Shard {}] Creating term lexicon", shard_id.as_int()); std::ifstream title_is(fmt::format("{}.terms", basename)); encode_payload_vector( - std::istream_iterator(title_is), std::istream_iterator()) + std::istream_iterator(title_is), std::istream_iterator() + ) .to_file(fmt::format("{}.termlex", basename)); } @@ -256,8 +252,8 @@ void process_shard( void partition_fwd_index( std::string const& input_basename, std::string const& output_basename, - VecMap& mapping) -{ + VecMap& mapping +) { auto terms = read_string_vec_map(fmt::format("{}.terms", input_basename)); auto shard_count = *std::max_element(mapping.begin(), mapping.end()) + 1; auto shard_ids = ranges::views::iota(0_s, shard_count) | ranges::to(); diff --git a/src/temporary_directory.cpp b/src/temporary_directory.cpp index ac69ebea..b0cae731 100644 --- a/src/temporary_directory.cpp +++ b/src/temporary_directory.cpp @@ -8,8 +8,7 @@ namespace pisa { -auto random_name(std::size_t length = 64UL) -> std::string -{ +auto random_name(std::size_t length = 64UL) -> std::string { thread_local std::random_device rd{}; thread_local std::mt19937 gen(rd()); std::uniform_int_distribution<> distrib('a', 'z'); @@ -19,8 +18,7 @@ auto random_name(std::size_t length = 64UL) -> std::string } TemporaryDirectory::TemporaryDirectory() - : dir_(std::filesystem::temp_directory_path() / random_name()) -{ + : dir_(std::filesystem::temp_directory_path() / random_name()) { std::filesystem::create_directory(dir_); spdlog::debug("Created a tmp dir {}", dir_.c_str()); } @@ -30,16 +28,14 @@ TemporaryDirectory::TemporaryDirectory(TemporaryDirectory&&) noexcept = default; TemporaryDirectory& TemporaryDirectory::operator=(TemporaryDirectory const&) = default; TemporaryDirectory& TemporaryDirectory::operator=(TemporaryDirectory&&) noexcept = default; -TemporaryDirectory::~TemporaryDirectory() -{ +TemporaryDirectory::~TemporaryDirectory() { if (std::filesystem::exists(dir_)) { std::filesystem::remove_all(dir_); } spdlog::debug("Removed a tmp dir {}", dir_.c_str()); } -auto TemporaryDirectory::path() -> std::filesystem::path const& -{ +auto TemporaryDirectory::path() -> std::filesystem::path const& { return dir_; } diff --git a/src/term_map.cpp b/src/term_map.cpp index 10fea06e..3499bdd6 100644 --- a/src/term_map.cpp +++ b/src/term_map.cpp @@ -15,8 +15,7 @@ TermMap& TermMap::operator=(TermMap const&) = default; TermMap& TermMap::operator=(TermMap&&) = default; TermMap::~TermMap() = default; -auto IntMap::operator()(std::string_view term) -> std::optional -{ +auto IntMap::operator()(std::string_view term) -> std::optional { std::uint32_t value; auto [ptr, ec] = std::from_chars(term.begin(), term.end(), value, 10); if (ec == std::errc::result_out_of_range || ec == std::errc::invalid_argument @@ -26,26 +25,21 @@ auto IntMap::operator()(std::string_view term) -> std::optional return value; } -auto IntMap::operator()(std::string const& term) -> std::optional -{ +auto IntMap::operator()(std::string const& term) -> std::optional { return (*this)(std::string_view(term)); } LexiconMap::LexiconMap(std::string const& file) - : m_buffer(Payload_Vector_Buffer::from_file(file)), m_lexicon(*m_buffer) -{} + : m_buffer(Payload_Vector_Buffer::from_file(file)), m_lexicon(*m_buffer) {} LexiconMap::LexiconMap(Payload_Vector lexicon) - : m_buffer(std::nullopt), m_lexicon(lexicon) -{} + : m_buffer(std::nullopt), m_lexicon(lexicon) {} -auto LexiconMap::operator()(std::string_view term) -> std::optional -{ +auto LexiconMap::operator()(std::string_view term) -> std::optional { return pisa::binary_search(m_lexicon.begin(), m_lexicon.end(), term); } -auto LexiconMap::operator()(std::string const& term) -> std::optional -{ +auto LexiconMap::operator()(std::string const& term) -> std::optional { return pisa::binary_search(m_lexicon.begin(), m_lexicon.end(), term); } diff --git a/src/text_analyzer.cpp b/src/text_analyzer.cpp index 7baabf53..f6e51e14 100644 --- a/src/text_analyzer.cpp +++ b/src/text_analyzer.cpp @@ -4,8 +4,8 @@ namespace pisa { -TextAnalyzer::TextAnalyzer(std::unique_ptr tokenizer) : m_tokenizer(std::move(tokenizer)) -{} +TextAnalyzer::TextAnalyzer(std::unique_ptr tokenizer) + : m_tokenizer(std::move(tokenizer)) {} class FlatMapStream: public TokenStream { std::unique_ptr m_input_stream; @@ -14,11 +14,9 @@ class FlatMapStream: public TokenStream { public: explicit FlatMapStream(std::unique_ptr input_stream, TokenFilter& map) - : m_input_stream(std::move(input_stream)), m_map(map) - {} + : m_input_stream(std::move(input_stream)), m_map(map) {} - auto next() -> std::optional override - { + auto next() -> std::optional override { std::optional token = std::nullopt; while (!token) { if (m_inner_stream == nullptr || !(token = m_inner_stream->next())) { @@ -34,18 +32,15 @@ class FlatMapStream: public TokenStream { } }; -void TextAnalyzer::add_text_filter(std::unique_ptr text_filter) -{ +void TextAnalyzer::add_text_filter(std::unique_ptr text_filter) { m_text_filters.emplace_back(std::move(text_filter)); } -void TextAnalyzer::add_token_filter(std::unique_ptr token_filter) -{ +void TextAnalyzer::add_token_filter(std::unique_ptr token_filter) { m_token_filters.emplace_back(std::move(token_filter)); } -auto TextAnalyzer::analyze(std::string_view input) const -> std::unique_ptr -{ +auto TextAnalyzer::analyze(std::string_view input) const -> std::unique_ptr { CowString text(input); for (auto& text_filter: m_text_filters) { text = CowString(text_filter->filter(text.as_view())); diff --git a/src/text_filter.cpp b/src/text_filter.cpp index a3593219..5869a6b0 100644 --- a/src/text_filter.cpp +++ b/src/text_filter.cpp @@ -11,8 +11,7 @@ TextFilter& TextFilter::operator=(TextFilter const&) = default; TextFilter& TextFilter::operator=(TextFilter&&) = default; TextFilter::~TextFilter() = default; -[[nodiscard]] auto cleantext(GumboNode* node) -> std::string -{ +[[nodiscard]] auto cleantext(GumboNode* node) -> std::string { if (node->type == GUMBO_NODE_TEXT) { return std::string(node->v.text.text); } @@ -32,8 +31,7 @@ TextFilter::~TextFilter() = default; return std::string(); } -auto StripHtmlFilter::filter(std::string_view input) -> std::string -{ +auto StripHtmlFilter::filter(std::string_view input) -> std::string { GumboOptions options = kGumboDefaultOptions; options.max_errors = 1000; GumboOutput* output = gumbo_parse_with_options(&options, input.data(), input.size()); diff --git a/src/token_filter.cpp b/src/token_filter.cpp index b7eb5de1..278d286a 100644 --- a/src/token_filter.cpp +++ b/src/token_filter.cpp @@ -13,72 +13,59 @@ TokenFilter& TokenFilter::operator=(TokenFilter const&) = default; TokenFilter& TokenFilter::operator=(TokenFilter&&) = default; TokenFilter::~TokenFilter() = default; -auto Porter2Stemmer::filter(std::string_view input) const -> std::unique_ptr -{ +auto Porter2Stemmer::filter(std::string_view input) const -> std::unique_ptr { return filter(std::string(input)); } -auto Porter2Stemmer::filter(std::string input) const -> std::unique_ptr -{ +auto Porter2Stemmer::filter(std::string input) const -> std::unique_ptr { thread_local porter2::Stemmer stemmer{}; return std::make_unique(stemmer.stem(input)); } -auto Porter2Stemmer::filter(CowString input) const -> std::unique_ptr -{ +auto Porter2Stemmer::filter(CowString input) const -> std::unique_ptr { return filter(std::move(input).to_owned()); } -auto KrovetzStemmer::filter(std::string_view input) const -> std::unique_ptr -{ +auto KrovetzStemmer::filter(std::string_view input) const -> std::unique_ptr { return filter(std::string(input)); } -auto KrovetzStemmer::filter(std::string input) const -> std::unique_ptr -{ +auto KrovetzStemmer::filter(std::string input) const -> std::unique_ptr { return std::make_unique(m_stemmer->kstem_stemmer(input)); } -auto KrovetzStemmer::filter(CowString input) const -> std::unique_ptr -{ +auto KrovetzStemmer::filter(CowString input) const -> std::unique_ptr { return filter(std::move(input).to_owned()); } -auto LowercaseFilter::filter(std::string_view input) const -> std::unique_ptr -{ +auto LowercaseFilter::filter(std::string_view input) const -> std::unique_ptr { return filter(std::string(input)); } -auto LowercaseFilter::filter(std::string input) const -> std::unique_ptr -{ +auto LowercaseFilter::filter(std::string input) const -> std::unique_ptr { boost::algorithm::to_lower(input); return std::make_unique(std::move(input)); } -auto LowercaseFilter::filter(CowString input) const -> std::unique_ptr -{ +auto LowercaseFilter::filter(CowString input) const -> std::unique_ptr { return filter(std::move(input).to_owned()); } StopWordRemover::StopWordRemover(std::unordered_set stopwords) - : m_stopwords(std::move(stopwords)) -{} + : m_stopwords(std::move(stopwords)) {} -auto StopWordRemover::filter(std::string_view input) const -> std::unique_ptr -{ +auto StopWordRemover::filter(std::string_view input) const -> std::unique_ptr { return filter(std::string(input)); } -auto StopWordRemover::filter(std::string input) const -> std::unique_ptr -{ +auto StopWordRemover::filter(std::string input) const -> std::unique_ptr { if (m_stopwords.find(input) != m_stopwords.end()) { return std::make_unique(); } return std::make_unique(std::move(input)); } -auto StopWordRemover::filter(CowString input) const -> std::unique_ptr -{ +auto StopWordRemover::filter(CowString input) const -> std::unique_ptr { return filter(std::move(input).to_owned()); } diff --git a/src/token_stream.cpp b/src/token_stream.cpp index 15322772..94764689 100644 --- a/src/token_stream.cpp +++ b/src/token_stream.cpp @@ -2,18 +2,15 @@ namespace pisa { -TokenIterator::TokenIterator(TokenStream* tokenizer) : m_tokenizer(tokenizer), m_pos(0) -{ +TokenIterator::TokenIterator(TokenStream* tokenizer) : m_tokenizer(tokenizer), m_pos(0) { m_token = m_tokenizer == nullptr ? std::nullopt : m_tokenizer->next(); } -[[nodiscard]] auto TokenIterator::operator*() const -> value_type -{ +[[nodiscard]] auto TokenIterator::operator*() const -> value_type { return *m_token; } -auto TokenIterator::operator++() -> TokenIterator& -{ +auto TokenIterator::operator++() -> TokenIterator& { if (m_token.has_value()) { m_token = m_tokenizer->next(); ++m_pos; @@ -21,23 +18,20 @@ auto TokenIterator::operator++() -> TokenIterator& return *this; } -[[nodiscard]] auto TokenIterator::operator++(int) -> TokenIterator -{ +[[nodiscard]] auto TokenIterator::operator++(int) -> TokenIterator { auto copy = *this; ++(*this); return copy; } -[[nodiscard]] auto TokenIterator::operator==(TokenIterator const& other) const -> bool -{ +[[nodiscard]] auto TokenIterator::operator==(TokenIterator const& other) const -> bool { if (m_token.has_value() && other.m_token.has_value()) { return m_pos == other.m_pos; } return m_token.has_value() == other.m_token.has_value(); } -[[nodiscard]] auto TokenIterator::operator!=(TokenIterator const& other) const -> bool -{ +[[nodiscard]] auto TokenIterator::operator!=(TokenIterator const& other) const -> bool { return !(*this == other); } @@ -48,30 +42,25 @@ TokenStream& TokenStream::operator=(TokenStream const&) = default; TokenStream& TokenStream::operator=(TokenStream&&) = default; TokenStream::~TokenStream() = default; -auto TokenStream::begin() -> TokenIterator -{ +auto TokenStream::begin() -> TokenIterator { return TokenIterator(this); } -auto TokenStream::end() -> TokenIterator -{ +auto TokenStream::end() -> TokenIterator { return TokenIterator(nullptr); } -auto TokenStream::collect() -> std::vector -{ +auto TokenStream::collect() -> std::vector { return std::vector(begin(), end()); } -auto EmptyTokenStream::next() -> std::optional -{ +auto EmptyTokenStream::next() -> std::optional { return std::nullopt; } SingleTokenStream::SingleTokenStream(std::string token) : m_next(std::move(token)) {} -auto SingleTokenStream::next() -> std::optional -{ +auto SingleTokenStream::next() -> std::optional { if (!m_next) { return std::nullopt; } diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 68ef49dd..c0a7312c 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -7,13 +7,11 @@ namespace pisa { enum TokenType { Abbreviature = 1, Possessive = 2, Term = 3, NotValid = 4 }; -[[nodiscard]] auto is_space(char symbol) -> bool -{ +[[nodiscard]] auto is_space(char symbol) -> bool { return std::isspace(static_cast(symbol)) != 0; } -auto is_valid(token_type const& tok) -> bool -{ +auto is_valid(token_type const& tok) -> bool { return tok.id() != TokenType::NotValid; } @@ -24,17 +22,13 @@ WhitespaceTokenStream& WhitespaceTokenStream::operator=(WhitespaceTokenStream&&) WhitespaceTokenStream::~WhitespaceTokenStream() = default; WhitespaceTokenStream::WhitespaceTokenStream(std::string_view input) - : m_input(input), m_view(m_input.as_view()) -{} + : m_input(input), m_view(m_input.as_view()) {} WhitespaceTokenStream::WhitespaceTokenStream(std::string input) - : m_input(std::move(input)), m_view(m_input.as_view()) -{} + : m_input(std::move(input)), m_view(m_input.as_view()) {} WhitespaceTokenStream::WhitespaceTokenStream(CowString input) - : m_input(std::move(input)), m_view(m_input.as_view()) -{} + : m_input(std::move(input)), m_view(m_input.as_view()) {} -auto WhitespaceTokenStream::next() -> std::optional -{ +auto WhitespaceTokenStream::next() -> std::optional { auto pos = std::find_if_not(m_view.begin(), m_view.end(), is_space); m_view = m_view.substr(std::distance(m_view.begin(), pos)); if (m_view.empty()) { @@ -53,14 +47,14 @@ WhitespaceTokenizer& WhitespaceTokenizer::operator=(WhitespaceTokenizer const&) WhitespaceTokenizer& WhitespaceTokenizer::operator=(WhitespaceTokenizer&&) = default; WhitespaceTokenizer::~WhitespaceTokenizer() = default; -auto transform_token(token_type const& tok) -> std::string -{ +auto transform_token(token_type const& tok) -> std::string { auto& val = tok.value(); switch (tok.id()) { case TokenType::Abbreviature: { std::string term; - std::copy_if( - val.begin(), val.end(), std::back_inserter(term), [](char ch) { return ch != '.'; }); + std::copy_if(val.begin(), val.end(), std::back_inserter(term), [](char ch) { + return ch != '.'; + }); return term; } case TokenType::Possessive: @@ -73,8 +67,7 @@ struct Lexer: lex::lexer { Lexer(); }; -Lexer::Lexer() -{ +Lexer::Lexer() { // Note: parsing process takes the first match from left to right. this->self = lex::token_def<>("([a-zA-Z]+\\.){2,}", TokenType::Abbreviature) | lex::token_def<>("[a-zA-Z0-9]+('[a-zA-Z]+)", TokenType::Possessive) @@ -89,24 +82,21 @@ EnglishTokenStream::EnglishTokenStream(std::string_view input) m_begin(input.begin()), m_end(input.end()), m_pos(LEXER.begin(m_begin, m_end)), - m_sentinel(LEXER.end()) -{} + m_sentinel(LEXER.end()) {} EnglishTokenStream::EnglishTokenStream(std::string input) : m_input(std::move(input)), m_begin(m_input.as_view().begin()), m_end(m_input.as_view().end()), m_pos(LEXER.begin(m_begin, m_end)), - m_sentinel(LEXER.end()) -{} + m_sentinel(LEXER.end()) {} EnglishTokenStream::EnglishTokenStream(CowString input) : m_input(std::move(input)), m_begin(m_input.as_view().begin()), m_end(m_input.as_view().end()), m_pos(LEXER.begin(m_begin, m_end)), - m_sentinel(LEXER.end()) -{} + m_sentinel(LEXER.end()) {} EnglishTokenStream::EnglishTokenStream(EnglishTokenStream const&) = default; EnglishTokenStream::EnglishTokenStream(EnglishTokenStream&&) = default; @@ -114,8 +104,7 @@ EnglishTokenStream& EnglishTokenStream::operator=(EnglishTokenStream const&) = d EnglishTokenStream& EnglishTokenStream::operator=(EnglishTokenStream&&) = default; EnglishTokenStream::~EnglishTokenStream() = default; -auto EnglishTokenStream::next() -> std::optional -{ +auto EnglishTokenStream::next() -> std::optional { while (m_pos != m_sentinel && !is_valid(*m_pos)) { ++m_pos; } @@ -134,33 +123,27 @@ Tokenizer& Tokenizer::operator=(Tokenizer const&) = default; Tokenizer& Tokenizer::operator=(Tokenizer&&) = default; Tokenizer::~Tokenizer() = default; -auto WhitespaceTokenizer::tokenize(std::string_view input) const -> std::unique_ptr -{ +auto WhitespaceTokenizer::tokenize(std::string_view input) const -> std::unique_ptr { return std::make_unique(input); } -auto WhitespaceTokenizer::tokenize(std::string input) const -> std::unique_ptr -{ +auto WhitespaceTokenizer::tokenize(std::string input) const -> std::unique_ptr { return std::make_unique(std::move(input)); } -auto WhitespaceTokenizer::tokenize(CowString input) const -> std::unique_ptr -{ +auto WhitespaceTokenizer::tokenize(CowString input) const -> std::unique_ptr { return std::make_unique(std::move(input)); } -auto EnglishTokenizer::tokenize(std::string_view input) const -> std::unique_ptr -{ +auto EnglishTokenizer::tokenize(std::string_view input) const -> std::unique_ptr { return std::make_unique(input); } -auto EnglishTokenizer::tokenize(std::string input) const -> std::unique_ptr -{ +auto EnglishTokenizer::tokenize(std::string input) const -> std::unique_ptr { return std::make_unique(std::move(input)); } -auto EnglishTokenizer::tokenize(CowString input) const -> std::unique_ptr -{ +auto EnglishTokenizer::tokenize(CowString input) const -> std::unique_ptr { return std::make_unique(std::move(input)); } diff --git a/test/in_memory_index.cpp b/test/in_memory_index.cpp index a70a9cee..b7adede7 100644 --- a/test/in_memory_index.cpp +++ b/test/in_memory_index.cpp @@ -1,30 +1,25 @@ #include "in_memory_index.hpp" -auto VectorCursor::size() const noexcept -> std::size_t -{ +auto VectorCursor::size() const noexcept -> std::size_t { return documents.size(); } -auto VectorCursor::docid() const noexcept -> std::uint32_t -{ +auto VectorCursor::docid() const noexcept -> std::uint32_t { return documents[0]; } -auto VectorCursor::freq() const noexcept -> float -{ +auto VectorCursor::freq() const noexcept -> float { return frequencies[0]; } -void VectorCursor::next() -{ +void VectorCursor::next() { if (documents[0] < max_docid) { documents = documents.subspan(1); frequencies = frequencies.subspan(1); try_finish(); } } -void VectorCursor::next_geq(std::uint32_t docid) -{ +void VectorCursor::next_geq(std::uint32_t docid) { if (documents[0] < max_docid) { auto new_pos = std::lower_bound(documents.begin(), documents.end(), docid); auto skip = std::distance(documents.begin(), new_pos); @@ -34,71 +29,62 @@ void VectorCursor::next_geq(std::uint32_t docid) } } -void VectorCursor::try_finish() -{ +void VectorCursor::try_finish() { if (documents.empty()) { documents = gsl::make_span(sentinel_document); } } -auto InMemoryIndex::operator[](std::uint32_t term_id) const -> VectorCursor -{ +auto InMemoryIndex::operator[](std::uint32_t term_id) const -> VectorCursor { if (term_id >= size()) { throw std::out_of_range( - fmt::format("Term {} is out of range; index contains {} terms", term_id, size())); + fmt::format("Term {} is out of range; index contains {} terms", term_id, size()) + ); } - return {gsl::make_span(documents[term_id]), - gsl::make_span(frequencies[term_id]), - num_documents, - {num_documents}}; + return { + gsl::make_span(documents[term_id]), + gsl::make_span(frequencies[term_id]), + num_documents, + {num_documents} + }; } -auto InMemoryIndex::size() const noexcept -> std::size_t -{ +auto InMemoryIndex::size() const noexcept -> std::size_t { return documents.size(); } -auto InMemoryIndex::num_docs() const noexcept -> std::size_t -{ +auto InMemoryIndex::num_docs() const noexcept -> std::size_t { return num_documents; } -auto InMemoryWand::max_term_weight(std::uint32_t term_id) const noexcept -> float -{ +auto InMemoryWand::max_term_weight(std::uint32_t term_id) const noexcept -> float { return max_weights[term_id]; } -auto InMemoryWand::term_posting_count(std::uint32_t term_id) const noexcept -> std::uint32_t -{ +auto InMemoryWand::term_posting_count(std::uint32_t term_id) const noexcept -> std::uint32_t { return 1; } -auto InMemoryWand::term_occurrence_count(std::uint32_t term_id) const noexcept -> std::uint32_t -{ +auto InMemoryWand::term_occurrence_count(std::uint32_t term_id) const noexcept -> std::uint32_t { return 1; } -auto InMemoryWand::norm_len(std::uint32_t docid) const noexcept -> float -{ +auto InMemoryWand::norm_len(std::uint32_t docid) const noexcept -> float { return 1.0; } -auto InMemoryWand::doc_len(std::uint32_t docid) const noexcept -> std::uint32_t -{ +auto InMemoryWand::doc_len(std::uint32_t docid) const noexcept -> std::uint32_t { return 1; } -auto InMemoryWand::avg_len() const noexcept -> float -{ +auto InMemoryWand::avg_len() const noexcept -> float { return 1.0; } -auto InMemoryWand::num_docs() const noexcept -> std::size_t -{ +auto InMemoryWand::num_docs() const noexcept -> std::size_t { return num_documents; } -auto InMemoryWand::collection_len() const noexcept -> std::size_t -{ +auto InMemoryWand::collection_len() const noexcept -> std::size_t { return 1; } diff --git a/test/test_algorithm.cpp b/test/test_algorithm.cpp index e92d4ea0..f96a421c 100644 --- a/test/test_algorithm.cpp +++ b/test/test_algorithm.cpp @@ -12,25 +12,24 @@ using namespace rc; const auto genlists = gen::mapcat(gen::inRange(0, 1000), [](std::size_t length) { return gen::pair( gen::container>(length, gen::arbitrary()), - gen::container>(length, gen::arbitrary())); + gen::container>(length, gen::arbitrary()) + ); }); -TEST_CASE("pisa::transform", "[algorithm][prop]") -{ - SECTION("Add 1") - { +TEST_CASE("pisa::transform", "[algorithm][prop]") { + SECTION("Add 1") { rc::check([](std::vector vals) { auto inc = [](auto val) { return val + 1; }; std::vector actual; pisa::transform( - pisa::execution::par_unseq, vals.begin(), vals.end(), std::back_inserter(actual), inc); + pisa::execution::par_unseq, vals.begin(), vals.end(), std::back_inserter(actual), inc + ); std::vector expected; std::transform(vals.begin(), vals.end(), std::back_inserter(expected), inc); REQUIRE(actual == expected); }); } - SECTION("Add two sequences") - { + SECTION("Add two sequences") { rc::check([&]() { auto [lhs, rhs] = *genlists; std::vector actual; @@ -40,19 +39,19 @@ TEST_CASE("pisa::transform", "[algorithm][prop]") lhs.end(), rhs.begin(), std::back_inserter(actual), - std::plus<>{}); + std::plus<>{} + ); std::vector expected; std::transform( - lhs.begin(), lhs.end(), rhs.begin(), std::back_inserter(expected), std::plus<>{}); + lhs.begin(), lhs.end(), rhs.begin(), std::back_inserter(expected), std::plus<>{} + ); REQUIRE(actual == expected); }); } } -TEST_CASE("pisa::sort", "[algorithm][prop]") -{ - SECTION("Default sort") - { +TEST_CASE("pisa::sort", "[algorithm][prop]") { + SECTION("Default sort") { rc::check([](std::vector vals) { std::vector actual = vals; pisa::sort(pisa::execution::par_unseq, actual.begin(), actual.end()); @@ -61,8 +60,7 @@ TEST_CASE("pisa::sort", "[algorithm][prop]") REQUIRE(actual == expected); }); } - SECTION("Reverse sort") - { + SECTION("Reverse sort") { rc::check([](std::vector vals) { std::vector actual = vals; pisa::sort(pisa::execution::par_unseq, actual.begin(), actual.end(), std::greater<>{}); @@ -73,8 +71,7 @@ TEST_CASE("pisa::sort", "[algorithm][prop]") } } -TEST_CASE("pisa::for_each", "[algorithm][prop]") -{ +TEST_CASE("pisa::for_each", "[algorithm][prop]") { rc::check([](std::vector vals) { auto inc = [](auto& val) { return val += 1; }; std::vector actual = vals; diff --git a/test/test_bit_vector.cpp b/test/test_bit_vector.cpp index 7d8e0652..9b25ddb8 100644 --- a/test/test_bit_vector.cpp +++ b/test/test_bit_vector.cpp @@ -11,8 +11,7 @@ #include "test_common.hpp" #include "test_rank_select_common.hpp" -TEST_CASE("bit_vector") -{ +TEST_CASE("bit_vector") { rc::check([](std::vector v) { { pisa::bit_vector_builder bvb; @@ -38,21 +37,23 @@ TEST_CASE("bit_vector") test_equal_bits(v, bitmap, "Random bits (set)"); } - auto ints = std::array{uint64_t(-1), - uint64_t(1) << 63U, - 1, - 1, - 1, - 3, - 5, - 7, - 0xFFF, - 0xF0F, - 1, - 0xFFFFFF, - 0x123456, - uint64_t(1) << 63U, - uint64_t(-1)}; + auto ints = std::array{ + uint64_t(-1), + uint64_t(1) << 63U, + 1, + 1, + 1, + 3, + 5, + 7, + 0xFFF, + 0xF0F, + 1, + 0xFFFFFF, + 0x123456, + uint64_t(1) << 63U, + uint64_t(-1) + }; { pisa::bit_vector_builder bvb; for (uint64_t i: ints) { @@ -93,8 +94,7 @@ TEST_CASE("bit_vector") }); } -TEST_CASE("bit_vector_enumerator") -{ +TEST_CASE("bit_vector_enumerator") { rc::check([](std::vector v) { pisa::bit_vector bitmap(v); @@ -114,8 +114,7 @@ TEST_CASE("bit_vector_enumerator") }); } -TEST_CASE("bit_vector_unary_enumerator") -{ +TEST_CASE("bit_vector_unary_enumerator") { std::random_device rd; std::mt19937 gen(rd()); std::bernoulli_distribution d(0.5); @@ -131,7 +130,8 @@ TEST_CASE("bit_vector_unary_enumerator") posv.end(), std::back_inserter(intervals), 40, - std::mt19937{std::random_device{}()}); + std::mt19937{std::random_device{}()} + ); REQUIRE(intervals.size() % 2 == 0); for (auto left = intervals.begin(); left != intervals.end(); std::advance(left, 2)) { auto right = std::next(left); @@ -212,8 +212,7 @@ TEST_CASE("bit_vector_unary_enumerator") } } -TEST_CASE("bvb_reverse") -{ +TEST_CASE("bvb_reverse") { rc::check([](std::vector v) { pisa::bit_vector_builder bvb; for (auto elem: v) { diff --git a/test/test_block_codecs.cpp b/test/test_block_codecs.cpp index f0dd2aeb..17a3a6f2 100644 --- a/test/test_block_codecs.cpp +++ b/test/test_block_codecs.cpp @@ -22,8 +22,7 @@ using namespace rc; template -void test_case(std::vector values, bool use_sum_of_values) -{ +void test_case(std::vector values, bool use_sum_of_values) { std::uint32_t sum_of_values = use_sum_of_values ? std::accumulate(values.begin(), values.end(), 0) : std::uint32_t(-1); @@ -49,13 +48,14 @@ void test_case(std::vector values, bool use_sum_of_values) } template -void test_block_codec() -{ +void test_block_codec() { const auto lengths = gen::elementOf( - std::vector{1, 2, BlockCodec::block_size - 1, BlockCodec::block_size}); + std::vector{1, 2, BlockCodec::block_size - 1, BlockCodec::block_size} + ); const auto genlist = gen::mapcat(lengths, [](std::size_t len) { return gen::container>( - len, gen::inRange(1, 1 << 12)); + len, gen::inRange(1, 1 << 12) + ); }); std::size_t use_sum_of_values = GENERATE(true, false); @@ -76,15 +76,16 @@ TEMPLATE_TEST_CASE( pisa::varintgb_block, pisa::simple8b_block, pisa::simple16_block, - pisa::simdbp_block) -{ + pisa::simdbp_block +) { std::size_t use_sum_of_values = GENERATE(true, false); - std::vector values{ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 259}; + std::vector values{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 259}; test_case(values, use_sum_of_values); } @@ -101,8 +102,8 @@ TEMPLATE_TEST_CASE( pisa::varintgb_block, pisa::simple8b_block, pisa::simple16_block, - pisa::simdbp_block) -{ + pisa::simdbp_block +) { std::size_t use_sum_of_values = GENERATE(true, false); test_block_codec(); } diff --git a/test/test_block_freq_index.cpp b/test/test_block_freq_index.cpp index f68d8cb7..de977cdc 100644 --- a/test/test_block_freq_index.cpp +++ b/test/test_block_freq_index.cpp @@ -22,8 +22,7 @@ #include template -void test_block_freq_index() -{ +void test_block_freq_index() { pisa::global_parameters params; uint64_t universe = 20000; using collection_type = pisa::block_freq_index; @@ -64,8 +63,7 @@ void test_block_freq_index() } } -TEST_CASE("block_freq_index") -{ +TEST_CASE("block_freq_index") { test_block_freq_index(); test_block_freq_index(); test_block_freq_index(); diff --git a/test/test_block_posting_list.cpp b/test/test_block_posting_list.cpp index 5b9fe626..ab262072 100644 --- a/test/test_block_posting_list.cpp +++ b/test/test_block_posting_list.cpp @@ -25,8 +25,8 @@ void test_block_posting_list_ops( uint64_t n, uint64_t universe, std::vector const& docs, - std::vector const& freqs) -{ + std::vector const& freqs +) { typename PostingList::document_enumerator e(data, universe); REQUIRE(n == e.size()); for (size_t i = 0; i < n; ++i, e.next()) { @@ -49,16 +49,15 @@ void test_block_posting_list_ops( } void random_posting_data( - uint64_t n, uint64_t universe, std::vector& docs, std::vector& freqs) -{ + uint64_t n, uint64_t universe, std::vector& docs, std::vector& freqs +) { docs = random_sequence(universe, n, true); freqs.resize(n); std::generate(freqs.begin(), freqs.end(), []() { return (rand() % 256) + 1; }); } template -void test_block_posting_list() -{ +void test_block_posting_list() { using posting_list_type = pisa::block_posting_list; uint64_t universe = 20000; for (size_t t = 0; t < 20; ++t) { @@ -75,8 +74,7 @@ void test_block_posting_list() } template -void test_block_posting_list_reordering() -{ +void test_block_posting_list_reordering() { using posting_list_type = pisa::block_posting_list; uint64_t universe = 20000; for (size_t t = 0; t < 20; ++t) { @@ -94,18 +92,17 @@ void test_block_posting_list_reordering() std::shuffle( blocks.begin() + 1, blocks.end(), - std::mt19937(std::random_device()())); // leave first block in place + std::mt19937(std::random_device()()) + ); // leave first block in place std::vector reordered_data; posting_list_type::write_blocks(reordered_data, n, blocks); - test_block_posting_list_ops( - reordered_data.data(), n, universe, docs, freqs); + test_block_posting_list_ops(reordered_data.data(), n, universe, docs, freqs); } } -TEST_CASE("block_posting_list") -{ +TEST_CASE("block_posting_list") { test_block_posting_list(); test_block_posting_list(); test_block_posting_list(); @@ -117,7 +114,6 @@ TEST_CASE("block_posting_list") test_block_posting_list(); test_block_posting_list(); } -TEST_CASE("block_posting_list_reordering") -{ +TEST_CASE("block_posting_list_reordering") { test_block_posting_list_reordering(); } diff --git a/test/test_bmw_queries.cpp b/test/test_bmw_queries.cpp index 8ec2cace..ecbaa8a9 100644 --- a/test/test_bmw_queries.cpp +++ b/test/test_bmw_queries.cpp @@ -36,14 +36,16 @@ struct IndexData { ScorerParams(scorer_name), BlockSize(VariableBlock(12.0)), std::nullopt, - dropped_term_ids) + dropped_term_ids + ) { typename Index::builder builder(collection.num_docs(), params); for (auto const& plist: collection) { uint64_t freqs_sum = std::accumulate(plist.freqs.begin(), plist.freqs.end(), uint64_t(0)); builder.add_posting_list( - plist.docs.size(), plist.docs.begin(), plist.freqs.begin(), freqs_sum); + plist.docs.size(), plist.docs.begin(), plist.freqs.begin(), freqs_sum + ); } builder.build(index); term_id_vec q; @@ -62,8 +64,7 @@ struct IndexData { WandTypePlain wdata; [[nodiscard]] static auto - get(std::string const& s_name, std::unordered_set const& dropped_term_ids) - { + get(std::string const& s_name, std::unordered_set const& dropped_term_ids) { if (IndexData::data.find(s_name) == IndexData::data.end()) { IndexData::data[s_name] = std::make_unique>(s_name, dropped_term_ids); } @@ -75,8 +76,7 @@ template std::unordered_map>> IndexData::data = {}; template -auto test(Wand& wdata, std::string const& s_name) -{ +auto test(Wand& wdata, std::string const& s_name) { std::unordered_set dropped_term_ids; auto data = IndexData::get(s_name, dropped_term_ids); topk_queue topk_1(10); @@ -93,26 +93,25 @@ auto test(Wand& wdata, std::string const& s_name) REQUIRE(topk_2.topk().size() == topk_1.topk().size()); for (size_t i = 0; i < wand_q.topk().size(); ++i) { - REQUIRE( - topk_2.topk()[i].first == Approx(topk_1.topk()[i].first).epsilon(0.01)); // tolerance - // is - // % - // relative + REQUIRE(topk_2.topk()[i].first == Approx(topk_1.topk()[i].first).epsilon(0.01)); // tolerance + // is + // % + // relative } topk_1.clear(); topk_2.clear(); } } -TEST_CASE("block_max_wand", "[bmw][query][ranked][integration]", ) -{ +TEST_CASE("block_max_wand", "[bmw][query][ranked][integration]", ) { for (auto&& s_name: {"bm25", "qld"}) { std::unordered_set dropped_term_ids; auto data = IndexData::get(s_name, dropped_term_ids); - SECTION("Regular") { test(data->wdata, s_name); } - SECTION("Fixed") - { + SECTION("Regular") { + test(data->wdata, s_name); + } + SECTION("Fixed") { std::unordered_set dropped_term_ids; WandTypePlain wdata_fixed( data->document_sizes.begin()->begin(), @@ -121,11 +120,11 @@ TEST_CASE("block_max_wand", "[bmw][query][ranked][integration]", ) ScorerParams(s_name), BlockSize(FixedBlock(5)), std::nullopt, - dropped_term_ids); + dropped_term_ids + ); test(wdata_fixed, s_name); } - SECTION("Uniform") - { + SECTION("Uniform") { std::unordered_set dropped_term_ids; WandTypeUniform wdata_uniform( data->document_sizes.begin()->begin(), @@ -134,7 +133,8 @@ TEST_CASE("block_max_wand", "[bmw][query][ranked][integration]", ) ScorerParams(s_name), BlockSize(VariableBlock(12.0)), Size(8), - dropped_term_ids); + dropped_term_ids + ); test(wdata_uniform, s_name); } } diff --git a/test/test_compact_elias_fano.cpp b/test/test_compact_elias_fano.cpp index 74c01e24..4856bcd4 100644 --- a/test/test_compact_elias_fano.cpp +++ b/test/test_compact_elias_fano.cpp @@ -8,8 +8,7 @@ #include struct sequence_initialization { - sequence_initialization() - { + sequence_initialization() { n = 100000; universe = n * 1024; seq = random_sequence(universe, n); @@ -31,8 +30,7 @@ struct sequence_initialization { pisa::bit_vector bv; }; -TEST_CASE_METHOD(sequence_initialization, "compact_elias_fano_singleton") -{ +TEST_CASE_METHOD(sequence_initialization, "compact_elias_fano_singleton") { // test singleton sequences std::vector short_seq; short_seq.push_back(0); @@ -41,8 +39,7 @@ TEST_CASE_METHOD(sequence_initialization, "compact_elias_fano_singleton") test_sequence(pisa::compact_elias_fano(), params, 2, short_seq); } -TEST_CASE_METHOD(sequence_initialization, "compact_elias_fano_construction") -{ +TEST_CASE_METHOD(sequence_initialization, "compact_elias_fano_construction") { // test pointers and low-level values pisa::compact_elias_fano::offsets of(0, universe, seq.size(), params); uint64_t rank = 0; @@ -71,14 +68,12 @@ TEST_CASE_METHOD(sequence_initialization, "compact_elias_fano_construction") } } -TEST_CASE_METHOD(sequence_initialization, "compact_elias_fano_enumerator") -{ +TEST_CASE_METHOD(sequence_initialization, "compact_elias_fano_enumerator") { pisa::compact_elias_fano::enumerator r(bv, 0, universe, seq.size(), params); test_sequence(r, seq); } -TEST_CASE_METHOD(sequence_initialization, "compact_elias_fano_weakly_monotone") -{ +TEST_CASE_METHOD(sequence_initialization, "compact_elias_fano_weakly_monotone") { n = 100000; universe = n * 3; std::vector seq = random_sequence(universe, n, false); diff --git a/test/test_compact_ranked_bitvector.cpp b/test/test_compact_ranked_bitvector.cpp index 75a5f979..e0052c3f 100644 --- a/test/test_compact_ranked_bitvector.cpp +++ b/test/test_compact_ranked_bitvector.cpp @@ -9,8 +9,7 @@ #include struct sequence_initialization { - sequence_initialization() : seq(random_sequence(universe, n, true)) - { + sequence_initialization() : seq(random_sequence(universe, n, true)) { // high granularity to test more corner cases params.rb_log_rank1_sampling = 6; params.rb_log_sampling1 = 5; @@ -26,8 +25,7 @@ struct sequence_initialization { pisa::bit_vector bv; }; -TEST_CASE_METHOD(sequence_initialization, "compact_ranked_bitvector_construction") -{ +TEST_CASE_METHOD(sequence_initialization, "compact_ranked_bitvector_construction") { // test pointers and rank samples pisa::compact_ranked_bitvector::offsets of(0, universe, seq.size(), params); uint64_t rank = 0; @@ -54,8 +52,7 @@ TEST_CASE_METHOD(sequence_initialization, "compact_ranked_bitvector_construction } } -TEST_CASE_METHOD(sequence_initialization, "compact_ranked_bitvector_singleton") -{ +TEST_CASE_METHOD(sequence_initialization, "compact_ranked_bitvector_singleton") { // test singleton sequences std::vector short_seq; short_seq.push_back(0); @@ -64,8 +61,7 @@ TEST_CASE_METHOD(sequence_initialization, "compact_ranked_bitvector_singleton") test_sequence(pisa::compact_ranked_bitvector(), params, 2, short_seq); } -TEST_CASE_METHOD(sequence_initialization, "compact_ranked_bitvector_enumerator") -{ +TEST_CASE_METHOD(sequence_initialization, "compact_ranked_bitvector_enumerator") { pisa::compact_ranked_bitvector::enumerator r(bv, 0, universe, seq.size(), params); test_sequence(r, seq); } diff --git a/test/test_cow_string.cpp b/test/test_cow_string.cpp index c79110f8..8bba2c69 100644 --- a/test/test_cow_string.cpp +++ b/test/test_cow_string.cpp @@ -10,67 +10,81 @@ constexpr char const* VALUE = "This is a long enough string so that when in std::string, it is allocated, " "and short-string optimization is not used."; -TEST_CASE("CowString") -{ - GIVEN("An owned CowString") - { +TEST_CASE("CowString") { + GIVEN("An owned CowString") { std::string value = VALUE; CowString cow(value); - WHEN("Accessed with as_view") - { + WHEN("Accessed with as_view") { auto view = cow.as_view(); - THEN("Equal to original value") { REQUIRE(view == value); } - THEN("Data location is different") { REQUIRE(view.data() != value.data()); } + THEN("Equal to original value") { + REQUIRE(view == value); + } + THEN("Data location is different") { + REQUIRE(view.data() != value.data()); + } } - WHEN("Accessed with to_owned") - { + WHEN("Accessed with to_owned") { auto owned = std::move(cow).to_owned(); - THEN("Equal to original value") { REQUIRE(owned == value); } - THEN("Data location is different") { REQUIRE(owned.data() != value.data()); } + THEN("Equal to original value") { + REQUIRE(owned == value); + } + THEN("Data location is different") { + REQUIRE(owned.data() != value.data()); + } } } - GIVEN("An owned CowString (moved from value)") - { + GIVEN("An owned CowString (moved from value)") { std::string value = VALUE; // We will check that we never copy the value, thus the data pointer is the same char const* data_ptr = value.data(); CowString cow(std::move(value)); - WHEN("Accessed with as_view") - { + WHEN("Accessed with as_view") { auto view = cow.as_view(); - THEN("Equal to original value") { REQUIRE(view == VALUE); } - THEN("Data location is the same as initially") { REQUIRE(view.data() == data_ptr); } + THEN("Equal to original value") { + REQUIRE(view == VALUE); + } + THEN("Data location is the same as initially") { + REQUIRE(view.data() == data_ptr); + } } - WHEN("Accessed with to_owned") - { + WHEN("Accessed with to_owned") { auto owned = std::move(cow).to_owned(); - THEN("Equal to original value") { REQUIRE(owned == VALUE); } - THEN("Data location is the same as initially") { REQUIRE(owned.data() == data_ptr); } + THEN("Equal to original value") { + REQUIRE(owned == VALUE); + } + THEN("Data location is the same as initially") { + REQUIRE(owned.data() == data_ptr); + } } } - GIVEN("A borrowed CowString") - { + GIVEN("A borrowed CowString") { std::string value = VALUE; // We will check that we never copy the value, thus the data pointer is the same char const* data_ptr = value.data(); CowString cow(std::string_view{value}); - WHEN("Accessed with as_view") - { + WHEN("Accessed with as_view") { auto view = cow.as_view(); - THEN("Equal to original value") { REQUIRE(view == VALUE); } - THEN("Data location is the same as initially") { REQUIRE(view.data() == data_ptr); } + THEN("Equal to original value") { + REQUIRE(view == VALUE); + } + THEN("Data location is the same as initially") { + REQUIRE(view.data() == data_ptr); + } } - WHEN("Accessed with to_owned") - { + WHEN("Accessed with to_owned") { auto owned = std::move(cow).to_owned(); - THEN("Equal to original value") { REQUIRE(owned == VALUE); } - THEN("Data location is the same as initially") { REQUIRE(owned.data() != data_ptr); } + THEN("Equal to original value") { + REQUIRE(owned == VALUE); + } + THEN("Data location is the same as initially") { + REQUIRE(owned.data() != data_ptr); + } } } } diff --git a/test/test_cursors.cpp b/test/test_cursors.cpp index 254d8fbe..761a6879 100644 --- a/test/test_cursors.cpp +++ b/test/test_cursors.cpp @@ -12,27 +12,28 @@ using namespace pisa; -TEST_CASE("TODO") -{ - InMemoryIndex index{{ - {0}, // 0 - {0, 1, 2}, // 1 - {0}, // 2 - {0}, // 3 - {0}, // 4 - {0, 1, 4}, // 5 - {1, 4, 8}, // 6 - }, - { - {1}, // 0 - {1, 1, 1}, // 1 - {1}, // 2 - {1}, // 3 - {1}, // 4 - {1, 1, 1}, // 5 - {1, 1, 1}, // 6 - }, - 10}; +TEST_CASE("TODO") { + InMemoryIndex index{ + { + {0}, // 0 + {0, 1, 2}, // 1 + {0}, // 2 + {0}, // 3 + {0}, // 4 + {0, 1, 4}, // 5 + {1, 4, 8}, // 6 + }, + { + {1}, // 0 + {1, 1, 1}, // 1 + {1}, // 2 + {1}, // 3 + {1}, // 4 + {1, 1, 1}, // 5 + {1, 1, 1}, // 6 + }, + 10 + }; InMemoryWand wand{{1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}, 10}; quantized scorer(wand); Query query{"Q1", {0, 1, 1, 2}, {}}; @@ -46,50 +47,40 @@ TEST_CASE("TODO") return scores; }; - SECTION("Scored cursor") - { - WHEN("No weights are requested") - { + SECTION("Scored cursor") { + WHEN("No weights are requested") { std::vector> scores; auto cursors = make_scored_cursors(index, scorer, query); - std::transform( - cursors.begin(), cursors.end(), std::back_inserter(scores), collect_scores); + std::transform(cursors.begin(), cursors.end(), std::back_inserter(scores), collect_scores); CHECK(scores == std::vector>{{1}, {1, 1, 1}, {1}}); } - WHEN("Weights are requested") - { + WHEN("Weights are requested") { std::vector> scores; auto cursors = make_scored_cursors(index, scorer, query, true); - std::transform( - cursors.begin(), cursors.end(), std::back_inserter(scores), collect_scores); + std::transform(cursors.begin(), cursors.end(), std::back_inserter(scores), collect_scores); CHECK(scores == std::vector>{{1}, {2, 2, 2}, {1}}); } } - SECTION("Max-scored cursor") - { - WHEN("No weights are requested") - { + SECTION("Max-scored cursor") { + WHEN("No weights are requested") { std::vector> scores; auto cursors = make_max_scored_cursors(index, wand, scorer, query); CHECK(cursors[0].max_score() == 1.0); CHECK(cursors[1].max_score() == 1.0); CHECK(cursors[2].max_score() == 1.0); - std::transform( - cursors.begin(), cursors.end(), std::back_inserter(scores), collect_scores); + std::transform(cursors.begin(), cursors.end(), std::back_inserter(scores), collect_scores); CHECK(scores == std::vector>{{1}, {1, 1, 1}, {1}}); } - WHEN("Weights are requested") - { + WHEN("Weights are requested") { std::vector> scores; auto cursors = make_max_scored_cursors(index, wand, scorer, query, true); CHECK(cursors[0].max_score() == 1.0); CHECK(cursors[1].max_score() == 2.0); CHECK(cursors[2].max_score() == 1.0); - std::transform( - cursors.begin(), cursors.end(), std::back_inserter(scores), collect_scores); + std::transform(cursors.begin(), cursors.end(), std::back_inserter(scores), collect_scores); CHECK(scores == std::vector>{{1}, {2, 2, 2}, {1}}); } } diff --git a/test/test_forward_index.cpp b/test/test_forward_index.cpp index ea3d13fd..5f229250 100644 --- a/test/test_forward_index.cpp +++ b/test/test_forward_index.cpp @@ -7,8 +7,7 @@ #include -TEST_CASE("write_and_read") -{ +TEST_CASE("write_and_read") { // given using namespace pisa; std::string invind_input("test_data/test_collection"); diff --git a/test/test_forward_index_builder.cpp b/test/test_forward_index_builder.cpp index 27889666..78e82291 100644 --- a/test/test_forward_index_builder.cpp +++ b/test/test_forward_index_builder.cpp @@ -20,47 +20,44 @@ using namespace pisa; -TEST_CASE("Batch file name", "[parsing][forward_index]") -{ +TEST_CASE("Batch file name", "[parsing][forward_index]") { std::string basename = "basename"; REQUIRE(Forward_Index_Builder::batch_file(basename, 0) == basename + ".batch.0"); REQUIRE(Forward_Index_Builder::batch_file(basename, 10) == basename + ".batch.10"); } -TEST_CASE("Write document to stream", "[parsing][forward_index]") -{ +TEST_CASE("Write document to stream", "[parsing][forward_index]") { std::ostringstream os; auto [term_ids, encoded_sequence] = GENERATE(table, std::string>( - {{{0, 1, 2, 3, 4, 3, 2, 1, 0}, - {9, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, - 4, 0, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}}, - {{}, {0, 0, 0, 0}}})); - WHEN("List of term IDs is written to stream") - { + {{{0, 1, 2, 3, 4, 3, 2, 1, 0}, {9, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, + 4, 0, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}}, + {{}, {0, 0, 0, 0}}} + )); + WHEN("List of term IDs is written to stream") { Forward_Index_Builder::write_document(os, term_ids.begin(), term_ids.end()); - THEN("Encoded sequence is " << encoded_sequence) { REQUIRE(os.str() == encoded_sequence); } + THEN("Encoded sequence is " << encoded_sequence) { + REQUIRE(os.str() == encoded_sequence); + } } } -TEST_CASE("Write header", "[parsing][forward_index]") -{ +TEST_CASE("Write header", "[parsing][forward_index]") { std::ostringstream os; - auto [document_count, encoded_header] = - GENERATE(table({{0, {1, 0, 0, 0, 0, 0, 0, 0}}, - {1, {1, 0, 0, 0, 1, 0, 0, 0}}, - {10, {1, 0, 0, 0, 10, 0, 0, 0}}})); + auto [document_count, encoded_header] = GENERATE(table( + {{0, {1, 0, 0, 0, 0, 0, 0, 0}}, {1, {1, 0, 0, 0, 1, 0, 0, 0}}, {10, {1, 0, 0, 0, 10, 0, 0, 0}}} + )); GIVEN("Document count is " << document_count) - WHEN("Header is written to stream") - { + WHEN("Header is written to stream") { Forward_Index_Builder::write_header(os, document_count); - THEN("Encoded header is " << encoded_header) { REQUIRE(os.str() == encoded_header); } + THEN("Encoded header is " << encoded_header) { + REQUIRE(os.str() == encoded_header); + } } } -[[nodiscard]] std::vector load_lines(std::istream& is) -{ +[[nodiscard]] std::vector load_lines(std::istream& is) { std::string line; std::vector vec; while (std::getline(is, line)) { @@ -69,67 +66,59 @@ TEST_CASE("Write header", "[parsing][forward_index]") return vec; } -[[nodiscard]] std::vector load_lines(std::string const& filename) -{ +[[nodiscard]] std::vector load_lines(std::string const& filename) { std::ifstream is(filename); return load_lines(is); } template -void write_lines(std::ostream& os, gsl::span&& elements) -{ +void write_lines(std::ostream& os, gsl::span&& elements) { for (auto const& element: elements) { os << element << '\n'; } } template -void write_lines(std::string const& filename, gsl::span&& elements) -{ +void write_lines(std::string const& filename, gsl::span&& elements) { std::ofstream os(filename); write_lines(os, std::forward>(elements)); } -TEST_CASE("Build forward index batch", "[parsing][forward_index]") -{ - GIVEN("a few test records") - { +TEST_CASE("Build forward index batch", "[parsing][forward_index]") { + GIVEN("a few test records") { std::vector records{ Document_Record("Doc10", "lorem ipsum dolor sit amet consectetur adipiscing elit", ""), Document_Record("Doc11", "integer rutrum felis et sagittis dapibus", ""), Document_Record("Doc12", "vivamus ac velit nec purus molestie tincidunt", ""), Document_Record("Doc13", "vivamus eu quam vitae lacus porta tempus quis eu metus", ""), - Document_Record( - "Doc14", "curabitur a justo vitae turpis feugiat molestie eu ac nunc", "")}; - WHEN("write a batch to temp directory") - { + Document_Record("Doc14", "curabitur a justo vitae turpis feugiat molestie eu ac nunc", "") + }; + WHEN("write a batch to temp directory") { pisa::TemporaryDirectory tmpdir; auto output_file = tmpdir.path() / "fwd"; - Forward_Index_Builder::Batch_Process bp{ - 7, records, Document_Id{10}, output_file.string()}; + Forward_Index_Builder::Batch_Process bp{7, records, Document_Id{10}, output_file.string()}; Forward_Index_Builder builder; builder.run(bp, TextAnalyzer(std::make_unique())); - THEN("documents are in check") - { + THEN("documents are in check") { std::vector expected_documents{ - "Doc10", "Doc11", "Doc12", "Doc13", "Doc14"}; + "Doc10", "Doc11", "Doc12", "Doc13", "Doc14" + }; auto documents = load_lines(output_file.string() + ".batch.7.documents"); REQUIRE(documents == expected_documents); } - THEN("terms are in check") - { + THEN("terms are in check") { std::vector expected_terms{ "lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit", "integer", "rutrum", "felis", "et", "sagittis", "dapibus", "vivamus", "ac", "velit", "nec", "purus", "molestie", "tincidunt", "eu", "quam", "vitae", "lacus", "porta", "tempus", "quis", "metus", "curabitur", - "a", "justo", "turpis", "feugiat", "nunc"}; + "a", "justo", "turpis", "feugiat", "nunc" + }; auto terms = load_lines(output_file.string() + ".batch.7.terms"); REQUIRE(terms == expected_terms); } - THEN("term IDs") - { + THEN("term IDs") { binary_collection coll((output_file.string() + ".batch.7").c_str()); std::vector> documents; for (auto seq_iter = ++coll.begin(); seq_iter != coll.end(); ++seq_iter) { @@ -141,7 +130,8 @@ TEST_CASE("Build forward index batch", "[parsing][forward_index]") {8, 9, 10, 11, 12, 13}, {14, 15, 16, 17, 18, 19, 20}, {14, 21, 22, 23, 24, 25, 26, 27, 21, 28}, - {29, 30, 31, 23, 32, 33, 19, 21, 15, 34}}; + {29, 30, 31, 23, 32, 33, 19, 21, 15, 34} + }; REQUIRE(documents == expected_documents); } } @@ -152,8 +142,8 @@ void write_batch( std::string const& basename, std::vector const& documents, std::vector const& terms, - std::vector> const& collection) -{ + std::vector> const& collection +) { std::string document_file = basename + ".documents"; std::string term_file = basename + ".terms"; write_lines(document_file, gsl::make_span(documents)); @@ -165,14 +155,13 @@ void write_batch( } } -TEST_CASE("Merge forward index batches", "[parsing][forward_index]") -{ +TEST_CASE("Merge forward index batches", "[parsing][forward_index]") { pisa::TemporaryDirectory tmpdir; auto dir = tmpdir.path(); - GIVEN("Three batches on disk") - { + GIVEN("Three batches on disk") { std::vector batch_paths{ - dir / "fwd.batch.0", dir / "fwd.batch.1", dir / "fwd.batch.2"}; + dir / "fwd.batch.0", dir / "fwd.batch.1", dir / "fwd.batch.2" + }; write_batch( batch_paths[0].string(), {"Doc10", "Doc11"}, @@ -190,7 +179,8 @@ TEST_CASE("Merge forward index batches", "[parsing][forward_index]") "et", "sagittis", "dapibus"}, - {{0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10, 11, 12, 13}}); + {{0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10, 11, 12, 13}} + ); write_batch( batch_paths[1].string(), {"Doc12", "Doc13"}, @@ -209,49 +199,40 @@ TEST_CASE("Merge forward index batches", "[parsing][forward_index]") "tempus", "quis", "metus"}, - {{0, 1, 2, 3, 4, 5, 6}, {0, 7, 8, 9, 10, 11, 12, 13, 7, 14}}); + {{0, 1, 2, 3, 4, 5, 6}, {0, 7, 8, 9, 10, 11, 12, 13, 7, 14}} + ); write_batch( batch_paths[2].string(), {"Doc14"}, - {"curabitur", - "a", - "justo", - "vitae", - "turpis", - "feugiat", - "molestie", - "eu", - "ac", - "nunc"}, - {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}}); + {"curabitur", "a", "justo", "vitae", "turpis", "feugiat", "molestie", "eu", "ac", "nunc"}, + {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}} + ); - WHEN("Merging function is called") - { + WHEN("Merging function is called") { auto output_file = (dir / "fwd").string(); Forward_Index_Builder builder; builder.merge(output_file, 5, 3); - THEN("documents are in check") - { + THEN("documents are in check") { std::vector expected_documents{ - "Doc10", "Doc11", "Doc12", "Doc13", "Doc14"}; + "Doc10", "Doc11", "Doc12", "Doc13", "Doc14" + }; auto documents = load_lines(output_file + ".documents"); REQUIRE(documents == expected_documents); } - THEN("terms are in check") - { + THEN("terms are in check") { std::vector expected_terms{ "a", "ac", "adipiscing", "amet", "consectetur", "curabitur", "dapibus", "dolor", "elit", "et", "eu", "felis", "feugiat", "integer", "ipsum", "justo", "lacus", "lorem", "metus", "molestie", "nec", "nunc", "porta", "purus", "quam", "quis", "rutrum", "sagittis", "sit", "tempus", - "tincidunt", "turpis", "velit", "vitae", "vivamus"}; + "tincidunt", "turpis", "velit", "vitae", "vivamus" + }; auto terms = load_lines(output_file + ".terms"); REQUIRE(terms == expected_terms); } - THEN("term IDs") - { + THEN("term IDs") { binary_collection coll((output_file).c_str()); std::vector> documents; for (auto seq_iter = ++coll.begin(); seq_iter != coll.end(); ++seq_iter) { @@ -263,48 +244,47 @@ TEST_CASE("Merge forward index batches", "[parsing][forward_index]") {13, 26, 11, 9, 27, 6}, {34, 1, 32, 20, 23, 19, 30}, {34, 10, 24, 33, 16, 22, 29, 25, 10, 18}, - {5, 0, 15, 33, 31, 12, 19, 10, 1, 21}}; + {5, 0, 15, 33, 31, 12, 19, 10, 1, 21} + }; REQUIRE(documents == expected_documents); } } } } -TEST_CASE("Parse HTML content", "[parsing][forward_index][unit]") -{ +TEST_CASE("Parse HTML content", "[parsing][forward_index][unit]") { std::vector vec; auto map_word = [&](std::string&& word) { vec.push_back(word); }; - SECTION("empty") - { + SECTION("empty") { parse_html_content( "HTTP/1.1 200 OK\n" "Content-Length: 16254\n\n" "", - map_word); + map_word + ); REQUIRE(vec.empty()); } - SECTION("non-empty") - { + SECTION("non-empty") { parse_html_content( "HTTP/1.1 200 OK\n" "Content-Length: 16254\n\n" "loremipsum", - map_word); + map_word + ); REQUIRE(vec == std::vector{"lorem", "ipsum"}); } - SECTION("non-empty with CR") - { + SECTION("non-empty with CR") { parse_html_content( "HTTP/1.1 200 OK\n" "Content-Length: 16254\n\r\n" "loremipsum", - map_word); + map_word + ); REQUIRE(vec == std::vector{"lorem", "ipsum"}); } } -[[nodiscard]] auto load_term_map(std::string const& basename) -> std::vector -{ +[[nodiscard]] auto load_term_map(std::string const& basename) -> std::vector { std::vector map; std::ifstream is(basename + ".terms"); std::string str; @@ -314,8 +294,7 @@ TEST_CASE("Parse HTML content", "[parsing][forward_index][unit]") return map; } -TEST_CASE("Build forward index", "[parsing][forward_index][integration]") -{ +TEST_CASE("Build forward index", "[parsing][forward_index][integration]") { auto next_record = [](std::istream& in) -> std::optional { Plaintext_Record record; if (in >> record) { @@ -324,14 +303,12 @@ TEST_CASE("Build forward index", "[parsing][forward_index][integration]") return std::nullopt; }; - GIVEN("A plaintext collection file") - { + GIVEN("A plaintext collection file") { std::string input(PISA_SOURCE_DIR "/test/test_data/clueweb1k.plaintext"); REQUIRE(std::filesystem::exists(std::filesystem::path(input)) == true); int thread_count = GENERATE(2, 8); int batch_size = GENERATE(123, 1000); - WHEN("Build a forward index") - { + WHEN("Build a forward index") { pisa::TemporaryDirectory tmpdir; auto dir = tmpdir.path(); std::string output = (dir / "fwd").string(); @@ -344,15 +321,14 @@ TEST_CASE("Build forward index", "[parsing][forward_index][integration]") next_record, std::make_shared(std::make_unique()), batch_size, - thread_count); + thread_count + ); - THEN("The collection mapped to terms matches input") - { + THEN("The collection mapped to terms matches input") { auto term_map = load_term_map(output); auto term_lexicon_buffer = Payload_Vector_Buffer::from_file(output + ".termlex"); auto term_lexicon = Payload_Vector(term_lexicon_buffer); - REQUIRE( - std::vector(term_lexicon.begin(), term_lexicon.end()) == term_map); + REQUIRE(std::vector(term_lexicon.begin(), term_lexicon.end()) == term_map); binary_collection coll((output).c_str()); auto seq_iter = coll.begin(); REQUIRE(*seq_iter->begin() == 1000); @@ -381,8 +357,7 @@ TEST_CASE("Build forward index", "[parsing][forward_index][integration]") }); REQUIRE(batch_files.empty()); } - AND_THEN("Document lexicon contains the same titles as text file") - { + AND_THEN("Document lexicon contains the same titles as text file") { auto documents = io::read_string_vector(output + ".documents"); auto doc_lexicon_buffer = Payload_Vector_Buffer::from_file(output + ".doclex"); auto doc_lexicon = Payload_Vector(doc_lexicon_buffer); diff --git a/test/test_freq_index.cpp b/test/test_freq_index.cpp index 1634838e..4c3df903 100644 --- a/test/test_freq_index.cpp +++ b/test/test_freq_index.cpp @@ -19,8 +19,7 @@ #include "temporary_directory.hpp" template -void test_freq_index() -{ +void test_freq_index() { pisa::TemporaryDirectory tmpdir; auto idx_path = (tmpdir.path() / "coll.bin").string(); @@ -63,8 +62,7 @@ void test_freq_index() } } -TEST_CASE("freq_index") -{ +TEST_CASE("freq_index") { using pisa::indexed_sequence; using pisa::partitioned_sequence; using pisa::positive_sequence; diff --git a/test/test_html.cpp b/test/test_html.cpp index ef2d51d6..648ffa87 100644 --- a/test/test_html.cpp +++ b/test/test_html.cpp @@ -7,13 +7,15 @@ using namespace pisa::parsing::html; -TEST_CASE("Parse HTML", "[html][unit]") -{ - auto [input, expected] = - GENERATE(table({{"text", "text"}, - {"text", "text"}, - {"texttext", "text text"}, - {"", ""}, - {"", ""}})); - GIVEN("Input: " << input) { CHECK(cleantext(input) == expected); } +TEST_CASE("Parse HTML", "[html][unit]") { + auto [input, expected] = GENERATE(table( + {{"text", "text"}, + {"text", "text"}, + {"texttext", "text text"}, + {"", ""}, + {"", ""}} + )); + GIVEN("Input: " << input) { + CHECK(cleantext(input) == expected); + } } diff --git a/test/test_indexed_sequence.cpp b/test/test_indexed_sequence.cpp index 4855c5b4..fa4b0171 100644 --- a/test/test_indexed_sequence.cpp +++ b/test/test_indexed_sequence.cpp @@ -7,8 +7,7 @@ #include #include -TEST_CASE("indexed_sequence") -{ +TEST_CASE("indexed_sequence") { pisa::global_parameters params; std::vector avg_gaps = {1.1, 1.9, 2.5, 3, 4, 5, 10}; diff --git a/test/test_intersection.cpp b/test/test_intersection.cpp index c193e1ba..17ffc38a 100644 --- a/test/test_intersection.cpp +++ b/test/test_intersection.cpp @@ -11,10 +11,8 @@ using namespace pisa; using namespace pisa::intersection; -TEST_CASE("filter query", "[intersection][unit]") -{ - GIVEN("Four-term query") - { +TEST_CASE("filter query", "[intersection][unit]") { + GIVEN("Four-term query") { Query query{ "Q1", // query ID {6, 1, 5}, // terms @@ -29,8 +27,7 @@ TEST_CASE("filter query", "[intersection][unit]") {0b110, Query{"Q1", {1, 5}, {0.4, 1.0}}}, {0b111, Query{"Q1", {6, 1, 5}, {0.1, 0.4, 1.0}}}, })); - WHEN("Filtered with mask " << mask) - { + WHEN("Filtered with mask " << mask) { auto actual = filter(query, mask); CHECK(actual.id == expected.id); CHECK(actual.terms == expected.terms); @@ -39,8 +36,7 @@ TEST_CASE("filter query", "[intersection][unit]") } } -TEST_CASE("Vector cursor", "[intersection][unit]") -{ +TEST_CASE("Vector cursor", "[intersection][unit]") { std::vector documents{0, 3, 5, 6, 87, 111}; std::vector frequencies{1, 4, 6, 7, 88, 112}; @@ -95,29 +91,29 @@ TEST_CASE("Vector cursor", "[intersection][unit]") REQUIRE(cursor.docid() == 200); } -TEST_CASE("compute intersection", "[intersection][unit]") -{ - GIVEN("Four-term query, index, and wand data object") - { - InMemoryIndex index{{ - {0}, // 0 - {0, 1, 2}, // 1 - {0}, // 2 - {0}, // 3 - {0}, // 4 - {0, 1, 4}, // 5 - {1, 4, 8}, // 6 - }, - { - {1}, // 0 - {1, 1, 1}, // 1 - {1}, // 2 - {1}, // 3 - {1}, // 4 - {1, 1, 1}, // 5 - {1, 1, 1}, // 6 - }, - 10}; +TEST_CASE("compute intersection", "[intersection][unit]") { + GIVEN("Four-term query, index, and wand data object") { + InMemoryIndex index{ + { + {0}, // 0 + {0, 1, 2}, // 1 + {0}, // 2 + {0}, // 3 + {0}, // 4 + {0, 1, 4}, // 5 + {1, 4, 8}, // 6 + }, + { + {1}, // 0 + {1, 1, 1}, // 1 + {1}, // 2 + {1}, // 3 + {1}, // 4 + {1, 1, 1}, // 5 + {1, 1, 1}, // 6 + }, + 10 + }; InMemoryWand wand{{0.0, 1.0, 0.0, 0.0, 0.0, 5.0, 6.0}, 10}; Query query{ @@ -134,8 +130,7 @@ TEST_CASE("compute intersection", "[intersection][unit]") {0b110, 2, 3.69165F}, {0b111, 1, 5.53748F}, })); - WHEN("Computed intersection with mask " << mask) - { + WHEN("Computed intersection with mask " << mask) { auto intersection = Intersection::compute(index, wand, query, mask); CHECK(intersection.length == len); CHECK(intersection.max_score == Approx(max)); @@ -143,10 +138,8 @@ TEST_CASE("compute intersection", "[intersection][unit]") } } -TEST_CASE("for_all_subsets", "[intersection][unit]") -{ - GIVEN("A query and a mock function that accumulates arguments") - { +TEST_CASE("for_all_subsets", "[intersection][unit]") { + GIVEN("A query and a mock function that accumulates arguments") { std::vector masks; auto accumulate = [&](Query const&, Mask const& mask) { masks.push_back(mask); }; Query query{ @@ -154,44 +147,36 @@ TEST_CASE("for_all_subsets", "[intersection][unit]") {6, 1, 5}, // terms {0.1, 0.4, 1.0} // weights }; - WHEN("Executed with limit 0") - { + WHEN("Executed with limit 0") { for_all_subsets(query, 0, accumulate); - THEN("No elements accumulated") { CHECK(masks.empty()); } + THEN("No elements accumulated") { + CHECK(masks.empty()); + } } - WHEN("Executed with limit 1") - { + WHEN("Executed with limit 1") { for_all_subsets(query, 1, accumulate); - THEN("Unigrams accumulated") - { + THEN("Unigrams accumulated") { CHECK(masks == std::vector{Mask(0b001), Mask(0b010), Mask(0b100)}); } } - WHEN("Executed with limit 2") - { + WHEN("Executed with limit 2") { for_all_subsets(query, 2, accumulate); - THEN("Unigrams and bigrams accumulated") - { + THEN("Unigrams and bigrams accumulated") { CHECK( masks - == std::vector{ - Mask(0b001), Mask(0b010), Mask(0b011), Mask(0b100), Mask(0b101), Mask(0b110)}); + == std::vector< + Mask>{Mask(0b001), Mask(0b010), Mask(0b011), Mask(0b100), Mask(0b101), Mask(0b110)} + ); } } - WHEN("Executed with limit 3") - { + WHEN("Executed with limit 3") { for_all_subsets(query, 3, accumulate); - THEN("All combinations accumulated") - { + THEN("All combinations accumulated") { CHECK( masks - == std::vector{Mask(0b001), - Mask(0b010), - Mask(0b011), - Mask(0b100), - Mask(0b101), - Mask(0b110), - Mask(0b111)}); + == std::vector< + Mask>{Mask(0b001), Mask(0b010), Mask(0b011), Mask(0b100), Mask(0b101), Mask(0b110), Mask(0b111)} + ); } } } diff --git a/test/test_invert.cpp b/test/test_invert.cpp index e52e8f12..f080a08d 100644 --- a/test/test_invert.cpp +++ b/test/test_invert.cpp @@ -17,24 +17,24 @@ using namespace pisa; using namespace pisa::literals; -TEST_CASE("Map sequence of document terms to sequence of postings", "[invert][unit]") -{ +TEST_CASE("Map sequence of document terms to sequence of postings", "[invert][unit]") { std::vector> documents = {{0_t, 1_t, 2_t, 3_t}, {1_t, 2_t, 3_t, 8_t}}; std::vector> spans = { - gsl::make_span(documents[0]), gsl::make_span(documents[1])}; + gsl::make_span(documents[0]), gsl::make_span(documents[1]) + }; auto postings = invert::map_to_postings(invert::ForwardIndexSlice{spans, ranges::views::iota(0_d, 2_d)}); REQUIRE( postings - == std::vector>{ - {0_t, 0_d}, {1_t, 0_d}, {2_t, 0_d}, {3_t, 0_d}, {1_t, 1_d}, {2_t, 1_d}, {3_t, 1_d}, {8_t, 1_d}}); + == std::vector>{{0_t, 0_d}, {1_t, 0_d}, {2_t, 0_d}, {3_t, 0_d}, {1_t, 1_d}, {2_t, 1_d}, {3_t, 1_d}, {8_t, 1_d}} + ); } -TEST_CASE("Join term from one index to the same term from another", "[invert][unit]") -{ - SECTION("Disjoint") - { +TEST_CASE("Join term from one index to the same term from another", "[invert][unit]") { + SECTION("Disjoint") { std::vector lower_doc{0_d, 3_d, 5_d}; std::vector lower_freq{3_f, 4_f, 5_f}; std::vector higher_doc{6_d, 7_d, 9_d}; @@ -43,8 +43,7 @@ TEST_CASE("Join term from one index to the same term from another", "[invert][un REQUIRE(lower_doc == std::vector{0_d, 3_d, 5_d, 6_d, 7_d, 9_d}); REQUIRE(lower_freq == std::vector{3_f, 4_f, 5_f, 6_f, 7_f, 8_f}); } - SECTION("With an overlaping document") - { + SECTION("With an overlaping document") { std::vector lower_doc{0_d, 3_d, 5_d}; std::vector lower_freq{3_f, 4_f, 5_f}; std::vector higher_doc{5_d, 7_d, 9_d}; @@ -55,38 +54,43 @@ TEST_CASE("Join term from one index to the same term from another", "[invert][un } } -TEST_CASE("Accumulate postings to Inverted_Index", "[invert][unit]") -{ - std::vector> postings = {{0_t, 0_d}, - {0_t, 1_d}, - {0_t, 2_d}, - {1_t, 0_d}, - {1_t, 0_d}, - {1_t, 0_d}, - {1_t, 0_d}, - {1_t, 1_d}, - {2_t, 5_d}}; +TEST_CASE("Accumulate postings to Inverted_Index", "[invert][unit]") { + std::vector> postings = { + {0_t, 0_d}, + {0_t, 1_d}, + {0_t, 2_d}, + {1_t, 0_d}, + {1_t, 0_d}, + {1_t, 0_d}, + {1_t, 0_d}, + {1_t, 1_d}, + {2_t, 5_d} + }; using iterator_type = decltype(postings.begin()); invert::Inverted_Index index; index(tbb::blocked_range(postings.begin(), postings.end())); REQUIRE( index.documents - == std::unordered_map>{ - {0_t, {0_d, 1_d, 2_d}}, {1_t, {0_d, 1_d}}, {2_t, {5_d}}}); + == std::unordered_map< + Term_Id, + std::vector>{{0_t, {0_d, 1_d, 2_d}}, {1_t, {0_d, 1_d}}, {2_t, {5_d}}} + ); REQUIRE( index.frequencies - == std::unordered_map>{ - {0_t, {1_f, 1_f, 1_f}}, {1_t, {4_f, 1_f}}, {2_t, {1_f}}}); + == std::unordered_map< + Term_Id, + std::vector>{{0_t, {1_f, 1_f, 1_f}}, {1_t, {4_f, 1_f}}, {2_t, {1_f}}} + ); } -TEST_CASE("Accumulate postings to Inverted_Index one by one", "[invert][unit]") -{ +TEST_CASE("Accumulate postings to Inverted_Index one by one", "[invert][unit]") { std::vector> postings = { {0_t, 0_d}, {0_t, 0_d}, {0_t, 1_d}, {0_t, 4_d}, {1_t, 2_d}, {1_t, 4_d}, {2_t, 0_d}, {2_t, 1_d}, {3_t, 0_d}, {3_t, 1_d}, {3_t, 4_d}, {4_t, 1_d}, {4_t, 1_d}, {4_t, 4_d}, {5_t, 1_d}, {5_t, 1_d}, {5_t, 2_d}, {5_t, 3_d}, {5_t, 4_d}, {6_t, 1_d}, {6_t, 4_d}, {6_t, 4_d}, {6_t, 4_d}, {6_t, 4_d}, {7_t, 1_d}, {8_t, 2_d}, {8_t, 2_d}, {8_t, 2_d}, - {8_t, 3_d}, {8_t, 4_d}, {9_t, 0_d}, {9_t, 2_d}, {9_t, 3_d}, {9_t, 4_d}}; + {8_t, 3_d}, {8_t, 4_d}, {9_t, 0_d}, {9_t, 2_d}, {9_t, 3_d}, {9_t, 4_d} + }; using iterator_type = decltype(postings.begin()); invert::Inverted_Index index; for (auto iter = postings.begin(); iter != postings.end(); ++iter) { @@ -94,41 +98,32 @@ TEST_CASE("Accumulate postings to Inverted_Index one by one", "[invert][unit]") } REQUIRE( index.documents - == std::unordered_map>{{0_t, {0_d, 1_d, 4_d}}, - {1_t, {2_d, 4_d}}, - {2_t, {0_d, 1_d}}, - {3_t, {0_d, 1_d, 4_d}}, - {4_t, {1_d, 4_d}}, - {5_t, {1_d, 2_d, 3_d, 4_d}}, - {6_t, {1_d, 4_d}}, - {7_t, {1_d}}, - {8_t, {2_d, 3_d, 4_d}}, - {9_t, {0_d, 2_d, 3_d, 4_d}}}); + == std::unordered_map< + Term_Id, + std::vector< + Document_Id>>{{0_t, {0_d, 1_d, 4_d}}, {1_t, {2_d, 4_d}}, {2_t, {0_d, 1_d}}, {3_t, {0_d, 1_d, 4_d}}, {4_t, {1_d, 4_d}}, {5_t, {1_d, 2_d, 3_d, 4_d}}, {6_t, {1_d, 4_d}}, {7_t, {1_d}}, {8_t, {2_d, 3_d, 4_d}}, {9_t, {0_d, 2_d, 3_d, 4_d}}} + ); REQUIRE( index.frequencies - == std::unordered_map>{{0_t, {2_f, 1_f, 1_f}}, - {1_t, {1_f, 1_f}}, - {2_t, {1_f, 1_f}}, - {3_t, {1_f, 1_f, 1_f}}, - {4_t, {2_f, 1_f}}, - {5_t, {2_f, 1_f, 1_f, 1_f}}, - {6_t, {1_f, 4_f}}, - {7_t, {1_f}}, - {8_t, {3_f, 1_f, 1_f}}, - {9_t, {1_f, 1_f, 1_f, 1_f}}}); + == std::unordered_map< + Term_Id, + std::vector< + Frequency>>{{0_t, {2_f, 1_f, 1_f}}, {1_t, {1_f, 1_f}}, {2_t, {1_f, 1_f}}, {3_t, {1_f, 1_f, 1_f}}, {4_t, {2_f, 1_f}}, {5_t, {2_f, 1_f, 1_f, 1_f}}, {6_t, {1_f, 4_f}}, {7_t, {1_f}}, {8_t, {3_f, 1_f, 1_f}}, {9_t, {1_f, 1_f, 1_f, 1_f}}} + ); } -TEST_CASE("Join Inverted_Index to another", "[invert][unit]") -{ +TEST_CASE("Join Inverted_Index to another", "[invert][unit]") { using index_type = invert::Inverted_Index; auto [lhs, rhs, expected_joined, message] = GENERATE(table( {{index_type( {{0_t, {0_d, 1_d, 2_d}}, {1_t, {0_d, 1_d}}, {2_t, {5_d}}}, - {{0_t, {1_f, 1_f, 1_f}}, {1_t, {4_f, 1_f}}, {2_t, {1_f}}}), + {{0_t, {1_f, 1_f, 1_f}}, {1_t, {4_f, 1_f}}, {2_t, {1_f}}} + ), index_type( {{3_t, {0_d, 1_d, 2_d}}, {4_t, {0_d, 1_d}}, {5_t, {5_d}}}, - {{3_t, {1_f, 1_f, 1_f}}, {4_t, {4_f, 1_f}}, {5_t, {1_f}}}), + {{3_t, {1_f, 1_f, 1_f}}, {4_t, {4_f, 1_f}}, {5_t, {1_f}}} + ), index_type( {{0_t, {0_d, 1_d, 2_d}}, {1_t, {0_d, 1_d}}, @@ -141,14 +136,17 @@ TEST_CASE("Join Inverted_Index to another", "[invert][unit]") {2_t, {1_f}}, {3_t, {1_f, 1_f, 1_f}}, {4_t, {4_f, 1_f}}, - {5_t, {1_f}}}), + {5_t, {1_f}}} + ), "disjoint terms"}, {index_type( {{0_t, {0_d, 1_d, 2_d}}, {1_t, {0_d, 1_d}}, {2_t, {5_d}}}, - {{0_t, {1_f, 1_f, 1_f}}, {1_t, {4_f, 1_f}}, {2_t, {1_f}}}), + {{0_t, {1_f, 1_f, 1_f}}, {1_t, {4_f, 1_f}}, {2_t, {1_f}}} + ), index_type( {{2_t, {6_d, 7_d, 8_d}}, {3_t, {0_d, 1_d}}, {4_t, {5_d}}}, - {{2_t, {1_f, 1_f, 1_f}}, {3_t, {4_f, 1_f}}, {4_t, {1_f}}}), + {{2_t, {1_f, 1_f, 1_f}}, {3_t, {4_f, 1_f}}, {4_t, {1_f}}} + ), index_type( {{0_t, {0_d, 1_d, 2_d}}, {1_t, {0_d, 1_d}}, @@ -159,14 +157,17 @@ TEST_CASE("Join Inverted_Index to another", "[invert][unit]") {1_t, {4_f, 1_f}}, {2_t, {1_f, 1_f, 1_f, 1_f}}, {3_t, {4_f, 1_f}}, - {4_t, {1_f}}}), + {4_t, {1_f}}} + ), "disjoint documents"}, {index_type( {{0_t, {0_d, 1_d, 2_d}}, {1_t, {0_d, 1_d}}, {2_t, {5_d}}}, - {{0_t, {1_f, 1_f, 1_f}}, {1_t, {4_f, 1_f}}, {2_t, {1_f}}}), + {{0_t, {1_f, 1_f, 1_f}}, {1_t, {4_f, 1_f}}, {2_t, {1_f}}} + ), index_type( {{2_t, {5_d, 7_d, 8_d}}, {3_t, {0_d, 1_d}}, {4_t, {5_d}}}, - {{2_t, {1_f, 1_f, 1_f}}, {3_t, {4_f, 1_f}}, {4_t, {1_f}}}), + {{2_t, {1_f, 1_f, 1_f}}, {3_t, {4_f, 1_f}}, {4_t, {1_f}}} + ), index_type( {{0_t, {0_d, 1_d, 2_d}}, {1_t, {0_d, 1_d}}, @@ -177,41 +178,43 @@ TEST_CASE("Join Inverted_Index to another", "[invert][unit]") {1_t, {4_f, 1_f}}, {2_t, {2_f, 1_f, 1_f}}, {3_t, {4_f, 1_f}}, - {4_t, {1_f}}}), + {4_t, {1_f}}} + ), "overlapping term and document"}, {index_type({{0_t, {0_d}}}, {{0_t, {1_f}}}), index_type({{0_t, {0_d}}}, {{0_t, {1_f}}}), index_type({{0_t, {0_d}}}, {{0_t, {2_f}}}), - "single posting"}})); - WHEN("Join left to right -- " << message) - { + "single posting"}} + )); + WHEN("Join left to right -- " << message) { lhs.join(rhs); REQUIRE(lhs.documents == expected_joined.documents); REQUIRE(lhs.frequencies == expected_joined.frequencies); } - WHEN("Join right to left -- " << message) - { + WHEN("Join right to left -- " << message) { rhs.join(lhs); REQUIRE(rhs.documents == expected_joined.documents); REQUIRE(rhs.frequencies == expected_joined.frequencies); } } -TEST_CASE("Invert a range of documents from a collection", "[invert][unit]") -{ +TEST_CASE("Invert a range of documents from a collection", "[invert][unit]") { using index_type = invert::Inverted_Index; std::vector> collection = { /* Doc 0 */ {2_t, 0_t, 3_t, 9_t, 0_t}, /* Doc 1 */ {5_t, 0_t, 3_t, 4_t, 2_t, 6_t, 7_t, 4_t, 5_t}, /* Doc 2 */ {5_t, 1_t, 8_t, 9_t, 8_t, 8_t}, /* Doc 3 */ {8_t, 5_t, 9_t}, - /* Doc 4 */ {8_t, 6_t, 9_t, 6_t, 6_t, 5_t, 4_t, 3_t, 1_t, 0_t, 6_t}}; + /* Doc 4 */ {8_t, 6_t, 9_t, 6_t, 6_t, 5_t, 4_t, 3_t, 1_t, 0_t, 6_t} + }; std::vector> document_range; std::transform( - collection.begin(), collection.end(), std::back_inserter(document_range), [](auto const& vec) { - return gsl::span(vec); - }); + collection.begin(), + collection.end(), + std::back_inserter(document_range), + [](auto const& vec) { return gsl::span(vec); } + ); size_t threads = 1; auto index = invert::invert_range(document_range, 0_d, threads); @@ -237,16 +240,15 @@ TEST_CASE("Invert a range of documents from a collection", "[invert][unit]") {7_t, {1_f}}, {8_t, {3_f, 1_f, 1_f}}, {9_t, {1_f, 1_f, 1_f, 1_f}}}, - {5, 9, 6, 3, 11}); + {5, 9, 6, 3, 11} + ); REQUIRE(index.documents == expected.documents); REQUIRE(index.frequencies == expected.frequencies); REQUIRE(index.document_sizes == expected.document_sizes); } -TEST_CASE("Invert collection", "[invert][unit]") -{ - GIVEN("A binary collection") - { +TEST_CASE("Invert collection", "[invert][unit]") { + GIVEN("A binary collection") { pisa::TemporaryDirectory tmpdir; uint32_t batch_size = GENERATE(1, 2, 3, 4, 5); uint32_t threads = GENERATE(1, 2, 3, 4, 5); @@ -262,26 +264,26 @@ TEST_CASE("Invert collection", "[invert][unit]") /* size */ 9, /* Doc 1 */ 5, 0, 3, 4, 2, 6, 7, 4, 5, /* size */ 6, /* Doc 2 */ 5, 1, 8, 9, 8, 8, /* size */ 3, /* Doc 3 */ 8, 5, 9, - /* size */ 11, /* Doc 4 */ 8, 6, 9, 6, 6, 5, 4, 3, 1, 0, 6}; + /* size */ 11, /* Doc 4 */ 8, 6, 9, 6, 6, 5, 4, 3, 1, 0, 6 + }; std::ofstream os(collection_filename); os.write( reinterpret_cast(collection_data.data()), - collection_data.size() * sizeof(uint32_t)); + collection_data.size() * sizeof(uint32_t) + ); if (with_lex) { std::vector terms{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}; encode_payload_vector(terms.begin(), terms.end()) .to_file((tmpdir.path() / "fwd.termlex").string()); } } - WHEN("Run inverting with batch size " << batch_size << " and " << threads << " threads") - { + WHEN("Run inverting with batch size " << batch_size << " and " << threads << " threads") { auto index_basename = (tmpdir.path() / "idx").string(); if (not with_lex) { params.term_count = 10; } invert::invert_forward_index(collection_filename, index_basename, params); - THEN("Index is stored in binary_freq_collection format") - { + THEN("Index is stored in binary_freq_collection format") { std::vector document_data{ /* size */ 1, /* count */ 5, /* size */ 3, /* Term 0 */ 0, 1, 4, @@ -293,7 +295,8 @@ TEST_CASE("Invert collection", "[invert][unit]") /* size */ 2, /* Term 6 */ 1, 4, /* size */ 1, /* Term 7 */ 1, /* size */ 3, /* Term 8 */ 2, 3, 4, - /* size */ 4, /* Term 9 */ 0, 2, 3, 4}; + /* size */ 4, /* Term 9 */ 0, 2, 3, 4 + }; std::vector frequency_data{ /* size */ 3, /* Term 0 */ 2, 1, 1, /* size */ 2, /* Term 1 */ 1, 1, @@ -304,24 +307,28 @@ TEST_CASE("Invert collection", "[invert][unit]") /* size */ 2, /* Term 6 */ 1, 4, /* size */ 1, /* Term 7 */ 1, /* size */ 3, /* Term 8 */ 3, 1, 1, - /* size */ 4, /* Term 9 */ 1, 1, 1, 1}; + /* size */ 4, /* Term 9 */ 1, 1, 1, 1 + }; std::vector size_data{/* size */ 5, /* sizes */ 5, 9, 6, 3, 11}; mio::mmap_source mm; std::error_code error; mm.map((index_basename + ".docs").c_str(), error); std::vector d( reinterpret_cast(mm.data()), - reinterpret_cast(mm.data()) + mm.size() / sizeof(uint32_t)); + reinterpret_cast(mm.data()) + mm.size() / sizeof(uint32_t) + ); mio::mmap_source mmf; mmf.map((index_basename + ".freqs").c_str(), error); std::vector f( reinterpret_cast(mmf.data()), - reinterpret_cast(mmf.data()) + mmf.size() / sizeof(uint32_t)); + reinterpret_cast(mmf.data()) + mmf.size() / sizeof(uint32_t) + ); mio::mmap_source mms; mms.map((index_basename + ".sizes").c_str(), error); std::vector s( reinterpret_cast(mms.data()), - reinterpret_cast(mms.data()) + mms.size() / sizeof(uint32_t)); + reinterpret_cast(mms.data()) + mms.size() / sizeof(uint32_t) + ); REQUIRE(d == document_data); REQUIRE(f == frequency_data); REQUIRE(s == size_data); diff --git a/test/test_mapper.cpp b/test/test_mapper.cpp index c1fde91a..51725c7c 100644 --- a/test/test_mapper.cpp +++ b/test/test_mapper.cpp @@ -7,8 +7,7 @@ #include "mappable/mapper.hpp" -TEST_CASE("basic_map") -{ +TEST_CASE("basic_map") { pisa::mapper::mappable_vector vec; REQUIRE(vec.size() == 0U); @@ -36,16 +35,14 @@ class complex_struct { public: complex_struct() : m_a(0) {} - void init() - { + void init() { m_a = 42; uint32_t b[] = {1, 2}; m_b.assign(b); } template - void map(Visitor& visit) - { + void map(Visitor& visit) { visit(m_a, "m_a")(m_b, "m_b"); } @@ -53,8 +50,7 @@ class complex_struct { pisa::mapper::mappable_vector m_b; }; -TEST_CASE("complex_struct_map") -{ +TEST_CASE("complex_struct_map") { complex_struct s; s.init(); pisa::mapper::freeze(s, "temp.bin"); diff --git a/test/test_memory.cpp b/test/test_memory.cpp index e4ec44aa..1303c2fc 100644 --- a/test/test_memory.cpp +++ b/test/test_memory.cpp @@ -5,55 +5,42 @@ #include "memory.hpp" -TEST_CASE("bitwise_reinterpret") -{ - GIVEN("4 bytes in an array") - { +TEST_CASE("bitwise_reinterpret") { + GIVEN("4 bytes in an array") { std::array memory{1, 2, 3, 4}; - WHEN("Reinterpreting as 4-byte int") - { + WHEN("Reinterpreting as 4-byte int") { auto value = *pisa::bitwise_reinterpret(memory.data()); - THEN("Equal to 4 bytes reinterpreted as int") - { + THEN("Equal to 4 bytes reinterpreted as int") { REQUIRE(value == (4U << 24) + (3U << 16U) + (2U << 8) + 1U); } } - WHEN("Reinterpreting 3 first bytes as 4-byte int") - { + WHEN("Reinterpreting 3 first bytes as 4-byte int") { auto value = *pisa::bitwise_reinterpret(memory.data(), 3); - THEN("Equal to the first 3 bytes reinterpreted as int") - { + THEN("Equal to the first 3 bytes reinterpreted as int") { REQUIRE(value == (3U << 16) + (2U << 8) + 1U); } } - WHEN("Reinterpreting as 2-byte int") - { + WHEN("Reinterpreting as 2-byte int") { auto value = *pisa::bitwise_reinterpret(memory.data()); - THEN("Equal to the first 2 bytes reinterpreted as int") - { + THEN("Equal to the first 2 bytes reinterpreted as int") { REQUIRE(value == (2U << 8) + 1U); } } } - GIVEN("4-byte integer and array") - { + GIVEN("4-byte integer and array") { std::array memory{0, 0, 0, 0}; std::uint32_t value = (4U << 24) + (3U << 16U) + (2U << 8) + 1U; - WHEN("Reinterpreting as 4-byte int and assigning value") - { + WHEN("Reinterpreting as 4-byte int and assigning value") { pisa::bitwise_reinterpret(memory.data()) = value; - THEN("Array equal to all bytes of the value") - { + THEN("Array equal to all bytes of the value") { REQUIRE(memory == std::array{1, 2, 3, 4}); } } - WHEN("Reinterpreting as 2-byte int and assigning value") - { + WHEN("Reinterpreting as 2-byte int and assigning value") { pisa::bitwise_reinterpret(memory.data()) = value; - THEN("Array equal to all bytes of the value") - { + THEN("Array equal to all bytes of the value") { REQUIRE(memory == std::array{1, 2, 0, 0}); } } diff --git a/test/test_memory_source.cpp b/test/test_memory_source.cpp index 77bc4a77..ebf6abc8 100644 --- a/test/test_memory_source.cpp +++ b/test/test_memory_source.cpp @@ -10,8 +10,7 @@ using pisa::MemorySource; using pisa::io::NoSuchFile; -TEST_CASE("Empty memory source", "[mmap][io]") -{ +TEST_CASE("Empty memory source", "[mmap][io]") { MemorySource source; REQUIRE_FALSE(source.is_mapped()); REQUIRE(source.size() == 0); @@ -22,15 +21,13 @@ TEST_CASE("Empty memory source", "[mmap][io]") REQUIRE_THROWS_AS(source.subspan(1), std::out_of_range); } -TEST_CASE("Error when mapping non-existent file", "[mmap][io]") -{ +TEST_CASE("Error when mapping non-existent file", "[mmap][io]") { pisa::TemporaryDirectory temp; auto file_path = (temp.path() / "file"); REQUIRE_THROWS_AS(MemorySource::mapped_file(file_path), NoSuchFile); } -TEST_CASE("Non-empty memory source", "[mmap][io]") -{ +TEST_CASE("Non-empty memory source", "[mmap][io]") { pisa::TemporaryDirectory temp; auto file_path = (temp.path() / "file"); { diff --git a/test/test_partition_fwd_index.cpp b/test/test_partition_fwd_index.cpp index 365e0207..a5ba7fff 100644 --- a/test/test_partition_fwd_index.cpp +++ b/test/test_partition_fwd_index.cpp @@ -31,51 +31,47 @@ using namespace pisa::literals; using index_type = invert::Inverted_Index; -[[nodiscard]] auto next_plaintext_record(std::istream& in) -> std::optional -{ +[[nodiscard]] auto next_plaintext_record(std::istream& in) -> std::optional { pisa::Plaintext_Record record; if (in >> record) { return std::make_optional( - std::move(record.trecid()), std::move(record.content()), std::move(record.url())); + std::move(record.trecid()), std::move(record.content()), std::move(record.url()) + ); } return std::nullopt; } -TEST_CASE("Expand shard", "[sharding]") -{ +TEST_CASE("Expand shard", "[sharding]") { REQUIRE(pisa::expand_shard("path", 17_s) == "path.017"); REQUIRE(pisa::expand_shard("path.{}", 17_s) == "path.017"); REQUIRE(pisa::expand_shard("path.{}.ext", 17_s) == "path.017.ext"); } -TEST_CASE("Resolve shards", "[sharding]") -{ +TEST_CASE("Resolve shards", "[sharding]") { pisa::TemporaryDirectory dir; - SECTION("No suffix") - { + SECTION("No suffix") { for (auto f: std::vector{"shard.000", "shard.001", "shard.002"}) { std::ofstream os((dir.path() / f).string()); os << "."; } REQUIRE( pisa::resolve_shards((dir.path() / "shard.{}").string()) - == std::vector{Shard_Id(0), Shard_Id(1), Shard_Id(2)}); + == std::vector{Shard_Id(0), Shard_Id(1), Shard_Id(2)} + ); } - SECTION("With suffix") - { - for (auto f: - std::vector{"shard.000.docs", "shard.001.docs", "shard.002.docs"}) { + SECTION("With suffix") { + for (auto f: std::vector{"shard.000.docs", "shard.001.docs", "shard.002.docs"}) { std::ofstream os((dir.path() / f).string()); os << "."; } REQUIRE( pisa::resolve_shards((dir.path() / "shard.{}").string(), ".docs") - == std::vector{Shard_Id(0), Shard_Id(1), Shard_Id(2)}); + == std::vector{Shard_Id(0), Shard_Id(1), Shard_Id(2)} + ); } } -TEST_CASE("mapping_from_files", "[invert][unit]") -{ +TEST_CASE("mapping_from_files", "[invert][unit]") { std::istringstream full("D00\nD01\nD02\nD03\nD04\nD05\nD06\nD07\nD08\nD09\nD010\nD11"); std::vector> shards; shards.push_back(std::make_unique("D00\nD01\nD02")); @@ -85,11 +81,11 @@ TEST_CASE("mapping_from_files", "[invert][unit]") | ranges::to(); REQUIRE( mapping_from_files(&full, gsl::span(stream_pointers)).as_vector() - == std::vector{0_s, 0_s, 0_s, 1_s, 1_s, 0_s, 2_s, 2_s, 2_s, 2_s, 2_s, 2_s}); + == std::vector{0_s, 0_s, 0_s, 1_s, 1_s, 0_s, 2_s, 2_s, 2_s, 2_s, 2_s, 2_s} + ); } -TEST_CASE("create_random_mapping", "[invert][unit]") -{ +TEST_CASE("create_random_mapping", "[invert][unit]") { uint64_t seed = 88887; auto mapping = pisa::create_random_mapping(1000U, 13U, seed); VecMap counts(13, 0); @@ -102,13 +98,12 @@ TEST_CASE("create_random_mapping", "[invert][unit]") REQUIRE( documents.as_vector() - == ranges::to(ranges::views::iota(Document_Id{}, Document_Id{1000U}))); - REQUIRE( - counts.as_vector() == std::vector{77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 76}); + == ranges::to(ranges::views::iota(Document_Id{}, Document_Id{1000U})) + ); + REQUIRE(counts.as_vector() == std::vector{77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 76}); } -auto round_robin_mapping(int document_count, int shard_count) -{ +auto round_robin_mapping(int document_count, int shard_count) { VecMap mapping(document_count); Shard_Id shard = 0_s; for (auto doc: ranges::views::iota(0_d, Document_Id{document_count})) { @@ -120,8 +115,7 @@ auto round_robin_mapping(int document_count, int shard_count) return mapping; } -void build_fwd_index(std::string const& output) -{ +void build_fwd_index(std::string const& output) { std::string input(PISA_SOURCE_DIR "/test/test_data/clueweb1k.plaintext"); std::ifstream is(input); pisa::Forward_Index_Builder builder; @@ -131,12 +125,12 @@ void build_fwd_index(std::string const& output) next_plaintext_record, std::make_shared(std::make_unique()), 20'000, - 2); + 2 + ); } template -auto shard_elements(Container const& container, Shard_Id shard_id, int shard_count) -{ +auto shard_elements(Container const& container, Shard_Id shard_id, int shard_count) { Container elems; for (auto const& val: ranges::views::drop(container, shard_id.as_int()) | ranges::views::stride(shard_count)) { @@ -145,18 +139,15 @@ auto shard_elements(Container const& container, Shard_Id shard_id, int shard_cou return elems; } -TEST_CASE("copy_sequence", "[invert][unit]") -{ - GIVEN("A test forward index") - { +TEST_CASE("copy_sequence", "[invert][unit]") { + GIVEN("A test forward index") { pisa::TemporaryDirectory dir; std::string fwd_basename = (dir.path() / "fwd").string(); std::string output = (dir.path() / "copy").string(); int document_count = 1'000; build_fwd_index(fwd_basename); - WHEN("All sequences are copied") - { + WHEN("All sequences are copied") { { std::ifstream is(fwd_basename); std::ofstream os(output); @@ -165,8 +156,7 @@ TEST_CASE("copy_sequence", "[invert][unit]") } } - THEN("Files are identical") - { + THEN("Files are identical") { auto actual = io::load_data(output); auto expected = io::load_data(fwd_basename); expected.resize(actual.size()); @@ -176,32 +166,27 @@ TEST_CASE("copy_sequence", "[invert][unit]") } } -TEST_CASE("Rearrange sequences", "[invert][integration]") -{ - GIVEN("A test forward index") - { +TEST_CASE("Rearrange sequences", "[invert][integration]") { + GIVEN("A test forward index") { pisa::TemporaryDirectory dir; std::string fwd_basename = (dir.path() / "fwd").string(); std::string output_basename = (dir.path() / "shards").string(); int document_count = 1'000; build_fwd_index(fwd_basename); - WHEN("Rearrange the sequences in a round-robin manner") - { + WHEN("Rearrange the sequences in a round-robin manner") { auto mapping = round_robin_mapping(document_count, 13); REQUIRE(mapping.size() == document_count); rearrange_sequences(fwd_basename, output_basename, mapping); auto shard_ids = ranges::views::iota(0_s, 13_s); - THEN("Sequences are properly rearranged") - { + THEN("Sequences are properly rearranged") { auto full = binary_collection(fwd_basename.c_str()); auto full_iter = ++full.begin(); std::vector> expected; - std::transform( - full_iter, full.end(), std::back_inserter(expected), [](auto const& seq) { - return std::vector(seq.begin(), seq.end()); - }); + std::transform(full_iter, full.end(), std::back_inserter(expected), [](auto const& seq) { + return std::vector(seq.begin(), seq.end()); + }); auto sorted_mapping = mapping.entries().collect(); ranges::stable_sort(sorted_mapping, [](auto const& lhs, auto const& rhs) { return std::make_pair(lhs.second, lhs.first) @@ -209,7 +194,8 @@ TEST_CASE("Rearrange sequences", "[invert][integration]") }); expected = ranges::views::transform( - sorted_mapping, [&](auto&& entry) { return expected[entry.first.as_int()]; }) + sorted_mapping, [&](auto&& entry) { return expected[entry.first.as_int()]; } + ) | ranges::to(); auto pos = expected.begin(); @@ -218,7 +204,8 @@ TEST_CASE("Rearrange sequences", "[invert][integration]") spdlog::info("Testing shard {}", shard.as_int()); spdlog::default_logger()->flush(); auto shard_coll = binary_collection( - fmt::format("{}.{:03d}", output_basename, shard.as_int()).c_str()); + fmt::format("{}.{:03d}", output_basename, shard.as_int()).c_str() + ); size_t doc = 0U; CAPTURE(shard); CAPTURE(doc); @@ -232,36 +219,32 @@ TEST_CASE("Rearrange sequences", "[invert][integration]") } } -TEST_CASE("partition_fwd_index", "[invert][integration]") -{ - GIVEN("A test forward index") - { +TEST_CASE("partition_fwd_index", "[invert][integration]") { + GIVEN("A test forward index") { pisa::TemporaryDirectory dir; std::string fwd_basename = (dir.path() / "fwd").string(); std::string output_basename = (dir.path() / "shards").string(); int document_count = 1'000; build_fwd_index(fwd_basename); - WHEN("Partition the forward index in a round-robin manner") - { + WHEN("Partition the forward index in a round-robin manner") { auto mapping = round_robin_mapping(document_count, 13); REQUIRE(mapping.size() == document_count); partition_fwd_index(fwd_basename, output_basename, mapping); auto shard_ids = ranges::views::iota(0_s, 13_s); - THEN("Document titles are correctly partitioned") - { + THEN("Document titles are correctly partitioned") { auto original_titles = io::read_string_vector(fmt::format("{}.documents", fwd_basename)); for (auto shard_id: shard_ids) { auto expected_titles = shard_elements(original_titles, shard_id, 13); auto actual_titles = io::read_string_vector( - fmt::format("{}.{:03d}.documents", output_basename, shard_id.as_int())); + fmt::format("{}.{:03d}.documents", output_basename, shard_id.as_int()) + ); REQUIRE(actual_titles == expected_titles); } } - AND_THEN("Document contents are identical wrt terms") - { + AND_THEN("Document contents are identical wrt terms") { auto full = binary_collection(fwd_basename.c_str()); auto full_iter = ++full.begin(); auto full_terms = io::read_string_vector(fmt::format("{}.terms", fwd_basename)); @@ -270,9 +253,11 @@ TEST_CASE("partition_fwd_index", "[invert][integration]") std::vector> shard_terms; for (auto shard: shard_ids) { shards.push_back(binary_collection( - fmt::format("{}.{:03d}", output_basename, shard.as_int()).c_str())); + fmt::format("{}.{:03d}", output_basename, shard.as_int()).c_str() + )); shard_terms.push_back(io::read_string_vector( - fmt::format("{}.{:03d}.terms", output_basename, shard.as_int()).c_str())); + fmt::format("{}.{:03d}.terms", output_basename, shard.as_int()).c_str() + )); shard_iterators.push_back(++shards.back().begin()); } Shard_Id shard = 0_s; @@ -286,12 +271,14 @@ TEST_CASE("partition_fwd_index", "[invert][integration]") full_seq.begin(), full_seq.end(), expected_documents.begin(), - [&](auto const& id) { return full_terms[id]; }); + [&](auto const& id) { return full_terms[id]; } + ); std::transform( shard_seq.begin(), shard_seq.end(), actual_documents.begin(), - [&](auto const& id) { return shard_terms[shard.as_int()][id]; }); + [&](auto const& id) { return shard_terms[shard.as_int()][id]; } + ); REQUIRE(actual_documents == expected_documents); ++full_iter; ++shard_iterators[shard.as_int()]; @@ -301,24 +288,26 @@ TEST_CASE("partition_fwd_index", "[invert][integration]") } } } - AND_THEN("Terms and term lexicon match") - { + AND_THEN("Terms and term lexicon match") { for (auto shard: shard_ids) { auto terms = io::read_string_vector( - fmt::format("{}.{:03d}.terms", output_basename, shard.as_int()).c_str()); + fmt::format("{}.{:03d}.terms", output_basename, shard.as_int()).c_str() + ); mio::mmap_source m( - fmt::format("{}.{:03d}.termlex", output_basename, shard.as_int()).c_str()); + fmt::format("{}.{:03d}.termlex", output_basename, shard.as_int()).c_str() + ); auto lexicon = Payload_Vector<>::from(m); REQUIRE(terms == std::vector(lexicon.begin(), lexicon.end())); } } - AND_THEN("Documents and document lexicon match") - { + AND_THEN("Documents and document lexicon match") { for (auto shard: shard_ids) { auto documents = io::read_string_vector( - fmt::format("{}.{:03d}.documents", output_basename, shard.as_int()).c_str()); + fmt::format("{}.{:03d}.documents", output_basename, shard.as_int()).c_str() + ); mio::mmap_source m( - fmt::format("{}.{:03d}.doclex", output_basename, shard.as_int()).c_str()); + fmt::format("{}.{:03d}.doclex", output_basename, shard.as_int()).c_str() + ); auto lexicon = Payload_Vector<>::from(m); REQUIRE(documents == std::vector(lexicon.begin(), lexicon.end())); } diff --git a/test/test_partitioned_sequence.cpp b/test/test_partitioned_sequence.cpp index 8503fd36..a08ff3e2 100644 --- a/test/test_partitioned_sequence.cpp +++ b/test/test_partitioned_sequence.cpp @@ -15,8 +15,7 @@ namespace pisa { class partitioned_sequence_test { public: template - static void test_construction(Enumerator& r, std::vector const& seq) - { + static void test_construction(Enumerator& r, std::vector const& seq) { if (r.m_partitions == 1) { // nothing to test here return; } @@ -42,8 +41,7 @@ class partitioned_sequence_test { } // namespace pisa template -void test_partitioned_sequence(uint64_t universe, std::vector const& seq) -{ +void test_partitioned_sequence(uint64_t universe, std::vector const& seq) { pisa::global_parameters params; using sequence_type = pisa::partitioned_sequence; @@ -58,8 +56,7 @@ void test_partitioned_sequence(uint64_t universe, std::vector const& s test_sequence(r, seq); } -TEST_CASE("partitioned_sequence") -{ +TEST_CASE("partitioned_sequence") { using pisa::indexed_sequence; using pisa::strict_sequence; diff --git a/test/test_payload_vector.cpp b/test/test_payload_vector.cpp index e4fe0de2..fc784b96 100644 --- a/test/test_payload_vector.cpp +++ b/test/test_payload_vector.cpp @@ -13,21 +13,19 @@ using namespace pisa; using namespace std::literals::string_view_literals; -inline std::byte operator"" _b(unsigned long long n) -{ +inline std::byte operator"" _b(unsigned long long n) { return std::byte(n); } -inline std::byte operator"" _b(char c) -{ +inline std::byte operator"" _b(char c) { return std::byte(c); } -TEST_CASE("Unpack head", "[payload_vector][unit]") -{ +TEST_CASE("Unpack head", "[payload_vector][unit]") { std::vector bytes{0_b, 1_b, 2_b, 3_b, 4_b, 5_b}; REQUIRE( unpack_head(bytes) - == std::tuple(0_b, gsl::make_span(std::vector{1_b, 2_b, 3_b, 4_b, 5_b}))); + == std::tuple(0_b, gsl::make_span(std::vector{1_b, 2_b, 3_b, 4_b, 5_b})) + ); auto [b, i, s] = unpack_head(bytes); CHECK(b == 0_b); CHECK(i == uint32_t(67305985)); @@ -38,37 +36,43 @@ TEST_CASE("Unpack head", "[payload_vector][unit]") Catch::Predicate([](std::runtime_error const& err) -> bool { return std::string(err.what()) == "Cannot unpack span of size 6 into structure of size 7"; - })); + }) + ); } -TEST_CASE("Split span", "[payload_vector][unit]") -{ +TEST_CASE("Split span", "[payload_vector][unit]") { std::vector bytes{0_b, 1_b, 2_b, 3_b, 4_b, 5_b}; REQUIRE( split(bytes, 0) == std::tuple( gsl::make_span(std::vector{}), - gsl::make_span(std::vector{0_b, 1_b, 2_b, 3_b, 4_b, 5_b}))); + gsl::make_span(std::vector{0_b, 1_b, 2_b, 3_b, 4_b, 5_b}) + ) + ); REQUIRE( split(bytes, 4) == std::tuple( gsl::make_span(std::vector{0_b, 1_b, 2_b, 3_b}), - gsl::make_span(std::vector{4_b, 5_b}))); + gsl::make_span(std::vector{4_b, 5_b}) + ) + ); REQUIRE( split(bytes, 6) == std::tuple( gsl::make_span(std::vector{0_b, 1_b, 2_b, 3_b, 4_b, 5_b}), - gsl::make_span(std::vector{}))); + gsl::make_span(std::vector{}) + ) + ); REQUIRE_THROWS_MATCHES( split(bytes, 7), std::runtime_error, Catch::Predicate([](std::runtime_error const& err) -> bool { return std::string(err.what()) == "Cannot split span of size 6 at position 7"; - })); + }) + ); } -TEST_CASE("Cast span", "[payload_vector][unit]") -{ +TEST_CASE("Cast span", "[payload_vector][unit]") { std::vector bytes{0_b, 1_b, 2_b, 3_b, 4_b, 5_b}; REQUIRE(cast_span(bytes) == gsl::make_span(std::vector{256, 770, 1284})); REQUIRE_THROWS_MATCHES( @@ -76,25 +80,26 @@ TEST_CASE("Cast span", "[payload_vector][unit]") std::runtime_error, Catch::Predicate([](std::runtime_error const& err) -> bool { return std::string(err.what()) == "Failed to cast byte-span to span of T of size 4"; - })); + }) + ); } -TEST_CASE("Test string-payload vector", "[payload_vector][unit]") -{ +TEST_CASE("Test string-payload vector", "[payload_vector][unit]") { std::vector offsets{0, 3, 6, 10, 13}; std::vector payloads{ - 'a'_b, 'b'_b, 'c'_b, 'd'_b, 'e'_b, 'f'_b, 'g'_b, 'h'_b, 'i'_b, 'j'_b, 'k'_b, 'l'_b, 'm'_b}; + 'a'_b, 'b'_b, 'c'_b, 'd'_b, 'e'_b, 'f'_b, 'g'_b, 'h'_b, 'i'_b, 'j'_b, 'k'_b, 'l'_b, 'm'_b + }; Payload_Vector vec(gsl::make_span(offsets), gsl::make_span(payloads)); - SECTION("size") { REQUIRE(vec.size() == 4); } - SECTION("iterator equality") - { + SECTION("size") { + REQUIRE(vec.size() == 4); + } + SECTION("iterator equality") { REQUIRE(vec.begin() == vec.begin()); REQUIRE(std::next(vec.begin()) != vec.begin()); REQUIRE(std::next(vec.begin()) != vec.end()); REQUIRE(vec.end() == vec.end()); } - SECTION("dereference with ++iter") - { + SECTION("dereference with ++iter") { auto iter = vec.begin(); REQUIRE(*iter == "abc"sv); ++iter; @@ -106,8 +111,7 @@ TEST_CASE("Test string-payload vector", "[payload_vector][unit]") ++iter; REQUIRE(iter == vec.end()); } - SECTION("dereference with iter++") - { + SECTION("dereference with iter++") { auto iter = vec.begin(); CHECK(*iter++ == "abc"sv); CHECK(*iter++ == "def"sv); @@ -115,52 +119,45 @@ TEST_CASE("Test string-payload vector", "[payload_vector][unit]") CHECK(*iter++ == "klm"sv); CHECK(iter == vec.end()); } - SECTION("dereference with begin() + n") - { + SECTION("dereference with begin() + n") { CHECK(*(vec.begin() + 0) == "abc"sv); CHECK(*(vec.begin() + 1) == "def"sv); CHECK(*(vec.begin() + 2) == "ghij"sv); CHECK(*(vec.begin() + 3) == "klm"sv); CHECK(vec.begin() + 4 == vec.end()); } - SECTION("dereference with next()") - { + SECTION("dereference with next()") { CHECK(*std::next(vec.begin(), 0) == "abc"sv); CHECK(*std::next(vec.begin(), 1) == "def"sv); CHECK(*std::next(vec.begin(), 2) == "ghij"sv); CHECK(*std::next(vec.begin(), 3) == "klm"sv); CHECK(std::next(vec.begin(), 4) == vec.end()); } - SECTION("dereference with begin() - n") - { + SECTION("dereference with begin() - n") { CHECK(*(vec.end() - 4) == "abc"sv); CHECK(*(vec.end() - 3) == "def"sv); CHECK(*(vec.end() - 2) == "ghij"sv); CHECK(*(vec.end() - 1) == "klm"sv); CHECK(vec.end() - 0 == vec.end()); } - SECTION("to vector") - { + SECTION("to vector") { std::vector v(vec.begin(), vec.end()); REQUIRE(v == std::vector{"abc"sv, "def"sv, "ghij"sv, "klm"sv}); } - SECTION("operator[]") - { + SECTION("operator[]") { CHECK(vec[0] == "abc"sv); CHECK(vec[1] == "def"sv); CHECK(vec[2] == "ghij"sv); CHECK(vec[3] == "klm"sv); } - SECTION("binary search") - { + SECTION("binary search") { CHECK(std::lower_bound(vec.begin(), vec.end(), "de"sv) == std::next(vec.begin())); CHECK(std::lower_bound(vec.begin(), vec.end(), "def"sv) == std::next(vec.begin())); CHECK(std::lower_bound(vec.begin(), vec.end(), "dew"sv) == std::next(vec.begin(), 2)); } } -TEST_CASE("Test payload vector container", "[payload_vector][unit]") -{ +TEST_CASE("Test payload vector container", "[payload_vector][unit]") { std::vector vec{"abc", "def", "ghij", "klm"}; std::ostringstream str; auto container = encode_payload_vector(gsl::span(vec)); @@ -168,8 +165,7 @@ TEST_CASE("Test payload vector container", "[payload_vector][unit]") REQUIRE(std::vector(vec.begin(), vec.end()) == vec); } -TEST_CASE("Test payload vector encoding", "[payload_vector][unit]") -{ +TEST_CASE("Test payload vector encoding", "[payload_vector][unit]") { std::vector vec{"abc", "def", "ghij", "klm"}; std::ostringstream str; encode_payload_vector(gsl::span(vec)).to_stream(str); @@ -186,8 +182,7 @@ TEST_CASE("Test payload vector encoding", "[payload_vector][unit]") // clang-format on } -TEST_CASE("Test payload vector decoding", "[payload_vector][unit]") -{ +TEST_CASE("Test payload vector decoding", "[payload_vector][unit]") { // clang-format off std::vector data{ /* length */ 4, 0, 0, 0, 0, 0, 0, 0, @@ -199,14 +194,15 @@ TEST_CASE("Test payload vector decoding", "[payload_vector][unit]") 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm'}; // clang-format on auto vec = Payload_Vector::from( - gsl::make_span(reinterpret_cast(data.data()), data.size())); + gsl::make_span(reinterpret_cast(data.data()), data.size()) + ); REQUIRE( std::vector(vec.begin(), vec.end()) - == std::vector{"abc", "def", "ghij", "klm"}); + == std::vector{"abc", "def", "ghij", "klm"} + ); } -TEST_CASE("Test binary search", "[payload_vector][unit]") -{ +TEST_CASE("Test binary search", "[payload_vector][unit]") { std::vector elements{0, 1, 2, 4, 5, 7, 8, 100}; REQUIRE(pisa::binary_search(elements.begin(), elements.end(), 0).value() == 0); REQUIRE(pisa::binary_search(elements.begin(), elements.end(), 1).value() == 1); @@ -219,8 +215,7 @@ TEST_CASE("Test binary search", "[payload_vector][unit]") REQUIRE(pisa::binary_search(elements.begin(), elements.end(), 101).has_value() == false); } -TEST_CASE("Binary search for sorted values is correct", "[payload_vector][prop]") -{ +TEST_CASE("Binary search for sorted values is correct", "[payload_vector][prop]") { rc::check([](std::vector elements, std::vector lookups) { std::sort(elements.begin(), elements.end()); for (auto v: lookups) { @@ -233,8 +228,7 @@ TEST_CASE("Binary search for sorted values is correct", "[payload_vector][prop]" }); } -TEST_CASE("Binary search for unsorted values doesn't crash", "[payload_vector][prop]") -{ +TEST_CASE("Binary search for unsorted values doesn't crash", "[payload_vector][prop]") { rc::check([](std::vector elements, std::vector lookups) { for (auto v: lookups) { pisa::binary_search(elements, 0); diff --git a/test/test_positive_sequence.cpp b/test/test_positive_sequence.cpp index ef05555f..d3126b23 100644 --- a/test/test_positive_sequence.cpp +++ b/test/test_positive_sequence.cpp @@ -13,8 +13,7 @@ #include "sequence/uniform_partitioned_sequence.hpp" template -void test_positive_sequence() -{ +void test_positive_sequence() { srand(42); pisa::global_parameters params; size_t n = 50000; @@ -37,8 +36,7 @@ void test_positive_sequence() } } -TEST_CASE("positive_sequence") -{ +TEST_CASE("positive_sequence") { test_positive_sequence(); test_positive_sequence>(); test_positive_sequence>(); diff --git a/test/test_queries.cpp b/test/test_queries.cpp index 80e9b279..9a521e9a 100644 --- a/test/test_queries.cpp +++ b/test/test_queries.cpp @@ -7,29 +7,27 @@ using namespace pisa; -TEST_CASE("Parse query term ids without query id") -{ +TEST_CASE("Parse query term ids without query id") { auto raw_query = "1 2\t3 4"; auto q = parse_query_ids(raw_query); REQUIRE(q.id.has_value() == false); REQUIRE(q.terms == std::vector{1, 2, 3, 4}); } -TEST_CASE("Parse query term ids with query id") -{ +TEST_CASE("Parse query term ids with query id") { auto raw_query = "1: 1\t2 3\t4"; auto q = parse_query_ids(raw_query); REQUIRE(q.id == "1"); REQUIRE(q.terms == std::vector{1, 2, 3, 4}); } -TEST_CASE("Compute parsing function") -{ +TEST_CASE("Compute parsing function") { pisa::TemporaryDirectory tmpdir; auto lexfile = tmpdir.path() / "lex"; encode_payload_vector( - gsl::make_span(std::vector{"a", "account", "he", "she", "usa", "world"})) + gsl::make_span(std::vector{"a", "account", "he", "she", "usa", "world"}) + ) .to_file(lexfile.string()); auto stopwords_filename = tmpdir.path() / "stop"; { @@ -39,82 +37,72 @@ TEST_CASE("Compute parsing function") std::vector queries; - WHEN("No stopwords, terms, or stemmer") - { + WHEN("No stopwords, terms, or stemmer") { // Note we don't need a tokenizer because ID parsing does not use it auto parse = resolve_query_parser(queries, nullptr, std::nullopt, std::nullopt, std::nullopt); - THEN("Parse query IDs") - { + THEN("Parse query IDs") { parse("1:0 2 4"); REQUIRE(queries[0].id == std::optional("1")); REQUIRE(queries[0].terms == std::vector{0, 2, 4}); REQUIRE(queries[0].term_weights.empty()); } } - WHEN("With terms and stopwords. No stemmer") - { + WHEN("With terms and stopwords. No stemmer") { auto parse = resolve_query_parser( queries, std::make_unique(), lexfile.string(), stopwords_filename.string(), - std::nullopt); - THEN("Parse query IDs") - { + std::nullopt + ); + THEN("Parse query IDs") { parse("1:a he usa"); REQUIRE(queries[0].id == std::optional("1")); REQUIRE(queries[0].terms == std::vector{2, 4}); REQUIRE(queries[0].term_weights.empty()); } } - WHEN("With terms, stopwords, and stemmer") - { + WHEN("With terms, stopwords, and stemmer") { auto parse = resolve_query_parser( queries, std::make_unique(), lexfile.string(), stopwords_filename.string(), - "porter2"); - THEN("Parse query IDs") - { + "porter2" + ); + THEN("Parse query IDs") { parse("1:a he usa"); REQUIRE(queries[0].id == std::optional("1")); REQUIRE(queries[0].terms == std::vector{2, 4}); REQUIRE(queries[0].term_weights.empty()); } } - WHEN("Parser with whitespace tokenizer") - { + WHEN("Parser with whitespace tokenizer") { auto parse = resolve_query_parser( - queries, - std::make_unique(), - lexfile.string(), - std::nullopt, - std::nullopt); - THEN("Parses usa's as usa's (and does not find it in lexicon)") - { + queries, std::make_unique(), lexfile.string(), std::nullopt, std::nullopt + ); + THEN("Parses usa's as usa's (and does not find it in lexicon)") { parse("1:a he usa's"); REQUIRE(queries[0].terms == std::vector{0, 2}); } } - WHEN("Parser with English tokenizer") - { + WHEN("Parser with English tokenizer") { auto parse = resolve_query_parser( - queries, std::make_unique(), lexfile.string(), std::nullopt, std::nullopt); - THEN("Parses usa's as usa (and finds it in lexicon)") - { + queries, std::make_unique(), lexfile.string(), std::nullopt, std::nullopt + ); + THEN("Parses usa's as usa (and finds it in lexicon)") { parse("1:a he usa's"); REQUIRE(queries[0].terms == std::vector{0, 2, 4}); } } } -TEST_CASE("Load stopwords in term processor with all stopwords present in the lexicon") -{ +TEST_CASE("Load stopwords in term processor with all stopwords present in the lexicon") { pisa::TemporaryDirectory tmpdir; auto lexfile = tmpdir.path() / "lex"; encode_payload_vector( - gsl::make_span(std::vector{"a", "account", "he", "she", "usa", "world"})) + gsl::make_span(std::vector{"a", "account", "he", "she", "usa", "world"}) + ) .to_file(lexfile.string()); auto stopwords_filename = (tmpdir.path() / "stopwords").string(); @@ -123,16 +111,17 @@ TEST_CASE("Load stopwords in term processor with all stopwords present in the le is.close(); TermProcessor tprocessor( - std::make_optional(lexfile.string()), std::make_optional(stopwords_filename), std::nullopt); + std::make_optional(lexfile.string()), std::make_optional(stopwords_filename), std::nullopt + ); REQUIRE(tprocessor.get_stopwords() == std::vector{0, 2, 3}); } -TEST_CASE("Load stopwords in term processor with some stopwords not present in the lexicon") -{ +TEST_CASE("Load stopwords in term processor with some stopwords not present in the lexicon") { pisa::TemporaryDirectory tmpdir; auto lexfile = tmpdir.path() / "lex"; encode_payload_vector( - gsl::make_span(std::vector{"account", "coffee", "he", "she", "usa", "world"})) + gsl::make_span(std::vector{"account", "coffee", "he", "she", "usa", "world"}) + ) .to_file(lexfile.string()); auto stopwords_filename = (tmpdir.path() / "stopwords").string(); @@ -141,16 +130,17 @@ TEST_CASE("Load stopwords in term processor with some stopwords not present in t is.close(); TermProcessor tprocessor( - std::make_optional(lexfile.string()), std::make_optional(stopwords_filename), std::nullopt); + std::make_optional(lexfile.string()), std::make_optional(stopwords_filename), std::nullopt + ); REQUIRE(tprocessor.get_stopwords() == std::vector{2, 3}); } -TEST_CASE("Check if term is stopword") -{ +TEST_CASE("Check if term is stopword") { pisa::TemporaryDirectory tmpdir; auto lexfile = tmpdir.path() / "lex"; encode_payload_vector( - gsl::make_span(std::vector{"account", "coffee", "he", "she", "usa", "world"})) + gsl::make_span(std::vector{"account", "coffee", "he", "she", "usa", "world"}) + ) .to_file(lexfile.string()); auto stopwords_filename = (tmpdir.path() / "stopwords").string(); @@ -159,7 +149,8 @@ TEST_CASE("Check if term is stopword") is.close(); TermProcessor tprocessor( - std::make_optional(lexfile.string()), std::make_optional(stopwords_filename), std::nullopt); + std::make_optional(lexfile.string()), std::make_optional(stopwords_filename), std::nullopt + ); REQUIRE(!tprocessor.is_stopword(0)); REQUIRE(!tprocessor.is_stopword(1)); REQUIRE(tprocessor.is_stopword(2)); diff --git a/test/test_query_stemmer.cpp b/test/test_query_stemmer.cpp index 1c2820c4..dffbc0c1 100644 --- a/test/test_query_stemmer.cpp +++ b/test/test_query_stemmer.cpp @@ -6,13 +6,15 @@ using namespace pisa; -TEST_CASE("Stem query", "[stemming][unit]") -{ - auto [input, expected] = - GENERATE(table({{"1:playing cards", "1:play card"}, - {"playing cards", "play card"}, - {"play card", "play card"}, - {"1:this:that", "1:this that"}})); +TEST_CASE("Stem query", "[stemming][unit]") { + auto [input, expected] = GENERATE(table( + {{"1:playing cards", "1:play card"}, + {"playing cards", "play card"}, + {"play card", "play card"}, + {"1:this:that", "1:this that"}} + )); QueryStemmer query_stemmer("porter2"); - GIVEN("Input: " << input) { CHECK(query_stemmer(input) == expected); } + GIVEN("Input: " << input) { + CHECK(query_stemmer(input) == expected); + } } diff --git a/test/test_ranked_queries.cpp b/test/test_ranked_queries.cpp index 7e436940..85194e87 100644 --- a/test/test_ranked_queries.cpp +++ b/test/test_ranked_queries.cpp @@ -41,14 +41,16 @@ struct IndexData { ScorerParams(scorer_name), BlockSize(FixedBlock(5)), quantized ? std::optional(Size(8)) : std::nullopt, - dropped_term_ids) + dropped_term_ids + ) { typename Index::builder builder(collection.num_docs(), params); for (auto const& plist: collection) { uint64_t freqs_sum = std::accumulate(plist.freqs.begin(), plist.freqs.end(), uint64_t(0)); builder.add_posting_list( - plist.docs.size(), plist.docs.begin(), plist.freqs.begin(), freqs_sum); + plist.docs.size(), plist.docs.begin(), plist.freqs.begin(), freqs_sum + ); } builder.build(index); @@ -63,8 +65,8 @@ struct IndexData { } [[nodiscard]] static auto - get(std::string const& s_name, bool quantized, std::unordered_set const& dropped_term_ids) - { + get(std::string const& s_name, bool quantized, std::unordered_set const& dropped_term_ids + ) { if (IndexData::data.find(s_name) == IndexData::data.end()) { IndexData::data[s_name] = std::make_unique>(s_name, quantized, dropped_term_ids); @@ -89,8 +91,7 @@ class ranked_or_taat_query_acc: public ranked_or_taat_query { using ranked_or_taat_query::ranked_or_taat_query; template - void operator()(CursorRange&& cursors, uint64_t max_docid) - { + void operator()(CursorRange&& cursors, uint64_t max_docid) { Acc accumulator(max_docid); ranked_or_taat_query::operator()(cursors, max_docid, accumulator); } @@ -102,29 +103,13 @@ class range_query_128: public range_query { using range_query::range_query; template - void operator()(CursorRange&& cursors, uint64_t max_docid) - { + void operator()(CursorRange&& cursors, uint64_t max_docid) { range_query::operator()(cursors, max_docid, 128); } }; // NOLINTNEXTLINE(hicpp-explicit-conversions) -TEMPLATE_TEST_CASE( - "Ranked query test", - "[query][ranked][integration]", - ranked_or_taat_query_acc, - ranked_or_taat_query_acc>, - wand_query, - maxscore_query, - block_max_wand_query, - block_max_maxscore_query, - range_query_128>, - range_query_128>>, - range_query_128, - range_query_128, - range_query_128, - range_query_128) -{ +TEMPLATE_TEST_CASE("Ranked query test", "[query][ranked][integration]", ranked_or_taat_query_acc, ranked_or_taat_query_acc>, wand_query, maxscore_query, block_max_wand_query, block_max_maxscore_query, range_query_128>, range_query_128>>, range_query_128, range_query_128, range_query_128, range_query_128) { for (auto quantized: {false, true}) { for (auto&& s_name: {"bm25", "qld"}) { std::unordered_set dropped_term_ids; @@ -139,15 +124,16 @@ TEMPLATE_TEST_CASE( or_q(make_scored_cursors(data->index, *scorer, q), data->index.num_docs()); op_q( make_block_max_scored_cursors(data->index, data->wdata, *scorer, q), - data->index.num_docs()); + data->index.num_docs() + ); topk_1.finalize(); topk_2.finalize(); REQUIRE(topk_2.topk().size() == topk_1.topk().size()); for (size_t i = 0; i < topk_2.topk().size(); ++i) { REQUIRE( - topk_2.topk()[i].first - == Approx(topk_1.topk()[i].first).epsilon(0.1)); // tolerance is % - // relative + topk_2.topk()[i].first == Approx(topk_1.topk()[i].first).epsilon(0.1) + ); // tolerance is % + // relative } topk_1.clear(); topk_2.clear(); @@ -157,8 +143,7 @@ TEMPLATE_TEST_CASE( } // NOLINTNEXTLINE(hicpp-explicit-conversions) -TEMPLATE_TEST_CASE("Ranked AND query test", "[query][ranked][integration]", block_max_ranked_and_query) -{ +TEMPLATE_TEST_CASE("Ranked AND query test", "[query][ranked][integration]", block_max_ranked_and_query) { for (auto quantized: {false, true}) { for (auto&& s_name: {"bm25", "qld"}) { std::unordered_set dropped_term_ids; @@ -174,15 +159,16 @@ TEMPLATE_TEST_CASE("Ranked AND query test", "[query][ranked][integration]", bloc and_q(make_scored_cursors(data->index, *scorer, q), data->index.num_docs()); op_q( make_block_max_scored_cursors(data->index, data->wdata, *scorer, q), - data->index.num_docs()); + data->index.num_docs() + ); topk_1.finalize(); topk_2.finalize(); REQUIRE(topk_1.topk().size() == topk_2.topk().size()); for (size_t i = 0; i < and_q.topk().size(); ++i) { REQUIRE( - topk_1.topk()[i].first - == Approx(topk_2.topk()[i].first).epsilon(0.1)); // tolerance is % - // relative + topk_1.topk()[i].first == Approx(topk_2.topk()[i].first).epsilon(0.1) + ); // tolerance is % + // relative } topk_1.clear(); topk_2.clear(); @@ -191,8 +177,7 @@ TEMPLATE_TEST_CASE("Ranked AND query test", "[query][ranked][integration]", bloc } } -TEST_CASE("Top k") -{ +TEST_CASE("Top k") { for (auto&& s_name: {"bm25", "qld"}) { std::unordered_set dropped_term_ids; auto data = IndexData::get(s_name, false, dropped_term_ids); diff --git a/test/test_recursive_graph_bisection.cpp b/test/test_recursive_graph_bisection.cpp index 8c510e7e..4e5b4b86 100644 --- a/test/test_recursive_graph_bisection.cpp +++ b/test/test_recursive_graph_bisection.cpp @@ -13,8 +13,7 @@ using namespace pisa; using StrColl = std::vector>>; [[nodiscard]] auto coll_to_strings(std::string const& coll_file, std::string const& doclex_file) - -> StrColl -{ + -> StrColl { auto doclex_buf = Payload_Vector_Buffer::from_file(doclex_file); pisa::Payload_Vector<> doclex(doclex_buf); pisa::binary_freq_collection coll(coll_file.c_str()); @@ -28,15 +27,15 @@ using StrColl = std::vector>>; std::back_inserter(pl), [&doclex](auto&& doc, auto&& freq) { return std::pair(doclex[doc], freq); - }); + } + ); std::sort(pl.begin(), pl.end()); strcoll.push_back(pl); } return strcoll; } -void compare_strcolls(StrColl const& expected, StrColl const& actual) -{ +void compare_strcolls(StrColl const& expected, StrColl const& actual) { REQUIRE(expected.size() == actual.size()); for (int list_idx = 0; list_idx < expected.size(); list_idx += 1) { REQUIRE(expected[list_idx].size() == actual[list_idx].size()); @@ -47,8 +46,7 @@ void compare_strcolls(StrColl const& expected, StrColl const& actual) } } -TEST_CASE("Reorder documents with BP") -{ +TEST_CASE("Reorder documents with BP") { pisa::TemporaryDirectory tmp; auto next_record = [](std::istream& in) -> std::optional { @@ -64,8 +62,7 @@ TEST_CASE("Reorder documents with BP") auto bp_fwd_path = (tmp.path() / "fwd.bp").string(); auto bp_inv_path = (tmp.path() / "inv.bp").string(); - GIVEN("Built a forward index and inverted") - { + GIVEN("Built a forward index and inverted") { std::string collection_input(PISA_SOURCE_DIR "/test/test_data/clueweb1k.plaintext"); REQUIRE(std::filesystem::exists(std::filesystem::path(collection_input)) == true); int thread_count = 2; @@ -82,16 +79,17 @@ TEST_CASE("Reorder documents with BP") next_record, std::make_shared(std::make_unique()), batch_size, - thread_count); + thread_count + ); pisa::invert::invert_forward_index(fwd_path, inv_path, params); - WHEN("Reordered documents with BP") - { + WHEN("Reordered documents with BP") { auto cache_depth = GENERATE( std::optional{}, std::make_optional(1), - std::make_optional(2)); + std::make_optional(2) + ); int code = recursive_graph_bisection(RecursiveGraphBisectionOptions{ .input_basename = inv_path, .output_basename = bp_inv_path, @@ -106,16 +104,14 @@ TEST_CASE("Reorder documents with BP") .print_args = false, }); REQUIRE(code == 0); - THEN("Both collections are equal when mapped to strings") - { + THEN("Both collections are equal when mapped to strings") { auto expected = coll_to_strings(inv_path, fmt::format("{}.doclex", fwd_path)); auto actual = coll_to_strings(bp_inv_path, fmt::format("{}.doclex", bp_fwd_path)); compare_strcolls(expected, actual); } } - WHEN("Reordered documents with BP node version") - { + WHEN("Reordered documents with BP node version") { int code = recursive_graph_bisection(RecursiveGraphBisectionOptions{ .input_basename = inv_path, .output_basename = bp_inv_path, @@ -130,8 +126,7 @@ TEST_CASE("Reorder documents with BP") .print_args = false, }); REQUIRE(code == 0); - THEN("Both collections are equal when mapped to strings") - { + THEN("Both collections are equal when mapped to strings") { auto expected = coll_to_strings(inv_path, fmt::format("{}.doclex", fwd_path)); auto actual = coll_to_strings(bp_inv_path, fmt::format("{}.doclex", bp_fwd_path)); compare_strcolls(expected, actual); diff --git a/test/test_sample_inverted_index.cpp b/test/test_sample_inverted_index.cpp index e460ae0b..2b9fd29f 100644 --- a/test/test_sample_inverted_index.cpp +++ b/test/test_sample_inverted_index.cpp @@ -12,8 +12,7 @@ #include "temporary_directory.hpp" #include "util/inverted_index_utils.hpp" -TEST_CASE("sample_inverted_index") -{ +TEST_CASE("sample_inverted_index") { // given using pisa::binary_freq_collection; std::string input(PISA_SOURCE_DIR "/test/test_data/test_collection"); @@ -31,7 +30,8 @@ TEST_CASE("sample_inverted_index") std::iota(sample.begin(), sample.end(), 0); return sample; }, - terms_to_drop); + terms_to_drop + ); auto sampled = binary_freq_collection(output.c_str()); // then @@ -58,8 +58,7 @@ TEST_CASE("sample_inverted_index") } } -TEST_CASE("sample_inverted_index_one_sample") -{ +TEST_CASE("sample_inverted_index_one_sample") { // given using pisa::binary_freq_collection; std::string input(PISA_SOURCE_DIR "/test/test_data/test_collection"); @@ -77,7 +76,8 @@ TEST_CASE("sample_inverted_index_one_sample") std::iota(sample.begin(), sample.end(), 0); return sample; }, - terms_to_drop); + terms_to_drop + ); auto sampled = binary_freq_collection(output.c_str()); // then @@ -106,8 +106,7 @@ TEST_CASE("sample_inverted_index_one_sample") } } -TEST_CASE("sample_inverted_index_reverse") -{ +TEST_CASE("sample_inverted_index_reverse") { // given using pisa::binary_freq_collection; std::string input(PISA_SOURCE_DIR "/test/test_data/test_collection"); @@ -129,7 +128,8 @@ TEST_CASE("sample_inverted_index_reverse") std::sort(sample.begin(), sample.end()); return sample; }, - terms_to_drop); + terms_to_drop + ); auto sampled = binary_freq_collection(output.c_str()); // then diff --git a/test/test_scorer.cpp b/test/test_scorer.cpp index 4af36951..9095ba5a 100644 --- a/test/test_scorer.cpp +++ b/test/test_scorer.cpp @@ -7,19 +7,16 @@ using namespace pisa; struct WandData { - [[nodiscard]] auto term_posting_count(std::uint32_t term_id) const -> std::size_t - { + [[nodiscard]] auto term_posting_count(std::uint32_t term_id) const -> std::size_t { switch (term_id) { case 0: return 10; default: return 20; } } - [[nodiscard]] auto norm_len(std::uint32_t docid) const -> float - { + [[nodiscard]] auto norm_len(std::uint32_t docid) const -> float { return doc_len(docid) / avg_len(); } - [[nodiscard]] auto doc_len(std::uint32_t docid) const -> std::size_t - { + [[nodiscard]] auto doc_len(std::uint32_t docid) const -> std::size_t { switch (docid) { case 0: return 50; case 1: return 40; @@ -27,8 +24,7 @@ struct WandData { default: return 50; } } - [[nodiscard]] auto term_occurrence_count(std::uint32_t term_id) const -> std::size_t - { + [[nodiscard]] auto term_occurrence_count(std::uint32_t term_id) const -> std::size_t { return 100; } [[nodiscard]] auto num_docs() const -> std::size_t { return 1000; } @@ -36,8 +32,7 @@ struct WandData { [[nodiscard]] auto collection_len() const -> std::size_t { return 10000; } }; -TEST_CASE("BM25", "[scorer][unit]") -{ +TEST_CASE("BM25", "[scorer][unit]") { WandData wdata; auto scorer = scorer::from_params(ScorerParams("bm25"), wdata); auto term_scorer = scorer->term_scorer(0); @@ -47,8 +42,7 @@ TEST_CASE("BM25", "[scorer][unit]") CHECK(term_scorer(1, 20) == Approx(8.29555)); } -TEST_CASE("QLD", "[scorer][unit]") -{ +TEST_CASE("QLD", "[scorer][unit]") { WandData wdata; auto scorer = scorer::from_params(ScorerParams("qld"), wdata); auto term_scorer = scorer->term_scorer(0); @@ -58,8 +52,7 @@ TEST_CASE("QLD", "[scorer][unit]") CHECK(term_scorer(1, 20) == Approx(1.05939)); } -TEST_CASE("PL2", "[scorer][unit]") -{ +TEST_CASE("PL2", "[scorer][unit]") { WandData wdata; auto scorer = scorer::from_params(ScorerParams("pl2"), wdata); auto term_scorer = scorer->term_scorer(0); @@ -69,8 +62,7 @@ TEST_CASE("PL2", "[scorer][unit]") CHECK(term_scorer(1, 20) == Approx(8.35714)); } -TEST_CASE("DPH", "[scorer][unit]") -{ +TEST_CASE("DPH", "[scorer][unit]") { WandData wdata; auto scorer = scorer::from_params(ScorerParams("dph"), wdata); auto term_scorer = scorer->term_scorer(0); @@ -80,8 +72,7 @@ TEST_CASE("DPH", "[scorer][unit]") CHECK(term_scorer(1, 20) == Approx(1.93217)); } -TEST_CASE("Quantized", "[scorer][unit]") -{ +TEST_CASE("Quantized", "[scorer][unit]") { WandData wdata; auto scorer = scorer::from_params(ScorerParams("quantized"), wdata); auto term_scorer = scorer->term_scorer(0); diff --git a/test/test_sequence_collection.cpp b/test/test_sequence_collection.cpp index 849eeac0..a46db24f 100644 --- a/test/test_sequence_collection.cpp +++ b/test/test_sequence_collection.cpp @@ -15,8 +15,7 @@ #include template -void test_sequence_collection() -{ +void test_sequence_collection() { pisa::global_parameters params; uint64_t universe = 10000; using collection_type = pisa::sequence_collection; @@ -49,8 +48,7 @@ void test_sequence_collection() } } -TEST_CASE("sequence_collection") -{ +TEST_CASE("sequence_collection") { test_sequence_collection(); test_sequence_collection>(); test_sequence_collection>(); diff --git a/test/test_stream_builder.cpp b/test/test_stream_builder.cpp index 0c55f981..a4d96819 100644 --- a/test/test_stream_builder.cpp +++ b/test/test_stream_builder.cpp @@ -17,8 +17,7 @@ using namespace pisa; // NOLINTNEXTLINE(hicpp-explicit-conversions) -TEST_CASE("Stream builder for block index", "[index]") -{ +TEST_CASE("Stream builder for block index", "[index]") { using index_type = block_simdbp_index; binary_freq_collection collection(PISA_SOURCE_DIR "/test/test_data/test_collection"); @@ -30,8 +29,7 @@ TEST_CASE("Stream builder for block index", "[index]") typename index_type::builder builder(collection.num_docs(), global_parameters{}); for (auto const& plist: collection) { uint64_t freqs_sum = std::accumulate(plist.freqs.begin(), plist.freqs.end(), uint64_t(0)); - builder.add_posting_list( - plist.docs.size(), plist.docs.begin(), plist.freqs.begin(), freqs_sum); + builder.add_posting_list(plist.docs.size(), plist.docs.begin(), plist.freqs.begin(), freqs_sum); } index_type index; builder.build(index); @@ -42,7 +40,8 @@ TEST_CASE("Stream builder for block index", "[index]") for (auto const& plist: collection) { uint64_t freqs_sum = std::accumulate(plist.freqs.begin(), plist.freqs.end(), uint64_t(0)); sbuilder.add_posting_list( - plist.docs.size(), plist.docs.begin(), plist.freqs.begin(), freqs_sum); + plist.docs.size(), plist.docs.begin(), plist.freqs.begin(), freqs_sum + ); } sbuilder.build(actual_path.string()); diff --git a/test/test_strict_elias_fano.cpp b/test/test_strict_elias_fano.cpp index a718257f..2f6b4fc0 100644 --- a/test/test_strict_elias_fano.cpp +++ b/test/test_strict_elias_fano.cpp @@ -7,8 +7,7 @@ #include #include -TEST_CASE("strict_elias_fano") -{ +TEST_CASE("strict_elias_fano") { pisa::global_parameters params; uint64_t n = 10000; diff --git a/test/test_taily_stats.cpp b/test/test_taily_stats.cpp index 9186f0d8..1776a8ee 100644 --- a/test/test_taily_stats.cpp +++ b/test/test_taily_stats.cpp @@ -19,8 +19,7 @@ using taily::Feature_Statistics; -void write_documents(std::filesystem::path const& path) -{ +void write_documents(std::filesystem::path const& path) { pisa::io::write_data( path.string(), gsl::span(std::array{ @@ -35,11 +34,11 @@ void write_documents(std::filesystem::path const& path) std::byte{4}, std::byte{0}, std::byte{0}, std::byte{0}, //< term 1 std::byte{1}, std::byte{0}, std::byte{0}, std::byte{0}, //< term 2 std::byte{5}, std::byte{0}, std::byte{0}, std::byte{0}, //< term 2 - })); + }) + ); } -void write_frequencies(std::filesystem::path const& path) -{ +void write_frequencies(std::filesystem::path const& path) { pisa::io::write_data( path.string(), gsl::span(std::array{ @@ -52,11 +51,11 @@ void write_frequencies(std::filesystem::path const& path) std::byte{5}, std::byte{0}, std::byte{0}, std::byte{0}, //< term 1 std::byte{1}, std::byte{0}, std::byte{0}, std::byte{0}, //< term 2 std::byte{4}, std::byte{0}, std::byte{0}, std::byte{0}, //< term 2 - })); + }) + ); } -void write_sizes(std::filesystem::path const& path) -{ +void write_sizes(std::filesystem::path const& path) { pisa::io::write_data( path.string(), gsl::span(std::array{ @@ -67,13 +66,12 @@ void write_sizes(std::filesystem::path const& path) std::byte{1}, std::byte{0}, std::byte{0}, std::byte{0}, // std::byte{1}, std::byte{0}, std::byte{0}, std::byte{0}, // std::byte{1}, std::byte{0}, std::byte{0}, std::byte{0}, // - })); + }) + ); } -TEST_CASE("Extract Taily feature stats", "[taily][unit]") -{ - GIVEN("Collection") - { +TEST_CASE("Extract Taily feature stats", "[taily][unit]") { + GIVEN("Collection") { pisa::TemporaryDirectory tmpdir; write_documents(tmpdir.path() / "coll.docs"); write_frequencies(tmpdir.path() / "coll.freqs"); @@ -89,18 +87,18 @@ TEST_CASE("Extract Taily feature stats", "[taily][unit]") false, false, pisa::Size(8), - {}); + {} + ); pisa::binary_freq_collection collection(collection_path.c_str()); pisa::wand_data wdata(pisa::MemorySource::mapped_file(wand_data_path)); - WHEN("Extract feature stats") - { + WHEN("Extract feature stats") { auto stats = pisa::extract_feature_stats( - collection, pisa::scorer::from_params(ScorerParams("quantized"), wdata)); + collection, pisa::scorer::from_params(ScorerParams("quantized"), wdata) + ); - THEN("Correct stats") - { + THEN("Correct stats") { REQUIRE(stats.size() == 3); REQUIRE(stats[0].frequency == 2); @@ -119,22 +117,20 @@ TEST_CASE("Extract Taily feature stats", "[taily][unit]") } } -TEST_CASE("Write Taily feature stats", "[taily][unit]") -{ +TEST_CASE("Write Taily feature stats", "[taily][unit]") { pisa::TemporaryDirectory tmpdir; auto stats_path = tmpdir.path() / "taily"; - GIVEN("Feature statistics") - { - std::vector stats{Feature_Statistics{1.0, 2.0, 10}, - Feature_Statistics{3.0, 4.0, 20}, - Feature_Statistics{5.0, 6.0, 30}}; - - WHEN("Stats written to a file") - { + GIVEN("Feature statistics") { + std::vector stats{ + Feature_Statistics{1.0, 2.0, 10}, + Feature_Statistics{3.0, 4.0, 20}, + Feature_Statistics{5.0, 6.0, 30} + }; + + WHEN("Stats written to a file") { pisa::write_feature_stats(stats, 10, stats_path.string()); - THEN("Stats can be read back") - { + THEN("Stats can be read back") { auto stats = pisa::TailyStats::from_mapped(stats_path.string()); REQUIRE(stats.num_documents() == 10); REQUIRE(stats.num_terms() == 3); diff --git a/test/test_text_analyzer.cpp b/test/test_text_analyzer.cpp index edce4625..26bf3414 100644 --- a/test/test_text_analyzer.cpp +++ b/test/test_text_analyzer.cpp @@ -6,26 +6,25 @@ using namespace pisa; -TEST_CASE("No token filters") -{ +TEST_CASE("No token filters") { TextAnalyzer analyzer(std::make_unique()); REQUIRE( analyzer.analyze("Lorem ipsum dolor sit amet")->collect() - == std::vector{"Lorem", "ipsum", "dolor", "sit", "amet"}); + == std::vector{"Lorem", "ipsum", "dolor", "sit", "amet"} + ); } -TEST_CASE("One filter") -{ +TEST_CASE("One filter") { std::unordered_set stopwords{"sit"}; TextAnalyzer analyzer(std::make_unique()); analyzer.emplace_token_filter(); REQUIRE( analyzer.analyze("Lorem ipsum dolor sit amet")->collect() - == std::vector{"lorem", "ipsum", "dolor", "sit", "amet"}); + == std::vector{"lorem", "ipsum", "dolor", "sit", "amet"} + ); } -TEST_CASE("Multiple filters") -{ +TEST_CASE("Multiple filters") { std::unordered_set stopwords{"sit", "and", "the"}; TextAnalyzer analyzer(std::make_unique()); analyzer.emplace_token_filter(); @@ -33,22 +32,22 @@ TEST_CASE("Multiple filters") analyzer.emplace_token_filter(); REQUIRE( analyzer.analyze("Lorem ipsum dolor sit amet and going the")->collect() - == std::vector{"lorem", "ipsum", "dolor", "amet", "go"}); + == std::vector{"lorem", "ipsum", "dolor", "amet", "go"} + ); } -TEST_CASE("Removing first and last token") -{ +TEST_CASE("Removing first and last token") { std::unordered_set stopwords{"lorem", "amet"}; TextAnalyzer analyzer(std::make_unique()); analyzer.emplace_token_filter(); analyzer.emplace_token_filter(std::move(stopwords)); REQUIRE( analyzer.analyze("Lorem ipsum dolor sit amet")->collect() - == std::vector{"ipsum", "dolor", "sit"}); + == std::vector{"ipsum", "dolor", "sit"} + ); } -TEST_CASE("Multiple token filters + html filter") -{ +TEST_CASE("Multiple token filters + html filter") { std::unordered_set stopwords{"sit", "and", "the"}; TextAnalyzer analyzer(std::make_unique()); analyzer.emplace_token_filter(); @@ -57,5 +56,6 @@ TEST_CASE("Multiple token filters + html filter") analyzer.emplace_text_filter(); REQUIRE( analyzer.analyze("

Lorem ipsum dolor sit amet and going the

")->collect() - == std::vector{"lorem", "ipsum", "dolor", "amet", "go"}); + == std::vector{"lorem", "ipsum", "dolor", "amet", "go"} + ); } diff --git a/test/test_token_filter.cpp b/test/test_token_filter.cpp index 51c5cb56..9a633a9d 100644 --- a/test/test_token_filter.cpp +++ b/test/test_token_filter.cpp @@ -8,16 +8,14 @@ using namespace pisa; -TEST_CASE("Lowercase filter") -{ +TEST_CASE("Lowercase filter") { LowercaseFilter lowercase; auto stream = lowercase.filter(std::string_view("WoRd")); REQUIRE(stream->next() == "word"); REQUIRE(stream->next() == std::nullopt); } -TEST_CASE("Stop word remover") -{ +TEST_CASE("Stop word remover") { std::unordered_set stopwords; stopwords.insert("the"); stopwords.insert("a"); @@ -28,59 +26,49 @@ TEST_CASE("Stop word remover") REQUIRE(remover.filter(std::string_view("word"))->collect() == std::vector{"word"}); } -TEST_CASE("Porter2") -{ +TEST_CASE("Porter2") { Porter2Stemmer stemmer; - SECTION("word") - { + SECTION("word") { auto stream = stemmer.filter(std::string_view("word")); REQUIRE(stream->next() == "word"); REQUIRE(stream->next() == std::nullopt); } - SECTION("playing") - { + SECTION("playing") { auto stream = stemmer.filter(std::string_view("playing")); REQUIRE(stream->next() == "play"); REQUIRE(stream->next() == std::nullopt); } - SECTION("I") - { + SECTION("I") { auto stream = stemmer.filter(std::string_view("I")); REQUIRE(stream->next() == "I"); REQUIRE(stream->next() == std::nullopt); } - SECTION("flying") - { + SECTION("flying") { auto stream = stemmer.filter(std::string_view("flying")); REQUIRE(stream->next() == "fli"); REQUIRE(stream->next() == std::nullopt); } } -TEST_CASE("Krovetz") -{ +TEST_CASE("Krovetz") { KrovetzStemmer stemmer; - SECTION("word") - { + SECTION("word") { auto stream = stemmer.filter(std::string_view("word")); REQUIRE(stream->next() == "word"); REQUIRE(stream->next() == std::nullopt); } - SECTION("playing") - { + SECTION("playing") { auto stream = stemmer.filter(std::string_view("playing")); REQUIRE(stream->next() == "play"); REQUIRE(stream->next() == std::nullopt); } // Notice the difference between Porter2 and Krovetz in the following two tests - SECTION("I") - { + SECTION("I") { auto stream = stemmer.filter(std::string_view("I")); REQUIRE(stream->next() == "i"); REQUIRE(stream->next() == std::nullopt); } - SECTION("flying") - { + SECTION("flying") { auto stream = stemmer.filter(std::string_view("flying")); REQUIRE(stream->next() == "flying"); REQUIRE(stream->next() == std::nullopt); diff --git a/test/test_token_stream.cpp b/test/test_token_stream.cpp index 5e85f6e9..7308b271 100644 --- a/test/test_token_stream.cpp +++ b/test/test_token_stream.cpp @@ -6,14 +6,12 @@ using namespace pisa; -TEST_CASE("EmptyTokenStream") -{ +TEST_CASE("EmptyTokenStream") { EmptyTokenStream empty; REQUIRE(empty.next() == std::nullopt); } -TEST_CASE("SingleTokenStream") -{ +TEST_CASE("SingleTokenStream") { SingleTokenStream single("token"); REQUIRE(single.next() == "token"); REQUIRE(single.next() == std::nullopt); diff --git a/test/test_tokenizer.cpp b/test/test_tokenizer.cpp index 96f4cffa..adda88e8 100644 --- a/test/test_tokenizer.cpp +++ b/test/test_tokenizer.cpp @@ -14,30 +14,25 @@ using namespace pisa; -TEST_CASE("WhitespaceTokenizer") -{ - WHEN("Empty input") - { +TEST_CASE("WhitespaceTokenizer") { + WHEN("Empty input") { std::string input = ""; WhitespaceTokenStream tok(input); REQUIRE(tok.next() == std::nullopt); } - WHEN("Input with only whitespaces") - { + WHEN("Input with only whitespaces") { std::string input = " \t "; WhitespaceTokenStream tok(input); REQUIRE(tok.next() == std::nullopt); } - WHEN("Input without spaces around") - { + WHEN("Input without spaces around") { std::string input = "dog cat"; WhitespaceTokenStream tok(input); REQUIRE(tok.next() == "dog"); REQUIRE(tok.next() == "cat"); REQUIRE(tok.next() == std::nullopt); } - WHEN("Input with spaces around") - { + WHEN("Input with spaces around") { std::string input = "\tbling ##ing\tsting ?*I(*&()) "; WhitespaceTokenStream tok(input); REQUIRE(tok.next() == "bling"); @@ -46,20 +41,18 @@ TEST_CASE("WhitespaceTokenizer") REQUIRE(tok.next() == "?*I(*&())"); REQUIRE(tok.next() == std::nullopt); } - SECTION("With iterators") - { + SECTION("With iterators") { std::string input = "\tbling ##ing\tsting ?*I(*&()) "; WhitespaceTokenStream tok(input); REQUIRE( std::vector(tok.begin(), tok.end()) - == std::vector{"bling", "##ing", "sting", "?*I(*&())"}); + == std::vector{"bling", "##ing", "sting", "?*I(*&())"} + ); } } -TEST_CASE("EnglishTokenizer") -{ - SECTION("With next()") - { +TEST_CASE("EnglishTokenizer") { + SECTION("With next()") { std::string str("a 1 12 w0rd, token-izer. pup's, U.S.a., us., hel.lo"); EnglishTokenStream tok(str); REQUIRE(tok.next() == "a"); @@ -75,23 +68,23 @@ TEST_CASE("EnglishTokenizer") REQUIRE(tok.next() == "lo"); REQUIRE(tok.next() == std::nullopt); } - SECTION("With iterators") - { + SECTION("With iterators") { std::string str("a 1 12 w0rd, token-izer. pup's, U.S.a., us., hel.lo"); EnglishTokenStream tokenizer(str); REQUIRE( std::vector(tokenizer.begin(), tokenizer.end()) - == std::vector{ - "a", "1", "12", "w0rd", "token", "izer", "pup", "USa", "us", "hel", "lo"}); + == std::vector< + std::string>{"a", "1", "12", "w0rd", "token", "izer", "pup", "USa", "us", "hel", "lo"} + ); } } -TEST_CASE("Parse query terms to ids") -{ +TEST_CASE("Parse query terms to ids") { pisa::TemporaryDirectory tmpdir; auto lexfile = tmpdir.path() / "lex"; encode_payload_vector( - gsl::make_span(std::vector{"lol", "obama", "term2", "tree", "usa"})) + gsl::make_span(std::vector{"lol", "obama", "term2", "tree", "usa"}) + ) .to_file(lexfile.string()); auto [query, id, parsed] = @@ -101,7 +94,8 @@ TEST_CASE("Parse query terms to ids") {"obama, family, trees", std::nullopt, {1, 3}}, {"obama + family + tree", std::nullopt, {1, 3}}, {"lol's", std::nullopt, {0}}, - {"U.S.A.!?", std::nullopt, {4}}})); + {"U.S.A.!?", std::nullopt, {4}}} + )); CAPTURE(query); TermProcessor term_processor(std::make_optional(lexfile.string()), std::nullopt, "krovetz"); EnglishTokenizer tokenizer; diff --git a/test/test_topk_queue.cpp b/test/test_topk_queue.cpp index c84d9e52..ab9b85fc 100644 --- a/test/test_topk_queue.cpp +++ b/test/test_topk_queue.cpp @@ -10,49 +10,46 @@ using namespace rc; /// Scale scores to (0, 1] to get smaller score differences. -auto scale_unit(float score) -> float -{ +auto scale_unit(float score) -> float { return std::max(score / std::numeric_limits::max(), std::numeric_limits::min()); } -auto gen_postings(int min_length, int max_length) -{ +auto gen_postings(int min_length, int max_length) { return gen::mapcat(gen::inRange(min_length, max_length), [](int length) { return rc::gen::pair( gen::container>(length, gen::map(gen::positive(), scale_unit)), - gen::unique>(length, gen::positive())); + gen::unique>(length, gen::positive()) + ); }); } -auto gen_quantized_postings(int min_length, int max_length) -{ +auto gen_quantized_postings(int min_length, int max_length) { return gen::mapcat(gen::inRange(min_length, max_length), [](int length) { return rc::gen::pair( gen::container>( length, - gen::map(gen::positive(), [](auto i) { return static_cast(i); })), - gen::unique>(length, gen::positive())); + gen::map(gen::positive(), [](auto i) { return static_cast(i); }) + ), + gen::unique>(length, gen::positive()) + ); }); } void accumulate( - pisa::topk_queue& topk, std::vector const& scores, std::vector const& docids) -{ + pisa::topk_queue& topk, std::vector const& scores, std::vector const& docids +) { for (int posting = 0; posting < docids.size(); ++posting) { topk.insert(scores[posting], docids[posting]); } } -auto kth(std::vector scores, int k) -> float -{ +auto kth(std::vector scores, int k) -> float { std::sort(scores.begin(), scores.end(), std::greater{}); return scores.at(k - 1); } -TEST_CASE("Threshold", "[topk_queue][prop]") -{ - SECTION("When initial = 0.0, the final threshold is the k-th score") - { +TEST_CASE("Threshold", "[topk_queue][prop]") { + SECTION("When initial = 0.0, the final threshold is the k-th score") { check([] { auto [scores, docids] = *gen_postings(10, 1000); @@ -66,8 +63,7 @@ TEST_CASE("Threshold", "[topk_queue][prop]") }); } - SECTION("When too few postings, then final threshold 0.0") - { + SECTION("When too few postings, then final threshold 0.0") { check([] { auto [scores, docids] = *gen_postings(1, 9); pisa::topk_queue topk(10); @@ -78,8 +74,7 @@ TEST_CASE("Threshold", "[topk_queue][prop]") }); } - SECTION("When too few postings and initial threshold, then final threshold equal to initial") - { + SECTION("When too few postings and initial threshold, then final threshold equal to initial") { check([] { auto [scores, docids] = *gen_postings(1, 9); auto initial = *gen::positive(); @@ -91,10 +86,8 @@ TEST_CASE("Threshold", "[topk_queue][prop]") }); } - SECTION("When initial is exact, final is the same") - { - SECTION("floats") - { + SECTION("When initial is exact, final is the same") { + SECTION("floats") { check([] { auto [scores, docids] = *gen_postings(10, 1000); auto initial = kth(scores, 10); @@ -105,8 +98,7 @@ TEST_CASE("Threshold", "[topk_queue][prop]") REQUIRE(topk.effective_threshold() == topk.initial_threshold()); }); } - SECTION("quantized") - { + SECTION("quantized") { check([] { auto [scores, docids] = *gen_quantized_postings(10, 1000); auto initial = kth(scores, 10); @@ -119,8 +111,7 @@ TEST_CASE("Threshold", "[topk_queue][prop]") } } - SECTION("When initial is too high, true is lower than effective") - { + SECTION("When initial is too high, true is lower than effective") { check([] { auto [scores, docids] = *gen_postings(10, 1000); auto initial = std::nextafter(kth(scores, 10), std::numeric_limits::max()); @@ -132,8 +123,7 @@ TEST_CASE("Threshold", "[topk_queue][prop]") }); } - SECTION("Threshold never decreases") - { + SECTION("Threshold never decreases") { check([] { auto [scores, docids] = *gen_postings(10, 1000); diff --git a/test/test_trec_topic_reader.cpp b/test/test_trec_topic_reader.cpp index 8b65c844..e05f2931 100644 --- a/test/test_trec_topic_reader.cpp +++ b/test/test_trec_topic_reader.cpp @@ -6,8 +6,7 @@ using namespace pisa; -TEST_CASE("Read topic", "[unit]") -{ +TEST_CASE("Read topic", "[unit]") { std::istringstream is( "\n" " Number: 301 \n" @@ -16,7 +15,8 @@ TEST_CASE("Read topic", "[unit]") "Some description here. \n" " Narrative:\n" "Some narrative content. \n" - "\n"); + "
\n" + ); pisa::trec_topic_reader reader(is); auto topic = reader.next_topic(); @@ -28,8 +28,7 @@ TEST_CASE("Read topic", "[unit]") REQUIRE(reader.next_topic() == std::nullopt); } -TEST_CASE("Read multiple topics", "[unit]") -{ +TEST_CASE("Read multiple topics", "[unit]") { std::istringstream is( "\n" " Number: 301 \n" @@ -49,7 +48,8 @@ TEST_CASE("Read multiple topics", "[unit]") "Some other description. \n" "\n" "Some other narrative\n... narrative" - "\n"); + "
\n" + ); pisa::trec_topic_reader reader(is); auto topic = reader.next_topic(); @@ -67,8 +67,7 @@ TEST_CASE("Read multiple topics", "[unit]") REQUIRE(reader.next_topic() == std::nullopt); } -TEST_CASE("Read topic with closing tags", "[unit]") -{ +TEST_CASE("Read topic with closing tags", "[unit]") { std::istringstream is( "\n" " Number: 301 \n" @@ -79,7 +78,8 @@ TEST_CASE("Read topic with closing tags", "[unit]") " Narrative:\n" "Some narrative content. \n" "" - "\n"); + "
\n" + ); pisa::trec_topic_reader reader(is); auto topic = reader.next_topic(); @@ -91,13 +91,13 @@ TEST_CASE("Read topic with closing tags", "[unit]") REQUIRE(reader.next_topic() == std::nullopt); } -TEST_CASE("Invalid topic", "[unit]") -{ +TEST_CASE("Invalid topic", "[unit]") { { std::istringstream is( "\n" "Number: 301 \n" - "\n"); + "
\n" + ); pisa::trec_topic_reader reader(is); REQUIRE_THROWS(reader.next_topic()); @@ -106,7 +106,8 @@ TEST_CASE("Invalid topic", "[unit]") std::istringstream is( "\n" "Number: 301 \n" - "\n"); + "
\n" + ); pisa::trec_topic_reader reader(is); REQUIRE_THROWS(reader.next_topic()); @@ -116,7 +117,8 @@ TEST_CASE("Invalid topic", "[unit]") "\n" "Number: 301 \n" " title here. \n" - "</top>\n"); + "</top>\n" + ); pisa::trec_topic_reader reader(is); REQUIRE_THROWS(reader.next_topic()); @@ -127,7 +129,8 @@ TEST_CASE("Invalid topic", "[unit]") "<num>Number: 301 \n" "<title> title here. \n" "<desc> description here. \n" - "</top>\n"); + "</top>\n" + ); pisa::trec_topic_reader reader(is); REQUIRE_THROWS(reader.next_topic()); @@ -138,7 +141,8 @@ TEST_CASE("Invalid topic", "[unit]") "<num>Number: 301 \n" "<title> title here. \n" "<desc> description here. \n" - "<narr> narrative here. \n"); + "<narr> narrative here. \n" + ); pisa::trec_topic_reader reader(is); REQUIRE_THROWS(reader.next_topic()); diff --git a/test/test_uniform_partitioned_sequence.cpp b/test/test_uniform_partitioned_sequence.cpp index 17580c65..8a7fce6f 100644 --- a/test/test_uniform_partitioned_sequence.cpp +++ b/test/test_uniform_partitioned_sequence.cpp @@ -8,8 +8,7 @@ #include <cstdlib> #include <vector> -TEST_CASE("uniform_partitioned_sequence") -{ +TEST_CASE("uniform_partitioned_sequence") { pisa::global_parameters params; using pisa::indexed_sequence; using pisa::strict_sequence; diff --git a/test/test_wand_data.cpp b/test/test_wand_data.cpp index 826ea8f4..195aea2b 100644 --- a/test/test_wand_data.cpp +++ b/test/test_wand_data.cpp @@ -16,8 +16,7 @@ using namespace pisa; -TEST_CASE("wand_data_range") -{ +TEST_CASE("wand_data_range") { tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, 2); using WandTypeRange = wand_data_range<64, 1024>; using WandType = wand_data<WandTypeRange>; @@ -34,12 +33,12 @@ TEST_CASE("wand_data_range") ScorerParams(scorer_name), BlockSize(FixedBlock(5)), std::nullopt, - dropped_term_ids); + dropped_term_ids + ); auto scorer = scorer::from_params(ScorerParams(scorer_name), wdata_range); - SECTION("Precomputed block-max scores") - { + SECTION("Precomputed block-max scores") { size_t term_id = 0; for (auto const& seq: collection) { if (seq.docs.size() >= 1024) { @@ -49,11 +48,11 @@ TEST_CASE("wand_data_range") for (auto&& [docid, freq]: ranges::views::zip(seq.docs, seq.freqs)) { float score = s(docid, freq); w.next_geq(docid); - CHECKED_ELSE(w.score() >= score) - { + CHECKED_ELSE(w.score() >= score) { FAIL( "Term: " << term_id << " docid: " << docid - << ", block docid: " << w.docid()); + << ", block docid: " << w.docid() + ); } REQUIRE(w.score() <= max); } @@ -67,13 +66,11 @@ TEST_CASE("wand_data_range") index_type::builder builder(collection.num_docs(), params); for (auto const& plist: collection) { uint64_t freqs_sum = std::accumulate(plist.freqs.begin(), plist.freqs.end(), uint64_t(0)); - builder.add_posting_list( - plist.docs.size(), plist.docs.begin(), plist.freqs.begin(), freqs_sum); + builder.add_posting_list(plist.docs.size(), plist.docs.begin(), plist.freqs.begin(), freqs_sum); } builder.build(index); - SECTION("Compute at run time") - { + SECTION("Compute at run time") { size_t term_id = 0; for (auto const& seq: collection) { auto list = index[term_id]; @@ -87,11 +84,11 @@ TEST_CASE("wand_data_range") ranges::views::zip(ranges::views::iota(0), seq.docs, seq.freqs)) { float score = s(docid, freq); we.next_geq(docid); - CHECKED_ELSE(we.score() >= score) - { + CHECKED_ELSE(we.score() >= score) { FAIL( "Term: " << term_id << " docid: " << docid << ", pos: " << pos - << ", block docid: " << we.docid()); + << ", block docid: " << we.docid() + ); } REQUIRE(we.score() <= max); } @@ -100,8 +97,7 @@ TEST_CASE("wand_data_range") } } - SECTION("Live block computation") - { + SECTION("Live block computation") { size_t i = 0; std::vector<WandTypeRange::enumerator> enums; for (auto const& seq: collection) { diff --git a/tools/app.cpp b/tools/app.cpp index c6c133f4..9fdebd32 100644 --- a/tools/app.cpp +++ b/tools/app.cpp @@ -3,27 +3,22 @@ namespace pisa::arg { -Encoding::Encoding(CLI::App* app) -{ +Encoding::Encoding(CLI::App* app) { app->add_option("-e,--encoding", m_encoding, "Index encoding")->required(); } -auto Encoding::index_encoding() const -> std::string const& -{ +auto Encoding::index_encoding() const -> std::string const& { return m_encoding; } -Index::Index(CLI::App* app) : Encoding(app) -{ +Index::Index(CLI::App* app) : Encoding(app) { app->add_option("-i,--index", m_index, "Inverted index filename")->required(); } -auto Index::index_filename() const -> std::string const& -{ +auto Index::index_filename() const -> std::string const& { return m_index; } -Analyzer::Analyzer(CLI::App* app) -{ +Analyzer::Analyzer(CLI::App* app) { app->add_option("--tokenizer", m_tokenizer, "Tokenizer") ->capture_default_str() ->check(CLI::IsMember(VALID_TOKENIZERS)); @@ -31,21 +26,18 @@ Analyzer::Analyzer(CLI::App* app) app->add_option("-F,--token-filters", m_token_filters, "Token filters") ->check(CLI::IsMember(VALID_TOKEN_FILTERS)); app->add_option( - "--stopwords", - m_stopwords_file, - "Path to file containing a list of stop words to filter out"); + "--stopwords", m_stopwords_file, "Path to file containing a list of stop words to filter out" + ); } -auto Analyzer::tokenizer() const -> std::unique_ptr<::pisa::Tokenizer> -{ +auto Analyzer::tokenizer() const -> std::unique_ptr<::pisa::Tokenizer> { if (m_tokenizer == "whitespace") { return std::make_unique<WhitespaceTokenizer>(); } return std::make_unique<EnglishTokenizer>(); } -auto Analyzer::text_analyzer() const -> TextAnalyzer -{ +auto Analyzer::text_analyzer() const -> TextAnalyzer { TextAnalyzer analyzer(tokenizer()); if (m_strip_html) { analyzer.emplace_text_filter<StripHtmlFilter>(); @@ -71,20 +63,19 @@ auto Analyzer::text_analyzer() const -> TextAnalyzer const std::set<std::string> Analyzer::VALID_TOKENIZERS = {"whitespace", "english"}; const std::set<std::string> Analyzer::VALID_TOKEN_FILTERS = {"lowercase", "porter2", "krovetz"}; -LogLevel::LogLevel(CLI::App* app) -{ +LogLevel::LogLevel(CLI::App* app) { app->add_option("-L,--log-level", m_level, "Log level") ->capture_default_str() ->check(CLI::IsMember(VALID_LEVELS)); } -auto LogLevel::log_level() const -> spdlog::level::level_enum -{ +auto LogLevel::log_level() const -> spdlog::level::level_enum { return ENUM_MAP.at(m_level); } const std::set<std::string> LogLevel::VALID_LEVELS = { - "trace", "debug", "info", "warn", "err", "critical", "off"}; + "trace", "debug", "info", "warn", "err", "critical", "off" +}; const std::map<std::string, spdlog::level::level_enum> LogLevel::ENUM_MAP = { {"trace", spdlog::level::level_enum::trace}, {"debug", spdlog::level::level_enum::debug}, @@ -92,149 +83,130 @@ const std::map<std::string, spdlog::level::level_enum> LogLevel::ENUM_MAP = { {"warn", spdlog::level::level_enum::warn}, {"err", spdlog::level::level_enum::err}, {"critical", spdlog::level::level_enum::critical}, - {"off", spdlog::level::level_enum::off}}; + {"off", spdlog::level::level_enum::off} +}; -Algorithm::Algorithm(CLI::App* app) -{ +Algorithm::Algorithm(CLI::App* app) { app->add_option("-a,--algorithm", m_algorithm, "Query processing algorithm")->required(); } -auto Algorithm::algorithm() const -> std::string const& -{ +auto Algorithm::algorithm() const -> std::string const& { return m_algorithm; } -Quantize::Quantize(CLI::App* app) : m_params("") -{ +Quantize::Quantize(CLI::App* app) : m_params("") { auto* wand = app->add_option("-w,--wand", m_wand_data_path, "WAND data filename"); auto* scorer = add_scorer_options(app, *this, ScorerMode::Optional); auto* quant = app->add_option( - "--quantize", m_quantization_bits, "Quantizes the scores using this many bits"); + "--quantize", m_quantization_bits, "Quantizes the scores using this many bits" + ); wand->needs(scorer); scorer->needs(wand); scorer->needs(quant); quant->needs(scorer); } -auto Quantize::scorer_params() const -> ScorerParams -{ +auto Quantize::scorer_params() const -> ScorerParams { return m_params; } -auto Quantize::wand_data_path() const -> std::optional<std::string> const& -{ +auto Quantize::wand_data_path() const -> std::optional<std::string> const& { return m_wand_data_path; } -auto Quantize::quantization_bits() const -> std::optional<Size> -{ +auto Quantize::quantization_bits() const -> std::optional<Size> { if (m_quantization_bits.has_value()) { return Size(*m_quantization_bits); } return std::nullopt; } -Scorer::Scorer(CLI::App* app) : m_params("") -{ +Scorer::Scorer(CLI::App* app) : m_params("") { add_scorer_options(app, *this, ScorerMode::Required); } -auto Scorer::scorer_params() const -> ScorerParams -{ +auto Scorer::scorer_params() const -> ScorerParams { return m_params; } -Thresholds::Thresholds(CLI::App* app) -{ - m_option = app->add_option( - "-T,--thresholds", m_thresholds_filename, "File containing query thresholds"); +Thresholds::Thresholds(CLI::App* app) { + m_option = + app->add_option("-T,--thresholds", m_thresholds_filename, "File containing query thresholds"); } -auto Thresholds::thresholds_file() const -> std::optional<std::string> const& -{ +auto Thresholds::thresholds_file() const -> std::optional<std::string> const& { return m_thresholds_filename; } -auto Thresholds::thresholds_option() -> CLI::Option* -{ +auto Thresholds::thresholds_option() -> CLI::Option* { return m_option; } -Verbose::Verbose(CLI::App* app) -{ +Verbose::Verbose(CLI::App* app) { app->add_flag("-v,--verbose", m_verbose, "Print additional information"); } -auto Verbose::verbose() const -> bool -{ +auto Verbose::verbose() const -> bool { return m_verbose; } -auto Verbose::print_args(std::ostream& os) const -> std::ostream& -{ +auto Verbose::print_args(std::ostream& os) const -> std::ostream& { os << fmt::format("verbose: {}\n", verbose()); return os; } -Threads::Threads(CLI::App* app) -{ +Threads::Threads(CLI::App* app) { app->add_option("-j,--threads", m_threads, "Number of threads"); } -auto Threads::threads() const -> std::size_t -{ +auto Threads::threads() const -> std::size_t { return m_threads; } -auto Threads::print_args(std::ostream& os) const -> std::ostream& -{ +auto Threads::print_args(std::ostream& os) const -> std::ostream& { os << fmt::format("threads: {}\n", threads()); return os; } -Invert::Invert(CLI::App* app) -{ +Invert::Invert(CLI::App* app) { app->add_option("-i,--input", m_input_basename, "Forward index basename")->required(); app->add_option("-o,--output", m_output_basename, "Output inverted index basename")->required(); app->add_option("--term-count", m_term_count, "Number of distinct terms in the forward index"); } -auto Invert::input_basename() const -> std::string -{ +auto Invert::input_basename() const -> std::string { return m_input_basename; } -auto Invert::output_basename() const -> std::string -{ +auto Invert::output_basename() const -> std::string { return m_output_basename; } -auto Invert::term_count() const -> std::optional<std::uint32_t> -{ +auto Invert::term_count() const -> std::optional<std::uint32_t> { return m_term_count; } /// Transform paths for `shard`. -void Invert::apply_shard(Shard_Id shard) -{ +void Invert::apply_shard(Shard_Id shard) { m_input_basename = expand_shard(m_input_basename, shard); m_output_basename = expand_shard(m_output_basename, shard); } -CreateWandData::CreateWandData(CLI::App* app) : m_params("") -{ +CreateWandData::CreateWandData(CLI::App* app) : m_params("") { app->add_option("-c,--collection", m_input_basename, "Collection basename")->required(); app->add_option("-o,--output", m_output, "Output filename")->required(); auto block_group = app->add_option_group("blocks"); auto block_size_opt = block_group->add_option( - "-b,--block-size", m_fixed_block_size, "Block size for fixed-length blocks"); + "-b,--block-size", m_fixed_block_size, "Block size for fixed-length blocks" + ); auto block_lambda_opt = block_group->add_option("-l,--lambda", m_lambda, "Lambda parameter for variable blocks") ->excludes(block_size_opt); block_group->require_option(); auto* quant = app->add_option( - "--quantize", m_quantization_bits, "Quantizes the scores using this many bits"); + "--quantize", m_quantization_bits, "Quantizes the scores using this many bits" + ); app->add_flag("--compress", m_compress, "Compress additional data")->needs(quant); add_scorer_options(app, *this, ScorerMode::Required); app->add_flag("--range", m_range, "Create docid-range based data") @@ -243,26 +215,23 @@ CreateWandData::CreateWandData(CLI::App* app) : m_params("") app->add_option( "--terms-to-drop", m_terms_to_drop_filename, - "A filename containing a list of term IDs that we want to drop"); + "A filename containing a list of term IDs that we want to drop" + ); } -auto CreateWandData::input_basename() const -> std::string -{ +auto CreateWandData::input_basename() const -> std::string { return m_input_basename; } -auto CreateWandData::output() const -> std::string -{ +auto CreateWandData::output() const -> std::string { return m_output; } -auto CreateWandData::scorer_params() const -> ScorerParams -{ +auto CreateWandData::scorer_params() const -> ScorerParams { return m_params; } -auto CreateWandData::block_size() const -> BlockSize -{ +auto CreateWandData::block_size() const -> BlockSize { if (m_lambda) { spdlog::info("Lambda {}", *m_lambda); return VariableBlock(*m_lambda); @@ -271,8 +240,7 @@ auto CreateWandData::block_size() const -> BlockSize return FixedBlock(*m_fixed_block_size); } -auto CreateWandData::dropped_term_ids() const -> std::unordered_set<size_t> -{ +auto CreateWandData::dropped_term_ids() const -> std::unordered_set<size_t> { std::unordered_set<size_t> dropped_term_ids; if (!m_terms_to_drop_filename) { return dropped_term_ids; @@ -281,27 +249,24 @@ auto CreateWandData::dropped_term_ids() const -> std::unordered_set<size_t> copy( std::istream_iterator<size_t>(dropped_terms_file), std::istream_iterator<size_t>(), - std::inserter(dropped_term_ids, dropped_term_ids.end())); + std::inserter(dropped_term_ids, dropped_term_ids.end()) + ); return dropped_term_ids; } -auto CreateWandData::lambda() const -> std::optional<float> -{ +auto CreateWandData::lambda() const -> std::optional<float> { return m_lambda; } -auto CreateWandData::compress() const -> bool -{ +auto CreateWandData::compress() const -> bool { return m_compress; } -auto CreateWandData::range() const -> bool -{ +auto CreateWandData::range() const -> bool { return m_range; } -auto CreateWandData::quantization_bits() const -> std::optional<Size> -{ +auto CreateWandData::quantization_bits() const -> std::optional<Size> { if (m_quantization_bits.has_value()) { return Size(*m_quantization_bits); } @@ -309,14 +274,12 @@ auto CreateWandData::quantization_bits() const -> std::optional<Size> } /// Transform paths for `shard`. -void CreateWandData::apply_shard(Shard_Id shard) -{ +void CreateWandData::apply_shard(Shard_Id shard) { m_input_basename = expand_shard(m_input_basename, shard); m_output = expand_shard(m_output, shard); } -ReorderDocuments::ReorderDocuments(CLI::App* app) -{ +ReorderDocuments::ReorderDocuments(CLI::App* app) { app->add_option("-c,--collection", m_input_basename, "Collection basename")->required(); auto output = app->add_option("-o,--output", m_output_basename, "Output basename"); auto docs_opt = app->add_option("--documents", m_doclex, "Document lexicon"); @@ -328,13 +291,16 @@ ReorderDocuments::ReorderDocuments(CLI::App* app) "--random", m_random, "Assign IDs randomly. You can use --seed for deterministic " - "results.") + "results." + ) ->needs(output); methods->add_option( - "--from-mapping", m_mapping, "Use the mapping defined in this new-line delimited text file"); + "--from-mapping", m_mapping, "Use the mapping defined in this new-line delimited text file" + ); methods->add_option("--by-feature", m_feature, "Order by URLs from this file"); auto bp = methods->add_flag( - "--recursive-graph-bisection,--bp", m_bp, "Use recursive graph bisection algorithm"); + "--recursive-graph-bisection,--bp", m_bp, "Use recursive graph bisection algorithm" + ); methods->require_option(1); // --random @@ -353,92 +319,74 @@ ReorderDocuments::ReorderDocuments(CLI::App* app) optconf->excludes(optdepth); } -auto ReorderDocuments::input_basename() const -> std::string -{ +auto ReorderDocuments::input_basename() const -> std::string { return m_input_basename; } -auto ReorderDocuments::output_basename() const -> std::optional<std::string> -{ +auto ReorderDocuments::output_basename() const -> std::optional<std::string> { return m_output_basename; } -auto ReorderDocuments::document_lexicon() const -> std::optional<std::string> -{ +auto ReorderDocuments::document_lexicon() const -> std::optional<std::string> { return m_doclex; } -auto ReorderDocuments::reordered_document_lexicon() const -> std::optional<std::string> -{ +auto ReorderDocuments::reordered_document_lexicon() const -> std::optional<std::string> { return m_reordered_doclex; } -auto ReorderDocuments::random() const -> bool -{ +auto ReorderDocuments::random() const -> bool { return m_random; } -auto ReorderDocuments::feature_file() const -> std::optional<std::string> -{ +auto ReorderDocuments::feature_file() const -> std::optional<std::string> { return m_feature; } -auto ReorderDocuments::bp() const -> bool -{ +auto ReorderDocuments::bp() const -> bool { return m_bp; } -auto ReorderDocuments::mapping_file() const -> std::optional<std::string> -{ +auto ReorderDocuments::mapping_file() const -> std::optional<std::string> { return m_mapping; } -auto ReorderDocuments::seed() const -> std::uint64_t -{ +auto ReorderDocuments::seed() const -> std::uint64_t { return m_seed; } -auto ReorderDocuments::input_collection() const -> binary_freq_collection -{ +auto ReorderDocuments::input_collection() const -> binary_freq_collection { return binary_freq_collection(input_basename().c_str()); } -auto ReorderDocuments::input_fwd() const -> std::optional<std::string> -{ +auto ReorderDocuments::input_fwd() const -> std::optional<std::string> { return m_input_fwd; } -auto ReorderDocuments::output_fwd() const -> std::optional<std::string> -{ +auto ReorderDocuments::output_fwd() const -> std::optional<std::string> { return m_output_fwd; } -auto ReorderDocuments::min_length() const -> std::size_t -{ +auto ReorderDocuments::min_length() const -> std::size_t { return m_min_len; } -auto ReorderDocuments::depth() const -> std::optional<std::size_t> -{ +auto ReorderDocuments::depth() const -> std::optional<std::size_t> { return m_depth; } -auto ReorderDocuments::nogb() const -> bool -{ +auto ReorderDocuments::nogb() const -> bool { return m_nogb; } -auto ReorderDocuments::print() const -> bool -{ +auto ReorderDocuments::print() const -> bool { return m_print; } -auto ReorderDocuments::node_config() const -> std::optional<std::string> -{ +auto ReorderDocuments::node_config() const -> std::optional<std::string> { return m_node_config; } -void ReorderDocuments::apply_shard(Shard_Id shard) -{ +void ReorderDocuments::apply_shard(Shard_Id shard) { m_input_basename = expand_shard(m_input_basename, shard); if (m_output_basename) { m_output_basename = expand_shard(*m_output_basename, shard); @@ -462,26 +410,23 @@ void ReorderDocuments::apply_shard(Shard_Id shard) } Separator::Separator(CLI::App* app, std::string default_separator) - : m_separator(std::move(default_separator)) -{ + : m_separator(std::move(default_separator)) { app->add_option("--sep", m_separator, "Separator string"); } -auto Separator::separator() const -> std::string const& -{ +auto Separator::separator() const -> std::string const& { return m_separator; } -PrintQueryId::PrintQueryId(CLI::App* app) -{ +PrintQueryId::PrintQueryId(CLI::App* app) { app->add_flag( "--query-id", m_print_query_id, - "Print query ID at the beginning of each line, separated by a colon"); + "Print query ID at the beginning of each line, separated by a colon" + ); } -auto PrintQueryId::print_query_id() const -> bool -{ +auto PrintQueryId::print_query_id() const -> bool { return m_print_query_id; } diff --git a/tools/app.hpp b/tools/app.hpp index 7c20f1a3..5c6342dd 100644 --- a/tools/app.hpp +++ b/tools/app.hpp @@ -40,8 +40,7 @@ namespace arg { template <WandMode Mode = WandMode::Required> struct WandData { - explicit WandData(CLI::App* app) - { + explicit WandData(CLI::App* app) { auto* wand = app->add_option("-w,--wand", m_wand_data_path, "WAND data filename"); app->add_flag("--compressed-wand", m_wand_compressed, "Compressed WAND data file") ->needs(wand); @@ -51,8 +50,7 @@ namespace arg { } } - [[nodiscard]] auto wand_data_path() const - { + [[nodiscard]] auto wand_data_path() const { if constexpr (Mode == WandMode::Required) { return *m_wand_data_path; } else { @@ -62,8 +60,7 @@ namespace arg { [[nodiscard]] auto is_wand_compressed() const -> bool { return m_wand_compressed; } /// Transform paths for `shard`. - void apply_shard(Shard_Id shard) - { + void apply_shard(Shard_Id shard) { if (m_wand_data_path) { m_wand_data_path = expand_shard(*m_wand_data_path, shard); } @@ -104,8 +101,7 @@ namespace arg { template <QueryMode Mode = QueryMode::Ranked> struct Query: public Analyzer { - explicit Query(CLI::App* app) : Analyzer(app) - { + explicit Query(CLI::App* app) : Analyzer(app) { app->add_option("-q,--queries", m_query_file, "Path to file with queries") ->capture_default_str(); m_terms_option = app->add_option("--terms", m_term_lexicon, "Term lexicon"); @@ -115,16 +111,14 @@ namespace arg { } } - [[nodiscard]] auto query_file() -> std::optional<std::reference_wrapper<std::string const>> - { + [[nodiscard]] auto query_file() -> std::optional<std::reference_wrapper<std::string const>> { if (m_query_file) { return m_query_file.value(); } return std::nullopt; } - [[nodiscard]] auto queries() const -> std::vector<::pisa::Query> - { + [[nodiscard]] auto queries() const -> std::vector<::pisa::Query> { std::vector<::pisa::Query> q; std::unique_ptr<TermMap> term_map = [this]() -> std::unique_ptr<TermMap> { if (this->m_term_lexicon) { @@ -149,8 +143,7 @@ namespace arg { protected: [[nodiscard]] auto terms_option() const -> CLI::Option* { return m_terms_option; } - void override_term_lexicon(std::string term_lexicon) - { + void override_term_lexicon(std::string term_lexicon) { m_term_lexicon = std::move(term_lexicon); } @@ -173,8 +166,7 @@ namespace arg { enum class ScorerMode : bool { Required, Optional }; template <typename T> - CLI::Option* add_scorer_options(CLI::App* app, T& args, ScorerMode scorer_mode) - { + CLI::Option* add_scorer_options(CLI::App* app, T& args, ScorerMode scorer_mode) { CLI::Option* scorer; if (scorer_mode == ScorerMode::Required) { scorer = @@ -246,8 +238,7 @@ namespace arg { template <std::size_t Default = 100'000> struct BatchSize { - explicit BatchSize(CLI::App* app) - { + explicit BatchSize(CLI::App* app) { app->add_option("--batch-size", m_batch_size, "Number of documents to process at a time") ->capture_default_str(); } @@ -274,8 +265,7 @@ namespace arg { }; struct Compress { - explicit Compress(CLI::App* app) - { + explicit Compress(CLI::App* app) { app->add_option("-c,--collection", m_input_basename, "Uncompressed index basename") ->required(); app->add_option("-o,--output", m_output, "Output inverted index")->required(); @@ -287,8 +277,7 @@ namespace arg { [[nodiscard]] auto check() const -> bool { return m_check; } /// Transform paths for `shard`. - void apply_shard(Shard_Id shard) - { + void apply_shard(Shard_Id shard) { m_input_basename = expand_shard(m_input_basename, shard); m_output = expand_shard(m_output, shard); } @@ -417,21 +406,18 @@ namespace arg { */ template <typename... Args> struct App: public CLI::App, public Args... { - explicit App(std::string const& description) : CLI::App(description), Args(this)... - { + explicit App(std::string const& description) : CLI::App(description), Args(this)... { this->set_config("--config", "", "Configuration .ini file", false); } }; template <typename... T> struct Args: public T... { - explicit Args(CLI::App* app) : T(app)... - { + explicit Args(CLI::App* app) : T(app)... { app->set_config("--config", "", "Configuration .ini file", false); } - auto print_args(std::ostream& os) const -> std::ostream& - { + auto print_args(std::ostream& os) const -> std::ostream& { (T::print_args(os), ...); return os; } @@ -445,8 +431,7 @@ using CreateWandDataArgs = pisa::Args<arg::CreateWandData, arg::LogLevel>; struct TailyStatsArgs : pisa::Args<arg::WandData<arg::WandMode::Required>, arg::Scorer, arg::LogLevel> { explicit TailyStatsArgs(CLI::App* app) - : pisa::Args<arg::WandData<arg::WandMode::Required>, arg::Scorer, arg::LogLevel>(app) - { + : pisa::Args<arg::WandData<arg::WandMode::Required>, arg::Scorer, arg::LogLevel>(app) { app->add_option("-c,--collection", m_collection_path, "Binary collection basename")->required(); app->add_option("-o,--output", m_output_path, "Output file path")->required(); app->set_config("--config", "", "Configuration .ini file", false); @@ -456,8 +441,7 @@ struct TailyStatsArgs [[nodiscard]] auto output_path() const -> std::string const& { return m_output_path; } /// Transform paths for `shard`. - void apply_shard(Shard_Id shard) - { + void apply_shard(Shard_Id shard) { arg::WandData<arg::WandMode::Required>::apply_shard(shard); m_collection_path = expand_shard(m_collection_path, shard); m_output_path = expand_shard(m_output_path, shard); @@ -469,8 +453,7 @@ struct TailyStatsArgs }; struct TailyRankArgs: pisa::Args<arg::Query<arg::QueryMode::Ranked>> { - explicit TailyRankArgs(CLI::App* app) : pisa::Args<arg::Query<arg::QueryMode::Ranked>>(app) - { + explicit TailyRankArgs(CLI::App* app) : pisa::Args<arg::Query<arg::QueryMode::Ranked>>(app) { arg::Query<arg::QueryMode::Ranked>::terms_option()->required(true); app->add_option("--global-stats", m_global_stats, "Global Taily statistics")->required(); app->add_option("--shard-stats", m_shard_stats, "Shard-level Taily statistics")->required(); @@ -481,8 +464,7 @@ struct TailyRankArgs: pisa::Args<arg::Query<arg::QueryMode::Ranked>> { [[nodiscard]] auto global_stats() const -> std::string const& { return m_global_stats; } [[nodiscard]] auto shard_stats() const -> std::string const& { return m_shard_stats; } - void apply_shard(Shard_Id shard) - { + void apply_shard(Shard_Id shard) { m_shard_term_lexicon = expand_shard(m_shard_term_lexicon, shard); override_term_lexicon(m_shard_term_lexicon); m_shard_stats = expand_shard(m_shard_stats, shard); @@ -496,8 +478,7 @@ struct TailyRankArgs: pisa::Args<arg::Query<arg::QueryMode::Ranked>> { struct TailyThresholds: pisa::Args<arg::Query<arg::QueryMode::Ranked>, arg::LogLevel> { explicit TailyThresholds(CLI::App* app) - : pisa::Args<arg::Query<arg::QueryMode::Ranked>, arg::LogLevel>(app) - { + : pisa::Args<arg::Query<arg::QueryMode::Ranked>, arg::LogLevel>(app) { app->add_option("--stats", m_stats, "Taily statistics file")->required(); app->set_config("--config", "", "Configuration .ini file", false); } diff --git a/tools/compress_inverted_index.cpp b/tools/compress_inverted_index.cpp index f7bbac7a..556abc42 100644 --- a/tools/compress_inverted_index.cpp +++ b/tools/compress_inverted_index.cpp @@ -4,8 +4,7 @@ #include "app.hpp" #include "compress.hpp" -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { spdlog::drop(""); spdlog::set_default_logger(spdlog::stderr_color_mt("")); CLI::App app{"Compresses an inverted index"}; @@ -19,5 +18,6 @@ int main(int argc, char** argv) args.output(), args.scorer_params(), args.quantization_bits(), - args.check()); + args.check() + ); } diff --git a/tools/compute_intersection.cpp b/tools/compute_intersection.cpp index c3cc81c2..a995a5c7 100644 --- a/tools/compute_intersection.cpp +++ b/tools/compute_intersection.cpp @@ -25,8 +25,8 @@ void intersect( std::optional<std::string> const& wand_data_filename, QueryRange&& queries, IntersectionType intersection_type, - std::optional<std::uint8_t> max_term_count = std::nullopt) -{ + std::optional<std::uint8_t> max_term_count = std::nullopt +) { IndexType index; mio::mmap_source m(index_filename.c_str()); mapper::map(index, m); @@ -53,7 +53,8 @@ void intersect( query.id ? *query.id : std::to_string(qid), mask.to_ulong(), intersection.length, - intersection.max_score); + intersection.max_score + ); }; for (auto&& query: queries) { @@ -65,7 +66,8 @@ void intersect( "{}\t{}\t{}\n", query.id ? *query.id : std::to_string(qid), intersection.length, - intersection.max_score); + intersection.max_score + ); } qid += 1; } @@ -73,8 +75,7 @@ void intersect( using wand_raw_index = wand_data<wand_data_raw>; -int main(int argc, const char** argv) -{ +int main(int argc, const char** argv) { spdlog::drop(""); spdlog::set_default_logger(spdlog::stderr_color_mt("")); @@ -87,11 +88,13 @@ int main(int argc, const char** argv) App<arg::Index, arg::WandData<arg::WandMode::Required>, arg::Query<arg::QueryMode::Unranked>, arg::LogLevel> app{"Computes intersections of posting lists."}; auto* combinations_flag = app.add_flag( - "--combinations", combinations, "Compute intersections for combinations of terms in query"); + "--combinations", combinations, "Compute intersections for combinations of terms in query" + ); app.add_option( "--max-term-count,--mtc", max_term_count, - "Max number of terms when computing combinations") + "Max number of terms when computing combinations" + ) ->needs(combinations_flag); app.add_option("--min-query-len", min_query_len, "Minimum query length"); app.add_option("--max-query-len", max_query_len, "Maximum query length"); @@ -119,16 +122,12 @@ int main(int argc, const char** argv) /**/ if (false) { -#define LOOP_BODY(R, DATA, T) \ - } \ - else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) \ - { \ - intersect<BOOST_PP_CAT(T, _index), wand_raw_index>( \ - app.index_filename(), \ - app.wand_data_path(), \ - filtered_queries, \ - intersection_type, \ - max_term_count); \ +#define LOOP_BODY(R, DATA, T) \ + } \ + else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) { \ + intersect<BOOST_PP_CAT(T, _index), wand_raw_index>( \ + app.index_filename(), app.wand_data_path(), filtered_queries, intersection_type, max_term_count \ + ); \ /**/ BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_INDEX_TYPES); diff --git a/tools/count_postings.cpp b/tools/count_postings.cpp index 58ff9b4f..0f234eef 100644 --- a/tools/count_postings.cpp +++ b/tools/count_postings.cpp @@ -20,16 +20,18 @@ void extract( std::vector<pisa::Query> const& queries, std::string const& separator, bool sum, - bool print_qid) -{ + bool print_qid +) { Index index(MemorySource::mapped_file(index_filename)); auto body = [&] { if (sum) { return std::function<void(Query const&)>([&](auto const& query) { auto count = std::accumulate( - query.terms.begin(), query.terms.end(), 0, [&](auto s, auto term_id) { - return s + index[term_id].size(); - }); + query.terms.begin(), + query.terms.end(), + 0, + [&](auto s, auto term_id) { return s + index[term_id].size(); } + ); std::cout << count << '\n'; }); } @@ -38,7 +40,8 @@ void extract( query.terms | boost::adaptors::transformed([&index](auto term_id) { return std::to_string(index[term_id].size()); }), - separator); + separator + ); std::cout << '\n'; }); }(); @@ -50,8 +53,7 @@ void extract( } } -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { spdlog::drop(""); spdlog::set_default_logger(spdlog::stderr_color_mt("")); @@ -63,19 +65,20 @@ int main(int argc, char** argv) "--sum", sum, "Sum postings accross the query terms; by default, individual list lengths will be " - "printed, separated by the separator defined with --sep"); + "printed, separated by the separator defined with --sep" + ); CLI11_PARSE(app, argc, argv); spdlog::set_level(app.log_level()); /**/ if (false) { -#define LOOP_BODY(R, DATA, T) \ - } \ - else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) \ - { \ - extract<BOOST_PP_CAT(T, _index)>( \ - app.index_filename(), app.queries(), app.separator(), sum, app.print_query_id()); \ +#define LOOP_BODY(R, DATA, T) \ + } \ + else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) { \ + extract<BOOST_PP_CAT(T, _index)>( \ + app.index_filename(), app.queries(), app.separator(), sum, app.print_query_id() \ + ); \ /**/ BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_INDEX_TYPES); #undef LOOP_BODY diff --git a/tools/create_wand_data.cpp b/tools/create_wand_data.cpp index eed235e7..609a4ffd 100644 --- a/tools/create_wand_data.cpp +++ b/tools/create_wand_data.cpp @@ -1,8 +1,7 @@ #include "app.hpp" #include "wand_data.hpp" -int main(int argc, const char** argv) -{ +int main(int argc, const char** argv) { CLI::App app{"Creates additional data for query processing."}; pisa::CreateWandDataArgs args(&app); CLI11_PARSE(app, argc, argv); @@ -14,5 +13,6 @@ int main(int argc, const char** argv) args.range(), args.compress(), args.quantization_bits(), - args.dropped_term_ids()); + args.dropped_term_ids() + ); } diff --git a/tools/evaluate_collection_ordering.cpp b/tools/evaluate_collection_ordering.cpp index 73e3e164..1b5b3c0a 100644 --- a/tools/evaluate_collection_ordering.cpp +++ b/tools/evaluate_collection_ordering.cpp @@ -13,8 +13,7 @@ #include "util/log.hpp" #include "util/util.hpp" -int main(int argc, const char** argv) -{ +int main(int argc, const char** argv) { using namespace pisa; if (argc != 2) { diff --git a/tools/evaluate_queries.cpp b/tools/evaluate_queries.cpp index c961b35e..02e6f575 100644 --- a/tools/evaluate_queries.cpp +++ b/tools/evaluate_queries.cpp @@ -49,8 +49,8 @@ void evaluate_queries( ScorerParams const& scorer_params, const bool weighted, std::string const& run_id, - std::string const& iteration) -{ + std::string const& iteration +) { IndexType index(MemorySource::mapped_file(index_filename)); WandType const wdata(MemorySource::mapped_file(wand_data_filename)); @@ -71,7 +71,8 @@ void evaluate_queries( block_max_wand_query block_max_wand_q(topk); block_max_wand_q( make_block_max_scored_cursors(index, wdata, *scorer, query, weighted), - index.num_docs()); + index.num_docs() + ); topk.finalize(); return topk.topk(); }; @@ -81,7 +82,8 @@ void evaluate_queries( block_max_maxscore_query block_max_maxscore_q(topk); block_max_maxscore_q( make_block_max_scored_cursors(index, wdata, *scorer, query, weighted), - index.num_docs()); + index.num_docs() + ); topk.finalize(); return topk.topk(); }; @@ -91,7 +93,8 @@ void evaluate_queries( block_max_ranked_and_query block_max_ranked_and_q(topk); block_max_ranked_and_q( make_block_max_scored_cursors(index, wdata, *scorer, query, weighted), - index.num_docs()); + index.num_docs() + ); topk.finalize(); return topk.topk(); }; @@ -116,7 +119,8 @@ void evaluate_queries( topk_queue topk(k); maxscore_query maxscore_q(topk); maxscore_q( - make_max_scored_cursors(index, wdata, *scorer, query, weighted), index.num_docs()); + make_max_scored_cursors(index, wdata, *scorer, query, weighted), index.num_docs() + ); topk.finalize(); return topk.topk(); }; @@ -125,7 +129,8 @@ void evaluate_queries( topk_queue topk(k); ranked_or_taat_query ranked_or_taat_q(topk); ranked_or_taat_q( - make_scored_cursors(index, *scorer, query, weighted), index.num_docs(), accumulator); + make_scored_cursors(index, *scorer, query, weighted), index.num_docs(), accumulator + ); topk.finalize(); return topk.topk(); }; @@ -134,7 +139,8 @@ void evaluate_queries( topk_queue topk(k); ranked_or_taat_query ranked_or_taat_q(topk); ranked_or_taat_q( - make_scored_cursors(index, *scorer, query, weighted), index.num_docs(), accumulator); + make_scored_cursors(index, *scorer, query, weighted), index.num_docs(), accumulator + ); topk.finalize(); return topk.topk(); }; @@ -163,7 +169,8 @@ void evaluate_queries( docmap[result.second], rank + 1, result.first, - run_id); + run_id + ); } } auto end_print = std::chrono::steady_clock::now(); @@ -179,8 +186,7 @@ using wand_raw_index = wand_data<wand_data_raw>; using wand_uniform_index = wand_data<wand_data_compressed<>>; using wand_uniform_index_quantized = wand_data<wand_data_compressed<PayloadType::Quantized>>; -int main(int argc, const char** argv) -{ +int main(int argc, const char** argv) { spdlog::set_default_logger(spdlog::stderr_color_mt("default")); std::string documents_file; @@ -224,25 +230,25 @@ int main(int argc, const char** argv) app.scorer_params(), app.weighted(), run_id, - iteration); + iteration + ); /**/ if (false) { // NOLINT -#define LOOP_BODY(R, DATA, T) \ - } \ - else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) \ - { \ - if (app.is_wand_compressed()) { \ - if (quantized) { \ - std::apply( \ - evaluate_queries<BOOST_PP_CAT(T, _index), wand_uniform_index_quantized>, \ - params); \ - } else { \ - std::apply(evaluate_queries<BOOST_PP_CAT(T, _index), wand_uniform_index>, params); \ - } \ - } else { \ - std::apply(evaluate_queries<BOOST_PP_CAT(T, _index), wand_raw_index>, params); \ - } \ +#define LOOP_BODY(R, DATA, T) \ + } \ + else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) { \ + if (app.is_wand_compressed()) { \ + if (quantized) { \ + std::apply( \ + evaluate_queries<BOOST_PP_CAT(T, _index), wand_uniform_index_quantized>, params \ + ); \ + } else { \ + std::apply(evaluate_queries<BOOST_PP_CAT(T, _index), wand_uniform_index>, params); \ + } \ + } else { \ + std::apply(evaluate_queries<BOOST_PP_CAT(T, _index), wand_raw_index>, params); \ + } \ /**/ BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_INDEX_TYPES); diff --git a/tools/extract_maxscores.cpp b/tools/extract_maxscores.cpp index 86f159c6..ef71b137 100644 --- a/tools/extract_maxscores.cpp +++ b/tools/extract_maxscores.cpp @@ -21,8 +21,8 @@ void extract( std::string const& wand_data_path, std::vector<pisa::Query> const& queries, std::string const& separator, - bool print_query_id) -{ + bool print_query_id +) { Wand wdata(MemorySource::mapped_file(wand_data_path)); for (auto const& query: queries) { if (print_query_id and query.id) { @@ -32,7 +32,8 @@ void extract( query.terms | boost::adaptors::transformed([&wdata](auto term_id) { return std::to_string(wdata.max_term_weight(term_id)); }), - separator); + separator + ); std::cout << '\n'; } } @@ -41,8 +42,7 @@ using wand_raw_index = wand_data<wand_data_raw>; using wand_uniform_index = wand_data<wand_data_compressed<>>; using wand_uniform_index_quantized = wand_data<wand_data_compressed<PayloadType::Quantized>>; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { spdlog::drop(""); spdlog::set_default_logger(spdlog::stderr_color_mt("")); @@ -58,7 +58,8 @@ int main(int argc, char** argv) Extracts max-scores for query terms from an inverted index. The max-scores will be printed to the output separated by --sep, -which is a tab by default.)"}; +which is a tab by default.)" + }; app.add_flag("--quantized", quantized, "Quantized scores"); CLI11_PARSE(app, argc, argv); diff --git a/tools/extract_topics.cpp b/tools/extract_topics.cpp index 76284939..41606458 100644 --- a/tools/extract_topics.cpp +++ b/tools/extract_topics.cpp @@ -9,15 +9,15 @@ #include "CLI/CLI.hpp" -int main(int argc, char const* argv[]) -{ +int main(int argc, char const* argv[]) { std::string input_filename; std::string output_basename; std::string format; bool unique = false; pisa::App<pisa::arg::LogLevel> app{ - "A tool for converting queries from several formats to PISA queries."}; + "A tool for converting queries from several formats to PISA queries." + }; app.add_option("-i,--input", input_filename, "TREC query input file")->required(); app.add_option("-o,--output", output_basename, "Output basename")->required(); app.add_option("-f,--format", format, "Input format")->required(); diff --git a/tools/invert.cpp b/tools/invert.cpp index d4a0a5ce..2d3f5a97 100644 --- a/tools/invert.cpp +++ b/tools/invert.cpp @@ -13,8 +13,7 @@ #include "invert.hpp" #include "util/util.hpp" -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { CLI::App app{"Constructs an inverted index from a forward index."}; pisa::InvertArgs args(&app); CLI11_PARSE(app, argc, argv); diff --git a/tools/kth_threshold.cpp b/tools/kth_threshold.cpp index 4d4180db..9360f81a 100644 --- a/tools/kth_threshold.cpp +++ b/tools/kth_threshold.cpp @@ -26,13 +26,13 @@ using namespace pisa; -std::set<uint32_t> parse_tuple(std::string const& line, size_t k) -{ +std::set<uint32_t> parse_tuple(std::string const& line, size_t k) { std::vector<std::string> term_ids; boost::algorithm::split(term_ids, line, boost::is_any_of(" \t")); if (term_ids.size() != k) { throw std::runtime_error(fmt::format( - "Wrong number of terms in line: {} (expected {} but found {})", line, k, term_ids.size())); + "Wrong number of terms in line: {} (expected {} but found {})", line, k, term_ids.size() + )); } std::set<uint32_t> term_ids_int; @@ -40,8 +40,8 @@ std::set<uint32_t> parse_tuple(std::string const& line, size_t k) try { term_ids_int.insert(std::stoi(term_id)); } catch (...) { - throw std::runtime_error( - fmt::format("Cannot convert {} to int in line: {}", term_id, line)); + throw std::runtime_error(fmt::format("Cannot convert {} to int in line: {}", term_id, line) + ); } } return term_ids_int; @@ -59,8 +59,8 @@ void kt_thresholds( std::optional<std::string> pairs_filename, std::optional<std::string> triples_filename, bool all_pairs, - bool all_triples) -{ + bool all_triples +) { IndexType index; mio::mmap_source m(index_filename.c_str()); mapper::map(index, m); @@ -139,7 +139,8 @@ void kt_thresholds( Query query; query.terms = {terms[i], terms[j], terms[s]}; wand_q( - make_max_scored_cursors(index, wdata, *scorer, query), index.num_docs()); + make_max_scored_cursors(index, wdata, *scorer, query), index.num_docs() + ); threshold = std::max(threshold, topk.size() == k ? topk.true_threshold() : 0.0F); topk.clear(); @@ -155,8 +156,7 @@ using wand_raw_index = wand_data<wand_data_raw>; using wand_uniform_index = wand_data<wand_data_compressed<>>; using wand_uniform_index_quantized = wand_data<wand_data_compressed<PayloadType::Quantized>>; -int main(int argc, const char** argv) -{ +int main(int argc, const char** argv) { spdlog::drop(""); spdlog::set_default_logger(spdlog::stderr_color_mt("")); @@ -175,11 +175,13 @@ int main(int argc, const char** argv) "term, pair or triple of a query. Pairs and triples are only used if provided with " "--pairs and --triples respectively."}; auto pairs = app.add_option( - "-p,--pairs", pairs_filename, "A tab separated file containing all the cached term pairs"); + "-p,--pairs", pairs_filename, "A tab separated file containing all the cached term pairs" + ); auto triples = app.add_option( "-t,--triples", triples_filename, - "A tab separated file containing all the cached term triples"); + "A tab separated file containing all the cached term triples" + ); app.add_flag("--all-pairs", all_pairs, "Consider all term pairs of a query")->excludes(pairs); app.add_flag("--all-triples", all_triples, "Consider all term triples of a query")->excludes(triples); app.add_flag("--quantized", quantized, "Quantizes the scores"); @@ -199,23 +201,24 @@ int main(int argc, const char** argv) pairs_filename, triples_filename, all_pairs, - all_triples); + all_triples + ); /**/ if (false) { -#define LOOP_BODY(R, DATA, T) \ - } \ - else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) \ - { \ - if (app.is_wand_compressed()) { \ - if (quantized) { \ - std::apply( \ - kt_thresholds<BOOST_PP_CAT(T, _index), wand_uniform_index_quantized>, params); \ - } else { \ - std::apply(kt_thresholds<BOOST_PP_CAT(T, _index), wand_uniform_index>, params); \ - } \ - } else { \ - std::apply(kt_thresholds<BOOST_PP_CAT(T, _index), wand_raw_index>, params); \ +#define LOOP_BODY(R, DATA, T) \ + } \ + else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) { \ + if (app.is_wand_compressed()) { \ + if (quantized) { \ + std::apply( \ + kt_thresholds<BOOST_PP_CAT(T, _index), wand_uniform_index_quantized>, params \ + ); \ + } else { \ + std::apply(kt_thresholds<BOOST_PP_CAT(T, _index), wand_uniform_index>, params); \ + } \ + } else { \ + std::apply(kt_thresholds<BOOST_PP_CAT(T, _index), wand_raw_index>, params); \ } /**/ BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_INDEX_TYPES); diff --git a/tools/lexicon.cpp b/tools/lexicon.cpp index d9243403..33fb4dd5 100644 --- a/tools/lexicon.cpp +++ b/tools/lexicon.cpp @@ -8,8 +8,7 @@ using namespace pisa; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { std::string text_file; std::string lexicon_file; std::size_t idx; @@ -36,7 +35,8 @@ int main(int argc, char** argv) if (*build) { std::ifstream is(text_file); encode_payload_vector( - std::istream_iterator<io::Line>(is), std::istream_iterator<io::Line>()) + std::istream_iterator<io::Line>(is), std::istream_iterator<io::Line>() + ) .to_file(lexicon_file); return 0; } diff --git a/tools/map_queries.cpp b/tools/map_queries.cpp index d5c2bc36..80376d9a 100644 --- a/tools/map_queries.cpp +++ b/tools/map_queries.cpp @@ -9,13 +9,13 @@ using namespace pisa; -int main(int argc, const char** argv) -{ +int main(int argc, const char** argv) { spdlog::drop(""); spdlog::set_default_logger(spdlog::stderr_color_mt("")); App<arg::Query<arg::QueryMode::Unranked>, arg::Separator, arg::PrintQueryId, arg::LogLevel> app{ - "A tool for transforming textual queries to IDs."}; + "A tool for transforming textual queries to IDs." + }; CLI11_PARSE(app, argc, argv); spdlog::set_level(app.log_level()); diff --git a/tools/parse_collection.cpp b/tools/parse_collection.cpp index 51ab69b2..0dce1204 100644 --- a/tools/parse_collection.cpp +++ b/tools/parse_collection.cpp @@ -13,16 +13,14 @@ using namespace pisa; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { auto valid_basename = [](std::string const& basename) { std::filesystem::path p(basename); auto parent = p.parent_path(); if (not std::filesystem::exists(parent) or not std::filesystem::is_directory(parent)) { return fmt::format( - "Basename {} invalid: path {} is not an existing directory", - basename, - parent.string()); + "Basename {} invalid: path {} is not an existing directory", basename, parent.string() + ); } return std::string(); }; @@ -33,7 +31,8 @@ int main(int argc, char** argv) ptrdiff_t batch_size = 100'000; pisa::App<pisa::arg::LogLevel, pisa::arg::Threads, pisa::arg::Analyzer> app{ - "parse_collection - parse collection and store as forward index."}; + "parse_collection - parse collection and store as forward index." + }; app.add_option("-o,--output", output_filename, "Forward index filename") ->required() ->check(valid_basename); @@ -47,7 +46,8 @@ int main(int argc, char** argv) "Merge previously produced batch files. " "When parsing process was killed during merging, " "use this command to finish merging without " - "having to restart building batches."); + "having to restart building batches." + ); merge_cmd->add_option("--batch-count", batch_count, "Number of batches")->required(); merge_cmd->add_option("--document-count", document_count, "Number of documents")->required(); @@ -69,7 +69,8 @@ int main(int argc, char** argv) record_parser(format, std::cin), std::make_shared<TextAnalyzer>(app.text_analyzer()), batch_size, - app.threads() + 1); + app.threads() + 1 + ); } } catch (std::exception& err) { spdlog::error(err.what()); diff --git a/tools/partition_fwd_index.cpp b/tools/partition_fwd_index.cpp index 51df0ad2..723e760a 100644 --- a/tools/partition_fwd_index.cpp +++ b/tools/partition_fwd_index.cpp @@ -27,8 +27,7 @@ using ranges::views::chunk; using ranges::views::iota; using ranges::views::zip; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { std::string input_basename; std::string output_basename; std::vector<std::string> shard_files; @@ -39,8 +38,7 @@ int main(int argc, char** argv) app.add_option("-i,--input", input_basename, "Forward index filename")->required(); app.add_option("-o,--output", output_basename, "Basename of partitioned shards")->required(); app.add_option("-j,--threads", threads, "Thread count"); - auto random_option = - app.add_option("-r,--random-shards", shard_count, "Number of random shards"); + auto random_option = app.add_option("-r,--random-shards", shard_count, "Number of random shards"); auto shard_files_option = app.add_option("-s,--shard-files", shard_files, "List of files with shard titles"); random_option->excludes(shard_files_option); @@ -58,7 +56,8 @@ int main(int argc, char** argv) partition_fwd_index(input_basename, output_basename, mapping); } else if (*shard_files_option) { auto mapping = mapping_from_files( - fmt::format("{}.documents", input_basename), gsl::make_span(shard_files)); + fmt::format("{}.documents", input_basename), gsl::make_span(shard_files) + ); partition_fwd_index(input_basename, output_basename, mapping); } else { spdlog::error("You must define either --random-shards or --shard-files"); diff --git a/tools/profile_queries.cpp b/tools/profile_queries.cpp index 81a04c38..78b2c0a0 100644 --- a/tools/profile_queries.cpp +++ b/tools/profile_queries.cpp @@ -25,8 +25,7 @@ using namespace pisa; template <typename QueryOperator> -void op_profile(QueryOperator const& query_op, std::vector<Query> const& queries) -{ +void op_profile(QueryOperator const& query_op, std::vector<Query> const& queries) { using namespace pisa; size_t n_threads = std::thread::hardware_concurrency(); @@ -69,8 +68,8 @@ void profile( const std::optional<std::string>& wand_data_filename, std::vector<Query> const& queries, std::string const& type, - std::string const& query_type) -{ + std::string const& query_type +) { using namespace pisa; typename add_profiling<IndexType>::type index; @@ -101,7 +100,8 @@ void profile( and_query and_q; return and_q( make_cursors<typename add_profiling<IndexType>::type>(index, query), - index.num_docs()) + index.num_docs() + ) .size(); }; } else if (t == "ranked_and" && wand_data_filename) { @@ -109,9 +109,9 @@ void profile( topk_queue topk(10); ranked_and_query ranked_and_q(topk); ranked_and_q( - make_scored_cursors<typename add_profiling<IndexType>::type>( - index, *scorer, query), - index.num_docs()); + make_scored_cursors<typename add_profiling<IndexType>::type>(index, *scorer, query), + index.num_docs() + ); topk.finalize(); return topk.topk().size(); }; @@ -121,8 +121,10 @@ void profile( wand_query wand_q(topk); wand_q( make_max_scored_cursors<typename add_profiling<IndexType>::type, WandType>( - index, wdata, *scorer, query), - index.num_docs()); + index, wdata, *scorer, query + ), + index.num_docs() + ); topk.finalize(); return topk.topk().size(); }; @@ -132,8 +134,10 @@ void profile( maxscore_query maxscore_q(topk); maxscore_q( make_max_scored_cursors<typename add_profiling<IndexType>::type, WandType>( - index, wdata, *scorer, query), - index.num_docs()); + index, wdata, *scorer, query + ), + index.num_docs() + ); topk.finalize(); return topk.topk().size(); }; @@ -146,8 +150,7 @@ void profile( block_profiler::dump(std::cout); } -int main(int argc, const char** argv) -{ +int main(int argc, const char** argv) { using namespace pisa; std::string type = argv[1]; @@ -179,12 +182,12 @@ int main(int argc, const char** argv) } if (false) { -#define LOOP_BODY(R, DATA, T) \ - } \ - else if (type == BOOST_PP_STRINGIZE(T)) \ - { \ - profile<BOOST_PP_CAT(T, _index)>( \ - index_filename, wand_data_filename, queries, type, query_type); \ +#define LOOP_BODY(R, DATA, T) \ + } \ + else if (type == BOOST_PP_STRINGIZE(T)) { \ + profile<BOOST_PP_CAT(T, _index)>( \ + index_filename, wand_data_filename, queries, type, query_type \ + ); \ /**/ BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_INDEX_TYPES); diff --git a/tools/queries.cpp b/tools/queries.cpp index f3e0d78a..7e780db2 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -51,15 +51,15 @@ void extract_times( std::string const& index_type, std::string const& query_type, size_t runs, - std::ostream& os) -{ + std::ostream& os +) { std::vector<std::size_t> times(runs); for (auto&& [qid, query]: enumerate(queries)) { do_not_optimize_away(fn(query, thresholds[qid])); std::generate(times.begin(), times.end(), [&fn, &q = query, &t = thresholds[qid]]() { return run_with_timer<std::chrono::microseconds>( - [&]() { do_not_optimize_away(fn(q, t)); }) - .count(); + [&]() { do_not_optimize_away(fn(q, t)); } + ).count(); }); auto mean = std::accumulate(times.begin(), times.end(), std::size_t{0}, std::plus<>()) / runs; os << fmt::format("{}\t{}\n", query.id.value_or(std::to_string(qid)), mean); @@ -75,8 +75,8 @@ void op_perftest( std::string const& query_type, size_t runs, std::uint64_t k, - bool safe) -{ + bool safe +) { std::vector<double> query_times; std::size_t num_reruns = 0; spdlog::info("Safe: {}", safe); @@ -137,8 +137,8 @@ void perftest( const ScorerParams& scorer_params, const bool weighted, bool extract, - bool safe) -{ + bool safe +) { spdlog::info("Loading index from {}", index_filename); IndexType index(MemorySource::mapped_file(index_filename)); @@ -205,8 +205,8 @@ void perftest( topk_queue topk(k, threshold); wand_query wand_q(topk); wand_q( - make_max_scored_cursors(index, wdata, *scorer, query, weighted), - index.num_docs()); + make_max_scored_cursors(index, wdata, *scorer, query, weighted), index.num_docs() + ); topk.finalize(); return topk.topk().size(); }; @@ -216,7 +216,8 @@ void perftest( block_max_wand_query block_max_wand_q(topk); block_max_wand_q( make_block_max_scored_cursors(index, wdata, *scorer, query, weighted), - index.num_docs()); + index.num_docs() + ); topk.finalize(); return topk.topk().size(); }; @@ -226,7 +227,8 @@ void perftest( block_max_maxscore_query block_max_maxscore_q(topk); block_max_maxscore_q( make_block_max_scored_cursors(index, wdata, *scorer, query, weighted), - index.num_docs()); + index.num_docs() + ); topk.finalize(); return topk.topk().size(); }; @@ -244,7 +246,8 @@ void perftest( block_max_ranked_and_query block_max_ranked_and_q(topk); block_max_ranked_and_q( make_block_max_scored_cursors(index, wdata, *scorer, query, weighted), - index.num_docs()); + index.num_docs() + ); topk.finalize(); return topk.topk().size(); }; @@ -261,8 +264,8 @@ void perftest( topk_queue topk(k, threshold); maxscore_query maxscore_q(topk); maxscore_q( - make_max_scored_cursors(index, wdata, *scorer, query, weighted), - index.num_docs()); + make_max_scored_cursors(index, wdata, *scorer, query, weighted), index.num_docs() + ); topk.finalize(); return topk.topk().size(); }; @@ -273,9 +276,8 @@ void perftest( query_fun = [&, ranked_or_taat_q, accumulator](Query query, Score threshold) mutable { topk.clear(threshold); ranked_or_taat_q( - make_scored_cursors(index, *scorer, query, weighted), - index.num_docs(), - accumulator); + make_scored_cursors(index, *scorer, query, weighted), index.num_docs(), accumulator + ); topk.finalize(); return topk.topk().size(); }; @@ -286,9 +288,8 @@ void perftest( query_fun = [&, ranked_or_taat_q, accumulator](Query query, Score threshold) mutable { topk.clear(threshold); ranked_or_taat_q( - make_scored_cursors(index, *scorer, query, weighted), - index.num_docs(), - accumulator); + make_scored_cursors(index, *scorer, query, weighted), index.num_docs(), accumulator + ); topk.finalize(); return topk.topk().size(); }; @@ -308,8 +309,7 @@ using wand_raw_index = wand_data<wand_data_raw>; using wand_uniform_index = wand_data<wand_data_compressed<>>; using wand_uniform_index_quantized = wand_data<wand_data_compressed<PayloadType::Quantized>>; -int main(int argc, const char** argv) -{ +int main(int argc, const char** argv) { bool extract = false; bool safe = false; bool quantized = false; @@ -345,13 +345,13 @@ int main(int argc, const char** argv) app.scorer_params(), app.weighted(), extract, - safe); + safe + ); /**/ if (false) { #define LOOP_BODY(R, DATA, T) \ } \ - else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) \ - { \ + else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) { \ if (app.is_wand_compressed()) { \ if (quantized) { \ std::apply(perftest<BOOST_PP_CAT(T, _index), wand_uniform_index_quantized>, params); \ diff --git a/tools/read_collection.cpp b/tools/read_collection.cpp index 3814ad89..8211344d 100644 --- a/tools/read_collection.cpp +++ b/tools/read_collection.cpp @@ -14,8 +14,7 @@ using namespace pisa; [[nodiscard]] auto print_function(std::optional<std::string> const& map_file, std::optional<std::string> const& lex_file) - -> std::function<void(std::uint32_t)> -{ + -> std::function<void(std::uint32_t)> { if (map_file) { return [loaded_map = pisa::io::read_string_vector(*map_file)](std::uint32_t term) { std::cout << loaded_map.at(term) << ' '; @@ -32,8 +31,7 @@ print_function(std::optional<std::string> const& map_file, std::optional<std::st return [](auto const& term) { std::cout << term << " "; }; } -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { std::string collection_file; std::optional<std::string> map_file{}; std::optional<std::string> lex_file{}; @@ -47,13 +45,15 @@ int main(int argc, char** argv) "ID to string mapping in text file format. " "Line n is the string associated with ID n. " "E.g., if used to read a document from a forward index, this would be the `.terms` " - "file, which maps term IDs to their string reperesentations."); + "file, which maps term IDs to their string reperesentations." + ); auto maplex = app.add_option( "--maplex", lex_file, "ID to string mapping in lexicon binary file format. " "E.g., if used to read a document from a forward index, this would be the `.termlex` " - "file, which maps term IDs to their string reperesentations."); + "file, which maps term IDs to their string reperesentations." + ); maptext->excludes(maplex); maplex->excludes(maptext); auto* entry_cmd = app.add_subcommand("entry", "Reads single entry."); @@ -64,7 +64,8 @@ int main(int argc, char** argv) "last", last, "End reading at this entry. " - "If not defined, read until the end of the collection."); + "If not defined, read until the end of the collection." + ); CLI11_PARSE(app, argc, argv); spdlog::set_level(app.log_level()); @@ -85,7 +86,8 @@ int main(int argc, char** argv) } if (last < first) { throw std::invalid_argument( - "Last entry index must be greater or equal to first."); + "Last entry index must be greater or equal to first." + ); } return std::next(iter, last - first + 1); } diff --git a/tools/reorder_docids.cpp b/tools/reorder_docids.cpp index 38a1433f..75038bca 100644 --- a/tools/reorder_docids.cpp +++ b/tools/reorder_docids.cpp @@ -3,8 +3,7 @@ #include "app.hpp" #include "reorder_docids.hpp" -int main(int argc, const char** argv) -{ +int main(int argc, const char** argv) { CLI::App app{"Reassigns the document IDs."}; pisa::ReorderDocuments args(&app); CLI11_PARSE(app, argc, argv); diff --git a/tools/reorder_docids.hpp b/tools/reorder_docids.hpp index ec2dad7e..b5fc2d85 100644 --- a/tools/reorder_docids.hpp +++ b/tools/reorder_docids.hpp @@ -7,8 +7,7 @@ namespace pisa { -auto reorder_docids(ReorderDocuments args) -> int -{ +auto reorder_docids(ReorderDocuments args) -> int { tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, 2); try { if (args.bp()) { @@ -26,10 +25,12 @@ auto reorder_docids(ReorderDocuments args) -> int .print_args = args.print(), }); } - ReorderOptions options{.input_basename = args.input_basename(), - .output_basename = *args.output_basename(), - .document_lexicon = args.document_lexicon(), - .reordered_document_lexicon = args.reordered_document_lexicon()}; + ReorderOptions options{ + .input_basename = args.input_basename(), + .output_basename = *args.output_basename(), + .document_lexicon = args.document_lexicon(), + .reordered_document_lexicon = args.reordered_document_lexicon() + }; if (args.random()) { return reorder_random(options, args.seed()); } diff --git a/tools/sample_inverted_index.cpp b/tools/sample_inverted_index.cpp index b3cd3a24..3f701bf4 100644 --- a/tools/sample_inverted_index.cpp +++ b/tools/sample_inverted_index.cpp @@ -13,8 +13,7 @@ using namespace pisa; -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { std::string input_basename; std::string output_basename; std::string type; @@ -31,7 +30,8 @@ int main(int argc, char** argv) app.add_option( "--terms-to-drop", terms_to_drop_filename, - "A filename containing a list of term IDs that we want to drop"); + "A filename containing a list of term IDs that we want to drop" + ); app.add_option("--seed", seed, "Seed state"); CLI11_PARSE(app, argc, argv); @@ -50,11 +50,8 @@ int main(int argc, char** argv) std::vector<std::uint32_t> sample; std::iota(indices.begin(), indices.end(), 0); std::sample( - indices.begin(), - indices.end(), - std::back_inserter(sample), - sample_size, - std::mt19937{seed}); + indices.begin(), indices.end(), std::back_inserter(sample), sample_size, std::mt19937{seed} + ); return sample; }; @@ -71,7 +68,8 @@ int main(int argc, char** argv) indices.end(), std::back_inserter(sampled_indices), sample_size, - std::mt19937{seed}); + std::mt19937{seed} + ); std::vector<bool> doc_ids(num_docs); for (auto&& p: sampled_indices) { doc_ids[p] = true; diff --git a/tools/selective_queries.cpp b/tools/selective_queries.cpp index 31367291..3df3ba5c 100644 --- a/tools/selective_queries.cpp +++ b/tools/selective_queries.cpp @@ -16,8 +16,8 @@ using namespace pisa; template <typename IndexType> void selective_queries( - const std::string& index_filename, std::string const& encoding, std::vector<Query> const& queries) -{ + const std::string& index_filename, std::string const& encoding, std::vector<Query> const& queries +) { IndexType index; spdlog::info("Loading index from {}", index_filename); mio::mmap_source m(index_filename.c_str()); @@ -40,21 +40,21 @@ void selective_queries( } } -int main(int argc, const char** argv) -{ +int main(int argc, const char** argv) { App<arg::Index, arg::Query<arg::QueryMode::Unranked>, arg::LogLevel> app{ - "Filters selective queries for a given index."}; + "Filters selective queries for a given index." + }; CLI11_PARSE(app, argc, argv); spdlog::set_level(app.log_level()); if (false) { -#define LOOP_BODY(R, DATA, T) \ - } \ - else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) \ - { \ - selective_queries<BOOST_PP_CAT(T, _index)>( \ - app.index_filename(), app.index_encoding(), app.queries()); +#define LOOP_BODY(R, DATA, T) \ + } \ + else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) { \ + selective_queries<BOOST_PP_CAT(T, _index)>( \ + app.index_filename(), app.index_encoding(), app.queries() \ + ); /**/ BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_INDEX_TYPES); diff --git a/tools/shards.cpp b/tools/shards.cpp index 56b8b4d9..b81d3be3 100644 --- a/tools/shards.cpp +++ b/tools/shards.cpp @@ -31,8 +31,7 @@ using pisa::TailyStatsArgs; using pisa::TailyThresholds; using pisa::invert::InvertParams; -void print_taily_scores(std::vector<double> const& scores, std::chrono::microseconds time) -{ +void print_taily_scores(std::vector<double> const& scores, std::chrono::microseconds time) { std::cout << R"({"time":)" << time.count() << R"(,"scores":[)"; if (!scores.empty()) { std::cout << scores.front(); @@ -43,24 +42,24 @@ void print_taily_scores(std::vector<double> const& scores, std::chrono::microsec std::cout << "]}\n"; } -int main(int argc, char** argv) -{ +int main(int argc, char** argv) { spdlog::drop(""); spdlog::set_default_logger(spdlog::stderr_color_mt("")); pisa::App<pisa::arg::LogLevel> app{"Executes commands for shards."}; - auto* invert = - app.add_subcommand("invert", "Constructs an inverted index from a forward index."); + auto* invert = app.add_subcommand("invert", "Constructs an inverted index from a forward index."); auto* reorder = app.add_subcommand("reorder-docids", "Reorder document IDs."); auto* compress = app.add_subcommand("compress", "Compresses an inverted index"); auto* wand = app.add_subcommand("wand-data", "Creates additional data for query processing."); auto* taily = app.add_subcommand( - "taily-stats", "Extracts Taily statistics from the index and stores it in a file."); + "taily-stats", "Extracts Taily statistics from the index and stores it in a file." + ); auto* taily_rank = app.add_subcommand( "taily-score", "Computes Taily shard ranks for queries." " NOTE: as term IDs need to be resolved individually for each shard," - " DO NOT provide already parsed and resolved queries (with IDs instead of terms)."); + " DO NOT provide already parsed and resolved queries (with IDs instead of terms)." + ); auto* taily_thresholds = app.add_subcommand("taily-thresholds", "Computes Taily thresholds."); InvertArgs invert_args(invert); ReorderDocuments reorder_args(reorder); @@ -77,7 +76,8 @@ int main(int argc, char** argv) try { if (invert->parsed()) { tbb::global_control control( - tbb::global_control::max_allowed_parallelism, invert_args.threads() + 1); + tbb::global_control::max_allowed_parallelism, invert_args.threads() + 1 + ); spdlog::info("Number of worker threads: {}", invert_args.threads()); Shard_Id shard_id{0}; @@ -89,7 +89,8 @@ int main(int argc, char** argv) invert::invert_forward_index( format_shard(invert_args.input_basename(), shard_id), format_shard(invert_args.output_basename(), shard_id), - params); + params + ); shard_id += 1; } } @@ -118,7 +119,8 @@ int main(int argc, char** argv) shard_args.output(), shard_args.scorer_params(), shard_args.quantization_bits(), - shard_args.check()); + shard_args.check() + ); } return 0; } @@ -136,7 +138,8 @@ int main(int argc, char** argv) shard_args.range(), shard_args.compress(), shard_args.quantization_bits(), - shard_args.dropped_term_ids()); + shard_args.dropped_term_ids() + ); } } if (taily->parsed()) { @@ -165,7 +168,8 @@ int main(int argc, char** argv) taily_rank_args.queries(), shard_queries, taily_rank_args.k(), - print_taily_scores); + print_taily_scores + ); } if (taily_thresholds->parsed()) { auto shards = resolve_shards(taily_thresholds_args.stats()); diff --git a/tools/stem_queries.cpp b/tools/stem_queries.cpp index 48c022c1..d3afeced 100644 --- a/tools/stem_queries.cpp +++ b/tools/stem_queries.cpp @@ -8,8 +8,7 @@ #include "io.hpp" #include "pisa/query/query_stemmer.hpp" -int main(int argc, char const* argv[]) -{ +int main(int argc, char const* argv[]) { std::string input_filename; std::string output_filename; std::optional<std::string> stemmer; diff --git a/tools/taily_stats.cpp b/tools/taily_stats.cpp index a4244e5e..5105e05b 100644 --- a/tools/taily_stats.cpp +++ b/tools/taily_stats.cpp @@ -23,8 +23,7 @@ using pisa::wand_data; using pisa::wand_data_compressed; using pisa::wand_data_raw; -int main(int argc, const char** argv) -{ +int main(int argc, const char** argv) { spdlog::drop(""); spdlog::set_default_logger(spdlog::stderr_color_mt("")); diff --git a/tools/taily_stats.hpp b/tools/taily_stats.hpp index d40b83f6..e559e110 100644 --- a/tools/taily_stats.hpp +++ b/tools/taily_stats.hpp @@ -13,23 +13,24 @@ void extract_taily_stats( std::string const& wand_data_path, ScorerParams const& scorer_params, pisa::binary_freq_collection const& collection, - std::string const& output_path) -{ + std::string const& output_path +) { Wand wdata(pisa::MemorySource::mapped_file(wand_data_path)); auto term_stats = pisa::extract_feature_stats(collection, pisa::scorer::from_params(scorer_params, wdata)); pisa::write_feature_stats(term_stats, collection.num_docs(), output_path); } -void extract_taily_stats(TailyStatsArgs const& args) -{ +void extract_taily_stats(TailyStatsArgs const& args) { pisa::binary_freq_collection collection(args.collection_path().c_str()); if (args.is_wand_compressed()) { extract_taily_stats<wand_data<wand_data_compressed<>>>( - args.wand_data_path(), args.scorer_params(), collection, args.output_path()); + args.wand_data_path(), args.scorer_params(), collection, args.output_path() + ); } else { extract_taily_stats<wand_data<wand_data_raw>>( - args.wand_data_path(), args.scorer_params(), collection, args.output_path()); + args.wand_data_path(), args.scorer_params(), collection, args.output_path() + ); } } diff --git a/tools/taily_thresholds.cpp b/tools/taily_thresholds.cpp index 54586924..d382c794 100644 --- a/tools/taily_thresholds.cpp +++ b/tools/taily_thresholds.cpp @@ -7,8 +7,7 @@ #include "app.hpp" #include "taily_thresholds.hpp" -int main(int argc, const char** argv) -{ +int main(int argc, const char** argv) { spdlog::drop(""); spdlog::set_default_logger(spdlog::stderr_color_mt("")); diff --git a/tools/taily_thresholds.hpp b/tools/taily_thresholds.hpp index 6ff2b7b0..23414d67 100644 --- a/tools/taily_thresholds.hpp +++ b/tools/taily_thresholds.hpp @@ -6,8 +6,7 @@ namespace pisa { -void estimate_taily_thresholds(pisa::TailyThresholds const& args) -{ +void estimate_taily_thresholds(pisa::TailyThresholds const& args) { auto stats = pisa::TailyStats::from_mapped(args.stats()); for (auto const& query: args.queries()) { auto threshold = taily::estimate_cutoff(stats.query_stats(query), args.k()); diff --git a/tools/thresholds.cpp b/tools/thresholds.cpp index 84ead288..883dbccf 100644 --- a/tools/thresholds.cpp +++ b/tools/thresholds.cpp @@ -28,8 +28,8 @@ void thresholds( std::string const& type, ScorerParams const& scorer_params, uint64_t k, - bool quantized) -{ + bool quantized +) { IndexType index(MemorySource::mapped_file(index_filename)); WandType const wdata(MemorySource::mapped_file(wand_data_filename)); @@ -54,8 +54,7 @@ using wand_raw_index = wand_data<wand_data_raw>; using wand_uniform_index = wand_data<wand_data_compressed<>>; using wand_uniform_index_quantized = wand_data<wand_data_compressed<PayloadType::Quantized>>; -int main(int argc, const char** argv) -{ +int main(int argc, const char** argv) { spdlog::drop(""); spdlog::set_default_logger(spdlog::stderr_color_mt("")); @@ -78,23 +77,22 @@ int main(int argc, const char** argv) app.index_encoding(), app.scorer_params(), app.k(), - quantized); + quantized + ); /**/ if (false) { -#define LOOP_BODY(R, DATA, T) \ - } \ - else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) \ - { \ - if (app.is_wand_compressed()) { \ - if (quantized) { \ - std::apply( \ - thresholds<BOOST_PP_CAT(T, _index), wand_uniform_index_quantized>, params); \ - } else { \ - std::apply(thresholds<BOOST_PP_CAT(T, _index), wand_uniform_index>, params); \ - } \ - } else { \ - std::apply(thresholds<BOOST_PP_CAT(T, _index), wand_raw_index>, params); \ +#define LOOP_BODY(R, DATA, T) \ + } \ + else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) { \ + if (app.is_wand_compressed()) { \ + if (quantized) { \ + std::apply(thresholds<BOOST_PP_CAT(T, _index), wand_uniform_index_quantized>, params); \ + } else { \ + std::apply(thresholds<BOOST_PP_CAT(T, _index), wand_uniform_index>, params); \ + } \ + } else { \ + std::apply(thresholds<BOOST_PP_CAT(T, _index), wand_raw_index>, params); \ } /**/ BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_INDEX_TYPES);