Skip to content

Commit

Permalink
Posting list concepts
Browse files Browse the repository at this point in the history
Introduce posting list concepts & use them in the algorithms.

Signed-off-by: Michal Siedlaczek <michal@siedlaczek.me>
  • Loading branch information
elshize committed Jan 21, 2024
1 parent 627f4fc commit dd1ce20
Show file tree
Hide file tree
Showing 23 changed files with 219 additions and 18 deletions.
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ set(CMAKE_CXX_EXTENSIONS OFF)

if(NOT CMAKE_CXX_STANDARD EQUAL 17)
add_compile_definitions(PISA_ENABLE_CONCEPTS=1)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fconcepts-diagnostics-depth=2")
endif()
add_compile_definitions(BOOST_NO_CXX98_FUNCTION_BASE=1)

Expand Down
11 changes: 10 additions & 1 deletion include/pisa/block_posting_list.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#pragma once

#include "codec/block_codecs.hpp"
#include "concepts.hpp"
#include "concepts/posting_cursor.hpp"
#include "util/block_profiler.hpp"
#include "util/util.hpp"

Expand Down Expand Up @@ -86,6 +88,11 @@ struct block_posting_list {
m_block_endpoints(m_block_maxs + 4 * m_blocks),
m_blocks_data(m_block_endpoints + 4 * (m_blocks - 1)),
m_universe(universe) {
PISA_ASSERT_CONCEPT(
(concepts::FrequencyPostingCursor<document_enumerator>
&& concepts::SortedPostingCursor<document_enumerator>)
);

if (Profile) {
// std::cout << "OPEN\t" << m_term_id << "\t" << m_blocks << "\n";
m_block_profile = block_profiler::open_list(term_id, m_blocks);
Expand Down Expand Up @@ -159,9 +166,11 @@ struct block_posting_list {
return m_freqs_buf[m_pos_in_block] + 1;
}

uint64_t PISA_ALWAYSINLINE value() { return freq(); }

uint64_t position() const { return m_cur_block * BlockCodec::block_size + m_pos_in_block; }

uint64_t size() const { return m_n; }
uint64_t size() const noexcept { return m_n; }

uint64_t num_blocks() const { return m_blocks; }

Expand Down
99 changes: 99 additions & 0 deletions include/pisa/concepts/posting_cursor.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright 2024 PISA developers
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// clang-format off

#pragma once

#ifdef PISA_ENABLE_CONCEPTS

#include <concepts>
#include <cstdint>

#include "container.hpp"
#include "type_alias.hpp"

namespace pisa::concepts {

/**
* A posting cursor iterates over a posting list.
*/
template <typename C>
concept PostingCursor = SizedContainer<C> && requires(C const &cursor)
{
/** Returns the document ID at the current position. */
{ cursor.docid() } -> std::convertible_to<std::uint32_t>;
} && requires(C cursor) {
/** Moves the cursor to the next position. */
cursor.next();
};

/**
* A posting cursor returning a score.
*/
template <typename C>
concept FrequencyPostingCursor = PostingCursor<C> && requires(C cursor) {
/** Returns the value of the payload. */
{ cursor.freq() } -> std::convertible_to<std::uint32_t>;
};

/**
* A posting cursor returning a score.
*/
template <typename C>
concept ScoredPostingCursor = PostingCursor<C> && requires(C cursor) {
/** Returns the value of the payload. */
{ cursor.score() } -> std::convertible_to<Score>;
};

/**
* A cursor over a posting list that stores postings in increasing order of document IDs.
*/
template <typename C>
concept SortedPostingCursor = PostingCursor<C>
&& requires(C cursor, std::uint32_t docid) {
/**
* Moves the cursor to the next position at which the document ID is at least `docid`.
* If the current ID already satisfies this condition, the cursor will not move. It will
* never move backwards.
*/
cursor.next_geq(docid);
};

/**
* A posting cursor with max score.
*/
template <typename C>
concept MaxScorePostingCursor = ScoredPostingCursor<C> && requires(C const& cursor) {
/** Returns the max score of the entire list. */
{ cursor.max_score() } noexcept -> std::convertible_to<Score>;
};

/**
* A posting cursor with block-max scores.
*/
template <typename C>
concept BlockMaxPostingCursor = MaxScorePostingCursor<C> && SortedPostingCursor<C>
&& requires(C cursor) {
/** Returns the max highest docid of the current block. */
{ cursor.block_max_docid() } -> std::convertible_to<DocId>;
/** Returns the max score of the current block. */
{ cursor.block_max_score() } -> std::convertible_to<Score>;
};

}; // namespace pisa

// clang-format on

#endif
5 changes: 4 additions & 1 deletion include/pisa/cursor/block_max_scored_cursor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
namespace pisa {

template <typename Cursor, typename Wand>
PISA_REQUIRES((concepts::FrequencyPostingCursor<Cursor> && concepts::SortedPostingCursor<Cursor>))
class BlockMaxScoredCursor: public MaxScoredCursor<Cursor> {
public:
using base_cursor_type = Cursor;
Expand All @@ -22,7 +23,9 @@ class BlockMaxScoredCursor: public MaxScoredCursor<Cursor> {
typename Wand::wand_data_enumerator wdata
)
: MaxScoredCursor<Cursor>(std::move(cursor), std::move(term_scorer), weight, max_score),
m_wdata(std::move(wdata)) {}
m_wdata(std::move(wdata)) {
PISA_ASSERT_CONCEPT((concepts::BlockMaxPostingCursor<BlockMaxScoredCursor>));
}
BlockMaxScoredCursor(BlockMaxScoredCursor const&) = delete;
BlockMaxScoredCursor(BlockMaxScoredCursor&&) = default;
BlockMaxScoredCursor& operator=(BlockMaxScoredCursor const&) = delete;
Expand Down
8 changes: 7 additions & 1 deletion include/pisa/cursor/max_scored_cursor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,19 @@
namespace pisa {

template <typename Cursor>
PISA_REQUIRES((concepts::FrequencyPostingCursor<Cursor> && concepts::SortedPostingCursor<Cursor>))
class MaxScoredCursor: public ScoredCursor<Cursor> {
public:
using base_cursor_type = Cursor;

MaxScoredCursor(Cursor cursor, TermScorer term_scorer, float weight, float max_score)
: ScoredCursor<Cursor>(std::move(cursor), std::move(term_scorer), weight),
m_max_score(max_score) {}
m_max_score(max_score) {
PISA_ASSERT_CONCEPT(
(concepts::MaxScorePostingCursor<MaxScoredCursor>
&& concepts::SortedPostingCursor<MaxScoredCursor>)
);
}
MaxScoredCursor(MaxScoredCursor const&) = delete;
MaxScoredCursor(MaxScoredCursor&&) = default;
MaxScoredCursor& operator=(MaxScoredCursor const&) = delete;
Expand Down
14 changes: 12 additions & 2 deletions include/pisa/cursor/scored_cursor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#include <vector>

#include "concepts.hpp"
#include "concepts/posting_cursor.hpp"
#include "query.hpp"
#include "scorer/index_scorer.hpp"
#include "util/compiler_attribute.hpp"
Expand All @@ -18,14 +20,20 @@ auto resolve_term_scorer(Scorer scorer, float weight) -> TermScorer {
}

template <typename Cursor>
PISA_REQUIRES((concepts::FrequencyPostingCursor<Cursor> && concepts::SortedPostingCursor<Cursor>))
class ScoredCursor {
public:
using base_cursor_type = Cursor;

ScoredCursor(Cursor cursor, TermScorer term_scorer, float weight)
: m_base_cursor(std::move(cursor)),
m_weight(weight),
m_term_scorer(resolve_term_scorer(term_scorer, weight)) {}
m_term_scorer(resolve_term_scorer(term_scorer, weight)) {
PISA_ASSERT_CONCEPT(
(concepts::ScoredPostingCursor<ScoredCursor>
&& concepts::SortedPostingCursor<ScoredCursor>)
);
}
ScoredCursor(ScoredCursor const&) = delete;
ScoredCursor(ScoredCursor&&) = default;
ScoredCursor& operator=(ScoredCursor const&) = delete;
Expand All @@ -40,7 +48,9 @@ class ScoredCursor {
[[nodiscard]] PISA_ALWAYSINLINE auto score() -> float { return m_term_scorer(docid(), freq()); }
void PISA_ALWAYSINLINE next() { m_base_cursor.next(); }
void PISA_ALWAYSINLINE next_geq(std::uint32_t docid) { m_base_cursor.next_geq(docid); }
[[nodiscard]] PISA_ALWAYSINLINE auto size() -> std::size_t { return m_base_cursor.size(); }
[[nodiscard]] PISA_ALWAYSINLINE auto size() const noexcept -> std::size_t {
return m_base_cursor.size();
}

private:
Cursor m_base_cursor;
Expand Down
10 changes: 9 additions & 1 deletion include/pisa/freq_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include "bitvector_collection.hpp"
#include "codec/compact_elias_fano.hpp"
#include "codec/integer_codes.hpp"
#include "concepts.hpp"
#include "concepts/posting_cursor.hpp"
#include "global_parameters.hpp"
#include "mappable/mapper.hpp"
#include "memory_source.hpp"
Expand Down Expand Up @@ -157,9 +159,11 @@ class freq_index {

uint64_t PISA_FLATTEN_FUNC freq() { return m_freqs_enum.move(m_cur_pos).second; }

uint64_t PISA_FLATTEN_FUNC value() { return freq(); }

uint64_t position() const { return m_cur_pos; }

uint64_t size() const { return m_docs_enum.size(); }
uint64_t size() const noexcept { return m_docs_enum.size(); }

typename DocsSequence::enumerator const& docs_enum() const { return m_docs_enum; }

Expand All @@ -172,6 +176,10 @@ class freq_index {
typename DocsSequence::enumerator docs_enum, typename FreqsSequence::enumerator freqs_enum
)
: m_docs_enum(docs_enum), m_freqs_enum(freqs_enum) {
PISA_ASSERT_CONCEPT(
(concepts::FrequencyPostingCursor<document_enumerator>
&& concepts::SortedPostingCursor<document_enumerator>)
);
reset();
}

Expand Down
4 changes: 4 additions & 0 deletions include/pisa/query/algorithm/and_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
#include <cstdint>
#include <vector>

#include "concepts.hpp"
#include "concepts/posting_cursor.hpp"

namespace pisa {

/**
Expand All @@ -15,6 +18,7 @@ namespace pisa {
*/
struct and_query {
template <typename CursorRange>
PISA_REQUIRES((concepts::SortedPostingCursor<typename CursorRange::value_type>))
auto operator()(CursorRange&& cursors, uint32_t max_docid) const {
using Cursor = typename std::decay_t<CursorRange>::value_type;

Expand Down
3 changes: 3 additions & 0 deletions include/pisa/query/algorithm/block_max_maxscore_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#include <vector>

#include "concepts.hpp"
#include "concepts/posting_cursor.hpp"
#include "topk_queue.hpp"

namespace pisa {
Expand All @@ -10,6 +12,7 @@ struct block_max_maxscore_query {
explicit block_max_maxscore_query(topk_queue& topk) : m_topk(topk) {}

template <typename CursorRange>
PISA_REQUIRES((concepts::BlockMaxPostingCursor<pisa::val_t<CursorRange>>))
void operator()(CursorRange&& cursors, uint64_t max_docid) {
using Cursor = typename std::decay_t<CursorRange>::value_type;
if (cursors.empty()) {
Expand Down
3 changes: 3 additions & 0 deletions include/pisa/query/algorithm/block_max_ranked_and_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#include <vector>

#include "concepts.hpp"
#include "concepts/posting_cursor.hpp"
#include "topk_queue.hpp"

namespace pisa {
Expand All @@ -10,6 +12,7 @@ struct block_max_ranked_and_query {
explicit block_max_ranked_and_query(topk_queue& topk) : m_topk(topk) {}

template <typename CursorRange>
PISA_REQUIRES(concepts::BlockMaxPostingCursor<pisa::val_t<CursorRange>>)
void operator()(CursorRange&& cursors, uint64_t max_docid) {
using Cursor = typename std::decay_t<CursorRange>::value_type;

Expand Down
3 changes: 3 additions & 0 deletions include/pisa/query/algorithm/block_max_wand_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#include <vector>

#include "concepts.hpp"
#include "concepts/posting_cursor.hpp"
#include "topk_queue.hpp"

namespace pisa {
Expand All @@ -10,6 +12,7 @@ struct block_max_wand_query {
explicit block_max_wand_query(topk_queue& topk) : m_topk(topk) {}

template <typename CursorRange>
PISA_REQUIRES(concepts::BlockMaxPostingCursor<pisa::val_t<CursorRange>>)
void operator()(CursorRange&& cursors, uint64_t max_docid) {
using Cursor = typename std::decay_t<CursorRange>::value_type;
if (cursors.empty()) {
Expand Down
14 changes: 12 additions & 2 deletions include/pisa/query/algorithm/maxscore_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include <utility>
#include <vector>

#include "concepts.hpp"
#include "concepts/posting_cursor.hpp"
#include "topk_queue.hpp"
#include "util/compiler_attribute.hpp"

Expand All @@ -14,21 +16,26 @@ struct maxscore_query {
explicit maxscore_query(topk_queue& topk) : m_topk(topk) {}

template <typename Cursors>
PISA_REQUIRES(
(concepts::MaxScorePostingCursor<pisa::val_t<Cursors>>
&& concepts::SortedPostingCursor<pisa::val_t<Cursors>>)
)
[[nodiscard]] PISA_ALWAYSINLINE auto sorted(Cursors&& cursors)
-> std::vector<typename std::decay_t<Cursors>::value_type> {
-> std::vector<pisa::val_t<Cursors>> {
std::vector<std::size_t> term_positions(cursors.size());
std::iota(term_positions.begin(), term_positions.end(), 0);
std::sort(term_positions.begin(), term_positions.end(), [&](auto&& lhs, auto&& rhs) {
return cursors[lhs].max_score() > cursors[rhs].max_score();
});
std::vector<typename std::decay_t<Cursors>::value_type> sorted;
std::vector<pisa::val_t<Cursors>> sorted;
for (auto pos: term_positions) {
sorted.push_back(std::move(cursors[pos]));
};
return sorted;
}

template <typename Cursors>
PISA_REQUIRES((concepts::MaxScorePostingCursor<pisa::val_t<Cursors>>))
[[nodiscard]] PISA_ALWAYSINLINE auto calc_upper_bounds(Cursors&& cursors) -> std::vector<float> {
std::vector<float> upper_bounds(cursors.size());
auto out = upper_bounds.rbegin();
Expand All @@ -41,6 +48,7 @@ struct maxscore_query {
}

template <typename Cursors>
PISA_REQUIRES((concepts::MaxScorePostingCursor<pisa::val_t<Cursors>>))
[[nodiscard]] PISA_ALWAYSINLINE auto min_docid(Cursors&& cursors) -> std::uint32_t {
return std::min_element(
cursors.begin(),
Expand All @@ -53,6 +61,7 @@ struct maxscore_query {
enum class DocumentStatus : bool { Insert, Skip };

template <typename Cursors>
PISA_REQUIRES((concepts::MaxScorePostingCursor<pisa::val_t<Cursors>>))
PISA_ALWAYSINLINE void run_sorted(Cursors&& cursors, uint64_t max_docid) {
auto upper_bounds = calc_upper_bounds(cursors);
auto above_threshold = [&](auto score) { return m_topk.would_enter(score); };
Expand Down Expand Up @@ -122,6 +131,7 @@ struct maxscore_query {
}

template <typename Cursors>
PISA_REQUIRES((concepts::MaxScorePostingCursor<pisa::val_t<Cursors>>))
void operator()(Cursors&& cursors_, uint64_t max_docid) {
if (cursors_.empty()) {
return;
Expand Down
Loading

0 comments on commit dd1ce20

Please sign in to comment.