Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rtree #1063

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open

Rtree #1063

Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ endif()
######################################
# BOOST
######################################
find_package(Boost 1.74 COMPONENTS iostreams program_options REQUIRED)
find_package(Boost 1.76 COMPONENTS iostreams serialization program_options REQUIRED)
noahnock marked this conversation as resolved.
Show resolved Hide resolved
include_directories(${Boost_INCLUDE_DIR})


Expand Down Expand Up @@ -231,6 +231,7 @@ set(USE_OPENMP OFF CACHE BOOL "Don't use OPENMP as default" FORCE)
add_subdirectory(third_party/stxxl EXCLUDE_FROM_ALL)

# apply STXXL CXXFLAGS

noahnock marked this conversation as resolved.
Show resolved Hide resolved
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${STXXL_CXX_FLAGS}")
include_directories(SYSTEM ${STXXL_INCLUDE_DIRS})

Expand Down
5 changes: 3 additions & 2 deletions src/VocabularyMergerMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ int main(int argc, char** argv) {
VocabularyMerger m;

auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX);
auto internalVocabularyAction = [&file](const auto& word) {
auto internalVocabularyAction = [&file](const auto& word, [[maybe_unused]]const auto& index) {
file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n';
};
auto externalVocabularyAction = []([[maybe_unused]]const auto& word, [[maybe_unused]]const auto& index) {};
m.mergeVocabulary(basename, numFiles, TripleComponentComparator(),
internalVocabularyAction);
internalVocabularyAction, externalVocabularyAction);
}
4 changes: 2 additions & 2 deletions src/engine/sparqlExpressions/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ add_library(sparqlExpressions
SetOfIntervals.cpp
SparqlExpressionPimpl.cpp
SampleExpression.cpp
RelationalExpressions.cpp AggregateExpression.cpp RegexExpression.cpp
RelationalExpressions.cpp AggregateExpression.cpp RegexExpression.cpp ContainsExpression.cpp
LangExpression.cpp NumericUnaryExpressions.cpp NumericBinaryExpressions.cpp DateExpressions.cpp StringExpressions.cpp)

qlever_target_link_libraries(sparqlExpressions index)
qlever_target_link_libraries(sparqlExpressions index util)
146 changes: 146 additions & 0 deletions src/engine/sparqlExpressions/ContainsExpression.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
// Copyright 2023, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Noah Nock <noah.v.nock@gmail.com>

#include "./ContainsExpression.h"

#include "engine/sparqlExpressions/SparqlExpressionGenerators.h"
#include "global/ValueIdComparators.h"
#include "re2/re2.h"
#include <util/Rtree.h>

using namespace std::literals;

class BoundingBoxType {
public:
boxGeo value{};

explicit BoundingBoxType(boxGeo boundingBox) {
this->value = boundingBox;
}
};

std::string removeQuotes(std::string_view input) {
AD_CORRECTNESS_CHECK(input.size() >= 2 && input.starts_with('"') &&
input.ends_with('"'));
input.remove_prefix(1);
input.remove_suffix(1);
return std::string{input};
}

namespace sparqlExpression {
// ___________________________________________________________________________
ContainsExpression::ContainsExpression(
SparqlExpression::Ptr child, SparqlExpression::Ptr boundingBox)
: child_{std::move(child)} {
if (!dynamic_cast<const VariableExpression*>(child_.get())) {
throw std::runtime_error(
"Contain expressions are currently supported only on variables.");
}
std::string boundingBoxString;
std::string originalBoundingBoxString;
if (auto boundingBoxPtr =
dynamic_cast<const StringLiteralExpression*>(boundingBox.get())) {
originalBoundingBoxString = boundingBoxPtr->value().normalizedLiteralContent().get();
if (!boundingBoxPtr->value().datatypeOrLangtag().empty()) {
throw std::runtime_error(
"The second argument to the Contain function (which contains the "
"bounding box) must not contain a language tag or a datatype");
}
boundingBoxString = removeQuotes(originalBoundingBoxString);
} else {
throw std::runtime_error(
"The second argument to the Contains function must be a "
"string literal (which contains the bounding box of format \"minX,minY,maxX,maxY\")");
}

boundingBoxAsString_ = boundingBoxString;

std::vector<std::string> boundingBoxEntriesAsString(4);
std::string errorMessage;
std::string::size_type searchFrom = 0;
for (int i = 0; i < 4; i++) {
if (i == 3) {
if (searchFrom >= boundingBoxAsString_.size()) { errorMessage = "The fourth argument was not provided"; break; }
boundingBoxEntriesAsString[i] = boundingBoxAsString_.substr(searchFrom, boundingBoxAsString_.size() - searchFrom);
break;
}
std::string::size_type end = boundingBoxAsString_.find(',', searchFrom);
if (end >= boundingBoxAsString_.size() - 1 || end == std::string::npos) { errorMessage = "There are not enough arguments"; break; }
boundingBoxEntriesAsString[i] = boundingBoxAsString_.substr(searchFrom, end - searchFrom);
searchFrom = end + 1;
}

if (errorMessage.empty()) {
double minX;
double minY;
double maxX;
double maxY;

try {
minX = std::stod(boundingBoxEntriesAsString[0]);
minY = std::stod(boundingBoxEntriesAsString[1]);
maxX = std::stod(boundingBoxEntriesAsString[2]);
maxY = std::stod(boundingBoxEntriesAsString[3]);

boundingBox_ = new BoundingBoxType(Rtree::createBoundingBox(minX, minY, maxX, maxY));
} catch (const std::invalid_argument& e) {
errorMessage = e.what();
} catch (const std::out_of_range& e) {
errorMessage = e.what();
}
}

if (!errorMessage.empty()) {
throw std::runtime_error{absl::StrCat(
"The bounding box ", originalBoundingBoxString,
" is not supported by QLever (must be of format \"minX,minY,maxX,maxY\"). "
"Error message is: ", errorMessage
)};
}
}


// ___________________________________________________________________________
string ContainsExpression::getCacheKey(
const VariableToColumnMap& varColMap) const {
return absl::StrCat("Bounding Box CONTAINS expression ", child_->getCacheKey(varColMap),
" with ", boundingBoxAsString_);
}

// ___________________________________________________________________________
std::span<SparqlExpression::Ptr> ContainsExpression::childrenImpl() {
return {&child_, 1};
}

// ___________________________________________________________________________
ExpressionResult ContainsExpression::evaluate(
sparqlExpression::EvaluationContext* context) const {
auto resultAsVariant = child_->evaluate(context);
auto variablePtr = std::get_if<Variable>(&resultAsVariant);
AD_CONTRACT_CHECK(variablePtr);

// search in the rtree
ad_utility::HashSet<long long> resultIds;
if (boundingBox_ != nullptr) {
Rtree rtree = context->_qec.getIndex().getRtree();
multiBoxGeo treeResults = rtree.SearchTree(boundingBox_->value, "./rtree_build");

for (rTreeValue item : treeResults) {
long long id = item.id;
resultIds.insert(id);
}
}

auto resultSize = context->size();
VectorWithMemoryLimit<Id> result{context->_allocator};
result.reserve(resultSize);

for (auto id : detail::makeGenerator(*variablePtr, resultSize, context)) {
result.push_back(Id::makeFromBool(resultIds.contains(id.getInt())));
}

return result;
}

} // namespace sparqlExpression
33 changes: 33 additions & 0 deletions src/engine/sparqlExpressions/ContainsExpression.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright 2023, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Noah Nock <noah.v.nock@gmail.com>

#pragma once

#include <string>
#include "engine/sparqlExpressions/LiteralExpression.h"
#include "engine/sparqlExpressions/SparqlExpression.h"

class BoundingBoxType;

namespace sparqlExpression {
class ContainsExpression : public SparqlExpression {
private:
SparqlExpression::Ptr child_;
std::string boundingBoxAsString_;
BoundingBoxType* boundingBox_{};

public:
// `child` must be a `VariableExpression` and `boundingBox` must be a
// `LiteralExpression` that stores a string, else an exception will be thrown.
ContainsExpression(SparqlExpression::Ptr child, SparqlExpression::Ptr boundingBox);

ExpressionResult evaluate(EvaluationContext *context) const override;

std::span<SparqlExpression::Ptr> childrenImpl() override;

// _________________________________________________________________________
[[nodiscard]] string getCacheKey(
const VariableToColumnMap &varColMap) const override;
};
}
5 changes: 4 additions & 1 deletion src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@ auto Index::getTextVocab() const -> const TextVocab& {
return pimpl_->getTextVocab();
}

// ____________________________________________________________________________
// ___________________________________________________
auto Index::getRtree() const -> const Rtree& { return pimpl_->getRtree(); }

// _____________________________________________________________________________
size_t Index::getCardinality(const TripleComponent& comp,
Permutation::Enum p) const {
return pimpl_->getCardinality(comp, p);
Expand Down
3 changes: 3 additions & 0 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "index/StringSortComparator.h"
#include "index/Vocabulary.h"
#include "parser/TripleComponent.h"
#include <util/Rtree.h>

// Forward declarations.
class IdTable;
Expand Down Expand Up @@ -100,6 +101,8 @@ class Index {
Vocabulary<std::string, SimpleStringComparator, WordVocabIndex>;
[[nodiscard]] const TextVocab& getTextVocab() const;

const Rtree& getRtree() const;

// --------------------------------------------------------------------------
// -- RETRIEVAL ---
// --------------------------------------------------------------------------
Expand Down
42 changes: 37 additions & 5 deletions src/index/IndexImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,14 +352,17 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
auto compressionOutfile = ad_utility::makeOfstream(
onDiskBase_ + TMP_BASENAME_COMPRESSION + INTERNAL_VOCAB_SUFFIX);
auto internalVocabularyActionCompression =
[&compressionOutfile](const auto& word) {
[&compressionOutfile](const auto& word, [[maybe_unused]]const auto& index) {
compressionOutfile << RdfEscaping::escapeNewlinesAndBackslashes(word)
<< '\n';
};
auto externalVocabularyActionCompression =
[]([[maybe_unused]]const auto& word, [[maybe_unused]]const auto& index) {
};
m._noIdMapsAndIgnoreExternalVocab = true;
auto mergeResult =
m.mergeVocabulary(onDiskBase_ + TMP_BASENAME_COMPRESSION, numFiles,
std::less<>(), internalVocabularyActionCompression);
std::less<>(), internalVocabularyActionCompression, externalVocabularyActionCompression);
sizeInternalVocabulary = mergeResult.numWordsTotal_;
LOG(INFO) << "Number of words in internal vocabulary: "
<< sizeInternalVocabulary << std::endl;
Expand All @@ -385,11 +388,29 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
};
auto wordWriter =
vocab_.makeUncompressingWordWriter(onDiskBase_ + INTERNAL_VOCAB_SUFFIX);
auto internalVocabularyAction = [&wordWriter](const auto& word) {

std::ofstream convertOfs = std::ofstream(onDiskBase_ + ".vocabulary.boundingbox.tmp", std::ios::binary);

auto internalVocabularyAction = [&wordWriter, &convertOfs](const auto& word, const auto& index) {
wordWriter.push(word.data(), word.size());
std::optional<boxGeo> boundingBox = Rtree::ConvertWordToRtreeEntry(word);
if (boundingBox) {
Rtree::SaveEntry(boundingBox.value(), index, convertOfs);
}
};
auto externalVocabularyAction = [&convertOfs](const auto& word, const auto& index) {
std::optional<boxGeo> boundingBox = Rtree::ConvertWordToRtreeEntry(word);
if (boundingBox) {
Rtree::SaveEntry(boundingBox.value(), index, convertOfs);
}
};
return v.mergeVocabulary(onDiskBase_, numFiles, sortPred,
internalVocabularyAction);

VocabularyMerger::VocabularyMetaData result = v.mergeVocabulary(onDiskBase_, numFiles, sortPred,
internalVocabularyAction, externalVocabularyAction);

convertOfs.close();

return result;
}();
LOG(DEBUG) << "Finished merging partial vocabularies" << std::endl;
IndexBuilderDataAsStxxlVector res;
Expand All @@ -399,6 +420,15 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
<< res.vocabularyMetaData_.numWordsTotal_ - sizeInternalVocabulary
<< std::endl;

LOG(INFO) << "Building the Rtree..." << std::endl;
try {
Rtree rtree = Rtree(1300000000000);
rtree.BuildTree(onDiskBase_ + ".vocabulary", 16, "./rtree_build");
LOG(INFO) << "Finished building the Rtree" << std::endl;
} catch (const std::exception &e) {
LOG(INFO) << e.what() << std::endl;
}

res.idTriples = std::move(idTriples);
res.actualPartialSizes = std::move(actualPartialSizes);

Expand Down Expand Up @@ -664,6 +694,8 @@ void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) {
avgNumDistinctPredicatesPerSubject_, numDistinctSubjectPredicatePairs_,
patterns_, hasPattern_);
}

// Load the Rtree TODO
}

// _____________________________________________________________________________
Expand Down
4 changes: 4 additions & 0 deletions src/index/IndexImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <util/MmapVector.h>
#include <util/Timer.h>
#include <util/json.h>
#include <util/Rtree.h>

#include <array>
#include <fstream>
Expand Down Expand Up @@ -121,6 +122,7 @@ class IndexImpl {
size_t totalVocabularySize_ = 0;
bool vocabPrefixCompressed_ = true;
Index::TextVocab textVocab_;
Rtree rtree_ = Rtree(4000000000);

TextMetaData textMeta_;
DocsDB docsDB_;
Expand Down Expand Up @@ -230,6 +232,8 @@ class IndexImpl {

const auto& getTextVocab() const { return textVocab_; };

const auto& getRtree() const { return rtree_; };

// --------------------------------------------------------------------------
// -- RETRIEVAL ---
// --------------------------------------------------------------------------
Expand Down
10 changes: 6 additions & 4 deletions src/index/VocabularyGenerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,11 @@ class VocabularyMerger {
// Argument comparator gives the way to order strings (case-sensitive or not)
// This automatically resets the inner members after finishing, to leave the
// external interface stateless
template <typename Comp, typename InternalVocabularyAction>
template <typename Comp, typename InternalVocabularyAction, typename ExternalVocabularyAction>
VocabularyMetaData mergeVocabulary(const std::string& basename,
size_t numFiles, Comp comparator,
InternalVocabularyAction& action);
InternalVocabularyAction& action,
ExternalVocabularyAction& externalVocabularyAction);

private:
// helper struct used in the priority queue for merging.
Expand Down Expand Up @@ -132,10 +133,11 @@ class VocabularyMerger {
// write the queu words in the buffer to their corresponding idPairVecs.
// Requires that all the QueueWords that are ever passed are ordered
// alphabetically (Also across multiple calls)
template <typename InternalVocabularyAction>
template <typename InternalVocabularyAction, typename ExternalVocabularyAction>
void writeQueueWordsToIdVec(
const std::vector<QueueWord>& buffer,
InternalVocabularyAction& internalVocabularyAction);
InternalVocabularyAction& internalVocabularyAction,
ExternalVocabularyAction& externalVocabularyAction);

// close all associated files and MmapVectors and reset all internal variables
void clear() {
Expand Down
Loading