Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rtree #1063

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open

Rtree #1063

Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ endif()
######################################
# BOOST
######################################
find_package(Boost 1.74 COMPONENTS iostreams program_options REQUIRED)
find_package(Boost 1.74 COMPONENTS iostreams serialization program_options REQUIRED)
include_directories(${Boost_INCLUDE_DIR})


Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ RUN make test
FROM base as runtime
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y wget python3-yaml unzip curl bzip2 pkg-config libicu-dev python3-icu libgomp1 uuid-runtime make lbzip2 libjemalloc-dev libzstd-dev libssl-dev libboost1.74-dev libboost-program-options1.74-dev libboost-iostreams1.74-dev
RUN apt-get update && apt-get install -y wget python3-yaml unzip curl bzip2 pkg-config libicu-dev python3-icu libgomp1 uuid-runtime make lbzip2 libjemalloc-dev libzstd-dev libssl-dev libboost1.74-dev libboost-program-options1.74-dev libboost-iostreams1.74-dev libboost-serialization1.74-dev

ARG UID=1000
RUN groupadd -r qlever && useradd --no-log-init -r -u $UID -g qlever qlever && chown qlever:qlever /app
Expand Down
7 changes: 5 additions & 2 deletions src/VocabularyMergerMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,12 @@ int main(int argc, char** argv) {
VocabularyMerger m;

auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX);
auto internalVocabularyAction = [&file](const auto& word) {
auto internalVocabularyAction = [&file](const auto& word,
[[maybe_unused]] const auto& index) {
file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n';
};
auto externalVocabularyAction = []([[maybe_unused]] const auto& word,
[[maybe_unused]] const auto& index) {};
m.mergeVocabulary(basename, numFiles, TripleComponentComparator(),
internalVocabularyAction);
internalVocabularyAction, externalVocabularyAction);
}
4 changes: 2 additions & 2 deletions src/engine/sparqlExpressions/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ add_library(sparqlExpressions
SetOfIntervals.cpp
SparqlExpressionPimpl.cpp
SampleExpression.cpp
RelationalExpressions.cpp AggregateExpression.cpp RegexExpression.cpp
RelationalExpressions.cpp AggregateExpression.cpp RegexExpression.cpp ContainsExpression.cpp
LangExpression.cpp NumericUnaryExpressions.cpp NumericBinaryExpressions.cpp DateExpressions.cpp StringExpressions.cpp)

qlever_target_link_libraries(sparqlExpressions index)
qlever_target_link_libraries(sparqlExpressions index util)
160 changes: 160 additions & 0 deletions src/engine/sparqlExpressions/ContainsExpression.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
// Copyright 2023, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Noah Nock <noah.v.nock@gmail.com>

#include "./ContainsExpression.h"

#include <util/Rtree.h>

#include "engine/sparqlExpressions/SparqlExpressionGenerators.h"
#include "global/ValueIdComparators.h"
#include "re2/re2.h"

using namespace std::literals;

class BoundingBoxType {
public:
Rtree::BoundingBox value{};

explicit BoundingBoxType(Rtree::BoundingBox boundingBox) {
this->value = boundingBox;
}
};

std::string removeQuotes(std::string_view input) {
AD_CORRECTNESS_CHECK(input.size() >= 2 && input.starts_with('"') &&
input.ends_with('"'));
input.remove_prefix(1);
input.remove_suffix(1);
return std::string{input};
}

namespace sparqlExpression {
// ___________________________________________________________________________
ContainsExpression::ContainsExpression(SparqlExpression::Ptr child,
SparqlExpression::Ptr boundingBox)
: child_{std::move(child)} {
if (!dynamic_cast<const VariableExpression*>(child_.get())) {
throw std::runtime_error(
"Contain expressions are currently supported only on variables.");
}
std::string boundingBoxString;
std::string originalBoundingBoxString;
if (auto boundingBoxPtr =
dynamic_cast<const StringLiteralExpression*>(boundingBox.get())) {
originalBoundingBoxString =
boundingBoxPtr->value().normalizedLiteralContent().get();
if (!boundingBoxPtr->value().datatypeOrLangtag().empty()) {
throw std::runtime_error(
"The second argument to the Contain function (which contains the "
"bounding box) must not contain a language tag or a datatype");
}
boundingBoxString = removeQuotes(originalBoundingBoxString);
} else {
throw std::runtime_error(
"The second argument to the Contains function must be a "
"string literal (which contains the bounding box of format "
"\"minX,minY,maxX,maxY\")");
}

boundingBoxAsString_ = boundingBoxString;

std::vector<std::string> boundingBoxEntriesAsString(4);
std::string errorMessage;
std::string::size_type searchFrom = 0;
for (int i = 0; i < 4; i++) {
if (i == 3) {
if (searchFrom >= boundingBoxAsString_.size()) {
errorMessage = "The fourth argument was not provided";
break;
}
boundingBoxEntriesAsString[i] = boundingBoxAsString_.substr(
searchFrom, boundingBoxAsString_.size() - searchFrom);
break;
}
std::string::size_type end = boundingBoxAsString_.find(',', searchFrom);
if (end >= boundingBoxAsString_.size() - 1 || end == std::string::npos) {
errorMessage = "There are not enough arguments";
break;
}
boundingBoxEntriesAsString[i] =
boundingBoxAsString_.substr(searchFrom, end - searchFrom);
searchFrom = end + 1;
}

if (errorMessage.empty()) {
double minX;
double minY;
double maxX;
double maxY;

try {
minX = std::stod(boundingBoxEntriesAsString[0]);
minY = std::stod(boundingBoxEntriesAsString[1]);
maxX = std::stod(boundingBoxEntriesAsString[2]);
maxY = std::stod(boundingBoxEntriesAsString[3]);

boundingBox_ =
new BoundingBoxType(Rtree::createBoundingBox(minX, minY, maxX, maxY));
} catch (const std::invalid_argument& e) {
errorMessage = e.what();
} catch (const std::out_of_range& e) {
errorMessage = e.what();
}
}

if (!errorMessage.empty()) {
throw std::runtime_error{
absl::StrCat("The bounding box ", originalBoundingBoxString,
" is not supported by QLever (must be of format "
"\"minX,minY,maxX,maxY\"). "
"Error message is: ",
errorMessage)};
}
}

// ___________________________________________________________________________
string ContainsExpression::getCacheKey(
const VariableToColumnMap& varColMap) const {
return absl::StrCat("Bounding Box CONTAINS expression ",
child_->getCacheKey(varColMap), " with ",
boundingBoxAsString_);
}

// ___________________________________________________________________________
std::span<SparqlExpression::Ptr> ContainsExpression::childrenImpl() {
return {&child_, 1};
}

// ___________________________________________________________________________
ExpressionResult ContainsExpression::evaluate(
sparqlExpression::EvaluationContext* context) const {
auto resultAsVariant = child_->evaluate(context);
auto variablePtr = std::get_if<Variable>(&resultAsVariant);
AD_CONTRACT_CHECK(variablePtr);

// search in the rtree
ad_utility::HashSet<long long> resultIds;
if (boundingBox_ != nullptr) {
Rtree rtree = context->_qec.getIndex().getRtree();
multiBoxGeo treeResults =
rtree.SearchTree(boundingBox_->value, "./rtree_build");

for (RTreeValue item : treeResults) {
long long id = item.id;
resultIds.insert(id);
}
}

auto resultSize = context->size();
VectorWithMemoryLimit<Id> result{context->_allocator};
result.reserve(resultSize);

for (auto id : detail::makeGenerator(*variablePtr, resultSize, context)) {
result.push_back(Id::makeFromBool(resultIds.contains(id.getInt())));
}

return result;
}

} // namespace sparqlExpression
35 changes: 35 additions & 0 deletions src/engine/sparqlExpressions/ContainsExpression.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright 2023, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Noah Nock <noah.v.nock@gmail.com>

#pragma once

#include <string>

#include "engine/sparqlExpressions/LiteralExpression.h"
#include "engine/sparqlExpressions/SparqlExpression.h"

class BoundingBoxType;

namespace sparqlExpression {
class ContainsExpression : public SparqlExpression {
private:
SparqlExpression::Ptr child_;
std::string boundingBoxAsString_;
BoundingBoxType* boundingBox_{};

public:
// `child` must be a `VariableExpression` and `boundingBox` must be a
// `LiteralExpression` that stores a string, else an exception will be thrown.
ContainsExpression(SparqlExpression::Ptr child,
SparqlExpression::Ptr boundingBox);

ExpressionResult evaluate(EvaluationContext* context) const override;

std::span<SparqlExpression::Ptr> childrenImpl() override;

// _________________________________________________________________________
[[nodiscard]] string getCacheKey(
const VariableToColumnMap& varColMap) const override;
};
} // namespace sparqlExpression
2 changes: 2 additions & 0 deletions src/global/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ static constexpr std::pair<std::string_view, std::string_view> GEOF_PREFIX = {
"geof:", "<http://www.opengis.net/def/function/geosparql/"};
static constexpr std::pair<std::string_view, std::string_view> MATH_PREFIX = {
"math:", "<http://www.w3.org/2005/xpath-functions/math#"};
static constexpr std::pair<std::string_view, std::string_view> GEO_RTREE_PREFIX = {
"geoRtree:", "<http://qlever.cs.uni-freiburg.de/"};

static const std::string INTERNAL_VARIABLE_PREFIX =
"?_QLever_internal_variable_";
Expand Down
5 changes: 4 additions & 1 deletion src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@ auto Index::getTextVocab() const -> const TextVocab& {
return pimpl_->getTextVocab();
}

// ____________________________________________________________________________
// ___________________________________________________
auto Index::getRtree() const -> const Rtree& { return pimpl_->getRtree(); }

// _____________________________________________________________________________
size_t Index::getCardinality(const TripleComponent& comp,
Permutation::Enum p) const {
return pimpl_->getCardinality(comp, p);
Expand Down
4 changes: 4 additions & 0 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
// 2018- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de)
#pragma once

#include <util/Rtree.h>

#include <array>
#include <optional>
#include <string>
Expand Down Expand Up @@ -100,6 +102,8 @@ class Index {
Vocabulary<std::string, SimpleStringComparator, WordVocabIndex>;
[[nodiscard]] const TextVocab& getTextVocab() const;

const Rtree& getRtree() const;

// --------------------------------------------------------------------------
// -- RETRIEVAL ---
// --------------------------------------------------------------------------
Expand Down
50 changes: 45 additions & 5 deletions src/index/IndexImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,14 +352,19 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
auto compressionOutfile = ad_utility::makeOfstream(
onDiskBase_ + TMP_BASENAME_COMPRESSION + INTERNAL_VOCAB_SUFFIX);
auto internalVocabularyActionCompression =
[&compressionOutfile](const auto& word) {
[&compressionOutfile](const auto& word,
[[maybe_unused]] const auto& index) {
compressionOutfile << RdfEscaping::escapeNewlinesAndBackslashes(word)
<< '\n';
};
auto externalVocabularyActionCompression =
[]([[maybe_unused]] const auto& word,
[[maybe_unused]] const auto& index) {};
m._noIdMapsAndIgnoreExternalVocab = true;
auto mergeResult =
m.mergeVocabulary(onDiskBase_ + TMP_BASENAME_COMPRESSION, numFiles,
std::less<>(), internalVocabularyActionCompression);
std::less<>(), internalVocabularyActionCompression,
externalVocabularyActionCompression);
sizeInternalVocabulary = mergeResult.numWordsTotal_;
LOG(INFO) << "Number of words in internal vocabulary: "
<< sizeInternalVocabulary << std::endl;
Expand All @@ -385,11 +390,35 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
};
auto wordWriter =
vocab_.makeUncompressingWordWriter(onDiskBase_ + INTERNAL_VOCAB_SUFFIX);
auto internalVocabularyAction = [&wordWriter](const auto& word) {

std::ofstream convertOfs = std::ofstream(
onDiskBase_ + ".vocabulary.boundingbox.tmp", std::ios::binary);

auto internalVocabularyAction = [&wordWriter, &convertOfs](
const auto& word, const auto& index) {
wordWriter.push(word.data(), word.size());
std::optional<Rtree::BoundingBox> boundingBox =
Rtree::ConvertWordToRtreeEntry(word);
if (boundingBox) {
Rtree::SaveEntry(boundingBox.value(), index, convertOfs);
}
};
return v.mergeVocabulary(onDiskBase_, numFiles, sortPred,
internalVocabularyAction);
auto externalVocabularyAction = [&convertOfs](const auto& word,
const auto& index) {
std::optional<Rtree::BoundingBox> boundingBox =
Rtree::ConvertWordToRtreeEntry(word);
if (boundingBox) {
Rtree::SaveEntry(boundingBox.value(), index, convertOfs);
}
};

VocabularyMerger::VocabularyMetaData result =
v.mergeVocabulary(onDiskBase_, numFiles, sortPred,
internalVocabularyAction, externalVocabularyAction);

convertOfs.close();

return result;
}();
LOG(DEBUG) << "Finished merging partial vocabularies" << std::endl;
IndexBuilderDataAsStxxlVector res;
Expand All @@ -399,6 +428,15 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary(
<< res.vocabularyMetaData_.numWordsTotal_ - sizeInternalVocabulary
<< std::endl;

LOG(INFO) << "Building the Rtree..." << std::endl;
try {
Rtree rtree = Rtree(1300000000000);
rtree.BuildTree(onDiskBase_ + ".vocabulary", 16, "./rtree_build");
LOG(INFO) << "Finished building the Rtree" << std::endl;
} catch (const std::exception& e) {
LOG(INFO) << e.what() << std::endl;
}

res.idTriples = std::move(idTriples);
res.actualPartialSizes = std::move(actualPartialSizes);

Expand Down Expand Up @@ -664,6 +702,8 @@ void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) {
avgNumDistinctPredicatesPerSubject_, numDistinctSubjectPredicatePairs_,
patterns_, hasPattern_);
}

// Load the Rtree TODO
}

// _____________________________________________________________________________
Expand Down
4 changes: 4 additions & 0 deletions src/index/IndexImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <util/Forward.h>
#include <util/HashMap.h>
#include <util/MmapVector.h>
#include <util/Rtree.h>
#include <util/Timer.h>
#include <util/json.h>

Expand Down Expand Up @@ -121,6 +122,7 @@ class IndexImpl {
size_t totalVocabularySize_ = 0;
bool vocabPrefixCompressed_ = true;
Index::TextVocab textVocab_;
Rtree rtree_ = Rtree(4000000000);

TextMetaData textMeta_;
DocsDB docsDB_;
Expand Down Expand Up @@ -230,6 +232,8 @@ class IndexImpl {

const auto& getTextVocab() const { return textVocab_; };

const auto& getRtree() const { return rtree_; };

// --------------------------------------------------------------------------
// -- RETRIEVAL ---
// --------------------------------------------------------------------------
Expand Down
Loading