diff --git a/CMakeLists.txt b/CMakeLists.txt index f2eff35508..7873949f8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,7 +116,7 @@ endif() ###################################### # BOOST ###################################### -find_package(Boost 1.81 COMPONENTS iostreams program_options REQUIRED) +find_package(Boost 1.81 COMPONENTS iostreams serialization program_options REQUIRED) include_directories(${Boost_INCLUDE_DIR}) diff --git a/Dockerfile b/Dockerfile index b749884e04..69c75968ab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,7 +21,7 @@ RUN make test FROM base as runtime WORKDIR /app ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update && apt-get install -y wget python3-yaml unzip curl bzip2 pkg-config libicu-dev python3-icu libgomp1 uuid-runtime make lbzip2 libjemalloc-dev libzstd-dev libssl-dev libboost1.81-dev libboost-program-options1.81-dev libboost-iostreams1.81-dev +RUN apt-get update && apt-get install -y wget python3-yaml unzip curl bzip2 pkg-config libicu-dev python3-icu libgomp1 uuid-runtime make lbzip2 libjemalloc-dev libzstd-dev libssl-dev libboost1.81-dev libboost-program-options1.81-dev libboost-iostreams1.81-dev libboost-serialization1.81-dev ARG UID=1000 RUN groupadd -r qlever && useradd --no-log-init -r -u $UID -g qlever qlever && chown qlever:qlever /app diff --git a/src/VocabularyMergerMain.cpp b/src/VocabularyMergerMain.cpp index 279b266827..ef033193ec 100644 --- a/src/VocabularyMergerMain.cpp +++ b/src/VocabularyMergerMain.cpp @@ -21,9 +21,12 @@ int main(int argc, char** argv) { VocabularyMerger m; auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word) { + auto internalVocabularyAction = [&file](const auto& word, + [[maybe_unused]] const auto& index) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; + auto externalVocabularyAction = []([[maybe_unused]] const auto& word, + [[maybe_unused]] const auto& index) {}; m.mergeVocabulary(basename, numFiles, TripleComponentComparator(), - internalVocabularyAction, 4_GB); + internalVocabularyAction, externalVocabularyAction, 4_GB); } diff --git a/src/engine/sparqlExpressions/CMakeLists.txt b/src/engine/sparqlExpressions/CMakeLists.txt index b1bd10cdde..01b926b342 100644 --- a/src/engine/sparqlExpressions/CMakeLists.txt +++ b/src/engine/sparqlExpressions/CMakeLists.txt @@ -4,8 +4,7 @@ add_library(sparqlExpressions SetOfIntervals.cpp SparqlExpressionPimpl.cpp SampleExpression.cpp - RelationalExpressions.cpp AggregateExpression.cpp RegexExpression.cpp - LangExpression.cpp NumericUnaryExpressions.cpp NumericBinaryExpressions.cpp DateExpressions.cpp StringExpressions.cpp - ConditionalExpressions.cpp) + RelationalExpressions.cpp AggregateExpression.cpp RegexExpression.cpp ContainsExpression.cpp + LangExpression.cpp NumericUnaryExpressions.cpp NumericBinaryExpressions.cpp DateExpressions.cpp StringExpressions.cpp ConditionalExpressions.cpp) -qlever_target_link_libraries(sparqlExpressions index) +qlever_target_link_libraries(sparqlExpressions index util) \ No newline at end of file diff --git a/src/engine/sparqlExpressions/ContainsExpression.cpp b/src/engine/sparqlExpressions/ContainsExpression.cpp new file mode 100644 index 0000000000..f94f0630af --- /dev/null +++ b/src/engine/sparqlExpressions/ContainsExpression.cpp @@ -0,0 +1,160 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#include "./ContainsExpression.h" + +#include + +#include "engine/sparqlExpressions/SparqlExpressionGenerators.h" +#include "global/ValueIdComparators.h" +#include "re2/re2.h" + +using namespace std::literals; + +class BoundingBoxType { + public: + BasicGeometry::BoundingBox value{}; + + explicit BoundingBoxType(BasicGeometry::BoundingBox boundingBox) { + this->value = boundingBox; + } +}; + +std::string removeQuotes(std::string_view input) { + AD_CORRECTNESS_CHECK(input.size() >= 2 && input.starts_with('"') && + input.ends_with('"')); + input.remove_prefix(1); + input.remove_suffix(1); + return std::string{input}; +} + +namespace sparqlExpression { +// ___________________________________________________________________________ +ContainsExpression::ContainsExpression(SparqlExpression::Ptr child, + SparqlExpression::Ptr boundingBox) + : child_{std::move(child)} { + if (!dynamic_cast(child_.get())) { + throw std::runtime_error( + "Contain expressions are currently supported only on variables."); + } + std::string boundingBoxString; + std::string originalBoundingBoxString; + if (auto boundingBoxPtr = + dynamic_cast(boundingBox.get())) { + originalBoundingBoxString = + boundingBoxPtr->value().normalizedLiteralContent().get(); + if (!boundingBoxPtr->value().datatypeOrLangtag().empty()) { + throw std::runtime_error( + "The second argument to the Contain function (which contains the " + "bounding box) must not contain a language tag or a datatype"); + } + boundingBoxString = removeQuotes(originalBoundingBoxString); + } else { + throw std::runtime_error( + "The second argument to the Contains function must be a " + "string literal (which contains the bounding box of format " + "\"minX,minY,maxX,maxY\")"); + } + + boundingBoxAsString_ = boundingBoxString; + + std::vector boundingBoxEntriesAsString(4); + std::string errorMessage; + std::string::size_type searchFrom = 0; + for (int i = 0; i < 4; i++) { + if (i == 3) { + if (searchFrom >= boundingBoxAsString_.size()) { + errorMessage = "The fourth argument was not provided"; + break; + } + boundingBoxEntriesAsString[i] = boundingBoxAsString_.substr( + searchFrom, boundingBoxAsString_.size() - searchFrom); + break; + } + std::string::size_type end = boundingBoxAsString_.find(',', searchFrom); + if (end >= boundingBoxAsString_.size() - 1 || end == std::string::npos) { + errorMessage = "There are not enough arguments"; + break; + } + boundingBoxEntriesAsString[i] = + boundingBoxAsString_.substr(searchFrom, end - searchFrom); + searchFrom = end + 1; + } + + if (errorMessage.empty()) { + double minX; + double minY; + double maxX; + double maxY; + + try { + minX = std::stod(boundingBoxEntriesAsString[0]); + minY = std::stod(boundingBoxEntriesAsString[1]); + maxX = std::stod(boundingBoxEntriesAsString[2]); + maxY = std::stod(boundingBoxEntriesAsString[3]); + + boundingBox_ = new BoundingBoxType( + BasicGeometry::CreateBoundingBox(minX, minY, maxX, maxY)); + } catch (const std::invalid_argument& e) { + errorMessage = e.what(); + } catch (const std::out_of_range& e) { + errorMessage = e.what(); + } + } + + if (!errorMessage.empty()) { + throw std::runtime_error{ + absl::StrCat("The bounding box ", originalBoundingBoxString, + " is not supported by QLever (must be of format " + "\"minX,minY,maxX,maxY\"). " + "Error message is: ", + errorMessage)}; + } +} + +// ___________________________________________________________________________ +string ContainsExpression::getCacheKey( + const VariableToColumnMap& varColMap) const { + return absl::StrCat("Bounding Box CONTAINS expression ", + child_->getCacheKey(varColMap), " with ", + boundingBoxAsString_); +} + +// ___________________________________________________________________________ +std::span ContainsExpression::childrenImpl() { + return {&child_, 1}; +} + +// ___________________________________________________________________________ +ExpressionResult ContainsExpression::evaluate( + sparqlExpression::EvaluationContext* context) const { + auto resultAsVariant = child_->evaluate(context); + auto variablePtr = std::get_if(&resultAsVariant); + AD_CONTRACT_CHECK(variablePtr); + + // search in the rtree + ad_utility::HashSet resultIds; + if (boundingBox_ != nullptr) { + Rtree rtree = context->_qec.getIndex().getRtree(); + multiBoxGeo treeResults = + rtree.SearchTree(boundingBox_->value, "./rtree_build"); + + for (RTreeValue item : treeResults) { + long long id = item.id; + resultIds.insert(id); + } + } + + auto resultSize = context->size(); + VectorWithMemoryLimit result{context->_allocator}; + result.reserve(resultSize); + + for (auto id : detail::makeGenerator(*variablePtr, resultSize, context)) { + result.push_back(Id::makeFromBool(resultIds.contains(id.getInt()))); + } + + return result; +} + +} // namespace sparqlExpression diff --git a/src/engine/sparqlExpressions/ContainsExpression.h b/src/engine/sparqlExpressions/ContainsExpression.h new file mode 100644 index 0000000000..90312c5e27 --- /dev/null +++ b/src/engine/sparqlExpressions/ContainsExpression.h @@ -0,0 +1,35 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#pragma once + +#include + +#include "engine/sparqlExpressions/LiteralExpression.h" +#include "engine/sparqlExpressions/SparqlExpression.h" + +class BoundingBoxType; + +namespace sparqlExpression { +class ContainsExpression : public SparqlExpression { + private: + SparqlExpression::Ptr child_; + std::string boundingBoxAsString_; + BoundingBoxType* boundingBox_{}; + + public: + // `child` must be a `VariableExpression` and `boundingBox` must be a + // `LiteralExpression` that stores a string, else an exception will be thrown. + ContainsExpression(SparqlExpression::Ptr child, + SparqlExpression::Ptr boundingBox); + + ExpressionResult evaluate(EvaluationContext* context) const override; + + std::span childrenImpl() override; + + // _________________________________________________________________________ + [[nodiscard]] string getCacheKey( + const VariableToColumnMap& varColMap) const override; +}; +} // namespace sparqlExpression diff --git a/src/global/Constants.h b/src/global/Constants.h index 6fec7fb53b..fe9e874475 100644 --- a/src/global/Constants.h +++ b/src/global/Constants.h @@ -54,6 +54,8 @@ static constexpr std::pair GEOF_PREFIX = { "geof:", " MATH_PREFIX = { "math:", " + GEO_RTREE_PREFIX = {"geoRtree:", " const TextVocab& { return pimpl_->getTextVocab(); } -// ____________________________________________________________________________ +// ___________________________________________________ +auto Index::getRtree() const -> const Rtree& { return pimpl_->getRtree(); } + +// _____________________________________________________________________________ size_t Index::getCardinality(const TripleComponent& comp, Permutation::Enum p) const { return pimpl_->getCardinality(comp, p); diff --git a/src/index/Index.h b/src/index/Index.h index 7d2ec634b4..50d49e8709 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -5,6 +5,8 @@ // 2018- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de) #pragma once +#include + #include #include #include @@ -104,6 +106,8 @@ class Index { Vocabulary; [[nodiscard]] const TextVocab& getTextVocab() const; + const Rtree& getRtree() const; + // -------------------------------------------------------------------------- // -- RETRIEVAL --- // -------------------------------------------------------------------------- diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index f44a53d999..b311f92d69 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -24,6 +24,7 @@ #include "util/CachingMemoryResource.h" #include "util/CompressionUsingZstd/ZstdWrapper.h" #include "util/HashMap.h" +#include "util/RtreeFileReader.h" #include "util/Serializer/FileSerializer.h" #include "util/TupleHelpers.h" @@ -366,14 +367,19 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( auto compressionOutfile = ad_utility::makeOfstream( onDiskBase_ + TMP_BASENAME_COMPRESSION + INTERNAL_VOCAB_SUFFIX); auto internalVocabularyActionCompression = - [&compressionOutfile](const auto& word) { + [&compressionOutfile](const auto& word, + [[maybe_unused]] const auto& index) { compressionOutfile << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; + auto externalVocabularyActionCompression = + []([[maybe_unused]] const auto& word, + [[maybe_unused]] const auto& index) {}; m._noIdMapsAndIgnoreExternalVocab = true; - auto mergeResult = m.mergeVocabulary( - onDiskBase_ + TMP_BASENAME_COMPRESSION, numFiles, std::less<>(), - internalVocabularyActionCompression, stxxlMemory()); + auto mergeResult = + m.mergeVocabulary(onDiskBase_ + TMP_BASENAME_COMPRESSION, numFiles, + std::less<>(), internalVocabularyActionCompression, + externalVocabularyActionCompression, stxxlMemory()); sizeInternalVocabulary = mergeResult.numWordsTotal_; LOG(INFO) << "Number of words in internal vocabulary: " << sizeInternalVocabulary << std::endl; @@ -399,11 +405,37 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( }; auto wordWriter = vocab_.makeUncompressingWordWriter(onDiskBase_ + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&wordWriter](const auto& word) { + + std::ofstream convertOfs = std::ofstream( + onDiskBase_ + ".vocabulary.boundingbox.tmp", std::ios::binary); + + auto internalVocabularyAction = [&wordWriter, &convertOfs]( + const auto& word, const auto& index) { wordWriter.push(word.data(), word.size()); + std::optional boundingBox = + BasicGeometry::ConvertWordToRtreeEntry(word); + if (boundingBox) { + FileReaderWithoutIndex::SaveEntry(boundingBox.value(), index, + convertOfs); + } }; - return v.mergeVocabulary(onDiskBase_, numFiles, sortPred, - internalVocabularyAction, stxxlMemory()); + auto externalVocabularyAction = [&convertOfs](const auto& word, + const auto& index) { + std::optional boundingBox = + BasicGeometry::ConvertWordToRtreeEntry(word); + if (boundingBox) { + FileReaderWithoutIndex::SaveEntry(boundingBox.value(), index, + convertOfs); + } + }; + + VocabularyMerger::VocabularyMetaData result = v.mergeVocabulary( + onDiskBase_, numFiles, sortPred, internalVocabularyAction, + externalVocabularyAction, stxxlMemory()); + + convertOfs.close(); + + return result; }(); LOG(DEBUG) << "Finished merging partial vocabularies" << std::endl; IndexBuilderDataAsStxxlVector res; @@ -413,6 +445,17 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( << res.vocabularyMetaData_.numWordsTotal_ - sizeInternalVocabulary << std::endl; + res.idTriples = std::move(*idTriples.wlock()); + LOG(INFO) << "Building the Rtree..." << std::endl; + try { + Rtree rtree = Rtree(10000000000); + uint64_t treeSize = rtree.BuildTree(onDiskBase_, ".vocabulary.boundingbox", 16, + "./rtree_build"); + LOG(INFO) << "Finished building the Rtree with " << treeSize << " elements." << std::endl; + } catch (const std::exception& e) { + LOG(INFO) << e.what() << std::endl; + } + res.idTriples = std::move(*idTriples.wlock()); res.actualPartialSizes = std::move(actualPartialSizes); @@ -683,6 +726,8 @@ void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) { avgNumDistinctPredicatesPerSubject_, numDistinctSubjectPredicatePairs_, patterns_, hasPattern_); } + + // Load the Rtree TODO } // _____________________________________________________________________________ diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index bc8742e5f2..203f136b8a 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -124,6 +125,7 @@ class IndexImpl { size_t totalVocabularySize_ = 0; bool vocabPrefixCompressed_ = true; Index::TextVocab textVocab_; + Rtree rtree_ = Rtree(4000000000); TextMetaData textMeta_; DocsDB docsDB_; @@ -233,6 +235,8 @@ class IndexImpl { const auto& getTextVocab() const { return textVocab_; }; + const auto& getRtree() const { return rtree_; }; + // -------------------------------------------------------------------------- // -- RETRIEVAL --- // -------------------------------------------------------------------------- diff --git a/src/index/VocabularyGenerator.h b/src/index/VocabularyGenerator.h index d66f6c3368..9f22dcb6cb 100644 --- a/src/index/VocabularyGenerator.h +++ b/src/index/VocabularyGenerator.h @@ -102,11 +102,13 @@ class VocabularyMerger { // Argument comparator gives the way to order strings (case-sensitive or not) // This automatically resets the inner members after finishing, to leave the // external interface stateless - template - VocabularyMetaData mergeVocabulary(const std::string& fileIdx, - size_t numFiles, Comp comparator, - InternalVocabularyAction& action, - ad_utility::MemorySize memToUse); + template + VocabularyMetaData mergeVocabulary( + const std::string& basename, size_t numFiles, Comp comparator, + InternalVocabularyAction& action, + ExternalVocabularyAction& externalVocabularyAction, + ad_utility::MemorySize memToUse); private: // helper struct used in the priority queue for merging. @@ -139,10 +141,12 @@ class VocabularyMerger { // write the queu words in the buffer to their corresponding idPairVecs. // Requires that all the QueueWords that are ever passed are ordered // alphabetically (Also across multiple calls) - template + template void writeQueueWordsToIdVec( const std::vector& buffer, - InternalVocabularyAction& internalVocabularyAction, const auto& lessThan); + InternalVocabularyAction& internalVocabularyAction, + ExternalVocabularyAction& externalVocabularyAction, const auto& lessThan); // close all associated files and MmapVectors and reset all internal variables void clear() { diff --git a/src/index/VocabularyGeneratorImpl.h b/src/index/VocabularyGeneratorImpl.h index 8502755ca6..6df1992a3d 100644 --- a/src/index/VocabularyGeneratorImpl.h +++ b/src/index/VocabularyGeneratorImpl.h @@ -28,10 +28,12 @@ #include "util/Timer.h" // ___________________________________________________________________ -template +template VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( const std::string& basename, size_t numFiles, Comparator comparator, InternalVocabularyAction& internalVocabularyAction, + ExternalVocabularyAction& externalVocabularyAction, ad_utility::MemorySize memoryToUse) { // Return true iff p1 >= p2 according to the lexicographic order of the IRI // or literal. All internal IRIs or literals come before all external ones. @@ -106,8 +108,10 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( // asynchronously write the next batch of sorted // queue words auto writeTask = [this, buf = std::move(sortedBuffer), - &internalVocabularyAction, &lessThan]() { - this->writeQueueWordsToIdVec(buf, internalVocabularyAction, lessThan); + &internalVocabularyAction, &externalVocabularyAction, + &lessThan]() { + this->writeQueueWordsToIdVec(buf, internalVocabularyAction, + externalVocabularyAction, lessThan); }; sortedBuffer.clear(); sortedBuffer.reserve(_bufferSize); @@ -129,7 +133,8 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( // Handle remaining words in the buffer if (!sortedBuffer.empty()) { - writeQueueWordsToIdVec(sortedBuffer, internalVocabularyAction, lessThan); + writeQueueWordsToIdVec(sortedBuffer, internalVocabularyAction, + externalVocabularyAction, lessThan); } auto metaData = std::move(metaData_); @@ -139,10 +144,11 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( } // ________________________________________________________________________________ -template +template void VocabularyMerger::writeQueueWordsToIdVec( const std::vector& buffer, - InternalVocabularyAction& internalVocabularyAction, const auto& lessThan) { + InternalVocabularyAction& internalVocabularyAction, + ExternalVocabularyAction& externalVocabularyAction, const auto& lessThan) { LOG(TIMING) << "Start writing a batch of merged words\n"; // smaller grained buffer for the actual inner write @@ -171,8 +177,11 @@ void VocabularyMerger::writeQueueWordsToIdVec( // write the new word to the vocabulary if (!lastTripleComponent_.value().isExternal()) { - internalVocabularyAction(lastTripleComponent_.value().iriOrLiteral()); + internalVocabularyAction(lastTripleComponent_.value().iriOrLiteral(), + lastTripleComponent_.value().index_); } else { + externalVocabularyAction(lastTripleComponent_.value().iriOrLiteral(), + lastTripleComponent_.value().index_); outfileExternal_ << RdfEscaping::escapeNewlinesAndBackslashes( lastTripleComponent_.value().iriOrLiteral()) << '\n'; diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index e3ed0f9f4a..be3659ebc5 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -13,6 +13,7 @@ #include #include "absl/strings/str_join.h" +#include "engine/sparqlExpressions/ContainsExpression.h" #include "engine/sparqlExpressions/LangExpression.h" #include "engine/sparqlExpressions/RandomExpression.h" #include "engine/sparqlExpressions/RegexExpression.h" @@ -122,7 +123,14 @@ ExpressionPtr Visitor::processIriFunctionCall( checkNumArgs(1); return sparqlExpression::makeTanExpression(std::move(argList[0])); } + } else if (checkPrefix(GEO_RTREE_PREFIX)) { + if (functionName == "boundingBoxContains") { + checkNumArgs(2); + return std::make_unique(std::move(argList[0]), + std::move(argList[1])); + } } + reportNotSupported(ctx, "Function \"" + iri + "\" is"); } @@ -1021,7 +1029,7 @@ ObjectList Visitor::visit(Parser::ObjectListContext* ctx) { } // ____________________________________________________________________________________ -Node Visitor::visit(Parser::ObjectRContext* ctx) { +ad_utility::sparql_types::Node Visitor::visit(Parser::ObjectRContext* ctx) { return visit(ctx->graphNode()); } @@ -1232,13 +1240,14 @@ uint64_t Visitor::visit(Parser::IntegerContext* ctx) { } // ____________________________________________________________________________________ -Node Visitor::visit(Parser::TriplesNodeContext* ctx) { +ad_utility::sparql_types::Node Visitor::visit(Parser::TriplesNodeContext* ctx) { return visitAlternative(ctx->collection(), ctx->blankNodePropertyList()); } // ____________________________________________________________________________________ -Node Visitor::visit(Parser::BlankNodePropertyListContext* ctx) { +ad_utility::sparql_types::Node Visitor::visit( + Parser::BlankNodePropertyListContext* ctx) { VarOrTerm var{GraphTerm{newBlankNode()}}; Triples triples; auto propertyList = visit(ctx->propertyListNotEmpty()); @@ -1262,7 +1271,7 @@ void Visitor::visit(Parser::BlankNodePropertyListPathContext* ctx) { } // ____________________________________________________________________________________ -Node Visitor::visit(Parser::CollectionContext* ctx) { +ad_utility::sparql_types::Node Visitor::visit(Parser::CollectionContext* ctx) { Triples triples; VarOrTerm nextElement{ GraphTerm{Iri{""}}}; @@ -1294,7 +1303,7 @@ void Visitor::visit(Parser::CollectionPathContext* ctx) { } // ____________________________________________________________________________________ -Node Visitor::visit(Parser::GraphNodeContext* ctx) { +ad_utility::sparql_types::Node Visitor::visit(Parser::GraphNodeContext* ctx) { if (ctx->varOrTerm()) { return {visit(ctx->varOrTerm()), Triples{}}; } else { diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index b77011c082..383e3e1f3d 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -3,5 +3,14 @@ add_subdirectory(MemorySize) add_subdirectory(http) add_library(util GeoSparqlHelpers.cpp antlr/ANTLRErrorHandling.cpp ParseException.cpp Conversions.cpp Date.cpp antlr/GenerateAntlrExceptionMetadata.cpp - CancellationHandle.cpp) -qlever_target_link_libraries(util) + CancellationHandle.cpp + Rtree.h + Rtree.cpp + RtreeBuild.cpp + RtreeFileReader.cpp + RtreeFileReader.h + RtreeBasicGeometry.h + RtreeNode.cpp + RtreeNode.h + RtreeSorter.cpp RtreeSorter.h) +qlever_target_link_libraries(util Boost::serialization ${STXXL_LIBRARIES}) diff --git a/src/util/ComparisonWithNan.h b/src/util/ComparisonWithNan.h index 748ba429a5..a2bd34f1da 100644 --- a/src/util/ComparisonWithNan.h +++ b/src/util/ComparisonWithNan.h @@ -3,7 +3,6 @@ // Author: Johannes Kalmbach #pragma once - #include #include diff --git a/src/util/Rtree.cpp b/src/util/Rtree.cpp new file mode 100644 index 0000000000..fe66d9d51c --- /dev/null +++ b/src/util/Rtree.cpp @@ -0,0 +1,46 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#include "./Rtree.h" + +#include "./RtreeFileReader.h" +#include "./RtreeNode.h" + +multiBoxGeo Rtree::SearchTree(BasicGeometry::BoundingBox query, + const std::string& folder) { + std::ifstream lookupIfs = + std::ifstream(folder + "/lookup.bin", std::ios::binary); + std::ifstream nodesIfs = + std::ifstream(folder + "/nodes.bin", std::ios::binary); + + RtreeNode rootNode = FileReader::LoadNode(0, lookupIfs, nodesIfs); + multiBoxGeo results; + std::stack nodes; + nodes.push(rootNode); + + while (!nodes.empty()) { + RtreeNode currentNode = nodes.top(); + nodes.pop(); + + for (RTreeValue child : currentNode.GetChildren()) { + if (intersects(query, child.box)) { + if (currentNode.GetIsLastInnerNode()) { + results.push_back(child); + } else { + RtreeNode newNode = + FileReader::LoadNode(child.id, lookupIfs, nodesIfs); + nodes.push(newNode); + } + } + } + } + + lookupIfs.close(); + nodesIfs.close(); + return results; +} + +Rtree::Rtree(uintmax_t maxBuildingRamUsage) { + this->maxBuildingRamUsage_ = maxBuildingRamUsage; +} diff --git a/src/util/Rtree.h b/src/util/Rtree.h new file mode 100644 index 0000000000..61763d14ab --- /dev/null +++ b/src/util/Rtree.h @@ -0,0 +1,155 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#ifndef QLEVER_RTREE_H +#define QLEVER_RTREE_H + +#ifndef EOF +#define EOF std::char_traits::eof() +#endif +#include +#include +#include +#include +#include +#include + +#include "./RtreeBasicGeometry.h" + +// ___________________________________________________________________________ +// Forward declaration +struct RTreeValue; +struct RTreeValueWithOrderIndex; +using multiBoxGeo = std::vector; +using multiBoxWithOrderIndex = std::vector; +struct SplitResult; +struct SplitBuffers; + +// ___________________________________________________________________________ +// Data type to store all the information of the rectangles (in ram or on disk) +// + the small lists for one dimension +struct RectanglesForOrderedBoxes { + std::variant rectangles; + multiBoxWithOrderIndex rectanglesSmall; + + RectanglesForOrderedBoxes() { + rectangles = {}; + rectanglesSmall = multiBoxWithOrderIndex(); + } + + void Clear() { + rectanglesSmall = multiBoxWithOrderIndex(); + if (std::holds_alternative(rectangles)) { + rectangles = multiBoxWithOrderIndex(); + } + } +}; + +// ___________________________________________________________________________ +// A Rtree based on bounding boxes and ids +class Rtree { + private: + uintmax_t maxBuildingRamUsage_; + + public: + // ___________________________________________________________________________ + // Build the whole Rtree with the raw data in onDiskBase + fileSuffix + + // ".tmp", M as branching factor and folder as Rtree destination + uint64_t BuildTree(const std::string& onDiskBase, const std::string& fileSuffix, + size_t M, const std::string& folder) const; + // ___________________________________________________________________________ + // Search for an intersection of query with any elements of the Rtree + static multiBoxGeo SearchTree(BasicGeometry::BoundingBox query, + const std::string& folder); + explicit Rtree(uintmax_t maxBuildingRamUsage); +}; + +// ___________________________________________________________________________ +// Data structure handling the datapoints of the Rtree sorted in x and y +// direction (either on ram or on disk) +class OrderedBoxes { + public: // TODO + bool workInRam_{}; + uint64_t size_{}; + BasicGeometry::BoundingBox boundingBox_{}; + RectanglesForOrderedBoxes + rectsD0_; // the rectangles (datapoints) sorted in x direction + RectanglesForOrderedBoxes rectsD1_; // the rectangles sorted in y direction + // ___________________________________________________________________________ + // Initiate the splitting of the rectangles in the best position (rectangles + // are stored in ram) + std::pair SplitAtBestInRam(size_t S, size_t M); + // ___________________________________________________________________________ + // Initiate the splitting of the rectangles in the best position (rectangles + // are stored on disk) + std::pair SplitAtBestOnDisk( + const std::string& filePath, size_t S, size_t M, + uint64_t maxBuildingRamUsage); + // ___________________________________________________________________________ + // Get the position and dimension of the best split possible to maximize the + // quality of the Rtree + SplitResult GetBestSplit(); + // ___________________________________________________________________________ + // Actually splitting the rectangles at the given split by splitResult + std::pair + PerformSplit(SplitResult splitResult, SplitBuffers& splitBuffers, size_t M, + size_t S, uint64_t maxBuildingRamUsage = 0); + + public: + [[nodiscard]] bool WorkInRam() const; + // ___________________________________________________________________________ + // Set up the OrderedBoxes with the rectangles given as vectors stored in ram + // and set workInRam to true + void SetOrderedBoxesToRam(RectanglesForOrderedBoxes rectanglesD0, + RectanglesForOrderedBoxes rectanglesD1, + BasicGeometry::BoundingBox box); + // ___________________________________________________________________________ + // Set up the OrderedBoxes with the rectangles given as files stored on disk + // and set workInRam to false + void SetOrderedBoxesToDisk(RectanglesForOrderedBoxes rectanglesD0, + RectanglesForOrderedBoxes rectanglesD1, + uint64_t size, BasicGeometry::BoundingBox box); + BasicGeometry::BoundingBox GetBoundingBox(); + [[nodiscard]] uint64_t GetSize() const; + // ___________________________________________________________________________ + // Wrapper function to perform the whole process of splitting the rectangles + // for either ram or disk case + std::pair SplitAtBest( + const std::filesystem::path& filePath, size_t S, size_t M, + uint64_t maxBuildingRamUsage); + // ___________________________________________________________________________ + // return the rectangles of the x sorting for the case where they are stored + // in ram + multiBoxWithOrderIndex GetRectanglesInRam(); + // ___________________________________________________________________________ + // return the rectangles of the x sorting for the case where they are stored + // on disk + std::filesystem::path GetRectanglesOnDisk(); + + // ___________________________________________________________________________ + // Clear all content of the OrderedBoxes + void Clear(); +}; + +// ___________________________________________________________________________ +// Data type containing all the information about the best split found, which +// are needed to actually perform the split. +struct SplitResult { + double bestCost = -1; + size_t bestDim = 0; + uint64_t bestIndex = 0; + RTreeValueWithOrderIndex bestLastElement; + RTreeValueWithOrderIndex bestElement; + RTreeValueWithOrderIndex bestMinElement; + RTreeValueWithOrderIndex bestMaxElement; +}; + +struct SplitBuffers { + RectanglesForOrderedBoxes& rectsD0Split0; + RectanglesForOrderedBoxes& rectsD1Split0; + RectanglesForOrderedBoxes& rectsD0Split1; + RectanglesForOrderedBoxes& rectsD1Split1; +}; + +#endif // QLEVER_RTREE_H diff --git a/src/util/RtreeBasicGeometry.h b/src/util/RtreeBasicGeometry.h new file mode 100644 index 0000000000..817b77aa8c --- /dev/null +++ b/src/util/RtreeBasicGeometry.h @@ -0,0 +1,196 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#ifndef QLEVER_RTREEBASICGEOMETRY_H +#define QLEVER_RTREEBASICGEOMETRY_H + +#include +#include + +#include "./Rtree.h" +#include "ctre/ctre.h" + +class BasicGeometry { + public: + typedef boost::geometry::model::point< + double, 2, + boost::geometry::cs::spherical_equatorial> + Point; + typedef boost::geometry::model::box BoundingBox; + + static double GetMinX(BoundingBox boundingBox) { + return boundingBox.min_corner().get<0>(); + } + static double GetMinY(BoundingBox boundingBox) { + return boundingBox.min_corner().get<1>(); + } + static double GetMaxX(BoundingBox boundingBox) { + return boundingBox.max_corner().get<0>(); + } + static double GetMaxY(BoundingBox boundingBox) { + return boundingBox.max_corner().get<1>(); + } + + // ___________________________________________________________________________ + // Create a bounding box, based on the corner coordinates + static BasicGeometry::BoundingBox CreateBoundingBox(double pointOneX, + double pointOneY, + double pointTwoX, + double pointTwoY) { + return {{pointOneX, pointOneY}, {pointTwoX, pointTwoY}}; + } + + // ___________________________________________________________________________ + // Take two bounding boxes and combine them into one bounding box containing + // both + static BasicGeometry::BoundingBox CombineBoundingBoxes( + const BasicGeometry::BoundingBox& b1, + const BasicGeometry::BoundingBox& b2) { + auto minX = [](BasicGeometry::BoundingBox b) -> double { + return b.min_corner().get<0>(); + }; + auto minY = [](BasicGeometry::BoundingBox b) -> double { + return b.min_corner().get<1>(); + }; + auto maxX = [](BasicGeometry::BoundingBox b) -> double { + return b.max_corner().get<0>(); + }; + auto maxY = [](BasicGeometry::BoundingBox b) -> double { + return b.max_corner().get<1>(); + }; + + double globalMinX = minX(b1) < minX(b2) ? minX(b1) : minX(b2); + double globalMinY = minY(b1) < minY(b2) ? minY(b1) : minY(b2); + double globalMaxX = maxX(b1) > maxX(b2) ? maxX(b1) : maxX(b2); + double globalMaxY = maxY(b1) > maxY(b2) ? maxY(b1) : maxY(b2); + + return {{globalMinX, globalMinY}, {globalMaxX, globalMaxY}}; + } + + static bool BoundingBoxesAreEqual(BasicGeometry::BoundingBox b1, + BasicGeometry::BoundingBox b2) { + if (BasicGeometry::GetMinX(b1) != BasicGeometry::GetMinX(b2)) return false; + if (BasicGeometry::GetMinY(b1) != BasicGeometry::GetMinY(b2)) return false; + if (BasicGeometry::GetMaxX(b1) != BasicGeometry::GetMaxX(b2)) return false; + if (BasicGeometry::GetMaxY(b1) != BasicGeometry::GetMaxY(b2)) return false; + return true; + } + + static bool IsBorderOfSplitCandidate(uint64_t current, uint64_t splitSize, + uint64_t M) { + if (((current + 1) % splitSize == 0 && (current + 1) / splitSize < M) || + (current % splitSize == 0 && current / splitSize >= 1)) + return true; + return false; + } + + // ___________________________________________________________________________ + // Convert a single wkt literal to a datapoint in the format suitable for the + // Rtree + static std::optional ConvertWordToRtreeEntry( + const std::string& wkt) { + /** + * Convert a single wkt literal to a boundingbox. + * Get the bounding box of either a multipolygon, polygon or a linestring + */ + if (!wkt.starts_with("\"MULTIPOLYGON") && !wkt.starts_with("\"POLYGON") && + !wkt.starts_with("\"LINESTRING")) { + return {}; + } + + double maxDouble = std::numeric_limits::max(); + + double minX = maxDouble; + double maxX = -maxDouble; + double minY = maxDouble; + double maxY = -maxDouble; + + // Iterate over matches and capture x and y coordinates + for ( + auto match : ctre::range< + R"( *([\-|\+]?[0-9]+(?:[.][0-9]+)?) +([\-|\+]?[0-9]+(?:[.][0-9]+)?))">( + wkt)) { + double x = std::stod(std::string(match.get<1>())); + double y = std::stod(std::string(match.get<2>())); + + if (x < minX) minX = x; + if (x > maxX) maxX = x; + if (y < minY) minY = y; + if (y > maxY) maxY = y; + } + + return {BasicGeometry::CreateBoundingBox(minX, minY, maxX, maxY)}; + } +}; + +// ___________________________________________________________________________ +// Data type for a value of the Rtree, which contains the id of the object and +// its bounding box. +struct RTreeValue { + BasicGeometry::BoundingBox box{}; + uint64_t id = 0; + [[nodiscard]] double MinX() const { return box.min_corner().get<0>(); } + [[nodiscard]] double MaxX() const { return box.max_corner().get<0>(); } + [[nodiscard]] double MinY() const { return box.min_corner().get<1>(); } + [[nodiscard]] double MaxY() const { return box.max_corner().get<1>(); } + + bool operator==(const RTreeValue& other) const + { + if (id != other.id) return false; + if (!BasicGeometry::BoundingBoxesAreEqual(box, other.box)) return false; + return true; + } + + template + void serialize(Archive& a, [[maybe_unused]] const unsigned int version) { + a& box; + a& id; + } +}; + +// ___________________________________________________________________________ +// Data type for a value of the Rtree (id and boundingbox), with the addtional +// information of its position in the x- and y-sorting. This is only used to +// create the Rtree in a more efficient way +struct RTreeValueWithOrderIndex : RTreeValue { + uint64_t orderX = 0; + uint64_t orderY = 0; + + bool operator==(const RTreeValueWithOrderIndex& other) const + { + if (id != other.id) return false; + if (!BasicGeometry::BoundingBoxesAreEqual(box, other.box)) return false; + if (orderX != other.orderX) return false; + if (orderY != other.orderY) return false; + return true; + } +}; + +namespace boost::serialization { +template +void save(Archive& a, const BasicGeometry::BoundingBox& b, + [[maybe_unused]] unsigned int version) { + a << b.min_corner().get<0>(); + a << b.min_corner().get<1>(); + a << b.max_corner().get<0>(); + a << b.max_corner().get<1>(); +} +template +void load(Archive& a, BasicGeometry::BoundingBox& b, + [[maybe_unused]] unsigned int version) { + double minX = 0; + a >> minX; + double minY = 0; + a >> minY; + double maxX = 0; + a >> maxX; + double maxY = 0; + a >> maxY; + b = BasicGeometry::BoundingBox(BasicGeometry::Point(minX, minY), + BasicGeometry::Point(maxX, maxY)); +} +} // namespace boost::serialization +BOOST_SERIALIZATION_SPLIT_FREE(BasicGeometry::BoundingBox); + +#endif // QLEVER_RTREEBASICGEOMETRY_H diff --git a/src/util/RtreeBuild.cpp b/src/util/RtreeBuild.cpp new file mode 100644 index 0000000000..ebab1b1928 --- /dev/null +++ b/src/util/RtreeBuild.cpp @@ -0,0 +1,676 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock +#include "./Rtree.h" +#include "./RtreeFileReader.h" +#include "./RtreeSorter.h" + +static double costFunctionTGS(BasicGeometry::BoundingBox& b0, + BasicGeometry::BoundingBox& b1, size_t dim) { + /** + * The cost function determines the quality of a split. The lower the cost, + * the better the split. Each split gets represented by the resulting bounding + * boxes of the split pieces. + */ + double cost; + + // The cost represents the overlap of the two boxes + if (dim == 0) { + cost = BasicGeometry::GetMaxX(b0) - BasicGeometry::GetMinX(b1); + cost = cost < 0 ? 0 : cost; + } else { + cost = BasicGeometry::GetMaxY(b0) - BasicGeometry::GetMinY(b1); + cost = cost < 0 ? 0 : cost; + } + + return cost; +} + +static std::vector TGSRecursive( + const std::string& filePath, OrderedBoxes* orderedInputRectangles, size_t M, + size_t S, uint64_t maxBuildingRamUsage) { + /** + * This function recursively constructs one layer of children for a certain + * root node. The input rectangles must be sorted in both x- and y-direction. + * The algorithm is based on this paper + * https://dl.acm.org/doi/pdf/10.1145/288692.288723 + */ + + uint64_t n = orderedInputRectangles->GetSize(); + + if (n <= S) { + // stop condition + return std::vector{*orderedInputRectangles}; + } + + // split the rectangles at the best split + std::pair split = + orderedInputRectangles->SplitAtBest(filePath, S, M, maxBuildingRamUsage); + + // recursion + std::vector result0 = + TGSRecursive(filePath + ".0", &split.first, M, S, maxBuildingRamUsage); + std::vector result1 = + TGSRecursive(filePath + ".1", &split.second, M, S, maxBuildingRamUsage); + + std::vector result; + result.insert(result.begin(), result0.begin(), result0.end()); + result.insert(result.end(), result1.begin(), result1.end()); + + return result; +} + +uint64_t Rtree::BuildTree(const std::string& onDiskBase, + const std::string& fileSuffix, size_t M, + const std::string& folder) const { + const std::filesystem::path file = onDiskBase + fileSuffix + ".tmp"; + + // sort the rectangles + uint64_t fileLines = + std::ceil(std::filesystem::file_size(file) / + (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t))); + bool workInRam = + (std::filesystem::file_size(file) + fileLines * 2 * sizeof(uint64_t)) * + 4 < + this->maxBuildingRamUsage_; + + std::cout << "Sorting" << (workInRam ? " in ram..." : "on disk...") << std::endl; + OrderedBoxes orderedInputRectangles = + SortInput(onDiskBase, fileSuffix, M, maxBuildingRamUsage_, workInRam); + uint64_t totalSize = orderedInputRectangles.GetSize(); + //OrderedBoxes orderedInputRectangles = InternalSort(onDiskBase, fileSuffix, M); + std::cout << "Finished initial sorting" << std::endl; + std::cout << orderedInputRectangles.GetSize() << std::endl; + std::cout << orderedInputRectangles.rectsD0_.rectanglesSmall.size() << std::endl; + std::cout << orderedInputRectangles.rectsD1_.rectanglesSmall.size() << std::endl; + + // prepare the files + std::filesystem::create_directory(folder); + std::ofstream nodesOfs = + std::ofstream(folder + "/nodes.bin", std::ios::binary); + std::map lookup; + + // build the tree in a depth first approach + std::stack layerStack; + + uint64_t newId = 1; // start from 1, because 0 is the root item + ConstructionNode rootItem = ConstructionNode(0, orderedInputRectangles); + layerStack.push(rootItem); + orderedInputRectangles.Clear(); + rootItem.GetOrderedBoxes().Clear(); + size_t layer = 0; + + while (!layerStack.empty()) { + ConstructionNode currentItem = layerStack.top(); + layerStack.pop(); + + if (currentItem.GetOrderedBoxes().GetSize() <= M) { + // reached a leaf + currentItem.AddChildrenToItem(); + currentItem.SetIsLastInnerNode(true); + uint64_t nodePtr = FileReader::SaveNode(currentItem, nodesOfs); + lookup[currentItem.GetId()] = nodePtr; + } else { + size_t S = std::ceil(((double)currentItem.GetOrderedBoxes().GetSize()) / + ((double)M)); + if (currentItem.GetOrderedBoxes().GetSize() <= M * M) { + // in this case S can be just M + S = M; + } + std::vector tgsResult = TGSRecursive( + onDiskBase + fileSuffix + "." + std::to_string(layer), + ¤tItem.GetOrderedBoxes(), M, S, this->maxBuildingRamUsage_); + for (OrderedBoxes& currentOrderedRectangles : tgsResult) { + ConstructionNode newItem = + ConstructionNode(newId, currentOrderedRectangles); + layerStack.push(newItem); + + currentItem.AddChild(newItem); + + newId++; + } + + uint64_t nodePtr = FileReader::SaveNode(currentItem, nodesOfs); + lookup[currentItem.GetId()] = nodePtr; + } + layer++; + } + nodesOfs.close(); + + std::ofstream lookupOfs(folder + "/lookup.bin", std::ios::binary); + for (unsigned int i = 0; i < newId; i++) { + uint64_t nodePtr = lookup[i]; + lookupOfs.write(reinterpret_cast(&nodePtr), sizeof(uint64_t)); + } + lookupOfs.close(); + + return totalSize; +} + +bool OrderedBoxes::WorkInRam() const { return this->workInRam_; } + +void OrderedBoxes::Clear() { + size_ = 0; + boundingBox_ = BasicGeometry::CreateBoundingBox(0, 0, 0, 0); + rectsD0_.Clear(); + rectsD1_.Clear(); +} + +void OrderedBoxes::SetOrderedBoxesToRam(RectanglesForOrderedBoxes rectanglesD0, + RectanglesForOrderedBoxes rectanglesD1, + BasicGeometry::BoundingBox box) { + this->workInRam_ = true; + this->rectsD0_ = std::move(rectanglesD0); + this->rectsD1_ = std::move(rectanglesD1); + this->size_ = + std::get(this->rectsD0_.rectangles).size(); + this->boundingBox_ = box; +} + +void OrderedBoxes::SetOrderedBoxesToDisk(RectanglesForOrderedBoxes rectanglesD0, + RectanglesForOrderedBoxes rectanglesD1, + uint64_t size, + BasicGeometry::BoundingBox box) { + this->workInRam_ = false; + this->rectsD0_ = std::move(rectanglesD0); + this->rectsD1_ = std::move(rectanglesD1); + this->size_ = size; + this->boundingBox_ = box; +} + +BasicGeometry::BoundingBox OrderedBoxes::GetBoundingBox() { + return this->boundingBox_; +} + +uint64_t OrderedBoxes::GetSize() const { return this->size_; } + +multiBoxWithOrderIndex OrderedBoxes::GetRectanglesInRam() { + return std::get(this->rectsD0_.rectangles); +} + +std::filesystem::path OrderedBoxes::GetRectanglesOnDisk() { + return std::get(this->rectsD0_.rectangles); +} + +SplitResult OrderedBoxes::GetBestSplit() { + /** + * Determine based on the "small-lists", which split is the best for the + * rtree. + */ + struct SplitResult splitResult; + + RTreeValueWithOrderIndex minElement; + RTreeValueWithOrderIndex maxElement; + RTreeValueWithOrderIndex currentLastElement; + RTreeValueWithOrderIndex currentElement; + + // This bool is used, since we need every other element as our element "S * i" + // (described in the algorithm) To perform the split better, the element + // before it (S * i - 1) is saved as well + bool currentlyAtSTimesI = false; + + for (size_t dim = 0; dim < 2; dim++) { + for (uint64_t i = 0; i < this->rectsD0_.rectanglesSmall.size(); i++) { + currentElement = dim == 0 ? this->rectsD0_.rectanglesSmall[i] + : this->rectsD1_.rectanglesSmall[i]; + + if (i == 0) { + // this is the min element + minElement = currentElement; + continue; + } + + if (i == 1) { + // this is the max element + maxElement = currentElement; + continue; + } + + if (!currentlyAtSTimesI) { + currentLastElement = currentElement; + currentlyAtSTimesI = true; + continue; + } + + if (currentElement.id == maxElement.id) { + break; + } + + currentlyAtSTimesI = false; + + // the current element is a possible split position. + double minXB0 = minElement.MinX(); + double maxXB0 = currentLastElement.MaxX(); + double minXB1 = currentElement.MinX(); + double maxXB1 = maxElement.MaxX(); + + double minYB0 = minElement.MinY(); + double maxYB0 = currentLastElement.MaxY(); + double minYB1 = currentElement.MinY(); + double maxYB1 = maxElement.MaxY(); + + BasicGeometry::BoundingBox b0 = + BasicGeometry::CreateBoundingBox(minXB0, minYB0, maxXB0, maxYB0); + BasicGeometry::BoundingBox b1 = + BasicGeometry::CreateBoundingBox(minXB1, minYB1, maxXB1, maxYB1); + + double cost = costFunctionTGS(b0, b1, dim); + + if (splitResult.bestCost == -1 || cost < splitResult.bestCost) { + splitResult.bestCost = cost; + splitResult.bestDim = dim; + splitResult.bestLastElement = currentLastElement; + splitResult.bestElement = currentElement; + splitResult.bestMinElement = minElement; + splitResult.bestMaxElement = maxElement; + splitResult.bestIndex = i; + } + } + currentlyAtSTimesI = false; + } + return splitResult; +} + +std::pair OrderedBoxes::SplitAtBest( + const std::filesystem::path& filePath, size_t S, size_t M, + uint64_t maxBuildingRamUsage) { + if (this->workInRam_) { + return this->SplitAtBestInRam(S, M); + } else { + return this->SplitAtBestOnDisk(filePath, S, M, maxBuildingRamUsage); + } +} + +std::pair OrderedBoxes::SplitAtBestInRam(size_t S, + size_t M) { + /** + * Split the ordered boxes in ram. First determine the best split and then + * perform it + */ + + struct SplitResult splitResult = this->GetBestSplit(); + + OrderedBoxes split0; + OrderedBoxes split1; + + RectanglesForOrderedBoxes rectsD0Split0; + RectanglesForOrderedBoxes rectsD1Split0; + RectanglesForOrderedBoxes rectsD0Split1; + RectanglesForOrderedBoxes rectsD1Split1; + + struct SplitBuffers splitBuffers = {rectsD0Split0, rectsD1Split0, + rectsD0Split1, rectsD1Split1}; + + std::pair + boundingBoxes = PerformSplit(splitResult, splitBuffers, M, S); + + split0.SetOrderedBoxesToRam(rectsD0Split0, rectsD1Split0, + boundingBoxes.first); + split1.SetOrderedBoxesToRam(rectsD0Split1, rectsD1Split1, + boundingBoxes.second); + + return std::make_pair(split0, split1); +} + +std::pair OrderedBoxes::SplitAtBestOnDisk( + const std::string& filePath, size_t S, size_t M, + uint64_t maxBuildingRamUsage) { + /** + * Split the ordered boxes on disk. First determine the best split and then + * perform it + */ + + OrderedBoxes split0; + OrderedBoxes split1; + + struct SplitResult splitResult = this->GetBestSplit(); + + RectanglesForOrderedBoxes rectsD0Split0; + RectanglesForOrderedBoxes rectsD1Split0; + RectanglesForOrderedBoxes rectsD0Split1; + RectanglesForOrderedBoxes rectsD1Split1; + struct SplitBuffers splitBuffers = {rectsD0Split0, rectsD1Split0, + rectsD0Split1, rectsD1Split1}; + + // perfrom the split + auto sizeLeft = + (uint64_t)(std::ceil(((double)splitResult.bestIndex - 2.0) / 2.0) * + (double)S); + uint64_t sizeRight = this->size_ - sizeLeft; + uint64_t split0ByteSize = + sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); + uint64_t split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + + 2 * sizeof(uint64_t)); + bool split0InRam = split0ByteSize * 4 < maxBuildingRamUsage; + bool split1InRam = split1ByteSize * 4 < maxBuildingRamUsage; + + if (!split0InRam) { + splitBuffers.rectsD0Split0.rectangles = filePath + ".0.dim0.tmp"; + splitBuffers.rectsD1Split0.rectangles = filePath + ".0.dim1.tmp"; + } + + if (!split1InRam) { + splitBuffers.rectsD0Split1.rectangles = filePath + ".1.dim0.tmp"; + splitBuffers.rectsD1Split1.rectangles = filePath + ".1.dim1.tmp"; + } + + std::pair + boundingBoxes = + PerformSplit(splitResult, splitBuffers, M, S, maxBuildingRamUsage); + + if (!split0InRam) { + split0.SetOrderedBoxesToDisk(rectsD0Split0, rectsD1Split0, sizeLeft, + boundingBoxes.first); + } else { + split0.SetOrderedBoxesToRam(rectsD0Split0, rectsD1Split0, + boundingBoxes.first); + } + + if (!split1InRam) { + split1.SetOrderedBoxesToDisk(rectsD0Split1, rectsD1Split1, sizeRight, + boundingBoxes.second); + } else { + split1.SetOrderedBoxesToRam(rectsD0Split1, rectsD1Split1, + boundingBoxes.second); + } + + std::remove( + std::get(this->rectsD0_.rectangles).c_str()); + std::remove( + std::get(this->rectsD1_.rectangles).c_str()); + + return std::make_pair(split0, split1); +} + +std::pair +OrderedBoxes::PerformSplit(SplitResult splitResult, SplitBuffers& splitBuffers, + size_t M, size_t S, uint64_t maxBuildingRamUsage) { + /** + * Perform the best split on the current ordered boxes in the disk case + */ + + auto sizeLeft = + (uint64_t)(std::ceil(((double)splitResult.bestIndex - 2.0) / 2.0) * + (double)S); + uint64_t sizeRight = this->size_ - sizeLeft; + size_t SSplit0 = + sizeLeft <= S ? (size_t)std::ceil((double)sizeLeft / (double)M) : S; + if (sizeLeft <= S && sizeLeft <= M * M) { + SSplit0 = M; + } + size_t SSplit1 = + sizeRight <= S ? (size_t)std::ceil((double)sizeRight / (double)M) : S; + if (sizeRight <= S && sizeRight <= M * M) { + SSplit1 = M; + } + uint64_t split0ByteSize = + sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); + uint64_t split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + + 2 * sizeof(uint64_t)); + bool split0InRam = + maxBuildingRamUsage == 0 || split0ByteSize * 4 < maxBuildingRamUsage; + bool split1InRam = + maxBuildingRamUsage == 0 || split1ByteSize * 4 < maxBuildingRamUsage; + + BasicGeometry::BoundingBox boxSplit0 = + BasicGeometry::CreateBoundingBox(0, 0, 0, 0); + BasicGeometry::BoundingBox boxSplit1 = + BasicGeometry::CreateBoundingBox(0, 0, 0, 0); + + RTreeValueWithOrderIndex minSplit0OtherDim; + RTreeValueWithOrderIndex maxSplit0OtherDim; + RTreeValueWithOrderIndex minSplit1OtherDim; + RTreeValueWithOrderIndex maxSplit1OtherDim; + + struct OtherDimension { + multiBoxWithOrderIndex* smallSplit0; + multiBoxWithOrderIndex* smallSplit1; + } otherDimension{}; + + auto pushSmallBoundaries = [splitResult]( + multiBoxWithOrderIndex& smallListS0, + multiBoxWithOrderIndex& smallListS1) { + smallListS0.push_back(splitResult.bestMinElement); + smallListS0.push_back(splitResult.bestLastElement); + smallListS1.push_back(splitResult.bestElement); + smallListS1.push_back(splitResult.bestMaxElement); + }; + + if (splitResult.bestDim == 0) { + pushSmallBoundaries(splitBuffers.rectsD0Split0.rectanglesSmall, + splitBuffers.rectsD0Split1.rectanglesSmall); + + // placeholder, since we need the min and max element of the split in the + otherDimension.smallSplit0 = &splitBuffers.rectsD1Split0.rectanglesSmall; + otherDimension.smallSplit1 = &splitBuffers.rectsD1Split1.rectanglesSmall; + // first two spots + otherDimension.smallSplit0->emplace_back(); + otherDimension.smallSplit0->emplace_back(); + otherDimension.smallSplit1->emplace_back(); + otherDimension.smallSplit1->emplace_back(); + } else { + pushSmallBoundaries(splitBuffers.rectsD1Split0.rectanglesSmall, + splitBuffers.rectsD1Split1.rectanglesSmall); + + // placeholder + otherDimension.smallSplit0 = &splitBuffers.rectsD0Split0.rectanglesSmall; + otherDimension.smallSplit1 = &splitBuffers.rectsD0Split1.rectanglesSmall; + + otherDimension.smallSplit0->emplace_back(); + otherDimension.smallSplit0->emplace_back(); + otherDimension.smallSplit1->emplace_back(); + otherDimension.smallSplit1->emplace_back(); + } + + std::optional elementOpt; + std::filesystem::path rectsD0Path = + !this->workInRam_ + ? std::get(this->rectsD0_.rectangles) + : ""; + std::filesystem::path rectsD1Path = + !this->workInRam_ + ? std::get(this->rectsD1_.rectangles) + : ""; + FileReader fileReaderDim0 = FileReader(rectsD0Path); + FileReader fileReaderDim1 = FileReader(rectsD1Path); + FileReader::iterator fileReaderDim0Iterator = + !rectsD0Path.empty() ? fileReaderDim0.begin() : FileReader::iterator(); + FileReader::iterator fileReaderDim1Iterator = + !rectsD1Path.empty() ? fileReaderDim1.begin() : FileReader::iterator(); + uint64_t currentXSplit0 = 0; + uint64_t currentXSplit1 = 0; + uint64_t currentYSplit0 = 0; + uint64_t currentYSplit1 = 0; + + std::optional rectanglesOnDiskS0D0Stream = {}; + std::optional rectanglesOnDiskS0D1Stream = {}; + std::optional rectanglesOnDiskS1D0Stream = {}; + std::optional rectanglesOnDiskS1D1Stream = {}; + if (!split0InRam && !this->workInRam_) { + rectanglesOnDiskS0D0Stream = std::ofstream( + std::get(splitBuffers.rectsD0Split0.rectangles), + std::ios::binary); + rectanglesOnDiskS0D1Stream = std::ofstream( + std::get(splitBuffers.rectsD1Split0.rectangles), + std::ios::binary); + } + if (!split1InRam && !this->workInRam_) { + rectanglesOnDiskS1D0Stream = std::ofstream( + std::get(splitBuffers.rectsD0Split1.rectangles), + std::ios::binary); + rectanglesOnDiskS1D1Stream = std::ofstream( + std::get(splitBuffers.rectsD1Split1.rectangles), + std::ios::binary); + } + + auto performCertainSplit = + [M, &splitBuffers, &splitResult, &rectanglesOnDiskS0D0Stream, + &rectanglesOnDiskS0D1Stream, &rectanglesOnDiskS1D0Stream, + &rectanglesOnDiskS1D1Stream]( + size_t dim, size_t split, uint64_t& current, size_t& currentSplitSize, + RTreeValueWithOrderIndex& minElement, + RTreeValueWithOrderIndex& maxElement, bool currentSplitInRam, + bool workInRam, RTreeValueWithOrderIndex& element, + BasicGeometry::BoundingBox& box) { + multiBoxWithOrderIndex* currentSmallList; + // current list is either in ram or on disk + std::variant currentList; + + if (split == 0) { + if (dim == 0) { + currentSmallList = &splitBuffers.rectsD0Split0.rectanglesSmall; + if (currentSplitInRam || workInRam) { + currentList = &std::get( + splitBuffers.rectsD0Split0.rectangles); + } else { + currentList = &rectanglesOnDiskS0D0Stream.value(); + } + } else { + currentSmallList = &splitBuffers.rectsD1Split0.rectanglesSmall; + if (currentSplitInRam || workInRam) { + currentList = &std::get( + splitBuffers.rectsD1Split0.rectangles); + } else { + currentList = &rectanglesOnDiskS0D1Stream.value(); + } + } + } else { + if (dim == 0) { + currentSmallList = &splitBuffers.rectsD0Split1.rectanglesSmall; + if (currentSplitInRam || workInRam) { + currentList = &std::get( + splitBuffers.rectsD0Split1.rectangles); + } else { + currentList = &rectanglesOnDiskS1D0Stream.value(); + } + } else { + currentSmallList = &splitBuffers.rectsD1Split1.rectanglesSmall; + if (currentSplitInRam || workInRam) { + currentList = &std::get( + splitBuffers.rectsD1Split1.rectangles); + } else { + currentList = &rectanglesOnDiskS1D1Stream.value(); + } + } + } + + // add the element to the current split dimension 0/1 vector / file + if (currentSplitInRam || workInRam) { + std::get(currentList)->push_back(element); + } else { + FileReader::SaveEntryWithOrderIndex( + element, *(std::get(currentList))); + } + + // check if the element is at the position i * S (described in the + // algorithm) or one before it. In this case it is a future possible + // split position and needs to be saved to the "small list" + if (BasicGeometry::IsBorderOfSplitCandidate(current, currentSplitSize, + M)) { + // index i * S - 1 or i * S + currentSmallList->push_back(element); + } + + // update the boundingbox to get the whole boundingbox of the split + if (dim == 0) { + if (current == 0) { + box = element.box; + } else { + box = BasicGeometry::CombineBoundingBoxes(box, element.box); + } + } + + // keep track of the min and max element of the split, to later + // replace the placeholder in the "small lists" + if (splitResult.bestDim == 1 - dim) { + if (current == 0) { + minElement = element; + } + // max element gets updated each time, because the elements are sorted + // in an ascending way + maxElement = element; + } + + current++; + }; + + for (size_t dim = 0; dim < 2; dim++) { + // start performing the actual split + uint64_t i = 0; + + if (!this->workInRam_) { + if (dim == 0 && fileReaderDim0Iterator != FileReader::end()) + elementOpt = *fileReaderDim0Iterator; + if (dim == 1 && fileReaderDim1Iterator != FileReader::end()) + elementOpt = *fileReaderDim1Iterator; + } + + while ((this->workInRam_ && i < this->size_) || + (!this->workInRam_ && elementOpt)) { + RTreeValueWithOrderIndex element; + + // get the current element, either from disk or from ram + if (this->workInRam_) { + element = + dim == 0 + ? std::get(this->rectsD0_.rectangles)[i] + : std::get( + this->rectsD1_.rectangles)[i]; + } else { + element = elementOpt.value(); + } + + if ((splitResult.bestDim == 0 && + element.orderX < splitResult.bestElement.orderX) || + (splitResult.bestDim == 1 && + element.orderY < splitResult.bestElement.orderY)) { + // the element belongs to split 0 + + if (dim == 0) { + performCertainSplit(0, 0, currentXSplit0, SSplit0, minSplit0OtherDim, + maxSplit0OtherDim, split0InRam, this->workInRam_, + element, boxSplit0); + } else { + performCertainSplit(1, 0, currentYSplit0, SSplit0, minSplit0OtherDim, + maxSplit0OtherDim, split0InRam, this->workInRam_, + element, boxSplit0); + } + } else { + // the element belongs to split 1 + + if (dim == 0) { + performCertainSplit(0, 1, currentXSplit1, SSplit1, minSplit1OtherDim, + maxSplit1OtherDim, split1InRam, this->workInRam_, + element, boxSplit1); + } else { + performCertainSplit(1, 1, currentYSplit1, SSplit1, minSplit1OtherDim, + maxSplit1OtherDim, split1InRam, this->workInRam_, + element, boxSplit1); + } + } + i++; + + if (!this->workInRam_) { + if (dim == 0 && ++fileReaderDim0Iterator != FileReader::end()) { + elementOpt = *fileReaderDim0Iterator; + continue; + } + if (dim == 1 && ++fileReaderDim1Iterator != FileReader::end()) { + elementOpt = *fileReaderDim1Iterator; + continue; + } + } + elementOpt = {}; + } + } + + // replace the placeholder + (*otherDimension.smallSplit0)[0] = minSplit0OtherDim; + (*otherDimension.smallSplit0)[1] = maxSplit0OtherDim; + (*otherDimension.smallSplit1)[0] = minSplit1OtherDim; + (*otherDimension.smallSplit1)[1] = maxSplit1OtherDim; + + this->Clear(); + + return std::make_pair(boxSplit0, boxSplit1); +} diff --git a/src/util/RtreeFileReader.cpp b/src/util/RtreeFileReader.cpp new file mode 100644 index 0000000000..e449314ba1 --- /dev/null +++ b/src/util/RtreeFileReader.cpp @@ -0,0 +1,124 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#include "./RtreeFileReader.h" + +#include +#include +#include + +#include "./Rtree.h" + +FileReader::iterator& FileReader::iterator::operator++() { + BasicGeometry::BoundingBox box; + uint64_t id; + uint64_t orderX; + uint64_t orderY; + if (input_ && + input_.read(reinterpret_cast(&box), + sizeof(BasicGeometry::BoundingBox)) && + input_.read(reinterpret_cast(&id), sizeof(uint64_t)) && + input_.read(reinterpret_cast(&orderX), sizeof(uint64_t)) && + input_.read(reinterpret_cast(&orderY), sizeof(uint64_t))) { + currentElement_ = {{box, id}, orderX, orderY}; + valid_ = true; + } else { + valid_ = false; + } + return *this; +} + +FileReaderWithoutIndex::iterator& +FileReaderWithoutIndex::iterator::operator++() { + BasicGeometry::BoundingBox box; + uint64_t id; + if (input_ && + input_.read(reinterpret_cast(&box), + sizeof(BasicGeometry::BoundingBox)) && + input_.read(reinterpret_cast(&id), sizeof(uint64_t))) { + currentElement_ = {box, id}; + valid_ = true; + } else { + valid_ = false; + } + return *this; +} + +uint64_t FileReader::SaveNode(RtreeNode& node, std::ofstream& nodesOfs) { + uint64_t pos = static_cast(nodesOfs.tellp()); + boost::archive::binary_oarchive archive(nodesOfs); + archive << node; + nodesOfs.write(" ", 1); + + return pos; +} + +RtreeNode FileReader::LoadNode(uint64_t id, std::ifstream& lookupIfs, + std::ifstream& nodesIfs) { + RtreeNode newNode; + + uint64_t offset = id * (uint64_t)sizeof(uint64_t); + lookupIfs.seekg((long long)offset, std::ios::beg); + + uint64_t nodePtr; + lookupIfs.read(reinterpret_cast(&nodePtr), sizeof(uint64_t)); + + nodesIfs.seekg((long long)nodePtr); + boost::archive::binary_iarchive ia(nodesIfs); + ia >> newNode; + + return newNode; +} + +void FileReaderWithoutIndex::SaveEntry(BasicGeometry::BoundingBox boundingBox, + uint64_t index, + std::ofstream& convertOfs) { + /** + * Save a single entry (which was e.g. converted by ConvertWordToRtreeEntry) + * to the disk + */ + static_assert(std::is_trivially_copyable_v); + convertOfs.write(reinterpret_cast(&boundingBox), + sizeof(BasicGeometry::BoundingBox)); + convertOfs.write(reinterpret_cast(&index), sizeof(uint64_t)); +} + +void FileReader::SaveEntryWithOrderIndex(RTreeValueWithOrderIndex treeValue, + std::ofstream& convertOfs) { + /** + * Save a single entry, containing its postion in the x- and y-sorting + */ + + static_assert(std::is_trivially_copyable_v); + convertOfs.write(reinterpret_cast(&treeValue.box), + sizeof(BasicGeometry::BoundingBox)); + convertOfs.write(reinterpret_cast(&treeValue.id), + sizeof(uint64_t)); + convertOfs.write(reinterpret_cast(&treeValue.orderX), + sizeof(uint64_t)); + convertOfs.write(reinterpret_cast(&treeValue.orderY), + sizeof(uint64_t)); +} + +multiBoxGeo FileReaderWithoutIndex::LoadEntries( + const std::filesystem::path& file) { + multiBoxGeo boxes; + + for (const RTreeValue& element : FileReaderWithoutIndex(file)) { + boxes.push_back(element); + } + + return boxes; +} + +multiBoxWithOrderIndex FileReader::LoadEntriesWithOrderIndex( + const std::filesystem::path& file) { + multiBoxWithOrderIndex boxes; + + for (const RTreeValueWithOrderIndex& element : FileReader(file)) { + boxes.push_back(element); + } + + return boxes; +} diff --git a/src/util/RtreeFileReader.h b/src/util/RtreeFileReader.h new file mode 100644 index 0000000000..7dc744a45d --- /dev/null +++ b/src/util/RtreeFileReader.h @@ -0,0 +1,114 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#ifndef QLEVER_RTREEFILEREADER_H +#define QLEVER_RTREEFILEREADER_H + +#include "./Rtree.h" +#include "./RtreeNode.h" + +class FileReader { + public: + // ___________________________________________________________________________ + // Save a single datapoint of the Rtree, together with its position in the x + // and y sorting to disk + static void SaveEntryWithOrderIndex(RTreeValueWithOrderIndex treeValue, + std::ofstream& convertOfs); + // ___________________________________________________________________________ + // Load all datapoints of the Rtree, together with its x and y sorting into + // ram + static multiBoxWithOrderIndex LoadEntriesWithOrderIndex( + const std::filesystem::path& file); + // ___________________________________________________________________________ + // Save the current node in the building process to disk and return the + // position of the node in the file + static uint64_t SaveNode(RtreeNode& node, std::ofstream& nodesOfs); + // ___________________________________________________________________________ + // Load a specific RtreeNode to query in its children + static RtreeNode LoadNode(uint64_t id, std::ifstream& lookupIfs, + std::ifstream& nodesIfs); + + explicit FileReader(const std::filesystem::path& filename) + : file_(filename) {} + + class iterator + : public std::iterator { + public: + explicit iterator(std::ifstream& in) : input_(in) { + ++(*this); // Read the first element + } + + iterator() : input_(nullstream_) {} // End iterator constructor + + iterator& operator++(); + + const RTreeValueWithOrderIndex& operator*() const { + return currentElement_; + } + + bool operator!=(const iterator& other) const { + return valid_ != other.valid_; + } + + private: + std::ifstream& input_; + std::ifstream nullstream_; // A dummy stream for the end iterator + RTreeValueWithOrderIndex currentElement_; + bool valid_{}; + }; + + iterator begin() { return iterator(file_); } + + static iterator end() { return {}; } + + private: + std::ifstream file_; +}; + +class FileReaderWithoutIndex { + public: + // ___________________________________________________________________________ + // Save a single datapoint for the Rtree to disk + static void SaveEntry(BasicGeometry::BoundingBox boundingBox, uint64_t index, + std::ofstream& convertOfs); + // ___________________________________________________________________________ + // Load all datapoints of the Rtree in file into ram + static multiBoxGeo LoadEntries(const std::filesystem::path& file); + + explicit FileReaderWithoutIndex(const std::filesystem::path& filename) + : file_(filename) {} + + class iterator + : public std::iterator { + public: + explicit iterator(std::ifstream& in) : input_(in) { + ++(*this); // Read the first element + } + + iterator() : input_(nullstream_) {} // End iterator constructor + + iterator& operator++(); + + const RTreeValue& operator*() const { return currentElement_; } + + bool operator!=(const iterator& other) const { + return valid_ != other.valid_; + } + + private: + std::ifstream& input_; + std::ifstream nullstream_; // A dummy stream for the end iterator + RTreeValue currentElement_; + bool valid_{}; + }; + + iterator begin() { return iterator(file_); } + + static iterator end() { return {}; } + + private: + std::ifstream file_; +}; + +#endif // QLEVER_RTREEFILEREADER_H diff --git a/src/util/RtreeNode.cpp b/src/util/RtreeNode.cpp new file mode 100644 index 0000000000..2453678df1 --- /dev/null +++ b/src/util/RtreeNode.cpp @@ -0,0 +1,69 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#include "./RtreeNode.h" + +#include "./RtreeFileReader.h" + +ConstructionNode::ConstructionNode(uint64_t id, OrderedBoxes orderedBoxes) + : RtreeNode{id} { + this->orderedBoxes_ = orderedBoxes; + // calculate the boundingBoxes + this->boundingBox_ = orderedBoxes.GetBoundingBox(); +} + +void ConstructionNode::AddChildrenToItem() { + /** + * Add all children of a certain node at once. + * This is used when a leaf node is reached. + */ + if (this->GetOrderedBoxes().WorkInRam()) { + for (RTreeValueWithOrderIndex box : + this->GetOrderedBoxes().GetRectanglesInRam()) { + RtreeNode leafNode = RtreeNode(box.id, box.box); + this->AddChild(leafNode); + } + } else { + for (const RTreeValueWithOrderIndex& element : + FileReader(this->GetOrderedBoxes().GetRectanglesOnDisk())) { + RtreeNode leafNode = RtreeNode(element.id, element.box); + this->AddChild(leafNode); + } + } +} + +OrderedBoxes& ConstructionNode::GetOrderedBoxes() { + return this->orderedBoxes_; +} + +void RtreeNode::AddChild(RtreeNode& child) { + BasicGeometry::BoundingBox box = child.GetBoundingBox(); + uint64_t entryId = child.GetId(); + RTreeValue entry = {box, entryId}; + this->children_.push_back(entry); +} + +BasicGeometry::BoundingBox RtreeNode::GetBoundingBox() const { + return this->boundingBox_; +} + +void RtreeNode::SetIsLastInnerNode(bool isLast) { + this->isLastInnerNode_ = isLast; +} + +uint64_t RtreeNode::GetId() const { return this->id_; } + +RtreeNode::RtreeNode() = default; + +RtreeNode::RtreeNode(uint64_t id, BasicGeometry::BoundingBox boundingBox, + bool isLastInnerNode, multiBoxGeo children) { + this->id_ = id; + this->boundingBox_ = boundingBox; + this->children_ = std::move(children); + this->isLastInnerNode_ = isLastInnerNode; +} + +bool RtreeNode::GetIsLastInnerNode() const { return this->isLastInnerNode_; } + +multiBoxGeo RtreeNode::GetChildren() { return this->children_; } diff --git a/src/util/RtreeNode.h b/src/util/RtreeNode.h new file mode 100644 index 0000000000..572d22e084 --- /dev/null +++ b/src/util/RtreeNode.h @@ -0,0 +1,67 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#ifndef QLEVER_NODE_H +#define QLEVER_NODE_H + +#include "./Rtree.h" + +// ___________________________________________________________________________ +// Data structure representing a single node of the Rtree containing the +// boundingbox and its children +class RtreeNode { + protected: + friend class boost::serialization::access; + uint64_t id_{}; + BasicGeometry::BoundingBox boundingBox_{}; + bool isLastInnerNode_ = + false; // when true, this means that the node is the last inner node and + // all of its children are leafs + multiBoxGeo children_; + + template + void serialize(Archive& a, [[maybe_unused]] const unsigned int version) { + a& id_; + a& isLastInnerNode_; + a& boundingBox_; + a& children_; + } + + public: + RtreeNode(); + explicit RtreeNode(uint64_t id, BasicGeometry::BoundingBox boundingBox = {}, + bool isLastInnerNode = false, multiBoxGeo children = {}); + [[nodiscard]] uint64_t GetId() const; + [[nodiscard]] BasicGeometry::BoundingBox GetBoundingBox() const; + void AddChild(RtreeNode& child); + void SetIsLastInnerNode(bool isLast); + [[nodiscard]] bool GetIsLastInnerNode() const; + multiBoxGeo GetChildren(); + + bool operator==(const RtreeNode& other) const + { + if (id_ != other.id_) return false; + if (!BasicGeometry::BoundingBoxesAreEqual(boundingBox_, other.boundingBox_)) return false; + if (isLastInnerNode_ != other.isLastInnerNode_) return false; + if (children_ != other.children_) return false; + return true; + } +}; + +BOOST_CLASS_VERSION(RtreeNode, 1) + +// ___________________________________________________________________________ +// Subclass of the RtreeNode only needed while constructing the Rtree (it keeps +// track of the remaining OrderedBoxes of the subtree) +class ConstructionNode : public RtreeNode { + private: + OrderedBoxes orderedBoxes_; + + public: + ConstructionNode(uint64_t id, OrderedBoxes orderedBoxes); + OrderedBoxes& GetOrderedBoxes(); + void AddChildrenToItem(); +}; + +#endif // QLEVER_NODE_H diff --git a/src/util/RtreeSorter.cpp b/src/util/RtreeSorter.cpp new file mode 100644 index 0000000000..bfcd126bd1 --- /dev/null +++ b/src/util/RtreeSorter.cpp @@ -0,0 +1,302 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#include + +#include "./RtreeFileReader.h" + +template +struct SortRuleLambda { + // comparison function + bool operator()(const RTreeValue& b1, const RTreeValue& b2) const { + double center1 = dimension == 0 ? std::midpoint(b1.MinX(), b1.MaxX()) + : std::midpoint(b1.MinY(), b1.MaxY()); + double center2 = dimension == 0 ? std::midpoint(b2.MinX(), b2.MaxX()) + : std::midpoint(b2.MinY(), b2.MaxY()); + return center1 < center2; + } + + // Value that is strictly smaller than any input element. + static RTreeValue min_value() { + return { + BasicGeometry::CreateBoundingBox(-std::numeric_limits::max(), + -std::numeric_limits::max(), + -std::numeric_limits::max(), + -std::numeric_limits::max()), + 0}; + } + + // Value that is strictly larger than any input element. + static RTreeValue max_value() { + return { + BasicGeometry::CreateBoundingBox(std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max()), + 0}; + } +}; + +template +struct SortRuleLambdaWithIndex { + uint64_t RTreeValueWithOrderIndex::*orderSelected = + dimension == 0 ? &RTreeValueWithOrderIndex::orderX + : &RTreeValueWithOrderIndex::orderY; + + // comparison function + bool operator()(const RTreeValueWithOrderIndex& b1, + const RTreeValueWithOrderIndex& b2) const { + double center1 = dimension == 0 ? std::midpoint(b1.MinX(), b1.MaxX()) + : std::midpoint(b1.MinY(), b1.MaxY()); + double center2 = dimension == 0 ? std::midpoint(b2.MinX(), b2.MaxX()) + : std::midpoint(b2.MinY(), b2.MaxY()); + + if (b1.*orderSelected == b2.*orderSelected) return center1 < center2; + return b1.*orderSelected < b2.*orderSelected; + } + + // Value that is strictly smaller than any input element. + static RTreeValueWithOrderIndex min_value() { + return { + {BasicGeometry::CreateBoundingBox(-std::numeric_limits::max(), + -std::numeric_limits::max(), + -std::numeric_limits::max(), + -std::numeric_limits::max()), + 0}, + 0, + 0}; + } + + // Value that is strictly larger than any input element. + static RTreeValueWithOrderIndex max_value() { + return { + {BasicGeometry::CreateBoundingBox(std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max()), + 0}, + std::numeric_limits::max(), + std::numeric_limits::max()}; + } +}; + +static void centerOrdering(multiBoxGeo& boxes, size_t dim) { + if (dim == 0) { + // order by centerX + std::sort(boxes.begin(), boxes.end(), SortRuleLambda<0>{}); + } else { + // order by centerY + std::sort(boxes.begin(), boxes.end(), SortRuleLambda<1>{}); + } +} + +static void centerOrdering(multiBoxWithOrderIndex& boxes, size_t dim) { + if (dim == 0) { + // order by centerX + std::sort(boxes.begin(), boxes.end(), SortRuleLambdaWithIndex<0>{}); + } else { + // order by centerY + std::sort(boxes.begin(), boxes.end(), SortRuleLambdaWithIndex<1>{}); + } +} + +RTreeValueWithOrderIndex handleD0ElementsWithoutIndex(RTreeValue& element, uint64_t i, BasicGeometry::BoundingBox& boundingBox) { + RTreeValueWithOrderIndex entry = {{element.box, element.id}, i, 0}; + if (i == 0) { + boundingBox = element.box; + } else { + boundingBox = + BasicGeometry::CombineBoundingBoxes(boundingBox, element.box); // TODO + } + return entry; +} + +void handleD1Elements(RTreeValueWithOrderIndex& element, uint64_t i, uint64_t S, size_t M, multiBoxWithOrderIndex& r1Small) { + element.orderY = i; + + if (BasicGeometry::IsBorderOfSplitCandidate(i, S, M)) { + // index i * S - 1 or i * S + r1Small.push_back(element); + } +} + +void handleD0Elements(RTreeValueWithOrderIndex& element, uint64_t i, uint64_t S, size_t M, multiBoxWithOrderIndex& r0Small) { + if (BasicGeometry::IsBorderOfSplitCandidate(i, S, M)) { + // index i * S - 1 or i * S + r0Small.push_back(element); + } +} + +OrderedBoxes InternalSort(const std::string& onDiskBase, const std::string& fileSuffix, size_t M) { + OrderedBoxes orderedInputRectangles; + multiBoxGeo RectanglesD0 = FileReaderWithoutIndex::LoadEntries(onDiskBase + fileSuffix + ".tmp"); + centerOrdering(RectanglesD0, 0); + + size_t currentS = std::ceil(((float) RectanglesD0.size()) / ((float) M)); + if (RectanglesD0.size() <= M * M) { + // in this case S can just be M + currentS = M; + } + + multiBoxWithOrderIndex R0Small = multiBoxWithOrderIndex(); + multiBoxWithOrderIndex R1Small = multiBoxWithOrderIndex(); + + BasicGeometry::BoundingBox boundingBox = BasicGeometry::CreateBoundingBox(0, 0, 0, 0); + multiBoxWithOrderIndex RectanglesD1WithOrder = multiBoxWithOrderIndex(); + for (uint64_t i = 0; i < RectanglesD0.size(); i++) { + RTreeValueWithOrderIndex entry = handleD0ElementsWithoutIndex(RectanglesD0[i], i, boundingBox); + RectanglesD1WithOrder.push_back(entry); + } + + centerOrdering(RectanglesD1WithOrder, 1); + + R1Small.push_back((RectanglesD1WithOrder)[0]); + RTreeValueWithOrderIndex maxElementDim1 = (RectanglesD1WithOrder)[RectanglesD1WithOrder.size() - 1]; + maxElementDim1.orderY = RectanglesD1WithOrder.size() - 1; + R1Small.push_back(maxElementDim1); + for (uint64_t i = 0; i < RectanglesD1WithOrder.size(); i++) { + handleD1Elements(RectanglesD1WithOrder[i], i, currentS, M, R1Small); + } + + multiBoxWithOrderIndex RectanglesD0WithOrder = multiBoxWithOrderIndex(RectanglesD1WithOrder); + centerOrdering(RectanglesD0WithOrder, 0); + + R0Small.push_back((RectanglesD0WithOrder)[0]); + RTreeValueWithOrderIndex maxElementDim0 = (RectanglesD0WithOrder)[RectanglesD0WithOrder.size() - 1]; + maxElementDim0.orderY = RectanglesD0WithOrder.size() - 1; + R0Small.push_back(maxElementDim0); + for (uint64_t i = 0; i < RectanglesD0WithOrder.size(); i++) { + handleD0Elements(RectanglesD0WithOrder[i], i, currentS, M, R0Small); + } + + RectanglesForOrderedBoxes d0WithOrder; + d0WithOrder.rectangles = RectanglesD0WithOrder; + d0WithOrder.rectanglesSmall = R0Small; + RectanglesForOrderedBoxes d1WithOrder; + d1WithOrder.rectangles = RectanglesD1WithOrder; + d1WithOrder.rectanglesSmall = R1Small; + orderedInputRectangles.SetOrderedBoxesToRam(d0WithOrder, d1WithOrder, boundingBox); + return orderedInputRectangles; +} + +OrderedBoxes ExternalSort(const std::string& onDiskBase, + const std::string& fileSuffix, size_t M, + uintmax_t maxBuildingRamUsage) { + OrderedBoxes orderedInputRectangles; + std::filesystem::path file = onDiskBase + fileSuffix + ".tmp"; + + auto maxRamForSorter = std::ceil(std::min((double)maxBuildingRamUsage / 3.0, 9999999999.0 / 3.0)); + ad_utility::BackgroundStxxlSorter> sorterRectsD0Basic = + ad_utility::BackgroundStxxlSorter>((size_t)maxRamForSorter);; + + for (const RTreeValue& rectD0Element : FileReaderWithoutIndex(file)) { + sorterRectsD0Basic.push(rectD0Element); + } + + uint64_t xSize = 0; + BasicGeometry::BoundingBox boundingBox = + BasicGeometry::CreateBoundingBox(0, 0, 0, 0); + + ad_utility::BackgroundStxxlSorter> sorterRectsD1 = + ad_utility::BackgroundStxxlSorter>((size_t)maxRamForSorter); + + for (RTreeValue element : sorterRectsD0Basic.sortedView()) { + RTreeValueWithOrderIndex entry = handleD0ElementsWithoutIndex(element, xSize, boundingBox); + sorterRectsD1.push(entry); + xSize++; + } + sorterRectsD0Basic.clear(); + + size_t currentS = std::ceil(((float)xSize) / ((float)M)); + if (xSize <= M * M) { + // in this case S can just be M + currentS = M; + } + + uint64_t ySize = 0; + std::ofstream r1File = + std::ofstream(onDiskBase + fileSuffix + ".d1.tmp", std::ios::binary); + ad_utility::BackgroundStxxlSorter> sorterRectsD0 = + ad_utility::BackgroundStxxlSorter>((size_t)maxRamForSorter); + multiBoxWithOrderIndex r1Small = multiBoxWithOrderIndex(); + // placeholder + r1Small.emplace_back(); + r1Small.emplace_back(); + + RTreeValueWithOrderIndex minD1; + RTreeValueWithOrderIndex maxD1; + for (RTreeValueWithOrderIndex element : sorterRectsD1.sortedView()) { + handleD1Elements(element, ySize, currentS, M, r1Small); + FileReader::SaveEntryWithOrderIndex(element, r1File); + sorterRectsD0.push(element); + if (ySize == 0) { + minD1 = element; + } + + maxD1 = element; + ySize++; + } + + r1File.close(); + sorterRectsD1.clear(); + + // replace the placeholder + r1Small[0] = minD1; + r1Small[1] = maxD1; + + uint64_t currentX = 0; + std::ofstream r0File = + std::ofstream(onDiskBase + fileSuffix + ".d0.tmp", std::ios::binary); + multiBoxWithOrderIndex r0Small = multiBoxWithOrderIndex(); + // placeholder + r0Small.emplace_back(); + r0Small.emplace_back(); + + RTreeValueWithOrderIndex minD0; + RTreeValueWithOrderIndex maxD0; + for (RTreeValueWithOrderIndex element : sorterRectsD0.sortedView()) { + FileReader::SaveEntryWithOrderIndex(element, r0File); + handleD0Elements(element, currentX, currentS, M, r0Small); + if (currentX == 0) { + minD0 = element; + } + maxD0 = element; + currentX++; + } + + r0File.close(); + sorterRectsD0.clear(); + + // replace the placeholder + r0Small[0] = minD0; + r0Small[1] = maxD0; + + RectanglesForOrderedBoxes rectsD0; + RectanglesForOrderedBoxes rectsD1; + rectsD0.rectanglesSmall = std::move(r0Small); + rectsD1.rectanglesSmall = std::move(r1Small); + rectsD0.rectangles = onDiskBase + fileSuffix + ".d0.tmp"; + rectsD1.rectangles = onDiskBase + fileSuffix + ".d1.tmp"; + orderedInputRectangles.SetOrderedBoxesToDisk(rectsD0, rectsD1, xSize, + boundingBox); + return orderedInputRectangles; +} + +OrderedBoxes SortInput(const std::string& onDiskBase, + const std::string& fileSuffix, size_t M, + uintmax_t maxBuildingRamUsage, bool workInRam) { + if (workInRam) { + return InternalSort(onDiskBase, fileSuffix, M); + } + return ExternalSort(onDiskBase, fileSuffix, M, maxBuildingRamUsage); +} + +/*OrderedBoxes SortInput(const std::filesystem::path& onDiskBase, size_t M, +uintmax_t maxBuildingRamUsage, bool workInRam) { if (workInRam) { return +InternalSort(onDiskBase, M); } else { return ExternalSort(onDiskBase, M, + maxBuildingRamUsage); + } +}*/ diff --git a/src/util/RtreeSorter.h b/src/util/RtreeSorter.h new file mode 100644 index 0000000000..0d42db5703 --- /dev/null +++ b/src/util/RtreeSorter.h @@ -0,0 +1,14 @@ +// +// Created by nockn on 11/17/23. +// + +#ifndef QLEVER_RTREESORTER_H +#define QLEVER_RTREESORTER_H + +#include "./Rtree.h" + +OrderedBoxes SortInput(const std::string& onDiskBase, + const std::string& fileSuffix, size_t M, + uintmax_t maxBuildingRamUsage, bool workInRam); + +#endif // QLEVER_RTREESORTER_H diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index fb3b2d714e..c307ed3e59 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -39,7 +39,7 @@ endfunction() if (SINGLE_TEST_BINARY) message(STATUS "All tests are linked into a single executable `QLeverAllUnitTestsMain`") - add_executable(QLeverAllUnitTestsMain) + add_executable(QLeverAllUnitTestsMain RtreeTest.cpp) qlever_target_link_libraries(QLeverAllUnitTestsMain gtest gmock_main testUtil ${CMAKE_THREAD_LIBS_INIT}) gtest_discover_tests(QLeverAllUnitTestsMain QLeverAllUnitTestsMain PROPERTIES RUN_SERIAL TRUE) @@ -288,6 +288,8 @@ addLinkAndDiscoverTest(TimerTest) addLinkAndDiscoverTest(AlgorithmTest) +addLinkAndDiscoverTest(RtreeTest) + addLinkAndDiscoverTestSerial(CompressedRelationsTest index) addLinkAndDiscoverTest(ExceptionTest) diff --git a/test/RtreeTest.cpp b/test/RtreeTest.cpp new file mode 100644 index 0000000000..640376ab70 --- /dev/null +++ b/test/RtreeTest.cpp @@ -0,0 +1,152 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#include +#include +#include +#include +#include + +bool boundingBoxesAreEqual(BasicGeometry::BoundingBox b1, + BasicGeometry::BoundingBox b2) { + if (BasicGeometry::GetMinX(b1) != BasicGeometry::GetMinX(b2)) return false; + if (BasicGeometry::GetMinY(b1) != BasicGeometry::GetMinY(b2)) return false; + if (BasicGeometry::GetMaxX(b1) != BasicGeometry::GetMaxX(b2)) return false; + if (BasicGeometry::GetMaxY(b1) != BasicGeometry::GetMaxY(b2)) return false; + return true; +} + +bool multiBoxGeosAreEqual(multiBoxGeo& m1, multiBoxGeo& m2) { + if (m1.size() != m2.size()) return false; + for (size_t i = 0; i < m1.size(); i++) { + RTreeValue r1 = m1[i]; + RTreeValue r2 = m2[i]; + if (r1.id != r2.id) return false; + if (!boundingBoxesAreEqual(r1.box, r2.box)) return false; + } + return true; +} + +bool multiBoxGeosWithOrderIndexAreEqual(multiBoxWithOrderIndex& m1, + multiBoxWithOrderIndex& m2) { + if (m1.size() != m2.size()) return false; + for (size_t i = 0; i < m1.size(); i++) { + RTreeValueWithOrderIndex r1 = m1[i]; + RTreeValueWithOrderIndex r2 = m2[i]; + if (r1.id != r2.id) return false; + if (!boundingBoxesAreEqual(r1.box, r2.box)) return false; + if (r1.orderX != r2.orderX) return false; + if (r1.orderY != r2.orderY) return false; + } + return true; +} + +TEST(Rtree, ConvertWordToRtreeEntry) { + std::string wkt1 = + "\"POLYGON((0.0 0.0,0.0 0.0,0.0 0.0,0.0 0.0,0.0 " + "0.0))\"^^"; + std::string wkt2 = + "\"MULTIPOLYGON(((-100 -100,0 0,50 50,75 75,100 100), (10 10,20 20,30 " + "30)), ((0 0,-10.0 -10,-20 -20), (-5 -5,-7 " + "-7)))\"^^"; + std::string wkt3 = + "\"LINESTRING(-120 -110,0.0 0.0,0.0 0.0,0.0 0.0,120.0 " + "110.0)\"^^"; + std::string wkt4 = "Invalid input"; + std::string wkt5 = + "\"POLYGON((1 1,2 2,5 5), (1.1 1.1, 2 2, 3 " + "3))\"^^"; + std::string wkt6 = + "\"MULTIPOLYGON(((-100 -100,0 0,50 50,75 75,100 100), (10 10,20 20,30 " + "30)), ((-150 -140,-10.0 -10,160 170), (-5 -5,-7 " + "-7)))\"^^"; + std::optional result1 = + BasicGeometry::ConvertWordToRtreeEntry(wkt1); + std::optional result2 = + BasicGeometry::ConvertWordToRtreeEntry(wkt2); + std::optional result3 = + BasicGeometry::ConvertWordToRtreeEntry(wkt3); + std::optional result4 = + BasicGeometry::ConvertWordToRtreeEntry(wkt4); + std::optional result5 = + BasicGeometry::ConvertWordToRtreeEntry(wkt5); + std::optional result6 = + BasicGeometry::ConvertWordToRtreeEntry(wkt6); + ASSERT_TRUE(result1); + ASSERT_TRUE(boundingBoxesAreEqual( + result1.value(), BasicGeometry::CreateBoundingBox(0, 0, 0, 0))); + ASSERT_TRUE(result2); + ASSERT_TRUE(boundingBoxesAreEqual( + result2.value(), BasicGeometry::CreateBoundingBox(-100, -100, 100, 100))); + ASSERT_TRUE(result3); + ASSERT_TRUE(boundingBoxesAreEqual( + result3.value(), BasicGeometry::CreateBoundingBox(-120, -110, 120, 110))); + ASSERT_TRUE(!result4); + ASSERT_TRUE(result5); + ASSERT_TRUE(boundingBoxesAreEqual( + result5.value(), BasicGeometry::CreateBoundingBox(1, 1, 5, 5))); + ASSERT_TRUE(result6); + ASSERT_TRUE(boundingBoxesAreEqual( + result6.value(), BasicGeometry::CreateBoundingBox(-150, -140, 160, 170))); +} + +TEST(Rtree, IsBorderOfSplitCandidate) { + ASSERT_TRUE(BasicGeometry::IsBorderOfSplitCandidate(16, 16, 16)); + ASSERT_TRUE(BasicGeometry::IsBorderOfSplitCandidate(15, 16, 16)); + ASSERT_TRUE(!BasicGeometry::IsBorderOfSplitCandidate(17, 16, 16)); + ASSERT_TRUE(!BasicGeometry::IsBorderOfSplitCandidate(3185, 200, 16)); + ASSERT_TRUE(!BasicGeometry::IsBorderOfSplitCandidate(3184, 200, 16)); + ASSERT_TRUE(BasicGeometry::IsBorderOfSplitCandidate(3000, 200, 16)); + ASSERT_TRUE(BasicGeometry::IsBorderOfSplitCandidate(2999, 200, 16)); + ASSERT_TRUE(BasicGeometry::IsBorderOfSplitCandidate(200, 200, 16)); + ASSERT_TRUE(BasicGeometry::IsBorderOfSplitCandidate(199, 200, 16)); +} + +TEST(Rtree, CreateBoundingBox) { + BasicGeometry::Point p1 = {-1, -2}; + BasicGeometry::Point p2 = {3, 4}; + BasicGeometry::BoundingBox b = {p1, p2}; + ASSERT_TRUE( + boundingBoxesAreEqual(b, BasicGeometry::CreateBoundingBox(-1, -2, 3, 4))); +} + +TEST(Rtree, CombineBoundingBoxes) { + BasicGeometry::BoundingBox b1 = BasicGeometry::CreateBoundingBox(0, 0, 0, 0); + BasicGeometry::BoundingBox b2 = BasicGeometry::CreateBoundingBox(1, 2, 3, 4); + BasicGeometry::BoundingBox b3 = + BasicGeometry::CreateBoundingBox(-1, -2, -3, -4); + ASSERT_TRUE( + boundingBoxesAreEqual(BasicGeometry::CombineBoundingBoxes(b1, b2), + BasicGeometry::CreateBoundingBox(0, 0, 3, 4))); + ASSERT_TRUE( + boundingBoxesAreEqual(BasicGeometry::CombineBoundingBoxes(b2, b1), + BasicGeometry::CreateBoundingBox(0, 0, 3, 4))); + ASSERT_TRUE( + boundingBoxesAreEqual(BasicGeometry::CombineBoundingBoxes(b1, b3), + BasicGeometry::CreateBoundingBox(-1, -2, 0, 0))); + ASSERT_TRUE( + boundingBoxesAreEqual(BasicGeometry::CombineBoundingBoxes(b3, b1), + BasicGeometry::CreateBoundingBox(-1, -2, 0, 0))); + BasicGeometry::BoundingBox b4 = + BasicGeometry::CreateBoundingBox(-150.0, 30.4, -70.0, 50); + BasicGeometry::BoundingBox b5 = + BasicGeometry::CreateBoundingBox(5.0, -30.4, 10.0, 20); + ASSERT_TRUE(boundingBoxesAreEqual( + BasicGeometry::CombineBoundingBoxes(b4, b5), + BasicGeometry::CreateBoundingBox(-150, -30.4, 10, 50))); +} + +TEST(Rtree, SaveAndLoadEntry) { + multiBoxGeo boxes1 = multiBoxGeo(); + boxes1.push_back(RTreeValue(BasicGeometry::CreateBoundingBox(0, 0, 0, 0), 1)); + std::filesystem::path path1 = "RtreeTest_SaveEntry1"; + std::ofstream ofs1 = std::ofstream("RtreeTest_SaveEntry1", std::ios::binary); + for (RTreeValue element : boxes1) { + FileReaderWithoutIndex::SaveEntry(element.box, element.id, ofs1); + } + ofs1.close(); + multiBoxGeo boxes2 = + FileReaderWithoutIndex::LoadEntries(std::filesystem::absolute(path1)); + ASSERT_TRUE(multiBoxGeosAreEqual(boxes1, boxes2)); +} diff --git a/test/VocabularyGeneratorTest.cpp b/test/VocabularyGeneratorTest.cpp index 3936475b52..b72a01fd7c 100644 --- a/test/VocabularyGeneratorTest.cpp +++ b/test/VocabularyGeneratorTest.cpp @@ -169,11 +169,15 @@ TEST_F(MergeVocabularyTest, mergeVocabulary) { { VocabularyMerger m; auto file = ad_utility::makeOfstream(_basePath + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word) { - file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; - }; + auto internalVocabularyAction = + [&file](const auto& word, [[maybe_unused]] const auto& index) { + file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; + }; + auto externalVocabularyAction = []([[maybe_unused]] const auto& word, + [[maybe_unused]] const auto& index) {}; res = m.mergeVocabulary(_basePath, 2, TripleComponentComparator(), - internalVocabularyAction, 1_GB); + internalVocabularyAction, externalVocabularyAction, + 1_GB); } // No language tags in text file @@ -219,11 +223,16 @@ TEST(VocabularyGenerator, ReadAndWritePartial) { { VocabularyMerger m; auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word) { - file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; - }; + auto internalVocabularyAction = + [&file](const auto& word, [[maybe_unused]] const auto& index) { + file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; + }; + + auto externalVocabularyAction = []([[maybe_unused]] const auto& word, + [[maybe_unused]] const auto& index) {}; m.mergeVocabulary(basename, 1, v.getCaseComparator(), - internalVocabularyAction, 1_GB); + internalVocabularyAction, externalVocabularyAction, + 1_GB); } auto idMap = IdMapFromPartialIdMapFile(basename + PARTIAL_MMAP_IDS + "0"); EXPECT_EQ(V(0), idMap[V(5)]); @@ -271,11 +280,15 @@ TEST(VocabularyGenerator, ReadAndWritePartial) { { VocabularyMerger m; auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word) { - file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; - }; + auto internalVocabularyAction = + [&file](const auto& word, [[maybe_unused]] const auto& index) { + file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; + }; + auto externalVocabularyAction = []([[maybe_unused]] const auto& word, + [[maybe_unused]] const auto& index) {}; m.mergeVocabulary(basename, 1, v.getCaseComparator(), - internalVocabularyAction, 1_GB); + internalVocabularyAction, externalVocabularyAction, + 1_GB); } auto idMap = IdMapFromPartialIdMapFile(basename + PARTIAL_MMAP_IDS + "0"); EXPECT_EQ(V(0), idMap[V(6)]);