From 01f586db0f5cd01db04819f63aa0ac45925dea91 Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Wed, 10 May 2023 14:50:31 +0200 Subject: [PATCH 01/20] init --- .gitmodules | 3 +++ CMakeLists.txt | 9 ++++++++- src/index/IndexImpl.cpp | 21 +++++++++++++++++---- src/index/VocabularyGenerator.h | 10 ++++++---- src/index/VocabularyGeneratorImpl.h | 18 ++++++++++-------- third_party/rtree | 1 + 6 files changed, 45 insertions(+), 17 deletions(-) create mode 160000 third_party/rtree diff --git a/.gitmodules b/.gitmodules index 7a2fe9cc0f..b08fcf3f78 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,3 +13,6 @@ [submodule "third_party/antlr4"] path = third_party/antlr4 url = https://github.com/antlr/antlr4.git +[submodule "third_party/rtree"] + path = third_party/rtree + url = git@github.com:noahnock/Bachelorprojekt.git diff --git a/CMakeLists.txt b/CMakeLists.txt index c25a1e2d9f..26c4866ada 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,6 +208,12 @@ if (ENABLE_EXPENSIVE_CHECKS) add_definitions("-DAD_ENABLE_EXPENSIVE_CHECKS") endif() +############################### +# RTREE +############################### +add_subdirectory(third_party/rtree EXCLUDE_FROM_ALL) +include_directories(SYSTEM third_party/rtree) + ################################ # STXXL ################################ @@ -218,6 +224,7 @@ set(USE_OPENMP OFF CACHE BOOL "Don't use OPENMP as default" FORCE) add_subdirectory(third_party/stxxl EXCLUDE_FROM_ALL) # apply STXXL CXXFLAGS + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${STXXL_CXX_FLAGS}") include_directories(SYSTEM ${STXXL_INCLUDE_DIRS}) @@ -288,7 +295,7 @@ add_definitions(-DLOGLEVEL=${LOG_LEVEL_${LOGLEVEL}}) ################################################## # Precompiled headers set(PRECOMPILED_HEADER_FILES_ENGINE src/util/HashMap.h src/engine/Operation.h src/engine/QueryExecutionTree.h) -set(PRECOMPILED_HEADER_FILES_PARSER src/engine/sparqlExpressions/AggregateExpression.h third_party/ctre/include/ctre/ctre.h third_party/antlr4/runtime/Cpp/runtime/src/antlr4-runtime.h) +set(PRECOMPILED_HEADER_FILES_PARSER src/engine/sparqlExpressions/AggregateExpression.h third_party/ctre/include/ctre/ctre.h third_party/antlr4/runtime/Cpp/runtime/src/antlr4-runtime.h third_party/rtree/Rtree.h) add_subdirectory(src/parser) target_precompile_headers(parser PRIVATE ${PRECOMPILED_HEADER_FILES_PARSER}) diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 3703b85223..f3411ffedf 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -365,14 +366,20 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( auto compressionOutfile = ad_utility::makeOfstream( _onDiskBase + TMP_BASENAME_COMPRESSION + INTERNAL_VOCAB_SUFFIX); auto internalVocabularyActionCompression = - [&compressionOutfile](const auto& word) { + [&compressionOutfile](const auto& word, const auto& index) { compressionOutfile << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; + + // TODO write the word with index into the rtree file }; + auto externalVocabularyActionCompression = + [](const auto& word, const auto& index) { + // TODO write the word with index into the rtree file + }; m._noIdMapsAndIgnoreExternalVocab = true; auto mergeResult = m.mergeVocabulary(_onDiskBase + TMP_BASENAME_COMPRESSION, numFiles, - std::less<>(), internalVocabularyActionCompression); + std::less<>(), internalVocabularyActionCompression, externalVocabularyActionCompression); sizeInternalVocabulary = mergeResult.numWordsTotal_; LOG(INFO) << "Number of words in internal vocabulary: " << sizeInternalVocabulary << std::endl; @@ -397,11 +404,17 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( }; auto wordWriter = _vocab.makeUncompressingWordWriter(_onDiskBase + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&wordWriter](const auto& word) { + auto internalVocabularyAction = [&wordWriter](const auto& word, const auto& index) { wordWriter.push(word.data(), word.size()); + // TODO write the word with index into the rtree file }; + + auto externalVocabularyAction = [](const auto& word, const auto& index) { + // TODO write the word with index into the rtree file + }; + return v.mergeVocabulary(_onDiskBase, numFiles, sortPred, - internalVocabularyAction); + internalVocabularyAction, externalVocabularyAction); }(); LOG(DEBUG) << "Finished merging partial vocabularies" << std::endl; IndexBuilderDataAsStxxlVector res; diff --git a/src/index/VocabularyGenerator.h b/src/index/VocabularyGenerator.h index 6425af3d59..5dc596da79 100644 --- a/src/index/VocabularyGenerator.h +++ b/src/index/VocabularyGenerator.h @@ -101,10 +101,11 @@ class VocabularyMerger { // Argument comparator gives the way to order strings (case-sensitive or not) // This automatically resets the inner members after finishing, to leave the // external interface stateless - template + template VocabularyMetaData mergeVocabulary(const std::string& basename, size_t numFiles, Comp comparator, - InternalVocabularyAction& action); + InternalVocabularyAction& action, + ExternalVocabularyAction& externalVocabularyAction); private: // helper struct used in the priority queue for merging. @@ -132,10 +133,11 @@ class VocabularyMerger { // write the queu words in the buffer to their corresponding idPairVecs. // Requires that all the QueueWords that are ever passed are ordered // alphabetically (Also across multiple calls) - template + template void writeQueueWordsToIdVec( const std::vector& buffer, - InternalVocabularyAction& internalVocabularyAction); + InternalVocabularyAction& internalVocabularyAction, + ExternalVocabularyAction& externalVocabularyAction); // close all associated files and MmapVectors and reset all internal variables void clear() { diff --git a/src/index/VocabularyGeneratorImpl.h b/src/index/VocabularyGeneratorImpl.h index 77a8237caf..6ef6a9be0f 100644 --- a/src/index/VocabularyGeneratorImpl.h +++ b/src/index/VocabularyGeneratorImpl.h @@ -25,10 +25,10 @@ #include "./VocabularyGenerator.h" // ___________________________________________________________________ -template +template VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( const std::string& basename, size_t numFiles, Comparator comparator, - InternalVocabularyAction& internalVocabularyAction) { + InternalVocabularyAction& internalVocabularyAction, ExternalVocabularyAction& externalVocabularyAction) { // Return true iff p1 >= p2 according to the lexicographic order of the IRI // or literal. All internal IRIs or literals come before all external ones. // TODO Change this as soon as we have Interleaved Ids via the @@ -102,8 +102,8 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( // asynchronously write the next batch of sorted // queue words auto writeTask = [this, buf = std::move(sortedBuffer), - &internalVocabularyAction]() { - this->writeQueueWordsToIdVec(buf, internalVocabularyAction); + &internalVocabularyAction, &externalVocabularyAction]() { + this->writeQueueWordsToIdVec(buf, internalVocabularyAction, externalVocabularyAction); }; sortedBuffer.clear(); sortedBuffer.reserve(_bufferSize); @@ -128,7 +128,7 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( // Handle remaining words in the buffer if (!sortedBuffer.empty()) { - writeQueueWordsToIdVec(sortedBuffer, internalVocabularyAction); + writeQueueWordsToIdVec(sortedBuffer, internalVocabularyAction, externalVocabularyAction); } auto metaData = std::move(metaData_); @@ -138,10 +138,11 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( } // ________________________________________________________________________________ -template +template void VocabularyMerger::writeQueueWordsToIdVec( const std::vector& buffer, - InternalVocabularyAction& internalVocabularyAction) { + InternalVocabularyAction& internalVocabularyAction, + ExternalVocabularyAction& externalVocabularyAction) { LOG(TIMING) << "Start writing a batch of merged words\n"; // smaller grained buffer for the actual inner write @@ -164,8 +165,9 @@ void VocabularyMerger::writeQueueWordsToIdVec( // write the new word to the vocabulary if (!lastTripleComponent_.value().isExternal()) { - internalVocabularyAction(lastTripleComponent_.value().iriOrLiteral()); + internalVocabularyAction(lastTripleComponent_.value().iriOrLiteral(), lastTripleComponent_.value()._index); } else { + externalVocabularyAction(lastTripleComponent_.value().iriOrLiteral(), lastTripleComponent_.value()._index); outfileExternal_ << RdfEscaping::escapeNewlinesAndBackslashes( lastTripleComponent_.value().iriOrLiteral()) << '\n'; diff --git a/third_party/rtree b/third_party/rtree new file mode 160000 index 0000000000..94e49eb3c5 --- /dev/null +++ b/third_party/rtree @@ -0,0 +1 @@ +Subproject commit 94e49eb3c57bafbdd50baba539571dd2390149a0 From f4d58903778006613c348732d28fa95efa886b6e Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Wed, 17 May 2023 15:49:31 +0200 Subject: [PATCH 02/20] removed wrong submodule --- .gitmodules | 3 --- third_party/rtree | 1 - 2 files changed, 4 deletions(-) delete mode 160000 third_party/rtree diff --git a/.gitmodules b/.gitmodules index b08fcf3f78..7a2fe9cc0f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,6 +13,3 @@ [submodule "third_party/antlr4"] path = third_party/antlr4 url = https://github.com/antlr/antlr4.git -[submodule "third_party/rtree"] - path = third_party/rtree - url = git@github.com:noahnock/Bachelorprojekt.git diff --git a/third_party/rtree b/third_party/rtree deleted file mode 160000 index 94e49eb3c5..0000000000 --- a/third_party/rtree +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 94e49eb3c57bafbdd50baba539571dd2390149a0 From 55f091391b6dbd9501bfdc45cb3d9bc0b520b4b8 Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Thu, 18 May 2023 14:11:21 +0200 Subject: [PATCH 03/20] Added the rtree --- .gitmodules | 3 +++ CMakeLists.txt | 6 ++++-- src/VocabularyMergerMain.cpp | 5 +++-- src/index/IndexImpl.cpp | 29 ++++++++++++++++++++--------- test/VocabularyGeneratorTest.cpp | 16 ++++++++++------ third_party/rtree | 1 + 6 files changed, 41 insertions(+), 19 deletions(-) create mode 160000 third_party/rtree diff --git a/.gitmodules b/.gitmodules index 7a2fe9cc0f..3174b33684 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,3 +13,6 @@ [submodule "third_party/antlr4"] path = third_party/antlr4 url = https://github.com/antlr/antlr4.git +[submodule "third_party/rtree"] + path = third_party/rtree + url = https://github.com/noahnock/Bachelorprojekt.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 26c4866ada..5783455042 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -212,7 +212,9 @@ endif() # RTREE ############################### add_subdirectory(third_party/rtree EXCLUDE_FROM_ALL) -include_directories(SYSTEM third_party/rtree) +include_directories(SYSTEM third_party/rtree/) +add_executable(${PROJECT_NAME} third_party/rtree/Rtree/Rtree.cpp) +target_include_directories(${PROJECT_NAME} PUBLIC $) ################################ # STXXL @@ -295,7 +297,7 @@ add_definitions(-DLOGLEVEL=${LOG_LEVEL_${LOGLEVEL}}) ################################################## # Precompiled headers set(PRECOMPILED_HEADER_FILES_ENGINE src/util/HashMap.h src/engine/Operation.h src/engine/QueryExecutionTree.h) -set(PRECOMPILED_HEADER_FILES_PARSER src/engine/sparqlExpressions/AggregateExpression.h third_party/ctre/include/ctre/ctre.h third_party/antlr4/runtime/Cpp/runtime/src/antlr4-runtime.h third_party/rtree/Rtree.h) +set(PRECOMPILED_HEADER_FILES_PARSER src/engine/sparqlExpressions/AggregateExpression.h third_party/ctre/include/ctre/ctre.h third_party/antlr4/runtime/Cpp/runtime/src/antlr4-runtime.h) add_subdirectory(src/parser) target_precompile_headers(parser PRIVATE ${PRECOMPILED_HEADER_FILES_PARSER}) diff --git a/src/VocabularyMergerMain.cpp b/src/VocabularyMergerMain.cpp index c36b4c1a72..9f283ab84e 100644 --- a/src/VocabularyMergerMain.cpp +++ b/src/VocabularyMergerMain.cpp @@ -21,9 +21,10 @@ int main(int argc, char** argv) { VocabularyMerger m; auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word) { + auto internalVocabularyAction = [&file](const auto& word, const auto& index) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; + auto externalVocabularyAction = [](const auto& word, const auto& index) {}; m.mergeVocabulary(basename, numFiles, TripleComponentComparator(), - internalVocabularyAction); + internalVocabularyAction, externalVocabularyAction); } diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index f3411ffedf..b2c6b8ca92 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -17,7 +17,7 @@ #include #include #include -#include +//#include #include #include @@ -369,12 +369,9 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( [&compressionOutfile](const auto& word, const auto& index) { compressionOutfile << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; - - // TODO write the word with index into the rtree file }; auto externalVocabularyActionCompression = [](const auto& word, const auto& index) { - // TODO write the word with index into the rtree file }; m._noIdMapsAndIgnoreExternalVocab = true; auto mergeResult = @@ -404,17 +401,27 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( }; auto wordWriter = _vocab.makeUncompressingWordWriter(_onDiskBase + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&wordWriter](const auto& word, const auto& index) { + + //Rtree rtree = Rtree(); + std::ofstream testOfStream = std::ofstream("/local/data/nockn/forkqlever/qlever/third_party/rtree/convertedTest.txt", std::ios_base::app); + + auto internalVocabularyAction = [&wordWriter/*, &rtree*/, &testOfStream](const auto& word, const auto& index) { wordWriter.push(word.data(), word.size()); - // TODO write the word with index into the rtree file + //rtree.ConvertWordToRtreeEntry(word.data(), word.size(), index); + testOfStream << word.data() << std::endl; }; - auto externalVocabularyAction = [](const auto& word, const auto& index) { - // TODO write the word with index into the rtree file + auto externalVocabularyAction = [/*&rtree*/&testOfStream](const auto& word, const auto& index) { + //rtree.ConvertWordToRtreeEntry(word.data(), word.size(), index); + testOfStream << word.data() << std::endl; }; - return v.mergeVocabulary(_onDiskBase, numFiles, sortPred, + //rtree.OpenConversion("../../third_party/rtree/ConversionTest"); + VocabularyMerger::VocabularyMetaData result = v.mergeVocabulary(_onDiskBase, numFiles, sortPred, internalVocabularyAction, externalVocabularyAction); + //rtree.CloseConversion(); + testOfStream.close(); + return result; }(); LOG(DEBUG) << "Finished merging partial vocabularies" << std::endl; IndexBuilderDataAsStxxlVector res; @@ -424,6 +431,8 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( << res.vocabularyMetaData_.numWordsTotal_ - sizeInternalVocabulary << std::endl; + // TODO RTREE + res.idTriples = std::move(idTriples); res.actualPartialSizes = std::move(actualPartialSizes); @@ -701,6 +710,8 @@ void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) { _avgNumDistinctPredicatesPerSubject, _numDistinctSubjectPredicatePairs, _patterns, _hasPattern); } + + // TODO Load the RTREE } // _____________________________________________________________________________ diff --git a/test/VocabularyGeneratorTest.cpp b/test/VocabularyGeneratorTest.cpp index 6e0b9fdcbc..80a3e3feff 100644 --- a/test/VocabularyGeneratorTest.cpp +++ b/test/VocabularyGeneratorTest.cpp @@ -161,11 +161,12 @@ TEST_F(MergeVocabularyTest, mergeVocabulary) { { VocabularyMerger m; auto file = ad_utility::makeOfstream(_basePath + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word) { + auto internalVocabularyAction = [&file](const auto& word, const auto& index) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; + auto externalVocabularyAction = [](const auto& word, const auto& index) {}; res = m.mergeVocabulary(_basePath, 2, TripleComponentComparator(), - internalVocabularyAction); + internalVocabularyAction, externalVocabularyAction); } // No language tags in text file @@ -206,11 +207,13 @@ TEST(VocabularyGenerator, ReadAndWritePartial) { { VocabularyMerger m; auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word) { + auto internalVocabularyAction = [&file](const auto& word, const auto& index) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; + + auto externalVocabularyAction = [](const auto& word, const auto& index) {}; m.mergeVocabulary(basename, 1, v.getCaseComparator(), - internalVocabularyAction); + internalVocabularyAction, externalVocabularyAction); } auto idMap = IdMapFromPartialIdMapFile(basename + PARTIAL_MMAP_IDS + "0"); ASSERT_EQ(V(0), idMap[V(5)]); @@ -249,11 +252,12 @@ TEST(VocabularyGenerator, ReadAndWritePartial) { { VocabularyMerger m; auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word) { + auto internalVocabularyAction = [&file](const auto& word, const auto& index) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; + auto externalVocabularyAction = [](const auto& word, const auto& index) {}; m.mergeVocabulary(basename, 1, v.getCaseComparator(), - internalVocabularyAction); + internalVocabularyAction, externalVocabularyAction); } auto idMap = IdMapFromPartialIdMapFile(basename + PARTIAL_MMAP_IDS + "0"); EXPECT_EQ(V(0), idMap[V(6)]); diff --git a/third_party/rtree b/third_party/rtree new file mode 160000 index 0000000000..8d2f7022a6 --- /dev/null +++ b/third_party/rtree @@ -0,0 +1 @@ +Subproject commit 8d2f7022a60ec4d237e928dd57e3b53e2f5aae84 From d621741edcb8394cde540b4869cdf4234e41333a Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Tue, 23 May 2023 23:02:43 +0200 Subject: [PATCH 04/20] updated rtree submodule --- third_party/rtree | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/rtree b/third_party/rtree index 8d2f7022a6..9218270b21 160000 --- a/third_party/rtree +++ b/third_party/rtree @@ -1 +1 @@ -Subproject commit 8d2f7022a60ec4d237e928dd57e3b53e2f5aae84 +Subproject commit 9218270b2120eb6db6b758ec48061fa3ed145114 From ea806e2fdeefead2795611525872b91090488267 Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Wed, 24 May 2023 11:12:11 +0200 Subject: [PATCH 05/20] Updating the rtree (1/2) --- CMakeLists.txt | 6 ++---- src/index/IndexImpl.cpp | 20 ++++++++------------ 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8af0c6e819..d9240a4144 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -129,7 +129,7 @@ find_package(OpenSSL REQUIRED) # `target_link_libraries` that additionally links against the common # libraries. function (qlever_target_link_libraries target) - target_link_libraries(${target} ${ARGN} absl::flat_hash_map absl::flat_hash_set absl::strings) + target_link_libraries(${target} ${ARGN} absl::flat_hash_map absl::flat_hash_set absl::strings rtree) endfunction() @@ -211,10 +211,8 @@ endif() ############################### # RTREE ############################### -add_subdirectory(third_party/rtree EXCLUDE_FROM_ALL) +add_subdirectory(third_party/rtree) include_directories(SYSTEM third_party/rtree/) -add_executable(${PROJECT_NAME} third_party/rtree/Rtree/Rtree.cpp) -target_include_directories(${PROJECT_NAME} PUBLIC $) ################################ # STXXL diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 311c048330..caffb90c47 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -17,7 +17,7 @@ #include #include #include -//#include +#include #include #include @@ -401,25 +401,21 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( auto wordWriter = vocab_.makeUncompressingWordWriter(onDiskBase_ + INTERNAL_VOCAB_SUFFIX); - //Rtree rtree = Rtree(); - std::ofstream testOfStream = std::ofstream("/local/data/nockn/forkqlever/qlever/third_party/rtree/convertedTest.txt", std::ios_base::app); + Rtree rtree = Rtree(); - auto internalVocabularyAction = [&wordWriter/*, &rtree*/, &testOfStream](const auto& word, const auto& index) { + auto internalVocabularyAction = [&wordWriter, &rtree](const auto& word, const auto& index) { wordWriter.push(word.data(), word.size()); - //rtree.ConvertWordToRtreeEntry(word.data(), word.size(), index); - testOfStream << word.data() << std::endl; + rtree.ConvertWordToRtreeEntry(word.data(), word.size(), index); }; - auto externalVocabularyAction = [/*&rtree*/&testOfStream](const auto& word, const auto& index) { - //rtree.ConvertWordToRtreeEntry(word.data(), word.size(), index); - testOfStream << word.data() << std::endl; + auto externalVocabularyAction = [&rtree](const auto& word, const auto& index) { + rtree.ConvertWordToRtreeEntry(word.data(), word.size(), index); }; - //rtree.OpenConversion("../../third_party/rtree/ConversionTest"); + rtree.OpenConversion("../../third_party/rtree/ConversionTest"); VocabularyMerger::VocabularyMetaData result = v.mergeVocabulary(onDiskBase_, numFiles, sortPred, internalVocabularyAction, externalVocabularyAction); - //rtree.CloseConversion(); - testOfStream.close(); + rtree.CloseConversion(); return result; }(); LOG(DEBUG) << "Finished merging partial vocabularies" << std::endl; From 474a7363c972306e1617ee133cf606355605e68a Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Wed, 24 May 2023 11:28:15 +0200 Subject: [PATCH 06/20] Updating the rtree (2/2) --- third_party/rtree | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/rtree b/third_party/rtree index 9218270b21..851be0c9cd 160000 --- a/third_party/rtree +++ b/third_party/rtree @@ -1 +1 @@ -Subproject commit 9218270b2120eb6db6b758ec48061fa3ed145114 +Subproject commit 851be0c9cdfa69f5725b516b69ff8c97f173a701 From 576534b02fd5d9691f2549cec7ddfef3e3f9f74e Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Sat, 10 Jun 2023 00:39:20 +0200 Subject: [PATCH 07/20] Fully integrated contains search via rtree --- src/VocabularyMergerMain.cpp | 4 +- src/engine/sparqlExpressions/CMakeLists.txt | 2 +- .../sparqlExpressions/ContainsExpression.cpp | 146 ++++++++++++++++++ .../sparqlExpressions/ContainsExpression.h | 33 ++++ src/index/Index.cpp | 3 + src/index/Index.h | 3 + src/index/IndexImpl.cpp | 26 ++-- src/index/IndexImpl.h | 4 + .../sparqlParser/SparqlQleverVisitor.cpp | 24 ++- test/VocabularyGeneratorTest.cpp | 12 +- 10 files changed, 230 insertions(+), 27 deletions(-) create mode 100644 src/engine/sparqlExpressions/ContainsExpression.cpp create mode 100644 src/engine/sparqlExpressions/ContainsExpression.h diff --git a/src/VocabularyMergerMain.cpp b/src/VocabularyMergerMain.cpp index 9f283ab84e..822141f0a4 100644 --- a/src/VocabularyMergerMain.cpp +++ b/src/VocabularyMergerMain.cpp @@ -21,10 +21,10 @@ int main(int argc, char** argv) { VocabularyMerger m; auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word, const auto& index) { + auto internalVocabularyAction = [&file](const auto& word, [[maybe_unused]]const auto& index) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; - auto externalVocabularyAction = [](const auto& word, const auto& index) {}; + auto externalVocabularyAction = []([[maybe_unused]]const auto& word, [[maybe_unused]]const auto& index) {}; m.mergeVocabulary(basename, numFiles, TripleComponentComparator(), internalVocabularyAction, externalVocabularyAction); } diff --git a/src/engine/sparqlExpressions/CMakeLists.txt b/src/engine/sparqlExpressions/CMakeLists.txt index 7d8d584ffb..be12b69e58 100644 --- a/src/engine/sparqlExpressions/CMakeLists.txt +++ b/src/engine/sparqlExpressions/CMakeLists.txt @@ -11,6 +11,6 @@ add_library(sparqlExpressions SparqlExpressionPimpl.h SparqlExpressionPimpl.cpp SampleExpression.h SampleExpression.cpp RelationalExpressions.cpp AggregateExpression.cpp RegexExpression.cpp - LangExpression.cpp) + ContainsExpression.cpp LangExpression.cpp) qlever_target_link_libraries(sparqlExpressions index) diff --git a/src/engine/sparqlExpressions/ContainsExpression.cpp b/src/engine/sparqlExpressions/ContainsExpression.cpp new file mode 100644 index 0000000000..8dff5371bc --- /dev/null +++ b/src/engine/sparqlExpressions/ContainsExpression.cpp @@ -0,0 +1,146 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#include "./ContainsExpression.h" + +#include "engine/sparqlExpressions/SparqlExpressionGenerators.h" +#include "global/ValueIdComparators.h" +#include "re2/re2.h" +#include + +using namespace std::literals; + +class BoundingBoxType { + public: + boxGeo value{}; + + explicit BoundingBoxType(boxGeo boundingBox) { + this->value = boundingBox; + } +}; + +std::string removeQuotes(std::string_view input) { + AD_CORRECTNESS_CHECK(input.size() >= 2 && input.starts_with('"') && + input.ends_with('"')); + input.remove_prefix(1); + input.remove_suffix(1); + return std::string{input}; +} + +namespace sparqlExpression { +// ___________________________________________________________________________ + ContainsExpression::ContainsExpression( + SparqlExpression::Ptr child, SparqlExpression::Ptr boundingBox) + : child_{std::move(child)} { + if (!dynamic_cast(child_.get())) { + throw std::runtime_error( + "Contain expressions are currently supported only on variables."); + } + std::string boundingBoxString; + std::string originalBoundingBoxString; + if (auto boundingBoxPtr = + dynamic_cast(boundingBox.get())) { + originalBoundingBoxString = boundingBoxPtr->value().normalizedLiteralContent().get(); + if (!boundingBoxPtr->value().datatypeOrLangtag().empty()) { + throw std::runtime_error( + "The second argument to the Contain function (which contains the " + "bounding box) must not contain a language tag or a datatype"); + } + boundingBoxString = removeQuotes(originalBoundingBoxString); + } else { + throw std::runtime_error( + "The second argument to the Contains function must be a " + "string literal (which contains the bounding box of format \"minX,minY,maxX,maxY\")"); + } + + boundingBoxAsString_ = boundingBoxString; + + std::vector boundingBoxEntriesAsString(4); + std::string errorMessage; + std::string::size_type searchFrom = 0; + for (int i = 0; i < 4; i++) { + if (i == 3) { + if (searchFrom >= boundingBoxAsString_.size()) { errorMessage = "The fourth argument was not provided"; break; } + boundingBoxEntriesAsString[i] = boundingBoxAsString_.substr(searchFrom, boundingBoxAsString_.size() - searchFrom); + break; + } + std::string::size_type end = boundingBoxAsString_.find(',', searchFrom); + if (end >= boundingBoxAsString_.size() - 1 || end == std::string::npos) { errorMessage = "There are not enough arguments"; break; } + boundingBoxEntriesAsString[i] = boundingBoxAsString_.substr(searchFrom, end - searchFrom); + searchFrom = end + 1; + } + + if (errorMessage.empty()) { + double minX; + double minY; + double maxX; + double maxY; + + try { + minX = std::stod(boundingBoxEntriesAsString[0]); + minY = std::stod(boundingBoxEntriesAsString[1]); + maxX = std::stod(boundingBoxEntriesAsString[2]); + maxY = std::stod(boundingBoxEntriesAsString[3]); + + boundingBox_ = new BoundingBoxType(Rtree::createBoundingBox(minX, minY, maxX, maxY)); + } catch (const std::invalid_argument& e) { + errorMessage = e.what(); + } catch (const std::out_of_range& e) { + errorMessage = e.what(); + } + } + + if (!errorMessage.empty()) { + throw std::runtime_error{absl::StrCat( + "The bounding box ", originalBoundingBoxString, + " is not supported by QLever (must be of format \"minX,minY,maxX,maxY\"). " + "Error message is: ", errorMessage + )}; + } + } + + +// ___________________________________________________________________________ + string ContainsExpression::getCacheKey( + const VariableToColumnMap& varColMap) const { + return absl::StrCat("Bounding Box CONTAINS expression ", child_->getCacheKey(varColMap), + " with ", boundingBoxAsString_); + } + +// ___________________________________________________________________________ + std::span ContainsExpression::children() { + return {&child_, 1}; + } + +// ___________________________________________________________________________ + ExpressionResult ContainsExpression::evaluate( + sparqlExpression::EvaluationContext* context) const { + auto resultAsVariant = child_->evaluate(context); + auto variablePtr = std::get_if(&resultAsVariant); + AD_CONTRACT_CHECK(variablePtr); + + // search in the rtree + ad_utility::HashSet resultIds; + if (boundingBox_ != nullptr) { + Rtree rtree = context->_qec.getIndex().getRtree(); + multiBoxGeo treeResults = rtree.SearchTree(boundingBox_->value, "./rtree_build"); + + for (rTreeValue item : treeResults) { + long long id = item.second; + resultIds.insert(id); + } + } + + auto resultSize = context->size(); + VectorWithMemoryLimit result{context->_allocator}; + result.reserve(resultSize); + + for (auto id : detail::makeGenerator(*variablePtr, resultSize, context)) { + result.push_back(resultIds.contains(id.getInt())); + } + + return result; + } + +} // namespace sparqlExpression diff --git a/src/engine/sparqlExpressions/ContainsExpression.h b/src/engine/sparqlExpressions/ContainsExpression.h new file mode 100644 index 0000000000..6d70a146b0 --- /dev/null +++ b/src/engine/sparqlExpressions/ContainsExpression.h @@ -0,0 +1,33 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#pragma once + +#include +#include "engine/sparqlExpressions/LiteralExpression.h" +#include "engine/sparqlExpressions/SparqlExpression.h" + +class BoundingBoxType; + +namespace sparqlExpression { + class ContainsExpression : public SparqlExpression { + private: + SparqlExpression::Ptr child_; + std::string boundingBoxAsString_; + BoundingBoxType* boundingBox_{}; + + public: + // `child` must be a `VariableExpression` and `boundingBox` must be a + // `LiteralExpression` that stores a string, else an exception will be thrown. + ContainsExpression(SparqlExpression::Ptr child, SparqlExpression::Ptr boundingBox); + + ExpressionResult evaluate(EvaluationContext *context) const override; + + std::span children() override; + + // _________________________________________________________________________ + [[nodiscard]] string getCacheKey( + const VariableToColumnMap &varColMap) const override; + }; +} \ No newline at end of file diff --git a/src/index/Index.cpp b/src/index/Index.cpp index 7d41eb8fdf..c6549a5ad1 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -67,6 +67,9 @@ auto Index::getTextVocab() const -> const TextVocab& { return pimpl_->getTextVocab(); } +// ___________________________________________________ +auto Index::getRtree() const -> const Rtree& { return pimpl_->getRtree(); } + // _____________________________________________________________________________ size_t Index::getCardinality(const TripleComponent& comp, Permutation::Enum p) const { diff --git a/src/index/Index.h b/src/index/Index.h index a071c7b730..3b5267e740 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -16,6 +16,7 @@ #include "index/StringSortComparator.h" #include "index/Vocabulary.h" #include "parser/TripleComponent.h" +#include // Forward declarations. class IdTable; @@ -109,6 +110,8 @@ class Index { Vocabulary; [[nodiscard]] const TextVocab& getTextVocab() const; + const Rtree& getRtree() const; + // -------------------------------------------------------------------------- // -- RETRIEVAL --- // -------------------------------------------------------------------------- diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index caffb90c47..f787aa4801 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -364,12 +363,12 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( auto compressionOutfile = ad_utility::makeOfstream( onDiskBase_ + TMP_BASENAME_COMPRESSION + INTERNAL_VOCAB_SUFFIX); auto internalVocabularyActionCompression = - [&compressionOutfile](const auto& word, const auto& index) { + [&compressionOutfile](const auto& word, [[maybe_unused]]const auto& index) { compressionOutfile << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; auto externalVocabularyActionCompression = - [](const auto& word, const auto& index) { + []([[maybe_unused]]const auto& word, [[maybe_unused]]const auto& index) { }; m._noIdMapsAndIgnoreExternalVocab = true; auto mergeResult = @@ -401,21 +400,17 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( auto wordWriter = vocab_.makeUncompressingWordWriter(onDiskBase_ + INTERNAL_VOCAB_SUFFIX); - Rtree rtree = Rtree(); - - auto internalVocabularyAction = [&wordWriter, &rtree](const auto& word, const auto& index) { + auto internalVocabularyAction = [&wordWriter](const auto& word, const auto& index) { wordWriter.push(word.data(), word.size()); - rtree.ConvertWordToRtreeEntry(word.data(), word.size(), index); + Rtree::ConvertWordToRtreeEntry(word, index, "./ConversionTest"); }; - auto externalVocabularyAction = [&rtree](const auto& word, const auto& index) { - rtree.ConvertWordToRtreeEntry(word.data(), word.size(), index); + auto externalVocabularyAction = [](const auto& word, const auto& index) { + Rtree::ConvertWordToRtreeEntry(word, index, "./ConversionTest"); }; - rtree.OpenConversion("../../third_party/rtree/ConversionTest"); VocabularyMerger::VocabularyMetaData result = v.mergeVocabulary(onDiskBase_, numFiles, sortPred, internalVocabularyAction, externalVocabularyAction); - rtree.CloseConversion(); return result; }(); LOG(DEBUG) << "Finished merging partial vocabularies" << std::endl; @@ -426,7 +421,11 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( << res.vocabularyMetaData_.numWordsTotal_ - sizeInternalVocabulary << std::endl; - // TODO RTREE + LOG(INFO) << "Loading the Rtree entries..." << std::endl; + Rtree rtree = Rtree(); + multiBoxGeo entries = rtree.LoadEntries("./ConversionTest"); + LOG(INFO) << "Building the Rtree..." << std::endl; + rtree.BuildTree(entries, 16, "./rtree_build"); res.idTriples = std::move(idTriples); res.actualPartialSizes = std::move(actualPartialSizes); @@ -692,7 +691,8 @@ void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) { patterns_, hasPattern_); } - // TODO Load the RTREE + // Load the Rtree + rtree_ = Rtree(); } // _____________________________________________________________________________ diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 9ac324e67f..8b9ec78a5d 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -119,6 +120,7 @@ class IndexImpl { size_t totalVocabularySize_ = 0; bool vocabPrefixCompressed_ = true; Index::TextVocab textVocab_; + Rtree rtree_; TextMetaData textMeta_; DocsDB docsDB_; @@ -227,6 +229,8 @@ class IndexImpl { const auto& getTextVocab() const { return textVocab_; }; + const auto& getRtree() const { return rtree_; }; + // -------------------------------------------------------------------------- // -- RETRIEVAL --- // -------------------------------------------------------------------------- diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 193a5089df..b170ceadb5 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -21,6 +21,7 @@ #include "parser/TurtleParser.h" #include "parser/data/Variable.h" #include "util/StringUtils.h" +#include "engine/sparqlExpressions/ContainsExpression.h" using namespace ad_utility::sparql_types; using namespace sparqlExpression; @@ -90,6 +91,19 @@ ExpressionPtr Visitor::processIriFunctionCall( std::move(argList[0])); } } + + constexpr static std::string_view geoPrefixRtree = + "')); + iriView.remove_suffix(1); + if (iriView == "boundingBoxContains") { + checkNumArgs("geoRtree:", iriView, 2); + return std::make_unique(std::move(argList[0]), std::move(argList[1])); + } + } + reportNotSupported(ctx, "Function \"" + iri + "\" is"); } @@ -986,7 +1000,7 @@ ObjectList Visitor::visit(Parser::ObjectListContext* ctx) { } // ____________________________________________________________________________________ -Node Visitor::visit(Parser::ObjectRContext* ctx) { +ad_utility::sparql_types::Node Visitor::visit(Parser::ObjectRContext* ctx) { return visit(ctx->graphNode()); } @@ -1183,13 +1197,13 @@ uint64_t Visitor::visit(Parser::IntegerContext* ctx) { } // ____________________________________________________________________________________ -Node Visitor::visit(Parser::TriplesNodeContext* ctx) { +ad_utility::sparql_types::Node Visitor::visit(Parser::TriplesNodeContext* ctx) { return visitAlternative(ctx->collection(), ctx->blankNodePropertyList()); } // ____________________________________________________________________________________ -Node Visitor::visit(Parser::BlankNodePropertyListContext* ctx) { +ad_utility::sparql_types::Node Visitor::visit(Parser::BlankNodePropertyListContext* ctx) { VarOrTerm var{GraphTerm{newBlankNode()}}; Triples triples; auto propertyList = visit(ctx->propertyListNotEmpty()); @@ -1213,7 +1227,7 @@ void Visitor::visit(Parser::BlankNodePropertyListPathContext* ctx) { } // ____________________________________________________________________________________ -Node Visitor::visit(Parser::CollectionContext* ctx) { +ad_utility::sparql_types::Node Visitor::visit(Parser::CollectionContext* ctx) { Triples triples; VarOrTerm nextElement{ GraphTerm{Iri{""}}}; @@ -1245,7 +1259,7 @@ void Visitor::visit(Parser::CollectionPathContext* ctx) { } // ____________________________________________________________________________________ -Node Visitor::visit(Parser::GraphNodeContext* ctx) { +ad_utility::sparql_types::Node Visitor::visit(Parser::GraphNodeContext* ctx) { if (ctx->varOrTerm()) { return {visit(ctx->varOrTerm()), Triples{}}; } else { diff --git a/test/VocabularyGeneratorTest.cpp b/test/VocabularyGeneratorTest.cpp index 80a3e3feff..dd33caed63 100644 --- a/test/VocabularyGeneratorTest.cpp +++ b/test/VocabularyGeneratorTest.cpp @@ -161,10 +161,10 @@ TEST_F(MergeVocabularyTest, mergeVocabulary) { { VocabularyMerger m; auto file = ad_utility::makeOfstream(_basePath + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word, const auto& index) { + auto internalVocabularyAction = [&file](const auto& word, [[maybe_unused]]const auto& index) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; - auto externalVocabularyAction = [](const auto& word, const auto& index) {}; + auto externalVocabularyAction = []([[maybe_unused]]const auto& word, [[maybe_unused]]const auto& index) {}; res = m.mergeVocabulary(_basePath, 2, TripleComponentComparator(), internalVocabularyAction, externalVocabularyAction); } @@ -207,11 +207,11 @@ TEST(VocabularyGenerator, ReadAndWritePartial) { { VocabularyMerger m; auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word, const auto& index) { + auto internalVocabularyAction = [&file](const auto& word, [[maybe_unused]]const auto& index) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; - auto externalVocabularyAction = [](const auto& word, const auto& index) {}; + auto externalVocabularyAction = []([[maybe_unused]]const auto& word, [[maybe_unused]]const auto& index) {}; m.mergeVocabulary(basename, 1, v.getCaseComparator(), internalVocabularyAction, externalVocabularyAction); } @@ -252,10 +252,10 @@ TEST(VocabularyGenerator, ReadAndWritePartial) { { VocabularyMerger m; auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word, const auto& index) { + auto internalVocabularyAction = [&file](const auto& word, [[maybe_unused]]const auto& index) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; - auto externalVocabularyAction = [](const auto& word, const auto& index) {}; + auto externalVocabularyAction = []([[maybe_unused]]const auto& word, [[maybe_unused]]const auto& index) {}; m.mergeVocabulary(basename, 1, v.getCaseComparator(), internalVocabularyAction, externalVocabularyAction); } From f1d3eb253807c1719fbef431a0ddfbbefd4dd3b3 Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Sat, 10 Jun 2023 00:54:33 +0200 Subject: [PATCH 08/20] Updated the Rtree --- third_party/rtree | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/rtree b/third_party/rtree index 851be0c9cd..6d05dc58ff 160000 --- a/third_party/rtree +++ b/third_party/rtree @@ -1 +1 @@ -Subproject commit 851be0c9cdfa69f5725b516b69ff8c97f173a701 +Subproject commit 6d05dc58ff366d9d0c0b4b759e6c0793a8dcb386 From 1afa5ea4c07e5d11264fd6c9a2721466b375f1db Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Mon, 21 Aug 2023 01:50:31 +0200 Subject: [PATCH 09/20] Added external sorting --- CMakeLists.txt | 10 +---- .../sparqlExpressions/ContainsExpression.cpp | 4 +- src/index/Index.h | 2 +- src/index/IndexImpl.cpp | 41 ++++++++++++++----- src/index/IndexImpl.h | 4 +- src/util/CMakeLists.txt | 4 +- 6 files changed, 39 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d9240a4144..45e42b902b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,7 +111,7 @@ endif() ###################################### # BOOST ###################################### -find_package(Boost 1.74 COMPONENTS iostreams program_options REQUIRED) +find_package(Boost 1.76 COMPONENTS iostreams serialization program_options REQUIRED) include_directories(${Boost_INCLUDE_DIR}) @@ -129,7 +129,7 @@ find_package(OpenSSL REQUIRED) # `target_link_libraries` that additionally links against the common # libraries. function (qlever_target_link_libraries target) - target_link_libraries(${target} ${ARGN} absl::flat_hash_map absl::flat_hash_set absl::strings rtree) + target_link_libraries(${target} ${ARGN} absl::flat_hash_map absl::flat_hash_set absl::strings) endfunction() @@ -208,12 +208,6 @@ if (ENABLE_EXPENSIVE_CHECKS) add_definitions("-DAD_ENABLE_EXPENSIVE_CHECKS") endif() -############################### -# RTREE -############################### -add_subdirectory(third_party/rtree) -include_directories(SYSTEM third_party/rtree/) - ################################ # STXXL ################################ diff --git a/src/engine/sparqlExpressions/ContainsExpression.cpp b/src/engine/sparqlExpressions/ContainsExpression.cpp index 8dff5371bc..0b14606fb8 100644 --- a/src/engine/sparqlExpressions/ContainsExpression.cpp +++ b/src/engine/sparqlExpressions/ContainsExpression.cpp @@ -7,7 +7,7 @@ #include "engine/sparqlExpressions/SparqlExpressionGenerators.h" #include "global/ValueIdComparators.h" #include "re2/re2.h" -#include +#include using namespace std::literals; @@ -127,7 +127,7 @@ namespace sparqlExpression { multiBoxGeo treeResults = rtree.SearchTree(boundingBox_->value, "./rtree_build"); for (rTreeValue item : treeResults) { - long long id = item.second; + long long id = item.id; resultIds.insert(id); } } diff --git a/src/index/Index.h b/src/index/Index.h index 3b5267e740..0148323761 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -16,7 +16,7 @@ #include "index/StringSortComparator.h" #include "index/Vocabulary.h" #include "parser/TripleComponent.h" -#include +#include // Forward declarations. class IdTable; diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index f787aa4801..7ad9e26085 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -239,6 +239,13 @@ template void IndexImpl::createFromFile( template IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( const string& filename, size_t linesPerPartial) { + try { + Rtree rtree = Rtree(1300000000); + rtree.BuildTree("vocab_raw", 16, "./rtree_build"); + LOG(INFO) << "Finished building the Rtree" << std::endl; + } catch (const std::exception &e) { + LOG(INFO) << e.what() << std::endl; + } LOG(INFO) << "Processing input triples from " << filename << " ..." << std::endl; auto parser = std::make_shared(filename); @@ -400,17 +407,27 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( auto wordWriter = vocab_.makeUncompressingWordWriter(onDiskBase_ + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&wordWriter](const auto& word, const auto& index) { + std::ofstream convertOfs = std::ofstream(onDiskBase_ + ".vocabulary.boundingbox.tmp", std::ios::binary); + + auto internalVocabularyAction = [&wordWriter, &convertOfs](const auto& word, const auto& index) { wordWriter.push(word.data(), word.size()); - Rtree::ConvertWordToRtreeEntry(word, index, "./ConversionTest"); + std::optional boundingBox = Rtree::ConvertWordToRtreeEntry(word); + if (boundingBox) { + Rtree::SaveEntry(boundingBox.value(), index, convertOfs); + } }; - - auto externalVocabularyAction = [](const auto& word, const auto& index) { - Rtree::ConvertWordToRtreeEntry(word, index, "./ConversionTest"); + auto externalVocabularyAction = [&convertOfs](const auto& word, const auto& index) { + std::optional boundingBox = Rtree::ConvertWordToRtreeEntry(word); + if (boundingBox) { + Rtree::SaveEntry(boundingBox.value(), index, convertOfs); + } }; VocabularyMerger::VocabularyMetaData result = v.mergeVocabulary(onDiskBase_, numFiles, sortPred, internalVocabularyAction, externalVocabularyAction); + + convertOfs.close(); + return result; }(); LOG(DEBUG) << "Finished merging partial vocabularies" << std::endl; @@ -421,11 +438,14 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( << res.vocabularyMetaData_.numWordsTotal_ - sizeInternalVocabulary << std::endl; - LOG(INFO) << "Loading the Rtree entries..." << std::endl; - Rtree rtree = Rtree(); - multiBoxGeo entries = rtree.LoadEntries("./ConversionTest"); LOG(INFO) << "Building the Rtree..." << std::endl; - rtree.BuildTree(entries, 16, "./rtree_build"); + try { + Rtree rtree = Rtree(1300000000); + rtree.BuildTree(onDiskBase_ + ".vocabulary", 16, "./rtree_build"); + LOG(INFO) << "Finished building the Rtree" << std::endl; + } catch (const std::exception &e) { + LOG(INFO) << e.what() << std::endl; + } res.idTriples = std::move(idTriples); res.actualPartialSizes = std::move(actualPartialSizes); @@ -691,8 +711,7 @@ void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) { patterns_, hasPattern_); } - // Load the Rtree - rtree_ = Rtree(); + // Load the Rtree TODO } // _____________________________________________________________________________ diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 8b9ec78a5d..4f344998b7 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include @@ -120,7 +120,7 @@ class IndexImpl { size_t totalVocabularySize_ = 0; bool vocabPrefixCompressed_ = true; Index::TextVocab textVocab_; - Rtree rtree_; + Rtree rtree_ = Rtree(4000000000); TextMetaData textMeta_; DocsDB docsDB_; diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 6581076a62..92d2293850 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -1,4 +1,4 @@ add_subdirectory(http) add_library(util GeoSparqlHelpers.h GeoSparqlHelpers.cpp VisitMixin.h - antlr/ANTLRErrorHandling.cpp antlr/ANTLRErrorHandling.h Conversions.cpp ResetWhenMoved.h) -qlever_target_link_libraries(util parser) + antlr/ANTLRErrorHandling.cpp antlr/ANTLRErrorHandling.h Conversions.cpp ResetWhenMoved.h Rtree.h Rtree.cpp) +qlever_target_link_libraries(util parser Boost::serialization) From 9fcd5fd7d71ff51e8751599cb13a259c18f2ec72 Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Mon, 21 Aug 2023 01:54:00 +0200 Subject: [PATCH 10/20] Removed Rtree submodule --- .gitmodules | 3 --- third_party/rtree | 1 - 2 files changed, 4 deletions(-) delete mode 160000 third_party/rtree diff --git a/.gitmodules b/.gitmodules index 3174b33684..7a2fe9cc0f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,6 +13,3 @@ [submodule "third_party/antlr4"] path = third_party/antlr4 url = https://github.com/antlr/antlr4.git -[submodule "third_party/rtree"] - path = third_party/rtree - url = https://github.com/noahnock/Bachelorprojekt.git diff --git a/third_party/rtree b/third_party/rtree deleted file mode 160000 index 6d05dc58ff..0000000000 --- a/third_party/rtree +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 6d05dc58ff366d9d0c0b4b759e6c0793a8dcb386 From 2987b71bb36496667351d91b3fb4bdf655367d70 Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Mon, 21 Aug 2023 01:54:53 +0200 Subject: [PATCH 11/20] Added Rtree to util --- src/util/Rtree.cpp | 1283 ++++++++++++++++++++++++++++++++++++++++++++ src/util/Rtree.h | 278 ++++++++++ 2 files changed, 1561 insertions(+) create mode 100644 src/util/Rtree.cpp create mode 100644 src/util/Rtree.h diff --git a/src/util/Rtree.cpp b/src/util/Rtree.cpp new file mode 100644 index 0000000000..2f7f078270 --- /dev/null +++ b/src/util/Rtree.cpp @@ -0,0 +1,1283 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#include +#include + +static void centerOrdering(multiBoxGeo& boxes, size_t dim) { + if (dim == 0) { + // order by centerX + sortRuleLambdaX comp; + + std::sort(boxes.begin(), boxes.end(), comp); + } else { + // order by centerY + auto sortRuleLambda = [](rTreeValue b1, rTreeValue b2) -> bool { + double center1 = (b1.box.min_corner().get<1>() + b1.box.max_corner().get<1>()) / 2; + double center2 = (b2.box.min_corner().get<1>() + b2.box.max_corner().get<1>()) / 2; + return center1 < center2; + }; + + std::sort(boxes.begin(), boxes.end(), sortRuleLambda); + } +} + +static void centerOrdering(multiBoxWithOrderIndex& boxes, size_t dim) { + if (dim == 0) { + // order by centerX + sortRuleLambdaXWithIndex comp; + + std::sort(boxes.begin(), boxes.end(), comp); + } else { + // order by centerY + sortRuleLambdaYWithIndex comp; + + std::sort(boxes.begin(), boxes.end(), comp); + } +} + +OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBuildingRamUsage, bool workInRam) { + OrderedBoxes orderedInputRectangles; + + ad_utility::BackgroundStxxlSorter sorterRectsD0Basic = ad_utility::BackgroundStxxlSorter(std::ceil(maxBuildingRamUsage / 3.0)); + multiBoxGeo rectsD0Basic; + + if (workInRam) { + rectsD0Basic = Rtree::LoadEntries(onDiskBase + ".boundingbox.tmp"); + centerOrdering(rectsD0Basic, 0); + } else { + FileReaderWithoutIndex fileReaderRectsD0 = FileReaderWithoutIndex(onDiskBase + ".boundingbox.tmp"); + std::optional rectD0Element = fileReaderRectsD0.GetNextElement(); + while (rectD0Element) { + sorterRectsD0Basic.push(rectD0Element.value()); + rectD0Element = fileReaderRectsD0.GetNextElement(); + } + fileReaderRectsD0.Close(); + } + + long long xSize = 0; + double globalMinX = -1; + double globalMinY = -1; + double globalMaxX = -1; + double globalMaxY = -1; + + ad_utility::BackgroundStxxlSorter sorterRectsD1 = ad_utility::BackgroundStxxlSorter(std::ceil(maxBuildingRamUsage / 3.0)); + std::shared_ptr RectanglesD1WithOrder = std::make_shared(); + + if (workInRam) { + for (rTreeValue element : rectsD0Basic) { + rTreeValueWithOrderIndex entry = rTreeValueWithOrderIndex(element.box, element.id, xSize, 0); + RectanglesD1WithOrder->push_back(entry); + xSize++; + + if (globalMinX == -1 || element.box.min_corner().get<0>() < globalMinX) { + globalMinX = element.box.min_corner().get<0>(); + } + if (globalMinY == -1 || element.box.min_corner().get<1>() < globalMinY) { + globalMinY = element.box.min_corner().get<1>(); + } + if (element.box.max_corner().get<0>() > globalMaxX) { + globalMaxX = element.box.max_corner().get<0>(); + } + if (element.box.max_corner().get<1>() > globalMaxY) { + globalMaxY = element.box.max_corner().get<1>(); + } + } + centerOrdering(*RectanglesD1WithOrder, 1); + } else { + for (rTreeValue element : sorterRectsD0Basic.sortedView()) { + rTreeValueWithOrderIndex entry = rTreeValueWithOrderIndex(element.box, element.id, xSize, 0); + sorterRectsD1.push(entry); + xSize++; + + if (globalMinX == -1 || element.box.min_corner().get<0>() < globalMinX) { + globalMinX = element.box.min_corner().get<0>(); + } + if (globalMinY == -1 || element.box.min_corner().get<1>() < globalMinY) { + globalMinY = element.box.min_corner().get<1>(); + } + if (element.box.max_corner().get<0>() > globalMaxX) { + globalMaxX = element.box.max_corner().get<0>(); + } + if (element.box.max_corner().get<1>() > globalMaxY) { + globalMaxY = element.box.max_corner().get<1>(); + } + } + } + sorterRectsD0Basic.clear(); + + size_t currentS = std::ceil(((float) xSize) / ((float) M)); + + long long ySize = 0; + std::ofstream r1File = std::ofstream(onDiskBase + ".boundingbox.d1.tmp", std::ios::binary); + ad_utility::BackgroundStxxlSorter sorterRectsD0 = ad_utility::BackgroundStxxlSorter(std::ceil(maxBuildingRamUsage / 3.0)); + std::shared_ptr RectanglesD0WithOrder = std::make_shared(); + std::shared_ptr r1Small = std::make_shared(); + // placeholder + r1Small->push_back(rTreeValueWithOrderIndex()); + r1Small->push_back(rTreeValueWithOrderIndex()); + rTreeValueWithOrderIndex minD1; + rTreeValueWithOrderIndex maxD1; + + if (workInRam) { + for (rTreeValueWithOrderIndex element : *RectanglesD1WithOrder) { + element.orderY = ySize; + RectanglesD0WithOrder->push_back(element); + + if (((ySize + 1) % currentS == 0 && (ySize + 1) / currentS >= 1 && (ySize + 1) / currentS < M) + || (ySize % currentS == 0 && ySize / currentS >= 1 && ySize / currentS < M)) { + // index i * S - 1 or i * S + r1Small->push_back(element); + } + + if (ySize == 0) { + minD1 = element; + maxD1 = element; + } + if (element.orderY > maxD1.orderY) { + maxD1 = element; + } + + ySize++; + } + centerOrdering(*RectanglesD0WithOrder, 0); + } else { + for (rTreeValueWithOrderIndex element : sorterRectsD1.sortedView()) { + element.orderY = ySize; + Rtree::SaveEntryWithOrderIndex(element, r1File); + sorterRectsD0.push(element); + + if (((ySize + 1) % currentS == 0 && (ySize + 1) / currentS >= 1 && (ySize + 1) / currentS < M) + || (ySize % currentS == 0 && ySize / currentS >= 1 && ySize / currentS < M)) { + // index i * S - 1 or i * S + r1Small->push_back(element); + } + + if (ySize == 0) { + minD1 = element; + maxD1 = element; + } + if (element.orderY > maxD1.orderY) { + maxD1 = element; + } + + ySize++; + } + } + + r1File.close(); + sorterRectsD1.clear(); + + // replace the placeholder + (*r1Small)[0] = minD1; + (*r1Small)[1] = maxD1; + + long long currentX = 0; + std::ofstream r0File = std::ofstream(onDiskBase + ".boundingbox.d0.tmp", std::ios::binary); + std::shared_ptr r0Small = std::make_shared(); + // placeholder + r0Small->push_back(rTreeValueWithOrderIndex()); + r0Small->push_back(rTreeValueWithOrderIndex()); + rTreeValueWithOrderIndex minD0; + rTreeValueWithOrderIndex maxD0; + + if (workInRam) { + for (rTreeValueWithOrderIndex element : *RectanglesD0WithOrder) { + if (((currentX + 1) % currentS == 0 && (currentX + 1) / currentS >= 1 && (currentX + 1) / currentS < M) + || (currentX % currentS == 0 && currentX / currentS >= 1 && currentX / currentS < M)) { + // index i * S - 1 or i * S + r0Small->push_back(element); + } + + if (currentX == 0) { + minD0 = element; + maxD0 = element; + } + if (element.orderX > maxD0.orderX) { + maxD0 = element; + } + + currentX++; + } + } else { + for (rTreeValueWithOrderIndex element : sorterRectsD0.sortedView()) { + Rtree::SaveEntryWithOrderIndex(element, r0File); + + if (((currentX + 1) % currentS == 0 && (currentX + 1) / currentS >= 1 && (currentX + 1) / currentS < M) + || (currentX % currentS == 0 && currentX / currentS >= 1 && currentX / currentS < M)) { + // index i * S - 1 or i * S + r0Small->push_back(element); + } + + if (currentX == 0) { + minD0 = element; + maxD0 = element; + } + if (element.orderX > maxD0.orderX) { + maxD0 = element; + } + + currentX++; + } + } + + r0File.close(); + sorterRectsD0.clear(); + + // replace the placeholder + (*r0Small)[0] = minD0; + (*r0Small)[1] = maxD0; + + boxGeo boundingBox = Rtree::createBoundingBox(globalMinX, globalMinY, globalMaxX, globalMaxY); + if (workInRam) { + orderedInputRectangles.CreateOrderedBoxesInRam(RectanglesD0WithOrder, RectanglesD1WithOrder, r0Small, r1Small, boundingBox); + } else { + orderedInputRectangles.CreateOrderedBoxesOnDisk(onDiskBase + ".boundingbox.d0", onDiskBase + ".boundingbox.d1", r0Small, r1Small, xSize, boundingBox); + } + return orderedInputRectangles; +} + +/*OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBuildingRamUsage, bool workInRam) { + if (workInRam) { + return InternalSort(onDiskBase, M); + } else { + return ExternalSort(onDiskBase, M, maxBuildingRamUsage); + } +}*/ + +bool intersects(const boxGeo &b1, const boxGeo &b2) { + /** + * Determine whether two bounding boxes intersect + */ + bool notIntersecting = b1.min_corner().get<0>() > b2.max_corner().get<0>() || + b2.min_corner().get<0>() > b1.max_corner().get<0>() || + b1.min_corner().get<1>() > b2.max_corner().get<1>() || + b2.min_corner().get<1>() > b1.max_corner().get<1>(); + + return !notIntersecting; +} + +static double costFunctionTGS(boxGeo& b0, boxGeo& b1, size_t dim) { + /** + * The cost function determines the quality of a split. The lower the cost, the better the split. + * Each split gets represented by the resulting bounding boxes of the split pieces. + */ + double cost; + + // The cost represents the overlap of the two boxes + if (dim == 0) { + cost = b0.max_corner().get<0>() - b1.min_corner().get<0>(); + cost = cost < 0 ? 0 : cost; + } else { + cost = b0.max_corner().get<1>() - b1.min_corner().get<1>(); + cost = cost < 0 ? 0 : cost; + } + + return cost; +} + +boxGeo Rtree::createBoundingBox(double pointOneX, double pointOneY, double pointTwoX, double pointTwoY) { + return make(make(pointOneX, pointOneY), make(pointTwoX, pointTwoY)); +} + +static std::vector TGSRecursive(const std::string& filePath, OrderedBoxes orderedInputRectangles, size_t M, size_t S, long long maxBuildingRamUsage) { + /** + * This function recursively constructs one layer of children for a certain root node. + * The input rectangles must be sorted in both x- and y-direction. + * The algorithm is based on this paper https://dl.acm.org/doi/pdf/10.1145/288692.288723 + */ + + unsigned long long n = orderedInputRectangles.GetSize(); + + if (n <= S || n <= M) { + // stop condition + return std::vector { orderedInputRectangles }; + } + // split the rectangles at the best split + std::pair split = orderedInputRectangles.SplitAtBest(filePath, S, M, maxBuildingRamUsage); + + // recursion + std::vector result0 = TGSRecursive(filePath + ".0", split.first, M, S, maxBuildingRamUsage); + std::vector result1 = TGSRecursive(filePath + ".1", split.second, M, S, maxBuildingRamUsage); + + std::vector result; + result.insert(result.begin(), result0.begin(), result0.end()); + result.insert(result.end(), result1.begin(), result1.end()); + + return result; +} + +void Rtree::BuildTree(const std::string& onDiskBase, size_t M, const std::string& folder) const { + const std::string file = onDiskBase + ".boundingbox.tmp"; + + // prepare the files + std::filesystem::create_directory(folder); + std::ofstream nodesOfs = std::ofstream(folder + "/nodes.bin", std::ios::binary); + std::map lookup; + + // sort the rectangles + long long fileLines = std::ceil(std::filesystem::file_size(file) / (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(long long))); + bool workInRam = (std::filesystem::file_size(file) + fileLines * 2 * sizeof(long long)) * 4 < this->maxBuildingRamUsage; + + if (workInRam) { + std::cout << "Building in ram" << std::endl; + } else { + std::cout << "Building on disk" << std::endl; + } + OrderedBoxes orderedInputRectangles = SortInput(onDiskBase, M, maxBuildingRamUsage, workInRam); + std::cout << "Finished intital sorting" << std::endl; + + // build the tree in a depth first approach + std::stack layerStack; + + long long newId = 1; // start from 1, because 0 is the root item + ConstructionNode rootItem = ConstructionNode(0, orderedInputRectangles); + layerStack.push(rootItem); + size_t layer = 0; + + while (!layerStack.empty()) { + ConstructionNode currentItem = layerStack.top(); + layerStack.pop(); + + if (currentItem.GetOrderedBoxes().GetSize() <= M) { + // reached a leaf + currentItem.AddChildrenToItem(); + long long nodePtr = SaveNode(currentItem, true, nodesOfs); + lookup[currentItem.GetId()] = nodePtr; + } else { + std::vector tgsResult = TGSRecursive(onDiskBase + ".boundingbox." + std::to_string(layer), currentItem.GetOrderedBoxes(), M, std::ceil(((float) currentItem.GetOrderedBoxes().GetSize()) / ((float) M)), this->maxBuildingRamUsage); + for (OrderedBoxes& currentOrderedRectangles : tgsResult) { + ConstructionNode newItem = ConstructionNode(newId, currentOrderedRectangles); + layerStack.push(newItem); + + currentItem.AddChild(newItem); + + newId++; + } + + long long nodePtr = SaveNode(currentItem, false, nodesOfs); + lookup[currentItem.GetId()] = nodePtr; + } + layer++; + } + nodesOfs.close(); + + std::ofstream lookupOfs(folder + "/lookup.bin", std::ios::binary); + for (unsigned int i = 0; i < newId; i++) { + long long nodePtr = lookup[i]; + lookupOfs.write(reinterpret_cast(&nodePtr), sizeof(long long)); + } + lookupOfs.close(); +} + +multiBoxGeo Rtree::SearchTree(boxGeo query, const std::string &folder) { + std::ifstream lookupIfs = std::ifstream(folder + "/lookup.bin", std::ios::binary); + std::ifstream nodesIfs = std::ifstream(folder + "/nodes.bin", std::ios::binary); + + Node rootNode = LoadNode(0, lookupIfs, nodesIfs); + multiBoxGeo results; + std::stack nodes; + nodes.push(rootNode); + + while(!nodes.empty()) { + Node currentNode = nodes.top(); + nodes.pop(); + + for (rTreeValue child : currentNode.GetChildren()) { + if (intersects(query, child.box)) { + if (currentNode.GetIsLastInnerNode()) { + results.push_back(child); + } else { + Node newNode = LoadNode(child.id, lookupIfs, nodesIfs); + nodes.push(newNode); + } + } + } + } + + lookupIfs.close(); + nodesIfs.close(); + return results; +} + +ConstructionNode::ConstructionNode(long long id, OrderedBoxes orderedBoxes) + : Node{id} +{ + this->orderedBoxes = orderedBoxes; + + // calculate the boundingBoxes + this->boundingBox = orderedBoxes.GetBoundingBox(); +} + +void ConstructionNode::AddChildrenToItem() { + /** + * Add all children of a certain node at once. + * This is used when a leaf node is reached. + */ + if (this->GetOrderedBoxes().WorkInRam()) { + for(rTreeValueWithOrderIndex box : *this->GetOrderedBoxes().GetRectanglesInRam()) { + Node leafNode = Node(box.id, box.box); + this->AddChild(leafNode); + } + } else { + FileReader fileReader = FileReader(this->GetOrderedBoxes().GetRectanglesOnDisk()); + + std::optional element = fileReader.GetNextElement(); + while(element) { + Node leafNode = Node(element.value().id, element.value().box); + this->AddChild(leafNode); + element = fileReader.GetNextElement(); + } + + fileReader.Close(); + } +} + +long long Node::GetId() const { + return this->id; +} + +OrderedBoxes ConstructionNode::GetOrderedBoxes() { + return this->orderedBoxes; +} + +Node::Node(long long id, boxGeo boundingbox) { + this->id = id; + this->boundingBox = boundingbox; +} + +Node::Node(long long id) { + this->id = id; +} + +Node::Node() {} + +Node::Node(long long id, boxGeo boundingBox, multiBoxGeo &children, bool isLastInnerNode) { + this->id = id; + this->boundingBox = boundingBox; + this->children = children; + this->isLastInnerNode = isLastInnerNode; +} + +Node::Node(long long id, double minX, double minY, double maxX, double maxY, bool isLastInnerNode) { + this->id = id; + this->boundingBox = Rtree::createBoundingBox(minX, minY, maxX, maxY); + this->isLastInnerNode = isLastInnerNode; +} + +void Node::AddChild(Node& child) { + boxGeo box = child.GetBoundingBox(); + unsigned long long entryId = child.GetId(); + rTreeValue entry = rTreeValue(box, entryId); + this->children.push_back(entry); +} + +boxGeo Node::GetBoundingBox() const { + return this->boundingBox; +} + +void Node::SetIsLastInnerNode(bool _isLastInnerNode) { + this->isLastInnerNode = _isLastInnerNode; +} + +bool Node::GetIsLastInnerNode() const { + return this->isLastInnerNode; +} + +multiBoxGeo Node::GetChildren() { + return this->children; +} + +long long Rtree::SaveNode(Node &node, bool isLastInnerNode, std::ofstream& nodesOfs) { + node.SetIsLastInnerNode(isLastInnerNode); + + long long pos = static_cast(nodesOfs.tellp()); + boost::archive::binary_oarchive archive(nodesOfs); + archive << node; + nodesOfs.write(" ", 1); + + return pos; +} + +Node Rtree::LoadNode(long long id, std::ifstream& lookupIfs, std::ifstream& nodesIfs) { + Node newNode; + + long long offset = id * (long long)sizeof(long long); + lookupIfs.seekg(offset, std::ios::beg); + + long long nodePtr; + lookupIfs.read(reinterpret_cast(&nodePtr), sizeof(long long)); + + nodesIfs.seekg(nodePtr); + boost::archive::binary_iarchive ia(nodesIfs); + ia >> newNode; + + return newNode; +} + +std::optional GetBoundingBoxFromWKT(const std::string& wkt) { + /** + * Parse the wkt literal in a way, that only the relevant data for the rtree gets read in. + */ + bool lookingForX = true; + bool readingDouble = false; + std::string currentDouble; + + double minX = -1; + double maxX = -1; + double minY = -1; + double maxY = -1; + + for (char c : wkt) { + if (isdigit(c)) { + readingDouble = true; + currentDouble += c; + } else if (c == '.') { + readingDouble = true; + currentDouble += '.'; + } else if (c == ' ') { + if (readingDouble && lookingForX) { + // x is completely read in + readingDouble = false; + lookingForX = false; + double x; + try { + x = std::stod(currentDouble); + } catch(...) { + return { }; + } + currentDouble = ""; + if (x < minX || minX == -1) { + minX = x; + } + + if (x > maxX) { + maxX = x; + } + } + } else { + if (readingDouble && !lookingForX) { + // y is completely read in + readingDouble = false; + lookingForX = true; + double y; + try { + y = std::stod(currentDouble); + } catch(...) { + return { }; + } + currentDouble = ""; + if (y < minY || minY == -1) { + minY = y; + } + + if (y > maxY) { + maxY = y; + } + } + } + } + + return { Rtree::createBoundingBox(minX, minY, maxX, maxY) }; +} + +std::optional Rtree::ConvertWordToRtreeEntry(const std::string& wkt) { + /** + * Convert a single wkt literal to a boundingbox. + */ + std::optional boundingBox; + + /* Get the bounding box(es) of either a multipolygon, polygon or a linestring */ + std::size_t posWKTStart = wkt.find("MULTIPOLYGON(((") + 14; + std::size_t posWKTEnd = wkt.find(")))", posWKTStart); + if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { + std::string newWkt = wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); + boundingBox = GetBoundingBoxFromWKT(newWkt); + } else { + posWKTStart = wkt.find("POLYGON((") + 8; + posWKTEnd = wkt.find("))", posWKTStart); + if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { + std::string newWkt = wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); + boundingBox = GetBoundingBoxFromWKT(newWkt); + } else { + posWKTStart = wkt.find("LINESTRING(") + 10; + posWKTEnd = wkt.find(')', posWKTStart); + if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { + std::string newWkt = wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); + boundingBox = GetBoundingBoxFromWKT(newWkt); + } else { + return { }; + } + } + } + + return boundingBox; +} + +void Rtree::SaveEntry(boxGeo boundingBox, uint64_t index, std::ofstream& convertOfs) { + /** + * Save a single entry (which was e.g. converted by ConvertWordToRtreeEntry) to the disk + */ + double minX = boundingBox.min_corner().get<0>(); + double minY = boundingBox.min_corner().get<1>(); + double maxX = boundingBox.max_corner().get<0>(); + double maxY = boundingBox.max_corner().get<1>(); + + convertOfs.write(reinterpret_cast(&minX), sizeof(double)); + convertOfs.write(reinterpret_cast(&minY), sizeof(double)); + convertOfs.write(reinterpret_cast(&maxX), sizeof(double)); + convertOfs.write(reinterpret_cast(&maxY), sizeof(double)); + convertOfs.write(reinterpret_cast(&index), sizeof(uint64_t)); +} + +void Rtree::SaveEntryWithOrderIndex(rTreeValueWithOrderIndex treeValue, std::ofstream& convertOfs) { + /** + * Save a single entry, containing its postion in the x- and y-sorting + */ + double minX = treeValue.box.min_corner().get<0>(); + double minY = treeValue.box.min_corner().get<1>(); + double maxX = treeValue.box.max_corner().get<0>(); + double maxY = treeValue.box.max_corner().get<1>(); + + convertOfs.write(reinterpret_cast(&minX), sizeof(double)); + convertOfs.write(reinterpret_cast(&minY), sizeof(double)); + convertOfs.write(reinterpret_cast(&maxX), sizeof(double)); + convertOfs.write(reinterpret_cast(&maxY), sizeof(double)); + convertOfs.write(reinterpret_cast(&treeValue.id), sizeof(uint64_t)); + convertOfs.write(reinterpret_cast(&treeValue.orderX), sizeof(long long)); + convertOfs.write(reinterpret_cast(&treeValue.orderY), sizeof(long long)); +} + +multiBoxGeo Rtree::LoadEntries(const std::string& file) { + multiBoxGeo boxes; + + FileReaderWithoutIndex fileReader = FileReaderWithoutIndex(file); + + std::optional element = fileReader.GetNextElement(); + while (element) { + boxes.push_back(element.value()); + element = fileReader.GetNextElement(); + } + + fileReader.Close(); + + return boxes; +} + +multiBoxWithOrderIndex Rtree::LoadEntriesWithOrderIndex(const std::string& file) { + multiBoxWithOrderIndex boxes; + FileReader fileReader = FileReader(file); + + std::optional element = fileReader.GetNextElement(); + while (element) { + boxes.push_back(element.value()); + element = fileReader.GetNextElement(); + } + + fileReader.Close(); + + return boxes; +} + +Rtree::Rtree(uintmax_t maxBuildingRamUsage) { + this->maxBuildingRamUsage = maxBuildingRamUsage; +} + +bool OrderedBoxes::WorkInRam() const{ + return this->workInRam; +} + +void OrderedBoxes::CreateOrderedBoxesInRam(const std::shared_ptr& rectanglesD0, const std::shared_ptr& rectanglesD1, const std::shared_ptr& rectanglesSmallD0, const std::shared_ptr& rectanglesSmallD1, boxGeo box) { + this->workInRam = true; + this->rectanglesD0InRam = rectanglesD0; + this->rectanglesD1InRam = rectanglesD1; + this->rectanglesD0Small = rectanglesSmallD0; + this->rectanglesD1Small = rectanglesSmallD1; + this->size = (*rectanglesD0).size(); + this->boundingBox = box; +} + +void OrderedBoxes::CreateOrderedBoxesOnDisk(const std::string& rectanglesD0, const std::string& rectanglesD1, const std::shared_ptr& rectanglesSmallD0, const std::shared_ptr& rectanglesSmallD1, long long size, boxGeo box) { + this->workInRam = false; + this->rectanglesD0OnDisk = rectanglesD0 + ".tmp"; + this->rectanglesD1OnDisk = rectanglesD1 + ".tmp"; + this->rectanglesD0Small = rectanglesSmallD0; + this->rectanglesD1Small = rectanglesSmallD1; + this->size = size; + this->boundingBox = box; +} + +boxGeo OrderedBoxes::GetBoundingBox() { + return this->boundingBox; +} + +long long OrderedBoxes::GetSize() const { + return this->size; +} + +std::shared_ptr OrderedBoxes::GetRectanglesInRam() { + return this->rectanglesD0InRam; +} + +std::string OrderedBoxes::GetRectanglesOnDisk() { + return this->rectanglesD0OnDisk; +} + +SplitResult OrderedBoxes::GetBestSplit() { + /** + * Determine based on the "small-lists", which split is the best for the rtree. + */ + struct SplitResult splitResult; + + rTreeValueWithOrderIndex minElement; + rTreeValueWithOrderIndex maxElement; + rTreeValueWithOrderIndex currentLastElement; + rTreeValueWithOrderIndex currentElement; + + // This bool is used, since we need every other element as our element "S * i" (described in the algorithm) + // To perform the split better, the element before it (S * i - 1) is saved as well + bool currentlyAtSTimesI = false; + + for (size_t dim = 0; dim < 2; dim++) { + for (long long i = 0; i < this->rectanglesD0Small->size(); i++) { + currentElement = dim == 0 ? (*this->rectanglesD0Small)[i] : (*this->rectanglesD1Small)[i]; + + if (i == 0) { + // this is the min element + minElement = currentElement; + continue; + } + + if (i == 1) { + // this is the max element + maxElement = currentElement; + continue; + } + + if (!currentlyAtSTimesI) { + currentLastElement = currentElement; + currentlyAtSTimesI = true; + continue; + } + + double minXB0 = 0; + double maxXB0 = 1; + double minXB1 = 0; + double maxXB1 = 1; + double minYB0 = 0; + double maxYB0 = 1; + double minYB1 = 0; + double maxYB1 = 1; + + if (currentlyAtSTimesI && currentElement.id != maxElement.id) { + // the current element is a possible split position. + if (dim == 0) { + minXB0 = (minElement.box.min_corner().get<0>() + minElement.box.max_corner().get<0>()) / 2; + maxXB0 = (currentLastElement.box.min_corner().get<0>() + currentLastElement.box.max_corner().get<0>()) / 2; + + minXB1 = (currentElement.box.min_corner().get<0>() + currentElement.box.max_corner().get<0>()) / 2; + maxXB1 = (maxElement.box.min_corner().get<0>() + maxElement.box.max_corner().get<0>()) / 2; + } else { + minYB0 = (minElement.box.min_corner().get<1>() + minElement.box.max_corner().get<1>()) / 2; + maxYB0 = (currentLastElement.box.min_corner().get<1>() + currentLastElement.box.max_corner().get<1>()) / 2; + + minYB1 = (currentElement.box.min_corner().get<1>() + currentElement.box.max_corner().get<1>()) / 2; + maxYB1 = (maxElement.box.min_corner().get<1>() + maxElement.box.max_corner().get<1>()) / 2; + } + + currentlyAtSTimesI = false; + } else { + break; + } + + boxGeo b0 = Rtree::createBoundingBox(minXB0, minYB0, maxXB0, maxYB0); + boxGeo b1 = Rtree::createBoundingBox(minXB1, minYB1, maxXB1, maxYB1); + + + double cost = costFunctionTGS(b0, b1, dim); + + if (splitResult.bestCost == -1 || cost < splitResult.bestCost) { + splitResult.bestCost = cost; + splitResult.bestDim = dim; + splitResult.bestLastElement = currentLastElement; + splitResult.bestElement = currentElement; + splitResult.bestMinElement = minElement; + splitResult.bestMaxElement = maxElement; + splitResult.bestIndex = i; + } + } + currentlyAtSTimesI = false; + } + + return splitResult; +} + +std::pair OrderedBoxes::SplitAtBest(const std::string& filePath, size_t S, size_t M, long long maxBuildingRamUsage) { + if (this->workInRam) { + return this->SplitAtBestInRam(S, M); + } else { + return this->SplitAtBestOnDisk(filePath, S, M, maxBuildingRamUsage); + } +} + +std::pair OrderedBoxes::SplitAtBestInRam(size_t S, size_t M) { + /** + * Split the ordered boxes in ram. First determine the best split and then perform it + */ + + struct SplitResult splitResult = this->GetBestSplit(); + + OrderedBoxes split0; + OrderedBoxes split1; + + struct SplitBuffersRam splitBuffers; + + splitBuffers.s0Dim0 = std::make_shared(); + splitBuffers.s0Dim1 = std::make_shared(); + splitBuffers.s1Dim0 = std::make_shared(); + splitBuffers.s1Dim1 = std::make_shared(); + + splitBuffers.s0SmallDim0 = std::make_shared(); + splitBuffers.s0SmallDim1 = std::make_shared(); + splitBuffers.s1SmallDim0 = std::make_shared(); + splitBuffers.s1SmallDim1 = std::make_shared(); + + std::pair boundingBoxes = PerformSplit(splitResult, splitBuffers, M, S); + + split0.CreateOrderedBoxesInRam(splitBuffers.s0Dim0, splitBuffers.s0Dim1, splitBuffers.s0SmallDim0, splitBuffers.s0SmallDim1, boundingBoxes.first); + split1.CreateOrderedBoxesInRam(splitBuffers.s1Dim0, splitBuffers.s1Dim1, splitBuffers.s1SmallDim0, splitBuffers.s1SmallDim1, boundingBoxes.second); + + (*this->rectanglesD0InRam).clear(); + (*this->rectanglesD1InRam).clear(); + (*this->rectanglesD0Small).clear(); + (*this->rectanglesD1Small).clear(); + (*this->rectanglesD0InRam).shrink_to_fit(); + (*this->rectanglesD1InRam).shrink_to_fit(); + (*this->rectanglesD0Small).shrink_to_fit(); + (*this->rectanglesD1Small).shrink_to_fit(); + + return std::make_pair(split0, split1); +} + +std::pair OrderedBoxes::SplitAtBestOnDisk(const std::string& filePath, size_t S, size_t M, long long maxBuildingRamUsage) { + /** + * Split the ordered boxes on disk. First determine the best split and then perform it + */ + + OrderedBoxes split0; + OrderedBoxes split1; + + struct SplitResult splitResult = this->GetBestSplit(); + + struct SplitBuffersDisk splitBuffers; + struct SplitBuffersRam splitBuffersRam; + + // perfrom the split + long long sizeLeft = std::ceil((splitResult.bestIndex - 2) / 2.0) * S; + long long sizeRight = this->size - sizeLeft; + long long split0ByteSize = sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(long long)); + long long split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(long long)); + bool split0InRam = split0ByteSize * 4 < maxBuildingRamUsage; + bool split1InRam = split1ByteSize * 4 < maxBuildingRamUsage; + + splitBuffersRam.s0SmallDim0 = std::make_shared(); + splitBuffersRam.s0SmallDim1 = std::make_shared(); + splitBuffersRam.s1SmallDim0 = std::make_shared(); + splitBuffersRam.s1SmallDim1 = std::make_shared(); + + if (!split0InRam) { + splitBuffers.split0Dim0File = { std::ofstream(filePath + ".0.dim0.tmp", std::ios::binary) }; + splitBuffers.split0Dim1File = { std::ofstream(filePath + ".0.dim1.tmp", std::ios::binary) }; + } else { + splitBuffersRam.s0Dim0 = std::make_shared(); + splitBuffersRam.s0Dim1 = std::make_shared(); + } + + if (!split1InRam) { + splitBuffers.split1Dim0File = { std::ofstream(filePath + ".1.dim0.tmp", std::ios::binary) }; + splitBuffers.split1Dim1File = { std::ofstream(filePath + ".1.dim1.tmp", std::ios::binary) }; + } else { + splitBuffersRam.s1Dim0 = std::make_shared(); + splitBuffersRam.s1Dim1 = std::make_shared(); + } + + splitBuffers.splitBuffersRam = splitBuffersRam; + + std::pair boundingBoxes = PerformSplit(splitResult, splitBuffers, M, S, maxBuildingRamUsage); + + if (!split0InRam) { + splitBuffers.split0Dim0File.value().close(); + splitBuffers.split0Dim1File.value().close(); + + split0.CreateOrderedBoxesOnDisk(filePath + ".0.dim0", filePath + ".0.dim1", splitBuffers.splitBuffersRam.s0SmallDim0, splitBuffers.splitBuffersRam.s0SmallDim1, sizeLeft, boundingBoxes.first); + } else { + split0.CreateOrderedBoxesInRam(splitBuffers.splitBuffersRam.s0Dim0, splitBuffers.splitBuffersRam.s0Dim1, splitBuffers.splitBuffersRam.s0SmallDim0, splitBuffers.splitBuffersRam.s0SmallDim1, boundingBoxes.first); + } + + if (!split1InRam) { + splitBuffers.split1Dim0File.value().close(); + splitBuffers.split1Dim1File.value().close(); + + split1.CreateOrderedBoxesOnDisk(filePath + ".1.dim0", filePath + ".1.dim1", splitBuffers.splitBuffersRam.s1SmallDim0, splitBuffers.splitBuffersRam.s1SmallDim1, sizeRight, boundingBoxes.second); + } else { + split1.CreateOrderedBoxesInRam(splitBuffers.splitBuffersRam.s1Dim0, splitBuffers.splitBuffersRam.s1Dim1, splitBuffers.splitBuffersRam.s1SmallDim0, splitBuffers.splitBuffersRam.s1SmallDim1, boundingBoxes.second); + } + + std::remove(this->rectanglesD0OnDisk.c_str()); + std::remove(this->rectanglesD1OnDisk.c_str()); + + return std::make_pair(split0, split1); +} + +std::pair OrderedBoxes::PerformSplit(SplitResult splitResult, SplitBuffersRam& splitBuffersRam, size_t M, size_t S) { + /** + * Perform the best split on the current ordered boxes in the ram case + */ + + struct SplitBuffersDisk splitBuffersDisk; + + splitBuffersDisk.splitBuffersRam = splitBuffersRam; + splitBuffersDisk.split0Dim0File = {}; + splitBuffersDisk.split0Dim1File = {}; + splitBuffersDisk.split1Dim0File = {}; + splitBuffersDisk.split1Dim1File = {}; + + // reuse the PerfromSplit of the Disk case. + std::pair boundingBoxes = PerformSplit(splitResult, splitBuffersDisk, M, S, 0); + + splitBuffersRam = splitBuffersDisk.splitBuffersRam; + + return boundingBoxes; +} + +std::pair OrderedBoxes::PerformSplit(SplitResult splitResult, SplitBuffersDisk& splitBuffers, size_t M, size_t S, long long maxBuildingRamUsage) { + /** + * Perform the best split on the current ordered boxes in the disk case + */ + + long long sizeLeft = std::ceil((splitResult.bestIndex - 2) / 2.0) * S; + long long sizeRight = this->size - sizeLeft; + size_t SSplit0 = sizeLeft <= S ? std::ceil(sizeLeft / (double) M) : S; + size_t SSplit1 = sizeRight <= S ? std::ceil(sizeRight / (double) M) : S; + long long split0ByteSize = sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(long long)); + long long split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(long long)); + bool split0InRam = maxBuildingRamUsage == 0 || split0ByteSize * 4 < maxBuildingRamUsage; + bool split1InRam = maxBuildingRamUsage == 0 || split1ByteSize * 4 < maxBuildingRamUsage; + + double globalMinXS0 = -1; + double globalMinYS0 = -1; + double globalMaxXS0 = -1; + double globalMaxYS0 = -1; + + double globalMinXS1 = -1; + double globalMinYS1 = -1; + double globalMaxXS1 = -1; + double globalMaxYS1 = -1; + + rTreeValueWithOrderIndex minSplit0OtherDim; + rTreeValueWithOrderIndex maxSplit0OtherDim; + rTreeValueWithOrderIndex minSplit1OtherDim; + rTreeValueWithOrderIndex maxSplit1OtherDim; + + if (splitResult.bestDim == 0) { + splitBuffers.splitBuffersRam.s0SmallDim0->push_back(splitResult.bestMinElement); + splitBuffers.splitBuffersRam.s0SmallDim0->push_back(splitResult.bestLastElement); + splitBuffers.splitBuffersRam.s1SmallDim0->push_back(splitResult.bestElement); + splitBuffers.splitBuffersRam.s1SmallDim0->push_back(splitResult.bestMaxElement); + + // placeholder, since we need the min and max element of the split in the first two spots + splitBuffers.splitBuffersRam.s0SmallDim1->push_back(rTreeValueWithOrderIndex()); + splitBuffers.splitBuffersRam.s0SmallDim1->push_back(rTreeValueWithOrderIndex()); + splitBuffers.splitBuffersRam.s1SmallDim1->push_back(rTreeValueWithOrderIndex()); + splitBuffers.splitBuffersRam.s1SmallDim1->push_back(rTreeValueWithOrderIndex()); + } else { + splitBuffers.splitBuffersRam.s0SmallDim1->push_back(splitResult.bestMinElement); + splitBuffers.splitBuffersRam.s0SmallDim1->push_back(splitResult.bestLastElement); + splitBuffers.splitBuffersRam.s1SmallDim1->push_back(splitResult.bestElement); + splitBuffers.splitBuffersRam.s1SmallDim1->push_back(splitResult.bestMaxElement); + + // placeholder + splitBuffers.splitBuffersRam.s0SmallDim0->push_back(rTreeValueWithOrderIndex()); + splitBuffers.splitBuffersRam.s0SmallDim0->push_back(rTreeValueWithOrderIndex()); + splitBuffers.splitBuffersRam.s1SmallDim0->push_back(rTreeValueWithOrderIndex()); + splitBuffers.splitBuffersRam.s1SmallDim0->push_back(rTreeValueWithOrderIndex()); + } + + std::optional elementOpt; + std::optional fileReaderDim0; + std::optional fileReaderDim1; + if (!this->workInRam) { + fileReaderDim0 = { FileReader(this->rectanglesD0OnDisk) }; + fileReaderDim1 = { FileReader(this->rectanglesD1OnDisk) }; + } + long long currentXSplit0 = 0; + long long currentXSplit1 = 0; + long long currentYSplit0 = 0; + long long currentYSplit1 = 0; + for (size_t dim = 0; dim < 2; dim++) { + // start performing the actual split + long long i = 0; + + if (!this->workInRam) { + if (dim == 0) + elementOpt = fileReaderDim0.value().GetNextElement(); + if (dim == 1) + elementOpt = fileReaderDim1.value().GetNextElement(); + } + + while ((this->workInRam && i < this->size) || (!this->workInRam && elementOpt)) { + rTreeValueWithOrderIndex element; + + // get the current element, either from disk or from ram + if (this->workInRam) { + element = dim == 0 ? (*this->rectanglesD0InRam)[i] : (*this->rectanglesD1InRam)[i]; + } else { + element = elementOpt.value(); + } + + if ((splitResult.bestDim == 0 && element.orderX < splitResult.bestElement.orderX) + || (splitResult.bestDim == 1 && element.orderY < splitResult.bestElement.orderY)) { + // the element belongs to split 0 + + if (dim == 0) { + // add the element to the split 0 dimension 0 vector / file + if (split0InRam || this->workInRam) { + splitBuffers.splitBuffersRam.s0Dim0->push_back(element); + } else { + Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split0Dim0File.value()); + } + + // check if the element is at the position i * S (described in the algorithm) or one before it. + // In this case it is a future possible split position and needs to be saved to the "small list" + if (((currentXSplit0 + 1) % SSplit0 == 0 && (currentXSplit0 + 1) / SSplit0 >= 1 && (currentXSplit0 + 1) / SSplit0 < M) + || (currentXSplit0 % SSplit0 == 0 && currentXSplit0 / SSplit0 >= 1 && currentXSplit0 / SSplit0 < M)) { + // index i * S - 1 or i * S + splitBuffers.splitBuffersRam.s0SmallDim0->push_back(element); + } + + // keep track of the min and max values to construct the bounding box of the split later + if (globalMinXS0 == -1 || element.box.min_corner().get<0>() < globalMinXS0) { + globalMinXS0 = element.box.min_corner().get<0>(); + } + if (globalMinYS0 == -1 || element.box.min_corner().get<1>() < globalMinYS0) { + globalMinYS0 = element.box.min_corner().get<1>(); + } + if (element.box.max_corner().get<0>() > globalMaxXS0) { + globalMaxXS0 = element.box.max_corner().get<0>(); + } + if (element.box.max_corner().get<1>() > globalMaxYS0) { + globalMaxYS0 = element.box.max_corner().get<1>(); + } + + // keep track of the min and max element of the split, to later replace the placeholder in the "small lists" + if (splitResult.bestDim == 1) { + if (currentXSplit0 == 0) { + minSplit0OtherDim = element; + maxSplit0OtherDim = element; + } + if (element.orderX > maxSplit0OtherDim.orderX) { + maxSplit0OtherDim = element; + } + } + + currentXSplit0++; + } else { + if (split0InRam || this->workInRam) { + splitBuffers.splitBuffersRam.s0Dim1->push_back(element); + } else { + Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split0Dim1File.value()); + } + + if (((currentYSplit0 + 1) % SSplit0 == 0 && (currentYSplit0 + 1) / SSplit0 >= 1 && (currentYSplit0 + 1) / SSplit0 < M) + || (currentYSplit0 % SSplit0 == 0 && currentYSplit0 / SSplit0 >= 1 && currentYSplit0 / SSplit0 < M)) { + // index i * S - 1 or i * S + splitBuffers.splitBuffersRam.s0SmallDim1->push_back(element); + } + + if (splitResult.bestDim == 0) { + if (currentYSplit0 == 0) { + minSplit0OtherDim = element; + maxSplit0OtherDim = element; + } + if (element.orderX > maxSplit0OtherDim.orderX) { + maxSplit0OtherDim = element; + } + } + + currentYSplit0++; + } + } else { + // the element belongs to split 1 + + if (dim == 0) { + if (split1InRam || this->workInRam) { + splitBuffers.splitBuffersRam.s1Dim0->push_back(element); + } else { + Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split1Dim0File.value()); + } + if (((currentXSplit1 + 1) % SSplit1 == 0 && (currentXSplit1 + 1) / SSplit1 >= 1 && (currentXSplit1 + 1) / SSplit1 < M) + || (currentXSplit1 % SSplit1 == 0 && currentXSplit1 / SSplit1 >= 1 && currentXSplit1 / SSplit1 < M)) { + // index i * S - 1 or i * S + splitBuffers.splitBuffersRam.s1SmallDim0->push_back(element); + } + + if (globalMinXS1 == -1 || element.box.min_corner().get<0>() < globalMinXS1) { + globalMinXS1 = element.box.min_corner().get<0>(); + } + if (globalMinYS1 == -1 || element.box.min_corner().get<1>() < globalMinYS1) { + globalMinYS1 = element.box.min_corner().get<1>(); + } + if (element.box.max_corner().get<0>() > globalMaxXS1) { + globalMaxXS1 = element.box.max_corner().get<0>(); + } + if (element.box.max_corner().get<1>() > globalMaxYS1) { + globalMaxYS1 = element.box.max_corner().get<1>(); + } + + if (splitResult.bestDim == 1) { + if (currentXSplit1 == 0) { + minSplit1OtherDim = element; + maxSplit1OtherDim = element; + } + if (element.orderX > maxSplit1OtherDim.orderX) { + maxSplit1OtherDim = element; + } + } + + currentXSplit1++; + } else { + if (split1InRam || this->workInRam) { + splitBuffers.splitBuffersRam.s1Dim1->push_back(element); + } else { + Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split1Dim1File.value()); + } + if (((currentYSplit1 + 1) % SSplit1 == 0 && (currentYSplit1 + 1) / SSplit1 >= 1 && (currentYSplit1 + 1) / SSplit1 < M) + || (currentYSplit1 % SSplit1 == 0 && currentYSplit1 / SSplit1 >= 1 && currentYSplit1 / SSplit1 < M)) { + // index i * S - 1 or i * S + splitBuffers.splitBuffersRam.s1SmallDim1->push_back(element); + } + + if (splitResult.bestDim == 0) { + if (currentYSplit1 == 0) { + minSplit1OtherDim = element; + maxSplit1OtherDim = element; + } + if (element.orderX > maxSplit1OtherDim.orderX) { + maxSplit1OtherDim = element; + } + } + + currentYSplit1++; + } + } + i++; + + if (!this->workInRam) { + if (dim == 0) + elementOpt = fileReaderDim0.value().GetNextElement(); + if (dim == 1) + elementOpt = fileReaderDim1.value().GetNextElement(); + } + } + } + + if (!this->workInRam) { + fileReaderDim0.value().Close(); + fileReaderDim1.value().Close(); + } + + // replace the placeholder + if (splitResult.bestDim == 0) { + (*splitBuffers.splitBuffersRam.s0SmallDim1)[0] = minSplit0OtherDim; + (*splitBuffers.splitBuffersRam.s0SmallDim1)[1] = maxSplit0OtherDim; + (*splitBuffers.splitBuffersRam.s1SmallDim1)[0] = minSplit1OtherDim; + (*splitBuffers.splitBuffersRam.s1SmallDim1)[1] = maxSplit1OtherDim; + } else { + (*splitBuffers.splitBuffersRam.s0SmallDim0)[0] = minSplit0OtherDim; + (*splitBuffers.splitBuffersRam.s0SmallDim0)[1] = maxSplit0OtherDim; + (*splitBuffers.splitBuffersRam.s1SmallDim0)[0] = minSplit1OtherDim; + (*splitBuffers.splitBuffersRam.s1SmallDim0)[1] = maxSplit1OtherDim; + } + + boxGeo boxSplit0 = Rtree::createBoundingBox(globalMinXS0, globalMinYS0, globalMaxXS0, globalMaxYS0); + boxGeo boxSplit1 = Rtree::createBoundingBox(globalMinXS1, globalMinYS1, globalMaxXS1, globalMaxYS1); + + return std::make_pair(boxSplit0, boxSplit1); +} + +FileReader::FileReader(const std::string& filePath) { + this->filePath = filePath; + + this->file = std::ifstream(this->filePath, std::ios::binary); + this->file.seekg (0, std::ifstream::end); + this->fileLength = this->file.tellg(); + this->file.seekg (0, std::ifstream::beg); +} + +std::optional FileReader::GetNextElement() { + if (this->file.tellg() < this->fileLength) { + double minX; + double minY; + double maxX; + double maxY; + uint64_t id; + long long orderX; + long long orderY; + + this->file.read(reinterpret_cast(&minX), sizeof(double)); + this->file.read(reinterpret_cast(&minY), sizeof(double)); + this->file.read(reinterpret_cast(&maxX), sizeof(double)); + this->file.read(reinterpret_cast(&maxY), sizeof(double)); + this->file.read(reinterpret_cast(&id), sizeof(uint64_t)); + this->file.read(reinterpret_cast(&orderX), sizeof(long long)); + this->file.read(reinterpret_cast(&orderY), sizeof(long long)); + + boxGeo box = Rtree::createBoundingBox(minX, minY, maxX, maxY); + rTreeValueWithOrderIndex element = rTreeValueWithOrderIndex(box, id, orderX, orderY); + + return { element }; + } else { + return {}; + } +} + +void FileReader::Close() { + this->file.close(); +} + +FileReaderWithoutIndex::FileReaderWithoutIndex(const std::string& filePath) { + this->filePath = filePath; + + this->file = std::ifstream(this->filePath, std::ios::binary); + this->file.seekg (0, std::ifstream::end); + this->fileLength = this->file.tellg(); + this->file.seekg (0, std::ifstream::beg); +} + +std::optional FileReaderWithoutIndex::GetNextElement() { + if (this->file.tellg() < this->fileLength) { + double minX; + double minY; + double maxX; + double maxY; + uint64_t id; + + this->file.read(reinterpret_cast(&minX), sizeof(double)); + this->file.read(reinterpret_cast(&minY), sizeof(double)); + this->file.read(reinterpret_cast(&maxX), sizeof(double)); + this->file.read(reinterpret_cast(&maxY), sizeof(double)); + this->file.read(reinterpret_cast(&id), sizeof(uint64_t)); + + boxGeo box = Rtree::createBoundingBox(minX, minY, maxX, maxY); + rTreeValue boxWithId = rTreeValue(box, id); + + return { boxWithId }; + } else { + return {}; + } +} + +void FileReaderWithoutIndex::Close() { + this->file.close(); +} diff --git a/src/util/Rtree.h b/src/util/Rtree.h new file mode 100644 index 0000000000..2b3a26cbf0 --- /dev/null +++ b/src/util/Rtree.h @@ -0,0 +1,278 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#ifndef QLEVER_RTREE_H +#define QLEVER_RTREE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace bg = boost::geometry; + +using pointGeo = bg::model::point>; +using boxGeo = bg::model::box; + +using bg::make; + +struct rTreeValue { + boxGeo box{}; + long long id; + rTreeValue(boxGeo box_, long long id_) { + box = box_; + id = id_; + } + + rTreeValue() {} + + template + void serialize(Archive & a, [[maybe_unused]]const unsigned int version) { + a & box; + a & id; + } +}; +using multiBoxGeo = std::vector; + +struct rTreeValueWithOrderIndex { + boxGeo box{}; + long long id; + long long orderX; + long long orderY; + rTreeValueWithOrderIndex(boxGeo box_, long long id_, long long orderX_, long long orderY_) { + box = box_; + id = id_; + orderX = orderX_; + orderY = orderY_; + } + + rTreeValueWithOrderIndex() {}; +}; +using multiBoxWithOrderIndex = std::vector; + +struct SplitResult { + double bestCost = -1; + size_t bestDim = 0; + long long bestIndex = 0; + rTreeValueWithOrderIndex bestLastElement; + rTreeValueWithOrderIndex bestElement; + rTreeValueWithOrderIndex bestMinElement; + rTreeValueWithOrderIndex bestMaxElement; +}; + +struct SplitBuffersRam { + std::shared_ptr s0Dim0 = std::make_shared(); + std::shared_ptr s0Dim1 = std::make_shared(); + std::shared_ptr s1Dim0 = std::make_shared(); + std::shared_ptr s1Dim1 = std::make_shared(); + + std::shared_ptr s0SmallDim0 = std::make_shared(); + std::shared_ptr s0SmallDim1 = std::make_shared(); + std::shared_ptr s1SmallDim0 = std::make_shared(); + std::shared_ptr s1SmallDim1 = std::make_shared(); +}; + +struct SplitBuffersDisk { + SplitBuffersRam splitBuffersRam; + std::optional split0Dim0File; + std::optional split0Dim1File; + std::optional split1Dim0File; + std::optional split1Dim1File; +}; + +class Node { + protected: + friend class boost::serialization::access; + long long id; + boxGeo boundingBox{}; + bool isLastInnerNode = false; + multiBoxGeo children; + + template + void serialize(Archive & a, [[maybe_unused]]const unsigned int version) { + a & id; + a & isLastInnerNode; + a & boundingBox; + a & children; + } + + explicit Node(long long id); + + public: + Node(); + Node(long long id, boxGeo boundingBox); + Node(long long id, boxGeo boundingBox, multiBoxGeo &children, bool isLastInnerNode); + Node(long long id, double minX, double minY, double maxX, double maxY, bool isLastInnerNode); + [[nodiscard]] long long GetId() const; + [[nodiscard]] boxGeo GetBoundingBox() const; + void AddChild(Node& child); + void SetIsLastInnerNode(bool isLastInnerNode); + [[nodiscard]] bool GetIsLastInnerNode() const; + multiBoxGeo GetChildren(); +}; + +BOOST_CLASS_VERSION(Node, 1) + +class Rtree { + private: + static long long SaveNode(Node &node, bool isLastInnerNode, std::ofstream& nodesOfs); + static Node LoadNode(long long id, std::ifstream& lookupIfs, std::ifstream& nodesIfs); + uintmax_t maxBuildingRamUsage; + public: + void BuildTree(const std::string& onDiskBase, size_t M, const std::string& folder) const; + static multiBoxGeo SearchTree(boxGeo query, const std::string& folder); + static std::optional ConvertWordToRtreeEntry(const std::string& wkt); + static void SaveEntry(boxGeo boundingBox, uint64_t index, std::ofstream& convertOfs); + static void SaveEntryWithOrderIndex(rTreeValueWithOrderIndex treeValue, std::ofstream& convertOfs); + static multiBoxGeo LoadEntries(const std::string& file); + static multiBoxWithOrderIndex LoadEntriesWithOrderIndex(const std::string& file); + static boxGeo createBoundingBox(double pointOneX, double pointOneY, double pointTwoX, double pointTwoY); + explicit Rtree(uintmax_t maxBuildingRamUsage); +}; + +class OrderedBoxes { + private: + bool workInRam; + long long size; + boxGeo boundingBox; + std::shared_ptr rectanglesD0InRam; + std::shared_ptr rectanglesD1InRam; + std::string rectanglesD0OnDisk; + std::string rectanglesD1OnDisk; + std::shared_ptr rectanglesD0Small; + std::shared_ptr rectanglesD1Small; + std::pair SplitAtBestInRam(size_t S, size_t M); + std::pair SplitAtBestOnDisk(const std::string& filePath, size_t S, size_t M, long long maxBuildingRamUsage); + SplitResult GetBestSplit(); + std::pair PerformSplit(SplitResult splitResult, SplitBuffersRam& splitBuffersRam, size_t M, size_t S); + std::pair PerformSplit(SplitResult splitResult, SplitBuffersDisk& splitBuffers, size_t M, size_t S, long long maxBuildingRamUsage); + public: + [[nodiscard]] bool WorkInRam() const; + void CreateOrderedBoxesInRam(const std::shared_ptr& rectanglesD0, const std::shared_ptr& rectanglesD1, const std::shared_ptr& rectanglesSmallD0, const std::shared_ptr& rectanglesSmallD1, boxGeo box); // workInRam = true + void CreateOrderedBoxesOnDisk(const std::string& rectanglesD0, const std::string& rectanglesD1, const std::shared_ptr& rectanglesSmallD0, const std::shared_ptr& rectanglesSmallD1, long long size, boxGeo box); // workInRam = false + boxGeo GetBoundingBox(); + [[nodiscard]] long long GetSize() const; + std::pair SplitAtBest(const std::string& filePath, size_t S, size_t M, long long maxBuildingRamUsage); + std::shared_ptr GetRectanglesInRam(); + std::string GetRectanglesOnDisk(); +}; + +class ConstructionNode: public Node { + private: + OrderedBoxes orderedBoxes; + + public: + ConstructionNode(long long id, OrderedBoxes orderedBoxes); + OrderedBoxes GetOrderedBoxes(); + void AddChildrenToItem(); +}; + +class FileReader { + private: + std::string filePath; + std::ifstream file; + long long fileLength; + public: + explicit FileReader(const std::string& filePath); + std::optional GetNextElement(); + void Close(); +}; + +class FileReaderWithoutIndex { + private: + std::string filePath; + std::ifstream file; + long long fileLength; + public: + explicit FileReaderWithoutIndex(const std::string& filePath); + std::optional GetNextElement(); + void Close(); +}; + +namespace boost::serialization { +template +void save(Archive & a, const boxGeo & b, [[maybe_unused]]unsigned int version) +{ + a << b.min_corner().get<0>(); + a << b.min_corner().get<1>(); + a << b.max_corner().get<0>(); + a << b.max_corner().get<1>(); +} +template +void load(Archive & a, boxGeo & b, [[maybe_unused]]unsigned int version) +{ + double minX = 0; + a >> minX; + double minY = 0; + a >> minY; + double maxX = 0; + a >> maxX; + double maxY = 0; + a >> maxY; + b = make(make(minX, minY), make(maxX, maxY)); +} +} +BOOST_SERIALIZATION_SPLIT_FREE(boxGeo); + +struct sortRuleLambdaX { + // comparison function + bool operator()(const rTreeValue& b1, const rTreeValue& b2) const { + double center1 = (b1.box.min_corner().get<0>() + b1.box.max_corner().get<0>()) / 2; + double center2 = (b2.box.min_corner().get<0>() + b2.box.max_corner().get<0>()) / 2; + return center1 < center2; + } + + // Value that is strictly smaller than any input element. + static rTreeValue min_value() { return {Rtree::createBoundingBox(DBL_MIN, DBL_MIN, DBL_MIN, DBL_MIN), 0}; } + + // Value that is strictly larger than any input element. + static rTreeValue max_value() { return {Rtree::createBoundingBox(DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX), 0}; } +}; + +struct sortRuleLambdaXWithIndex { + // comparison function + bool operator()(const rTreeValueWithOrderIndex& b1, const rTreeValueWithOrderIndex& b2) const { + double center1 = (b1.box.min_corner().get<0>() + b1.box.max_corner().get<0>()) / 2; + double center2 = (b2.box.min_corner().get<0>() + b2.box.max_corner().get<0>()) / 2; + + if (b1.orderX == b2.orderX) + return center1 < center2; + return b1.orderX < b2.orderX; + } + + // Value that is strictly smaller than any input element. + static rTreeValueWithOrderIndex min_value() { return {Rtree::createBoundingBox(DBL_MIN, DBL_MIN, DBL_MIN, DBL_MIN), 0, 0, 0}; } + + // Value that is strictly larger than any input element. + static rTreeValueWithOrderIndex max_value() { return {Rtree::createBoundingBox(DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX), 0, LLONG_MAX, LLONG_MAX}; } +}; + +struct sortRuleLambdaYWithIndex { + // comparison function + bool operator()(const rTreeValueWithOrderIndex& b1, const rTreeValueWithOrderIndex& b2) const { + double center1 = (b1.box.min_corner().get<1>() + b1.box.max_corner().get<1>()) / 2; + double center2 = (b2.box.min_corner().get<1>() + b2.box.max_corner().get<1>()) / 2; + + if (b1.orderY == b2.orderY) + return center1 < center2; + return b1.orderY < b2.orderY; + } + + // Value that is strictly smaller than any input element. + static rTreeValueWithOrderIndex min_value() { return {Rtree::createBoundingBox(DBL_MIN, DBL_MIN, DBL_MIN, DBL_MIN), 0, 0, 0}; } + + // Value that is strictly larger than any input element. + static rTreeValueWithOrderIndex max_value() { return {Rtree::createBoundingBox(DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX), 0, LLONG_MAX, LLONG_MAX}; } +}; + +#endif //QLEVER_RTREE_H From 7e1a694aa63f77ecdac18ad8d86650ea7f3991d5 Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Mon, 21 Aug 2023 02:42:41 +0200 Subject: [PATCH 12/20] Fixed some bugs and finished external sorting --- src/index/IndexImpl.cpp | 7 ------- src/util/Rtree.cpp | 12 +++--------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 7ad9e26085..b238ae47dd 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -239,13 +239,6 @@ template void IndexImpl::createFromFile( template IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( const string& filename, size_t linesPerPartial) { - try { - Rtree rtree = Rtree(1300000000); - rtree.BuildTree("vocab_raw", 16, "./rtree_build"); - LOG(INFO) << "Finished building the Rtree" << std::endl; - } catch (const std::exception &e) { - LOG(INFO) << e.what() << std::endl; - } LOG(INFO) << "Processing input triples from " << filename << " ..." << std::endl; auto parser = std::make_shared(filename); diff --git a/src/util/Rtree.cpp b/src/util/Rtree.cpp index 2f7f078270..ff00e409bd 100644 --- a/src/util/Rtree.cpp +++ b/src/util/Rtree.cpp @@ -40,7 +40,7 @@ static void centerOrdering(multiBoxWithOrderIndex& boxes, size_t dim) { OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBuildingRamUsage, bool workInRam) { OrderedBoxes orderedInputRectangles; - ad_utility::BackgroundStxxlSorter sorterRectsD0Basic = ad_utility::BackgroundStxxlSorter(std::ceil(maxBuildingRamUsage / 3.0)); + ad_utility::BackgroundStxxlSorter sorterRectsD0Basic = ad_utility::BackgroundStxxlSorter(std::ceil((maxBuildingRamUsage < 9999999999 ? maxBuildingRamUsage : 9999999999) / 3.0)); multiBoxGeo rectsD0Basic; if (workInRam) { @@ -62,7 +62,7 @@ OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBui double globalMaxX = -1; double globalMaxY = -1; - ad_utility::BackgroundStxxlSorter sorterRectsD1 = ad_utility::BackgroundStxxlSorter(std::ceil(maxBuildingRamUsage / 3.0)); + ad_utility::BackgroundStxxlSorter sorterRectsD1 = ad_utility::BackgroundStxxlSorter(std::ceil((maxBuildingRamUsage < 9999999999 ? maxBuildingRamUsage : 9999999999) / 3.0)); std::shared_ptr RectanglesD1WithOrder = std::make_shared(); if (workInRam) { @@ -111,7 +111,7 @@ OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBui long long ySize = 0; std::ofstream r1File = std::ofstream(onDiskBase + ".boundingbox.d1.tmp", std::ios::binary); - ad_utility::BackgroundStxxlSorter sorterRectsD0 = ad_utility::BackgroundStxxlSorter(std::ceil(maxBuildingRamUsage / 3.0)); + ad_utility::BackgroundStxxlSorter sorterRectsD0 = ad_utility::BackgroundStxxlSorter(std::ceil((maxBuildingRamUsage < 9999999999 ? maxBuildingRamUsage : 9999999999) / 3.0)); std::shared_ptr RectanglesD0WithOrder = std::make_shared(); std::shared_ptr r1Small = std::make_shared(); // placeholder @@ -320,13 +320,7 @@ void Rtree::BuildTree(const std::string& onDiskBase, size_t M, const std::string long long fileLines = std::ceil(std::filesystem::file_size(file) / (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(long long))); bool workInRam = (std::filesystem::file_size(file) + fileLines * 2 * sizeof(long long)) * 4 < this->maxBuildingRamUsage; - if (workInRam) { - std::cout << "Building in ram" << std::endl; - } else { - std::cout << "Building on disk" << std::endl; - } OrderedBoxes orderedInputRectangles = SortInput(onDiskBase, M, maxBuildingRamUsage, workInRam); - std::cout << "Finished intital sorting" << std::endl; // build the tree in a depth first approach std::stack layerStack; From 76590c2c3d5fe23b2416f9871c3cce614a28dccf Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Mon, 21 Aug 2023 11:25:24 +0200 Subject: [PATCH 13/20] Modified CMakeLists --- src/engine/sparqlExpressions/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/sparqlExpressions/CMakeLists.txt b/src/engine/sparqlExpressions/CMakeLists.txt index be12b69e58..ab98537e63 100644 --- a/src/engine/sparqlExpressions/CMakeLists.txt +++ b/src/engine/sparqlExpressions/CMakeLists.txt @@ -13,4 +13,4 @@ add_library(sparqlExpressions RelationalExpressions.cpp AggregateExpression.cpp RegexExpression.cpp ContainsExpression.cpp LangExpression.cpp) -qlever_target_link_libraries(sparqlExpressions index) +qlever_target_link_libraries(sparqlExpressions index util) From 3448cd6831908f91f85ad21e58968ddfbc89580c Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Mon, 21 Aug 2023 14:54:13 +0200 Subject: [PATCH 14/20] reversed changes --- test/ConfigManagerTest.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/ConfigManagerTest.cpp b/test/ConfigManagerTest.cpp index d43ab85b3b..3ad400e58a 100644 --- a/test/ConfigManagerTest.cpp +++ b/test/ConfigManagerTest.cpp @@ -75,9 +75,9 @@ TEST(ConfigManagerTest, CreateConfigurationOptionExceptionTest) { configuration grammar. Ergo, you can't set values, with such paths per short hand, which we don't want. */ - /*ASSERT_THROW(config.addOption({"Shared part"s, "Sense_of_existence"s}, "", + ASSERT_THROW(config.addOption({"Shared part"s, "Sense_of_existence"s}, "", ¬Used, 42); - , ad_utility::NotValidShortHandNameException);*/ + , ad_utility::NotValidShortHandNameException); } TEST(ConfigManagerTest, ParseConfig) { @@ -89,7 +89,7 @@ TEST(ConfigManagerTest, ParseConfig) { int thirdInt; decltype(auto) optionZero = - config.addOption({"depth_0"s, "Option_0"s, "Option_0"s}, + config.addOption({"depth_0"s, "Option_0"s}, "Must be set. Has no default value.", &firstInt); decltype(auto) optionOne = config.addOption({"depth_0"s, "depth_1"s, "Option_1"s}, @@ -131,9 +131,9 @@ TEST(ConfigManagerTest, ParseConfigExceptionTest) { // Add one option with default and one without. int notUsedInt; std::vector notUsedVector; - config.addOption({"depth_0"s, "Without_default"s, ""}, + config.addOption({"depth_0"s, "Without_default"s}, "Must be set. Has no default value.", ¬UsedInt); - config.addOption({"depth_0"s, "With_default"s, ""}, + config.addOption({"depth_0"s, "With_default"s}, "Must not be set. Has default value.", ¬UsedVector, {40, 41}); From 3ff4a498469407face8ff094cc8d92d91eceffbf Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Mon, 21 Aug 2023 15:00:30 +0200 Subject: [PATCH 15/20] Fixed contains expression --- src/engine/sparqlExpressions/ContainsExpression.cpp | 10 +++++----- src/engine/sparqlExpressions/ContainsExpression.h | 2 +- src/engine/sparqlExpressions/SparqlExpression.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/engine/sparqlExpressions/ContainsExpression.cpp b/src/engine/sparqlExpressions/ContainsExpression.cpp index ca3df72473..a6984b69de 100644 --- a/src/engine/sparqlExpressions/ContainsExpression.cpp +++ b/src/engine/sparqlExpressions/ContainsExpression.cpp @@ -11,7 +11,7 @@ using namespace std::literals; -/*class BoundingBoxType { +class BoundingBoxType { public: boxGeo value{}; @@ -109,7 +109,7 @@ namespace sparqlExpression { } // ___________________________________________________________________________ - std::span ContainsExpression::children() { + std::span ContainsExpression::childrenImpl() { return {&child_, 1}; } @@ -133,14 +133,14 @@ namespace sparqlExpression { } auto resultSize = context->size(); - VectorWithMemoryLimit result{context->_allocator}; + VectorWithMemoryLimit result{context->_allocator}; result.reserve(resultSize); for (auto id : detail::makeGenerator(*variablePtr, resultSize, context)) { - result.push_back(resultIds.contains(id.getInt())); + result.push_back(Id::makeFromBool(resultIds.contains(id.getInt()))); } return result; } -} // namespace sparqlExpression*/ +} // namespace sparqlExpression diff --git a/src/engine/sparqlExpressions/ContainsExpression.h b/src/engine/sparqlExpressions/ContainsExpression.h index 6d70a146b0..a735cbe0e2 100644 --- a/src/engine/sparqlExpressions/ContainsExpression.h +++ b/src/engine/sparqlExpressions/ContainsExpression.h @@ -24,7 +24,7 @@ namespace sparqlExpression { ExpressionResult evaluate(EvaluationContext *context) const override; - std::span children() override; + std::span childrenImpl() override; // _________________________________________________________________________ [[nodiscard]] string getCacheKey( diff --git a/src/engine/sparqlExpressions/SparqlExpression.h b/src/engine/sparqlExpressions/SparqlExpression.h index 5ab35e7b76..743f749ea3 100644 --- a/src/engine/sparqlExpressions/SparqlExpression.h +++ b/src/engine/sparqlExpressions/SparqlExpression.h @@ -159,7 +159,7 @@ class SparqlExpression { } // Get the direct child expressions. - virtual std::span children() { + virtual std::span children() final { return childrenImpl(); } virtual std::span children() const final { From 2b5e3260ce48fe94f9a7fcafda00f7ca55cae320 Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Sun, 27 Aug 2023 23:29:14 +0200 Subject: [PATCH 16/20] Resolved issues from comments --- CMakeLists.txt | 3 +- .../sparqlExpressions/ContainsExpression.cpp | 6 +- src/index/IndexImpl.cpp | 4 +- .../sparqlParser/SparqlQleverVisitor.cpp | 4 +- src/util/CMakeLists.txt | 3 +- src/util/Rtree.cpp | 1226 +---------------- src/util/Rtree.h | 273 ++-- src/util/RtreeBuild.cpp | 1133 +++++++++++++++ src/util/RtreeFileReader.cpp | 82 ++ src/util/RtreeFileReader.h | 41 + 10 files changed, 1426 insertions(+), 1349 deletions(-) create mode 100644 src/util/RtreeBuild.cpp create mode 100644 src/util/RtreeFileReader.cpp create mode 100644 src/util/RtreeFileReader.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 87852e4d00..fc3aba248c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,7 +124,7 @@ endif() ###################################### # BOOST ###################################### -find_package(Boost 1.76 COMPONENTS iostreams serialization program_options REQUIRED) +find_package(Boost 1.74 COMPONENTS iostreams serialization program_options REQUIRED) include_directories(${Boost_INCLUDE_DIR}) @@ -231,7 +231,6 @@ set(USE_OPENMP OFF CACHE BOOL "Don't use OPENMP as default" FORCE) add_subdirectory(third_party/stxxl EXCLUDE_FROM_ALL) # apply STXXL CXXFLAGS - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${STXXL_CXX_FLAGS}") include_directories(SYSTEM ${STXXL_INCLUDE_DIRS}) diff --git a/src/engine/sparqlExpressions/ContainsExpression.cpp b/src/engine/sparqlExpressions/ContainsExpression.cpp index a6984b69de..25a74c339f 100644 --- a/src/engine/sparqlExpressions/ContainsExpression.cpp +++ b/src/engine/sparqlExpressions/ContainsExpression.cpp @@ -13,9 +13,9 @@ using namespace std::literals; class BoundingBoxType { public: - boxGeo value{}; + Rtree::BoundingBox value{}; - explicit BoundingBoxType(boxGeo boundingBox) { + explicit BoundingBoxType(Rtree::BoundingBox boundingBox) { this->value = boundingBox; } }; @@ -126,7 +126,7 @@ namespace sparqlExpression { Rtree rtree = context->_qec.getIndex().getRtree(); multiBoxGeo treeResults = rtree.SearchTree(boundingBox_->value, "./rtree_build"); - for (rTreeValue item : treeResults) { + for (RTreeValue item : treeResults) { long long id = item.id; resultIds.insert(id); } diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 86b6964165..e1d92b020a 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -393,13 +393,13 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( auto internalVocabularyAction = [&wordWriter, &convertOfs](const auto& word, const auto& index) { wordWriter.push(word.data(), word.size()); - std::optional boundingBox = Rtree::ConvertWordToRtreeEntry(word); + std::optional boundingBox = Rtree::ConvertWordToRtreeEntry(word); if (boundingBox) { Rtree::SaveEntry(boundingBox.value(), index, convertOfs); } }; auto externalVocabularyAction = [&convertOfs](const auto& word, const auto& index) { - std::optional boundingBox = Rtree::ConvertWordToRtreeEntry(word); + std::optional boundingBox = Rtree::ConvertWordToRtreeEntry(word); if (boundingBox) { Rtree::SaveEntry(boundingBox.value(), index, convertOfs); } diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 2f5f90fab3..31ca15cbe1 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -97,10 +97,10 @@ ExpressionPtr Visitor::processIriFunctionCall( iriView.remove_prefix(geoPrefixRtree.size()); AD_CONTRACT_CHECK(iriView.ends_with('>')); iriView.remove_suffix(1); - /*if (iriView == "boundingBoxContains") { + if (iriView == "boundingBoxContains") { checkNumArgs("geoRtree:", iriView, 2); return std::make_unique(std::move(argList[0]), std::move(argList[1])); - }*/ + } } reportNotSupported(ctx, "Function \"" + iri + "\" is"); diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 21df036ea2..e617e04866 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -2,5 +2,6 @@ add_subdirectory(ConfigManager) add_subdirectory(MemorySize) add_subdirectory(http) add_library(util GeoSparqlHelpers.cpp - antlr/ANTLRErrorHandling.cpp ParseException.cpp Conversions.cpp Date.cpp antlr/GenerateAntlrExceptionMetadata.cpp Rtree.h Rtree.cpp) + antlr/ANTLRErrorHandling.cpp ParseException.cpp Conversions.cpp Date.cpp antlr/GenerateAntlrExceptionMetadata.cpp + Rtree.h Rtree.cpp RtreeFileReader.h RtreeFileReader.cpp RtreeBuild.cpp) qlever_target_link_libraries(util Boost::serialization) diff --git a/src/util/Rtree.cpp b/src/util/Rtree.cpp index ff00e409bd..bea47bbfc5 100644 --- a/src/util/Rtree.cpp +++ b/src/util/Rtree.cpp @@ -3,250 +3,10 @@ // Author: Noah Nock #include -#include +#include +#include -static void centerOrdering(multiBoxGeo& boxes, size_t dim) { - if (dim == 0) { - // order by centerX - sortRuleLambdaX comp; - - std::sort(boxes.begin(), boxes.end(), comp); - } else { - // order by centerY - auto sortRuleLambda = [](rTreeValue b1, rTreeValue b2) -> bool { - double center1 = (b1.box.min_corner().get<1>() + b1.box.max_corner().get<1>()) / 2; - double center2 = (b2.box.min_corner().get<1>() + b2.box.max_corner().get<1>()) / 2; - return center1 < center2; - }; - - std::sort(boxes.begin(), boxes.end(), sortRuleLambda); - } -} - -static void centerOrdering(multiBoxWithOrderIndex& boxes, size_t dim) { - if (dim == 0) { - // order by centerX - sortRuleLambdaXWithIndex comp; - - std::sort(boxes.begin(), boxes.end(), comp); - } else { - // order by centerY - sortRuleLambdaYWithIndex comp; - - std::sort(boxes.begin(), boxes.end(), comp); - } -} - -OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBuildingRamUsage, bool workInRam) { - OrderedBoxes orderedInputRectangles; - - ad_utility::BackgroundStxxlSorter sorterRectsD0Basic = ad_utility::BackgroundStxxlSorter(std::ceil((maxBuildingRamUsage < 9999999999 ? maxBuildingRamUsage : 9999999999) / 3.0)); - multiBoxGeo rectsD0Basic; - - if (workInRam) { - rectsD0Basic = Rtree::LoadEntries(onDiskBase + ".boundingbox.tmp"); - centerOrdering(rectsD0Basic, 0); - } else { - FileReaderWithoutIndex fileReaderRectsD0 = FileReaderWithoutIndex(onDiskBase + ".boundingbox.tmp"); - std::optional rectD0Element = fileReaderRectsD0.GetNextElement(); - while (rectD0Element) { - sorterRectsD0Basic.push(rectD0Element.value()); - rectD0Element = fileReaderRectsD0.GetNextElement(); - } - fileReaderRectsD0.Close(); - } - - long long xSize = 0; - double globalMinX = -1; - double globalMinY = -1; - double globalMaxX = -1; - double globalMaxY = -1; - - ad_utility::BackgroundStxxlSorter sorterRectsD1 = ad_utility::BackgroundStxxlSorter(std::ceil((maxBuildingRamUsage < 9999999999 ? maxBuildingRamUsage : 9999999999) / 3.0)); - std::shared_ptr RectanglesD1WithOrder = std::make_shared(); - - if (workInRam) { - for (rTreeValue element : rectsD0Basic) { - rTreeValueWithOrderIndex entry = rTreeValueWithOrderIndex(element.box, element.id, xSize, 0); - RectanglesD1WithOrder->push_back(entry); - xSize++; - - if (globalMinX == -1 || element.box.min_corner().get<0>() < globalMinX) { - globalMinX = element.box.min_corner().get<0>(); - } - if (globalMinY == -1 || element.box.min_corner().get<1>() < globalMinY) { - globalMinY = element.box.min_corner().get<1>(); - } - if (element.box.max_corner().get<0>() > globalMaxX) { - globalMaxX = element.box.max_corner().get<0>(); - } - if (element.box.max_corner().get<1>() > globalMaxY) { - globalMaxY = element.box.max_corner().get<1>(); - } - } - centerOrdering(*RectanglesD1WithOrder, 1); - } else { - for (rTreeValue element : sorterRectsD0Basic.sortedView()) { - rTreeValueWithOrderIndex entry = rTreeValueWithOrderIndex(element.box, element.id, xSize, 0); - sorterRectsD1.push(entry); - xSize++; - - if (globalMinX == -1 || element.box.min_corner().get<0>() < globalMinX) { - globalMinX = element.box.min_corner().get<0>(); - } - if (globalMinY == -1 || element.box.min_corner().get<1>() < globalMinY) { - globalMinY = element.box.min_corner().get<1>(); - } - if (element.box.max_corner().get<0>() > globalMaxX) { - globalMaxX = element.box.max_corner().get<0>(); - } - if (element.box.max_corner().get<1>() > globalMaxY) { - globalMaxY = element.box.max_corner().get<1>(); - } - } - } - sorterRectsD0Basic.clear(); - - size_t currentS = std::ceil(((float) xSize) / ((float) M)); - - long long ySize = 0; - std::ofstream r1File = std::ofstream(onDiskBase + ".boundingbox.d1.tmp", std::ios::binary); - ad_utility::BackgroundStxxlSorter sorterRectsD0 = ad_utility::BackgroundStxxlSorter(std::ceil((maxBuildingRamUsage < 9999999999 ? maxBuildingRamUsage : 9999999999) / 3.0)); - std::shared_ptr RectanglesD0WithOrder = std::make_shared(); - std::shared_ptr r1Small = std::make_shared(); - // placeholder - r1Small->push_back(rTreeValueWithOrderIndex()); - r1Small->push_back(rTreeValueWithOrderIndex()); - rTreeValueWithOrderIndex minD1; - rTreeValueWithOrderIndex maxD1; - - if (workInRam) { - for (rTreeValueWithOrderIndex element : *RectanglesD1WithOrder) { - element.orderY = ySize; - RectanglesD0WithOrder->push_back(element); - - if (((ySize + 1) % currentS == 0 && (ySize + 1) / currentS >= 1 && (ySize + 1) / currentS < M) - || (ySize % currentS == 0 && ySize / currentS >= 1 && ySize / currentS < M)) { - // index i * S - 1 or i * S - r1Small->push_back(element); - } - - if (ySize == 0) { - minD1 = element; - maxD1 = element; - } - if (element.orderY > maxD1.orderY) { - maxD1 = element; - } - - ySize++; - } - centerOrdering(*RectanglesD0WithOrder, 0); - } else { - for (rTreeValueWithOrderIndex element : sorterRectsD1.sortedView()) { - element.orderY = ySize; - Rtree::SaveEntryWithOrderIndex(element, r1File); - sorterRectsD0.push(element); - - if (((ySize + 1) % currentS == 0 && (ySize + 1) / currentS >= 1 && (ySize + 1) / currentS < M) - || (ySize % currentS == 0 && ySize / currentS >= 1 && ySize / currentS < M)) { - // index i * S - 1 or i * S - r1Small->push_back(element); - } - - if (ySize == 0) { - minD1 = element; - maxD1 = element; - } - if (element.orderY > maxD1.orderY) { - maxD1 = element; - } - - ySize++; - } - } - - r1File.close(); - sorterRectsD1.clear(); - - // replace the placeholder - (*r1Small)[0] = minD1; - (*r1Small)[1] = maxD1; - - long long currentX = 0; - std::ofstream r0File = std::ofstream(onDiskBase + ".boundingbox.d0.tmp", std::ios::binary); - std::shared_ptr r0Small = std::make_shared(); - // placeholder - r0Small->push_back(rTreeValueWithOrderIndex()); - r0Small->push_back(rTreeValueWithOrderIndex()); - rTreeValueWithOrderIndex minD0; - rTreeValueWithOrderIndex maxD0; - - if (workInRam) { - for (rTreeValueWithOrderIndex element : *RectanglesD0WithOrder) { - if (((currentX + 1) % currentS == 0 && (currentX + 1) / currentS >= 1 && (currentX + 1) / currentS < M) - || (currentX % currentS == 0 && currentX / currentS >= 1 && currentX / currentS < M)) { - // index i * S - 1 or i * S - r0Small->push_back(element); - } - - if (currentX == 0) { - minD0 = element; - maxD0 = element; - } - if (element.orderX > maxD0.orderX) { - maxD0 = element; - } - - currentX++; - } - } else { - for (rTreeValueWithOrderIndex element : sorterRectsD0.sortedView()) { - Rtree::SaveEntryWithOrderIndex(element, r0File); - - if (((currentX + 1) % currentS == 0 && (currentX + 1) / currentS >= 1 && (currentX + 1) / currentS < M) - || (currentX % currentS == 0 && currentX / currentS >= 1 && currentX / currentS < M)) { - // index i * S - 1 or i * S - r0Small->push_back(element); - } - - if (currentX == 0) { - minD0 = element; - maxD0 = element; - } - if (element.orderX > maxD0.orderX) { - maxD0 = element; - } - - currentX++; - } - } - - r0File.close(); - sorterRectsD0.clear(); - - // replace the placeholder - (*r0Small)[0] = minD0; - (*r0Small)[1] = maxD0; - - boxGeo boundingBox = Rtree::createBoundingBox(globalMinX, globalMinY, globalMaxX, globalMaxY); - if (workInRam) { - orderedInputRectangles.CreateOrderedBoxesInRam(RectanglesD0WithOrder, RectanglesD1WithOrder, r0Small, r1Small, boundingBox); - } else { - orderedInputRectangles.CreateOrderedBoxesOnDisk(onDiskBase + ".boundingbox.d0", onDiskBase + ".boundingbox.d1", r0Small, r1Small, xSize, boundingBox); - } - return orderedInputRectangles; -} - -/*OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBuildingRamUsage, bool workInRam) { - if (workInRam) { - return InternalSort(onDiskBase, M); - } else { - return ExternalSort(onDiskBase, M, maxBuildingRamUsage); - } -}*/ - -bool intersects(const boxGeo &b1, const boxGeo &b2) { +bool intersects(const Rtree::BoundingBox &b1, const Rtree::BoundingBox &b2) { /** * Determine whether two bounding boxes intersect */ @@ -256,116 +16,13 @@ bool intersects(const boxGeo &b1, const boxGeo &b2) { b2.min_corner().get<1>() > b1.max_corner().get<1>(); return !notIntersecting; -} - -static double costFunctionTGS(boxGeo& b0, boxGeo& b1, size_t dim) { - /** - * The cost function determines the quality of a split. The lower the cost, the better the split. - * Each split gets represented by the resulting bounding boxes of the split pieces. - */ - double cost; - - // The cost represents the overlap of the two boxes - if (dim == 0) { - cost = b0.max_corner().get<0>() - b1.min_corner().get<0>(); - cost = cost < 0 ? 0 : cost; - } else { - cost = b0.max_corner().get<1>() - b1.min_corner().get<1>(); - cost = cost < 0 ? 0 : cost; - } - - return cost; -} +} -boxGeo Rtree::createBoundingBox(double pointOneX, double pointOneY, double pointTwoX, double pointTwoY) { - return make(make(pointOneX, pointOneY), make(pointTwoX, pointTwoY)); -} +Rtree::BoundingBox Rtree::createBoundingBox(double pointOneX, double pointOneY, double pointTwoX, double pointTwoY) { + return Rtree::BoundingBox(Rtree::Point(pointOneX, pointOneY), Rtree::Point(pointTwoX, pointTwoY)); +} -static std::vector TGSRecursive(const std::string& filePath, OrderedBoxes orderedInputRectangles, size_t M, size_t S, long long maxBuildingRamUsage) { - /** - * This function recursively constructs one layer of children for a certain root node. - * The input rectangles must be sorted in both x- and y-direction. - * The algorithm is based on this paper https://dl.acm.org/doi/pdf/10.1145/288692.288723 - */ - - unsigned long long n = orderedInputRectangles.GetSize(); - - if (n <= S || n <= M) { - // stop condition - return std::vector { orderedInputRectangles }; - } - // split the rectangles at the best split - std::pair split = orderedInputRectangles.SplitAtBest(filePath, S, M, maxBuildingRamUsage); - - // recursion - std::vector result0 = TGSRecursive(filePath + ".0", split.first, M, S, maxBuildingRamUsage); - std::vector result1 = TGSRecursive(filePath + ".1", split.second, M, S, maxBuildingRamUsage); - - std::vector result; - result.insert(result.begin(), result0.begin(), result0.end()); - result.insert(result.end(), result1.begin(), result1.end()); - - return result; -} - -void Rtree::BuildTree(const std::string& onDiskBase, size_t M, const std::string& folder) const { - const std::string file = onDiskBase + ".boundingbox.tmp"; - - // prepare the files - std::filesystem::create_directory(folder); - std::ofstream nodesOfs = std::ofstream(folder + "/nodes.bin", std::ios::binary); - std::map lookup; - - // sort the rectangles - long long fileLines = std::ceil(std::filesystem::file_size(file) / (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(long long))); - bool workInRam = (std::filesystem::file_size(file) + fileLines * 2 * sizeof(long long)) * 4 < this->maxBuildingRamUsage; - - OrderedBoxes orderedInputRectangles = SortInput(onDiskBase, M, maxBuildingRamUsage, workInRam); - - // build the tree in a depth first approach - std::stack layerStack; - - long long newId = 1; // start from 1, because 0 is the root item - ConstructionNode rootItem = ConstructionNode(0, orderedInputRectangles); - layerStack.push(rootItem); - size_t layer = 0; - - while (!layerStack.empty()) { - ConstructionNode currentItem = layerStack.top(); - layerStack.pop(); - - if (currentItem.GetOrderedBoxes().GetSize() <= M) { - // reached a leaf - currentItem.AddChildrenToItem(); - long long nodePtr = SaveNode(currentItem, true, nodesOfs); - lookup[currentItem.GetId()] = nodePtr; - } else { - std::vector tgsResult = TGSRecursive(onDiskBase + ".boundingbox." + std::to_string(layer), currentItem.GetOrderedBoxes(), M, std::ceil(((float) currentItem.GetOrderedBoxes().GetSize()) / ((float) M)), this->maxBuildingRamUsage); - for (OrderedBoxes& currentOrderedRectangles : tgsResult) { - ConstructionNode newItem = ConstructionNode(newId, currentOrderedRectangles); - layerStack.push(newItem); - - currentItem.AddChild(newItem); - - newId++; - } - - long long nodePtr = SaveNode(currentItem, false, nodesOfs); - lookup[currentItem.GetId()] = nodePtr; - } - layer++; - } - nodesOfs.close(); - - std::ofstream lookupOfs(folder + "/lookup.bin", std::ios::binary); - for (unsigned int i = 0; i < newId; i++) { - long long nodePtr = lookup[i]; - lookupOfs.write(reinterpret_cast(&nodePtr), sizeof(long long)); - } - lookupOfs.close(); -} - -multiBoxGeo Rtree::SearchTree(boxGeo query, const std::string &folder) { +multiBoxGeo Rtree::SearchTree(Rtree::BoundingBox query, const std::string &folder) { std::ifstream lookupIfs = std::ifstream(folder + "/lookup.bin", std::ios::binary); std::ifstream nodesIfs = std::ifstream(folder + "/nodes.bin", std::ios::binary); @@ -378,7 +35,7 @@ multiBoxGeo Rtree::SearchTree(boxGeo query, const std::string &folder) { Node currentNode = nodes.top(); nodes.pop(); - for (rTreeValue child : currentNode.GetChildren()) { + for (RTreeValue child : currentNode.GetChildren()) { if (intersects(query, child.box)) { if (currentNode.GetIsLastInnerNode()) { results.push_back(child); @@ -393,885 +50,60 @@ multiBoxGeo Rtree::SearchTree(boxGeo query, const std::string &folder) { lookupIfs.close(); nodesIfs.close(); return results; -} +} -ConstructionNode::ConstructionNode(long long id, OrderedBoxes orderedBoxes) - : Node{id} -{ - this->orderedBoxes = orderedBoxes; - - // calculate the boundingBoxes - this->boundingBox = orderedBoxes.GetBoundingBox(); -} - -void ConstructionNode::AddChildrenToItem() { - /** - * Add all children of a certain node at once. - * This is used when a leaf node is reached. - */ - if (this->GetOrderedBoxes().WorkInRam()) { - for(rTreeValueWithOrderIndex box : *this->GetOrderedBoxes().GetRectanglesInRam()) { - Node leafNode = Node(box.id, box.box); - this->AddChild(leafNode); - } - } else { - FileReader fileReader = FileReader(this->GetOrderedBoxes().GetRectanglesOnDisk()); - - std::optional element = fileReader.GetNextElement(); - while(element) { - Node leafNode = Node(element.value().id, element.value().box); - this->AddChild(leafNode); - element = fileReader.GetNextElement(); - } - - fileReader.Close(); - } -} - -long long Node::GetId() const { +uint64_t Node::GetId() const { return this->id; -} +} -OrderedBoxes ConstructionNode::GetOrderedBoxes() { - return this->orderedBoxes; -} - -Node::Node(long long id, boxGeo boundingbox) { +Node::Node(uint64_t id, Rtree::BoundingBox boundingbox) { this->id = id; this->boundingBox = boundingbox; -} +} -Node::Node(long long id) { +Node::Node(uint64_t id) { this->id = id; -} +} -Node::Node() {} +Node::Node() {} -Node::Node(long long id, boxGeo boundingBox, multiBoxGeo &children, bool isLastInnerNode) { +Node::Node(uint64_t id, Rtree::BoundingBox boundingBox, multiBoxGeo &children, bool isLastInnerNode) { this->id = id; this->boundingBox = boundingBox; this->children = children; this->isLastInnerNode = isLastInnerNode; -} +} -Node::Node(long long id, double minX, double minY, double maxX, double maxY, bool isLastInnerNode) { +Node::Node(uint64_t id, double minX, double minY, double maxX, double maxY, bool isLastInnerNode) { this->id = id; this->boundingBox = Rtree::createBoundingBox(minX, minY, maxX, maxY); this->isLastInnerNode = isLastInnerNode; -} - -void Node::AddChild(Node& child) { - boxGeo box = child.GetBoundingBox(); - unsigned long long entryId = child.GetId(); - rTreeValue entry = rTreeValue(box, entryId); - this->children.push_back(entry); -} - -boxGeo Node::GetBoundingBox() const { - return this->boundingBox; -} - -void Node::SetIsLastInnerNode(bool _isLastInnerNode) { - this->isLastInnerNode = _isLastInnerNode; -} +} bool Node::GetIsLastInnerNode() const { return this->isLastInnerNode; -} +} multiBoxGeo Node::GetChildren() { return this->children; -} +} -long long Rtree::SaveNode(Node &node, bool isLastInnerNode, std::ofstream& nodesOfs) { - node.SetIsLastInnerNode(isLastInnerNode); - - long long pos = static_cast(nodesOfs.tellp()); - boost::archive::binary_oarchive archive(nodesOfs); - archive << node; - nodesOfs.write(" ", 1); - - return pos; -} - -Node Rtree::LoadNode(long long id, std::ifstream& lookupIfs, std::ifstream& nodesIfs) { +Node Rtree::LoadNode(uint64_t id, std::ifstream& lookupIfs, std::ifstream& nodesIfs) { Node newNode; - long long offset = id * (long long)sizeof(long long); + uint64_t offset = id * (uint64_t)sizeof(uint64_t); lookupIfs.seekg(offset, std::ios::beg); - long long nodePtr; - lookupIfs.read(reinterpret_cast(&nodePtr), sizeof(long long)); + uint64_t nodePtr; + lookupIfs.read(reinterpret_cast(&nodePtr), sizeof(uint64_t)); nodesIfs.seekg(nodePtr); boost::archive::binary_iarchive ia(nodesIfs); ia >> newNode; return newNode; -} - -std::optional GetBoundingBoxFromWKT(const std::string& wkt) { - /** - * Parse the wkt literal in a way, that only the relevant data for the rtree gets read in. - */ - bool lookingForX = true; - bool readingDouble = false; - std::string currentDouble; - - double minX = -1; - double maxX = -1; - double minY = -1; - double maxY = -1; - - for (char c : wkt) { - if (isdigit(c)) { - readingDouble = true; - currentDouble += c; - } else if (c == '.') { - readingDouble = true; - currentDouble += '.'; - } else if (c == ' ') { - if (readingDouble && lookingForX) { - // x is completely read in - readingDouble = false; - lookingForX = false; - double x; - try { - x = std::stod(currentDouble); - } catch(...) { - return { }; - } - currentDouble = ""; - if (x < minX || minX == -1) { - minX = x; - } - - if (x > maxX) { - maxX = x; - } - } - } else { - if (readingDouble && !lookingForX) { - // y is completely read in - readingDouble = false; - lookingForX = true; - double y; - try { - y = std::stod(currentDouble); - } catch(...) { - return { }; - } - currentDouble = ""; - if (y < minY || minY == -1) { - minY = y; - } - - if (y > maxY) { - maxY = y; - } - } - } - } - - return { Rtree::createBoundingBox(minX, minY, maxX, maxY) }; -} - -std::optional Rtree::ConvertWordToRtreeEntry(const std::string& wkt) { - /** - * Convert a single wkt literal to a boundingbox. - */ - std::optional boundingBox; - - /* Get the bounding box(es) of either a multipolygon, polygon or a linestring */ - std::size_t posWKTStart = wkt.find("MULTIPOLYGON(((") + 14; - std::size_t posWKTEnd = wkt.find(")))", posWKTStart); - if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { - std::string newWkt = wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); - boundingBox = GetBoundingBoxFromWKT(newWkt); - } else { - posWKTStart = wkt.find("POLYGON((") + 8; - posWKTEnd = wkt.find("))", posWKTStart); - if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { - std::string newWkt = wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); - boundingBox = GetBoundingBoxFromWKT(newWkt); - } else { - posWKTStart = wkt.find("LINESTRING(") + 10; - posWKTEnd = wkt.find(')', posWKTStart); - if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { - std::string newWkt = wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); - boundingBox = GetBoundingBoxFromWKT(newWkt); - } else { - return { }; - } - } - } - - return boundingBox; -} - -void Rtree::SaveEntry(boxGeo boundingBox, uint64_t index, std::ofstream& convertOfs) { - /** - * Save a single entry (which was e.g. converted by ConvertWordToRtreeEntry) to the disk - */ - double minX = boundingBox.min_corner().get<0>(); - double minY = boundingBox.min_corner().get<1>(); - double maxX = boundingBox.max_corner().get<0>(); - double maxY = boundingBox.max_corner().get<1>(); - - convertOfs.write(reinterpret_cast(&minX), sizeof(double)); - convertOfs.write(reinterpret_cast(&minY), sizeof(double)); - convertOfs.write(reinterpret_cast(&maxX), sizeof(double)); - convertOfs.write(reinterpret_cast(&maxY), sizeof(double)); - convertOfs.write(reinterpret_cast(&index), sizeof(uint64_t)); -} - -void Rtree::SaveEntryWithOrderIndex(rTreeValueWithOrderIndex treeValue, std::ofstream& convertOfs) { - /** - * Save a single entry, containing its postion in the x- and y-sorting - */ - double minX = treeValue.box.min_corner().get<0>(); - double minY = treeValue.box.min_corner().get<1>(); - double maxX = treeValue.box.max_corner().get<0>(); - double maxY = treeValue.box.max_corner().get<1>(); - - convertOfs.write(reinterpret_cast(&minX), sizeof(double)); - convertOfs.write(reinterpret_cast(&minY), sizeof(double)); - convertOfs.write(reinterpret_cast(&maxX), sizeof(double)); - convertOfs.write(reinterpret_cast(&maxY), sizeof(double)); - convertOfs.write(reinterpret_cast(&treeValue.id), sizeof(uint64_t)); - convertOfs.write(reinterpret_cast(&treeValue.orderX), sizeof(long long)); - convertOfs.write(reinterpret_cast(&treeValue.orderY), sizeof(long long)); -} - -multiBoxGeo Rtree::LoadEntries(const std::string& file) { - multiBoxGeo boxes; - - FileReaderWithoutIndex fileReader = FileReaderWithoutIndex(file); - - std::optional element = fileReader.GetNextElement(); - while (element) { - boxes.push_back(element.value()); - element = fileReader.GetNextElement(); - } - - fileReader.Close(); - - return boxes; -} - -multiBoxWithOrderIndex Rtree::LoadEntriesWithOrderIndex(const std::string& file) { - multiBoxWithOrderIndex boxes; - FileReader fileReader = FileReader(file); - - std::optional element = fileReader.GetNextElement(); - while (element) { - boxes.push_back(element.value()); - element = fileReader.GetNextElement(); - } - - fileReader.Close(); - - return boxes; -} +} Rtree::Rtree(uintmax_t maxBuildingRamUsage) { this->maxBuildingRamUsage = maxBuildingRamUsage; -} - -bool OrderedBoxes::WorkInRam() const{ - return this->workInRam; -} - -void OrderedBoxes::CreateOrderedBoxesInRam(const std::shared_ptr& rectanglesD0, const std::shared_ptr& rectanglesD1, const std::shared_ptr& rectanglesSmallD0, const std::shared_ptr& rectanglesSmallD1, boxGeo box) { - this->workInRam = true; - this->rectanglesD0InRam = rectanglesD0; - this->rectanglesD1InRam = rectanglesD1; - this->rectanglesD0Small = rectanglesSmallD0; - this->rectanglesD1Small = rectanglesSmallD1; - this->size = (*rectanglesD0).size(); - this->boundingBox = box; -} - -void OrderedBoxes::CreateOrderedBoxesOnDisk(const std::string& rectanglesD0, const std::string& rectanglesD1, const std::shared_ptr& rectanglesSmallD0, const std::shared_ptr& rectanglesSmallD1, long long size, boxGeo box) { - this->workInRam = false; - this->rectanglesD0OnDisk = rectanglesD0 + ".tmp"; - this->rectanglesD1OnDisk = rectanglesD1 + ".tmp"; - this->rectanglesD0Small = rectanglesSmallD0; - this->rectanglesD1Small = rectanglesSmallD1; - this->size = size; - this->boundingBox = box; -} - -boxGeo OrderedBoxes::GetBoundingBox() { - return this->boundingBox; -} - -long long OrderedBoxes::GetSize() const { - return this->size; -} - -std::shared_ptr OrderedBoxes::GetRectanglesInRam() { - return this->rectanglesD0InRam; -} - -std::string OrderedBoxes::GetRectanglesOnDisk() { - return this->rectanglesD0OnDisk; -} - -SplitResult OrderedBoxes::GetBestSplit() { - /** - * Determine based on the "small-lists", which split is the best for the rtree. - */ - struct SplitResult splitResult; - - rTreeValueWithOrderIndex minElement; - rTreeValueWithOrderIndex maxElement; - rTreeValueWithOrderIndex currentLastElement; - rTreeValueWithOrderIndex currentElement; - - // This bool is used, since we need every other element as our element "S * i" (described in the algorithm) - // To perform the split better, the element before it (S * i - 1) is saved as well - bool currentlyAtSTimesI = false; - - for (size_t dim = 0; dim < 2; dim++) { - for (long long i = 0; i < this->rectanglesD0Small->size(); i++) { - currentElement = dim == 0 ? (*this->rectanglesD0Small)[i] : (*this->rectanglesD1Small)[i]; - - if (i == 0) { - // this is the min element - minElement = currentElement; - continue; - } - - if (i == 1) { - // this is the max element - maxElement = currentElement; - continue; - } - - if (!currentlyAtSTimesI) { - currentLastElement = currentElement; - currentlyAtSTimesI = true; - continue; - } - - double minXB0 = 0; - double maxXB0 = 1; - double minXB1 = 0; - double maxXB1 = 1; - double minYB0 = 0; - double maxYB0 = 1; - double minYB1 = 0; - double maxYB1 = 1; - - if (currentlyAtSTimesI && currentElement.id != maxElement.id) { - // the current element is a possible split position. - if (dim == 0) { - minXB0 = (minElement.box.min_corner().get<0>() + minElement.box.max_corner().get<0>()) / 2; - maxXB0 = (currentLastElement.box.min_corner().get<0>() + currentLastElement.box.max_corner().get<0>()) / 2; - - minXB1 = (currentElement.box.min_corner().get<0>() + currentElement.box.max_corner().get<0>()) / 2; - maxXB1 = (maxElement.box.min_corner().get<0>() + maxElement.box.max_corner().get<0>()) / 2; - } else { - minYB0 = (minElement.box.min_corner().get<1>() + minElement.box.max_corner().get<1>()) / 2; - maxYB0 = (currentLastElement.box.min_corner().get<1>() + currentLastElement.box.max_corner().get<1>()) / 2; - - minYB1 = (currentElement.box.min_corner().get<1>() + currentElement.box.max_corner().get<1>()) / 2; - maxYB1 = (maxElement.box.min_corner().get<1>() + maxElement.box.max_corner().get<1>()) / 2; - } - - currentlyAtSTimesI = false; - } else { - break; - } - - boxGeo b0 = Rtree::createBoundingBox(minXB0, minYB0, maxXB0, maxYB0); - boxGeo b1 = Rtree::createBoundingBox(minXB1, minYB1, maxXB1, maxYB1); - - - double cost = costFunctionTGS(b0, b1, dim); - - if (splitResult.bestCost == -1 || cost < splitResult.bestCost) { - splitResult.bestCost = cost; - splitResult.bestDim = dim; - splitResult.bestLastElement = currentLastElement; - splitResult.bestElement = currentElement; - splitResult.bestMinElement = minElement; - splitResult.bestMaxElement = maxElement; - splitResult.bestIndex = i; - } - } - currentlyAtSTimesI = false; - } - - return splitResult; -} - -std::pair OrderedBoxes::SplitAtBest(const std::string& filePath, size_t S, size_t M, long long maxBuildingRamUsage) { - if (this->workInRam) { - return this->SplitAtBestInRam(S, M); - } else { - return this->SplitAtBestOnDisk(filePath, S, M, maxBuildingRamUsage); - } -} - -std::pair OrderedBoxes::SplitAtBestInRam(size_t S, size_t M) { - /** - * Split the ordered boxes in ram. First determine the best split and then perform it - */ - - struct SplitResult splitResult = this->GetBestSplit(); - - OrderedBoxes split0; - OrderedBoxes split1; - - struct SplitBuffersRam splitBuffers; - - splitBuffers.s0Dim0 = std::make_shared(); - splitBuffers.s0Dim1 = std::make_shared(); - splitBuffers.s1Dim0 = std::make_shared(); - splitBuffers.s1Dim1 = std::make_shared(); - - splitBuffers.s0SmallDim0 = std::make_shared(); - splitBuffers.s0SmallDim1 = std::make_shared(); - splitBuffers.s1SmallDim0 = std::make_shared(); - splitBuffers.s1SmallDim1 = std::make_shared(); - - std::pair boundingBoxes = PerformSplit(splitResult, splitBuffers, M, S); - - split0.CreateOrderedBoxesInRam(splitBuffers.s0Dim0, splitBuffers.s0Dim1, splitBuffers.s0SmallDim0, splitBuffers.s0SmallDim1, boundingBoxes.first); - split1.CreateOrderedBoxesInRam(splitBuffers.s1Dim0, splitBuffers.s1Dim1, splitBuffers.s1SmallDim0, splitBuffers.s1SmallDim1, boundingBoxes.second); - - (*this->rectanglesD0InRam).clear(); - (*this->rectanglesD1InRam).clear(); - (*this->rectanglesD0Small).clear(); - (*this->rectanglesD1Small).clear(); - (*this->rectanglesD0InRam).shrink_to_fit(); - (*this->rectanglesD1InRam).shrink_to_fit(); - (*this->rectanglesD0Small).shrink_to_fit(); - (*this->rectanglesD1Small).shrink_to_fit(); - - return std::make_pair(split0, split1); -} - -std::pair OrderedBoxes::SplitAtBestOnDisk(const std::string& filePath, size_t S, size_t M, long long maxBuildingRamUsage) { - /** - * Split the ordered boxes on disk. First determine the best split and then perform it - */ - - OrderedBoxes split0; - OrderedBoxes split1; - - struct SplitResult splitResult = this->GetBestSplit(); - - struct SplitBuffersDisk splitBuffers; - struct SplitBuffersRam splitBuffersRam; - - // perfrom the split - long long sizeLeft = std::ceil((splitResult.bestIndex - 2) / 2.0) * S; - long long sizeRight = this->size - sizeLeft; - long long split0ByteSize = sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(long long)); - long long split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(long long)); - bool split0InRam = split0ByteSize * 4 < maxBuildingRamUsage; - bool split1InRam = split1ByteSize * 4 < maxBuildingRamUsage; - - splitBuffersRam.s0SmallDim0 = std::make_shared(); - splitBuffersRam.s0SmallDim1 = std::make_shared(); - splitBuffersRam.s1SmallDim0 = std::make_shared(); - splitBuffersRam.s1SmallDim1 = std::make_shared(); - - if (!split0InRam) { - splitBuffers.split0Dim0File = { std::ofstream(filePath + ".0.dim0.tmp", std::ios::binary) }; - splitBuffers.split0Dim1File = { std::ofstream(filePath + ".0.dim1.tmp", std::ios::binary) }; - } else { - splitBuffersRam.s0Dim0 = std::make_shared(); - splitBuffersRam.s0Dim1 = std::make_shared(); - } - - if (!split1InRam) { - splitBuffers.split1Dim0File = { std::ofstream(filePath + ".1.dim0.tmp", std::ios::binary) }; - splitBuffers.split1Dim1File = { std::ofstream(filePath + ".1.dim1.tmp", std::ios::binary) }; - } else { - splitBuffersRam.s1Dim0 = std::make_shared(); - splitBuffersRam.s1Dim1 = std::make_shared(); - } - - splitBuffers.splitBuffersRam = splitBuffersRam; - - std::pair boundingBoxes = PerformSplit(splitResult, splitBuffers, M, S, maxBuildingRamUsage); - - if (!split0InRam) { - splitBuffers.split0Dim0File.value().close(); - splitBuffers.split0Dim1File.value().close(); - - split0.CreateOrderedBoxesOnDisk(filePath + ".0.dim0", filePath + ".0.dim1", splitBuffers.splitBuffersRam.s0SmallDim0, splitBuffers.splitBuffersRam.s0SmallDim1, sizeLeft, boundingBoxes.first); - } else { - split0.CreateOrderedBoxesInRam(splitBuffers.splitBuffersRam.s0Dim0, splitBuffers.splitBuffersRam.s0Dim1, splitBuffers.splitBuffersRam.s0SmallDim0, splitBuffers.splitBuffersRam.s0SmallDim1, boundingBoxes.first); - } - - if (!split1InRam) { - splitBuffers.split1Dim0File.value().close(); - splitBuffers.split1Dim1File.value().close(); - - split1.CreateOrderedBoxesOnDisk(filePath + ".1.dim0", filePath + ".1.dim1", splitBuffers.splitBuffersRam.s1SmallDim0, splitBuffers.splitBuffersRam.s1SmallDim1, sizeRight, boundingBoxes.second); - } else { - split1.CreateOrderedBoxesInRam(splitBuffers.splitBuffersRam.s1Dim0, splitBuffers.splitBuffersRam.s1Dim1, splitBuffers.splitBuffersRam.s1SmallDim0, splitBuffers.splitBuffersRam.s1SmallDim1, boundingBoxes.second); - } - - std::remove(this->rectanglesD0OnDisk.c_str()); - std::remove(this->rectanglesD1OnDisk.c_str()); - - return std::make_pair(split0, split1); -} - -std::pair OrderedBoxes::PerformSplit(SplitResult splitResult, SplitBuffersRam& splitBuffersRam, size_t M, size_t S) { - /** - * Perform the best split on the current ordered boxes in the ram case - */ - - struct SplitBuffersDisk splitBuffersDisk; - - splitBuffersDisk.splitBuffersRam = splitBuffersRam; - splitBuffersDisk.split0Dim0File = {}; - splitBuffersDisk.split0Dim1File = {}; - splitBuffersDisk.split1Dim0File = {}; - splitBuffersDisk.split1Dim1File = {}; - - // reuse the PerfromSplit of the Disk case. - std::pair boundingBoxes = PerformSplit(splitResult, splitBuffersDisk, M, S, 0); - - splitBuffersRam = splitBuffersDisk.splitBuffersRam; - - return boundingBoxes; -} - -std::pair OrderedBoxes::PerformSplit(SplitResult splitResult, SplitBuffersDisk& splitBuffers, size_t M, size_t S, long long maxBuildingRamUsage) { - /** - * Perform the best split on the current ordered boxes in the disk case - */ - - long long sizeLeft = std::ceil((splitResult.bestIndex - 2) / 2.0) * S; - long long sizeRight = this->size - sizeLeft; - size_t SSplit0 = sizeLeft <= S ? std::ceil(sizeLeft / (double) M) : S; - size_t SSplit1 = sizeRight <= S ? std::ceil(sizeRight / (double) M) : S; - long long split0ByteSize = sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(long long)); - long long split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(long long)); - bool split0InRam = maxBuildingRamUsage == 0 || split0ByteSize * 4 < maxBuildingRamUsage; - bool split1InRam = maxBuildingRamUsage == 0 || split1ByteSize * 4 < maxBuildingRamUsage; - - double globalMinXS0 = -1; - double globalMinYS0 = -1; - double globalMaxXS0 = -1; - double globalMaxYS0 = -1; - - double globalMinXS1 = -1; - double globalMinYS1 = -1; - double globalMaxXS1 = -1; - double globalMaxYS1 = -1; - - rTreeValueWithOrderIndex minSplit0OtherDim; - rTreeValueWithOrderIndex maxSplit0OtherDim; - rTreeValueWithOrderIndex minSplit1OtherDim; - rTreeValueWithOrderIndex maxSplit1OtherDim; - - if (splitResult.bestDim == 0) { - splitBuffers.splitBuffersRam.s0SmallDim0->push_back(splitResult.bestMinElement); - splitBuffers.splitBuffersRam.s0SmallDim0->push_back(splitResult.bestLastElement); - splitBuffers.splitBuffersRam.s1SmallDim0->push_back(splitResult.bestElement); - splitBuffers.splitBuffersRam.s1SmallDim0->push_back(splitResult.bestMaxElement); - - // placeholder, since we need the min and max element of the split in the first two spots - splitBuffers.splitBuffersRam.s0SmallDim1->push_back(rTreeValueWithOrderIndex()); - splitBuffers.splitBuffersRam.s0SmallDim1->push_back(rTreeValueWithOrderIndex()); - splitBuffers.splitBuffersRam.s1SmallDim1->push_back(rTreeValueWithOrderIndex()); - splitBuffers.splitBuffersRam.s1SmallDim1->push_back(rTreeValueWithOrderIndex()); - } else { - splitBuffers.splitBuffersRam.s0SmallDim1->push_back(splitResult.bestMinElement); - splitBuffers.splitBuffersRam.s0SmallDim1->push_back(splitResult.bestLastElement); - splitBuffers.splitBuffersRam.s1SmallDim1->push_back(splitResult.bestElement); - splitBuffers.splitBuffersRam.s1SmallDim1->push_back(splitResult.bestMaxElement); - - // placeholder - splitBuffers.splitBuffersRam.s0SmallDim0->push_back(rTreeValueWithOrderIndex()); - splitBuffers.splitBuffersRam.s0SmallDim0->push_back(rTreeValueWithOrderIndex()); - splitBuffers.splitBuffersRam.s1SmallDim0->push_back(rTreeValueWithOrderIndex()); - splitBuffers.splitBuffersRam.s1SmallDim0->push_back(rTreeValueWithOrderIndex()); - } - - std::optional elementOpt; - std::optional fileReaderDim0; - std::optional fileReaderDim1; - if (!this->workInRam) { - fileReaderDim0 = { FileReader(this->rectanglesD0OnDisk) }; - fileReaderDim1 = { FileReader(this->rectanglesD1OnDisk) }; - } - long long currentXSplit0 = 0; - long long currentXSplit1 = 0; - long long currentYSplit0 = 0; - long long currentYSplit1 = 0; - for (size_t dim = 0; dim < 2; dim++) { - // start performing the actual split - long long i = 0; - - if (!this->workInRam) { - if (dim == 0) - elementOpt = fileReaderDim0.value().GetNextElement(); - if (dim == 1) - elementOpt = fileReaderDim1.value().GetNextElement(); - } - - while ((this->workInRam && i < this->size) || (!this->workInRam && elementOpt)) { - rTreeValueWithOrderIndex element; - - // get the current element, either from disk or from ram - if (this->workInRam) { - element = dim == 0 ? (*this->rectanglesD0InRam)[i] : (*this->rectanglesD1InRam)[i]; - } else { - element = elementOpt.value(); - } - - if ((splitResult.bestDim == 0 && element.orderX < splitResult.bestElement.orderX) - || (splitResult.bestDim == 1 && element.orderY < splitResult.bestElement.orderY)) { - // the element belongs to split 0 - - if (dim == 0) { - // add the element to the split 0 dimension 0 vector / file - if (split0InRam || this->workInRam) { - splitBuffers.splitBuffersRam.s0Dim0->push_back(element); - } else { - Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split0Dim0File.value()); - } - - // check if the element is at the position i * S (described in the algorithm) or one before it. - // In this case it is a future possible split position and needs to be saved to the "small list" - if (((currentXSplit0 + 1) % SSplit0 == 0 && (currentXSplit0 + 1) / SSplit0 >= 1 && (currentXSplit0 + 1) / SSplit0 < M) - || (currentXSplit0 % SSplit0 == 0 && currentXSplit0 / SSplit0 >= 1 && currentXSplit0 / SSplit0 < M)) { - // index i * S - 1 or i * S - splitBuffers.splitBuffersRam.s0SmallDim0->push_back(element); - } - - // keep track of the min and max values to construct the bounding box of the split later - if (globalMinXS0 == -1 || element.box.min_corner().get<0>() < globalMinXS0) { - globalMinXS0 = element.box.min_corner().get<0>(); - } - if (globalMinYS0 == -1 || element.box.min_corner().get<1>() < globalMinYS0) { - globalMinYS0 = element.box.min_corner().get<1>(); - } - if (element.box.max_corner().get<0>() > globalMaxXS0) { - globalMaxXS0 = element.box.max_corner().get<0>(); - } - if (element.box.max_corner().get<1>() > globalMaxYS0) { - globalMaxYS0 = element.box.max_corner().get<1>(); - } - - // keep track of the min and max element of the split, to later replace the placeholder in the "small lists" - if (splitResult.bestDim == 1) { - if (currentXSplit0 == 0) { - minSplit0OtherDim = element; - maxSplit0OtherDim = element; - } - if (element.orderX > maxSplit0OtherDim.orderX) { - maxSplit0OtherDim = element; - } - } - - currentXSplit0++; - } else { - if (split0InRam || this->workInRam) { - splitBuffers.splitBuffersRam.s0Dim1->push_back(element); - } else { - Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split0Dim1File.value()); - } - - if (((currentYSplit0 + 1) % SSplit0 == 0 && (currentYSplit0 + 1) / SSplit0 >= 1 && (currentYSplit0 + 1) / SSplit0 < M) - || (currentYSplit0 % SSplit0 == 0 && currentYSplit0 / SSplit0 >= 1 && currentYSplit0 / SSplit0 < M)) { - // index i * S - 1 or i * S - splitBuffers.splitBuffersRam.s0SmallDim1->push_back(element); - } - - if (splitResult.bestDim == 0) { - if (currentYSplit0 == 0) { - minSplit0OtherDim = element; - maxSplit0OtherDim = element; - } - if (element.orderX > maxSplit0OtherDim.orderX) { - maxSplit0OtherDim = element; - } - } - - currentYSplit0++; - } - } else { - // the element belongs to split 1 - - if (dim == 0) { - if (split1InRam || this->workInRam) { - splitBuffers.splitBuffersRam.s1Dim0->push_back(element); - } else { - Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split1Dim0File.value()); - } - if (((currentXSplit1 + 1) % SSplit1 == 0 && (currentXSplit1 + 1) / SSplit1 >= 1 && (currentXSplit1 + 1) / SSplit1 < M) - || (currentXSplit1 % SSplit1 == 0 && currentXSplit1 / SSplit1 >= 1 && currentXSplit1 / SSplit1 < M)) { - // index i * S - 1 or i * S - splitBuffers.splitBuffersRam.s1SmallDim0->push_back(element); - } - - if (globalMinXS1 == -1 || element.box.min_corner().get<0>() < globalMinXS1) { - globalMinXS1 = element.box.min_corner().get<0>(); - } - if (globalMinYS1 == -1 || element.box.min_corner().get<1>() < globalMinYS1) { - globalMinYS1 = element.box.min_corner().get<1>(); - } - if (element.box.max_corner().get<0>() > globalMaxXS1) { - globalMaxXS1 = element.box.max_corner().get<0>(); - } - if (element.box.max_corner().get<1>() > globalMaxYS1) { - globalMaxYS1 = element.box.max_corner().get<1>(); - } - - if (splitResult.bestDim == 1) { - if (currentXSplit1 == 0) { - minSplit1OtherDim = element; - maxSplit1OtherDim = element; - } - if (element.orderX > maxSplit1OtherDim.orderX) { - maxSplit1OtherDim = element; - } - } - - currentXSplit1++; - } else { - if (split1InRam || this->workInRam) { - splitBuffers.splitBuffersRam.s1Dim1->push_back(element); - } else { - Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split1Dim1File.value()); - } - if (((currentYSplit1 + 1) % SSplit1 == 0 && (currentYSplit1 + 1) / SSplit1 >= 1 && (currentYSplit1 + 1) / SSplit1 < M) - || (currentYSplit1 % SSplit1 == 0 && currentYSplit1 / SSplit1 >= 1 && currentYSplit1 / SSplit1 < M)) { - // index i * S - 1 or i * S - splitBuffers.splitBuffersRam.s1SmallDim1->push_back(element); - } - - if (splitResult.bestDim == 0) { - if (currentYSplit1 == 0) { - minSplit1OtherDim = element; - maxSplit1OtherDim = element; - } - if (element.orderX > maxSplit1OtherDim.orderX) { - maxSplit1OtherDim = element; - } - } - - currentYSplit1++; - } - } - i++; - - if (!this->workInRam) { - if (dim == 0) - elementOpt = fileReaderDim0.value().GetNextElement(); - if (dim == 1) - elementOpt = fileReaderDim1.value().GetNextElement(); - } - } - } - - if (!this->workInRam) { - fileReaderDim0.value().Close(); - fileReaderDim1.value().Close(); - } - - // replace the placeholder - if (splitResult.bestDim == 0) { - (*splitBuffers.splitBuffersRam.s0SmallDim1)[0] = minSplit0OtherDim; - (*splitBuffers.splitBuffersRam.s0SmallDim1)[1] = maxSplit0OtherDim; - (*splitBuffers.splitBuffersRam.s1SmallDim1)[0] = minSplit1OtherDim; - (*splitBuffers.splitBuffersRam.s1SmallDim1)[1] = maxSplit1OtherDim; - } else { - (*splitBuffers.splitBuffersRam.s0SmallDim0)[0] = minSplit0OtherDim; - (*splitBuffers.splitBuffersRam.s0SmallDim0)[1] = maxSplit0OtherDim; - (*splitBuffers.splitBuffersRam.s1SmallDim0)[0] = minSplit1OtherDim; - (*splitBuffers.splitBuffersRam.s1SmallDim0)[1] = maxSplit1OtherDim; - } - - boxGeo boxSplit0 = Rtree::createBoundingBox(globalMinXS0, globalMinYS0, globalMaxXS0, globalMaxYS0); - boxGeo boxSplit1 = Rtree::createBoundingBox(globalMinXS1, globalMinYS1, globalMaxXS1, globalMaxYS1); - - return std::make_pair(boxSplit0, boxSplit1); -} - -FileReader::FileReader(const std::string& filePath) { - this->filePath = filePath; - - this->file = std::ifstream(this->filePath, std::ios::binary); - this->file.seekg (0, std::ifstream::end); - this->fileLength = this->file.tellg(); - this->file.seekg (0, std::ifstream::beg); -} - -std::optional FileReader::GetNextElement() { - if (this->file.tellg() < this->fileLength) { - double minX; - double minY; - double maxX; - double maxY; - uint64_t id; - long long orderX; - long long orderY; - - this->file.read(reinterpret_cast(&minX), sizeof(double)); - this->file.read(reinterpret_cast(&minY), sizeof(double)); - this->file.read(reinterpret_cast(&maxX), sizeof(double)); - this->file.read(reinterpret_cast(&maxY), sizeof(double)); - this->file.read(reinterpret_cast(&id), sizeof(uint64_t)); - this->file.read(reinterpret_cast(&orderX), sizeof(long long)); - this->file.read(reinterpret_cast(&orderY), sizeof(long long)); - - boxGeo box = Rtree::createBoundingBox(minX, minY, maxX, maxY); - rTreeValueWithOrderIndex element = rTreeValueWithOrderIndex(box, id, orderX, orderY); - - return { element }; - } else { - return {}; - } -} - -void FileReader::Close() { - this->file.close(); -} - -FileReaderWithoutIndex::FileReaderWithoutIndex(const std::string& filePath) { - this->filePath = filePath; - - this->file = std::ifstream(this->filePath, std::ios::binary); - this->file.seekg (0, std::ifstream::end); - this->fileLength = this->file.tellg(); - this->file.seekg (0, std::ifstream::beg); -} - -std::optional FileReaderWithoutIndex::GetNextElement() { - if (this->file.tellg() < this->fileLength) { - double minX; - double minY; - double maxX; - double maxY; - uint64_t id; - - this->file.read(reinterpret_cast(&minX), sizeof(double)); - this->file.read(reinterpret_cast(&minY), sizeof(double)); - this->file.read(reinterpret_cast(&maxX), sizeof(double)); - this->file.read(reinterpret_cast(&maxY), sizeof(double)); - this->file.read(reinterpret_cast(&id), sizeof(uint64_t)); - - boxGeo box = Rtree::createBoundingBox(minX, minY, maxX, maxY); - rTreeValue boxWithId = rTreeValue(box, id); - - return { boxWithId }; - } else { - return {}; - } -} - -void FileReaderWithoutIndex::Close() { - this->file.close(); -} +} diff --git a/src/util/Rtree.h b/src/util/Rtree.h index 2b3a26cbf0..f378cea81c 100644 --- a/src/util/Rtree.h +++ b/src/util/Rtree.h @@ -12,89 +12,36 @@ #include #include #include -#include -#include -#include -#include #include #include namespace bg = boost::geometry; -using pointGeo = bg::model::point>; -using boxGeo = bg::model::box; - -using bg::make; - -struct rTreeValue { - boxGeo box{}; - long long id; - rTreeValue(boxGeo box_, long long id_) { - box = box_; - id = id_; - } - - rTreeValue() {} - - template - void serialize(Archive & a, [[maybe_unused]]const unsigned int version) { - a & box; - a & id; - } -}; -using multiBoxGeo = std::vector; - -struct rTreeValueWithOrderIndex { - boxGeo box{}; - long long id; - long long orderX; - long long orderY; - rTreeValueWithOrderIndex(boxGeo box_, long long id_, long long orderX_, long long orderY_) { - box = box_; - id = id_; - orderX = orderX_; - orderY = orderY_; - } - - rTreeValueWithOrderIndex() {}; -}; -using multiBoxWithOrderIndex = std::vector; - -struct SplitResult { - double bestCost = -1; - size_t bestDim = 0; - long long bestIndex = 0; - rTreeValueWithOrderIndex bestLastElement; - rTreeValueWithOrderIndex bestElement; - rTreeValueWithOrderIndex bestMinElement; - rTreeValueWithOrderIndex bestMaxElement; -}; - -struct SplitBuffersRam { - std::shared_ptr s0Dim0 = std::make_shared(); - std::shared_ptr s0Dim1 = std::make_shared(); - std::shared_ptr s1Dim0 = std::make_shared(); - std::shared_ptr s1Dim1 = std::make_shared(); - - std::shared_ptr s0SmallDim0 = std::make_shared(); - std::shared_ptr s0SmallDim1 = std::make_shared(); - std::shared_ptr s1SmallDim0 = std::make_shared(); - std::shared_ptr s1SmallDim1 = std::make_shared(); -}; - -struct SplitBuffersDisk { - SplitBuffersRam splitBuffersRam; - std::optional split0Dim0File; - std::optional split0Dim1File; - std::optional split1Dim0File; - std::optional split1Dim1File; +/* Forward declaration */ +struct RTreeValue; +struct RTreeValueWithOrderIndex; +using multiBoxGeo = std::vector; +using multiBoxWithOrderIndex = std::vector; +struct SplitResult; +struct SplitBuffersRam; +struct SplitBuffersDisk; + +/** + * Data type to store all the information of the rectangles (in ram or on disk) + the small lists for one dimension + */ +struct RectanglesForOrderedBoxes { + std::shared_ptr rectanglesInRam; + std::string rectanglesOnDisk; + std::shared_ptr rectanglesSmall; }; class Node { protected: + typedef bg::model::point> Point; + typedef bg::model::box BoundingBox; friend class boost::serialization::access; - long long id; - boxGeo boundingBox{}; + uint64_t id; + BoundingBox boundingBox{}; bool isLastInnerNode = false; multiBoxGeo children; @@ -106,17 +53,17 @@ class Node { a & children; } - explicit Node(long long id); + explicit Node(uint64_t id); public: Node(); - Node(long long id, boxGeo boundingBox); - Node(long long id, boxGeo boundingBox, multiBoxGeo &children, bool isLastInnerNode); - Node(long long id, double minX, double minY, double maxX, double maxY, bool isLastInnerNode); - [[nodiscard]] long long GetId() const; - [[nodiscard]] boxGeo GetBoundingBox() const; + Node(uint64_t id, BoundingBox boundingBox); + Node(uint64_t id, BoundingBox boundingBox, multiBoxGeo &children, bool isLastInnerNode); + Node(uint64_t id, double minX, double minY, double maxX, double maxY, bool isLastInnerNode); + [[nodiscard]] uint64_t GetId() const; + [[nodiscard]] BoundingBox GetBoundingBox() const; void AddChild(Node& child); - void SetIsLastInnerNode(bool isLastInnerNode); + void SetIsLastInnerNode(bool isLast); [[nodiscard]] bool GetIsLastInnerNode() const; multiBoxGeo GetChildren(); }; @@ -125,44 +72,42 @@ BOOST_CLASS_VERSION(Node, 1) class Rtree { private: - static long long SaveNode(Node &node, bool isLastInnerNode, std::ofstream& nodesOfs); - static Node LoadNode(long long id, std::ifstream& lookupIfs, std::ifstream& nodesIfs); + static uint64_t SaveNode(Node &node, bool isLastInnerNode, std::ofstream& nodesOfs); + static Node LoadNode(uint64_t id, std::ifstream& lookupIfs, std::ifstream& nodesIfs); uintmax_t maxBuildingRamUsage; public: + typedef bg::model::point> Point; + typedef bg::model::box BoundingBox; void BuildTree(const std::string& onDiskBase, size_t M, const std::string& folder) const; - static multiBoxGeo SearchTree(boxGeo query, const std::string& folder); - static std::optional ConvertWordToRtreeEntry(const std::string& wkt); - static void SaveEntry(boxGeo boundingBox, uint64_t index, std::ofstream& convertOfs); - static void SaveEntryWithOrderIndex(rTreeValueWithOrderIndex treeValue, std::ofstream& convertOfs); + static multiBoxGeo SearchTree(BoundingBox query, const std::string& folder); + static std::optional ConvertWordToRtreeEntry(const std::string& wkt); + static void SaveEntry(BoundingBox boundingBox, uint64_t index, std::ofstream& convertOfs); + static void SaveEntryWithOrderIndex(RTreeValueWithOrderIndex treeValue, std::ofstream& convertOfs); static multiBoxGeo LoadEntries(const std::string& file); static multiBoxWithOrderIndex LoadEntriesWithOrderIndex(const std::string& file); - static boxGeo createBoundingBox(double pointOneX, double pointOneY, double pointTwoX, double pointTwoY); + static BoundingBox createBoundingBox(double pointOneX, double pointOneY, double pointTwoX, double pointTwoY); explicit Rtree(uintmax_t maxBuildingRamUsage); }; class OrderedBoxes { private: bool workInRam; - long long size; - boxGeo boundingBox; - std::shared_ptr rectanglesD0InRam; - std::shared_ptr rectanglesD1InRam; - std::string rectanglesD0OnDisk; - std::string rectanglesD1OnDisk; - std::shared_ptr rectanglesD0Small; - std::shared_ptr rectanglesD1Small; + uint64_t size; + Rtree::BoundingBox boundingBox; + RectanglesForOrderedBoxes rectsD0; + RectanglesForOrderedBoxes rectsD1; std::pair SplitAtBestInRam(size_t S, size_t M); - std::pair SplitAtBestOnDisk(const std::string& filePath, size_t S, size_t M, long long maxBuildingRamUsage); + std::pair SplitAtBestOnDisk(const std::string& filePath, size_t S, size_t M, uint64_t maxBuildingRamUsage); SplitResult GetBestSplit(); - std::pair PerformSplit(SplitResult splitResult, SplitBuffersRam& splitBuffersRam, size_t M, size_t S); - std::pair PerformSplit(SplitResult splitResult, SplitBuffersDisk& splitBuffers, size_t M, size_t S, long long maxBuildingRamUsage); + std::pair PerformSplit(SplitResult splitResult, SplitBuffersRam& splitBuffersRam, size_t M, size_t S); + std::pair PerformSplit(SplitResult splitResult, SplitBuffersDisk& splitBuffers, size_t M, size_t S, uint64_t maxBuildingRamUsage); public: [[nodiscard]] bool WorkInRam() const; - void CreateOrderedBoxesInRam(const std::shared_ptr& rectanglesD0, const std::shared_ptr& rectanglesD1, const std::shared_ptr& rectanglesSmallD0, const std::shared_ptr& rectanglesSmallD1, boxGeo box); // workInRam = true - void CreateOrderedBoxesOnDisk(const std::string& rectanglesD0, const std::string& rectanglesD1, const std::shared_ptr& rectanglesSmallD0, const std::shared_ptr& rectanglesSmallD1, long long size, boxGeo box); // workInRam = false - boxGeo GetBoundingBox(); - [[nodiscard]] long long GetSize() const; - std::pair SplitAtBest(const std::string& filePath, size_t S, size_t M, long long maxBuildingRamUsage); + void CreateOrderedBoxesInRam(RectanglesForOrderedBoxes& rectanglesD0, RectanglesForOrderedBoxes& rectanglesD1, Rtree::BoundingBox box); // workInRam = true + void CreateOrderedBoxesOnDisk(RectanglesForOrderedBoxes& rectanglesD0, RectanglesForOrderedBoxes& rectanglesD1, uint64_t size, Rtree::BoundingBox box); // workInRam = false + Rtree::BoundingBox GetBoundingBox(); + [[nodiscard]] uint64_t GetSize() const; + std::pair SplitAtBest(const std::string& filePath, size_t S, size_t M, uint64_t maxBuildingRamUsage); std::shared_ptr GetRectanglesInRam(); std::string GetRectanglesOnDisk(); }; @@ -172,36 +117,14 @@ class ConstructionNode: public Node { OrderedBoxes orderedBoxes; public: - ConstructionNode(long long id, OrderedBoxes orderedBoxes); + ConstructionNode(uint64_t id, OrderedBoxes orderedBoxes); OrderedBoxes GetOrderedBoxes(); void AddChildrenToItem(); }; -class FileReader { - private: - std::string filePath; - std::ifstream file; - long long fileLength; - public: - explicit FileReader(const std::string& filePath); - std::optional GetNextElement(); - void Close(); -}; - -class FileReaderWithoutIndex { - private: - std::string filePath; - std::ifstream file; - long long fileLength; - public: - explicit FileReaderWithoutIndex(const std::string& filePath); - std::optional GetNextElement(); - void Close(); -}; - namespace boost::serialization { template -void save(Archive & a, const boxGeo & b, [[maybe_unused]]unsigned int version) +void save(Archive & a, const Rtree::BoundingBox & b, [[maybe_unused]]unsigned int version) { a << b.min_corner().get<0>(); a << b.min_corner().get<1>(); @@ -209,7 +132,7 @@ void save(Archive & a, const boxGeo & b, [[maybe_unused]]unsigned int version) a << b.max_corner().get<1>(); } template -void load(Archive & a, boxGeo & b, [[maybe_unused]]unsigned int version) +void load(Archive & a, Rtree::BoundingBox & b, [[maybe_unused]]unsigned int version) { double minX = 0; a >> minX; @@ -219,29 +142,95 @@ void load(Archive & a, boxGeo & b, [[maybe_unused]]unsigned int version) a >> maxX; double maxY = 0; a >> maxY; - b = make(make(minX, minY), make(maxX, maxY)); + b = Rtree::BoundingBox(Rtree::Point(minX, minY), Rtree::Point(maxX, maxY)); } } -BOOST_SERIALIZATION_SPLIT_FREE(boxGeo); +BOOST_SERIALIZATION_SPLIT_FREE(Rtree::BoundingBox); + +/** + * Data type for a value of the Rtree, which contains the id of the object and its bounding box. + */ +struct RTreeValue { + Rtree::BoundingBox box{}; + uint64_t id = 0; -struct sortRuleLambdaX { + template + void serialize(Archive & a, [[maybe_unused]]const unsigned int version) { + a & box; + a & id; + } +}; + +/** + * Data type for a value of the Rtree (id and boundingbox), with the addtional information + * of its position in the x- and y-sorting. This is only used to create the Rtree in a more efficient way + */ +struct RTreeValueWithOrderIndex { + Rtree::BoundingBox box{}; + uint64_t id = 0; + uint64_t orderX = 0; + uint64_t orderY = 0; +}; + +/** + * Data type containing all the information about the best split found, which are needed + * to actually perform the split. + */ +struct SplitResult { + double bestCost = -1; + size_t bestDim = 0; + uint64_t bestIndex = 0; + RTreeValueWithOrderIndex bestLastElement; + RTreeValueWithOrderIndex bestElement; + RTreeValueWithOrderIndex bestMinElement; + RTreeValueWithOrderIndex bestMaxElement; +}; + +/** + * A Buffer data structure, containing vectors for the result of a split, while doing it in ram + */ +struct SplitBuffersRam { + std::shared_ptr s0Dim0 = std::make_shared(); + std::shared_ptr s0Dim1 = std::make_shared(); + std::shared_ptr s1Dim0 = std::make_shared(); + std::shared_ptr s1Dim1 = std::make_shared(); + + std::shared_ptr s0SmallDim0 = std::make_shared(); + std::shared_ptr s0SmallDim1 = std::make_shared(); + std::shared_ptr s1SmallDim0 = std::make_shared(); + std::shared_ptr s1SmallDim1 = std::make_shared(); +}; + +/** + * A Buffer data structure, containing the write streams for the result of a split, while doing it on disk + */ +struct SplitBuffersDisk { + SplitBuffersRam splitBuffersRam; + std::optional split0Dim0File; + std::optional split0Dim1File; + std::optional split1Dim0File; + std::optional split1Dim1File; +}; + +struct SortRuleLambdaX { // comparison function - bool operator()(const rTreeValue& b1, const rTreeValue& b2) const { + bool operator()(const RTreeValue& b1, const RTreeValue& b2) const { double center1 = (b1.box.min_corner().get<0>() + b1.box.max_corner().get<0>()) / 2; double center2 = (b2.box.min_corner().get<0>() + b2.box.max_corner().get<0>()) / 2; return center1 < center2; } // Value that is strictly smaller than any input element. - static rTreeValue min_value() { return {Rtree::createBoundingBox(DBL_MIN, DBL_MIN, DBL_MIN, DBL_MIN), 0}; } + static RTreeValue min_value() { return {Rtree::createBoundingBox(-DBL_MAX, -DBL_MAX, -DBL_MAX, -DBL_MAX), 0}; } // Value that is strictly larger than any input element. - static rTreeValue max_value() { return {Rtree::createBoundingBox(DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX), 0}; } + static RTreeValue max_value() { return {Rtree::createBoundingBox(DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX), 0}; } }; -struct sortRuleLambdaXWithIndex { + +struct SortRuleLambdaXWithIndex { // comparison function - bool operator()(const rTreeValueWithOrderIndex& b1, const rTreeValueWithOrderIndex& b2) const { + bool operator()(const RTreeValueWithOrderIndex& b1, const RTreeValueWithOrderIndex& b2) const { double center1 = (b1.box.min_corner().get<0>() + b1.box.max_corner().get<0>()) / 2; double center2 = (b2.box.min_corner().get<0>() + b2.box.max_corner().get<0>()) / 2; @@ -251,15 +240,15 @@ struct sortRuleLambdaXWithIndex { } // Value that is strictly smaller than any input element. - static rTreeValueWithOrderIndex min_value() { return {Rtree::createBoundingBox(DBL_MIN, DBL_MIN, DBL_MIN, DBL_MIN), 0, 0, 0}; } + static RTreeValueWithOrderIndex min_value() { return {Rtree::createBoundingBox(-DBL_MAX, -DBL_MAX, -DBL_MAX, -DBL_MAX), 0, 0, 0}; } // Value that is strictly larger than any input element. - static rTreeValueWithOrderIndex max_value() { return {Rtree::createBoundingBox(DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX), 0, LLONG_MAX, LLONG_MAX}; } + static RTreeValueWithOrderIndex max_value() { return {Rtree::createBoundingBox(DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX), 0, LLONG_MAX, LLONG_MAX}; } }; -struct sortRuleLambdaYWithIndex { +struct SortRuleLambdaYWithIndex { // comparison function - bool operator()(const rTreeValueWithOrderIndex& b1, const rTreeValueWithOrderIndex& b2) const { + bool operator()(const RTreeValueWithOrderIndex& b1, const RTreeValueWithOrderIndex& b2) const { double center1 = (b1.box.min_corner().get<1>() + b1.box.max_corner().get<1>()) / 2; double center2 = (b2.box.min_corner().get<1>() + b2.box.max_corner().get<1>()) / 2; @@ -269,10 +258,10 @@ struct sortRuleLambdaYWithIndex { } // Value that is strictly smaller than any input element. - static rTreeValueWithOrderIndex min_value() { return {Rtree::createBoundingBox(DBL_MIN, DBL_MIN, DBL_MIN, DBL_MIN), 0, 0, 0}; } + static RTreeValueWithOrderIndex min_value() { return {Rtree::createBoundingBox(-DBL_MAX, -DBL_MAX, -DBL_MAX, -DBL_MAX), 0, 0, 0}; } // Value that is strictly larger than any input element. - static rTreeValueWithOrderIndex max_value() { return {Rtree::createBoundingBox(DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX), 0, LLONG_MAX, LLONG_MAX}; } + static RTreeValueWithOrderIndex max_value() { return {Rtree::createBoundingBox(DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX), 0, LLONG_MAX, LLONG_MAX}; } }; #endif //QLEVER_RTREE_H diff --git a/src/util/RtreeBuild.cpp b/src/util/RtreeBuild.cpp new file mode 100644 index 0000000000..1bbc43e4d6 --- /dev/null +++ b/src/util/RtreeBuild.cpp @@ -0,0 +1,1133 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock +#include +#include +#include +#include +#include + +static void centerOrdering(multiBoxGeo& boxes, size_t dim) { + if (dim == 0) { + // order by centerX + SortRuleLambdaX comp; + + std::sort(boxes.begin(), boxes.end(), comp); + } else { + // order by centerY + auto sortRuleLambda = [](RTreeValue b1, RTreeValue b2) -> bool { + double center1 = std::midpoint(b1.box.min_corner().get<1>(), b1.box.max_corner().get<1>()); + double center2 = std::midpoint(b2.box.min_corner().get<1>(), b2.box.max_corner().get<1>()); + return center1 < center2; + }; + + std::sort(boxes.begin(), boxes.end(), sortRuleLambda); + } +} + +static void centerOrdering(multiBoxWithOrderIndex& boxes, size_t dim) { + if (dim == 0) { + // order by centerX + SortRuleLambdaXWithIndex comp; + + std::sort(boxes.begin(), boxes.end(), comp); + } else { + // order by centerY + SortRuleLambdaYWithIndex comp; + + std::sort(boxes.begin(), boxes.end(), comp); + } +} + +OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBuildingRamUsage, bool workInRam) { + OrderedBoxes orderedInputRectangles; + + ad_utility::BackgroundStxxlSorter sorterRectsD0Basic = ad_utility::BackgroundStxxlSorter(std::ceil((maxBuildingRamUsage < 9999999999 ? maxBuildingRamUsage : 9999999999) / 3.0)); + multiBoxGeo rectsD0Basic; + + if (workInRam) { + rectsD0Basic = Rtree::LoadEntries(onDiskBase + ".boundingbox.tmp"); + centerOrdering(rectsD0Basic, 0); + } else { + FileReaderWithoutIndex fileReaderRectsD0 = FileReaderWithoutIndex(onDiskBase + ".boundingbox.tmp"); + std::optional rectD0Element = fileReaderRectsD0.GetNextElement(); + while (rectD0Element) { + sorterRectsD0Basic.push(rectD0Element.value()); + rectD0Element = fileReaderRectsD0.GetNextElement(); + } + fileReaderRectsD0.Close(); + } + + uint64_t xSize = 0; + double globalMinX = -1; + double globalMinY = -1; + double globalMaxX = -1; + double globalMaxY = -1; + + ad_utility::BackgroundStxxlSorter sorterRectsD1 = ad_utility::BackgroundStxxlSorter(std::ceil((maxBuildingRamUsage < 9999999999 ? maxBuildingRamUsage : 9999999999) / 3.0)); + std::shared_ptr RectanglesD1WithOrder = std::make_shared(); + + if (workInRam) { + for (RTreeValue element : rectsD0Basic) { + RTreeValueWithOrderIndex entry = {element.box, element.id, xSize, 0}; + RectanglesD1WithOrder->push_back(entry); + xSize++; + + if (globalMinX == -1 || element.box.min_corner().get<0>() < globalMinX) { + globalMinX = element.box.min_corner().get<0>(); + } + if (globalMinY == -1 || element.box.min_corner().get<1>() < globalMinY) { + globalMinY = element.box.min_corner().get<1>(); + } + if (element.box.max_corner().get<0>() > globalMaxX) { + globalMaxX = element.box.max_corner().get<0>(); + } + if (element.box.max_corner().get<1>() > globalMaxY) { + globalMaxY = element.box.max_corner().get<1>(); + } + } + centerOrdering(*RectanglesD1WithOrder, 1); + } else { + for (RTreeValue element : sorterRectsD0Basic.sortedView()) { + RTreeValueWithOrderIndex entry = {element.box, element.id, xSize, 0}; + sorterRectsD1.push(entry); + xSize++; + + if (globalMinX == -1 || element.box.min_corner().get<0>() < globalMinX) { + globalMinX = element.box.min_corner().get<0>(); + } + if (globalMinY == -1 || element.box.min_corner().get<1>() < globalMinY) { + globalMinY = element.box.min_corner().get<1>(); + } + if (element.box.max_corner().get<0>() > globalMaxX) { + globalMaxX = element.box.max_corner().get<0>(); + } + if (element.box.max_corner().get<1>() > globalMaxY) { + globalMaxY = element.box.max_corner().get<1>(); + } + } + } + sorterRectsD0Basic.clear(); + + size_t currentS = std::ceil(((float) xSize) / ((float) M)); + + uint64_t ySize = 0; + std::ofstream r1File = std::ofstream(onDiskBase + ".boundingbox.d1.tmp", std::ios::binary); + ad_utility::BackgroundStxxlSorter sorterRectsD0 = ad_utility::BackgroundStxxlSorter(std::ceil((maxBuildingRamUsage < 9999999999 ? maxBuildingRamUsage : 9999999999) / 3.0)); + std::shared_ptr RectanglesD0WithOrder = std::make_shared(); + std::shared_ptr r1Small = std::make_shared(); + // placeholder + r1Small->push_back(RTreeValueWithOrderIndex()); + r1Small->push_back(RTreeValueWithOrderIndex()); + RTreeValueWithOrderIndex minD1; + RTreeValueWithOrderIndex maxD1; + + if (workInRam) { + for (RTreeValueWithOrderIndex element : *RectanglesD1WithOrder) { + element.orderY = ySize; + RectanglesD0WithOrder->push_back(element); + + if (((ySize + 1) % currentS == 0 && (ySize + 1) / currentS >= 1 && (ySize + 1) / currentS < M) + || (ySize % currentS == 0 && ySize / currentS >= 1 && ySize / currentS < M)) { + // index i * S - 1 or i * S + r1Small->push_back(element); + } + + if (ySize == 0) { + minD1 = element; + maxD1 = element; + } + if (element.orderY > maxD1.orderY) { + maxD1 = element; + } + + ySize++; + } + centerOrdering(*RectanglesD0WithOrder, 0); + } else { + for (RTreeValueWithOrderIndex element : sorterRectsD1.sortedView()) { + element.orderY = ySize; + Rtree::SaveEntryWithOrderIndex(element, r1File); + sorterRectsD0.push(element); + + if (((ySize + 1) % currentS == 0 && (ySize + 1) / currentS >= 1 && (ySize + 1) / currentS < M) + || (ySize % currentS == 0 && ySize / currentS >= 1 && ySize / currentS < M)) { + // index i * S - 1 or i * S + r1Small->push_back(element); + } + + if (ySize == 0) { + minD1 = element; + maxD1 = element; + } + if (element.orderY > maxD1.orderY) { + maxD1 = element; + } + + ySize++; + } + } + + r1File.close(); + sorterRectsD1.clear(); + + // replace the placeholder + (*r1Small)[0] = minD1; + (*r1Small)[1] = maxD1; + + uint64_t currentX = 0; + std::ofstream r0File = std::ofstream(onDiskBase + ".boundingbox.d0.tmp", std::ios::binary); + std::shared_ptr r0Small = std::make_shared(); + // placeholder + r0Small->push_back(RTreeValueWithOrderIndex()); + r0Small->push_back(RTreeValueWithOrderIndex()); + RTreeValueWithOrderIndex minD0; + RTreeValueWithOrderIndex maxD0; + + if (workInRam) { + for (RTreeValueWithOrderIndex element : *RectanglesD0WithOrder) { + if (((currentX + 1) % currentS == 0 && (currentX + 1) / currentS >= 1 && (currentX + 1) / currentS < M) + || (currentX % currentS == 0 && currentX / currentS >= 1 && currentX / currentS < M)) { + // index i * S - 1 or i * S + r0Small->push_back(element); + } + + if (currentX == 0) { + minD0 = element; + maxD0 = element; + } + if (element.orderX > maxD0.orderX) { + maxD0 = element; + } + + currentX++; + } + } else { + for (RTreeValueWithOrderIndex element : sorterRectsD0.sortedView()) { + Rtree::SaveEntryWithOrderIndex(element, r0File); + + if (((currentX + 1) % currentS == 0 && (currentX + 1) / currentS >= 1 && (currentX + 1) / currentS < M) + || (currentX % currentS == 0 && currentX / currentS >= 1 && currentX / currentS < M)) { + // index i * S - 1 or i * S + r0Small->push_back(element); + } + + if (currentX == 0) { + minD0 = element; + maxD0 = element; + } + if (element.orderX > maxD0.orderX) { + maxD0 = element; + } + + currentX++; + } + } + + r0File.close(); + sorterRectsD0.clear(); + + // replace the placeholder + (*r0Small)[0] = minD0; + (*r0Small)[1] = maxD0; + + Rtree::BoundingBox boundingBox = Rtree::createBoundingBox(globalMinX, globalMinY, globalMaxX, globalMaxY); + RectanglesForOrderedBoxes rectsD0; + RectanglesForOrderedBoxes rectsD1; + rectsD0.rectanglesSmall = r0Small; + rectsD1.rectanglesSmall = r1Small; + if (workInRam) { + rectsD0.rectanglesInRam = RectanglesD0WithOrder; + rectsD1.rectanglesInRam = RectanglesD1WithOrder; + orderedInputRectangles.CreateOrderedBoxesInRam(rectsD0, rectsD1, boundingBox); + } else { + rectsD0.rectanglesOnDisk = onDiskBase + ".boundingbox.d0"; + rectsD1.rectanglesOnDisk = onDiskBase + ".boundingbox.d1"; + orderedInputRectangles.CreateOrderedBoxesOnDisk(rectsD0, rectsD1, xSize, boundingBox); + } + return orderedInputRectangles; +} + +/*OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBuildingRamUsage, bool workInRam) { + if (workInRam) { + return InternalSort(onDiskBase, M); + } else { + return ExternalSort(onDiskBase, M, maxBuildingRamUsage); + } +}*/ + +static double costFunctionTGS(Rtree::BoundingBox& b0, Rtree::BoundingBox& b1, size_t dim) { + /** + * The cost function determines the quality of a split. The lower the cost, the better the split. + * Each split gets represented by the resulting bounding boxes of the split pieces. + */ + double cost; + + // The cost represents the overlap of the two boxes + if (dim == 0) { + cost = b0.max_corner().get<0>() - b1.min_corner().get<0>(); + cost = cost < 0 ? 0 : cost; + } else { + cost = b0.max_corner().get<1>() - b1.min_corner().get<1>(); + cost = cost < 0 ? 0 : cost; + } + + return cost; +} + +static std::vector TGSRecursive(const std::string& filePath, OrderedBoxes orderedInputRectangles, size_t M, size_t S, uint64_t maxBuildingRamUsage) { + /** + * This function recursively constructs one layer of children for a certain root node. + * The input rectangles must be sorted in both x- and y-direction. + * The algorithm is based on this paper https://dl.acm.org/doi/pdf/10.1145/288692.288723 + */ + + uint64_t n = orderedInputRectangles.GetSize(); + + if (n <= S || n <= M) { + // stop condition + return std::vector { orderedInputRectangles }; + } + // split the rectangles at the best split + std::pair split = orderedInputRectangles.SplitAtBest(filePath, S, M, maxBuildingRamUsage); + + // recursion + std::vector result0 = TGSRecursive(filePath + ".0", split.first, M, S, maxBuildingRamUsage); + std::vector result1 = TGSRecursive(filePath + ".1", split.second, M, S, maxBuildingRamUsage); + + std::vector result; + result.insert(result.begin(), result0.begin(), result0.end()); + result.insert(result.end(), result1.begin(), result1.end()); + + return result; +} + +void Rtree::BuildTree(const std::string& onDiskBase, size_t M, const std::string& folder) const { + const std::string file = onDiskBase + ".boundingbox.tmp"; + + // prepare the files + std::filesystem::create_directory(folder); + std::ofstream nodesOfs = std::ofstream(folder + "/nodes.bin", std::ios::binary); + std::map lookup; + + // sort the rectangles + uint64_t fileLines = std::ceil(std::filesystem::file_size(file) / (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t))); + bool workInRam = (std::filesystem::file_size(file) + fileLines * 2 * sizeof(uint64_t)) * 4 < this->maxBuildingRamUsage; + + OrderedBoxes orderedInputRectangles = SortInput(onDiskBase, M, maxBuildingRamUsage, workInRam); + + // build the tree in a depth first approach + std::stack layerStack; + + uint64_t newId = 1; // start from 1, because 0 is the root item + ConstructionNode rootItem = ConstructionNode(0, orderedInputRectangles); + layerStack.push(rootItem); + size_t layer = 0; + + while (!layerStack.empty()) { + ConstructionNode currentItem = layerStack.top(); + layerStack.pop(); + + if (currentItem.GetOrderedBoxes().GetSize() <= M) { + // reached a leaf + currentItem.AddChildrenToItem(); + uint64_t nodePtr = SaveNode(currentItem, true, nodesOfs); + lookup[currentItem.GetId()] = nodePtr; + } else { + std::vector tgsResult = TGSRecursive(onDiskBase + ".boundingbox." + std::to_string(layer), currentItem.GetOrderedBoxes(), M, std::ceil(((float) currentItem.GetOrderedBoxes().GetSize()) / ((float) M)), this->maxBuildingRamUsage); + for (OrderedBoxes& currentOrderedRectangles : tgsResult) { + ConstructionNode newItem = ConstructionNode(newId, currentOrderedRectangles); + layerStack.push(newItem); + + currentItem.AddChild(newItem); + + newId++; + } + + uint64_t nodePtr = SaveNode(currentItem, false, nodesOfs); + lookup[currentItem.GetId()] = nodePtr; + } + layer++; + } + nodesOfs.close(); + + std::ofstream lookupOfs(folder + "/lookup.bin", std::ios::binary); + for (unsigned int i = 0; i < newId; i++) { + uint64_t nodePtr = lookup[i]; + lookupOfs.write(reinterpret_cast(&nodePtr), sizeof(uint64_t)); + } + lookupOfs.close(); +} + +ConstructionNode::ConstructionNode(uint64_t id, OrderedBoxes orderedBoxes) + : Node{id} +{ + this->orderedBoxes = orderedBoxes; + + // calculate the boundingBoxes + this->boundingBox = orderedBoxes.GetBoundingBox(); +} + +void ConstructionNode::AddChildrenToItem() { + /** + * Add all children of a certain node at once. + * This is used when a leaf node is reached. + */ + if (this->GetOrderedBoxes().WorkInRam()) { + for(RTreeValueWithOrderIndex box : *this->GetOrderedBoxes().GetRectanglesInRam()) { + Node leafNode = Node(box.id, box.box); + this->AddChild(leafNode); + } + } else { + FileReader fileReader = FileReader(this->GetOrderedBoxes().GetRectanglesOnDisk()); + + std::optional element = fileReader.GetNextElement(); + while(element) { + Node leafNode = Node(element.value().id, element.value().box); + this->AddChild(leafNode); + element = fileReader.GetNextElement(); + } + + fileReader.Close(); + } +} + +OrderedBoxes ConstructionNode::GetOrderedBoxes() { + return this->orderedBoxes; +} + +void Node::AddChild(Node& child) { + Rtree::BoundingBox box = child.GetBoundingBox(); + uint64_t entryId = child.GetId(); + RTreeValue entry = {box, entryId}; + this->children.push_back(entry); +} + +Rtree::BoundingBox Node::GetBoundingBox() const { + return this->boundingBox; +} + +void Node::SetIsLastInnerNode(bool isLast) { + this->isLastInnerNode = isLast; +} + +uint64_t Rtree::SaveNode(Node &node, bool isLastInnerNode, std::ofstream& nodesOfs) { + node.SetIsLastInnerNode(isLastInnerNode); + + uint64_t pos = static_cast(nodesOfs.tellp()); + boost::archive::binary_oarchive archive(nodesOfs); + archive << node; + nodesOfs.write(" ", 1); + + return pos; +} + +std::optional GetBoundingBoxFromWKT(const std::string& wkt) { + /** + * Parse the wkt literal in a way, that only the relevant data for the rtree gets read in. + */ + bool lookingForX = true; + bool readingDouble = false; + std::string currentDouble; + + double minX = -1; + double maxX = -1; + double minY = -1; + double maxY = -1; + + for (char c : wkt) { + if (isdigit(c)) { + readingDouble = true; + currentDouble += c; + } else if (c == '.') { + readingDouble = true; + currentDouble += '.'; + } else if (c == ' ') { + if (readingDouble && lookingForX) { + // x is completely read in + readingDouble = false; + lookingForX = false; + double x; + try { + x = std::stod(currentDouble); + } catch(...) { + return { }; + } + currentDouble = ""; + if (x < minX || minX == -1) { + minX = x; + } + + if (x > maxX) { + maxX = x; + } + } + } else { + if (readingDouble && !lookingForX) { + // y is completely read in + readingDouble = false; + lookingForX = true; + double y; + try { + y = std::stod(currentDouble); + } catch(...) { + return { }; + } + currentDouble = ""; + if (y < minY || minY == -1) { + minY = y; + } + + if (y > maxY) { + maxY = y; + } + } + } + } + + return { Rtree::createBoundingBox(minX, minY, maxX, maxY) }; +} + +std::optional Rtree::ConvertWordToRtreeEntry(const std::string& wkt) { + /** + * Convert a single wkt literal to a boundingbox. + */ + std::optional boundingBox; + + /* Get the bounding box(es) of either a multipolygon, polygon or a linestring */ + std::size_t posWKTStart = wkt.find("MULTIPOLYGON(((") + 14; + std::size_t posWKTEnd = wkt.find(")))", posWKTStart); + if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { + std::string newWkt = wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); + boundingBox = GetBoundingBoxFromWKT(newWkt); + } else { + posWKTStart = wkt.find("POLYGON((") + 8; + posWKTEnd = wkt.find("))", posWKTStart); + if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { + std::string newWkt = wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); + boundingBox = GetBoundingBoxFromWKT(newWkt); + } else { + posWKTStart = wkt.find("LINESTRING(") + 10; + posWKTEnd = wkt.find(')', posWKTStart); + if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { + std::string newWkt = wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); + boundingBox = GetBoundingBoxFromWKT(newWkt); + } else { + return { }; + } + } + } + + return boundingBox; +} + +void Rtree::SaveEntry(Rtree::BoundingBox boundingBox, uint64_t index, std::ofstream& convertOfs) { + /** + * Save a single entry (which was e.g. converted by ConvertWordToRtreeEntry) to the disk + */ + double minX = boundingBox.min_corner().get<0>(); + double minY = boundingBox.min_corner().get<1>(); + double maxX = boundingBox.max_corner().get<0>(); + double maxY = boundingBox.max_corner().get<1>(); + + convertOfs.write(reinterpret_cast(&minX), sizeof(double)); + convertOfs.write(reinterpret_cast(&minY), sizeof(double)); + convertOfs.write(reinterpret_cast(&maxX), sizeof(double)); + convertOfs.write(reinterpret_cast(&maxY), sizeof(double)); + convertOfs.write(reinterpret_cast(&index), sizeof(uint64_t)); +} + +void Rtree::SaveEntryWithOrderIndex(RTreeValueWithOrderIndex treeValue, std::ofstream& convertOfs) { + /** + * Save a single entry, containing its postion in the x- and y-sorting + */ + double minX = treeValue.box.min_corner().get<0>(); + double minY = treeValue.box.min_corner().get<1>(); + double maxX = treeValue.box.max_corner().get<0>(); + double maxY = treeValue.box.max_corner().get<1>(); + + convertOfs.write(reinterpret_cast(&minX), sizeof(double)); + convertOfs.write(reinterpret_cast(&minY), sizeof(double)); + convertOfs.write(reinterpret_cast(&maxX), sizeof(double)); + convertOfs.write(reinterpret_cast(&maxY), sizeof(double)); + convertOfs.write(reinterpret_cast(&treeValue.id), sizeof(uint64_t)); + convertOfs.write(reinterpret_cast(&treeValue.orderX), sizeof(uint64_t)); + convertOfs.write(reinterpret_cast(&treeValue.orderY), sizeof(uint64_t)); +} + +multiBoxGeo Rtree::LoadEntries(const std::string& file) { + multiBoxGeo boxes; + + FileReaderWithoutIndex fileReader = FileReaderWithoutIndex(file); + + std::optional element = fileReader.GetNextElement(); + while (element) { + boxes.push_back(element.value()); + element = fileReader.GetNextElement(); + } + + fileReader.Close(); + + return boxes; +} + +multiBoxWithOrderIndex Rtree::LoadEntriesWithOrderIndex(const std::string& file) { + multiBoxWithOrderIndex boxes; + FileReader fileReader = FileReader(file); + + std::optional element = fileReader.GetNextElement(); + while (element) { + boxes.push_back(element.value()); + element = fileReader.GetNextElement(); + } + + fileReader.Close(); + + return boxes; +} + +bool OrderedBoxes::WorkInRam() const{ + return this->workInRam; +} + +void OrderedBoxes::CreateOrderedBoxesInRam(RectanglesForOrderedBoxes& rectanglesD0, RectanglesForOrderedBoxes& rectanglesD1, Rtree::BoundingBox box) { + this->workInRam = true; + this->rectsD0 = rectanglesD0; + this->rectsD1 = rectanglesD1; + this->size = (*rectsD0.rectanglesInRam).size(); + this->boundingBox = box; +} + +void OrderedBoxes::CreateOrderedBoxesOnDisk(RectanglesForOrderedBoxes& rectanglesD0, RectanglesForOrderedBoxes& rectanglesD1, uint64_t size, Rtree::BoundingBox box) { + this->workInRam = false; + this->rectsD0 = rectanglesD0; + this->rectsD1 = rectanglesD1; + this->size = size; + this->boundingBox = box; +} + +Rtree::BoundingBox OrderedBoxes::GetBoundingBox() { + return this->boundingBox; +} + +uint64_t OrderedBoxes::GetSize() const { + return this->size; +} + +std::shared_ptr OrderedBoxes::GetRectanglesInRam() { + return this->rectsD0.rectanglesInRam; +} + +std::string OrderedBoxes::GetRectanglesOnDisk() { + return this->rectsD0.rectanglesOnDisk; +} + +SplitResult OrderedBoxes::GetBestSplit() { + /** + * Determine based on the "small-lists", which split is the best for the rtree. + */ + struct SplitResult splitResult; + + RTreeValueWithOrderIndex minElement; + RTreeValueWithOrderIndex maxElement; + RTreeValueWithOrderIndex currentLastElement; + RTreeValueWithOrderIndex currentElement; + + // This bool is used, since we need every other element as our element "S * i" (described in the algorithm) + // To perform the split better, the element before it (S * i - 1) is saved as well + bool currentlyAtSTimesI = false; + + for (size_t dim = 0; dim < 2; dim++) { + for (uint64_t i = 0; i < this->rectsD0.rectanglesSmall->size(); i++) { + currentElement = dim == 0 ? (*this->rectsD0.rectanglesSmall)[i] : (*this->rectsD1.rectanglesSmall)[i]; + + if (i == 0) { + // this is the min element + minElement = currentElement; + continue; + } + + if (i == 1) { + // this is the max element + maxElement = currentElement; + continue; + } + + if (!currentlyAtSTimesI) { + currentLastElement = currentElement; + currentlyAtSTimesI = true; + continue; + } + + double minXB0 = 0; + double maxXB0 = 1; + double minXB1 = 0; + double maxXB1 = 1; + double minYB0 = 0; + double maxYB0 = 1; + double minYB1 = 0; + double maxYB1 = 1; + + if (currentlyAtSTimesI && currentElement.id != maxElement.id) { + // the current element is a possible split position. + if (dim == 0) { + minXB0 = std::midpoint(minElement.box.min_corner().get<0>(), minElement.box.max_corner().get<0>()); + maxXB0 = std::midpoint(currentLastElement.box.min_corner().get<0>(), currentLastElement.box.max_corner().get<0>()); + + minXB1 = std::midpoint(currentElement.box.min_corner().get<0>(), currentElement.box.max_corner().get<0>()); + maxXB1 = std::midpoint(maxElement.box.min_corner().get<0>(), maxElement.box.max_corner().get<0>()); + } else { + minYB0 = std::midpoint(minElement.box.min_corner().get<1>(), minElement.box.max_corner().get<1>()); + maxYB0 = std::midpoint(currentLastElement.box.min_corner().get<1>(), currentLastElement.box.max_corner().get<1>()); + + minYB1 = std::midpoint(currentElement.box.min_corner().get<1>(), currentElement.box.max_corner().get<1>()); + maxYB1 = std::midpoint(maxElement.box.min_corner().get<1>(), maxElement.box.max_corner().get<1>()); + } + + currentlyAtSTimesI = false; + } else { + break; + } + + Rtree::BoundingBox b0 = Rtree::createBoundingBox(minXB0, minYB0, maxXB0, maxYB0); + Rtree::BoundingBox b1 = Rtree::createBoundingBox(minXB1, minYB1, maxXB1, maxYB1); + + + double cost = costFunctionTGS(b0, b1, dim); + + if (splitResult.bestCost == -1 || cost < splitResult.bestCost) { + splitResult.bestCost = cost; + splitResult.bestDim = dim; + splitResult.bestLastElement = currentLastElement; + splitResult.bestElement = currentElement; + splitResult.bestMinElement = minElement; + splitResult.bestMaxElement = maxElement; + splitResult.bestIndex = i; + } + } + currentlyAtSTimesI = false; + } + + return splitResult; +} + +std::pair OrderedBoxes::SplitAtBest(const std::string& filePath, size_t S, size_t M, uint64_t maxBuildingRamUsage) { + if (this->workInRam) { + return this->SplitAtBestInRam(S, M); + } else { + return this->SplitAtBestOnDisk(filePath, S, M, maxBuildingRamUsage); + } +} + +std::pair OrderedBoxes::SplitAtBestInRam(size_t S, size_t M) { + /** + * Split the ordered boxes in ram. First determine the best split and then perform it + */ + + struct SplitResult splitResult = this->GetBestSplit(); + + OrderedBoxes split0; + OrderedBoxes split1; + + struct SplitBuffersRam splitBuffers; + + splitBuffers.s0Dim0 = std::make_shared(); + splitBuffers.s0Dim1 = std::make_shared(); + splitBuffers.s1Dim0 = std::make_shared(); + splitBuffers.s1Dim1 = std::make_shared(); + + splitBuffers.s0SmallDim0 = std::make_shared(); + splitBuffers.s0SmallDim1 = std::make_shared(); + splitBuffers.s1SmallDim0 = std::make_shared(); + splitBuffers.s1SmallDim1 = std::make_shared(); + + std::pair boundingBoxes = PerformSplit(splitResult, splitBuffers, M, S); + + RectanglesForOrderedBoxes rectsD0Split0 = {splitBuffers.s0Dim0, "", splitBuffers.s0SmallDim0}; + RectanglesForOrderedBoxes rectsD1Split0 = {splitBuffers.s0Dim1, "", splitBuffers.s0SmallDim1}; + RectanglesForOrderedBoxes rectsD0Split1 = {splitBuffers.s1Dim0, "", splitBuffers.s1SmallDim0}; + RectanglesForOrderedBoxes rectsD1Split1 = {splitBuffers.s1Dim1, "", splitBuffers.s1SmallDim1}; + split0.CreateOrderedBoxesInRam(rectsD0Split0, rectsD1Split0, boundingBoxes.first); + split1.CreateOrderedBoxesInRam(rectsD0Split1, rectsD1Split1, boundingBoxes.second); + + (*this->rectsD0.rectanglesInRam).clear(); + (*this->rectsD1.rectanglesInRam).clear(); + (*this->rectsD0.rectanglesSmall).clear(); + (*this->rectsD1.rectanglesSmall).clear(); + (*this->rectsD0.rectanglesInRam).shrink_to_fit(); + (*this->rectsD1.rectanglesInRam).shrink_to_fit(); + (*this->rectsD0.rectanglesSmall).shrink_to_fit(); + (*this->rectsD1.rectanglesSmall).shrink_to_fit(); + + return std::make_pair(split0, split1); +} + +std::pair OrderedBoxes::SplitAtBestOnDisk(const std::string& filePath, size_t S, size_t M, uint64_t maxBuildingRamUsage) { + /** + * Split the ordered boxes on disk. First determine the best split and then perform it + */ + + OrderedBoxes split0; + OrderedBoxes split1; + + struct SplitResult splitResult = this->GetBestSplit(); + + struct SplitBuffersDisk splitBuffers; + struct SplitBuffersRam splitBuffersRam; + + // perfrom the split + uint64_t sizeLeft = std::ceil((splitResult.bestIndex - 2) / 2.0) * S; + uint64_t sizeRight = this->size - sizeLeft; + uint64_t split0ByteSize = sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); + uint64_t split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); + bool split0InRam = split0ByteSize * 4 < maxBuildingRamUsage; + bool split1InRam = split1ByteSize * 4 < maxBuildingRamUsage; + + splitBuffersRam.s0SmallDim0 = std::make_shared(); + splitBuffersRam.s0SmallDim1 = std::make_shared(); + splitBuffersRam.s1SmallDim0 = std::make_shared(); + splitBuffersRam.s1SmallDim1 = std::make_shared(); + + if (!split0InRam) { + splitBuffers.split0Dim0File = { std::ofstream(filePath + ".0.dim0.tmp", std::ios::binary) }; + splitBuffers.split0Dim1File = { std::ofstream(filePath + ".0.dim1.tmp", std::ios::binary) }; + } else { + splitBuffersRam.s0Dim0 = std::make_shared(); + splitBuffersRam.s0Dim1 = std::make_shared(); + } + + if (!split1InRam) { + splitBuffers.split1Dim0File = { std::ofstream(filePath + ".1.dim0.tmp", std::ios::binary) }; + splitBuffers.split1Dim1File = { std::ofstream(filePath + ".1.dim1.tmp", std::ios::binary) }; + } else { + splitBuffersRam.s1Dim0 = std::make_shared(); + splitBuffersRam.s1Dim1 = std::make_shared(); + } + + splitBuffers.splitBuffersRam = splitBuffersRam; + + std::pair boundingBoxes = PerformSplit(splitResult, splitBuffers, M, S, maxBuildingRamUsage); + + RectanglesForOrderedBoxes rectsD0Split0; + RectanglesForOrderedBoxes rectsD1Split0; + RectanglesForOrderedBoxes rectsD0Split1; + RectanglesForOrderedBoxes rectsD1Split1; + rectsD0Split0.rectanglesInRam = splitBuffers.splitBuffersRam.s0SmallDim0; + rectsD1Split0.rectanglesInRam = splitBuffers.splitBuffersRam.s0SmallDim1; + rectsD0Split1.rectanglesInRam = splitBuffers.splitBuffersRam.s1SmallDim0; + rectsD1Split1.rectanglesInRam = splitBuffers.splitBuffersRam.s1SmallDim1; + + if (!split0InRam) { + splitBuffers.split0Dim0File.value().close(); + splitBuffers.split0Dim1File.value().close(); + + rectsD0Split0.rectanglesOnDisk = filePath + ".0.dim0"; + rectsD1Split0.rectanglesOnDisk = filePath + ".0.dim1"; + + split0.CreateOrderedBoxesOnDisk(rectsD0Split0, rectsD1Split0, sizeLeft, boundingBoxes.first); + } else { + rectsD0Split0.rectanglesInRam = splitBuffers.splitBuffersRam.s0Dim0; + rectsD1Split0.rectanglesInRam = splitBuffers.splitBuffersRam.s0Dim1; + split0.CreateOrderedBoxesInRam(rectsD0Split0, rectsD1Split0, boundingBoxes.first); + } + + if (!split1InRam) { + splitBuffers.split1Dim0File.value().close(); + splitBuffers.split1Dim1File.value().close(); + + rectsD0Split1.rectanglesOnDisk = filePath + ".1.dim0"; + rectsD1Split1.rectanglesOnDisk = filePath + ".1.dim1"; + + split1.CreateOrderedBoxesOnDisk(rectsD0Split1, rectsD1Split1, sizeRight, boundingBoxes.second); + } else { + rectsD0Split1.rectanglesInRam = splitBuffers.splitBuffersRam.s1Dim0; + rectsD1Split1.rectanglesInRam = splitBuffers.splitBuffersRam.s1Dim1; + split1.CreateOrderedBoxesInRam(rectsD0Split1, rectsD1Split1, boundingBoxes.second); + } + + std::remove(this->rectsD0.rectanglesOnDisk.c_str()); + std::remove(this->rectsD1.rectanglesOnDisk.c_str()); + + return std::make_pair(split0, split1); +} + +std::pair OrderedBoxes::PerformSplit(SplitResult splitResult, SplitBuffersRam& splitBuffersRam, size_t M, size_t S) { + /** + * Perform the best split on the current ordered boxes in the ram case + */ + + struct SplitBuffersDisk splitBuffersDisk; + + splitBuffersDisk.splitBuffersRam = splitBuffersRam; + splitBuffersDisk.split0Dim0File = {}; + splitBuffersDisk.split0Dim1File = {}; + splitBuffersDisk.split1Dim0File = {}; + splitBuffersDisk.split1Dim1File = {}; + + // reuse the PerfromSplit of the Disk case. + std::pair boundingBoxes = PerformSplit(splitResult, splitBuffersDisk, M, S, 0); + + splitBuffersRam = splitBuffersDisk.splitBuffersRam; + + return boundingBoxes; +} + +std::pair OrderedBoxes::PerformSplit(SplitResult splitResult, SplitBuffersDisk& splitBuffers, size_t M, size_t S, uint64_t maxBuildingRamUsage) { + /** + * Perform the best split on the current ordered boxes in the disk case + */ + + uint64_t sizeLeft = std::ceil((splitResult.bestIndex - 2) / 2.0) * S; + uint64_t sizeRight = this->size - sizeLeft; + size_t SSplit0 = sizeLeft <= S ? std::ceil(sizeLeft / (double) M) : S; + size_t SSplit1 = sizeRight <= S ? std::ceil(sizeRight / (double) M) : S; + uint64_t split0ByteSize = sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); + uint64_t split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); + bool split0InRam = maxBuildingRamUsage == 0 || split0ByteSize * 4 < maxBuildingRamUsage; + bool split1InRam = maxBuildingRamUsage == 0 || split1ByteSize * 4 < maxBuildingRamUsage; + + double globalMinXS0 = -1; + double globalMinYS0 = -1; + double globalMaxXS0 = -1; + double globalMaxYS0 = -1; + + double globalMinXS1 = -1; + double globalMinYS1 = -1; + double globalMaxXS1 = -1; + double globalMaxYS1 = -1; + + RTreeValueWithOrderIndex minSplit0OtherDim; + RTreeValueWithOrderIndex maxSplit0OtherDim; + RTreeValueWithOrderIndex minSplit1OtherDim; + RTreeValueWithOrderIndex maxSplit1OtherDim; + + struct OtherDimension { + std::shared_ptr smallSplit0; + std::shared_ptr smallSplit1; + } otherDimension; + + if (splitResult.bestDim == 0) { + splitBuffers.splitBuffersRam.s0SmallDim0->push_back(splitResult.bestMinElement); + splitBuffers.splitBuffersRam.s0SmallDim0->push_back(splitResult.bestLastElement); + splitBuffers.splitBuffersRam.s1SmallDim0->push_back(splitResult.bestElement); + splitBuffers.splitBuffersRam.s1SmallDim0->push_back(splitResult.bestMaxElement); + + // placeholder, since we need the min and max element of the split in the first two spots + otherDimension.smallSplit0 = splitBuffers.splitBuffersRam.s0SmallDim1; + otherDimension.smallSplit1 = splitBuffers.splitBuffersRam.s1SmallDim1; + + otherDimension.smallSplit0->push_back(RTreeValueWithOrderIndex()); + otherDimension.smallSplit0->push_back(RTreeValueWithOrderIndex()); + otherDimension.smallSplit1->push_back(RTreeValueWithOrderIndex()); + otherDimension.smallSplit1->push_back(RTreeValueWithOrderIndex()); + } else { + splitBuffers.splitBuffersRam.s0SmallDim1->push_back(splitResult.bestMinElement); + splitBuffers.splitBuffersRam.s0SmallDim1->push_back(splitResult.bestLastElement); + splitBuffers.splitBuffersRam.s1SmallDim1->push_back(splitResult.bestElement); + splitBuffers.splitBuffersRam.s1SmallDim1->push_back(splitResult.bestMaxElement); + + // placeholder + otherDimension.smallSplit0 = splitBuffers.splitBuffersRam.s0SmallDim0; + otherDimension.smallSplit1 = splitBuffers.splitBuffersRam.s1SmallDim0; + + otherDimension.smallSplit0->push_back(RTreeValueWithOrderIndex()); + otherDimension.smallSplit0->push_back(RTreeValueWithOrderIndex()); + otherDimension.smallSplit1->push_back(RTreeValueWithOrderIndex()); + otherDimension.smallSplit1->push_back(RTreeValueWithOrderIndex()); + } + + std::optional elementOpt; + std::optional fileReaderDim0; + std::optional fileReaderDim1; + if (!this->workInRam) { + fileReaderDim0 = { FileReader(this->rectsD0.rectanglesOnDisk) }; + fileReaderDim1 = { FileReader(this->rectsD1.rectanglesOnDisk) }; + } + uint64_t currentXSplit0 = 0; + uint64_t currentXSplit1 = 0; + uint64_t currentYSplit0 = 0; + uint64_t currentYSplit1 = 0; + for (size_t dim = 0; dim < 2; dim++) { + // start performing the actual split + uint64_t i = 0; + + if (!this->workInRam) { + if (dim == 0) + elementOpt = fileReaderDim0.value().GetNextElement(); + if (dim == 1) + elementOpt = fileReaderDim1.value().GetNextElement(); + } + + while ((this->workInRam && i < this->size) || (!this->workInRam && elementOpt)) { + RTreeValueWithOrderIndex element; + + // get the current element, either from disk or from ram + if (this->workInRam) { + element = dim == 0 ? (*this->rectsD0.rectanglesInRam)[i] : (*this->rectsD1.rectanglesInRam)[i]; + } else { + element = elementOpt.value(); + } + + if ((splitResult.bestDim == 0 && element.orderX < splitResult.bestElement.orderX) + || (splitResult.bestDim == 1 && element.orderY < splitResult.bestElement.orderY)) { + // the element belongs to split 0 + + if (dim == 0) { + // add the element to the split 0 dimension 0 vector / file + if (split0InRam || this->workInRam) { + splitBuffers.splitBuffersRam.s0Dim0->push_back(element); + } else { + Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split0Dim0File.value()); + } + + // check if the element is at the position i * S (described in the algorithm) or one before it. + // In this case it is a future possible split position and needs to be saved to the "small list" + if (((currentXSplit0 + 1) % SSplit0 == 0 && (currentXSplit0 + 1) / SSplit0 >= 1 && (currentXSplit0 + 1) / SSplit0 < M) + || (currentXSplit0 % SSplit0 == 0 && currentXSplit0 / SSplit0 >= 1 && currentXSplit0 / SSplit0 < M)) { + // index i * S - 1 or i * S + splitBuffers.splitBuffersRam.s0SmallDim0->push_back(element); + } + + // keep track of the min and max values to construct the bounding box of the split later + if (globalMinXS0 == -1 || element.box.min_corner().get<0>() < globalMinXS0) { + globalMinXS0 = element.box.min_corner().get<0>(); + } + if (globalMinYS0 == -1 || element.box.min_corner().get<1>() < globalMinYS0) { + globalMinYS0 = element.box.min_corner().get<1>(); + } + if (element.box.max_corner().get<0>() > globalMaxXS0) { + globalMaxXS0 = element.box.max_corner().get<0>(); + } + if (element.box.max_corner().get<1>() > globalMaxYS0) { + globalMaxYS0 = element.box.max_corner().get<1>(); + } + + // keep track of the min and max element of the split, to later replace the placeholder in the "small lists" + if (splitResult.bestDim == 1) { + if (currentXSplit0 == 0) { + minSplit0OtherDim = element; + maxSplit0OtherDim = element; + } + if (element.orderX > maxSplit0OtherDim.orderX) { + maxSplit0OtherDim = element; + } + } + + currentXSplit0++; + } else { + if (split0InRam || this->workInRam) { + splitBuffers.splitBuffersRam.s0Dim1->push_back(element); + } else { + Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split0Dim1File.value()); + } + + if (((currentYSplit0 + 1) % SSplit0 == 0 && (currentYSplit0 + 1) / SSplit0 >= 1 && (currentYSplit0 + 1) / SSplit0 < M) + || (currentYSplit0 % SSplit0 == 0 && currentYSplit0 / SSplit0 >= 1 && currentYSplit0 / SSplit0 < M)) { + // index i * S - 1 or i * S + splitBuffers.splitBuffersRam.s0SmallDim1->push_back(element); + } + + if (splitResult.bestDim == 0) { + if (currentYSplit0 == 0) { + minSplit0OtherDim = element; + maxSplit0OtherDim = element; + } + if (element.orderX > maxSplit0OtherDim.orderX) { + maxSplit0OtherDim = element; + } + } + + currentYSplit0++; + } + } else { + // the element belongs to split 1 + + if (dim == 0) { + if (split1InRam || this->workInRam) { + splitBuffers.splitBuffersRam.s1Dim0->push_back(element); + } else { + Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split1Dim0File.value()); + } + if (((currentXSplit1 + 1) % SSplit1 == 0 && (currentXSplit1 + 1) / SSplit1 >= 1 && (currentXSplit1 + 1) / SSplit1 < M) + || (currentXSplit1 % SSplit1 == 0 && currentXSplit1 / SSplit1 >= 1 && currentXSplit1 / SSplit1 < M)) { + // index i * S - 1 or i * S + splitBuffers.splitBuffersRam.s1SmallDim0->push_back(element); + } + + if (globalMinXS1 == -1 || element.box.min_corner().get<0>() < globalMinXS1) { + globalMinXS1 = element.box.min_corner().get<0>(); + } + if (globalMinYS1 == -1 || element.box.min_corner().get<1>() < globalMinYS1) { + globalMinYS1 = element.box.min_corner().get<1>(); + } + if (element.box.max_corner().get<0>() > globalMaxXS1) { + globalMaxXS1 = element.box.max_corner().get<0>(); + } + if (element.box.max_corner().get<1>() > globalMaxYS1) { + globalMaxYS1 = element.box.max_corner().get<1>(); + } + + if (splitResult.bestDim == 1) { + if (currentXSplit1 == 0) { + minSplit1OtherDim = element; + maxSplit1OtherDim = element; + } + if (element.orderX > maxSplit1OtherDim.orderX) { + maxSplit1OtherDim = element; + } + } + + currentXSplit1++; + } else { + if (split1InRam || this->workInRam) { + splitBuffers.splitBuffersRam.s1Dim1->push_back(element); + } else { + Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split1Dim1File.value()); + } + if (((currentYSplit1 + 1) % SSplit1 == 0 && (currentYSplit1 + 1) / SSplit1 >= 1 && (currentYSplit1 + 1) / SSplit1 < M) + || (currentYSplit1 % SSplit1 == 0 && currentYSplit1 / SSplit1 >= 1 && currentYSplit1 / SSplit1 < M)) { + // index i * S - 1 or i * S + splitBuffers.splitBuffersRam.s1SmallDim1->push_back(element); + } + + if (splitResult.bestDim == 0) { + if (currentYSplit1 == 0) { + minSplit1OtherDim = element; + maxSplit1OtherDim = element; + } + if (element.orderX > maxSplit1OtherDim.orderX) { + maxSplit1OtherDim = element; + } + } + + currentYSplit1++; + } + } + i++; + + if (!this->workInRam) { + if (dim == 0) + elementOpt = fileReaderDim0.value().GetNextElement(); + if (dim == 1) + elementOpt = fileReaderDim1.value().GetNextElement(); + } + } + } + + if (!this->workInRam) { + fileReaderDim0.value().Close(); + fileReaderDim1.value().Close(); + } + + // replace the placeholder + (*otherDimension.smallSplit0)[0] = minSplit0OtherDim; + (*otherDimension.smallSplit0)[1] = maxSplit0OtherDim; + (*otherDimension.smallSplit1)[0] = minSplit1OtherDim; + (*otherDimension.smallSplit1)[1] = maxSplit1OtherDim; + + Rtree::BoundingBox boxSplit0 = Rtree::createBoundingBox(globalMinXS0, globalMinYS0, globalMaxXS0, globalMaxYS0); + Rtree::BoundingBox boxSplit1 = Rtree::createBoundingBox(globalMinXS1, globalMinYS1, globalMaxXS1, globalMaxYS1); + + return std::make_pair(boxSplit0, boxSplit1); +} + diff --git a/src/util/RtreeFileReader.cpp b/src/util/RtreeFileReader.cpp new file mode 100644 index 0000000000..1288f1342f --- /dev/null +++ b/src/util/RtreeFileReader.cpp @@ -0,0 +1,82 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#include +#include + +FileReader::FileReader(const std::string& filePath) { + this->filePath = filePath; + + this->file = std::ifstream(this->filePath, std::ios::binary); + this->file.seekg (0, std::ifstream::end); + this->fileLength = this->file.tellg(); + this->file.seekg (0, std::ifstream::beg); +} + +std::optional FileReader::GetNextElement() { + if (static_cast(this->file.tellg()) < this->fileLength) { + double minX; + double minY; + double maxX; + double maxY; + uint64_t id; + uint64_t orderX; + uint64_t orderY; + + this->file.read(reinterpret_cast(&minX), sizeof(double)); + this->file.read(reinterpret_cast(&minY), sizeof(double)); + this->file.read(reinterpret_cast(&maxX), sizeof(double)); + this->file.read(reinterpret_cast(&maxY), sizeof(double)); + this->file.read(reinterpret_cast(&id), sizeof(uint64_t)); + this->file.read(reinterpret_cast(&orderX), sizeof(uint64_t)); + this->file.read(reinterpret_cast(&orderY), sizeof(uint64_t)); + + Rtree::BoundingBox box = Rtree::createBoundingBox(minX, minY, maxX, maxY); + RTreeValueWithOrderIndex element = {box, id, orderX, orderY}; + + return { element }; + } else { + return {}; + } +} + +void FileReader::Close() { + this->file.close(); +} + +FileReaderWithoutIndex::FileReaderWithoutIndex(const std::string& filePath) { + this->filePath = filePath; + + this->file = std::ifstream(this->filePath, std::ios::binary); + this->file.seekg (0, std::ifstream::end); + this->fileLength = this->file.tellg(); + this->file.seekg (0, std::ifstream::beg); +} + +std::optional FileReaderWithoutIndex::GetNextElement() { + if (static_cast(this->file.tellg()) < this->fileLength) { + double minX; + double minY; + double maxX; + double maxY; + uint64_t id; + + this->file.read(reinterpret_cast(&minX), sizeof(double)); + this->file.read(reinterpret_cast(&minY), sizeof(double)); + this->file.read(reinterpret_cast(&maxX), sizeof(double)); + this->file.read(reinterpret_cast(&maxY), sizeof(double)); + this->file.read(reinterpret_cast(&id), sizeof(uint64_t)); + + Rtree::BoundingBox box = Rtree::createBoundingBox(minX, minY, maxX, maxY); + RTreeValue boxWithId = {box, id}; + + return { boxWithId }; + } else { + return {}; + } +} + +void FileReaderWithoutIndex::Close() { + this->file.close(); +} diff --git a/src/util/RtreeFileReader.h b/src/util/RtreeFileReader.h new file mode 100644 index 0000000000..0a019916a8 --- /dev/null +++ b/src/util/RtreeFileReader.h @@ -0,0 +1,41 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#ifndef QLEVER_RTREEFILEREADER_H +#define QLEVER_RTREEFILEREADER_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class FileReader { + private: + std::string filePath; + std::ifstream file; + uint64_t fileLength; + public: + explicit FileReader(const std::string& filePath); + std::optional GetNextElement(); + void Close(); +}; + +class FileReaderWithoutIndex { + private: + std::string filePath; + std::ifstream file; + uint64_t fileLength; + public: + explicit FileReaderWithoutIndex(const std::string& filePath); + std::optional GetNextElement(); + void Close(); +}; + +#endif //QLEVER_RTREEFILEREADER_H From 9f33b6a4ab4c7d45c9e8c042451ed22167760971 Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Mon, 25 Sep 2023 09:45:35 +0200 Subject: [PATCH 17/20] Code refactoring --- Dockerfile | 2 +- src/VocabularyMergerMain.cpp | 6 +- .../sparqlExpressions/ContainsExpression.cpp | 222 ++--- .../sparqlExpressions/ContainsExpression.h | 42 +- src/global/Constants.h | 2 + src/index/Index.h | 3 +- src/index/IndexImpl.cpp | 32 +- src/index/IndexImpl.h | 2 +- src/index/VocabularyGenerator.h | 14 +- src/index/VocabularyGeneratorImpl.h | 21 +- .../sparqlParser/SparqlQleverVisitor.cpp | 21 +- src/util/Rtree.cpp | 100 ++- src/util/Rtree.h | 353 +++++--- src/util/RtreeBuild.cpp | 790 +++++++++--------- src/util/RtreeFileReader.cpp | 20 +- src/util/RtreeFileReader.h | 21 +- test/VocabularyGeneratorTest.cpp | 30 +- 17 files changed, 939 insertions(+), 742 deletions(-) diff --git a/Dockerfile b/Dockerfile index b0830f76ac..73474f1feb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,7 +20,7 @@ RUN make test FROM base as runtime WORKDIR /app ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update && apt-get install -y wget python3-yaml unzip curl bzip2 pkg-config libicu-dev python3-icu libgomp1 uuid-runtime make lbzip2 libjemalloc-dev libzstd-dev libssl-dev libboost1.74-dev libboost-program-options1.74-dev libboost-iostreams1.74-dev +RUN apt-get update && apt-get install -y wget python3-yaml unzip curl bzip2 pkg-config libicu-dev python3-icu libgomp1 uuid-runtime make lbzip2 libjemalloc-dev libzstd-dev libssl-dev libboost1.74-dev libboost-program-options1.74-dev libboost-iostreams1.74-dev libboost-serialization1.74-dev ARG UID=1000 RUN groupadd -r qlever && useradd --no-log-init -r -u $UID -g qlever qlever && chown qlever:qlever /app diff --git a/src/VocabularyMergerMain.cpp b/src/VocabularyMergerMain.cpp index 822141f0a4..27820c0183 100644 --- a/src/VocabularyMergerMain.cpp +++ b/src/VocabularyMergerMain.cpp @@ -21,10 +21,12 @@ int main(int argc, char** argv) { VocabularyMerger m; auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word, [[maybe_unused]]const auto& index) { + auto internalVocabularyAction = [&file](const auto& word, + [[maybe_unused]] const auto& index) { file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; - auto externalVocabularyAction = []([[maybe_unused]]const auto& word, [[maybe_unused]]const auto& index) {}; + auto externalVocabularyAction = []([[maybe_unused]] const auto& word, + [[maybe_unused]] const auto& index) {}; m.mergeVocabulary(basename, numFiles, TripleComponentComparator(), internalVocabularyAction, externalVocabularyAction); } diff --git a/src/engine/sparqlExpressions/ContainsExpression.cpp b/src/engine/sparqlExpressions/ContainsExpression.cpp index 25a74c339f..732bfc6285 100644 --- a/src/engine/sparqlExpressions/ContainsExpression.cpp +++ b/src/engine/sparqlExpressions/ContainsExpression.cpp @@ -4,10 +4,11 @@ #include "./ContainsExpression.h" +#include + #include "engine/sparqlExpressions/SparqlExpressionGenerators.h" #include "global/ValueIdComparators.h" #include "re2/re2.h" -#include using namespace std::literals; @@ -30,117 +31,130 @@ std::string removeQuotes(std::string_view input) { namespace sparqlExpression { // ___________________________________________________________________________ - ContainsExpression::ContainsExpression( - SparqlExpression::Ptr child, SparqlExpression::Ptr boundingBox) - : child_{std::move(child)} { - if (!dynamic_cast(child_.get())) { - throw std::runtime_error( - "Contain expressions are currently supported only on variables."); - } - std::string boundingBoxString; - std::string originalBoundingBoxString; - if (auto boundingBoxPtr = - dynamic_cast(boundingBox.get())) { - originalBoundingBoxString = boundingBoxPtr->value().normalizedLiteralContent().get(); - if (!boundingBoxPtr->value().datatypeOrLangtag().empty()) { - throw std::runtime_error( - "The second argument to the Contain function (which contains the " - "bounding box) must not contain a language tag or a datatype"); - } - boundingBoxString = removeQuotes(originalBoundingBoxString); - } else { - throw std::runtime_error( - "The second argument to the Contains function must be a " - "string literal (which contains the bounding box of format \"minX,minY,maxX,maxY\")"); - } - - boundingBoxAsString_ = boundingBoxString; - - std::vector boundingBoxEntriesAsString(4); - std::string errorMessage; - std::string::size_type searchFrom = 0; - for (int i = 0; i < 4; i++) { - if (i == 3) { - if (searchFrom >= boundingBoxAsString_.size()) { errorMessage = "The fourth argument was not provided"; break; } - boundingBoxEntriesAsString[i] = boundingBoxAsString_.substr(searchFrom, boundingBoxAsString_.size() - searchFrom); - break; - } - std::string::size_type end = boundingBoxAsString_.find(',', searchFrom); - if (end >= boundingBoxAsString_.size() - 1 || end == std::string::npos) { errorMessage = "There are not enough arguments"; break; } - boundingBoxEntriesAsString[i] = boundingBoxAsString_.substr(searchFrom, end - searchFrom); - searchFrom = end + 1; - } - - if (errorMessage.empty()) { - double minX; - double minY; - double maxX; - double maxY; - - try { - minX = std::stod(boundingBoxEntriesAsString[0]); - minY = std::stod(boundingBoxEntriesAsString[1]); - maxX = std::stod(boundingBoxEntriesAsString[2]); - maxY = std::stod(boundingBoxEntriesAsString[3]); - - boundingBox_ = new BoundingBoxType(Rtree::createBoundingBox(minX, minY, maxX, maxY)); - } catch (const std::invalid_argument& e) { - errorMessage = e.what(); - } catch (const std::out_of_range& e) { - errorMessage = e.what(); - } - } - - if (!errorMessage.empty()) { - throw std::runtime_error{absl::StrCat( - "The bounding box ", originalBoundingBoxString, - " is not supported by QLever (must be of format \"minX,minY,maxX,maxY\"). " - "Error message is: ", errorMessage - )}; - } +ContainsExpression::ContainsExpression(SparqlExpression::Ptr child, + SparqlExpression::Ptr boundingBox) + : child_{std::move(child)} { + if (!dynamic_cast(child_.get())) { + throw std::runtime_error( + "Contain expressions are currently supported only on variables."); + } + std::string boundingBoxString; + std::string originalBoundingBoxString; + if (auto boundingBoxPtr = + dynamic_cast(boundingBox.get())) { + originalBoundingBoxString = + boundingBoxPtr->value().normalizedLiteralContent().get(); + if (!boundingBoxPtr->value().datatypeOrLangtag().empty()) { + throw std::runtime_error( + "The second argument to the Contain function (which contains the " + "bounding box) must not contain a language tag or a datatype"); } + boundingBoxString = removeQuotes(originalBoundingBoxString); + } else { + throw std::runtime_error( + "The second argument to the Contains function must be a " + "string literal (which contains the bounding box of format " + "\"minX,minY,maxX,maxY\")"); + } + boundingBoxAsString_ = boundingBoxString; + + std::vector boundingBoxEntriesAsString(4); + std::string errorMessage; + std::string::size_type searchFrom = 0; + for (int i = 0; i < 4; i++) { + if (i == 3) { + if (searchFrom >= boundingBoxAsString_.size()) { + errorMessage = "The fourth argument was not provided"; + break; + } + boundingBoxEntriesAsString[i] = boundingBoxAsString_.substr( + searchFrom, boundingBoxAsString_.size() - searchFrom); + break; + } + std::string::size_type end = boundingBoxAsString_.find(',', searchFrom); + if (end >= boundingBoxAsString_.size() - 1 || end == std::string::npos) { + errorMessage = "There are not enough arguments"; + break; + } + boundingBoxEntriesAsString[i] = + boundingBoxAsString_.substr(searchFrom, end - searchFrom); + searchFrom = end + 1; + } -// ___________________________________________________________________________ - string ContainsExpression::getCacheKey( - const VariableToColumnMap& varColMap) const { - return absl::StrCat("Bounding Box CONTAINS expression ", child_->getCacheKey(varColMap), - " with ", boundingBoxAsString_); + if (errorMessage.empty()) { + double minX; + double minY; + double maxX; + double maxY; + + try { + minX = std::stod(boundingBoxEntriesAsString[0]); + minY = std::stod(boundingBoxEntriesAsString[1]); + maxX = std::stod(boundingBoxEntriesAsString[2]); + maxY = std::stod(boundingBoxEntriesAsString[3]); + + boundingBox_ = + new BoundingBoxType(Rtree::createBoundingBox(minX, minY, maxX, maxY)); + } catch (const std::invalid_argument& e) { + errorMessage = e.what(); + } catch (const std::out_of_range& e) { + errorMessage = e.what(); } + } + + if (!errorMessage.empty()) { + throw std::runtime_error{ + absl::StrCat("The bounding box ", originalBoundingBoxString, + " is not supported by QLever (must be of format " + "\"minX,minY,maxX,maxY\"). " + "Error message is: ", + errorMessage)}; + } +} // ___________________________________________________________________________ - std::span ContainsExpression::childrenImpl() { - return {&child_, 1}; - } +string ContainsExpression::getCacheKey( + const VariableToColumnMap& varColMap) const { + return absl::StrCat("Bounding Box CONTAINS expression ", + child_->getCacheKey(varColMap), " with ", + boundingBoxAsString_); +} + +// ___________________________________________________________________________ +std::span ContainsExpression::childrenImpl() { + return {&child_, 1}; +} // ___________________________________________________________________________ - ExpressionResult ContainsExpression::evaluate( - sparqlExpression::EvaluationContext* context) const { - auto resultAsVariant = child_->evaluate(context); - auto variablePtr = std::get_if(&resultAsVariant); - AD_CONTRACT_CHECK(variablePtr); - - // search in the rtree - ad_utility::HashSet resultIds; - if (boundingBox_ != nullptr) { - Rtree rtree = context->_qec.getIndex().getRtree(); - multiBoxGeo treeResults = rtree.SearchTree(boundingBox_->value, "./rtree_build"); - - for (RTreeValue item : treeResults) { - long long id = item.id; - resultIds.insert(id); - } - } - - auto resultSize = context->size(); - VectorWithMemoryLimit result{context->_allocator}; - result.reserve(resultSize); - - for (auto id : detail::makeGenerator(*variablePtr, resultSize, context)) { - result.push_back(Id::makeFromBool(resultIds.contains(id.getInt()))); - } - - return result; +ExpressionResult ContainsExpression::evaluate( + sparqlExpression::EvaluationContext* context) const { + auto resultAsVariant = child_->evaluate(context); + auto variablePtr = std::get_if(&resultAsVariant); + AD_CONTRACT_CHECK(variablePtr); + + // search in the rtree + ad_utility::HashSet resultIds; + if (boundingBox_ != nullptr) { + Rtree rtree = context->_qec.getIndex().getRtree(); + multiBoxGeo treeResults = + rtree.SearchTree(boundingBox_->value, "./rtree_build"); + + for (RTreeValue item : treeResults) { + long long id = item.id; + resultIds.insert(id); } + } + + auto resultSize = context->size(); + VectorWithMemoryLimit result{context->_allocator}; + result.reserve(resultSize); + + for (auto id : detail::makeGenerator(*variablePtr, resultSize, context)) { + result.push_back(Id::makeFromBool(resultIds.contains(id.getInt()))); + } + + return result; +} } // namespace sparqlExpression diff --git a/src/engine/sparqlExpressions/ContainsExpression.h b/src/engine/sparqlExpressions/ContainsExpression.h index a735cbe0e2..90312c5e27 100644 --- a/src/engine/sparqlExpressions/ContainsExpression.h +++ b/src/engine/sparqlExpressions/ContainsExpression.h @@ -5,29 +5,31 @@ #pragma once #include + #include "engine/sparqlExpressions/LiteralExpression.h" #include "engine/sparqlExpressions/SparqlExpression.h" class BoundingBoxType; namespace sparqlExpression { - class ContainsExpression : public SparqlExpression { - private: - SparqlExpression::Ptr child_; - std::string boundingBoxAsString_; - BoundingBoxType* boundingBox_{}; - - public: - // `child` must be a `VariableExpression` and `boundingBox` must be a - // `LiteralExpression` that stores a string, else an exception will be thrown. - ContainsExpression(SparqlExpression::Ptr child, SparqlExpression::Ptr boundingBox); - - ExpressionResult evaluate(EvaluationContext *context) const override; - - std::span childrenImpl() override; - - // _________________________________________________________________________ - [[nodiscard]] string getCacheKey( - const VariableToColumnMap &varColMap) const override; - }; -} \ No newline at end of file +class ContainsExpression : public SparqlExpression { + private: + SparqlExpression::Ptr child_; + std::string boundingBoxAsString_; + BoundingBoxType* boundingBox_{}; + + public: + // `child` must be a `VariableExpression` and `boundingBox` must be a + // `LiteralExpression` that stores a string, else an exception will be thrown. + ContainsExpression(SparqlExpression::Ptr child, + SparqlExpression::Ptr boundingBox); + + ExpressionResult evaluate(EvaluationContext* context) const override; + + std::span childrenImpl() override; + + // _________________________________________________________________________ + [[nodiscard]] string getCacheKey( + const VariableToColumnMap& varColMap) const override; +}; +} // namespace sparqlExpression diff --git a/src/global/Constants.h b/src/global/Constants.h index 5c07733fa7..573bf5947c 100644 --- a/src/global/Constants.h +++ b/src/global/Constants.h @@ -51,6 +51,8 @@ static constexpr std::pair GEOF_PREFIX = { "geof:", " MATH_PREFIX = { "math:", " GEO_RTREE_PREFIX = { + "geoRtree:", " + #include #include #include @@ -16,7 +18,6 @@ #include "index/StringSortComparator.h" #include "index/Vocabulary.h" #include "parser/TripleComponent.h" -#include // Forward declarations. class IdTable; diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index e1d92b020a..a436308504 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -352,17 +352,19 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( auto compressionOutfile = ad_utility::makeOfstream( onDiskBase_ + TMP_BASENAME_COMPRESSION + INTERNAL_VOCAB_SUFFIX); auto internalVocabularyActionCompression = - [&compressionOutfile](const auto& word, [[maybe_unused]]const auto& index) { + [&compressionOutfile](const auto& word, + [[maybe_unused]] const auto& index) { compressionOutfile << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; }; auto externalVocabularyActionCompression = - []([[maybe_unused]]const auto& word, [[maybe_unused]]const auto& index) { - }; + []([[maybe_unused]] const auto& word, + [[maybe_unused]] const auto& index) {}; m._noIdMapsAndIgnoreExternalVocab = true; auto mergeResult = m.mergeVocabulary(onDiskBase_ + TMP_BASENAME_COMPRESSION, numFiles, - std::less<>(), internalVocabularyActionCompression, externalVocabularyActionCompression); + std::less<>(), internalVocabularyActionCompression, + externalVocabularyActionCompression); sizeInternalVocabulary = mergeResult.numWordsTotal_; LOG(INFO) << "Number of words in internal vocabulary: " << sizeInternalVocabulary << std::endl; @@ -389,24 +391,30 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( auto wordWriter = vocab_.makeUncompressingWordWriter(onDiskBase_ + INTERNAL_VOCAB_SUFFIX); - std::ofstream convertOfs = std::ofstream(onDiskBase_ + ".vocabulary.boundingbox.tmp", std::ios::binary); + std::ofstream convertOfs = std::ofstream( + onDiskBase_ + ".vocabulary.boundingbox.tmp", std::ios::binary); - auto internalVocabularyAction = [&wordWriter, &convertOfs](const auto& word, const auto& index) { + auto internalVocabularyAction = [&wordWriter, &convertOfs]( + const auto& word, const auto& index) { wordWriter.push(word.data(), word.size()); - std::optional boundingBox = Rtree::ConvertWordToRtreeEntry(word); + std::optional boundingBox = + Rtree::ConvertWordToRtreeEntry(word); if (boundingBox) { Rtree::SaveEntry(boundingBox.value(), index, convertOfs); } }; - auto externalVocabularyAction = [&convertOfs](const auto& word, const auto& index) { - std::optional boundingBox = Rtree::ConvertWordToRtreeEntry(word); + auto externalVocabularyAction = [&convertOfs](const auto& word, + const auto& index) { + std::optional boundingBox = + Rtree::ConvertWordToRtreeEntry(word); if (boundingBox) { Rtree::SaveEntry(boundingBox.value(), index, convertOfs); } }; - VocabularyMerger::VocabularyMetaData result = v.mergeVocabulary(onDiskBase_, numFiles, sortPred, - internalVocabularyAction, externalVocabularyAction); + VocabularyMerger::VocabularyMetaData result = + v.mergeVocabulary(onDiskBase_, numFiles, sortPred, + internalVocabularyAction, externalVocabularyAction); convertOfs.close(); @@ -425,7 +433,7 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( Rtree rtree = Rtree(1300000000000); rtree.BuildTree(onDiskBase_ + ".vocabulary", 16, "./rtree_build"); LOG(INFO) << "Finished building the Rtree" << std::endl; - } catch (const std::exception &e) { + } catch (const std::exception& e) { LOG(INFO) << e.what() << std::endl; } diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index bfbda1b330..a022e59f66 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -29,9 +29,9 @@ #include #include #include +#include #include #include -#include #include #include diff --git a/src/index/VocabularyGenerator.h b/src/index/VocabularyGenerator.h index aa52ba8a33..af21f0b72e 100644 --- a/src/index/VocabularyGenerator.h +++ b/src/index/VocabularyGenerator.h @@ -101,11 +101,12 @@ class VocabularyMerger { // Argument comparator gives the way to order strings (case-sensitive or not) // This automatically resets the inner members after finishing, to leave the // external interface stateless - template - VocabularyMetaData mergeVocabulary(const std::string& basename, - size_t numFiles, Comp comparator, - InternalVocabularyAction& action, - ExternalVocabularyAction& externalVocabularyAction); + template + VocabularyMetaData mergeVocabulary( + const std::string& basename, size_t numFiles, Comp comparator, + InternalVocabularyAction& action, + ExternalVocabularyAction& externalVocabularyAction); private: // helper struct used in the priority queue for merging. @@ -133,7 +134,8 @@ class VocabularyMerger { // write the queu words in the buffer to their corresponding idPairVecs. // Requires that all the QueueWords that are ever passed are ordered // alphabetically (Also across multiple calls) - template + template void writeQueueWordsToIdVec( const std::vector& buffer, InternalVocabularyAction& internalVocabularyAction, diff --git a/src/index/VocabularyGeneratorImpl.h b/src/index/VocabularyGeneratorImpl.h index 6ef6a9be0f..71158c5089 100644 --- a/src/index/VocabularyGeneratorImpl.h +++ b/src/index/VocabularyGeneratorImpl.h @@ -25,10 +25,12 @@ #include "./VocabularyGenerator.h" // ___________________________________________________________________ -template +template VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( const std::string& basename, size_t numFiles, Comparator comparator, - InternalVocabularyAction& internalVocabularyAction, ExternalVocabularyAction& externalVocabularyAction) { + InternalVocabularyAction& internalVocabularyAction, + ExternalVocabularyAction& externalVocabularyAction) { // Return true iff p1 >= p2 according to the lexicographic order of the IRI // or literal. All internal IRIs or literals come before all external ones. // TODO Change this as soon as we have Interleaved Ids via the @@ -102,8 +104,10 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( // asynchronously write the next batch of sorted // queue words auto writeTask = [this, buf = std::move(sortedBuffer), - &internalVocabularyAction, &externalVocabularyAction]() { - this->writeQueueWordsToIdVec(buf, internalVocabularyAction, externalVocabularyAction); + &internalVocabularyAction, + &externalVocabularyAction]() { + this->writeQueueWordsToIdVec(buf, internalVocabularyAction, + externalVocabularyAction); }; sortedBuffer.clear(); sortedBuffer.reserve(_bufferSize); @@ -128,7 +132,8 @@ VocabularyMerger::VocabularyMetaData VocabularyMerger::mergeVocabulary( // Handle remaining words in the buffer if (!sortedBuffer.empty()) { - writeQueueWordsToIdVec(sortedBuffer, internalVocabularyAction, externalVocabularyAction); + writeQueueWordsToIdVec(sortedBuffer, internalVocabularyAction, + externalVocabularyAction); } auto metaData = std::move(metaData_); @@ -165,9 +170,11 @@ void VocabularyMerger::writeQueueWordsToIdVec( // write the new word to the vocabulary if (!lastTripleComponent_.value().isExternal()) { - internalVocabularyAction(lastTripleComponent_.value().iriOrLiteral(), lastTripleComponent_.value()._index); + internalVocabularyAction(lastTripleComponent_.value().iriOrLiteral(), + lastTripleComponent_.value()._index); } else { - externalVocabularyAction(lastTripleComponent_.value().iriOrLiteral(), lastTripleComponent_.value()._index); + externalVocabularyAction(lastTripleComponent_.value().iriOrLiteral(), + lastTripleComponent_.value()._index); outfileExternal_ << RdfEscaping::escapeNewlinesAndBackslashes( lastTripleComponent_.value().iriOrLiteral()) << '\n'; diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 8ef4caa50f..aabc29e960 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -11,6 +11,7 @@ #include #include "absl/strings/str_join.h" +#include "engine/sparqlExpressions/ContainsExpression.h" #include "engine/sparqlExpressions/LangExpression.h" #include "engine/sparqlExpressions/RandomExpression.h" #include "engine/sparqlExpressions/RegexExpression.h" @@ -22,7 +23,6 @@ #include "util/OnDestructionDontThrowDuringStackUnwinding.h" #include "util/StringUtils.h" #include "util/antlr/GenerateAntlrExceptionMetadata.h" -#include "engine/sparqlExpressions/ContainsExpression.h" using namespace ad_utility::sparql_types; using namespace sparqlExpression; @@ -121,17 +121,11 @@ ExpressionPtr Visitor::processIriFunctionCall( checkNumArgs(1); return sparqlExpression::makeTanExpression(std::move(argList[0])); } - } - - constexpr static std::string_view geoPrefixRtree = - "')); - iriView.remove_suffix(1); - if (iriView == "boundingBoxContains") { - checkNumArgs("geoRtree:", iriView, 2); - return std::make_unique(std::move(argList[0]), std::move(argList[1])); + } else if (checkPrefix(GEO_RTREE_PREFIX)) { + if (functionName == "boundingBoxContains") { + checkNumArgs(2); + return std::make_unique(std::move(argList[0]), + std::move(argList[1])); } } @@ -1235,7 +1229,8 @@ ad_utility::sparql_types::Node Visitor::visit(Parser::TriplesNodeContext* ctx) { } // ____________________________________________________________________________________ -ad_utility::sparql_types::Node Visitor::visit(Parser::BlankNodePropertyListContext* ctx) { +ad_utility::sparql_types::Node Visitor::visit( + Parser::BlankNodePropertyListContext* ctx) { VarOrTerm var{GraphTerm{newBlankNode()}}; Triples triples; auto propertyList = visit(ctx->propertyListNotEmpty()); diff --git a/src/util/Rtree.cpp b/src/util/Rtree.cpp index bea47bbfc5..01d8d00fcc 100644 --- a/src/util/Rtree.cpp +++ b/src/util/Rtree.cpp @@ -3,12 +3,13 @@ // Author: Noah Nock #include + #include #include -bool intersects(const Rtree::BoundingBox &b1, const Rtree::BoundingBox &b2) { +bool intersects(const Rtree::BoundingBox& b1, const Rtree::BoundingBox& b2) { /** - * Determine whether two bounding boxes intersect + * Determine whether two bounding boxes intersect */ bool notIntersecting = b1.min_corner().get<0>() > b2.max_corner().get<0>() || b2.min_corner().get<0>() > b1.max_corner().get<0>() || @@ -16,22 +17,58 @@ bool intersects(const Rtree::BoundingBox &b1, const Rtree::BoundingBox &b2) { b2.min_corner().get<1>() > b1.max_corner().get<1>(); return !notIntersecting; -} - -Rtree::BoundingBox Rtree::createBoundingBox(double pointOneX, double pointOneY, double pointTwoX, double pointTwoY) { - return Rtree::BoundingBox(Rtree::Point(pointOneX, pointOneY), Rtree::Point(pointTwoX, pointTwoY)); -} - -multiBoxGeo Rtree::SearchTree(Rtree::BoundingBox query, const std::string &folder) { - std::ifstream lookupIfs = std::ifstream(folder + "/lookup.bin", std::ios::binary); - std::ifstream nodesIfs = std::ifstream(folder + "/nodes.bin", std::ios::binary); +} + +Rtree::BoundingBox Rtree::createBoundingBox(double pointOneX, double pointOneY, + double pointTwoX, + double pointTwoY) { + return {{pointOneX, pointOneY}, {pointTwoX, pointTwoY}}; +} + +Rtree::BoundingBox Rtree::combineBoundingBoxes(Rtree::BoundingBox b1, + Rtree::BoundingBox b2) { + if (b1.min_corner().get<0>() == 0 && b1.min_corner().get<1>() == 0 && + b1.max_corner().get<0>() == 0 && b1.max_corner().get<1>() == 0) { + return b2; + } + if (b2.min_corner().get<0>() == 0 && b2.min_corner().get<1>() == 0 && + b2.max_corner().get<0>() == 0 && b2.max_corner().get<1>() == 0) { + return b1; + } + auto minX = [](Rtree::BoundingBox b) -> double { + return b.min_corner().get<0>(); + }; + auto minY = [](Rtree::BoundingBox b) -> double { + return b.min_corner().get<1>(); + }; + auto maxX = [](Rtree::BoundingBox b) -> double { + return b.max_corner().get<0>(); + }; + auto maxY = [](Rtree::BoundingBox b) -> double { + return b.max_corner().get<1>(); + }; + + double globalMinX = minX(b1) < minX(b2) ? minX(b1) : minX(b2); + double globalMinY = minY(b1) < minY(b2) ? minY(b1) : minY(b2); + double globalMaxX = maxX(b1) > maxX(b2) ? maxX(b1) : maxX(b2); + double globalMaxY = maxY(b1) > maxY(b2) ? maxY(b1) : maxY(b2); + + return {{globalMinX, globalMinY}, {globalMaxX, globalMaxY}}; +} + +multiBoxGeo Rtree::SearchTree(Rtree::BoundingBox query, + const std::string& folder) { + std::ifstream lookupIfs = + std::ifstream(folder + "/lookup.bin", std::ios::binary); + std::ifstream nodesIfs = + std::ifstream(folder + "/nodes.bin", std::ios::binary); Node rootNode = LoadNode(0, lookupIfs, nodesIfs); multiBoxGeo results; std::stack nodes; nodes.push(rootNode); - while(!nodes.empty()) { + while (!nodes.empty()) { Node currentNode = nodes.top(); nodes.pop(); @@ -50,45 +87,40 @@ multiBoxGeo Rtree::SearchTree(Rtree::BoundingBox query, const std::string &folde lookupIfs.close(); nodesIfs.close(); return results; -} +} -uint64_t Node::GetId() const { - return this->id; -} +uint64_t Node::GetId() const { return this->id; } Node::Node(uint64_t id, Rtree::BoundingBox boundingbox) { this->id = id; this->boundingBox = boundingbox; -} +} -Node::Node(uint64_t id) { - this->id = id; -} +Node::Node(uint64_t id) { this->id = id; } -Node::Node() {} +Node::Node() {} -Node::Node(uint64_t id, Rtree::BoundingBox boundingBox, multiBoxGeo &children, bool isLastInnerNode) { +Node::Node(uint64_t id, Rtree::BoundingBox boundingBox, multiBoxGeo& children, + bool isLastInnerNode) { this->id = id; this->boundingBox = boundingBox; this->children = children; this->isLastInnerNode = isLastInnerNode; -} +} -Node::Node(uint64_t id, double minX, double minY, double maxX, double maxY, bool isLastInnerNode) { +Node::Node(uint64_t id, double minX, double minY, double maxX, double maxY, + bool isLastInnerNode) { this->id = id; this->boundingBox = Rtree::createBoundingBox(minX, minY, maxX, maxY); this->isLastInnerNode = isLastInnerNode; -} +} -bool Node::GetIsLastInnerNode() const { - return this->isLastInnerNode; -} +bool Node::GetIsLastInnerNode() const { return this->isLastInnerNode; } -multiBoxGeo Node::GetChildren() { - return this->children; -} +multiBoxGeo Node::GetChildren() { return this->children; } -Node Rtree::LoadNode(uint64_t id, std::ifstream& lookupIfs, std::ifstream& nodesIfs) { +Node Rtree::LoadNode(uint64_t id, std::ifstream& lookupIfs, + std::ifstream& nodesIfs) { Node newNode; uint64_t offset = id * (uint64_t)sizeof(uint64_t); @@ -102,8 +134,8 @@ Node Rtree::LoadNode(uint64_t id, std::ifstream& lookupIfs, std::ifstream& nodes ia >> newNode; return newNode; -} +} Rtree::Rtree(uintmax_t maxBuildingRamUsage) { this->maxBuildingRamUsage = maxBuildingRamUsage; -} +} diff --git a/src/util/Rtree.h b/src/util/Rtree.h index f378cea81c..f83d1182e8 100644 --- a/src/util/Rtree.h +++ b/src/util/Rtree.h @@ -5,19 +5,19 @@ #ifndef QLEVER_RTREE_H #define QLEVER_RTREE_H -#include -#include -#include -#include -#include -#include #include -#include #include +#include +#include +#include +#include +#include +#include namespace bg = boost::geometry; -/* Forward declaration */ +// ___________________________________________________________________________ +// Forward declaration struct RTreeValue; struct RTreeValueWithOrderIndex; using multiBoxGeo = std::vector; @@ -26,31 +26,37 @@ struct SplitResult; struct SplitBuffersRam; struct SplitBuffersDisk; -/** - * Data type to store all the information of the rectangles (in ram or on disk) + the small lists for one dimension - */ +// ___________________________________________________________________________ +// Data type to store all the information of the rectangles (in ram or on disk) +// + the small lists for one dimension struct RectanglesForOrderedBoxes { std::shared_ptr rectanglesInRam; std::string rectanglesOnDisk; std::shared_ptr rectanglesSmall; }; +// ___________________________________________________________________________ +// Data structure representing a single node of the Rtree containing the +// boundingbox and its children class Node { protected: - typedef bg::model::point> Point; + typedef bg::model::point> + Point; typedef bg::model::box BoundingBox; friend class boost::serialization::access; uint64_t id; BoundingBox boundingBox{}; - bool isLastInnerNode = false; + bool isLastInnerNode = + false; // when true, this means that the node is the last inner node and + // all of its children are leafs multiBoxGeo children; - template - void serialize(Archive & a, [[maybe_unused]]const unsigned int version) { - a & id; - a & isLastInnerNode; - a & boundingBox; - a & children; + template + void serialize(Archive& a, [[maybe_unused]] const unsigned int version) { + a& id; + a& isLastInnerNode; + a& boundingBox; + a& children; } explicit Node(uint64_t id); @@ -58,8 +64,10 @@ class Node { public: Node(); Node(uint64_t id, BoundingBox boundingBox); - Node(uint64_t id, BoundingBox boundingBox, multiBoxGeo &children, bool isLastInnerNode); - Node(uint64_t id, double minX, double minY, double maxX, double maxY, bool isLastInnerNode); + Node(uint64_t id, BoundingBox boundingBox, multiBoxGeo& children, + bool isLastInnerNode); + Node(uint64_t id, double minX, double minY, double maxX, double maxY, + bool isLastInnerNode); [[nodiscard]] uint64_t GetId() const; [[nodiscard]] BoundingBox GetBoundingBox() const; void AddChild(Node& child); @@ -70,49 +78,140 @@ class Node { BOOST_CLASS_VERSION(Node, 1) +// ___________________________________________________________________________ +// A Rtree based on bounding boxes and ids class Rtree { private: - static uint64_t SaveNode(Node &node, bool isLastInnerNode, std::ofstream& nodesOfs); - static Node LoadNode(uint64_t id, std::ifstream& lookupIfs, std::ifstream& nodesIfs); + // ___________________________________________________________________________ + // Save the current node in the building process to disk + static uint64_t SaveNode(Node& node, bool isLastInnerNode, + std::ofstream& nodesOfs); + // ___________________________________________________________________________ + // Load a specific Node to query in its children + static Node LoadNode(uint64_t id, std::ifstream& lookupIfs, + std::ifstream& nodesIfs); uintmax_t maxBuildingRamUsage; + public: - typedef bg::model::point> Point; + typedef bg::model::point> + Point; typedef bg::model::box BoundingBox; - void BuildTree(const std::string& onDiskBase, size_t M, const std::string& folder) const; + // ___________________________________________________________________________ + // Build the whole Rtree with the raw data in onDiskBase + ".boundingbox.tmp", + // M as branching factor and folder as Rtree destination + void BuildTree(const std::string& onDiskBase, size_t M, + const std::string& folder) const; + // ___________________________________________________________________________ + // Search for an intersection of query with any elements of the Rtree static multiBoxGeo SearchTree(BoundingBox query, const std::string& folder); - static std::optional ConvertWordToRtreeEntry(const std::string& wkt); - static void SaveEntry(BoundingBox boundingBox, uint64_t index, std::ofstream& convertOfs); - static void SaveEntryWithOrderIndex(RTreeValueWithOrderIndex treeValue, std::ofstream& convertOfs); + // ___________________________________________________________________________ + // Convert a single wkt literal to a datapoint in the format suitable for the + // Rtree + static std::optional ConvertWordToRtreeEntry( + const std::string& wkt); + // ___________________________________________________________________________ + // Save a single datapoint for the Rtree to disk + static void SaveEntry(BoundingBox boundingBox, uint64_t index, + std::ofstream& convertOfs); + // ___________________________________________________________________________ + // Save a single datapoint of the Rtree, together with its position in the x + // and y sorting to disk + static void SaveEntryWithOrderIndex(RTreeValueWithOrderIndex treeValue, + std::ofstream& convertOfs); + // ___________________________________________________________________________ + // Load all datapoints of the Rtree in file into ram static multiBoxGeo LoadEntries(const std::string& file); - static multiBoxWithOrderIndex LoadEntriesWithOrderIndex(const std::string& file); - static BoundingBox createBoundingBox(double pointOneX, double pointOneY, double pointTwoX, double pointTwoY); + // ___________________________________________________________________________ + // Load all datapoints of the Rtree, together with its x and y sorting into + // ram + static multiBoxWithOrderIndex LoadEntriesWithOrderIndex( + const std::string& file); + // ___________________________________________________________________________ + // Create a bounding box, based on the corner coordinates + static BoundingBox createBoundingBox(double pointOneX, double pointOneY, + double pointTwoX, double pointTwoY); + // ___________________________________________________________________________ + // Take two bounding boxes and combine them into one bounding box containing + // both + static BoundingBox combineBoundingBoxes(Rtree::BoundingBox b1, + Rtree::BoundingBox b2); explicit Rtree(uintmax_t maxBuildingRamUsage); }; +// ___________________________________________________________________________ +// Data structure handling the datapoints of the Rtree sorted in x and y +// direction (either on ram or on disk) class OrderedBoxes { private: bool workInRam; uint64_t size; Rtree::BoundingBox boundingBox; - RectanglesForOrderedBoxes rectsD0; - RectanglesForOrderedBoxes rectsD1; + RectanglesForOrderedBoxes + rectsD0; // the rectangles (datapoints) sorted in x direction + RectanglesForOrderedBoxes rectsD1; // the rectangles sorted in y direction + // ___________________________________________________________________________ + // Initiate the splitting of the rectangles in the best position (rectangles + // are stored in ram) std::pair SplitAtBestInRam(size_t S, size_t M); - std::pair SplitAtBestOnDisk(const std::string& filePath, size_t S, size_t M, uint64_t maxBuildingRamUsage); + // ___________________________________________________________________________ + // Initiate the splitting of the rectangles in the best position (rectangles + // are stored on disk) + std::pair SplitAtBestOnDisk( + const std::string& filePath, size_t S, size_t M, + uint64_t maxBuildingRamUsage); + // ___________________________________________________________________________ + // Get the position and dimension of the best split possible to maximize the + // quality of the Rtree SplitResult GetBestSplit(); - std::pair PerformSplit(SplitResult splitResult, SplitBuffersRam& splitBuffersRam, size_t M, size_t S); - std::pair PerformSplit(SplitResult splitResult, SplitBuffersDisk& splitBuffers, size_t M, size_t S, uint64_t maxBuildingRamUsage); + // ___________________________________________________________________________ + // Actually splitting the rectangles at the given split by splitResult + // (rectangles are stored in ram) + std::pair PerformSplit( + SplitResult splitResult, SplitBuffersRam& splitBuffersRam, size_t M, + size_t S); + // ___________________________________________________________________________ + // Actually splitting the rectangles at the given split by splitResult + // (rectangles are stored on disk) + std::pair PerformSplit( + SplitResult splitResult, SplitBuffersDisk& splitBuffers, size_t M, + size_t S, uint64_t maxBuildingRamUsage); + public: [[nodiscard]] bool WorkInRam() const; - void CreateOrderedBoxesInRam(RectanglesForOrderedBoxes& rectanglesD0, RectanglesForOrderedBoxes& rectanglesD1, Rtree::BoundingBox box); // workInRam = true - void CreateOrderedBoxesOnDisk(RectanglesForOrderedBoxes& rectanglesD0, RectanglesForOrderedBoxes& rectanglesD1, uint64_t size, Rtree::BoundingBox box); // workInRam = false + // ___________________________________________________________________________ + // Set up the OrderedBoxes with the rectangles given as vectors stored in ram + // and set workInRam to true + void SetOrderedBoxesToRam(RectanglesForOrderedBoxes& rectanglesD0, + RectanglesForOrderedBoxes& rectanglesD1, + Rtree::BoundingBox box); + // ___________________________________________________________________________ + // Set up the OrderedBoxes with the rectangles given as files stored on disk + // and set workInRam to false + void SetOrderedBoxesToDisk(RectanglesForOrderedBoxes& rectanglesD0, + RectanglesForOrderedBoxes& rectanglesD1, + uint64_t size, Rtree::BoundingBox box); Rtree::BoundingBox GetBoundingBox(); [[nodiscard]] uint64_t GetSize() const; - std::pair SplitAtBest(const std::string& filePath, size_t S, size_t M, uint64_t maxBuildingRamUsage); + // ___________________________________________________________________________ + // Wrapper function to perform the whole process of splitting the rectangles + // for either ram or disk case + std::pair SplitAtBest( + const std::string& filePath, size_t S, size_t M, + uint64_t maxBuildingRamUsage); + // ___________________________________________________________________________ + // return the rectangles of the x sorting for the case where they are stored + // in ram std::shared_ptr GetRectanglesInRam(); + // ___________________________________________________________________________ + // return the rectangles of the x sorting for the case where they are stored + // on disk std::string GetRectanglesOnDisk(); }; -class ConstructionNode: public Node { +// ___________________________________________________________________________ +// Subclass of the Node only needed while constructing the Rtree (it keeps track +// of the remaining OrderedBoxes of the subtree) +class ConstructionNode : public Node { private: OrderedBoxes orderedBoxes; @@ -123,17 +222,17 @@ class ConstructionNode: public Node { }; namespace boost::serialization { -template -void save(Archive & a, const Rtree::BoundingBox & b, [[maybe_unused]]unsigned int version) -{ +template +void save(Archive& a, const Rtree::BoundingBox& b, + [[maybe_unused]] unsigned int version) { a << b.min_corner().get<0>(); a << b.min_corner().get<1>(); a << b.max_corner().get<0>(); a << b.max_corner().get<1>(); } -template -void load(Archive & a, Rtree::BoundingBox & b, [[maybe_unused]]unsigned int version) -{ +template +void load(Archive& a, Rtree::BoundingBox& b, + [[maybe_unused]] unsigned int version) { double minX = 0; a >> minX; double minY = 0; @@ -144,38 +243,39 @@ void load(Archive & a, Rtree::BoundingBox & b, [[maybe_unused]]unsigned int vers a >> maxY; b = Rtree::BoundingBox(Rtree::Point(minX, minY), Rtree::Point(maxX, maxY)); } -} +} // namespace boost::serialization BOOST_SERIALIZATION_SPLIT_FREE(Rtree::BoundingBox); -/** - * Data type for a value of the Rtree, which contains the id of the object and its bounding box. - */ +// ___________________________________________________________________________ +// Data type for a value of the Rtree, which contains the id of the object and +// its bounding box. struct RTreeValue { Rtree::BoundingBox box{}; uint64_t id = 0; - - template - void serialize(Archive & a, [[maybe_unused]]const unsigned int version) { - a & box; - a & id; + double MinX() const { return box.min_corner().get<0>(); } + double MaxX() const { return box.max_corner().get<0>(); } + double MinY() const { return box.min_corner().get<1>(); } + double MaxY() const { return box.max_corner().get<1>(); } + + template + void serialize(Archive& a, [[maybe_unused]] const unsigned int version) { + a& box; + a& id; } }; -/** - * Data type for a value of the Rtree (id and boundingbox), with the addtional information - * of its position in the x- and y-sorting. This is only used to create the Rtree in a more efficient way - */ -struct RTreeValueWithOrderIndex { - Rtree::BoundingBox box{}; - uint64_t id = 0; +// ___________________________________________________________________________ +// Data type for a value of the Rtree (id and boundingbox), with the addtional +// information of its position in the x- and y-sorting. This is only used to +// create the Rtree in a more efficient way +struct RTreeValueWithOrderIndex : RTreeValue { uint64_t orderX = 0; uint64_t orderY = 0; }; -/** - * Data type containing all the information about the best split found, which are needed - * to actually perform the split. - */ +// ___________________________________________________________________________ +// Data type containing all the information about the best split found, which +// are needed to actually perform the split. struct SplitResult { double bestCost = -1; size_t bestDim = 0; @@ -186,24 +286,32 @@ struct SplitResult { RTreeValueWithOrderIndex bestMaxElement; }; -/** - * A Buffer data structure, containing vectors for the result of a split, while doing it in ram - */ +// ___________________________________________________________________________ +// A Buffer data structure, containing vectors for the result of a split, while +// doing it in ram struct SplitBuffersRam { - std::shared_ptr s0Dim0 = std::make_shared(); - std::shared_ptr s0Dim1 = std::make_shared(); - std::shared_ptr s1Dim0 = std::make_shared(); - std::shared_ptr s1Dim1 = std::make_shared(); - - std::shared_ptr s0SmallDim0 = std::make_shared(); - std::shared_ptr s0SmallDim1 = std::make_shared(); - std::shared_ptr s1SmallDim0 = std::make_shared(); - std::shared_ptr s1SmallDim1 = std::make_shared(); + std::shared_ptr s0Dim0 = + std::make_shared(); + std::shared_ptr s0Dim1 = + std::make_shared(); + std::shared_ptr s1Dim0 = + std::make_shared(); + std::shared_ptr s1Dim1 = + std::make_shared(); + + std::shared_ptr s0SmallDim0 = + std::make_shared(); + std::shared_ptr s0SmallDim1 = + std::make_shared(); + std::shared_ptr s1SmallDim0 = + std::make_shared(); + std::shared_ptr s1SmallDim1 = + std::make_shared(); }; -/** - * A Buffer data structure, containing the write streams for the result of a split, while doing it on disk - */ +// ___________________________________________________________________________ +// A Buffer data structure, containing the write streams for the result of a +// split, while doing it on disk struct SplitBuffersDisk { SplitBuffersRam splitBuffersRam; std::optional split0Dim0File; @@ -212,56 +320,75 @@ struct SplitBuffersDisk { std::optional split1Dim1File; }; -struct SortRuleLambdaX { +template +struct SortRuleLambda { // comparison function bool operator()(const RTreeValue& b1, const RTreeValue& b2) const { - double center1 = (b1.box.min_corner().get<0>() + b1.box.max_corner().get<0>()) / 2; - double center2 = (b2.box.min_corner().get<0>() + b2.box.max_corner().get<0>()) / 2; + double center1 = dimension == 0 ? std::midpoint(b1.MinX(), b1.MaxX()) + : std::midpoint(b1.MinY(), b1.MaxY()); + double center2 = dimension == 0 ? std::midpoint(b2.MinX(), b2.MaxX()) + : std::midpoint(b2.MinY(), b2.MaxY()); return center1 < center2; } // Value that is strictly smaller than any input element. - static RTreeValue min_value() { return {Rtree::createBoundingBox(-DBL_MAX, -DBL_MAX, -DBL_MAX, -DBL_MAX), 0}; } + static RTreeValue min_value() { + return {Rtree::createBoundingBox(-std::numeric_limits::max(), + -std::numeric_limits::max(), + -std::numeric_limits::max(), + -std::numeric_limits::max()), + 0}; + } // Value that is strictly larger than any input element. - static RTreeValue max_value() { return {Rtree::createBoundingBox(DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX), 0}; } + static RTreeValue max_value() { + return {Rtree::createBoundingBox(std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max()), + 0}; + } }; +template +struct SortRuleLambdaWithIndex { + uint64_t RTreeValueWithOrderIndex::*orderSelected = + dimension == 0 ? &RTreeValueWithOrderIndex::orderX + : &RTreeValueWithOrderIndex::orderY; -struct SortRuleLambdaXWithIndex { // comparison function - bool operator()(const RTreeValueWithOrderIndex& b1, const RTreeValueWithOrderIndex& b2) const { - double center1 = (b1.box.min_corner().get<0>() + b1.box.max_corner().get<0>()) / 2; - double center2 = (b2.box.min_corner().get<0>() + b2.box.max_corner().get<0>()) / 2; - - if (b1.orderX == b2.orderX) - return center1 < center2; - return b1.orderX < b2.orderX; + bool operator()(const RTreeValueWithOrderIndex& b1, + const RTreeValueWithOrderIndex& b2) const { + double center1 = dimension == 0 ? std::midpoint(b1.MinX(), b1.MaxX()) + : std::midpoint(b1.MinY(), b1.MaxY()); + double center2 = dimension == 0 ? std::midpoint(b2.MinX(), b2.MaxX()) + : std::midpoint(b2.MinY(), b2.MaxY()); + + if (b1.*orderSelected == b2.*orderSelected) return center1 < center2; + return b1.*orderSelected < b2.*orderSelected; } // Value that is strictly smaller than any input element. - static RTreeValueWithOrderIndex min_value() { return {Rtree::createBoundingBox(-DBL_MAX, -DBL_MAX, -DBL_MAX, -DBL_MAX), 0, 0, 0}; } - - // Value that is strictly larger than any input element. - static RTreeValueWithOrderIndex max_value() { return {Rtree::createBoundingBox(DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX), 0, LLONG_MAX, LLONG_MAX}; } -}; - -struct SortRuleLambdaYWithIndex { - // comparison function - bool operator()(const RTreeValueWithOrderIndex& b1, const RTreeValueWithOrderIndex& b2) const { - double center1 = (b1.box.min_corner().get<1>() + b1.box.max_corner().get<1>()) / 2; - double center2 = (b2.box.min_corner().get<1>() + b2.box.max_corner().get<1>()) / 2; - - if (b1.orderY == b2.orderY) - return center1 < center2; - return b1.orderY < b2.orderY; + static RTreeValueWithOrderIndex min_value() { + return {{Rtree::createBoundingBox(-std::numeric_limits::max(), + -std::numeric_limits::max(), + -std::numeric_limits::max(), + -std::numeric_limits::max()), + 0}, + 0, + 0}; } - // Value that is strictly smaller than any input element. - static RTreeValueWithOrderIndex min_value() { return {Rtree::createBoundingBox(-DBL_MAX, -DBL_MAX, -DBL_MAX, -DBL_MAX), 0, 0, 0}; } - // Value that is strictly larger than any input element. - static RTreeValueWithOrderIndex max_value() { return {Rtree::createBoundingBox(DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX), 0, LLONG_MAX, LLONG_MAX}; } + static RTreeValueWithOrderIndex max_value() { + return {{Rtree::createBoundingBox(std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max()), + 0}, + LLONG_MAX, + LLONG_MAX}; + } }; -#endif //QLEVER_RTREE_H +#endif // QLEVER_RTREE_H diff --git a/src/util/RtreeBuild.cpp b/src/util/RtreeBuild.cpp index 1bbc43e4d6..a7a2d00584 100644 --- a/src/util/RtreeBuild.cpp +++ b/src/util/RtreeBuild.cpp @@ -1,56 +1,55 @@ // Copyright 2023, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: Noah Nock +#include #include #include -#include + #include #include static void centerOrdering(multiBoxGeo& boxes, size_t dim) { if (dim == 0) { // order by centerX - SortRuleLambdaX comp; - - std::sort(boxes.begin(), boxes.end(), comp); + std::sort(boxes.begin(), boxes.end(), SortRuleLambda<0>{}); } else { // order by centerY - auto sortRuleLambda = [](RTreeValue b1, RTreeValue b2) -> bool { - double center1 = std::midpoint(b1.box.min_corner().get<1>(), b1.box.max_corner().get<1>()); - double center2 = std::midpoint(b2.box.min_corner().get<1>(), b2.box.max_corner().get<1>()); - return center1 < center2; - }; - - std::sort(boxes.begin(), boxes.end(), sortRuleLambda); + std::sort(boxes.begin(), boxes.end(), SortRuleLambda<1>{}); } } static void centerOrdering(multiBoxWithOrderIndex& boxes, size_t dim) { if (dim == 0) { // order by centerX - SortRuleLambdaXWithIndex comp; - - std::sort(boxes.begin(), boxes.end(), comp); + std::sort(boxes.begin(), boxes.end(), SortRuleLambdaWithIndex<0>{}); } else { // order by centerY - SortRuleLambdaYWithIndex comp; - - std::sort(boxes.begin(), boxes.end(), comp); + std::sort(boxes.begin(), boxes.end(), SortRuleLambdaWithIndex<1>{}); } } -OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBuildingRamUsage, bool workInRam) { +OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, + uintmax_t maxBuildingRamUsage, bool workInRam) { OrderedBoxes orderedInputRectangles; - ad_utility::BackgroundStxxlSorter sorterRectsD0Basic = ad_utility::BackgroundStxxlSorter(std::ceil((maxBuildingRamUsage < 9999999999 ? maxBuildingRamUsage : 9999999999) / 3.0)); + auto maxRamForSorter = + std::ceil((maxBuildingRamUsage < 9999999999.0 ? maxBuildingRamUsage + : 9999999999.0) / + 3.0); + ad_utility::BackgroundStxxlSorter> + sorterRectsD0Basic = + ad_utility::BackgroundStxxlSorter>( + maxRamForSorter); multiBoxGeo rectsD0Basic; if (workInRam) { rectsD0Basic = Rtree::LoadEntries(onDiskBase + ".boundingbox.tmp"); centerOrdering(rectsD0Basic, 0); } else { - FileReaderWithoutIndex fileReaderRectsD0 = FileReaderWithoutIndex(onDiskBase + ".boundingbox.tmp"); - std::optional rectD0Element = fileReaderRectsD0.GetNextElement(); + FileReaderWithoutIndex fileReaderRectsD0 = + FileReaderWithoutIndex(onDiskBase + ".boundingbox.tmp"); + std::optional rectD0Element = + fileReaderRectsD0.GetNextElement(); while (rectD0Element) { sorterRectsD0Basic.push(rectD0Element.value()); rectD0Element = fileReaderRectsD0.GetNextElement(); @@ -59,112 +58,94 @@ OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBui } uint64_t xSize = 0; - double globalMinX = -1; - double globalMinY = -1; - double globalMaxX = -1; - double globalMaxY = -1; + Rtree::BoundingBox boundingBox = Rtree::createBoundingBox(0, 0, 0, 0); - ad_utility::BackgroundStxxlSorter sorterRectsD1 = ad_utility::BackgroundStxxlSorter(std::ceil((maxBuildingRamUsage < 9999999999 ? maxBuildingRamUsage : 9999999999) / 3.0)); - std::shared_ptr RectanglesD1WithOrder = std::make_shared(); + ad_utility::BackgroundStxxlSorter> + sorterRectsD1 = + ad_utility::BackgroundStxxlSorter>( + maxRamForSorter); + std::shared_ptr RectanglesD1WithOrder = + std::make_shared(); if (workInRam) { for (RTreeValue element : rectsD0Basic) { - RTreeValueWithOrderIndex entry = {element.box, element.id, xSize, 0}; + RTreeValueWithOrderIndex entry = {{element.box, element.id}, xSize, 0}; RectanglesD1WithOrder->push_back(entry); xSize++; - if (globalMinX == -1 || element.box.min_corner().get<0>() < globalMinX) { - globalMinX = element.box.min_corner().get<0>(); - } - if (globalMinY == -1 || element.box.min_corner().get<1>() < globalMinY) { - globalMinY = element.box.min_corner().get<1>(); - } - if (element.box.max_corner().get<0>() > globalMaxX) { - globalMaxX = element.box.max_corner().get<0>(); - } - if (element.box.max_corner().get<1>() > globalMaxY) { - globalMaxY = element.box.max_corner().get<1>(); - } + boundingBox = Rtree::combineBoundingBoxes(boundingBox, element.box); } centerOrdering(*RectanglesD1WithOrder, 1); } else { for (RTreeValue element : sorterRectsD0Basic.sortedView()) { - RTreeValueWithOrderIndex entry = {element.box, element.id, xSize, 0}; + RTreeValueWithOrderIndex entry = {{element.box, element.id}, xSize, 0}; sorterRectsD1.push(entry); xSize++; - if (globalMinX == -1 || element.box.min_corner().get<0>() < globalMinX) { - globalMinX = element.box.min_corner().get<0>(); - } - if (globalMinY == -1 || element.box.min_corner().get<1>() < globalMinY) { - globalMinY = element.box.min_corner().get<1>(); - } - if (element.box.max_corner().get<0>() > globalMaxX) { - globalMaxX = element.box.max_corner().get<0>(); - } - if (element.box.max_corner().get<1>() > globalMaxY) { - globalMaxY = element.box.max_corner().get<1>(); - } + boundingBox = Rtree::combineBoundingBoxes(boundingBox, element.box); } } sorterRectsD0Basic.clear(); - size_t currentS = std::ceil(((float) xSize) / ((float) M)); + size_t currentS = std::ceil(((float)xSize) / ((float)M)); uint64_t ySize = 0; - std::ofstream r1File = std::ofstream(onDiskBase + ".boundingbox.d1.tmp", std::ios::binary); - ad_utility::BackgroundStxxlSorter sorterRectsD0 = ad_utility::BackgroundStxxlSorter(std::ceil((maxBuildingRamUsage < 9999999999 ? maxBuildingRamUsage : 9999999999) / 3.0)); - std::shared_ptr RectanglesD0WithOrder = std::make_shared(); - std::shared_ptr r1Small = std::make_shared(); + std::ofstream r1File = + std::ofstream(onDiskBase + ".boundingbox.d1.tmp", std::ios::binary); + ad_utility::BackgroundStxxlSorter> + sorterRectsD0 = + ad_utility::BackgroundStxxlSorter>( + maxRamForSorter); + std::shared_ptr RectanglesD0WithOrder = + std::make_shared(); + std::shared_ptr r1Small = + std::make_shared(); // placeholder r1Small->push_back(RTreeValueWithOrderIndex()); r1Small->push_back(RTreeValueWithOrderIndex()); RTreeValueWithOrderIndex minD1; RTreeValueWithOrderIndex maxD1; - if (workInRam) { - for (RTreeValueWithOrderIndex element : *RectanglesD1WithOrder) { - element.orderY = ySize; - RectanglesD0WithOrder->push_back(element); + auto processD1Element = [&ySize, currentS, M, &r1Small, &minD1, + &maxD1](RTreeValueWithOrderIndex element) { + element.orderY = ySize; - if (((ySize + 1) % currentS == 0 && (ySize + 1) / currentS >= 1 && (ySize + 1) / currentS < M) - || (ySize % currentS == 0 && ySize / currentS >= 1 && ySize / currentS < M)) { - // index i * S - 1 or i * S - r1Small->push_back(element); - } + if (((ySize + 1) % currentS == 0 && (ySize + 1) / currentS >= 1 && + (ySize + 1) / currentS < M) || + (ySize % currentS == 0 && ySize / currentS >= 1 && + ySize / currentS < M)) { + // index i * S - 1 or i * S + r1Small->push_back(element); + } - if (ySize == 0) { - minD1 = element; - maxD1 = element; - } - if (element.orderY > maxD1.orderY) { - maxD1 = element; - } + if (ySize == 0) { + minD1 = element; + maxD1 = element; + } + if (element.orderY > maxD1.orderY) { + maxD1 = element; + } - ySize++; + ySize++; + }; + + if (workInRam) { + for (RTreeValueWithOrderIndex element : *RectanglesD1WithOrder) { + processD1Element(element); + + RectanglesD0WithOrder->push_back(element); } centerOrdering(*RectanglesD0WithOrder, 0); } else { for (RTreeValueWithOrderIndex element : sorterRectsD1.sortedView()) { - element.orderY = ySize; + processD1Element(element); + Rtree::SaveEntryWithOrderIndex(element, r1File); sorterRectsD0.push(element); - - if (((ySize + 1) % currentS == 0 && (ySize + 1) / currentS >= 1 && (ySize + 1) / currentS < M) - || (ySize % currentS == 0 && ySize / currentS >= 1 && ySize / currentS < M)) { - // index i * S - 1 or i * S - r1Small->push_back(element); - } - - if (ySize == 0) { - minD1 = element; - maxD1 = element; - } - if (element.orderY > maxD1.orderY) { - maxD1 = element; - } - - ySize++; } } @@ -176,51 +157,46 @@ OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBui (*r1Small)[1] = maxD1; uint64_t currentX = 0; - std::ofstream r0File = std::ofstream(onDiskBase + ".boundingbox.d0.tmp", std::ios::binary); - std::shared_ptr r0Small = std::make_shared(); + std::ofstream r0File = + std::ofstream(onDiskBase + ".boundingbox.d0.tmp", std::ios::binary); + std::shared_ptr r0Small = + std::make_shared(); // placeholder r0Small->push_back(RTreeValueWithOrderIndex()); r0Small->push_back(RTreeValueWithOrderIndex()); RTreeValueWithOrderIndex minD0; RTreeValueWithOrderIndex maxD0; - if (workInRam) { - for (RTreeValueWithOrderIndex element : *RectanglesD0WithOrder) { - if (((currentX + 1) % currentS == 0 && (currentX + 1) / currentS >= 1 && (currentX + 1) / currentS < M) - || (currentX % currentS == 0 && currentX / currentS >= 1 && currentX / currentS < M)) { - // index i * S - 1 or i * S - r0Small->push_back(element); - } + auto processD0Element = [¤tX, currentS, M, &r0Small, &minD0, + &maxD0](RTreeValueWithOrderIndex element) { + if (((currentX + 1) % currentS == 0 && (currentX + 1) / currentS >= 1 && + (currentX + 1) / currentS < M) || + (currentX % currentS == 0 && currentX / currentS >= 1 && + currentX / currentS < M)) { + // index i * S - 1 or i * S + r0Small->push_back(element); + } - if (currentX == 0) { - minD0 = element; - maxD0 = element; - } - if (element.orderX > maxD0.orderX) { - maxD0 = element; - } + if (currentX == 0) { + minD0 = element; + maxD0 = element; + } + if (element.orderX > maxD0.orderX) { + maxD0 = element; + } - currentX++; + currentX++; + }; + + if (workInRam) { + for (RTreeValueWithOrderIndex element : *RectanglesD0WithOrder) { + processD0Element(element); } } else { for (RTreeValueWithOrderIndex element : sorterRectsD0.sortedView()) { Rtree::SaveEntryWithOrderIndex(element, r0File); - if (((currentX + 1) % currentS == 0 && (currentX + 1) / currentS >= 1 && (currentX + 1) / currentS < M) - || (currentX % currentS == 0 && currentX / currentS >= 1 && currentX / currentS < M)) { - // index i * S - 1 or i * S - r0Small->push_back(element); - } - - if (currentX == 0) { - minD0 = element; - maxD0 = element; - } - if (element.orderX > maxD0.orderX) { - maxD0 = element; - } - - currentX++; + processD0Element(element); } } @@ -231,7 +207,6 @@ OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBui (*r0Small)[0] = minD0; (*r0Small)[1] = maxD0; - Rtree::BoundingBox boundingBox = Rtree::createBoundingBox(globalMinX, globalMinY, globalMaxX, globalMaxY); RectanglesForOrderedBoxes rectsD0; RectanglesForOrderedBoxes rectsD1; rectsD0.rectanglesSmall = r0Small; @@ -239,27 +214,29 @@ OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBui if (workInRam) { rectsD0.rectanglesInRam = RectanglesD0WithOrder; rectsD1.rectanglesInRam = RectanglesD1WithOrder; - orderedInputRectangles.CreateOrderedBoxesInRam(rectsD0, rectsD1, boundingBox); + orderedInputRectangles.SetOrderedBoxesToRam(rectsD0, rectsD1, boundingBox); } else { rectsD0.rectanglesOnDisk = onDiskBase + ".boundingbox.d0"; rectsD1.rectanglesOnDisk = onDiskBase + ".boundingbox.d1"; - orderedInputRectangles.CreateOrderedBoxesOnDisk(rectsD0, rectsD1, xSize, boundingBox); + orderedInputRectangles.SetOrderedBoxesToDisk(rectsD0, rectsD1, xSize, + boundingBox); } return orderedInputRectangles; } -/*OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t maxBuildingRamUsage, bool workInRam) { - if (workInRam) { - return InternalSort(onDiskBase, M); - } else { - return ExternalSort(onDiskBase, M, maxBuildingRamUsage); +/*OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t +maxBuildingRamUsage, bool workInRam) { if (workInRam) { return +InternalSort(onDiskBase, M); } else { return ExternalSort(onDiskBase, M, +maxBuildingRamUsage); } }*/ -static double costFunctionTGS(Rtree::BoundingBox& b0, Rtree::BoundingBox& b1, size_t dim) { +static double costFunctionTGS(Rtree::BoundingBox& b0, Rtree::BoundingBox& b1, + size_t dim) { /** - * The cost function determines the quality of a split. The lower the cost, the better the split. - * Each split gets represented by the resulting bounding boxes of the split pieces. + * The cost function determines the quality of a split. The lower the cost, + * the better the split. Each split gets represented by the resulting bounding + * boxes of the split pieces. */ double cost; @@ -275,25 +252,31 @@ static double costFunctionTGS(Rtree::BoundingBox& b0, Rtree::BoundingBox& b1, si return cost; } -static std::vector TGSRecursive(const std::string& filePath, OrderedBoxes orderedInputRectangles, size_t M, size_t S, uint64_t maxBuildingRamUsage) { +static std::vector TGSRecursive( + const std::string& filePath, OrderedBoxes orderedInputRectangles, size_t M, + size_t S, uint64_t maxBuildingRamUsage) { /** - * This function recursively constructs one layer of children for a certain root node. - * The input rectangles must be sorted in both x- and y-direction. - * The algorithm is based on this paper https://dl.acm.org/doi/pdf/10.1145/288692.288723 + * This function recursively constructs one layer of children for a certain + * root node. The input rectangles must be sorted in both x- and y-direction. + * The algorithm is based on this paper + * https://dl.acm.org/doi/pdf/10.1145/288692.288723 */ uint64_t n = orderedInputRectangles.GetSize(); if (n <= S || n <= M) { // stop condition - return std::vector { orderedInputRectangles }; + return std::vector{orderedInputRectangles}; } // split the rectangles at the best split - std::pair split = orderedInputRectangles.SplitAtBest(filePath, S, M, maxBuildingRamUsage); + std::pair split = + orderedInputRectangles.SplitAtBest(filePath, S, M, maxBuildingRamUsage); // recursion - std::vector result0 = TGSRecursive(filePath + ".0", split.first, M, S, maxBuildingRamUsage); - std::vector result1 = TGSRecursive(filePath + ".1", split.second, M, S, maxBuildingRamUsage); + std::vector result0 = + TGSRecursive(filePath + ".0", split.first, M, S, maxBuildingRamUsage); + std::vector result1 = + TGSRecursive(filePath + ".1", split.second, M, S, maxBuildingRamUsage); std::vector result; result.insert(result.begin(), result0.begin(), result0.end()); @@ -302,24 +285,32 @@ static std::vector TGSRecursive(const std::string& filePath, Order return result; } -void Rtree::BuildTree(const std::string& onDiskBase, size_t M, const std::string& folder) const { +void Rtree::BuildTree(const std::string& onDiskBase, size_t M, + const std::string& folder) const { const std::string file = onDiskBase + ".boundingbox.tmp"; // prepare the files std::filesystem::create_directory(folder); - std::ofstream nodesOfs = std::ofstream(folder + "/nodes.bin", std::ios::binary); + std::ofstream nodesOfs = + std::ofstream(folder + "/nodes.bin", std::ios::binary); std::map lookup; // sort the rectangles - uint64_t fileLines = std::ceil(std::filesystem::file_size(file) / (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t))); - bool workInRam = (std::filesystem::file_size(file) + fileLines * 2 * sizeof(uint64_t)) * 4 < this->maxBuildingRamUsage; + uint64_t fileLines = + std::ceil(std::filesystem::file_size(file) / + (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t))); + bool workInRam = + (std::filesystem::file_size(file) + fileLines * 2 * sizeof(uint64_t)) * + 4 < + this->maxBuildingRamUsage; - OrderedBoxes orderedInputRectangles = SortInput(onDiskBase, M, maxBuildingRamUsage, workInRam); + OrderedBoxes orderedInputRectangles = + SortInput(onDiskBase, M, maxBuildingRamUsage, workInRam); // build the tree in a depth first approach std::stack layerStack; - uint64_t newId = 1; // start from 1, because 0 is the root item + uint64_t newId = 1; // start from 1, because 0 is the root item ConstructionNode rootItem = ConstructionNode(0, orderedInputRectangles); layerStack.push(rootItem); size_t layer = 0; @@ -334,9 +325,15 @@ void Rtree::BuildTree(const std::string& onDiskBase, size_t M, const std::string uint64_t nodePtr = SaveNode(currentItem, true, nodesOfs); lookup[currentItem.GetId()] = nodePtr; } else { - std::vector tgsResult = TGSRecursive(onDiskBase + ".boundingbox." + std::to_string(layer), currentItem.GetOrderedBoxes(), M, std::ceil(((float) currentItem.GetOrderedBoxes().GetSize()) / ((float) M)), this->maxBuildingRamUsage); + std::vector tgsResult = TGSRecursive( + onDiskBase + ".boundingbox." + std::to_string(layer), + currentItem.GetOrderedBoxes(), M, + std::ceil(((float)currentItem.GetOrderedBoxes().GetSize()) / + ((float)M)), + this->maxBuildingRamUsage); for (OrderedBoxes& currentOrderedRectangles : tgsResult) { - ConstructionNode newItem = ConstructionNode(newId, currentOrderedRectangles); + ConstructionNode newItem = + ConstructionNode(newId, currentOrderedRectangles); layerStack.push(newItem); currentItem.AddChild(newItem); @@ -354,14 +351,13 @@ void Rtree::BuildTree(const std::string& onDiskBase, size_t M, const std::string std::ofstream lookupOfs(folder + "/lookup.bin", std::ios::binary); for (unsigned int i = 0; i < newId; i++) { uint64_t nodePtr = lookup[i]; - lookupOfs.write(reinterpret_cast(&nodePtr), sizeof(uint64_t)); + lookupOfs.write(reinterpret_cast(&nodePtr), sizeof(uint64_t)); } lookupOfs.close(); } ConstructionNode::ConstructionNode(uint64_t id, OrderedBoxes orderedBoxes) - : Node{id} -{ + : Node{id} { this->orderedBoxes = orderedBoxes; // calculate the boundingBoxes @@ -370,19 +366,22 @@ ConstructionNode::ConstructionNode(uint64_t id, OrderedBoxes orderedBoxes) void ConstructionNode::AddChildrenToItem() { /** - * Add all children of a certain node at once. - * This is used when a leaf node is reached. + * Add all children of a certain node at once. + * This is used when a leaf node is reached. */ if (this->GetOrderedBoxes().WorkInRam()) { - for(RTreeValueWithOrderIndex box : *this->GetOrderedBoxes().GetRectanglesInRam()) { + for (RTreeValueWithOrderIndex box : + *this->GetOrderedBoxes().GetRectanglesInRam()) { Node leafNode = Node(box.id, box.box); this->AddChild(leafNode); } } else { - FileReader fileReader = FileReader(this->GetOrderedBoxes().GetRectanglesOnDisk()); + FileReader fileReader = + FileReader(this->GetOrderedBoxes().GetRectanglesOnDisk()); - std::optional element = fileReader.GetNextElement(); - while(element) { + std::optional element = + fileReader.GetNextElement(); + while (element) { Node leafNode = Node(element.value().id, element.value().box); this->AddChild(leafNode); element = fileReader.GetNextElement(); @@ -392,9 +391,7 @@ void ConstructionNode::AddChildrenToItem() { } } -OrderedBoxes ConstructionNode::GetOrderedBoxes() { - return this->orderedBoxes; -} +OrderedBoxes ConstructionNode::GetOrderedBoxes() { return this->orderedBoxes; } void Node::AddChild(Node& child) { Rtree::BoundingBox box = child.GetBoundingBox(); @@ -403,15 +400,12 @@ void Node::AddChild(Node& child) { this->children.push_back(entry); } -Rtree::BoundingBox Node::GetBoundingBox() const { - return this->boundingBox; -} +Rtree::BoundingBox Node::GetBoundingBox() const { return this->boundingBox; } -void Node::SetIsLastInnerNode(bool isLast) { - this->isLastInnerNode = isLast; -} +void Node::SetIsLastInnerNode(bool isLast) { this->isLastInnerNode = isLast; } -uint64_t Rtree::SaveNode(Node &node, bool isLastInnerNode, std::ofstream& nodesOfs) { +uint64_t Rtree::SaveNode(Node& node, bool isLastInnerNode, + std::ofstream& nodesOfs) { node.SetIsLastInnerNode(isLastInnerNode); uint64_t pos = static_cast(nodesOfs.tellp()); @@ -422,9 +416,11 @@ uint64_t Rtree::SaveNode(Node &node, bool isLastInnerNode, std::ofstream& nodesO return pos; } -std::optional GetBoundingBoxFromWKT(const std::string& wkt) { +std::optional GetBoundingBoxFromWKT( + const std::string& wkt) { /** - * Parse the wkt literal in a way, that only the relevant data for the rtree gets read in. + * Parse the wkt literal in a way, that only the relevant data for the rtree + * gets read in. */ bool lookingForX = true; bool readingDouble = false; @@ -450,8 +446,8 @@ std::optional GetBoundingBoxFromWKT(const std::string& wkt) double x; try { x = std::stod(currentDouble); - } catch(...) { - return { }; + } catch (...) { + return {}; } currentDouble = ""; if (x < minX || minX == -1) { @@ -470,8 +466,8 @@ std::optional GetBoundingBoxFromWKT(const std::string& wkt) double y; try { y = std::stod(currentDouble); - } catch(...) { - return { }; + } catch (...) { + return {}; } currentDouble = ""; if (y < minY || minY == -1) { @@ -485,16 +481,18 @@ std::optional GetBoundingBoxFromWKT(const std::string& wkt) } } - return { Rtree::createBoundingBox(minX, minY, maxX, maxY) }; + return {Rtree::createBoundingBox(minX, minY, maxX, maxY)}; } -std::optional Rtree::ConvertWordToRtreeEntry(const std::string& wkt) { +std::optional Rtree::ConvertWordToRtreeEntry( + const std::string& wkt) { /** - * Convert a single wkt literal to a boundingbox. + * Convert a single wkt literal to a boundingbox. */ std::optional boundingBox; - /* Get the bounding box(es) of either a multipolygon, polygon or a linestring */ + /* Get the bounding box(es) of either a multipolygon, polygon or a linestring + */ std::size_t posWKTStart = wkt.find("MULTIPOLYGON(((") + 14; std::size_t posWKTEnd = wkt.find(")))", posWKTStart); if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { @@ -510,10 +508,11 @@ std::optional Rtree::ConvertWordToRtreeEntry(const std::stri posWKTStart = wkt.find("LINESTRING(") + 10; posWKTEnd = wkt.find(')', posWKTStart); if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { - std::string newWkt = wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); + std::string newWkt = + wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); boundingBox = GetBoundingBoxFromWKT(newWkt); } else { - return { }; + return {}; } } } @@ -521,38 +520,44 @@ std::optional Rtree::ConvertWordToRtreeEntry(const std::stri return boundingBox; } -void Rtree::SaveEntry(Rtree::BoundingBox boundingBox, uint64_t index, std::ofstream& convertOfs) { +void Rtree::SaveEntry(Rtree::BoundingBox boundingBox, uint64_t index, + std::ofstream& convertOfs) { /** - * Save a single entry (which was e.g. converted by ConvertWordToRtreeEntry) to the disk + * Save a single entry (which was e.g. converted by ConvertWordToRtreeEntry) + * to the disk */ double minX = boundingBox.min_corner().get<0>(); double minY = boundingBox.min_corner().get<1>(); double maxX = boundingBox.max_corner().get<0>(); double maxY = boundingBox.max_corner().get<1>(); - convertOfs.write(reinterpret_cast(&minX), sizeof(double)); - convertOfs.write(reinterpret_cast(&minY), sizeof(double)); - convertOfs.write(reinterpret_cast(&maxX), sizeof(double)); - convertOfs.write(reinterpret_cast(&maxY), sizeof(double)); - convertOfs.write(reinterpret_cast(&index), sizeof(uint64_t)); + convertOfs.write(reinterpret_cast(&minX), sizeof(double)); + convertOfs.write(reinterpret_cast(&minY), sizeof(double)); + convertOfs.write(reinterpret_cast(&maxX), sizeof(double)); + convertOfs.write(reinterpret_cast(&maxY), sizeof(double)); + convertOfs.write(reinterpret_cast(&index), sizeof(uint64_t)); } -void Rtree::SaveEntryWithOrderIndex(RTreeValueWithOrderIndex treeValue, std::ofstream& convertOfs) { +void Rtree::SaveEntryWithOrderIndex(RTreeValueWithOrderIndex treeValue, + std::ofstream& convertOfs) { /** - * Save a single entry, containing its postion in the x- and y-sorting + * Save a single entry, containing its postion in the x- and y-sorting */ - double minX = treeValue.box.min_corner().get<0>(); - double minY = treeValue.box.min_corner().get<1>(); - double maxX = treeValue.box.max_corner().get<0>(); - double maxY = treeValue.box.max_corner().get<1>(); - - convertOfs.write(reinterpret_cast(&minX), sizeof(double)); - convertOfs.write(reinterpret_cast(&minY), sizeof(double)); - convertOfs.write(reinterpret_cast(&maxX), sizeof(double)); - convertOfs.write(reinterpret_cast(&maxY), sizeof(double)); - convertOfs.write(reinterpret_cast(&treeValue.id), sizeof(uint64_t)); - convertOfs.write(reinterpret_cast(&treeValue.orderX), sizeof(uint64_t)); - convertOfs.write(reinterpret_cast(&treeValue.orderY), sizeof(uint64_t)); + double minX = treeValue.MinX(); + double minY = treeValue.MinY(); + double maxX = treeValue.MaxX(); + double maxY = treeValue.MaxY(); + + convertOfs.write(reinterpret_cast(&minX), sizeof(double)); + convertOfs.write(reinterpret_cast(&minY), sizeof(double)); + convertOfs.write(reinterpret_cast(&maxX), sizeof(double)); + convertOfs.write(reinterpret_cast(&maxY), sizeof(double)); + convertOfs.write(reinterpret_cast(&treeValue.id), + sizeof(uint64_t)); + convertOfs.write(reinterpret_cast(&treeValue.orderX), + sizeof(uint64_t)); + convertOfs.write(reinterpret_cast(&treeValue.orderY), + sizeof(uint64_t)); } multiBoxGeo Rtree::LoadEntries(const std::string& file) { @@ -571,7 +576,8 @@ multiBoxGeo Rtree::LoadEntries(const std::string& file) { return boxes; } -multiBoxWithOrderIndex Rtree::LoadEntriesWithOrderIndex(const std::string& file) { +multiBoxWithOrderIndex Rtree::LoadEntriesWithOrderIndex( + const std::string& file) { multiBoxWithOrderIndex boxes; FileReader fileReader = FileReader(file); @@ -586,19 +592,19 @@ multiBoxWithOrderIndex Rtree::LoadEntriesWithOrderIndex(const std::string& file) return boxes; } -bool OrderedBoxes::WorkInRam() const{ - return this->workInRam; -} +bool OrderedBoxes::WorkInRam() const { return this->workInRam; } -void OrderedBoxes::CreateOrderedBoxesInRam(RectanglesForOrderedBoxes& rectanglesD0, RectanglesForOrderedBoxes& rectanglesD1, Rtree::BoundingBox box) { - this->workInRam = true; - this->rectsD0 = rectanglesD0; - this->rectsD1 = rectanglesD1; - this->size = (*rectsD0.rectanglesInRam).size(); - this->boundingBox = box; +void OrderedBoxes::SetOrderedBoxesToRam(RectanglesForOrderedBoxes& rectanglesD0, + RectanglesForOrderedBoxes& rectanglesD1, + Rtree::BoundingBox box) { + SetOrderedBoxesToDisk(rectanglesD0, rectanglesD1, + (*rectsD0.rectanglesInRam).size(), box); } -void OrderedBoxes::CreateOrderedBoxesOnDisk(RectanglesForOrderedBoxes& rectanglesD0, RectanglesForOrderedBoxes& rectanglesD1, uint64_t size, Rtree::BoundingBox box) { +void OrderedBoxes::SetOrderedBoxesToDisk( + RectanglesForOrderedBoxes& rectanglesD0, + RectanglesForOrderedBoxes& rectanglesD1, uint64_t size, + Rtree::BoundingBox box) { this->workInRam = false; this->rectsD0 = rectanglesD0; this->rectsD1 = rectanglesD1; @@ -606,13 +612,9 @@ void OrderedBoxes::CreateOrderedBoxesOnDisk(RectanglesForOrderedBoxes& rectangle this->boundingBox = box; } -Rtree::BoundingBox OrderedBoxes::GetBoundingBox() { - return this->boundingBox; -} +Rtree::BoundingBox OrderedBoxes::GetBoundingBox() { return this->boundingBox; } -uint64_t OrderedBoxes::GetSize() const { - return this->size; -} +uint64_t OrderedBoxes::GetSize() const { return this->size; } std::shared_ptr OrderedBoxes::GetRectanglesInRam() { return this->rectsD0.rectanglesInRam; @@ -624,7 +626,8 @@ std::string OrderedBoxes::GetRectanglesOnDisk() { SplitResult OrderedBoxes::GetBestSplit() { /** - * Determine based on the "small-lists", which split is the best for the rtree. + * Determine based on the "small-lists", which split is the best for the + * rtree. */ struct SplitResult splitResult; @@ -633,13 +636,15 @@ SplitResult OrderedBoxes::GetBestSplit() { RTreeValueWithOrderIndex currentLastElement; RTreeValueWithOrderIndex currentElement; - // This bool is used, since we need every other element as our element "S * i" (described in the algorithm) - // To perform the split better, the element before it (S * i - 1) is saved as well + // This bool is used, since we need every other element as our element "S * i" + // (described in the algorithm) To perform the split better, the element + // before it (S * i - 1) is saved as well bool currentlyAtSTimesI = false; for (size_t dim = 0; dim < 2; dim++) { for (uint64_t i = 0; i < this->rectsD0.rectanglesSmall->size(); i++) { - currentElement = dim == 0 ? (*this->rectsD0.rectanglesSmall)[i] : (*this->rectsD1.rectanglesSmall)[i]; + currentElement = dim == 0 ? (*this->rectsD0.rectanglesSmall)[i] + : (*this->rectsD1.rectanglesSmall)[i]; if (i == 0) { // this is the min element @@ -659,39 +664,27 @@ SplitResult OrderedBoxes::GetBestSplit() { continue; } - double minXB0 = 0; - double maxXB0 = 1; - double minXB1 = 0; - double maxXB1 = 1; - double minYB0 = 0; - double maxYB0 = 1; - double minYB1 = 0; - double maxYB1 = 1; - - if (currentlyAtSTimesI && currentElement.id != maxElement.id) { - // the current element is a possible split position. - if (dim == 0) { - minXB0 = std::midpoint(minElement.box.min_corner().get<0>(), minElement.box.max_corner().get<0>()); - maxXB0 = std::midpoint(currentLastElement.box.min_corner().get<0>(), currentLastElement.box.max_corner().get<0>()); - - minXB1 = std::midpoint(currentElement.box.min_corner().get<0>(), currentElement.box.max_corner().get<0>()); - maxXB1 = std::midpoint(maxElement.box.min_corner().get<0>(), maxElement.box.max_corner().get<0>()); - } else { - minYB0 = std::midpoint(minElement.box.min_corner().get<1>(), minElement.box.max_corner().get<1>()); - maxYB0 = std::midpoint(currentLastElement.box.min_corner().get<1>(), currentLastElement.box.max_corner().get<1>()); - - minYB1 = std::midpoint(currentElement.box.min_corner().get<1>(), currentElement.box.max_corner().get<1>()); - maxYB1 = std::midpoint(maxElement.box.min_corner().get<1>(), maxElement.box.max_corner().get<1>()); - } - - currentlyAtSTimesI = false; - } else { + if (!currentlyAtSTimesI || currentElement.id != maxElement.id) { break; } - Rtree::BoundingBox b0 = Rtree::createBoundingBox(minXB0, minYB0, maxXB0, maxYB0); - Rtree::BoundingBox b1 = Rtree::createBoundingBox(minXB1, minYB1, maxXB1, maxYB1); + currentlyAtSTimesI = false; + // the current element is a possible split position. + double minXB0 = minElement.MinX(); + double maxXB0 = currentLastElement.MaxX(); + double minXB1 = currentElement.MinX(); + double maxXB1 = maxElement.MaxX(); + + double minYB0 = minElement.MinY(); + double maxYB0 = currentLastElement.MaxY(); + double minYB1 = currentElement.MinY(); + double maxYB1 = maxElement.MaxY(); + + Rtree::BoundingBox b0 = + Rtree::createBoundingBox(minXB0, minYB0, maxXB0, maxYB0); + Rtree::BoundingBox b1 = + Rtree::createBoundingBox(minXB1, minYB1, maxXB1, maxYB1); double cost = costFunctionTGS(b0, b1, dim); @@ -711,7 +704,9 @@ SplitResult OrderedBoxes::GetBestSplit() { return splitResult; } -std::pair OrderedBoxes::SplitAtBest(const std::string& filePath, size_t S, size_t M, uint64_t maxBuildingRamUsage) { +std::pair OrderedBoxes::SplitAtBest( + const std::string& filePath, size_t S, size_t M, + uint64_t maxBuildingRamUsage) { if (this->workInRam) { return this->SplitAtBestInRam(S, M); } else { @@ -719,9 +714,11 @@ std::pair OrderedBoxes::SplitAtBest(const std::strin } } -std::pair OrderedBoxes::SplitAtBestInRam(size_t S, size_t M) { +std::pair OrderedBoxes::SplitAtBestInRam(size_t S, + size_t M) { /** - * Split the ordered boxes in ram. First determine the best split and then perform it + * Split the ordered boxes in ram. First determine the best split and then + * perform it */ struct SplitResult splitResult = this->GetBestSplit(); @@ -731,40 +728,31 @@ std::pair OrderedBoxes::SplitAtBestInRam(size_t S, s struct SplitBuffersRam splitBuffers; - splitBuffers.s0Dim0 = std::make_shared(); - splitBuffers.s0Dim1 = std::make_shared(); - splitBuffers.s1Dim0 = std::make_shared(); - splitBuffers.s1Dim1 = std::make_shared(); - - splitBuffers.s0SmallDim0 = std::make_shared(); - splitBuffers.s0SmallDim1 = std::make_shared(); - splitBuffers.s1SmallDim0 = std::make_shared(); - splitBuffers.s1SmallDim1 = std::make_shared(); - - std::pair boundingBoxes = PerformSplit(splitResult, splitBuffers, M, S); - - RectanglesForOrderedBoxes rectsD0Split0 = {splitBuffers.s0Dim0, "", splitBuffers.s0SmallDim0}; - RectanglesForOrderedBoxes rectsD1Split0 = {splitBuffers.s0Dim1, "", splitBuffers.s0SmallDim1}; - RectanglesForOrderedBoxes rectsD0Split1 = {splitBuffers.s1Dim0, "", splitBuffers.s1SmallDim0}; - RectanglesForOrderedBoxes rectsD1Split1 = {splitBuffers.s1Dim1, "", splitBuffers.s1SmallDim1}; - split0.CreateOrderedBoxesInRam(rectsD0Split0, rectsD1Split0, boundingBoxes.first); - split1.CreateOrderedBoxesInRam(rectsD0Split1, rectsD1Split1, boundingBoxes.second); - - (*this->rectsD0.rectanglesInRam).clear(); - (*this->rectsD1.rectanglesInRam).clear(); - (*this->rectsD0.rectanglesSmall).clear(); - (*this->rectsD1.rectanglesSmall).clear(); - (*this->rectsD0.rectanglesInRam).shrink_to_fit(); - (*this->rectsD1.rectanglesInRam).shrink_to_fit(); - (*this->rectsD0.rectanglesSmall).shrink_to_fit(); - (*this->rectsD1.rectanglesSmall).shrink_to_fit(); + std::pair boundingBoxes = + PerformSplit(splitResult, splitBuffers, M, S); + + RectanglesForOrderedBoxes rectsD0Split0 = {splitBuffers.s0Dim0, "", + splitBuffers.s0SmallDim0}; + RectanglesForOrderedBoxes rectsD1Split0 = {splitBuffers.s0Dim1, "", + splitBuffers.s0SmallDim1}; + RectanglesForOrderedBoxes rectsD0Split1 = {splitBuffers.s1Dim0, "", + splitBuffers.s1SmallDim0}; + RectanglesForOrderedBoxes rectsD1Split1 = {splitBuffers.s1Dim1, "", + splitBuffers.s1SmallDim1}; + split0.SetOrderedBoxesToRam(rectsD0Split0, rectsD1Split0, + boundingBoxes.first); + split1.SetOrderedBoxesToRam(rectsD0Split1, rectsD1Split1, + boundingBoxes.second); return std::make_pair(split0, split1); } -std::pair OrderedBoxes::SplitAtBestOnDisk(const std::string& filePath, size_t S, size_t M, uint64_t maxBuildingRamUsage) { +std::pair OrderedBoxes::SplitAtBestOnDisk( + const std::string& filePath, size_t S, size_t M, + uint64_t maxBuildingRamUsage) { /** - * Split the ordered boxes on disk. First determine the best split and then perform it + * Split the ordered boxes on disk. First determine the best split and then + * perform it */ OrderedBoxes split0; @@ -778,35 +766,31 @@ std::pair OrderedBoxes::SplitAtBestOnDisk(const std: // perfrom the split uint64_t sizeLeft = std::ceil((splitResult.bestIndex - 2) / 2.0) * S; uint64_t sizeRight = this->size - sizeLeft; - uint64_t split0ByteSize = sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); - uint64_t split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); + uint64_t split0ByteSize = + sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); + uint64_t split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + + 2 * sizeof(uint64_t)); bool split0InRam = split0ByteSize * 4 < maxBuildingRamUsage; bool split1InRam = split1ByteSize * 4 < maxBuildingRamUsage; - splitBuffersRam.s0SmallDim0 = std::make_shared(); - splitBuffersRam.s0SmallDim1 = std::make_shared(); - splitBuffersRam.s1SmallDim0 = std::make_shared(); - splitBuffersRam.s1SmallDim1 = std::make_shared(); - if (!split0InRam) { - splitBuffers.split0Dim0File = { std::ofstream(filePath + ".0.dim0.tmp", std::ios::binary) }; - splitBuffers.split0Dim1File = { std::ofstream(filePath + ".0.dim1.tmp", std::ios::binary) }; - } else { - splitBuffersRam.s0Dim0 = std::make_shared(); - splitBuffersRam.s0Dim1 = std::make_shared(); + splitBuffers.split0Dim0File = { + std::ofstream(filePath + ".0.dim0.tmp", std::ios::binary)}; + splitBuffers.split0Dim1File = { + std::ofstream(filePath + ".0.dim1.tmp", std::ios::binary)}; } if (!split1InRam) { - splitBuffers.split1Dim0File = { std::ofstream(filePath + ".1.dim0.tmp", std::ios::binary) }; - splitBuffers.split1Dim1File = { std::ofstream(filePath + ".1.dim1.tmp", std::ios::binary) }; - } else { - splitBuffersRam.s1Dim0 = std::make_shared(); - splitBuffersRam.s1Dim1 = std::make_shared(); + splitBuffers.split1Dim0File = { + std::ofstream(filePath + ".1.dim0.tmp", std::ios::binary)}; + splitBuffers.split1Dim1File = { + std::ofstream(filePath + ".1.dim1.tmp", std::ios::binary)}; } splitBuffers.splitBuffersRam = splitBuffersRam; - std::pair boundingBoxes = PerformSplit(splitResult, splitBuffers, M, S, maxBuildingRamUsage); + std::pair boundingBoxes = + PerformSplit(splitResult, splitBuffers, M, S, maxBuildingRamUsage); RectanglesForOrderedBoxes rectsD0Split0; RectanglesForOrderedBoxes rectsD1Split0; @@ -824,11 +808,13 @@ std::pair OrderedBoxes::SplitAtBestOnDisk(const std: rectsD0Split0.rectanglesOnDisk = filePath + ".0.dim0"; rectsD1Split0.rectanglesOnDisk = filePath + ".0.dim1"; - split0.CreateOrderedBoxesOnDisk(rectsD0Split0, rectsD1Split0, sizeLeft, boundingBoxes.first); + split0.SetOrderedBoxesToDisk(rectsD0Split0, rectsD1Split0, sizeLeft, + boundingBoxes.first); } else { rectsD0Split0.rectanglesInRam = splitBuffers.splitBuffersRam.s0Dim0; rectsD1Split0.rectanglesInRam = splitBuffers.splitBuffersRam.s0Dim1; - split0.CreateOrderedBoxesInRam(rectsD0Split0, rectsD1Split0, boundingBoxes.first); + split0.SetOrderedBoxesToRam(rectsD0Split0, rectsD1Split0, + boundingBoxes.first); } if (!split1InRam) { @@ -838,11 +824,13 @@ std::pair OrderedBoxes::SplitAtBestOnDisk(const std: rectsD0Split1.rectanglesOnDisk = filePath + ".1.dim0"; rectsD1Split1.rectanglesOnDisk = filePath + ".1.dim1"; - split1.CreateOrderedBoxesOnDisk(rectsD0Split1, rectsD1Split1, sizeRight, boundingBoxes.second); + split1.SetOrderedBoxesToDisk(rectsD0Split1, rectsD1Split1, sizeRight, + boundingBoxes.second); } else { rectsD0Split1.rectanglesInRam = splitBuffers.splitBuffersRam.s1Dim0; rectsD1Split1.rectanglesInRam = splitBuffers.splitBuffersRam.s1Dim1; - split1.CreateOrderedBoxesInRam(rectsD0Split1, rectsD1Split1, boundingBoxes.second); + split1.SetOrderedBoxesToRam(rectsD0Split1, rectsD1Split1, + boundingBoxes.second); } std::remove(this->rectsD0.rectanglesOnDisk.c_str()); @@ -851,9 +839,11 @@ std::pair OrderedBoxes::SplitAtBestOnDisk(const std: return std::make_pair(split0, split1); } -std::pair OrderedBoxes::PerformSplit(SplitResult splitResult, SplitBuffersRam& splitBuffersRam, size_t M, size_t S) { +std::pair OrderedBoxes::PerformSplit( + SplitResult splitResult, SplitBuffersRam& splitBuffersRam, size_t M, + size_t S) { /** - * Perform the best split on the current ordered boxes in the ram case + * Perform the best split on the current ordered boxes in the ram case */ struct SplitBuffersDisk splitBuffersDisk; @@ -865,36 +855,36 @@ std::pair OrderedBoxes::PerformSplit(Spl splitBuffersDisk.split1Dim1File = {}; // reuse the PerfromSplit of the Disk case. - std::pair boundingBoxes = PerformSplit(splitResult, splitBuffersDisk, M, S, 0); + std::pair boundingBoxes = + PerformSplit(splitResult, splitBuffersDisk, M, S, 0); splitBuffersRam = splitBuffersDisk.splitBuffersRam; return boundingBoxes; } -std::pair OrderedBoxes::PerformSplit(SplitResult splitResult, SplitBuffersDisk& splitBuffers, size_t M, size_t S, uint64_t maxBuildingRamUsage) { +std::pair OrderedBoxes::PerformSplit( + SplitResult splitResult, SplitBuffersDisk& splitBuffers, size_t M, size_t S, + uint64_t maxBuildingRamUsage) { /** - * Perform the best split on the current ordered boxes in the disk case + * Perform the best split on the current ordered boxes in the disk case */ uint64_t sizeLeft = std::ceil((splitResult.bestIndex - 2) / 2.0) * S; uint64_t sizeRight = this->size - sizeLeft; - size_t SSplit0 = sizeLeft <= S ? std::ceil(sizeLeft / (double) M) : S; - size_t SSplit1 = sizeRight <= S ? std::ceil(sizeRight / (double) M) : S; - uint64_t split0ByteSize = sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); - uint64_t split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); - bool split0InRam = maxBuildingRamUsage == 0 || split0ByteSize * 4 < maxBuildingRamUsage; - bool split1InRam = maxBuildingRamUsage == 0 || split1ByteSize * 4 < maxBuildingRamUsage; - - double globalMinXS0 = -1; - double globalMinYS0 = -1; - double globalMaxXS0 = -1; - double globalMaxYS0 = -1; - - double globalMinXS1 = -1; - double globalMinYS1 = -1; - double globalMaxXS1 = -1; - double globalMaxYS1 = -1; + size_t SSplit0 = sizeLeft <= S ? std::ceil(sizeLeft / (double)M) : S; + size_t SSplit1 = sizeRight <= S ? std::ceil(sizeRight / (double)M) : S; + uint64_t split0ByteSize = + sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); + uint64_t split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + + 2 * sizeof(uint64_t)); + bool split0InRam = + maxBuildingRamUsage == 0 || split0ByteSize * 4 < maxBuildingRamUsage; + bool split1InRam = + maxBuildingRamUsage == 0 || split1ByteSize * 4 < maxBuildingRamUsage; + + Rtree::BoundingBox boxSplit0 = Rtree::createBoundingBox(0, 0, 0, 0); + Rtree::BoundingBox boxSplit1 = Rtree::createBoundingBox(0, 0, 0, 0); RTreeValueWithOrderIndex minSplit0OtherDim; RTreeValueWithOrderIndex maxSplit0OtherDim; @@ -907,12 +897,17 @@ std::pair OrderedBoxes::PerformSplit(Spl } otherDimension; if (splitResult.bestDim == 0) { - splitBuffers.splitBuffersRam.s0SmallDim0->push_back(splitResult.bestMinElement); - splitBuffers.splitBuffersRam.s0SmallDim0->push_back(splitResult.bestLastElement); - splitBuffers.splitBuffersRam.s1SmallDim0->push_back(splitResult.bestElement); - splitBuffers.splitBuffersRam.s1SmallDim0->push_back(splitResult.bestMaxElement); - - // placeholder, since we need the min and max element of the split in the first two spots + splitBuffers.splitBuffersRam.s0SmallDim0->push_back( + splitResult.bestMinElement); + splitBuffers.splitBuffersRam.s0SmallDim0->push_back( + splitResult.bestLastElement); + splitBuffers.splitBuffersRam.s1SmallDim0->push_back( + splitResult.bestElement); + splitBuffers.splitBuffersRam.s1SmallDim0->push_back( + splitResult.bestMaxElement); + + // placeholder, since we need the min and max element of the split in the + // first two spots otherDimension.smallSplit0 = splitBuffers.splitBuffersRam.s0SmallDim1; otherDimension.smallSplit1 = splitBuffers.splitBuffersRam.s1SmallDim1; @@ -921,10 +916,14 @@ std::pair OrderedBoxes::PerformSplit(Spl otherDimension.smallSplit1->push_back(RTreeValueWithOrderIndex()); otherDimension.smallSplit1->push_back(RTreeValueWithOrderIndex()); } else { - splitBuffers.splitBuffersRam.s0SmallDim1->push_back(splitResult.bestMinElement); - splitBuffers.splitBuffersRam.s0SmallDim1->push_back(splitResult.bestLastElement); - splitBuffers.splitBuffersRam.s1SmallDim1->push_back(splitResult.bestElement); - splitBuffers.splitBuffersRam.s1SmallDim1->push_back(splitResult.bestMaxElement); + splitBuffers.splitBuffersRam.s0SmallDim1->push_back( + splitResult.bestMinElement); + splitBuffers.splitBuffersRam.s0SmallDim1->push_back( + splitResult.bestLastElement); + splitBuffers.splitBuffersRam.s1SmallDim1->push_back( + splitResult.bestElement); + splitBuffers.splitBuffersRam.s1SmallDim1->push_back( + splitResult.bestMaxElement); // placeholder otherDimension.smallSplit0 = splitBuffers.splitBuffersRam.s0SmallDim0; @@ -940,8 +939,8 @@ std::pair OrderedBoxes::PerformSplit(Spl std::optional fileReaderDim0; std::optional fileReaderDim1; if (!this->workInRam) { - fileReaderDim0 = { FileReader(this->rectsD0.rectanglesOnDisk) }; - fileReaderDim1 = { FileReader(this->rectsD1.rectanglesOnDisk) }; + fileReaderDim0 = {FileReader(this->rectsD0.rectanglesOnDisk)}; + fileReaderDim1 = {FileReader(this->rectsD1.rectanglesOnDisk)}; } uint64_t currentXSplit0 = 0; uint64_t currentXSplit1 = 0; @@ -952,24 +951,26 @@ std::pair OrderedBoxes::PerformSplit(Spl uint64_t i = 0; if (!this->workInRam) { - if (dim == 0) - elementOpt = fileReaderDim0.value().GetNextElement(); - if (dim == 1) - elementOpt = fileReaderDim1.value().GetNextElement(); + if (dim == 0) elementOpt = fileReaderDim0.value().GetNextElement(); + if (dim == 1) elementOpt = fileReaderDim1.value().GetNextElement(); } - while ((this->workInRam && i < this->size) || (!this->workInRam && elementOpt)) { + while ((this->workInRam && i < this->size) || + (!this->workInRam && elementOpt)) { RTreeValueWithOrderIndex element; // get the current element, either from disk or from ram if (this->workInRam) { - element = dim == 0 ? (*this->rectsD0.rectanglesInRam)[i] : (*this->rectsD1.rectanglesInRam)[i]; + element = dim == 0 ? (*this->rectsD0.rectanglesInRam)[i] + : (*this->rectsD1.rectanglesInRam)[i]; } else { element = elementOpt.value(); } - if ((splitResult.bestDim == 0 && element.orderX < splitResult.bestElement.orderX) - || (splitResult.bestDim == 1 && element.orderY < splitResult.bestElement.orderY)) { + if ((splitResult.bestDim == 0 && + element.orderX < splitResult.bestElement.orderX) || + (splitResult.bestDim == 1 && + element.orderY < splitResult.bestElement.orderY)) { // the element belongs to split 0 if (dim == 0) { @@ -977,32 +978,27 @@ std::pair OrderedBoxes::PerformSplit(Spl if (split0InRam || this->workInRam) { splitBuffers.splitBuffersRam.s0Dim0->push_back(element); } else { - Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split0Dim0File.value()); + Rtree::SaveEntryWithOrderIndex(element, + splitBuffers.split0Dim0File.value()); } - // check if the element is at the position i * S (described in the algorithm) or one before it. - // In this case it is a future possible split position and needs to be saved to the "small list" - if (((currentXSplit0 + 1) % SSplit0 == 0 && (currentXSplit0 + 1) / SSplit0 >= 1 && (currentXSplit0 + 1) / SSplit0 < M) - || (currentXSplit0 % SSplit0 == 0 && currentXSplit0 / SSplit0 >= 1 && currentXSplit0 / SSplit0 < M)) { + // check if the element is at the position i * S (described in the + // algorithm) or one before it. In this case it is a future possible + // split position and needs to be saved to the "small list" + if (((currentXSplit0 + 1) % SSplit0 == 0 && + (currentXSplit0 + 1) / SSplit0 >= 1 && + (currentXSplit0 + 1) / SSplit0 < M) || + (currentXSplit0 % SSplit0 == 0 && currentXSplit0 / SSplit0 >= 1 && + currentXSplit0 / SSplit0 < M)) { // index i * S - 1 or i * S splitBuffers.splitBuffersRam.s0SmallDim0->push_back(element); } - // keep track of the min and max values to construct the bounding box of the split later - if (globalMinXS0 == -1 || element.box.min_corner().get<0>() < globalMinXS0) { - globalMinXS0 = element.box.min_corner().get<0>(); - } - if (globalMinYS0 == -1 || element.box.min_corner().get<1>() < globalMinYS0) { - globalMinYS0 = element.box.min_corner().get<1>(); - } - if (element.box.max_corner().get<0>() > globalMaxXS0) { - globalMaxXS0 = element.box.max_corner().get<0>(); - } - if (element.box.max_corner().get<1>() > globalMaxYS0) { - globalMaxYS0 = element.box.max_corner().get<1>(); - } + // update the boundingbox to get the whole boundingbox of the split + boxSplit0 = Rtree::combineBoundingBoxes(boxSplit0, element.box); - // keep track of the min and max element of the split, to later replace the placeholder in the "small lists" + // keep track of the min and max element of the split, to later + // replace the placeholder in the "small lists" if (splitResult.bestDim == 1) { if (currentXSplit0 == 0) { minSplit0OtherDim = element; @@ -1018,11 +1014,15 @@ std::pair OrderedBoxes::PerformSplit(Spl if (split0InRam || this->workInRam) { splitBuffers.splitBuffersRam.s0Dim1->push_back(element); } else { - Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split0Dim1File.value()); + Rtree::SaveEntryWithOrderIndex(element, + splitBuffers.split0Dim1File.value()); } - if (((currentYSplit0 + 1) % SSplit0 == 0 && (currentYSplit0 + 1) / SSplit0 >= 1 && (currentYSplit0 + 1) / SSplit0 < M) - || (currentYSplit0 % SSplit0 == 0 && currentYSplit0 / SSplit0 >= 1 && currentYSplit0 / SSplit0 < M)) { + if (((currentYSplit0 + 1) % SSplit0 == 0 && + (currentYSplit0 + 1) / SSplit0 >= 1 && + (currentYSplit0 + 1) / SSplit0 < M) || + (currentYSplit0 % SSplit0 == 0 && currentYSplit0 / SSplit0 >= 1 && + currentYSplit0 / SSplit0 < M)) { // index i * S - 1 or i * S splitBuffers.splitBuffersRam.s0SmallDim1->push_back(element); } @@ -1046,26 +1046,19 @@ std::pair OrderedBoxes::PerformSplit(Spl if (split1InRam || this->workInRam) { splitBuffers.splitBuffersRam.s1Dim0->push_back(element); } else { - Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split1Dim0File.value()); + Rtree::SaveEntryWithOrderIndex(element, + splitBuffers.split1Dim0File.value()); } - if (((currentXSplit1 + 1) % SSplit1 == 0 && (currentXSplit1 + 1) / SSplit1 >= 1 && (currentXSplit1 + 1) / SSplit1 < M) - || (currentXSplit1 % SSplit1 == 0 && currentXSplit1 / SSplit1 >= 1 && currentXSplit1 / SSplit1 < M)) { + if (((currentXSplit1 + 1) % SSplit1 == 0 && + (currentXSplit1 + 1) / SSplit1 >= 1 && + (currentXSplit1 + 1) / SSplit1 < M) || + (currentXSplit1 % SSplit1 == 0 && currentXSplit1 / SSplit1 >= 1 && + currentXSplit1 / SSplit1 < M)) { // index i * S - 1 or i * S splitBuffers.splitBuffersRam.s1SmallDim0->push_back(element); } - if (globalMinXS1 == -1 || element.box.min_corner().get<0>() < globalMinXS1) { - globalMinXS1 = element.box.min_corner().get<0>(); - } - if (globalMinYS1 == -1 || element.box.min_corner().get<1>() < globalMinYS1) { - globalMinYS1 = element.box.min_corner().get<1>(); - } - if (element.box.max_corner().get<0>() > globalMaxXS1) { - globalMaxXS1 = element.box.max_corner().get<0>(); - } - if (element.box.max_corner().get<1>() > globalMaxYS1) { - globalMaxYS1 = element.box.max_corner().get<1>(); - } + boxSplit1 = Rtree::combineBoundingBoxes(boxSplit1, element.box); if (splitResult.bestDim == 1) { if (currentXSplit1 == 0) { @@ -1082,10 +1075,14 @@ std::pair OrderedBoxes::PerformSplit(Spl if (split1InRam || this->workInRam) { splitBuffers.splitBuffersRam.s1Dim1->push_back(element); } else { - Rtree::SaveEntryWithOrderIndex(element, splitBuffers.split1Dim1File.value()); + Rtree::SaveEntryWithOrderIndex(element, + splitBuffers.split1Dim1File.value()); } - if (((currentYSplit1 + 1) % SSplit1 == 0 && (currentYSplit1 + 1) / SSplit1 >= 1 && (currentYSplit1 + 1) / SSplit1 < M) - || (currentYSplit1 % SSplit1 == 0 && currentYSplit1 / SSplit1 >= 1 && currentYSplit1 / SSplit1 < M)) { + if (((currentYSplit1 + 1) % SSplit1 == 0 && + (currentYSplit1 + 1) / SSplit1 >= 1 && + (currentYSplit1 + 1) / SSplit1 < M) || + (currentYSplit1 % SSplit1 == 0 && currentYSplit1 / SSplit1 >= 1 && + currentYSplit1 / SSplit1 < M)) { // index i * S - 1 or i * S splitBuffers.splitBuffersRam.s1SmallDim1->push_back(element); } @@ -1106,10 +1103,8 @@ std::pair OrderedBoxes::PerformSplit(Spl i++; if (!this->workInRam) { - if (dim == 0) - elementOpt = fileReaderDim0.value().GetNextElement(); - if (dim == 1) - elementOpt = fileReaderDim1.value().GetNextElement(); + if (dim == 0) elementOpt = fileReaderDim0.value().GetNextElement(); + if (dim == 1) elementOpt = fileReaderDim1.value().GetNextElement(); } } } @@ -1125,9 +1120,14 @@ std::pair OrderedBoxes::PerformSplit(Spl (*otherDimension.smallSplit1)[0] = minSplit1OtherDim; (*otherDimension.smallSplit1)[1] = maxSplit1OtherDim; - Rtree::BoundingBox boxSplit0 = Rtree::createBoundingBox(globalMinXS0, globalMinYS0, globalMaxXS0, globalMaxYS0); - Rtree::BoundingBox boxSplit1 = Rtree::createBoundingBox(globalMinXS1, globalMinYS1, globalMaxXS1, globalMaxYS1); + (*this->rectsD0.rectanglesInRam).clear(); + (*this->rectsD1.rectanglesInRam).clear(); + (*this->rectsD0.rectanglesSmall).clear(); + (*this->rectsD1.rectanglesSmall).clear(); + (*this->rectsD0.rectanglesInRam).shrink_to_fit(); + (*this->rectsD1.rectanglesInRam).shrink_to_fit(); + (*this->rectsD0.rectanglesSmall).shrink_to_fit(); + (*this->rectsD1.rectanglesSmall).shrink_to_fit(); return std::make_pair(boxSplit0, boxSplit1); } - diff --git a/src/util/RtreeFileReader.cpp b/src/util/RtreeFileReader.cpp index 1288f1342f..e0340a520a 100644 --- a/src/util/RtreeFileReader.cpp +++ b/src/util/RtreeFileReader.cpp @@ -9,9 +9,9 @@ FileReader::FileReader(const std::string& filePath) { this->filePath = filePath; this->file = std::ifstream(this->filePath, std::ios::binary); - this->file.seekg (0, std::ifstream::end); + this->file.seekg(0, std::ifstream::end); this->fileLength = this->file.tellg(); - this->file.seekg (0, std::ifstream::beg); + this->file.seekg(0, std::ifstream::beg); } std::optional FileReader::GetNextElement() { @@ -35,23 +35,21 @@ std::optional FileReader::GetNextElement() { Rtree::BoundingBox box = Rtree::createBoundingBox(minX, minY, maxX, maxY); RTreeValueWithOrderIndex element = {box, id, orderX, orderY}; - return { element }; + return {element}; } else { return {}; } } -void FileReader::Close() { - this->file.close(); -} +void FileReader::Close() { this->file.close(); } FileReaderWithoutIndex::FileReaderWithoutIndex(const std::string& filePath) { this->filePath = filePath; this->file = std::ifstream(this->filePath, std::ios::binary); - this->file.seekg (0, std::ifstream::end); + this->file.seekg(0, std::ifstream::end); this->fileLength = this->file.tellg(); - this->file.seekg (0, std::ifstream::beg); + this->file.seekg(0, std::ifstream::beg); } std::optional FileReaderWithoutIndex::GetNextElement() { @@ -71,12 +69,10 @@ std::optional FileReaderWithoutIndex::GetNextElement() { Rtree::BoundingBox box = Rtree::createBoundingBox(minX, minY, maxX, maxY); RTreeValue boxWithId = {box, id}; - return { boxWithId }; + return {boxWithId}; } else { return {}; } } -void FileReaderWithoutIndex::Close() { - this->file.close(); -} +void FileReaderWithoutIndex::Close() { this->file.close(); } diff --git a/src/util/RtreeFileReader.h b/src/util/RtreeFileReader.h index 0a019916a8..a7eef8b00a 100644 --- a/src/util/RtreeFileReader.h +++ b/src/util/RtreeFileReader.h @@ -5,22 +5,24 @@ #ifndef QLEVER_RTREEFILEREADER_H #define QLEVER_RTREEFILEREADER_H -#include -#include -#include -#include -#include -#include +#include + #include -#include #include -#include +#include +#include +#include +#include +#include +#include +#include class FileReader { private: std::string filePath; std::ifstream file; uint64_t fileLength; + public: explicit FileReader(const std::string& filePath); std::optional GetNextElement(); @@ -32,10 +34,11 @@ class FileReaderWithoutIndex { std::string filePath; std::ifstream file; uint64_t fileLength; + public: explicit FileReaderWithoutIndex(const std::string& filePath); std::optional GetNextElement(); void Close(); }; -#endif //QLEVER_RTREEFILEREADER_H +#endif // QLEVER_RTREEFILEREADER_H diff --git a/test/VocabularyGeneratorTest.cpp b/test/VocabularyGeneratorTest.cpp index dd33caed63..5fbf165e60 100644 --- a/test/VocabularyGeneratorTest.cpp +++ b/test/VocabularyGeneratorTest.cpp @@ -161,10 +161,12 @@ TEST_F(MergeVocabularyTest, mergeVocabulary) { { VocabularyMerger m; auto file = ad_utility::makeOfstream(_basePath + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word, [[maybe_unused]]const auto& index) { - file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; - }; - auto externalVocabularyAction = []([[maybe_unused]]const auto& word, [[maybe_unused]]const auto& index) {}; + auto internalVocabularyAction = + [&file](const auto& word, [[maybe_unused]] const auto& index) { + file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; + }; + auto externalVocabularyAction = []([[maybe_unused]] const auto& word, + [[maybe_unused]] const auto& index) {}; res = m.mergeVocabulary(_basePath, 2, TripleComponentComparator(), internalVocabularyAction, externalVocabularyAction); } @@ -207,11 +209,13 @@ TEST(VocabularyGenerator, ReadAndWritePartial) { { VocabularyMerger m; auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word, [[maybe_unused]]const auto& index) { - file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; - }; + auto internalVocabularyAction = + [&file](const auto& word, [[maybe_unused]] const auto& index) { + file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; + }; - auto externalVocabularyAction = []([[maybe_unused]]const auto& word, [[maybe_unused]]const auto& index) {}; + auto externalVocabularyAction = []([[maybe_unused]] const auto& word, + [[maybe_unused]] const auto& index) {}; m.mergeVocabulary(basename, 1, v.getCaseComparator(), internalVocabularyAction, externalVocabularyAction); } @@ -252,10 +256,12 @@ TEST(VocabularyGenerator, ReadAndWritePartial) { { VocabularyMerger m; auto file = ad_utility::makeOfstream(basename + INTERNAL_VOCAB_SUFFIX); - auto internalVocabularyAction = [&file](const auto& word, [[maybe_unused]]const auto& index) { - file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; - }; - auto externalVocabularyAction = []([[maybe_unused]]const auto& word, [[maybe_unused]]const auto& index) {}; + auto internalVocabularyAction = + [&file](const auto& word, [[maybe_unused]] const auto& index) { + file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n'; + }; + auto externalVocabularyAction = []([[maybe_unused]] const auto& word, + [[maybe_unused]] const auto& index) {}; m.mergeVocabulary(basename, 1, v.getCaseComparator(), internalVocabularyAction, externalVocabularyAction); } From e01ebae7bd86a04eed8c49b0298158565700d05d Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Tue, 31 Oct 2023 01:33:19 +0100 Subject: [PATCH 18/20] Fixed some issues --- src/global/Constants.h | 4 +- src/util/CMakeLists.txt | 2 +- src/util/Rtree.cpp | 9 +- src/util/Rtree.h | 7 +- src/util/RtreeBuild.cpp | 412 +++++++++++++---------------------- src/util/RtreeFileReader.cpp | 78 ------- src/util/RtreeFileReader.h | 117 ++++++++-- 7 files changed, 262 insertions(+), 367 deletions(-) delete mode 100644 src/util/RtreeFileReader.cpp diff --git a/src/global/Constants.h b/src/global/Constants.h index 573bf5947c..ae65f98ad3 100644 --- a/src/global/Constants.h +++ b/src/global/Constants.h @@ -51,8 +51,8 @@ static constexpr std::pair GEOF_PREFIX = { "geof:", " MATH_PREFIX = { "math:", " GEO_RTREE_PREFIX = { - "geoRtree:", " + GEO_RTREE_PREFIX = {"geoRtree:", "id = id; } -Node::Node() {} - Node::Node(uint64_t id, Rtree::BoundingBox boundingBox, multiBoxGeo& children, bool isLastInnerNode) { this->id = id; @@ -108,13 +106,14 @@ Node::Node(uint64_t id, Rtree::BoundingBox boundingBox, multiBoxGeo& children, this->isLastInnerNode = isLastInnerNode; } -Node::Node(uint64_t id, double minX, double minY, double maxX, double maxY, - bool isLastInnerNode) { +Node::Node(uint64_t id, BoundingBox boundingbox, bool isLastInnerNode) { this->id = id; - this->boundingBox = Rtree::createBoundingBox(minX, minY, maxX, maxY); + this->boundingBox = boundingbox; this->isLastInnerNode = isLastInnerNode; } +Node::Node() {} + bool Node::GetIsLastInnerNode() const { return this->isLastInnerNode; } multiBoxGeo Node::GetChildren() { return this->children; } diff --git a/src/util/Rtree.h b/src/util/Rtree.h index f83d1182e8..853ea7c8ce 100644 --- a/src/util/Rtree.h +++ b/src/util/Rtree.h @@ -66,8 +66,7 @@ class Node { Node(uint64_t id, BoundingBox boundingBox); Node(uint64_t id, BoundingBox boundingBox, multiBoxGeo& children, bool isLastInnerNode); - Node(uint64_t id, double minX, double minY, double maxX, double maxY, - bool isLastInnerNode); + Node(uint64_t id, BoundingBox boundingbox, bool isLastInnerNode); [[nodiscard]] uint64_t GetId() const; [[nodiscard]] BoundingBox GetBoundingBox() const; void AddChild(Node& child); @@ -386,8 +385,8 @@ struct SortRuleLambdaWithIndex { std::numeric_limits::max(), std::numeric_limits::max()), 0}, - LLONG_MAX, - LLONG_MAX}; + std::numeric_limits::max(), + std::numeric_limits::max()}; } }; diff --git a/src/util/RtreeBuild.cpp b/src/util/RtreeBuild.cpp index a7a2d00584..efe8ef01fe 100644 --- a/src/util/RtreeBuild.cpp +++ b/src/util/RtreeBuild.cpp @@ -8,6 +8,16 @@ #include #include +#include "ctre/ctre.h" + +static bool isBorderOfSplitCandidate(uint64_t current, uint64_t splitSize, + uint64_t M) { + if (((current + 1) % splitSize == 0 && (current + 1) / splitSize < M) || + (current % splitSize == 0 && current / splitSize >= 1)) + return true; + return false; +} + static void centerOrdering(multiBoxGeo& boxes, size_t dim) { if (dim == 0) { // order by centerX @@ -46,15 +56,10 @@ OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, rectsD0Basic = Rtree::LoadEntries(onDiskBase + ".boundingbox.tmp"); centerOrdering(rectsD0Basic, 0); } else { - FileReaderWithoutIndex fileReaderRectsD0 = - FileReaderWithoutIndex(onDiskBase + ".boundingbox.tmp"); - std::optional rectD0Element = - fileReaderRectsD0.GetNextElement(); - while (rectD0Element) { - sorterRectsD0Basic.push(rectD0Element.value()); - rectD0Element = fileReaderRectsD0.GetNextElement(); + for (const RTreeValue& rectD0Element : + FileReaderWithoutIndex(onDiskBase + ".boundingbox.tmp")) { + sorterRectsD0Basic.push(rectD0Element); } - fileReaderRectsD0.Close(); } uint64_t xSize = 0; @@ -111,25 +116,20 @@ OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, RTreeValueWithOrderIndex maxD1; auto processD1Element = [&ySize, currentS, M, &r1Small, &minD1, - &maxD1](RTreeValueWithOrderIndex element) { + &maxD1](RTreeValueWithOrderIndex& element) { element.orderY = ySize; - if (((ySize + 1) % currentS == 0 && (ySize + 1) / currentS >= 1 && - (ySize + 1) / currentS < M) || - (ySize % currentS == 0 && ySize / currentS >= 1 && - ySize / currentS < M)) { + if (isBorderOfSplitCandidate(ySize, currentS, M)) { // index i * S - 1 or i * S r1Small->push_back(element); } if (ySize == 0) { minD1 = element; - maxD1 = element; - } - if (element.orderY > maxD1.orderY) { - maxD1 = element; } + maxD1 = element; + ySize++; }; @@ -168,22 +168,16 @@ OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, RTreeValueWithOrderIndex maxD0; auto processD0Element = [¤tX, currentS, M, &r0Small, &minD0, - &maxD0](RTreeValueWithOrderIndex element) { - if (((currentX + 1) % currentS == 0 && (currentX + 1) / currentS >= 1 && - (currentX + 1) / currentS < M) || - (currentX % currentS == 0 && currentX / currentS >= 1 && - currentX / currentS < M)) { + &maxD0](RTreeValueWithOrderIndex& element) { + if (isBorderOfSplitCandidate(currentX, currentS, M)) { // index i * S - 1 or i * S r0Small->push_back(element); } if (currentX == 0) { minD0 = element; - maxD0 = element; - } - if (element.orderX > maxD0.orderX) { - maxD0 = element; } + maxD0 = element; currentX++; }; @@ -376,18 +370,11 @@ void ConstructionNode::AddChildrenToItem() { this->AddChild(leafNode); } } else { - FileReader fileReader = - FileReader(this->GetOrderedBoxes().GetRectanglesOnDisk()); - - std::optional element = - fileReader.GetNextElement(); - while (element) { - Node leafNode = Node(element.value().id, element.value().box); + for (const RTreeValueWithOrderIndex& element : + FileReader(this->GetOrderedBoxes().GetRectanglesOnDisk())) { + Node leafNode = Node(element.id, element.box); this->AddChild(leafNode); - element = fileReader.GetNextElement(); } - - fileReader.Close(); } } @@ -422,63 +409,24 @@ std::optional GetBoundingBoxFromWKT( * Parse the wkt literal in a way, that only the relevant data for the rtree * gets read in. */ - bool lookingForX = true; - bool readingDouble = false; - std::string currentDouble; - - double minX = -1; - double maxX = -1; - double minY = -1; - double maxY = -1; - - for (char c : wkt) { - if (isdigit(c)) { - readingDouble = true; - currentDouble += c; - } else if (c == '.') { - readingDouble = true; - currentDouble += '.'; - } else if (c == ' ') { - if (readingDouble && lookingForX) { - // x is completely read in - readingDouble = false; - lookingForX = false; - double x; - try { - x = std::stod(currentDouble); - } catch (...) { - return {}; - } - currentDouble = ""; - if (x < minX || minX == -1) { - minX = x; - } - - if (x > maxX) { - maxX = x; - } - } - } else { - if (readingDouble && !lookingForX) { - // y is completely read in - readingDouble = false; - lookingForX = true; - double y; - try { - y = std::stod(currentDouble); - } catch (...) { - return {}; - } - currentDouble = ""; - if (y < minY || minY == -1) { - minY = y; - } - - if (y > maxY) { - maxY = y; - } - } - } + double maxDouble = std::numeric_limits::max(); + + double minX = maxDouble; + double maxX = -maxDouble; + double minY = maxDouble; + double maxY = -maxDouble; + + // Iterate over matches and capture x and y coordinates + for (auto match : + ctre::range( + wkt)) { + double x = std::stod(std::string(match.get<1>())); + double y = std::stod(std::string(match.get<2>())); + + if (x < minX) minX = x; + if (x > maxX) maxX = x; + if (y < minY) minY = y; + if (y > maxY) maxY = y; } return {Rtree::createBoundingBox(minX, minY, maxX, maxY)}; @@ -486,38 +434,17 @@ std::optional GetBoundingBoxFromWKT( std::optional Rtree::ConvertWordToRtreeEntry( const std::string& wkt) { - /** + /* * Convert a single wkt literal to a boundingbox. + * Get the bounding box(es) of either a multipolygon, polygon or a linestring */ - std::optional boundingBox; - /* Get the bounding box(es) of either a multipolygon, polygon or a linestring - */ - std::size_t posWKTStart = wkt.find("MULTIPOLYGON(((") + 14; - std::size_t posWKTEnd = wkt.find(")))", posWKTStart); - if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { - std::string newWkt = wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); - boundingBox = GetBoundingBoxFromWKT(newWkt); - } else { - posWKTStart = wkt.find("POLYGON((") + 8; - posWKTEnd = wkt.find("))", posWKTStart); - if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { - std::string newWkt = wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); - boundingBox = GetBoundingBoxFromWKT(newWkt); - } else { - posWKTStart = wkt.find("LINESTRING(") + 10; - posWKTEnd = wkt.find(')', posWKTStart); - if (posWKTStart != std::string::npos && posWKTEnd != std::string::npos) { - std::string newWkt = - wkt.substr(posWKTStart, posWKTEnd - posWKTStart + 1); - boundingBox = GetBoundingBoxFromWKT(newWkt); - } else { - return {}; - } - } + if (wkt.starts_with("\"MULTIPOLYGON") || wkt.starts_with("\"POLYGON") || + wkt.starts_with("\"LINESTRING")) { + return GetBoundingBoxFromWKT(wkt); } - return boundingBox; + return {}; } void Rtree::SaveEntry(Rtree::BoundingBox boundingBox, uint64_t index, @@ -563,32 +490,21 @@ void Rtree::SaveEntryWithOrderIndex(RTreeValueWithOrderIndex treeValue, multiBoxGeo Rtree::LoadEntries(const std::string& file) { multiBoxGeo boxes; - FileReaderWithoutIndex fileReader = FileReaderWithoutIndex(file); - - std::optional element = fileReader.GetNextElement(); - while (element) { - boxes.push_back(element.value()); - element = fileReader.GetNextElement(); + for (const RTreeValue& element : FileReaderWithoutIndex(file)) { + boxes.push_back(element); } - fileReader.Close(); - return boxes; } multiBoxWithOrderIndex Rtree::LoadEntriesWithOrderIndex( const std::string& file) { multiBoxWithOrderIndex boxes; - FileReader fileReader = FileReader(file); - std::optional element = fileReader.GetNextElement(); - while (element) { - boxes.push_back(element.value()); - element = fileReader.GetNextElement(); + for (const RTreeValueWithOrderIndex& element : FileReader(file)) { + boxes.push_back(element); } - fileReader.Close(); - return boxes; } @@ -942,17 +858,89 @@ std::pair OrderedBoxes::PerformSplit( fileReaderDim0 = {FileReader(this->rectsD0.rectanglesOnDisk)}; fileReaderDim1 = {FileReader(this->rectsD1.rectanglesOnDisk)}; } + FileReader::iterator fileReaderDim0Iterator = + fileReaderDim0 ? fileReaderDim0.value().begin() : FileReader::iterator(); + FileReader::iterator fileReaderDim1Iterator = + fileReaderDim1 ? fileReaderDim1.value().begin() : FileReader::iterator(); uint64_t currentXSplit0 = 0; uint64_t currentXSplit1 = 0; uint64_t currentYSplit0 = 0; uint64_t currentYSplit1 = 0; + + auto performCertainSplit = + [M, &splitBuffers, &splitResult]( + size_t dim, size_t split, uint64_t& current, + uint64_t& currentSplitSize, RTreeValueWithOrderIndex& minElement, + RTreeValueWithOrderIndex& maxElement, bool currentSplitInRam, + bool workInRam, RTreeValueWithOrderIndex& element, + Rtree::BoundingBox& box) { + std::shared_ptr currentList; + std::shared_ptr currentSmallList; + std::ofstream* currentFile; + + if (split == 0) { + if (dim == 0) { + currentList = splitBuffers.splitBuffersRam.s0Dim0; + currentSmallList = splitBuffers.splitBuffersRam.s0SmallDim0; + currentFile = &splitBuffers.split0Dim0File.value(); + } else { + currentList = splitBuffers.splitBuffersRam.s0Dim1; + currentSmallList = splitBuffers.splitBuffersRam.s0SmallDim1; + currentFile = &splitBuffers.split0Dim1File.value(); + } + } else { + if (dim == 0) { + currentList = splitBuffers.splitBuffersRam.s1Dim0; + currentSmallList = splitBuffers.splitBuffersRam.s1SmallDim0; + currentFile = &splitBuffers.split1Dim0File.value(); + } else { + currentList = splitBuffers.splitBuffersRam.s1Dim1; + currentSmallList = splitBuffers.splitBuffersRam.s1SmallDim1; + currentFile = &splitBuffers.split1Dim1File.value(); + } + } + + // add the element to the current split dimension 0/1 vector / file + if (currentSplitInRam || workInRam) { + currentList->push_back(element); + } else { + Rtree::SaveEntryWithOrderIndex(element, *currentFile); + } + + // check if the element is at the position i * S (described in the + // algorithm) or one before it. In this case it is a future possible + // split position and needs to be saved to the "small list" + if (isBorderOfSplitCandidate(current, currentSplitSize, M)) { + // index i * S - 1 or i * S + currentSmallList->push_back(element); + } + + // update the boundingbox to get the whole boundingbox of the split + if (dim == 0) box = Rtree::combineBoundingBoxes(box, element.box); + + // keep track of the min and max element of the split, to later + // replace the placeholder in the "small lists" + if (splitResult.bestDim == 1 - dim) { + if (current == 0) { + minElement = element; + } + // max element gets updated each time, because the elements are sorted + // in an ascending way + maxElement = element; + } + + current++; + }; + for (size_t dim = 0; dim < 2; dim++) { // start performing the actual split uint64_t i = 0; - if (!this->workInRam) { - if (dim == 0) elementOpt = fileReaderDim0.value().GetNextElement(); - if (dim == 1) elementOpt = fileReaderDim1.value().GetNextElement(); + if (!this->workInRam && + fileReaderDim0Iterator != fileReaderDim0.value().end() && + fileReaderDim1Iterator != fileReaderDim1.value().end()) { + if (dim == 0) elementOpt = *fileReaderDim0Iterator; + if (dim == 1) elementOpt = *fileReaderDim1Iterator; } while ((this->workInRam && i < this->size) || @@ -974,146 +962,44 @@ std::pair OrderedBoxes::PerformSplit( // the element belongs to split 0 if (dim == 0) { - // add the element to the split 0 dimension 0 vector / file - if (split0InRam || this->workInRam) { - splitBuffers.splitBuffersRam.s0Dim0->push_back(element); - } else { - Rtree::SaveEntryWithOrderIndex(element, - splitBuffers.split0Dim0File.value()); - } - - // check if the element is at the position i * S (described in the - // algorithm) or one before it. In this case it is a future possible - // split position and needs to be saved to the "small list" - if (((currentXSplit0 + 1) % SSplit0 == 0 && - (currentXSplit0 + 1) / SSplit0 >= 1 && - (currentXSplit0 + 1) / SSplit0 < M) || - (currentXSplit0 % SSplit0 == 0 && currentXSplit0 / SSplit0 >= 1 && - currentXSplit0 / SSplit0 < M)) { - // index i * S - 1 or i * S - splitBuffers.splitBuffersRam.s0SmallDim0->push_back(element); - } - - // update the boundingbox to get the whole boundingbox of the split - boxSplit0 = Rtree::combineBoundingBoxes(boxSplit0, element.box); - - // keep track of the min and max element of the split, to later - // replace the placeholder in the "small lists" - if (splitResult.bestDim == 1) { - if (currentXSplit0 == 0) { - minSplit0OtherDim = element; - maxSplit0OtherDim = element; - } - if (element.orderX > maxSplit0OtherDim.orderX) { - maxSplit0OtherDim = element; - } - } - - currentXSplit0++; + performCertainSplit(0, 0, currentXSplit0, SSplit0, minSplit0OtherDim, + maxSplit0OtherDim, split0InRam, this->workInRam, + element, boxSplit0); } else { - if (split0InRam || this->workInRam) { - splitBuffers.splitBuffersRam.s0Dim1->push_back(element); - } else { - Rtree::SaveEntryWithOrderIndex(element, - splitBuffers.split0Dim1File.value()); - } - - if (((currentYSplit0 + 1) % SSplit0 == 0 && - (currentYSplit0 + 1) / SSplit0 >= 1 && - (currentYSplit0 + 1) / SSplit0 < M) || - (currentYSplit0 % SSplit0 == 0 && currentYSplit0 / SSplit0 >= 1 && - currentYSplit0 / SSplit0 < M)) { - // index i * S - 1 or i * S - splitBuffers.splitBuffersRam.s0SmallDim1->push_back(element); - } - - if (splitResult.bestDim == 0) { - if (currentYSplit0 == 0) { - minSplit0OtherDim = element; - maxSplit0OtherDim = element; - } - if (element.orderX > maxSplit0OtherDim.orderX) { - maxSplit0OtherDim = element; - } - } - - currentYSplit0++; + performCertainSplit(1, 0, currentYSplit0, SSplit0, minSplit0OtherDim, + maxSplit0OtherDim, split0InRam, this->workInRam, + element, boxSplit0); } } else { // the element belongs to split 1 if (dim == 0) { - if (split1InRam || this->workInRam) { - splitBuffers.splitBuffersRam.s1Dim0->push_back(element); - } else { - Rtree::SaveEntryWithOrderIndex(element, - splitBuffers.split1Dim0File.value()); - } - if (((currentXSplit1 + 1) % SSplit1 == 0 && - (currentXSplit1 + 1) / SSplit1 >= 1 && - (currentXSplit1 + 1) / SSplit1 < M) || - (currentXSplit1 % SSplit1 == 0 && currentXSplit1 / SSplit1 >= 1 && - currentXSplit1 / SSplit1 < M)) { - // index i * S - 1 or i * S - splitBuffers.splitBuffersRam.s1SmallDim0->push_back(element); - } - - boxSplit1 = Rtree::combineBoundingBoxes(boxSplit1, element.box); - - if (splitResult.bestDim == 1) { - if (currentXSplit1 == 0) { - minSplit1OtherDim = element; - maxSplit1OtherDim = element; - } - if (element.orderX > maxSplit1OtherDim.orderX) { - maxSplit1OtherDim = element; - } - } - - currentXSplit1++; + performCertainSplit(0, 1, currentXSplit1, SSplit1, minSplit1OtherDim, + maxSplit1OtherDim, split1InRam, this->workInRam, + element, boxSplit1); } else { - if (split1InRam || this->workInRam) { - splitBuffers.splitBuffersRam.s1Dim1->push_back(element); - } else { - Rtree::SaveEntryWithOrderIndex(element, - splitBuffers.split1Dim1File.value()); - } - if (((currentYSplit1 + 1) % SSplit1 == 0 && - (currentYSplit1 + 1) / SSplit1 >= 1 && - (currentYSplit1 + 1) / SSplit1 < M) || - (currentYSplit1 % SSplit1 == 0 && currentYSplit1 / SSplit1 >= 1 && - currentYSplit1 / SSplit1 < M)) { - // index i * S - 1 or i * S - splitBuffers.splitBuffersRam.s1SmallDim1->push_back(element); - } - - if (splitResult.bestDim == 0) { - if (currentYSplit1 == 0) { - minSplit1OtherDim = element; - maxSplit1OtherDim = element; - } - if (element.orderX > maxSplit1OtherDim.orderX) { - maxSplit1OtherDim = element; - } - } - - currentYSplit1++; + performCertainSplit(1, 1, currentYSplit1, SSplit1, minSplit1OtherDim, + maxSplit1OtherDim, split1InRam, this->workInRam, + element, boxSplit1); } } i++; - if (!this->workInRam) { - if (dim == 0) elementOpt = fileReaderDim0.value().GetNextElement(); - if (dim == 1) elementOpt = fileReaderDim1.value().GetNextElement(); + if (!this->workInRam && + fileReaderDim0Iterator != fileReaderDim0.value().end() && + fileReaderDim1Iterator != fileReaderDim1.value().end()) { + if (dim == 0) { + ++fileReaderDim0Iterator; + elementOpt = *fileReaderDim0Iterator; + } + if (dim == 1) { + ++fileReaderDim1Iterator; + elementOpt = *fileReaderDim1Iterator; + } } } } - if (!this->workInRam) { - fileReaderDim0.value().Close(); - fileReaderDim1.value().Close(); - } - // replace the placeholder (*otherDimension.smallSplit0)[0] = minSplit0OtherDim; (*otherDimension.smallSplit0)[1] = maxSplit0OtherDim; diff --git a/src/util/RtreeFileReader.cpp b/src/util/RtreeFileReader.cpp deleted file mode 100644 index e0340a520a..0000000000 --- a/src/util/RtreeFileReader.cpp +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2023, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Noah Nock - -#include -#include - -FileReader::FileReader(const std::string& filePath) { - this->filePath = filePath; - - this->file = std::ifstream(this->filePath, std::ios::binary); - this->file.seekg(0, std::ifstream::end); - this->fileLength = this->file.tellg(); - this->file.seekg(0, std::ifstream::beg); -} - -std::optional FileReader::GetNextElement() { - if (static_cast(this->file.tellg()) < this->fileLength) { - double minX; - double minY; - double maxX; - double maxY; - uint64_t id; - uint64_t orderX; - uint64_t orderY; - - this->file.read(reinterpret_cast(&minX), sizeof(double)); - this->file.read(reinterpret_cast(&minY), sizeof(double)); - this->file.read(reinterpret_cast(&maxX), sizeof(double)); - this->file.read(reinterpret_cast(&maxY), sizeof(double)); - this->file.read(reinterpret_cast(&id), sizeof(uint64_t)); - this->file.read(reinterpret_cast(&orderX), sizeof(uint64_t)); - this->file.read(reinterpret_cast(&orderY), sizeof(uint64_t)); - - Rtree::BoundingBox box = Rtree::createBoundingBox(minX, minY, maxX, maxY); - RTreeValueWithOrderIndex element = {box, id, orderX, orderY}; - - return {element}; - } else { - return {}; - } -} - -void FileReader::Close() { this->file.close(); } - -FileReaderWithoutIndex::FileReaderWithoutIndex(const std::string& filePath) { - this->filePath = filePath; - - this->file = std::ifstream(this->filePath, std::ios::binary); - this->file.seekg(0, std::ifstream::end); - this->fileLength = this->file.tellg(); - this->file.seekg(0, std::ifstream::beg); -} - -std::optional FileReaderWithoutIndex::GetNextElement() { - if (static_cast(this->file.tellg()) < this->fileLength) { - double minX; - double minY; - double maxX; - double maxY; - uint64_t id; - - this->file.read(reinterpret_cast(&minX), sizeof(double)); - this->file.read(reinterpret_cast(&minY), sizeof(double)); - this->file.read(reinterpret_cast(&maxX), sizeof(double)); - this->file.read(reinterpret_cast(&maxY), sizeof(double)); - this->file.read(reinterpret_cast(&id), sizeof(uint64_t)); - - Rtree::BoundingBox box = Rtree::createBoundingBox(minX, minY, maxX, maxY); - RTreeValue boxWithId = {box, id}; - - return {boxWithId}; - } else { - return {}; - } -} - -void FileReaderWithoutIndex::Close() { this->file.close(); } diff --git a/src/util/RtreeFileReader.h b/src/util/RtreeFileReader.h index a7eef8b00a..913263ce63 100644 --- a/src/util/RtreeFileReader.h +++ b/src/util/RtreeFileReader.h @@ -18,27 +18,116 @@ #include class FileReader { + public: + explicit FileReader(const std::string& filename) : file(filename) {} + + class iterator : public std::iterator { + public: + explicit iterator(std::ifstream& in) : input(in) { + ++(*this); // Read the first element + } + + iterator() : input(nullstream) {} // End iterator constructor + + iterator& operator++() { + double minX; + double minY; + double maxX; + double maxY; + uint64_t id; + uint64_t orderX; + uint64_t orderY; + if (input && input.read(reinterpret_cast(&minX), sizeof(double)) && + input.read(reinterpret_cast(&minY), sizeof(double)) && + input.read(reinterpret_cast(&maxX), sizeof(double)) && + input.read(reinterpret_cast(&maxY), sizeof(double)) && + input.read(reinterpret_cast(&id), sizeof(uint64_t)) && + input.read(reinterpret_cast(&orderX), sizeof(uint64_t)) && + input.read(reinterpret_cast(&orderY), sizeof(uint64_t))) { + Rtree::BoundingBox box = + Rtree::createBoundingBox(minX, minY, maxX, maxY); + currentElement = {{box, id}, orderX, orderY}; + valid = true; + } else { + valid = false; + } + return *this; + } + + const RTreeValueWithOrderIndex& operator*() const { return currentElement; } + + bool operator!=(const iterator& other) const { + return valid != other.valid; + } + + private: + std::ifstream& input; + std::ifstream nullstream; // A dummy stream for the end iterator + RTreeValueWithOrderIndex currentElement; + bool valid{}; + }; + + iterator begin() { return iterator(file); } + + static iterator end() { return {}; } + private: - std::string filePath; std::ifstream file; - uint64_t fileLength; - - public: - explicit FileReader(const std::string& filePath); - std::optional GetNextElement(); - void Close(); }; class FileReaderWithoutIndex { + public: + explicit FileReaderWithoutIndex(const std::string& filename) + : file(filename) {} + + class iterator : public std::iterator { + public: + explicit iterator(std::ifstream& in) : input(in) { + ++(*this); // Read the first element + } + + iterator() : input(nullstream) {} // End iterator constructor + + iterator& operator++() { + double minX; + double minY; + double maxX; + double maxY; + uint64_t id; + if (input && input.read(reinterpret_cast(&minX), sizeof(double)) && + input.read(reinterpret_cast(&minY), sizeof(double)) && + input.read(reinterpret_cast(&maxX), sizeof(double)) && + input.read(reinterpret_cast(&maxY), sizeof(double)) && + input.read(reinterpret_cast(&id), sizeof(uint64_t))) { + Rtree::BoundingBox box = + Rtree::createBoundingBox(minX, minY, maxX, maxY); + currentElement = {box, id}; + valid = true; + } else { + valid = false; + } + return *this; + } + + const RTreeValue& operator*() const { return currentElement; } + + bool operator!=(const iterator& other) const { + return valid != other.valid; + } + + private: + std::ifstream& input; + std::ifstream nullstream; // A dummy stream for the end iterator + RTreeValue currentElement; + bool valid{}; + }; + + iterator begin() { return iterator(file); } + + static iterator end() { return {}; } + private: - std::string filePath; std::ifstream file; - uint64_t fileLength; - - public: - explicit FileReaderWithoutIndex(const std::string& filePath); - std::optional GetNextElement(); - void Close(); }; #endif // QLEVER_RTREEFILEREADER_H From 76e634e3af1e24bfd8eb33e438cfd8d37409d63b Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Fri, 17 Nov 2023 18:14:33 +0100 Subject: [PATCH 19/20] Imroved code quality and added first tests --- .../sparqlExpressions/ContainsExpression.cpp | 8 +- src/index/IndexImpl.cpp | 17 +- src/util/CMakeLists.txt | 12 +- src/util/Rtree.cpp | 116 +-- src/util/Rtree.h | 337 +------ src/util/RtreeBasicGeometry.h | 157 ++++ src/util/RtreeBuild.cpp | 855 +++++------------- src/util/RtreeFileReader.cpp | 114 +++ src/util/RtreeFileReader.h | 144 ++- src/util/RtreeNode.cpp | 61 ++ src/util/RtreeNode.h | 58 ++ src/util/RtreeSorter.cpp | 293 ++++++ src/util/RtreeSorter.h | 13 + test/CMakeLists.txt | 4 +- test/RtreeTest.cpp | 112 +++ 15 files changed, 1180 insertions(+), 1121 deletions(-) create mode 100644 src/util/RtreeBasicGeometry.h create mode 100644 src/util/RtreeFileReader.cpp create mode 100644 src/util/RtreeNode.cpp create mode 100644 src/util/RtreeNode.h create mode 100644 src/util/RtreeSorter.cpp create mode 100644 src/util/RtreeSorter.h create mode 100644 test/RtreeTest.cpp diff --git a/src/engine/sparqlExpressions/ContainsExpression.cpp b/src/engine/sparqlExpressions/ContainsExpression.cpp index 732bfc6285..9c76676b3b 100644 --- a/src/engine/sparqlExpressions/ContainsExpression.cpp +++ b/src/engine/sparqlExpressions/ContainsExpression.cpp @@ -4,7 +4,7 @@ #include "./ContainsExpression.h" -#include +#include #include "engine/sparqlExpressions/SparqlExpressionGenerators.h" #include "global/ValueIdComparators.h" @@ -14,9 +14,9 @@ using namespace std::literals; class BoundingBoxType { public: - Rtree::BoundingBox value{}; + BasicGeometry::BoundingBox value{}; - explicit BoundingBoxType(Rtree::BoundingBox boundingBox) { + explicit BoundingBoxType(BasicGeometry::BoundingBox boundingBox) { this->value = boundingBox; } }; @@ -95,7 +95,7 @@ ContainsExpression::ContainsExpression(SparqlExpression::Ptr child, maxY = std::stod(boundingBoxEntriesAsString[3]); boundingBox_ = - new BoundingBoxType(Rtree::createBoundingBox(minX, minY, maxX, maxY)); + new BoundingBoxType(BasicGeometry::CreateBoundingBox(minX, minY, maxX, maxY)); } catch (const std::invalid_argument& e) { errorMessage = e.what(); } catch (const std::out_of_range& e) { diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index e391c547ae..a6b6c46c6c 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -25,6 +25,7 @@ #include "util/HashMap.h" #include "util/Serializer/FileSerializer.h" #include "util/TupleHelpers.h" +#include "util/RtreeFileReader.h" using std::array; using namespace ad_utility::memory_literals; @@ -410,18 +411,18 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( auto internalVocabularyAction = [&wordWriter, &convertOfs]( const auto& word, const auto& index) { wordWriter.push(word.data(), word.size()); - std::optional boundingBox = - Rtree::ConvertWordToRtreeEntry(word); + std::optional boundingBox = + BasicGeometry::ConvertWordToRtreeEntry(word); if (boundingBox) { - Rtree::SaveEntry(boundingBox.value(), index, convertOfs); + FileReaderWithoutIndex::SaveEntry(boundingBox.value(), index, convertOfs); } }; auto externalVocabularyAction = [&convertOfs](const auto& word, const auto& index) { - std::optional boundingBox = - Rtree::ConvertWordToRtreeEntry(word); + std::optional boundingBox = + BasicGeometry::ConvertWordToRtreeEntry(word); if (boundingBox) { - Rtree::SaveEntry(boundingBox.value(), index, convertOfs); + FileReaderWithoutIndex::SaveEntry(boundingBox.value(), index, convertOfs); } }; @@ -445,13 +446,13 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( LOG(INFO) << "Building the Rtree..." << std::endl; try { Rtree rtree = Rtree(1300000000000); - rtree.BuildTree(onDiskBase_ + ".vocabulary", 16, "./rtree_build"); + rtree.BuildTree(onDiskBase_, ".vocabulary.boundingbox", 16, "./rtree_build"); LOG(INFO) << "Finished building the Rtree" << std::endl; } catch (const std::exception& e) { LOG(INFO) << e.what() << std::endl; } - res.idTriples = std::move(idTriples); + res.idTriples = std::move(*idTriples.wlock()); res.actualPartialSizes = std::move(actualPartialSizes); LOG(INFO) << "Removing temporary files ..." << std::endl; diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 11f79a694e..182cbec718 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -3,5 +3,13 @@ add_subdirectory(MemorySize) add_subdirectory(http) add_library(util GeoSparqlHelpers.cpp antlr/ANTLRErrorHandling.cpp ParseException.cpp Conversions.cpp Date.cpp antlr/GenerateAntlrExceptionMetadata.cpp - Rtree.h Rtree.cpp RtreeFileReader.h RtreeBuild.cpp) -qlever_target_link_libraries(util Boost::serialization) + Rtree.h + Rtree.cpp + RtreeBuild.cpp + RtreeFileReader.cpp + RtreeFileReader.h + RtreeBasicGeometry.h + RtreeNode.cpp + RtreeNode.h + RtreeSorter.cpp RtreeSorter.h) +qlever_target_link_libraries(util Boost::serialization ${STXXL_LIBRARIES}) diff --git a/src/util/Rtree.cpp b/src/util/Rtree.cpp index 837c4331af..eb805ffe78 100644 --- a/src/util/Rtree.cpp +++ b/src/util/Rtree.cpp @@ -2,74 +2,24 @@ // Chair of Algorithms and Data Structures. // Author: Noah Nock -#include +#include "./Rtree.h" +#include "./RtreeNode.h" +#include "./RtreeFileReader.h" -#include -#include - -bool intersects(const Rtree::BoundingBox& b1, const Rtree::BoundingBox& b2) { - /** - * Determine whether two bounding boxes intersect - */ - bool notIntersecting = b1.min_corner().get<0>() > b2.max_corner().get<0>() || - b2.min_corner().get<0>() > b1.max_corner().get<0>() || - b1.min_corner().get<1>() > b2.max_corner().get<1>() || - b2.min_corner().get<1>() > b1.max_corner().get<1>(); - - return !notIntersecting; -} - -Rtree::BoundingBox Rtree::createBoundingBox(double pointOneX, double pointOneY, - double pointTwoX, - double pointTwoY) { - return {{pointOneX, pointOneY}, {pointTwoX, pointTwoY}}; -} - -Rtree::BoundingBox Rtree::combineBoundingBoxes(Rtree::BoundingBox b1, - Rtree::BoundingBox b2) { - if (b1.min_corner().get<0>() == 0 && b1.min_corner().get<1>() == 0 && - b1.max_corner().get<0>() == 0 && b1.max_corner().get<1>() == 0) { - return b2; - } - if (b2.min_corner().get<0>() == 0 && b2.min_corner().get<1>() == 0 && - b2.max_corner().get<0>() == 0 && b2.max_corner().get<1>() == 0) { - return b1; - } - auto minX = [](Rtree::BoundingBox b) -> double { - return b.min_corner().get<0>(); - }; - auto minY = [](Rtree::BoundingBox b) -> double { - return b.min_corner().get<1>(); - }; - auto maxX = [](Rtree::BoundingBox b) -> double { - return b.max_corner().get<0>(); - }; - auto maxY = [](Rtree::BoundingBox b) -> double { - return b.max_corner().get<1>(); - }; - - double globalMinX = minX(b1) < minX(b2) ? minX(b1) : minX(b2); - double globalMinY = minY(b1) < minY(b2) ? minY(b1) : minY(b2); - double globalMaxX = maxX(b1) > maxX(b2) ? maxX(b1) : maxX(b2); - double globalMaxY = maxY(b1) > maxY(b2) ? maxY(b1) : maxY(b2); - - return {{globalMinX, globalMinY}, {globalMaxX, globalMaxY}}; -} - -multiBoxGeo Rtree::SearchTree(Rtree::BoundingBox query, +multiBoxGeo Rtree::SearchTree(BasicGeometry::BoundingBox query, const std::string& folder) { std::ifstream lookupIfs = std::ifstream(folder + "/lookup.bin", std::ios::binary); std::ifstream nodesIfs = std::ifstream(folder + "/nodes.bin", std::ios::binary); - Node rootNode = LoadNode(0, lookupIfs, nodesIfs); + RtreeNode rootNode = FileReader::LoadNode(0, lookupIfs, nodesIfs); multiBoxGeo results; - std::stack nodes; + std::stack nodes; nodes.push(rootNode); while (!nodes.empty()) { - Node currentNode = nodes.top(); + RtreeNode currentNode = nodes.top(); nodes.pop(); for (RTreeValue child : currentNode.GetChildren()) { @@ -77,7 +27,7 @@ multiBoxGeo Rtree::SearchTree(Rtree::BoundingBox query, if (currentNode.GetIsLastInnerNode()) { results.push_back(child); } else { - Node newNode = LoadNode(child.id, lookupIfs, nodesIfs); + RtreeNode newNode = FileReader::LoadNode(child.id, lookupIfs, nodesIfs); nodes.push(newNode); } } @@ -89,52 +39,6 @@ multiBoxGeo Rtree::SearchTree(Rtree::BoundingBox query, return results; } -uint64_t Node::GetId() const { return this->id; } - -Node::Node(uint64_t id, Rtree::BoundingBox boundingbox) { - this->id = id; - this->boundingBox = boundingbox; -} - -Node::Node(uint64_t id) { this->id = id; } - -Node::Node(uint64_t id, Rtree::BoundingBox boundingBox, multiBoxGeo& children, - bool isLastInnerNode) { - this->id = id; - this->boundingBox = boundingBox; - this->children = children; - this->isLastInnerNode = isLastInnerNode; -} - -Node::Node(uint64_t id, BoundingBox boundingbox, bool isLastInnerNode) { - this->id = id; - this->boundingBox = boundingbox; - this->isLastInnerNode = isLastInnerNode; -} - -Node::Node() {} - -bool Node::GetIsLastInnerNode() const { return this->isLastInnerNode; } - -multiBoxGeo Node::GetChildren() { return this->children; } - -Node Rtree::LoadNode(uint64_t id, std::ifstream& lookupIfs, - std::ifstream& nodesIfs) { - Node newNode; - - uint64_t offset = id * (uint64_t)sizeof(uint64_t); - lookupIfs.seekg(offset, std::ios::beg); - - uint64_t nodePtr; - lookupIfs.read(reinterpret_cast(&nodePtr), sizeof(uint64_t)); - - nodesIfs.seekg(nodePtr); - boost::archive::binary_iarchive ia(nodesIfs); - ia >> newNode; - - return newNode; -} - Rtree::Rtree(uintmax_t maxBuildingRamUsage) { - this->maxBuildingRamUsage = maxBuildingRamUsage; -} + this->maxBuildingRamUsage_ = maxBuildingRamUsage; +} \ No newline at end of file diff --git a/src/util/Rtree.h b/src/util/Rtree.h index 853ea7c8ce..7dee50708a 100644 --- a/src/util/Rtree.h +++ b/src/util/Rtree.h @@ -5,16 +5,17 @@ #ifndef QLEVER_RTREE_H #define QLEVER_RTREE_H -#include -#include +#ifndef EOF +#define EOF std::char_traits::eof() +#endif #include #include #include -#include +#include #include -#include +#include -namespace bg = boost::geometry; +#include "./RtreeBasicGeometry.h" // ___________________________________________________________________________ // Forward declaration @@ -23,117 +24,43 @@ struct RTreeValueWithOrderIndex; using multiBoxGeo = std::vector; using multiBoxWithOrderIndex = std::vector; struct SplitResult; -struct SplitBuffersRam; -struct SplitBuffersDisk; +struct SplitBuffers; // ___________________________________________________________________________ // Data type to store all the information of the rectangles (in ram or on disk) // + the small lists for one dimension struct RectanglesForOrderedBoxes { - std::shared_ptr rectanglesInRam; - std::string rectanglesOnDisk; - std::shared_ptr rectanglesSmall; -}; - -// ___________________________________________________________________________ -// Data structure representing a single node of the Rtree containing the -// boundingbox and its children -class Node { - protected: - typedef bg::model::point> - Point; - typedef bg::model::box BoundingBox; - friend class boost::serialization::access; - uint64_t id; - BoundingBox boundingBox{}; - bool isLastInnerNode = - false; // when true, this means that the node is the last inner node and - // all of its children are leafs - multiBoxGeo children; + std::variant rectangles; + multiBoxWithOrderIndex rectanglesSmall; - template - void serialize(Archive& a, [[maybe_unused]] const unsigned int version) { - a& id; - a& isLastInnerNode; - a& boundingBox; - a& children; + RectanglesForOrderedBoxes() { + rectangles = {}; + rectanglesSmall = multiBoxWithOrderIndex(); } - explicit Node(uint64_t id); - - public: - Node(); - Node(uint64_t id, BoundingBox boundingBox); - Node(uint64_t id, BoundingBox boundingBox, multiBoxGeo& children, - bool isLastInnerNode); - Node(uint64_t id, BoundingBox boundingbox, bool isLastInnerNode); - [[nodiscard]] uint64_t GetId() const; - [[nodiscard]] BoundingBox GetBoundingBox() const; - void AddChild(Node& child); - void SetIsLastInnerNode(bool isLast); - [[nodiscard]] bool GetIsLastInnerNode() const; - multiBoxGeo GetChildren(); + void Clear() { + rectanglesSmall = multiBoxWithOrderIndex(); + if (std::holds_alternative(rectangles)) { + rectangles = multiBoxWithOrderIndex(); + } + } }; -BOOST_CLASS_VERSION(Node, 1) - // ___________________________________________________________________________ // A Rtree based on bounding boxes and ids class Rtree { private: - // ___________________________________________________________________________ - // Save the current node in the building process to disk - static uint64_t SaveNode(Node& node, bool isLastInnerNode, - std::ofstream& nodesOfs); - // ___________________________________________________________________________ - // Load a specific Node to query in its children - static Node LoadNode(uint64_t id, std::ifstream& lookupIfs, - std::ifstream& nodesIfs); - uintmax_t maxBuildingRamUsage; + uintmax_t maxBuildingRamUsage_; public: - typedef bg::model::point> - Point; - typedef bg::model::box BoundingBox; // ___________________________________________________________________________ - // Build the whole Rtree with the raw data in onDiskBase + ".boundingbox.tmp", + // Build the whole Rtree with the raw data in onDiskBase + fileSuffix + ".tmp", // M as branching factor and folder as Rtree destination - void BuildTree(const std::string& onDiskBase, size_t M, + void BuildTree(const std::string& onDiskBase, const std::string& fileSuffix, size_t M, const std::string& folder) const; // ___________________________________________________________________________ // Search for an intersection of query with any elements of the Rtree - static multiBoxGeo SearchTree(BoundingBox query, const std::string& folder); - // ___________________________________________________________________________ - // Convert a single wkt literal to a datapoint in the format suitable for the - // Rtree - static std::optional ConvertWordToRtreeEntry( - const std::string& wkt); - // ___________________________________________________________________________ - // Save a single datapoint for the Rtree to disk - static void SaveEntry(BoundingBox boundingBox, uint64_t index, - std::ofstream& convertOfs); - // ___________________________________________________________________________ - // Save a single datapoint of the Rtree, together with its position in the x - // and y sorting to disk - static void SaveEntryWithOrderIndex(RTreeValueWithOrderIndex treeValue, - std::ofstream& convertOfs); - // ___________________________________________________________________________ - // Load all datapoints of the Rtree in file into ram - static multiBoxGeo LoadEntries(const std::string& file); - // ___________________________________________________________________________ - // Load all datapoints of the Rtree, together with its x and y sorting into - // ram - static multiBoxWithOrderIndex LoadEntriesWithOrderIndex( - const std::string& file); - // ___________________________________________________________________________ - // Create a bounding box, based on the corner coordinates - static BoundingBox createBoundingBox(double pointOneX, double pointOneY, - double pointTwoX, double pointTwoY); - // ___________________________________________________________________________ - // Take two bounding boxes and combine them into one bounding box containing - // both - static BoundingBox combineBoundingBoxes(Rtree::BoundingBox b1, - Rtree::BoundingBox b2); + static multiBoxGeo SearchTree(BasicGeometry::BoundingBox query, const std::string& folder); explicit Rtree(uintmax_t maxBuildingRamUsage); }; @@ -142,12 +69,12 @@ class Rtree { // direction (either on ram or on disk) class OrderedBoxes { private: - bool workInRam; - uint64_t size; - Rtree::BoundingBox boundingBox; + bool workInRam_{}; + uint64_t size_{}; + BasicGeometry::BoundingBox boundingBox_{}; RectanglesForOrderedBoxes - rectsD0; // the rectangles (datapoints) sorted in x direction - RectanglesForOrderedBoxes rectsD1; // the rectangles sorted in y direction + rectsD0_; // the rectangles (datapoints) sorted in x direction + RectanglesForOrderedBoxes rectsD1_; // the rectangles sorted in y direction // ___________________________________________________________________________ // Initiate the splitting of the rectangles in the best position (rectangles // are stored in ram) @@ -164,112 +91,44 @@ class OrderedBoxes { SplitResult GetBestSplit(); // ___________________________________________________________________________ // Actually splitting the rectangles at the given split by splitResult - // (rectangles are stored in ram) - std::pair PerformSplit( - SplitResult splitResult, SplitBuffersRam& splitBuffersRam, size_t M, - size_t S); - // ___________________________________________________________________________ - // Actually splitting the rectangles at the given split by splitResult - // (rectangles are stored on disk) - std::pair PerformSplit( - SplitResult splitResult, SplitBuffersDisk& splitBuffers, size_t M, - size_t S, uint64_t maxBuildingRamUsage); + std::pair PerformSplit( + SplitResult splitResult, SplitBuffers& splitBuffers, size_t M, + size_t S, uint64_t maxBuildingRamUsage = 0); public: [[nodiscard]] bool WorkInRam() const; // ___________________________________________________________________________ // Set up the OrderedBoxes with the rectangles given as vectors stored in ram // and set workInRam to true - void SetOrderedBoxesToRam(RectanglesForOrderedBoxes& rectanglesD0, - RectanglesForOrderedBoxes& rectanglesD1, - Rtree::BoundingBox box); + void SetOrderedBoxesToRam(RectanglesForOrderedBoxes rectanglesD0, + RectanglesForOrderedBoxes rectanglesD1, + BasicGeometry::BoundingBox box); // ___________________________________________________________________________ // Set up the OrderedBoxes with the rectangles given as files stored on disk // and set workInRam to false - void SetOrderedBoxesToDisk(RectanglesForOrderedBoxes& rectanglesD0, - RectanglesForOrderedBoxes& rectanglesD1, - uint64_t size, Rtree::BoundingBox box); - Rtree::BoundingBox GetBoundingBox(); + void SetOrderedBoxesToDisk(RectanglesForOrderedBoxes rectanglesD0, + RectanglesForOrderedBoxes rectanglesD1, + uint64_t size, BasicGeometry::BoundingBox box); + BasicGeometry::BoundingBox GetBoundingBox(); [[nodiscard]] uint64_t GetSize() const; // ___________________________________________________________________________ // Wrapper function to perform the whole process of splitting the rectangles // for either ram or disk case std::pair SplitAtBest( - const std::string& filePath, size_t S, size_t M, + const std::filesystem::path& filePath, size_t S, size_t M, uint64_t maxBuildingRamUsage); // ___________________________________________________________________________ // return the rectangles of the x sorting for the case where they are stored // in ram - std::shared_ptr GetRectanglesInRam(); + multiBoxWithOrderIndex GetRectanglesInRam(); // ___________________________________________________________________________ // return the rectangles of the x sorting for the case where they are stored // on disk - std::string GetRectanglesOnDisk(); -}; - -// ___________________________________________________________________________ -// Subclass of the Node only needed while constructing the Rtree (it keeps track -// of the remaining OrderedBoxes of the subtree) -class ConstructionNode : public Node { - private: - OrderedBoxes orderedBoxes; + std::filesystem::path GetRectanglesOnDisk(); - public: - ConstructionNode(uint64_t id, OrderedBoxes orderedBoxes); - OrderedBoxes GetOrderedBoxes(); - void AddChildrenToItem(); -}; - -namespace boost::serialization { -template -void save(Archive& a, const Rtree::BoundingBox& b, - [[maybe_unused]] unsigned int version) { - a << b.min_corner().get<0>(); - a << b.min_corner().get<1>(); - a << b.max_corner().get<0>(); - a << b.max_corner().get<1>(); -} -template -void load(Archive& a, Rtree::BoundingBox& b, - [[maybe_unused]] unsigned int version) { - double minX = 0; - a >> minX; - double minY = 0; - a >> minY; - double maxX = 0; - a >> maxX; - double maxY = 0; - a >> maxY; - b = Rtree::BoundingBox(Rtree::Point(minX, minY), Rtree::Point(maxX, maxY)); -} -} // namespace boost::serialization -BOOST_SERIALIZATION_SPLIT_FREE(Rtree::BoundingBox); - -// ___________________________________________________________________________ -// Data type for a value of the Rtree, which contains the id of the object and -// its bounding box. -struct RTreeValue { - Rtree::BoundingBox box{}; - uint64_t id = 0; - double MinX() const { return box.min_corner().get<0>(); } - double MaxX() const { return box.max_corner().get<0>(); } - double MinY() const { return box.min_corner().get<1>(); } - double MaxY() const { return box.max_corner().get<1>(); } - - template - void serialize(Archive& a, [[maybe_unused]] const unsigned int version) { - a& box; - a& id; - } -}; - -// ___________________________________________________________________________ -// Data type for a value of the Rtree (id and boundingbox), with the addtional -// information of its position in the x- and y-sorting. This is only used to -// create the Rtree in a more efficient way -struct RTreeValueWithOrderIndex : RTreeValue { - uint64_t orderX = 0; - uint64_t orderY = 0; + // ___________________________________________________________________________ + // Clear all content of the OrderedBoxes + void Clear(); }; // ___________________________________________________________________________ @@ -285,109 +144,11 @@ struct SplitResult { RTreeValueWithOrderIndex bestMaxElement; }; -// ___________________________________________________________________________ -// A Buffer data structure, containing vectors for the result of a split, while -// doing it in ram -struct SplitBuffersRam { - std::shared_ptr s0Dim0 = - std::make_shared(); - std::shared_ptr s0Dim1 = - std::make_shared(); - std::shared_ptr s1Dim0 = - std::make_shared(); - std::shared_ptr s1Dim1 = - std::make_shared(); - - std::shared_ptr s0SmallDim0 = - std::make_shared(); - std::shared_ptr s0SmallDim1 = - std::make_shared(); - std::shared_ptr s1SmallDim0 = - std::make_shared(); - std::shared_ptr s1SmallDim1 = - std::make_shared(); -}; - -// ___________________________________________________________________________ -// A Buffer data structure, containing the write streams for the result of a -// split, while doing it on disk -struct SplitBuffersDisk { - SplitBuffersRam splitBuffersRam; - std::optional split0Dim0File; - std::optional split0Dim1File; - std::optional split1Dim0File; - std::optional split1Dim1File; -}; - -template -struct SortRuleLambda { - // comparison function - bool operator()(const RTreeValue& b1, const RTreeValue& b2) const { - double center1 = dimension == 0 ? std::midpoint(b1.MinX(), b1.MaxX()) - : std::midpoint(b1.MinY(), b1.MaxY()); - double center2 = dimension == 0 ? std::midpoint(b2.MinX(), b2.MaxX()) - : std::midpoint(b2.MinY(), b2.MaxY()); - return center1 < center2; - } - - // Value that is strictly smaller than any input element. - static RTreeValue min_value() { - return {Rtree::createBoundingBox(-std::numeric_limits::max(), - -std::numeric_limits::max(), - -std::numeric_limits::max(), - -std::numeric_limits::max()), - 0}; - } - - // Value that is strictly larger than any input element. - static RTreeValue max_value() { - return {Rtree::createBoundingBox(std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max()), - 0}; - } -}; - -template -struct SortRuleLambdaWithIndex { - uint64_t RTreeValueWithOrderIndex::*orderSelected = - dimension == 0 ? &RTreeValueWithOrderIndex::orderX - : &RTreeValueWithOrderIndex::orderY; - - // comparison function - bool operator()(const RTreeValueWithOrderIndex& b1, - const RTreeValueWithOrderIndex& b2) const { - double center1 = dimension == 0 ? std::midpoint(b1.MinX(), b1.MaxX()) - : std::midpoint(b1.MinY(), b1.MaxY()); - double center2 = dimension == 0 ? std::midpoint(b2.MinX(), b2.MaxX()) - : std::midpoint(b2.MinY(), b2.MaxY()); - - if (b1.*orderSelected == b2.*orderSelected) return center1 < center2; - return b1.*orderSelected < b2.*orderSelected; - } - - // Value that is strictly smaller than any input element. - static RTreeValueWithOrderIndex min_value() { - return {{Rtree::createBoundingBox(-std::numeric_limits::max(), - -std::numeric_limits::max(), - -std::numeric_limits::max(), - -std::numeric_limits::max()), - 0}, - 0, - 0}; - } - - // Value that is strictly larger than any input element. - static RTreeValueWithOrderIndex max_value() { - return {{Rtree::createBoundingBox(std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max()), - 0}, - std::numeric_limits::max(), - std::numeric_limits::max()}; - } +struct SplitBuffers { + RectanglesForOrderedBoxes& rectsD0Split0; + RectanglesForOrderedBoxes& rectsD1Split0; + RectanglesForOrderedBoxes& rectsD0Split1; + RectanglesForOrderedBoxes& rectsD1Split1; }; -#endif // QLEVER_RTREE_H +#endif // QLEVER_RTREE_H \ No newline at end of file diff --git a/src/util/RtreeBasicGeometry.h b/src/util/RtreeBasicGeometry.h new file mode 100644 index 0000000000..5b71038bc9 --- /dev/null +++ b/src/util/RtreeBasicGeometry.h @@ -0,0 +1,157 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + + +#ifndef QLEVER_RTREEBASICGEOMETRY_H +#define QLEVER_RTREEBASICGEOMETRY_H + +#include "./Rtree.h" +#include +#include +#include "ctre/ctre.h" + +class BasicGeometry { + public: + typedef boost::geometry::model::point> + Point; + typedef boost::geometry::model::box BoundingBox; + + static double GetMinX(BoundingBox boundingBox) { return boundingBox.min_corner().get<0>(); } + static double GetMinY(BoundingBox boundingBox) { return boundingBox.min_corner().get<1>(); } + static double GetMaxX(BoundingBox boundingBox) { return boundingBox.max_corner().get<0>(); } + static double GetMaxY(BoundingBox boundingBox) { return boundingBox.max_corner().get<1>(); } + + // ___________________________________________________________________________ + // Create a bounding box, based on the corner coordinates + static BasicGeometry::BoundingBox CreateBoundingBox(double pointOneX, double pointOneY, + double pointTwoX, + double pointTwoY) { + return {{pointOneX, pointOneY}, {pointTwoX, pointTwoY}}; + } + + // ___________________________________________________________________________ + // Take two bounding boxes and combine them into one bounding box containing + // both + static BasicGeometry::BoundingBox CombineBoundingBoxes(const BasicGeometry::BoundingBox& b1, + const BasicGeometry::BoundingBox& b2) { + auto minX = [](BasicGeometry::BoundingBox b) -> double { + return b.min_corner().get<0>(); + }; + auto minY = [](BasicGeometry::BoundingBox b) -> double { + return b.min_corner().get<1>(); + }; + auto maxX = [](BasicGeometry::BoundingBox b) -> double { + return b.max_corner().get<0>(); + }; + auto maxY = [](BasicGeometry::BoundingBox b) -> double { + return b.max_corner().get<1>(); + }; + + double globalMinX = minX(b1) < minX(b2) ? minX(b1) : minX(b2); + double globalMinY = minY(b1) < minY(b2) ? minY(b1) : minY(b2); + double globalMaxX = maxX(b1) > maxX(b2) ? maxX(b1) : maxX(b2); + double globalMaxY = maxY(b1) > maxY(b2) ? maxY(b1) : maxY(b2); + + return {{globalMinX, globalMinY}, {globalMaxX, globalMaxY}}; + } + + static bool IsBorderOfSplitCandidate(uint64_t current, uint64_t splitSize, + uint64_t M) { + if (((current + 1) % splitSize == 0 && (current + 1) / splitSize < M) || + (current % splitSize == 0 && current / splitSize >= 1)) + return true; + return false; + } + + // ___________________________________________________________________________ + // Convert a single wkt literal to a datapoint in the format suitable for the + // Rtree + static std::optional ConvertWordToRtreeEntry( + const std::string& wkt) { + /** + * Convert a single wkt literal to a boundingbox. + * Get the bounding box of either a multipolygon, polygon or a linestring + */ +if (!wkt.starts_with("\"MULTIPOLYGON") && !wkt.starts_with("\"POLYGON") && + !wkt.starts_with("\"LINESTRING")) { + return {}; +} + +double maxDouble = std::numeric_limits::max(); + +double minX = maxDouble; +double maxX = -maxDouble; +double minY = maxDouble; +double maxY = -maxDouble; + +// Iterate over matches and capture x and y coordinates +for (auto match : + ctre::range( + wkt)) { + double x = std::stod(std::string(match.get<1>())); + double y = std::stod(std::string(match.get<2>())); + + if (x < minX) minX = x; + if (x > maxX) maxX = x; + if (y < minY) minY = y; + if (y > maxY) maxY = y; +} + +return {BasicGeometry::CreateBoundingBox(minX, minY, maxX, maxY)}; +} +}; + +// ___________________________________________________________________________ +// Data type for a value of the Rtree, which contains the id of the object and +// its bounding box. +struct RTreeValue { + BasicGeometry::BoundingBox box{}; + uint64_t id = 0; + [[nodiscard]] double MinX() const { return box.min_corner().get<0>(); } + [[nodiscard]] double MaxX() const { return box.max_corner().get<0>(); } + [[nodiscard]] double MinY() const { return box.min_corner().get<1>(); } + [[nodiscard]] double MaxY() const { return box.max_corner().get<1>(); } + + template + void serialize(Archive& a, [[maybe_unused]] const unsigned int version) { + a& box; + a& id; + } +}; + +// ___________________________________________________________________________ +// Data type for a value of the Rtree (id and boundingbox), with the addtional +// information of its position in the x- and y-sorting. This is only used to +// create the Rtree in a more efficient way +struct RTreeValueWithOrderIndex : RTreeValue { + uint64_t orderX = 0; + uint64_t orderY = 0; +}; + +namespace boost::serialization { +template +void save(Archive& a, const BasicGeometry::BoundingBox& b, + [[maybe_unused]] unsigned int version) { + a << b.min_corner().get<0>(); + a << b.min_corner().get<1>(); + a << b.max_corner().get<0>(); + a << b.max_corner().get<1>(); +} +template +void load(Archive& a, BasicGeometry::BoundingBox& b, + [[maybe_unused]] unsigned int version) { + double minX = 0; + a >> minX; + double minY = 0; + a >> minY; + double maxX = 0; + a >> maxX; + double maxY = 0; + a >> maxY; + b = BasicGeometry::BoundingBox(BasicGeometry::Point(minX, minY), BasicGeometry::Point(maxX, maxY)); +} +} // namespace boost::serialization +BOOST_SERIALIZATION_SPLIT_FREE(BasicGeometry::BoundingBox); + +#endif //QLEVER_RTREEBASICGEOMETRY_H diff --git a/src/util/RtreeBuild.cpp b/src/util/RtreeBuild.cpp index efe8ef01fe..f668ac93ff 100644 --- a/src/util/RtreeBuild.cpp +++ b/src/util/RtreeBuild.cpp @@ -1,245 +1,25 @@ // Copyright 2023, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: Noah Nock -#include -#include -#include +#include "./Rtree.h" +#include "./RtreeFileReader.h" +#include "./RtreeSorter.h" -#include -#include - -#include "ctre/ctre.h" - -static bool isBorderOfSplitCandidate(uint64_t current, uint64_t splitSize, - uint64_t M) { - if (((current + 1) % splitSize == 0 && (current + 1) / splitSize < M) || - (current % splitSize == 0 && current / splitSize >= 1)) - return true; - return false; -} - -static void centerOrdering(multiBoxGeo& boxes, size_t dim) { - if (dim == 0) { - // order by centerX - std::sort(boxes.begin(), boxes.end(), SortRuleLambda<0>{}); - } else { - // order by centerY - std::sort(boxes.begin(), boxes.end(), SortRuleLambda<1>{}); - } -} - -static void centerOrdering(multiBoxWithOrderIndex& boxes, size_t dim) { - if (dim == 0) { - // order by centerX - std::sort(boxes.begin(), boxes.end(), SortRuleLambdaWithIndex<0>{}); - } else { - // order by centerY - std::sort(boxes.begin(), boxes.end(), SortRuleLambdaWithIndex<1>{}); - } -} - -OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, - uintmax_t maxBuildingRamUsage, bool workInRam) { - OrderedBoxes orderedInputRectangles; - - auto maxRamForSorter = - std::ceil((maxBuildingRamUsage < 9999999999.0 ? maxBuildingRamUsage - : 9999999999.0) / - 3.0); - ad_utility::BackgroundStxxlSorter> - sorterRectsD0Basic = - ad_utility::BackgroundStxxlSorter>( - maxRamForSorter); - multiBoxGeo rectsD0Basic; - - if (workInRam) { - rectsD0Basic = Rtree::LoadEntries(onDiskBase + ".boundingbox.tmp"); - centerOrdering(rectsD0Basic, 0); - } else { - for (const RTreeValue& rectD0Element : - FileReaderWithoutIndex(onDiskBase + ".boundingbox.tmp")) { - sorterRectsD0Basic.push(rectD0Element); - } - } - - uint64_t xSize = 0; - Rtree::BoundingBox boundingBox = Rtree::createBoundingBox(0, 0, 0, 0); - - ad_utility::BackgroundStxxlSorter> - sorterRectsD1 = - ad_utility::BackgroundStxxlSorter>( - maxRamForSorter); - std::shared_ptr RectanglesD1WithOrder = - std::make_shared(); - - if (workInRam) { - for (RTreeValue element : rectsD0Basic) { - RTreeValueWithOrderIndex entry = {{element.box, element.id}, xSize, 0}; - RectanglesD1WithOrder->push_back(entry); - xSize++; - - boundingBox = Rtree::combineBoundingBoxes(boundingBox, element.box); - } - centerOrdering(*RectanglesD1WithOrder, 1); - } else { - for (RTreeValue element : sorterRectsD0Basic.sortedView()) { - RTreeValueWithOrderIndex entry = {{element.box, element.id}, xSize, 0}; - sorterRectsD1.push(entry); - xSize++; - - boundingBox = Rtree::combineBoundingBoxes(boundingBox, element.box); - } - } - sorterRectsD0Basic.clear(); - - size_t currentS = std::ceil(((float)xSize) / ((float)M)); - - uint64_t ySize = 0; - std::ofstream r1File = - std::ofstream(onDiskBase + ".boundingbox.d1.tmp", std::ios::binary); - ad_utility::BackgroundStxxlSorter> - sorterRectsD0 = - ad_utility::BackgroundStxxlSorter>( - maxRamForSorter); - std::shared_ptr RectanglesD0WithOrder = - std::make_shared(); - std::shared_ptr r1Small = - std::make_shared(); - // placeholder - r1Small->push_back(RTreeValueWithOrderIndex()); - r1Small->push_back(RTreeValueWithOrderIndex()); - RTreeValueWithOrderIndex minD1; - RTreeValueWithOrderIndex maxD1; - - auto processD1Element = [&ySize, currentS, M, &r1Small, &minD1, - &maxD1](RTreeValueWithOrderIndex& element) { - element.orderY = ySize; - - if (isBorderOfSplitCandidate(ySize, currentS, M)) { - // index i * S - 1 or i * S - r1Small->push_back(element); - } - - if (ySize == 0) { - minD1 = element; - } - - maxD1 = element; - - ySize++; - }; - - if (workInRam) { - for (RTreeValueWithOrderIndex element : *RectanglesD1WithOrder) { - processD1Element(element); - - RectanglesD0WithOrder->push_back(element); - } - centerOrdering(*RectanglesD0WithOrder, 0); - } else { - for (RTreeValueWithOrderIndex element : sorterRectsD1.sortedView()) { - processD1Element(element); - - Rtree::SaveEntryWithOrderIndex(element, r1File); - sorterRectsD0.push(element); - } - } - - r1File.close(); - sorterRectsD1.clear(); - - // replace the placeholder - (*r1Small)[0] = minD1; - (*r1Small)[1] = maxD1; - - uint64_t currentX = 0; - std::ofstream r0File = - std::ofstream(onDiskBase + ".boundingbox.d0.tmp", std::ios::binary); - std::shared_ptr r0Small = - std::make_shared(); - // placeholder - r0Small->push_back(RTreeValueWithOrderIndex()); - r0Small->push_back(RTreeValueWithOrderIndex()); - RTreeValueWithOrderIndex minD0; - RTreeValueWithOrderIndex maxD0; - - auto processD0Element = [¤tX, currentS, M, &r0Small, &minD0, - &maxD0](RTreeValueWithOrderIndex& element) { - if (isBorderOfSplitCandidate(currentX, currentS, M)) { - // index i * S - 1 or i * S - r0Small->push_back(element); - } - - if (currentX == 0) { - minD0 = element; - } - maxD0 = element; - - currentX++; - }; - - if (workInRam) { - for (RTreeValueWithOrderIndex element : *RectanglesD0WithOrder) { - processD0Element(element); - } - } else { - for (RTreeValueWithOrderIndex element : sorterRectsD0.sortedView()) { - Rtree::SaveEntryWithOrderIndex(element, r0File); - - processD0Element(element); - } - } - - r0File.close(); - sorterRectsD0.clear(); - - // replace the placeholder - (*r0Small)[0] = minD0; - (*r0Small)[1] = maxD0; - - RectanglesForOrderedBoxes rectsD0; - RectanglesForOrderedBoxes rectsD1; - rectsD0.rectanglesSmall = r0Small; - rectsD1.rectanglesSmall = r1Small; - if (workInRam) { - rectsD0.rectanglesInRam = RectanglesD0WithOrder; - rectsD1.rectanglesInRam = RectanglesD1WithOrder; - orderedInputRectangles.SetOrderedBoxesToRam(rectsD0, rectsD1, boundingBox); - } else { - rectsD0.rectanglesOnDisk = onDiskBase + ".boundingbox.d0"; - rectsD1.rectanglesOnDisk = onDiskBase + ".boundingbox.d1"; - orderedInputRectangles.SetOrderedBoxesToDisk(rectsD0, rectsD1, xSize, - boundingBox); - } - return orderedInputRectangles; -} - -/*OrderedBoxes SortInput(const std::string& onDiskBase, size_t M, uintmax_t -maxBuildingRamUsage, bool workInRam) { if (workInRam) { return -InternalSort(onDiskBase, M); } else { return ExternalSort(onDiskBase, M, -maxBuildingRamUsage); - } -}*/ - -static double costFunctionTGS(Rtree::BoundingBox& b0, Rtree::BoundingBox& b1, +static double costFunctionTGS(BasicGeometry::BoundingBox& b0, BasicGeometry::BoundingBox& b1, size_t dim) { /** - * The cost function determines the quality of a split. The lower the cost, - * the better the split. Each split gets represented by the resulting bounding - * boxes of the split pieces. + * The cost function determines the quality of a split. The lower the cost, + * the better the split. Each split gets represented by the resulting bounding + * boxes of the split pieces. */ double cost; // The cost represents the overlap of the two boxes if (dim == 0) { - cost = b0.max_corner().get<0>() - b1.min_corner().get<0>(); + cost = BasicGeometry::GetMaxX(b0) - BasicGeometry::GetMinX(b1); cost = cost < 0 ? 0 : cost; } else { - cost = b0.max_corner().get<1>() - b1.min_corner().get<1>(); + cost = BasicGeometry::GetMaxY(b0) - BasicGeometry::GetMinY(b1); cost = cost < 0 ? 0 : cost; } @@ -247,30 +27,31 @@ static double costFunctionTGS(Rtree::BoundingBox& b0, Rtree::BoundingBox& b1, } static std::vector TGSRecursive( - const std::string& filePath, OrderedBoxes orderedInputRectangles, size_t M, + const std::string& filePath, OrderedBoxes* orderedInputRectangles, size_t M, size_t S, uint64_t maxBuildingRamUsage) { /** - * This function recursively constructs one layer of children for a certain - * root node. The input rectangles must be sorted in both x- and y-direction. - * The algorithm is based on this paper - * https://dl.acm.org/doi/pdf/10.1145/288692.288723 + * This function recursively constructs one layer of children for a certain + * root node. The input rectangles must be sorted in both x- and y-direction. + * The algorithm is based on this paper + * https://dl.acm.org/doi/pdf/10.1145/288692.288723 */ - uint64_t n = orderedInputRectangles.GetSize(); + uint64_t n = orderedInputRectangles->GetSize(); - if (n <= S || n <= M) { + if (n <= S) { // stop condition - return std::vector{orderedInputRectangles}; + return std::vector{*orderedInputRectangles}; } + // split the rectangles at the best split std::pair split = - orderedInputRectangles.SplitAtBest(filePath, S, M, maxBuildingRamUsage); + orderedInputRectangles->SplitAtBest(filePath, S, M, maxBuildingRamUsage); // recursion std::vector result0 = - TGSRecursive(filePath + ".0", split.first, M, S, maxBuildingRamUsage); + TGSRecursive(filePath + ".0", &split.first, M, S, maxBuildingRamUsage); std::vector result1 = - TGSRecursive(filePath + ".1", split.second, M, S, maxBuildingRamUsage); + TGSRecursive(filePath + ".1", &split.second, M, S, maxBuildingRamUsage); std::vector result; result.insert(result.begin(), result0.begin(), result0.end()); @@ -279,15 +60,9 @@ static std::vector TGSRecursive( return result; } -void Rtree::BuildTree(const std::string& onDiskBase, size_t M, +void Rtree::BuildTree(const std::string& onDiskBase, const std::string& fileSuffix, size_t M, const std::string& folder) const { - const std::string file = onDiskBase + ".boundingbox.tmp"; - - // prepare the files - std::filesystem::create_directory(folder); - std::ofstream nodesOfs = - std::ofstream(folder + "/nodes.bin", std::ios::binary); - std::map lookup; + const std::filesystem::path file = onDiskBase + fileSuffix + ".tmp"; // sort the rectangles uint64_t fileLines = @@ -296,10 +71,17 @@ void Rtree::BuildTree(const std::string& onDiskBase, size_t M, bool workInRam = (std::filesystem::file_size(file) + fileLines * 2 * sizeof(uint64_t)) * 4 < - this->maxBuildingRamUsage; + this->maxBuildingRamUsage_; OrderedBoxes orderedInputRectangles = - SortInput(onDiskBase, M, maxBuildingRamUsage, workInRam); + SortInput(onDiskBase, fileSuffix, M, maxBuildingRamUsage_, workInRam); + std::cout << "Finished initial sorting" << std::endl; + + // prepare the files + std::filesystem::create_directory(folder); + std::ofstream nodesOfs = + std::ofstream(folder + "/nodes.bin", std::ios::binary); + std::map lookup; // build the tree in a depth first approach std::stack layerStack; @@ -307,6 +89,8 @@ void Rtree::BuildTree(const std::string& onDiskBase, size_t M, uint64_t newId = 1; // start from 1, because 0 is the root item ConstructionNode rootItem = ConstructionNode(0, orderedInputRectangles); layerStack.push(rootItem); + orderedInputRectangles.Clear(); + rootItem.GetOrderedBoxes().Clear(); size_t layer = 0; while (!layerStack.empty()) { @@ -316,15 +100,19 @@ void Rtree::BuildTree(const std::string& onDiskBase, size_t M, if (currentItem.GetOrderedBoxes().GetSize() <= M) { // reached a leaf currentItem.AddChildrenToItem(); - uint64_t nodePtr = SaveNode(currentItem, true, nodesOfs); + currentItem.SetIsLastInnerNode(true); + uint64_t nodePtr = FileReader::SaveNode(currentItem, nodesOfs); lookup[currentItem.GetId()] = nodePtr; } else { + size_t S = std::ceil(((double)currentItem.GetOrderedBoxes().GetSize()) / ((double)M)); + if (currentItem.GetOrderedBoxes().GetSize() <= M * M) { + // in this case S can be just M + S = M; + } std::vector tgsResult = TGSRecursive( - onDiskBase + ".boundingbox." + std::to_string(layer), - currentItem.GetOrderedBoxes(), M, - std::ceil(((float)currentItem.GetOrderedBoxes().GetSize()) / - ((float)M)), - this->maxBuildingRamUsage); + onDiskBase + fileSuffix + "." + std::to_string(layer), + ¤tItem.GetOrderedBoxes(), M, S, + this->maxBuildingRamUsage_); for (OrderedBoxes& currentOrderedRectangles : tgsResult) { ConstructionNode newItem = ConstructionNode(newId, currentOrderedRectangles); @@ -335,7 +123,7 @@ void Rtree::BuildTree(const std::string& onDiskBase, size_t M, newId++; } - uint64_t nodePtr = SaveNode(currentItem, false, nodesOfs); + uint64_t nodePtr = FileReader::SaveNode(currentItem, nodesOfs); lookup[currentItem.GetId()] = nodePtr; } layer++; @@ -350,200 +138,52 @@ void Rtree::BuildTree(const std::string& onDiskBase, size_t M, lookupOfs.close(); } -ConstructionNode::ConstructionNode(uint64_t id, OrderedBoxes orderedBoxes) - : Node{id} { - this->orderedBoxes = orderedBoxes; - - // calculate the boundingBoxes - this->boundingBox = orderedBoxes.GetBoundingBox(); -} - -void ConstructionNode::AddChildrenToItem() { - /** - * Add all children of a certain node at once. - * This is used when a leaf node is reached. - */ - if (this->GetOrderedBoxes().WorkInRam()) { - for (RTreeValueWithOrderIndex box : - *this->GetOrderedBoxes().GetRectanglesInRam()) { - Node leafNode = Node(box.id, box.box); - this->AddChild(leafNode); - } - } else { - for (const RTreeValueWithOrderIndex& element : - FileReader(this->GetOrderedBoxes().GetRectanglesOnDisk())) { - Node leafNode = Node(element.id, element.box); - this->AddChild(leafNode); - } - } -} - -OrderedBoxes ConstructionNode::GetOrderedBoxes() { return this->orderedBoxes; } - -void Node::AddChild(Node& child) { - Rtree::BoundingBox box = child.GetBoundingBox(); - uint64_t entryId = child.GetId(); - RTreeValue entry = {box, entryId}; - this->children.push_back(entry); -} - -Rtree::BoundingBox Node::GetBoundingBox() const { return this->boundingBox; } - -void Node::SetIsLastInnerNode(bool isLast) { this->isLastInnerNode = isLast; } - -uint64_t Rtree::SaveNode(Node& node, bool isLastInnerNode, - std::ofstream& nodesOfs) { - node.SetIsLastInnerNode(isLastInnerNode); - - uint64_t pos = static_cast(nodesOfs.tellp()); - boost::archive::binary_oarchive archive(nodesOfs); - archive << node; - nodesOfs.write(" ", 1); - - return pos; -} - -std::optional GetBoundingBoxFromWKT( - const std::string& wkt) { - /** - * Parse the wkt literal in a way, that only the relevant data for the rtree - * gets read in. - */ - double maxDouble = std::numeric_limits::max(); - - double minX = maxDouble; - double maxX = -maxDouble; - double minY = maxDouble; - double maxY = -maxDouble; - - // Iterate over matches and capture x and y coordinates - for (auto match : - ctre::range( - wkt)) { - double x = std::stod(std::string(match.get<1>())); - double y = std::stod(std::string(match.get<2>())); - - if (x < minX) minX = x; - if (x > maxX) maxX = x; - if (y < minY) minY = y; - if (y > maxY) maxY = y; - } - - return {Rtree::createBoundingBox(minX, minY, maxX, maxY)}; -} - -std::optional Rtree::ConvertWordToRtreeEntry( - const std::string& wkt) { - /* - * Convert a single wkt literal to a boundingbox. - * Get the bounding box(es) of either a multipolygon, polygon or a linestring - */ - - if (wkt.starts_with("\"MULTIPOLYGON") || wkt.starts_with("\"POLYGON") || - wkt.starts_with("\"LINESTRING")) { - return GetBoundingBoxFromWKT(wkt); - } - - return {}; -} - -void Rtree::SaveEntry(Rtree::BoundingBox boundingBox, uint64_t index, - std::ofstream& convertOfs) { - /** - * Save a single entry (which was e.g. converted by ConvertWordToRtreeEntry) - * to the disk - */ - double minX = boundingBox.min_corner().get<0>(); - double minY = boundingBox.min_corner().get<1>(); - double maxX = boundingBox.max_corner().get<0>(); - double maxY = boundingBox.max_corner().get<1>(); - - convertOfs.write(reinterpret_cast(&minX), sizeof(double)); - convertOfs.write(reinterpret_cast(&minY), sizeof(double)); - convertOfs.write(reinterpret_cast(&maxX), sizeof(double)); - convertOfs.write(reinterpret_cast(&maxY), sizeof(double)); - convertOfs.write(reinterpret_cast(&index), sizeof(uint64_t)); -} +bool OrderedBoxes::WorkInRam() const { return this->workInRam_; } -void Rtree::SaveEntryWithOrderIndex(RTreeValueWithOrderIndex treeValue, - std::ofstream& convertOfs) { - /** - * Save a single entry, containing its postion in the x- and y-sorting - */ - double minX = treeValue.MinX(); - double minY = treeValue.MinY(); - double maxX = treeValue.MaxX(); - double maxY = treeValue.MaxY(); - - convertOfs.write(reinterpret_cast(&minX), sizeof(double)); - convertOfs.write(reinterpret_cast(&minY), sizeof(double)); - convertOfs.write(reinterpret_cast(&maxX), sizeof(double)); - convertOfs.write(reinterpret_cast(&maxY), sizeof(double)); - convertOfs.write(reinterpret_cast(&treeValue.id), - sizeof(uint64_t)); - convertOfs.write(reinterpret_cast(&treeValue.orderX), - sizeof(uint64_t)); - convertOfs.write(reinterpret_cast(&treeValue.orderY), - sizeof(uint64_t)); +void OrderedBoxes::Clear() { + size_ = 0; + boundingBox_ = BasicGeometry::CreateBoundingBox(0, 0, 0, 0); + rectsD0_.Clear(); + rectsD1_.Clear(); } -multiBoxGeo Rtree::LoadEntries(const std::string& file) { - multiBoxGeo boxes; - - for (const RTreeValue& element : FileReaderWithoutIndex(file)) { - boxes.push_back(element); - } - - return boxes; -} - -multiBoxWithOrderIndex Rtree::LoadEntriesWithOrderIndex( - const std::string& file) { - multiBoxWithOrderIndex boxes; - - for (const RTreeValueWithOrderIndex& element : FileReader(file)) { - boxes.push_back(element); - } - - return boxes; -} - -bool OrderedBoxes::WorkInRam() const { return this->workInRam; } - -void OrderedBoxes::SetOrderedBoxesToRam(RectanglesForOrderedBoxes& rectanglesD0, - RectanglesForOrderedBoxes& rectanglesD1, - Rtree::BoundingBox box) { - SetOrderedBoxesToDisk(rectanglesD0, rectanglesD1, - (*rectsD0.rectanglesInRam).size(), box); +void OrderedBoxes::SetOrderedBoxesToRam(RectanglesForOrderedBoxes rectanglesD0, + RectanglesForOrderedBoxes rectanglesD1, + BasicGeometry::BoundingBox box) { + this->workInRam_ = true; + this->rectsD0_ = std::move(rectanglesD0); + this->rectsD1_ = std::move(rectanglesD1); + this->size_ = std::get(this->rectsD0_.rectangles).size(); + this->boundingBox_ = box; } void OrderedBoxes::SetOrderedBoxesToDisk( - RectanglesForOrderedBoxes& rectanglesD0, - RectanglesForOrderedBoxes& rectanglesD1, uint64_t size, - Rtree::BoundingBox box) { - this->workInRam = false; - this->rectsD0 = rectanglesD0; - this->rectsD1 = rectanglesD1; - this->size = size; - this->boundingBox = box; + RectanglesForOrderedBoxes rectanglesD0, + RectanglesForOrderedBoxes rectanglesD1, uint64_t size, + BasicGeometry::BoundingBox box) { + this->workInRam_ = false; + this->rectsD0_ = std::move(rectanglesD0); + this->rectsD1_ = std::move(rectanglesD1); + this->size_ = size; + this->boundingBox_ = box; } -Rtree::BoundingBox OrderedBoxes::GetBoundingBox() { return this->boundingBox; } +BasicGeometry::BoundingBox OrderedBoxes::GetBoundingBox() { return this->boundingBox_; } -uint64_t OrderedBoxes::GetSize() const { return this->size; } +uint64_t OrderedBoxes::GetSize() const { return this->size_; } -std::shared_ptr OrderedBoxes::GetRectanglesInRam() { - return this->rectsD0.rectanglesInRam; +multiBoxWithOrderIndex OrderedBoxes::GetRectanglesInRam() { + return std::get(this->rectsD0_.rectangles); } -std::string OrderedBoxes::GetRectanglesOnDisk() { - return this->rectsD0.rectanglesOnDisk; +std::filesystem::path OrderedBoxes::GetRectanglesOnDisk() { + return std::get(this->rectsD0_.rectangles); } SplitResult OrderedBoxes::GetBestSplit() { /** - * Determine based on the "small-lists", which split is the best for the - * rtree. + * Determine based on the "small-lists", which split is the best for the + * rtree. */ struct SplitResult splitResult; @@ -558,9 +198,9 @@ SplitResult OrderedBoxes::GetBestSplit() { bool currentlyAtSTimesI = false; for (size_t dim = 0; dim < 2; dim++) { - for (uint64_t i = 0; i < this->rectsD0.rectanglesSmall->size(); i++) { - currentElement = dim == 0 ? (*this->rectsD0.rectanglesSmall)[i] - : (*this->rectsD1.rectanglesSmall)[i]; + for (uint64_t i = 0; i < this->rectsD0_.rectanglesSmall.size(); i++) { + currentElement = dim == 0 ? this->rectsD0_.rectanglesSmall[i] + : this->rectsD1_.rectanglesSmall[i]; if (i == 0) { // this is the min element @@ -580,7 +220,7 @@ SplitResult OrderedBoxes::GetBestSplit() { continue; } - if (!currentlyAtSTimesI || currentElement.id != maxElement.id) { + if (currentElement.id == maxElement.id) { break; } @@ -597,10 +237,10 @@ SplitResult OrderedBoxes::GetBestSplit() { double minYB1 = currentElement.MinY(); double maxYB1 = maxElement.MaxY(); - Rtree::BoundingBox b0 = - Rtree::createBoundingBox(minXB0, minYB0, maxXB0, maxYB0); - Rtree::BoundingBox b1 = - Rtree::createBoundingBox(minXB1, minYB1, maxXB1, maxYB1); + BasicGeometry::BoundingBox b0 = + BasicGeometry::CreateBoundingBox(minXB0, minYB0, maxXB0, maxYB0); + BasicGeometry::BoundingBox b1 = + BasicGeometry::CreateBoundingBox(minXB1, minYB1, maxXB1, maxYB1); double cost = costFunctionTGS(b0, b1, dim); @@ -616,14 +256,13 @@ SplitResult OrderedBoxes::GetBestSplit() { } currentlyAtSTimesI = false; } - return splitResult; } std::pair OrderedBoxes::SplitAtBest( - const std::string& filePath, size_t S, size_t M, + const std::filesystem::path& filePath, size_t S, size_t M, uint64_t maxBuildingRamUsage) { - if (this->workInRam) { + if (this->workInRam_) { return this->SplitAtBestInRam(S, M); } else { return this->SplitAtBestOnDisk(filePath, S, M, maxBuildingRamUsage); @@ -633,8 +272,8 @@ std::pair OrderedBoxes::SplitAtBest( std::pair OrderedBoxes::SplitAtBestInRam(size_t S, size_t M) { /** - * Split the ordered boxes in ram. First determine the best split and then - * perform it + * Split the ordered boxes in ram. First determine the best split and then + * perform it */ struct SplitResult splitResult = this->GetBestSplit(); @@ -642,19 +281,16 @@ std::pair OrderedBoxes::SplitAtBestInRam(size_t S, OrderedBoxes split0; OrderedBoxes split1; - struct SplitBuffersRam splitBuffers; + RectanglesForOrderedBoxes rectsD0Split0; + RectanglesForOrderedBoxes rectsD1Split0; + RectanglesForOrderedBoxes rectsD0Split1; + RectanglesForOrderedBoxes rectsD1Split1; + + struct SplitBuffers splitBuffers = {rectsD0Split0, rectsD1Split0, rectsD0Split1, rectsD1Split1}; - std::pair boundingBoxes = + std::pair boundingBoxes = PerformSplit(splitResult, splitBuffers, M, S); - RectanglesForOrderedBoxes rectsD0Split0 = {splitBuffers.s0Dim0, "", - splitBuffers.s0SmallDim0}; - RectanglesForOrderedBoxes rectsD1Split0 = {splitBuffers.s0Dim1, "", - splitBuffers.s0SmallDim1}; - RectanglesForOrderedBoxes rectsD0Split1 = {splitBuffers.s1Dim0, "", - splitBuffers.s1SmallDim0}; - RectanglesForOrderedBoxes rectsD1Split1 = {splitBuffers.s1Dim1, "", - splitBuffers.s1SmallDim1}; split0.SetOrderedBoxesToRam(rectsD0Split0, rectsD1Split0, boundingBoxes.first); split1.SetOrderedBoxesToRam(rectsD0Split1, rectsD1Split1, @@ -667,8 +303,8 @@ std::pair OrderedBoxes::SplitAtBestOnDisk( const std::string& filePath, size_t S, size_t M, uint64_t maxBuildingRamUsage) { /** - * Split the ordered boxes on disk. First determine the best split and then - * perform it + * Split the ordered boxes on disk. First determine the best split and then + * perform it */ OrderedBoxes split0; @@ -676,12 +312,15 @@ std::pair OrderedBoxes::SplitAtBestOnDisk( struct SplitResult splitResult = this->GetBestSplit(); - struct SplitBuffersDisk splitBuffers; - struct SplitBuffersRam splitBuffersRam; + RectanglesForOrderedBoxes rectsD0Split0; + RectanglesForOrderedBoxes rectsD1Split0; + RectanglesForOrderedBoxes rectsD0Split1; + RectanglesForOrderedBoxes rectsD1Split1; + struct SplitBuffers splitBuffers = {rectsD0Split0, rectsD1Split0, rectsD0Split1, rectsD1Split1}; // perfrom the split - uint64_t sizeLeft = std::ceil((splitResult.bestIndex - 2) / 2.0) * S; - uint64_t sizeRight = this->size - sizeLeft; + auto sizeLeft = (uint64_t)(std::ceil(((double)splitResult.bestIndex - 2.0) / 2.0) * (double)S); + uint64_t sizeRight = this->size_ - sizeLeft; uint64_t split0ByteSize = sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); uint64_t split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + @@ -690,106 +329,57 @@ std::pair OrderedBoxes::SplitAtBestOnDisk( bool split1InRam = split1ByteSize * 4 < maxBuildingRamUsage; if (!split0InRam) { - splitBuffers.split0Dim0File = { - std::ofstream(filePath + ".0.dim0.tmp", std::ios::binary)}; - splitBuffers.split0Dim1File = { - std::ofstream(filePath + ".0.dim1.tmp", std::ios::binary)}; + splitBuffers.rectsD0Split0.rectangles = filePath + ".0.dim0.tmp"; + splitBuffers.rectsD1Split0.rectangles = filePath + ".0.dim1.tmp"; } if (!split1InRam) { - splitBuffers.split1Dim0File = { - std::ofstream(filePath + ".1.dim0.tmp", std::ios::binary)}; - splitBuffers.split1Dim1File = { - std::ofstream(filePath + ".1.dim1.tmp", std::ios::binary)}; + splitBuffers.rectsD0Split1.rectangles = filePath + ".1.dim0.tmp"; + splitBuffers.rectsD1Split1.rectangles = filePath + ".1.dim1.tmp"; } - splitBuffers.splitBuffersRam = splitBuffersRam; - - std::pair boundingBoxes = + std::pair boundingBoxes = PerformSplit(splitResult, splitBuffers, M, S, maxBuildingRamUsage); - RectanglesForOrderedBoxes rectsD0Split0; - RectanglesForOrderedBoxes rectsD1Split0; - RectanglesForOrderedBoxes rectsD0Split1; - RectanglesForOrderedBoxes rectsD1Split1; - rectsD0Split0.rectanglesInRam = splitBuffers.splitBuffersRam.s0SmallDim0; - rectsD1Split0.rectanglesInRam = splitBuffers.splitBuffersRam.s0SmallDim1; - rectsD0Split1.rectanglesInRam = splitBuffers.splitBuffersRam.s1SmallDim0; - rectsD1Split1.rectanglesInRam = splitBuffers.splitBuffersRam.s1SmallDim1; - if (!split0InRam) { - splitBuffers.split0Dim0File.value().close(); - splitBuffers.split0Dim1File.value().close(); - - rectsD0Split0.rectanglesOnDisk = filePath + ".0.dim0"; - rectsD1Split0.rectanglesOnDisk = filePath + ".0.dim1"; - split0.SetOrderedBoxesToDisk(rectsD0Split0, rectsD1Split0, sizeLeft, boundingBoxes.first); } else { - rectsD0Split0.rectanglesInRam = splitBuffers.splitBuffersRam.s0Dim0; - rectsD1Split0.rectanglesInRam = splitBuffers.splitBuffersRam.s0Dim1; split0.SetOrderedBoxesToRam(rectsD0Split0, rectsD1Split0, boundingBoxes.first); } if (!split1InRam) { - splitBuffers.split1Dim0File.value().close(); - splitBuffers.split1Dim1File.value().close(); - - rectsD0Split1.rectanglesOnDisk = filePath + ".1.dim0"; - rectsD1Split1.rectanglesOnDisk = filePath + ".1.dim1"; - split1.SetOrderedBoxesToDisk(rectsD0Split1, rectsD1Split1, sizeRight, boundingBoxes.second); } else { - rectsD0Split1.rectanglesInRam = splitBuffers.splitBuffersRam.s1Dim0; - rectsD1Split1.rectanglesInRam = splitBuffers.splitBuffersRam.s1Dim1; split1.SetOrderedBoxesToRam(rectsD0Split1, rectsD1Split1, boundingBoxes.second); } - std::remove(this->rectsD0.rectanglesOnDisk.c_str()); - std::remove(this->rectsD1.rectanglesOnDisk.c_str()); + std::remove(std::get(this->rectsD0_.rectangles).c_str()); + std::remove(std::get(this->rectsD1_.rectangles).c_str()); return std::make_pair(split0, split1); } -std::pair OrderedBoxes::PerformSplit( - SplitResult splitResult, SplitBuffersRam& splitBuffersRam, size_t M, - size_t S) { - /** - * Perform the best split on the current ordered boxes in the ram case - */ - - struct SplitBuffersDisk splitBuffersDisk; - - splitBuffersDisk.splitBuffersRam = splitBuffersRam; - splitBuffersDisk.split0Dim0File = {}; - splitBuffersDisk.split0Dim1File = {}; - splitBuffersDisk.split1Dim0File = {}; - splitBuffersDisk.split1Dim1File = {}; - - // reuse the PerfromSplit of the Disk case. - std::pair boundingBoxes = - PerformSplit(splitResult, splitBuffersDisk, M, S, 0); - - splitBuffersRam = splitBuffersDisk.splitBuffersRam; - - return boundingBoxes; -} - -std::pair OrderedBoxes::PerformSplit( - SplitResult splitResult, SplitBuffersDisk& splitBuffers, size_t M, size_t S, +std::pair OrderedBoxes::PerformSplit( + SplitResult splitResult, SplitBuffers& splitBuffers, size_t M, size_t S, uint64_t maxBuildingRamUsage) { /** - * Perform the best split on the current ordered boxes in the disk case + * Perform the best split on the current ordered boxes in the disk case */ - uint64_t sizeLeft = std::ceil((splitResult.bestIndex - 2) / 2.0) * S; - uint64_t sizeRight = this->size - sizeLeft; - size_t SSplit0 = sizeLeft <= S ? std::ceil(sizeLeft / (double)M) : S; - size_t SSplit1 = sizeRight <= S ? std::ceil(sizeRight / (double)M) : S; + auto sizeLeft = (uint64_t)(std::ceil(((double)splitResult.bestIndex - 2.0) / 2.0) * (double)S); + uint64_t sizeRight = this->size_ - sizeLeft; + size_t SSplit0 = sizeLeft <= S ? (size_t)std::ceil((double)sizeLeft / (double)M) : S; + if (sizeLeft <= S && sizeLeft <= M * M) { + SSplit0 = M; + } + size_t SSplit1 = sizeRight <= S ? (size_t)std::ceil((double)sizeRight / (double)M) : S; + if (sizeRight <= S && sizeRight <= M * M) { + SSplit1 = M; + } uint64_t split0ByteSize = sizeLeft * (4 * sizeof(double) + sizeof(uint64_t) + 2 * sizeof(uint64_t)); uint64_t split1ByteSize = sizeRight * (4 * sizeof(double) + sizeof(uint64_t) + @@ -799,8 +389,8 @@ std::pair OrderedBoxes::PerformSplit( bool split1InRam = maxBuildingRamUsage == 0 || split1ByteSize * 4 < maxBuildingRamUsage; - Rtree::BoundingBox boxSplit0 = Rtree::createBoundingBox(0, 0, 0, 0); - Rtree::BoundingBox boxSplit1 = Rtree::createBoundingBox(0, 0, 0, 0); + BasicGeometry::BoundingBox boxSplit0 = BasicGeometry::CreateBoundingBox(0, 0, 0, 0); + BasicGeometry::BoundingBox boxSplit1 = BasicGeometry::CreateBoundingBox(0, 0, 0, 0); RTreeValueWithOrderIndex minSplit0OtherDim; RTreeValueWithOrderIndex maxSplit0OtherDim; @@ -808,115 +398,136 @@ std::pair OrderedBoxes::PerformSplit( RTreeValueWithOrderIndex maxSplit1OtherDim; struct OtherDimension { - std::shared_ptr smallSplit0; - std::shared_ptr smallSplit1; - } otherDimension; + multiBoxWithOrderIndex* smallSplit0; + multiBoxWithOrderIndex* smallSplit1; + } otherDimension{}; + + auto pushSmallBoundaries = [splitResult](multiBoxWithOrderIndex& smallListS0, multiBoxWithOrderIndex& smallListS1) { + smallListS0.push_back(splitResult.bestMinElement); + smallListS0.push_back(splitResult.bestLastElement); + smallListS1.push_back(splitResult.bestElement); + smallListS1.push_back(splitResult.bestMaxElement); + }; if (splitResult.bestDim == 0) { - splitBuffers.splitBuffersRam.s0SmallDim0->push_back( - splitResult.bestMinElement); - splitBuffers.splitBuffersRam.s0SmallDim0->push_back( - splitResult.bestLastElement); - splitBuffers.splitBuffersRam.s1SmallDim0->push_back( - splitResult.bestElement); - splitBuffers.splitBuffersRam.s1SmallDim0->push_back( - splitResult.bestMaxElement); + pushSmallBoundaries(splitBuffers.rectsD0Split0.rectanglesSmall, splitBuffers.rectsD0Split1.rectanglesSmall); // placeholder, since we need the min and max element of the split in the + otherDimension.smallSplit0 = &splitBuffers.rectsD1Split0.rectanglesSmall; + otherDimension.smallSplit1 = &splitBuffers.rectsD1Split1.rectanglesSmall; // first two spots - otherDimension.smallSplit0 = splitBuffers.splitBuffersRam.s0SmallDim1; - otherDimension.smallSplit1 = splitBuffers.splitBuffersRam.s1SmallDim1; - - otherDimension.smallSplit0->push_back(RTreeValueWithOrderIndex()); - otherDimension.smallSplit0->push_back(RTreeValueWithOrderIndex()); - otherDimension.smallSplit1->push_back(RTreeValueWithOrderIndex()); - otherDimension.smallSplit1->push_back(RTreeValueWithOrderIndex()); + otherDimension.smallSplit0->emplace_back(); + otherDimension.smallSplit0->emplace_back(); + otherDimension.smallSplit1->emplace_back(); + otherDimension.smallSplit1->emplace_back(); } else { - splitBuffers.splitBuffersRam.s0SmallDim1->push_back( - splitResult.bestMinElement); - splitBuffers.splitBuffersRam.s0SmallDim1->push_back( - splitResult.bestLastElement); - splitBuffers.splitBuffersRam.s1SmallDim1->push_back( - splitResult.bestElement); - splitBuffers.splitBuffersRam.s1SmallDim1->push_back( - splitResult.bestMaxElement); + pushSmallBoundaries(splitBuffers.rectsD1Split0.rectanglesSmall, splitBuffers.rectsD1Split1.rectanglesSmall); // placeholder - otherDimension.smallSplit0 = splitBuffers.splitBuffersRam.s0SmallDim0; - otherDimension.smallSplit1 = splitBuffers.splitBuffersRam.s1SmallDim0; + otherDimension.smallSplit0 = &splitBuffers.rectsD0Split0.rectanglesSmall; + otherDimension.smallSplit1 = &splitBuffers.rectsD0Split1.rectanglesSmall; - otherDimension.smallSplit0->push_back(RTreeValueWithOrderIndex()); - otherDimension.smallSplit0->push_back(RTreeValueWithOrderIndex()); - otherDimension.smallSplit1->push_back(RTreeValueWithOrderIndex()); - otherDimension.smallSplit1->push_back(RTreeValueWithOrderIndex()); + otherDimension.smallSplit0->emplace_back(); + otherDimension.smallSplit0->emplace_back(); + otherDimension.smallSplit1->emplace_back(); + otherDimension.smallSplit1->emplace_back(); } std::optional elementOpt; - std::optional fileReaderDim0; - std::optional fileReaderDim1; - if (!this->workInRam) { - fileReaderDim0 = {FileReader(this->rectsD0.rectanglesOnDisk)}; - fileReaderDim1 = {FileReader(this->rectsD1.rectanglesOnDisk)}; - } + std::filesystem::path rectsD0Path = !this->workInRam_ ? std::get(this->rectsD0_.rectangles) : ""; + std::filesystem::path rectsD1Path = !this->workInRam_ ? std::get(this->rectsD1_.rectangles) : ""; + FileReader fileReaderDim0 = FileReader(rectsD0Path); + FileReader fileReaderDim1 = FileReader(rectsD1Path); FileReader::iterator fileReaderDim0Iterator = - fileReaderDim0 ? fileReaderDim0.value().begin() : FileReader::iterator(); + !rectsD0Path.empty() ? fileReaderDim0.begin() : FileReader::iterator(); FileReader::iterator fileReaderDim1Iterator = - fileReaderDim1 ? fileReaderDim1.value().begin() : FileReader::iterator(); + !rectsD1Path.empty() ? fileReaderDim1.begin() : FileReader::iterator(); uint64_t currentXSplit0 = 0; uint64_t currentXSplit1 = 0; uint64_t currentYSplit0 = 0; uint64_t currentYSplit1 = 0; + std::optional rectanglesOnDiskS0D0Stream = {}; + std::optional rectanglesOnDiskS0D1Stream = {}; + std::optional rectanglesOnDiskS1D0Stream = {}; + std::optional rectanglesOnDiskS1D1Stream = {}; + if (!split0InRam && !this->workInRam_) { + rectanglesOnDiskS0D0Stream = std::ofstream(std::get(splitBuffers.rectsD0Split0.rectangles), std::ios::binary); + rectanglesOnDiskS0D1Stream = std::ofstream(std::get(splitBuffers.rectsD1Split0.rectangles), std::ios::binary); + } + if (!split1InRam && !this->workInRam_) { + rectanglesOnDiskS1D0Stream = std::ofstream(std::get(splitBuffers.rectsD0Split1.rectangles), std::ios::binary); + rectanglesOnDiskS1D1Stream = std::ofstream(std::get(splitBuffers.rectsD1Split1.rectangles), std::ios::binary); + } + auto performCertainSplit = - [M, &splitBuffers, &splitResult]( + [M, &splitBuffers, &splitResult, &rectanglesOnDiskS0D0Stream, &rectanglesOnDiskS0D1Stream, &rectanglesOnDiskS1D0Stream, &rectanglesOnDiskS1D1Stream]( size_t dim, size_t split, uint64_t& current, - uint64_t& currentSplitSize, RTreeValueWithOrderIndex& minElement, + size_t& currentSplitSize, RTreeValueWithOrderIndex& minElement, RTreeValueWithOrderIndex& maxElement, bool currentSplitInRam, bool workInRam, RTreeValueWithOrderIndex& element, - Rtree::BoundingBox& box) { - std::shared_ptr currentList; - std::shared_ptr currentSmallList; - std::ofstream* currentFile; + BasicGeometry::BoundingBox& box) { + multiBoxWithOrderIndex* currentSmallList; + // current list is either in ram or on disk + std::variant currentList; if (split == 0) { if (dim == 0) { - currentList = splitBuffers.splitBuffersRam.s0Dim0; - currentSmallList = splitBuffers.splitBuffersRam.s0SmallDim0; - currentFile = &splitBuffers.split0Dim0File.value(); + currentSmallList = &splitBuffers.rectsD0Split0.rectanglesSmall; + if (currentSplitInRam || workInRam) { + currentList = &std::get(splitBuffers.rectsD0Split0.rectangles); + } else { + currentList = &rectanglesOnDiskS0D0Stream.value(); + } } else { - currentList = splitBuffers.splitBuffersRam.s0Dim1; - currentSmallList = splitBuffers.splitBuffersRam.s0SmallDim1; - currentFile = &splitBuffers.split0Dim1File.value(); + currentSmallList = &splitBuffers.rectsD1Split0.rectanglesSmall; + if (currentSplitInRam || workInRam) { + currentList = &std::get(splitBuffers.rectsD1Split0.rectangles); + } else { + currentList = &rectanglesOnDiskS0D1Stream.value(); + } } } else { if (dim == 0) { - currentList = splitBuffers.splitBuffersRam.s1Dim0; - currentSmallList = splitBuffers.splitBuffersRam.s1SmallDim0; - currentFile = &splitBuffers.split1Dim0File.value(); + currentSmallList = &splitBuffers.rectsD0Split1.rectanglesSmall; + if (currentSplitInRam || workInRam) { + currentList = &std::get(splitBuffers.rectsD0Split1.rectangles); + } else { + currentList = &rectanglesOnDiskS1D0Stream.value(); + } } else { - currentList = splitBuffers.splitBuffersRam.s1Dim1; - currentSmallList = splitBuffers.splitBuffersRam.s1SmallDim1; - currentFile = &splitBuffers.split1Dim1File.value(); + currentSmallList = &splitBuffers.rectsD1Split1.rectanglesSmall; + if (currentSplitInRam || workInRam) { + currentList = &std::get(splitBuffers.rectsD1Split1.rectangles); + } else { + currentList = &rectanglesOnDiskS1D1Stream.value(); + } } } // add the element to the current split dimension 0/1 vector / file if (currentSplitInRam || workInRam) { - currentList->push_back(element); + std::get(currentList)->push_back(element); } else { - Rtree::SaveEntryWithOrderIndex(element, *currentFile); + FileReader::SaveEntryWithOrderIndex(element, *(std::get(currentList))); } // check if the element is at the position i * S (described in the // algorithm) or one before it. In this case it is a future possible // split position and needs to be saved to the "small list" - if (isBorderOfSplitCandidate(current, currentSplitSize, M)) { + if (BasicGeometry::IsBorderOfSplitCandidate(current, currentSplitSize, M)) { // index i * S - 1 or i * S currentSmallList->push_back(element); } // update the boundingbox to get the whole boundingbox of the split - if (dim == 0) box = Rtree::combineBoundingBoxes(box, element.box); + if (dim == 0) { + if (current == 0) { + box = element.box; + } else { + box = BasicGeometry::CombineBoundingBoxes(box, element.box); + } + } // keep track of the min and max element of the split, to later // replace the placeholder in the "small lists" @@ -936,21 +547,19 @@ std::pair OrderedBoxes::PerformSplit( // start performing the actual split uint64_t i = 0; - if (!this->workInRam && - fileReaderDim0Iterator != fileReaderDim0.value().end() && - fileReaderDim1Iterator != fileReaderDim1.value().end()) { - if (dim == 0) elementOpt = *fileReaderDim0Iterator; - if (dim == 1) elementOpt = *fileReaderDim1Iterator; + if (!this->workInRam_) { + if (dim == 0 && fileReaderDim0Iterator != FileReader::end()) elementOpt = *fileReaderDim0Iterator; + if (dim == 1 && fileReaderDim1Iterator != FileReader::end()) elementOpt = *fileReaderDim1Iterator; } - while ((this->workInRam && i < this->size) || - (!this->workInRam && elementOpt)) { + while ((this->workInRam_ && i < this->size_) || + (!this->workInRam_ && elementOpt)) { RTreeValueWithOrderIndex element; // get the current element, either from disk or from ram - if (this->workInRam) { - element = dim == 0 ? (*this->rectsD0.rectanglesInRam)[i] - : (*this->rectsD1.rectanglesInRam)[i]; + if (this->workInRam_) { + element = dim == 0 ? std::get(this->rectsD0_.rectangles)[i] + : std::get(this->rectsD1_.rectangles)[i]; } else { element = elementOpt.value(); } @@ -963,11 +572,11 @@ std::pair OrderedBoxes::PerformSplit( if (dim == 0) { performCertainSplit(0, 0, currentXSplit0, SSplit0, minSplit0OtherDim, - maxSplit0OtherDim, split0InRam, this->workInRam, + maxSplit0OtherDim, split0InRam, this->workInRam_, element, boxSplit0); } else { performCertainSplit(1, 0, currentYSplit0, SSplit0, minSplit0OtherDim, - maxSplit0OtherDim, split0InRam, this->workInRam, + maxSplit0OtherDim, split0InRam, this->workInRam_, element, boxSplit0); } } else { @@ -975,28 +584,27 @@ std::pair OrderedBoxes::PerformSplit( if (dim == 0) { performCertainSplit(0, 1, currentXSplit1, SSplit1, minSplit1OtherDim, - maxSplit1OtherDim, split1InRam, this->workInRam, + maxSplit1OtherDim, split1InRam, this->workInRam_, element, boxSplit1); } else { performCertainSplit(1, 1, currentYSplit1, SSplit1, minSplit1OtherDim, - maxSplit1OtherDim, split1InRam, this->workInRam, + maxSplit1OtherDim, split1InRam, this->workInRam_, element, boxSplit1); } } i++; - if (!this->workInRam && - fileReaderDim0Iterator != fileReaderDim0.value().end() && - fileReaderDim1Iterator != fileReaderDim1.value().end()) { - if (dim == 0) { - ++fileReaderDim0Iterator; + if (!this->workInRam_) { + if (dim == 0 && ++fileReaderDim0Iterator != FileReader::end()) { elementOpt = *fileReaderDim0Iterator; + continue; } - if (dim == 1) { - ++fileReaderDim1Iterator; + if (dim == 1 && ++fileReaderDim1Iterator != FileReader::end()) { elementOpt = *fileReaderDim1Iterator; + continue; } } + elementOpt = {}; } } @@ -1006,14 +614,7 @@ std::pair OrderedBoxes::PerformSplit( (*otherDimension.smallSplit1)[0] = minSplit1OtherDim; (*otherDimension.smallSplit1)[1] = maxSplit1OtherDim; - (*this->rectsD0.rectanglesInRam).clear(); - (*this->rectsD1.rectanglesInRam).clear(); - (*this->rectsD0.rectanglesSmall).clear(); - (*this->rectsD1.rectanglesSmall).clear(); - (*this->rectsD0.rectanglesInRam).shrink_to_fit(); - (*this->rectsD1.rectanglesInRam).shrink_to_fit(); - (*this->rectsD0.rectanglesSmall).shrink_to_fit(); - (*this->rectsD1.rectanglesSmall).shrink_to_fit(); + this->Clear(); return std::make_pair(boxSplit0, boxSplit1); -} +} \ No newline at end of file diff --git a/src/util/RtreeFileReader.cpp b/src/util/RtreeFileReader.cpp new file mode 100644 index 0000000000..9e0df5d374 --- /dev/null +++ b/src/util/RtreeFileReader.cpp @@ -0,0 +1,114 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#include "./Rtree.h" +#include "./RtreeFileReader.h" + +#include +#include +#include + +FileReader::iterator &FileReader::iterator::operator++() { + BasicGeometry::BoundingBox box; + uint64_t id; + uint64_t orderX; + uint64_t orderY; + if (input_ && input_.read(reinterpret_cast(&box), sizeof(BasicGeometry::BoundingBox)) && + input_.read(reinterpret_cast(&id), sizeof(uint64_t)) && + input_.read(reinterpret_cast(&orderX), sizeof(uint64_t)) && + input_.read(reinterpret_cast(&orderY), sizeof(uint64_t))) { + currentElement_ = {{box, id}, orderX, orderY}; + valid_ = true; + } else { + valid_ = false; + } + return *this; +} + +FileReaderWithoutIndex::iterator &FileReaderWithoutIndex::iterator::operator++() { + BasicGeometry::BoundingBox box; + uint64_t id; + if (input_ && input_.read(reinterpret_cast(&box), sizeof(BasicGeometry::BoundingBox)) && + input_.read(reinterpret_cast(&id), sizeof(uint64_t))) { + currentElement_ = {box, id}; + valid_ = true; + } else { + valid_ = false; + } + return *this; +} + +uint64_t FileReader::SaveNode(RtreeNode& node, std::ofstream& nodesOfs) { + uint64_t pos = static_cast(nodesOfs.tellp()); + boost::archive::binary_oarchive archive(nodesOfs); + archive << node; + nodesOfs.write(" ", 1); + + return pos; +} + +RtreeNode FileReader::LoadNode(uint64_t id, std::ifstream& lookupIfs, + std::ifstream& nodesIfs) { + RtreeNode newNode; + + uint64_t offset = id * (uint64_t)sizeof(uint64_t); + lookupIfs.seekg((long long)offset, std::ios::beg); + + uint64_t nodePtr; + lookupIfs.read(reinterpret_cast(&nodePtr), sizeof(uint64_t)); + + nodesIfs.seekg((long long)nodePtr); + boost::archive::binary_iarchive ia(nodesIfs); + ia >> newNode; + + return newNode; +} + +void FileReaderWithoutIndex::SaveEntry(BasicGeometry::BoundingBox boundingBox, uint64_t index, + std::ofstream& convertOfs) { + /** + * Save a single entry (which was e.g. converted by ConvertWordToRtreeEntry) + * to the disk + */ + static_assert(std::is_trivially_copyable_v); + convertOfs.write(reinterpret_cast(&boundingBox), sizeof(BasicGeometry::BoundingBox)); + convertOfs.write(reinterpret_cast(&index), sizeof(uint64_t)); +} + +void FileReader::SaveEntryWithOrderIndex(RTreeValueWithOrderIndex treeValue, + std::ofstream& convertOfs) { + /** + * Save a single entry, containing its postion in the x- and y-sorting + */ + + static_assert(std::is_trivially_copyable_v); + convertOfs.write(reinterpret_cast(&treeValue.box), sizeof(BasicGeometry::BoundingBox)); + convertOfs.write(reinterpret_cast(&treeValue.id), + sizeof(uint64_t)); + convertOfs.write(reinterpret_cast(&treeValue.orderX), + sizeof(uint64_t)); + convertOfs.write(reinterpret_cast(&treeValue.orderY), + sizeof(uint64_t)); +} + +multiBoxGeo FileReaderWithoutIndex::LoadEntries(const std::filesystem::path& file) { + multiBoxGeo boxes; + + for (const RTreeValue& element : FileReaderWithoutIndex(file)) { + boxes.push_back(element); + } + + return boxes; +} + +multiBoxWithOrderIndex FileReader::LoadEntriesWithOrderIndex( + const std::filesystem::path& file) { + multiBoxWithOrderIndex boxes; + + for (const RTreeValueWithOrderIndex& element : FileReader(file)) { + boxes.push_back(element); + } + + return boxes; +} diff --git a/src/util/RtreeFileReader.h b/src/util/RtreeFileReader.h index 913263ce63..3d5640e28f 100644 --- a/src/util/RtreeFileReader.h +++ b/src/util/RtreeFileReader.h @@ -5,129 +5,103 @@ #ifndef QLEVER_RTREEFILEREADER_H #define QLEVER_RTREEFILEREADER_H -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "./Rtree.h" +#include "./RtreeNode.h" class FileReader { public: - explicit FileReader(const std::string& filename) : file(filename) {} - - class iterator : public std::iterator { + // ___________________________________________________________________________ + // Save a single datapoint of the Rtree, together with its position in the x + // and y sorting to disk + static void SaveEntryWithOrderIndex(RTreeValueWithOrderIndex treeValue, + std::ofstream& convertOfs); + // ___________________________________________________________________________ + // Load all datapoints of the Rtree, together with its x and y sorting into + // ram + static multiBoxWithOrderIndex LoadEntriesWithOrderIndex( + const std::filesystem::path& file); + // ___________________________________________________________________________ + // Save the current node in the building process to disk and return the position of the node in the file + static uint64_t SaveNode(RtreeNode& node, std::ofstream& nodesOfs); + // ___________________________________________________________________________ + // Load a specific RtreeNode to query in its children + static RtreeNode LoadNode(uint64_t id, std::ifstream& lookupIfs, std::ifstream& nodesIfs); + + explicit FileReader(const std::filesystem::path& filename) : file_(filename) {} + + class iterator : public std::iterator { public: - explicit iterator(std::ifstream& in) : input(in) { + explicit iterator(std::ifstream& in) : input_(in) { ++(*this); // Read the first element } - iterator() : input(nullstream) {} // End iterator constructor - - iterator& operator++() { - double minX; - double minY; - double maxX; - double maxY; - uint64_t id; - uint64_t orderX; - uint64_t orderY; - if (input && input.read(reinterpret_cast(&minX), sizeof(double)) && - input.read(reinterpret_cast(&minY), sizeof(double)) && - input.read(reinterpret_cast(&maxX), sizeof(double)) && - input.read(reinterpret_cast(&maxY), sizeof(double)) && - input.read(reinterpret_cast(&id), sizeof(uint64_t)) && - input.read(reinterpret_cast(&orderX), sizeof(uint64_t)) && - input.read(reinterpret_cast(&orderY), sizeof(uint64_t))) { - Rtree::BoundingBox box = - Rtree::createBoundingBox(minX, minY, maxX, maxY); - currentElement = {{box, id}, orderX, orderY}; - valid = true; - } else { - valid = false; - } - return *this; - } + iterator() : input_(nullstream_) {} // End iterator constructor + + iterator& operator++(); - const RTreeValueWithOrderIndex& operator*() const { return currentElement; } + const RTreeValueWithOrderIndex& operator*() const { return currentElement_; } bool operator!=(const iterator& other) const { - return valid != other.valid; + return valid_ != other.valid_; } private: - std::ifstream& input; - std::ifstream nullstream; // A dummy stream for the end iterator - RTreeValueWithOrderIndex currentElement; - bool valid{}; + std::ifstream& input_; + std::ifstream nullstream_; // A dummy stream for the end iterator + RTreeValueWithOrderIndex currentElement_; + bool valid_{}; }; - iterator begin() { return iterator(file); } + iterator begin() { return iterator(file_); } static iterator end() { return {}; } private: - std::ifstream file; + std::ifstream file_; }; class FileReaderWithoutIndex { public: - explicit FileReaderWithoutIndex(const std::string& filename) - : file(filename) {} - - class iterator : public std::iterator { + // ___________________________________________________________________________ + // Save a single datapoint for the Rtree to disk + static void SaveEntry(BasicGeometry::BoundingBox boundingBox, uint64_t index, + std::ofstream& convertOfs); + // ___________________________________________________________________________ + // Load all datapoints of the Rtree in file into ram + static multiBoxGeo LoadEntries(const std::filesystem::path& file); + + explicit FileReaderWithoutIndex(const std::filesystem::path& filename) + : file_(filename) {} + + class iterator : public std::iterator { public: - explicit iterator(std::ifstream& in) : input(in) { + explicit iterator(std::ifstream& in) : input_(in) { ++(*this); // Read the first element } - iterator() : input(nullstream) {} // End iterator constructor - - iterator& operator++() { - double minX; - double minY; - double maxX; - double maxY; - uint64_t id; - if (input && input.read(reinterpret_cast(&minX), sizeof(double)) && - input.read(reinterpret_cast(&minY), sizeof(double)) && - input.read(reinterpret_cast(&maxX), sizeof(double)) && - input.read(reinterpret_cast(&maxY), sizeof(double)) && - input.read(reinterpret_cast(&id), sizeof(uint64_t))) { - Rtree::BoundingBox box = - Rtree::createBoundingBox(minX, minY, maxX, maxY); - currentElement = {box, id}; - valid = true; - } else { - valid = false; - } - return *this; - } + iterator() : input_(nullstream_) {} // End iterator constructor + + iterator& operator++(); - const RTreeValue& operator*() const { return currentElement; } + const RTreeValue& operator*() const { return currentElement_; } bool operator!=(const iterator& other) const { - return valid != other.valid; + return valid_ != other.valid_; } private: - std::ifstream& input; - std::ifstream nullstream; // A dummy stream for the end iterator - RTreeValue currentElement; - bool valid{}; + std::ifstream& input_; + std::ifstream nullstream_; // A dummy stream for the end iterator + RTreeValue currentElement_; + bool valid_{}; }; - iterator begin() { return iterator(file); } + iterator begin() { return iterator(file_); } static iterator end() { return {}; } private: - std::ifstream file; + std::ifstream file_; }; -#endif // QLEVER_RTREEFILEREADER_H +#endif // QLEVER_RTREEFILEREADER_H \ No newline at end of file diff --git a/src/util/RtreeNode.cpp b/src/util/RtreeNode.cpp new file mode 100644 index 0000000000..ef672ba591 --- /dev/null +++ b/src/util/RtreeNode.cpp @@ -0,0 +1,61 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#include "./RtreeNode.h" +#include "./RtreeFileReader.h" + +ConstructionNode::ConstructionNode(uint64_t id, OrderedBoxes orderedBoxes) + : RtreeNode{id} { + this->orderedBoxes_ = orderedBoxes; + // calculate the boundingBoxes + this->boundingBox_ = orderedBoxes.GetBoundingBox(); +} + +void ConstructionNode::AddChildrenToItem() { + /** + * Add all children of a certain node at once. + * This is used when a leaf node is reached. + */ + if (this->GetOrderedBoxes().WorkInRam()) { + for (RTreeValueWithOrderIndex box : + this->GetOrderedBoxes().GetRectanglesInRam()) { + RtreeNode leafNode = RtreeNode(box.id, box.box); + this->AddChild(leafNode); + } + } else { + for (const RTreeValueWithOrderIndex& element : + FileReader(this->GetOrderedBoxes().GetRectanglesOnDisk())) { + RtreeNode leafNode = RtreeNode(element.id, element.box); + this->AddChild(leafNode); + } + } +} + +OrderedBoxes& ConstructionNode::GetOrderedBoxes() { return this->orderedBoxes_; } + +void RtreeNode::AddChild(RtreeNode& child) { + BasicGeometry::BoundingBox box = child.GetBoundingBox(); + uint64_t entryId = child.GetId(); + RTreeValue entry = {box, entryId}; + this->children_.push_back(entry); +} + +BasicGeometry::BoundingBox RtreeNode::GetBoundingBox() const { return this->boundingBox_; } + +void RtreeNode::SetIsLastInnerNode(bool isLast) { this->isLastInnerNode_ = isLast; } + +uint64_t RtreeNode::GetId() const { return this->id_; } + +RtreeNode::RtreeNode() = default; + +RtreeNode::RtreeNode(uint64_t id, BasicGeometry::BoundingBox boundingBox, bool isLastInnerNode, multiBoxGeo children) { + this->id_ = id; + this->boundingBox_ = boundingBox; + this->children_ = std::move(children); + this->isLastInnerNode_ = isLastInnerNode; +} + +bool RtreeNode::GetIsLastInnerNode() const { return this->isLastInnerNode_; } + +multiBoxGeo RtreeNode::GetChildren() { return this->children_; } diff --git a/src/util/RtreeNode.h b/src/util/RtreeNode.h new file mode 100644 index 0000000000..cd39f3c17f --- /dev/null +++ b/src/util/RtreeNode.h @@ -0,0 +1,58 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#ifndef QLEVER_NODE_H +#define QLEVER_NODE_H + +#include "./Rtree.h" + +// ___________________________________________________________________________ +// Data structure representing a single node of the Rtree containing the +// boundingbox and its children +class RtreeNode { + protected: + friend class boost::serialization::access; + uint64_t id_{}; + BasicGeometry::BoundingBox boundingBox_{}; + bool isLastInnerNode_ = + false; // when true, this means that the node is the last inner node and + // all of its children are leafs + multiBoxGeo children_; + + template + void serialize(Archive& a, [[maybe_unused]] const unsigned int version) { + a& id_; + a& isLastInnerNode_; + a& boundingBox_; + a& children_; + } + + public: + RtreeNode(); + explicit RtreeNode(uint64_t id, BasicGeometry::BoundingBox boundingBox = {}, bool isLastInnerNode = false, multiBoxGeo children = {}); + [[nodiscard]] uint64_t GetId() const; + [[nodiscard]] BasicGeometry::BoundingBox GetBoundingBox() const; + void AddChild(RtreeNode& child); + void SetIsLastInnerNode(bool isLast); + [[nodiscard]] bool GetIsLastInnerNode() const; + multiBoxGeo GetChildren(); +}; + +BOOST_CLASS_VERSION(RtreeNode, 1) + +// ___________________________________________________________________________ +// Subclass of the RtreeNode only needed while constructing the Rtree (it keeps track +// of the remaining OrderedBoxes of the subtree) +class ConstructionNode : public RtreeNode { + private: + OrderedBoxes orderedBoxes_; + + public: + ConstructionNode(uint64_t id, OrderedBoxes orderedBoxes); + OrderedBoxes& GetOrderedBoxes(); + void AddChildrenToItem(); +}; + + +#endif //QLEVER_NODE_H diff --git a/src/util/RtreeSorter.cpp b/src/util/RtreeSorter.cpp new file mode 100644 index 0000000000..66dca16f7c --- /dev/null +++ b/src/util/RtreeSorter.cpp @@ -0,0 +1,293 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#include "./RtreeFileReader.h" +#include + +template +struct SortRuleLambda { + // comparison function + bool operator()(const RTreeValue& b1, const RTreeValue& b2) const { + double center1 = dimension == 0 ? std::midpoint(b1.MinX(), b1.MaxX()) + : std::midpoint(b1.MinY(), b1.MaxY()); + double center2 = dimension == 0 ? std::midpoint(b2.MinX(), b2.MaxX()) + : std::midpoint(b2.MinY(), b2.MaxY()); + return center1 < center2; + } + + // Value that is strictly smaller than any input element. + static RTreeValue min_value() { + return {BasicGeometry::CreateBoundingBox(-std::numeric_limits::max(), + -std::numeric_limits::max(), + -std::numeric_limits::max(), + -std::numeric_limits::max()), + 0}; + } + + // Value that is strictly larger than any input element. + static RTreeValue max_value() { + return {BasicGeometry::CreateBoundingBox(std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max()), + 0}; + } +}; + +template +struct SortRuleLambdaWithIndex { + uint64_t RTreeValueWithOrderIndex::*orderSelected = + dimension == 0 ? &RTreeValueWithOrderIndex::orderX + : &RTreeValueWithOrderIndex::orderY; + + // comparison function + bool operator()(const RTreeValueWithOrderIndex& b1, + const RTreeValueWithOrderIndex& b2) const { + double center1 = dimension == 0 ? std::midpoint(b1.MinX(), b1.MaxX()) + : std::midpoint(b1.MinY(), b1.MaxY()); + double center2 = dimension == 0 ? std::midpoint(b2.MinX(), b2.MaxX()) + : std::midpoint(b2.MinY(), b2.MaxY()); + + if (b1.*orderSelected == b2.*orderSelected) return center1 < center2; + return b1.*orderSelected < b2.*orderSelected; + } + + // Value that is strictly smaller than any input element. + static RTreeValueWithOrderIndex min_value() { + return {{BasicGeometry::CreateBoundingBox(-std::numeric_limits::max(), + -std::numeric_limits::max(), + -std::numeric_limits::max(), + -std::numeric_limits::max()), + 0}, + 0, + 0}; + } + + // Value that is strictly larger than any input element. + static RTreeValueWithOrderIndex max_value() { + return {{BasicGeometry::CreateBoundingBox(std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max()), + 0}, + std::numeric_limits::max(), + std::numeric_limits::max()}; + } +}; + +static void centerOrdering(multiBoxGeo& boxes, size_t dim) { + if (dim == 0) { + // order by centerX + std::sort(boxes.begin(), boxes.end(), SortRuleLambda<0>{}); + } else { + // order by centerY + std::sort(boxes.begin(), boxes.end(), SortRuleLambda<1>{}); + } +} + +static void centerOrdering(multiBoxWithOrderIndex& boxes, size_t dim) { + if (dim == 0) { + // order by centerX + std::sort(boxes.begin(), boxes.end(), SortRuleLambdaWithIndex<0>{}); + } else { + // order by centerY + std::sort(boxes.begin(), boxes.end(), SortRuleLambdaWithIndex<1>{}); + } +} + +OrderedBoxes SortInput(const std::string& onDiskBase, const std::string& fileSuffix, size_t M, + uintmax_t maxBuildingRamUsage, bool workInRam) { + OrderedBoxes orderedInputRectangles; + std::filesystem::path file = onDiskBase + fileSuffix + ".tmp"; + + auto maxRamForSorter = + std::ceil(((double) maxBuildingRamUsage < 9999999999.0 ? (double) maxBuildingRamUsage + : 9999999999.0) / + 3.0); + ad_utility::BackgroundStxxlSorter> + sorterRectsD0Basic = + ad_utility::BackgroundStxxlSorter>( + (size_t) maxRamForSorter); + multiBoxGeo rectsD0Basic; + + if (workInRam) { + rectsD0Basic = FileReaderWithoutIndex::LoadEntries(file); + centerOrdering(rectsD0Basic, 0); + } else { + for (const RTreeValue& rectD0Element : + FileReaderWithoutIndex(file)) { + sorterRectsD0Basic.push(rectD0Element); + } + } + + uint64_t xSize = 0; + BasicGeometry::BoundingBox boundingBox = BasicGeometry::CreateBoundingBox(0, 0, 0, 0); + + ad_utility::BackgroundStxxlSorter> + sorterRectsD1 = + ad_utility::BackgroundStxxlSorter>( + (size_t) maxRamForSorter); + multiBoxWithOrderIndex RectanglesD1WithOrder = multiBoxWithOrderIndex(); + + if (workInRam) { + for (RTreeValue element : rectsD0Basic) { + RTreeValueWithOrderIndex entry = {{element.box, element.id}, xSize, 0}; + RectanglesD1WithOrder.push_back(entry); + + if (xSize == 0) { + boundingBox = element.box; + } else { + boundingBox = BasicGeometry::CombineBoundingBoxes(boundingBox, element.box); + } + xSize++; + } + centerOrdering(RectanglesD1WithOrder, 1); + } else { + for (RTreeValue element : sorterRectsD0Basic.sortedView()) { + RTreeValueWithOrderIndex entry = {{element.box, element.id}, xSize, 0}; + sorterRectsD1.push(entry); + + if (xSize == 0) { + boundingBox = element.box; + } else { + boundingBox = BasicGeometry::CombineBoundingBoxes(boundingBox, element.box); + } + xSize++; + } + } + sorterRectsD0Basic.clear(); + + size_t currentS = std::ceil(((float)xSize) / ((float)M)); + if (xSize <= M * M) { + // in this case S can just be M + currentS = M; + } + + uint64_t ySize = 0; + std::ofstream r1File = + std::ofstream(onDiskBase + fileSuffix + ".d1.tmp", std::ios::binary); + ad_utility::BackgroundStxxlSorter> + sorterRectsD0 = + ad_utility::BackgroundStxxlSorter>( + (size_t) maxRamForSorter); + multiBoxWithOrderIndex RectanglesD0WithOrder = multiBoxWithOrderIndex(); + multiBoxWithOrderIndex r1Small = multiBoxWithOrderIndex(); + // placeholder + r1Small.emplace_back(); + r1Small.emplace_back(); + RTreeValueWithOrderIndex minD1; + RTreeValueWithOrderIndex maxD1; + + auto processD1Element = [&ySize, currentS, M, &r1Small, &minD1, + &maxD1](RTreeValueWithOrderIndex& element) { + element.orderY = ySize; + + if (BasicGeometry::IsBorderOfSplitCandidate(ySize, currentS, M)) { + // index i * S - 1 or i * S + r1Small.push_back(element); + } + + if (ySize == 0) { + minD1 = element; + } + + maxD1 = element; + + ySize++; + }; + + if (workInRam) { + for (RTreeValueWithOrderIndex element : RectanglesD1WithOrder) { + processD1Element(element); + + RectanglesD0WithOrder.push_back(element); + } + centerOrdering(RectanglesD0WithOrder, 0); + } else { + for (RTreeValueWithOrderIndex element : sorterRectsD1.sortedView()) { + processD1Element(element); + + FileReader::SaveEntryWithOrderIndex(element, r1File); + sorterRectsD0.push(element); + } + } + + r1File.close(); + sorterRectsD1.clear(); + + // replace the placeholder + r1Small[0] = minD1; + r1Small[1] = maxD1; + + uint64_t currentX = 0; + std::ofstream r0File = + std::ofstream(onDiskBase + fileSuffix + ".d0.tmp", std::ios::binary); + multiBoxWithOrderIndex r0Small = multiBoxWithOrderIndex(); + // placeholder + r0Small.emplace_back(); + r0Small.emplace_back(); + RTreeValueWithOrderIndex minD0; + RTreeValueWithOrderIndex maxD0; + + auto processD0Element = [¤tX, currentS, M, &r0Small, &minD0, + &maxD0](RTreeValueWithOrderIndex& element) { + if (BasicGeometry::IsBorderOfSplitCandidate(currentX, currentS, M)) { + // index i * S - 1 or i * S + r0Small.push_back(element); + } + + if (currentX == 0) { + minD0 = element; + } + maxD0 = element; + + currentX++; + }; + + if (workInRam) { + for (RTreeValueWithOrderIndex element : RectanglesD0WithOrder) { + processD0Element(element); + } + } else { + for (RTreeValueWithOrderIndex element : sorterRectsD0.sortedView()) { + FileReader::SaveEntryWithOrderIndex(element, r0File); + + processD0Element(element); + } + } + + r0File.close(); + sorterRectsD0.clear(); + + // replace the placeholder + r0Small[0] = minD0; + r0Small[1] = maxD0; + + RectanglesForOrderedBoxes rectsD0; + RectanglesForOrderedBoxes rectsD1; + rectsD0.rectanglesSmall = std::move(r0Small); + rectsD1.rectanglesSmall = std::move(r1Small); + if (workInRam) { + rectsD0.rectangles = std::move(RectanglesD0WithOrder); + rectsD1.rectangles = std::move(RectanglesD1WithOrder); + orderedInputRectangles.SetOrderedBoxesToRam(rectsD0, rectsD1, boundingBox); + } else { + rectsD0.rectangles = onDiskBase + fileSuffix + ".d0.tmp"; + rectsD1.rectangles = onDiskBase + fileSuffix + ".d1.tmp"; + orderedInputRectangles.SetOrderedBoxesToDisk(rectsD0, rectsD1, xSize, + boundingBox); + } + return orderedInputRectangles; +} + +/*OrderedBoxes SortInput(const std::filesystem::path& onDiskBase, size_t M, uintmax_t + maxBuildingRamUsage, bool workInRam) { if (workInRam) { + return InternalSort(onDiskBase, M); } else { return ExternalSort(onDiskBase, M, + maxBuildingRamUsage); + } +}*/ diff --git a/src/util/RtreeSorter.h b/src/util/RtreeSorter.h new file mode 100644 index 0000000000..b0ba1179f3 --- /dev/null +++ b/src/util/RtreeSorter.h @@ -0,0 +1,13 @@ +// +// Created by nockn on 11/17/23. +// + +#ifndef QLEVER_RTREESORTER_H +#define QLEVER_RTREESORTER_H + +#include "./Rtree.h" + +OrderedBoxes SortInput(const std::string& onDiskBase, const std::string& fileSuffix, size_t M, + uintmax_t maxBuildingRamUsage, bool workInRam); + +#endif // QLEVER_RTREESORTER_H diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 84a5d14bc3..44a1329175 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -39,7 +39,7 @@ endfunction() if (SINGLE_TEST_BINARY) message(STATUS "All tests are linked into a single executable `QLeverAllUnitTestsMain`") - add_executable(QLeverAllUnitTestsMain) + add_executable(QLeverAllUnitTestsMain RtreeTest.cpp) qlever_target_link_libraries(QLeverAllUnitTestsMain gtest gmock_main testUtil ${CMAKE_THREAD_LIBS_INIT}) gtest_discover_tests(QLeverAllUnitTestsMain QLeverAllUnitTestsMain PROPERTIES RUN_SERIAL TRUE) @@ -288,6 +288,8 @@ addLinkAndDiscoverTest(TimerTest) addLinkAndDiscoverTest(AlgorithmTest) +addLinkAndDiscoverTest(RtreeTest) + addLinkAndDiscoverTestSerial(CompressedRelationsTest index) addLinkAndDiscoverTest(ExceptionTest) diff --git a/test/RtreeTest.cpp b/test/RtreeTest.cpp new file mode 100644 index 0000000000..eea5cec305 --- /dev/null +++ b/test/RtreeTest.cpp @@ -0,0 +1,112 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Noah Nock + +#include +#include +#include +#include +#include + +bool boundingBoxesAreEqual(BasicGeometry::BoundingBox b1, BasicGeometry::BoundingBox b2) { + if (BasicGeometry::GetMinX(b1) != BasicGeometry::GetMinX(b2)) return false; + if (BasicGeometry::GetMinY(b1) != BasicGeometry::GetMinY(b2)) return false; + if (BasicGeometry::GetMaxX(b1) != BasicGeometry::GetMaxX(b2)) return false; + if (BasicGeometry::GetMaxY(b1) != BasicGeometry::GetMaxY(b2)) return false; + return true; +} + +bool multiBoxGeosAreEqual(multiBoxGeo& m1, multiBoxGeo& m2) { + if (m1.size() != m2.size()) return false; + for (size_t i = 0; i < m1.size(); i++) { + RTreeValue r1 = m1[i]; + RTreeValue r2 = m2[i]; + if (r1.id != r2.id) return false; + if (!boundingBoxesAreEqual(r1.box, r2.box)) return false; + } + return true; +} + +bool multiBoxGeosWithOrderIndexAreEqual(multiBoxWithOrderIndex& m1, multiBoxWithOrderIndex& m2) { + if (m1.size() != m2.size()) return false; + for (size_t i = 0; i < m1.size(); i++) { + RTreeValueWithOrderIndex r1 = m1[i]; + RTreeValueWithOrderIndex r2 = m2[i]; + if (r1.id != r2.id) return false; + if (!boundingBoxesAreEqual(r1.box, r2.box)) return false; + if (r1.orderX != r2.orderX) return false; + if (r1.orderY != r2.orderY) return false; + } + return true; +} + +TEST(Rtree, ConvertWordToRtreeEntry) { + std::string wkt1 = "\"POLYGON((0.0 0.0,0.0 0.0,0.0 0.0,0.0 0.0,0.0 0.0))\"^^"; + std::string wkt2 = "\"MULTIPOLYGON(((-100 -100,0 0,50 50,75 75,100 100), (10 10,20 20,30 30)), ((0 0,-10.0 -10,-20 -20), (-5 -5,-7 -7)))\"^^"; + std::string wkt3 = "\"LINESTRING(-120 -110,0.0 0.0,0.0 0.0,0.0 0.0,120.0 110.0)\"^^"; + std::string wkt4 = "Invalid input"; + std::string wkt5 = "\"POLYGON((1 1,2 2,5 5), (1.1 1.1, 2 2, 3 3))\"^^"; + std::string wkt6 = "\"MULTIPOLYGON(((-100 -100,0 0,50 50,75 75,100 100), (10 10,20 20,30 30)), ((-150 -140,-10.0 -10,160 170), (-5 -5,-7 -7)))\"^^"; + std::optional result1 = BasicGeometry::ConvertWordToRtreeEntry(wkt1); + std::optional result2 = BasicGeometry::ConvertWordToRtreeEntry(wkt2); + std::optional result3 = BasicGeometry::ConvertWordToRtreeEntry(wkt3); + std::optional result4 = BasicGeometry::ConvertWordToRtreeEntry(wkt4); + std::optional result5 = BasicGeometry::ConvertWordToRtreeEntry(wkt5); + std::optional result6 = BasicGeometry::ConvertWordToRtreeEntry(wkt6); + ASSERT_TRUE(result1); + ASSERT_TRUE(boundingBoxesAreEqual(result1.value(), BasicGeometry::CreateBoundingBox(0, 0, 0, 0))); + ASSERT_TRUE(result2); + ASSERT_TRUE(boundingBoxesAreEqual(result2.value(), BasicGeometry::CreateBoundingBox(-100, -100, 100, 100))); + ASSERT_TRUE(result3); + ASSERT_TRUE(boundingBoxesAreEqual(result3.value(), BasicGeometry::CreateBoundingBox(-120, -110, 120, 110))); + ASSERT_TRUE(!result4); + ASSERT_TRUE(result5); + ASSERT_TRUE(boundingBoxesAreEqual(result5.value(), BasicGeometry::CreateBoundingBox(1, 1, 5, 5))); + ASSERT_TRUE(result6); + ASSERT_TRUE(boundingBoxesAreEqual(result6.value(), BasicGeometry::CreateBoundingBox(-150, -140, 160, 170))); +} + +TEST(Rtree, IsBorderOfSplitCandidate) { + ASSERT_TRUE(BasicGeometry::IsBorderOfSplitCandidate(16, 16, 16)); + ASSERT_TRUE(BasicGeometry::IsBorderOfSplitCandidate(15, 16, 16)); + ASSERT_TRUE(!BasicGeometry::IsBorderOfSplitCandidate(17, 16, 16)); + ASSERT_TRUE(!BasicGeometry::IsBorderOfSplitCandidate(3185, 200, 16)); + ASSERT_TRUE(!BasicGeometry::IsBorderOfSplitCandidate(3184, 200, 16)); + ASSERT_TRUE(BasicGeometry::IsBorderOfSplitCandidate(3000, 200, 16)); + ASSERT_TRUE(BasicGeometry::IsBorderOfSplitCandidate(2999, 200, 16)); + ASSERT_TRUE(BasicGeometry::IsBorderOfSplitCandidate(200, 200, 16)); + ASSERT_TRUE(BasicGeometry::IsBorderOfSplitCandidate(199, 200, 16)); +} + +TEST(Rtree, CreateBoundingBox) { + BasicGeometry::Point p1 = {-1, -2}; + BasicGeometry::Point p2 = {3, 4}; + BasicGeometry::BoundingBox b = {p1, p2}; + ASSERT_TRUE(boundingBoxesAreEqual(b, BasicGeometry::CreateBoundingBox(-1, -2, 3, 4))); +} + +TEST(Rtree, CombineBoundingBoxes) { + BasicGeometry::BoundingBox b1 = BasicGeometry::CreateBoundingBox(0, 0, 0, 0); + BasicGeometry::BoundingBox b2 = BasicGeometry::CreateBoundingBox(1, 2, 3, 4); + BasicGeometry::BoundingBox b3 = BasicGeometry::CreateBoundingBox(-1, -2, -3, -4); + ASSERT_TRUE(boundingBoxesAreEqual(BasicGeometry::CombineBoundingBoxes(b1, b2), BasicGeometry::CreateBoundingBox(0, 0, 3, 4))); + ASSERT_TRUE(boundingBoxesAreEqual(BasicGeometry::CombineBoundingBoxes(b2, b1), BasicGeometry::CreateBoundingBox(0, 0, 3, 4))); + ASSERT_TRUE(boundingBoxesAreEqual(BasicGeometry::CombineBoundingBoxes(b1, b3), BasicGeometry::CreateBoundingBox(-1, -2, 0, 0))); + ASSERT_TRUE(boundingBoxesAreEqual(BasicGeometry::CombineBoundingBoxes(b3, b1), BasicGeometry::CreateBoundingBox(-1, -2, 0, 0))); + BasicGeometry::BoundingBox b4 = BasicGeometry::CreateBoundingBox(-150.0, 30.4, -70.0, 50); + BasicGeometry::BoundingBox b5 = BasicGeometry::CreateBoundingBox(5.0, -30.4, 10.0, 20); + ASSERT_TRUE(boundingBoxesAreEqual(BasicGeometry::CombineBoundingBoxes(b4, b5), BasicGeometry::CreateBoundingBox(-150, -30.4, 10, 50))); +} + +TEST(Rtree, SaveAndLoadEntry) { + multiBoxGeo boxes1 = multiBoxGeo(); + boxes1.push_back(RTreeValue(BasicGeometry::CreateBoundingBox(0, 0, 0, 0), 1)); + std::filesystem::path path1 = "RtreeTest_SaveEntry1"; + std::ofstream ofs1 = std::ofstream("RtreeTest_SaveEntry1", std::ios::binary); + for (RTreeValue element : boxes1) { + FileReaderWithoutIndex::SaveEntry(element.box, element.id, ofs1); + } + ofs1.close(); + multiBoxGeo boxes2 = FileReaderWithoutIndex::LoadEntries(std::filesystem::absolute(path1)); + ASSERT_TRUE(multiBoxGeosAreEqual(boxes1, boxes2)); +} From eb81237e063da918574413e6189be331163e84be Mon Sep 17 00:00:00 2001 From: Noah Nock Date: Tue, 21 Nov 2023 10:44:52 +0100 Subject: [PATCH 20/20] Fixed issue when building rtree --- src/index/IndexImpl.cpp | 6 +- src/util/ComparisonWithNan.h | 1 - src/util/Rtree.h | 4 +- src/util/RtreeBasicGeometry.h | 25 ++++ src/util/RtreeBuild.cpp | 10 +- src/util/RtreeNode.h | 9 ++ src/util/RtreeSorter.cpp | 231 +++++++++++++++++----------------- 7 files changed, 164 insertions(+), 122 deletions(-) diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index fa423eee73..b311f92d69 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -448,10 +448,10 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( res.idTriples = std::move(*idTriples.wlock()); LOG(INFO) << "Building the Rtree..." << std::endl; try { - Rtree rtree = Rtree(1300000000000); - rtree.BuildTree(onDiskBase_, ".vocabulary.boundingbox", 16, + Rtree rtree = Rtree(10000000000); + uint64_t treeSize = rtree.BuildTree(onDiskBase_, ".vocabulary.boundingbox", 16, "./rtree_build"); - LOG(INFO) << "Finished building the Rtree" << std::endl; + LOG(INFO) << "Finished building the Rtree with " << treeSize << " elements." << std::endl; } catch (const std::exception& e) { LOG(INFO) << e.what() << std::endl; } diff --git a/src/util/ComparisonWithNan.h b/src/util/ComparisonWithNan.h index 748ba429a5..a2bd34f1da 100644 --- a/src/util/ComparisonWithNan.h +++ b/src/util/ComparisonWithNan.h @@ -3,7 +3,6 @@ // Author: Johannes Kalmbach #pragma once - #include #include diff --git a/src/util/Rtree.h b/src/util/Rtree.h index d21d197d28..61763d14ab 100644 --- a/src/util/Rtree.h +++ b/src/util/Rtree.h @@ -56,7 +56,7 @@ class Rtree { // ___________________________________________________________________________ // Build the whole Rtree with the raw data in onDiskBase + fileSuffix + // ".tmp", M as branching factor and folder as Rtree destination - void BuildTree(const std::string& onDiskBase, const std::string& fileSuffix, + uint64_t BuildTree(const std::string& onDiskBase, const std::string& fileSuffix, size_t M, const std::string& folder) const; // ___________________________________________________________________________ // Search for an intersection of query with any elements of the Rtree @@ -69,7 +69,7 @@ class Rtree { // Data structure handling the datapoints of the Rtree sorted in x and y // direction (either on ram or on disk) class OrderedBoxes { - private: + public: // TODO bool workInRam_{}; uint64_t size_{}; BasicGeometry::BoundingBox boundingBox_{}; diff --git a/src/util/RtreeBasicGeometry.h b/src/util/RtreeBasicGeometry.h index 70540602cb..817b77aa8c 100644 --- a/src/util/RtreeBasicGeometry.h +++ b/src/util/RtreeBasicGeometry.h @@ -68,6 +68,15 @@ class BasicGeometry { return {{globalMinX, globalMinY}, {globalMaxX, globalMaxY}}; } + static bool BoundingBoxesAreEqual(BasicGeometry::BoundingBox b1, + BasicGeometry::BoundingBox b2) { + if (BasicGeometry::GetMinX(b1) != BasicGeometry::GetMinX(b2)) return false; + if (BasicGeometry::GetMinY(b1) != BasicGeometry::GetMinY(b2)) return false; + if (BasicGeometry::GetMaxX(b1) != BasicGeometry::GetMaxX(b2)) return false; + if (BasicGeometry::GetMaxY(b1) != BasicGeometry::GetMaxY(b2)) return false; + return true; + } + static bool IsBorderOfSplitCandidate(uint64_t current, uint64_t splitSize, uint64_t M) { if (((current + 1) % splitSize == 0 && (current + 1) / splitSize < M) || @@ -126,6 +135,13 @@ struct RTreeValue { [[nodiscard]] double MinY() const { return box.min_corner().get<1>(); } [[nodiscard]] double MaxY() const { return box.max_corner().get<1>(); } + bool operator==(const RTreeValue& other) const + { + if (id != other.id) return false; + if (!BasicGeometry::BoundingBoxesAreEqual(box, other.box)) return false; + return true; + } + template void serialize(Archive& a, [[maybe_unused]] const unsigned int version) { a& box; @@ -140,6 +156,15 @@ struct RTreeValue { struct RTreeValueWithOrderIndex : RTreeValue { uint64_t orderX = 0; uint64_t orderY = 0; + + bool operator==(const RTreeValueWithOrderIndex& other) const + { + if (id != other.id) return false; + if (!BasicGeometry::BoundingBoxesAreEqual(box, other.box)) return false; + if (orderX != other.orderX) return false; + if (orderY != other.orderY) return false; + return true; + } }; namespace boost::serialization { diff --git a/src/util/RtreeBuild.cpp b/src/util/RtreeBuild.cpp index 4ac3781fcd..ebab1b1928 100644 --- a/src/util/RtreeBuild.cpp +++ b/src/util/RtreeBuild.cpp @@ -60,7 +60,7 @@ static std::vector TGSRecursive( return result; } -void Rtree::BuildTree(const std::string& onDiskBase, +uint64_t Rtree::BuildTree(const std::string& onDiskBase, const std::string& fileSuffix, size_t M, const std::string& folder) const { const std::filesystem::path file = onDiskBase + fileSuffix + ".tmp"; @@ -74,9 +74,15 @@ void Rtree::BuildTree(const std::string& onDiskBase, 4 < this->maxBuildingRamUsage_; + std::cout << "Sorting" << (workInRam ? " in ram..." : "on disk...") << std::endl; OrderedBoxes orderedInputRectangles = SortInput(onDiskBase, fileSuffix, M, maxBuildingRamUsage_, workInRam); + uint64_t totalSize = orderedInputRectangles.GetSize(); + //OrderedBoxes orderedInputRectangles = InternalSort(onDiskBase, fileSuffix, M); std::cout << "Finished initial sorting" << std::endl; + std::cout << orderedInputRectangles.GetSize() << std::endl; + std::cout << orderedInputRectangles.rectsD0_.rectanglesSmall.size() << std::endl; + std::cout << orderedInputRectangles.rectsD1_.rectanglesSmall.size() << std::endl; // prepare the files std::filesystem::create_directory(folder); @@ -137,6 +143,8 @@ void Rtree::BuildTree(const std::string& onDiskBase, lookupOfs.write(reinterpret_cast(&nodePtr), sizeof(uint64_t)); } lookupOfs.close(); + + return totalSize; } bool OrderedBoxes::WorkInRam() const { return this->workInRam_; } diff --git a/src/util/RtreeNode.h b/src/util/RtreeNode.h index 7697b5b368..572d22e084 100644 --- a/src/util/RtreeNode.h +++ b/src/util/RtreeNode.h @@ -38,6 +38,15 @@ class RtreeNode { void SetIsLastInnerNode(bool isLast); [[nodiscard]] bool GetIsLastInnerNode() const; multiBoxGeo GetChildren(); + + bool operator==(const RtreeNode& other) const + { + if (id_ != other.id_) return false; + if (!BasicGeometry::BoundingBoxesAreEqual(boundingBox_, other.boundingBox_)) return false; + if (isLastInnerNode_ != other.isLastInnerNode_) return false; + if (children_ != other.children_) return false; + return true; + } }; BOOST_CLASS_VERSION(RtreeNode, 1) diff --git a/src/util/RtreeSorter.cpp b/src/util/RtreeSorter.cpp index ee89d42cc2..bfcd126bd1 100644 --- a/src/util/RtreeSorter.cpp +++ b/src/util/RtreeSorter.cpp @@ -101,70 +101,111 @@ static void centerOrdering(multiBoxWithOrderIndex& boxes, size_t dim) { } } -OrderedBoxes SortInput(const std::string& onDiskBase, - const std::string& fileSuffix, size_t M, - uintmax_t maxBuildingRamUsage, bool workInRam) { +RTreeValueWithOrderIndex handleD0ElementsWithoutIndex(RTreeValue& element, uint64_t i, BasicGeometry::BoundingBox& boundingBox) { + RTreeValueWithOrderIndex entry = {{element.box, element.id}, i, 0}; + if (i == 0) { + boundingBox = element.box; + } else { + boundingBox = + BasicGeometry::CombineBoundingBoxes(boundingBox, element.box); // TODO + } + return entry; +} + +void handleD1Elements(RTreeValueWithOrderIndex& element, uint64_t i, uint64_t S, size_t M, multiBoxWithOrderIndex& r1Small) { + element.orderY = i; + + if (BasicGeometry::IsBorderOfSplitCandidate(i, S, M)) { + // index i * S - 1 or i * S + r1Small.push_back(element); + } +} + +void handleD0Elements(RTreeValueWithOrderIndex& element, uint64_t i, uint64_t S, size_t M, multiBoxWithOrderIndex& r0Small) { + if (BasicGeometry::IsBorderOfSplitCandidate(i, S, M)) { + // index i * S - 1 or i * S + r0Small.push_back(element); + } +} + +OrderedBoxes InternalSort(const std::string& onDiskBase, const std::string& fileSuffix, size_t M) { + OrderedBoxes orderedInputRectangles; + multiBoxGeo RectanglesD0 = FileReaderWithoutIndex::LoadEntries(onDiskBase + fileSuffix + ".tmp"); + centerOrdering(RectanglesD0, 0); + + size_t currentS = std::ceil(((float) RectanglesD0.size()) / ((float) M)); + if (RectanglesD0.size() <= M * M) { + // in this case S can just be M + currentS = M; + } + + multiBoxWithOrderIndex R0Small = multiBoxWithOrderIndex(); + multiBoxWithOrderIndex R1Small = multiBoxWithOrderIndex(); + + BasicGeometry::BoundingBox boundingBox = BasicGeometry::CreateBoundingBox(0, 0, 0, 0); + multiBoxWithOrderIndex RectanglesD1WithOrder = multiBoxWithOrderIndex(); + for (uint64_t i = 0; i < RectanglesD0.size(); i++) { + RTreeValueWithOrderIndex entry = handleD0ElementsWithoutIndex(RectanglesD0[i], i, boundingBox); + RectanglesD1WithOrder.push_back(entry); + } + + centerOrdering(RectanglesD1WithOrder, 1); + + R1Small.push_back((RectanglesD1WithOrder)[0]); + RTreeValueWithOrderIndex maxElementDim1 = (RectanglesD1WithOrder)[RectanglesD1WithOrder.size() - 1]; + maxElementDim1.orderY = RectanglesD1WithOrder.size() - 1; + R1Small.push_back(maxElementDim1); + for (uint64_t i = 0; i < RectanglesD1WithOrder.size(); i++) { + handleD1Elements(RectanglesD1WithOrder[i], i, currentS, M, R1Small); + } + + multiBoxWithOrderIndex RectanglesD0WithOrder = multiBoxWithOrderIndex(RectanglesD1WithOrder); + centerOrdering(RectanglesD0WithOrder, 0); + + R0Small.push_back((RectanglesD0WithOrder)[0]); + RTreeValueWithOrderIndex maxElementDim0 = (RectanglesD0WithOrder)[RectanglesD0WithOrder.size() - 1]; + maxElementDim0.orderY = RectanglesD0WithOrder.size() - 1; + R0Small.push_back(maxElementDim0); + for (uint64_t i = 0; i < RectanglesD0WithOrder.size(); i++) { + handleD0Elements(RectanglesD0WithOrder[i], i, currentS, M, R0Small); + } + + RectanglesForOrderedBoxes d0WithOrder; + d0WithOrder.rectangles = RectanglesD0WithOrder; + d0WithOrder.rectanglesSmall = R0Small; + RectanglesForOrderedBoxes d1WithOrder; + d1WithOrder.rectangles = RectanglesD1WithOrder; + d1WithOrder.rectanglesSmall = R1Small; + orderedInputRectangles.SetOrderedBoxesToRam(d0WithOrder, d1WithOrder, boundingBox); + return orderedInputRectangles; +} + +OrderedBoxes ExternalSort(const std::string& onDiskBase, + const std::string& fileSuffix, size_t M, + uintmax_t maxBuildingRamUsage) { OrderedBoxes orderedInputRectangles; std::filesystem::path file = onDiskBase + fileSuffix + ".tmp"; - auto maxRamForSorter = std::ceil(((double)maxBuildingRamUsage < 9999999999.0 - ? (double)maxBuildingRamUsage - : 9999999999.0) / - 3.0); - ad_utility::BackgroundStxxlSorter> - sorterRectsD0Basic = - ad_utility::BackgroundStxxlSorter>( - (size_t)maxRamForSorter); - multiBoxGeo rectsD0Basic; + auto maxRamForSorter = std::ceil(std::min((double)maxBuildingRamUsage / 3.0, 9999999999.0 / 3.0)); + ad_utility::BackgroundStxxlSorter> sorterRectsD0Basic = + ad_utility::BackgroundStxxlSorter>((size_t)maxRamForSorter);; - if (workInRam) { - rectsD0Basic = FileReaderWithoutIndex::LoadEntries(file); - centerOrdering(rectsD0Basic, 0); - } else { - for (const RTreeValue& rectD0Element : FileReaderWithoutIndex(file)) { - sorterRectsD0Basic.push(rectD0Element); - } + for (const RTreeValue& rectD0Element : FileReaderWithoutIndex(file)) { + sorterRectsD0Basic.push(rectD0Element); } uint64_t xSize = 0; BasicGeometry::BoundingBox boundingBox = BasicGeometry::CreateBoundingBox(0, 0, 0, 0); - ad_utility::BackgroundStxxlSorter> - sorterRectsD1 = + ad_utility::BackgroundStxxlSorter> sorterRectsD1 = ad_utility::BackgroundStxxlSorter>( - (size_t)maxRamForSorter); - multiBoxWithOrderIndex RectanglesD1WithOrder = multiBoxWithOrderIndex(); + SortRuleLambdaWithIndex<1>>((size_t)maxRamForSorter); - if (workInRam) { - for (RTreeValue element : rectsD0Basic) { - RTreeValueWithOrderIndex entry = {{element.box, element.id}, xSize, 0}; - RectanglesD1WithOrder.push_back(entry); - - if (xSize == 0) { - boundingBox = element.box; - } else { - boundingBox = - BasicGeometry::CombineBoundingBoxes(boundingBox, element.box); - } - xSize++; - } - centerOrdering(RectanglesD1WithOrder, 1); - } else { - for (RTreeValue element : sorterRectsD0Basic.sortedView()) { - RTreeValueWithOrderIndex entry = {{element.box, element.id}, xSize, 0}; - sorterRectsD1.push(entry); - - if (xSize == 0) { - boundingBox = element.box; - } else { - boundingBox = - BasicGeometry::CombineBoundingBoxes(boundingBox, element.box); - } - xSize++; - } + for (RTreeValue element : sorterRectsD0Basic.sortedView()) { + RTreeValueWithOrderIndex entry = handleD0ElementsWithoutIndex(element, xSize, boundingBox); + sorterRectsD1.push(entry); + xSize++; } sorterRectsD0Basic.clear(); @@ -177,52 +218,26 @@ OrderedBoxes SortInput(const std::string& onDiskBase, uint64_t ySize = 0; std::ofstream r1File = std::ofstream(onDiskBase + fileSuffix + ".d1.tmp", std::ios::binary); - ad_utility::BackgroundStxxlSorter> - sorterRectsD0 = + ad_utility::BackgroundStxxlSorter> sorterRectsD0 = ad_utility::BackgroundStxxlSorter>( - (size_t)maxRamForSorter); - multiBoxWithOrderIndex RectanglesD0WithOrder = multiBoxWithOrderIndex(); + SortRuleLambdaWithIndex<0>>((size_t)maxRamForSorter); multiBoxWithOrderIndex r1Small = multiBoxWithOrderIndex(); // placeholder r1Small.emplace_back(); r1Small.emplace_back(); + RTreeValueWithOrderIndex minD1; RTreeValueWithOrderIndex maxD1; - - auto processD1Element = [&ySize, currentS, M, &r1Small, &minD1, - &maxD1](RTreeValueWithOrderIndex& element) { - element.orderY = ySize; - - if (BasicGeometry::IsBorderOfSplitCandidate(ySize, currentS, M)) { - // index i * S - 1 or i * S - r1Small.push_back(element); - } - + for (RTreeValueWithOrderIndex element : sorterRectsD1.sortedView()) { + handleD1Elements(element, ySize, currentS, M, r1Small); + FileReader::SaveEntryWithOrderIndex(element, r1File); + sorterRectsD0.push(element); if (ySize == 0) { minD1 = element; } maxD1 = element; - ySize++; - }; - - if (workInRam) { - for (RTreeValueWithOrderIndex element : RectanglesD1WithOrder) { - processD1Element(element); - - RectanglesD0WithOrder.push_back(element); - } - centerOrdering(RectanglesD0WithOrder, 0); - } else { - for (RTreeValueWithOrderIndex element : sorterRectsD1.sortedView()) { - processD1Element(element); - - FileReader::SaveEntryWithOrderIndex(element, r1File); - sorterRectsD0.push(element); - } } r1File.close(); @@ -239,34 +254,17 @@ OrderedBoxes SortInput(const std::string& onDiskBase, // placeholder r0Small.emplace_back(); r0Small.emplace_back(); + RTreeValueWithOrderIndex minD0; RTreeValueWithOrderIndex maxD0; - - auto processD0Element = [¤tX, currentS, M, &r0Small, &minD0, - &maxD0](RTreeValueWithOrderIndex& element) { - if (BasicGeometry::IsBorderOfSplitCandidate(currentX, currentS, M)) { - // index i * S - 1 or i * S - r0Small.push_back(element); - } - + for (RTreeValueWithOrderIndex element : sorterRectsD0.sortedView()) { + FileReader::SaveEntryWithOrderIndex(element, r0File); + handleD0Elements(element, currentX, currentS, M, r0Small); if (currentX == 0) { minD0 = element; } maxD0 = element; - currentX++; - }; - - if (workInRam) { - for (RTreeValueWithOrderIndex element : RectanglesD0WithOrder) { - processD0Element(element); - } - } else { - for (RTreeValueWithOrderIndex element : sorterRectsD0.sortedView()) { - FileReader::SaveEntryWithOrderIndex(element, r0File); - - processD0Element(element); - } } r0File.close(); @@ -280,17 +278,20 @@ OrderedBoxes SortInput(const std::string& onDiskBase, RectanglesForOrderedBoxes rectsD1; rectsD0.rectanglesSmall = std::move(r0Small); rectsD1.rectanglesSmall = std::move(r1Small); + rectsD0.rectangles = onDiskBase + fileSuffix + ".d0.tmp"; + rectsD1.rectangles = onDiskBase + fileSuffix + ".d1.tmp"; + orderedInputRectangles.SetOrderedBoxesToDisk(rectsD0, rectsD1, xSize, + boundingBox); + return orderedInputRectangles; +} + +OrderedBoxes SortInput(const std::string& onDiskBase, + const std::string& fileSuffix, size_t M, + uintmax_t maxBuildingRamUsage, bool workInRam) { if (workInRam) { - rectsD0.rectangles = std::move(RectanglesD0WithOrder); - rectsD1.rectangles = std::move(RectanglesD1WithOrder); - orderedInputRectangles.SetOrderedBoxesToRam(rectsD0, rectsD1, boundingBox); - } else { - rectsD0.rectangles = onDiskBase + fileSuffix + ".d0.tmp"; - rectsD1.rectangles = onDiskBase + fileSuffix + ".d1.tmp"; - orderedInputRectangles.SetOrderedBoxesToDisk(rectsD0, rectsD1, xSize, - boundingBox); + return InternalSort(onDiskBase, fileSuffix, M); } - return orderedInputRectangles; + return ExternalSort(onDiskBase, fileSuffix, M, maxBuildingRamUsage); } /*OrderedBoxes SortInput(const std::filesystem::path& onDiskBase, size_t M,