From ec75e8975711e481be73160e9dca0bf4e0d5ba41 Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Sat, 25 Jan 2025 22:26:15 +0100 Subject: [PATCH] arbitrary geometry types and areas dont need to be approximated as points --- src/engine/SpatialJoin.h | 4 + src/engine/SpatialJoinAlgorithms.cpp | 118 +++++++++++++--------- src/engine/SpatialJoinAlgorithms.h | 19 ++++ test/engine/SpatialJoinAlgorithmsTest.cpp | 114 +++++++++++++++++++++ test/engine/SpatialJoinTestHelpers.h | 11 +- 5 files changed, 217 insertions(+), 49 deletions(-) diff --git a/src/engine/SpatialJoin.h b/src/engine/SpatialJoin.h index af76c49a81..c85a25059a 100644 --- a/src/engine/SpatialJoin.h +++ b/src/engine/SpatialJoin.h @@ -174,6 +174,10 @@ class SpatialJoin : public Operation { return childRight_; } + PreparedSpatialJoinParams onlyForTestingGetPrepareJoin() const { + return prepareJoin(); + } + private: // helper function to generate a variable to column map from `childRight_` // that only contains the columns selected by `config_.payloadVariables_` diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index 6ac16d848d..e434c33dcf 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -50,21 +50,48 @@ std::string SpatialJoinAlgorithms::betweenQuotes( } } +struct DistanceVisitor : public boost::static_visitor { + template + double operator()(const Geometry1& geom1, const Geometry2& geom2) const { + return bg::distance(geom1, geom2); + } +}; + // ____________________________________________________________________________ Id SpatialJoinAlgorithms::computeDist(const IdTable* idTableLeft, const IdTable* idTableRight, size_t rowLeft, size_t rowRight, ColumnIndex leftPointCol, ColumnIndex rightPointCol) const { + auto getAreaString = [&](const IdTable* idtable, size_t row, size_t col) { + return betweenQuotes(ExportQueryExecutionTrees::idToStringAndType( + qec_->getIndex(), idtable->at(row, col), {}) + .value() + .first); + }; + + auto getAreaOrPointGeometry = + [&](const IdTable* idtable, size_t row, size_t col, + std::optional point) -> std::optional { + AnyGeometry geometry; + try { + if (!point) { + boost::geometry::read_wkt(getAreaString(idtable, row, col), geometry); + } else { + geometry = Point(point.value().getLng(), point.value().getLat()); + } + } catch (...) { + return std::nullopt; + } + return geometry; + }; + // for now we need to get the data from the disk, but in the future, this // information will be stored in an ID, just like GeoPoint auto getAreaPoint = [&](const IdTable* idtable, size_t row, size_t col) -> std::optional { - std::string areastring = - betweenQuotes(ExportQueryExecutionTrees::idToStringAndType( - qec_->getIndex(), idtable->at(row, col), {}) - .value() - .first); + std::string areastring = getAreaString(idtable, row, col); + Box areaBox; try { areaBox = calculateBoundingBoxOfArea(areastring); @@ -73,29 +100,41 @@ Id SpatialJoinAlgorithms::computeDist(const IdTable* idTableLeft, // gets printed at another place and the point/area just gets skipped return std::nullopt; } - if (useMidpointForAreas_) { - Point p = calculateMidpointOfBox(areaBox); - return GeoPoint(p.get<1>(), p.get<0>()); - } else { - AD_FAIL(); // TODO not yet implemented - } + Point p = calculateMidpointOfBox(areaBox); + return GeoPoint(p.get<1>(), p.get<0>()); }; auto point1 = getPoint(idTableLeft, rowLeft, leftPointCol); - if (!point1) { - point1 = getAreaPoint(idTableLeft, rowLeft, leftPointCol); - } - auto point2 = getPoint(idTableRight, rowRight, rightPointCol); - if (!point2) { - point2 = getAreaPoint(idTableRight, rowRight, rightPointCol); - } + if (useMidpointForAreas_) { + if (!point1) { + point1 = getAreaPoint(idTableLeft, rowLeft, leftPointCol); + } + + if (!point2) { + point2 = getAreaPoint(idTableRight, rowRight, rightPointCol); + } - if (!point1.has_value() || !point2.has_value()) { - return Id::makeUndefined(); + if (!point1.has_value() || !point2.has_value()) { + return Id::makeUndefined(); + } + return Id::makeFromDouble( + ad_utility::detail::wktDistImpl(point1.value(), point2.value())); + } else { + auto geometry1 = + getAreaOrPointGeometry(idTableLeft, rowLeft, leftPointCol, point1); + auto geometry2 = + getAreaOrPointGeometry(idTableRight, rowRight, rightPointCol, point2); + if (!geometry1.has_value() || !geometry2.has_value()) { + return Id::makeUndefined(); + } + // convert to m and return. Note that the 78630 is an approximation, which + // does not provide accurate results for the poles. + return Id::makeFromDouble(boost::apply_visitor(DistanceVisitor(), + geometry1.value(), + geometry2.value()) * + 78630); } - return Id::makeFromDouble( - ad_utility::detail::wktDistImpl(point1.value(), point2.value())); } // ____________________________________________________________________________ @@ -489,32 +528,21 @@ std::array SpatialJoinAlgorithms::isAPoleTouched( return std::array{northPoleReached, southPoleReached}; } +struct BoundingBoxVisitor : public boost::static_visitor { + template + Box operator()(const Geometry& geometry) const { + Box box; + bg::envelope(geometry, box); + return box; + } +}; + // ____________________________________________________________________________ Box SpatialJoinAlgorithms::calculateBoundingBoxOfArea( const std::string& wktString) const { - Polygon polygon; - boost::geometry::read_wkt(wktString, polygon); - double minLng = std::numeric_limits::infinity(); - double maxLng = -std::numeric_limits::infinity(); - double minLat = std::numeric_limits::infinity(); - double maxLat = -std::numeric_limits::infinity(); - for (const auto& point : polygon.outer()) { - double lng = boost::geometry::get<0>(point); - double lat = boost::geometry::get<1>(point); - if (lng < minLng) { - minLng = lng; - } - if (lng > maxLng) { - maxLng = lng; - } - if (lat < minLat) { - minLat = lat; - } - if (lat > maxLat) { - maxLat = lat; - } - } - return Box(Point(minLng, minLat), Point(maxLng, maxLat)); + AnyGeometry geometry; + boost::geometry::read_wkt(wktString, geometry); + return boost::apply_visitor(BoundingBoxVisitor(), geometry); } // ____________________________________________________________________________ diff --git a/src/engine/SpatialJoinAlgorithms.h b/src/engine/SpatialJoinAlgorithms.h index 28bf302a6f..0fb411ffc2 100644 --- a/src/engine/SpatialJoinAlgorithms.h +++ b/src/engine/SpatialJoinAlgorithms.h @@ -23,6 +23,12 @@ using Box = bg::model::box; using Value = std::pair; using Polygon = boost::geometry::model::polygon< boost::geometry::model::d2::point_xy>; +using Linestring = bg::model::linestring; +using MultiPoint = bg::model::multi_point; +using MultiLinestring = bg::model::multi_linestring; +using MultiPolygon = bg::model::multi_polygon; +using AnyGeometry = boost::variant; } // namespace BoostGeometryNamespace class SpatialJoinAlgorithms { @@ -58,6 +64,19 @@ class SpatialJoinAlgorithms { return calculateMidpointOfBox(box); } + void setUseMidpointForAreas_(bool useMidpointForAreas) { + useMidpointForAreas_ = useMidpointForAreas; + } + + Id OnlyForTestingWrapperComputeDist(const IdTable* idTableLeft, + const IdTable* idTableRight, + size_t rowLeft, size_t rowRight, + ColumnIndex leftPointCol, + ColumnIndex rightPointCol) const { + return computeDist(idTableLeft, idTableRight, rowLeft, rowRight, + leftPointCol, rightPointCol); + } + double OnlyForTestingWrapperGetMaxDistFromMidpointToAnyPointInsideTheBox( const BoostGeometryNamespace::Box& box, std::optional midpoint = diff --git a/test/engine/SpatialJoinAlgorithmsTest.cpp b/test/engine/SpatialJoinAlgorithmsTest.cpp index 7c6b861244..69c656ed43 100644 --- a/test/engine/SpatialJoinAlgorithmsTest.cpp +++ b/test/engine/SpatialJoinAlgorithmsTest.cpp @@ -1514,6 +1514,120 @@ TEST(SpatialJoin, getMaxDistFromMidpointToAnyPointInsideTheBox) { area_statue, midpoint_statue)); } +QueryExecutionContext* getAllGeometriesQEC() { + auto addRow = [](std::string& kg, std::string nr, std::string wktStr) { + kg += absl::StrCat(" \"", wktStr, "\".\n"); + }; + // each object (except for the point) has its leftmost coordinate at an + // integer number and its rightmost coordinate 0.5 units further right. The + // y coordinate will be 0 for those case. All other points are inbetween these + // two points with different y coordinates. The reason for that is, that it + // is very easy to calculate the shortest distance between two geometries. + std::string point = "POINT(1.5 0)"; + std::string linestring = "LINESTRING(2.0 0, 2.2 1, 2.5 0)"; + std::string polygon = "POLYGON((3.0 0, 3.1 1, 3.2 2, 3.5 0))"; + std::string multiPoint = "MULTIPOINT((4.0 0), (4.2 1), (4.5 0))"; + std::string multiLinestring = + "MULTILINESTRING((5.0 0, 5.2 1, 5.5 1), (5.1 3, 5.5 0))"; + std::string multiPolygon = + "MULTIPOLYGON(((6.0 0, 6.1 1, 6.3 5, 6.4 2)), ((6.2 1, 6.3 4, 6.4 3, 6.5 " + "0)))"; + + std::string kg = ""; // tiny test knowledge graph for all geometries + addRow(kg, "1", point); + addRow(kg, "2", linestring); + addRow(kg, "3", polygon); + addRow(kg, "4", multiPoint); + addRow(kg, "5", multiLinestring); + addRow(kg, "6", multiPolygon); + + auto qec = ad_utility::testing::getQec(kg, true, true, false, 16_MB, false, + true, std::nullopt, 10_kB); + return qec; +} + +TEST(SpatialJoin, areaFormat) { + auto qec = getAllGeometriesQEC(); + auto leftChild = + buildIndexScan(qec, {"?geo1", std::string{""}, "?obj1"}); + auto rightChild = + buildIndexScan(qec, {"?geo2", std::string{""}, "?obj2"}); + + std::shared_ptr spatialJoinOperation = + ad_utility::makeExecutionTree( + qec, + SpatialJoinConfiguration{MaxDistanceConfig(100000000), + Variable{"?obj1"}, Variable{"?obj2"}}, + leftChild, rightChild); + + std::shared_ptr op = spatialJoinOperation->getRootOperation(); + SpatialJoin* spatialJoin = static_cast(op.get()); + spatialJoin->selectAlgorithm(SpatialJoinAlgorithm::BOUNDING_BOX); + auto res = spatialJoin->getResult(); + // if all rows can be parsed correctly, the result should be the cross + // product. (Lines which can't be parsed will be ignored (and a warning gets + // printed) and therefore the cross product of all parsed lines would be + // smaller then 36) + ASSERT_EQ(res->idTable().numRows(), 36); +} + +TEST(SpatialJoin, trueAreaDistance) { + auto testDist = [](QueryExecutionContext* qec, std::string nr1, + std::string nr2, double distance) { + auto makeIndexScan = [&](std::string nr) { + auto subject = absl::StrCat(""); + auto objStr = absl::StrCat("?obj", nr); + TripleComponent object{Variable{objStr}}; + return ad_utility::makeExecutionTree( + qec, Permutation::Enum::PSO, + SparqlTriple{TripleComponent::Iri::fromIriref(subject), "", + object}); + }; + auto scan1 = makeIndexScan(nr1); + auto scan2 = makeIndexScan(nr2); + auto var1 = absl::StrCat("?obj", nr1); + auto var2 = absl::StrCat("?obj", nr2); + + std::shared_ptr spatialJoinOperation = + ad_utility::makeExecutionTree( + qec, + SpatialJoinConfiguration{MaxDistanceConfig(100000000), + Variable{var1}, Variable{var2}}, + scan1, scan2); + + std::shared_ptr op = spatialJoinOperation->getRootOperation(); + SpatialJoin* spatialJoin = static_cast(op.get()); + spatialJoin->selectAlgorithm(SpatialJoinAlgorithm::BOUNDING_BOX); + PreparedSpatialJoinParams params = + spatialJoin->onlyForTestingGetPrepareJoin(); + SpatialJoinAlgorithms algorithms{ + qec, params, spatialJoin->onlyForTestingGetConfig(), std::nullopt}; + algorithms.setUseMidpointForAreas_(false); + auto distID = algorithms.OnlyForTestingWrapperComputeDist( + params.idTableLeft_, params.idTableRight_, 0, 0, params.leftJoinCol_, + params.rightJoinCol_); + ASSERT_EQ(distID.getDatatype(), Datatype::Double); + ASSERT_DOUBLE_EQ(distID.getDouble(), distance); + }; + auto qec = getAllGeometriesQEC(); + double conversionFactor = 78630; // convert to meters + testDist(qec, "1", "2", 0.5 * conversionFactor); + testDist(qec, "1", "3", 1.5 * conversionFactor); + testDist(qec, "1", "4", 2.5 * conversionFactor); + testDist(qec, "1", "5", 3.5 * conversionFactor); + testDist(qec, "1", "6", 4.5 * conversionFactor); + testDist(qec, "2", "3", 0.5 * conversionFactor); + testDist(qec, "2", "4", 1.5 * conversionFactor); + testDist(qec, "2", "5", 2.5 * conversionFactor); + testDist(qec, "2", "6", 3.5 * conversionFactor); + testDist(qec, "3", "4", 0.5 * conversionFactor); + testDist(qec, "3", "5", 1.5 * conversionFactor); + testDist(qec, "3", "6", 2.5 * conversionFactor); + testDist(qec, "4", "5", 0.5 * conversionFactor); + testDist(qec, "4", "6", 1.5 * conversionFactor); + testDist(qec, "5", "6", 0.5 * conversionFactor); +} + } // namespace boundingBox } // namespace diff --git a/test/engine/SpatialJoinTestHelpers.h b/test/engine/SpatialJoinTestHelpers.h index a46197040f..eab4082004 100644 --- a/test/engine/SpatialJoinTestHelpers.h +++ b/test/engine/SpatialJoinTestHelpers.h @@ -349,12 +349,15 @@ inline std::shared_ptr buildSmallChild( // one of the wrapper classes needs a proper maxDistance, otherwise the wrapper // can't be used to test the function inline SpatialJoinAlgorithms getDummySpatialJoinAlgsForWrapperTesting( - size_t maxDist = 1000) { - auto qec = buildTestQEC(); + size_t maxDist = 1000, + std::optional qec = std::nullopt) { + if (!qec) { + qec = buildTestQEC(); + } MaxDistanceConfig task{maxDist}; std::shared_ptr spatialJoinOperation = ad_utility::makeExecutionTree( - qec, + qec.value(), SpatialJoinConfiguration{task, Variable{"?point1"}, Variable{"?point2"}}, std::nullopt, std::nullopt); @@ -373,7 +376,7 @@ inline SpatialJoinAlgorithms getDummySpatialJoinAlgsForWrapperTesting( spatialJoin->getMaxDist(), std::nullopt}; - return {qec, params, spatialJoin->onlyForTestingGetConfig()}; + return {qec.value(), params, spatialJoin->onlyForTestingGetConfig()}; } } // namespace SpatialJoinTestHelpers