From 2d9171704497ab21fb9711821d8b3ed7809973e2 Mon Sep 17 00:00:00 2001 From: "hengjiang.ly" Date: Tue, 6 Aug 2024 14:35:18 +0800 Subject: [PATCH] opt like fix like add arch indepent impl opt code remove simd_strstr to a seperate PR fix format --- velox/docs/functions/presto/regexp.rst | 6 ++ velox/functions/lib/Re2Functions.cpp | 79 ++++++++++++++++--- velox/functions/lib/Re2Functions.h | 19 ++++- .../functions/lib/tests/Re2FunctionsTest.cpp | 25 ++++++ 4 files changed, 115 insertions(+), 14 deletions(-) diff --git a/velox/docs/functions/presto/regexp.rst b/velox/docs/functions/presto/regexp.rst index d7d41eb8361d..d6d646eeb0ca 100644 --- a/velox/docs/functions/presto/regexp.rst +++ b/velox/docs/functions/presto/regexp.rst @@ -32,6 +32,12 @@ limited to 20 different expressions per instance and thread of execution. SELECT like('abc', '%b%'); -- true SELECT like('a_c', '%#_%', '#'); -- true + String sequence search: There are some patterns that are equivalent to simple + string searches. For such constant patterns without custom-escape, velox uses + substring searches instead of regex searches, for example: + like("hello velox", "%hello%velox%") + is equivalent to searching for the strings "hello", "velox" in sequence. + .. function:: regexp_extract(string, pattern) -> varchar Returns the first substring matched by the regular expression ``pattern`` diff --git a/velox/functions/lib/Re2Functions.cpp b/velox/functions/lib/Re2Functions.cpp index eb5b8f9db7a6..f47ff51b8849 100644 --- a/velox/functions/lib/Re2Functions.cpp +++ b/velox/functions/lib/Re2Functions.cpp @@ -650,6 +650,22 @@ bool matchSubstringPattern( std::string::npos); } +bool matchSubstringsPattern( + const StringView& input, + const std::vector& patterns) { + const char* data = input.data(); + for (int i = 0; i < patterns.size(); i++) { + auto curPos = + std::string_view(data, input.end() - data) + .find(std::string_view(patterns[i].c_str(), patterns[i].size())); + if (curPos == std::string::npos) { + return false; + } + data = data + curPos + patterns[i].size(); + } + return true; +} + // Return true if the input VARCHAR argument is all-ASCII for the specified // rows. FOLLY_ALWAYS_INLINE static bool isAsciiArg( @@ -705,6 +721,8 @@ class OptimizedLike final : public exec::VectorFunction { .first; case PatternKind::kSubstring: return matchSubstringPattern(input, patternMetadata.fixedPattern()); + case PatternKind::kSubstrings: + return matchSubstringsPattern(input, patternMetadata.substrings()); } } else { switch (P) { @@ -739,6 +757,8 @@ class OptimizedLike final : public exec::VectorFunction { input, patternMetadata, input.size() - 1); case PatternKind::kSubstring: return matchSubstringPattern(input, patternMetadata.fixedPattern()); + case PatternKind::kSubstrings: + return matchSubstringsPattern(input, patternMetadata.substrings()); } } } @@ -1688,19 +1708,19 @@ std::vector> re2ExtractSignatures() { } PatternMetadata PatternMetadata::generic() { - return {PatternKind::kGeneric, 0, "", {}}; + return {PatternKind::kGeneric, 0, "", {}, {}}; } PatternMetadata PatternMetadata::atLeastN(size_t length) { - return {PatternKind::kAtLeastN, length, "", {}}; + return {PatternKind::kAtLeastN, length, "", {}, {}}; } PatternMetadata PatternMetadata::exactlyN(size_t length) { - return {PatternKind::kExactlyN, length, "", {}}; + return {PatternKind::kExactlyN, length, "", {}, {}}; } PatternMetadata PatternMetadata::fixed(const std::string& fixedPattern) { - return {PatternKind::kFixed, fixedPattern.length(), fixedPattern, {}}; + return {PatternKind::kFixed, fixedPattern.length(), fixedPattern, {}, {}}; } PatternMetadata PatternMetadata::relaxedFixed( @@ -1711,11 +1731,12 @@ PatternMetadata PatternMetadata::relaxedFixed( PatternKind::kRelaxedFixed, fixedLength, std::move(fixedPattern), - std::move(subPatterns)}; + std::move(subPatterns), + {}}; } PatternMetadata PatternMetadata::prefix(const std::string& fixedPattern) { - return {PatternKind::kPrefix, fixedPattern.length(), fixedPattern, {}}; + return {PatternKind::kPrefix, fixedPattern.length(), fixedPattern, {}, {}}; } PatternMetadata PatternMetadata::relaxedPrefix( @@ -1726,11 +1747,12 @@ PatternMetadata PatternMetadata::relaxedPrefix( PatternKind::kRelaxedPrefix, fixedLength, std::move(fixedPattern), - std::move(subPatterns)}; + std::move(subPatterns), + {}}; } PatternMetadata PatternMetadata::suffix(const std::string& fixedPattern) { - return {PatternKind::kSuffix, fixedPattern.length(), fixedPattern, {}}; + return {PatternKind::kSuffix, fixedPattern.length(), fixedPattern, {}, {}}; } PatternMetadata PatternMetadata::relaxedSuffix( @@ -1741,22 +1763,47 @@ PatternMetadata PatternMetadata::relaxedSuffix( PatternKind::kRelaxedSuffix, fixedLength, std::move(fixedPattern), - std::move(subPatterns)}; + std::move(subPatterns), + {}}; } PatternMetadata PatternMetadata::substring(const std::string& fixedPattern) { - return {PatternKind::kSubstring, fixedPattern.length(), fixedPattern, {}}; + return {PatternKind::kSubstring, fixedPattern.length(), fixedPattern, {}, {}}; +} + +PatternMetadata PatternMetadata::substrings( + std::vector substrings) { + return {PatternKind::kSubstrings, 0, "", {}, std::move(substrings)}; +} + +std::vector PatternMetadata::parseSubstrings( + const std::string_view& pattern) { + // Not support substrings-search with '_' for best performance. + static const re2::RE2 fullPattern(R"((%+[^%_#\\]+)+%+)"); + static const re2::RE2 subPattern(R"((?:%+)([^%_#\\]+))"); + re2::StringPiece full(pattern); + re2::StringPiece cur; + std::vector substrings; + if (RE2::FullMatch(full, fullPattern)) { + while (RE2::PartialMatch(full, subPattern, &cur)) { + substrings.push_back(cur.as_string()); + full.set(cur.end(), full.end() - cur.end()); + } + } + return substrings; } PatternMetadata::PatternMetadata( PatternKind patternKind, size_t length, std::string fixedPattern, - std::vector subPatterns) + std::vector subPatterns, + std::vector substrings) : patternKind_{patternKind}, length_{length}, fixedPattern_(std::move(fixedPattern)), - subPatterns_(std::move(subPatterns)) {} + subPatterns_(std::move(subPatterns)), + substrings_(std::move(substrings)) {} // Iterates through a pattern string. Transparently handles escape sequences. class PatternStringIterator { @@ -2154,6 +2201,14 @@ std::shared_ptr makeLike( PatternMetadata patternMetadata = PatternMetadata::generic(); try { + // Fast path for substrings search. + auto substrings = + PatternMetadata::parseSubstrings(std::string_view(pattern)); + if (substrings.size() > 0) { + patternMetadata = PatternMetadata::substrings(std::move(substrings)); + return std::make_shared>( + patternMetadata); + } patternMetadata = determinePatternKind(std::string_view(pattern), escapeChar); } catch (...) { diff --git a/velox/functions/lib/Re2Functions.h b/velox/functions/lib/Re2Functions.h index 5e7267c7ea83..1ff9484ee301 100644 --- a/velox/functions/lib/Re2Functions.h +++ b/velox/functions/lib/Re2Functions.h @@ -19,7 +19,6 @@ #include #include - #include "velox/expression/VectorFunction.h" #include "velox/functions/Udf.h" #include "velox/vector/BaseVector.h" @@ -50,6 +49,10 @@ enum class PatternKind { kRelaxedSuffix, /// Patterns matching '%{c0}%', such as '%foo%%', '%%%hello%'. kSubstring, + /// Patterns matching '%{c0}%{c1}%', such as '%%foo%%bar%%', '%foo%bar%'. + /// Note: Unlike kSubstring, kSubstrings applies only to constant patterns + /// as pattern parsing is expensive. + kSubstrings, /// Patterns which do not fit any of the above types, such as 'hello_world', /// '_presto%'. kGeneric, @@ -101,6 +104,11 @@ class PatternMetadata { static PatternMetadata substring(const std::string& fixedPattern); + static PatternMetadata substrings(std::vector substrings); + + static std::vector parseSubstrings( + const std::string_view& pattern); + PatternKind patternKind() const { return patternKind_; } @@ -117,12 +125,17 @@ class PatternMetadata { return fixedPattern_; } + const std::vector& substrings() const { + return substrings_; + } + private: PatternMetadata( PatternKind patternKind, size_t length, std::string fixedPattern, - std::vector subPatterns); + std::vector subPatterns, + std::vector substrings); PatternKind patternKind_; @@ -140,6 +153,8 @@ class PatternMetadata { /// used for kRelaxedXxx patterns. e.g. If the pattern is: _pr_sto%, we will /// have four sub-patterns here: _, pr, _ and sto. std::vector subPatterns_; + + std::vector substrings_; }; inline const int kMaxCompiledRegexes = 20; diff --git a/velox/functions/lib/tests/Re2FunctionsTest.cpp b/velox/functions/lib/tests/Re2FunctionsTest.cpp index 94d02b27b715..2ff23abae75e 100644 --- a/velox/functions/lib/tests/Re2FunctionsTest.cpp +++ b/velox/functions/lib/tests/Re2FunctionsTest.cpp @@ -460,6 +460,9 @@ TEST_F(Re2FunctionsTest, likePattern) { false); testLike("abc", "MEDIUM POLISHED%", false); + + testLike("aabbccddeeff", "%aa%bb%", true); + testLike("aaccddeeff", "%aa%bb%", false); } TEST_F(Re2FunctionsTest, likeDeterminePatternKind) { @@ -1500,5 +1503,27 @@ TEST_F(Re2FunctionsTest, split) { assertEqualVectors(expected, result); } +TEST_F(Re2FunctionsTest, parseSubstrings) { + auto test = [&](const std::string& input, + const std::vector& expected) { + ASSERT_EQ(PatternMetadata::parseSubstrings(input), expected); + }; + // Cases that not supported by substrings-search. + // Note: we always return LikeGeneric for escape-case, see makeLike(). + test("%%", {}); + // Not supports prefix. + test("aa%bb%%", {}); + // Not supports sufix. + test("%aa%bb", {}); + // Not supports '_'. + test("%aa_%", {}); + // Not supports '#'. + test("%aa#%", {}); + + // Cases that supported by substrings-search. + test("%aa%", {"aa"}); + test("%aa%bb%%", {"aa", "bb"}); + test("%aa%bb%%%cc%", {"aa", "bb", "cc"}); +} } // namespace } // namespace facebook::velox::functions