Skip to content

Commit

Permalink
opt like
Browse files Browse the repository at this point in the history
fix like

add arch indepent impl

opt code

remove simd_strstr to a seperate PR

fix format
  • Loading branch information
skadilover committed Sep 13, 2024
1 parent d1ac079 commit 2d91717
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 14 deletions.
6 changes: 6 additions & 0 deletions velox/docs/functions/presto/regexp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ limited to 20 different expressions per instance and thread of execution.
SELECT like('abc', '%b%'); -- true
SELECT like('a_c', '%#_%', '#'); -- true

String sequence search: There are some patterns that are equivalent to simple
string searches. For such constant patterns without custom-escape, velox uses
substring searches instead of regex searches, for example:
like("hello velox", "%hello%velox%")
is equivalent to searching for the strings "hello", "velox" in sequence.

.. function:: regexp_extract(string, pattern) -> varchar

Returns the first substring matched by the regular expression ``pattern``
Expand Down
79 changes: 67 additions & 12 deletions velox/functions/lib/Re2Functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,22 @@ bool matchSubstringPattern(
std::string::npos);
}

bool matchSubstringsPattern(
const StringView& input,
const std::vector<std::string>& patterns) {
const char* data = input.data();
for (int i = 0; i < patterns.size(); i++) {
auto curPos =
std::string_view(data, input.end() - data)
.find(std::string_view(patterns[i].c_str(), patterns[i].size()));
if (curPos == std::string::npos) {
return false;
}
data = data + curPos + patterns[i].size();
}
return true;
}

// Return true if the input VARCHAR argument is all-ASCII for the specified
// rows.
FOLLY_ALWAYS_INLINE static bool isAsciiArg(
Expand Down Expand Up @@ -705,6 +721,8 @@ class OptimizedLike final : public exec::VectorFunction {
.first;
case PatternKind::kSubstring:
return matchSubstringPattern(input, patternMetadata.fixedPattern());
case PatternKind::kSubstrings:
return matchSubstringsPattern(input, patternMetadata.substrings());
}
} else {
switch (P) {
Expand Down Expand Up @@ -739,6 +757,8 @@ class OptimizedLike final : public exec::VectorFunction {
input, patternMetadata, input.size() - 1);
case PatternKind::kSubstring:
return matchSubstringPattern(input, patternMetadata.fixedPattern());
case PatternKind::kSubstrings:
return matchSubstringsPattern(input, patternMetadata.substrings());
}
}
}
Expand Down Expand Up @@ -1688,19 +1708,19 @@ std::vector<std::shared_ptr<exec::FunctionSignature>> re2ExtractSignatures() {
}

PatternMetadata PatternMetadata::generic() {
return {PatternKind::kGeneric, 0, "", {}};
return {PatternKind::kGeneric, 0, "", {}, {}};
}

PatternMetadata PatternMetadata::atLeastN(size_t length) {
return {PatternKind::kAtLeastN, length, "", {}};
return {PatternKind::kAtLeastN, length, "", {}, {}};
}

PatternMetadata PatternMetadata::exactlyN(size_t length) {
return {PatternKind::kExactlyN, length, "", {}};
return {PatternKind::kExactlyN, length, "", {}, {}};
}

PatternMetadata PatternMetadata::fixed(const std::string& fixedPattern) {
return {PatternKind::kFixed, fixedPattern.length(), fixedPattern, {}};
return {PatternKind::kFixed, fixedPattern.length(), fixedPattern, {}, {}};
}

PatternMetadata PatternMetadata::relaxedFixed(
Expand All @@ -1711,11 +1731,12 @@ PatternMetadata PatternMetadata::relaxedFixed(
PatternKind::kRelaxedFixed,
fixedLength,
std::move(fixedPattern),
std::move(subPatterns)};
std::move(subPatterns),
{}};
}

PatternMetadata PatternMetadata::prefix(const std::string& fixedPattern) {
return {PatternKind::kPrefix, fixedPattern.length(), fixedPattern, {}};
return {PatternKind::kPrefix, fixedPattern.length(), fixedPattern, {}, {}};
}

PatternMetadata PatternMetadata::relaxedPrefix(
Expand All @@ -1726,11 +1747,12 @@ PatternMetadata PatternMetadata::relaxedPrefix(
PatternKind::kRelaxedPrefix,
fixedLength,
std::move(fixedPattern),
std::move(subPatterns)};
std::move(subPatterns),
{}};
}

PatternMetadata PatternMetadata::suffix(const std::string& fixedPattern) {
return {PatternKind::kSuffix, fixedPattern.length(), fixedPattern, {}};
return {PatternKind::kSuffix, fixedPattern.length(), fixedPattern, {}, {}};
}

PatternMetadata PatternMetadata::relaxedSuffix(
Expand All @@ -1741,22 +1763,47 @@ PatternMetadata PatternMetadata::relaxedSuffix(
PatternKind::kRelaxedSuffix,
fixedLength,
std::move(fixedPattern),
std::move(subPatterns)};
std::move(subPatterns),
{}};
}

PatternMetadata PatternMetadata::substring(const std::string& fixedPattern) {
return {PatternKind::kSubstring, fixedPattern.length(), fixedPattern, {}};
return {PatternKind::kSubstring, fixedPattern.length(), fixedPattern, {}, {}};
}

PatternMetadata PatternMetadata::substrings(
std::vector<std::string> substrings) {
return {PatternKind::kSubstrings, 0, "", {}, std::move(substrings)};
}

std::vector<std::string> PatternMetadata::parseSubstrings(
const std::string_view& pattern) {
// Not support substrings-search with '_' for best performance.
static const re2::RE2 fullPattern(R"((%+[^%_#\\]+)+%+)");
static const re2::RE2 subPattern(R"((?:%+)([^%_#\\]+))");
re2::StringPiece full(pattern);
re2::StringPiece cur;
std::vector<std::string> substrings;
if (RE2::FullMatch(full, fullPattern)) {
while (RE2::PartialMatch(full, subPattern, &cur)) {
substrings.push_back(cur.as_string());
full.set(cur.end(), full.end() - cur.end());
}
}
return substrings;
}

PatternMetadata::PatternMetadata(
PatternKind patternKind,
size_t length,
std::string fixedPattern,
std::vector<SubPatternMetadata> subPatterns)
std::vector<SubPatternMetadata> subPatterns,
std::vector<std::string> substrings)
: patternKind_{patternKind},
length_{length},
fixedPattern_(std::move(fixedPattern)),
subPatterns_(std::move(subPatterns)) {}
subPatterns_(std::move(subPatterns)),
substrings_(std::move(substrings)) {}

// Iterates through a pattern string. Transparently handles escape sequences.
class PatternStringIterator {
Expand Down Expand Up @@ -2154,6 +2201,14 @@ std::shared_ptr<exec::VectorFunction> makeLike(

PatternMetadata patternMetadata = PatternMetadata::generic();
try {
// Fast path for substrings search.
auto substrings =
PatternMetadata::parseSubstrings(std::string_view(pattern));
if (substrings.size() > 0) {
patternMetadata = PatternMetadata::substrings(std::move(substrings));
return std::make_shared<OptimizedLike<PatternKind::kSubstrings>>(
patternMetadata);
}
patternMetadata =
determinePatternKind(std::string_view(pattern), escapeChar);
} catch (...) {
Expand Down
19 changes: 17 additions & 2 deletions velox/functions/lib/Re2Functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <vector>

#include <re2/re2.h>

#include "velox/expression/VectorFunction.h"
#include "velox/functions/Udf.h"
#include "velox/vector/BaseVector.h"
Expand Down Expand Up @@ -50,6 +49,10 @@ enum class PatternKind {
kRelaxedSuffix,
/// Patterns matching '%{c0}%', such as '%foo%%', '%%%hello%'.
kSubstring,
/// Patterns matching '%{c0}%{c1}%', such as '%%foo%%bar%%', '%foo%bar%'.
/// Note: Unlike kSubstring, kSubstrings applies only to constant patterns
/// as pattern parsing is expensive.
kSubstrings,
/// Patterns which do not fit any of the above types, such as 'hello_world',
/// '_presto%'.
kGeneric,
Expand Down Expand Up @@ -101,6 +104,11 @@ class PatternMetadata {

static PatternMetadata substring(const std::string& fixedPattern);

static PatternMetadata substrings(std::vector<std::string> substrings);

static std::vector<std::string> parseSubstrings(
const std::string_view& pattern);

PatternKind patternKind() const {
return patternKind_;
}
Expand All @@ -117,12 +125,17 @@ class PatternMetadata {
return fixedPattern_;
}

const std::vector<std::string>& substrings() const {
return substrings_;
}

private:
PatternMetadata(
PatternKind patternKind,
size_t length,
std::string fixedPattern,
std::vector<SubPatternMetadata> subPatterns);
std::vector<SubPatternMetadata> subPatterns,
std::vector<std::string> substrings);

PatternKind patternKind_;

Expand All @@ -140,6 +153,8 @@ class PatternMetadata {
/// used for kRelaxedXxx patterns. e.g. If the pattern is: _pr_sto%, we will
/// have four sub-patterns here: _, pr, _ and sto.
std::vector<SubPatternMetadata> subPatterns_;

std::vector<std::string> substrings_;
};

inline const int kMaxCompiledRegexes = 20;
Expand Down
25 changes: 25 additions & 0 deletions velox/functions/lib/tests/Re2FunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,9 @@ TEST_F(Re2FunctionsTest, likePattern) {
false);

testLike("abc", "MEDIUM POLISHED%", false);

testLike("aabbccddeeff", "%aa%bb%", true);
testLike("aaccddeeff", "%aa%bb%", false);
}

TEST_F(Re2FunctionsTest, likeDeterminePatternKind) {
Expand Down Expand Up @@ -1500,5 +1503,27 @@ TEST_F(Re2FunctionsTest, split) {
assertEqualVectors(expected, result);
}

TEST_F(Re2FunctionsTest, parseSubstrings) {
auto test = [&](const std::string& input,
const std::vector<std::string>& expected) {
ASSERT_EQ(PatternMetadata::parseSubstrings(input), expected);
};
// Cases that not supported by substrings-search.
// Note: we always return LikeGeneric for escape-case, see makeLike().
test("%%", {});
// Not supports prefix.
test("aa%bb%%", {});
// Not supports sufix.
test("%aa%bb", {});
// Not supports '_'.
test("%aa_%", {});
// Not supports '#'.
test("%aa#%", {});

// Cases that supported by substrings-search.
test("%aa%", {"aa"});
test("%aa%bb%%", {"aa", "bb"});
test("%aa%bb%%%cc%", {"aa", "bb", "cc"});
}
} // namespace
} // namespace facebook::velox::functions

0 comments on commit 2d91717

Please sign in to comment.