diff --git a/README.md b/README.md index ea6af5f..162d8cc 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,8 @@ backtracking regular expression engines like those used in PCRE, Perl, and Python". **Current version:** 2.4.3 -**Supported Ruby versions:** 2.6, 2.7, 3.0, 3.1, 3.2 **Bundled RE2 version:** libre2.11 (2023-11-01) +**Supported Ruby versions:** 2.6, 2.7, 3.0, 3.1, 3.2 **Supported RE2 versions:** libre2.0 (< 2020-03-02), libre2.1 (2020-03-02), libre2.6 (2020-03-03), libre2.7 (2020-05-01), libre2.8 (2020-07-06), libre2.9 (2020-11-01), libre2.10 (2022-12-01), libre2.11 (2023-07-01) Installation @@ -68,9 +68,10 @@ Documentation Full documentation automatically generated from the latest version is available at . -Note that RE2's regular expression syntax differs from PCRE and Ruby's -built-in [`Regexp`][Regexp] library, see the [official syntax page][] for more -details. +> [!IMPORTANT] +> Note that RE2's regular expression syntax differs from PCRE and Ruby's +> built-in [`Regexp`][Regexp] library, see the [official syntax page][] for +> more details. Usage ----- @@ -80,27 +81,19 @@ library (with [`Regexp`](http://mudge.name/re2/RE2/Regexp.html) and [`MatchData`](http://mudge.name/re2/RE2/MatchData.html)), its API is slightly different: -```console -$ irb -rubygems -> require 're2' -> r = RE2::Regexp.new('w(\d)(\d+)') -=> # -> m = r.match("w1234") -=> # -> m[1] -=> "1" -> m.string -=> "w1234" -> m.begin(1) -=> 1 -> m.end(1) -=> 2 -> r =~ "w1234" -=> true -> r !~ "bob" -=> true -> r.match("bob") -=> nil +```ruby +require "re2" + +r = RE2::Regexp.new('w(\d)(\d+)') # => # +m = r.match("w1234") # => # +m[1] # => "1" + +# Improve performance by requesting fewer submatches +m = r.match("w1234", 1) # => # + +# Or no submatches at all +r.match("w1234", 0) # => true +r =~ "w1234" # => true ``` As @@ -109,30 +102,25 @@ As defined against `Kernel` so you can use a shorter version to create regular expressions: -```console -> RE2('(\d+)') -=> # +```ruby +RE2('(\d+)') # => # ``` Note the use of *single quotes* as double quotes will interpret `\d` as `d` as in the following example: -```console -> RE2("(\d+)") -=> # +```ruby +RE2("(\d+)") # => # ``` As of 0.3.0, you can use named groups: -```console -> r = RE2::Regexp.new('(?P\w+) (?P\d+)') -=> #\w+) (?P\d+)/> -> m = r.match("Bob 40") -=> # -> m[:name] -=> "Bob" -> m["age"] -=> "40" +```ruby +r = RE2::Regexp.new('(?P\w+) (?P\d+)') +# => #\w+) (?P\d+)/> +m = r.match("Bob 40") # => # +m[:name] # => "Bob" +m["age"] # => "40" ``` As of 0.6.0, you can use `RE2::Regexp#scan` to incrementally scan text for @@ -197,9 +185,10 @@ end Encoding -------- -Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be -returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the -`RE2::Regexp` is set to false (any other encoding's behaviour is undefined). +> [!IMPORTANT] +> Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be +> returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the +> `RE2::Regexp` is set to false (any other encoding's behaviour is undefined). For backward compatibility: re2 won't automatically convert string inputs to the right encoding so this is the responsibility of the caller, e.g. @@ -220,8 +209,8 @@ Features `RE2::Regexp.compile(re)` or `RE2(re)` (including specifying options, e.g. `RE2::Regexp.new("pattern", :case_sensitive => false)` -* Extracting matches with `re2.match(text)` (and an exact number of matches - with `re2.match(text, number_of_matches)` such as `re2.match("123-234", 2)`) +* Extracting matches with `re2.match(text)` (and an exact number of submatches + with `re2.match(text, number_of_submatches)` such as `re2.match("123-234", 2)`) * Extracting matches by name (both with strings and symbols) diff --git a/ext/re2/re2.cc b/ext/re2/re2.cc index 1815bf3..c59bb54 100644 --- a/ext/re2/re2.cc +++ b/ext/re2/re2.cc @@ -1309,15 +1309,20 @@ static VALUE re2_regexp_named_capturing_groups(const VALUE self) { } /* - * Match the pattern against the given +text+ and return either - * a boolean (if no submatches are required) or a {RE2::MatchData} - * instance. + * Match the pattern against the given +text+ and return either a boolean (if + * no submatches are required) or a {RE2::MatchData} instance with the + * specified number of submatches (defaults to the total number of capturing + * groups). + * + * The number of submatches has a significant impact on performance: requesting + * one submatch is much faster than requesting more than one and requesting + * zero submatches is faster still. * * @return [Boolean, RE2::MatchData] * * @overload match(text) * Returns an {RE2::MatchData} containing the matching pattern and all - * subpatterns resulting from looking for the regexp in +text+ if the pattern + * submatches resulting from looking for the regexp in +text+ if the pattern * contains capturing groups. * * Returns either true or false indicating whether a successful match was @@ -1326,7 +1331,7 @@ static VALUE re2_regexp_named_capturing_groups(const VALUE self) { * @param [String] text the text to search * @return [RE2::MatchData] if the pattern contains capturing groups * @return [Boolean] if the pattern does not contain capturing groups - * @raise [NoMemoryError] if there was not enough memory to allocate the matches + * @raise [NoMemoryError] if there was not enough memory to allocate the submatches * @example Matching with capturing groups * r = RE2::Regexp.new('w(o)(o)') * r.match('woo') #=> # @@ -1340,20 +1345,20 @@ static VALUE re2_regexp_named_capturing_groups(const VALUE self) { * * @param [String] text the text to search * @return [Boolean] whether the match was successful - * @raise [NoMemoryError] if there was not enough memory to allocate the matches + * @raise [NoMemoryError] if there was not enough memory to allocate the submatches * @example * r = RE2::Regexp.new('w(o)(o)') * r.match('woo', 0) #=> true * r.match('bob', 0) #=> false * - * @overload match(text, number_of_matches) + * @overload match(text, number_of_submatches) * See +match(text)+ but with a specific number of - * matches returned (padded with nils if necessary). + * submatches returned (padded with nils if necessary). * * @param [String] text the text to search - * @param [Integer] number_of_matches the number of matches to return - * @return [RE2::MatchData] the matches - * @raise [ArgumentError] if given a negative number of matches + * @param [Integer] number_of_submatches the number of submatches to return + * @return [RE2::MatchData] the submatches + * @raise [ArgumentError] if given a negative number of submatches * @raise [NoMemoryError] if there was not enough memory to allocate the matches * @example * r = RE2::Regexp.new('w(o)(o)') @@ -1363,9 +1368,9 @@ static VALUE re2_regexp_named_capturing_groups(const VALUE self) { static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) { re2_pattern *p; re2_matchdata *m; - VALUE text, number_of_matches; + VALUE text, number_of_submatches; - rb_scan_args(argc, argv, "11", &text, &number_of_matches); + rb_scan_args(argc, argv, "11", &text, &number_of_submatches); /* Ensure text is a string. */ StringValue(text); @@ -1374,8 +1379,8 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) { int n; - if (RTEST(number_of_matches)) { - n = NUM2INT(number_of_matches); + if (RTEST(number_of_submatches)) { + n = NUM2INT(number_of_submatches); if (n < 0) { rb_raise(rb_eArgError, "number of matches should be >= 0");