[Groonga-commit] groonga/groonga at 54f4aca [master] Accept (?-mix:XXX) for index searchable regular expression

Back to archive index
Kouhei Sutou null+****@clear*****
Wed Oct 17 01:36:07 JST 2018


Kouhei Sutou	2018-10-17 01:36:07 +0900 (Wed, 17 Oct 2018)

  Revision: 54f4aca44570836f3d2dc6217e695fbd55009e40
  https://github.com/groonga/groonga/commit/54f4aca44570836f3d2dc6217e695fbd55009e40

  Message:
    Accept (?-mix:XXX) for index searchable regular expression
    
    [groonga-dev,04683]
    
    Reported by Masatoshi SEKI. Thanks!!!

  Added files:
    test/command/suite/select/filter/index/regexp/options/off_all/begin_in.expected
    test/command/suite/select/filter/index/regexp/options/off_all/begin_in.test
    test/command/suite/select/filter/index/regexp/options/off_all/begin_out.expected
    test/command/suite/select/filter/index/regexp/options/off_all/begin_out.test
    test/command/suite/select/filter/index/regexp/options/off_all/end_in.expected
    test/command/suite/select/filter/index/regexp/options/off_all/end_in.test
    test/command/suite/select/filter/index/regexp/options/off_all/end_out.expected
    test/command/suite/select/filter/index/regexp/options/off_all/end_out.test
  Modified files:
    lib/expr.c
    lib/ii.c
    lib/mrb/scripts/scan_info_data.rb
    test/command/suite/select/function/highlight_html/regexp.expected
    test/command/suite/select/function/highlight_html/regexp.test

  Modified: lib/expr.c (+39 -9)
===================================================================
--- lib/expr.c    2018-10-15 14:19:02 +0900 (f97d31224)
+++ lib/expr.c    2018-10-17 01:36:07 +0900 (a65c12d14)
@@ -2324,9 +2324,12 @@ scan_info_build_match_expr(grn_ctx *ctx,
 static grn_bool
 is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp)
 {
+  const char *all_off_options = "?-mix:";
+  size_t all_off_options_length = strlen(all_off_options);
   const char *regexp_raw;
   const char *regexp_raw_end;
   grn_bool escaping = GRN_FALSE;
+  grn_bool in_paren = GRN_FALSE;
   grn_bool dot = GRN_FALSE;
 
   if (!(regexp->header.domain == GRN_DB_SHORT_TEXT ||
@@ -2380,15 +2383,35 @@ is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp)
         }
       } else {
         switch (regexp_raw[0]) {
+        case '(' :
+          if (in_paren) {
+            return GRN_FALSE;
+          } else {
+            const char *options = regexp_raw + 1;
+            if (regexp_raw_end - options >= all_off_options_length &&
+                memcmp(options, all_off_options, all_off_options_length) == 0) {
+              in_paren = GRN_TRUE;
+              regexp_raw += all_off_options_length;
+              continue;
+            } else {
+              return GRN_FALSE;
+            }
+          }
+          break;
+        case ')' :
+          if (in_paren) {
+            in_paren = GRN_FALSE;
+          } else {
+            return GRN_FALSE;
+          }
+          break;
         case '.' :
-          escaping = GRN_FALSE;
           if (dot) {
             return GRN_FALSE;
           }
           dot = GRN_TRUE;
           break;
         case '*' :
-          escaping = GRN_FALSE;
           if (!dot) {
             return GRN_FALSE;
           }
@@ -2406,9 +2429,6 @@ is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp)
         case '}' :
         case '^' :
         case '$' :
-        case '(' :
-        case ')' :
-          escaping = GRN_FALSE;
           return GRN_FALSE;
         case '\\' :
           if (dot) {
@@ -2420,7 +2440,6 @@ is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp)
           if (dot) {
             return GRN_FALSE;
           }
-          escaping = GRN_FALSE;
           break;
         }
       }
@@ -2431,6 +2450,13 @@ is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp)
     regexp_raw += char_len;
   }
 
+  if (dot) {
+    return GRN_FALSE;
+  }
+  if (in_paren) {
+    return GRN_FALSE;
+  }
+
   return GRN_TRUE;
 }
 
@@ -7086,6 +7112,8 @@ grn_expr_syntax_expand_query_by_table(grn_ctx *ctx,
 static void
 grn_expr_get_keywords_regexp(grn_ctx *ctx, grn_obj *keywords, grn_obj *regexp)
 {
+  const char *all_off_options = "?-mix:";
+  size_t all_off_options_length = strlen(all_off_options);
   const char *regexp_raw;
   const char *regexp_raw_end;
   grn_bool escaping = GRN_FALSE;
@@ -7122,11 +7150,14 @@ grn_expr_get_keywords_regexp(grn_ctx *ctx, grn_obj *keywords, grn_obj *regexp)
         }
       } else {
         switch (regexp_raw[0]) {
+        case '(' :
+          regexp_raw += all_off_options_length;
+          break;
+        case ')' :
+          break;
         case '.' :
-          escaping = GRN_FALSE;
           break;
         case '*' :
-          escaping = GRN_FALSE;
           if (GRN_TEXT_LEN(&keyword) > 0) {
             grn_vector_add_element(ctx,
                                    keywords,
@@ -7141,7 +7172,6 @@ grn_expr_get_keywords_regexp(grn_ctx *ctx, grn_obj *keywords, grn_obj *regexp)
           escaping = GRN_TRUE;
           break;
         default :
-          escaping = GRN_FALSE;
           GRN_TEXT_PUTC(ctx, &keyword, regexp_raw[0]);
           break;
         }

  Modified: lib/ii.c (+17 -1)
===================================================================
--- lib/ii.c    2018-10-15 14:19:02 +0900 (7d819d8e1)
+++ lib/ii.c    2018-10-17 01:36:07 +0900 (ea89f9e82)
@@ -8448,6 +8448,9 @@ grn_ii_parse_regexp_query(grn_ctx *ctx,
                           grn_obj *parsed_strings)
 {
   grn_bool escaping = GRN_FALSE;
+  grn_bool in_paren = GRN_FALSE;
+  const char *all_off_options = "?-mix:";
+  size_t all_off_options_len = strlen(all_off_options);
   int nth_char = 0;
   const char *current = string;
   const char *string_end = string + string_len;
@@ -8482,7 +8485,11 @@ grn_ii_parse_regexp_query(grn_ctx *ctx,
           }
           break;
         case 'z' :
-          if (current == string_end) {
+          if (current == string_end ||
+              (in_paren &&
+               grn_charlen(ctx, current, string_end) == 1 &&
+               *current == ')' &&
+               (current + 1) == string_end)) {
             target = GRN_TOKENIZER_END_MARK_UTF8;
             char_len = GRN_TOKENIZER_END_MARK_UTF8_LEN;
           }
@@ -8496,6 +8503,15 @@ grn_ii_parse_regexp_query(grn_ctx *ctx,
         if (*target == '\\') {
           escaping = GRN_TRUE;
           continue;
+        } else if (*target == '(' &&
+                   (string_end - current) >= all_off_options_len &&
+                   memcmp(current, all_off_options, all_off_options_len) == 0) {
+          current += all_off_options_len;
+          in_paren = GRN_TRUE;
+          continue;
+        } else if (*target == ')') {
+          in_paren = GRN_FALSE;
+          continue;
         } else if (*target == '.' &&
                    grn_charlen(ctx, current, string_end) == 1 &&
                    *current == '*') {

  Modified: lib/mrb/scripts/scan_info_data.rb (+42 -3)
===================================================================
--- lib/mrb/scripts/scan_info_data.rb    2018-10-15 14:19:02 +0900 (904fe748b)
+++ lib/mrb/scripts/scan_info_data.rb    2018-10-17 01:36:07 +0900 (70dee2e6d)
@@ -148,6 +148,8 @@ module Groonga
     def index_searchable_regexp?(pattern)
       return false if pattern.nil?
 
+      paren = :outside
+      dot = false
       previous_char = nil
       pattern.value.each_char do |char|
         if previous_char == "\\"
@@ -167,13 +169,50 @@ module Groonga
             next
           end
         else
-          case char
-          when ".", "[", "]", "|", "?", "+", "*", "{", "}", "^", "$", "(", ")"
-            return false
+          case paren
+          when :starting
+            case char
+            when "?"
+              return false if previous_char != "("
+            when "-"
+              return false if previous_char != "?"
+            when "m"
+              return false if previous_char != "-"
+            when "i"
+              return false if previous_char != "m"
+            when "x"
+              return false if previous_char != "i"
+            when ":"
+              return false if previous_char != "x"
+              paren = :inside
+            else
+              return false
+            end
+          else
+            case char
+            when "("
+              return false unless paren == :outside
+              paren = :starting
+            when ")"
+              return false unless paren == :inside
+              paren = :outside
+            when "."
+              return false if dot
+              dot = true
+            when "*"
+              return false unless dot
+              dot = false
+            when "[", "]", "|", "?", "+", "{", "}", "^", "$"
+              return false
+            else
+              return false if dot
+            end
           end
         end
         previous_char = char
       end
+      return false if dot
+      return false unless paren == :outside
       true
     end
 

  Added: test/command/suite/select/filter/index/regexp/options/off_all/begin_in.expected (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/options/off_all/begin_in.expected    2018-10-17 01:36:07 +0900 (10967255d)
@@ -0,0 +1,26 @@
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+table_create RegexpTokens TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION   Memos content
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"content": "Groonga"},
+{"content": "Rroonga"},
+{"content": "Ruby and Rroonga"}
+]
+[[0,0.0,0.0],3]
+log_level --level info
+[[0,0.0,0.0],true]
+select Memos --filter 'content @~ "(?-mix:\\\\ARro)"'
+[[0,0.0,0.0],[[[1],[["_id","UInt32"],["content","Text"]],[2,"Rroonga"]]]]
+#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content>
+#|i| grn_ii_sel > ((?-mix:\ARro))
+#|i| n=3 (￯Rro)
+#|i| exact: 1
+#|i| hits=1
+log_level --level notice
+[[0,0.0,0.0],true]

  Added: test/command/suite/select/filter/index/regexp/options/off_all/begin_in.test (+20 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/options/off_all/begin_in.test    2018-10-17 01:36:07 +0900 (bc37d0499)
@@ -0,0 +1,20 @@
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR Text
+
+table_create RegexpTokens TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
+  Memos content
+
+load --table Memos
+[
+{"content": "Groonga"},
+{"content": "Rroonga"},
+{"content": "Ruby and Rroonga"}
+]
+
+log_level --level info
+#@add-important-log-levels info
+select Memos --filter 'content @~ "(?-mix:\\\\ARro)"'
+#@remove-important-log-levels info
+log_level --level notice

  Added: test/command/suite/select/filter/index/regexp/options/off_all/begin_out.expected (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/options/off_all/begin_out.expected    2018-10-17 01:36:07 +0900 (883ee9f77)
@@ -0,0 +1,26 @@
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+table_create RegexpTokens TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION   Memos content
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"content": "Groonga"},
+{"content": "Rroonga"},
+{"content": "Ruby and Rroonga"}
+]
+[[0,0.0,0.0],3]
+log_level --level info
+[[0,0.0,0.0],true]
+select Memos --filter 'content @~ "\\\\A(?-mix:Rro)"'
+[[0,0.0,0.0],[[[1],[["_id","UInt32"],["content","Text"]],[2,"Rroonga"]]]]
+#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content>
+#|i| grn_ii_sel > (\A(?-mix:Rro))
+#|i| n=3 (￯Rro)
+#|i| exact: 1
+#|i| hits=1
+log_level --level notice
+[[0,0.0,0.0],true]

  Added: test/command/suite/select/filter/index/regexp/options/off_all/begin_out.test (+20 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/options/off_all/begin_out.test    2018-10-17 01:36:07 +0900 (a2d37debe)
@@ -0,0 +1,20 @@
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR Text
+
+table_create RegexpTokens TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
+  Memos content
+
+load --table Memos
+[
+{"content": "Groonga"},
+{"content": "Rroonga"},
+{"content": "Ruby and Rroonga"}
+]
+
+log_level --level info
+#@add-important-log-levels info
+select Memos --filter 'content @~ "\\\\A(?-mix:Rro)"'
+#@remove-important-log-levels info
+log_level --level notice

  Added: test/command/suite/select/filter/index/regexp/options/off_all/end_in.expected (+57 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/options/off_all/end_in.expected    2018-10-17 01:36:07 +0900 (2104877a8)
@@ -0,0 +1,57 @@
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+table_create RegexpTokens TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION   Memos content
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"content": "Groonga"},
+{"content": "Rroonga"},
+{"content": "Rroonga and Ruby"}
+]
+[[0,0.0,0.0],3]
+log_level --level info
+[[0,0.0,0.0],true]
+select Memos --filter 'content @~ "(?-mix:onga\\\\z)"'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        2
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "content",
+          "Text"
+        ]
+      ],
+      [
+        1,
+        "Groonga"
+      ],
+      [
+        2,
+        "Rroonga"
+      ]
+    ]
+  ]
+]
+#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content>
+#|i| grn_ii_sel > ((?-mix:onga\z))
+#|i| n=3 (onga￰)
+#|i| exact: 2
+#|i| hits=2
+log_level --level notice
+[[0,0.0,0.0],true]

  Added: test/command/suite/select/filter/index/regexp/options/off_all/end_in.test (+20 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/options/off_all/end_in.test    2018-10-17 01:36:07 +0900 (03cd01bf2)
@@ -0,0 +1,20 @@
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR Text
+
+table_create RegexpTokens TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
+  Memos content
+
+load --table Memos
+[
+{"content": "Groonga"},
+{"content": "Rroonga"},
+{"content": "Rroonga and Ruby"}
+]
+
+log_level --level info
+#@add-important-log-levels info
+select Memos --filter 'content @~ "(?-mix:onga\\\\z)"'
+#@remove-important-log-levels info
+log_level --level notice

  Added: test/command/suite/select/filter/index/regexp/options/off_all/end_out.expected (+57 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/options/off_all/end_out.expected    2018-10-17 01:36:07 +0900 (b261eda5e)
@@ -0,0 +1,57 @@
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+table_create RegexpTokens TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION   Memos content
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"content": "Groonga"},
+{"content": "Rroonga"},
+{"content": "Rroonga and Ruby"}
+]
+[[0,0.0,0.0],3]
+log_level --level info
+[[0,0.0,0.0],true]
+select Memos --filter 'content @~ "(?-mix:onga)\\\\z"'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        2
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "content",
+          "Text"
+        ]
+      ],
+      [
+        1,
+        "Groonga"
+      ],
+      [
+        2,
+        "Rroonga"
+      ]
+    ]
+  ]
+]
+#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content>
+#|i| grn_ii_sel > ((?-mix:onga)\z)
+#|i| n=3 (onga￰)
+#|i| exact: 2
+#|i| hits=2
+log_level --level notice
+[[0,0.0,0.0],true]

  Added: test/command/suite/select/filter/index/regexp/options/off_all/end_out.test (+20 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/options/off_all/end_out.test    2018-10-17 01:36:07 +0900 (5702f286d)
@@ -0,0 +1,20 @@
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR Text
+
+table_create RegexpTokens TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
+  Memos content
+
+load --table Memos
+[
+{"content": "Groonga"},
+{"content": "Rroonga"},
+{"content": "Rroonga and Ruby"}
+]
+
+log_level --level info
+#@add-important-log-levels info
+select Memos --filter 'content @~ "(?-mix:onga)\\\\z"'
+#@remove-important-log-levels info
+log_level --level notice

  Modified: test/command/suite/select/function/highlight_html/regexp.expected (+1 -1)
===================================================================
--- test/command/suite/select/function/highlight_html/regexp.expected    2018-10-15 14:19:02 +0900 (265cb9d70)
+++ test/command/suite/select/function/highlight_html/regexp.expected    2018-10-17 01:36:07 +0900 (61c9384f1)
@@ -11,7 +11,7 @@ load --table Entries
 {"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."}
 ]
 [[0,0.0,0.0],1]
-select Entries   --filter "body @~ 'ro.*ga'"   --output_columns "highlight_html(body)"
+select Entries   --filter "body @~ '(?-mix:ro.*ga)'"   --output_columns "highlight_html(body)"
 [
   [
     0,

  Modified: test/command/suite/select/function/highlight_html/regexp.test (+1 -1)
===================================================================
--- test/command/suite/select/function/highlight_html/regexp.test    2018-10-15 14:19:02 +0900 (77eb47652)
+++ test/command/suite/select/function/highlight_html/regexp.test    2018-10-17 01:36:07 +0900 (2c12a7ddd)
@@ -10,5 +10,5 @@ load --table Entries
 ]
 
 select Entries \
-  --filter "body @~ 'ro.*ga'" \
+  --filter "body @~ '(?-mix:ro.*ga)'" \
   --output_columns "highlight_html(body)"
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181017/6148d1f9/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index