Kouhei Sutou 2018-10-17 01:36:07 +0900 (Wed, 17 Oct 2018) Revision: 54f4aca44570836f3d2dc6217e695fbd55009e40 https://github.com/groonga/groonga/commit/54f4aca44570836f3d2dc6217e695fbd55009e40 Message: Accept (?-mix:XXX) for index searchable regular expression [groonga-dev,04683] Reported by Masatoshi SEKI. Thanks!!! Added files: test/command/suite/select/filter/index/regexp/options/off_all/begin_in.expected test/command/suite/select/filter/index/regexp/options/off_all/begin_in.test test/command/suite/select/filter/index/regexp/options/off_all/begin_out.expected test/command/suite/select/filter/index/regexp/options/off_all/begin_out.test test/command/suite/select/filter/index/regexp/options/off_all/end_in.expected test/command/suite/select/filter/index/regexp/options/off_all/end_in.test test/command/suite/select/filter/index/regexp/options/off_all/end_out.expected test/command/suite/select/filter/index/regexp/options/off_all/end_out.test Modified files: lib/expr.c lib/ii.c lib/mrb/scripts/scan_info_data.rb test/command/suite/select/function/highlight_html/regexp.expected test/command/suite/select/function/highlight_html/regexp.test Modified: lib/expr.c (+39 -9) =================================================================== --- lib/expr.c 2018-10-15 14:19:02 +0900 (f97d31224) +++ lib/expr.c 2018-10-17 01:36:07 +0900 (a65c12d14) @@ -2324,9 +2324,12 @@ scan_info_build_match_expr(grn_ctx *ctx, static grn_bool is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp) { + const char *all_off_options = "?-mix:"; + size_t all_off_options_length = strlen(all_off_options); const char *regexp_raw; const char *regexp_raw_end; grn_bool escaping = GRN_FALSE; + grn_bool in_paren = GRN_FALSE; grn_bool dot = GRN_FALSE; if (!(regexp->header.domain == GRN_DB_SHORT_TEXT || @@ -2380,15 +2383,35 @@ is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp) } } else { switch (regexp_raw[0]) { + case '(' : + if (in_paren) { + return GRN_FALSE; + } else { + const char *options = regexp_raw + 1; + if (regexp_raw_end - options >= all_off_options_length && + memcmp(options, all_off_options, all_off_options_length) == 0) { + in_paren = GRN_TRUE; + regexp_raw += all_off_options_length; + continue; + } else { + return GRN_FALSE; + } + } + break; + case ')' : + if (in_paren) { + in_paren = GRN_FALSE; + } else { + return GRN_FALSE; + } + break; case '.' : - escaping = GRN_FALSE; if (dot) { return GRN_FALSE; } dot = GRN_TRUE; break; case '*' : - escaping = GRN_FALSE; if (!dot) { return GRN_FALSE; } @@ -2406,9 +2429,6 @@ is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp) case '}' : case '^' : case '$' : - case '(' : - case ')' : - escaping = GRN_FALSE; return GRN_FALSE; case '\\' : if (dot) { @@ -2420,7 +2440,6 @@ is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp) if (dot) { return GRN_FALSE; } - escaping = GRN_FALSE; break; } } @@ -2431,6 +2450,13 @@ is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp) regexp_raw += char_len; } + if (dot) { + return GRN_FALSE; + } + if (in_paren) { + return GRN_FALSE; + } + return GRN_TRUE; } @@ -7086,6 +7112,8 @@ grn_expr_syntax_expand_query_by_table(grn_ctx *ctx, static void grn_expr_get_keywords_regexp(grn_ctx *ctx, grn_obj *keywords, grn_obj *regexp) { + const char *all_off_options = "?-mix:"; + size_t all_off_options_length = strlen(all_off_options); const char *regexp_raw; const char *regexp_raw_end; grn_bool escaping = GRN_FALSE; @@ -7122,11 +7150,14 @@ grn_expr_get_keywords_regexp(grn_ctx *ctx, grn_obj *keywords, grn_obj *regexp) } } else { switch (regexp_raw[0]) { + case '(' : + regexp_raw += all_off_options_length; + break; + case ')' : + break; case '.' : - escaping = GRN_FALSE; break; case '*' : - escaping = GRN_FALSE; if (GRN_TEXT_LEN(&keyword) > 0) { grn_vector_add_element(ctx, keywords, @@ -7141,7 +7172,6 @@ grn_expr_get_keywords_regexp(grn_ctx *ctx, grn_obj *keywords, grn_obj *regexp) escaping = GRN_TRUE; break; default : - escaping = GRN_FALSE; GRN_TEXT_PUTC(ctx, &keyword, regexp_raw[0]); break; } Modified: lib/ii.c (+17 -1) =================================================================== --- lib/ii.c 2018-10-15 14:19:02 +0900 (7d819d8e1) +++ lib/ii.c 2018-10-17 01:36:07 +0900 (ea89f9e82) @@ -8448,6 +8448,9 @@ grn_ii_parse_regexp_query(grn_ctx *ctx, grn_obj *parsed_strings) { grn_bool escaping = GRN_FALSE; + grn_bool in_paren = GRN_FALSE; + const char *all_off_options = "?-mix:"; + size_t all_off_options_len = strlen(all_off_options); int nth_char = 0; const char *current = string; const char *string_end = string + string_len; @@ -8482,7 +8485,11 @@ grn_ii_parse_regexp_query(grn_ctx *ctx, } break; case 'z' : - if (current == string_end) { + if (current == string_end || + (in_paren && + grn_charlen(ctx, current, string_end) == 1 && + *current == ')' && + (current + 1) == string_end)) { target = GRN_TOKENIZER_END_MARK_UTF8; char_len = GRN_TOKENIZER_END_MARK_UTF8_LEN; } @@ -8496,6 +8503,15 @@ grn_ii_parse_regexp_query(grn_ctx *ctx, if (*target == '\\') { escaping = GRN_TRUE; continue; + } else if (*target == '(' && + (string_end - current) >= all_off_options_len && + memcmp(current, all_off_options, all_off_options_len) == 0) { + current += all_off_options_len; + in_paren = GRN_TRUE; + continue; + } else if (*target == ')') { + in_paren = GRN_FALSE; + continue; } else if (*target == '.' && grn_charlen(ctx, current, string_end) == 1 && *current == '*') { Modified: lib/mrb/scripts/scan_info_data.rb (+42 -3) =================================================================== --- lib/mrb/scripts/scan_info_data.rb 2018-10-15 14:19:02 +0900 (904fe748b) +++ lib/mrb/scripts/scan_info_data.rb 2018-10-17 01:36:07 +0900 (70dee2e6d) @@ -148,6 +148,8 @@ module Groonga def index_searchable_regexp?(pattern) return false if pattern.nil? + paren = :outside + dot = false previous_char = nil pattern.value.each_char do |char| if previous_char == "\\" @@ -167,13 +169,50 @@ module Groonga next end else - case char - when ".", "[", "]", "|", "?", "+", "*", "{", "}", "^", "$", "(", ")" - return false + case paren + when :starting + case char + when "?" + return false if previous_char != "(" + when "-" + return false if previous_char != "?" + when "m" + return false if previous_char != "-" + when "i" + return false if previous_char != "m" + when "x" + return false if previous_char != "i" + when ":" + return false if previous_char != "x" + paren = :inside + else + return false + end + else + case char + when "(" + return false unless paren == :outside + paren = :starting + when ")" + return false unless paren == :inside + paren = :outside + when "." + return false if dot + dot = true + when "*" + return false unless dot + dot = false + when "[", "]", "|", "?", "+", "{", "}", "^", "$" + return false + else + return false if dot + end end end previous_char = char end + return false if dot + return false unless paren == :outside true end Added: test/command/suite/select/filter/index/regexp/options/off_all/begin_in.expected (+26 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/options/off_all/begin_in.expected 2018-10-17 01:36:07 +0900 (10967255d) @@ -0,0 +1,26 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Rroonga"}, +{"content": "Ruby and Rroonga"} +] +[[0,0.0,0.0],3] +log_level --level info +[[0,0.0,0.0],true] +select Memos --filter 'content @~ "(?-mix:\\\\ARro)"' +[[0,0.0,0.0],[[[1],[["_id","UInt32"],["content","Text"]],[2,"Rroonga"]]]] +#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content> +#|i| grn_ii_sel > ((?-mix:\ARro)) +#|i| n=3 (Rro) +#|i| exact: 1 +#|i| hits=1 +log_level --level notice +[[0,0.0,0.0],true] Added: test/command/suite/select/filter/index/regexp/options/off_all/begin_in.test (+20 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/options/off_all/begin_in.test 2018-10-17 01:36:07 +0900 (bc37d0499) @@ -0,0 +1,20 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Memos content + +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Rroonga"}, +{"content": "Ruby and Rroonga"} +] + +log_level --level info +#@add-important-log-levels info +select Memos --filter 'content @~ "(?-mix:\\\\ARro)"' +#@remove-important-log-levels info +log_level --level notice Added: test/command/suite/select/filter/index/regexp/options/off_all/begin_out.expected (+26 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/options/off_all/begin_out.expected 2018-10-17 01:36:07 +0900 (883ee9f77) @@ -0,0 +1,26 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Rroonga"}, +{"content": "Ruby and Rroonga"} +] +[[0,0.0,0.0],3] +log_level --level info +[[0,0.0,0.0],true] +select Memos --filter 'content @~ "\\\\A(?-mix:Rro)"' +[[0,0.0,0.0],[[[1],[["_id","UInt32"],["content","Text"]],[2,"Rroonga"]]]] +#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content> +#|i| grn_ii_sel > (\A(?-mix:Rro)) +#|i| n=3 (Rro) +#|i| exact: 1 +#|i| hits=1 +log_level --level notice +[[0,0.0,0.0],true] Added: test/command/suite/select/filter/index/regexp/options/off_all/begin_out.test (+20 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/options/off_all/begin_out.test 2018-10-17 01:36:07 +0900 (a2d37debe) @@ -0,0 +1,20 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Memos content + +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Rroonga"}, +{"content": "Ruby and Rroonga"} +] + +log_level --level info +#@add-important-log-levels info +select Memos --filter 'content @~ "\\\\A(?-mix:Rro)"' +#@remove-important-log-levels info +log_level --level notice Added: test/command/suite/select/filter/index/regexp/options/off_all/end_in.expected (+57 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/options/off_all/end_in.expected 2018-10-17 01:36:07 +0900 (2104877a8) @@ -0,0 +1,57 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Rroonga"}, +{"content": "Rroonga and Ruby"} +] +[[0,0.0,0.0],3] +log_level --level info +[[0,0.0,0.0],true] +select Memos --filter 'content @~ "(?-mix:onga\\\\z)"' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 2 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "content", + "Text" + ] + ], + [ + 1, + "Groonga" + ], + [ + 2, + "Rroonga" + ] + ] + ] +] +#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content> +#|i| grn_ii_sel > ((?-mix:onga\z)) +#|i| n=3 (onga) +#|i| exact: 2 +#|i| hits=2 +log_level --level notice +[[0,0.0,0.0],true] Added: test/command/suite/select/filter/index/regexp/options/off_all/end_in.test (+20 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/options/off_all/end_in.test 2018-10-17 01:36:07 +0900 (03cd01bf2) @@ -0,0 +1,20 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Memos content + +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Rroonga"}, +{"content": "Rroonga and Ruby"} +] + +log_level --level info +#@add-important-log-levels info +select Memos --filter 'content @~ "(?-mix:onga\\\\z)"' +#@remove-important-log-levels info +log_level --level notice Added: test/command/suite/select/filter/index/regexp/options/off_all/end_out.expected (+57 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/options/off_all/end_out.expected 2018-10-17 01:36:07 +0900 (b261eda5e) @@ -0,0 +1,57 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Rroonga"}, +{"content": "Rroonga and Ruby"} +] +[[0,0.0,0.0],3] +log_level --level info +[[0,0.0,0.0],true] +select Memos --filter 'content @~ "(?-mix:onga)\\\\z"' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 2 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "content", + "Text" + ] + ], + [ + 1, + "Groonga" + ], + [ + 2, + "Rroonga" + ] + ] + ] +] +#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content> +#|i| grn_ii_sel > ((?-mix:onga)\z) +#|i| n=3 (onga) +#|i| exact: 2 +#|i| hits=2 +log_level --level notice +[[0,0.0,0.0],true] Added: test/command/suite/select/filter/index/regexp/options/off_all/end_out.test (+20 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/options/off_all/end_out.test 2018-10-17 01:36:07 +0900 (5702f286d) @@ -0,0 +1,20 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Memos content + +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Rroonga"}, +{"content": "Rroonga and Ruby"} +] + +log_level --level info +#@add-important-log-levels info +select Memos --filter 'content @~ "(?-mix:onga)\\\\z"' +#@remove-important-log-levels info +log_level --level notice Modified: test/command/suite/select/function/highlight_html/regexp.expected (+1 -1) =================================================================== --- test/command/suite/select/function/highlight_html/regexp.expected 2018-10-15 14:19:02 +0900 (265cb9d70) +++ test/command/suite/select/function/highlight_html/regexp.expected 2018-10-17 01:36:07 +0900 (61c9384f1) @@ -11,7 +11,7 @@ load --table Entries {"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."} ] [[0,0.0,0.0],1] -select Entries --filter "body @~ 'ro.*ga'" --output_columns "highlight_html(body)" +select Entries --filter "body @~ '(?-mix:ro.*ga)'" --output_columns "highlight_html(body)" [ [ 0, Modified: test/command/suite/select/function/highlight_html/regexp.test (+1 -1) =================================================================== --- test/command/suite/select/function/highlight_html/regexp.test 2018-10-15 14:19:02 +0900 (77eb47652) +++ test/command/suite/select/function/highlight_html/regexp.test 2018-10-17 01:36:07 +0900 (2c12a7ddd) @@ -10,5 +10,5 @@ load --table Entries ] select Entries \ - --filter "body @~ 'ro.*ga'" \ + --filter "body @~ '(?-mix:ro.*ga)'" \ --output_columns "highlight_html(body)" -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181017/6148d1f9/attachment-0001.html>