Kouhei Sutou 2019-02-07 15:11:00 +0900 (Thu, 07 Feb 2019) Revision: ef6e895e3d52972267801f21667debe017b31c65 https://github.com/groonga/groonga/commit/ef6e895e3d52972267801f21667debe017b31c65 Message: TokenPattern: stop returning needless the last empty token Added files: test/command/suite/select/query/match/with_index/token_pattern/match.expected test/command/suite/select/query/match/with_index/token_pattern/match.test Modified files: lib/tokenizers.c Modified: lib/tokenizers.c (+47 -22) =================================================================== --- lib/tokenizers.c 2019-02-07 14:48:48 +0900 (41f8ee843) +++ lib/tokenizers.c 2019-02-07 15:11:00 +0900 (4abd5450d) @@ -1561,6 +1561,8 @@ typedef struct { const unsigned char *start; const unsigned char *next; const unsigned char *end; + const unsigned char *current; + size_t current_length; } grn_pattern_tokenizer; static void @@ -1698,11 +1700,42 @@ pattern_init(grn_ctx *ctx, grn_tokenizer_query *query) tokenizer->start = (const unsigned char *)normalized; tokenizer->next = tokenizer->start; tokenizer->end = tokenizer->start + normalized_length_in_bytes; + tokenizer->current = NULL; + tokenizer->current_length = 0; } return tokenizer; } +#ifdef GRN_SUPPORT_REGEXP +static void +pattern_search(grn_ctx *ctx, + grn_pattern_tokenizer *tokenizer) +{ + OnigPosition position; + OnigRegion region; + + onig_region_init(®ion); + position = onig_search(tokenizer->options->regex, + tokenizer->start, + tokenizer->end, + tokenizer->next, + tokenizer->end, + ®ion, + ONIG_OPTION_NONE); + if (position == ONIG_MISMATCH) { + tokenizer->current = NULL; + tokenizer->current_length = 0; + tokenizer->next = tokenizer->end; + } else { + tokenizer->current = tokenizer->start + region.beg[0]; + tokenizer->current_length = region.end[0] - region.beg[0]; + tokenizer->next = tokenizer->start + region.end[0]; + } + onig_region_free(®ion, 0); +} +#endif + static void pattern_next(grn_ctx *ctx, grn_tokenizer_query *query, @@ -1723,29 +1756,21 @@ pattern_next(grn_ctx *ctx, tokenizer->encoding); #ifdef GRN_SUPPORT_REGEXP } else if (tokenizer->options->regex) { - OnigPosition position; - OnigRegion region; - - onig_region_init(®ion); - position = onig_search(tokenizer->options->regex, - tokenizer->start, - tokenizer->end, - tokenizer->next, - tokenizer->end, - ®ion, - ONIG_OPTION_NONE); - if (position == ONIG_MISMATCH) { - grn_token_set_data(ctx, token, NULL, 0); - grn_token_set_status(ctx, token, GRN_TOKEN_LAST); - } else { - grn_token_set_data(ctx, - token, - tokenizer->start + region.beg[0], - region.end[0] - region.beg[0]); - grn_token_set_status(ctx, token, GRN_TOKEN_CONTINUE); - tokenizer->next = tokenizer->start + region.end[0]; - onig_region_free(®ion, 0); + grn_token_status status = GRN_TOKEN_CONTINUE; + if (tokenizer->next == tokenizer->start) { + pattern_search(ctx, tokenizer); } + grn_token_set_data(ctx, + token, + tokenizer->current, + tokenizer->current_length); + if (tokenizer->next != tokenizer->end) { + pattern_search(ctx, tokenizer); + } + if (tokenizer->next == tokenizer->end) { + status = GRN_TOKEN_LAST; + } + grn_token_set_status(ctx, token, status); #endif /* GRN_SUPPORT_REGEXP */ } else { grn_token_set_data(ctx, token, NULL, 0); Added: test/command/suite/select/query/match/with_index/token_pattern/match.expected (+16 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/query/match/with_index/token_pattern/match.expected 2019-02-07 15:11:00 +0900 (fb2c6767a) @@ -0,0 +1,16 @@ +table_create Menus TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Menus name COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create Keywords TABLE_PAT_KEY ShortText --normalize NormalizerNFKC100 --default_tokenizer 'TokenPattern("pattern", "焼き?肉")' +[[0,0.0,0.0],true] +column_create Keywords index COLUMN_INDEX Menus name +[[0,0.0,0.0],true] +load --table Menus +[ +{"name": "焼肉定食"}, +{"name": "焼き肉定食"} +] +[[0,0.0,0.0],2] +select Menus --match_columns name --query "焼き肉弁当" +[[0,0.0,0.0],[[[1],[["_id","UInt32"],["name","Text"]],[2,"焼き肉定食"]]]] Added: test/command/suite/select/query/match/with_index/token_pattern/match.test (+15 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/query/match/with_index/token_pattern/match.test 2019-02-07 15:11:00 +0900 (f6e1d3dd1) @@ -0,0 +1,15 @@ +table_create Menus TABLE_NO_KEY +column_create Menus name COLUMN_SCALAR Text + +table_create Keywords TABLE_PAT_KEY ShortText \ + --normalize NormalizerNFKC100 \ + --default_tokenizer 'TokenPattern("pattern", "焼き?肉")' +column_create Keywords index COLUMN_INDEX Menus name + +load --table Menus +[ +{"name": "焼肉定食"}, +{"name": "焼き肉定食"} +] + +select Menus --match_columns name --query "焼き肉弁当" -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190207/d9b1f6f0/attachment-0001.html>