Kouhei Sutou
null+****@clear*****
Sun May 10 00:02:05 JST 2015
Kouhei Sutou 2015-05-10 00:02:05 +0900 (Sun, 10 May 2015) New Revision: 161d31f5fe46031facf4c511176d9f3d299a097d https://github.com/groonga/groonga/commit/161d31f5fe46031facf4c511176d9f3d299a097d Message: TokenRegexp: don't ignore blank on GET mode Added files: test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.expected test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.test test/command/suite/tokenizers/regexp/get/normalizer/blank/less_after.expected test/command/suite/tokenizers/regexp/get/normalizer/blank/less_after.test test/command/suite/tokenizers/regexp/get/normalizer/blank/less_before.expected test/command/suite/tokenizers/regexp/get/normalizer/blank/less_before.test Modified files: lib/tokenizers.c Modified: lib/tokenizers.c (+15 -8) =================================================================== --- lib/tokenizers.c 2015-05-09 23:14:59 +0900 (f4bb58a) +++ lib/tokenizers.c 2015-05-10 00:02:05 +0900 (e983ef1) @@ -585,10 +585,12 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) const const uint_least8_t *char_types = tokenizer->char_types + tokenizer->nth_char; grn_tokenize_mode mode = tokenizer->query->tokenize_mode; + grn_bool is_first_token = tokenizer->is_first_token; grn_bool escaping = GRN_FALSE; grn_bool break_by_blank = GRN_FALSE; GRN_BULK_REWIND(buffer); + tokenizer->is_first_token = GRN_FALSE; if (mode == GRN_TOKEN_GET) { if (tokenizer->get.have_begin) { @@ -650,20 +652,23 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) n_characters++; GRN_TEXT_PUT(ctx, buffer, current, char_len); current += char_len; - escaping = GRN_FALSE; if (n_characters == 1) { tokenizer->next = current; tokenizer->nth_char++; + if (escaping) { + tokenizer->nth_char++; + } } - if (n_characters == ngram_unit) { - break; - } + escaping = GRN_FALSE; char_type = char_types[0]; char_types++; if (GRN_STR_ISBLANK(char_type)) { break_by_blank = GRN_TRUE; break; } + if (n_characters == ngram_unit) { + break; + } } char_len = grn_charlen_(ctx, (const char *)current, (const char *)end, @@ -676,7 +681,7 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) if (tokenizer->is_overlapping) { status |= GRN_TOKEN_OVERLAP; } - if (n_characters < ngram_unit && !break_by_blank) { + if (n_characters < ngram_unit) { status |= GRN_TOKEN_UNMATURED; } tokenizer->is_overlapping = (n_characters > 1); @@ -688,7 +693,7 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) tokenizer->is_end = GRN_TRUE; } if (status & GRN_TOKEN_UNMATURED) { - if (tokenizer->is_first_token) { + if (is_first_token) { status |= GRN_TOKEN_FORCE_PREFIX; } else { status |= GRN_TOKEN_SKIP; @@ -702,7 +707,10 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) } } } else { - if (tokenizer->get.n_skip_tokens > 0) { + if (break_by_blank) { + tokenizer->get.n_skip_tokens = 0; + tokenizer->is_first_token = GRN_TRUE; + } else if (tokenizer->get.n_skip_tokens > 0) { tokenizer->get.n_skip_tokens--; status |= GRN_TOKEN_SKIP; } else { @@ -720,7 +728,6 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer), status); - tokenizer->is_first_token = GRN_FALSE; return NULL; } Added: test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.expected (+70 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.expected 2015-05-10 00:02:05 +0900 (989ce04) @@ -0,0 +1,70 @@ +table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp --normalizer NormalizerAuto +[[0,0.0,0.0],true] +table_tokenize Lexicon "abc\ndef" --mode ADD +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "", + "position": 0 + }, + { + "value": "ab", + "position": 1 + }, + { + "value": "bc", + "position": 2 + }, + { + "value": "c", + "position": 3 + }, + { + "value": "de", + "position": 4 + }, + { + "value": "ef", + "position": 5 + }, + { + "value": "f", + "position": 6 + }, + { + "value": "", + "position": 7 + } + ] +] +table_tokenize Lexicon "a\\bc\ndef" --mode GET +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "ab", + "position": 0 + }, + { + "value": "bc", + "position": 1 + }, + { + "value": "de", + "position": 3 + }, + { + "value": "ef", + "position": 4 + } + ] +] Added: test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.test (+6 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.test 2015-05-10 00:02:05 +0900 (e4772fc) @@ -0,0 +1,6 @@ +table_create Lexicon TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp \ + --normalizer NormalizerAuto +table_tokenize Lexicon "abc\ndef" --mode ADD + +table_tokenize Lexicon "a\\bc\ndef" --mode GET Added: test/command/suite/tokenizers/regexp/get/normalizer/blank/less_after.expected (+58 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/get/normalizer/blank/less_after.expected 2015-05-10 00:02:05 +0900 (c05a87c) @@ -0,0 +1,58 @@ +table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp --normalizer NormalizerAuto +[[0,0.0,0.0],true] +table_tokenize Lexicon "abc\nd" --mode ADD +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "", + "position": 0 + }, + { + "value": "ab", + "position": 1 + }, + { + "value": "bc", + "position": 2 + }, + { + "value": "c", + "position": 3 + }, + { + "value": "d", + "position": 4 + }, + { + "value": "", + "position": 5 + } + ] +] +table_tokenize Lexicon "abc\nd" --mode GET +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "ab", + "position": 0 + }, + { + "value": "bc", + "position": 1 + }, + { + "value": "d", + "position": 3 + } + ] +] Added: test/command/suite/tokenizers/regexp/get/normalizer/blank/less_after.test (+6 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/get/normalizer/blank/less_after.test 2015-05-10 00:02:05 +0900 (6d79cc7) @@ -0,0 +1,6 @@ +table_create Lexicon TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp \ + --normalizer NormalizerAuto +table_tokenize Lexicon "abc\nd" --mode ADD + +table_tokenize Lexicon "abc\nd" --mode GET Added: test/command/suite/tokenizers/regexp/get/normalizer/blank/less_before.expected (+58 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/get/normalizer/blank/less_before.expected 2015-05-10 00:02:05 +0900 (1607760) @@ -0,0 +1,58 @@ +table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp --normalizer NormalizerAuto +[[0,0.0,0.0],true] +table_tokenize Lexicon "a\ndef" --mode ADD +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "", + "position": 0 + }, + { + "value": "a", + "position": 1 + }, + { + "value": "de", + "position": 2 + }, + { + "value": "ef", + "position": 3 + }, + { + "value": "f", + "position": 4 + }, + { + "value": "", + "position": 5 + } + ] +] +table_tokenize Lexicon "a\ndef" --mode GET +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "a", + "position": 0 + }, + { + "value": "de", + "position": 1 + }, + { + "value": "ef", + "position": 2 + } + ] +] Added: test/command/suite/tokenizers/regexp/get/normalizer/blank/less_before.test (+6 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/get/normalizer/blank/less_before.test 2015-05-10 00:02:05 +0900 (b753510) @@ -0,0 +1,6 @@ +table_create Lexicon TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp \ + --normalizer NormalizerAuto +table_tokenize Lexicon "a\ndef" --mode ADD + +table_tokenize Lexicon "a\ndef" --mode GET -------------- next part -------------- HTML����������������������������...Download