Kouhei Sutou
null+****@clear*****
Sun May 10 17:14:35 JST 2015
Kouhei Sutou 2015-05-10 17:14:35 +0900 (Sun, 10 May 2015) New Revision: 3cfdcca9663f31d613e331a087c758fa0d791b60 https://github.com/groonga/groonga/commit/3cfdcca9663f31d613e331a087c758fa0d791b60 Message: TokenRegexp: move regular expression parser to ii.c from tokenizers Now, we can use TokenRegexp for match operation (@ in script syntax) without escaping. Added files: test/command/suite/select/filter/index/match/token_regexp.expected test/command/suite/select/filter/index/match/token_regexp.test Removed files: test/command/suite/tokenizers/regexp/get/escape/one.expected test/command/suite/tokenizers/regexp/get/escape/one.test test/command/suite/tokenizers/regexp/get/escape/two.expected test/command/suite/tokenizers/regexp/get/escape/two.test test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.expected test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.test Modified files: lib/ii.c lib/tokenizers.c test/command/suite/tokenizers/regexp/get/begin/one.expected test/command/suite/tokenizers/regexp/get/begin/one.test test/command/suite/tokenizers/regexp/get/begin/three.expected test/command/suite/tokenizers/regexp/get/begin/three.test test/command/suite/tokenizers/regexp/get/begin/two.expected test/command/suite/tokenizers/regexp/get/begin/two.test test/command/suite/tokenizers/regexp/get/end/four.expected test/command/suite/tokenizers/regexp/get/end/four.test test/command/suite/tokenizers/regexp/get/end/one.expected test/command/suite/tokenizers/regexp/get/end/one.test test/command/suite/tokenizers/regexp/get/end/three.expected test/command/suite/tokenizers/regexp/get/end/three.test test/command/suite/tokenizers/regexp/get/end/two.expected test/command/suite/tokenizers/regexp/get/end/two.test Modified: lib/ii.c (+74 -0) =================================================================== --- lib/ii.c 2015-05-10 15:29:38 +0900 (2d32df2) +++ lib/ii.c 2015-05-10 17:14:35 +0900 (6f6a0ca) @@ -6019,6 +6019,77 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string, return rc; } +static grn_rc +grn_ii_select_regexp(grn_ctx *ctx, grn_ii *ii, + const char *string, unsigned int string_len, + grn_hash *s, grn_operator op, grn_select_optarg *optarg) +{ + grn_rc rc; + grn_obj parsed_string; + grn_bool escaping = GRN_FALSE; + int nth_char = 0; + const char *current = string; + const char *string_end = string + string_len; + + GRN_TEXT_INIT(&parsed_string, 0); + while (current < string_end) { + const char *target; + int char_len; + + char_len = grn_charlen(ctx, current, string_end); + if (char_len == 0) { + ERR(GRN_INVALID_ARGUMENT, + "[ii][select][regexp] invalid encoding character: <%.*s|%#x|>", + (int)(current - string), string, + *current); + return ctx->rc; + } + target = current; + current += char_len; + + if (escaping) { + escaping = GRN_FALSE; + if (char_len == 1) { + switch (*target) { + case 'A' : + if (nth_char == 0) { + target = GRN_TOKENIZER_BEGIN_MARK_UTF8; + char_len = GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN; + } + break; + case 'z' : + if (current == string_end) { + target = GRN_TOKENIZER_END_MARK_UTF8; + char_len = GRN_TOKENIZER_END_MARK_UTF8_LEN; + } + break; + default : + break; + } + } + } else { + if (char_len == 1 && *target == '\\') { + escaping = GRN_TRUE; + continue; + } + } + + GRN_TEXT_PUT(ctx, &parsed_string, target, char_len); + nth_char++; + } + + if (optarg) { + optarg->mode = GRN_OP_MATCH; + } + + rc = grn_ii_select(ctx, ii, + GRN_TEXT_VALUE(&parsed_string), + GRN_TEXT_LEN(&parsed_string), + s, op, optarg); + + return rc; +} + #ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH static grn_bool grn_ii_select_sequential_search_should_use(grn_ctx *ctx, @@ -6259,6 +6330,9 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_ if (mode == GRN_OP_TERM_EXTRACT) { return grn_ii_term_extract(ctx, ii, string, string_len, s, op, optarg); } + if (mode == GRN_OP_REGEXP) { + return grn_ii_select_regexp(ctx, ii, string, string_len, s, op, optarg); + } /* todo : support subrec rep = (s->record_unit == grn_rec_position || s->subrec_unit == grn_rec_position); orp = (s->record_unit == grn_rec_position || op == GRN_OP_OR); Modified: lib/tokenizers.c (+72 -106) =================================================================== --- lib/tokenizers.c 2015-05-10 15:29:38 +0900 (dfa9cc5) +++ lib/tokenizers.c 2015-05-10 17:14:35 +0900 (0dd0f1e) @@ -473,8 +473,6 @@ typedef struct { grn_tokenizer_token token; grn_tokenizer_query *query; struct { - grn_bool have_begin; - grn_bool have_end; int32_t n_skip_tokens; } get; grn_bool is_begin; @@ -514,8 +512,6 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) grn_tokenizer_token_init(ctx, &(tokenizer->token)); tokenizer->query = query; - tokenizer->get.have_begin = GRN_FALSE; - tokenizer->get.have_end = GRN_FALSE; tokenizer->get.n_skip_tokens = 0; tokenizer->is_begin = GRN_TRUE; @@ -532,40 +528,6 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) tokenizer->char_types = grn_string_get_types(ctx, tokenizer->query->normalized_query); - if (tokenizer->query->tokenize_mode == GRN_TOKEN_GET) { - unsigned int query_length = tokenizer->query->length; - if (query_length >= 2) { - const char *query_string = tokenizer->query->ptr; - grn_encoding encoding = tokenizer->query->encoding; - if (query_string[0] == '\\' && query_string[1] == 'A') { - tokenizer->get.have_begin = GRN_TRUE; - /* TODO: It assumes that both "\\" and "A" are normalized to 1 - characters. Normalizer may omit character or expand to - multiple characters. */ - tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end, - encoding); - tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end, - encoding); - tokenizer->nth_char = 2; - } - if (query_string[query_length - 2] == '\\' && - query_string[query_length - 1] == 'z') { - tokenizer->get.have_end = GRN_TRUE; - /* TODO: It assumes that both "\\" and "z" are normalized to 1 - byte characters. Normalizer may omit character or expand to - multiple characters. */ - tokenizer->end -= grn_charlen_(ctx, - tokenizer->end - 1, - tokenizer->end, - encoding); - tokenizer->end -= grn_charlen_(ctx, - tokenizer->end - 1, - tokenizer->end, - encoding); - } - } - } - GRN_TEXT_INIT(&(tokenizer->buffer), 0); return NULL; @@ -584,45 +546,26 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) const char *end = tokenizer->end; const const uint_least8_t *char_types = tokenizer->char_types; grn_tokenize_mode mode = tokenizer->query->tokenize_mode; + grn_bool is_begin = tokenizer->is_begin; grn_bool is_start_token = tokenizer->is_start_token; - grn_bool escaping = GRN_FALSE; grn_bool break_by_blank = GRN_FALSE; + grn_bool break_by_end_mark = GRN_FALSE; GRN_BULK_REWIND(buffer); + tokenizer->is_begin = GRN_FALSE; tokenizer->is_start_token = GRN_FALSE; if (char_types) { char_types += tokenizer->nth_char; } - if (mode == GRN_TOKEN_GET) { - if (tokenizer->get.have_begin) { - grn_tokenizer_token_push(ctx, - &(tokenizer->token), - GRN_TOKENIZER_BEGIN_MARK_UTF8, - GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, - status); - tokenizer->get.have_begin = GRN_FALSE; - return NULL; - } - - if (tokenizer->is_end && tokenizer->get.have_end) { - status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; - grn_tokenizer_token_push(ctx, - &(tokenizer->token), - GRN_TOKENIZER_END_MARK_UTF8, - GRN_TOKENIZER_END_MARK_UTF8_LEN, - status); - return NULL; - } - } else { - if (tokenizer->is_begin) { + if (mode != GRN_TOKEN_GET) { + if (is_begin) { grn_tokenizer_token_push(ctx, &(tokenizer->token), GRN_TOKENIZER_BEGIN_MARK_UTF8, GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, status); - tokenizer->is_begin = GRN_FALSE; return NULL; } @@ -651,37 +594,54 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) return NULL; } - while (GRN_TRUE) { - if (!escaping && mode == GRN_TOKEN_GET && - char_len == 1 && current[0] == '\\') { - current += char_len; - escaping = GRN_TRUE; - if (char_types) { - char_types++; - } - } else { + if (mode == GRN_TOKEN_GET) { + if (is_begin && + char_len == GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN && + memcmp(current, GRN_TOKENIZER_BEGIN_MARK_UTF8, char_len) == 0) { n_characters++; GRN_TEXT_PUT(ctx, buffer, current, char_len); current += char_len; - if (n_characters == 1) { - tokenizer->next = current; - tokenizer->nth_char++; - if (escaping) { - tokenizer->nth_char++; - } - } - escaping = GRN_FALSE; - if (char_types) { - uint_least8_t char_type; - char_type = char_types[0]; - char_types++; - if (GRN_STR_ISBLANK(char_type)) { - break_by_blank = GRN_TRUE; - break; - } + tokenizer->next = current; + tokenizer->nth_char++; + if (current == end) { + status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; } - if (n_characters == ngram_unit) { - break; + grn_tokenizer_token_push(ctx, + &(tokenizer->token), + GRN_TOKENIZER_BEGIN_MARK_UTF8, + GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, + status); + return NULL; + } + + if (current + char_len == end && + char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN && + memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) { + status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; + grn_tokenizer_token_push(ctx, + &(tokenizer->token), + GRN_TOKENIZER_END_MARK_UTF8, + GRN_TOKENIZER_END_MARK_UTF8_LEN, + status); + return NULL; + } + } + + while (GRN_TRUE) { + n_characters++; + GRN_TEXT_PUT(ctx, buffer, current, char_len); + current += char_len; + if (n_characters == 1) { + tokenizer->next = current; + tokenizer->nth_char++; + } + + if (char_types) { + uint_least8_t char_type; + char_type = char_types[0]; + char_types++; + if (GRN_STR_ISBLANK(char_type)) { + break_by_blank = GRN_TRUE; } } @@ -690,6 +650,21 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) if (char_len == 0) { break; } + + if (mode == GRN_TOKEN_GET && + current + char_len == end && + char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN && + memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) { + break_by_end_mark = GRN_TRUE; + } + + if (break_by_blank || break_by_end_mark) { + break; + } + + if (n_characters == ngram_unit) { + break; + } } if (tokenizer->is_overlapping) { @@ -702,28 +677,19 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) if (mode == GRN_TOKEN_GET) { if (current == end) { - if (tokenizer->get.have_end) { - if (tokenizer->next == end) { - tokenizer->is_end = GRN_TRUE; - } - if (status & GRN_TOKEN_UNMATURED) { - if (is_start_token) { - status |= GRN_TOKEN_FORCE_PREFIX; - } else { - status |= GRN_TOKEN_SKIP; - } - } - } else { - tokenizer->is_end = GRN_TRUE; - status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; - if (status & GRN_TOKEN_UNMATURED) { - status |= GRN_TOKEN_FORCE_PREFIX; - } + tokenizer->is_end = GRN_TRUE; + status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; + if (status & GRN_TOKEN_UNMATURED) { + status |= GRN_TOKEN_FORCE_PREFIX; } } else { if (break_by_blank) { tokenizer->get.n_skip_tokens = 0; tokenizer->is_start_token = GRN_TRUE; + } else if (break_by_end_mark) { + if (!is_start_token && (status & GRN_TOKEN_UNMATURED)) { + status |= GRN_TOKEN_SKIP; + } } else if (tokenizer->get.n_skip_tokens > 0) { tokenizer->get.n_skip_tokens--; status |= GRN_TOKEN_SKIP; Added: test/command/suite/select/filter/index/match/token_regexp.expected (+42 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/match/token_regexp.expected 2015-05-10 17:14:35 +0900 (76ff708) @@ -0,0 +1,42 @@ +table_create Paths TABLE_PAT_KEY ShortText +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp --normalizer NormalizerAuto +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Paths _key +[[0,0.0,0.0],true] +load --table Paths +[ +{"_key": "c:\\Users\\alice"}, +{"_key": "c:\\Users\\alice\\Downloads"}, +{"_key": "c:\\Users\\bob\\Downloads"} +] +[[0,0.0,0.0],3] +select Paths --filter '_key @ "\\\\Users\\\\alice\\\\"' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "_key", + "ShortText" + ] + ], + [ + 2, + "c:\\Users\\alice\\Downloads" + ] + ] + ] +] Added: test/command/suite/select/filter/index/match/token_regexp.test (+16 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/match/token_regexp.test 2015-05-10 17:14:35 +0900 (4d27bb0) @@ -0,0 +1,16 @@ +table_create Paths TABLE_PAT_KEY ShortText + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp \ + --normalizer NormalizerAuto +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Paths _key + +load --table Paths +[ +{"_key": "c:\\Users\\alice"}, +{"_key": "c:\\Users\\alice\\Downloads"}, +{"_key": "c:\\Users\\bob\\Downloads"} +] + +select Paths --filter '_key @ "\\\\Users\\\\alice\\\\"' Modified: test/command/suite/tokenizers/regexp/get/begin/one.expected (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/begin/one.expected 2015-05-10 15:29:38 +0900 (8c9747a) +++ test/command/suite/tokenizers/regexp/get/begin/one.expected 2015-05-10 17:14:35 +0900 (81aeaab) @@ -22,5 +22,5 @@ table_tokenize Lexicon "x" --mode ADD } ] ] -table_tokenize Lexicon "\\Ax" --mode GET +table_tokenize Lexicon "x" --mode GET [[0,0.0,0.0],[{"value":"","position":0},{"value":"x","position":1}]] Modified: test/command/suite/tokenizers/regexp/get/begin/one.test (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/begin/one.test 2015-05-10 15:29:38 +0900 (0a4f35b) +++ test/command/suite/tokenizers/regexp/get/begin/one.test 2015-05-10 17:14:35 +0900 (fb1cb1c) @@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \ --default_tokenizer TokenRegexp table_tokenize Lexicon "x" --mode ADD -table_tokenize Lexicon "\\Ax" --mode GET +table_tokenize Lexicon "x" --mode GET Modified: test/command/suite/tokenizers/regexp/get/begin/three.expected (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/begin/three.expected 2015-05-10 15:29:38 +0900 (878c093) +++ test/command/suite/tokenizers/regexp/get/begin/three.expected 2015-05-10 17:14:35 +0900 (7047d1e) @@ -30,7 +30,7 @@ table_tokenize Lexicon "xyz" --mode ADD } ] ] -table_tokenize Lexicon "\\Axyz" --mode GET +table_tokenize Lexicon "xyz" --mode GET [ [ 0, Modified: test/command/suite/tokenizers/regexp/get/begin/three.test (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/begin/three.test 2015-05-10 15:29:38 +0900 (82d674f) +++ test/command/suite/tokenizers/regexp/get/begin/three.test 2015-05-10 17:14:35 +0900 (c4efefd) @@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \ --default_tokenizer TokenRegexp table_tokenize Lexicon "xyz" --mode ADD -table_tokenize Lexicon "\\Axyz" --mode GET +table_tokenize Lexicon "xyz" --mode GET Modified: test/command/suite/tokenizers/regexp/get/begin/two.expected (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/begin/two.expected 2015-05-10 15:29:38 +0900 (8e82fb0) +++ test/command/suite/tokenizers/regexp/get/begin/two.expected 2015-05-10 17:14:35 +0900 (588669b) @@ -26,5 +26,5 @@ table_tokenize Lexicon "xy" --mode ADD } ] ] -table_tokenize Lexicon "\\Axy" --mode GET +table_tokenize Lexicon "xy" --mode GET [[0,0.0,0.0],[{"value":"","position":0},{"value":"xy","position":1}]] Modified: test/command/suite/tokenizers/regexp/get/begin/two.test (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/begin/two.test 2015-05-10 15:29:38 +0900 (9be9343) +++ test/command/suite/tokenizers/regexp/get/begin/two.test 2015-05-10 17:14:35 +0900 (1a0dc73) @@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \ --default_tokenizer TokenRegexp table_tokenize Lexicon "xy" --mode ADD -table_tokenize Lexicon "\\Axy" --mode GET +table_tokenize Lexicon "xy" --mode GET Modified: test/command/suite/tokenizers/regexp/get/end/four.expected (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/end/four.expected 2015-05-10 15:29:38 +0900 (ad58a34) +++ test/command/suite/tokenizers/regexp/get/end/four.expected 2015-05-10 17:14:35 +0900 (6ce64ce) @@ -34,7 +34,7 @@ table_tokenize Lexicon "abcd" --mode ADD } ] ] -table_tokenize Lexicon "abcd\\z" --mode GET +table_tokenize Lexicon "abcd" --mode GET [ [ 0, Modified: test/command/suite/tokenizers/regexp/get/end/four.test (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/end/four.test 2015-05-10 15:29:38 +0900 (a4b1c2d) +++ test/command/suite/tokenizers/regexp/get/end/four.test 2015-05-10 17:14:35 +0900 (0c18d81) @@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \ --default_tokenizer TokenRegexp table_tokenize Lexicon "abcd" --mode ADD -table_tokenize Lexicon "abcd\\z" --mode GET +table_tokenize Lexicon "abcd" --mode GET Modified: test/command/suite/tokenizers/regexp/get/end/one.expected (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/end/one.expected 2015-05-10 15:29:38 +0900 (acaf793) +++ test/command/suite/tokenizers/regexp/get/end/one.expected 2015-05-10 17:14:35 +0900 (9ba69f8) @@ -22,5 +22,5 @@ table_tokenize Lexicon "x" --mode ADD } ] ] -table_tokenize Lexicon "x\\z" --mode GET +table_tokenize Lexicon "x" --mode GET [[0,0.0,0.0],[{"value":"x","position":0},{"value":"","position":1}]] Modified: test/command/suite/tokenizers/regexp/get/end/one.test (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/end/one.test 2015-05-10 15:29:38 +0900 (3314d6f) +++ test/command/suite/tokenizers/regexp/get/end/one.test 2015-05-10 17:14:35 +0900 (b54d648) @@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \ --default_tokenizer TokenRegexp table_tokenize Lexicon "x" --mode ADD -table_tokenize Lexicon "x\\z" --mode GET +table_tokenize Lexicon "x" --mode GET Modified: test/command/suite/tokenizers/regexp/get/end/three.expected (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/end/three.expected 2015-05-10 15:29:38 +0900 (d082e85) +++ test/command/suite/tokenizers/regexp/get/end/three.expected 2015-05-10 17:14:35 +0900 (aaba665) @@ -30,7 +30,7 @@ table_tokenize Lexicon "xyz" --mode ADD } ] ] -table_tokenize Lexicon "xyz\\z" --mode GET +table_tokenize Lexicon "xyz" --mode GET [ [ 0, Modified: test/command/suite/tokenizers/regexp/get/end/three.test (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/end/three.test 2015-05-10 15:29:38 +0900 (510d69c) +++ test/command/suite/tokenizers/regexp/get/end/three.test 2015-05-10 17:14:35 +0900 (8e225df) @@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \ --default_tokenizer TokenRegexp table_tokenize Lexicon "xyz" --mode ADD -table_tokenize Lexicon "xyz\\z" --mode GET +table_tokenize Lexicon "xyz" --mode GET Modified: test/command/suite/tokenizers/regexp/get/end/two.expected (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/end/two.expected 2015-05-10 15:29:38 +0900 (40ed20b) +++ test/command/suite/tokenizers/regexp/get/end/two.expected 2015-05-10 17:14:35 +0900 (3b94144) @@ -26,5 +26,5 @@ table_tokenize Lexicon "xy" --mode ADD } ] ] -table_tokenize Lexicon "xy\\z" --mode GET +table_tokenize Lexicon "xy" --mode GET [[0,0.0,0.0],[{"value":"xy","position":0},{"value":"","position":2}]] Modified: test/command/suite/tokenizers/regexp/get/end/two.test (+1 -1) =================================================================== --- test/command/suite/tokenizers/regexp/get/end/two.test 2015-05-10 15:29:38 +0900 (58b3e77) +++ test/command/suite/tokenizers/regexp/get/end/two.test 2015-05-10 17:14:35 +0900 (3c9cc9b) @@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \ --default_tokenizer TokenRegexp table_tokenize Lexicon "xy" --mode ADD -table_tokenize Lexicon "xy\\z" --mode GET +table_tokenize Lexicon "xy" --mode GET Deleted: test/command/suite/tokenizers/regexp/get/escape/one.expected (+0 -30) 100644 =================================================================== --- test/command/suite/tokenizers/regexp/get/escape/one.expected 2015-05-10 15:29:38 +0900 (f79eadc) +++ /dev/null @@ -1,30 +0,0 @@ -table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp -[[0,0.0,0.0],true] -table_tokenize Lexicon "[e" --mode ADD -[ - [ - 0, - 0.0, - 0.0 - ], - [ - { - "value": "", - "position": 0 - }, - { - "value": "[e", - "position": 1 - }, - { - "value": "e", - "position": 2 - }, - { - "value": "", - "position": 3 - } - ] -] -table_tokenize Lexicon "\\[e" --mode GET -[[0,0.0,0.0],[{"value":"[e","position":0}]] Deleted: test/command/suite/tokenizers/regexp/get/escape/one.test (+0 -5) 100644 =================================================================== --- test/command/suite/tokenizers/regexp/get/escape/one.test 2015-05-10 15:29:38 +0900 (d2e7562) +++ /dev/null @@ -1,5 +0,0 @@ -table_create Lexicon TABLE_PAT_KEY ShortText \ - --default_tokenizer TokenRegexp -table_tokenize Lexicon "[e" --mode ADD - -table_tokenize Lexicon "\\[e" --mode GET Deleted: test/command/suite/tokenizers/regexp/get/escape/two.expected (+0 -86) 100644 =================================================================== --- test/command/suite/tokenizers/regexp/get/escape/two.expected 2015-05-10 15:29:38 +0900 (2de6d20) +++ /dev/null @@ -1,86 +0,0 @@ -table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp -[[0,0.0,0.0],true] -table_tokenize Lexicon "c:\\server" --mode ADD -[ - [ - 0, - 0.0, - 0.0 - ], - [ - { - "value": "", - "position": 0 - }, - { - "value": "c:", - "position": 1 - }, - { - "value": ":\\", - "position": 2 - }, - { - "value": "\\s", - "position": 3 - }, - { - "value": "se", - "position": 4 - }, - { - "value": "er", - "position": 5 - }, - { - "value": "rv", - "position": 6 - }, - { - "value": "ve", - "position": 7 - }, - { - "value": "er", - "position": 8 - }, - { - "value": "r", - "position": 9 - }, - { - "value": "", - "position": 10 - } - ] -] -table_tokenize Lexicon "c:\\\\server" --mode GET -[ - [ - 0, - 0.0, - 0.0 - ], - [ - { - "value": "c:", - "position": 0 - }, - { - "value": "\\s", - "position": 2 - }, - { - "value": "er", - "position": 4 - }, - { - "value": "ve", - "position": 6 - }, - { - "value": "er", - "position": 7 - } - ] -] Deleted: test/command/suite/tokenizers/regexp/get/escape/two.test (+0 -5) 100644 =================================================================== --- test/command/suite/tokenizers/regexp/get/escape/two.test 2015-05-10 15:29:38 +0900 (a2e47e7) +++ /dev/null @@ -1,5 +0,0 @@ -table_create Lexicon TABLE_PAT_KEY ShortText \ - --default_tokenizer TokenRegexp -table_tokenize Lexicon "c:\\server" --mode ADD - -table_tokenize Lexicon "c:\\\\server" --mode GET Deleted: test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.expected (+0 -70) 100644 =================================================================== --- test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.expected 2015-05-10 15:29:38 +0900 (791177e) +++ /dev/null @@ -1,70 +0,0 @@ -table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp --normalizer NormalizerAuto -[[0,0.0,0.0],true] -table_tokenize Lexicon "abc\ndef" --mode ADD -[ - [ - 0, - 0.0, - 0.0 - ], - [ - { - "value": "", - "position": 0 - }, - { - "value": "ab", - "position": 1 - }, - { - "value": "bc", - "position": 2 - }, - { - "value": "c", - "position": 3 - }, - { - "value": "de", - "position": 5 - }, - { - "value": "ef", - "position": 6 - }, - { - "value": "f", - "position": 7 - }, - { - "value": "", - "position": 8 - } - ] -] -table_tokenize Lexicon "a\\bc\ndef" --mode GET -[ - [ - 0, - 0.0, - 0.0 - ], - [ - { - "value": "ab", - "position": 0 - }, - { - "value": "bc", - "position": 1 - }, - { - "value": "de", - "position": 3 - }, - { - "value": "ef", - "position": 4 - } - ] -] Deleted: test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.test (+0 -6) 100644 =================================================================== --- test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.test 2015-05-10 15:29:38 +0900 (e4772fc) +++ /dev/null @@ -1,6 +0,0 @@ -table_create Lexicon TABLE_PAT_KEY ShortText \ - --default_tokenizer TokenRegexp \ - --normalizer NormalizerAuto -table_tokenize Lexicon "abc\ndef" --mode ADD - -table_tokenize Lexicon "a\\bc\ndef" --mode GET -------------- next part -------------- HTML����������������������������...Download