Kouhei Sutou
null+****@clear*****
Tue Mar 17 17:48:42 JST 2015
Kouhei Sutou 2015-03-17 17:48:42 +0900 (Tue, 17 Mar 2015) New Revision: 3036b42552215de4f613404cf055c9c5282a9b6d https://github.com/groonga/groonga/commit/3036b42552215de4f613404cf055c9c5282a9b6d Message: TokenRegexp: support escape Added files: test/command/suite/select/filter/index/regexp/escape.expected test/command/suite/select/filter/index/regexp/escape.test test/command/suite/tokenizers/regexp/get/escape/one.expected test/command/suite/tokenizers/regexp/get/escape/one.test test/command/suite/tokenizers/regexp/get/escape/two.expected test/command/suite/tokenizers/regexp/get/escape/two.test Modified files: lib/tokenizers.c Modified: lib/tokenizers.c (+29 -10) =================================================================== --- lib/tokenizers.c 2015-03-17 15:46:14 +0900 (ef1a0c5) +++ lib/tokenizers.c 2015-03-17 17:48:42 +0900 (1c07c52) @@ -481,6 +481,7 @@ typedef struct { grn_bool is_overlapping; const char *next; const char *end; + grn_obj buffer; } grn_regexp_tokenizer; static grn_obj * @@ -555,6 +556,8 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) } } + GRN_TEXT_INIT(&(tokenizer->buffer), 0); + return NULL; } @@ -566,10 +569,13 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) grn_regexp_tokenizer *tokenizer = user_data->ptr; unsigned int n_characters = 0; int ngram_unit = 2; - const char *start = tokenizer->next; - const char *current = start; + grn_obj *buffer = &(tokenizer->buffer); + const char *current = tokenizer->next; const char *end = tokenizer->end; grn_tokenize_mode mode = tokenizer->query->tokenize_mode; + grn_bool escaping = GRN_FALSE; + + GRN_BULK_REWIND(buffer); if (mode == GRN_TOKEN_GET) { if (tokenizer->get.have_begin) { @@ -620,17 +626,29 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) return NULL; } - n_characters++; - current += char_len; - tokenizer->next = current; - while (n_characters < ngram_unit) { + while (GRN_TRUE) { + if (!escaping && mode == GRN_TOKEN_GET && + char_len == 1 && current[0] == '\\') { + current += char_len; + escaping = GRN_TRUE; + } else { + n_characters++; + GRN_TEXT_PUT(ctx, buffer, current, char_len); + current += char_len; + escaping = GRN_FALSE; + if (n_characters == 1) { + tokenizer->next = current; + } + if (n_characters == ngram_unit) { + break; + } + } + char_len = grn_charlen_(ctx, (const char *)current, (const char *)end, tokenizer->query->encoding); if (char_len == 0) { break; } - n_characters++; - current += char_len; } if (tokenizer->is_overlapping) { @@ -654,8 +672,8 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) grn_tokenizer_token_push(ctx, &(tokenizer->token), - (const char *)start, - current - start, + GRN_TEXT_VALUE(buffer), + GRN_TEXT_LEN(buffer), status); return NULL; } @@ -669,6 +687,7 @@ regexp_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) } grn_tokenizer_token_fin(ctx, &(tokenizer->token)); grn_tokenizer_query_close(ctx, tokenizer->query); + GRN_OBJ_FIN(ctx, &(tokenizer->buffer)); GRN_FREE(tokenizer); return NULL; } Added: test/command/suite/select/filter/index/regexp/escape.expected (+54 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/escape.expected 2015-03-17 17:48:42 +0900 (dd195d8) @@ -0,0 +1,54 @@ +table_create Logs TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Logs message COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create RegexpLexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp --normalizer NormalizerAuto +[[0,0.0,0.0],true] +column_create RegexpLexicon logs_message_index COLUMN_INDEX|WITH_POSITION Logs message +[[0,0.0,0.0],true] +load --table Logs +[ +{"message": "host1:[error]: No memory"}, +{"message": "host1:[warning]: Remained disk space is less than 30%"}, +{"message": "host1:[error]: Disk full"}, +{"message": "host2:[error]: No memory"}, +{"message": "host2:[info]: Shutdown"} +] +[[0,0.0,0.0],5] +select Logs --filter 'message @~ "\\\\[error\\\\]"' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 3 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "message", + "Text" + ] + ], + [ + 1, + "host1:[error]: No memory" + ], + [ + 3, + "host1:[error]: Disk full" + ], + [ + 4, + "host2:[error]: No memory" + ] + ] + ] +] Added: test/command/suite/select/filter/index/regexp/escape.test (+19 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/escape.test 2015-03-17 17:48:42 +0900 (d50b732) @@ -0,0 +1,19 @@ +table_create Logs TABLE_NO_KEY +column_create Logs message COLUMN_SCALAR Text + +table_create RegexpLexicon TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp \ + --normalizer NormalizerAuto +column_create RegexpLexicon logs_message_index \ + COLUMN_INDEX|WITH_POSITION Logs message + +load --table Logs +[ +{"message": "host1:[error]: No memory"}, +{"message": "host1:[warning]: Remained disk space is less than 30%"}, +{"message": "host1:[error]: Disk full"}, +{"message": "host2:[error]: No memory"}, +{"message": "host2:[info]: Shutdown"} +] + +select Logs --filter 'message @~ "\\\\[error\\\\]"' Added: test/command/suite/tokenizers/regexp/get/escape/one.expected (+30 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/get/escape/one.expected 2015-03-17 17:48:42 +0900 (f79eadc) @@ -0,0 +1,30 @@ +table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp +[[0,0.0,0.0],true] +table_tokenize Lexicon "[e" --mode ADD +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "", + "position": 0 + }, + { + "value": "[e", + "position": 1 + }, + { + "value": "e", + "position": 2 + }, + { + "value": "", + "position": 3 + } + ] +] +table_tokenize Lexicon "\\[e" --mode GET +[[0,0.0,0.0],[{"value":"[e","position":0}]] Added: test/command/suite/tokenizers/regexp/get/escape/one.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/get/escape/one.test 2015-03-17 17:48:42 +0900 (d2e7562) @@ -0,0 +1,5 @@ +table_create Lexicon TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp +table_tokenize Lexicon "[e" --mode ADD + +table_tokenize Lexicon "\\[e" --mode GET Added: test/command/suite/tokenizers/regexp/get/escape/two.expected (+98 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/get/escape/two.expected 2015-03-17 17:48:42 +0900 (47b7da8) @@ -0,0 +1,98 @@ +table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp +[[0,0.0,0.0],true] +table_tokenize Lexicon "c:\\server" --mode ADD +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "", + "position": 0 + }, + { + "value": "c:", + "position": 1 + }, + { + "value": ":\\", + "position": 2 + }, + { + "value": "\\s", + "position": 3 + }, + { + "value": "se", + "position": 4 + }, + { + "value": "er", + "position": 5 + }, + { + "value": "rv", + "position": 6 + }, + { + "value": "ve", + "position": 7 + }, + { + "value": "er", + "position": 8 + }, + { + "value": "r", + "position": 9 + }, + { + "value": "", + "position": 10 + } + ] +] +table_tokenize Lexicon "c:\\\\server" --mode GET +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "c:", + "position": 0 + }, + { + "value": ":\\", + "position": 1 + }, + { + "value": "\\s", + "position": 2 + }, + { + "value": "se", + "position": 3 + }, + { + "value": "er", + "position": 4 + }, + { + "value": "rv", + "position": 5 + }, + { + "value": "ve", + "position": 6 + }, + { + "value": "er", + "position": 7 + } + ] +] Added: test/command/suite/tokenizers/regexp/get/escape/two.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/get/escape/two.test 2015-03-17 17:48:42 +0900 (a2e47e7) @@ -0,0 +1,5 @@ +table_create Lexicon TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp +table_tokenize Lexicon "c:\\server" --mode ADD + +table_tokenize Lexicon "c:\\\\server" --mode GET -------------- next part -------------- HTML����������������������������... Download