naoa
null+****@clear*****
Sun Oct 26 20:00:49 JST 2014
naoa 2014-10-26 20:00:49 +0900 (Sun, 26 Oct 2014) New Revision: 0b54c748fd84f59c7cee3f8443b63d7c2ee608b5 https://github.com/groonga/groonga/commit/0b54c748fd84f59c7cee3f8443b63d7c2ee608b5 Merged ff11b0c: Merge pull request #232 from naoa/tokenize-support-token_filters Message: Add table_tokenize command Added files: test/command/suite/table_tokenize/add_mode.expected test/command/suite/table_tokenize/add_mode.test test/command/suite/table_tokenize/flags.expected test/command/suite/table_tokenize/flags.test test/command/suite/table_tokenize/get_mode.expected test/command/suite/table_tokenize/get_mode.test test/command/suite/table_tokenize/with_normalizer.expected test/command/suite/table_tokenize/with_normalizer.test test/command/suite/table_tokenize/with_token_filters.expected test/command/suite/table_tokenize/with_token_filters.test Modified files: lib/proc.c Modified: lib/proc.c (+102 -26) =================================================================== --- lib/proc.c 2014-10-26 18:27:34 +0900 (74b85d6) +++ lib/proc.c 2014-10-26 20:00:49 +0900 (cab6995) @@ -3536,6 +3536,42 @@ tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, grn_token_mode mode, grn_token_cursor_close(ctx, token_cursor); } +static void +add_tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, unsigned int flags) +{ + grn_obj tokens; + GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); + tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); + output_tokens(ctx, &tokens, lexicon); + GRN_OBJ_FIN(ctx, &tokens); +} + +static void +get_tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, unsigned int flags) +{ + { + grn_token_cursor *token_cursor; + token_cursor = + grn_token_cursor_open(ctx, lexicon, + GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string), + GRN_TOKEN_ADD, flags); + if (token_cursor) { + while (token_cursor->status == GRN_TOKEN_DOING) { + grn_token_cursor_next(ctx, token_cursor); + } + grn_token_cursor_close(ctx, token_cursor); + } + } + + { + grn_obj tokens; + GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); + tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); + output_tokens(ctx, &tokens, lexicon); + GRN_OBJ_FIN(ctx, &tokens); + } +} + static grn_obj * proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { @@ -3585,33 +3621,9 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) { - grn_obj tokens; - GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); - tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); - output_tokens(ctx, &tokens, lexicon); - GRN_OBJ_FIN(ctx, &tokens); + add_tokenize(ctx, lexicon, string, flags); } else if (MODE_NAME_EQUAL("GET")) { - { - grn_token_cursor *token_cursor; - token_cursor = - grn_token_cursor_open(ctx, lexicon, - GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string), - GRN_TOKEN_ADD, flags); - if (token_cursor) { - while (token_cursor->status == GRN_TOKEN_DOING) { - grn_token_cursor_next(ctx, token_cursor); - } - grn_token_cursor_close(ctx, token_cursor); - } - } - - { - grn_obj tokens; - GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); - tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); - output_tokens(ctx, &tokens, lexicon); - GRN_OBJ_FIN(ctx, &tokens); - } + get_tokenize(ctx, lexicon, string, flags); } else { ERR(GRN_INVALID_ARGUMENT, "[tokenize] invalid mode: <%.*s>", (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); @@ -3624,6 +3636,64 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) return NULL; } +static grn_obj * +proc_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_obj *table_name; + grn_obj *string; + grn_obj *flag_names; + grn_obj *mode_name; + + table_name = VAR(0); + string = VAR(1); + flag_names = VAR(2); + mode_name = VAR(3); + + if (GRN_TEXT_LEN(table_name) == 0) { + ERR(GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing"); + return NULL; + } + + if (GRN_TEXT_LEN(string) == 0) { + ERR(GRN_INVALID_ARGUMENT, "[table_tokenize] string is missing"); + return NULL; + } + + { + unsigned int flags; + grn_obj *lexicon; + + flags = parse_tokenize_flags(ctx, flag_names); + if (ctx->rc != GRN_SUCCESS) { + return NULL; + } + + lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name)); + + if (!lexicon) { + return NULL; + } + +#define MODE_NAME_EQUAL(name)\ + (GRN_TEXT_LEN(mode_name) == strlen(name) &&\ + memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) + + if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) { + add_tokenize(ctx, lexicon, string, flags); + } else if (MODE_NAME_EQUAL("GET")) { + get_tokenize(ctx, lexicon, string, flags); + } else { + ERR(GRN_INVALID_ARGUMENT, "[table_tokenize] invalid mode: <%.*s>", + (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); + } +#undef MODE_NAME_EQUAL + + grn_obj_unlink(ctx, lexicon); + } + + return NULL; +} + static void list_proc(grn_ctx *ctx, grn_proc_type target_proc_type, const char *name, const char *plural_name) @@ -5495,6 +5565,12 @@ grn_db_init_builtin_query(grn_ctx *ctx) DEF_VAR(vars[5], "token_filters"); DEF_COMMAND("tokenize", proc_tokenize, 6, vars); + DEF_VAR(vars[0], "table"); + DEF_VAR(vars[1], "string"); + DEF_VAR(vars[2], "flags"); + DEF_VAR(vars[3], "mode"); + DEF_COMMAND("table_tokenize", proc_table_tokenize, 4, vars); + DEF_COMMAND("tokenizer_list", proc_tokenizer_list, 0, vars); DEF_COMMAND("normalizer_list", proc_normalizer_list, 0, vars); Added: test/command/suite/table_tokenize/add_mode.expected (+32 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_tokenize/add_mode.expected 2014-10-26 20:00:49 +0900 (001b9b2) @@ -0,0 +1,32 @@ +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto +[[0,0.0,0.0],true] +table_tokenize Terms "あいabアイ" --mode ADD +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "あい", + "position": 0 + }, + { + "value": "い", + "position": 1 + }, + { + "value": "ab", + "position": 2 + }, + { + "value": "アイ", + "position": 3 + }, + { + "value": "イ", + "position": 4 + } + ] +] Added: test/command/suite/table_tokenize/add_mode.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_tokenize/add_mode.test 2014-10-26 20:00:49 +0900 (8de0873) @@ -0,0 +1,5 @@ +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram \ + --normalizer NormalizerAuto + +table_tokenize Terms "あいabアイ" --mode ADD Added: test/command/suite/table_tokenize/flags.expected (+24 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_tokenize/flags.expected 2014-10-26 20:00:49 +0900 (20bb1c6) @@ -0,0 +1,24 @@ +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenDelimit --normalizer NormalizerAuto +[[0,0.0,0.0],true] +table_tokenize Terms "aBcDe 123" ENABLE_TOKENIZED_DELIMITER +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "ab", + "position": 0 + }, + { + "value": "cde 1", + "position": 1 + }, + { + "value": "23", + "position": 2 + } + ] +] Added: test/command/suite/table_tokenize/flags.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_tokenize/flags.test 2014-10-26 20:00:49 +0900 (e5da57a) @@ -0,0 +1,5 @@ +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenDelimit \ + --normalizer NormalizerAuto + +table_tokenize Terms "aBcDe 123" ENABLE_TOKENIZED_DELIMITER Added: test/command/suite/table_tokenize/get_mode.expected (+24 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_tokenize/get_mode.expected 2014-10-26 20:00:49 +0900 (774c69d) @@ -0,0 +1,24 @@ +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto +[[0,0.0,0.0],true] +table_tokenize Terms "あいabアイ" --mode GET +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "あい", + "position": 0 + }, + { + "value": "ab", + "position": 2 + }, + { + "value": "アイ", + "position": 3 + } + ] +] Added: test/command/suite/table_tokenize/get_mode.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_tokenize/get_mode.test 2014-10-26 20:00:49 +0900 (529c395) @@ -0,0 +1,5 @@ +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram \ + --normalizer NormalizerAuto + +table_tokenize Terms "あいabアイ" --mode GET Added: test/command/suite/table_tokenize/with_normalizer.expected (+4 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_tokenize/with_normalizer.expected 2014-10-26 20:00:49 +0900 (ac20b02) @@ -0,0 +1,4 @@ +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto +[[0,0.0,0.0],true] +table_tokenize Terms "aBcDe 123" +[[0,0.0,0.0],[{"value":"abcde","position":0},{"value":"123","position":1}]] Added: test/command/suite/table_tokenize/with_normalizer.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_tokenize/with_normalizer.test 2014-10-26 20:00:49 +0900 (3cda398) @@ -0,0 +1,5 @@ +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram \ + --normalizer NormalizerAuto + +table_tokenize Terms "aBcDe 123" Added: test/command/suite/table_tokenize/with_token_filters.expected (+37 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_tokenize/with_token_filters.expected 2014-10-26 20:00:49 +0900 (7cfcda0) @@ -0,0 +1,37 @@ +register token_filters/stop_word +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto --token_filters TokenFilterStopWord +[[0,0.0,0.0],true] +column_create Terms is_stop_word COLUMN_SCALAR Bool +[[0,0.0,0.0],true] +load --table Terms +[ +{"_key": "and", "is_stop_word": true} +] +[[0,0.0,0.0],1] +table_tokenize Terms "Hello and Good-bye" --mode GET +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "hello", + "position": 0 + }, + { + "value": "good", + "position": 2 + }, + { + "value": "-", + "position": 3 + }, + { + "value": "bye", + "position": 4 + } + ] +] Added: test/command/suite/table_tokenize/with_token_filters.test (+14 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_tokenize/with_token_filters.test 2014-10-26 20:00:49 +0900 (80c5adb) @@ -0,0 +1,14 @@ +register token_filters/stop_word + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram \ + --normalizer NormalizerAuto \ + --token_filters TokenFilterStopWord +column_create Terms is_stop_word COLUMN_SCALAR Bool + +load --table Terms +[ +{"_key": "and", "is_stop_word": true} +] + +table_tokenize Terms "Hello and Good-bye" --mode GET -------------- next part -------------- HTML����������������������������...Download