naoa
null+****@clear*****
Sun Oct 26 17:24:38 JST 2014
naoa 2014-10-26 17:24:38 +0900 (Sun, 26 Oct 2014) New Revision: 8d7f275527588aeaa932666ef6df3cd18d82f41c https://github.com/groonga/groonga/commit/8d7f275527588aeaa932666ef6df3cd18d82f41c Merged ff11b0c: Merge pull request #232 from naoa/tokenize-support-token_filters Message: tokenize: support token filters Added files: test/command/suite/tokenize/with_token_filters.expected test/command/suite/tokenize/with_token_filters.test Modified files: lib/proc.c Modified: lib/proc.c (+35 -17) =================================================================== --- lib/proc.c 2014-10-26 15:05:34 +0900 (b0b54fe) +++ lib/proc.c 2014-10-26 17:24:38 +0900 (15948b6) @@ -3400,7 +3400,7 @@ typedef struct { } tokenize_token; static void -output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_hash *lexicon) +output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon) { int i, n_tokens; @@ -3416,8 +3416,8 @@ output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_hash *lexicon) GRN_OUTPUT_MAP_OPEN("TOKEN", 2); GRN_OUTPUT_CSTR("value"); - value_size = grn_hash_get_key(ctx, lexicon, token->id, - value, GRN_TABLE_MAX_KEY_SIZE); + value_size = grn_table_get_key(ctx, lexicon, token->id, + value, GRN_TABLE_MAX_KEY_SIZE); GRN_OUTPUT_STR(value, value_size); GRN_OUTPUT_CSTR("position"); @@ -3428,12 +3428,13 @@ output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_hash *lexicon) GRN_OUTPUT_ARRAY_CLOSE(); } -static grn_hash * +static grn_obj * create_lexicon_for_tokenize(grn_ctx *ctx, grn_obj *tokenizer_name, - grn_obj *normalizer_name) + grn_obj *normalizer_name, + grn_obj *token_filter_names) { - grn_hash *lexicon; + grn_obj *lexicon; grn_obj *tokenizer; grn_obj *normalizer = NULL; @@ -3489,28 +3490,32 @@ create_lexicon_for_tokenize(grn_ctx *ctx, } } - lexicon = grn_hash_create(ctx, NULL, GRN_TABLE_MAX_KEY_SIZE, 0, - GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_VAR_SIZE); - grn_obj_set_info(ctx, (grn_obj *)lexicon, + lexicon = grn_table_create(ctx, NULL, 0, + NULL, + GRN_OBJ_TABLE_HASH_KEY, + grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), + NULL); + grn_obj_set_info(ctx, lexicon, GRN_INFO_DEFAULT_TOKENIZER, tokenizer); grn_obj_unlink(ctx, tokenizer); if (normalizer) { - grn_obj_set_info(ctx, (grn_obj *)lexicon, + grn_obj_set_info(ctx, lexicon, GRN_INFO_NORMALIZER, normalizer); grn_obj_unlink(ctx, normalizer); } + proc_table_create_set_token_filters(ctx, lexicon, token_filter_names); return lexicon; } static void -tokenize(grn_ctx *ctx, grn_hash *lexicon, grn_obj *string, grn_token_mode mode, +tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, grn_token_mode mode, unsigned int flags, grn_obj *tokens) { grn_token_cursor *token_cursor; token_cursor = - grn_token_cursor_open(ctx, (grn_obj *)lexicon, + grn_token_cursor_open(ctx, lexicon, GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string), mode, flags); if (!token_cursor) { @@ -3539,12 +3544,16 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) grn_obj *normalizer_name; grn_obj *flag_names; grn_obj *mode_name; + grn_obj *token_filter_names; + grn_obj *table_name; tokenizer_name = VAR(0); string = VAR(1); normalizer_name = VAR(2); flag_names = VAR(3); mode_name = VAR(4); + token_filter_names = VAR(5); + table_name = VAR(6); if (GRN_TEXT_LEN(tokenizer_name) == 0) { ERR(GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing"); @@ -3558,14 +3567,21 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { unsigned int flags; - grn_hash *lexicon; + grn_obj *lexicon; flags = parse_tokenize_flags(ctx, flag_names); if (ctx->rc != GRN_SUCCESS) { return NULL; } - lexicon = create_lexicon_for_tokenize(ctx, tokenizer_name, normalizer_name); + if (GRN_TEXT_LEN(table_name)) { + lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name)); + } else { + lexicon = create_lexicon_for_tokenize(ctx, + tokenizer_name, + normalizer_name, + token_filter_names); + } if (!lexicon) { return NULL; } @@ -3584,7 +3600,7 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_token_cursor *token_cursor; token_cursor = - grn_token_cursor_open(ctx, (grn_obj *)lexicon, + grn_token_cursor_open(ctx, lexicon, GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string), GRN_TOKEN_ADD, flags); if (token_cursor) { @@ -3608,7 +3624,7 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) } #undef MODE_NAME_EQUAL - grn_hash_close(ctx, lexicon); + grn_obj_unlink(ctx, lexicon); } return NULL; @@ -5482,7 +5498,9 @@ grn_db_init_builtin_query(grn_ctx *ctx) DEF_VAR(vars[2], "normalizer"); DEF_VAR(vars[3], "flags"); DEF_VAR(vars[4], "mode"); - DEF_COMMAND("tokenize", proc_tokenize, 5, vars); + DEF_VAR(vars[5], "token_filters"); + DEF_VAR(vars[6], "table"); + DEF_COMMAND("tokenize", proc_tokenize, 7, vars); DEF_COMMAND("tokenizer_list", proc_tokenizer_list, 0, vars); Added: test/command/suite/tokenize/with_token_filters.expected (+37 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenize/with_token_filters.expected 2014-10-26 17:24:38 +0900 (87c7367) @@ -0,0 +1,37 @@ +register token_filters/stop_word +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto --token_filters TokenFilterStopWord +[[0,0.0,0.0],true] +column_create Terms is_stop_word COLUMN_SCALAR Bool +[[0,0.0,0.0],true] +load --table Terms +[ +{"_key": "and", "is_stop_word": true} +] +[[0,0.0,0.0],1] +tokenize TokenBigram "Hello and Good-bye" --mode GET --table Terms +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "hello", + "position": 0 + }, + { + "value": "good", + "position": 2 + }, + { + "value": "-", + "position": 3 + }, + { + "value": "bye", + "position": 4 + } + ] +] Added: test/command/suite/tokenize/with_token_filters.test (+14 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenize/with_token_filters.test 2014-10-26 17:24:38 +0900 (19a85a5) @@ -0,0 +1,14 @@ +register token_filters/stop_word + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram \ + --normalizer NormalizerAuto \ + --token_filters TokenFilterStopWord +column_create Terms is_stop_word COLUMN_SCALAR Bool + +load --table Terms +[ +{"_key": "and", "is_stop_word": true} +] + +tokenize TokenBigram "Hello and Good-bye" --mode GET --table Terms -------------- next part -------------- HTML����������������������������...Download