Kouhei Sutou
null+****@clear*****
Wed Sep 19 11:14:29 JST 2018
Kouhei Sutou 2018-09-19 11:14:29 +0900 (Wed, 19 Sep 2018) Revision: 1a9ef9e1bbdba660f712f3037b6c7b42708ef7b1 https://github.com/groonga/groonga/commit/1a9ef9e1bbdba660f712f3037b6c7b42708ef7b1 Message: TokenMecab: loose_reading -> use_reading Because we don't need original surfaces to search by reading. Copied files: test/command/suite/tokenizers/mecab/options/use_reading_add.expected (from test/command/suite/tokenizers/mecab/options/loose_reading_get.expected) Removed files: test/command/suite/tokenizers/mecab/options/loose_reading_add.expected Modified files: plugins/tokenizers/mecab.c Renamed files: test/command/suite/select/query/match/with_index/token_mecab/use_reading.expected (from test/command/suite/select/query/match/with_index/token_mecab/loose_reading.expected) test/command/suite/select/query/match/with_index/token_mecab/use_reading.test (from test/command/suite/select/query/match/with_index/token_mecab/loose_reading.test) test/command/suite/tokenizers/mecab/options/use_reading_add.test (from test/command/suite/tokenizers/mecab/options/loose_reading_add.test) test/command/suite/tokenizers/mecab/options/use_reading_get.expected (from test/command/suite/tokenizers/mecab/options/loose_reading_get.expected) test/command/suite/tokenizers/mecab/options/use_reading_get.test (from test/command/suite/tokenizers/mecab/options/loose_reading_get.test) Modified: plugins/tokenizers/mecab.c (+29 -94) =================================================================== --- plugins/tokenizers/mecab.c 2018-09-13 22:26:19 +0900 (487358adb) +++ plugins/tokenizers/mecab.c 2018-09-19 11:14:29 +0900 (7d022f05a) @@ -46,13 +46,15 @@ static grn_mecab mecab_wakati; static grn_bool grn_mecab_chunked_tokenize_enabled = GRN_FALSE; static int32_t grn_mecab_chunk_size_threshold = 8192; +static const size_t GRN_MECAB_FEATURE_LOCATION_READING = 7; + typedef struct { grn_bool chunked_tokenize; int32_t chunk_size_threshold; grn_bool include_class; grn_bool include_reading; grn_bool include_form; - grn_bool loose_reading; + grn_bool use_reading; } grn_mecab_tokenizer_options; typedef struct { @@ -63,13 +65,6 @@ typedef struct { const char *end; grn_tokenizer_query *query; grn_obj feature_locations; - struct { - grn_bool ing; - grn_bool need; - grn_bool need_end_mark; - grn_obj readings; - size_t offset; - } loose; } grn_mecab_tokenizer; static const char * @@ -155,7 +150,7 @@ mecab_tokenizer_options_init(grn_mecab_tokenizer_options *options) options->include_class = GRN_FALSE; options->include_reading = GRN_FALSE; options->include_form = GRN_FALSE; - options->loose_reading = GRN_FALSE; + options->use_reading = GRN_FALSE; } static grn_bool @@ -177,7 +172,7 @@ mecab_tokenizer_options_need_default_output(grn_mecab_tokenizer_options *options return GRN_TRUE; } - if (options->loose_reading) { + if (options->use_reading) { return GRN_TRUE; } @@ -238,12 +233,12 @@ mecab_tokenizer_options_open(grn_ctx *ctx, raw_options, i, options->include_form); - } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "loose_reading")) { - options->loose_reading = + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "use_reading")) { + options->use_reading = grn_vector_get_element_bool(ctx, raw_options, i, - options->loose_reading); + options->use_reading); } } GRN_OPTION_VALUES_EACH_END(); @@ -722,29 +717,6 @@ mecab_next_default_format_consume_token(grn_ctx *ctx, tokenizer->next = current; mecab_next_default_format_skip_eos(ctx, tokenizer); - if (tokenizer->options->loose_reading) { - const char *reading = NULL; - size_t reading_length; - reading_length = mecab_get_feature(ctx, feature_locations, 7, &reading); - if (reading_length > 0) { - tokenizer->loose.need = GRN_TRUE; - tokenizer->loose.need_end_mark = GRN_TRUE; - grn_vector_add_element(ctx, - &(tokenizer->loose.readings), - reading, - reading_length, - 0, - GRN_DB_TEXT); - } else { - grn_vector_add_element(ctx, - &(tokenizer->loose.readings), - surface, - surface_length, - 0, - GRN_DB_TEXT); - } - } - if (surface_output) { *surface_output = surface; } @@ -759,54 +731,32 @@ mecab_next_default_format(grn_ctx *ctx, const char *surface; size_t surface_length = 0; - if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) { - grn_tokenizer_status status = GRN_TOKEN_CONTINUE; - grn_token_set_data(ctx, - token, - GRN_TOKENIZER_END_MARK_UTF8, - GRN_TOKENIZER_END_MARK_UTF8_LEN); - grn_token_set_status(ctx, token, status); - tokenizer->loose.need_end_mark = GRN_FALSE; - return; - } - - if (tokenizer->loose.ing) { - grn_tokenizer_status status = GRN_TOKEN_CONTINUE; - const char *reading = NULL; - unsigned int reading_length; - - if (tokenizer->loose.offset + 1 == - grn_vector_size(ctx, &(tokenizer->loose.readings))) { - status = GRN_TOKEN_LAST; - } - reading_length = grn_vector_get_element(ctx, - &(tokenizer->loose.readings), - tokenizer->loose.offset, - &reading, - NULL, - NULL); - grn_token_set_data(ctx, token, reading, reading_length); - grn_token_set_status(ctx, token, status); - tokenizer->loose.offset++; - return; - } - surface_length = mecab_next_default_format_consume_token(ctx, tokenizer, &surface); - grn_token_set_data(ctx, token, surface, surface_length); + if (tokenizer->options->use_reading) { + grn_obj *feature_locations = &(tokenizer->feature_locations); + const char *reading = NULL; + size_t reading_length; + reading_length = mecab_get_feature(ctx, + feature_locations, + GRN_MECAB_FEATURE_LOCATION_READING, + &reading); + if (reading_length > 0) { + grn_token_set_data(ctx, token, reading, reading_length); + } else { + grn_token_set_data(ctx, token, surface, surface_length); + } + } else { + grn_token_set_data(ctx, token, surface, surface_length); + } { grn_tokenizer_status status; if (surface_length == 0) { /* Error */ status = GRN_TOKEN_LAST; } else if (tokenizer->next == tokenizer->end) { - if (tokenizer->loose.need) { - tokenizer->loose.ing = GRN_TRUE; - status = GRN_TOKEN_CONTINUE; - } else { - status = GRN_TOKEN_LAST; - } + status = GRN_TOKEN_LAST; } else { status = GRN_TOKEN_CONTINUE; } @@ -829,7 +779,10 @@ mecab_next_default_format(grn_ctx *ctx, data.feature_locations = &(tokenizer->feature_locations); data.ignore_empty_value = GRN_TRUE; data.ignore_asterisk_value = GRN_FALSE; - mecab_next_default_format_add_feature(ctx, &data, "reading", 7); + mecab_next_default_format_add_feature(ctx, + &data, + "reading", + GRN_MECAB_FEATURE_LOCATION_READING); } if (tokenizer->options->include_form) { add_feature_data data; @@ -1012,23 +965,6 @@ mecab_init(grn_ctx *ctx, grn_tokenizer_query *query) GRN_UINT64_INIT(&(tokenizer->feature_locations), GRN_OBJ_VECTOR); - tokenizer->loose.ing = GRN_FALSE; - tokenizer->loose.need = GRN_FALSE; - tokenizer->loose.need_end_mark = GRN_FALSE; - GRN_TEXT_INIT(&(tokenizer->loose.readings), GRN_OBJ_VECTOR); - tokenizer->loose.offset = 0; - - if (tokenizer->options->loose_reading && - grn_tokenizer_query_get_mode(ctx, tokenizer->query) == GRN_TOKEN_GET) { - while (tokenizer->next < tokenizer->end && - mecab_next_default_format_consume_token(ctx, tokenizer, NULL) > 0) { - /* Do nothing */ - } - tokenizer->loose.ing = GRN_TRUE; - tokenizer->loose.need = GRN_TRUE; - tokenizer->loose.need_end_mark = GRN_FALSE; - } - return tokenizer; } @@ -1079,7 +1015,6 @@ mecab_fin(grn_ctx *ctx, void *user_data) if (!tokenizer) { return; } - GRN_OBJ_FIN(ctx, &(tokenizer->loose.readings)); GRN_OBJ_FIN(ctx, &(tokenizer->feature_locations)); GRN_OBJ_FIN(ctx, &(tokenizer->buf)); GRN_PLUGIN_FREE(ctx, tokenizer); Renamed: test/command/suite/select/query/match/with_index/token_mecab/use_reading.expected (+1 -1) 87% =================================================================== --- test/command/suite/select/query/match/with_index/token_mecab/loose_reading.expected 2018-09-13 22:26:19 +0900 (3c8d1de6d) +++ test/command/suite/select/query/match/with_index/token_mecab/use_reading.expected 2018-09-19 11:14:29 +0900 (f43c868da) @@ -2,7 +2,7 @@ table_create Menus TABLE_NO_KEY [[0,0.0,0.0],true] column_create Menus name COLUMN_SCALAR Text [[0,0.0,0.0],true] -table_create Terms TABLE_PAT_KEY ShortText --normalize NormalizerNFKC100 --default_tokenizer 'TokenMecab("loose_reading", true)' +table_create Terms TABLE_PAT_KEY ShortText --normalize NormalizerNFKC100 --default_tokenizer 'TokenMecab("use_reading", true)' [[0,0.0,0.0],true] column_create Terms index COLUMN_INDEX|WITH_POSITION Menus name [[0,0.0,0.0],true] Renamed: test/command/suite/select/query/match/with_index/token_mecab/use_reading.test (+1 -1) 84% =================================================================== --- test/command/suite/select/query/match/with_index/token_mecab/loose_reading.test 2018-09-13 22:26:19 +0900 (40d5857d0) +++ test/command/suite/select/query/match/with_index/token_mecab/use_reading.test 2018-09-19 11:14:29 +0900 (4dfe3e38d) @@ -3,7 +3,7 @@ column_create Menus name COLUMN_SCALAR Text table_create Terms TABLE_PAT_KEY ShortText \ --normalize NormalizerNFKC100 \ - --default_tokenizer 'TokenMecab("loose_reading", true)' + --default_tokenizer 'TokenMecab("use_reading", true)' column_create Terms index COLUMN_INDEX|WITH_POSITION Menus name load --table Menus Deleted: test/command/suite/tokenizers/mecab/options/loose_reading_add.expected (+0 -65) 100644 =================================================================== --- test/command/suite/tokenizers/mecab/options/loose_reading_add.expected 2018-09-13 22:26:19 +0900 (ea667f08c) +++ /dev/null @@ -1,65 +0,0 @@ -tokenize 'TokenMecab("loose_reading", true)' '焼き肉と焼肉とyakiniku' -[ - [ - 0, - 0.0, - 0.0 - ], - [ - { - "value": "焼き肉", - "position": 0, - "force_prefix": false - }, - { - "value": "と", - "position": 1, - "force_prefix": false - }, - { - "value": "焼肉", - "position": 2, - "force_prefix": false - }, - { - "value": "と", - "position": 3, - "force_prefix": false - }, - { - "value": "yakiniku", - "position": 4, - "force_prefix": false - }, - { - "value": "", - "position": 5, - "force_prefix": false - }, - { - "value": "ヤキニク", - "position": 6, - "force_prefix": false - }, - { - "value": "ト", - "position": 7, - "force_prefix": false - }, - { - "value": "ヤキニク", - "position": 8, - "force_prefix": false - }, - { - "value": "ト", - "position": 9, - "force_prefix": false - }, - { - "value": "yakiniku", - "position": 10, - "force_prefix": false - } - ] -] Copied: test/command/suite/tokenizers/mecab/options/use_reading_add.expected (+1 -1) 83% =================================================================== --- test/command/suite/tokenizers/mecab/options/loose_reading_get.expected 2018-09-13 22:26:19 +0900 (40824322b) +++ test/command/suite/tokenizers/mecab/options/use_reading_add.expected 2018-09-19 11:14:29 +0900 (00f119724) @@ -1,4 +1,4 @@ -tokenize 'TokenMecab("loose_reading", true)' '焼き肉と焼肉とyakiniku' --mode GET +tokenize 'TokenMecab("use_reading", true)' '焼き肉と焼肉とyakiniku' [ [ 0, Renamed: test/command/suite/tokenizers/mecab/options/use_reading_add.test (+1 -1) 66% =================================================================== --- test/command/suite/tokenizers/mecab/options/loose_reading_add.test 2018-09-13 22:26:19 +0900 (194e23893) +++ test/command/suite/tokenizers/mecab/options/use_reading_add.test 2018-09-19 11:14:29 +0900 (fb27c64de) @@ -1,5 +1,5 @@ #@on-error omit tokenize \ - 'TokenMecab("loose_reading", true)' \ + 'TokenMecab("use_reading", true)' \ '焼き肉と焼肉とyakiniku' #@on-error default Renamed: test/command/suite/tokenizers/mecab/options/use_reading_get.expected (+1 -1) 83% =================================================================== --- test/command/suite/tokenizers/mecab/options/loose_reading_get.expected 2018-09-13 22:26:19 +0900 (40824322b) +++ test/command/suite/tokenizers/mecab/options/use_reading_get.expected 2018-09-19 11:14:29 +0900 (d4ba3d0fa) @@ -1,4 +1,4 @@ -tokenize 'TokenMecab("loose_reading", true)' '焼き肉と焼肉とyakiniku' --mode GET +tokenize 'TokenMecab("use_reading", true)' '焼き肉と焼肉とyakiniku' --mode GET [ [ 0, Renamed: test/command/suite/tokenizers/mecab/options/use_reading_get.test (+1 -1) 70% =================================================================== --- test/command/suite/tokenizers/mecab/options/loose_reading_get.test 2018-09-13 22:26:19 +0900 (3efcbdcfc) +++ test/command/suite/tokenizers/mecab/options/use_reading_get.test 2018-09-19 11:14:29 +0900 (e4d453e49) @@ -1,6 +1,6 @@ #@on-error omit tokenize \ - 'TokenMecab("loose_reading", true)' \ + 'TokenMecab("use_reading", true)' \ '焼き肉と焼肉とyakiniku' \ --mode GET #@on-error default -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180919/91925a0e/attachment-0001.htm