Kouhei Sutou
null+****@clear*****
Mon Sep 10 17:55:28 JST 2018
Kouhei Sutou 2018-09-10 17:55:28 +0900 (Mon, 10 Sep 2018) Revision: 94e1c7685a5b0d1e6e4377d2f5f60b3fcce80d55 https://github.com/groonga/groonga/commit/94e1c7685a5b0d1e6e4377d2f5f60b3fcce80d55 Message: TokenMecab: add loose_reading option TODO: GET mode support Added files: test/command/suite/tokenizers/mecab/options/loose_reading_add.expected test/command/suite/tokenizers/mecab/options/loose_reading_add.test Modified files: plugins/tokenizers/mecab.c Modified: plugins/tokenizers/mecab.c (+112 -15) =================================================================== --- plugins/tokenizers/mecab.c 2018-09-10 17:55:01 +0900 (42e0c5177) +++ plugins/tokenizers/mecab.c 2018-09-10 17:55:28 +0900 (6ab3a1d3a) @@ -52,6 +52,7 @@ typedef struct { grn_bool include_class; grn_bool include_reading; grn_bool include_form; + grn_bool loose_reading; } grn_mecab_tokenizer_options; typedef struct { @@ -61,6 +62,13 @@ typedef struct { const char *next; const char *end; grn_tokenizer_query *query; + struct { + grn_bool ing; + grn_bool need; + grn_bool need_end_mark; + grn_obj readings; + size_t offset; + } loose; } grn_mecab_tokenizer; static const char * @@ -146,6 +154,7 @@ mecab_tokenizer_options_init(grn_mecab_tokenizer_options *options) options->include_class = GRN_FALSE; options->include_reading = GRN_FALSE; options->include_form = GRN_FALSE; + options->loose_reading = GRN_FALSE; } static grn_bool @@ -167,6 +176,10 @@ mecab_tokenizer_options_need_default_output(grn_mecab_tokenizer_options *options return GRN_TRUE; } + if (options->loose_reading) { + return GRN_TRUE; + } + return GRN_FALSE; } @@ -224,6 +237,12 @@ mecab_tokenizer_options_open(grn_ctx *ctx, raw_options, i, options->include_form); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "loose_reading")) { + options->loose_reading = + grn_vector_get_element_bool(ctx, + raw_options, + i, + options->loose_reading); } } GRN_OPTION_VALUES_EACH_END(); @@ -673,6 +692,12 @@ mecab_init(grn_ctx *ctx, grn_tokenizer_query *query) } } + tokenizer->loose.ing = GRN_FALSE; + tokenizer->loose.need = GRN_FALSE; + tokenizer->loose.need_end_mark = GRN_FALSE; + GRN_TEXT_INIT(&(tokenizer->loose.readings), GRN_OBJ_VECTOR); + tokenizer->loose.offset = 0; + return tokenizer; } @@ -703,6 +728,27 @@ typedef struct { grn_bool ignore_asterisk_value; } add_feature_data; +static size_t +mecab_get_feature(grn_ctx *ctx, + grn_obj *features, + size_t i, + const char **value) +{ + size_t n_locations = GRN_BULK_VSIZE(features) / sizeof(uint64_t); + const char *start; + const char *end; + + if (i + 2 > n_locations) { + *value = NULL; + return 0; + } + + start = (const char *)(GRN_UINT64_VALUE_AT(features, i)); + end = ((const char *)(GRN_UINT64_VALUE_AT(features, i + 1))) - 1; + *value = start; + return end - start; +} + static void mecab_next_default_format_add_feature(grn_ctx *ctx, add_feature_data *data, @@ -711,31 +757,22 @@ mecab_next_default_format_add_feature(grn_ctx *ctx, { grn_token *token = data->token; grn_obj *features = data->features; - size_t n_locations = GRN_BULK_VSIZE(features) / sizeof(uint64_t); - const char *feature_start; - const char *feature_end; + const char *feature = NULL; size_t feature_length; grn_obj value; - if (i + 2 > n_locations) { - return; - } - - feature_start = (const char *)(GRN_UINT64_VALUE_AT(features, i)); - feature_end = ((const char *)(GRN_UINT64_VALUE_AT(features, i + 1))) - 1; - feature_length = feature_end - feature_start; - + feature_length = mecab_get_feature(ctx, features, i, &feature); if (data->ignore_empty_value && feature_length == 0) { return; } if (data->ignore_asterisk_value && feature_length == 1 && - feature_start[0] == '*') { + feature[0] == '*') { return; } GRN_TEXT_INIT(&value, GRN_OBJ_DO_SHALLOW_COPY); - GRN_TEXT_SET(ctx, &value, feature_start, feature_length); + GRN_TEXT_SET(ctx, &value, feature, feature_length); grn_token_metadata_add(ctx, grn_token_get_metadata(ctx, token), name, @@ -758,6 +795,38 @@ mecab_next_default_format(grn_ctx *ctx, size_t surface_length = 0; grn_obj features; + if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) { + grn_tokenizer_status status = GRN_TOKEN_CONTINUE; + grn_token_set_data(ctx, + token, + GRN_TOKENIZER_END_MARK_UTF8, + GRN_TOKENIZER_END_MARK_UTF8_LEN); + grn_token_set_status(ctx, token, status); + tokenizer->loose.need_end_mark = GRN_FALSE; + return; + } + + if (tokenizer->loose.ing) { + grn_tokenizer_status status = GRN_TOKEN_CONTINUE; + const char *reading = NULL; + unsigned int reading_length; + + if (tokenizer->loose.offset + 1 == + grn_vector_size(ctx, &(tokenizer->loose.readings))) { + status = GRN_TOKEN_LAST; + } + reading_length = grn_vector_get_element(ctx, + &(tokenizer->loose.readings), + tokenizer->loose.offset, + &reading, + NULL, + NULL); + grn_token_set_data(ctx, token, reading, reading_length); + grn_token_set_status(ctx, token, status); + tokenizer->loose.offset++; + return; + } + mecab_next_default_format_skip_eos(ctx, tokenizer); start = surface = tokenizer->next; GRN_UINT64_INIT(&features, GRN_OBJ_VECTOR); @@ -812,7 +881,12 @@ mecab_next_default_format(grn_ctx *ctx, { grn_tokenizer_status status; if (current == end || tokenizer->next == end) { - status = GRN_TOKEN_LAST; + if (tokenizer->loose.need) { + tokenizer->loose.ing = GRN_TRUE; + status = GRN_TOKEN_CONTINUE; + } else { + status = GRN_TOKEN_LAST; + } } else { status = GRN_TOKEN_CONTINUE; } @@ -847,6 +921,28 @@ mecab_next_default_format(grn_ctx *ctx, mecab_next_default_format_add_feature(ctx, &data, "inflected_form", 5); mecab_next_default_format_add_feature(ctx, &data, "base_form", 6); } + if (tokenizer->options->loose_reading) { + const char *reading = NULL; + size_t reading_length; + reading_length = mecab_get_feature(ctx, &features, 7, &reading); + if (reading_length > 0) { + tokenizer->loose.need = GRN_TRUE; + tokenizer->loose.need_end_mark = GRN_TRUE; + grn_vector_add_element(ctx, + &(tokenizer->loose.readings), + reading, + reading_length, + 0, + GRN_DB_TEXT); + } else { + grn_vector_add_element(ctx, + &(tokenizer->loose.readings), + surface, + surface_length, + 0, + GRN_DB_TEXT); + } + } GRN_OBJ_FIN(ctx, &features); } @@ -943,7 +1039,8 @@ mecab_fin(grn_ctx *ctx, void *user_data) if (!tokenizer) { return; } - grn_obj_unlink(ctx, &(tokenizer->buf)); + GRN_OBJ_FIN(ctx, &(tokenizer->loose.readings)); + GRN_OBJ_FIN(ctx, &(tokenizer->buf)); GRN_PLUGIN_FREE(ctx, tokenizer); } Added: test/command/suite/tokenizers/mecab/options/loose_reading_add.expected (+65 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/loose_reading_add.expected 2018-09-10 17:55:28 +0900 (7a0bba241) @@ -0,0 +1,65 @@ +tokenize 'TokenMecab("loose_reading", true)' '焼き肉と焼きにくとyakiniku' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "焼き肉", + "position": 0, + "force_prefix": false + }, + { + "value": "と", + "position": 1, + "force_prefix": false + }, + { + "value": "焼きにく", + "position": 2, + "force_prefix": false + }, + { + "value": "と", + "position": 3, + "force_prefix": false + }, + { + "value": "yakiniku", + "position": 4, + "force_prefix": false + }, + { + "value": "", + "position": 5, + "force_prefix": false + }, + { + "value": "ヤキニク", + "position": 6, + "force_prefix": false + }, + { + "value": "ト", + "position": 7, + "force_prefix": false + }, + { + "value": "ヤキニク", + "position": 8, + "force_prefix": false + }, + { + "value": "ト", + "position": 9, + "force_prefix": false + }, + { + "value": "yakiniku", + "position": 10, + "force_prefix": false + } + ] +] Added: test/command/suite/tokenizers/mecab/options/loose_reading_add.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/loose_reading_add.test 2018-09-10 17:55:28 +0900 (9250edb4d) @@ -0,0 +1,5 @@ +#@on-error omit +tokenize \ + 'TokenMecab("loose_reading", true)' \ + '焼き肉と焼きにくとyakiniku' +#@on-error default -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180910/572a3ad1/attachment-0001.htm