Kouhei Sutou
null+****@clear*****
Tue Sep 11 11:06:49 JST 2018
Kouhei Sutou 2018-09-11 11:06:49 +0900 (Tue, 11 Sep 2018) Revision: 29e90749d9ecc4726db4246194d8dc5ff47510aa https://github.com/groonga/groonga/commit/29e90749d9ecc4726db4246194d8dc5ff47510aa Message: TokenMecab: add support for GET mode in loose_reading Added files: test/command/suite/tokenizers/mecab/options/loose_reading_get.expected test/command/suite/tokenizers/mecab/options/loose_reading_get.test Modified files: plugins/tokenizers/mecab.c Modified: plugins/tokenizers/mecab.c (+244 -207) =================================================================== --- plugins/tokenizers/mecab.c 2018-09-11 10:04:33 +0900 (6ab3a1d3a) +++ plugins/tokenizers/mecab.c 2018-09-11 11:06:49 +0900 (ba67d6ece) @@ -62,6 +62,7 @@ typedef struct { const char *next; const char *end; grn_tokenizer_query *query; + grn_obj feature_locations; struct { grn_bool ing; grn_bool need; @@ -570,137 +571,6 @@ mecab_init_mecab(grn_ctx *ctx, grn_mecab_tokenizer *tokenizer) } } -/* - This function is called for a full text search query or a document to be - indexed. This means that both short/long strings are given. - The return value of this function is ignored. When an error occurs in this - function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). - */ -static void * -mecab_init(grn_ctx *ctx, grn_tokenizer_query *query) -{ - grn_obj *lexicon; - grn_mecab_tokenizer *tokenizer; - - lexicon = grn_tokenizer_query_get_lexicon(ctx, query); - - if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { - GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, - "[tokenizer][mecab] " - "memory allocation to grn_mecab_tokenizer failed"); - return NULL; - } - - tokenizer->options = - grn_table_cache_default_tokenizer_options(ctx, - lexicon, - mecab_tokenizer_options_open, - mecab_tokenizer_options_close, - NULL); - if (ctx->rc != GRN_SUCCESS) { - GRN_PLUGIN_FREE(ctx, tokenizer); - return NULL; - } - - mecab_init_mecab(ctx, tokenizer); - if (!tokenizer->mecab->mecab) { - GRN_PLUGIN_FREE(ctx, tokenizer); - return NULL; - } - - { - grn_encoding encoding = grn_tokenizer_query_get_encoding(ctx, query); - if (encoding != tokenizer->mecab->encoding) { - GRN_PLUGIN_FREE(ctx, tokenizer); - GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, - "[tokenizer][mecab] " - "MeCab dictionary charset (%s) does not match " - "the table encoding: <%s>", - grn_encoding_to_string(tokenizer->mecab->encoding), - grn_encoding_to_string(encoding)); - return NULL; - } - } - - tokenizer->query = query; - - { - grn_obj *string; - const char *normalized_string; - unsigned int normalized_string_length; - - string = grn_tokenizer_query_get_normalized_string(ctx, query); - grn_string_get_normalized(ctx, - string, - &normalized_string, - &normalized_string_length, - NULL); - GRN_TEXT_INIT(&(tokenizer->buf), 0); - if (grn_tokenizer_query_have_tokenized_delimiter(ctx, query)) { - tokenizer->next = normalized_string; - tokenizer->end = tokenizer->next + normalized_string_length; - } else if (normalized_string_length == 0) { - tokenizer->next = ""; - tokenizer->end = tokenizer->next; - } else { - grn_bool succeeded; - grn_plugin_mutex_lock(ctx, tokenizer->mecab->mutex); - if (tokenizer->options->chunked_tokenize && - ctx->encoding == GRN_ENC_UTF8) { - succeeded = chunked_tokenize_utf8(ctx, - tokenizer, - normalized_string, - normalized_string_length); - } else { - const char *s; - s = mecab_sparse_tostr2(tokenizer->mecab->mecab, - normalized_string, - normalized_string_length); - if (!s) { - succeeded = GRN_FALSE; - GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, - "[tokenizer][mecab] " - "mecab_sparse_tostr() failed len=%d err=%s", - normalized_string_length, - mecab_strerror(tokenizer->mecab->mecab)); - } else { - succeeded = GRN_TRUE; - GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); - } - } - grn_plugin_mutex_unlock(ctx, tokenizer->mecab->mutex); - if (!succeeded) { - GRN_PLUGIN_FREE(ctx, tokenizer); - return NULL; - } - if (mecab_tokenizer_options_need_default_output(tokenizer->options)) { - tokenizer->next = GRN_TEXT_VALUE(&(tokenizer->buf)); - tokenizer->end = tokenizer->next + GRN_TEXT_LEN(&(tokenizer->buf)); - } else { - char *buf, *p; - unsigned int bufsize; - - buf = GRN_TEXT_VALUE(&(tokenizer->buf)); - bufsize = GRN_TEXT_LEN(&(tokenizer->buf)); - /* A certain version of mecab returns trailing lf or spaces. */ - for (p = buf + bufsize - 2; - buf <= p && isspace(*(unsigned char *)p); - p--) { *p = '\0'; } - tokenizer->next = buf; - tokenizer->end = p + 1; - } - } - } - - tokenizer->loose.ing = GRN_FALSE; - tokenizer->loose.need = GRN_FALSE; - tokenizer->loose.need_end_mark = GRN_FALSE; - GRN_TEXT_INIT(&(tokenizer->loose.readings), GRN_OBJ_VECTOR); - tokenizer->loose.offset = 0; - - return tokenizer; -} - static void mecab_next_default_format_skip_eos(grn_ctx *ctx, grn_mecab_tokenizer *tokenizer) @@ -723,18 +593,18 @@ mecab_next_default_format_skip_eos(grn_ctx *ctx, typedef struct { grn_token *token; - grn_obj *features; + grn_obj *feature_locations; grn_bool ignore_empty_value; grn_bool ignore_asterisk_value; } add_feature_data; static size_t mecab_get_feature(grn_ctx *ctx, - grn_obj *features, + grn_obj *feature_locations, size_t i, const char **value) { - size_t n_locations = GRN_BULK_VSIZE(features) / sizeof(uint64_t); + size_t n_locations = GRN_BULK_VSIZE(feature_locations) / sizeof(uint64_t); const char *start; const char *end; @@ -743,8 +613,8 @@ mecab_get_feature(grn_ctx *ctx, return 0; } - start = (const char *)(GRN_UINT64_VALUE_AT(features, i)); - end = ((const char *)(GRN_UINT64_VALUE_AT(features, i + 1))) - 1; + start = (const char *)(GRN_UINT64_VALUE_AT(feature_locations, i)); + end = ((const char *)(GRN_UINT64_VALUE_AT(feature_locations, i + 1))) - 1; *value = start; return end - start; } @@ -756,12 +626,12 @@ mecab_next_default_format_add_feature(grn_ctx *ctx, size_t i) { grn_token *token = data->token; - grn_obj *features = data->features; + grn_obj *feature_locations = data->feature_locations; const char *feature = NULL; size_t feature_length; grn_obj value; - feature_length = mecab_get_feature(ctx, features, i, &feature); + feature_length = mecab_get_feature(ctx, feature_locations, i, &feature); if (data->ignore_empty_value && feature_length == 0) { return; } @@ -781,59 +651,34 @@ mecab_next_default_format_add_feature(grn_ctx *ctx, GRN_OBJ_FIN(ctx, &value); } -static void -mecab_next_default_format(grn_ctx *ctx, - grn_mecab_tokenizer *tokenizer, - grn_token *token) +static size_t +mecab_next_default_format_consume_token(grn_ctx *ctx, + grn_mecab_tokenizer *tokenizer, + const char **surface_output) { grn_encoding encoding = tokenizer->query->encoding; + grn_obj *feature_locations = &(tokenizer->feature_locations); const char *start; const char *current; const char *end = tokenizer->end; + const char *surface = NULL; int length = 0; - const char *surface; size_t surface_length = 0; - grn_obj features; - if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) { - grn_tokenizer_status status = GRN_TOKEN_CONTINUE; - grn_token_set_data(ctx, - token, - GRN_TOKENIZER_END_MARK_UTF8, - GRN_TOKENIZER_END_MARK_UTF8_LEN); - grn_token_set_status(ctx, token, status); - tokenizer->loose.need_end_mark = GRN_FALSE; - return; - } - - if (tokenizer->loose.ing) { - grn_tokenizer_status status = GRN_TOKEN_CONTINUE; - const char *reading = NULL; - unsigned int reading_length; - - if (tokenizer->loose.offset + 1 == - grn_vector_size(ctx, &(tokenizer->loose.readings))) { - status = GRN_TOKEN_LAST; - } - reading_length = grn_vector_get_element(ctx, - &(tokenizer->loose.readings), - tokenizer->loose.offset, - &reading, - NULL, - NULL); - grn_token_set_data(ctx, token, reading, reading_length); - grn_token_set_status(ctx, token, status); - tokenizer->loose.offset++; - return; + if (surface_output) { + *surface_output = NULL; } + GRN_BULK_REWIND(feature_locations); mecab_next_default_format_skip_eos(ctx, tokenizer); start = surface = tokenizer->next; - GRN_UINT64_INIT(&features, GRN_OBJ_VECTOR); for (current = start; current < end; current += length) { length = grn_charlen_(ctx, current, end, encoding); if (length == 0) { - break; + if (surface_output) { + *surface_output = NULL; + } + return 0; } if (length == 1) { @@ -841,7 +686,7 @@ mecab_next_default_format(grn_ctx *ctx, if (surface_length == 0) { surface_length = current - surface; } else { - GRN_UINT64_PUT(ctx, &features, current); + GRN_UINT64_PUT(ctx, feature_locations, current); } current++; if (current < end && @@ -854,7 +699,7 @@ mecab_next_default_format(grn_ctx *ctx, if (surface_length == 0) { surface_length = current - surface; } else { - GRN_UINT64_PUT(ctx, &features, current); + GRN_UINT64_PUT(ctx, feature_locations, current); } current++; break; @@ -865,22 +710,94 @@ mecab_next_default_format(grn_ctx *ctx, if (length == 1 && current[0] == '\t') { surface_length = current - surface; if (current + 1 < end) { - GRN_UINT64_PUT(ctx, &features, current + 1); + GRN_UINT64_PUT(ctx, feature_locations, current + 1); } } } else { if (length == 1 && current[0] == ',' && current + 1 < end) { - GRN_UINT64_PUT(ctx, &features, current + 1); + GRN_UINT64_PUT(ctx, feature_locations, current + 1); } } } tokenizer->next = current; mecab_next_default_format_skip_eos(ctx, tokenizer); + if (tokenizer->options->loose_reading) { + const char *reading = NULL; + size_t reading_length; + reading_length = mecab_get_feature(ctx, feature_locations, 7, &reading); + if (reading_length > 0) { + tokenizer->loose.need = GRN_TRUE; + tokenizer->loose.need_end_mark = GRN_TRUE; + grn_vector_add_element(ctx, + &(tokenizer->loose.readings), + reading, + reading_length, + 0, + GRN_DB_TEXT); + } else { + grn_vector_add_element(ctx, + &(tokenizer->loose.readings), + surface, + surface_length, + 0, + GRN_DB_TEXT); + } + } + + if (surface_output) { + *surface_output = surface; + } + return surface_length; +} + +static void +mecab_next_default_format(grn_ctx *ctx, + grn_mecab_tokenizer *tokenizer, + grn_token *token) +{ + const char *surface; + size_t surface_length = 0; + + if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) { + grn_tokenizer_status status = GRN_TOKEN_CONTINUE; + grn_token_set_data(ctx, + token, + GRN_TOKENIZER_END_MARK_UTF8, + GRN_TOKENIZER_END_MARK_UTF8_LEN); + grn_token_set_status(ctx, token, status); + tokenizer->loose.need_end_mark = GRN_FALSE; + return; + } + + if (tokenizer->loose.ing) { + grn_tokenizer_status status = GRN_TOKEN_CONTINUE; + const char *reading = NULL; + unsigned int reading_length; + + if (tokenizer->loose.offset + 1 == + grn_vector_size(ctx, &(tokenizer->loose.readings))) { + status = GRN_TOKEN_LAST; + } + reading_length = grn_vector_get_element(ctx, + &(tokenizer->loose.readings), + tokenizer->loose.offset, + &reading, + NULL, + NULL); + grn_token_set_data(ctx, token, reading, reading_length); + grn_token_set_status(ctx, token, status); + tokenizer->loose.offset++; + return; + } + + surface_length = mecab_next_default_format_consume_token(ctx, + tokenizer, + &surface); grn_token_set_data(ctx, token, surface, surface_length); { grn_tokenizer_status status; - if (current == end || tokenizer->next == end) { + if (tokenizer->next == tokenizer->end) { if (tokenizer->loose.need) { tokenizer->loose.ing = GRN_TRUE; status = GRN_TOKEN_CONTINUE; @@ -895,7 +812,7 @@ mecab_next_default_format(grn_ctx *ctx, if (tokenizer->options->include_class) { add_feature_data data; data.token = token; - data.features = &features; + data.feature_locations = &(tokenizer->feature_locations); data.ignore_empty_value = GRN_TRUE; data.ignore_asterisk_value = GRN_TRUE; mecab_next_default_format_add_feature(ctx, &data, "class", 0); @@ -906,7 +823,7 @@ mecab_next_default_format(grn_ctx *ctx, if (tokenizer->options->include_reading) { add_feature_data data; data.token = token; - data.features = &features; + data.feature_locations = &(tokenizer->feature_locations); data.ignore_empty_value = GRN_TRUE; data.ignore_asterisk_value = GRN_FALSE; mecab_next_default_format_add_feature(ctx, &data, "reading", 7); @@ -914,36 +831,13 @@ mecab_next_default_format(grn_ctx *ctx, if (tokenizer->options->include_form) { add_feature_data data; data.token = token; - data.features = &features; + data.feature_locations = &(tokenizer->feature_locations); data.ignore_empty_value = GRN_TRUE; data.ignore_asterisk_value = GRN_TRUE; mecab_next_default_format_add_feature(ctx, &data, "inflected_type", 4); mecab_next_default_format_add_feature(ctx, &data, "inflected_form", 5); mecab_next_default_format_add_feature(ctx, &data, "base_form", 6); } - if (tokenizer->options->loose_reading) { - const char *reading = NULL; - size_t reading_length; - reading_length = mecab_get_feature(ctx, &features, 7, &reading); - if (reading_length > 0) { - tokenizer->loose.need = GRN_TRUE; - tokenizer->loose.need_end_mark = GRN_TRUE; - grn_vector_add_element(ctx, - &(tokenizer->loose.readings), - reading, - reading_length, - 0, - GRN_DB_TEXT); - } else { - grn_vector_add_element(ctx, - &(tokenizer->loose.readings), - surface, - surface_length, - 0, - GRN_DB_TEXT); - } - } - GRN_OBJ_FIN(ctx, &features); } static void @@ -991,6 +885,148 @@ mecab_next_wakati_format(grn_ctx *ctx, grn_token_set_status(ctx, token, status); } +/* + This function is called for a full text search query or a document to be + indexed. This means that both short/long strings are given. + The return value of this function is ignored. When an error occurs in this + function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). + */ +static void * +mecab_init(grn_ctx *ctx, grn_tokenizer_query *query) +{ + grn_obj *lexicon; + grn_mecab_tokenizer *tokenizer; + + lexicon = grn_tokenizer_query_get_lexicon(ctx, query); + + if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][mecab] " + "memory allocation to grn_mecab_tokenizer failed"); + return NULL; + } + + tokenizer->options = + grn_table_cache_default_tokenizer_options(ctx, + lexicon, + mecab_tokenizer_options_open, + mecab_tokenizer_options_close, + NULL); + if (ctx->rc != GRN_SUCCESS) { + GRN_PLUGIN_FREE(ctx, tokenizer); + return NULL; + } + + mecab_init_mecab(ctx, tokenizer); + if (!tokenizer->mecab->mecab) { + GRN_PLUGIN_FREE(ctx, tokenizer); + return NULL; + } + + { + grn_encoding encoding = grn_tokenizer_query_get_encoding(ctx, query); + if (encoding != tokenizer->mecab->encoding) { + GRN_PLUGIN_FREE(ctx, tokenizer); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "MeCab dictionary charset (%s) does not match " + "the table encoding: <%s>", + grn_encoding_to_string(tokenizer->mecab->encoding), + grn_encoding_to_string(encoding)); + return NULL; + } + } + + tokenizer->query = query; + + { + grn_obj *string; + const char *normalized_string; + unsigned int normalized_string_length; + + string = grn_tokenizer_query_get_normalized_string(ctx, query); + grn_string_get_normalized(ctx, + string, + &normalized_string, + &normalized_string_length, + NULL); + GRN_TEXT_INIT(&(tokenizer->buf), 0); + if (grn_tokenizer_query_have_tokenized_delimiter(ctx, query)) { + tokenizer->next = normalized_string; + tokenizer->end = tokenizer->next + normalized_string_length; + } else if (normalized_string_length == 0) { + tokenizer->next = ""; + tokenizer->end = tokenizer->next; + } else { + grn_bool succeeded; + grn_plugin_mutex_lock(ctx, tokenizer->mecab->mutex); + if (tokenizer->options->chunked_tokenize && + ctx->encoding == GRN_ENC_UTF8) { + succeeded = chunked_tokenize_utf8(ctx, + tokenizer, + normalized_string, + normalized_string_length); + } else { + const char *s; + s = mecab_sparse_tostr2(tokenizer->mecab->mecab, + normalized_string, + normalized_string_length); + if (!s) { + succeeded = GRN_FALSE; + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "mecab_sparse_tostr() failed len=%d err=%s", + normalized_string_length, + mecab_strerror(tokenizer->mecab->mecab)); + } else { + succeeded = GRN_TRUE; + GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); + } + } + grn_plugin_mutex_unlock(ctx, tokenizer->mecab->mutex); + if (!succeeded) { + GRN_PLUGIN_FREE(ctx, tokenizer); + return NULL; + } + if (mecab_tokenizer_options_need_default_output(tokenizer->options)) { + tokenizer->next = GRN_TEXT_VALUE(&(tokenizer->buf)); + tokenizer->end = tokenizer->next + GRN_TEXT_LEN(&(tokenizer->buf)); + } else { + char *buf, *p; + unsigned int bufsize; + + buf = GRN_TEXT_VALUE(&(tokenizer->buf)); + bufsize = GRN_TEXT_LEN(&(tokenizer->buf)); + /* A certain version of mecab returns trailing lf or spaces. */ + for (p = buf + bufsize - 2; + buf <= p && isspace(*(unsigned char *)p); + p--) { *p = '\0'; } + tokenizer->next = buf; + tokenizer->end = p + 1; + } + } + } + + GRN_UINT64_INIT(&(tokenizer->feature_locations), GRN_OBJ_VECTOR); + + tokenizer->loose.ing = GRN_FALSE; + tokenizer->loose.need = GRN_FALSE; + tokenizer->loose.need_end_mark = GRN_FALSE; + GRN_TEXT_INIT(&(tokenizer->loose.readings), GRN_OBJ_VECTOR); + tokenizer->loose.offset = 0; + + if (tokenizer->options->loose_reading && + grn_tokenizer_query_get_mode(ctx, tokenizer->query) == GRN_TOKEN_GET) { + while (mecab_next_default_format_consume_token(ctx, tokenizer, NULL) > 0) { + /* Do nothing */ + } + tokenizer->loose.ing = GRN_TRUE; + tokenizer->loose.need = GRN_TRUE; + tokenizer->loose.need_end_mark = GRN_FALSE; + } + + return tokenizer; +} /* This function returns tokens one by one. @@ -1040,6 +1076,7 @@ mecab_fin(grn_ctx *ctx, void *user_data) return; } GRN_OBJ_FIN(ctx, &(tokenizer->loose.readings)); + GRN_OBJ_FIN(ctx, &(tokenizer->feature_locations)); GRN_OBJ_FIN(ctx, &(tokenizer->buf)); GRN_PLUGIN_FREE(ctx, tokenizer); } Added: test/command/suite/tokenizers/mecab/options/loose_reading_get.expected (+35 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/loose_reading_get.expected 2018-09-11 11:06:49 +0900 (40824322b) @@ -0,0 +1,35 @@ +tokenize 'TokenMecab("loose_reading", true)' '焼き肉と焼肉とyakiniku' --mode GET +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "ヤキニク", + "position": 0, + "force_prefix": false + }, + { + "value": "ト", + "position": 1, + "force_prefix": false + }, + { + "value": "ヤキニク", + "position": 2, + "force_prefix": false + }, + { + "value": "ト", + "position": 3, + "force_prefix": false + }, + { + "value": "yakiniku", + "position": 4, + "force_prefix": false + } + ] +] Added: test/command/suite/tokenizers/mecab/options/loose_reading_get.test (+6 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/loose_reading_get.test 2018-09-11 11:06:49 +0900 (3efcbdcfc) @@ -0,0 +1,6 @@ +#@on-error omit +tokenize \ + 'TokenMecab("loose_reading", true)' \ + '焼き肉と焼肉とyakiniku' \ + --mode GET +#@on-error default -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180911/716b05cd/attachment-0001.htm