Kouhei Sutou
null+****@clear*****
Tue May 8 12:49:05 JST 2018
Kouhei Sutou 2018-05-08 12:49:05 +0900 (Tue, 08 May 2018) New Revision: 5d7ed50b50fe1ae1e5af0d912da74926c8fa684a https://github.com/groonga/groonga/commit/5d7ed50b50fe1ae1e5af0d912da74926c8fa684a Message: Hide grn_tokenizer_query internal Direct grn_tokenizer_query field access is deprecated. Use grn_tokenizer_query_* instead. Modified files: include/groonga/tokenizer.h lib/db.c lib/expr.c lib/grn_token_cursor.h lib/tokenizer.c lib/tokenizers.c plugins/suggest/suggest.c plugins/tokenizers/kytea.cpp plugins/tokenizers/mecab.c Modified: include/groonga/tokenizer.h (+24 -20) =================================================================== --- include/groonga/tokenizer.h 2018-05-07 17:34:13 +0900 (bdb1c41aa) +++ include/groonga/tokenizer.h 2018-05-08 12:49:05 +0900 (d8da40134) @@ -1,6 +1,6 @@ /* -*- c-basic-offset: 2 -*- */ /* - Copyright(C) 2012-2016 Brazil + Copyright(C) 2012-2018 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -20,6 +20,7 @@ #include <groonga/plugin.h> #include <groonga/token.h> +#include <groonga/tokenizer_query_deprecated.h> #ifdef __cplusplus extern "C" { @@ -70,25 +71,6 @@ GRN_PLUGIN_EXPORT grn_bool grn_tokenizer_have_tokenized_delimiter(grn_ctx *ctx, grn_encoding encoding); /* - grn_tokenizer_query is a structure for storing a query. See the following - functions. - */ -typedef struct _grn_tokenizer_query grn_tokenizer_query; - -struct _grn_tokenizer_query { - grn_obj *normalized_query; - char *query_buf; - const char *ptr; - unsigned int length; - grn_encoding encoding; - unsigned int flags; - grn_bool have_tokenized_delimiter; - /* Deprecated since 4.0.8. Use tokenize_mode instead. */ - grn_token_mode token_mode; - grn_tokenize_mode tokenize_mode; -}; - -/* grn_tokenizer_query_open() parses `args' and returns a new object of grn_tokenizer_query. The new object stores information of the query. grn_tokenizer_query_open() normalizes the query if the target table @@ -122,6 +104,28 @@ GRN_PLUGIN_EXPORT void grn_tokenizer_query_close(grn_ctx *ctx, grn_tokenizer_que */ void grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query); +GRN_PLUGIN_EXPORT grn_obj * +grn_tokenizer_query_get_normalized_string(grn_ctx *ctx, + grn_tokenizer_query *query); + +GRN_PLUGIN_EXPORT const char * +grn_tokenizer_query_get_raw_string(grn_ctx *ctx, + grn_tokenizer_query *query, + size_t *length); + +GRN_PLUGIN_EXPORT grn_encoding +grn_tokenizer_query_get_encoding(grn_ctx *ctx, grn_tokenizer_query *query); + +GRN_PLUGIN_EXPORT unsigned int +grn_tokenizer_query_get_flags(grn_ctx *ctx, grn_tokenizer_query *query); + +GRN_PLUGIN_EXPORT grn_bool +grn_tokenizer_query_have_tokenized_delimiter(grn_ctx *ctx, + grn_tokenizer_query *query); + +GRN_PLUGIN_EXPORT grn_tokenize_mode +grn_tokenizer_query_get_mode(grn_ctx *ctx, grn_tokenizer_query *query); + /* grn_tokenizer_token is needed to return tokens. A grn_tokenizer_token object stores a token to be returned and it must be maintained until a request for Modified: lib/db.c (+3 -0) =================================================================== --- lib/db.c 2018-05-07 17:34:13 +0900 (19d46d3eb) +++ lib/db.c 2018-05-08 12:49:05 +0900 (3cfa9535a) @@ -15,7 +15,10 @@ License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ + #include "grn.h" +#include "grn_tokenizer.h" + #include "grn_config.h" #include "grn_db.h" #include "grn_obj.h" Modified: lib/expr.c (+3 -0) =================================================================== --- lib/expr.c 2018-05-07 17:34:13 +0900 (f7ab03969) +++ lib/expr.c 2018-05-08 12:49:05 +0900 (31aa512de) @@ -15,7 +15,10 @@ License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ + #include "grn.h" +#include "grn_tokenizer.h" + #include "grn_db.h" #include "grn_ctx_impl.h" #include "grn_ctx_impl_mrb.h" Modified: lib/grn_token_cursor.h (+1 -2) =================================================================== --- lib/grn_token_cursor.h 2018-05-07 17:34:13 +0900 (17858f236) +++ lib/grn_token_cursor.h 2018-05-08 12:49:05 +0900 (1b4ad11a3) @@ -19,10 +19,9 @@ #pragma once #include "grn_ctx.h" +#include "grn_tokenizer.h" #include "grn_db.h" -#include <groonga/tokenizer.h> - #ifdef __cplusplus extern "C" { #endif Modified: lib/tokenizer.c (+46 -2) =================================================================== --- lib/tokenizer.c 2018-05-07 17:34:13 +0900 (0c162580f) +++ lib/tokenizer.c 2018-05-08 12:49:05 +0900 (c74614324) @@ -1,6 +1,6 @@ /* -*- c-basic-offset: 2 -*- */ /* - Copyright(C) 2012-2014 Brazil + Copyright(C) 2012-2018 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -15,8 +15,9 @@ License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ + #include "grn.h" -#include <groonga/tokenizer.h> +#include "grn_tokenizer.h" #include <string.h> @@ -218,6 +219,49 @@ grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query) grn_tokenizer_query_close(ctx, query); } +grn_obj * +grn_tokenizer_query_get_normalized_string(grn_ctx *ctx, + grn_tokenizer_query *query) +{ + return query->normalized_query; +} + +const char * +grn_tokenizer_query_get_raw_string(grn_ctx *ctx, + grn_tokenizer_query *query, + size_t *length) +{ + if (length) { + *length = query->length; + } + return query->ptr; +} + +grn_encoding +grn_tokenizer_query_get_encoding(grn_ctx *ctx, grn_tokenizer_query *query) +{ + return query->encoding; +} + +unsigned int +grn_tokenizer_query_get_flags(grn_ctx *ctx, grn_tokenizer_query *query) +{ + return query->flags; +} + +grn_bool +grn_tokenizer_query_have_tokenized_delimiter(grn_ctx *ctx, + grn_tokenizer_query *query) +{ + return query->have_tokenized_delimiter; +} + +grn_tokenize_mode +grn_tokenizer_query_get_mode(grn_ctx *ctx, grn_tokenizer_query *query) +{ + return query->tokenize_mode; +} + void grn_tokenizer_token_init(grn_ctx *ctx, grn_tokenizer_token *token) { Modified: lib/tokenizers.c (+83 -53) =================================================================== --- lib/tokenizers.c 2018-05-07 17:34:13 +0900 (b9f264739) +++ lib/tokenizers.c 2018-05-08 12:49:05 +0900 (d7ddb9ec6) @@ -20,7 +20,6 @@ #include "grn_string.h" #include "grn_plugin.h" #include "grn_raw_string.h" -#include <groonga/tokenizer.h> grn_obj *grn_tokenizer_uvector = NULL; @@ -107,6 +106,7 @@ typedef struct { grn_tokenizer_token token; grn_tokenizer_query *query; grn_bool have_tokenized_delimiter; + grn_encoding encoding; } grn_delimited_tokenizer; static grn_obj * @@ -115,8 +115,6 @@ delimited_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data { grn_tokenizer_query *query; unsigned int normalize_flags = 0; - const char *normalized; - unsigned int normalized_length_in_bytes; grn_delimited_tokenizer *tokenizer; query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); @@ -135,18 +133,37 @@ delimited_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data tokenizer->query = query; - tokenizer->have_tokenized_delimiter = - grn_tokenizer_have_tokenized_delimiter(ctx, - tokenizer->query->ptr, - tokenizer->query->length, - tokenizer->query->encoding); + { + const char *raw_string; + size_t raw_string_length; + grn_encoding encoding; + + raw_string = grn_tokenizer_query_get_raw_string(ctx, + tokenizer->query, + &raw_string_length); + encoding = grn_tokenizer_query_get_encoding(ctx, tokenizer->query); + tokenizer->have_tokenized_delimiter = + grn_tokenizer_have_tokenized_delimiter(ctx, + raw_string, + raw_string_length, + encoding); + tokenizer->encoding = encoding; + } tokenizer->delimiter = delimiter; tokenizer->delimiter_len = delimiter_len; - grn_string_get_normalized(ctx, tokenizer->query->normalized_query, - &normalized, &normalized_length_in_bytes, - NULL); - tokenizer->next = (const unsigned char *)normalized; - tokenizer->end = tokenizer->next + normalized_length_in_bytes; + { + grn_obj *string; + const char *normalized; + unsigned int normalized_length_in_bytes; + + string = grn_tokenizer_query_get_normalized_string(ctx, tokenizer->query); + grn_string_get_normalized(ctx, + string, + &normalized, &normalized_length_in_bytes, + NULL); + tokenizer->next = (const unsigned char *)normalized; + tokenizer->end = tokenizer->next + normalized_length_in_bytes; + } grn_tokenizer_token_init(ctx, &(tokenizer->token)); @@ -167,15 +184,14 @@ delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data &(tokenizer->token), (const char *)tokenizer->next, rest_length, - tokenizer->query->encoding); + tokenizer->encoding); } else { size_t cl; const unsigned char *p = tokenizer->next, *r; const unsigned char *e = tokenizer->end; grn_token_status status; for (r = p; r < e; r += cl) { - if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, - tokenizer->query->encoding))) { + if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->encoding))) { tokenizer->next = (unsigned char *)e; break; } @@ -303,6 +319,8 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, normalized_end = normalized + normalized_length_in_bytes; if (types) { + grn_encoding encoding = + grn_tokenizer_query_get_encoding(ctx, tokenizer->query); uint_least8_t *loose_types; tokenizer->loose.ctypes = @@ -319,7 +337,7 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, length = grn_charlen_(ctx, (char *)normalized, (char *)normalized_end, - tokenizer->query->encoding); + encoding); if (length == 0) { break; } @@ -365,8 +383,6 @@ ngram_init_raw(grn_ctx *ctx, GRN_STRING_WITH_TYPES | GRN_STRING_REMOVE_TOKENIZED_DELIMITER; grn_tokenizer_query *query; - const char *normalized; - unsigned int normalized_length_in_bytes; grn_ngram_tokenizer *tokenizer; if (!options->remove_blank) { @@ -399,15 +415,22 @@ ngram_init_raw(grn_ctx *ctx, tokenizer->pos = 0; tokenizer->skip = 0; - grn_string_get_normalized(ctx, tokenizer->query->normalized_query, - &normalized, &normalized_length_in_bytes, - &(tokenizer->len)); - tokenizer->next = (const unsigned char *)normalized; - tokenizer->end = tokenizer->next + normalized_length_in_bytes; - tokenizer->ctypes = - grn_string_get_types(ctx, tokenizer->query->normalized_query); + { + grn_obj *string; + const char *normalized_raw; + unsigned int normalized_length_in_bytes; + + string = grn_tokenizer_query_get_normalized_string(ctx, tokenizer->query); + grn_string_get_normalized(ctx, + string, + &normalized_raw, &normalized_length_in_bytes, + &(tokenizer->len)); + tokenizer->next = (const unsigned char *)normalized_raw; + tokenizer->end = tokenizer->next + normalized_length_in_bytes; + tokenizer->ctypes = grn_string_get_types(ctx, string); + } - if (tokenizer->query->tokenize_mode == GRN_TOKEN_GET) { + if (grn_tokenizer_query_get_mode(ctx, tokenizer->query) == GRN_TOKEN_GET) { ngram_switch_to_loose_mode(ctx, tokenizer); } @@ -593,6 +616,8 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) int32_t len = 0, pos = tokenizer->pos + tokenizer->skip; grn_token_status status = 0; const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL; + grn_encoding encoding = + grn_tokenizer_query_get_encoding(ctx, tokenizer->query); if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) { grn_tokenizer_token_push(ctx, @@ -620,8 +645,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) if (cp && tokenizer->options.uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) { - while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, - tokenizer->query->encoding))) { + while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) { len++; r += cl; LOOSE_NEED_CHECK(cp, tokenizer); @@ -633,8 +657,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) } else if (cp && tokenizer->options.uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) { - while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, - tokenizer->query->encoding))) { + while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) { len++; r += cl; LOOSE_NEED_CHECK(cp, tokenizer); @@ -646,8 +669,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) } else if (cp && tokenizer->options.uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) { - while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, - tokenizer->query->encoding))) { + while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) { len++; r += cl; LOOSE_NEED_CHECK(cp, tokenizer); @@ -665,9 +687,9 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) tokenizer->status = GRN_TOKEN_CURSOR_NOT_FOUND; return NULL; } - len = grn_str_len(key, tokenizer->query->encoding, NULL); + len = grn_str_len(key, encoding, NULL); } - r = p + grn_charlen_(ctx, p, e, tokenizer->query->encoding); + r = p + grn_charlen_(ctx, p, e, encoding); if (tid && (len > 1 || r == p)) { if (r != p && pos + len - 1 <= tokenizer->tail) { continue; } p += strlen(key); @@ -676,14 +698,12 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) } } #endif /* PRE_DEFINED_UNSPLIT_WORDS */ - if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, - tokenizer->query->encoding))) { + if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) { len++; r += cl; tokenizer->next = r; while (len < tokenizer->options.unit && - (cl = grn_charlen_(ctx, (char *)r, (char *)e, - tokenizer->query->encoding))) { + (cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) { if (cp) { LOOSE_NEED_CHECK(cp, tokenizer); if (!tokenizer->options.ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } @@ -778,8 +798,6 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { unsigned int normalize_flags = GRN_STRING_WITH_TYPES; grn_tokenizer_query *query; - const char *normalized; - unsigned int normalized_length_in_bytes; grn_regexp_tokenizer *tokenizer; query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); @@ -806,14 +824,21 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) tokenizer->is_start_token = GRN_TRUE; tokenizer->is_overlapping = GRN_FALSE; - grn_string_get_normalized(ctx, tokenizer->query->normalized_query, - &normalized, &normalized_length_in_bytes, - NULL); - tokenizer->next = normalized; - tokenizer->end = tokenizer->next + normalized_length_in_bytes; - tokenizer->nth_char = 0; - tokenizer->char_types = - grn_string_get_types(ctx, tokenizer->query->normalized_query); + { + grn_obj *string; + const char *normalized; + unsigned int normalized_length_in_bytes; + + string = grn_tokenizer_query_get_normalized_string(ctx, tokenizer->query); + grn_string_get_normalized(ctx, + string, + &normalized, &normalized_length_in_bytes, + NULL); + tokenizer->next = normalized; + tokenizer->end = tokenizer->next + normalized_length_in_bytes; + tokenizer->nth_char = 0; + tokenizer->char_types = grn_string_get_types(ctx, string); + } GRN_TEXT_INIT(&(tokenizer->buffer), 0); @@ -832,7 +857,10 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) const char *current = tokenizer->next; const char *end = tokenizer->end; const uint_least8_t *char_types = tokenizer->char_types; - grn_tokenize_mode mode = tokenizer->query->tokenize_mode; + const grn_tokenize_mode mode = + grn_tokenizer_query_get_mode(ctx, tokenizer->query); + grn_encoding encoding = + grn_tokenizer_query_get_encoding(ctx, tokenizer->query); grn_bool is_begin = tokenizer->is_begin; grn_bool is_start_token = tokenizer->is_start_token; grn_bool break_by_blank = GRN_FALSE; @@ -874,7 +902,7 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) } } - char_len = grn_charlen_(ctx, current, end, tokenizer->query->encoding); + char_len = grn_charlen_(ctx, current, end, encoding); if (char_len == 0) { status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status); @@ -933,8 +961,10 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) } } - char_len = grn_charlen_(ctx, (const char *)current, (const char *)end, - tokenizer->query->encoding); + char_len = grn_charlen_(ctx, + (const char *)current, + (const char *)end, + encoding); if (char_len == 0) { break; } Modified: plugins/suggest/suggest.c (+2 -1) =================================================================== --- plugins/suggest/suggest.c 2018-05-07 17:34:13 +0900 (66e8cf111) +++ plugins/suggest/suggest.c 2018-05-08 12:49:05 +0900 (8bf222a76) @@ -22,9 +22,10 @@ #include <string.h> #include "grn_ctx.h" +#include "grn_token_cursor.h" + #include "grn_db.h" #include "grn_ii.h" -#include "grn_token_cursor.h" #include "grn_output.h" #include <groonga/plugin.h> Modified: plugins/tokenizers/kytea.cpp (+11 -7) =================================================================== --- plugins/tokenizers/kytea.cpp 2018-05-07 17:34:13 +0900 (62ef0bb58) +++ plugins/tokenizers/kytea.cpp 2018-05-08 12:49:05 +0900 (880742801) @@ -1,5 +1,6 @@ /* -*- c-basic-offset: 2 -*- */ -/* Copyright(C) 2012 Brazil +/* + Copyright(C) 2012-2018 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -217,7 +218,7 @@ grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args, tokenizer->query = query; - grn_obj *normalized_query = query->normalized_query; + grn_obj *string = grn_tokenizer_query_get_normalized_string(ctx, query); const char *normalized_string; unsigned int normalized_string_length; grn_string_get_normalized(ctx, @@ -225,7 +226,7 @@ grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args, &normalized_string, &normalized_string_length, NULL); - if (tokenizer->query->have_tokenized_delimiter) { + if (grn_tokenizer_query_have_tokenized_delimiter(ctx, tokenizer->query)) { tokenizer->rest_query_string = normalized_string; tokenizer->rest_query_string_length = normalized_string_length; } else { @@ -246,6 +247,7 @@ grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args, grn_plugin_mutex_unlock(ctx, kytea_mutex); try { + grn_encoding encoding = grn_tokenizer_query_get_encoding(ctx, query); for (std::size_t i = 0; i < tokenizer->sentence.words.size(); ++i) { const std::string &token = kytea_util->showString(tokenizer->sentence.words[i].surface); @@ -253,9 +255,9 @@ grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args, unsigned int left = static_cast<unsigned int>(token.length()); while (left > 0) { const int char_length = - grn_tokenizer_charlen(ctx, ptr, left, query->encoding); + grn_tokenizer_charlen(ctx, ptr, left, encoding); if ((char_length == 0) || - (grn_tokenizer_isspace(ctx, ptr, left, query->encoding) != 0)) { + (grn_tokenizer_isspace(ctx, ptr, left, encoding) != 0)) { break; } ptr += char_length; @@ -282,15 +284,17 @@ grn_obj *grn_kytea_next(grn_ctx *ctx, int num_args, grn_obj **args, grn_tokenizer_kytea * const tokenizer = static_cast<grn_tokenizer_kytea *>(user_data->ptr); - if (tokenizer->query->have_tokenized_delimiter) { + if (grn_tokenizer_query_have_tokenized_delimiter(ctx, tokenizer->query)) { unsigned int rest_query_string_length = tokenizer->rest_query_string_length; + grn_encoding encoding = + grn_tokenizer_query_have_tokenized_delimiter(ctx, tokenizer->query); const char *rest_query_string = grn_tokenizer_tokenized_delimiter_next(ctx, &(tokenizer->token), tokenizer->rest_query_string, rest_query_string_length, - tokenizer->query->encoding); + encoding); if (rest_query_string) { tokenizer->rest_query_string_length -= rest_query_string - tokenizer->rest_query_string; Modified: plugins/tokenizers/mecab.c (+74 -68) =================================================================== --- plugins/tokenizers/mecab.c 2018-05-07 17:34:13 +0900 (297592ee7) +++ plugins/tokenizers/mecab.c 2018-05-08 12:49:05 +0900 (7a1e80eaa) @@ -1,6 +1,6 @@ /* -*- c-basic-offset: 2 -*- */ /* - Copyright(C) 2009-2016 Brazil + Copyright(C) 2009-2018 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -191,7 +191,8 @@ chunked_tokenize_utf8(grn_ctx *ctx, const char *current; const char *last_delimiter; const char *string_end = string + string_bytes; - grn_encoding encoding = tokenizer->query->encoding; + grn_encoding encoding = + grn_tokenizer_query_get_encoding(ctx, tokenizer->query); if (string_bytes < grn_mecab_chunk_size_threshold) { return chunked_tokenize_utf8_chunk(ctx, @@ -343,9 +344,6 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) grn_mecab_tokenizer *tokenizer; unsigned int normalizer_flags = 0; grn_tokenizer_query *query; - grn_obj *normalized_query; - const char *normalized_string; - unsigned int normalized_string_length; query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags); if (!query) { @@ -366,15 +364,18 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) return NULL; } - if (query->encoding != sole_mecab_encoding) { - grn_tokenizer_query_close(ctx, query); - GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, - "[tokenizer][mecab] " - "MeCab dictionary charset (%s) does not match " - "the table encoding: <%s>", - grn_encoding_to_string(sole_mecab_encoding), - grn_encoding_to_string(query->encoding)); - return NULL; + { + grn_encoding encoding = grn_tokenizer_query_get_encoding(ctx, query); + if (encoding != sole_mecab_encoding) { + grn_tokenizer_query_close(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "MeCab dictionary charset (%s) does not match " + "the table encoding: <%s>", + grn_encoding_to_string(sole_mecab_encoding), + grn_encoding_to_string(encoding)); + return NULL; + } } if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { @@ -387,63 +388,68 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) tokenizer->mecab = sole_mecab; tokenizer->query = query; - normalized_query = query->normalized_query; - grn_string_get_normalized(ctx, - normalized_query, - &normalized_string, - &normalized_string_length, - NULL); - GRN_TEXT_INIT(&(tokenizer->buf), 0); - if (query->have_tokenized_delimiter) { - tokenizer->next = normalized_string; - tokenizer->end = tokenizer->next + normalized_string_length; - } else if (normalized_string_length == 0) { - tokenizer->next = ""; - tokenizer->end = tokenizer->next; - } else { - grn_bool succeeded; - grn_plugin_mutex_lock(ctx, sole_mecab_mutex); - if (grn_mecab_chunked_tokenize_enabled && - ctx->encoding == GRN_ENC_UTF8) { - succeeded = chunked_tokenize_utf8(ctx, - tokenizer, - normalized_string, - normalized_string_length); + { + grn_obj *string; + const char *normalized_string; + unsigned int normalized_string_length; + + string = grn_tokenizer_query_get_normalized_string(ctx, query); + grn_string_get_normalized(ctx, + string, + &normalized_string, + &normalized_string_length, + NULL); + GRN_TEXT_INIT(&(tokenizer->buf), 0); + if (query->have_tokenized_delimiter) { + tokenizer->next = normalized_string; + tokenizer->end = tokenizer->next + normalized_string_length; + } else if (normalized_string_length == 0) { + tokenizer->next = ""; + tokenizer->end = tokenizer->next; } else { - const char *s; - s = mecab_sparse_tostr2(tokenizer->mecab, - normalized_string, - normalized_string_length); - if (!s) { - succeeded = GRN_FALSE; - GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, - "[tokenizer][mecab] " - "mecab_sparse_tostr() failed len=%d err=%s", - normalized_string_length, - mecab_strerror(tokenizer->mecab)); + grn_bool succeeded; + grn_plugin_mutex_lock(ctx, sole_mecab_mutex); + if (grn_mecab_chunked_tokenize_enabled && ctx->encoding == GRN_ENC_UTF8) { + succeeded = chunked_tokenize_utf8(ctx, + tokenizer, + normalized_string, + normalized_string_length); } else { - succeeded = GRN_TRUE; - GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); + const char *s; + s = mecab_sparse_tostr2(tokenizer->mecab, + normalized_string, + normalized_string_length); + if (!s) { + succeeded = GRN_FALSE; + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "mecab_sparse_tostr() failed len=%d err=%s", + normalized_string_length, + mecab_strerror(tokenizer->mecab)); + } else { + succeeded = GRN_TRUE; + GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); + } + } + grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); + if (!succeeded) { + grn_tokenizer_query_close(ctx, tokenizer->query); + GRN_PLUGIN_FREE(ctx, tokenizer); + return NULL; + } + { + char *buf, *p; + unsigned int bufsize; + + buf = GRN_TEXT_VALUE(&(tokenizer->buf)); + bufsize = GRN_TEXT_LEN(&(tokenizer->buf)); + /* A certain version of mecab returns trailing lf or spaces. */ + for (p = buf + bufsize - 2; + buf <= p && isspace(*(unsigned char *)p); + p--) { *p = '\0'; } + tokenizer->next = buf; + tokenizer->end = p + 1; } - } - grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); - if (!succeeded) { - grn_tokenizer_query_close(ctx, tokenizer->query); - GRN_PLUGIN_FREE(ctx, tokenizer); - return NULL; - } - { - char *buf, *p; - unsigned int bufsize; - - buf = GRN_TEXT_VALUE(&(tokenizer->buf)); - bufsize = GRN_TEXT_LEN(&(tokenizer->buf)); - /* A certain version of mecab returns trailing lf or spaces. */ - for (p = buf + bufsize - 2; - buf <= p && isspace(*(unsigned char *)p); - p--) { *p = '\0'; } - tokenizer->next = buf; - tokenizer->end = p + 1; } } user_data->ptr = tokenizer; -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180508/22abef5a/attachment-0001.htm