[Groonga-commit] groonga/groonga at 5d7ed50 [master] Hide grn_tokenizer_query internal

Back to archive index

Kouhei Sutou null+****@clear*****
Tue May 8 12:49:05 JST 2018


Kouhei Sutou	2018-05-08 12:49:05 +0900 (Tue, 08 May 2018)

  New Revision: 5d7ed50b50fe1ae1e5af0d912da74926c8fa684a
  https://github.com/groonga/groonga/commit/5d7ed50b50fe1ae1e5af0d912da74926c8fa684a

  Message:
    Hide grn_tokenizer_query internal
    
    Direct grn_tokenizer_query field access is deprecated. Use
    grn_tokenizer_query_* instead.

  Modified files:
    include/groonga/tokenizer.h
    lib/db.c
    lib/expr.c
    lib/grn_token_cursor.h
    lib/tokenizer.c
    lib/tokenizers.c
    plugins/suggest/suggest.c
    plugins/tokenizers/kytea.cpp
    plugins/tokenizers/mecab.c

  Modified: include/groonga/tokenizer.h (+24 -20)
===================================================================
--- include/groonga/tokenizer.h    2018-05-07 17:34:13 +0900 (bdb1c41aa)
+++ include/groonga/tokenizer.h    2018-05-08 12:49:05 +0900 (d8da40134)
@@ -1,6 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
-  Copyright(C) 2012-2016 Brazil
+  Copyright(C) 2012-2018 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -20,6 +20,7 @@
 
 #include <groonga/plugin.h>
 #include <groonga/token.h>
+#include <groonga/tokenizer_query_deprecated.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -70,25 +71,6 @@ GRN_PLUGIN_EXPORT grn_bool grn_tokenizer_have_tokenized_delimiter(grn_ctx *ctx,
                                                                   grn_encoding encoding);
 
 /*
-  grn_tokenizer_query is a structure for storing a query. See the following
-  functions.
- */
-typedef struct _grn_tokenizer_query grn_tokenizer_query;
-
-struct _grn_tokenizer_query {
-  grn_obj *normalized_query;
-  char *query_buf;
-  const char *ptr;
-  unsigned int length;
-  grn_encoding encoding;
-  unsigned int flags;
-  grn_bool have_tokenized_delimiter;
-  /* Deprecated since 4.0.8. Use tokenize_mode instead. */
-  grn_token_mode token_mode;
-  grn_tokenize_mode tokenize_mode;
-};
-
-/*
   grn_tokenizer_query_open() parses `args' and returns a new object of
   grn_tokenizer_query. The new object stores information of the query.
   grn_tokenizer_query_open() normalizes the query if the target table
@@ -122,6 +104,28 @@ GRN_PLUGIN_EXPORT void grn_tokenizer_query_close(grn_ctx *ctx, grn_tokenizer_que
  */
 void grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query);
 
+GRN_PLUGIN_EXPORT grn_obj *
+grn_tokenizer_query_get_normalized_string(grn_ctx *ctx,
+                                          grn_tokenizer_query *query);
+
+GRN_PLUGIN_EXPORT const char *
+grn_tokenizer_query_get_raw_string(grn_ctx *ctx,
+                                   grn_tokenizer_query *query,
+                                   size_t *length);
+
+GRN_PLUGIN_EXPORT grn_encoding
+grn_tokenizer_query_get_encoding(grn_ctx *ctx, grn_tokenizer_query *query);
+
+GRN_PLUGIN_EXPORT unsigned int
+grn_tokenizer_query_get_flags(grn_ctx *ctx, grn_tokenizer_query *query);
+
+GRN_PLUGIN_EXPORT grn_bool
+grn_tokenizer_query_have_tokenized_delimiter(grn_ctx *ctx,
+                                             grn_tokenizer_query *query);
+
+GRN_PLUGIN_EXPORT grn_tokenize_mode
+grn_tokenizer_query_get_mode(grn_ctx *ctx, grn_tokenizer_query *query);
+
 /*
   grn_tokenizer_token is needed to return tokens. A grn_tokenizer_token object
   stores a token to be returned and it must be maintained until a request for

  Modified: lib/db.c (+3 -0)
===================================================================
--- lib/db.c    2018-05-07 17:34:13 +0900 (19d46d3eb)
+++ lib/db.c    2018-05-08 12:49:05 +0900 (3cfa9535a)
@@ -15,7 +15,10 @@
   License along with this library; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
+
 #include "grn.h"
+#include "grn_tokenizer.h"
+
 #include "grn_config.h"
 #include "grn_db.h"
 #include "grn_obj.h"

  Modified: lib/expr.c (+3 -0)
===================================================================
--- lib/expr.c    2018-05-07 17:34:13 +0900 (f7ab03969)
+++ lib/expr.c    2018-05-08 12:49:05 +0900 (31aa512de)
@@ -15,7 +15,10 @@
   License along with this library; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
+
 #include "grn.h"
+#include "grn_tokenizer.h"
+
 #include "grn_db.h"
 #include "grn_ctx_impl.h"
 #include "grn_ctx_impl_mrb.h"

  Modified: lib/grn_token_cursor.h (+1 -2)
===================================================================
--- lib/grn_token_cursor.h    2018-05-07 17:34:13 +0900 (17858f236)
+++ lib/grn_token_cursor.h    2018-05-08 12:49:05 +0900 (1b4ad11a3)
@@ -19,10 +19,9 @@
 #pragma once
 
 #include "grn_ctx.h"
+#include "grn_tokenizer.h"
 #include "grn_db.h"
 
-#include <groonga/tokenizer.h>
-
 #ifdef __cplusplus
 extern "C" {
 #endif

  Modified: lib/tokenizer.c (+46 -2)
===================================================================
--- lib/tokenizer.c    2018-05-07 17:34:13 +0900 (0c162580f)
+++ lib/tokenizer.c    2018-05-08 12:49:05 +0900 (c74614324)
@@ -1,6 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
-  Copyright(C) 2012-2014 Brazil
+  Copyright(C) 2012-2018 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -15,8 +15,9 @@
   License along with this library; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
+
 #include "grn.h"
-#include <groonga/tokenizer.h>
+#include "grn_tokenizer.h"
 
 #include <string.h>
 
@@ -218,6 +219,49 @@ grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query)
   grn_tokenizer_query_close(ctx, query);
 }
 
+grn_obj *
+grn_tokenizer_query_get_normalized_string(grn_ctx *ctx,
+                                          grn_tokenizer_query *query)
+{
+  return query->normalized_query;
+}
+
+const char *
+grn_tokenizer_query_get_raw_string(grn_ctx *ctx,
+                                   grn_tokenizer_query *query,
+                                   size_t *length)
+{
+  if (length) {
+    *length = query->length;
+  }
+  return query->ptr;
+}
+
+grn_encoding
+grn_tokenizer_query_get_encoding(grn_ctx *ctx, grn_tokenizer_query *query)
+{
+  return query->encoding;
+}
+
+unsigned int
+grn_tokenizer_query_get_flags(grn_ctx *ctx, grn_tokenizer_query *query)
+{
+  return query->flags;
+}
+
+grn_bool
+grn_tokenizer_query_have_tokenized_delimiter(grn_ctx *ctx,
+                                             grn_tokenizer_query *query)
+{
+  return query->have_tokenized_delimiter;
+}
+
+grn_tokenize_mode
+grn_tokenizer_query_get_mode(grn_ctx *ctx, grn_tokenizer_query *query)
+{
+  return query->tokenize_mode;
+}
+
 void
 grn_tokenizer_token_init(grn_ctx *ctx, grn_tokenizer_token *token)
 {

  Modified: lib/tokenizers.c (+83 -53)
===================================================================
--- lib/tokenizers.c    2018-05-07 17:34:13 +0900 (b9f264739)
+++ lib/tokenizers.c    2018-05-08 12:49:05 +0900 (d7ddb9ec6)
@@ -20,7 +20,6 @@
 #include "grn_string.h"
 #include "grn_plugin.h"
 #include "grn_raw_string.h"
-#include <groonga/tokenizer.h>
 
 grn_obj *grn_tokenizer_uvector = NULL;
 
@@ -107,6 +106,7 @@ typedef struct {
   grn_tokenizer_token token;
   grn_tokenizer_query *query;
   grn_bool have_tokenized_delimiter;
+  grn_encoding encoding;
 } grn_delimited_tokenizer;
 
 static grn_obj *
@@ -115,8 +115,6 @@ delimited_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data
 {
   grn_tokenizer_query *query;
   unsigned int normalize_flags = 0;
-  const char *normalized;
-  unsigned int normalized_length_in_bytes;
   grn_delimited_tokenizer *tokenizer;
 
   query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
@@ -135,18 +133,37 @@ delimited_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data
 
   tokenizer->query = query;
 
-  tokenizer->have_tokenized_delimiter =
-    grn_tokenizer_have_tokenized_delimiter(ctx,
-                                           tokenizer->query->ptr,
-                                           tokenizer->query->length,
-                                           tokenizer->query->encoding);
+  {
+    const char *raw_string;
+    size_t raw_string_length;
+    grn_encoding encoding;
+
+    raw_string = grn_tokenizer_query_get_raw_string(ctx,
+                                                    tokenizer->query,
+                                                    &raw_string_length);
+    encoding = grn_tokenizer_query_get_encoding(ctx, tokenizer->query);
+    tokenizer->have_tokenized_delimiter =
+      grn_tokenizer_have_tokenized_delimiter(ctx,
+                                             raw_string,
+                                             raw_string_length,
+                                             encoding);
+    tokenizer->encoding = encoding;
+  }
   tokenizer->delimiter = delimiter;
   tokenizer->delimiter_len = delimiter_len;
-  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
-                            &normalized, &normalized_length_in_bytes,
-                            NULL);
-  tokenizer->next = (const unsigned char *)normalized;
-  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
+  {
+    grn_obj *string;
+    const char *normalized;
+    unsigned int normalized_length_in_bytes;
+
+    string = grn_tokenizer_query_get_normalized_string(ctx, tokenizer->query);
+    grn_string_get_normalized(ctx,
+                              string,
+                              &normalized, &normalized_length_in_bytes,
+                              NULL);
+    tokenizer->next = (const unsigned char *)normalized;
+    tokenizer->end = tokenizer->next + normalized_length_in_bytes;
+  }
 
   grn_tokenizer_token_init(ctx, &(tokenizer->token));
 
@@ -167,15 +184,14 @@ delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data
         &(tokenizer->token),
         (const char *)tokenizer->next,
         rest_length,
-        tokenizer->query->encoding);
+        tokenizer->encoding);
   } else {
     size_t cl;
     const unsigned char *p = tokenizer->next, *r;
     const unsigned char *e = tokenizer->end;
     grn_token_status status;
     for (r = p; r < e; r += cl) {
-      if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e,
-                              tokenizer->query->encoding))) {
+      if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->encoding))) {
         tokenizer->next = (unsigned char *)e;
         break;
       }
@@ -303,6 +319,8 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
   normalized_end = normalized + normalized_length_in_bytes;
 
   if (types) {
+    grn_encoding encoding =
+      grn_tokenizer_query_get_encoding(ctx, tokenizer->query);
     uint_least8_t *loose_types;
 
     tokenizer->loose.ctypes =
@@ -319,7 +337,7 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
       length = grn_charlen_(ctx,
                             (char *)normalized,
                             (char *)normalized_end,
-                            tokenizer->query->encoding);
+                            encoding);
       if (length == 0) {
         break;
       }
@@ -365,8 +383,6 @@ ngram_init_raw(grn_ctx *ctx,
     GRN_STRING_WITH_TYPES |
     GRN_STRING_REMOVE_TOKENIZED_DELIMITER;
   grn_tokenizer_query *query;
-  const char *normalized;
-  unsigned int normalized_length_in_bytes;
   grn_ngram_tokenizer *tokenizer;
 
   if (!options->remove_blank) {
@@ -399,15 +415,22 @@ ngram_init_raw(grn_ctx *ctx,
   tokenizer->pos = 0;
   tokenizer->skip = 0;
 
-  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
-                            &normalized, &normalized_length_in_bytes,
-                            &(tokenizer->len));
-  tokenizer->next = (const unsigned char *)normalized;
-  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
-  tokenizer->ctypes =
-    grn_string_get_types(ctx, tokenizer->query->normalized_query);
+  {
+    grn_obj *string;
+    const char *normalized_raw;
+    unsigned int normalized_length_in_bytes;
+
+    string = grn_tokenizer_query_get_normalized_string(ctx, tokenizer->query);
+    grn_string_get_normalized(ctx,
+                              string,
+                              &normalized_raw, &normalized_length_in_bytes,
+                              &(tokenizer->len));
+    tokenizer->next = (const unsigned char *)normalized_raw;
+    tokenizer->end = tokenizer->next + normalized_length_in_bytes;
+    tokenizer->ctypes = grn_string_get_types(ctx, string);
+  }
 
-  if (tokenizer->query->tokenize_mode == GRN_TOKEN_GET) {
+  if (grn_tokenizer_query_get_mode(ctx, tokenizer->query) == GRN_TOKEN_GET) {
     ngram_switch_to_loose_mode(ctx, tokenizer);
   }
 
@@ -593,6 +616,8 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   int32_t len = 0, pos = tokenizer->pos + tokenizer->skip;
   grn_token_status status = 0;
   const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL;
+  grn_encoding encoding =
+    grn_tokenizer_query_get_encoding(ctx, tokenizer->query);
 
   if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) {
     grn_tokenizer_token_push(ctx,
@@ -620,8 +645,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 
   if (cp && tokenizer->options.uni_alpha &&
       GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) {
-    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
-                              tokenizer->query->encoding))) {
+    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) {
       len++;
       r += cl;
       LOOSE_NEED_CHECK(cp, tokenizer);
@@ -633,8 +657,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   } else if (cp &&
              tokenizer->options.uni_digit &&
              GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) {
-    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
-                              tokenizer->query->encoding))) {
+    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) {
       len++;
       r += cl;
       LOOSE_NEED_CHECK(cp, tokenizer);
@@ -646,8 +669,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   } else if (cp &&
              tokenizer->options.uni_symbol &&
              GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) {
-    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
-                              tokenizer->query->encoding))) {
+    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) {
       len++;
       r += cl;
       LOOSE_NEED_CHECK(cp, tokenizer);
@@ -665,9 +687,9 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
         tokenizer->status = GRN_TOKEN_CURSOR_NOT_FOUND;
         return NULL;
       }
-      len = grn_str_len(key, tokenizer->query->encoding, NULL);
+      len = grn_str_len(key, encoding, NULL);
     }
-    r = p + grn_charlen_(ctx, p, e, tokenizer->query->encoding);
+    r = p + grn_charlen_(ctx, p, e, encoding);
     if (tid && (len > 1 || r == p)) {
       if (r != p && pos + len - 1 <= tokenizer->tail) { continue; }
       p += strlen(key);
@@ -676,14 +698,12 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
       }
     }
 #endif /* PRE_DEFINED_UNSPLIT_WORDS */
-    if ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
-                           tokenizer->query->encoding))) {
+    if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) {
       len++;
       r += cl;
       tokenizer->next = r;
       while (len < tokenizer->options.unit &&
-             (cl = grn_charlen_(ctx, (char *)r, (char *)e,
-                                tokenizer->query->encoding))) {
+             (cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) {
         if (cp) {
           LOOSE_NEED_CHECK(cp, tokenizer);
           if (!tokenizer->options.ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
@@ -778,8 +798,6 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
   unsigned int normalize_flags = GRN_STRING_WITH_TYPES;
   grn_tokenizer_query *query;
-  const char *normalized;
-  unsigned int normalized_length_in_bytes;
   grn_regexp_tokenizer *tokenizer;
 
   query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
@@ -806,14 +824,21 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   tokenizer->is_start_token = GRN_TRUE;
   tokenizer->is_overlapping = GRN_FALSE;
 
-  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
-                            &normalized, &normalized_length_in_bytes,
-                            NULL);
-  tokenizer->next = normalized;
-  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
-  tokenizer->nth_char = 0;
-  tokenizer->char_types =
-    grn_string_get_types(ctx, tokenizer->query->normalized_query);
+  {
+    grn_obj *string;
+    const char *normalized;
+    unsigned int normalized_length_in_bytes;
+
+    string = grn_tokenizer_query_get_normalized_string(ctx, tokenizer->query);
+    grn_string_get_normalized(ctx,
+                              string,
+                              &normalized, &normalized_length_in_bytes,
+                              NULL);
+    tokenizer->next = normalized;
+    tokenizer->end = tokenizer->next + normalized_length_in_bytes;
+    tokenizer->nth_char = 0;
+    tokenizer->char_types = grn_string_get_types(ctx, string);
+  }
 
   GRN_TEXT_INIT(&(tokenizer->buffer), 0);
 
@@ -832,7 +857,10 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   const char *current = tokenizer->next;
   const char *end = tokenizer->end;
   const uint_least8_t *char_types = tokenizer->char_types;
-  grn_tokenize_mode mode = tokenizer->query->tokenize_mode;
+  const grn_tokenize_mode mode =
+    grn_tokenizer_query_get_mode(ctx, tokenizer->query);
+  grn_encoding encoding =
+    grn_tokenizer_query_get_encoding(ctx, tokenizer->query);
   grn_bool is_begin = tokenizer->is_begin;
   grn_bool is_start_token = tokenizer->is_start_token;
   grn_bool break_by_blank = GRN_FALSE;
@@ -874,7 +902,7 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     }
   }
 
-  char_len = grn_charlen_(ctx, current, end, tokenizer->query->encoding);
+  char_len = grn_charlen_(ctx, current, end, encoding);
   if (char_len == 0) {
     status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
     grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status);
@@ -933,8 +961,10 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
       }
     }
 
-    char_len = grn_charlen_(ctx, (const char *)current, (const char *)end,
-                            tokenizer->query->encoding);
+    char_len = grn_charlen_(ctx,
+                            (const char *)current,
+                            (const char *)end,
+                            encoding);
     if (char_len == 0) {
       break;
     }

  Modified: plugins/suggest/suggest.c (+2 -1)
===================================================================
--- plugins/suggest/suggest.c    2018-05-07 17:34:13 +0900 (66e8cf111)
+++ plugins/suggest/suggest.c    2018-05-08 12:49:05 +0900 (8bf222a76)
@@ -22,9 +22,10 @@
 #include <string.h>
 
 #include "grn_ctx.h"
+#include "grn_token_cursor.h"
+
 #include "grn_db.h"
 #include "grn_ii.h"
-#include "grn_token_cursor.h"
 #include "grn_output.h"
 #include <groonga/plugin.h>
 

  Modified: plugins/tokenizers/kytea.cpp (+11 -7)
===================================================================
--- plugins/tokenizers/kytea.cpp    2018-05-07 17:34:13 +0900 (62ef0bb58)
+++ plugins/tokenizers/kytea.cpp    2018-05-08 12:49:05 +0900 (880742801)
@@ -1,5 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
-/* Copyright(C) 2012 Brazil
+/*
+  Copyright(C) 2012-2018 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -217,7 +218,7 @@ grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args,
 
   tokenizer->query = query;
 
-  grn_obj *normalized_query = query->normalized_query;
+  grn_obj *string = grn_tokenizer_query_get_normalized_string(ctx, query);
   const char *normalized_string;
   unsigned int normalized_string_length;
   grn_string_get_normalized(ctx,
@@ -225,7 +226,7 @@ grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args,
                             &normalized_string,
                             &normalized_string_length,
                             NULL);
-  if (tokenizer->query->have_tokenized_delimiter) {
+  if (grn_tokenizer_query_have_tokenized_delimiter(ctx, tokenizer->query)) {
     tokenizer->rest_query_string = normalized_string;
     tokenizer->rest_query_string_length = normalized_string_length;
   } else {
@@ -246,6 +247,7 @@ grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args,
     grn_plugin_mutex_unlock(ctx, kytea_mutex);
 
     try {
+      grn_encoding encoding = grn_tokenizer_query_get_encoding(ctx, query);
       for (std::size_t i = 0; i < tokenizer->sentence.words.size(); ++i) {
         const std::string &token =
             kytea_util->showString(tokenizer->sentence.words[i].surface);
@@ -253,9 +255,9 @@ grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args,
         unsigned int left = static_cast<unsigned int>(token.length());
         while (left > 0) {
           const int char_length =
-              grn_tokenizer_charlen(ctx, ptr, left, query->encoding);
+              grn_tokenizer_charlen(ctx, ptr, left, encoding);
           if ((char_length == 0) ||
-              (grn_tokenizer_isspace(ctx, ptr, left, query->encoding) != 0)) {
+              (grn_tokenizer_isspace(ctx, ptr, left, encoding) != 0)) {
             break;
           }
           ptr += char_length;
@@ -282,15 +284,17 @@ grn_obj *grn_kytea_next(grn_ctx *ctx, int num_args, grn_obj **args,
   grn_tokenizer_kytea * const tokenizer =
       static_cast<grn_tokenizer_kytea *>(user_data->ptr);
 
-  if (tokenizer->query->have_tokenized_delimiter) {
+  if (grn_tokenizer_query_have_tokenized_delimiter(ctx, tokenizer->query)) {
     unsigned int rest_query_string_length =
       tokenizer->rest_query_string_length;
+    grn_encoding encoding =
+      grn_tokenizer_query_have_tokenized_delimiter(ctx, tokenizer->query);
     const char *rest_query_string =
       grn_tokenizer_tokenized_delimiter_next(ctx,
                                              &(tokenizer->token),
                                              tokenizer->rest_query_string,
                                              rest_query_string_length,
-                                             tokenizer->query->encoding);
+                                             encoding);
     if (rest_query_string) {
       tokenizer->rest_query_string_length -=
         rest_query_string - tokenizer->rest_query_string;

  Modified: plugins/tokenizers/mecab.c (+74 -68)
===================================================================
--- plugins/tokenizers/mecab.c    2018-05-07 17:34:13 +0900 (297592ee7)
+++ plugins/tokenizers/mecab.c    2018-05-08 12:49:05 +0900 (7a1e80eaa)
@@ -1,6 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
-  Copyright(C) 2009-2016 Brazil
+  Copyright(C) 2009-2018 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -191,7 +191,8 @@ chunked_tokenize_utf8(grn_ctx *ctx,
   const char *current;
   const char *last_delimiter;
   const char *string_end = string + string_bytes;
-  grn_encoding encoding = tokenizer->query->encoding;
+  grn_encoding encoding =
+    grn_tokenizer_query_get_encoding(ctx, tokenizer->query);
 
   if (string_bytes < grn_mecab_chunk_size_threshold) {
     return chunked_tokenize_utf8_chunk(ctx,
@@ -343,9 +344,6 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   grn_mecab_tokenizer *tokenizer;
   unsigned int normalizer_flags = 0;
   grn_tokenizer_query *query;
-  grn_obj *normalized_query;
-  const char *normalized_string;
-  unsigned int normalized_string_length;
 
   query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
   if (!query) {
@@ -366,15 +364,18 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     return NULL;
   }
 
-  if (query->encoding != sole_mecab_encoding) {
-    grn_tokenizer_query_close(ctx, query);
-    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
-                     "[tokenizer][mecab] "
-                     "MeCab dictionary charset (%s) does not match "
-                     "the table encoding: <%s>",
-                     grn_encoding_to_string(sole_mecab_encoding),
-                     grn_encoding_to_string(query->encoding));
-    return NULL;
+  {
+    grn_encoding encoding = grn_tokenizer_query_get_encoding(ctx, query);
+    if (encoding != sole_mecab_encoding) {
+      grn_tokenizer_query_close(ctx, query);
+      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                       "[tokenizer][mecab] "
+                       "MeCab dictionary charset (%s) does not match "
+                       "the table encoding: <%s>",
+                       grn_encoding_to_string(sole_mecab_encoding),
+                       grn_encoding_to_string(encoding));
+      return NULL;
+    }
   }
 
   if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
@@ -387,63 +388,68 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   tokenizer->mecab = sole_mecab;
   tokenizer->query = query;
 
-  normalized_query = query->normalized_query;
-  grn_string_get_normalized(ctx,
-                            normalized_query,
-                            &normalized_string,
-                            &normalized_string_length,
-                            NULL);
-  GRN_TEXT_INIT(&(tokenizer->buf), 0);
-  if (query->have_tokenized_delimiter) {
-    tokenizer->next = normalized_string;
-    tokenizer->end = tokenizer->next + normalized_string_length;
-  } else if (normalized_string_length == 0) {
-    tokenizer->next = "";
-    tokenizer->end = tokenizer->next;
-  } else {
-    grn_bool succeeded;
-    grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
-    if (grn_mecab_chunked_tokenize_enabled &&
-        ctx->encoding == GRN_ENC_UTF8) {
-      succeeded = chunked_tokenize_utf8(ctx,
-                                        tokenizer,
-                                        normalized_string,
-                                        normalized_string_length);
+  {
+    grn_obj *string;
+    const char *normalized_string;
+    unsigned int normalized_string_length;
+
+    string = grn_tokenizer_query_get_normalized_string(ctx, query);
+    grn_string_get_normalized(ctx,
+                              string,
+                              &normalized_string,
+                              &normalized_string_length,
+                              NULL);
+    GRN_TEXT_INIT(&(tokenizer->buf), 0);
+    if (query->have_tokenized_delimiter) {
+      tokenizer->next = normalized_string;
+      tokenizer->end = tokenizer->next + normalized_string_length;
+    } else if (normalized_string_length == 0) {
+      tokenizer->next = "";
+      tokenizer->end = tokenizer->next;
     } else {
-      const char *s;
-      s = mecab_sparse_tostr2(tokenizer->mecab,
-                              normalized_string,
-                              normalized_string_length);
-      if (!s) {
-        succeeded = GRN_FALSE;
-        GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
-                         "[tokenizer][mecab] "
-                         "mecab_sparse_tostr() failed len=%d err=%s",
-                         normalized_string_length,
-                         mecab_strerror(tokenizer->mecab));
+      grn_bool succeeded;
+      grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
+      if (grn_mecab_chunked_tokenize_enabled && ctx->encoding == GRN_ENC_UTF8) {
+        succeeded = chunked_tokenize_utf8(ctx,
+                                          tokenizer,
+                                          normalized_string,
+                                          normalized_string_length);
       } else {
-        succeeded = GRN_TRUE;
-        GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
+        const char *s;
+        s = mecab_sparse_tostr2(tokenizer->mecab,
+                                normalized_string,
+                                normalized_string_length);
+        if (!s) {
+          succeeded = GRN_FALSE;
+          GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                           "[tokenizer][mecab] "
+                           "mecab_sparse_tostr() failed len=%d err=%s",
+                           normalized_string_length,
+                           mecab_strerror(tokenizer->mecab));
+        } else {
+          succeeded = GRN_TRUE;
+          GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
+        }
+      }
+      grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
+      if (!succeeded) {
+        grn_tokenizer_query_close(ctx, tokenizer->query);
+        GRN_PLUGIN_FREE(ctx, tokenizer);
+        return NULL;
+      }
+      {
+        char *buf, *p;
+        unsigned int bufsize;
+
+        buf = GRN_TEXT_VALUE(&(tokenizer->buf));
+        bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
+        /* A certain version of mecab returns trailing lf or spaces. */
+        for (p = buf + bufsize - 2;
+             buf <= p && isspace(*(unsigned char *)p);
+             p--) { *p = '\0'; }
+        tokenizer->next = buf;
+        tokenizer->end = p + 1;
       }
-    }
-    grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
-    if (!succeeded) {
-      grn_tokenizer_query_close(ctx, tokenizer->query);
-      GRN_PLUGIN_FREE(ctx, tokenizer);
-      return NULL;
-    }
-    {
-      char *buf, *p;
-      unsigned int bufsize;
-
-      buf = GRN_TEXT_VALUE(&(tokenizer->buf));
-      bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
-      /* A certain version of mecab returns trailing lf or spaces. */
-      for (p = buf + bufsize - 2;
-           buf <= p && isspace(*(unsigned char *)p);
-           p--) { *p = '\0'; }
-      tokenizer->next = buf;
-      tokenizer->end = p + 1;
     }
   }
   user_data->ptr = tokenizer;
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180508/22abef5a/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index