[Groonga-commit] groonga/groonga at b798908 [master] TokenMecab: add "chunked_tokenize" and "chunk_size_threshold" options

Back to archive index

Kouhei Sutou null+****@clear*****
Thu Aug 9 18:17:47 JST 2018


Kouhei Sutou	2018-08-09 18:17:47 +0900 (Thu, 09 Aug 2018)

  New Revision: b798908e73a34f5beaa6be9354e56cadcf0292e6
  https://github.com/groonga/groonga/commit/b798908e73a34f5beaa6be9354e56cadcf0292e6

  Message:
    TokenMecab: add "chunked_tokenize" and "chunk_size_threshold" options

  Added files:
    test/command/suite/tokenizers/mecab/options/chunk_size_threshold.expected
    test/command/suite/tokenizers/mecab/options/chunk_size_threshold.reject
    test/command/suite/tokenizers/mecab/options/chunk_size_threshold.test
  Modified files:
    plugins/tokenizers/mecab.c

  Modified: plugins/tokenizers/mecab.c (+127 -49)
===================================================================
--- plugins/tokenizers/mecab.c    2018-08-08 18:25:13 +0900 (d392facce)
+++ plugins/tokenizers/mecab.c    2018-08-09 18:17:47 +0900 (a384bf257)
@@ -1,6 +1,7 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
   Copyright(C) 2009-2018 Brazil
+  Copyright(C) 2018 Kouhei Sutou <kou �� clear-code.com>
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -38,15 +39,20 @@ static grn_plugin_mutex *sole_mecab_mutex = NULL;
 static grn_encoding sole_mecab_encoding = GRN_ENC_NONE;
 
 static grn_bool grn_mecab_chunked_tokenize_enabled = GRN_FALSE;
-static int grn_mecab_chunk_size_threshold = 8192;
+static int32_t grn_mecab_chunk_size_threshold = 8192;
 
 typedef struct {
+  grn_bool chunked_tokenize;
+  int32_t chunk_size_threshold;
+} grn_mecab_tokenizer_options;
+
+typedef struct {
+  grn_mecab_tokenizer_options *options;
   mecab_t *mecab;
   grn_obj buf;
   const char *next;
   const char *end;
   grn_tokenizer_query *query;
-  grn_tokenizer_token token;
 } grn_mecab_tokenizer;
 
 static const char *
@@ -93,6 +99,62 @@ get_mecab_encoding(mecab_t *mecab)
   return encoding;
 }
 
+static void
+mecab_tokenizer_options_init(grn_mecab_tokenizer_options *options)
+{
+  options->chunked_tokenize = grn_mecab_chunked_tokenize_enabled;
+  options->chunk_size_threshold = grn_mecab_chunk_size_threshold;
+}
+
+static void *
+mecab_tokenizer_options_open(grn_ctx *ctx,
+                             grn_obj *lexicon,
+                             grn_obj *raw_options,
+                             void *user_data)
+{
+  grn_mecab_tokenizer_options *options;
+
+  options = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer_options));
+  if (!options) {
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_NO_MEMORY_AVAILABLE,
+                     "[tokenizer][mecab] "
+                     "failed to allocate memory for options");
+    return NULL;
+  }
+
+  mecab_tokenizer_options_init(options);
+
+  GRN_OPTION_VALUES_EACH_BEGIN(ctx, raw_options, i, name, name_length) {
+    grn_raw_string name_raw;
+    name_raw.value = name;
+    name_raw.length = name_length;
+
+    if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "chunked_tokenize")) {
+      options->chunked_tokenize =
+        grn_vector_get_element_bool(ctx,
+                                    raw_options,
+                                    i,
+                                    options->chunked_tokenize);
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "chunk_size_threshold")) {
+      options->chunk_size_threshold =
+        grn_vector_get_element_int32(ctx,
+                                     raw_options,
+                                     i,
+                                     options->chunk_size_threshold);
+    }
+  } GRN_OPTION_VALUES_EACH_END();
+
+  return options;
+}
+
+static void
+mecab_tokenizer_options_close(grn_ctx *ctx, void *data)
+{
+  grn_mecab_tokenizer_options *options = data;
+  GRN_PLUGIN_FREE(ctx, options);
+}
+
 static grn_inline grn_bool
 is_delimiter_character(grn_ctx *ctx, const char *character, int character_bytes)
 {
@@ -196,7 +258,7 @@ chunked_tokenize_utf8(grn_ctx *ctx,
   grn_encoding encoding =
     grn_tokenizer_query_get_encoding(ctx, tokenizer->query);
 
-  if (string_bytes < grn_mecab_chunk_size_threshold) {
+  if (string_bytes < tokenizer->options->chunk_size_threshold) {
     return chunked_tokenize_utf8_chunk(ctx,
                                        tokenizer,
                                        string,
@@ -243,7 +305,7 @@ chunked_tokenize_utf8(grn_ctx *ctx,
       last_delimiter = current;
     }
 
-    if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) {
+    if ((current - chunk_start) >= tokenizer->options->chunk_size_threshold) {
       grn_bool succeeded;
       if (last_delimiter) {
         succeeded = chunked_tokenize_utf8_chunk(ctx,
@@ -353,17 +415,32 @@ mecab_create(grn_ctx *ctx)
   The return value of this function is ignored. When an error occurs in this
   function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
  */
-static grn_obj *
-mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+static void *
+mecab_init(grn_ctx *ctx, grn_tokenizer_query *query)
 {
+  grn_obj *lexicon;
   grn_mecab_tokenizer *tokenizer;
-  unsigned int normalizer_flags = 0;
-  grn_tokenizer_query *query;
 
-  query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
-  if (!query) {
+  lexicon = grn_tokenizer_query_get_lexicon(ctx, query);
+
+  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
+    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+                     "[tokenizer][mecab] "
+                     "memory allocation to grn_mecab_tokenizer failed");
     return NULL;
   }
+
+  tokenizer->options =
+    grn_table_cache_default_tokenizer_options(ctx,
+                                              lexicon,
+                                              mecab_tokenizer_options_open,
+                                              mecab_tokenizer_options_close,
+                                              NULL);
+  if (ctx->rc != GRN_SUCCESS) {
+    GRN_PLUGIN_FREE(ctx, tokenizer);
+    return NULL;
+  }
+
   if (!sole_mecab) {
     grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
     if (!sole_mecab) {
@@ -375,14 +452,14 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
   }
   if (!sole_mecab) {
-    grn_tokenizer_query_close(ctx, query);
+    GRN_PLUGIN_FREE(ctx, tokenizer);
     return NULL;
   }
 
   {
     grn_encoding encoding = grn_tokenizer_query_get_encoding(ctx, query);
     if (encoding != sole_mecab_encoding) {
-      grn_tokenizer_query_close(ctx, query);
+      GRN_PLUGIN_FREE(ctx, tokenizer);
       GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                        "[tokenizer][mecab] "
                        "MeCab dictionary charset (%s) does not match "
@@ -393,13 +470,6 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     }
   }
 
-  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
-    grn_tokenizer_query_close(ctx, query);
-    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
-                     "[tokenizer][mecab] "
-                     "memory allocation to grn_mecab_tokenizer failed");
-    return NULL;
-  }
   tokenizer->mecab = sole_mecab;
   tokenizer->query = query;
 
@@ -424,7 +494,8 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     } else {
       grn_bool succeeded;
       grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
-      if (grn_mecab_chunked_tokenize_enabled && ctx->encoding == GRN_ENC_UTF8) {
+      if (tokenizer->options->chunked_tokenize &&
+          ctx->encoding == GRN_ENC_UTF8) {
         succeeded = chunked_tokenize_utf8(ctx,
                                           tokenizer,
                                           normalized_string,
@@ -448,7 +519,6 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
       }
       grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
       if (!succeeded) {
-        grn_tokenizer_query_close(ctx, tokenizer->query);
         GRN_PLUGIN_FREE(ctx, tokenizer);
         return NULL;
       }
@@ -467,30 +537,40 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
       }
     }
   }
-  user_data->ptr = tokenizer;
-
-  grn_tokenizer_token_init(ctx, &(tokenizer->token));
 
-  return NULL;
+  return tokenizer;
 }
 
 /*
   This function returns tokens one by one.
  */
-static grn_obj *
-mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+static void
+mecab_next(grn_ctx *ctx,
+           grn_tokenizer_query *query,
+           grn_token *token,
+           void *user_data)
 {
-  /* grn_obj *table = args[0]; */
-  grn_mecab_tokenizer *tokenizer = user_data->ptr;
+  grn_mecab_tokenizer *tokenizer = user_data;
   grn_encoding encoding = tokenizer->query->encoding;
 
   if (tokenizer->query->have_tokenized_delimiter) {
+    grn_tokenizer_token tokenizer_token;
+    grn_tokenizer_token_init(ctx, &tokenizer_token);
+    /* TODO: Need grn_token version. */
     tokenizer->next =
       grn_tokenizer_tokenized_delimiter_next(ctx,
-                                             &(tokenizer->token),
+                                             &tokenizer_token,
                                              tokenizer->next,
                                              tokenizer->end - tokenizer->next,
                                              encoding);
+    grn_token_set_data(ctx,
+                       token,
+                       GRN_TEXT_VALUE(&(tokenizer_token.str)),
+                       GRN_TEXT_LEN(&(tokenizer_token.str)));
+    grn_token_set_status(ctx,
+                         token,
+                         GRN_UINT32_VALUE(&(tokenizer_token.status)));
+    grn_tokenizer_token_fin(ctx, &tokenizer_token);
   } else {
     size_t cl;
     const char *p = tokenizer->next, *r;
@@ -523,31 +603,27 @@ mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     }
 
     if (r == e || tokenizer->next == e) {
-      status = GRN_TOKENIZER_LAST;
+      status = GRN_TOKEN_LAST;
     } else {
-      status = GRN_TOKENIZER_CONTINUE;
+      status = GRN_TOKEN_CONTINUE;
     }
-    grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status);
+    grn_token_set_data(ctx, token, p, r - p);
+    grn_token_set_status(ctx, token, status);
   }
-
-  return NULL;
 }
 
 /*
   This function finalizes a tokenization.
  */
-static grn_obj *
-mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+static void
+mecab_fin(grn_ctx *ctx, void *user_data)
 {
-  grn_mecab_tokenizer *tokenizer = user_data->ptr;
+  grn_mecab_tokenizer *tokenizer = user_data;
   if (!tokenizer) {
-    return NULL;
+    return;
   }
-  grn_tokenizer_token_fin(ctx, &(tokenizer->token));
-  grn_tokenizer_query_close(ctx, tokenizer->query);
   grn_obj_unlink(ctx, &(tokenizer->buf));
   GRN_PLUGIN_FREE(ctx, tokenizer);
-  return NULL;
 }
 
 static void
@@ -635,15 +711,17 @@ GRN_PLUGIN_INIT(grn_ctx *ctx)
 grn_rc
 GRN_PLUGIN_REGISTER(grn_ctx *ctx)
 {
-  grn_rc rc;
+  grn_rc rc = GRN_SUCCESS;
+  grn_obj *tokenizer;
+
+  tokenizer = grn_tokenizer_create(ctx, "TokenMecab", -1);
+  if (tokenizer) {
+    grn_tokenizer_set_init_func(ctx, tokenizer, mecab_init);
+    grn_tokenizer_set_next_func(ctx, tokenizer, mecab_next);
+    grn_tokenizer_set_fin_func(ctx, tokenizer, mecab_fin);
 
-  rc = grn_tokenizer_register(ctx, "TokenMecab", 10,
-                              mecab_init, mecab_next, mecab_fin);
-  if (rc == GRN_SUCCESS) {
-    grn_obj *token_mecab;
-    token_mecab = grn_ctx_get(ctx, "TokenMecab", 10);
     /* Just for backward compatibility. TokenMecab was built-in not plugin. */
-    if (token_mecab && grn_obj_id(ctx, token_mecab) != GRN_DB_MECAB) {
+    if (grn_obj_id(ctx, tokenizer) != GRN_DB_MECAB) {
       rc = GRN_FILE_CORRUPT;
     }
   }

  Added: test/command/suite/tokenizers/mecab/options/chunk_size_threshold.expected (+40 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/options/chunk_size_threshold.expected    2018-08-09 18:17:47 +0900 (03d7061e9)
@@ -0,0 +1,40 @@
+tokenize   'TokenMecab("chunked_tokenize", true, "chunk_size_threshold", 30)'   '日本のエンジンとエンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0,
+      "force_prefix": false
+    },
+    {
+      "value": "の",
+      "position": 1,
+      "force_prefix": false
+    },
+    {
+      "value": "エンジン",
+      "position": 2,
+      "force_prefix": false
+    },
+    {
+      "value": "と",
+      "position": 3,
+      "force_prefix": false
+    },
+    {
+      "value": "エン",
+      "position": 4,
+      "force_prefix": false
+    },
+    {
+      "value": "ジン",
+      "position": 5,
+      "force_prefix": false
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/options/chunk_size_threshold.reject (+35 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/options/chunk_size_threshold.reject    2018-08-09 18:17:47 +0900 (aa70e92e7)
@@ -0,0 +1,35 @@
+tokenize   'TokenMecab("chunked_tokenize", false, "chunk_size_threshold", 30)'   '日本のエンジンとエンジン'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "日本",
+      "position": 0,
+      "force_prefix": false
+    },
+    {
+      "value": "の",
+      "position": 1,
+      "force_prefix": false
+    },
+    {
+      "value": "エンジン",
+      "position": 2,
+      "force_prefix": false
+    },
+    {
+      "value": "と",
+      "position": 3,
+      "force_prefix": false
+    },
+    {
+      "value": "エンジン",
+      "position": 4,
+      "force_prefix": false
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/options/chunk_size_threshold.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/options/chunk_size_threshold.test    2018-08-09 18:17:47 +0900 (595f1ca86)
@@ -0,0 +1,5 @@
+#@on-error omit
+tokenize \
+  'TokenMecab("chunked_tokenize", true, "chunk_size_threshold", 30)' \
+  '日本のエンジンとエンジン'
+#@on-error default
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180809/f56d6a80/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index