[Groonga-commit] groonga/groonga at 29e9074 [master] TokenMecab: add support for GET mode in loose_reading

Back to archive index

Kouhei Sutou null+****@clear*****
Tue Sep 11 11:06:49 JST 2018

Kouhei Sutou	2018-09-11 11:06:49 +0900 (Tue, 11 Sep 2018)

  Revision: 29e90749d9ecc4726db4246194d8dc5ff47510aa

    TokenMecab: add support for GET mode in loose_reading

  Added files:
  Modified files:

  Modified: plugins/tokenizers/mecab.c (+244 -207)
--- plugins/tokenizers/mecab.c    2018-09-11 10:04:33 +0900 (6ab3a1d3a)
+++ plugins/tokenizers/mecab.c    2018-09-11 11:06:49 +0900 (ba67d6ece)
@@ -62,6 +62,7 @@ typedef struct {
   const char *next;
   const char *end;
   grn_tokenizer_query *query;
+  grn_obj feature_locations;
   struct {
     grn_bool ing;
     grn_bool need;
@@ -570,137 +571,6 @@ mecab_init_mecab(grn_ctx *ctx, grn_mecab_tokenizer *tokenizer)
-  This function is called for a full text search query or a document to be
-  indexed. This means that both short/long strings are given.
-  The return value of this function is ignored. When an error occurs in this
-  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
- */
-static void *
-mecab_init(grn_ctx *ctx, grn_tokenizer_query *query)
-  grn_obj *lexicon;
-  grn_mecab_tokenizer *tokenizer;
-  lexicon = grn_tokenizer_query_get_lexicon(ctx, query);
-  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
-                     "[tokenizer][mecab] "
-                     "memory allocation to grn_mecab_tokenizer failed");
-    return NULL;
-  }
-  tokenizer->options =
-    grn_table_cache_default_tokenizer_options(ctx,
-                                              lexicon,
-                                              mecab_tokenizer_options_open,
-                                              mecab_tokenizer_options_close,
-                                              NULL);
-  if (ctx->rc != GRN_SUCCESS) {
-    GRN_PLUGIN_FREE(ctx, tokenizer);
-    return NULL;
-  }
-  mecab_init_mecab(ctx, tokenizer);
-  if (!tokenizer->mecab->mecab) {
-    GRN_PLUGIN_FREE(ctx, tokenizer);
-    return NULL;
-  }
-  {
-    grn_encoding encoding = grn_tokenizer_query_get_encoding(ctx, query);
-    if (encoding != tokenizer->mecab->encoding) {
-      GRN_PLUGIN_FREE(ctx, tokenizer);
-                       "[tokenizer][mecab] "
-                       "MeCab dictionary charset (%s) does not match "
-                       "the table encoding: <%s>",
-                       grn_encoding_to_string(tokenizer->mecab->encoding),
-                       grn_encoding_to_string(encoding));
-      return NULL;
-    }
-  }
-  tokenizer->query = query;
-  {
-    grn_obj *string;
-    const char *normalized_string;
-    unsigned int normalized_string_length;
-    string = grn_tokenizer_query_get_normalized_string(ctx, query);
-    grn_string_get_normalized(ctx,
-                              string,
-                              &normalized_string,
-                              &normalized_string_length,
-                              NULL);
-    GRN_TEXT_INIT(&(tokenizer->buf), 0);
-    if (grn_tokenizer_query_have_tokenized_delimiter(ctx, query)) {
-      tokenizer->next = normalized_string;
-      tokenizer->end = tokenizer->next + normalized_string_length;
-    } else if (normalized_string_length == 0) {
-      tokenizer->next = "";
-      tokenizer->end = tokenizer->next;
-    } else {
-      grn_bool succeeded;
-      grn_plugin_mutex_lock(ctx, tokenizer->mecab->mutex);
-      if (tokenizer->options->chunked_tokenize &&
-          ctx->encoding == GRN_ENC_UTF8) {
-        succeeded = chunked_tokenize_utf8(ctx,
-                                          tokenizer,
-                                          normalized_string,
-                                          normalized_string_length);
-      } else {
-        const char *s;
-        s = mecab_sparse_tostr2(tokenizer->mecab->mecab,
-                                normalized_string,
-                                normalized_string_length);
-        if (!s) {
-          succeeded = GRN_FALSE;
-                           "[tokenizer][mecab] "
-                           "mecab_sparse_tostr() failed len=%d err=%s",
-                           normalized_string_length,
-                           mecab_strerror(tokenizer->mecab->mecab));
-        } else {
-          succeeded = GRN_TRUE;
-          GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
-        }
-      }
-      grn_plugin_mutex_unlock(ctx, tokenizer->mecab->mutex);
-      if (!succeeded) {
-        GRN_PLUGIN_FREE(ctx, tokenizer);
-        return NULL;
-      }
-      if (mecab_tokenizer_options_need_default_output(tokenizer->options)) {
-        tokenizer->next = GRN_TEXT_VALUE(&(tokenizer->buf));
-        tokenizer->end = tokenizer->next + GRN_TEXT_LEN(&(tokenizer->buf));
-      } else {
-        char *buf, *p;
-        unsigned int bufsize;
-        buf = GRN_TEXT_VALUE(&(tokenizer->buf));
-        bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
-        /* A certain version of mecab returns trailing lf or spaces. */
-        for (p = buf + bufsize - 2;
-             buf <= p && isspace(*(unsigned char *)p);
-             p--) { *p = '\0'; }
-        tokenizer->next = buf;
-        tokenizer->end = p + 1;
-      }
-    }
-  }
-  tokenizer->loose.ing = GRN_FALSE;
-  tokenizer->loose.need = GRN_FALSE;
-  tokenizer->loose.need_end_mark = GRN_FALSE;
-  GRN_TEXT_INIT(&(tokenizer->loose.readings), GRN_OBJ_VECTOR);
-  tokenizer->loose.offset = 0;
-  return tokenizer;
 static void
 mecab_next_default_format_skip_eos(grn_ctx *ctx,
                                    grn_mecab_tokenizer *tokenizer)
@@ -723,18 +593,18 @@ mecab_next_default_format_skip_eos(grn_ctx *ctx,
 typedef struct {
   grn_token *token;
-  grn_obj *features;
+  grn_obj *feature_locations;
   grn_bool ignore_empty_value;
   grn_bool ignore_asterisk_value;
 } add_feature_data;
 static size_t
 mecab_get_feature(grn_ctx *ctx,
-                  grn_obj *features,
+                  grn_obj *feature_locations,
                   size_t i,
                   const char **value)
-  size_t n_locations = GRN_BULK_VSIZE(features) / sizeof(uint64_t);
+  size_t n_locations = GRN_BULK_VSIZE(feature_locations) / sizeof(uint64_t);
   const char *start;
   const char *end;
@@ -743,8 +613,8 @@ mecab_get_feature(grn_ctx *ctx,
     return 0;
-  start = (const char *)(GRN_UINT64_VALUE_AT(features, i));
-  end = ((const char *)(GRN_UINT64_VALUE_AT(features, i + 1))) - 1;
+  start = (const char *)(GRN_UINT64_VALUE_AT(feature_locations, i));
+  end = ((const char *)(GRN_UINT64_VALUE_AT(feature_locations, i + 1))) - 1;
   *value = start;
   return end - start;
@@ -756,12 +626,12 @@ mecab_next_default_format_add_feature(grn_ctx *ctx,
                                       size_t i)
   grn_token *token = data->token;
-  grn_obj *features = data->features;
+  grn_obj *feature_locations = data->feature_locations;
   const char *feature = NULL;
   size_t feature_length;
   grn_obj value;
-  feature_length = mecab_get_feature(ctx, features, i, &feature);
+  feature_length = mecab_get_feature(ctx, feature_locations, i, &feature);
   if (data->ignore_empty_value && feature_length == 0) {
@@ -781,59 +651,34 @@ mecab_next_default_format_add_feature(grn_ctx *ctx,
   GRN_OBJ_FIN(ctx, &value);
-static void
-mecab_next_default_format(grn_ctx *ctx,
-                          grn_mecab_tokenizer *tokenizer,
-                          grn_token *token)
+static size_t
+mecab_next_default_format_consume_token(grn_ctx *ctx,
+                                        grn_mecab_tokenizer *tokenizer,
+                                        const char **surface_output)
   grn_encoding encoding = tokenizer->query->encoding;
+  grn_obj *feature_locations = &(tokenizer->feature_locations);
   const char *start;
   const char *current;
   const char *end = tokenizer->end;
+  const char *surface = NULL;
   int length = 0;
-  const char *surface;
   size_t surface_length = 0;
-  grn_obj features;
-  if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) {
-    grn_tokenizer_status status = GRN_TOKEN_CONTINUE;
-    grn_token_set_data(ctx,
-                       token,
-                       GRN_TOKENIZER_END_MARK_UTF8,
-                       GRN_TOKENIZER_END_MARK_UTF8_LEN);
-    grn_token_set_status(ctx, token, status);
-    tokenizer->loose.need_end_mark = GRN_FALSE;
-    return;
-  }
-  if (tokenizer->loose.ing) {
-    grn_tokenizer_status status = GRN_TOKEN_CONTINUE;
-    const char *reading = NULL;
-    unsigned int reading_length;
-    if (tokenizer->loose.offset + 1 ==
-        grn_vector_size(ctx, &(tokenizer->loose.readings))) {
-      status = GRN_TOKEN_LAST;
-    }
-    reading_length = grn_vector_get_element(ctx,
-                                            &(tokenizer->loose.readings),
-                                            tokenizer->loose.offset,
-                                            &reading,
-                                            NULL,
-                                            NULL);
-    grn_token_set_data(ctx, token, reading, reading_length);
-    grn_token_set_status(ctx, token, status);
-    tokenizer->loose.offset++;
-    return;
+  if (surface_output) {
+    *surface_output = NULL;
+  GRN_BULK_REWIND(feature_locations);
   mecab_next_default_format_skip_eos(ctx, tokenizer);
   start = surface = tokenizer->next;
-  GRN_UINT64_INIT(&features, GRN_OBJ_VECTOR);
   for (current = start; current < end; current += length) {
     length = grn_charlen_(ctx, current, end, encoding);
     if (length == 0) {
-      break;
+      if (surface_output) {
+        *surface_output = NULL;
+      }
+      return 0;
     if (length == 1) {
@@ -841,7 +686,7 @@ mecab_next_default_format(grn_ctx *ctx,
         if (surface_length == 0) {
           surface_length = current - surface;
         } else {
-          GRN_UINT64_PUT(ctx, &features, current);
+          GRN_UINT64_PUT(ctx, feature_locations, current);
         if (current < end &&
@@ -854,7 +699,7 @@ mecab_next_default_format(grn_ctx *ctx,
         if (surface_length == 0) {
           surface_length = current - surface;
         } else {
-          GRN_UINT64_PUT(ctx, &features, current);
+          GRN_UINT64_PUT(ctx, feature_locations, current);
@@ -865,22 +710,94 @@ mecab_next_default_format(grn_ctx *ctx,
       if (length == 1 && current[0] == '\t') {
         surface_length = current - surface;
         if (current + 1 < end) {
-          GRN_UINT64_PUT(ctx, &features, current + 1);
+          GRN_UINT64_PUT(ctx, feature_locations, current + 1);
     } else {
       if (length == 1 && current[0] == ',' && current + 1 < end) {
-        GRN_UINT64_PUT(ctx, &features, current + 1);
+        GRN_UINT64_PUT(ctx, feature_locations, current + 1);
   tokenizer->next = current;
   mecab_next_default_format_skip_eos(ctx, tokenizer);
+  if (tokenizer->options->loose_reading) {
+    const char *reading = NULL;
+    size_t reading_length;
+    reading_length = mecab_get_feature(ctx, feature_locations, 7, &reading);
+    if (reading_length > 0) {
+      tokenizer->loose.need = GRN_TRUE;
+      tokenizer->loose.need_end_mark = GRN_TRUE;
+      grn_vector_add_element(ctx,
+                             &(tokenizer->loose.readings),
+                             reading,
+                             reading_length,
+                             0,
+                             GRN_DB_TEXT);
+    } else {
+      grn_vector_add_element(ctx,
+                             &(tokenizer->loose.readings),
+                             surface,
+                             surface_length,
+                             0,
+                             GRN_DB_TEXT);
+    }
+  }
+  if (surface_output) {
+    *surface_output = surface;
+  }
+  return surface_length;
+static void
+mecab_next_default_format(grn_ctx *ctx,
+                          grn_mecab_tokenizer *tokenizer,
+                          grn_token *token)
+  const char *surface;
+  size_t surface_length = 0;
+  if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) {
+    grn_tokenizer_status status = GRN_TOKEN_CONTINUE;
+    grn_token_set_data(ctx,
+                       token,
+                       GRN_TOKENIZER_END_MARK_UTF8,
+                       GRN_TOKENIZER_END_MARK_UTF8_LEN);
+    grn_token_set_status(ctx, token, status);
+    tokenizer->loose.need_end_mark = GRN_FALSE;
+    return;
+  }
+  if (tokenizer->loose.ing) {
+    grn_tokenizer_status status = GRN_TOKEN_CONTINUE;
+    const char *reading = NULL;
+    unsigned int reading_length;
+    if (tokenizer->loose.offset + 1 ==
+        grn_vector_size(ctx, &(tokenizer->loose.readings))) {
+      status = GRN_TOKEN_LAST;
+    }
+    reading_length = grn_vector_get_element(ctx,
+                                            &(tokenizer->loose.readings),
+                                            tokenizer->loose.offset,
+                                            &reading,
+                                            NULL,
+                                            NULL);
+    grn_token_set_data(ctx, token, reading, reading_length);
+    grn_token_set_status(ctx, token, status);
+    tokenizer->loose.offset++;
+    return;
+  }
+  surface_length = mecab_next_default_format_consume_token(ctx,
+                                                           tokenizer,
+                                                           &surface);
   grn_token_set_data(ctx, token, surface, surface_length);
     grn_tokenizer_status status;
-    if (current == end || tokenizer->next == end) {
+    if (tokenizer->next == tokenizer->end) {
       if (tokenizer->loose.need) {
         tokenizer->loose.ing = GRN_TRUE;
         status = GRN_TOKEN_CONTINUE;
@@ -895,7 +812,7 @@ mecab_next_default_format(grn_ctx *ctx,
   if (tokenizer->options->include_class) {
     add_feature_data data;
     data.token = token;
-    data.features = &features;
+    data.feature_locations = &(tokenizer->feature_locations);
     data.ignore_empty_value = GRN_TRUE;
     data.ignore_asterisk_value = GRN_TRUE;
     mecab_next_default_format_add_feature(ctx, &data, "class", 0);
@@ -906,7 +823,7 @@ mecab_next_default_format(grn_ctx *ctx,
   if (tokenizer->options->include_reading) {
     add_feature_data data;
     data.token = token;
-    data.features = &features;
+    data.feature_locations = &(tokenizer->feature_locations);
     data.ignore_empty_value = GRN_TRUE;
     data.ignore_asterisk_value = GRN_FALSE;
     mecab_next_default_format_add_feature(ctx, &data, "reading", 7);
@@ -914,36 +831,13 @@ mecab_next_default_format(grn_ctx *ctx,
   if (tokenizer->options->include_form) {
     add_feature_data data;
     data.token = token;
-    data.features = &features;
+    data.feature_locations = &(tokenizer->feature_locations);
     data.ignore_empty_value = GRN_TRUE;
     data.ignore_asterisk_value = GRN_TRUE;
     mecab_next_default_format_add_feature(ctx, &data, "inflected_type", 4);
     mecab_next_default_format_add_feature(ctx, &data, "inflected_form", 5);
     mecab_next_default_format_add_feature(ctx, &data, "base_form", 6);
-  if (tokenizer->options->loose_reading) {
-    const char *reading = NULL;
-    size_t reading_length;
-    reading_length = mecab_get_feature(ctx, &features, 7, &reading);
-    if (reading_length > 0) {
-      tokenizer->loose.need = GRN_TRUE;
-      tokenizer->loose.need_end_mark = GRN_TRUE;
-      grn_vector_add_element(ctx,
-                             &(tokenizer->loose.readings),
-                             reading,
-                             reading_length,
-                             0,
-                             GRN_DB_TEXT);
-    } else {
-      grn_vector_add_element(ctx,
-                             &(tokenizer->loose.readings),
-                             surface,
-                             surface_length,
-                             0,
-                             GRN_DB_TEXT);
-    }
-  }
-  GRN_OBJ_FIN(ctx, &features);
 static void
@@ -991,6 +885,148 @@ mecab_next_wakati_format(grn_ctx *ctx,
   grn_token_set_status(ctx, token, status);
+  This function is called for a full text search query or a document to be
+  indexed. This means that both short/long strings are given.
+  The return value of this function is ignored. When an error occurs in this
+  function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
+ */
+static void *
+mecab_init(grn_ctx *ctx, grn_tokenizer_query *query)
+  grn_obj *lexicon;
+  grn_mecab_tokenizer *tokenizer;
+  lexicon = grn_tokenizer_query_get_lexicon(ctx, query);
+  if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
+                     "[tokenizer][mecab] "
+                     "memory allocation to grn_mecab_tokenizer failed");
+    return NULL;
+  }
+  tokenizer->options =
+    grn_table_cache_default_tokenizer_options(ctx,
+                                              lexicon,
+                                              mecab_tokenizer_options_open,
+                                              mecab_tokenizer_options_close,
+                                              NULL);
+  if (ctx->rc != GRN_SUCCESS) {
+    GRN_PLUGIN_FREE(ctx, tokenizer);
+    return NULL;
+  }
+  mecab_init_mecab(ctx, tokenizer);
+  if (!tokenizer->mecab->mecab) {
+    GRN_PLUGIN_FREE(ctx, tokenizer);
+    return NULL;
+  }
+  {
+    grn_encoding encoding = grn_tokenizer_query_get_encoding(ctx, query);
+    if (encoding != tokenizer->mecab->encoding) {
+      GRN_PLUGIN_FREE(ctx, tokenizer);
+                       "[tokenizer][mecab] "
+                       "MeCab dictionary charset (%s) does not match "
+                       "the table encoding: <%s>",
+                       grn_encoding_to_string(tokenizer->mecab->encoding),
+                       grn_encoding_to_string(encoding));
+      return NULL;
+    }
+  }
+  tokenizer->query = query;
+  {
+    grn_obj *string;
+    const char *normalized_string;
+    unsigned int normalized_string_length;
+    string = grn_tokenizer_query_get_normalized_string(ctx, query);
+    grn_string_get_normalized(ctx,
+                              string,
+                              &normalized_string,
+                              &normalized_string_length,
+                              NULL);
+    GRN_TEXT_INIT(&(tokenizer->buf), 0);
+    if (grn_tokenizer_query_have_tokenized_delimiter(ctx, query)) {
+      tokenizer->next = normalized_string;
+      tokenizer->end = tokenizer->next + normalized_string_length;
+    } else if (normalized_string_length == 0) {
+      tokenizer->next = "";
+      tokenizer->end = tokenizer->next;
+    } else {
+      grn_bool succeeded;
+      grn_plugin_mutex_lock(ctx, tokenizer->mecab->mutex);
+      if (tokenizer->options->chunked_tokenize &&
+          ctx->encoding == GRN_ENC_UTF8) {
+        succeeded = chunked_tokenize_utf8(ctx,
+                                          tokenizer,
+                                          normalized_string,
+                                          normalized_string_length);
+      } else {
+        const char *s;
+        s = mecab_sparse_tostr2(tokenizer->mecab->mecab,
+                                normalized_string,
+                                normalized_string_length);
+        if (!s) {
+          succeeded = GRN_FALSE;
+                           "[tokenizer][mecab] "
+                           "mecab_sparse_tostr() failed len=%d err=%s",
+                           normalized_string_length,
+                           mecab_strerror(tokenizer->mecab->mecab));
+        } else {
+          succeeded = GRN_TRUE;
+          GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
+        }
+      }
+      grn_plugin_mutex_unlock(ctx, tokenizer->mecab->mutex);
+      if (!succeeded) {
+        GRN_PLUGIN_FREE(ctx, tokenizer);
+        return NULL;
+      }
+      if (mecab_tokenizer_options_need_default_output(tokenizer->options)) {
+        tokenizer->next = GRN_TEXT_VALUE(&(tokenizer->buf));
+        tokenizer->end = tokenizer->next + GRN_TEXT_LEN(&(tokenizer->buf));
+      } else {
+        char *buf, *p;
+        unsigned int bufsize;
+        buf = GRN_TEXT_VALUE(&(tokenizer->buf));
+        bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
+        /* A certain version of mecab returns trailing lf or spaces. */
+        for (p = buf + bufsize - 2;
+             buf <= p && isspace(*(unsigned char *)p);
+             p--) { *p = '\0'; }
+        tokenizer->next = buf;
+        tokenizer->end = p + 1;
+      }
+    }
+  }
+  GRN_UINT64_INIT(&(tokenizer->feature_locations), GRN_OBJ_VECTOR);
+  tokenizer->loose.ing = GRN_FALSE;
+  tokenizer->loose.need = GRN_FALSE;
+  tokenizer->loose.need_end_mark = GRN_FALSE;
+  GRN_TEXT_INIT(&(tokenizer->loose.readings), GRN_OBJ_VECTOR);
+  tokenizer->loose.offset = 0;
+  if (tokenizer->options->loose_reading &&
+      grn_tokenizer_query_get_mode(ctx, tokenizer->query) == GRN_TOKEN_GET) {
+    while (mecab_next_default_format_consume_token(ctx, tokenizer, NULL) > 0) {
+      /* Do nothing */
+    }
+    tokenizer->loose.ing = GRN_TRUE;
+    tokenizer->loose.need = GRN_TRUE;
+    tokenizer->loose.need_end_mark = GRN_FALSE;
+  }
+  return tokenizer;
   This function returns tokens one by one.
@@ -1040,6 +1076,7 @@ mecab_fin(grn_ctx *ctx, void *user_data)
   GRN_OBJ_FIN(ctx, &(tokenizer->loose.readings));
+  GRN_OBJ_FIN(ctx, &(tokenizer->feature_locations));
   GRN_OBJ_FIN(ctx, &(tokenizer->buf));
   GRN_PLUGIN_FREE(ctx, tokenizer);

  Added: test/command/suite/tokenizers/mecab/options/loose_reading_get.expected (+35 -0) 100644
--- /dev/null
+++ test/command/suite/tokenizers/mecab/options/loose_reading_get.expected    2018-09-11 11:06:49 +0900 (40824322b)
@@ -0,0 +1,35 @@
+tokenize   'TokenMecab("loose_reading", true)'   '焼き肉と焼肉とyakiniku'   --mode GET
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "ヤキニク",
+      "position": 0,
+      "force_prefix": false
+    },
+    {
+      "value": "ト",
+      "position": 1,
+      "force_prefix": false
+    },
+    {
+      "value": "ヤキニク",
+      "position": 2,
+      "force_prefix": false
+    },
+    {
+      "value": "ト",
+      "position": 3,
+      "force_prefix": false
+    },
+    {
+      "value": "yakiniku",
+      "position": 4,
+      "force_prefix": false
+    }
+  ]

  Added: test/command/suite/tokenizers/mecab/options/loose_reading_get.test (+6 -0) 100644
--- /dev/null
+++ test/command/suite/tokenizers/mecab/options/loose_reading_get.test    2018-09-11 11:06:49 +0900 (3efcbdcfc)
@@ -0,0 +1,6 @@
+#@on-error omit
+tokenize \
+  'TokenMecab("loose_reading", true)' \
+  '焼き肉と焼肉とyakiniku' \
+  --mode GET
+#@on-error default
-------------- next part --------------
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180911/716b05cd/attachment-0001.htm 

More information about the Groonga-commit mailing list
Back to archive index