groonga/groonga at a10b493 [master] TokenMecab: add target_class option (Groonga-commit) - Groonga - fulltext search engine.

Kouhei Sutou	2018-10-18 17:30:23 +0900 (Thu, 18 Oct 2018)

  Revision: a10b493b0318af2c5cca519272f8266df86ab88e
  https://github.com/groonga/groonga/commit/a10b493b0318af2c5cca519272f8266df86ab88e

  Message:
    TokenMecab: add target_class option
    
    TODO: Test multiple target_class options case.

  Added files:
    test/command/suite/tokenizers/mecab/options/target_class/one.expected
    test/command/suite/tokenizers/mecab/options/target_class/one.test
  Modified files:
    plugins/tokenizers/mecab.c

  Modified: plugins/tokenizers/mecab.c (+121 -21)
===================================================================

--- plugins/tokenizers/mecab.c    2018-10-17 15:41:40 +0900 (7d022f05a)
+++ plugins/tokenizers/mecab.c    2018-10-18 17:30:23 +0900 (536d6da0c)
@@ -46,6 +46,10 @@ static grn_mecab mecab_wakati;
 static grn_bool grn_mecab_chunked_tokenize_enabled = GRN_FALSE;
 static int32_t grn_mecab_chunk_size_threshold = 8192;
 
+static const size_t GRN_MECAB_FEATURE_LOCATION_CLASS = 0;
+static const size_t GRN_MECAB_FEATURE_LOCATION_SUBCLASS0 = 1;
+static const size_t GRN_MECAB_FEATURE_LOCATION_SUBCLASS1 = 2;
+static const size_t GRN_MECAB_FEATURE_LOCATION_SUBCLASS2 = 3;
 static const size_t GRN_MECAB_FEATURE_LOCATION_READING = 7;
 
 typedef struct {
@@ -55,6 +59,7 @@ typedef struct {
   grn_bool include_reading;
   grn_bool include_form;
   grn_bool use_reading;
+  grn_obj target_classes;
 } grn_mecab_tokenizer_options;
 
 typedef struct {
@@ -151,10 +156,12 @@ mecab_tokenizer_options_init(grn_mecab_tokenizer_options *options)
   options->include_reading = GRN_FALSE;
   options->include_form = GRN_FALSE;
   options->use_reading = GRN_FALSE;
+  GRN_TEXT_INIT(&(options->target_classes), GRN_OBJ_VECTOR);
 }
 
 static grn_bool
-mecab_tokenizer_options_need_default_output(grn_mecab_tokenizer_options *options)
+mecab_tokenizer_options_need_default_output(grn_ctx *ctx,
+                                            grn_mecab_tokenizer_options *options)
 {
   if (!options) {
     return GRN_FALSE;
@@ -176,6 +183,10 @@ mecab_tokenizer_options_need_default_output(grn_mecab_tokenizer_options *options
     return GRN_TRUE;
   }
 
+  if (grn_vector_size(ctx, &(options->target_classes)) > 0) {
+    return GRN_TRUE;
+  }
+
   return GRN_FALSE;
 }
 
@@ -239,6 +250,25 @@ mecab_tokenizer_options_open(grn_ctx *ctx,
                                     raw_options,
                                     i,
                                     options->use_reading);
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "target_class")) {
+      const char *target_class = NULL;
+      unsigned int target_class_length;
+      grn_id domain;
+
+      target_class_length = grn_vector_get_element(ctx,
+                                                   raw_options,
+                                                   i,
+                                                   &target_class,
+                                                   NULL,
+                                                   &domain);
+      if (grn_type_id_is_text_family(ctx, domain) && target_class_length > 0) {
+        grn_vector_add_element(ctx,
+                               &(options->target_classes),
+                               target_class,
+                               target_class_length,
+                               0,
+                               GRN_DB_TEXT);
+      }
     }
   } GRN_OPTION_VALUES_EACH_END();
 
@@ -249,6 +279,7 @@ static void
 mecab_tokenizer_options_close(grn_ctx *ctx, void *data)
 {
   grn_mecab_tokenizer_options *options = data;
+  GRN_OBJ_FIN(ctx, &(options->target_classes));
   GRN_PLUGIN_FREE(ctx, options);
 }
 
@@ -446,7 +477,8 @@ mecab_create(grn_ctx *ctx,
   const char *tag;
   grn_bool need_default_output = GRN_FALSE;
 
-  need_default_output = mecab_tokenizer_options_need_default_output(options);
+  need_default_output =
+    mecab_tokenizer_options_need_default_output(ctx, options);
 
   if (need_default_output) {
     tag = "[default]";
@@ -548,7 +580,7 @@ mecab_create(grn_ctx *ctx,
 static void
 mecab_init_mecab(grn_ctx *ctx, grn_mecab_tokenizer *tokenizer)
 {
-  if (mecab_tokenizer_options_need_default_output(tokenizer->options)) {
+  if (mecab_tokenizer_options_need_default_output(ctx, tokenizer->options)) {
     tokenizer->mecab = &mecab_default;
   } else {
     tokenizer->mecab = &mecab_wakati;
@@ -724,6 +756,65 @@ mecab_next_default_format_consume_token(grn_ctx *ctx,
 }
 
 static void
+mecab_next_default_format_consume_needless_tokens(grn_ctx *ctx,
+                                                  grn_mecab_tokenizer *tokenizer)
+{
+  grn_obj *target_classes = &(tokenizer->options->target_classes);
+  unsigned int n_target_classes;
+  const char *last_next = tokenizer->next;
+  grn_bool is_target = GRN_FALSE;
+
+  n_target_classes = grn_vector_size(ctx, target_classes);
+
+  if (n_target_classes == 0) {
+    return;
+  }
+
+  while (tokenizer->next != tokenizer->end && !is_target) {
+    const char *surface = NULL;
+    size_t surface_length = 0;
+    unsigned int i;
+    grn_obj *feature_locations;
+    const char *class = NULL;
+    size_t class_length;
+
+    last_next = tokenizer->next;
+    surface_length = mecab_next_default_format_consume_token(ctx,
+                                                             tokenizer,
+                                                             &surface);
+
+    if (surface_length == 0) {
+      break;
+    }
+
+    feature_locations = &(tokenizer->feature_locations);
+    class_length = mecab_get_feature(ctx,
+                                     feature_locations,
+                                     GRN_MECAB_FEATURE_LOCATION_CLASS,
+                                     &class);
+    for (i = 0; i < n_target_classes; i++) {
+      const char *target_class;
+      unsigned int target_class_length;
+      target_class_length = grn_vector_get_element(ctx,
+                                                   target_classes,
+                                                   i,
+                                                   &target_class,
+                                                   NULL,
+                                                   NULL);
+      if (target_class_length == class_length &&
+          memcmp(target_class, class, target_class_length) == 0) {
+        is_target = GRN_TRUE;
+        break;
+      }
+    }
+  }
+
+  if (is_target) {
+    tokenizer->next = last_next;
+  }
+}
+
+static void
 mecab_next_default_format(grn_ctx *ctx,
                           grn_mecab_tokenizer *tokenizer,
                           grn_token *token)
@@ -750,28 +841,20 @@ mecab_next_default_format(grn_ctx *ctx,
   } else {
     grn_token_set_data(ctx, token, surface, surface_length);
   }
-  {
-    grn_tokenizer_status status;
-    if (surface_length == 0) {
-      /* Error */
-      status = GRN_TOKEN_LAST;
-    } else if (tokenizer->next == tokenizer->end) {
-      status = GRN_TOKEN_LAST;
-    } else {
-      status = GRN_TOKEN_CONTINUE;
-    }
-    grn_token_set_status(ctx, token, status);
-  }
   if (tokenizer->options->include_class) {
     add_feature_data data;
     data.token = token;
     data.feature_locations = &(tokenizer->feature_locations);
     data.ignore_empty_value = GRN_TRUE;
     data.ignore_asterisk_value = GRN_TRUE;
-    mecab_next_default_format_add_feature(ctx, &data, "class", 0);
-    mecab_next_default_format_add_feature(ctx, &data, "subclass0", 1);
-    mecab_next_default_format_add_feature(ctx, &data, "subclass1", 2);
-    mecab_next_default_format_add_feature(ctx, &data, "subclass2", 3);
+    mecab_next_default_format_add_feature(ctx, &data, "class",
+                                          GRN_MECAB_FEATURE_LOCATION_CLASS);
+    mecab_next_default_format_add_feature(ctx, &data, "subclass0",
+                                          GRN_MECAB_FEATURE_LOCATION_SUBCLASS0);
+    mecab_next_default_format_add_feature(ctx, &data, "subclass1",
+                                          GRN_MECAB_FEATURE_LOCATION_SUBCLASS1);
+    mecab_next_default_format_add_feature(ctx, &data, "subclass2",
+                                          GRN_MECAB_FEATURE_LOCATION_SUBCLASS2);
   }
   if (tokenizer->options->include_reading) {
     add_feature_data data;
@@ -794,6 +877,21 @@ mecab_next_default_format(grn_ctx *ctx,
     mecab_next_default_format_add_feature(ctx, &data, "inflected_form", 5);
     mecab_next_default_format_add_feature(ctx, &data, "base_form", 6);
   }
+  {
+    grn_tokenizer_status status;
+    if (surface_length == 0) {
+      /* Error */
+      status = GRN_TOKEN_LAST;
+    } else {
+      mecab_next_default_format_consume_needless_tokens(ctx, tokenizer);
+      if (tokenizer->next == tokenizer->end) {
+        status = GRN_TOKEN_LAST;
+      } else {
+        status = GRN_TOKEN_CONTINUE;
+      }
+    }
+    grn_token_set_status(ctx, token, status);
+  }
 }
 
 static void
@@ -944,7 +1042,7 @@ mecab_init(grn_ctx *ctx, grn_tokenizer_query *query)
         GRN_PLUGIN_FREE(ctx, tokenizer);
         return NULL;
       }
-      if (mecab_tokenizer_options_need_default_output(tokenizer->options)) {
+      if (mecab_tokenizer_options_need_default_output(ctx, tokenizer->options)) {
         tokenizer->next = GRN_TEXT_VALUE(&(tokenizer->buf));
         tokenizer->end = tokenizer->next + GRN_TEXT_LEN(&(tokenizer->buf));
       } else {
@@ -965,6 +1063,8 @@ mecab_init(grn_ctx *ctx, grn_tokenizer_query *query)
 
   GRN_UINT64_INIT(&(tokenizer->feature_locations), GRN_OBJ_VECTOR);
 
+  mecab_next_default_format_consume_needless_tokens(ctx, tokenizer);
+
   return tokenizer;
 }
 
@@ -998,7 +1098,7 @@ mecab_next(grn_ctx *ctx,
                          token,
                          GRN_UINT32_VALUE(&(tokenizer_token.status)));
     grn_tokenizer_token_fin(ctx, &tokenizer_token);
-  } else if (mecab_tokenizer_options_need_default_output(tokenizer->options)) {
+  } else if (mecab_tokenizer_options_need_default_output(ctx, tokenizer->options)) {
     mecab_next_default_format(ctx, tokenizer, token);
   } else {
     mecab_next_wakati_format(ctx, tokenizer, token);

  Added: test/command/suite/tokenizers/mecab/options/target_class/one.expected (+75 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/options/target_class/one.expected    2018-10-18 17:30:23 +0900 (4b5bd07e2)
@@ -0,0 +1,75 @@
+tokenize   'TokenMecab("include_class", true)'   '私の名前は中野です。'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "私",
+      "position": 0,
+      "force_prefix": false,
+      "metadata": {
+        "class": "名詞",
+        "subclass0": "代名詞",
+        "subclass1": "一般"
+      }
+    },
+    {
+      "value": "の",
+      "position": 1,
+      "force_prefix": false,
+      "metadata": {
+        "class": "助詞",
+        "subclass0": "連体化"
+      }
+    },
+    {
+      "value": "名前",
+      "position": 2,
+      "force_prefix": false,
+      "metadata": {
+        "class": "名詞",
+        "subclass0": "一般"
+      }
+    },
+    {
+      "value": "は",
+      "position": 3,
+      "force_prefix": false,
+      "metadata": {
+        "class": "助詞",
+        "subclass0": "係助詞"
+      }
+    },
+    {
+      "value": "中野",
+      "position": 4,
+      "force_prefix": false,
+      "metadata": {
+        "class": "名詞",
+        "subclass0": "固有名詞",
+        "subclass1": "人名",
+        "subclass2": "姓"
+      }
+    },
+    {
+      "value": "です",
+      "position": 5,
+      "force_prefix": false,
+      "metadata": {
+        "class": "助動詞"
+      }
+    },
+    {
+      "value": "。",
+      "position": 6,
+      "force_prefix": false,
+      "metadata": {
+        "class": "記号",
+        "subclass0": "句点"
+      }
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/options/target_class/one.test (+6 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/options/target_class/one.test    2018-10-18 17:30:23 +0900 (c6d1a0c9f)
@@ -0,0 +1,6 @@
+#@on-error omit
+tokenize \
+  'TokenMecab("include_class", true, \
+              "target_class", "名詞")' \
+  '私の名前は中野です。'
+#@on-error default
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181018/c589f7fc/attachment-0001.html>


Groonga - fulltext search engine.

[Groonga-commit] groonga/groonga at a10b493 [master] TokenMecab: add target_class option