[Groonga-commit] groonga/groonga at 94e1c76 [master] TokenMecab: add loose_reading option

Back to archive index

Kouhei Sutou null+****@clear*****
Mon Sep 10 17:55:28 JST 2018


Kouhei Sutou	2018-09-10 17:55:28 +0900 (Mon, 10 Sep 2018)

  Revision: 94e1c7685a5b0d1e6e4377d2f5f60b3fcce80d55
  https://github.com/groonga/groonga/commit/94e1c7685a5b0d1e6e4377d2f5f60b3fcce80d55

  Message:
    TokenMecab: add loose_reading option
    
    TODO: GET mode support

  Added files:
    test/command/suite/tokenizers/mecab/options/loose_reading_add.expected
    test/command/suite/tokenizers/mecab/options/loose_reading_add.test
  Modified files:
    plugins/tokenizers/mecab.c

  Modified: plugins/tokenizers/mecab.c (+112 -15)
===================================================================
--- plugins/tokenizers/mecab.c    2018-09-10 17:55:01 +0900 (42e0c5177)
+++ plugins/tokenizers/mecab.c    2018-09-10 17:55:28 +0900 (6ab3a1d3a)
@@ -52,6 +52,7 @@ typedef struct {
   grn_bool include_class;
   grn_bool include_reading;
   grn_bool include_form;
+  grn_bool loose_reading;
 } grn_mecab_tokenizer_options;
 
 typedef struct {
@@ -61,6 +62,13 @@ typedef struct {
   const char *next;
   const char *end;
   grn_tokenizer_query *query;
+  struct {
+    grn_bool ing;
+    grn_bool need;
+    grn_bool need_end_mark;
+    grn_obj readings;
+    size_t offset;
+  } loose;
 } grn_mecab_tokenizer;
 
 static const char *
@@ -146,6 +154,7 @@ mecab_tokenizer_options_init(grn_mecab_tokenizer_options *options)
   options->include_class = GRN_FALSE;
   options->include_reading = GRN_FALSE;
   options->include_form = GRN_FALSE;
+  options->loose_reading = GRN_FALSE;
 }
 
 static grn_bool
@@ -167,6 +176,10 @@ mecab_tokenizer_options_need_default_output(grn_mecab_tokenizer_options *options
     return GRN_TRUE;
   }
 
+  if (options->loose_reading) {
+    return GRN_TRUE;
+  }
+
   return GRN_FALSE;
 }
 
@@ -224,6 +237,12 @@ mecab_tokenizer_options_open(grn_ctx *ctx,
                                     raw_options,
                                     i,
                                     options->include_form);
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "loose_reading")) {
+      options->loose_reading =
+        grn_vector_get_element_bool(ctx,
+                                    raw_options,
+                                    i,
+                                    options->loose_reading);
     }
   } GRN_OPTION_VALUES_EACH_END();
 
@@ -673,6 +692,12 @@ mecab_init(grn_ctx *ctx, grn_tokenizer_query *query)
     }
   }
 
+  tokenizer->loose.ing = GRN_FALSE;
+  tokenizer->loose.need = GRN_FALSE;
+  tokenizer->loose.need_end_mark = GRN_FALSE;
+  GRN_TEXT_INIT(&(tokenizer->loose.readings), GRN_OBJ_VECTOR);
+  tokenizer->loose.offset = 0;
+
   return tokenizer;
 }
 
@@ -703,6 +728,27 @@ typedef struct {
   grn_bool ignore_asterisk_value;
 } add_feature_data;
 
+static size_t
+mecab_get_feature(grn_ctx *ctx,
+                  grn_obj *features,
+                  size_t i,
+                  const char **value)
+{
+  size_t n_locations = GRN_BULK_VSIZE(features) / sizeof(uint64_t);
+  const char *start;
+  const char *end;
+
+  if (i + 2 > n_locations) {
+    *value = NULL;
+    return 0;
+  }
+
+  start = (const char *)(GRN_UINT64_VALUE_AT(features, i));
+  end = ((const char *)(GRN_UINT64_VALUE_AT(features, i + 1))) - 1;
+  *value = start;
+  return end - start;
+}
+
 static void
 mecab_next_default_format_add_feature(grn_ctx *ctx,
                                       add_feature_data *data,
@@ -711,31 +757,22 @@ mecab_next_default_format_add_feature(grn_ctx *ctx,
 {
   grn_token *token = data->token;
   grn_obj *features = data->features;
-  size_t n_locations = GRN_BULK_VSIZE(features) / sizeof(uint64_t);
-  const char *feature_start;
-  const char *feature_end;
+  const char *feature = NULL;
   size_t feature_length;
   grn_obj value;
 
-  if (i + 2 > n_locations) {
-    return;
-  }
-
-  feature_start = (const char *)(GRN_UINT64_VALUE_AT(features, i));
-  feature_end = ((const char *)(GRN_UINT64_VALUE_AT(features, i + 1))) - 1;
-  feature_length = feature_end - feature_start;
-
+  feature_length = mecab_get_feature(ctx, features, i, &feature);
   if (data->ignore_empty_value && feature_length == 0) {
     return;
   }
   if (data->ignore_asterisk_value &&
       feature_length == 1 &&
-      feature_start[0] == '*') {
+      feature[0] == '*') {
     return;
   }
 
   GRN_TEXT_INIT(&value, GRN_OBJ_DO_SHALLOW_COPY);
-  GRN_TEXT_SET(ctx, &value, feature_start, feature_length);
+  GRN_TEXT_SET(ctx, &value, feature, feature_length);
   grn_token_metadata_add(ctx,
                          grn_token_get_metadata(ctx, token),
                          name,
@@ -758,6 +795,38 @@ mecab_next_default_format(grn_ctx *ctx,
   size_t surface_length = 0;
   grn_obj features;
 
+  if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) {
+    grn_tokenizer_status status = GRN_TOKEN_CONTINUE;
+    grn_token_set_data(ctx,
+                       token,
+                       GRN_TOKENIZER_END_MARK_UTF8,
+                       GRN_TOKENIZER_END_MARK_UTF8_LEN);
+    grn_token_set_status(ctx, token, status);
+    tokenizer->loose.need_end_mark = GRN_FALSE;
+    return;
+  }
+
+  if (tokenizer->loose.ing) {
+    grn_tokenizer_status status = GRN_TOKEN_CONTINUE;
+    const char *reading = NULL;
+    unsigned int reading_length;
+
+    if (tokenizer->loose.offset + 1 ==
+        grn_vector_size(ctx, &(tokenizer->loose.readings))) {
+      status = GRN_TOKEN_LAST;
+    }
+    reading_length = grn_vector_get_element(ctx,
+                                            &(tokenizer->loose.readings),
+                                            tokenizer->loose.offset,
+                                            &reading,
+                                            NULL,
+                                            NULL);
+    grn_token_set_data(ctx, token, reading, reading_length);
+    grn_token_set_status(ctx, token, status);
+    tokenizer->loose.offset++;
+    return;
+  }
+
   mecab_next_default_format_skip_eos(ctx, tokenizer);
   start = surface = tokenizer->next;
   GRN_UINT64_INIT(&features, GRN_OBJ_VECTOR);
@@ -812,7 +881,12 @@ mecab_next_default_format(grn_ctx *ctx,
   {
     grn_tokenizer_status status;
     if (current == end || tokenizer->next == end) {
-      status = GRN_TOKEN_LAST;
+      if (tokenizer->loose.need) {
+        tokenizer->loose.ing = GRN_TRUE;
+        status = GRN_TOKEN_CONTINUE;
+      } else {
+        status = GRN_TOKEN_LAST;
+      }
     } else {
       status = GRN_TOKEN_CONTINUE;
     }
@@ -847,6 +921,28 @@ mecab_next_default_format(grn_ctx *ctx,
     mecab_next_default_format_add_feature(ctx, &data, "inflected_form", 5);
     mecab_next_default_format_add_feature(ctx, &data, "base_form", 6);
   }
+  if (tokenizer->options->loose_reading) {
+    const char *reading = NULL;
+    size_t reading_length;
+    reading_length = mecab_get_feature(ctx, &features, 7, &reading);
+    if (reading_length > 0) {
+      tokenizer->loose.need = GRN_TRUE;
+      tokenizer->loose.need_end_mark = GRN_TRUE;
+      grn_vector_add_element(ctx,
+                             &(tokenizer->loose.readings),
+                             reading,
+                             reading_length,
+                             0,
+                             GRN_DB_TEXT);
+    } else {
+      grn_vector_add_element(ctx,
+                             &(tokenizer->loose.readings),
+                             surface,
+                             surface_length,
+                             0,
+                             GRN_DB_TEXT);
+    }
+  }
   GRN_OBJ_FIN(ctx, &features);
 }
 
@@ -943,7 +1039,8 @@ mecab_fin(grn_ctx *ctx, void *user_data)
   if (!tokenizer) {
     return;
   }
-  grn_obj_unlink(ctx, &(tokenizer->buf));
+  GRN_OBJ_FIN(ctx, &(tokenizer->loose.readings));
+  GRN_OBJ_FIN(ctx, &(tokenizer->buf));
   GRN_PLUGIN_FREE(ctx, tokenizer);
 }
 

  Added: test/command/suite/tokenizers/mecab/options/loose_reading_add.expected (+65 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/options/loose_reading_add.expected    2018-09-10 17:55:28 +0900 (7a0bba241)
@@ -0,0 +1,65 @@
+tokenize   'TokenMecab("loose_reading", true)'   '焼き肉と焼きにくとyakiniku'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "焼き肉",
+      "position": 0,
+      "force_prefix": false
+    },
+    {
+      "value": "と",
+      "position": 1,
+      "force_prefix": false
+    },
+    {
+      "value": "焼きにく",
+      "position": 2,
+      "force_prefix": false
+    },
+    {
+      "value": "と",
+      "position": 3,
+      "force_prefix": false
+    },
+    {
+      "value": "yakiniku",
+      "position": 4,
+      "force_prefix": false
+    },
+    {
+      "value": "￰",
+      "position": 5,
+      "force_prefix": false
+    },
+    {
+      "value": "ヤキニク",
+      "position": 6,
+      "force_prefix": false
+    },
+    {
+      "value": "ト",
+      "position": 7,
+      "force_prefix": false
+    },
+    {
+      "value": "ヤキニク",
+      "position": 8,
+      "force_prefix": false
+    },
+    {
+      "value": "ト",
+      "position": 9,
+      "force_prefix": false
+    },
+    {
+      "value": "yakiniku",
+      "position": 10,
+      "force_prefix": false
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/mecab/options/loose_reading_add.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/mecab/options/loose_reading_add.test    2018-09-10 17:55:28 +0900 (9250edb4d)
@@ -0,0 +1,5 @@
+#@on-error omit
+tokenize \
+  'TokenMecab("loose_reading", true)' \
+  '焼き肉と焼きにくとyakiniku'
+#@on-error default
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180910/572a3ad1/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index