[Groonga-commit] groonga/groonga at 1a9ef9e [master] TokenMecab: loose_reading -> use_reading

Back to archive index

Kouhei Sutou null+****@clear*****
Wed Sep 19 11:14:29 JST 2018


Kouhei Sutou	2018-09-19 11:14:29 +0900 (Wed, 19 Sep 2018)

  Revision: 1a9ef9e1bbdba660f712f3037b6c7b42708ef7b1
  https://github.com/groonga/groonga/commit/1a9ef9e1bbdba660f712f3037b6c7b42708ef7b1

  Message:
    TokenMecab: loose_reading -> use_reading
    
    Because we don't need original surfaces to search by reading.

  Copied files:
    test/command/suite/tokenizers/mecab/options/use_reading_add.expected
      (from test/command/suite/tokenizers/mecab/options/loose_reading_get.expected)
  Removed files:
    test/command/suite/tokenizers/mecab/options/loose_reading_add.expected
  Modified files:
    plugins/tokenizers/mecab.c
  Renamed files:
    test/command/suite/select/query/match/with_index/token_mecab/use_reading.expected
      (from test/command/suite/select/query/match/with_index/token_mecab/loose_reading.expected)
    test/command/suite/select/query/match/with_index/token_mecab/use_reading.test
      (from test/command/suite/select/query/match/with_index/token_mecab/loose_reading.test)
    test/command/suite/tokenizers/mecab/options/use_reading_add.test
      (from test/command/suite/tokenizers/mecab/options/loose_reading_add.test)
    test/command/suite/tokenizers/mecab/options/use_reading_get.expected
      (from test/command/suite/tokenizers/mecab/options/loose_reading_get.expected)
    test/command/suite/tokenizers/mecab/options/use_reading_get.test
      (from test/command/suite/tokenizers/mecab/options/loose_reading_get.test)

  Modified: plugins/tokenizers/mecab.c (+29 -94)
===================================================================
--- plugins/tokenizers/mecab.c    2018-09-13 22:26:19 +0900 (487358adb)
+++ plugins/tokenizers/mecab.c    2018-09-19 11:14:29 +0900 (7d022f05a)
@@ -46,13 +46,15 @@ static grn_mecab mecab_wakati;
 static grn_bool grn_mecab_chunked_tokenize_enabled = GRN_FALSE;
 static int32_t grn_mecab_chunk_size_threshold = 8192;
 
+static const size_t GRN_MECAB_FEATURE_LOCATION_READING = 7;
+
 typedef struct {
   grn_bool chunked_tokenize;
   int32_t chunk_size_threshold;
   grn_bool include_class;
   grn_bool include_reading;
   grn_bool include_form;
-  grn_bool loose_reading;
+  grn_bool use_reading;
 } grn_mecab_tokenizer_options;
 
 typedef struct {
@@ -63,13 +65,6 @@ typedef struct {
   const char *end;
   grn_tokenizer_query *query;
   grn_obj feature_locations;
-  struct {
-    grn_bool ing;
-    grn_bool need;
-    grn_bool need_end_mark;
-    grn_obj readings;
-    size_t offset;
-  } loose;
 } grn_mecab_tokenizer;
 
 static const char *
@@ -155,7 +150,7 @@ mecab_tokenizer_options_init(grn_mecab_tokenizer_options *options)
   options->include_class = GRN_FALSE;
   options->include_reading = GRN_FALSE;
   options->include_form = GRN_FALSE;
-  options->loose_reading = GRN_FALSE;
+  options->use_reading = GRN_FALSE;
 }
 
 static grn_bool
@@ -177,7 +172,7 @@ mecab_tokenizer_options_need_default_output(grn_mecab_tokenizer_options *options
     return GRN_TRUE;
   }
 
-  if (options->loose_reading) {
+  if (options->use_reading) {
     return GRN_TRUE;
   }
 
@@ -238,12 +233,12 @@ mecab_tokenizer_options_open(grn_ctx *ctx,
                                     raw_options,
                                     i,
                                     options->include_form);
-    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "loose_reading")) {
-      options->loose_reading =
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "use_reading")) {
+      options->use_reading =
         grn_vector_get_element_bool(ctx,
                                     raw_options,
                                     i,
-                                    options->loose_reading);
+                                    options->use_reading);
     }
   } GRN_OPTION_VALUES_EACH_END();
 
@@ -722,29 +717,6 @@ mecab_next_default_format_consume_token(grn_ctx *ctx,
   tokenizer->next = current;
   mecab_next_default_format_skip_eos(ctx, tokenizer);
 
-  if (tokenizer->options->loose_reading) {
-    const char *reading = NULL;
-    size_t reading_length;
-    reading_length = mecab_get_feature(ctx, feature_locations, 7, &reading);
-    if (reading_length > 0) {
-      tokenizer->loose.need = GRN_TRUE;
-      tokenizer->loose.need_end_mark = GRN_TRUE;
-      grn_vector_add_element(ctx,
-                             &(tokenizer->loose.readings),
-                             reading,
-                             reading_length,
-                             0,
-                             GRN_DB_TEXT);
-    } else {
-      grn_vector_add_element(ctx,
-                             &(tokenizer->loose.readings),
-                             surface,
-                             surface_length,
-                             0,
-                             GRN_DB_TEXT);
-    }
-  }
-
   if (surface_output) {
     *surface_output = surface;
   }
@@ -759,54 +731,32 @@ mecab_next_default_format(grn_ctx *ctx,
   const char *surface;
   size_t surface_length = 0;
 
-  if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) {
-    grn_tokenizer_status status = GRN_TOKEN_CONTINUE;
-    grn_token_set_data(ctx,
-                       token,
-                       GRN_TOKENIZER_END_MARK_UTF8,
-                       GRN_TOKENIZER_END_MARK_UTF8_LEN);
-    grn_token_set_status(ctx, token, status);
-    tokenizer->loose.need_end_mark = GRN_FALSE;
-    return;
-  }
-
-  if (tokenizer->loose.ing) {
-    grn_tokenizer_status status = GRN_TOKEN_CONTINUE;
-    const char *reading = NULL;
-    unsigned int reading_length;
-
-    if (tokenizer->loose.offset + 1 ==
-        grn_vector_size(ctx, &(tokenizer->loose.readings))) {
-      status = GRN_TOKEN_LAST;
-    }
-    reading_length = grn_vector_get_element(ctx,
-                                            &(tokenizer->loose.readings),
-                                            tokenizer->loose.offset,
-                                            &reading,
-                                            NULL,
-                                            NULL);
-    grn_token_set_data(ctx, token, reading, reading_length);
-    grn_token_set_status(ctx, token, status);
-    tokenizer->loose.offset++;
-    return;
-  }
-
   surface_length = mecab_next_default_format_consume_token(ctx,
                                                            tokenizer,
                                                            &surface);
-  grn_token_set_data(ctx, token, surface, surface_length);
+  if (tokenizer->options->use_reading) {
+    grn_obj *feature_locations = &(tokenizer->feature_locations);
+    const char *reading = NULL;
+    size_t reading_length;
+    reading_length = mecab_get_feature(ctx,
+                                       feature_locations,
+                                       GRN_MECAB_FEATURE_LOCATION_READING,
+                                       &reading);
+    if (reading_length > 0) {
+      grn_token_set_data(ctx, token, reading, reading_length);
+    } else {
+      grn_token_set_data(ctx, token, surface, surface_length);
+    }
+  } else {
+    grn_token_set_data(ctx, token, surface, surface_length);
+  }
   {
     grn_tokenizer_status status;
     if (surface_length == 0) {
       /* Error */
       status = GRN_TOKEN_LAST;
     } else if (tokenizer->next == tokenizer->end) {
-      if (tokenizer->loose.need) {
-        tokenizer->loose.ing = GRN_TRUE;
-        status = GRN_TOKEN_CONTINUE;
-      } else {
-        status = GRN_TOKEN_LAST;
-      }
+      status = GRN_TOKEN_LAST;
     } else {
       status = GRN_TOKEN_CONTINUE;
     }
@@ -829,7 +779,10 @@ mecab_next_default_format(grn_ctx *ctx,
     data.feature_locations = &(tokenizer->feature_locations);
     data.ignore_empty_value = GRN_TRUE;
     data.ignore_asterisk_value = GRN_FALSE;
-    mecab_next_default_format_add_feature(ctx, &data, "reading", 7);
+    mecab_next_default_format_add_feature(ctx,
+                                          &data,
+                                          "reading",
+                                          GRN_MECAB_FEATURE_LOCATION_READING);
   }
   if (tokenizer->options->include_form) {
     add_feature_data data;
@@ -1012,23 +965,6 @@ mecab_init(grn_ctx *ctx, grn_tokenizer_query *query)
 
   GRN_UINT64_INIT(&(tokenizer->feature_locations), GRN_OBJ_VECTOR);
 
-  tokenizer->loose.ing = GRN_FALSE;
-  tokenizer->loose.need = GRN_FALSE;
-  tokenizer->loose.need_end_mark = GRN_FALSE;
-  GRN_TEXT_INIT(&(tokenizer->loose.readings), GRN_OBJ_VECTOR);
-  tokenizer->loose.offset = 0;
-
-  if (tokenizer->options->loose_reading &&
-      grn_tokenizer_query_get_mode(ctx, tokenizer->query) == GRN_TOKEN_GET) {
-    while (tokenizer->next < tokenizer->end &&
-           mecab_next_default_format_consume_token(ctx, tokenizer, NULL) > 0) {
-      /* Do nothing */
-    }
-    tokenizer->loose.ing = GRN_TRUE;
-    tokenizer->loose.need = GRN_TRUE;
-    tokenizer->loose.need_end_mark = GRN_FALSE;
-  }
-
   return tokenizer;
 }
 
@@ -1079,7 +1015,6 @@ mecab_fin(grn_ctx *ctx, void *user_data)
   if (!tokenizer) {
     return;
   }
-  GRN_OBJ_FIN(ctx, &(tokenizer->loose.readings));
   GRN_OBJ_FIN(ctx, &(tokenizer->feature_locations));
   GRN_OBJ_FIN(ctx, &(tokenizer->buf));
   GRN_PLUGIN_FREE(ctx, tokenizer);

  Renamed: test/command/suite/select/query/match/with_index/token_mecab/use_reading.expected (+1 -1) 87%
===================================================================
--- test/command/suite/select/query/match/with_index/token_mecab/loose_reading.expected    2018-09-13 22:26:19 +0900 (3c8d1de6d)
+++ test/command/suite/select/query/match/with_index/token_mecab/use_reading.expected    2018-09-19 11:14:29 +0900 (f43c868da)
@@ -2,7 +2,7 @@ table_create Menus TABLE_NO_KEY
 [[0,0.0,0.0],true]
 column_create Menus name COLUMN_SCALAR Text
 [[0,0.0,0.0],true]
-table_create Terms TABLE_PAT_KEY ShortText   --normalize NormalizerNFKC100   --default_tokenizer 'TokenMecab("loose_reading", true)'
+table_create Terms TABLE_PAT_KEY ShortText   --normalize NormalizerNFKC100   --default_tokenizer 'TokenMecab("use_reading", true)'
 [[0,0.0,0.0],true]
 column_create Terms index COLUMN_INDEX|WITH_POSITION Menus name
 [[0,0.0,0.0],true]

  Renamed: test/command/suite/select/query/match/with_index/token_mecab/use_reading.test (+1 -1) 84%
===================================================================
--- test/command/suite/select/query/match/with_index/token_mecab/loose_reading.test    2018-09-13 22:26:19 +0900 (40d5857d0)
+++ test/command/suite/select/query/match/with_index/token_mecab/use_reading.test    2018-09-19 11:14:29 +0900 (4dfe3e38d)
@@ -3,7 +3,7 @@ column_create Menus name COLUMN_SCALAR Text
 
 table_create Terms TABLE_PAT_KEY ShortText \
   --normalize NormalizerNFKC100 \
-  --default_tokenizer 'TokenMecab("loose_reading", true)'
+  --default_tokenizer 'TokenMecab("use_reading", true)'
 column_create Terms index COLUMN_INDEX|WITH_POSITION Menus name
 
 load --table Menus

  Deleted: test/command/suite/tokenizers/mecab/options/loose_reading_add.expected (+0 -65) 100644
===================================================================
--- test/command/suite/tokenizers/mecab/options/loose_reading_add.expected    2018-09-13 22:26:19 +0900 (ea667f08c)
+++ /dev/null
@@ -1,65 +0,0 @@
-tokenize   'TokenMecab("loose_reading", true)'   '焼き肉と焼肉とyakiniku'
-[
-  [
-    0,
-    0.0,
-    0.0
-  ],
-  [
-    {
-      "value": "焼き肉",
-      "position": 0,
-      "force_prefix": false
-    },
-    {
-      "value": "と",
-      "position": 1,
-      "force_prefix": false
-    },
-    {
-      "value": "焼肉",
-      "position": 2,
-      "force_prefix": false
-    },
-    {
-      "value": "と",
-      "position": 3,
-      "force_prefix": false
-    },
-    {
-      "value": "yakiniku",
-      "position": 4,
-      "force_prefix": false
-    },
-    {
-      "value": "￰",
-      "position": 5,
-      "force_prefix": false
-    },
-    {
-      "value": "ヤキニク",
-      "position": 6,
-      "force_prefix": false
-    },
-    {
-      "value": "ト",
-      "position": 7,
-      "force_prefix": false
-    },
-    {
-      "value": "ヤキニク",
-      "position": 8,
-      "force_prefix": false
-    },
-    {
-      "value": "ト",
-      "position": 9,
-      "force_prefix": false
-    },
-    {
-      "value": "yakiniku",
-      "position": 10,
-      "force_prefix": false
-    }
-  ]
-]

  Copied: test/command/suite/tokenizers/mecab/options/use_reading_add.expected (+1 -1) 83%
===================================================================
--- test/command/suite/tokenizers/mecab/options/loose_reading_get.expected    2018-09-13 22:26:19 +0900 (40824322b)
+++ test/command/suite/tokenizers/mecab/options/use_reading_add.expected    2018-09-19 11:14:29 +0900 (00f119724)
@@ -1,4 +1,4 @@
-tokenize   'TokenMecab("loose_reading", true)'   '焼き肉と焼肉とyakiniku'   --mode GET
+tokenize   'TokenMecab("use_reading", true)'   '焼き肉と焼肉とyakiniku'
 [
   [
     0,

  Renamed: test/command/suite/tokenizers/mecab/options/use_reading_add.test (+1 -1) 66%
===================================================================
--- test/command/suite/tokenizers/mecab/options/loose_reading_add.test    2018-09-13 22:26:19 +0900 (194e23893)
+++ test/command/suite/tokenizers/mecab/options/use_reading_add.test    2018-09-19 11:14:29 +0900 (fb27c64de)
@@ -1,5 +1,5 @@
 #@on-error omit
 tokenize \
-  'TokenMecab("loose_reading", true)' \
+  'TokenMecab("use_reading", true)' \
   '焼き肉と焼肉とyakiniku'
 #@on-error default

  Renamed: test/command/suite/tokenizers/mecab/options/use_reading_get.expected (+1 -1) 83%
===================================================================
--- test/command/suite/tokenizers/mecab/options/loose_reading_get.expected    2018-09-13 22:26:19 +0900 (40824322b)
+++ test/command/suite/tokenizers/mecab/options/use_reading_get.expected    2018-09-19 11:14:29 +0900 (d4ba3d0fa)
@@ -1,4 +1,4 @@
-tokenize   'TokenMecab("loose_reading", true)'   '焼き肉と焼肉とyakiniku'   --mode GET
+tokenize   'TokenMecab("use_reading", true)'   '焼き肉と焼肉とyakiniku'   --mode GET
 [
   [
     0,

  Renamed: test/command/suite/tokenizers/mecab/options/use_reading_get.test (+1 -1) 70%
===================================================================
--- test/command/suite/tokenizers/mecab/options/loose_reading_get.test    2018-09-13 22:26:19 +0900 (3efcbdcfc)
+++ test/command/suite/tokenizers/mecab/options/use_reading_get.test    2018-09-19 11:14:29 +0900 (e4d453e49)
@@ -1,6 +1,6 @@
 #@on-error omit
 tokenize \
-  'TokenMecab("loose_reading", true)' \
+  'TokenMecab("use_reading", true)' \
   '焼き肉と焼肉とyakiniku' \
   --mode GET
 #@on-error default
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180919/91925a0e/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index