[Groonga-commit] groonga/groonga at d90f50e [master] TokenNgram: use offsets information from normalizer only when it's available

Back to archive index

Kouhei Sutou null+****@clear*****
Mon May 28 14:01:54 JST 2018


Kouhei Sutou	2018-05-28 14:01:54 +0900 (Mon, 28 May 2018)

  New Revision: d90f50ee1ee02cb7c7833c0dfc1b89569a781685
  https://github.com/groonga/groonga/commit/d90f50ee1ee02cb7c7833c0dfc1b89569a781685

  Message:
    TokenNgram: use offsets information from normalizer only when it's available

  Added files:
    test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test
  Copied files:
    test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected
      (from test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected)
  Modified files:
    lib/tokenizers.c
    test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected
    test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test
    test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected
    test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test
    test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected
    test/command/suite/tokenizers/ngram/report_source_location/hiragana.test
    test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.expected
    test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.test
    test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected
    test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test

  Modified: lib/tokenizers.c (+127 -39)
===================================================================
--- lib/tokenizers.c    2018-05-28 12:16:40 +0900 (e59fcd6ca)
+++ lib/tokenizers.c    2018-05-28 14:01:54 +0900 (8fd624ce0)
@@ -282,14 +282,17 @@ typedef struct {
     grn_obj text;
     uint_least8_t *ctypes;
     int16_t *checks;
+    uint64_t *offsets;
   } loose;
   int32_t pos;
   uint32_t skip;
+  unsigned int n_chars;
   const unsigned char *start;
   const unsigned char *next;
   const unsigned char *end;
   const uint_least8_t *ctypes;
   const int16_t *checks;
+  const uint64_t *offsets;
   uint32_t tail;
   uint64_t source_offset;
 } grn_ngram_tokenizer;
@@ -319,6 +322,7 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
   const char *normalized_end;
   const uint_least8_t *types = tokenizer->ctypes;
   const int16_t *checks = tokenizer->checks;
+  const uint64_t *offsets = tokenizer->offsets;
 
   string = grn_tokenizer_query_get_normalized_string(ctx, tokenizer->query);
   grn_string_get_normalized(ctx,
@@ -333,7 +337,10 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
       grn_tokenizer_query_get_encoding(ctx, tokenizer->query);
     uint_least8_t *loose_types;
     int16_t *loose_checks = NULL;
+    uint64_t *loose_offsets = NULL;
     const int16_t *removed_checks = NULL;
+    uint64_t last_offset = 0;
+    unsigned int n_chars = 0;
 
     tokenizer->loose.ctypes =
       GRN_MALLOC(sizeof(uint_least8_t) * normalized_length_in_chars);
@@ -350,12 +357,23 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
       if (!tokenizer->loose.checks) {
         ERR(GRN_NO_MEMORY_AVAILABLE,
             "[tokenizer][ngram][loose] "
+            "failed to allocate memory for character lengths");
+        return;
+      }
+    }
+    if (offsets) {
+      tokenizer->loose.offsets =
+        GRN_CALLOC(sizeof(uint64_t) * normalized_length_in_chars);
+      if (!tokenizer->loose.offsets) {
+        ERR(GRN_NO_MEMORY_AVAILABLE,
+            "[tokenizer][ngram][loose] "
             "failed to allocate memory for character offsets");
         return;
       }
     }
     loose_types = tokenizer->loose.ctypes;
     loose_checks = tokenizer->loose.checks;
+    loose_offsets = tokenizer->loose.offsets;
     while (normalized < normalized_end) {
       size_t length;
       length = grn_charlen_(ctx,
@@ -373,6 +391,9 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
         if (!removed_checks) {
           removed_checks = checks;
         }
+        if (offsets && last_offset == 0) {
+          last_offset = *offsets;
+        }
       } else {
         GRN_TEXT_PUT(ctx, &(tokenizer->loose.text), normalized, length);
         *loose_types = *types;
@@ -393,12 +414,29 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
           }
           loose_checks += length;
         }
+        if (loose_offsets) {
+          *loose_offsets = *offsets;
+          loose_offsets++;
+          last_offset = 0;
+        }
+        n_chars++;
       }
       normalized += length;
       types++;
       if (checks) {
         checks += length;
       }
+      if (offsets) {
+        offsets++;
+      }
+    }
+    *loose_checks = *checks;
+    if (offsets) {
+      if (last_offset) {
+        *loose_offsets = last_offset;
+      } else {
+        *loose_offsets = *offsets;
+      }
     }
     tokenizer->start =
       (const unsigned char *)GRN_TEXT_VALUE(&(tokenizer->loose.text));
@@ -406,6 +444,8 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
     tokenizer->end = tokenizer->start + GRN_TEXT_LEN(&(tokenizer->loose.text));
     tokenizer->ctypes = tokenizer->loose.ctypes;
     tokenizer->checks = tokenizer->loose.checks;
+    tokenizer->offsets = tokenizer->loose.offsets;
+    tokenizer->n_chars = n_chars;
   } else {
     tokenizer->start = normalized;
     tokenizer->next = tokenizer->start;
@@ -457,6 +497,7 @@ ngram_init_raw(grn_ctx *ctx,
   GRN_TEXT_INIT(&(tokenizer->loose.text), 0);
   tokenizer->loose.ctypes = NULL;
   tokenizer->loose.checks = NULL;
+  tokenizer->loose.offsets = NULL;
   tokenizer->pos = 0;
   tokenizer->skip = 0;
   tokenizer->source_offset = 0;
@@ -465,17 +506,21 @@ ngram_init_raw(grn_ctx *ctx,
     grn_obj *string;
     const char *normalized_raw;
     unsigned int normalized_length_in_bytes;
+    unsigned int normalized_length_in_chars;
 
     string = grn_tokenizer_query_get_normalized_string(ctx, tokenizer->query);
     grn_string_get_normalized(ctx,
                               string,
-                              &normalized_raw, &normalized_length_in_bytes,
-                              NULL);
+                              &normalized_raw,
+                              &normalized_length_in_bytes,
+                              &normalized_length_in_chars);
     tokenizer->start = (const unsigned char *)normalized_raw;
     tokenizer->next = tokenizer->start;
     tokenizer->end = tokenizer->start + normalized_length_in_bytes;
+    tokenizer->n_chars = normalized_length_in_chars;
     tokenizer->ctypes = grn_string_get_types(ctx, string);
     tokenizer->checks = grn_string_get_checks(ctx, string);
+    tokenizer->offsets = grn_string_get_offsets(ctx, string);
   }
 
   if (grn_tokenizer_query_get_mode(ctx, tokenizer->query) == GRN_TOKEN_GET) {
@@ -700,6 +745,7 @@ ngram_next(grn_ctx *ctx,
   grn_token_status status = 0;
   const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL;
   const int16_t *checks = NULL;
+  const uint64_t *offsets = tokenizer->offsets ? tokenizer->offsets + pos : NULL;
   grn_encoding encoding = grn_tokenizer_query_get_encoding(ctx, query);
 
   if (tokenizer->checks) {
@@ -712,8 +758,14 @@ ngram_next(grn_ctx *ctx,
                        GRN_TOKENIZER_END_MARK_UTF8,
                        GRN_TOKENIZER_END_MARK_UTF8_LEN);
     grn_token_set_status(ctx, token, status);
-    if (checks) {
-      grn_token_set_source_offset(ctx, token, tokenizer->source_offset);
+    if (offsets) {
+      grn_token_set_source_offset(ctx,
+                                  token,
+                                  tokenizer->offsets[tokenizer->n_chars]);
+    } else if (checks) {
+      grn_token_set_source_offset(ctx,
+                                  token,
+                                  tokenizer->source_offset);
     }
     ngram_switch_to_loose_mode(ctx, tokenizer);
     tokenizer->loose.need_end_mark = GRN_FALSE;
@@ -840,51 +892,84 @@ ngram_next(grn_ctx *ctx,
     grn_token_set_data(ctx, token, p, data_size);
     grn_token_set_status(ctx, token, status);
     grn_token_set_overlap(ctx, token, tokenizer->overlap);
-    if (checks) {
-      size_t i;
-      uint32_t source_length = 0;
-      uint32_t source_first_character_length = 0;
-      uint64_t next_offset = tokenizer->source_offset;
-      grn_token_set_source_offset(ctx, token, tokenizer->source_offset);
-      if (checks[0] == -1) {
-        size_t n_leading_bytes = p - tokenizer->start;
-        for (i = 1; i <= n_leading_bytes; i++) {
-          if (checks[-i] > 0) {
-            source_length = source_first_character_length = checks[-i];
-            if (!tokenizer->overlap) {
-              next_offset += checks[-i];
+    /* TODO: Clean and complete... */
+    if (offsets) {
+      grn_token_set_source_offset(ctx, token, offsets[0]);
+      if (checks) {
+        size_t i;
+        uint32_t source_first_character_length = 0;
+        if (checks[0] == -1) {
+          size_t n_leading_bytes = p - tokenizer->start;
+          for (i = 1; i <= n_leading_bytes; i++) {
+            if (checks[-i] > 0) {
+              source_first_character_length = checks[-i];
+              break;
             }
-            break;
           }
         }
-      }
-      {
-        uint64_t first_offset = 0;
-        for (i = 0; i < data_size; i++) {
-          if (checks[i] > 0) {
-            if ((tokenizer->overlap && first_offset == 0) ||
-                !tokenizer->overlap) {
-              if (first_offset == 0) {
-                first_offset = checks[i];
+        {
+          for (i = 0; i < data_size; i++) {
+            if (checks[i] > 0) {
+              if (source_first_character_length == 0) {
+                source_first_character_length = checks[i];
               }
-              next_offset += checks[i];
             }
-            if (source_first_character_length == 0) {
-              source_first_character_length = checks[i];
+          }
+        }
+        grn_token_set_source_length(ctx,
+                                    token,
+                                    offsets[n_characters] - offsets[0]);
+        grn_token_set_source_first_character_length(ctx,
+                                                    token,
+                                                    source_first_character_length);
+      }
+    } else {
+      if (checks) {
+        size_t i;
+        uint32_t source_length = 0;
+        uint32_t source_first_character_length = 0;
+        uint64_t next_offset = tokenizer->source_offset;
+        grn_token_set_source_offset(ctx, token, tokenizer->source_offset);
+        if (checks[0] == -1) {
+          size_t n_leading_bytes = p - tokenizer->start;
+          for (i = 1; i <= n_leading_bytes; i++) {
+            if (checks[-i] > 0) {
+              source_length = source_first_character_length = checks[-i];
+              if (!tokenizer->overlap) {
+                next_offset += checks[-i];
+              }
+              break;
             }
-            source_length += checks[i];
-          } else if (checks[i] < 0) {
-            if (tokenizer->overlap) {
-              next_offset -= first_offset;
+          }
+        }
+        {
+          uint64_t first_offset = 0;
+          for (i = 0; i < data_size; i++) {
+            if (checks[i] > 0) {
+              if ((tokenizer->overlap && first_offset == 0) ||
+                  !tokenizer->overlap) {
+                if (first_offset == 0) {
+                  first_offset = checks[i];
+                }
+                next_offset += checks[i];
+              }
+              if (source_first_character_length == 0) {
+                source_first_character_length = checks[i];
+              }
+              source_length += checks[i];
+            } else if (checks[i] < 0) {
+              if (tokenizer->overlap) {
+                next_offset -= first_offset;
+              }
             }
           }
         }
+        grn_token_set_source_length(ctx, token, source_length);
+        grn_token_set_source_first_character_length(ctx,
+                                                    token,
+                                                    source_first_character_length);
+        tokenizer->source_offset = next_offset;
       }
-      grn_token_set_source_length(ctx, token, source_length);
-      grn_token_set_source_first_character_length(ctx,
-                                                  token,
-                                                  source_first_character_length);
-      tokenizer->source_offset = next_offset;
     }
   }
 }
@@ -925,6 +1010,9 @@ ngram_fin(grn_ctx *ctx, void *user_data)
   if (tokenizer->loose.checks) {
     GRN_FREE(tokenizer->loose.checks);
   }
+  if (tokenizer->loose.offsets) {
+    GRN_FREE(tokenizer->loose.offsets);
+  }
   GRN_OBJ_FIN(ctx, &(tokenizer->loose.text));
   grn_tokenizer_token_fin(ctx, &(tokenizer->token));
   GRN_FREE(tokenizer);

  Modified: test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected (+1 -1)
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected    2018-05-28 12:16:40 +0900 (390cd501f)
+++ test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected    2018-05-28 14:01:54 +0900 (0674f5ce8)
@@ -1,4 +1,4 @@
-tokenize   'TokenNgram("report_source_location", true)'   "アイ㌕エオ"   NormalizerAuto
+tokenize   'TokenNgram("report_source_location", true)'   "アイ㌕エオ"   'NormalizerNFKC100'
 [
   [
     0,

  Modified: test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test (+1 -1)
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test    2018-05-28 12:16:40 +0900 (5c4795fd8)
+++ test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test    2018-05-28 14:01:54 +0900 (fdf9fb6d4)
@@ -1,4 +1,4 @@
 tokenize \
   'TokenNgram("report_source_location", true)' \
   "アイ㌕エオ" \
-  NormalizerAuto
+  'NormalizerNFKC100'

  Modified: test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected (+1 -1)
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected    2018-05-28 12:16:40 +0900 (eacdd61bb)
+++ test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected    2018-05-28 14:01:54 +0900 (02dd5427b)
@@ -1,4 +1,4 @@
-tokenize   'TokenNgram("report_source_location", true)'   "A㌔Z"   NormalizerAuto
+tokenize   'TokenNgram("report_source_location", true)'   "A㌔Z"   'NormalizerNFKC100("report_source_offset", true)'
 [
   [
     0,

  Modified: test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test (+1 -1)
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test    2018-05-28 12:16:40 +0900 (6e26f39a6)
+++ test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test    2018-05-28 14:01:54 +0900 (38b01dced)
@@ -1,4 +1,4 @@
 tokenize \
   'TokenNgram("report_source_location", true)' \
   "A㌔Z" \
-  NormalizerAuto
+  'NormalizerNFKC100("report_source_offset", true)'

  Modified: test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected (+1 -1)
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected    2018-05-28 12:16:40 +0900 (d1d4a8ad9)
+++ test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected    2018-05-28 14:01:54 +0900 (33074e0de)
@@ -1,4 +1,4 @@
-tokenize   'TokenNgram("report_source_location", true)'   "あいうえお"   NormalizerAuto
+tokenize   'TokenNgram("report_source_location", true)'   "あいうえお"   'NormalizerNFKC100("report_source_offset", true)'
 [
   [
     0,

  Modified: test/command/suite/tokenizers/ngram/report_source_location/hiragana.test (+1 -1)
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/hiragana.test    2018-05-28 12:16:40 +0900 (d1ac43c8d)
+++ test/command/suite/tokenizers/ngram/report_source_location/hiragana.test    2018-05-28 14:01:54 +0900 (d5ff21622)
@@ -1,4 +1,4 @@
 tokenize \
   'TokenNgram("report_source_location", true)' \
   "あいうえお" \
-  NormalizerAuto
+  'NormalizerNFKC100("report_source_offset", true)'

  Copied: test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected (+35 -27) 71%
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected    2018-05-28 12:16:40 +0900 (e0ccd2903)
+++ test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected    2018-05-28 14:01:54 +0900 (63a079929)
@@ -1,4 +1,4 @@
-tokenize   'TokenNgram("loose_symbol", true,               "report_source_location", true)'   "(あいうえお)"   'NormalizerNFKC100'
+tokenize   'TokenNgram("loose_symbol", true,               "report_source_location", true)'   "[クリアコード]"   'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true,                      "report_source_offset", true)'
 [
   [
     0,
@@ -7,7 +7,7 @@ tokenize   'TokenNgram("loose_symbol", true,               "report_source_locati
   ],
   [
     {
-      "value": "(",
+      "value": "[",
       "position": 0,
       "force_prefix": false,
       "source_offset": 0,
@@ -15,7 +15,7 @@ tokenize   'TokenNgram("loose_symbol", true,               "report_source_locati
       "source_first_character_length": 1
     },
     {
-      "value": "あい",
+      "value": "クリ",
       "position": 1,
       "force_prefix": false,
       "source_offset": 1,
@@ -23,7 +23,7 @@ tokenize   'TokenNgram("loose_symbol", true,               "report_source_locati
       "source_first_character_length": 3
     },
     {
-      "value": "いう",
+      "value": "リア",
       "position": 2,
       "force_prefix": false,
       "source_offset": 4,
@@ -31,7 +31,7 @@ tokenize   'TokenNgram("loose_symbol", true,               "report_source_locati
       "source_first_character_length": 3
     },
     {
-      "value": "うえ",
+      "value": "アコ",
       "position": 3,
       "force_prefix": false,
       "source_offset": 7,
@@ -39,15 +39,15 @@ tokenize   'TokenNgram("loose_symbol", true,               "report_source_locati
       "source_first_character_length": 3
     },
     {
-      "value": "えお",
+      "value": "コ",
       "position": 4,
       "force_prefix": false,
       "source_offset": 10,
-      "source_length": 6,
+      "source_length": 3,
       "source_first_character_length": 3
     },
     {
-      "value": "お",
+      "value": "-",
       "position": 5,
       "force_prefix": false,
       "source_offset": 13,
@@ -55,60 +55,68 @@ tokenize   'TokenNgram("loose_symbol", true,               "report_source_locati
       "source_first_character_length": 3
     },
     {
-      "value": ")",
+      "value": "ド",
       "position": 6,
       "force_prefix": false,
       "source_offset": 16,
+      "source_length": 3,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "]",
+      "position": 7,
+      "force_prefix": false,
+      "source_offset": 19,
       "source_length": 1,
       "source_first_character_length": 1
     },
     {
       "value": "￰",
-      "position": 7,
+      "position": 8,
       "force_prefix": false,
-      "source_offset": 17,
+      "source_offset": 20,
       "source_length": 0,
       "source_first_character_length": 0
     },
     {
-      "value": "あい",
-      "position": 8,
+      "value": "クリ",
+      "position": 9,
       "force_prefix": false,
-      "source_offset": 0,
-      "source_length": 7,
+      "source_offset": 1,
+      "source_length": 6,
       "source_first_character_length": 4
     },
     {
-      "value": "いう",
-      "position": 9,
+      "value": "リア",
+      "position": 10,
       "force_prefix": false,
       "source_offset": 4,
       "source_length": 6,
       "source_first_character_length": 3
     },
     {
-      "value": "うえ",
-      "position": 10,
+      "value": "アコ",
+      "position": 11,
       "force_prefix": false,
       "source_offset": 7,
-      "source_length": 6,
+      "source_length": 9,
       "source_first_character_length": 3
     },
     {
-      "value": "えお",
-      "position": 11,
+      "value": "コド",
+      "position": 12,
       "force_prefix": false,
       "source_offset": 10,
-      "source_length": 6,
+      "source_length": 9,
       "source_first_character_length": 3
     },
     {
-      "value": "お",
-      "position": 12,
+      "value": "ド",
+      "position": 13,
       "force_prefix": false,
-      "source_offset": 13,
+      "source_offset": 16,
       "source_length": 3,
-      "source_first_character_length": 3
+      "source_first_character_length": 6
     }
   ]
 ]

  Added: test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test (+6 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test    2018-05-28 14:01:54 +0900 (f1e51e3b9)
@@ -0,0 +1,6 @@
+tokenize \
+  'TokenNgram("loose_symbol", true, \
+              "report_source_location", true)' \
+  "[クリアコード]" \
+  'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true, \
+                     "report_source_offset", true)'

  Modified: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.expected (+1 -1)
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.expected    2018-05-28 12:16:40 +0900 (83927a3eb)
+++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.expected    2018-05-28 14:01:54 +0900 (d7ecd9fa7)
@@ -1,4 +1,4 @@
-tokenize   'TokenNgram("report_source_location", true, "loose_symbol", true)'   "090(1234)56−78"   NormalizerAuto
+tokenize   'TokenNgram("report_source_location", true, "loose_symbol", true)'   "090(1234)56−78"   'NormalizerNFKC100("report_source_offset", true)'
 [
   [
     0,

  Modified: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.test (+1 -1)
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.test    2018-05-28 12:16:40 +0900 (135a9c270)
+++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.test    2018-05-28 14:01:54 +0900 (c10ee0528)
@@ -1,4 +1,4 @@
 tokenize \
   'TokenNgram("report_source_location", true, "loose_symbol", true)' \
   "090(1234)56−78" \
-  NormalizerAuto
+  'NormalizerNFKC100("report_source_offset", true)'

  Modified: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected (+3 -3)
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected    2018-05-28 12:16:40 +0900 (e0ccd2903)
+++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected    2018-05-28 14:01:54 +0900 (231e19e69)
@@ -1,4 +1,4 @@
-tokenize   'TokenNgram("loose_symbol", true,               "report_source_location", true)'   "(あいうえお)"   'NormalizerNFKC100'
+tokenize   'TokenNgram("loose_symbol", true,               "report_source_location", true)'   "(あいうえお)"   'NormalizerNFKC100("report_source_offset", true)'
 [
   [
     0,
@@ -74,8 +74,8 @@ tokenize   'TokenNgram("loose_symbol", true,               "report_source_locati
       "value": "あい",
       "position": 8,
       "force_prefix": false,
-      "source_offset": 0,
-      "source_length": 7,
+      "source_offset": 1,
+      "source_length": 6,
       "source_first_character_length": 4
     },
     {

  Modified: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test (+1 -1)
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test    2018-05-28 12:16:40 +0900 (8b9a1545a)
+++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test    2018-05-28 14:01:54 +0900 (d25604773)
@@ -2,4 +2,4 @@ tokenize \
   'TokenNgram("loose_symbol", true, \
               "report_source_location", true)' \
   "(あいうえお)" \
-  'NormalizerNFKC100'
+  'NormalizerNFKC100("report_source_offset", true)'
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180528/51f9f6be/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index