[Groonga-commit] groonga/groonga at b91e966 [master] TokenNgram report_source_location: fix wrong report

Back to archive index

Kouhei Sutou null+****@clear*****
Fri May 11 14:40:37 JST 2018


Kouhei Sutou	2018-05-11 14:40:37 +0900 (Fri, 11 May 2018)

  New Revision: b91e966ff9b8ffa564c245e2db2b8e06021e110e
  https://github.com/groonga/groonga/commit/b91e966ff9b8ffa564c245e2db2b8e06021e110e

  Message:
    TokenNgram report_source_location: fix wrong report

  Added files:
    test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected
    test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected
  Copied files:
    test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test
      (from test/command/suite/tokenizers/ngram/report_source_location/expand.test)
    test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test
      (from test/command/suite/tokenizers/ngram/report_source_location/expand.test)
  Modified files:
    lib/tokenizers.c
  Renamed files:
    test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected
      (from test/command/suite/tokenizers/ngram/report_source_location/expand.expected)
    test/command/suite/tokenizers/ngram/report_source_location/hiragana.test
      (from test/command/suite/tokenizers/ngram/report_source_location/expand.test)

  Modified: lib/tokenizers.c (+13 -17)
===================================================================
--- lib/tokenizers.c    2018-05-11 12:26:32 +0900 (be7d9bca5)
+++ lib/tokenizers.c    2018-05-11 14:40:37 +0900 (3e40e8f02)
@@ -842,39 +842,35 @@ ngram_next(grn_ctx *ctx,
     grn_token_set_overlap(ctx, token, tokenizer->overlap);
     if (checks) {
       size_t i;
-      uint32_t uncount_offset = 0;
       uint32_t source_length = 0;
+      uint64_t next_offset = tokenizer->source_offset;
+      grn_bool first_character = GRN_TRUE;
       grn_token_set_source_offset(ctx, token, tokenizer->source_offset);
       if (checks[0] == -1) {
         size_t n_leading_bytes = p - tokenizer->start;
         for (i = 1; i <= n_leading_bytes; i++) {
           if (checks[-i] > 0) {
-            uncount_offset = source_length = checks[-i];
+            source_length = checks[-i];
+            if (!tokenizer->overlap) {
+              next_offset += checks[-i];
+            }
+            first_character = GRN_FALSE;
             break;
           }
         }
       }
       for (i = 0; i < data_size; i++) {
         if (checks[i] > 0) {
-          source_length += checks[i];
-        }
-      }
-      if (r < e) {
-        if (checks[i] > 0) {
-          if (!tokenizer->overlap) {
-            uncount_offset = 0;
-          }
-        } else if (checks[i] == -1) {
-          for (; i > 0; i--) {
-            if (checks[i - 1] > 0) {
-              uncount_offset += checks[i - 1];
-              break;
-            }
+          if ((tokenizer->overlap && !first_character) ||
+              !tokenizer->overlap) {
+            next_offset += checks[i];
           }
+          source_length += checks[i];
+          first_character = GRN_FALSE;
         }
       }
       grn_token_set_source_length(ctx, token, source_length);
-      tokenizer->source_offset += source_length - uncount_offset;
+      tokenizer->source_offset = next_offset;
     }
   }
 }

  Renamed: test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected (+28 -14) 58%
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/expand.expected    2018-05-11 12:26:32 +0900 (56c2b8d84)
+++ test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected    2018-05-11 14:40:37 +0900 (7f8fdc0a2)
@@ -1,4 +1,4 @@
-tokenize   'TokenNgram("report_source_location", true)'   "ア㌕Az"   NormalizerAuto
+tokenize   'TokenNgram("report_source_location", true)'   "アイ㌕エオ"   NormalizerAuto
 [
   [
     0,
@@ -7,53 +7,67 @@ tokenize   'TokenNgram("report_source_location", true)'   "ア㌕Az"   Normali
   ],
   [
     {
-      "value": "アキ",
+      "value": "アイ",
       "position": 0,
       "force_prefix": false,
       "source_offset": 0,
       "source_length": 6
     },
     {
-      "value": "キロ",
+      "value": "イキ",
       "position": 1,
       "force_prefix": false,
       "source_offset": 3,
-      "source_length": 3
+      "source_length": 6
     },
     {
-      "value": "ログ",
+      "value": "キロ",
       "position": 2,
       "force_prefix": false,
-      "source_offset": 3,
+      "source_offset": 6,
       "source_length": 3
     },
     {
-      "value": "グラ",
+      "value": "ログ",
       "position": 3,
       "force_prefix": false,
-      "source_offset": 3,
+      "source_offset": 6,
       "source_length": 3
     },
     {
-      "value": "ラム",
+      "value": "グラ",
       "position": 4,
       "force_prefix": false,
-      "source_offset": 3,
+      "source_offset": 6,
       "source_length": 3
     },
     {
-      "value": "ム",
+      "value": "ラム",
       "position": 5,
       "force_prefix": false,
-      "source_offset": 3,
+      "source_offset": 6,
       "source_length": 3
     },
     {
-      "value": "az",
+      "value": "ムエ",
       "position": 6,
       "force_prefix": false,
       "source_offset": 6,
-      "source_length": 4
+      "source_length": 6
+    },
+    {
+      "value": "エオ",
+      "position": 7,
+      "force_prefix": false,
+      "source_offset": 9,
+      "source_length": 6
+    },
+    {
+      "value": "オ",
+      "position": 8,
+      "force_prefix": false,
+      "source_offset": 12,
+      "source_length": 3
     }
   ]
 ]

  Copied: test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test (+1 -1) 77%
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/expand.test    2018-05-11 12:26:32 +0900 (f45dd3257)
+++ test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test    2018-05-11 14:40:37 +0900 (5c4795fd8)
@@ -1,4 +1,4 @@
 tokenize \
   'TokenNgram("report_source_location", true)' \
-  "ア㌕Az" \
+  "アイ㌕エオ" \
   NormalizerAuto

  Added: test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected (+38 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected    2018-05-11 14:40:37 +0900 (1e4bb5327)
@@ -0,0 +1,38 @@
+tokenize   'TokenNgram("report_source_location", true)'   "A㌔Z"   NormalizerAuto
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "a",
+      "position": 0,
+      "force_prefix": false,
+      "source_offset": 0,
+      "source_length": 3
+    },
+    {
+      "value": "キロ",
+      "position": 1,
+      "force_prefix": false,
+      "source_offset": 3,
+      "source_length": 3
+    },
+    {
+      "value": "ロ",
+      "position": 2,
+      "force_prefix": false,
+      "source_offset": 3,
+      "source_length": 3
+    },
+    {
+      "value": "z",
+      "position": 3,
+      "force_prefix": false,
+      "source_offset": 6,
+      "source_length": 3
+    }
+  ]
+]

  Copied: test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test (+1 -1) 81%
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/expand.test    2018-05-11 12:26:32 +0900 (f45dd3257)
+++ test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test    2018-05-11 14:40:37 +0900 (6e26f39a6)
@@ -1,4 +1,4 @@
 tokenize \
   'TokenNgram("report_source_location", true)' \
-  "ア㌕Az" \
+  "A㌔Z" \
   NormalizerAuto

  Added: test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected (+45 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected    2018-05-11 14:40:37 +0900 (f092fa91a)
@@ -0,0 +1,45 @@
+tokenize   'TokenNgram("report_source_location", true)'   "あいうえお"   NormalizerAuto
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "あい",
+      "position": 0,
+      "force_prefix": false,
+      "source_offset": 0,
+      "source_length": 6
+    },
+    {
+      "value": "いう",
+      "position": 1,
+      "force_prefix": false,
+      "source_offset": 3,
+      "source_length": 6
+    },
+    {
+      "value": "うえ",
+      "position": 2,
+      "force_prefix": false,
+      "source_offset": 6,
+      "source_length": 6
+    },
+    {
+      "value": "えお",
+      "position": 3,
+      "force_prefix": false,
+      "source_offset": 9,
+      "source_length": 6
+    },
+    {
+      "value": "お",
+      "position": 4,
+      "force_prefix": false,
+      "source_offset": 12,
+      "source_length": 3
+    }
+  ]
+]

  Renamed: test/command/suite/tokenizers/ngram/report_source_location/hiragana.test (+1 -1) 77%
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/expand.test    2018-05-11 12:26:32 +0900 (f45dd3257)
+++ test/command/suite/tokenizers/ngram/report_source_location/hiragana.test    2018-05-11 14:40:37 +0900 (d1ac43c8d)
@@ -1,4 +1,4 @@
 tokenize \
   'TokenNgram("report_source_location", true)' \
-  "ア㌕Az" \
+  "あいうえお" \
   NormalizerAuto
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180511/8a3547d2/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index