[Groonga-commit] groonga/groonga at 40e3790 [master] NormalizerNFKC100: fix a bug that stateful normalization breaks stateless normalization

Back to archive index
Kouhei Sutou null+****@clear*****
Fri Jan 25 11:21:50 JST 2019


Kouhei Sutou	2019-01-25 11:21:50 +0900 (Fri, 25 Jan 2019)

  Revision: 40e3790667a58175177c5fdae69e97fca839d8e0
  https://github.com/groonga/groonga/commit/40e3790667a58175177c5fdae69e97fca839d8e0

  Message:
    NormalizerNFKC100: fix a bug that stateful normalization breaks stateless normalization
    
    For example, "unify_kana" (stateless) and
    "unify_katakana_v_sounds" (stateful) returns wrong normalized text.

  Added files:
    test/command/suite/normalizers/nfkc100/mix/unify_katakana_v_sounds_and_kana.expected
    test/command/suite/normalizers/nfkc100/mix/unify_katakana_v_sounds_and_kana.test
  Modified files:
    lib/normalizer.c

  Modified: lib/normalizer.c (+23 -2)
===================================================================
--- lib/normalizer.c    2019-01-24 16:22:59 +0900 (5a0dc7f23)
+++ lib/normalizer.c    2019-01-25 11:21:50 +0900 (024766a8d)
@@ -1,7 +1,7 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
   Copyright(C) 2012-2018 Brazil
-  Copyright(C) 2018 Kouhei Sutou <kou****@clear*****>
+  Copyright(C) 2018-2019 Kouhei Sutou <kou****@clear*****>
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -663,6 +663,18 @@ grn_nfkc_normalize_context_swap(grn_ctx *ctx,
 }
 
 grn_inline static void
+grn_nfkc_normalize_context_rewind(grn_ctx *ctx,
+                                  grn_nfkc_normalize_context *context)
+{
+  context->d = context->dest;
+  context->d_ = NULL;
+  context->n_characters = 0;
+  context->c = context->checks;
+  context->t = context->types;
+  context->o = context->offsets;
+}
+
+grn_inline static void
 grn_nfkc_normalize_data_init(grn_ctx *ctx,
                              grn_nfkc_normalize_data *data,
                              grn_obj *string,
@@ -1337,6 +1349,9 @@ grn_nfkc_normalize_unify_katakana_v_sounds(grn_ctx *ctx,
     return unified_buffer;
   }
 
+  *n_unified_bytes = *n_used_bytes;
+  *n_unified_characters = *n_used_characters;
+
   return current;
 }
 
@@ -1390,6 +1405,9 @@ grn_nfkc_normalize_unify_katakana_bu_sound(grn_ctx *ctx,
     return unified_buffer;
   }
 
+  *n_unified_bytes = *n_used_bytes;
+  *n_unified_characters = *n_used_characters;
+
   return current;
 }
 
@@ -1436,7 +1454,7 @@ grn_nfkc_normalize_unify_stateful(grn_ctx *ctx,
       }
       if (unify->c) {
         size_t i;
-        *(unify->c++) += data->context.checks[i_byte];
+        *(unify->c++) = data->context.checks[i_byte];
         for (i = 1; i < n_unified_bytes; i++) {
           *(unify->c++) = 0;
         }
@@ -1547,6 +1565,7 @@ grn_nfkc_normalize_unify(grn_ctx *ctx,
   if (data->options->unify_katakana_v_sounds) {
     if (need_swap) {
       grn_nfkc_normalize_context_swap(ctx, &(data->context), &unify);
+      grn_nfkc_normalize_context_rewind(ctx, &unify);
     }
     grn_nfkc_normalize_unify_stateful(ctx,
                                       data,
@@ -1562,6 +1581,7 @@ grn_nfkc_normalize_unify(grn_ctx *ctx,
   if (data->options->unify_katakana_bu_sound) {
     if (need_swap) {
       grn_nfkc_normalize_context_swap(ctx, &(data->context), &unify);
+      grn_nfkc_normalize_context_rewind(ctx, &unify);
     }
     grn_nfkc_normalize_unify_stateful(ctx,
                                       data,
@@ -1577,6 +1597,7 @@ grn_nfkc_normalize_unify(grn_ctx *ctx,
   if (data->options->unify_to_romaji) {
     if (need_swap) {
       grn_nfkc_normalize_context_swap(ctx, &(data->context), &unify);
+      grn_nfkc_normalize_context_rewind(ctx, &unify);
     }
     grn_nfkc_normalize_unify_stateful(ctx,
                                       data,

  Added: test/command/suite/normalizers/nfkc100/mix/unify_katakana_v_sounds_and_kana.expected (+72 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/mix/unify_katakana_v_sounds_and_kana.expected    2019-01-25 11:21:50 +0900 (7049c7dce)
@@ -0,0 +1,72 @@
+normalize   'NormalizerNFKC100("unify_kana", true,                      "unify_katakana_v_sounds", true,                      "report_source_offset", true)'   "うヴァヴィヴヴェヴォヴ"   WITH_CHECKS|WITH_TYPES
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  {
+    "normalized": "うゔぁゔぃゔゔぇゔぉゔ",
+    "types": [
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana"
+    ],
+    "checks": [
+      3,
+      0,
+      0,
+      3,
+      0,
+      0,
+      3,
+      0,
+      0,
+      3,
+      0,
+      0,
+      3,
+      0,
+      0,
+      3,
+      0,
+      0,
+      3,
+      0,
+      0,
+      3,
+      0,
+      0,
+      3,
+      0,
+      0,
+      3,
+      0,
+      0,
+      3,
+      0,
+      0
+    ],
+    "offsets": [
+      0,
+      3,
+      6,
+      9,
+      12,
+      15,
+      18,
+      21,
+      24,
+      27,
+      30
+    ]
+  }
+]

  Added: test/command/suite/normalizers/nfkc100/mix/unify_katakana_v_sounds_and_kana.test (+6 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/mix/unify_katakana_v_sounds_and_kana.test    2019-01-25 11:21:50 +0900 (99ba70623)
@@ -0,0 +1,6 @@
+normalize \
+  'NormalizerNFKC100("unify_kana", true, \
+                     "unify_katakana_v_sounds", true, \
+                     "report_source_offset", true)' \
+  "うヴァヴィヴヴェヴォヴ" \
+  WITH_CHECKS|WITH_TYPES
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190125/b5897eeb/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index