[Groonga-commit] groonga/groonga at c5cb663 [master] NormalizerNFKC100: add "unify_kana_voiced_sound_mark" option

Back to archive index

Kouhei Sutou null+****@clear*****
Wed Apr 11 18:27:04 JST 2018


Kouhei Sutou	2018-04-11 18:27:04 +0900 (Wed, 11 Apr 2018)

  New Revision: c5cb663b84bfd630c04c0a1a0463c521ef7fe222
  https://github.com/groonga/groonga/commit/c5cb663b84bfd630c04c0a1a0463c521ef7fe222

  Message:
    NormalizerNFKC100: add "unify_kana_voiced_sound_mark" option

  Added files:
    test/command/suite/normalizers/nfkc100/unify_kana_voiced_sound_mark_hiragana.expected
    test/command/suite/normalizers/nfkc100/unify_kana_voiced_sound_mark_hiragana.test
    test/command/suite/normalizers/nfkc100/unify_kana_voiced_sound_mark_katakana.expected
    test/command/suite/normalizers/nfkc100/unify_kana_voiced_sound_mark_katakana.test
  Modified files:
    lib/normalizer.c

  Modified: lib/normalizer.c (+119 -0)
===================================================================
--- lib/normalizer.c    2018-04-11 17:56:40 +0900 (4a84ff589)
+++ lib/normalizer.c    2018-04-11 18:27:04 +0900 (18c442939)
@@ -622,6 +622,7 @@ typedef struct {
   grn_nfkc_compose_func compose_func;
   grn_bool unify_kana;
   grn_bool unify_kana_case;
+  grn_bool unify_kana_voiced_sound_mark;
 } grn_utf8_normalize_options;
 
 static void
@@ -635,6 +636,7 @@ utf8_normalize_options_init(grn_utf8_normalize_options *options,
   options->compose_func = compose_func;
   options->unify_kana = GRN_FALSE;
   options->unify_kana_case = GRN_FALSE;
+  options->unify_kana_voiced_sound_mark = GRN_FALSE;
 }
 
 grn_inline static const unsigned char *
@@ -754,6 +756,96 @@ utf8_normalize_unify_katakana_case(const unsigned char *utf8_char,
   return utf8_char;
 }
 
+grn_inline static const unsigned char *
+utf8_normalize_unify_hiragana_voiced_sound_mark(const unsigned char *utf8_char,
+                                                unsigned char *unified)
+{
+  if (utf8_char[0] == 0xe3) {
+    if ((utf8_char[1] == 0x81 && (0x8c <= utf8_char[2] &&
+                                  utf8_char[2] <= 0xa2))) {
+      /* U+304C HIRAGANA LETTER GA ..
+       * U+3062 HIRAGANA LETTER DI */
+      if (!(utf8_char[2] & 0x1)) {
+        unified[0] = utf8_char[0];
+        unified[1] = utf8_char[1];
+        unified[2] = utf8_char[2] - 1;
+        return unified;
+      }
+    } else if ((utf8_char[1] == 0x81 && (0xa5 <= utf8_char[2] &&
+                                         utf8_char[2] <= 0xa9))) {
+      /* U+3065 HIRAGANA LETTER DU ..
+       * U+3069 HIRAGANA LETTER DO */
+      if (utf8_char[2] & 0x1) {
+        unified[0] = utf8_char[0];
+        unified[1] = utf8_char[1];
+        unified[2] = utf8_char[2] - 1;
+        return unified;
+      }
+    } else if ((utf8_char[1] == 0x81 && (0xb0 <= utf8_char[2] &&
+                                         utf8_char[2] <= 0xbd))) {
+      /* U+3070 HIRAGANA LETTER BA ..
+       * U+307D HIRAGANA LETTER PO */
+      unsigned char mod3 = (utf8_char[2] - 1) % 3;
+      if (mod3 != 0) {
+        unified[0] = utf8_char[0];
+        unified[1] = utf8_char[1];
+        unified[2] = utf8_char[2] - mod3;
+        return unified;
+      }
+    }
+  }
+
+  return utf8_char;
+}
+
+grn_inline static const unsigned char *
+utf8_normalize_unify_katakana_voiced_sound_mark(const unsigned char *utf8_char,
+                                                unsigned char *unified)
+{
+  if (utf8_char[0] == 0xe3) {
+    if (utf8_char[1] == 0x83 && utf8_char[2] == 0x80) {
+      /* U+30C0 KATAKANA LETTER DA */
+      unified[0] = utf8_char[0];
+      unified[1] = 0x82;
+      unified[2] = 0xbf;
+      return unified;
+    } else if ((utf8_char[1] == 0x82 && 0xac <= utf8_char[2]) ||
+               (utf8_char[1] == 0x83 && utf8_char[2] <= 0x82)) {
+      /* U+30AC KATAKANA LETTER GA ..
+       * U+30C2 KATAKANA LETTER DI */
+      if (!(utf8_char[2] & 0x1)) {
+        unified[0] = utf8_char[0];
+        unified[1] = utf8_char[1];
+        unified[2] = utf8_char[2] - 1;
+        return unified;
+      }
+    } else if ((utf8_char[1] == 0x83 && (0x85 <= utf8_char[2] &&
+                                         utf8_char[2] <= 0x89))) {
+      /* U+30C5 KATAKANA LETTER DU ..
+       * U+30C9 KATAKANA LETTER DO */
+      if (utf8_char[2] & 0x1) {
+        unified[0] = utf8_char[0];
+        unified[1] = utf8_char[1];
+        unified[2] = utf8_char[2] - 1;
+        return unified;
+      }
+    } else if ((utf8_char[1] == 0x83 && (0x90 <= utf8_char[2] &&
+                                         utf8_char[2] <= 0x9d))) {
+      /* U+30D0 KATAKANA LETTER BA ..
+       * U+30DD KATAKANA LETTER PO */
+      unsigned char mod3 = (utf8_char[2] - 2) % 3;
+      if (mod3 != 0) {
+        unified[0] = utf8_char[0];
+        unified[1] = utf8_char[1];
+        unified[2] = utf8_char[2] - mod3;
+        return unified;
+      }
+    }
+  }
+
+  return utf8_char;
+}
+
 grn_inline static grn_obj *
 utf8_normalize(grn_ctx *ctx,
                grn_string *nstr,
@@ -879,6 +971,7 @@ utf8_normalize(grn_ctx *ctx,
         {
           unsigned char unified_kana[3];
           unsigned char unified_kana_case[3];
+          unsigned char unified_kana_voiced_sound_mark[3];
 
           if (options->unify_kana &&
               char_type == GRN_CHAR_KATAKANA &&
@@ -906,6 +999,25 @@ utf8_normalize(grn_ctx *ctx,
             }
           }
 
+          if (options->unify_kana_voiced_sound_mark) {
+            switch (char_type) {
+            case GRN_CHAR_HIRAGANA :
+              if (lp == 3) {
+                p = utf8_normalize_unify_hiragana_voiced_sound_mark(
+                  p, unified_kana_voiced_sound_mark);
+              }
+              break;
+            case GRN_CHAR_KATAKANA :
+              if (lp == 3) {
+                p = utf8_normalize_unify_katakana_voiced_sound_mark(
+                  p, unified_kana_voiced_sound_mark);
+              }
+              break;
+            default :
+              break;
+            }
+          }
+
           grn_memcpy(d, p, lp);
         }
         d_ = d;
@@ -1385,6 +1497,13 @@ nfkc100_open_options(grn_ctx *ctx,
                                     raw_options,
                                     i,
                                     options->unify_kana_case);
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw,
+                                            "unify_kana_voiced_sound_mark")) {
+      options->unify_kana_voiced_sound_mark =
+        grn_vector_get_element_bool(ctx,
+                                    raw_options,
+                                    i,
+                                    options->unify_kana_voiced_sound_mark);
     }
   } GRN_OPTION_VALUES_EACH_END();
 

  Added: test/command/suite/normalizers/nfkc100/unify_kana_voiced_sound_mark_hiragana.expected (+61 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_kana_voiced_sound_mark_hiragana.expected    2018-04-11 18:27:04 +0900 (08f44f3a5)
@@ -0,0 +1,61 @@
+normalize   'NormalizerNFKC100("unify_kana_voiced_sound_mark", true)'   "かがきぎくぐけげこごさざしじすずせぜそぞただちぢつづてでとどはばぱひびぴふぶぷへべぺほぼぽ"   WITH_TYPES
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  {
+    "normalized": "かかききくくけけここささししすすせせそそたたちちつつててととはははひひひふふふへへへほほほ",
+    "types": [
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana"
+    ],
+    "checks": [
+
+    ]
+  }
+]

  Added: test/command/suite/normalizers/nfkc100/unify_kana_voiced_sound_mark_hiragana.test (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_kana_voiced_sound_mark_hiragana.test    2018-04-11 18:27:04 +0900 (ad3a6db2c)
@@ -0,0 +1,4 @@
+normalize \
+  'NormalizerNFKC100("unify_kana_voiced_sound_mark", true)' \
+  "かがきぎくぐけげこごさざしじすずせぜそぞただちぢつづてでとどはばぱひびぴふぶぷへべぺほぼぽ" \
+  WITH_TYPES

  Added: test/command/suite/normalizers/nfkc100/unify_kana_voiced_sound_mark_katakana.expected (+61 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_kana_voiced_sound_mark_katakana.expected    2018-04-11 18:27:04 +0900 (4c68cbb52)
@@ -0,0 +1,61 @@
+normalize   'NormalizerNFKC100("unify_kana_voiced_sound_mark", true)'   "カガキギクグケゲコゴサザシジスズセゼソゾタダチヂツヅテデトドハバパヒビピフブプヘベペホボポ"   WITH_TYPES
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  {
+    "normalized": "カカキキククケケココササシシススセセソソタタチチツツテテトトハハハヒヒヒフフフヘヘヘホホホ",
+    "types": [
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana"
+    ],
+    "checks": [
+
+    ]
+  }
+]

  Added: test/command/suite/normalizers/nfkc100/unify_kana_voiced_sound_mark_katakana.test (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_kana_voiced_sound_mark_katakana.test    2018-04-11 18:27:04 +0900 (6b94b4b0e)
@@ -0,0 +1,4 @@
+normalize \
+  'NormalizerNFKC100("unify_kana_voiced_sound_mark", true)' \
+  "カガキギクグケゲコゴサザシジスズセゼソゾタダチヂツヅテデトドハバパヒビピフブプヘベペホボポ" \
+  WITH_TYPES
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180411/cd5d4bae/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index