[Groonga-commit] groonga/groonga at 75620f1 [master] NormalizeNFKC100: support xtsu + n + [pbm] case

Back to archive index
Kouhei Sutou null+****@clear*****
Wed Nov 7 16:21:37 JST 2018


Kouhei Sutou	2018-11-07 16:21:37 +0900 (Wed, 07 Nov 2018)

  Revision: 75620f1e2a827d5c2fe9bd1543f41556d1df4bcb
  https://github.com/groonga/groonga/commit/75620f1e2a827d5c2fe9bd1543f41556d1df4bcb

  Message:
    NormalizeNFKC100: support xtsu + n + [pbm] case

  Added files:
    test/command/suite/normalizers/nfkc100/unify_to_romaji/hiragana/xtsu.expected
    test/command/suite/normalizers/nfkc100/unify_to_romaji/hiragana/xtsu.test
    test/command/suite/normalizers/nfkc100/unify_to_romaji/katakana/xtsu.expected
    test/command/suite/normalizers/nfkc100/unify_to_romaji/katakana/xtsu.test
  Modified files:
    lib/grn_romaji.h
    lib/normalizer.c
    lib/romaji.c

  Modified: lib/grn_romaji.h (+1 -1)
===================================================================
--- lib/grn_romaji.h    2018-11-07 15:59:23 +0900 (f056db4db)
+++ lib/grn_romaji.h    2018-11-07 16:21:37 +0900 (318afe336)
@@ -25,7 +25,7 @@ extern "C" {
 #endif
 
 const unsigned char *
-grn_romaji_convert_hepburn(grn_ctx *ctx,
+grn_romaji_hepburn_convert(grn_ctx *ctx,
                            const unsigned char *current,
                            const unsigned char *end,
                            size_t *n_used_bytes,

  Modified: lib/normalizer.c (+1 -1)
===================================================================
--- lib/normalizer.c    2018-11-07 15:59:23 +0900 (c32ef6719)
+++ lib/normalizer.c    2018-11-07 16:21:37 +0900 (ab5dc1c4b)
@@ -1580,7 +1580,7 @@ grn_nfkc_normalize_unify(grn_ctx *ctx,
     grn_nfkc_normalize_unify_stateful(ctx,
                                       data,
                                       &unify,
-                                      grn_romaji_convert_hepburn,
+                                      grn_romaji_hepburn_convert,
                                       "[unify][romaji]");
     if (ctx->rc != GRN_SUCCESS) {
       goto exit;

  Modified: lib/romaji.c (+77 -37)
===================================================================
--- lib/romaji.c    2018-11-07 15:59:23 +0900 (5270bf725)
+++ lib/romaji.c    2018-11-07 16:21:37 +0900 (bdbb63edf)
@@ -19,8 +19,63 @@
 #include "grn_romaji.h"
 #include "grn_str.h"
 
+static grn_inline grn_bool
+grn_romaji_hepburn_is_pbm(const unsigned char *utf8,
+                          size_t length)
+{
+  if (length != 3) {
+    return GRN_FALSE;
+  }
+
+  switch (utf8[0]) {
+  case 0xe3 :
+    switch (utf8[1]) {
+    case 0x81 :
+      switch (utf8[2]) {
+      case 0xb0 : /* U+3070 HIRAGANA LETTER BA */
+      case 0xb1 : /* U+3071 HIRAGANA LETTER PA */
+      case 0xb3 : /* U+3073 HIRAGANA LETTER BI */
+      case 0xb4 : /* U+3074 HIRAGANA LETTER PI */
+      case 0xb6 : /* U+3076 HIRAGANA LETTER BU */
+      case 0xb7 : /* U+3077 HIRAGANA LETTER PU */
+      case 0xb9 : /* U+3079 HIRAGANA LETTER BE */
+      case 0xba : /* U+307A HIRAGANA LETTER PE */
+        return GRN_TRUE;
+      default :
+        /* U+3079 HIRAGANA LETTER BO ..
+         * U+307F HIRAGANA LETTER MI */
+        return utf8[2] >= 0xbc;
+      }
+    case 0x82 :
+      /* U+3080 HIRAGANA LETTER MU ..
+       * U+3082 HIRAGANA LETTER MO */
+      return (0x80 <= utf8[2] && utf8[2] <= 0x82);
+    case 0x83 :
+      switch (utf8[2]) {
+      case 0x90 : /* U+30D0 KATAKANA LETTER BA */
+      case 0x91 : /* U+30D1 KATAKANA LETTER PA */
+      case 0x93 : /* U+30D3 KATAKANA LETTER BI */
+      case 0x94 : /* U+30D4 KATAKANA LETTER PI */
+      case 0x96 : /* U+30D6 KATAKANA LETTER BU */
+      case 0x97 : /* U+30D7 KATAKANA LETTER PU */
+      case 0x99 : /* U+30D9 KATAKANA LETTER BE */
+      case 0x9a : /* U+30DA KATAKANA LETTER PE */
+        return GRN_TRUE;
+      default :
+        /* U+30DC KATAKANA LETTER BO ..
+         * U+30E2 KATAKANA LETTER MO */
+        return (0x9c <= utf8[2] && utf8[2] <= 0xa2);
+      }
+    default :
+      return GRN_FALSE;
+    }
+  default :
+    return GRN_FALSE;
+  }
+}
+
 const unsigned char *
-grn_romaji_convert_hepburn(grn_ctx *ctx,
+grn_romaji_hepburn_convert(grn_ctx *ctx,
                            const unsigned char *current,
                            const unsigned char *end,
                            size_t *n_used_bytes,
@@ -48,6 +103,7 @@ grn_romaji_convert_hepburn(grn_ctx *ctx,
   if (char_length == 3) {
     next = current + char_length;
     next_char_length = grn_charlen_(ctx, next, end, GRN_ENC_UTF8);
+    next_pbm = grn_romaji_hepburn_is_pbm(next, next_char_length);
     if (next_char_length == 3) {
       if (next[0] == 0xe3 &&
           next[1] == 0x82 &&
@@ -64,38 +120,6 @@ grn_romaji_convert_hepburn(grn_ctx *ctx,
         next_small_y = GRN_TRUE;
         next_small_yayuyo = aiueo[(next[2] - 3) % 5];
       } else if (next[0] == 0xe3 &&
-                 ((next[1] == 0x81 &&
-                   (next[2] == 0xb0 || /* U+3070 HIRAGANA LETTER BA */
-                    next[2] == 0xb1 || /* U+3071 HIRAGANA LETTER PA */
-                    next[2] == 0xb3 || /* U+3073 HIRAGANA LETTER BI */
-                    next[2] == 0xb4 || /* U+3074 HIRAGANA LETTER PI */
-                    next[2] == 0xb6 || /* U+3076 HIRAGANA LETTER BU */
-                    next[2] == 0xb7 || /* U+3077 HIRAGANA LETTER PU */
-                    next[2] == 0xb9 || /* U+3079 HIRAGANA LETTER BE */
-                    next[2] == 0xba || /* U+307A HIRAGANA LETTER PE */
-                    /* U+3079 HIRAGANA LETTER BO ..
-                     * U+307F HIRAGANA LETTER MI */
-                    0xbc <= next[2])) ||
-                  (next[1] == 0x82 &&
-                  /* U+3080 HIRAGANA LETTER MU ..
-                   * U+3082 HIRAGANA LETTER MO */
-                   (0x80 <= next[2] && next[2] <= 0x82)))) {
-        next_pbm = GRN_TRUE;
-      } else if (next[0] == 0xe3 &&
-                 next[1] == 0x83 &&
-                 (next[2] == 0x90 || /* U+30D0 KATAKANA LETTER BA */
-                   next[2] == 0x91 || /* U+30D1 KATAKANA LETTER PA */
-                   next[2] == 0x93 || /* U+30D3 KATAKANA LETTER BI */
-                   next[2] == 0x94 || /* U+30D4 KATAKANA LETTER PI */
-                   next[2] == 0x96 || /* U+30D6 KATAKANA LETTER BU */
-                   next[2] == 0x97 || /* U+30D7 KATAKANA LETTER PU */
-                   next[2] == 0x99 || /* U+30D9 KATAKANA LETTER BE */
-                   next[2] == 0x9a || /* U+30DA KATAKANA LETTER PE */
-                  /* U+30DC KATAKANA LETTER BO ..
-                   * U+30E2 KATAKANA LETTER MO */
-                  (0x9c <= next[2] && next[2] <= 0xa2))) {
-        next_pbm = GRN_TRUE;
-      } else if (next[0] == 0xe3 &&
                  next[1] == 0x81 &&
                  (next[2] == 0x82 || /* U+3042 HIRAGANA LETTER A */
                   next[2] == 0x84 || /* U+3044 HIRAGANA LETTER I */
@@ -200,8 +224,16 @@ grn_romaji_convert_hepburn(grn_ctx *ctx,
             }
           } else if (next[2] == 0x93) {
             /* U+3093 HIRAGANA LETTER N */
-            /* TODO: Maybe 'm' */
-            next_consonant = 'n';
+            const unsigned char *next_next = next + next_char_length;
+            size_t next_next_char_length = grn_charlen_(ctx,
+                                                        next_next,
+                                                        end,
+                                                        GRN_ENC_UTF8);
+            if (grn_romaji_hepburn_is_pbm(next_next, next_next_char_length)) {
+              next_consonant = 'm';
+            } else {
+              next_consonant = 'n';
+            }
           } else if (next[2] == 0x94) {
             /* U+3094 HIRAGANA LETTER VU */
             next_consonant = 'v';
@@ -281,8 +313,16 @@ grn_romaji_convert_hepburn(grn_ctx *ctx,
             }
           } else if (next[2] == 0xb3) {
             /* U+30F3 KATAKANA LETTER N */
-            /* TODO: Maybe 'm' */
-            next_consonant = 'n';
+            const unsigned char *next_next = next + next_char_length;
+            size_t next_next_char_length = grn_charlen_(ctx,
+                                                        next_next,
+                                                        end,
+                                                        GRN_ENC_UTF8);
+            if (grn_romaji_hepburn_is_pbm(next_next, next_next_char_length)) {
+              next_consonant = 'm';
+            } else {
+              next_consonant = 'n';
+            }
           } else if (next[2] == 0xb4) {
             /* U+30F4 KATAKANA LETTER VU */
             next_consonant = 'v';

  Added: test/command/suite/normalizers/nfkc100/unify_to_romaji/hiragana/xtsu.expected (+29 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_to_romaji/hiragana/xtsu.expected    2018-11-07 16:21:37 +0900 (aeff6876a)
@@ -0,0 +1,29 @@
+normalize   'NormalizerNFKC100("unify_to_romaji", true,                      "report_source_offset", true)'   "っんぱ"   WITH_CHECKS|WITH_TYPES
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  {
+    "normalized": "mmpa",
+    "types": [
+      "alpha",
+      "alpha",
+      "alpha",
+      "alpha"
+    ],
+    "checks": [
+      3,
+      3,
+      3,
+      -1
+    ],
+    "offsets": [
+      0,
+      3,
+      6,
+      6
+    ]
+  }
+]

  Added: test/command/suite/normalizers/nfkc100/unify_to_romaji/hiragana/xtsu.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_to_romaji/hiragana/xtsu.test    2018-11-07 16:21:37 +0900 (653329fff)
@@ -0,0 +1,5 @@
+normalize \
+  'NormalizerNFKC100("unify_to_romaji", true, \
+                     "report_source_offset", true)' \
+  "っんぱ" \
+  WITH_CHECKS|WITH_TYPES

  Added: test/command/suite/normalizers/nfkc100/unify_to_romaji/katakana/xtsu.expected (+29 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_to_romaji/katakana/xtsu.expected    2018-11-07 16:21:37 +0900 (7a8d7e4f2)
@@ -0,0 +1,29 @@
+normalize   'NormalizerNFKC100("unify_to_romaji", true,                      "report_source_offset", true)'   "ッンパ"   WITH_CHECKS|WITH_TYPES
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  {
+    "normalized": "mmpa",
+    "types": [
+      "alpha",
+      "alpha",
+      "alpha",
+      "alpha"
+    ],
+    "checks": [
+      3,
+      3,
+      3,
+      -1
+    ],
+    "offsets": [
+      0,
+      3,
+      6,
+      6
+    ]
+  }
+]

  Added: test/command/suite/normalizers/nfkc100/unify_to_romaji/katakana/xtsu.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_to_romaji/katakana/xtsu.test    2018-11-07 16:21:37 +0900 (f25888e50)
@@ -0,0 +1,5 @@
+normalize \
+  'NormalizerNFKC100("unify_to_romaji", true, \
+                     "report_source_offset", true)' \
+  "ッンパ" \
+  WITH_CHECKS|WITH_TYPES
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181107/b3a6aa8b/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index