Kouhei Sutou 2018-11-07 16:21:37 +0900 (Wed, 07 Nov 2018) Revision: 75620f1e2a827d5c2fe9bd1543f41556d1df4bcb https://github.com/groonga/groonga/commit/75620f1e2a827d5c2fe9bd1543f41556d1df4bcb Message: NormalizeNFKC100: support xtsu + n + [pbm] case Added files: test/command/suite/normalizers/nfkc100/unify_to_romaji/hiragana/xtsu.expected test/command/suite/normalizers/nfkc100/unify_to_romaji/hiragana/xtsu.test test/command/suite/normalizers/nfkc100/unify_to_romaji/katakana/xtsu.expected test/command/suite/normalizers/nfkc100/unify_to_romaji/katakana/xtsu.test Modified files: lib/grn_romaji.h lib/normalizer.c lib/romaji.c Modified: lib/grn_romaji.h (+1 -1) =================================================================== --- lib/grn_romaji.h 2018-11-07 15:59:23 +0900 (f056db4db) +++ lib/grn_romaji.h 2018-11-07 16:21:37 +0900 (318afe336) @@ -25,7 +25,7 @@ extern "C" { #endif const unsigned char * -grn_romaji_convert_hepburn(grn_ctx *ctx, +grn_romaji_hepburn_convert(grn_ctx *ctx, const unsigned char *current, const unsigned char *end, size_t *n_used_bytes, Modified: lib/normalizer.c (+1 -1) =================================================================== --- lib/normalizer.c 2018-11-07 15:59:23 +0900 (c32ef6719) +++ lib/normalizer.c 2018-11-07 16:21:37 +0900 (ab5dc1c4b) @@ -1580,7 +1580,7 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, grn_nfkc_normalize_unify_stateful(ctx, data, &unify, - grn_romaji_convert_hepburn, + grn_romaji_hepburn_convert, "[unify][romaji]"); if (ctx->rc != GRN_SUCCESS) { goto exit; Modified: lib/romaji.c (+77 -37) =================================================================== --- lib/romaji.c 2018-11-07 15:59:23 +0900 (5270bf725) +++ lib/romaji.c 2018-11-07 16:21:37 +0900 (bdbb63edf) @@ -19,8 +19,63 @@ #include "grn_romaji.h" #include "grn_str.h" +static grn_inline grn_bool +grn_romaji_hepburn_is_pbm(const unsigned char *utf8, + size_t length) +{ + if (length != 3) { + return GRN_FALSE; + } + + switch (utf8[0]) { + case 0xe3 : + switch (utf8[1]) { + case 0x81 : + switch (utf8[2]) { + case 0xb0 : /* U+3070 HIRAGANA LETTER BA */ + case 0xb1 : /* U+3071 HIRAGANA LETTER PA */ + case 0xb3 : /* U+3073 HIRAGANA LETTER BI */ + case 0xb4 : /* U+3074 HIRAGANA LETTER PI */ + case 0xb6 : /* U+3076 HIRAGANA LETTER BU */ + case 0xb7 : /* U+3077 HIRAGANA LETTER PU */ + case 0xb9 : /* U+3079 HIRAGANA LETTER BE */ + case 0xba : /* U+307A HIRAGANA LETTER PE */ + return GRN_TRUE; + default : + /* U+3079 HIRAGANA LETTER BO .. + * U+307F HIRAGANA LETTER MI */ + return utf8[2] >= 0xbc; + } + case 0x82 : + /* U+3080 HIRAGANA LETTER MU .. + * U+3082 HIRAGANA LETTER MO */ + return (0x80 <= utf8[2] && utf8[2] <= 0x82); + case 0x83 : + switch (utf8[2]) { + case 0x90 : /* U+30D0 KATAKANA LETTER BA */ + case 0x91 : /* U+30D1 KATAKANA LETTER PA */ + case 0x93 : /* U+30D3 KATAKANA LETTER BI */ + case 0x94 : /* U+30D4 KATAKANA LETTER PI */ + case 0x96 : /* U+30D6 KATAKANA LETTER BU */ + case 0x97 : /* U+30D7 KATAKANA LETTER PU */ + case 0x99 : /* U+30D9 KATAKANA LETTER BE */ + case 0x9a : /* U+30DA KATAKANA LETTER PE */ + return GRN_TRUE; + default : + /* U+30DC KATAKANA LETTER BO .. + * U+30E2 KATAKANA LETTER MO */ + return (0x9c <= utf8[2] && utf8[2] <= 0xa2); + } + default : + return GRN_FALSE; + } + default : + return GRN_FALSE; + } +} + const unsigned char * -grn_romaji_convert_hepburn(grn_ctx *ctx, +grn_romaji_hepburn_convert(grn_ctx *ctx, const unsigned char *current, const unsigned char *end, size_t *n_used_bytes, @@ -48,6 +103,7 @@ grn_romaji_convert_hepburn(grn_ctx *ctx, if (char_length == 3) { next = current + char_length; next_char_length = grn_charlen_(ctx, next, end, GRN_ENC_UTF8); + next_pbm = grn_romaji_hepburn_is_pbm(next, next_char_length); if (next_char_length == 3) { if (next[0] == 0xe3 && next[1] == 0x82 && @@ -64,38 +120,6 @@ grn_romaji_convert_hepburn(grn_ctx *ctx, next_small_y = GRN_TRUE; next_small_yayuyo = aiueo[(next[2] - 3) % 5]; } else if (next[0] == 0xe3 && - ((next[1] == 0x81 && - (next[2] == 0xb0 || /* U+3070 HIRAGANA LETTER BA */ - next[2] == 0xb1 || /* U+3071 HIRAGANA LETTER PA */ - next[2] == 0xb3 || /* U+3073 HIRAGANA LETTER BI */ - next[2] == 0xb4 || /* U+3074 HIRAGANA LETTER PI */ - next[2] == 0xb6 || /* U+3076 HIRAGANA LETTER BU */ - next[2] == 0xb7 || /* U+3077 HIRAGANA LETTER PU */ - next[2] == 0xb9 || /* U+3079 HIRAGANA LETTER BE */ - next[2] == 0xba || /* U+307A HIRAGANA LETTER PE */ - /* U+3079 HIRAGANA LETTER BO .. - * U+307F HIRAGANA LETTER MI */ - 0xbc <= next[2])) || - (next[1] == 0x82 && - /* U+3080 HIRAGANA LETTER MU .. - * U+3082 HIRAGANA LETTER MO */ - (0x80 <= next[2] && next[2] <= 0x82)))) { - next_pbm = GRN_TRUE; - } else if (next[0] == 0xe3 && - next[1] == 0x83 && - (next[2] == 0x90 || /* U+30D0 KATAKANA LETTER BA */ - next[2] == 0x91 || /* U+30D1 KATAKANA LETTER PA */ - next[2] == 0x93 || /* U+30D3 KATAKANA LETTER BI */ - next[2] == 0x94 || /* U+30D4 KATAKANA LETTER PI */ - next[2] == 0x96 || /* U+30D6 KATAKANA LETTER BU */ - next[2] == 0x97 || /* U+30D7 KATAKANA LETTER PU */ - next[2] == 0x99 || /* U+30D9 KATAKANA LETTER BE */ - next[2] == 0x9a || /* U+30DA KATAKANA LETTER PE */ - /* U+30DC KATAKANA LETTER BO .. - * U+30E2 KATAKANA LETTER MO */ - (0x9c <= next[2] && next[2] <= 0xa2))) { - next_pbm = GRN_TRUE; - } else if (next[0] == 0xe3 && next[1] == 0x81 && (next[2] == 0x82 || /* U+3042 HIRAGANA LETTER A */ next[2] == 0x84 || /* U+3044 HIRAGANA LETTER I */ @@ -200,8 +224,16 @@ grn_romaji_convert_hepburn(grn_ctx *ctx, } } else if (next[2] == 0x93) { /* U+3093 HIRAGANA LETTER N */ - /* TODO: Maybe 'm' */ - next_consonant = 'n'; + const unsigned char *next_next = next + next_char_length; + size_t next_next_char_length = grn_charlen_(ctx, + next_next, + end, + GRN_ENC_UTF8); + if (grn_romaji_hepburn_is_pbm(next_next, next_next_char_length)) { + next_consonant = 'm'; + } else { + next_consonant = 'n'; + } } else if (next[2] == 0x94) { /* U+3094 HIRAGANA LETTER VU */ next_consonant = 'v'; @@ -281,8 +313,16 @@ grn_romaji_convert_hepburn(grn_ctx *ctx, } } else if (next[2] == 0xb3) { /* U+30F3 KATAKANA LETTER N */ - /* TODO: Maybe 'm' */ - next_consonant = 'n'; + const unsigned char *next_next = next + next_char_length; + size_t next_next_char_length = grn_charlen_(ctx, + next_next, + end, + GRN_ENC_UTF8); + if (grn_romaji_hepburn_is_pbm(next_next, next_next_char_length)) { + next_consonant = 'm'; + } else { + next_consonant = 'n'; + } } else if (next[2] == 0xb4) { /* U+30F4 KATAKANA LETTER VU */ next_consonant = 'v'; Added: test/command/suite/normalizers/nfkc100/unify_to_romaji/hiragana/xtsu.expected (+29 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_to_romaji/hiragana/xtsu.expected 2018-11-07 16:21:37 +0900 (aeff6876a) @@ -0,0 +1,29 @@ +normalize 'NormalizerNFKC100("unify_to_romaji", true, "report_source_offset", true)' "っんぱ" WITH_CHECKS|WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "mmpa", + "types": [ + "alpha", + "alpha", + "alpha", + "alpha" + ], + "checks": [ + 3, + 3, + 3, + -1 + ], + "offsets": [ + 0, + 3, + 6, + 6 + ] + } +] Added: test/command/suite/normalizers/nfkc100/unify_to_romaji/hiragana/xtsu.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_to_romaji/hiragana/xtsu.test 2018-11-07 16:21:37 +0900 (653329fff) @@ -0,0 +1,5 @@ +normalize \ + 'NormalizerNFKC100("unify_to_romaji", true, \ + "report_source_offset", true)' \ + "っんぱ" \ + WITH_CHECKS|WITH_TYPES Added: test/command/suite/normalizers/nfkc100/unify_to_romaji/katakana/xtsu.expected (+29 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_to_romaji/katakana/xtsu.expected 2018-11-07 16:21:37 +0900 (7a8d7e4f2) @@ -0,0 +1,29 @@ +normalize 'NormalizerNFKC100("unify_to_romaji", true, "report_source_offset", true)' "ッンパ" WITH_CHECKS|WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "mmpa", + "types": [ + "alpha", + "alpha", + "alpha", + "alpha" + ], + "checks": [ + 3, + 3, + 3, + -1 + ], + "offsets": [ + 0, + 3, + 6, + 6 + ] + } +] Added: test/command/suite/normalizers/nfkc100/unify_to_romaji/katakana/xtsu.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_to_romaji/katakana/xtsu.test 2018-11-07 16:21:37 +0900 (f25888e50) @@ -0,0 +1,5 @@ +normalize \ + 'NormalizerNFKC100("unify_to_romaji", true, \ + "report_source_offset", true)' \ + "ッンパ" \ + WITH_CHECKS|WITH_TYPES -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181107/b3a6aa8b/attachment-0001.html>