Kouhei Sutou
null+****@clear*****
Wed Apr 11 17:40:22 JST 2018
Kouhei Sutou 2018-04-11 17:40:22 +0900 (Wed, 11 Apr 2018) New Revision: a9cf34f9e31a918c3009ab821f05c670303feb4b https://github.com/groonga/groonga/commit/a9cf34f9e31a918c3009ab821f05c670303feb4b Message: NormalizerNFKC100: add "unify_kana_case" option Added files: test/command/suite/normalizers/nfkc100/unify_kana_case_hiragana.expected test/command/suite/normalizers/nfkc100/unify_kana_case_hiragana.test test/command/suite/normalizers/nfkc100/unify_kana_case_katakana.expected test/command/suite/normalizers/nfkc100/unify_kana_case_katakana.test Modified files: lib/normalizer.c Modified: lib/normalizer.c (+150 -19) =================================================================== --- lib/normalizer.c 2018-04-11 16:41:39 +0900 (cf2db157f) +++ lib/normalizer.c 2018-04-11 17:40:22 +0900 (11780f52b) @@ -621,6 +621,7 @@ typedef struct { grn_nfkc_decompose_func decompose_func; grn_nfkc_compose_func compose_func; grn_bool unify_kana; + grn_bool unify_kana_case; } grn_utf8_normalize_options; static void @@ -633,6 +634,118 @@ utf8_normalize_options_init(grn_utf8_normalize_options *options, options->decompose_func = decompose_func; options->compose_func = compose_func; options->unify_kana = GRN_FALSE; + options->unify_kana_case = GRN_FALSE; +} + +grn_inline static const unsigned char * +utf8_normalize_unify_kana(const unsigned char *utf8_char, + unsigned char *unified_kana) +{ + if (utf8_char[0] == 0xe3 && + /* U+30A1 KATAKANA LETTER SMALL A .. + * U+30F6 KATAKANA LETTER SMALL KE + * + * U+30FD KATAKANA ITERATION MARK .. + * U+30F6 KATAKANA LETTER SMALL KE */ + ((utf8_char[1] == 0x82 && 0xa1 <= utf8_char[2]) || + (utf8_char[1] == 0x83 && utf8_char[2] <= 0xb6) || + (utf8_char[1] == 0x83 && (0xbd <= utf8_char[2] && + utf8_char[2] <= 0xbe)))) { + unified_kana[0] = utf8_char[0]; + unified_kana[1] = utf8_char[1] - 1; + unified_kana[2] = utf8_char[2] ^ 0x20; + return unified_kana; + } + + return utf8_char; +} + +grn_inline static const unsigned char * +utf8_normalize_unify_hiragana_case(const unsigned char *utf8_char, + unsigned char *unified_kana_case) +{ + if (utf8_char[0] == 0xe3) { + if ((utf8_char[1] == 0x81 && (0x81 <= utf8_char[2] && + utf8_char[2] <= 0x89)) || + (utf8_char[1] == 0x82 && (0x83 <= utf8_char[2] && + utf8_char[2] <= 0x87))) { + /* U+3041 HIRAGANA LETTER SMALL A .. + * U+3049 HIRAGANA LETTER SMALL O + * + * U+3083 HIRAGANA LETTER SMALL YA .. + * U+3087 HIRAGANA LETTER SMALL YO */ + if (utf8_char[2] & 0x1) { + unified_kana_case[0] = utf8_char[0]; + unified_kana_case[1] = utf8_char[1]; + unified_kana_case[2] = utf8_char[2] + 1; + return unified_kana_case; + } + } else if (utf8_char[1] == 0x82 && utf8_char[2] == 0x8e) { + /* U+308E HIRAGANA LETTER SMALL WA */ + unified_kana_case[0] = utf8_char[0]; + unified_kana_case[1] = utf8_char[1]; + unified_kana_case[2] = utf8_char[2] + 1; + return unified_kana_case; + } else if (utf8_char[1] == 0x82 && utf8_char[2] == 0x95) { + /* U+3095 HIRAGANA LETTER SMALL KA */ + unified_kana_case[0] = utf8_char[0]; + unified_kana_case[1] = 0x81; + unified_kana_case[2] = 0x8b; + return unified_kana_case; + } else if (utf8_char[1] == 0x82 && utf8_char[2] == 0x96) { + /* U+3096 HIRAGANA LETTER SMALL KE */ + unified_kana_case[0] = utf8_char[0]; + unified_kana_case[1] = 0x81; + unified_kana_case[2] = 0x91; + return unified_kana_case; + } + } + + return utf8_char; +} + +grn_inline static const unsigned char * +utf8_normalize_unify_katakana_case(const unsigned char *utf8_char, + unsigned char *unified_kana_case) +{ + if (utf8_char[0] == 0xe3) { + if ((utf8_char[1] == 0x82 && (0xa1 <= utf8_char[2] && + utf8_char[2] <= 0xa9)) || + (utf8_char[1] == 0x83 && (0xa3 <= utf8_char[2] && + utf8_char[2] <= 0xa7))) { + /* U+30A1 KATAKANA LETTER SMALL A .. + * U+30A9 KATAKANA LETTER SMALL O + * + * U+30E3 KATAKANA LETTER SMALL YA .. + * U+30E7 KATAKANA LETTER SMALL YO */ + if (utf8_char[2] & 0x1) { + unified_kana_case[0] = utf8_char[0]; + unified_kana_case[1] = utf8_char[1]; + unified_kana_case[2] = utf8_char[2] + 1; + return unified_kana_case; + } + } else if (utf8_char[1] == 0x83 && utf8_char[2] == 0xae) { + /* U+30EE KATAKANA LETTER SMALL WA */ + unified_kana_case[0] = utf8_char[0]; + unified_kana_case[1] = utf8_char[1]; + unified_kana_case[2] = utf8_char[2] + 1; + return unified_kana_case; + } else if (utf8_char[1] == 0x83 && utf8_char[2] == 0xb5) { + /* U+3095 HIRAGANA LETTER SMALL KA */ + unified_kana_case[0] = utf8_char[0]; + unified_kana_case[1] = 0x82; + unified_kana_case[2] = 0xab; + return unified_kana_case; + } else if (utf8_char[1] == 0x83 && utf8_char[2] == 0xb6) { + /* U+3096 HIRAGANA LETTER SMALL KE */ + unified_kana_case[0] = utf8_char[0]; + unified_kana_case[1] = 0x82; + unified_kana_case[2] = 0xb1; + return unified_kana_case; + } + } + + return utf8_char; } grn_inline static grn_obj * @@ -713,6 +826,7 @@ utf8_normalize(grn_ctx *ctx, if (cp > nstr->ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } } else { grn_char_type char_type; + char_type = options->char_type_func(p); if (de <= d + lp) { unsigned char *normalized; @@ -755,26 +869,37 @@ utf8_normalize(grn_ctx *ctx, nstr->ctypes = ctypes; } } - char_type = options->char_type_func(p); - if (options->unify_kana && char_type == GRN_CHAR_KATAKANA) { - if (lp == 3 && - p[0] == 0xe3 && - /* U+30A1 KATAKANA LETTER SMALL A .. - * U+30F6 KATAKANA LETTER SMALL KE - * - * U+30FD KATAKANA ITERATION MARK .. - * U+30F6 KATAKANA LETTER SMALL KE */ - ((p[1] == 0x82 && 0xa1 <= p[2]) || - (p[1] == 0x83 && p[2] <= 0xb6) || - (p[1] == 0x83 && (0xbd <= p[2] && p[2] <= 0xbe)))) { - d[0] = p[0]; - d[1] = p[1] - 1; - d[2] = p[2] ^ 0x20; - char_type = GRN_CHAR_HIRAGANA; - } else { - grn_memcpy(d, p, lp); + + { + unsigned char unified_kana[3]; + unsigned char unified_kana_case[3]; + + if (options->unify_kana && + char_type == GRN_CHAR_KATAKANA && + lp == 3) { + p = utf8_normalize_unify_kana(p, unified_kana); + if (p == unified_kana) { + char_type = GRN_CHAR_HIRAGANA; + } } - } else { + + if (options->unify_kana_case) { + switch (char_type) { + case GRN_CHAR_HIRAGANA : + if (lp == 3) { + p = utf8_normalize_unify_hiragana_case(p, unified_kana_case); + } + break; + case GRN_CHAR_KATAKANA : + if (lp == 3) { + p = utf8_normalize_unify_katakana_case(p, unified_kana_case); + } + break; + default : + break; + } + } + grn_memcpy(d, p, lp); } d_ = d; @@ -1248,6 +1373,12 @@ nfkc100_open_options(grn_ctx *ctx, raw_options, i, options->unify_kana); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_kana_case")) { + options->unify_kana_case = + grn_vector_get_element_bool(ctx, + raw_options, + i, + options->unify_kana_case); } } GRN_OPTION_VALUES_EACH_END(); Added: test/command/suite/normalizers/nfkc100/unify_kana_case_hiragana.expected (+38 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_kana_case_hiragana.expected 2018-04-11 17:40:22 +0900 (d646bb8d8) @@ -0,0 +1,38 @@ +normalize 'NormalizerNFKC100("unify_kana_case", true)' "ぁあぃいぅうぇえぉおゃやゅゆょよゎわゕかゖけ" WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "ああいいううええおおややゆゆよよわわかかけけ", + "types": [ + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana" + ], + "checks": [ + + ] + } +] Added: test/command/suite/normalizers/nfkc100/unify_kana_case_hiragana.test (+4 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_kana_case_hiragana.test 2018-04-11 17:40:22 +0900 (93006d064) @@ -0,0 +1,4 @@ +normalize \ + 'NormalizerNFKC100("unify_kana_case", true)' \ + "ぁあぃいぅうぇえぉおゃやゅゆょよゎわゕかゖけ" \ + WITH_TYPES Added: test/command/suite/normalizers/nfkc100/unify_kana_case_katakana.expected (+38 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_kana_case_katakana.expected 2018-04-11 17:40:22 +0900 (9ac121551) @@ -0,0 +1,38 @@ +normalize 'NormalizerNFKC100("unify_kana_case", true)' "ァアィイゥウェエォオャヤュユョヨヮワヵカヶケ" WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "アアイイウウエエオオヤヤユユヨヨワワカカケケ", + "types": [ + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana" + ], + "checks": [ + + ] + } +] Added: test/command/suite/normalizers/nfkc100/unify_kana_case_katakana.test (+4 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_kana_case_katakana.test 2018-04-11 17:40:22 +0900 (683a3ddca) @@ -0,0 +1,4 @@ +normalize \ + 'NormalizerNFKC100("unify_kana_case", true)' \ + "ァアィイゥウェエォオャヤュユョヨヮワヵカヶケ" \ + WITH_TYPES -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180411/7152521e/attachment-0001.htm