Kouhei Sutou
null+****@clear*****
Mon May 14 16:42:51 JST 2018
Kouhei Sutou 2018-05-14 16:42:51 +0900 (Mon, 14 May 2018) New Revision: ede396a503a1563a1adb09bf13e0eded37fd2e1b https://github.com/groonga/groonga/commit/ede396a503a1563a1adb09bf13e0eded37fd2e1b Message: NormalizerNFKC100: Add unify_katakana_v_sounds option Added files: test/command/suite/normalizers/nfkc100/unify_katakana_v_sounds.expected test/command/suite/normalizers/nfkc100/unify_katakana_v_sounds.test Modified files: lib/normalizer.c Modified: lib/normalizer.c (+80 -12) =================================================================== --- lib/normalizer.c 2018-05-14 14:36:37 +0900 (0e131300e) +++ lib/normalizer.c 2018-05-14 16:42:51 +0900 (625c3608a) @@ -627,6 +627,7 @@ typedef struct { grn_bool unify_prolonged_sound_mark; grn_bool unify_hyphen_and_prolonged_sound_mark; grn_bool unify_middle_dot; + grn_bool unify_katakana_v_sounds; } grn_utf8_normalize_options; static void @@ -645,6 +646,7 @@ utf8_normalize_options_init(grn_utf8_normalize_options *options, options->unify_prolonged_sound_mark = GRN_FALSE; options->unify_hyphen_and_prolonged_sound_mark = GRN_FALSE; options->unify_middle_dot = GRN_FALSE; + options->unify_katakana_v_sounds = GRN_FALSE; } grn_inline static const unsigned char * @@ -983,6 +985,55 @@ utf8_normalize_is_middle_dot_family(const unsigned char *utf8_char, return GRN_FALSE; } +grn_inline static grn_bool +utf8_normalize_unify_katakana_v_sounds(const unsigned char *utf8_char, + size_t length, + unsigned char *previous_normalized, + unsigned char *normalized) +{ + if (!previous_normalized) { + return GRN_FALSE; + } + + { + size_t previous_length = normalized - previous_normalized; + + /* U+30F4 KATAKANA LETTER VU */ + if (previous_length == 3 && + previous_normalized[0] == 0xe3 && + previous_normalized[1] == 0x83 && + previous_normalized[2] == 0xb4) { + if (length == 3 && utf8_char[0] == 0xe3 && utf8_char[1] == 0x82) { + if (utf8_char[2] == 0xa1) { /* U+30A1 KATAKANA LETTER SMALL A */ + /* U+30D0 KATAKANA LETTER BA */ + previous_normalized[2] = 0x90; + return GRN_TRUE; + } else if (utf8_char[2] == 0xa3) { /* U+30A3 KATAKANA LETTER SMALL I */ + /* U+30D3 KATAKANA LETTER BI */ + previous_normalized[2] = 0x93; + return GRN_TRUE; + } else if (utf8_char[2] == 0xa5) { /* U+30A5 KATAKANA LETTER SMALL U */ + /* U+30D6 KATAKANA LETTER BU */ + previous_normalized[2] = 0x96; + return GRN_TRUE; + } else if (utf8_char[2] == 0xa7) { /* U+30A7 KATAKANA LETTER SMALL E */ + /* U+30D9 KATAKANA LETTER BE */ + previous_normalized[2] = 0x99; + return GRN_TRUE; + } else if (utf8_char[2] == 0xa9) { /* U+30A8 KATAKANA LETTER SMALL O */ + /* U+30DC KATAKANA LETTER BO */ + previous_normalized[2] = 0x9c; + return GRN_TRUE; + } + } + /* U+30D6 KATAKANA LETTER BU */ + previous_normalized[2] = 0x96; + } + } + + return GRN_FALSE; +} + grn_inline static grn_obj * utf8_normalize(grn_ctx *ctx, grn_string *nstr, @@ -1196,29 +1247,40 @@ utf8_normalize(grn_ctx *ctx, } } + if (options->unify_katakana_v_sounds) { + if (utf8_normalize_unify_katakana_v_sounds(p, lp, d_, d)) { + lp = 0; + } + } + grn_memcpy(d, p, lp); p = p_original; } d_ = d; - d += lp; - length++; - if (cp) { *cp++ = char_type; } - if (ch) { - size_t i; - if (s_ == s + ls) { - *ch++ = -1; - } else { - *ch++ = (int16_t)(s + ls - s_); - s__ = s_; - s_ = s + ls; + if (lp > 0) { + d += lp; + length++; + if (cp) { *cp++ = char_type; } + if (ch) { + size_t i; + if (s_ == s + ls) { + *ch++ = -1; + } else { + *ch++ = (int16_t)(s + ls - s_); + s__ = s_; + s_ = s + ls; + } + for (i = lp; i > 1; i--) { *ch++ = 0; } } - for (i = lp; i > 1; i--) { *ch++ = 0; } } lp = lp_original; } } } if (cp) { *cp = GRN_CHAR_NULL; } + if (options->unify_katakana_v_sounds) { + utf8_normalize_unify_katakana_v_sounds(NULL, 0, d_, d); + } *d = '\0'; nstr->n_characters = length; nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized); @@ -1709,6 +1771,12 @@ nfkc100_open_options(grn_ctx *ctx, raw_options, i, options->unify_middle_dot); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_katakana_v_sounds")) { + options->unify_katakana_v_sounds = + grn_vector_get_element_bool(ctx, + raw_options, + i, + options->unify_katakana_v_sounds); } } GRN_OPTION_VALUES_EACH_END(); Added: test/command/suite/normalizers/nfkc100/unify_katakana_v_sounds.expected (+22 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_katakana_v_sounds.expected 2018-05-14 16:42:51 +0900 (adc07d828) @@ -0,0 +1,22 @@ +normalize 'NormalizerNFKC100("unify_katakana_v_sounds", true)' "ヴァヴィヴヴェヴォヴ" WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "バビブベボブ", + "types": [ + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana" + ], + "checks": [ + + ] + } +] Added: test/command/suite/normalizers/nfkc100/unify_katakana_v_sounds.test (+4 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_katakana_v_sounds.test 2018-05-14 16:42:51 +0900 (8198e2932) @@ -0,0 +1,4 @@ +normalize \ + 'NormalizerNFKC100("unify_katakana_v_sounds", true)' \ + "ヴァヴィヴヴェヴォヴ" \ + WITH_TYPES -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180514/102b8c01/attachment-0001.htm