Kouhei Sutou
null+****@clear*****
Wed Apr 18 10:52:30 JST 2018
Kouhei Sutou 2018-04-18 10:52:30 +0900 (Wed, 18 Apr 2018) New Revision: 0d17b2e296e764bb43661a6fdf6a446667ebbe7a https://github.com/groonga/groonga/commit/0d17b2e296e764bb43661a6fdf6a446667ebbe7a Message: NormalizerNFKC100: add unify_hyphen_and_prolonged_sound_mark option Added files: test/command/suite/normalizers/nfkc100/unify_hyphen_and_prolonged_sound_mark.expected test/command/suite/normalizers/nfkc100/unify_hyphen_and_prolonged_sound_mark.test Modified files: lib/normalizer.c Modified: lib/normalizer.c (+29 -1) =================================================================== --- lib/normalizer.c 2018-04-18 10:39:39 +0900 (fd5e3842a) +++ lib/normalizer.c 2018-04-18 10:52:30 +0900 (6ea189872) @@ -625,6 +625,7 @@ typedef struct { grn_bool unify_kana_voiced_sound_mark; grn_bool unify_hyphen; grn_bool unify_prolonged_sound_mark; + grn_bool unify_hyphen_and_prolonged_sound_mark; } grn_utf8_normalize_options; static void @@ -641,6 +642,7 @@ utf8_normalize_options_init(grn_utf8_normalize_options *options, options->unify_kana_voiced_sound_mark = GRN_FALSE; options->unify_hyphen = GRN_FALSE; options->unify_prolonged_sound_mark = GRN_FALSE; + options->unify_hyphen_and_prolonged_sound_mark = GRN_FALSE; } grn_inline static const unsigned char * @@ -854,7 +856,12 @@ grn_inline static const grn_bool utf8_normalize_is_hyphen_famity(const unsigned char *utf8_char, size_t length) { - if (length == 2) { + if (length == 1) { + if (utf8_char[0] == '-') { + /* U+002D HYPHEN-MINUS */ + return GRN_TRUE; + } + } else if (length == 2) { switch (utf8_char[0]) { case 0xcb : if (utf8_char[1] == 0x97) { @@ -914,6 +921,11 @@ utf8_normalize_is_prolonged_sound_mark_famity(const unsigned char *utf8_char, * U+2501 BOX DRAWINGS HEAVY HORIZONTAL */ return GRN_TRUE; } + } else if (utf8_char[0] == 0xe3) { + if (utf8_char[1] == 0x83 && utf8_char[2] == 0xbc) { + /* U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK */ + return GRN_TRUE; + } } else if (utf8_char[0] == 0xef) { if (utf8_char[1] == 0xbd && utf8_char[2] == 0xb0) { /* U+FF70 HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */ @@ -1119,6 +1131,15 @@ utf8_normalize(grn_ctx *ctx, } } + if (options->unify_hyphen_and_prolonged_sound_mark) { + if (utf8_normalize_is_hyphen_famity(p, lp) || + utf8_normalize_is_prolonged_sound_mark_famity(p, lp)) { + p = unified_hyphen; + lp = sizeof(unified_hyphen); + char_type = GRN_CHAR_SYMBOL; + } + } + grn_memcpy(d, p, lp); p = p_original; } @@ -1619,6 +1640,13 @@ nfkc100_open_options(grn_ctx *ctx, raw_options, i, options->unify_prolonged_sound_mark); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, + "unify_hyphen_and_prolonged_sound_mark")) { + options->unify_hyphen_and_prolonged_sound_mark = + grn_vector_get_element_bool(ctx, + raw_options, + i, + options->unify_hyphen_and_prolonged_sound_mark); } } GRN_OPTION_VALUES_EACH_END(); Added: test/command/suite/normalizers/nfkc100/unify_hyphen_and_prolonged_sound_mark.expected (+37 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_hyphen_and_prolonged_sound_mark.expected 2018-04-18 10:52:30 +0900 (1f8c926e2) @@ -0,0 +1,37 @@ +normalize 'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true)' "-˗֊‐‑‒–⁃⁻₋− ﹣- ー—―─━ー" WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "----------- -- ------", + "types": [ + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "others", + "symbol", + "symbol", + "others", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol" + ], + "checks": [ + + ] + } +] Added: test/command/suite/normalizers/nfkc100/unify_hyphen_and_prolonged_sound_mark.test (+4 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_hyphen_and_prolonged_sound_mark.test 2018-04-18 10:52:30 +0900 (fc9898b11) @@ -0,0 +1,4 @@ +normalize \ + 'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true)' \ + "-˗֊‐‑‒–⁃⁻₋− ﹣- ー—―─━ー" \ + WITH_TYPES -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180418/0b315666/attachment-0001.htm