Kouhei Sutou
null+****@clear*****
Wed Apr 18 10:39:39 JST 2018
Kouhei Sutou 2018-04-18 10:39:39 +0900 (Wed, 18 Apr 2018) New Revision: fa48d25ff64e72303f263c75e317391a1a3e9e18 https://github.com/groonga/groonga/commit/fa48d25ff64e72303f263c75e317391a1a3e9e18 Message: NormalizerNFKC100: add unify_prolonged_sound_mark option NEologd unifies "U+FE63 SMALL HYPHEN-MINUS" and "U+FF0D FULLWIDTH HYPHEN-MINUS" to "U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK" but we unify them to "U+002D HYPHEN-MINUS". Added files: test/command/suite/normalizers/nfkc100/unify_prolonged_sound_mark.expected test/command/suite/normalizers/nfkc100/unify_prolonged_sound_mark.test Modified files: lib/normalizer.c Modified: lib/normalizer.c (+49 -1) =================================================================== --- lib/normalizer.c 2018-04-18 10:10:47 +0900 (af75019a2) +++ lib/normalizer.c 2018-04-18 10:39:39 +0900 (fd5e3842a) @@ -624,6 +624,7 @@ typedef struct { grn_bool unify_kana_case; grn_bool unify_kana_voiced_sound_mark; grn_bool unify_hyphen; + grn_bool unify_prolonged_sound_mark; } grn_utf8_normalize_options; static void @@ -639,6 +640,7 @@ utf8_normalize_options_init(grn_utf8_normalize_options *options, options->unify_kana_case = GRN_FALSE; options->unify_kana_voiced_sound_mark = GRN_FALSE; options->unify_hyphen = GRN_FALSE; + options->unify_prolonged_sound_mark = GRN_FALSE; } grn_inline static const unsigned char * @@ -895,6 +897,34 @@ utf8_normalize_is_hyphen_famity(const unsigned char *utf8_char, return GRN_FALSE; } +grn_inline static const grn_bool +utf8_normalize_is_prolonged_sound_mark_famity(const unsigned char *utf8_char, + size_t length) +{ + if (length == 3) { + if (utf8_char[0] == 0xe2) { + if (utf8_char[1] == 0x80 && + (0x94 <= utf8_char[2] && utf8_char[2] <= 0x95)) { + /* U+2014 EM DASH .. + * U+2015 HORIZONTAL BAR */ + return GRN_TRUE; + } else if (utf8_char[1] == 0x94 && + (0x80 <= utf8_char[2] && utf8_char[2] <= 0x81)) { + /* U+2500 BOX DRAWINGS LIGHT HORIZONTAL .. + * U+2501 BOX DRAWINGS HEAVY HORIZONTAL */ + return GRN_TRUE; + } + } else if (utf8_char[0] == 0xef) { + if (utf8_char[1] == 0xbd && utf8_char[2] == 0xb0) { + /* U+FF70 HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */ + return GRN_TRUE; + } + } + } + + return GRN_FALSE; +} + grn_inline static grn_obj * utf8_normalize(grn_ctx *ctx, grn_string *nstr, @@ -1024,6 +1054,9 @@ utf8_normalize(grn_ctx *ctx, unsigned char unified_kana_case[3]; unsigned char unified_kana_voiced_sound_mark[3]; const unsigned char unified_hyphen[] = {'-'}; + /* U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK */ + const unsigned char unified_prolonged_sound_mark[] = + {0xe3, 0x83, 0xbc}; if (options->unify_kana && char_type == GRN_CHAR_KATAKANA && @@ -1073,11 +1106,19 @@ utf8_normalize(grn_ctx *ctx, if (options->unify_hyphen) { if (utf8_normalize_is_hyphen_famity(p, lp)) { p = unified_hyphen; - lp = 1; + lp = sizeof(unified_hyphen); char_type = GRN_CHAR_SYMBOL; } } + if (options->unify_prolonged_sound_mark) { + if (utf8_normalize_is_prolonged_sound_mark_famity(p, lp)) { + p = unified_prolonged_sound_mark; + lp = sizeof(unified_prolonged_sound_mark); + char_type = GRN_CHAR_KATAKANA; + } + } + grn_memcpy(d, p, lp); p = p_original; } @@ -1571,6 +1612,13 @@ nfkc100_open_options(grn_ctx *ctx, raw_options, i, options->unify_hyphen); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, + "unify_prolonged_sound_mark")) { + options->unify_prolonged_sound_mark = + grn_vector_get_element_bool(ctx, + raw_options, + i, + options->unify_prolonged_sound_mark); } } GRN_OPTION_VALUES_EACH_END(); Added: test/command/suite/normalizers/nfkc100/unify_prolonged_sound_mark.expected (+22 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_prolonged_sound_mark.expected 2018-04-18 10:39:39 +0900 (f00f2de1b) @@ -0,0 +1,22 @@ +normalize 'NormalizerNFKC100("unify_prolonged_sound_mark", true)' "ー—―─━ー" WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "ーーーーーー", + "types": [ + "katakana", + "katakana", + "katakana", + "katakana", + "katakana", + "katakana" + ], + "checks": [ + + ] + } +] Added: test/command/suite/normalizers/nfkc100/unify_prolonged_sound_mark.test (+4 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_prolonged_sound_mark.test 2018-04-18 10:39:39 +0900 (ae8270f44) @@ -0,0 +1,4 @@ +normalize \ + 'NormalizerNFKC100("unify_prolonged_sound_mark", true)' \ + "ー—―─━ー" \ + WITH_TYPES -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180418/0c41099d/attachment-0001.htm