Kouhei Sutou
null+****@clear*****
Mon May 20 23:19:49 JST 2013
Kouhei Sutou 2013-05-20 23:19:49 +0900 (Mon, 20 May 2013) New Revision: 2d20449318887de2321dffb6569b0ebc47ae7244 https://github.com/groonga/groonga-normalizer-mysql/commit/2d20449318887de2321dffb6569b0ebc47ae7244 Message: Support halfwidth katakana + voiced sound mark Because NormalizerMySQLUnicodeCIExceptKanaCIKanaWithVoicedSoundMark is not MySQL compatible normalizer. MySQL compatible normalizers should behave same as MySQL. But the normalizer is groonga original normalizer. We can change behavior if there is reasonable reason. https://gist.github.com/y-ken/eb49eaad879f47b27cec#comment-832107 Suggested by Y.Kentaro. Thanks!!! Added files: test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_with_voiced_sound_mark.expected test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_with_voiced_sound_mark.test Copied files: test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_ha_line_with_semi_voiced_sound_mark.expected (from test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_voiced_sound_mark.expected) test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_ha_line_with_semi_voiced_sound_mark.test (from test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_voiced_sound_mark.test) Modified files: normalizers/mysql.c Renamed files: test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_ha_line_with_voiced_sound_mark.expected (from test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_voiced_sound_mark.expected) test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_ha_line_with_voiced_sound_mark.test (from test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_voiced_sound_mark.test) Modified: normalizers/mysql.c (+217 -8) =================================================================== --- normalizers/mysql.c 2013-05-20 21:44:46 +0900 (60e65b7) +++ normalizers/mysql.c 2013-05-20 23:19:49 +0900 (f22609e) @@ -35,6 +35,15 @@ # define inline _inline #endif +typedef grn_bool (*normalizer_func)(grn_ctx *ctx, + const char *utf8, + int *character_length, + int rest_length, + uint32_t **normalize_table, + char *normalized, + unsigned int *normalized_length_in_bytes, + unsigned int *normalized_n_characters); + static inline unsigned int unichar_to_utf8(uint32_t unichar, char *output) { @@ -78,6 +87,57 @@ unichar_to_utf8(uint32_t unichar, char *output) return n_bytes; } +static inline uint32_t +utf8_to_unichar(const char *utf8, int byte_size) +{ + uint32_t unichar; + const unsigned char *bytes = (const unsigned char *)utf8; + + switch (byte_size) { + case 1 : + unichar = bytes[0] & 0x7f; + break; + case 2 : + unichar = ((bytes[0] & 0x1f) << 6) + (bytes[1] & 0x3f); + break; + case 3 : + unichar = + ((bytes[0] & 0x0f) << 12) + + ((bytes[1] & 0x3f) << 6) + + ((bytes[2] & 0x3f)); + break; + case 4 : + unichar = + ((bytes[0] & 0x07) << 18) + + ((bytes[1] & 0x3f) << 12) + + ((bytes[2] & 0x3f) << 6) + + ((bytes[3] & 0x3f)); + break; + case 5 : + unichar = + ((bytes[0] & 0x03) << 24) + + ((bytes[1] & 0x3f) << 18) + + ((bytes[2] & 0x3f) << 12) + + ((bytes[3] & 0x3f) << 6) + + ((bytes[4] & 0x3f)); + break; + case 6 : + unichar = + ((bytes[0] & 0x01) << 30) + + ((bytes[1] & 0x3f) << 24) + + ((bytes[2] & 0x3f) << 18) + + ((bytes[3] & 0x3f) << 12) + + ((bytes[4] & 0x3f) << 6) + + ((bytes[5] & 0x3f)); + break; + default : + unichar = 0; + break; + } + + return unichar; +} + static inline void decompose_character(const char *rest, int character_length, int *page, uint32_t *low_code) @@ -139,7 +199,8 @@ normalize_character(const char *utf8, int character_length, } static void -normalize(grn_ctx *ctx, grn_obj *string, uint32_t **normalize_table) +normalize(grn_ctx *ctx, grn_obj *string, uint32_t **normalize_table, + normalizer_func custom_normalizer) { const char *original, *rest; unsigned int original_length_in_bytes, rest_length; @@ -180,10 +241,23 @@ normalize(grn_ctx *ctx, grn_obj *string, uint32_t **normalize_table) current_type[-1] |= GRN_CHAR_BLANK; } } else { - normalize_character(rest, character_length, normalize_table, - normalized, - &normalized_length_in_bytes, - &normalized_n_characters); + grn_bool custom_normalized = GRN_FALSE; + if (custom_normalizer) { + custom_normalized = custom_normalizer(ctx, + rest, + &character_length, + rest_length - character_length, + normalize_table, + normalized, + &normalized_length_in_bytes, + &normalized_n_characters); + } + if (!custom_normalized) { + normalize_character(rest, character_length, normalize_table, + normalized, + &normalized_length_in_bytes, + &normalized_n_characters); + } if (current_type) { char *current_normalized; current_normalized = @@ -229,7 +303,7 @@ mysql_general_ci_next(GNUC_UNUSED grn_ctx *ctx, grn_encoding_to_string(encoding)); return NULL; } - normalize(ctx, string, general_ci_table); + normalize(ctx, string, general_ci_table, NULL); return NULL; } @@ -251,10 +325,144 @@ mysql_unicode_ci_next(GNUC_UNUSED grn_ctx *ctx, grn_encoding_to_string(encoding)); return NULL; } - normalize(ctx, string, unicode_ci_table); + normalize(ctx, string, unicode_ci_table, NULL); return NULL; } +#define HALFWIDTH_KATAKANA_LETTER_KA 0xff76 +#define HALFWIDTH_KATAKANA_LETTER_KI 0xff77 +#define HALFWIDTH_KATAKANA_LETTER_KU 0xff78 +#define HALFWIDTH_KATAKANA_LETTER_KE 0xff79 +#define HALFWIDTH_KATAKANA_LETTER_KO 0xff7a + +#define HALFWIDTH_KATAKANA_LETTER_SA 0xff7b +#define HALFWIDTH_KATAKANA_LETTER_SI 0xff7c +#define HALFWIDTH_KATAKANA_LETTER_SU 0xff7d +#define HALFWIDTH_KATAKANA_LETTER_SE 0xff7e +#define HALFWIDTH_KATAKANA_LETTER_SO 0xff7f + +#define HALFWIDTH_KATAKANA_LETTER_TA 0xff80 +#define HALFWIDTH_KATAKANA_LETTER_TI 0xff81 +#define HALFWIDTH_KATAKANA_LETTER_TU 0xff82 +#define HALFWIDTH_KATAKANA_LETTER_TE 0xff83 +#define HALFWIDTH_KATAKANA_LETTER_TO 0xff84 + +#define HALFWIDTH_KATAKANA_LETTER_HA 0xff8a +#define HALFWIDTH_KATAKANA_LETTER_HI 0xff8b +#define HALFWIDTH_KATAKANA_LETTER_HU 0xff8c +#define HALFWIDTH_KATAKANA_LETTER_HE 0xff8d +#define HALFWIDTH_KATAKANA_LETTER_HO 0xff8e + +#define HALFWIDTH_KATAKANA_VOICED_SOUND_MARK 0xff9e +#define HALFWIDTH_KATAKANA_SEMI_VOICED_SOUND_MARK 0xff9f + +#define HIRAGANA_LETTER_KA 0x304b +#define HIRAGANA_VOICED_SOUND_MARK_OFFSET 1 +#define HIRAGANA_VOICED_SOUND_MARK_GAP 2 + +#define HIRAGANA_LETTER_HA 0x306f +#define HIRAGANA_HA_LINE_BA_OFFSET 1 +#define HIRAGANA_HA_LINE_PA_OFFSET 2 +#define HIRAGANA_HA_LINE_GAP 3 + +static grn_bool +normalize_halfwidth_katakana_with_voiced_sound_mark( + grn_ctx *ctx, + const char *utf8, + int *character_length, + int rest_length, + GNUC_UNUSED uint32_t **normalize_table, + char *normalized, + unsigned int *normalized_length_in_bytes, + unsigned int *normalized_n_characters) +{ + grn_bool custom_normalized = GRN_FALSE; + grn_bool is_voiced_sound_markable_halfwidth_katakana = GRN_FALSE; + grn_bool is_semi_voiced_sound_markable_halfwidth_katakana = GRN_FALSE; + grn_bool is_ha_line = GRN_FALSE; + uint32_t unichar; + + if (*character_length != 3) { + return GRN_FALSE; + } + if (rest_length < 3) { + return GRN_FALSE; + } + + unichar = utf8_to_unichar(utf8, *character_length); + if (HALFWIDTH_KATAKANA_LETTER_KA <= unichar && + unichar <= HALFWIDTH_KATAKANA_LETTER_TO) { + is_voiced_sound_markable_halfwidth_katakana = GRN_TRUE; + } else if (HALFWIDTH_KATAKANA_LETTER_HA <= unichar && + unichar <= HALFWIDTH_KATAKANA_LETTER_HO) { + is_voiced_sound_markable_halfwidth_katakana = GRN_TRUE; + is_semi_voiced_sound_markable_halfwidth_katakana = GRN_TRUE; + is_ha_line = GRN_TRUE; + } + + if (!is_voiced_sound_markable_halfwidth_katakana && + !is_semi_voiced_sound_markable_halfwidth_katakana) { + return GRN_FALSE; + } + + { + int next_character_length; + uint32_t next_unichar; + next_character_length = grn_plugin_charlen(ctx, + utf8 + *character_length, + rest_length, + GRN_ENC_UTF8); + if (next_character_length != 3) { + return GRN_FALSE; + } + next_unichar = utf8_to_unichar(utf8 + *character_length, + next_character_length); + if (next_unichar == HALFWIDTH_KATAKANA_VOICED_SOUND_MARK) { + if (is_voiced_sound_markable_halfwidth_katakana) { + unsigned int n_bytes; + if (is_ha_line) { + n_bytes = unichar_to_utf8(HIRAGANA_LETTER_HA + + HIRAGANA_HA_LINE_BA_OFFSET + + ((unichar - HALFWIDTH_KATAKANA_LETTER_HA) * + HIRAGANA_HA_LINE_GAP), + normalized + *normalized_length_in_bytes); + } else { + int small_tu_offset = 0; + if (HALFWIDTH_KATAKANA_LETTER_TU <= unichar && + unichar <= HALFWIDTH_KATAKANA_LETTER_TO) { + small_tu_offset = 1; + } + n_bytes = unichar_to_utf8(HIRAGANA_LETTER_KA + + HIRAGANA_VOICED_SOUND_MARK_OFFSET + + small_tu_offset + + ((unichar - HALFWIDTH_KATAKANA_LETTER_KA) * + HIRAGANA_VOICED_SOUND_MARK_GAP), + normalized + *normalized_length_in_bytes); + } + *character_length += next_character_length; + *normalized_length_in_bytes += n_bytes; + (*normalized_n_characters)++; + custom_normalized = GRN_TRUE; + } + } else if (next_unichar == HALFWIDTH_KATAKANA_SEMI_VOICED_SOUND_MARK) { + if (is_semi_voiced_sound_markable_halfwidth_katakana) { + unsigned int n_bytes; + n_bytes = unichar_to_utf8(HIRAGANA_LETTER_HA + + HIRAGANA_HA_LINE_PA_OFFSET + + ((unichar - HALFWIDTH_KATAKANA_LETTER_HA) * + HIRAGANA_HA_LINE_GAP), + normalized + *normalized_length_in_bytes); + *character_length += next_character_length; + *normalized_length_in_bytes += n_bytes; + (*normalized_n_characters)++; + custom_normalized = GRN_TRUE; + } + } + } + + return custom_normalized; +} + static grn_obj * mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next( GNUC_UNUSED grn_ctx *ctx, @@ -276,7 +484,8 @@ mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next( return NULL; } normalize(ctx, string, - unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table); + unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table, + normalize_halfwidth_katakana_with_voiced_sound_mark); return NULL; } Copied: test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_ha_line_with_semi_voiced_sound_mark.expected (+2 -2) 52% =================================================================== --- test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_voiced_sound_mark.expected 2013-05-20 21:44:46 +0900 (3ff6818) +++ test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_ha_line_with_semi_voiced_sound_mark.expected 2013-05-20 23:19:49 +0900 (7cc7bbf) @@ -1,4 +1,4 @@ register normalizers/mysql [[0,0.0,0.0],true] -normalize NormalizerMySQLUnicodeCIExceptKanaCIKanaWithVoicedSoundMark "beforeゲafter" -[[0,0.0,0.0],{"normalized":"BEFOREけAFTER","types":[]}] +normalize NormalizerMySQLUnicodeCIExceptKanaCIKanaWithVoicedSoundMark "パピプペポ" +[[0,0.0,0.0],{"normalized":"ぱぴぷぺぽ","types":[]}] Copied: test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_ha_line_with_semi_voiced_sound_mark.test (+1 -1) 74% =================================================================== --- test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_voiced_sound_mark.test 2013-05-20 21:44:46 +0900 (c6ac08f) +++ test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_ha_line_with_semi_voiced_sound_mark.test 2013-05-20 23:19:49 +0900 (7615ce9) @@ -1,4 +1,4 @@ register normalizers/mysql normalize NormalizerMySQLUnicodeCIExceptKanaCIKanaWithVoicedSoundMark \ - "beforeゲafter" + "パピプペポ" Renamed: test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_ha_line_with_voiced_sound_mark.expected (+2 -2) 52% =================================================================== --- test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_voiced_sound_mark.expected 2013-05-20 21:44:46 +0900 (3ff6818) +++ test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_ha_line_with_voiced_sound_mark.expected 2013-05-20 23:19:49 +0900 (c6f90d0) @@ -1,4 +1,4 @@ register normalizers/mysql [[0,0.0,0.0],true] -normalize NormalizerMySQLUnicodeCIExceptKanaCIKanaWithVoicedSoundMark "beforeゲafter" -[[0,0.0,0.0],{"normalized":"BEFOREけAFTER","types":[]}] +normalize NormalizerMySQLUnicodeCIExceptKanaCIKanaWithVoicedSoundMark "バビブベボ" +[[0,0.0,0.0],{"normalized":"ばびぶべぼ","types":[]}] Renamed: test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_ha_line_with_voiced_sound_mark.test (+1 -1) 74% =================================================================== --- test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_voiced_sound_mark.test 2013-05-20 21:44:46 +0900 (c6ac08f) +++ test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_ha_line_with_voiced_sound_mark.test 2013-05-20 23:19:49 +0900 (df58ba0) @@ -1,4 +1,4 @@ register normalizers/mysql normalize NormalizerMySQLUnicodeCIExceptKanaCIKanaWithVoicedSoundMark \ - "beforeゲafter" + "バビブベボ" Added: test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_with_voiced_sound_mark.expected (+16 -0) 100644 =================================================================== --- /dev/null +++ test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_with_voiced_sound_mark.expected 2013-05-20 23:19:49 +0900 (98960b5) @@ -0,0 +1,16 @@ +register normalizers/mysql +[[0,0.0,0.0],true] +normalize NormalizerMySQLUnicodeCIExceptKanaCIKanaWithVoicedSoundMark "ガギグゲゴザジズゼゾダヂヅデド" +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "がぎぐげござじずぜぞだぢづでど", + "types": [ + + ] + } +] Added: test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_with_voiced_sound_mark.test (+4 -0) 100644 =================================================================== --- /dev/null +++ test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/halfwidth_katakana_with_voiced_sound_mark.test 2013-05-20 23:19:49 +0900 (12b27b5) @@ -0,0 +1,4 @@ +register normalizers/mysql + +normalize NormalizerMySQLUnicodeCIExceptKanaCIKanaWithVoicedSoundMark \ + "ガギグゲゴザジズゼゾダヂヅデド" -------------- next part -------------- HTML����������������������������...Download