Kouhei Sutou 2018-11-02 18:41:26 +0900 (Fri, 02 Nov 2018) Revision: b8f7950446676aa4989d7aa697d916945b1122af https://github.com/groonga/groonga/commit/b8f7950446676aa4989d7aa697d916945b1122af Message: Extract code not for unification from unify function Modified files: lib/normalizer.c Modified: lib/normalizer.c (+36 -28) =================================================================== --- lib/normalizer.c 2018-11-02 18:14:53 +0900 (0bb240515) +++ lib/normalizer.c 2018-11-02 18:41:26 +0900 (d6b2b5624) @@ -582,6 +582,13 @@ typedef struct { size_t ds; grn_bool remove_blank_p; grn_bool remove_tokenized_delimiter_p; + + unsigned char unified_kana[3]; + unsigned char unified_kana_case[3]; + unsigned char unified_kana_voiced_sound_mark[3]; + unsigned char unified_hyphen[1]; + unsigned char unified_prolonged_sound_mark[3]; + unsigned char unified_middle_dot[2]; } grn_nfkc_normalize_data; grn_inline static void @@ -632,6 +639,15 @@ grn_nfkc_normalize_data_init(grn_ctx *ctx, data->de = data->d + data->ds; data->d_ = NULL; data->e = (unsigned char *)(data->string->original) + data->size; + + data->unified_hyphen[0] = '-'; + /* U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK */ + data->unified_prolonged_sound_mark[0] = 0xe3; + data->unified_prolonged_sound_mark[1] = 0x83; + data->unified_prolonged_sound_mark[2] = 0xbc; + /* U+00B7 MIDDLE DOT */ + data->unified_middle_dot[0] = 0xc2; + data->unified_middle_dot[1] = 0xb7; } grn_inline static void @@ -1115,21 +1131,11 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, grn_nfkc_normalize_data *data, grn_char_type char_type) { - const unsigned char *p_original = data->p; - unsigned char unified_kana[3]; - unsigned char unified_kana_case[3]; - unsigned char unified_kana_voiced_sound_mark[3]; - const unsigned char unified_hyphen[] = {'-'}; - /* U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK */ - const unsigned char unified_prolonged_sound_mark[] = {0xe3, 0x83, 0xbc}; - /* U+00B7 MIDDLE DOT */ - const unsigned char unified_middle_dot[] = {0xc2, 0xb7}; - if (data->options->unify_kana && char_type == GRN_CHAR_KATAKANA && data->lp == 3) { - data->p = grn_nfkc_normalize_unify_kana(data->p, unified_kana); - if (data->p == unified_kana) { + data->p = grn_nfkc_normalize_unify_kana(data->p, data->unified_kana); + if (data->p == data->unified_kana) { char_type = GRN_CHAR_HIRAGANA; } } @@ -1139,13 +1145,13 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, case GRN_CHAR_HIRAGANA : if (data->lp == 3) { data->p = grn_nfkc_normalize_unify_hiragana_case( - data->p, unified_kana_case); + data->p, data->unified_kana_case); } break; case GRN_CHAR_KATAKANA : if (data->lp == 3) { data->p = grn_nfkc_normalize_unify_katakana_case( - data->p, unified_kana_case); + data->p, data->unified_kana_case); } break; default : @@ -1158,13 +1164,13 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, case GRN_CHAR_HIRAGANA : if (data->lp == 3) { data->p = grn_nfkc_normalize_unify_hiragana_voiced_sound_mark( - data->p, unified_kana_voiced_sound_mark); + data->p, data->unified_kana_voiced_sound_mark); } break; case GRN_CHAR_KATAKANA : if (data->lp == 3) { data->p = grn_nfkc_normalize_unify_katakana_voiced_sound_mark( - data->p, unified_kana_voiced_sound_mark); + data->p, data->unified_kana_voiced_sound_mark); } break; default : @@ -1174,16 +1180,16 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, if (data->options->unify_hyphen) { if (grn_nfkc_normalize_is_hyphen_famity(data->p, data->lp)) { - data->p = unified_hyphen; - data->lp = sizeof(unified_hyphen); + data->p = data->unified_hyphen; + data->lp = sizeof(data->unified_hyphen); char_type = GRN_CHAR_SYMBOL; } } if (data->options->unify_prolonged_sound_mark) { if (grn_nfkc_normalize_is_prolonged_sound_mark_famity(data->p, data->lp)) { - data->p = unified_prolonged_sound_mark; - data->lp = sizeof(unified_prolonged_sound_mark); + data->p = data->unified_prolonged_sound_mark; + data->lp = sizeof(data->unified_prolonged_sound_mark); char_type = GRN_CHAR_KATAKANA; } } @@ -1191,16 +1197,16 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, if (data->options->unify_hyphen_and_prolonged_sound_mark) { if (grn_nfkc_normalize_is_hyphen_famity(data->p, data->lp) || grn_nfkc_normalize_is_prolonged_sound_mark_famity(data->p, data->lp)) { - data->p = unified_hyphen; - data->lp = sizeof(unified_hyphen); + data->p = data->unified_hyphen; + data->lp = sizeof(data->unified_hyphen); char_type = GRN_CHAR_SYMBOL; } } if (data->options->unify_middle_dot) { if (grn_nfkc_normalize_is_middle_dot_family(data->p, data->lp)) { - data->p = unified_middle_dot; - data->lp = sizeof(unified_middle_dot); + data->p = data->unified_middle_dot; + data->lp = sizeof(data->unified_middle_dot); char_type = GRN_CHAR_SYMBOL; } } @@ -1217,9 +1223,6 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, } } - grn_memcpy(data->d, data->p, data->lp); - data->p = p_original; - return char_type; } @@ -1293,7 +1296,12 @@ grn_nfkc_normalize(grn_ctx *ctx, } } - char_type = grn_nfkc_normalize_unify(ctx, &data, char_type); + { + const unsigned char *p = data.p; + char_type = grn_nfkc_normalize_unify(ctx, &data, char_type); + grn_memcpy(data.d, data.p, data.lp); + data.p = p; + } data.d_ = data.d; if (data.lp > 0) { -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181102/dbe3e654/attachment-0001.html>