Kouhei Sutou 2019-01-25 11:21:50 +0900 (Fri, 25 Jan 2019) Revision: 40e3790667a58175177c5fdae69e97fca839d8e0 https://github.com/groonga/groonga/commit/40e3790667a58175177c5fdae69e97fca839d8e0 Message: NormalizerNFKC100: fix a bug that stateful normalization breaks stateless normalization For example, "unify_kana" (stateless) and "unify_katakana_v_sounds" (stateful) returns wrong normalized text. Added files: test/command/suite/normalizers/nfkc100/mix/unify_katakana_v_sounds_and_kana.expected test/command/suite/normalizers/nfkc100/mix/unify_katakana_v_sounds_and_kana.test Modified files: lib/normalizer.c Modified: lib/normalizer.c (+23 -2) =================================================================== --- lib/normalizer.c 2019-01-24 16:22:59 +0900 (5a0dc7f23) +++ lib/normalizer.c 2019-01-25 11:21:50 +0900 (024766a8d) @@ -1,7 +1,7 @@ /* -*- c-basic-offset: 2 -*- */ /* Copyright(C) 2012-2018 Brazil - Copyright(C) 2018 Kouhei Sutou <kou****@clear*****> + Copyright(C) 2018-2019 Kouhei Sutou <kou****@clear*****> This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -663,6 +663,18 @@ grn_nfkc_normalize_context_swap(grn_ctx *ctx, } grn_inline static void +grn_nfkc_normalize_context_rewind(grn_ctx *ctx, + grn_nfkc_normalize_context *context) +{ + context->d = context->dest; + context->d_ = NULL; + context->n_characters = 0; + context->c = context->checks; + context->t = context->types; + context->o = context->offsets; +} + +grn_inline static void grn_nfkc_normalize_data_init(grn_ctx *ctx, grn_nfkc_normalize_data *data, grn_obj *string, @@ -1337,6 +1349,9 @@ grn_nfkc_normalize_unify_katakana_v_sounds(grn_ctx *ctx, return unified_buffer; } + *n_unified_bytes = *n_used_bytes; + *n_unified_characters = *n_used_characters; + return current; } @@ -1390,6 +1405,9 @@ grn_nfkc_normalize_unify_katakana_bu_sound(grn_ctx *ctx, return unified_buffer; } + *n_unified_bytes = *n_used_bytes; + *n_unified_characters = *n_used_characters; + return current; } @@ -1436,7 +1454,7 @@ grn_nfkc_normalize_unify_stateful(grn_ctx *ctx, } if (unify->c) { size_t i; - *(unify->c++) += data->context.checks[i_byte]; + *(unify->c++) = data->context.checks[i_byte]; for (i = 1; i < n_unified_bytes; i++) { *(unify->c++) = 0; } @@ -1547,6 +1565,7 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, if (data->options->unify_katakana_v_sounds) { if (need_swap) { grn_nfkc_normalize_context_swap(ctx, &(data->context), &unify); + grn_nfkc_normalize_context_rewind(ctx, &unify); } grn_nfkc_normalize_unify_stateful(ctx, data, @@ -1562,6 +1581,7 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, if (data->options->unify_katakana_bu_sound) { if (need_swap) { grn_nfkc_normalize_context_swap(ctx, &(data->context), &unify); + grn_nfkc_normalize_context_rewind(ctx, &unify); } grn_nfkc_normalize_unify_stateful(ctx, data, @@ -1577,6 +1597,7 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, if (data->options->unify_to_romaji) { if (need_swap) { grn_nfkc_normalize_context_swap(ctx, &(data->context), &unify); + grn_nfkc_normalize_context_rewind(ctx, &unify); } grn_nfkc_normalize_unify_stateful(ctx, data, Added: test/command/suite/normalizers/nfkc100/mix/unify_katakana_v_sounds_and_kana.expected (+72 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/mix/unify_katakana_v_sounds_and_kana.expected 2019-01-25 11:21:50 +0900 (7049c7dce) @@ -0,0 +1,72 @@ +normalize 'NormalizerNFKC100("unify_kana", true, "unify_katakana_v_sounds", true, "report_source_offset", true)' "うヴァヴィヴヴェヴォヴ" WITH_CHECKS|WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "うゔぁゔぃゔゔぇゔぉゔ", + "types": [ + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana" + ], + "checks": [ + 3, + 0, + 0, + 3, + 0, + 0, + 3, + 0, + 0, + 3, + 0, + 0, + 3, + 0, + 0, + 3, + 0, + 0, + 3, + 0, + 0, + 3, + 0, + 0, + 3, + 0, + 0, + 3, + 0, + 0, + 3, + 0, + 0 + ], + "offsets": [ + 0, + 3, + 6, + 9, + 12, + 15, + 18, + 21, + 24, + 27, + 30 + ] + } +] Added: test/command/suite/normalizers/nfkc100/mix/unify_katakana_v_sounds_and_kana.test (+6 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/mix/unify_katakana_v_sounds_and_kana.test 2019-01-25 11:21:50 +0900 (99ba70623) @@ -0,0 +1,6 @@ +normalize \ + 'NormalizerNFKC100("unify_kana", true, \ + "unify_katakana_v_sounds", true, \ + "report_source_offset", true)' \ + "うヴァヴィヴヴェヴォヴ" \ + WITH_CHECKS|WITH_TYPES -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190125/b5897eeb/attachment-0001.html>