Kouhei Sutou
null+****@clear*****
Mon Apr 9 16:58:07 JST 2018
Kouhei Sutou 2018-04-09 16:58:07 +0900 (Mon, 09 Apr 2018) New Revision: 10b3941ab176de098852ae150350481c3f86988c https://github.com/groonga/groonga/commit/10b3941ab176de098852ae150350481c3f86988c Message: Add NormalizerNFKC100 that is Unicode 10.0 based normalizer Added files: test/command/suite/normalizers/nfkc100/emoji.expected test/command/suite/normalizers/nfkc100/emoji.test Modified files: lib/normalizer.c lib/proc.c lib/string.c test/command/suite/normalizer_list/default.expected test/command/suite/schema/plugins.expected test/command/suite/schema/tables/columns/compress/lz4.expected test/command/suite/schema/tables/columns/compress/zlib.expected test/command/suite/schema/tables/columns/compress/zstd.expected test/command/suite/schema/tables/columns/type/index_medium.expected test/command/suite/schema/tables/columns/type/index_small.expected test/command/suite/schema/tables/columns/type/scalar.expected test/command/suite/schema/tables/columns/type/vector.expected test/command/suite/schema/tables/normalizer.expected test/command/suite/schema/tables/token_filters.expected test/command/suite/schema/tables/tokenizer.expected test/command/suite/schema/tables/tokenizer_with_options.expected test/command/suite/schema/tables/type/array.expected test/command/suite/schema/tables/type/hash_table.expected test/command/suite/schema/tables/value_type/reference.expected test/command/suite/schema/tables/value_type/type.expected Modified: lib/normalizer.c (+39 -7) =================================================================== --- lib/normalizer.c 2018-04-09 16:48:42 +0900 (23d94505f) +++ lib/normalizer.c 2018-04-09 16:58:07 +0900 (8ca5aa18e) @@ -1,6 +1,6 @@ /* -*- c-basic-offset: 2 -*- */ /* - Copyright(C) 2012 Brazil + Copyright(C) 2012-2018 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -610,8 +610,17 @@ grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char return 0; } +typedef grn_char_type (*grn_nfkc_char_type_func)(const unsigned char *utf8); +typedef const char *(*grn_nfkc_decompose_func)(const unsigned char *utf8); +typedef const char *(*grn_nfkc_compose_func)(const unsigned char *prefix_utf8, + const unsigned char *suffix_utf8); + grn_inline static grn_obj * -utf8_normalize(grn_ctx *ctx, grn_string *nstr) +utf8_normalize(grn_ctx *ctx, + grn_string *nstr, + grn_nfkc_char_type_func char_type_func, + grn_nfkc_decompose_func decompose_func, + grn_nfkc_compose_func compose_func) { int16_t *ch; const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e; @@ -659,13 +668,13 @@ utf8_normalize(grn_ctx *ctx, grn_string *nstr) GRN_ENC_UTF8)) { continue; } - if ((p = (unsigned char *)grn_nfkc_decompose(s))) { + if ((p = (unsigned char *)decompose_func(s))) { pe = p + strlen((char *)p); } else { p = s; pe = p + ls; } - if (d_ && (p2 = (unsigned char *)grn_nfkc_compose(d_, p))) { + if (d_ && (p2 = (unsigned char *)compose_func(d_, p))) { p = p2; pe = p + strlen((char *)p); if (cp) { cp--; } @@ -730,7 +739,7 @@ utf8_normalize(grn_ctx *ctx, grn_string *nstr) d_ = d; d += lp; length++; - if (cp) { *cp++ = grn_nfkc_char_type(p); } + if (cp) { *cp++ = char_type_func(p); } if (ch) { size_t i; if (s_ == s + ls) { @@ -1124,7 +1133,11 @@ auto_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) break; case GRN_ENC_UTF8 : #ifdef GRN_WITH_NFKC - utf8_normalize(ctx, string); + utf8_normalize(ctx, + string, + grn_nfkc_char_type, + grn_nfkc_decompose, + grn_nfkc_compose); #else /* GRN_WITH_NFKC */ ascii_normalize(ctx, string); #endif /* GRN_WITH_NFKC */ @@ -1150,7 +1163,23 @@ static grn_obj * nfkc51_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_string *string = (grn_string *)(args[0]); - utf8_normalize(ctx, string); + utf8_normalize(ctx, + string, + grn_nfkc50_char_type, + grn_nfkc50_decompose, + grn_nfkc50_compose); + return NULL; +} + +static grn_obj * +nfkc100_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_string *string = (grn_string *)(args[0]); + utf8_normalize(ctx, + string, + grn_nfkc100_char_type, + grn_nfkc100_decompose, + grn_nfkc100_compose); return NULL; } #endif /* GRN_WITH_NFKC */ @@ -1173,6 +1202,7 @@ grn_rc grn_db_init_builtin_normalizers(grn_ctx *ctx) { const char *normalizer_nfkc51_name = "NormalizerNFKC51"; + const char *normalizer_nfkc100_name = "NormalizerNFKC100"; grn_normalizer_register(ctx, GRN_NORMALIZER_AUTO_NAME, -1, NULL, auto_next, NULL); @@ -1180,6 +1210,8 @@ grn_db_init_builtin_normalizers(grn_ctx *ctx) #ifdef GRN_WITH_NFKC grn_normalizer_register(ctx, normalizer_nfkc51_name, -1, NULL, nfkc51_next, NULL); + grn_normalizer_register(ctx, normalizer_nfkc100_name, -1, + NULL, nfkc100_next, NULL); #else /* GRN_WITH_NFKC */ grn_normalizer_register(ctx, normalizer_nfkc51_name, -1, NULL, NULL, NULL); Modified: lib/proc.c (+1 -49) =================================================================== --- lib/proc.c 2018-04-09 16:48:42 +0900 (b860cfb74) +++ lib/proc.c 2018-04-09 16:58:07 +0900 (f89c05426) @@ -1130,54 +1130,6 @@ is_normalizer(grn_ctx *ctx, grn_obj *object) return GRN_TRUE; } -static const char * -char_type_name(grn_char_type type) -{ - const char *name = "unknown"; - -#define CHAR_TYPE_NAME_WITH_BLANK(type_name) do { \ - if (GRN_CHAR_IS_BLANK(type)) { \ - name = type_name "|blank"; \ - } else { \ - name = type_name; \ - } \ - } while (GRN_FALSE) - - switch (GRN_CHAR_TYPE(type)) { - case GRN_CHAR_NULL : - CHAR_TYPE_NAME_WITH_BLANK("null"); - break; - case GRN_CHAR_ALPHA : - CHAR_TYPE_NAME_WITH_BLANK("alpha"); - break; - case GRN_CHAR_DIGIT : - CHAR_TYPE_NAME_WITH_BLANK("digit"); - break; - case GRN_CHAR_SYMBOL : - CHAR_TYPE_NAME_WITH_BLANK("symbol"); - break; - case GRN_CHAR_HIRAGANA : - CHAR_TYPE_NAME_WITH_BLANK("hiragana"); - break; - case GRN_CHAR_KATAKANA : - CHAR_TYPE_NAME_WITH_BLANK("katakana"); - break; - case GRN_CHAR_KANJI : - CHAR_TYPE_NAME_WITH_BLANK("kanji"); - break; - case GRN_CHAR_OTHERS : - CHAR_TYPE_NAME_WITH_BLANK("others"); - break; - default : - CHAR_TYPE_NAME_WITH_BLANK("unknown"); - break; - } - -#undef CHAR_TYPE_NAME_WITH_BLANK - - return name; -} - static grn_obj * proc_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { @@ -1250,7 +1202,7 @@ proc_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data unsigned int i; GRN_OUTPUT_ARRAY_OPEN("types", normalized_n_characters); for (i = 0; i < normalized_n_characters; i++) { - GRN_OUTPUT_CSTR(char_type_name(types[i])); + GRN_OUTPUT_CSTR(grn_char_type_to_string(types[i])); } GRN_OUTPUT_ARRAY_CLOSE(); } else { Modified: lib/string.c (+44 -20) =================================================================== --- lib/string.c 2018-04-09 16:48:42 +0900 (e281791da) +++ lib/string.c 2018-04-09 16:58:07 +0900 (cbe42ac24) @@ -28,28 +28,52 @@ const char * grn_char_type_to_string(grn_char_type type) { - switch (type) { - case GRN_CHAR_NULL: - return "null"; - case GRN_CHAR_ALPHA: - return "alpha"; - case GRN_CHAR_DIGIT: - return "digit"; - case GRN_CHAR_SYMBOL: - return "symbol"; - case GRN_CHAR_HIRAGANA: - return "hiragana"; - case GRN_CHAR_KATAKANA: - return "katakana"; - case GRN_CHAR_KANJI: - return "kanji"; - case GRN_CHAR_OTHERS: - return "others"; - case GRN_CHAR_EMOJI: - return "emoji"; + const char *string = "unknown"; + +#define CHAR_TYPE_STRING_WITH_BLANK(type_string) do { \ + if (GRN_CHAR_IS_BLANK(type)) { \ + string = type_string "|blank"; \ + } else { \ + string = type_string; \ + } \ + } while (GRN_FALSE) + + switch (GRN_CHAR_TYPE(type)) { + case GRN_CHAR_NULL : + CHAR_TYPE_STRING_WITH_BLANK("null"); + break; + case GRN_CHAR_ALPHA : + CHAR_TYPE_STRING_WITH_BLANK("alpha"); + break; + case GRN_CHAR_DIGIT : + CHAR_TYPE_STRING_WITH_BLANK("digit"); + break; + case GRN_CHAR_SYMBOL : + CHAR_TYPE_STRING_WITH_BLANK("symbol"); + break; + case GRN_CHAR_HIRAGANA : + CHAR_TYPE_STRING_WITH_BLANK("hiragana"); + break; + case GRN_CHAR_KATAKANA : + CHAR_TYPE_STRING_WITH_BLANK("katakana"); + break; + case GRN_CHAR_KANJI : + CHAR_TYPE_STRING_WITH_BLANK("kanji"); + break; + case GRN_CHAR_OTHERS : + CHAR_TYPE_STRING_WITH_BLANK("others"); + break; + case GRN_CHAR_EMOJI : + CHAR_TYPE_STRING_WITH_BLANK("emoji"); + break; + default : + CHAR_TYPE_STRING_WITH_BLANK("unknown"); + break; } - return "unknown"; +#undef CHAR_TYPE_STRING_WITH_BLANK + + return string; } static grn_string * Modified: test/command/suite/normalizer_list/default.expected (+18 -1) =================================================================== --- test/command/suite/normalizer_list/default.expected 2018-04-09 16:48:42 +0900 (81a6924c2) +++ test/command/suite/normalizer_list/default.expected 2018-04-09 16:58:07 +0900 (832c59d08) @@ -1,2 +1,19 @@ normalizer_list -[[0,0.0,0.0],[{"name":"NormalizerAuto"},{"name":"NormalizerNFKC51"}]] +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "name": "NormalizerAuto" + }, + { + "name": "NormalizerNFKC51" + }, + { + "name": "NormalizerNFKC100" + } + ] +] Added: test/command/suite/normalizers/nfkc100/emoji.expected (+2 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/emoji.expected 2018-04-09 16:58:07 +0900 (aad3d4d2f) @@ -0,0 +1,2 @@ +normalize NormalizerNFKC100 "©" WITH_TYPES +[[0,0.0,0.0],{"normalized":"©","types":["emoji"],"checks":[]}] Added: test/command/suite/normalizers/nfkc100/emoji.test (+1 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/emoji.test 2018-04-09 16:58:07 +0900 (71cb0fcc2) @@ -0,0 +1 @@ +normalize NormalizerNFKC100 "©" WITH_TYPES Modified: test/command/suite/schema/plugins.expected (+4 -0) =================================================================== --- test/command/suite/schema/plugins.expected 2018-04-09 16:48:42 +0900 (e3bbcbcc7) +++ test/command/suite/schema/plugins.expected 2018-04-09 16:58:07 +0900 (1ddc055f1) @@ -201,6 +201,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/columns/compress/lz4.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/compress/lz4.expected 2018-04-09 16:48:42 +0900 (bfab62c18) +++ test/command/suite/schema/tables/columns/compress/lz4.expected 2018-04-09 16:58:07 +0900 (ac1adadd1) @@ -200,6 +200,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/columns/compress/zlib.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/compress/zlib.expected 2018-04-09 16:48:42 +0900 (6eba84ce0) +++ test/command/suite/schema/tables/columns/compress/zlib.expected 2018-04-09 16:58:07 +0900 (28ebf9471) @@ -200,6 +200,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/columns/compress/zstd.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/compress/zstd.expected 2018-04-09 16:48:42 +0900 (6bfeb981e) +++ test/command/suite/schema/tables/columns/compress/zstd.expected 2018-04-09 16:58:07 +0900 (470518c79) @@ -200,6 +200,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/columns/type/index_medium.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/type/index_medium.expected 2018-04-09 16:48:42 +0900 (f028fb7cb) +++ test/command/suite/schema/tables/columns/type/index_medium.expected 2018-04-09 16:58:07 +0900 (33089f2d1) @@ -206,6 +206,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/columns/type/index_small.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/type/index_small.expected 2018-04-09 16:48:42 +0900 (0326d2757) +++ test/command/suite/schema/tables/columns/type/index_small.expected 2018-04-09 16:58:07 +0900 (ca070aea2) @@ -206,6 +206,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/columns/type/scalar.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/type/scalar.expected 2018-04-09 16:48:42 +0900 (1b5397061) +++ test/command/suite/schema/tables/columns/type/scalar.expected 2018-04-09 16:58:07 +0900 (91c546c87) @@ -200,6 +200,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/columns/type/vector.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/columns/type/vector.expected 2018-04-09 16:48:42 +0900 (ff6017c2f) +++ test/command/suite/schema/tables/columns/type/vector.expected 2018-04-09 16:58:07 +0900 (f04e88e2b) @@ -202,6 +202,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/normalizer.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/normalizer.expected 2018-04-09 16:48:42 +0900 (62b64bdd2) +++ test/command/suite/schema/tables/normalizer.expected 2018-04-09 16:58:07 +0900 (70ce81083) @@ -198,6 +198,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/token_filters.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/token_filters.expected 2018-04-09 16:48:42 +0900 (620433ea3) +++ test/command/suite/schema/tables/token_filters.expected 2018-04-09 16:58:07 +0900 (f63a2b7b0) @@ -203,6 +203,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/tokenizer.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/tokenizer.expected 2018-04-09 16:48:42 +0900 (b7cfcd2d2) +++ test/command/suite/schema/tables/tokenizer.expected 2018-04-09 16:58:07 +0900 (026790528) @@ -198,6 +198,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/tokenizer_with_options.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/tokenizer_with_options.expected 2018-04-09 16:48:42 +0900 (41ed03ce6) +++ test/command/suite/schema/tables/tokenizer_with_options.expected 2018-04-09 16:58:07 +0900 (c7ed034f7) @@ -198,6 +198,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/type/array.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/type/array.expected 2018-04-09 16:48:42 +0900 (67566e936) +++ test/command/suite/schema/tables/type/array.expected 2018-04-09 16:58:07 +0900 (94661bca3) @@ -198,6 +198,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/type/hash_table.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/type/hash_table.expected 2018-04-09 16:48:42 +0900 (3dda6cf5b) +++ test/command/suite/schema/tables/type/hash_table.expected 2018-04-09 16:58:07 +0900 (a540989b7) @@ -198,6 +198,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/value_type/reference.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/value_type/reference.expected 2018-04-09 16:48:42 +0900 (7f201461f) +++ test/command/suite/schema/tables/value_type/reference.expected 2018-04-09 16:58:07 +0900 (a566e438f) @@ -200,6 +200,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" Modified: test/command/suite/schema/tables/value_type/type.expected (+4 -0) =================================================================== --- test/command/suite/schema/tables/value_type/type.expected 2018-04-09 16:48:42 +0900 (964056fa7) +++ test/command/suite/schema/tables/value_type/type.expected 2018-04-09 16:58:07 +0900 (b3a0afd59) @@ -198,6 +198,10 @@ schema "id": 79, "name": "NormalizerAuto" }, + "NormalizerNFKC100": { + "id": 81, + "name": "NormalizerNFKC100" + }, "NormalizerNFKC51": { "id": 80, "name": "NormalizerNFKC51" -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180409/a4bc9294/attachment-0001.htm