null+****@clear*****
null+****@clear*****
2012年 2月 13日 (月) 18:44:06 JST
Kouhei Sutou 2012-02-13 18:44:06 +0900 (Mon, 13 Feb 2012)
New Revision: 730d953118097be6b05bffc286bce19a2df50685
Log:
[normalizer][nfkc] NormalizerUTF8NFKC -> NormalizerNFKC51.
Because it's based on Unicode 5.1.
Modified files:
include/groonga.h
lib/normalizer.c
plugins/normalizers/Makefile.am
plugins/normalizers/nfkc.c
test/benchmark/bench-normalize.c
test/unit/core/test-command-dump.c
Renamed files:
plugins/normalizers/nfkc-unicode-5.1.c
(from plugins/normalizers/nfkc-core.c)
plugins/normalizers/nfkc-unicode-5.1.h
(from plugins/normalizers/nfkc.h)
Modified: include/groonga.h (+1 -1)
===================================================================
--- include/groonga.h 2012-02-13 18:02:01 +0900 (17a0b28)
+++ include/groonga.h 2012-02-13 18:44:06 +0900 (750e37e)
@@ -593,11 +593,11 @@ typedef enum {
typedef enum {
GRN_DB_NORMALIZER_ASCII = 96,
- GRN_DB_NORMALIZER_UTF8_NFKC, /* Normalization Form KC */
GRN_DB_NORMALIZER_EUC_JP,
GRN_DB_NORMALIZER_SJIS,
GRN_DB_NORMALIZER_LATIN1,
GRN_DB_NORMALIZER_KOI8R,
+ GRN_DB_NORMALIZER_NFKC51, /* Normalization Form KC for Unicode 5.1 */
GRN_DB_NORMALIZER_UTF8_UCA /* Unicode Collation Algorithm */
} grn_builtin_normalizer;
Modified: lib/normalizer.c (+9 -9)
===================================================================
--- lib/normalizer.c 2012-02-13 18:02:01 +0900 (3462498)
+++ lib/normalizer.c 2012-02-13 18:44:06 +0900 (dbd140a)
@@ -32,7 +32,7 @@ grn_normalizer_find(grn_ctx *ctx, grn_encoding encoding)
break;
case GRN_ENC_UTF8 :
#ifdef WITH_NFKC
- normalizer_id = GRN_DB_NORMALIZER_UTF8_NFKC;
+ normalizer_id = GRN_DB_NORMALIZER_NFKC51;
#else /* WITH_NFKC */
normalizer_id = GRN_DB_NORMALIZER_ASCII;
#endif /* WITH_NFKC */
@@ -1123,14 +1123,6 @@ grn_db_init_builtin_normalizers(grn_ctx *ctx)
if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_ASCII) {
return GRN_FILE_CORRUPT;
}
-#ifdef WITH_NFKC
- if (grn_plugin_register(ctx, "normalizers/nfkc")) {
- ERRCLR(ctx);
-#endif
- grn_obj_register(ctx, grn_ctx_db(ctx), "NormalizerUTF8NFKC", 18);
-#ifdef WITH_NFKC
- }
-#endif
obj = DEF_NORMALIZERIZER("NormalizerEUCJP", eucjp_normalize);
if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_EUC_JP) {
return GRN_FILE_CORRUPT;
@@ -1147,6 +1139,14 @@ grn_db_init_builtin_normalizers(grn_ctx *ctx)
if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_KOI8R) {
return GRN_FILE_CORRUPT;
}
+#ifdef WITH_NFKC
+ if (grn_plugin_register(ctx, "normalizers/nfkc")) {
+ ERRCLR(ctx);
+#endif
+ grn_obj_register(ctx, grn_ctx_db(ctx), "NormalizerNFKC51", 16);
+#ifdef WITH_NFKC
+ }
+#endif
/* obj = DEF_NORMALIZERIZER("NormalizerUTF8UCA", utf8_uca_normalize); */
/* if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_UTF8_UCA) { */
/* return GRN_FILE_CORRUPT; */
Modified: plugins/normalizers/Makefile.am (+5 -2)
===================================================================
--- plugins/normalizers/Makefile.am 2012-02-13 18:02:01 +0900 (7652acc)
+++ plugins/normalizers/Makefile.am 2012-02-13 18:44:06 +0900 (9113882)
@@ -20,5 +20,8 @@ if WITH_NFKC
normalizer_plugins_LTLIBRARIES += nfkc.la
endif
-nfkc_la_SOURCES = nfkc.c nfkc-core.c
-noinst_HEADERS = nfkc.h
+nfkc_la_SOURCES = \
+ nfkc.c \
+ nfkc-unicode-5.1.c
+noinst_HEADERS = \
+ nfkc-unicode-5.1.h
Renamed: plugins/normalizers/nfkc-unicode-5.1.c (+5 -5) 99%
===================================================================
--- plugins/normalizers/nfkc-core.c 2012-02-13 18:02:01 +0900 (18092d9)
+++ plugins/normalizers/nfkc-unicode-5.1.c 2012-02-13 18:44:06 +0900 (3d08aae)
@@ -1,5 +1,5 @@
/* -*- c-basic-offset: 2 -*- */
-/* Copyright(C) 2010 Brazil
+/* Copyright(C) 2010-2012 Brazil
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -17,10 +17,10 @@
don't edit this file by hand. it generated automatically by nfkc.rb
*/
-#include "nfkc.h"
+#include "nfkc-unicode-5.1.h"
unsigned char
-grn_nfkc_ctype(const unsigned char *str)
+grn_nfkc_unicode_51_ctype(const unsigned char *str)
{
switch (str[0]) {
case 0x01 :
@@ -9687,7 +9687,7 @@ default :
}
const char *
-grn_nfkc_map1(const unsigned char *str)
+grn_nfkc_unicode_51_map1(const unsigned char *str)
{
switch (str[0]) {
case 0x41 :
@@ -24472,7 +24472,7 @@ case 0xF0 :
}
const char *
-grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix)
+grn_nfkc_unicode_51_map2(const unsigned char *prefix, const unsigned char *suffix)
{
switch (suffix[0]) {
case 0xCC :
Renamed: plugins/normalizers/nfkc-unicode-5.1.h (+5 -4) 74%
===================================================================
--- plugins/normalizers/nfkc.h 2012-02-13 18:02:01 +0900 (077391d)
+++ plugins/normalizers/nfkc-unicode-5.1.h 2012-02-13 18:44:06 +0900 (8eb6bd0)
@@ -1,5 +1,5 @@
/* -*- c-basic-offset: 2 -*- */
-/* Copyright(C) 2009 Brazil
+/* Copyright(C) 2009-2012 Brazil
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -23,9 +23,10 @@
extern "C" {
#endif
-unsigned char grn_nfkc_ctype(const unsigned char *str);
-const char *grn_nfkc_map1(const unsigned char *str);
-const char *grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix);
+unsigned char grn_nfkc_unicode_51_ctype(const unsigned char *str);
+const char *grn_nfkc_unicode_51_map1(const unsigned char *str);
+const char *grn_nfkc_unicode_51_map2(const unsigned char *prefix,
+ const unsigned char *suffix);
#ifdef __cplusplus
}
Modified: plugins/normalizers/nfkc.c (+11 -11)
===================================================================
--- plugins/normalizers/nfkc.c 2012-02-13 18:02:01 +0900 (cc6ff51)
+++ plugins/normalizers/nfkc.c 2012-02-13 18:44:06 +0900 (5dc5136)
@@ -20,7 +20,7 @@
#include <string.h>
#include <groonga/normalizer.h>
-#include "nfkc.h"
+#include "nfkc-unicode-5.1.h"
static grn_obj *
utf8_nfkc_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
@@ -45,7 +45,7 @@ utf8_nfkc_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
if (!(norm = GRN_PLUGIN_MALLOC(ctx, ds + 1))) {
GRN_PLUGIN_ERROR(ctx,
GRN_NO_MEMORY_AVAILABLE,
- "[normalizer][utf8][nfkc] "
+ "[normalizer][nfkc][unicode5.1] "
"failed to allocate normalized text space");
return NULL;
}
@@ -54,7 +54,7 @@ utf8_nfkc_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
GRN_PLUGIN_FREE(ctx, norm);
GRN_PLUGIN_ERROR(ctx,
GRN_NO_MEMORY_AVAILABLE,
- "[normalizer][utf8][nfkc] "
+ "[normalizer][nfkc][unicode5.1] "
"failed to allocate checks space");
return NULL;
}
@@ -66,7 +66,7 @@ utf8_nfkc_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
GRN_PLUGIN_FREE(ctx, norm);
GRN_PLUGIN_ERROR(ctx,
GRN_NO_MEMORY_AVAILABLE,
- "[normalizer][utf8][nfkc] "
+ "[normalizer][nfkc][unicode5.1] "
"failed to allocate character types space");
return NULL;
}
@@ -80,13 +80,13 @@ utf8_nfkc_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
if (!(ls = grn_charlen_utf8(ctx, s, e))) {
break;
}
- if ((p = (unsigned char *)grn_nfkc_map1(s))) {
+ if ((p = (unsigned char *)grn_nfkc_unicode_51_map1(s))) {
pe = p + strlen((char *)p);
} else {
p = s;
pe = p + ls;
}
- if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
+ if (d_ && (p2 = (unsigned char *)grn_nfkc_unicode_51_map2(d_, p))) {
p = p2;
pe = p + strlen((char *)p);
if (cp) { cp--; }
@@ -113,7 +113,7 @@ utf8_nfkc_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
GRN_PLUGIN_FREE(ctx, norm);
GRN_PLUGIN_ERROR(ctx,
GRN_NO_MEMORY_AVAILABLE,
- "[normalizer][utf8][nfkc] "
+ "[normalizer][nfkc][unicode5.1] "
"failed to reallocate normalized text space");
return NULL;
}
@@ -129,7 +129,7 @@ utf8_nfkc_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
GRN_PLUGIN_FREE(ctx, norm);
GRN_PLUGIN_ERROR(ctx,
GRN_NO_MEMORY_AVAILABLE,
- "[normalizer][utf8][nfkc] "
+ "[normalizer][nfkc][unicode5.1] "
"failed to reallocate checks space");
return NULL;
}
@@ -144,7 +144,7 @@ utf8_nfkc_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
GRN_PLUGIN_FREE(ctx, norm);
GRN_PLUGIN_ERROR(ctx,
GRN_NO_MEMORY_AVAILABLE,
- "[normalizer][utf8][nfkc] "
+ "[normalizer][nfkc][unicode5.1] "
"failed to reallocate character types space");
return NULL;
}
@@ -156,7 +156,7 @@ utf8_nfkc_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
d_ = d;
d += lp;
length++;
- if (cp) { *cp++ = grn_nfkc_ctype(p); }
+ if (cp) { *cp++ = grn_nfkc_unicode_51_ctype(p); }
if (ch) {
size_t i;
if (s_ == s + ls) {
@@ -192,7 +192,7 @@ GRN_PLUGIN_REGISTER(grn_ctx *ctx)
{
grn_obj *normalizer;
- normalizer = GRN_NORMALIZER_REGISTER(ctx, "NormalizerUTF8NFKC",
+ normalizer = GRN_NORMALIZER_REGISTER(ctx, "NormalizerNFKC51",
NULL, utf8_nfkc_normalize, NULL);
if (normalizer) {
return GRN_SUCCESS;
Modified: test/benchmark/bench-normalize.c (+16 -16)
===================================================================
--- test/benchmark/bench-normalize.c 2012-02-13 18:02:01 +0900 (5b989d8)
+++ test/benchmark/bench-normalize.c 2012-02-13 18:44:06 +0900 (97e0fbd)
@@ -44,11 +44,11 @@
#include <groonga.h>
#include <groonga_in.h>
-#define grn_nfkc_ctype bundle_grn_nfkc_ctype
-#define grn_nfkc_map1 bundle_grn_nfkc_map1
-#define grn_nfkc_map2 bundle_grn_nfkc_map2
+#define grn_nfkc_unicode_51_ctype bundle_grn_nfkc_ctype
+#define grn_nfkc_unicode_51_map1 bundle_grn_nfkc_map1
+#define grn_nfkc_unicode_51_map2 bundle_grn_nfkc_map2
-#include "plugins/normalizers/nfkc-core.c"
+#include "plugins/normalizers/nfkc-unicode-5.1.c"
#define GRN_STR_REMOVEBLANK (0x01<<0)
#define GRN_STR_WITH_TYPES (0x01<<1)
@@ -110,13 +110,13 @@ utf8_nfkc_normalize_original(grn_ctx *ctx, grn_str *nstr)
if (!(ls = grn_charlen_utf8(ctx, s, e))) {
break;
}
- if ((p = (unsigned char *)grn_nfkc_map1(s))) {
+ if ((p = (unsigned char *)grn_nfkc_unicode_51_map1(s))) {
pe = p + strlen((char *)p);
} else {
p = s;
pe = p + ls;
}
- if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
+ if (d_ && (p2 = (unsigned char *)grn_nfkc_unicode_51_map2(d_, p))) {
p = p2;
pe = p + strlen((char *)p);
if (cp) { cp--; }
@@ -182,7 +182,7 @@ utf8_nfkc_normalize_original(grn_ctx *ctx, grn_str *nstr)
d_ = d;
d += lp;
length++;
- if (cp) { *cp++ = grn_nfkc_ctype(p); }
+ if (cp) { *cp++ = grn_nfkc_unicode_51_ctype(p); }
if (ch) {
size_t i;
if (s_ == s + ls) {
@@ -247,13 +247,13 @@ utf8_nfkc_normalize_short(grn_ctx *ctx, grn_str *nstr)
if (!(ls = grn_charlen_utf8(ctx, s, e))) {
break;
}
- if ((p = (unsigned char *)grn_nfkc_map1(s))) {
+ if ((p = (unsigned char *)grn_nfkc_unicode_51_map1(s))) {
pe = p + strlen((char *)p);
} else {
p = s;
pe = p + ls;
}
- if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
+ if (d_ && (p2 = (unsigned char *)grn_nfkc_unicode_51_map2(d_, p))) {
p = p2;
pe = p + strlen((char *)p);
if (cp) { cp--; }
@@ -319,7 +319,7 @@ utf8_nfkc_normalize_short(grn_ctx *ctx, grn_str *nstr)
d_ = d;
d += lp;
length++;
- if (cp) { *cp++ = grn_nfkc_ctype(p); }
+ if (cp) { *cp++ = grn_nfkc_unicode_51_ctype(p); }
if (ch) {
size_t i;
if (s_ == s + ls) {
@@ -384,13 +384,13 @@ utf8_nfkc_normalize_unsigned_char(grn_ctx *ctx, grn_str *nstr)
if (!(ls = grn_charlen_utf8(ctx, s, e))) {
break;
}
- if ((p = (unsigned char *)grn_nfkc_map1(s))) {
+ if ((p = (unsigned char *)grn_nfkc_unicode_51_map1(s))) {
pe = p + strlen((char *)p);
} else {
p = s;
pe = p + ls;
}
- if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
+ if (d_ && (p2 = (unsigned char *)grn_nfkc_unicode_51_map2(d_, p))) {
p = p2;
pe = p + strlen((char *)p);
if (cp) { cp--; }
@@ -456,7 +456,7 @@ utf8_nfkc_normalize_unsigned_char(grn_ctx *ctx, grn_str *nstr)
d_ = d;
d += lp;
length++;
- if (cp) { *cp++ = grn_nfkc_ctype(p); }
+ if (cp) { *cp++ = grn_nfkc_unicode_51_ctype(p); }
if (ch) {
size_t i;
if (s_ == s + ls) {
@@ -521,13 +521,13 @@ utf8_nfkc_normalize_local(grn_ctx *ctx, grn_str *nstr)
if (!(ls = grn_charlen_utf8(ctx, s, e))) {
break;
}
- if ((p = (unsigned char *)grn_nfkc_map1(s))) {
+ if ((p = (unsigned char *)grn_nfkc_unicode_51_map1(s))) {
pe = p + strlen((char *)p);
} else {
p = s;
pe = p + ls;
}
- if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
+ if (d_ && (p2 = (unsigned char *)grn_nfkc_unicode_51_map2(d_, p))) {
p = p2;
pe = p + strlen((char *)p);
if (cp) { cp--; }
@@ -593,7 +593,7 @@ utf8_nfkc_normalize_local(grn_ctx *ctx, grn_str *nstr)
d_ = d;
d += lp;
length++;
- if (cp) { *cp++ = grn_nfkc_ctype(p); }
+ if (cp) { *cp++ = grn_nfkc_unicode_51_ctype(p); }
if (ch) {
size_t i;
if (s_ == s + ls) {
Modified: test/unit/core/test-command-dump.c (+6 -6)
===================================================================
--- test/unit/core/test-command-dump.c 2012-02-13 18:02:01 +0900 (9d2a2b4)
+++ test/unit/core/test-command-dump.c 2012-02-13 18:44:06 +0900 (04bb916)
@@ -160,14 +160,14 @@ data_hash_table_create(void)
NULL);
ADD_DATA("hash - key normalize",
"table_create Blog TABLE_HASH_KEY ShortText "
- "--normalizer NormalizerUTF8NFKC",
+ "--normalizer NormalizerNFKC51",
"Blog",
GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_NORMALIZE,
"ShortText",
NULL);
ADD_DATA("hash - key normalize - value",
"table_create Blog TABLE_HASH_KEY ShortText Int32 "
- "--normalizer NormalizerUTF8NFKC",
+ "--normalizer NormalizerNFKC51",
"Blog",
GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_NORMALIZE,
"ShortText",
@@ -191,14 +191,14 @@ data_patricia_trie_create(void)
NULL);
ADD_DATA("patricia trie - key normalize",
"table_create Blog TABLE_PAT_KEY ShortText "
- "--normalizer NormalizerUTF8NFKC",
+ "--normalizer NormalizerNFKC51",
"Blog",
GRN_OBJ_TABLE_PAT_KEY | GRN_OBJ_KEY_NORMALIZE,
"ShortText",
NULL);
ADD_DATA("patricia trie - key normalize - value",
"table_create Blog TABLE_PAT_KEY ShortText Int32 "
- "--normalizer NormalizerUTF8NFKC",
+ "--normalizer NormalizerNFKC51",
"Blog",
GRN_OBJ_TABLE_PAT_KEY | GRN_OBJ_KEY_NORMALIZE,
"ShortText",
@@ -222,14 +222,14 @@ data_double_array_trie_create(void)
NULL);
ADD_DATA("double-array trie - key normalize",
"table_create Blog TABLE_DAT_KEY ShortText "
- "--normalizer NormalizerUTF8NFKC",
+ "--normalizer NormalizerNFKC51",
"Blog",
GRN_OBJ_TABLE_DAT_KEY | GRN_OBJ_KEY_NORMALIZE,
"ShortText",
NULL);
ADD_DATA("double-array trie - key normalize - value",
"table_create Blog TABLE_DAT_KEY ShortText Int32 "
- "--normalizer NormalizerUTF8NFKC",
+ "--normalizer NormalizerNFKC51",
"Blog",
GRN_OBJ_TABLE_DAT_KEY | GRN_OBJ_KEY_NORMALIZE,
"ShortText",