[Groonga-commit] groonga/groonga [master] Make normalizer grn_obj

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Dec 14 12:40:02 JST 2012


Kouhei Sutou	2012-12-14 12:40:02 +0900 (Fri, 14 Dec 2012)

  New Revision: 05f50761b76802859207d885804ab546b0e0b1b1
  https://github.com/groonga/groonga/commit/05f50761b76802859207d885804ab546b0e0b1b1

  Log:
    Make normalizer grn_obj
    
    We can create a normalizer as a plugin. Yay!
    
    TODO:
    
      * Add --normalizer option to table_create command
      * Add Unicode Collation Algorithm (UCA) based normalizer as a plugin
    
    We don't inplement NFKC normalizer as plugin because it is used
    by NormalizerAuto internaly.

  Added files:
    include/groonga/normalizer.h
    lib/normalizer_in.h
  Copied files:
    lib/normalizer.c
      (from lib/string.c)
  Modified files:
    include/groonga.h
    include/groonga/Makefile.am
    lib/ctx.c
    lib/dat.cpp
    lib/dat.h
    lib/db.c
    lib/db.h
    lib/expr.c
    lib/hash.c
    lib/hash.h
    lib/ii.c
    lib/pat.c
    lib/pat.h
    lib/sources.am
    lib/string.c
    lib/token.c
    lib/tokenizer.c
    lib/util.c
    test/unit/core/dat/test-dat.cpp
    test/unit/util/test-snip.c
    test/unit/util/test-string.c

  Modified: include/groonga.h (+12 -3)
===================================================================
--- include/groonga.h    2012-12-14 12:20:38 +0900 (1a9dda6)
+++ include/groonga.h    2012-12-14 12:40:02 +0900 (77280a8)
@@ -112,7 +112,8 @@ typedef enum {
   GRN_TOO_LARGE_OFFSET = -68,
   GRN_TOO_SMALL_LIMIT = -69,
   GRN_CAS_ERROR = -70,
-  GRN_UNSUPPORTED_COMMAND_VERSION = -71
+  GRN_UNSUPPORTED_COMMAND_VERSION = -71,
+  GRN_NORMALIZER_ERROR = -72,
 } grn_rc;
 
 GRN_API grn_rc grn_init(void);
@@ -609,6 +610,12 @@ typedef enum {
   GRN_DB_TRIGRAM
 } grn_builtin_tokenizer;
 
+typedef enum {
+  GRN_DB_NORMALIZER_AUTO = 96,
+  GRN_DB_NORMALIZER_NFKC51,          /* Normalization Form KC for Unicode 5.1 */
+  GRN_DB_NORMALIZER_UCA              /* Unicode Collation Algorithm */
+} grn_builtin_normalizer;
+
 GRN_API grn_obj *grn_ctx_at(grn_ctx *ctx, grn_id id);
 
 /**
@@ -657,7 +664,8 @@ typedef enum {
   GRN_PROC_TOKENIZER = 1,
   GRN_PROC_COMMAND,
   GRN_PROC_FUNCTION,
-  GRN_PROC_HOOK
+  GRN_PROC_HOOK,
+  GRN_PROC_NORMALIZER
 } grn_proc_type;
 
 GRN_API grn_obj *grn_proc_create(grn_ctx *ctx,
@@ -1374,7 +1382,8 @@ typedef enum {
   GRN_INFO_PARTIAL_MATCH_THRESHOLD,
   GRN_INFO_II_SPLIT_THRESHOLD,
   GRN_INFO_SUPPORT_ZLIB,
-  GRN_INFO_SUPPORT_LZO
+  GRN_INFO_SUPPORT_LZO,
+  GRN_INFO_NORMALIZER
 } grn_info_type;
 
 /**

  Modified: include/groonga/Makefile.am (+2 -1)
===================================================================
--- include/groonga/Makefile.am    2012-12-14 12:20:38 +0900 (e6f37e7)
+++ include/groonga/Makefile.am    2012-12-14 12:40:02 +0900 (f7151d7)
@@ -1,4 +1,5 @@
 groonga_includedir = $(pkgincludedir)/groonga
 groonga_include_HEADERS =			\
 	plugin.h				\
-	tokenizer.h
+	tokenizer.h				\
+	normalizer.h

  Added: include/groonga/normalizer.h (+55 -0) 100644
===================================================================
--- /dev/null
+++ include/groonga/normalizer.h    2012-12-14 12:40:02 +0900 (3ec843c)
@@ -0,0 +1,55 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2012 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#ifndef GROONGA_NORMALIER_H
+#define GROONGA_NORMALIER_H
+
+#include <stddef.h>
+
+#include <groonga/plugin.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  /* __cplusplus */
+
+/*
+  grn_normalizer_register() registers a normalizer to the database
+  which is associated with `ctx'. `name_ptr' and `name_length' specify
+  the normalizer name. `name_length' can be `-1'. `-1' means that
+  `name_ptr` is NULL-terminated. Alphabetic letters ('A'-'Z' and
+  'a'-'z'), digits ('0'-'9') and an underscore ('_') are capable
+  characters. `init', `next' and `fin' specify the normalizer
+  functions. `init' is called for initializing a tokenizer for a
+  document or query. `next' is called for extracting tokens one by
+  one. `fin' is called for finalizing a
+  tokenizer. grn_tokenizer_register() returns GRN_SUCCESS on success,
+  an error code on failure. See "groonga.h" for more details of
+  grn_proc_func and grn_user_data, that is used as an argument of
+  grn_proc_func.
+ */
+GRN_PLUGIN_EXPORT grn_rc grn_normalizer_register(grn_ctx *ctx,
+                                                 const char *name_ptr,
+                                                 int name_length,
+                                                 grn_proc_func *init,
+                                                 grn_proc_func *next,
+                                                 grn_proc_func *fin);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif  /* __cplusplus */
+
+#endif  /* GROONGA_NORMALIER_H */

  Modified: lib/ctx.c (+6 -0)
===================================================================
--- lib/ctx.c    2012-12-14 12:20:38 +0900 (a358592)
+++ lib/ctx.c    2012-12-14 12:40:02 +0900 (f80d23f)
@@ -24,6 +24,7 @@
 #include "plugin_in.h"
 #include "snip.h"
 #include "output.h"
+#include "normalizer_in.h"
 #include <stdio.h>
 #include <stdarg.h>
 #include <time.h>
@@ -944,6 +945,10 @@ grn_init(void)
     GRN_LOG(ctx, GRN_LOG_ALERT, "plugins initialize failed (%d)", rc);
     return rc;
   }
+  if ((rc = grn_normalizer_init())) {
+    GRN_LOG(ctx, GRN_LOG_ALERT, "grn_normalizer_init failed (%d)", rc);
+    return rc;
+  }
   if ((rc = grn_token_init())) {
     GRN_LOG(ctx, GRN_LOG_ALERT, "grn_token_init failed (%d)", rc);
     return rc;
@@ -1029,6 +1034,7 @@ grn_fin(void)
   grn_query_logger_fin(ctx);
   grn_cache_fin();
   grn_token_fin();
+  grn_normalizer_fin();
   grn_plugins_fin();
   grn_io_fin();
   grn_ctx_fin(ctx);

  Modified: lib/dat.cpp (+15 -3)
===================================================================
--- lib/dat.cpp    2012-12-14 12:20:38 +0900 (e83e60c)
+++ lib/dat.cpp    2012-12-14 12:40:02 +0900 (ca208f0)
@@ -302,6 +302,14 @@ grn_dat_create(grn_ctx *ctx, const char *path, uint32_t,
   dat->header->encoding = encoding;
   dat->header->tokenizer = GRN_ID_NIL;
   dat->header->file_id = 0;
+  if (dat->header->flags & GRN_OBJ_KEY_NORMALIZE) {
+    dat->header->flags &= ~GRN_OBJ_KEY_NORMALIZE;
+    dat->header->normalizer = GRN_DB_NORMALIZER_AUTO;
+    dat->normalizer = grn_ctx_at(ctx, dat->header->normalizer);
+  } else {
+    dat->header->normalizer = GRN_ID_NIL;
+    dat->normalizer = NULL;
+  }
   dat->encoding = encoding;
   dat->tokenizer = NULL;
   return dat;
@@ -337,6 +345,11 @@ grn_dat_open(grn_ctx *ctx, const char *path)
   dat->encoding = dat->header->encoding;
   dat->obj.header.flags = dat->header->flags;
   dat->tokenizer = grn_ctx_at(ctx, dat->header->tokenizer);
+  if (dat->header->flags & GRN_OBJ_KEY_NORMALIZE) {
+    dat->header->flags &= ~GRN_OBJ_KEY_NORMALIZE;
+    dat->header->normalizer = GRN_DB_NORMALIZER_AUTO;
+  }
+  dat->normalizer = grn_ctx_at(ctx, dat->header->normalizer);
   return dat;
 }
 
@@ -672,11 +685,10 @@ grn_dat_scan(grn_ctx *ctx, grn_dat *dat, const char *str,
 
   int num_scan_hits = 0;
   try {
-    if (dat->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {
-      grn_obj *normalizer = GRN_NORMALIZER_AUTO;
+    if (dat->normalizer) {
       int flags = GRN_STRING_WITH_CHECKS;
       grn_obj * const normalized_string = grn_string_open(ctx, str, str_size,
-                                                          normalizer,
+                                                          dat->normalizer,
                                                           flags);
       if (!normalized_string) {
         fprintf(stderr, "error: grn_string_open() failed!\n");

  Modified: lib/dat.h (+3 -0)
===================================================================
--- lib/dat.h    2012-12-14 12:20:38 +0900 (9ae15f8)
+++ lib/dat.h    2012-12-14 12:40:02 +0900 (00c71df)
@@ -36,6 +36,7 @@ struct _grn_dat {
   void *trie;
   void *old_trie;
   grn_obj *tokenizer;
+  grn_obj *normalizer;
   grn_critical_section lock;
 };
 
@@ -44,6 +45,8 @@ struct grn_dat_header {
   grn_encoding encoding;
   grn_id tokenizer;
   uint32_t file_id;
+  grn_id normalizer;
+  uint32_t reserved[235];
 };
 
 struct _grn_dat_cursor {

  Modified: lib/db.c (+55 -8)
===================================================================
--- lib/db.c    2012-12-14 12:20:38 +0900 (c1d6887)
+++ lib/db.c    2012-12-14 12:40:02 +0900 (fc625ba)
@@ -27,6 +27,7 @@
 #include "geo.h"
 #include "snip.h"
 #include "string_in.h"
+#include "normalizer_in.h"
 #include "util.h"
 #include <string.h>
 #include <float.h>
@@ -34,10 +35,10 @@
 #define NEXT_ADDR(p) (((byte *)(p)) + sizeof *(p))
 
 #define WITH_NORMALIZE(table,key,key_size,block) do {\
-  if ((table)->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {\
+  if ((table)->normalizer) {\
     grn_obj *nstr;\
     if ((nstr = grn_string_open(ctx, key, key_size,\
-                                GRN_NORMALIZER_AUTO, 0))) {\
+                                (table)->normalizer, 0))) {\
       const char *key;\
       unsigned int key_size;\
       grn_string_get_normalized(ctx, nstr, &key, &key_size, NULL);\
@@ -253,6 +254,7 @@ grn_db_open(grn_ctx *ctx, const char *path)
           }
 #endif
           grn_db_init_builtin_tokenizers(ctx);
+          grn_db_init_builtin_normalizers(ctx);
           grn_db_init_builtin_query(ctx);
           GRN_API_RETURN((grn_obj *)s);
         }
@@ -1812,7 +1814,7 @@ grn_table_truncate(grn_ctx *ctx, grn_obj *table)
       }
       grn_hash_close(ctx, cols);
     }
-    grn_table_get_info(ctx, table, NULL, NULL, &tokenizer);
+    grn_table_get_info(ctx, table, NULL, NULL, &tokenizer, NULL);
     switch (table->header.type) {
     case GRN_TABLE_PAT_KEY :
       for (hooks = DB_OBJ(table)->hooks[GRN_HOOK_INSERT]; hooks; hooks = hooks->next) {
@@ -1854,7 +1856,8 @@ exit :
 
 grn_rc
 grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags,
-                   grn_encoding *encoding, grn_obj **tokenizer)
+                   grn_encoding *encoding, grn_obj **tokenizer,
+                   grn_obj **normalizer)
 {
   grn_rc rc = GRN_INVALID_ARGUMENT;
   GRN_API_ENTER;
@@ -1864,24 +1867,28 @@ grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags,
       if (flags) { *flags = ((grn_pat *)table)->obj.header.flags; }
       if (encoding) { *encoding = ((grn_pat *)table)->encoding; }
       if (tokenizer) { *tokenizer = ((grn_pat *)table)->tokenizer; }
+      if (normalizer) { *normalizer = ((grn_pat *)table)->normalizer; }
       rc = GRN_SUCCESS;
       break;
     case GRN_TABLE_DAT_KEY :
       if (flags) { *flags = ((grn_dat *)table)->obj.header.flags; }
       if (encoding) { *encoding = ((grn_dat *)table)->encoding; }
       if (tokenizer) { *tokenizer = ((grn_dat *)table)->tokenizer; }
+      if (normalizer) { *normalizer = ((grn_dat *)table)->normalizer; }
       rc = GRN_SUCCESS;
       break;
     case GRN_TABLE_HASH_KEY :
       if (flags) { *flags = ((grn_hash *)table)->obj.header.flags; }
       if (encoding) { *encoding = ((grn_hash *)table)->encoding; }
       if (tokenizer) { *tokenizer = ((grn_hash *)table)->tokenizer; }
+      if (normalizer) { *normalizer = ((grn_hash *)table)->normalizer; }
       rc = GRN_SUCCESS;
       break;
     case GRN_TABLE_NO_KEY :
       if (flags) { *flags = 0; }
       if (encoding) { *encoding = GRN_ENC_NONE; }
       if (tokenizer) { *tokenizer = grn_uvector_tokenizer; }
+      if (normalizer) { *normalizer = NULL; }
       rc = GRN_SUCCESS;
       break;
     }
@@ -6095,6 +6102,19 @@ grn_obj_get_info(grn_ctx *ctx, grn_obj *obj, grn_info_type type, grn_obj *valueb
         break;
       }
       break;
+    case GRN_INFO_NORMALIZER :
+      switch (DB_OBJ(obj)->header.type) {
+      case GRN_TABLE_HASH_KEY :
+        valuebuf = ((grn_hash *)obj)->normalizer;
+        break;
+      case GRN_TABLE_PAT_KEY :
+        valuebuf = ((grn_pat *)obj)->normalizer;
+        break;
+      case GRN_TABLE_DAT_KEY :
+        valuebuf = ((grn_dat *)obj)->normalizer;
+        break;
+      }
+      break;
     default :
       /* todo */
       break;
@@ -6117,7 +6137,7 @@ build_index(grn_ctx *ctx, grn_obj *obj)
       grn_obj_flags flags;
       grn_ii *ii = (grn_ii *)obj;
       grn_bool use_grn_ii_build;
-      grn_table_get_info(ctx, ii->lexicon, &flags, NULL, NULL);
+      grn_table_get_info(ctx, ii->lexicon, &flags, NULL, NULL, NULL);
       switch (flags & GRN_OBJ_TABLE_TYPE_MASK) {
       case GRN_OBJ_TABLE_PAT_KEY :
       case GRN_OBJ_TABLE_DAT_KEY :
@@ -6434,6 +6454,28 @@ grn_obj_set_info(grn_ctx *ctx, grn_obj *obj, grn_info_type type, grn_obj *value)
         break;
       }
     }
+    break;
+  case GRN_INFO_NORMALIZER :
+    if (!value || DB_OBJ(value)->header.type == GRN_PROC) {
+      switch (DB_OBJ(obj)->header.type) {
+      case GRN_TABLE_HASH_KEY :
+        ((grn_hash *)obj)->normalizer = value;
+        ((grn_hash *)obj)->header->normalizer = grn_obj_id(ctx, value);
+        rc = GRN_SUCCESS;
+        break;
+      case GRN_TABLE_PAT_KEY :
+        ((grn_pat *)obj)->normalizer = value;
+        ((grn_pat *)obj)->header->normalizer = grn_obj_id(ctx, value);
+        rc = GRN_SUCCESS;
+        break;
+      case GRN_TABLE_DAT_KEY :
+        ((grn_dat *)obj)->normalizer = value;
+        ((grn_dat *)obj)->header->normalizer = grn_obj_id(ctx, value);
+        rc = GRN_SUCCESS;
+        break;
+      }
+    }
+    break;
   default :
     /* todo */
     break;
@@ -8446,6 +8488,11 @@ grn_db_init_builtin_types(grn_ctx *ctx)
   }
 #endif
   grn_db_init_builtin_tokenizers(ctx);
+  for (id = grn_db_curr_id(ctx, db) + 1; id < GRN_DB_NORMALIZER_AUTO; id++) {
+    grn_itoh(id, buf + 3, 2);
+    grn_obj_register(ctx, db, buf, 5);
+  }
+  grn_db_init_builtin_normalizers(ctx);
   for (id = grn_db_curr_id(ctx, db) + 1; id < 128; id++) {
     grn_itoh(id, buf + 3, 2);
     grn_obj_register(ctx, db, buf, 5);
@@ -8479,7 +8526,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op,
         if (obj->header.type != GRN_COLUMN_FIX_SIZE) {
           grn_obj *tokenizer, *lexicon = grn_ctx_at(ctx, target->header.domain);
           if (!lexicon) { continue; }
-          grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer);
+          grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL);
           if (tokenizer) { continue; }
         }
         if (n < buf_size) {
@@ -8520,7 +8567,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op,
           if (!lexicon) { continue; }
           if (lexicon->header.type != GRN_TABLE_PAT_KEY) { continue; }
           /* FIXME: GRN_TABLE_DAT_KEY should be supported */
-          grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer);
+          grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL);
           if (tokenizer) { continue; }
         }
         if (n < buf_size) {
@@ -8626,7 +8673,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op,
               if (!lexicon) { continue; }
               if (lexicon->header.type != GRN_TABLE_PAT_KEY) { continue; }
               /* FIXME: GRN_TABLE_DAT_KEY should be supported */
-              grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer);
+              grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL);
               if (tokenizer) { continue; }
             }
             if (n < buf_size) {

  Modified: lib/db.h (+2 -1)
===================================================================
--- lib/db.h    2012-12-14 12:20:38 +0900 (961c968)
+++ lib/db.h    2012-12-14 12:40:02 +0900 (0db78e5)
@@ -92,7 +92,8 @@ grn_id grn_table_get_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_si
 grn_id grn_table_add_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_size,
                        void **value, int *added);
 GRN_API grn_rc grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags,
-                          grn_encoding *encoding, grn_obj **tokenizer);
+                                  grn_encoding *encoding, grn_obj **tokenizer,
+                                  grn_obj **normalizer);
 const char *_grn_table_key(grn_ctx *ctx, grn_obj *table, grn_id id, uint32_t *key_size);
 
 grn_rc grn_table_search(grn_ctx *ctx, grn_obj *table,

  Modified: lib/expr.c (+9 -3)
===================================================================
--- lib/expr.c    2012-12-14 12:20:38 +0900 (8915385)
+++ lib/expr.c    2012-12-14 12:40:02 +0900 (496c2b8)
@@ -2298,14 +2298,16 @@ grn_proc_call(grn_ctx *ctx, grn_obj *proc, int nargs, grn_obj *caller)
 void
 pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res)
 {
+  grn_obj *normalizer;
   grn_obj *a = NULL, *b = NULL;
 
+  normalizer = grn_ctx_at(ctx, GRN_DB_NORMALIZER_AUTO);
   switch (x->header.domain) {
   case GRN_DB_SHORT_TEXT:
   case GRN_DB_TEXT:
   case GRN_DB_LONG_TEXT:
     a = grn_string_open(ctx, GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x),
-                        GRN_NORMALIZER_AUTO, 0);
+                        normalizer, 0);
     break;
   default:
     break;
@@ -2316,7 +2318,7 @@ pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res)
   case GRN_DB_TEXT:
   case GRN_DB_LONG_TEXT:
     b = grn_string_open(ctx, GRN_TEXT_VALUE(y), GRN_TEXT_LEN(y),
-                        GRN_NORMALIZER_AUTO, 0);
+                        normalizer, 0);
     break;
   default:
     break;
@@ -2336,6 +2338,8 @@ pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res)
 
   if (a) { grn_obj_close(ctx, a); }
   if (b) { grn_obj_close(ctx, b); }
+
+  if (normalizer) { grn_obj_unlink(ctx, normalizer); }
 }
 
 grn_obj *
@@ -2926,7 +2930,9 @@ grn_expr_exec(grn_ctx *ctx, grn_obj *expr, int nargs)
         {
           grn_obj *x, *y;
           POP2ALLOC1(x, y, res);
-          pseudo_query_scan(ctx, x, y, res);
+          WITH_SPSAVE({
+            pseudo_query_scan(ctx, x, y, res);
+          });
         }
         code++;
         break;

  Modified: lib/hash.c (+18 -2)
===================================================================
--- lib/hash.c    2012-12-14 12:20:38 +0900 (e511517)
+++ lib/hash.c    2012-12-14 12:40:02 +0900 (3267179)
@@ -1482,9 +1482,17 @@ grn_io_hash_init(grn_ctx *ctx, grn_hash *hash, const char *path,
   header->n_entries = 0;
   header->n_garbages = 0;
   header->tokenizer = GRN_ID_NIL;
+  if (header->flags & GRN_OBJ_KEY_NORMALIZE) {
+    header->flags &= ~GRN_OBJ_KEY_NORMALIZE;
+    header->normalizer = GRN_DB_NORMALIZER_AUTO;
+    hash->normalizer = grn_ctx_at(ctx, header->normalizer);
+  } else {
+    header->normalizer = GRN_ID_NIL;
+    hash->normalizer = NULL;
+  }
   grn_table_queue_init(ctx, &header->queue);
 
-  hash->obj.header.flags = flags;
+  hash->obj.header.flags = header->flags;
   hash->ctx = ctx;
   hash->key_size = key_size;
   hash->encoding = encoding;
@@ -1555,6 +1563,7 @@ grn_tiny_hash_init(grn_ctx *ctx, grn_hash *hash, const char *path,
   hash->n_entries_ = 0;
   hash->garbages = GRN_ID_NIL;
   hash->tokenizer = NULL;
+  hash->normalizer = NULL;
   grn_tiny_array_init(ctx, &hash->a, entry_size, GRN_TINY_ARRAY_CLEAR);
   grn_tiny_bitmap_init(ctx, &hash->bitmap);
   return GRN_SUCCESS;
@@ -1621,6 +1630,11 @@ grn_hash_open(grn_ctx *ctx, const char *path)
             hash->header = header;
             hash->lock = &header->lock;
             hash->tokenizer = grn_ctx_at(ctx, header->tokenizer);
+            if (header->flags & GRN_OBJ_KEY_NORMALIZE) {
+              header->flags &= ~GRN_OBJ_KEY_NORMALIZE;
+              header->normalizer = GRN_DB_NORMALIZER_AUTO;
+            }
+            hash->normalizer = grn_ctx_at(ctx, header->normalizer);
             return hash;
           } else {
             GRN_LOG(ctx, GRN_LOG_NOTICE,
@@ -2922,7 +2936,7 @@ grn_hash_check(grn_ctx *ctx, grn_hash *hash)
   char buf[8];
   struct grn_hash_header *h = hash->header;
   GRN_OUTPUT_ARRAY_OPEN("RESULT", 1);
-  GRN_OUTPUT_MAP_OPEN("SUMMARY", 24);
+  GRN_OUTPUT_MAP_OPEN("SUMMARY", 25);
   GRN_OUTPUT_CSTR("flags");
   grn_itoh(h->flags, buf, 8);
   GRN_OUTPUT_STR(buf, 8);
@@ -2932,6 +2946,8 @@ grn_hash_check(grn_ctx *ctx, grn_hash *hash)
   GRN_OUTPUT_INT64(hash->value_size);
   GRN_OUTPUT_CSTR("tokenizer");
   GRN_OUTPUT_INT64(h->tokenizer);
+  GRN_OUTPUT_CSTR("normalizer");
+  GRN_OUTPUT_INT64(h->normalizer);
   GRN_OUTPUT_CSTR("curr_rec");
   GRN_OUTPUT_INT64(h->curr_rec);
   GRN_OUTPUT_CSTR("curr_key");

  Modified: lib/hash.h (+3 -1)
===================================================================
--- lib/hash.h    2012-12-14 12:20:38 +0900 (626ad01)
+++ lib/hash.h    2012-12-14 12:40:02 +0900 (6792e5b)
@@ -199,6 +199,7 @@ struct _grn_hash {
   uint32_t *n_entries;
   uint32_t *max_offset;
   grn_obj *tokenizer;
+  grn_obj *normalizer;
 
   /* For grn_io_hash. */
   grn_io *io;
@@ -242,7 +243,8 @@ struct grn_hash_header {
   uint32_t n_entries;
   uint32_t n_garbages;
   uint32_t lock;
-  uint32_t reserved[16];
+  grn_id normalizer;
+  uint32_t reserved[15];
   grn_id garbages[GRN_HASH_MAX_KEY_SIZE];
   grn_table_queue queue;
 };

  Modified: lib/ii.c (+13 -5)
===================================================================
--- lib/ii.c    2012-12-14 12:20:38 +0900 (eb80a0c)
+++ lib/ii.c    2012-12-14 12:40:02 +0900 (d7dda44)
@@ -3458,7 +3458,9 @@ _grn_ii_create(grn_ctx *ctx, grn_ii *ii, const char *path, grn_obj *lexicon, uin
     free_histogram[i] = 0;
   }
   */
-  if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer)) { return NULL; }
+  if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, NULL)) {
+    return NULL;
+  }
   if (path && strlen(path) + 6 >= PATH_MAX) { return NULL; }
   seg = grn_io_create(ctx, path, sizeof(struct grn_ii_header),
                       S_SEGMENT, GRN_II_MAX_LSEG, grn_io_auto, GRN_IO_EXPIRE_SEGMENT);
@@ -3578,7 +3580,9 @@ grn_ii_open(grn_ctx *ctx, const char *path, grn_obj *lexicon)
   grn_obj_flags lflags;
   grn_encoding encoding;
   grn_obj *tokenizer;
-  if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer)) { return NULL; }
+  if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, NULL)) {
+    return NULL;
+  }
   if (strlen(path) + 6 >= PATH_MAX) { return NULL; }
   strcpy(path2, path);
   strcat(path2, ".c");
@@ -6745,14 +6749,18 @@ get_tmp_lexicon(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
     grn_obj *domain = grn_ctx_at(ctx, ii_buffer->lexicon->header.domain);
     grn_obj *range = grn_ctx_at(ctx, DB_OBJ(ii_buffer->lexicon)->range);
     grn_obj *tokenizer;
+    grn_obj *normalizer;
     grn_obj_flags flags;
-    grn_table_get_info(ctx, ii_buffer->lexicon, &flags, NULL, &tokenizer);
+    grn_table_get_info(ctx, ii_buffer->lexicon, &flags, NULL,
+                       &tokenizer, &normalizer);
     flags &= ~GRN_OBJ_PERSISTENT;
     tmp_lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range);
     if (tmp_lexicon) {
       ii_buffer->tmp_lexicon = tmp_lexicon;
       grn_obj_set_info(ctx, tmp_lexicon,
                        GRN_INFO_DEFAULT_TOKENIZER, tokenizer);
+      grn_obj_set_info(ctx, tmp_lexicon,
+                       GRN_INFO_NORMALIZER, normalizer);
       if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) {
         grn_pat_cache_enable(ctx, (grn_pat *)tmp_lexicon, PAT_CACHE_SIZE);
       }
@@ -7193,7 +7201,7 @@ grn_ii_buffer_open(grn_ctx *ctx, grn_ii *ii,
                                           S_IRUSR|S_IWUSR);
           if (ii_buffer->tmpfd != -1) {
             grn_obj_flags flags;
-            grn_table_get_info(ctx, ii->lexicon, &flags, NULL, NULL);
+            grn_table_get_info(ctx, ii->lexicon, &flags, NULL, NULL, NULL);
             if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) {
               grn_pat_cache_enable(ctx, (grn_pat *)ii->lexicon,
                                    PAT_CACHE_SIZE);
@@ -7316,7 +7324,7 @@ grn_ii_buffer_close(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
 {
   uint32_t i;
   grn_obj_flags flags;
-  grn_table_get_info(ctx, ii_buffer->ii->lexicon, &flags, NULL, NULL);
+  grn_table_get_info(ctx, ii_buffer->ii->lexicon, &flags, NULL, NULL, NULL);
   if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) {
     grn_pat_cache_disable(ctx, (grn_pat *)ii_buffer->ii->lexicon);
   }

  Copied: lib/normalizer.c (+104 -375) 75%
===================================================================
--- lib/string.c    2012-12-14 12:20:38 +0900 (5253a35)
+++ lib/normalizer.c    2012-12-14 12:40:02 +0900 (8eff99e)
@@ -1,6 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
-  Copyright(C) 2009-2012 Brazil
+  Copyright(C) 2012 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -16,13 +16,59 @@
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
 
-#include "groonga_in.h"
 #include <string.h>
-#include "string_in.h"
-#include "str.h"
 
+#include "normalizer_in.h"
+#include "string_in.h"
+#include <groonga/normalizer.h>
 #include <groonga/tokenizer.h>
 
+grn_rc
+grn_normalizer_register(grn_ctx *ctx,
+                        const char *name_ptr,
+                        int name_length,
+                        grn_proc_func *init,
+                        grn_proc_func *next,
+                        grn_proc_func *fin)
+{
+  grn_expr_var vars[] = {
+    { NULL, 0 }
+  };
+  GRN_PTR_INIT(&vars[0].value, 0, GRN_ID_NIL);
+
+  if (name_length < 0) {
+    name_length = strlen(name_ptr);
+  }
+
+  {
+    grn_obj * const normalizer = grn_proc_create(ctx,
+                                                 name_ptr, name_length,
+                                                 GRN_PROC_NORMALIZER,
+                                                 init, next, fin,
+                                                 sizeof(*vars) / sizeof(vars),
+                                                 vars);
+    if (!normalizer) {
+      GRN_PLUGIN_ERROR(ctx, GRN_NORMALIZER_ERROR,
+                       "[normalizer] failed to register normalizer: <%.*s>",
+                       name_length, name_ptr);
+      return ctx->rc;
+    }
+  }
+  return GRN_SUCCESS;
+}
+
+grn_rc
+grn_normalizer_init(void)
+{
+  return GRN_SUCCESS;
+}
+
+grn_rc
+grn_normalizer_fin(void)
+{
+  return GRN_SUCCESS;
+}
+
 static unsigned char symbol[] = {
   ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0,
@@ -32,8 +78,7 @@ static unsigned char symbol[] = {
 };
 
 inline static grn_obj *
-eucjp_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
-                grn_user_data *user_data)
+eucjp_normalize(grn_ctx *ctx, grn_string *nstr)
 {
   static uint16_t hankana[] = {
     0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3,
@@ -54,7 +99,6 @@ eucjp_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
   static unsigned char handaku[] = {
     0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd
   };
-  grn_string *nstr = (grn_string *)args[0];
   int16_t *ch;
   const unsigned char *s, *s_, *e;
   unsigned char *d, *d0, *d_, b;
@@ -279,8 +323,7 @@ eucjp_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
 }
 
 inline static grn_obj *
-sjis_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
-               grn_user_data *user_data)
+sjis_normalize(grn_ctx *ctx, grn_string *nstr)
 {
   static uint16_t hankana[] = {
     0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342,
@@ -301,7 +344,6 @@ sjis_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
   static unsigned char handaku[] = {
     0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c
   };
-  grn_string *nstr = (grn_string *)args[0];
   int16_t *ch;
   const unsigned char *s, *s_;
   unsigned char *d, *d0, *d_, b, *e;
@@ -572,13 +614,12 @@ grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char
 }
 
 inline static grn_obj *
-utf8_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+utf8_normalize(grn_ctx *ctx, grn_string *nstr)
 {
   int16_t *ch;
   const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
   unsigned char *d, *d_, *de;
   uint_least8_t *cp;
-  grn_string *nstr = (grn_string *)args[0];
   size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3;
   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
   grn_bool remove_tokenized_delimiter_p =
@@ -715,9 +756,8 @@ utf8_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data
 #endif /* WITH_NFKC */
 
 inline static grn_obj *
-ascii_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+ascii_normalize(grn_ctx *ctx, grn_string *nstr)
 {
-  grn_string *nstr = (grn_string *)args[0];
   int16_t *ch;
   const unsigned char *s, *s_, *e;
   unsigned char *d, *d0, *d_;
@@ -818,10 +858,8 @@ ascii_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_dat
 
 /* use cp1252 as latin1 */
 inline static grn_obj *
-latin1_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
-                 grn_user_data *user_data)
+latin1_normalize(grn_ctx *ctx, grn_string *nstr)
 {
-  grn_string *nstr = (grn_string *)args[0];
   int16_t *ch;
   const unsigned char *s, *s_, *e;
   unsigned char *d, *d0, *d_;
@@ -955,10 +993,8 @@ latin1_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
 }
 
 inline static grn_obj *
-koi8r_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
-                grn_user_data *user_data)
+koi8r_normalize(grn_ctx *ctx, grn_string *nstr)
 {
-  grn_string *nstr = (grn_string *)args[0];
   int16_t *ch;
   const unsigned char *s, *s_, *e;
   unsigned char *d, *d0, *d_;
@@ -1080,396 +1116,89 @@ koi8r_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
   return NULL;
 }
 
-static grn_string *
-grn_fake_string_open(grn_ctx *ctx, grn_string *string)
-{
-  /* TODO: support GRN_STRING_REMOVE_BLANK flag and ctypes */
-  grn_string *nstr = string;
-  const char *str;
-  unsigned int str_len;
-
-  str = nstr->original;
-  str_len = nstr->original_length_in_bytes;
-
-  if (!(nstr->normalized = GRN_MALLOC(str_len + 1))) {
-    ERR(GRN_NO_MEMORY_AVAILABLE,
-        "[strinig][fake] failed to allocate normalized text space");
-    grn_string_close(ctx, (grn_obj *)nstr);
-    return NULL;
-  }
-
-  if (nstr->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER &&
-      ctx->encoding == GRN_ENC_UTF8) {
-    int char_length;
-    const char *source_current = str;
-    const char *source_end = str + str_len;
-    char *destination = nstr->normalized;
-    unsigned int destination_length = 0;
-    while ((char_length = grn_charlen(ctx, source_current, source_end)) > 0) {
-      if (!grn_tokenizer_is_tokenized_delimiter(ctx,
-                                                source_current, char_length,
-                                                ctx->encoding)) {
-        memcpy(destination, source_current, char_length);
-        destination += char_length;
-        destination_length += char_length;
-      }
-      source_current += char_length;
-    }
-    nstr->normalized[destination_length] = '\0';
-    nstr->normalized_length_in_bytes = destination_length;
-  } else {
-    memcpy(nstr->normalized, str, str_len);
-    nstr->normalized[str_len] = '\0';
-    nstr->normalized_length_in_bytes = str_len;
-  }
-
-  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
-    int16_t f = 0;
-    unsigned char c;
-    size_t i;
-    if (!(nstr->checks = (int16_t *) GRN_MALLOC(sizeof(int16_t) * str_len))) {
-      grn_string_close(ctx, (grn_obj *)nstr);
-      ERR(GRN_NO_MEMORY_AVAILABLE,
-          "[strinig][fake] failed to allocate checks space");
-      return NULL;
-    }
-    switch (nstr->encoding) {
-    case GRN_ENC_EUC_JP:
-      for (i = 0; i < str_len; i++) {
-        if (!f) {
-          c = (unsigned char) str[i];
-          f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1)
-            );
-          nstr->checks[i] = f;
-        } else {
-          nstr->checks[i] = 0;
-        }
-        f--;
-      }
-      break;
-    case GRN_ENC_SJIS:
-      for (i = 0; i < str_len; i++) {
-        if (!f) {
-          c = (unsigned char) str[i];
-          f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1);
-          nstr->checks[i] = f;
-        } else {
-          nstr->checks[i] = 0;
-        }
-        f--;
-      }
-      break;
-    case GRN_ENC_UTF8:
-      for (i = 0; i < str_len; i++) {
-        if (!f) {
-          c = (unsigned char) str[i];
-          f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3)
-                           : 2)
-               : 1);
-          nstr->checks[i] = f;
-        } else {
-          nstr->checks[i] = 0;
-        }
-        f--;
-      }
-      break;
-    default:
-      for (i = 0; i < str_len; i++) {
-        nstr->checks[i] = 1;
-      }
-      break;
-    }
-  }
-  return nstr;
-}
-
-grn_obj *
-grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len,
-                 grn_obj *normalizer, int flags, grn_encoding encoding)
+static grn_obj *
+auto_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
-  grn_string *string;
-  grn_obj *obj;
-  grn_obj *args[1];
-
-  if (!str || !str_len) {
-    return NULL;
-  }
-
-  string = GRN_MALLOCN(grn_string, 1);
-  if (!string) {
-    GRN_LOG(ctx, GRN_LOG_ALERT,
-            "[string][open] failed to allocate memory");
-    return NULL;
-  }
-
-  obj = (grn_obj *)string;
-  GRN_OBJ_INIT(obj, GRN_STRING, GRN_OBJ_ALLOCATED, GRN_ID_NIL);
-  string->original = str;
-  string->original_length_in_bytes = str_len;
-  string->normalized = NULL;
-  string->normalized_length_in_bytes = 0;
-  string->n_characters = 0;
-  string->checks = NULL;
-  string->ctypes = NULL;
-  string->encoding = encoding;
-  string->flags = flags;
-
-  if (!normalizer) {
-    return (grn_obj *)grn_fake_string_open(ctx, string);
-  }
-
-  args[0] = obj;
-  switch (encoding) {
+  grn_string *string = (grn_string *)(args[0]);
+  switch (string->encoding) {
   case GRN_ENC_EUC_JP :
-    eucjp_normalize(ctx, 1, args, NULL);
+    eucjp_normalize(ctx, string);
     break;
   case GRN_ENC_UTF8 :
 #ifdef WITH_NFKC
-    utf8_normalize(ctx, 1, args, NULL);
+    utf8_normalize(ctx, string);
 #else /* WITH_NFKC */
-    ascii_normalize(ctx, 1, args, NULL);
+    ascii_normalize(ctx, string);
 #endif /* WITH_NFKC */
     break;
   case GRN_ENC_SJIS :
-    sjis_normalize(ctx, 1, args, NULL);
+    sjis_normalize(ctx, string);
     break;
   case GRN_ENC_LATIN1 :
-    latin1_normalize(ctx, 1, args, NULL);
+    latin1_normalize(ctx, string);
     break;
   case GRN_ENC_KOI8R :
-    koi8r_normalize(ctx, 1, args, NULL);
+    koi8r_normalize(ctx, string);
     break;
   default :
-    ascii_normalize(ctx, 1, args, NULL);
+    ascii_normalize(ctx, string);
     break;
   }
-  if (ctx->rc) {
-    grn_obj_close(ctx, obj);
-    obj = NULL;
-  }
-
-  return obj;
-}
-
-grn_obj *
-grn_string_open(grn_ctx *ctx, const char *str, unsigned int str_len,
-                grn_obj *normalizer, int flags)
-{
-  return grn_string_open_(ctx, str, str_len, normalizer, flags, ctx->encoding);
-}
-
-grn_rc
-grn_string_get_original(grn_ctx *ctx, grn_obj *string,
-                        const char **original,
-                        unsigned int *length_in_bytes)
-{
-  grn_rc rc;
-  grn_string *string_ = (grn_string *)string;
-  GRN_API_ENTER;
-  if (string_) {
-    if (original) { *original = string_->original; }
-    if (length_in_bytes) {
-      *length_in_bytes = string_->original_length_in_bytes;
-    }
-    rc = GRN_SUCCESS;
-  } else {
-    rc = GRN_INVALID_ARGUMENT;
-  }
-  GRN_API_RETURN(rc);
-}
-
-int
-grn_string_get_flags(grn_ctx *ctx, grn_obj *string)
-{
-  int flags = 0;
-  grn_string *string_ = (grn_string *)string;
-  GRN_API_ENTER;
-  if (string_) {
-    flags = string_->flags;
-  }
-  GRN_API_RETURN(flags);
+  return NULL;
 }
 
-grn_rc
-grn_string_get_normalized(grn_ctx *ctx, grn_obj *string,
-                          const char **normalized,
-                          unsigned int *length_in_bytes,
-                          unsigned int *n_characters)
+#ifdef WITH_NFKC
+static grn_obj *
+nfkc51_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
-  grn_rc rc;
-  grn_string *string_ = (grn_string *)string;
-  GRN_API_ENTER;
-  if (string_) {
-    if (normalized) { *normalized = string_->normalized; }
-    if (length_in_bytes) {
-      *length_in_bytes = string_->normalized_length_in_bytes;
-    }
-    if (n_characters) { *n_characters = string_->n_characters; }
-    rc = GRN_SUCCESS;
-  } else {
-    rc = GRN_INVALID_ARGUMENT;
-  }
-  GRN_API_RETURN(rc);
+  grn_string *string = (grn_string *)(args[0]);
+  utf8_normalize(ctx, string);
+  return NULL;
 }
+#endif /* WITH_NFKC */
 
 grn_rc
-grn_string_set_normalized(grn_ctx *ctx, grn_obj *string,
-                          char *normalized, unsigned int length_in_bytes,
-                          unsigned int n_characters)
+grn_normalizer_normalize(grn_ctx *ctx, grn_obj *normalizer, grn_obj *string)
 {
   grn_rc rc;
-  grn_string *string_ = (grn_string *)string;
-  GRN_API_ENTER;
-  if (string_) {
-    if (string_->normalized) { GRN_FREE(string_->normalized); }
-    string_->normalized = normalized;
-    string_->normalized_length_in_bytes = length_in_bytes;
-    string_->n_characters = n_characters;
-    rc = GRN_SUCCESS;
-  } else {
-    rc = GRN_INVALID_ARGUMENT;
-  }
-  GRN_API_RETURN(rc);
-}
-
-const short *
-grn_string_get_checks(grn_ctx *ctx, grn_obj *string)
-{
-  int16_t *checks = NULL;
-  grn_string *string_ = (grn_string *)string;
-  GRN_API_ENTER;
-  if (string_) {
-    checks = string_->checks;
-  } else {
-    checks = NULL;
-  }
-  GRN_API_RETURN(checks);
-}
+  int nargs = 0;
 
-grn_rc
-grn_string_set_checks(grn_ctx *ctx, grn_obj *string, short *checks)
-{
-  grn_rc rc;
-  grn_string *string_ = (grn_string *)string;
-  GRN_API_ENTER;
-  if (string_) {
-    if (string_->checks) { GRN_FREE(string_->checks); }
-    string_->checks = checks;
-    rc = GRN_SUCCESS;
-  } else {
-    rc = GRN_INVALID_ARGUMENT;
-  }
-  GRN_API_RETURN(rc);
-}
+  grn_ctx_push(ctx, string);
+  nargs++;
+  rc = grn_proc_call(ctx, normalizer, nargs, NULL);
+  grn_ctx_pop(ctx);
 
-const unsigned char *
-grn_string_get_types(grn_ctx *ctx, grn_obj *string)
-{
-  unsigned char *types = NULL;
-  grn_string *string_ = (grn_string *)string;
-  GRN_API_ENTER;
-  if (string_) {
-    types = string_->ctypes;
-  } else {
-    types = NULL;
-  }
-  GRN_API_RETURN(types);
+  return rc;
 }
 
 grn_rc
-grn_string_set_types(grn_ctx *ctx, grn_obj *string, unsigned char *types)
+grn_db_init_builtin_normalizers(grn_ctx *ctx)
 {
   grn_rc rc;
-  grn_string *string_ = (grn_string *)string;
-  GRN_API_ENTER;
-  if (string_) {
-    if (string_->ctypes) { GRN_FREE(string_->ctypes); }
-    string_->ctypes = types;
-    rc = GRN_SUCCESS;
-  } else {
-    rc = GRN_INVALID_ARGUMENT;
-  }
-  GRN_API_RETURN(rc);
-}
-
-grn_encoding
-grn_string_get_encoding(grn_ctx *ctx, grn_obj *string)
-{
-  grn_encoding encoding = GRN_ENC_NONE;
-  grn_string *string_ = (grn_string *)string;
-  GRN_API_ENTER;
-  if (string_) {
-    encoding = string_->encoding;
-  }
-  GRN_API_RETURN(encoding);
-}
-
-grn_rc
-grn_string_inspect(grn_ctx *ctx, grn_obj *buffer, grn_obj *string)
-{
-  grn_string *string_ = (grn_string *)string;
-
-  GRN_TEXT_PUTS(ctx, buffer, "#<string:");
-
-  GRN_TEXT_PUTS(ctx, buffer, " original:<");
-  GRN_TEXT_PUT(ctx, buffer,
-               string_->original,
-               string_->original_length_in_bytes);
-  GRN_TEXT_PUTS(ctx, buffer, ">");
-  GRN_TEXT_PUTS(ctx, buffer, "(");
-  grn_text_itoa(ctx, buffer, string_->original_length_in_bytes);
-  GRN_TEXT_PUTS(ctx, buffer, ")");
-
-  GRN_TEXT_PUTS(ctx, buffer, " normalized:<");
-  GRN_TEXT_PUT(ctx, buffer,
-               string_->normalized,
-               string_->normalized_length_in_bytes);
-  GRN_TEXT_PUTS(ctx, buffer, ">");
-  GRN_TEXT_PUTS(ctx, buffer, "(");
-  grn_text_itoa(ctx, buffer, string_->normalized_length_in_bytes);
-  GRN_TEXT_PUTS(ctx, buffer, ")");
-
-  GRN_TEXT_PUTS(ctx, buffer, " n_characters:");
-  grn_text_itoa(ctx, buffer, string_->n_characters);
-
-  GRN_TEXT_PUTS(ctx, buffer, " encoding:");
-  grn_inspect_encoding(ctx, buffer, string_->encoding);
+  const char *normalizer_auto_name = "NormalizerAuto";
+  const char *normalizer_nfkc51_name = "NormalizerNFKC51";
 
-  GRN_TEXT_PUTS(ctx, buffer, " flags:");
-  if (string_->flags & GRN_STRING_REMOVE_BLANK) {
-  GRN_TEXT_PUTS(ctx, buffer, "REMOVE_BLANK|");
-  }
-  if (string_->flags & GRN_STRING_WITH_TYPES) {
-    GRN_TEXT_PUTS(ctx, buffer, "WITH_TYPES|");
-  }
-  if (string_->flags & GRN_STRING_WITH_CHECKS) {
-    GRN_TEXT_PUTS(ctx, buffer, "WITH_CHECKS|");
-  }
-  if (string_->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER) {
-    GRN_TEXT_PUTS(ctx, buffer, "REMOVE_TOKENIZED_DELIMITER|");
-  }
-  if (GRN_TEXT_VALUE(buffer)[GRN_TEXT_LEN(buffer) - 1] == '|') {
-    grn_bulk_truncate(ctx, buffer, GRN_TEXT_LEN(buffer) - 1);
+  rc = grn_normalizer_register(ctx, normalizer_auto_name, -1,
+                                NULL, auto_next, NULL);
+  if (rc == GRN_SUCCESS) {
+    grn_obj *obj;
+    obj = grn_ctx_get(ctx, normalizer_auto_name, -1);
+    if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_AUTO) {
+      return GRN_FILE_CORRUPT;
+    }
   }
 
-  GRN_TEXT_PUTS(ctx, buffer, ">");
+#ifdef WITH_NFKC
+  grn_normalizer_register(ctx, normalizer_nfkc51_name, -1,
+                          NULL, nfkc51_next, NULL);
+#else /* WITH_NFKC */
+  grn_normalizer_register(ctx, normalizer_nfkc51_name, -1,
+                          NULL, NULL, NULL);
+#endif /* WITH_NFKC */
+/*
+  grn_normalizer_register(ctx, "NormalizerUCA", -1,
+                          NULL, uca_next, NULL);
+*/
 
   return GRN_SUCCESS;
 }
-
-grn_rc
-grn_string_close(grn_ctx *ctx, grn_obj *string)
-{
-  grn_rc rc;
-  grn_string *string_ = (grn_string *)string;
-  if (string_) {
-    if (string_->normalized) { GRN_FREE(string_->normalized); }
-    if (string_->ctypes) { GRN_FREE(string_->ctypes); }
-    if (string_->checks) { GRN_FREE(string_->checks); }
-    GRN_FREE(string);
-    rc = GRN_SUCCESS;
-  } else {
-    rc = GRN_INVALID_ARGUMENT;
-  }
-  return rc;
-}

  Added: lib/normalizer_in.h (+50 -0) 100644
===================================================================
--- /dev/null
+++ lib/normalizer_in.h    2012-12-14 12:40:02 +0900 (e3411f3)
@@ -0,0 +1,50 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2012 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#ifndef GRN_NORMALIZER_IN_H
+#define GRN_NORMALIZER_IN_H
+
+#ifndef GROONGA_IN_H
+#include "groonga_in.h"
+#endif /* GROONGA_IN_H */
+
+#ifndef GRN_CTX_H
+#include "ctx.h"
+#endif /* GRN_CTX_H */
+
+#ifndef GRN_DB_H
+#include "db.h"
+#endif /* GRN_DB_H */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+grn_rc grn_normalizer_init(void);
+grn_rc grn_normalizer_fin(void);
+
+grn_rc grn_normalizer_normalize(grn_ctx *ctx,
+                                grn_obj *normalizer,
+                                grn_obj *string);
+
+grn_rc grn_db_init_builtin_normalizers(grn_ctx *ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GRN_NORMALIZER_IN_H */

  Modified: lib/pat.c (+19 -4)
===================================================================
--- lib/pat.c    2012-12-14 12:20:38 +0900 (e3f4710)
+++ lib/pat.c    2012-12-14 12:40:02 +0900 (8ab4e22)
@@ -423,13 +423,21 @@ _grn_pat_create(grn_ctx *ctx, grn_pat *pat,
   header->curr_del3 = 0;
   header->n_garbages = 0;
   header->tokenizer = GRN_ID_NIL;
+  if (header->flags & GRN_OBJ_KEY_NORMALIZE) {
+    header->flags &= ~GRN_OBJ_KEY_NORMALIZE;
+    header->normalizer = GRN_DB_NORMALIZER_AUTO;
+    pat->normalizer = grn_ctx_at(ctx, header->normalizer);
+  } else {
+    header->normalizer = GRN_ID_NIL;
+    pat->normalizer = NULL;
+  }
   pat->io = io;
   pat->header = header;
   pat->key_size = key_size;
   pat->value_size = value_size;
   pat->tokenizer = NULL;
   pat->encoding = encoding;
-  pat->obj.header.flags = flags;
+  pat->obj.header.flags = header->flags;
   if (!(node0 = pat_get(ctx, pat, 0))) {
     grn_io_close(ctx, io);
     return NULL;
@@ -518,6 +526,11 @@ grn_pat_open(grn_ctx *ctx, const char *path)
   pat->encoding = header->encoding;
   pat->obj.header.flags = header->flags;
   pat->tokenizer = grn_ctx_at(ctx, header->tokenizer);
+  if (header->flags & GRN_OBJ_KEY_NORMALIZE) {
+    header->flags &= ~GRN_OBJ_KEY_NORMALIZE;
+    header->normalizer = GRN_DB_NORMALIZER_AUTO;
+  }
+  pat->normalizer = grn_ctx_at(ctx, header->normalizer);
   PAT_AT(pat, 0, node0);
   if (!node0) {
     grn_io_close(ctx, io);
@@ -1528,9 +1541,9 @@ grn_pat_scan(grn_ctx *ctx, grn_pat *pat, const char *str, unsigned int str_len,
 {
   int n = 0;
   grn_id tid;
-  if (pat->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {
+  if (pat->normalizer) {
     grn_obj *nstr = grn_string_open(ctx, str, str_len,
-                                    GRN_NORMALIZER_AUTO, GRN_STRING_WITH_CHECKS);
+                                    pat->normalizer, GRN_STRING_WITH_CHECKS);
     if (nstr) {
       const short *cp = grn_string_get_checks(ctx, nstr);
       unsigned int offset = 0, offset0 = 0;
@@ -2281,7 +2294,7 @@ grn_pat_check(grn_ctx *ctx, grn_pat *pat)
   char buf[8];
   struct grn_pat_header *h = pat->header;
   GRN_OUTPUT_ARRAY_OPEN("RESULT", 1);
-  GRN_OUTPUT_MAP_OPEN("SUMMARY", 22);
+  GRN_OUTPUT_MAP_OPEN("SUMMARY", 23);
   GRN_OUTPUT_CSTR("flags");
   grn_itoh(h->flags, buf, 8);
   GRN_OUTPUT_STR(buf, 8);
@@ -2291,6 +2304,8 @@ grn_pat_check(grn_ctx *ctx, grn_pat *pat)
   GRN_OUTPUT_INT64(h->value_size);
   GRN_OUTPUT_CSTR("tokenizer");
   GRN_OUTPUT_INT64(h->tokenizer);
+  GRN_OUTPUT_CSTR("normalizer");
+  GRN_OUTPUT_INT64(h->normalizer);
   GRN_OUTPUT_CSTR("n_entries");
   GRN_OUTPUT_INT64(h->n_entries);
   GRN_OUTPUT_CSTR("curr_rec");

  Modified: lib/pat.h (+3 -1)
===================================================================
--- lib/pat.h    2012-12-14 12:20:38 +0900 (3430f9d)
+++ lib/pat.h    2012-12-14 12:40:02 +0900 (09e1fa8)
@@ -38,6 +38,7 @@ struct _grn_pat {
   uint32_t key_size;
   uint32_t value_size;
   grn_obj *tokenizer;
+  grn_obj *normalizer;
   grn_id *cache;
   uint32_t cache_size;
 };
@@ -64,7 +65,8 @@ struct grn_pat_header {
   int32_t curr_del2;
   int32_t curr_del3;
   uint32_t n_garbages;
-  uint32_t reserved[1005];
+  grn_id normalizer;
+  uint32_t reserved[1004];
   grn_pat_delinfo delinfos[GRN_PAT_NDELINFOS];
   grn_id garbages[GRN_PAT_MAX_KEY_SIZE + 1];
 };

  Modified: lib/sources.am (+2 -0)
===================================================================
--- lib/sources.am    2012-12-14 12:20:38 +0900 (61e888f)
+++ lib/sources.am    2012-12-14 12:40:02 +0900 (f1f704e)
@@ -20,6 +20,8 @@ libgroonga_la_SOURCES =				\
 	io.h					\
 	nfkc.c					\
 	nfkc.h					\
+	normalizer.c				\
+	normalizer.h				\
 	output.c				\
 	output.h				\
 	pat.c					\

  Modified: lib/string.c (+13 -1082)
===================================================================
--- lib/string.c    2012-12-14 12:20:38 +0900 (5253a35)
+++ lib/string.c    2012-12-14 12:40:02 +0900 (2119a70)
@@ -19,1067 +19,12 @@
 #include "groonga_in.h"
 #include <string.h>
 #include "string_in.h"
+#include "normalizer_in.h"
 #include "str.h"
+#include "util.h"
 
 #include <groonga/tokenizer.h>
 
-static unsigned char symbol[] = {
-  ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0,
-  '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-inline static grn_obj *
-eucjp_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
-                grn_user_data *user_data)
-{
-  static uint16_t hankana[] = {
-    0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3,
-    0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2,
-    0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3,
-    0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6,
-    0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5,
-    0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6,
-    0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab,
-    0xa1eb
-  };
-  static unsigned char dakuten[] = {
-    0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0,
-    0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7,
-    0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0,
-    0, 0xdc
-  };
-  static unsigned char handaku[] = {
-    0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd
-  };
-  grn_string *nstr = (grn_string *)args[0];
-  int16_t *ch;
-  const unsigned char *s, *s_, *e;
-  unsigned char *d, *d0, *d_, b;
-  uint_least8_t *cp, *ctypes, ctype;
-  size_t size = nstr->original_length_in_bytes, length = 0;
-  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
-  if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) {
-    ERR(GRN_NO_MEMORY_AVAILABLE,
-        "[strinig][eucjp] failed to allocate normalized text space");
-    return NULL;
-  }
-  d0 = (unsigned char *) nstr->normalized;
-  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
-    if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
-      GRN_FREE(nstr->normalized);
-      nstr->normalized = NULL;
-      ERR(GRN_NO_MEMORY_AVAILABLE,
-          "[strinig][eucjp] failed to allocate checks space");
-      return NULL;
-    }
-  }
-  ch = nstr->checks;
-  if (nstr->flags & GRN_STRING_WITH_TYPES) {
-    if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
-      GRN_FREE(nstr->checks);
-      GRN_FREE(nstr->normalized);
-      nstr->checks = NULL;
-      nstr->normalized = NULL;
-      ERR(GRN_NO_MEMORY_AVAILABLE,
-          "[strinig][eucjp] failed to allocate character types space");
-      return NULL;
-    }
-  }
-  cp = ctypes = nstr->ctypes;
-  e = (unsigned char *)nstr->original + size;
-  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
-    if ((*s & 0x80)) {
-      if (((s + 1) < e) && (*(s + 1) & 0x80)) {
-        unsigned char c1 = *s++, c2 = *s, c3 = 0;
-        switch (c1 >> 4) {
-        case 0x08 :
-          if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) {
-            uint16_t c = hankana[c2 - 0xa0];
-            switch (c) {
-            case 0xa1ab :
-              if (d > d0 + 1 && d[-2] == 0xa5
-                  && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) {
-                *(d - 1) = b;
-                if (ch) { ch[-1] += 2; s_ += 2; }
-                continue;
-              } else {
-                *d++ = c >> 8; *d = c & 0xff;
-              }
-              break;
-            case 0xa1eb :
-              if (d > d0 + 1 && d[-2] == 0xa5
-                  && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) {
-                *(d - 1) = b;
-                if (ch) { ch[-1] += 2; s_ += 2; }
-                continue;
-              } else {
-                *d++ = c >> 8; *d = c & 0xff;
-              }
-              break;
-            default :
-              *d++ = c >> 8; *d = c & 0xff;
-              break;
-            }
-            ctype = grn_char_katakana;
-          } else {
-            *d++ = c1; *d = c2;
-            ctype = grn_char_others;
-          }
-          break;
-        case 0x09 :
-          *d++ = c1; *d = c2;
-          ctype = grn_char_others;
-          break;
-        case 0x0a :
-          switch (c1 & 0x0f) {
-          case 1 :
-            switch (c2) {
-            case 0xbc :
-              *d++ = c1; *d = c2;
-              ctype = grn_char_katakana;
-              break;
-            case 0xb9 :
-              *d++ = c1; *d = c2;
-              ctype = grn_char_kanji;
-              break;
-            case 0xa1 :
-              if (removeblankp) {
-                if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
-                continue;
-              } else {
-                *d = ' ';
-                ctype = GRN_CHAR_BLANK|grn_char_symbol;
-              }
-              break;
-            default :
-              if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) {
-                *d = c3;
-                ctype = grn_char_symbol;
-              } else {
-                *d++ = c1; *d = c2;
-                ctype = grn_char_others;
-              }
-              break;
-            }
-            break;
-          case 2 :
-            *d++ = c1; *d = c2;
-            ctype = grn_char_symbol;
-            break;
-          case 3 :
-            c3 = c2 - 0x80;
-            if ('a' <= c3 && c3 <= 'z') {
-              ctype = grn_char_alpha;
-              *d = c3;
-            } else if ('A' <= c3 && c3 <= 'Z') {
-              ctype = grn_char_alpha;
-              *d = c3 + 0x20;
-            } else if ('0' <= c3 && c3 <= '9') {
-              ctype = grn_char_digit;
-              *d = c3;
-            } else {
-              ctype = grn_char_others;
-              *d++ = c1; *d = c2;
-            }
-            break;
-          case 4 :
-            *d++ = c1; *d = c2;
-            ctype = grn_char_hiragana;
-            break;
-          case 5 :
-            *d++ = c1; *d = c2;
-            ctype = grn_char_katakana;
-            break;
-          case 6 :
-          case 7 :
-          case 8 :
-            *d++ = c1; *d = c2;
-            ctype = grn_char_symbol;
-            break;
-          default :
-            *d++ = c1; *d = c2;
-            ctype = grn_char_others;
-            break;
-          }
-          break;
-        default :
-          *d++ = c1; *d = c2;
-          ctype = grn_char_kanji;
-          break;
-        }
-      } else {
-        /* skip invalid character */
-        continue;
-      }
-    } else {
-      unsigned char c = *s;
-      switch (c >> 4) {
-      case 0 :
-      case 1 :
-        /* skip unprintable ascii */
-        if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
-        continue;
-      case 2 :
-        if (c == 0x20) {
-          if (removeblankp) {
-            if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
-            continue;
-          } else {
-            *d = ' ';
-            ctype = GRN_CHAR_BLANK|grn_char_symbol;
-          }
-        } else {
-          *d = c;
-          ctype = grn_char_symbol;
-        }
-        break;
-      case 3 :
-        *d = c;
-        ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
-        break;
-      case 4 :
-        *d = ('A' <= c) ? c + 0x20 : c;
-        ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
-        break;
-      case 5 :
-        *d = (c <= 'Z') ? c + 0x20 : c;
-        ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
-        break;
-      case 6 :
-        *d = c;
-        ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
-        break;
-      case 7 :
-        *d = c;
-        ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
-        break;
-      default :
-        *d = c;
-        ctype = grn_char_others;
-        break;
-      }
-    }
-    d++;
-    length++;
-    if (cp) { *cp++ = ctype; }
-    if (ch) {
-      *ch++ = (int16_t)(s + 1 - s_);
-      s_ = s + 1;
-      while (++d_ < d) { *ch++ = 0; }
-    }
-  }
-  if (cp) { *cp = grn_char_null; }
-  *d = '\0';
-  nstr->n_characters = length;
-  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
-  return NULL;
-}
-
-inline static grn_obj *
-sjis_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
-               grn_user_data *user_data)
-{
-  static uint16_t hankana[] = {
-    0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342,
-    0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341,
-    0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352,
-    0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365,
-    0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374,
-    0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386,
-    0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a,
-    0x814b
-  };
-  static unsigned char dakuten[] = {
-    0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0,
-    0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66,
-    0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0,
-    0, 0x7b
-  };
-  static unsigned char handaku[] = {
-    0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c
-  };
-  grn_string *nstr = (grn_string *)args[0];
-  int16_t *ch;
-  const unsigned char *s, *s_;
-  unsigned char *d, *d0, *d_, b, *e;
-  uint_least8_t *cp, *ctypes, ctype;
-  size_t size = nstr->original_length_in_bytes, length = 0;
-  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
-  if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) {
-    ERR(GRN_NO_MEMORY_AVAILABLE,
-        "[strinig][sjis] failed to allocate normalized text space");
-    return NULL;
-  }
-  d0 = (unsigned char *) nstr->normalized;
-  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
-    if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
-      GRN_FREE(nstr->normalized);
-      nstr->normalized = NULL;
-      ERR(GRN_NO_MEMORY_AVAILABLE,
-          "[strinig][sjis] failed to allocate checks space");
-      return NULL;
-    }
-  }
-  ch = nstr->checks;
-  if (nstr->flags & GRN_STRING_WITH_TYPES) {
-    if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
-      GRN_FREE(nstr->checks);
-      GRN_FREE(nstr->normalized);
-      nstr->checks = NULL;
-      nstr->normalized = NULL;
-      ERR(GRN_NO_MEMORY_AVAILABLE,
-          "[strinig][sjis] failed to allocate character types space");
-      return NULL;
-    }
-  }
-  cp = ctypes = nstr->ctypes;
-  e = (unsigned char *)nstr->original + size;
-  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
-    if ((*s & 0x80)) {
-      if (0xa0 <= *s && *s <= 0xdf) {
-        uint16_t c = hankana[*s - 0xa0];
-        switch (c) {
-        case 0x814a :
-          if (d > d0 + 1 && d[-2] == 0x83
-              && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) {
-            *(d - 1) = b;
-            if (ch) { ch[-1]++; s_++; }
-            continue;
-          } else {
-            *d++ = c >> 8; *d = c & 0xff;
-          }
-          break;
-        case 0x814b :
-          if (d > d0 + 1 && d[-2] == 0x83
-              && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) {
-            *(d - 1) = b;
-            if (ch) { ch[-1]++; s_++; }
-            continue;
-          } else {
-            *d++ = c >> 8; *d = c & 0xff;
-          }
-          break;
-        default :
-          *d++ = c >> 8; *d = c & 0xff;
-          break;
-        }
-        ctype = grn_char_katakana;
-      } else {
-        if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) {
-          unsigned char c1 = *s++, c2 = *s, c3 = 0;
-          if (0x81 <= c1 && c1 <= 0x87) {
-            switch (c1 & 0x0f) {
-            case 1 :
-              switch (c2) {
-              case 0x5b :
-                *d++ = c1; *d = c2;
-                ctype = grn_char_katakana;
-                break;
-              case 0x58 :
-                *d++ = c1; *d = c2;
-                ctype = grn_char_kanji;
-                break;
-              case 0x40 :
-                if (removeblankp) {
-                  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
-                  continue;
-                } else {
-                  *d = ' ';
-                  ctype = GRN_CHAR_BLANK|grn_char_symbol;
-                }
-                break;
-              default :
-                if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) {
-                  *d = c3;
-                  ctype = grn_char_symbol;
-                } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) {
-                  *d = c3;
-                  ctype = grn_char_symbol;
-                } else {
-                  *d++ = c1; *d = c2;
-                  ctype = grn_char_others;
-                }
-                break;
-              }
-              break;
-            case 2 :
-              c3 = c2 - 0x1f;
-              if (0x4f <= c2 && c2 <= 0x58) {
-                ctype = grn_char_digit;
-                *d = c2 - 0x1f;
-              } else if (0x60 <= c2 && c2 <= 0x79) {
-                ctype = grn_char_alpha;
-                *d = c2 + 0x01;
-              } else if (0x81 <= c2 && c2 <= 0x9a) {
-                ctype = grn_char_alpha;
-                *d = c2 - 0x20;
-              } else if (0x9f <= c2 && c2 <= 0xf1) {
-                *d++ = c1; *d = c2;
-                ctype = grn_char_hiragana;
-              } else {
-                *d++ = c1; *d = c2;
-                ctype = grn_char_others;
-              }
-              break;
-            case 3 :
-              if (0x40 <= c2 && c2 <= 0x96) {
-                *d++ = c1; *d = c2;
-                ctype = grn_char_katakana;
-              } else {
-                *d++ = c1; *d = c2;
-                ctype = grn_char_symbol;
-              }
-              break;
-            case 4 :
-            case 7 :
-              *d++ = c1; *d = c2;
-              ctype = grn_char_symbol;
-              break;
-            default :
-              *d++ = c1; *d = c2;
-              ctype = grn_char_others;
-              break;
-            }
-          } else {
-            *d++ = c1; *d = c2;
-            ctype = grn_char_kanji;
-          }
-        } else {
-          /* skip invalid character */
-          continue;
-        }
-      }
-    } else {
-      unsigned char c = *s;
-      switch (c >> 4) {
-      case 0 :
-      case 1 :
-        /* skip unprintable ascii */
-        if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
-        continue;
-      case 2 :
-        if (c == 0x20) {
-          if (removeblankp) {
-            if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
-            continue;
-          } else {
-            *d = ' ';
-            ctype = GRN_CHAR_BLANK|grn_char_symbol;
-          }
-        } else {
-          *d = c;
-          ctype = grn_char_symbol;
-        }
-        break;
-      case 3 :
-        *d = c;
-        ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
-        break;
-      case 4 :
-        *d = ('A' <= c) ? c + 0x20 : c;
-        ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
-        break;
-      case 5 :
-        *d = (c <= 'Z') ? c + 0x20 : c;
-        ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
-        break;
-      case 6 :
-        *d = c;
-        ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
-        break;
-      case 7 :
-        *d = c;
-        ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
-        break;
-      default :
-        *d = c;
-        ctype = grn_char_others;
-        break;
-      }
-    }
-    d++;
-    length++;
-    if (cp) { *cp++ = ctype; }
-    if (ch) {
-      *ch++ = (int16_t)(s + 1 - s_);
-      s_ = s + 1;
-      while (++d_ < d) { *ch++ = 0; }
-    }
-  }
-  if (cp) { *cp = grn_char_null; }
-  *d = '\0';
-  nstr->n_characters = length;
-  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
-  return NULL;
-}
-
-#ifdef WITH_NFKC
-uint_least8_t grn_nfkc_ctype(const unsigned char *str);
-const char *grn_nfkc_map1(const unsigned char *str);
-const char *grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix);
-
-static inline int
-grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char *end)
-{
-  /* MEMO: This function allows non-null-terminated string as str. */
-  /*       But requires the end of string. */
-  const unsigned char *p = str;
-  if (end <= p || !*p) { return 0; }
-  if (*p & 0x80) {
-    int b, w;
-    int size;
-    int i;
-    for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++);
-    if (!w) {
-      GRN_LOG(ctx, GRN_LOG_WARNING,
-              "invalid utf8 string: the first bit is 0x80: <%.*s>: <%.*s>",
-              (int)(end - p), p,
-              (int)(end - str), str);
-      return 0;
-    }
-    size = w + 1;
-    for (i = 1; i < size; i++) {
-      if (++p >= end) {
-        GRN_LOG(ctx, GRN_LOG_WARNING,
-                "invalid utf8 string: too short: "
-                "%d byte is required but %d byte is given: <%.*s>",
-                size, i,
-                (int)(end - str), str);
-        return 0;
-      }
-      if (!*p) {
-        GRN_LOG(ctx, GRN_LOG_WARNING,
-                "invalid utf8 string: NULL character is found: <%.*s>",
-                (int)(end - str), str);
-        return 0;
-      }
-      if ((*p & 0xc0) != 0x80) {
-        GRN_LOG(ctx, GRN_LOG_WARNING,
-                "invalid utf8 string: 0x80 is not allowed: <%.*s>: <%.*s>",
-                (int)(end - p), p,
-                (int)(end - str), str);
-        return 0;
-      }
-    }
-    return size;
-  } else {
-    return 1;
-  }
-  return 0;
-}
-
-inline static grn_obj *
-utf8_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-  int16_t *ch;
-  const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
-  unsigned char *d, *d_, *de;
-  uint_least8_t *cp;
-  grn_string *nstr = (grn_string *)args[0];
-  size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3;
-  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
-  grn_bool remove_tokenized_delimiter_p =
-    nstr->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER;
-  if (!(nstr->normalized = GRN_MALLOC(ds + 1))) {
-    ERR(GRN_NO_MEMORY_AVAILABLE,
-        "[strinig][utf8] failed to allocate normalized text space");
-    return NULL;
-  }
-  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
-    if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) {
-      GRN_FREE(nstr->normalized);
-      nstr->normalized = NULL;
-      ERR(GRN_NO_MEMORY_AVAILABLE,
-          "[strinig][utf8] failed to allocate checks space");
-      return NULL;
-    }
-  }
-  ch = nstr->checks;
-  if (nstr->flags & GRN_STRING_WITH_TYPES) {
-    if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) {
-      if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
-      GRN_FREE(nstr->normalized); nstr->normalized = NULL;
-      ERR(GRN_NO_MEMORY_AVAILABLE,
-          "[strinig][utf8] failed to allocate character types space");
-      return NULL;
-    }
-  }
-  cp = nstr->ctypes;
-  d = (unsigned char *)nstr->normalized;
-  de = d + ds;
-  d_ = NULL;
-  e = (unsigned char *)nstr->original + size;
-  for (s = s_ = (unsigned char *)nstr->original; ; s += ls) {
-    if (!(ls = grn_str_charlen_utf8(ctx, s, e))) {
-      break;
-    }
-    if (remove_tokenized_delimiter_p &&
-        grn_tokenizer_is_tokenized_delimiter(ctx, s, ls, GRN_ENC_UTF8)) {
-      continue;
-    }
-    if ((p = (unsigned char *)grn_nfkc_map1(s))) {
-      pe = p + strlen((char *)p);
-    } else {
-      p = s;
-      pe = p + ls;
-    }
-    if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
-      p = p2;
-      pe = p + strlen((char *)p);
-      if (cp) { cp--; }
-      if (ch) {
-        ch -= (d - d_);
-        if (ch[0] >= 0) {
-          s_ = s__;
-        }
-      }
-      d = d_;
-      length--;
-    }
-    for (; ; p += lp) {
-      if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) {
-        break;
-      }
-      if ((*p == ' ' && removeblankp) || *p < 0x20  /* skip unprintable ascii */ ) {
-        if (cp > nstr->ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
-      } else {
-        if (de <= d + lp) {
-          unsigned char *normalized;
-          ds += (ds >> 1) + lp;
-          if (!(normalized = GRN_REALLOC(nstr->normalized, ds + 1))) {
-            if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
-            if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
-            GRN_FREE(nstr->normalized); nstr->normalized = NULL;
-            ERR(GRN_NO_MEMORY_AVAILABLE,
-                "[strinig][utf8] failed to expand normalized text space");
-            return NULL;
-          }
-          de = normalized + ds;
-          d = normalized + (d - (unsigned char *)nstr->normalized);
-          nstr->normalized = normalized;
-          if (ch) {
-            int16_t *checks;
-            if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) {
-              if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
-              GRN_FREE(nstr->checks); nstr->checks = NULL;
-              GRN_FREE(nstr->normalized); nstr->normalized = NULL;
-              ERR(GRN_NO_MEMORY_AVAILABLE,
-                  "[strinig][utf8] failed to expand checks space");
-              return NULL;
-            }
-            ch = checks + (ch - nstr->checks);
-            nstr->checks = checks;
-          }
-          if (cp) {
-            uint_least8_t *ctypes;
-            if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) {
-              GRN_FREE(nstr->ctypes); nstr->ctypes = NULL;
-              if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
-              GRN_FREE(nstr->normalized); nstr->normalized = NULL;
-              ERR(GRN_NO_MEMORY_AVAILABLE,
-                  "[strinig][utf8] failed to expand character types space");
-              return NULL;
-            }
-            cp = ctypes + (cp - nstr->ctypes);
-            nstr->ctypes = ctypes;
-          }
-        }
-        memcpy(d, p, lp);
-        d_ = d;
-        d += lp;
-        length++;
-        if (cp) { *cp++ = grn_nfkc_ctype(p); }
-        if (ch) {
-          size_t i;
-          if (s_ == s + ls) {
-            *ch++ = -1;
-          } else {
-            *ch++ = (int16_t)(s + ls - s_);
-            s__ = s_;
-            s_ = s + ls;
-          }
-          for (i = lp; i > 1; i--) { *ch++ = 0; }
-        }
-      }
-    }
-  }
-  if (cp) { *cp = grn_str_null; }
-  *d = '\0';
-  nstr->n_characters = length;
-  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
-  return NULL;
-}
-#endif /* WITH_NFKC */
-
-inline static grn_obj *
-ascii_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-  grn_string *nstr = (grn_string *)args[0];
-  int16_t *ch;
-  const unsigned char *s, *s_, *e;
-  unsigned char *d, *d0, *d_;
-  uint_least8_t *cp, *ctypes, ctype;
-  size_t size = nstr->original_length_in_bytes, length = 0;
-  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
-  if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
-    ERR(GRN_NO_MEMORY_AVAILABLE,
-        "[strinig][ascii] failed to allocate normalized text space");
-    return NULL;
-  }
-  d0 = (unsigned char *) nstr->normalized;
-  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
-    if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
-      GRN_FREE(nstr->normalized);
-      nstr->normalized = NULL;
-      ERR(GRN_NO_MEMORY_AVAILABLE,
-          "[strinig][ascii] failed to allocate checks space");
-      return NULL;
-    }
-  }
-  ch = nstr->checks;
-  if (nstr->flags & GRN_STRING_WITH_TYPES) {
-    if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
-      GRN_FREE(nstr->checks);
-      GRN_FREE(nstr->normalized);
-      nstr->checks = NULL;
-      nstr->normalized = NULL;
-      ERR(GRN_NO_MEMORY_AVAILABLE,
-          "[strinig][ascii] failed to allocate character types space");
-      return NULL;
-    }
-  }
-  cp = ctypes = nstr->ctypes;
-  e = (unsigned char *)nstr->original + size;
-  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
-    unsigned char c = *s;
-    switch (c >> 4) {
-    case 0 :
-    case 1 :
-      /* skip unprintable ascii */
-      if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
-      continue;
-    case 2 :
-      if (c == 0x20) {
-        if (removeblankp) {
-          if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
-          continue;
-        } else {
-          *d = ' ';
-          ctype = GRN_CHAR_BLANK|grn_char_symbol;
-        }
-      } else {
-        *d = c;
-        ctype = grn_char_symbol;
-      }
-      break;
-    case 3 :
-      *d = c;
-      ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
-      break;
-    case 4 :
-      *d = ('A' <= c) ? c + 0x20 : c;
-      ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
-      break;
-    case 5 :
-      *d = (c <= 'Z') ? c + 0x20 : c;
-      ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
-      break;
-    case 6 :
-      *d = c;
-      ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
-      break;
-    case 7 :
-      *d = c;
-      ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
-      break;
-    default :
-      *d = c;
-      ctype = grn_char_others;
-      break;
-    }
-    d++;
-    length++;
-    if (cp) { *cp++ = ctype; }
-    if (ch) {
-      *ch++ = (int16_t)(s + 1 - s_);
-      s_ = s + 1;
-      while (++d_ < d) { *ch++ = 0; }
-    }
-  }
-  if (cp) { *cp = grn_char_null; }
-  *d = '\0';
-  nstr->n_characters = length;
-  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
-  return NULL;
-}
-
-/* use cp1252 as latin1 */
-inline static grn_obj *
-latin1_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
-                 grn_user_data *user_data)
-{
-  grn_string *nstr = (grn_string *)args[0];
-  int16_t *ch;
-  const unsigned char *s, *s_, *e;
-  unsigned char *d, *d0, *d_;
-  uint_least8_t *cp, *ctypes, ctype;
-  size_t size = nstr->original_length_in_bytes, length = 0;
-  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
-  if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
-    ERR(GRN_NO_MEMORY_AVAILABLE,
-        "[strinig][latin1] failed to allocate normalized text space");
-    return NULL;
-  }
-  d0 = (unsigned char *) nstr->normalized;
-  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
-    if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
-      GRN_FREE(nstr->normalized);
-      nstr->normalized = NULL;
-      ERR(GRN_NO_MEMORY_AVAILABLE,
-          "[strinig][latin1] failed to allocate checks space");
-      return NULL;
-    }
-  }
-  ch = nstr->checks;
-  if (nstr->flags & GRN_STRING_WITH_TYPES) {
-    if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
-      GRN_FREE(nstr->checks);
-      GRN_FREE(nstr->normalized);
-      nstr->checks = NULL;
-      nstr->normalized = NULL;
-      ERR(GRN_NO_MEMORY_AVAILABLE,
-          "[normalizer][latin1] failed to allocate character types space");
-      return NULL;
-    }
-  }
-  cp = ctypes = nstr->ctypes;
-  e = (unsigned char *)nstr->original + size;
-  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
-    unsigned char c = *s;
-    switch (c >> 4) {
-    case 0 :
-    case 1 :
-      /* skip unprintable ascii */
-      if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
-      continue;
-    case 2 :
-      if (c == 0x20) {
-        if (removeblankp) {
-          if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
-          continue;
-        } else {
-          *d = ' ';
-          ctype = GRN_CHAR_BLANK|grn_char_symbol;
-        }
-      } else {
-        *d = c;
-        ctype = grn_char_symbol;
-      }
-      break;
-    case 3 :
-      *d = c;
-      ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
-      break;
-    case 4 :
-      *d = ('A' <= c) ? c + 0x20 : c;
-      ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
-      break;
-    case 5 :
-      *d = (c <= 'Z') ? c + 0x20 : c;
-      ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
-      break;
-    case 6 :
-      *d = c;
-      ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
-      break;
-    case 7 :
-      *d = c;
-      ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
-      break;
-    case 8 :
-      if (c == 0x8a || c == 0x8c || c == 0x8e) {
-        *d = c + 0x10;
-        ctype = grn_char_alpha;
-      } else {
-        *d = c;
-        ctype = grn_char_symbol;
-      }
-      break;
-    case 9 :
-      if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) {
-        *d = (c == 0x9f) ? c + 0x60 : c;
-        ctype = grn_char_alpha;
-      } else {
-        *d = c;
-        ctype = grn_char_symbol;
-      }
-      break;
-    case 0x0c :
-      *d = c + 0x20;
-      ctype = grn_char_alpha;
-      break;
-    case 0x0d :
-      *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20;
-      ctype = (c == 0xd7) ? grn_char_symbol : grn_char_alpha;
-      break;
-    case 0x0e :
-      *d = c;
-      ctype = grn_char_alpha;
-      break;
-    case 0x0f :
-      *d = c;
-      ctype = (c == 0xf7) ? grn_char_symbol : grn_char_alpha;
-      break;
-    default :
-      *d = c;
-      ctype = grn_char_others;
-      break;
-    }
-    d++;
-    length++;
-    if (cp) { *cp++ = ctype; }
-    if (ch) {
-      *ch++ = (int16_t)(s + 1 - s_);
-      s_ = s + 1;
-      while (++d_ < d) { *ch++ = 0; }
-    }
-  }
-  if (cp) { *cp = grn_char_null; }
-  *d = '\0';
-  nstr->n_characters = length;
-  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
-  return NULL;
-}
-
-inline static grn_obj *
-koi8r_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
-                grn_user_data *user_data)
-{
-  grn_string *nstr = (grn_string *)args[0];
-  int16_t *ch;
-  const unsigned char *s, *s_, *e;
-  unsigned char *d, *d0, *d_;
-  uint_least8_t *cp, *ctypes, ctype;
-  size_t size = nstr->original_length_in_bytes, length = 0;
-  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
-  if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
-    ERR(GRN_NO_MEMORY_AVAILABLE,
-        "[strinig][koi8r] failed to allocate normalized text space");
-    return NULL;
-  }
-  d0 = (unsigned char *) nstr->normalized;
-  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
-    if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
-      GRN_FREE(nstr->normalized);
-      nstr->normalized = NULL;
-      ERR(GRN_NO_MEMORY_AVAILABLE,
-          "[strinig][koi8r] failed to allocate checks space");
-      return NULL;
-    }
-  }
-  ch = nstr->checks;
-  if (nstr->flags & GRN_STRING_WITH_TYPES) {
-    if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
-      GRN_FREE(nstr->checks);
-      GRN_FREE(nstr->normalized);
-      nstr->checks = NULL;
-      nstr->normalized = NULL;
-      ERR(GRN_NO_MEMORY_AVAILABLE,
-          "[strinig][koi8r] failed to allocate character types space");
-      return NULL;
-    }
-  }
-  cp = ctypes = nstr->ctypes;
-  e = (unsigned char *)nstr->original + size;
-  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
-    unsigned char c = *s;
-    switch (c >> 4) {
-    case 0 :
-    case 1 :
-      /* skip unprintable ascii */
-      if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
-      continue;
-    case 2 :
-      if (c == 0x20) {
-        if (removeblankp) {
-          if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
-          continue;
-        } else {
-          *d = ' ';
-          ctype = GRN_CHAR_BLANK|grn_char_symbol;
-        }
-      } else {
-        *d = c;
-        ctype = grn_char_symbol;
-      }
-      break;
-    case 3 :
-      *d = c;
-      ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
-      break;
-    case 4 :
-      *d = ('A' <= c) ? c + 0x20 : c;
-      ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
-      break;
-    case 5 :
-      *d = (c <= 'Z') ? c + 0x20 : c;
-      ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
-      break;
-    case 6 :
-      *d = c;
-      ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
-      break;
-    case 7 :
-      *d = c;
-      ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
-      break;
-    case 0x0a :
-      *d = c;
-      ctype = (c == 0xa3) ? grn_char_alpha : grn_char_others;
-      break;
-    case 0x0b :
-      if (c == 0xb3) {
-        *d = c - 0x10;
-        ctype = grn_char_alpha;
-      } else {
-        *d = c;
-        ctype = grn_char_others;
-      }
-      break;
-    case 0x0c :
-    case 0x0d :
-      *d = c;
-      ctype = grn_char_alpha;
-      break;
-    case 0x0e :
-    case 0x0f :
-      *d = c - 0x20;
-      ctype = grn_char_alpha;
-      break;
-    default :
-      *d = c;
-      ctype = grn_char_others;
-      break;
-    }
-    d++;
-    length++;
-    if (cp) { *cp++ = ctype; }
-    if (ch) {
-      *ch++ = (int16_t)(s + 1 - s_);
-      s_ = s + 1;
-      while (++d_ < d) { *ch++ = 0; }
-    }
-  }
-  if (cp) { *cp = grn_char_null; }
-  *d = '\0';
-  nstr->n_characters = length;
-  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
-  return NULL;
-}
-
 static grn_string *
 grn_fake_string_open(grn_ctx *ctx, grn_string *string)
 {
@@ -1189,7 +134,7 @@ grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len,
 {
   grn_string *string;
   grn_obj *obj;
-  grn_obj *args[1];
+  grn_bool is_normalizer_auto;
 
   if (!str || !str_len) {
     return NULL;
@@ -1218,36 +163,22 @@ grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len,
     return (grn_obj *)grn_fake_string_open(ctx, string);
   }
 
-  args[0] = obj;
-  switch (encoding) {
-  case GRN_ENC_EUC_JP :
-    eucjp_normalize(ctx, 1, args, NULL);
-    break;
-  case GRN_ENC_UTF8 :
-#ifdef WITH_NFKC
-    utf8_normalize(ctx, 1, args, NULL);
-#else /* WITH_NFKC */
-    ascii_normalize(ctx, 1, args, NULL);
-#endif /* WITH_NFKC */
-    break;
-  case GRN_ENC_SJIS :
-    sjis_normalize(ctx, 1, args, NULL);
-    break;
-  case GRN_ENC_LATIN1 :
-    latin1_normalize(ctx, 1, args, NULL);
-    break;
-  case GRN_ENC_KOI8R :
-    koi8r_normalize(ctx, 1, args, NULL);
-    break;
-  default :
-    ascii_normalize(ctx, 1, args, NULL);
-    break;
+  is_normalizer_auto = (normalizer == GRN_NORMALIZER_AUTO);
+  if (is_normalizer_auto) {
+    normalizer = grn_ctx_at(ctx, GRN_DB_NORMALIZER_AUTO);
   }
+
+  /* TODO: check rc */
+  grn_normalizer_normalize(ctx, normalizer, (grn_obj *)string);
   if (ctx->rc) {
     grn_obj_close(ctx, obj);
     obj = NULL;
   }
 
+  if (is_normalizer_auto) {
+    grn_obj_unlink(ctx, normalizer);
+  }
+
   return obj;
 }
 

  Modified: lib/token.c (+7 -14)
===================================================================
--- lib/token.c    2012-12-14 12:20:38 +0900 (bdcbafe)
+++ lib/token.c    2012-12-14 12:40:02 +0900 (3c730a3)
@@ -111,7 +111,8 @@ delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data,
   }
   user_data->ptr = tokenizer;
 
-  grn_table_get_info(ctx, table, &table_flags, &tokenizer->encoding, NULL);
+  grn_table_get_info(ctx, table, &table_flags, &tokenizer->encoding, NULL,
+                     &normalizer);
 
   tokenizer->have_tokenized_delimiter =
     grn_tokenizer_have_tokenized_delimiter(ctx,
@@ -120,10 +121,6 @@ delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data,
                                            tokenizer->encoding);
   tokenizer->delimiter = delimiter;
   tokenizer->delimiter_len = delimiter_len;
-
-  if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
-    normalizer = GRN_NORMALIZER_AUTO;
-  }
   tokenizer->nstr = grn_string_open_(ctx,
                                      GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
                                      normalizer, nflags, tokenizer->encoding);
@@ -260,10 +257,8 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
   token->overlap = 0;
   token->pos = 0;
   token->skip = 0;
-  grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL);
-  if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
-    normalizer = GRN_NORMALIZER_AUTO;
-  }
+  grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL,
+                     &normalizer);
   if (!(token->nstr = grn_string_open_(ctx,
                                        GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
                                        normalizer, nflags, token->encoding))) {
@@ -452,8 +447,10 @@ grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
   grn_token *token;
   grn_encoding encoding;
   grn_obj *tokenizer;
+  grn_obj *normalizer;
   grn_obj_flags table_flags;
-  if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer)) {
+  if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer,
+                         &normalizer)) {
     return NULL;
   }
   if (!(token = GRN_MALLOC(sizeof(grn_token)))) { return NULL; }
@@ -483,11 +480,7 @@ grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
     ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token->pctx.user_data);
     grn_obj_close(ctx, &str_);
   } else {
-    grn_obj *normalizer = NULL;
     int nflags = 0;
-    if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
-      normalizer = GRN_NORMALIZER_AUTO;
-    }
     token->nstr = grn_string_open_(ctx, str, str_len,
                                    normalizer, nflags, token->encoding);
     if (token->nstr) {

  Modified: lib/tokenizer.c (+4 -2)
===================================================================
--- lib/tokenizer.c    2012-12-14 12:20:38 +0900 (6a377fb)
+++ lib/tokenizer.c    2012-12-14 12:40:02 +0900 (c5a58bc)
@@ -165,15 +165,17 @@ grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args)
       grn_encoding table_encoding;
       unsigned int query_length = GRN_TEXT_LEN(query_str);
       char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1);
+      grn_obj *normalizer = NULL;
+
       if (query_buf == NULL) {
         GRN_PLUGIN_FREE(ctx, query);
         GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                          "[tokenizer] failed to duplicate query");
         return NULL;
       }
-      grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL);
+      grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL,
+                         &normalizer);
       {
-        grn_obj *normalizer = NULL;
         int flags = 0;
         grn_obj *normalized_string;
         if (table_flags & GRN_OBJ_KEY_NORMALIZE) {

  Modified: lib/util.c (+3 -0)
===================================================================
--- lib/util.c    2012-12-14 12:20:38 +0900 (46a5511)
+++ lib/util.c    2012-12-14 12:40:02 +0900 (3664a94)
@@ -252,6 +252,9 @@ grn_proc_inspect(grn_ctx *ctx, grn_obj *buf, grn_obj *obj)
   case GRN_PROC_HOOK :
     GRN_TEXT_PUTS(ctx, buf, "hook");
     break;
+  case GRN_PROC_NORMALIZER :
+    GRN_TEXT_PUTS(ctx, buf, "normalizer");
+    break;
   }
   GRN_TEXT_PUTS(ctx, buf, " ");
 

  Modified: test/unit/core/dat/test-dat.cpp (+3 -0)
===================================================================
--- test/unit/core/dat/test-dat.cpp    2012-12-14 12:20:38 +0900 (904316f)
+++ test/unit/core/dat/test-dat.cpp    2012-12-14 12:40:02 +0900 (681b70a)
@@ -71,6 +71,7 @@ namespace test_dat
 {
   const char *base_dir;
   grn_ctx ctx;
+  grn_obj *database;
 
   void cut_setup(void)
   {
@@ -81,12 +82,14 @@ namespace test_dat
     g_mkdir_with_parents(base_dir, 0755);
 
     grn_ctx_init(&ctx, 0);
+    database = grn_db_create(&ctx, NULL, NULL);
     enter_api(&ctx);
   }
 
   void cut_teardown(void)
   {
     leave_api(&ctx);
+    grn_obj_close(&ctx, database);
     grn_ctx_fin(&ctx);
 
     if (base_dir) {

  Modified: test/unit/util/test-snip.c (+4 -1)
===================================================================
--- test/unit/util/test-snip.c    2012-12-14 12:20:38 +0900 (fdd97b5)
+++ test/unit/util/test-snip.c    2012-12-14 12:40:02 +0900 (60f30a0)
@@ -46,6 +46,7 @@ void test_add_cond_with_too_large_keyword(void);
 void test_add_cond_with_copy_tag_flag(void);
 
 static grn_ctx context;
+static grn_obj *database;
 static grn_snip *snip;
 static gchar *keyword;
 static gchar *result;
@@ -197,7 +198,8 @@ cut_shutdown(void)
 void
 cut_setup(void)
 {
-  grn_ctx_init(&context, GRN_CTX_USE_QL);
+  grn_ctx_init(&context, 0);
+  database = grn_db_create(&context, NULL, NULL);
 
   snip = NULL;
   keyword = NULL;
@@ -234,6 +236,7 @@ cut_teardown(void)
     g_free(default_close_tag);
   }
 
+  grn_obj_close(&context, database);
   grn_ctx_fin(&context);
 }
 

  Modified: test/unit/util/test-string.c (+4 -1)
===================================================================
--- test/unit/util/test-string.c    2012-12-14 12:20:38 +0900 (b97118f)
+++ test/unit/util/test-string.c    2012-12-14 12:40:02 +0900 (b1b617b)
@@ -55,6 +55,7 @@ void data_itoh(void);
 void test_itoh(gconstpointer data);
 
 static grn_ctx context;
+static grn_obj *database;
 static grn_obj buffer;
 
 static const gchar text_ja_utf8[] =
@@ -76,7 +77,8 @@ static const gchar normalized_text_ja_utf8[] =
 void
 setup (void)
 {
-  grn_ctx_init(&context, GRN_CTX_USE_QL);
+  grn_ctx_init(&context, 0);
+  database = grn_db_create(&context, NULL, NULL);
   GRN_VOID_INIT(&buffer);
 }
 
@@ -84,6 +86,7 @@ void
 teardown (void)
 {
   GRN_OBJ_FIN(&context, &buffer);
+  grn_obj_close(&context, database);
   grn_ctx_fin(&context);
 }
 




More information about the Groonga-commit mailing list
Back to archive index