[Groonga-commit] groonga/groonga [master] Create grn_string that supports custom normalization

Back to archive index

null+****@clear***** null+****@clear*****
2012年 6月 8日 (金) 16:39:06 JST


Kouhei Sutou	2012-06-08 16:39:06 +0900 (Fri, 08 Jun 2012)

  New Revision: db4408268daa4e055d095d799662b33da1b58adf

  Log:
    Create grn_string that supports custom normalization
    
    grn_str is deprecated. Use grn_string instead.
    
    refs #1164

  Added files:
    lib/string.c
    lib/string_in.h
  Modified files:
    include/groonga.h
    lib/dat.cpp
    lib/db.c
    lib/expr.c
    lib/ii.c
    lib/pat.c
    lib/snip.c
    lib/snip.h
    lib/sources.am
    lib/token.c
    lib/token.h
    lib/tokenizer.c

  Modified: include/groonga.h (+52 -1)
===================================================================
--- include/groonga.h    2012-06-08 17:38:04 +0900 (76ca58e)
+++ include/groonga.h    2012-06-08 16:39:06 +0900 (5c0d626)
@@ -414,6 +414,7 @@ typedef unsigned short int grn_obj_flags;
 #define GRN_ACCESSOR_VIEW              (0x0a)
 #define GRN_SNIP                       (0x0b)
 #define GRN_PATSNIP                    (0x0c)
+#define GRN_STRING                     (0x0d)
 #define GRN_CURSOR_TABLE_HASH_KEY      (0x10)
 #define GRN_CURSOR_TABLE_PAT_KEY       (0x11)
 #define GRN_CURSOR_TABLE_DAT_KEY       (0x12)
@@ -2435,7 +2436,7 @@ GRN_API void grn_time_now(grn_ctx *ctx, grn_obj *obj);
   grn_bulk_write((ctx), (obj), (char *)&_val, sizeof(grn_obj *));\
 } while (0)
 
-/* grn_str */
+/* grn_str: deprecated. use grn_string instead. */
 
 typedef struct {
   const char *orig;
@@ -2458,6 +2459,56 @@ GRN_API grn_str *grn_str_open(grn_ctx *ctx, const char *str, unsigned int str_le
                               int flags);
 GRN_API grn_rc grn_str_close(grn_ctx *ctx, grn_str *nstr);
 
+/* grn_string */
+
+#define GRN_STRING_REMOVE_BLANK (0x01<<0)
+#define GRN_STRING_WITH_TYPES   (0x01<<1)
+#define GRN_STRING_WITH_CHECKS  (0x01<<2)
+
+#define GRN_NORMALIZER_AUTO ((grn_obj *)1)
+
+#define GRN_CHAR_BLANK 0x80
+#define GRN_CHAR_IS_BLANK(c) ((c) & (GRN_CHAR_BLANK))
+#define GRN_CHAR_TYPE(c) ((c) & 0x7f)
+
+typedef enum {
+  grn_char_null = 0,
+  grn_char_alpha,
+  grn_char_digit,
+  grn_char_symbol,
+  grn_char_hiragana,
+  grn_char_katakana,
+  grn_char_kanji,
+  grn_char_others
+} grn_char_type;
+
+GRN_API grn_obj *grn_string_open(grn_ctx *ctx,
+                                 const char *string,
+                                 unsigned int length_in_bytes,
+                                 grn_obj *normalizer, int flags);
+GRN_API grn_rc grn_string_get_original(grn_ctx *ctx, grn_obj *string,
+                                       const char **original,
+                                       unsigned int *length_in_bytes);
+GRN_API int grn_string_get_flags(grn_ctx *ctx, grn_obj *string);
+GRN_API grn_rc grn_string_get_normalized(grn_ctx *ctx, grn_obj *string,
+                                         const char **normalized,
+                                         unsigned int *length_in_bytes,
+                                         unsigned int *n_characters);
+GRN_API grn_rc grn_string_set_normalized(grn_ctx *ctx, grn_obj *string,
+                                         char *original,
+                                         unsigned int length_in_bytes,
+                                         unsigned int n_characters);
+GRN_API const short *grn_string_get_checks(grn_ctx *ctx, grn_obj *string);
+GRN_API grn_rc grn_string_set_checks(grn_ctx *ctx,
+                                     grn_obj *string,
+                                     short *checks);
+GRN_API const unsigned char *grn_string_get_types(grn_ctx *ctx, grn_obj *string);
+GRN_API grn_rc grn_string_set_types(grn_ctx *ctx,
+                                    grn_obj *string,
+                                    unsigned char *types);
+GRN_API grn_encoding grn_string_get_encoding(grn_ctx *ctx, grn_obj *string);
+
+
 GRN_API int grn_charlen(grn_ctx *ctx, const char *str, const char *end);
 
 /* expr */

  Modified: lib/dat.cpp (+12 -9)
===================================================================
--- lib/dat.cpp    2012-06-08 17:38:04 +0900 (a20f398)
+++ lib/dat.cpp    2012-06-08 16:39:06 +0900 (552da69)
@@ -672,15 +672,17 @@ grn_dat_scan(grn_ctx *ctx, grn_dat *dat, const char *str,
   int num_scan_hits = 0;
   try {
     if (dat->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {
-      grn_str * const normalized_str = grn_str_open(
-          ctx, str, str_size, GRN_STR_NORMALIZE | GRN_STR_WITH_CHECKS);
-      if (!normalized_str) {
-        fprintf(stderr, "error: grn_str_open() failed!\n");
+      grn_obj *normalizer = GRN_NORMALIZER_AUTO;
+      int flags = GRN_STRING_WITH_CHECKS;
+      grn_obj * const normalized_string = grn_string_open(ctx, str, str_size,
+                                                          normalizer,
+                                                          flags);
+      if (!normalized_string) {
+        fprintf(stderr, "error: grn_string_open() failed!\n");
         return -1;
       }
-      str = normalized_str->norm;
-      str_size = normalized_str->norm_blen;
-      const short *checks = normalized_str->checks;
+      grn_string_get_normalized(ctx, normalized_string, &str, &str_size, NULL);
+      const short *checks = grn_string_get_checks(ctx, normalized_string);
       unsigned int offset = 0;
       while (str_size) {
         if (*checks) {
@@ -717,9 +719,10 @@ grn_dat_scan(grn_ctx *ctx, grn_dat *dat, const char *str,
         ++checks;
       }
       if (str_rest) {
-        *str_rest = normalized_str->orig + offset;
+        grn_string_get_original(ctx, normalized_string, str_rest, NULL);
+        *str_rest += offset;
       }
-      grn_str_close(ctx, normalized_str);
+      grn_obj_close(ctx, normalized_string);
     } else {
       const char * const begin = str;
       while (str_size) {

  Modified: lib/db.c (+11 -5)
===================================================================
--- lib/db.c    2012-06-08 17:38:04 +0900 (91d4ca4)
+++ lib/db.c    2012-06-08 16:39:06 +0900 (dcbde8e)
@@ -26,6 +26,7 @@
 #include "plugin_in.h"
 #include "geo.h"
 #include "snip.h"
+#include "string_in.h"
 #include "util.h"
 #include <string.h>
 #include <float.h>
@@ -34,12 +35,14 @@
 
 #define WITH_NORMALIZE(table,key,key_size,block) do {\
   if ((table)->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {\
-    grn_str *nstr;\
-    if ((nstr = grn_str_open(ctx, key, key_size, GRN_STR_NORMALIZE))) { \
-      char *key = nstr->norm;\
-      unsigned int key_size = nstr->norm_blen;\
+    grn_obj *nstr;\
+    if ((nstr = grn_string_open(ctx, key, key_size,\
+                                GRN_NORMALIZER_AUTO, 0))) {\
+      const char *key;\
+      unsigned int key_size;\
+      grn_string_get_normalized(ctx, nstr, &key, &key_size, NULL);\
       block\
-      grn_str_close(ctx, nstr);\
+      grn_obj_close(ctx, nstr);\
     }\
   } else {\
     block\
@@ -6926,6 +6929,9 @@ grn_obj_close(grn_ctx *ctx, grn_obj *obj)
     case GRN_SNIP :
       rc = grn_snip_close_real(ctx, (grn_snip *)obj);
       break;
+    case GRN_STRING :
+      rc = grn_string_close(ctx, obj);
+      break;
     case GRN_CURSOR_TABLE_PAT_KEY :
       grn_pat_cursor_close(ctx, (grn_pat_cursor *)obj);
       break;

  Modified: lib/expr.c (+12 -7)
===================================================================
--- lib/expr.c    2012-06-08 17:38:04 +0900 (ee97a01)
+++ lib/expr.c    2012-06-08 16:39:06 +0900 (cd77050)
@@ -2240,13 +2240,14 @@ grn_proc_call(grn_ctx *ctx, grn_obj *proc, int nargs, grn_obj *caller)
 void
 pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res)
 {
-  grn_str *a = NULL, *b = NULL;
+  grn_obj *a = NULL, *b = NULL;
 
   switch (x->header.domain) {
   case GRN_DB_SHORT_TEXT:
   case GRN_DB_TEXT:
   case GRN_DB_LONG_TEXT:
-    a = grn_str_open(ctx, GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x), GRN_STR_NORMALIZE);
+    a = grn_string_open(ctx, GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x),
+                        GRN_NORMALIZER_AUTO, 0);
     break;
   default:
     break;
@@ -2256,23 +2257,27 @@ pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res)
   case GRN_DB_SHORT_TEXT:
   case GRN_DB_TEXT:
   case GRN_DB_LONG_TEXT:
-    b = grn_str_open(ctx, GRN_TEXT_VALUE(y), GRN_TEXT_LEN(y), GRN_STR_NORMALIZE);
+    b = grn_string_open(ctx, GRN_TEXT_VALUE(y), GRN_TEXT_LEN(y),
+                        GRN_NORMALIZER_AUTO, 0);
     break;
   default:
     break;
   }
 
   /* normalized str doesn't contain '\0'. */
-  if (a && b && strstr(a->norm, b->norm)) {
-    GRN_INT32_SET(ctx, res, 1);
+  if (a && b) {
+    const char *a_norm, *b_norm;
+    grn_string_get_normalized(ctx, a, &a_norm, NULL, NULL);
+    grn_string_get_normalized(ctx, b, &b_norm, NULL, NULL);
+    GRN_INT32_SET(ctx, res, strstr(a_norm, b_norm) != NULL);
   } else {
     GRN_INT32_SET(ctx, res, 0);
   }
   res->header.type = GRN_BULK;
   res->header.domain = GRN_DB_INT32;
 
-  if (a) { grn_str_close(ctx, a); }
-  if (b) { grn_str_close(ctx, b); }
+  if (a) { grn_obj_close(ctx, a); }
+  if (b) { grn_obj_close(ctx, b); }
 }
 
 grn_obj *

  Modified: lib/ii.c (+9 -5)
===================================================================
--- lib/ii.c    2012-06-08 17:38:04 +0900 (c55b1ef)
+++ lib/ii.c    2012-06-08 16:39:06 +0900 (3e27e97)
@@ -5776,7 +5776,9 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string,
   grn_rset_posinfo pi;
   grn_id tid;
   const char *p, *pe;
-  grn_str *nstr;
+  grn_obj *nstr;
+  const char *normalized;
+  unsigned int normalized_length_in_bytes;
   grn_ii_cursor *c;
   grn_ii_posting *pos;
   int skip, rep, policy;
@@ -5785,7 +5787,7 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string,
   if (!ii || !string || !string_len || !s || !optarg) {
     return GRN_INVALID_ARGUMENT;
   }
-  if (!(nstr = grn_str_open(ctx, string, string_len, 0))) {
+  if (!(nstr = grn_string_open(ctx, string, string_len, NULL, 0))) {
     return GRN_INVALID_ARGUMENT;
   }
   policy = optarg->max_interval;
@@ -5801,7 +5803,9 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string,
   rep = (s->record_unit == grn_rec_position || s->subrec_unit == grn_rec_position);
   */
   rep = 0;
-  for (p = nstr->norm, pe = p + nstr->norm_blen; p < pe; p += skip) {
+  grn_string_get_normalized(ctx, nstr, &normalized, &normalized_length_in_bytes,
+                            NULL);
+  for (p = normalized, pe = p + normalized_length_in_bytes; p < pe; p += skip) {
     if ((tid = grn_table_lcp_search(ctx, ii->lexicon, p, pe - p))) {
       if (policy == TERM_EXTRACT_EACH_POST) {
         if (!(skip = grn_table_get_key(ctx, ii->lexicon, tid, NULL, 0))) { break; }
@@ -5827,7 +5831,7 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string,
         while (grn_ii_cursor_next(ctx, c)) {
           if (policy == TERM_EXTRACT_EACH_POST) {
             pi.rid = c->post->rid;
-            pi.sid = p - nstr->norm;
+            pi.sid = p - normalized;
             res_add(ctx, s, &pi, pi.sid + 1, op);
           } else {
             pos = c->post;
@@ -5843,7 +5847,7 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string,
       }
     }
   }
-  grn_str_close(ctx, nstr);
+  grn_obj_close(ctx, nstr);
   return rc;
 }
 

  Modified: lib/pat.c (+13 -5)
===================================================================
--- lib/pat.c    2012-06-08 17:38:04 +0900 (6e0c377)
+++ lib/pat.c    2012-06-08 16:39:06 +0900 (6722ab9)
@@ -1527,11 +1527,16 @@ grn_pat_scan(grn_ctx *ctx, grn_pat *pat, const char *str, unsigned int str_len,
   int n = 0;
   grn_id tid;
   if (pat->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {
-    grn_str *nstr = grn_str_open(ctx, str, str_len, GRN_STR_NORMALIZE|GRN_STR_WITH_CHECKS);
+    grn_obj *nstr = grn_string_open(ctx, str, str_len,
+                                    GRN_NORMALIZER_AUTO, GRN_STRING_WITH_CHECKS);
     if (nstr) {
-      int16_t *cp = nstr->checks;
+      const short *cp = grn_string_get_checks(ctx, nstr);
       unsigned int offset = 0, offset0 = 0;
-      const char *sp = nstr->norm, *se = nstr->norm + nstr->norm_blen;
+      unsigned int normalized_length_in_bytes;
+      const char *sp, *se;
+      grn_string_get_normalized(ctx, nstr, &sp, &normalized_length_in_bytes,
+                                NULL);
+      se = sp + normalized_length_in_bytes;
       while (n < sh_size) {
         if ((tid = grn_pat_lcp_search(ctx, pat, sp, se - sp))) {
           uint32_t len;
@@ -1552,8 +1557,11 @@ grn_pat_scan(grn_ctx *ctx, grn_pat *pat, const char *str, unsigned int str_len,
         }
         if (se <= sp) { offset = str_len; break; }
       }
-      if (rest) { *rest = nstr->orig + offset; }
-      grn_str_close(ctx, nstr);
+      if (rest) {
+        grn_string_get_original(ctx, nstr, rest, NULL);
+        *rest += offset;
+      }
+      grn_obj_close(ctx, nstr);
     } else {
       n = -1;
       if (rest) { *rest = str; }

  Modified: lib/snip.c (+55 -36)
===================================================================
--- lib/snip.c    2012-06-08 17:38:04 +0900 (02a5c10)
+++ lib/snip.c    2012-06-08 16:39:06 +0900 (ffadc90)
@@ -83,31 +83,31 @@ grn_bm_preBmBc(const unsigned char *x, size_t m, size_t *bmBc)
 }
 
 #define GRN_BM_COMPARE do { \
-  if (object->checks[found]) { \
+  if (string_checks[found]) { \
     size_t offset = cond->last_offset, found_alpha_head = cond->found_alpha_head; \
     /* calc real offset */\
     for (i = cond->last_found; i < found; i++) { \
-      if (object->checks[i] > 0) { \
+      if (string_checks[i] > 0) { \
         found_alpha_head = i; \
-        offset += object->checks[i]; \
+        offset += string_checks[i]; \
       } \
     } \
     /* if real offset is in a character, move it the head of the character */ \
-    if (object->checks[found] < 0) { \
-      offset -= object->checks[found_alpha_head]; \
+    if (string_checks[found] < 0) { \
+      offset -= string_checks[found_alpha_head]; \
       cond->last_found = found_alpha_head; \
     } else { \
       cond->last_found = found; \
     } \
     cond->start_offset = cond->last_offset = offset; \
     if (flags & GRN_SNIP_SKIP_LEADING_SPACES) { \
-      while (cond->start_offset < object->orig_blen && \
-             (i = grn_isspace(object->orig + cond->start_offset, \
-                              object->encoding))) { cond->start_offset += i; } \
+      while (cond->start_offset < string_original_length_in_bytes && \
+             (i = grn_isspace(string_original + cond->start_offset, \
+                              string_encoding))) { cond->start_offset += i; } \
     } \
     for (i = cond->last_found; i < found + m; i++) { \
-      if (object->checks[i] > 0) { \
-        offset += object->checks[i]; \
+      if (string_checks[i] > 0) { \
+        offset += string_checks[i]; \
       } \
     } \
     cond->end_offset = offset; \
@@ -130,7 +130,7 @@ grn_bm_preBmBc(const unsigned char *x, size_t m, size_t *bmBc)
 } while (0)
 
 void
-grn_bm_tunedbm(snip_cond *cond, grn_str *object, int flags)
+grn_bm_tunedbm(grn_ctx *ctx, snip_cond *cond, grn_obj *string, int flags)
 {
   register unsigned char *limit, ck;
   register const unsigned char *p, *cp;
@@ -140,13 +140,25 @@ grn_bm_tunedbm(snip_cond *cond, grn_str *object, int flags)
   unsigned char *y;
   size_t shift, found;
 
-  const size_t n = object->norm_blen, m = cond->keyword->norm_blen;
-
-  y = (unsigned char *) object->norm;
+  const char *string_original;
+  unsigned int string_original_length_in_bytes;
+  const short *string_checks;
+  grn_encoding string_encoding;
+  const char *string_norm, *keyword_norm;
+  unsigned int n, m;
+
+  grn_string_get_original(ctx, string,
+                          &string_original, &string_original_length_in_bytes);
+  string_checks = grn_string_get_checks(ctx, string);
+  string_encoding = grn_string_get_encoding(ctx, string);
+  grn_string_get_normalized(ctx, string, &string_norm, &n, NULL);
+  grn_string_get_normalized(ctx, cond->keyword, &keyword_norm, &m, NULL);
+
+  y = (unsigned char *)string_norm;
   if (m == 1) {
     if (n > cond->found) {
       shift = 1;
-      p = memchr(y + cond->found, cond->keyword->norm[0], n - cond->found);
+      p = memchr(y + cond->found, keyword_norm[0], n - cond->found);
       if (p != NULL) {
         found = p - y;
         GRN_BM_COMPARE;
@@ -156,7 +168,7 @@ grn_bm_tunedbm(snip_cond *cond, grn_str *object, int flags)
     return;
   }
 
-  x = (unsigned char *) cond->keyword->norm;
+  x = (unsigned char *)keyword_norm;
   bmBc = cond->bmBc;
   shift = cond->shift;
 
@@ -240,7 +252,7 @@ grn_snip_cond_close(grn_ctx *ctx, snip_cond *cond)
     return GRN_INVALID_ARGUMENT;
   }
   if (cond->keyword) {
-    grn_str_close(ctx, cond->keyword);
+    grn_obj_close(ctx, cond->keyword);
   }
   return GRN_SUCCESS;
 }
@@ -249,23 +261,27 @@ grn_rc
 grn_snip_cond_init(grn_ctx *ctx, snip_cond *sc, const char *keyword, unsigned int keyword_len,
                 grn_encoding enc, int flags)
 {
-  size_t norm_blen;
+  const char *norm;
+  unsigned int norm_blen;
+  grn_obj *normalizer = NULL;
   int f = GRN_STR_REMOVEBLANK;
   memset(sc, 0, sizeof(snip_cond));
-  if (flags & GRN_SNIP_NORMALIZE) { f |= GRN_STR_NORMALIZE; }
-  if (!(sc->keyword = grn_str_open(ctx, keyword, keyword_len, f))) {
-    GRN_LOG(ctx, GRN_LOG_ALERT, "grn_str_open on snip_cond_init failed !");
+  if (flags & GRN_SNIP_NORMALIZE) { normalizer = GRN_NORMALIZER_AUTO; }
+  if (!(sc->keyword = grn_string_open(ctx, keyword, keyword_len,
+                                      normalizer, f))) {
+    GRN_LOG(ctx, GRN_LOG_ALERT,
+            "grn_string_open on snip_cond_init failed!");
     return GRN_NO_MEMORY_AVAILABLE;
   }
-  norm_blen = sc->keyword->norm_blen; /* byte length, not cond->keyword->length */
+  grn_string_get_normalized(ctx, sc->keyword, &norm, &norm_blen, NULL);
   if (!norm_blen) {
     grn_snip_cond_close(ctx, sc);
     return GRN_INVALID_ARGUMENT;
   }
   if (norm_blen != 1) {
-    grn_bm_preBmBc((unsigned char *)sc->keyword->norm, norm_blen, sc->bmBc);
-    sc->shift = sc->bmBc[(unsigned char)sc->keyword->norm[norm_blen - 1]];
-    sc->bmBc[(unsigned char)sc->keyword->norm[norm_blen - 1]] = 0;
+    grn_bm_preBmBc((unsigned char *)norm, norm_blen, sc->bmBc);
+    sc->shift = sc->bmBc[(unsigned char)norm[norm_blen - 1]];
+    sc->bmBc[(unsigned char)norm[norm_blen - 1]] = 0;
   }
   return GRN_SUCCESS;
 }
@@ -332,6 +348,7 @@ grn_snip_add_cond(grn_ctx *ctx, grn_snip *snip,
   grn_rc rc;
   int copy_tag;
   snip_cond *cond;
+  unsigned int norm_blen;
 
   if (!snip || !keyword || !keyword_len || snip->cond_len >= MAX_SNIP_COND_COUNT) {
     return GRN_INVALID_ARGUMENT;
@@ -341,7 +358,8 @@ grn_snip_add_cond(grn_ctx *ctx, grn_snip *snip,
                                snip->encoding, snip->flags))) {
     return rc;
   }
-  if (cond->keyword->norm_blen > snip->width) {
+  grn_string_get_normalized(ctx, cond->keyword, NULL, &norm_blen, NULL);
+  if (norm_blen > snip->width) {
     grn_snip_cond_close(ctx, cond);
     return GRN_INVALID_ARGUMENT;
   }
@@ -490,7 +508,7 @@ exec_clean(grn_ctx *ctx, grn_snip *snip)
 {
   snip_cond *cond, *cond_end;
   if (snip->nstr) {
-    grn_str_close(ctx, snip->nstr);
+    grn_obj_close(ctx, snip->nstr);
     snip->nstr = NULL;
   }
   snip->tag_count = 0;
@@ -522,7 +540,7 @@ grn_snip_close_real(grn_ctx *ctx, grn_snip *snip)
     if (dct) { GRN_FREE((void *)dct); }
   }
   if (snip->nstr) {
-    grn_str_close(ctx, snip->nstr);
+    grn_obj_close(ctx, snip->nstr);
   }
   for (cond = snip->cond, cond_end = cond + snip->cond_len;
        cond < cond_end; cond++) {
@@ -547,6 +565,7 @@ grn_snip_exec(grn_ctx *ctx, grn_snip *snip, const char *string, unsigned int str
               unsigned int *nresults, unsigned int *max_tagged_len)
 {
   size_t i;
+  grn_obj *normalizer = NULL;
   int f = GRN_STR_WITH_CHECKS|GRN_STR_REMOVEBLANK;
   if (!snip || !string || !nresults || !max_tagged_len) {
     return GRN_INVALID_ARGUMENT;
@@ -554,15 +573,15 @@ grn_snip_exec(grn_ctx *ctx, grn_snip *snip, const char *string, unsigned int str
   GRN_API_ENTER;
   exec_clean(ctx, snip);
   *nresults = 0;
-  if (snip->flags & GRN_SNIP_NORMALIZE) { f |= GRN_STR_NORMALIZE; }
-  snip->nstr = grn_str_open(ctx, string, string_len, f);
+  if (snip->flags & GRN_SNIP_NORMALIZE) { normalizer = GRN_NORMALIZER_AUTO; }
+  snip->nstr = grn_string_open(ctx, string, string_len, normalizer, f);
   if (!snip->nstr) {
     exec_clean(ctx, snip);
-    GRN_LOG(ctx, GRN_LOG_ALERT, "grn_str_open on grn_snip_exec failed !");
+    GRN_LOG(ctx, GRN_LOG_ALERT, "grn_string_open on grn_snip_exec failed !");
     GRN_API_RETURN(ctx->rc);
   }
   for (i = 0; i < snip->cond_len; i++) {
-    grn_bm_tunedbm(snip->cond + i, snip->nstr, snip->flags);
+    grn_bm_tunedbm(ctx, snip->cond + i, snip->nstr, snip->flags);
   }
 
   {
@@ -610,7 +629,7 @@ grn_snip_exec(grn_ctx *ctx, grn_snip *snip, const char *string, unsigned int str
               }
             }
             if (exclude_other_cond) {
-              grn_bm_tunedbm(cond, snip->nstr, snip->flags);
+              grn_bm_tunedbm(ctx, cond, snip->nstr, snip->flags);
               continue;
             }
           }
@@ -623,7 +642,7 @@ grn_snip_exec(grn_ctx *ctx, grn_snip *snip, const char *string, unsigned int str
           /* check nesting to make valid HTML */
           /* ToDo: allow <test><te>te</te><st>st</st></test> */
           if (cond->start_offset < last_tag_end) {
-            grn_bm_tunedbm(cond, snip->nstr, snip->flags);
+            grn_bm_tunedbm(ctx, cond, snip->nstr, snip->flags);
             continue;
           }
         }
@@ -631,7 +650,7 @@ grn_snip_exec(grn_ctx *ctx, grn_snip *snip, const char *string, unsigned int str
           /* If a keyword gets across a snippet, */
           /* it was skipped and never to be tagged. */
           cond->stopflag = SNIPCOND_ACROSS;
-          grn_bm_tunedbm(cond, snip->nstr, snip->flags);
+          grn_bm_tunedbm(ctx, cond, snip->nstr, snip->flags);
         } else {
           found_cond = 1;
           if (cond->count == 0) {
@@ -650,7 +669,7 @@ grn_snip_exec(grn_ctx *ctx, grn_snip *snip, const char *string, unsigned int str
           if (++snip->tag_count >= MAX_SNIP_TAG_COUNT) {
             break;
           }
-          grn_bm_tunedbm(cond, snip->nstr, snip->flags);
+          grn_bm_tunedbm(ctx, cond, snip->nstr, snip->flags);
         }
       }
       if (!found_cond) {

  Modified: lib/snip.h (+4 -4)
===================================================================
--- lib/snip.h    2012-06-08 17:38:04 +0900 (f328af4)
+++ lib/snip.h    2012-06-08 16:39:06 +0900 (12d87c3)
@@ -1,5 +1,5 @@
 /* -*- c-basic-offset: 2 -*- */
-/* Copyright(C) 2009 Brazil
+/* Copyright(C) 2009-2012 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -50,7 +50,7 @@ typedef struct _snip_cond
   const char *closetag;
   size_t opentag_len;
   size_t closetag_len;
-  grn_str *keyword;
+  grn_obj *keyword;
 
   /* Tuned BM pre */
   size_t bmBc[ASIZE];
@@ -108,7 +108,7 @@ struct _grn_snip
   unsigned int snip_count;
 
   const char *string;
-  grn_str *nstr;
+  grn_obj *nstr;
 
   _snip_result snip_result[MAX_SNIP_RESULT_COUNT];
   _snip_tag_result tag_result[MAX_SNIP_TAG_COUNT];
@@ -121,7 +121,7 @@ grn_rc grn_snip_cond_init(grn_ctx *ctx, snip_cond *sc, const char *keyword, unsi
                           grn_encoding enc, int flags);
 void grn_snip_cond_reinit(snip_cond *cond);
 grn_rc grn_snip_cond_close(grn_ctx *ctx, snip_cond *cond);
-void grn_bm_tunedbm(snip_cond *cond, grn_str *object, int flags);
+void grn_bm_tunedbm(grn_ctx *ctx, snip_cond *cond, grn_obj *string, int flags);
 
 #ifdef __cplusplus
 }

  Modified: lib/sources.am (+2 -0)
===================================================================
--- lib/sources.am    2012-06-08 17:38:04 +0900 (b0966a3)
+++ lib/sources.am    2012-06-08 16:39:06 +0900 (61e888f)
@@ -34,6 +34,8 @@ libgroonga_la_SOURCES =				\
 	store.h					\
 	str.c					\
 	str.h					\
+	string.c				\
+	string_in.h				\
 	token.c					\
 	token.h					\
 	tokenizer.c				\

  Added: lib/string.c (+1369 -0) 100644
===================================================================
--- /dev/null
+++ lib/string.c    2012-06-08 16:39:06 +0900 (6b0cfec)
@@ -0,0 +1,1369 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2009-2012 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include "groonga_in.h"
+#include <string.h>
+#include "string_in.h"
+#include "str.h"
+
+static unsigned char symbol[] = {
+  ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0,
+  '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+inline static grn_obj *
+eucjp_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
+                grn_user_data *user_data)
+{
+  static uint16_t hankana[] = {
+    0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3,
+    0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2,
+    0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3,
+    0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6,
+    0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5,
+    0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6,
+    0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab,
+    0xa1eb
+  };
+  static unsigned char dakuten[] = {
+    0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0,
+    0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7,
+    0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0,
+    0, 0xdc
+  };
+  static unsigned char handaku[] = {
+    0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd
+  };
+  grn_string *nstr = (grn_string *)args[0];
+  int16_t *ch;
+  const unsigned char *s, *s_, *e;
+  unsigned char *d, *d0, *d_, b;
+  uint_least8_t *cp, *ctypes, ctype;
+  size_t size = nstr->original_length_in_bytes, length = 0;
+  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
+  if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[strinig][eucjp] failed to allocate normalized text space");
+    return NULL;
+  }
+  d0 = (unsigned char *) nstr->normalized;
+  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+    if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
+      GRN_FREE(nstr->normalized);
+      nstr->normalized = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[strinig][eucjp] failed to allocate checks space");
+      return NULL;
+    }
+  }
+  ch = nstr->checks;
+  if (nstr->flags & GRN_STRING_WITH_TYPES) {
+    if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+      GRN_FREE(nstr->checks);
+      GRN_FREE(nstr->normalized);
+      nstr->checks = NULL;
+      nstr->normalized = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[strinig][eucjp] failed to allocate character types space");
+      return NULL;
+    }
+  }
+  cp = ctypes = nstr->ctypes;
+  e = (unsigned char *)nstr->original + size;
+  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
+    if ((*s & 0x80)) {
+      if (((s + 1) < e) && (*(s + 1) & 0x80)) {
+        unsigned char c1 = *s++, c2 = *s, c3 = 0;
+        switch (c1 >> 4) {
+        case 0x08 :
+          if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) {
+            uint16_t c = hankana[c2 - 0xa0];
+            switch (c) {
+            case 0xa1ab :
+              if (d > d0 + 1 && d[-2] == 0xa5
+                  && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) {
+                *(d - 1) = b;
+                if (ch) { ch[-1] += 2; s_ += 2; }
+                continue;
+              } else {
+                *d++ = c >> 8; *d = c & 0xff;
+              }
+              break;
+            case 0xa1eb :
+              if (d > d0 + 1 && d[-2] == 0xa5
+                  && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) {
+                *(d - 1) = b;
+                if (ch) { ch[-1] += 2; s_ += 2; }
+                continue;
+              } else {
+                *d++ = c >> 8; *d = c & 0xff;
+              }
+              break;
+            default :
+              *d++ = c >> 8; *d = c & 0xff;
+              break;
+            }
+            ctype = grn_char_katakana;
+          } else {
+            *d++ = c1; *d = c2;
+            ctype = grn_char_others;
+          }
+          break;
+        case 0x09 :
+          *d++ = c1; *d = c2;
+          ctype = grn_char_others;
+          break;
+        case 0x0a :
+          switch (c1 & 0x0f) {
+          case 1 :
+            switch (c2) {
+            case 0xbc :
+              *d++ = c1; *d = c2;
+              ctype = grn_char_katakana;
+              break;
+            case 0xb9 :
+              *d++ = c1; *d = c2;
+              ctype = grn_char_kanji;
+              break;
+            case 0xa1 :
+              if (removeblankp) {
+                if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+                continue;
+              } else {
+                *d = ' ';
+                ctype = GRN_CHAR_BLANK|grn_char_symbol;
+              }
+              break;
+            default :
+              if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) {
+                *d = c3;
+                ctype = grn_char_symbol;
+              } else {
+                *d++ = c1; *d = c2;
+                ctype = grn_char_others;
+              }
+              break;
+            }
+            break;
+          case 2 :
+            *d++ = c1; *d = c2;
+            ctype = grn_char_symbol;
+            break;
+          case 3 :
+            c3 = c2 - 0x80;
+            if ('a' <= c3 && c3 <= 'z') {
+              ctype = grn_char_alpha;
+              *d = c3;
+            } else if ('A' <= c3 && c3 <= 'Z') {
+              ctype = grn_char_alpha;
+              *d = c3 + 0x20;
+            } else if ('0' <= c3 && c3 <= '9') {
+              ctype = grn_char_digit;
+              *d = c3;
+            } else {
+              ctype = grn_char_others;
+              *d++ = c1; *d = c2;
+            }
+            break;
+          case 4 :
+            *d++ = c1; *d = c2;
+            ctype = grn_char_hiragana;
+            break;
+          case 5 :
+            *d++ = c1; *d = c2;
+            ctype = grn_char_katakana;
+            break;
+          case 6 :
+          case 7 :
+          case 8 :
+            *d++ = c1; *d = c2;
+            ctype = grn_char_symbol;
+            break;
+          default :
+            *d++ = c1; *d = c2;
+            ctype = grn_char_others;
+            break;
+          }
+          break;
+        default :
+          *d++ = c1; *d = c2;
+          ctype = grn_char_kanji;
+          break;
+        }
+      } else {
+        /* skip invalid character */
+        continue;
+      }
+    } else {
+      unsigned char c = *s;
+      switch (c >> 4) {
+      case 0 :
+      case 1 :
+        /* skip unprintable ascii */
+        if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+        continue;
+      case 2 :
+        if (c == 0x20) {
+          if (removeblankp) {
+            if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+            continue;
+          } else {
+            *d = ' ';
+            ctype = GRN_CHAR_BLANK|grn_char_symbol;
+          }
+        } else {
+          *d = c;
+          ctype = grn_char_symbol;
+        }
+        break;
+      case 3 :
+        *d = c;
+        ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
+        break;
+      case 4 :
+        *d = ('A' <= c) ? c + 0x20 : c;
+        ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
+        break;
+      case 5 :
+        *d = (c <= 'Z') ? c + 0x20 : c;
+        ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
+        break;
+      case 6 :
+        *d = c;
+        ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
+        break;
+      case 7 :
+        *d = c;
+        ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
+        break;
+      default :
+        *d = c;
+        ctype = grn_char_others;
+        break;
+      }
+    }
+    d++;
+    length++;
+    if (cp) { *cp++ = ctype; }
+    if (ch) {
+      *ch++ = (int16_t)(s + 1 - s_);
+      s_ = s + 1;
+      while (++d_ < d) { *ch++ = 0; }
+    }
+  }
+  if (cp) { *cp = grn_char_null; }
+  *d = '\0';
+  nstr->n_characters = length;
+  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
+  return NULL;
+}
+
+inline static grn_obj *
+sjis_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
+               grn_user_data *user_data)
+{
+  static uint16_t hankana[] = {
+    0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342,
+    0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341,
+    0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352,
+    0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365,
+    0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374,
+    0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386,
+    0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a,
+    0x814b
+  };
+  static unsigned char dakuten[] = {
+    0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0,
+    0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66,
+    0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0,
+    0, 0x7b
+  };
+  static unsigned char handaku[] = {
+    0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c
+  };
+  grn_string *nstr = (grn_string *)args[0];
+  int16_t *ch;
+  const unsigned char *s, *s_;
+  unsigned char *d, *d0, *d_, b, *e;
+  uint_least8_t *cp, *ctypes, ctype;
+  size_t size = nstr->original_length_in_bytes, length = 0;
+  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
+  if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[strinig][sjis] failed to allocate normalized text space");
+    return NULL;
+  }
+  d0 = (unsigned char *) nstr->normalized;
+  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+    if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
+      GRN_FREE(nstr->normalized);
+      nstr->normalized = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[strinig][sjis] failed to allocate checks space");
+      return NULL;
+    }
+  }
+  ch = nstr->checks;
+  if (nstr->flags & GRN_STRING_WITH_TYPES) {
+    if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+      GRN_FREE(nstr->checks);
+      GRN_FREE(nstr->normalized);
+      nstr->checks = NULL;
+      nstr->normalized = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[strinig][sjis] failed to allocate character types space");
+      return NULL;
+    }
+  }
+  cp = ctypes = nstr->ctypes;
+  e = (unsigned char *)nstr->original + size;
+  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
+    if ((*s & 0x80)) {
+      if (0xa0 <= *s && *s <= 0xdf) {
+        uint16_t c = hankana[*s - 0xa0];
+        switch (c) {
+        case 0x814a :
+          if (d > d0 + 1 && d[-2] == 0x83
+              && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) {
+            *(d - 1) = b;
+            if (ch) { ch[-1]++; s_++; }
+            continue;
+          } else {
+            *d++ = c >> 8; *d = c & 0xff;
+          }
+          break;
+        case 0x814b :
+          if (d > d0 + 1 && d[-2] == 0x83
+              && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) {
+            *(d - 1) = b;
+            if (ch) { ch[-1]++; s_++; }
+            continue;
+          } else {
+            *d++ = c >> 8; *d = c & 0xff;
+          }
+          break;
+        default :
+          *d++ = c >> 8; *d = c & 0xff;
+          break;
+        }
+        ctype = grn_char_katakana;
+      } else {
+        if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) {
+          unsigned char c1 = *s++, c2 = *s, c3 = 0;
+          if (0x81 <= c1 && c1 <= 0x87) {
+            switch (c1 & 0x0f) {
+            case 1 :
+              switch (c2) {
+              case 0x5b :
+                *d++ = c1; *d = c2;
+                ctype = grn_char_katakana;
+                break;
+              case 0x58 :
+                *d++ = c1; *d = c2;
+                ctype = grn_char_kanji;
+                break;
+              case 0x40 :
+                if (removeblankp) {
+                  if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+                  continue;
+                } else {
+                  *d = ' ';
+                  ctype = GRN_CHAR_BLANK|grn_char_symbol;
+                }
+                break;
+              default :
+                if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) {
+                  *d = c3;
+                  ctype = grn_char_symbol;
+                } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) {
+                  *d = c3;
+                  ctype = grn_char_symbol;
+                } else {
+                  *d++ = c1; *d = c2;
+                  ctype = grn_char_others;
+                }
+                break;
+              }
+              break;
+            case 2 :
+              c3 = c2 - 0x1f;
+              if (0x4f <= c2 && c2 <= 0x58) {
+                ctype = grn_char_digit;
+                *d = c2 - 0x1f;
+              } else if (0x60 <= c2 && c2 <= 0x79) {
+                ctype = grn_char_alpha;
+                *d = c2 + 0x01;
+              } else if (0x81 <= c2 && c2 <= 0x9a) {
+                ctype = grn_char_alpha;
+                *d = c2 - 0x20;
+              } else if (0x9f <= c2 && c2 <= 0xf1) {
+                *d++ = c1; *d = c2;
+                ctype = grn_char_hiragana;
+              } else {
+                *d++ = c1; *d = c2;
+                ctype = grn_char_others;
+              }
+              break;
+            case 3 :
+              if (0x40 <= c2 && c2 <= 0x96) {
+                *d++ = c1; *d = c2;
+                ctype = grn_char_katakana;
+              } else {
+                *d++ = c1; *d = c2;
+                ctype = grn_char_symbol;
+              }
+              break;
+            case 4 :
+            case 7 :
+              *d++ = c1; *d = c2;
+              ctype = grn_char_symbol;
+              break;
+            default :
+              *d++ = c1; *d = c2;
+              ctype = grn_char_others;
+              break;
+            }
+          } else {
+            *d++ = c1; *d = c2;
+            ctype = grn_char_kanji;
+          }
+        } else {
+          /* skip invalid character */
+          continue;
+        }
+      }
+    } else {
+      unsigned char c = *s;
+      switch (c >> 4) {
+      case 0 :
+      case 1 :
+        /* skip unprintable ascii */
+        if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+        continue;
+      case 2 :
+        if (c == 0x20) {
+          if (removeblankp) {
+            if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+            continue;
+          } else {
+            *d = ' ';
+            ctype = GRN_CHAR_BLANK|grn_char_symbol;
+          }
+        } else {
+          *d = c;
+          ctype = grn_char_symbol;
+        }
+        break;
+      case 3 :
+        *d = c;
+        ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
+        break;
+      case 4 :
+        *d = ('A' <= c) ? c + 0x20 : c;
+        ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
+        break;
+      case 5 :
+        *d = (c <= 'Z') ? c + 0x20 : c;
+        ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
+        break;
+      case 6 :
+        *d = c;
+        ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
+        break;
+      case 7 :
+        *d = c;
+        ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
+        break;
+      default :
+        *d = c;
+        ctype = grn_char_others;
+        break;
+      }
+    }
+    d++;
+    length++;
+    if (cp) { *cp++ = ctype; }
+    if (ch) {
+      *ch++ = (int16_t)(s + 1 - s_);
+      s_ = s + 1;
+      while (++d_ < d) { *ch++ = 0; }
+    }
+  }
+  if (cp) { *cp = grn_char_null; }
+  *d = '\0';
+  nstr->n_characters = length;
+  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
+  return NULL;
+}
+
+#ifdef WITH_NFKC
+uint_least8_t grn_nfkc_ctype(const unsigned char *str);
+const char *grn_nfkc_map1(const unsigned char *str);
+const char *grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix);
+
+static inline int
+grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char *end)
+{
+  /* MEMO: This function allows non-null-terminated string as str. */
+  /*       But requires the end of string. */
+  const unsigned char *p = str;
+  if (end <= p || !*p) { return 0; }
+  if (*p & 0x80) {
+    int b, w;
+    int size;
+    for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++);
+    if (!w) {
+      GRN_LOG(ctx, GRN_LOG_WARNING, "invalid utf8 string(1) on grn_str_charlen_utf8");
+      return 0;
+    }
+    for (size = 1; w--; size++) {
+      if (++p >= end || !*p || (*p & 0xc0) != 0x80) {
+        GRN_LOG(ctx, GRN_LOG_WARNING, "invalid utf8 string(2) on grn_str_charlen_utf8");
+        return 0;
+      }
+    }
+    return size;
+  } else {
+    return 1;
+  }
+  return 0;
+}
+
+inline static grn_obj *
+utf8_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+  int16_t *ch;
+  const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
+  unsigned char *d, *d_, *de;
+  uint_least8_t *cp;
+  grn_string *nstr = (grn_string *)args[0];
+  size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3;
+  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
+  if (!(nstr->normalized = GRN_MALLOC(ds + 1))) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[strinig][utf8] failed to allocate normalized text space");
+    return NULL;
+  }
+  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+    if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) {
+      GRN_FREE(nstr->normalized);
+      nstr->normalized = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[strinig][utf8] failed to allocate checks space");
+      return NULL;
+    }
+  }
+  ch = nstr->checks;
+  if (nstr->flags & GRN_STRING_WITH_TYPES) {
+    if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) {
+      if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+      GRN_FREE(nstr->normalized); nstr->normalized = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[strinig][utf8] failed to allocate character types space");
+      return NULL;
+    }
+  }
+  cp = nstr->ctypes;
+  d = (unsigned char *)nstr->normalized;
+  de = d + ds;
+  d_ = NULL;
+  e = (unsigned char *)nstr->original + size;
+  for (s = s_ = (unsigned char *)nstr->original; ; s += ls) {
+    if (!(ls = grn_str_charlen_utf8(ctx, s, e))) {
+      break;
+    }
+    if ((p = (unsigned char *)grn_nfkc_map1(s))) {
+      pe = p + strlen((char *)p);
+    } else {
+      p = s;
+      pe = p + ls;
+    }
+    if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
+      p = p2;
+      pe = p + strlen((char *)p);
+      if (cp) { cp--; }
+      if (ch) {
+        ch -= (d - d_);
+        s_ = s__;
+      }
+      d = d_;
+      length--;
+    }
+    for (; ; p += lp) {
+      if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) {
+        break;
+      }
+      if ((*p == ' ' && removeblankp) || *p < 0x20  /* skip unprintable ascii */ ) {
+        if (cp > nstr->ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+      } else {
+        if (de <= d + lp) {
+          unsigned char *normalized;
+          ds += (ds >> 1) + lp;
+          if (!(normalized = GRN_REALLOC(nstr->normalized, ds + 1))) {
+            if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
+            if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+            GRN_FREE(nstr->normalized); nstr->normalized = NULL;
+            ERR(GRN_NO_MEMORY_AVAILABLE,
+                "[strinig][utf8] failed to expand normalized text space");
+            return NULL;
+          }
+          de = normalized + ds;
+          d = normalized + (d - (unsigned char *)nstr->normalized);
+          nstr->normalized = normalized;
+          if (ch) {
+            int16_t *checks;
+            if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) {
+              if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
+              GRN_FREE(nstr->checks); nstr->checks = NULL;
+              GRN_FREE(nstr->normalized); nstr->normalized = NULL;
+              ERR(GRN_NO_MEMORY_AVAILABLE,
+                  "[strinig][utf8] failed to expand checks space");
+              return NULL;
+            }
+            ch = checks + (ch - nstr->checks);
+            nstr->checks = checks;
+          }
+          if (cp) {
+            uint_least8_t *ctypes;
+            if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) {
+              GRN_FREE(nstr->ctypes); nstr->ctypes = NULL;
+              if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+              GRN_FREE(nstr->normalized); nstr->normalized = NULL;
+              ERR(GRN_NO_MEMORY_AVAILABLE,
+                  "[strinig][utf8] failed to expand character types space");
+              return NULL;
+            }
+            cp = ctypes + (cp - nstr->ctypes);
+            nstr->ctypes = ctypes;
+          }
+        }
+        memcpy(d, p, lp);
+        d_ = d;
+        d += lp;
+        length++;
+        if (cp) { *cp++ = grn_nfkc_ctype(p); }
+        if (ch) {
+          size_t i;
+          if (s_ == s + ls) {
+            *ch++ = -1;
+          } else {
+            *ch++ = (int16_t)(s + ls - s_);
+            s__ = s_;
+            s_ = s + ls;
+          }
+          for (i = lp; i > 1; i--) { *ch++ = 0; }
+        }
+      }
+    }
+  }
+  if (cp) { *cp = grn_str_null; }
+  *d = '\0';
+  nstr->n_characters = length;
+  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
+  return NULL;
+}
+#endif /* WITH_NFKC */
+
+inline static grn_obj *
+ascii_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+  grn_string *nstr = (grn_string *)args[0];
+  int16_t *ch;
+  const unsigned char *s, *s_, *e;
+  unsigned char *d, *d0, *d_;
+  uint_least8_t *cp, *ctypes, ctype;
+  size_t size = nstr->original_length_in_bytes, length = 0;
+  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
+  if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[strinig][ascii] failed to allocate normalized text space");
+    return NULL;
+  }
+  d0 = (unsigned char *) nstr->normalized;
+  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+    if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
+      GRN_FREE(nstr->normalized);
+      nstr->normalized = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[strinig][ascii] failed to allocate checks space");
+      return NULL;
+    }
+  }
+  ch = nstr->checks;
+  if (nstr->flags & GRN_STRING_WITH_TYPES) {
+    if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+      GRN_FREE(nstr->checks);
+      GRN_FREE(nstr->normalized);
+      nstr->checks = NULL;
+      nstr->normalized = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[strinig][ascii] failed to allocate character types space");
+      return NULL;
+    }
+  }
+  cp = ctypes = nstr->ctypes;
+  e = (unsigned char *)nstr->original + size;
+  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
+    unsigned char c = *s;
+    switch (c >> 4) {
+    case 0 :
+    case 1 :
+      /* skip unprintable ascii */
+      if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+      continue;
+    case 2 :
+      if (c == 0x20) {
+        if (removeblankp) {
+          if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+          continue;
+        } else {
+          *d = ' ';
+          ctype = GRN_CHAR_BLANK|grn_char_symbol;
+        }
+      } else {
+        *d = c;
+        ctype = grn_char_symbol;
+      }
+      break;
+    case 3 :
+      *d = c;
+      ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
+      break;
+    case 4 :
+      *d = ('A' <= c) ? c + 0x20 : c;
+      ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
+      break;
+    case 5 :
+      *d = (c <= 'Z') ? c + 0x20 : c;
+      ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
+      break;
+    case 6 :
+      *d = c;
+      ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
+      break;
+    case 7 :
+      *d = c;
+      ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
+      break;
+    default :
+      *d = c;
+      ctype = grn_char_others;
+      break;
+    }
+    d++;
+    length++;
+    if (cp) { *cp++ = ctype; }
+    if (ch) {
+      *ch++ = (int16_t)(s + 1 - s_);
+      s_ = s + 1;
+      while (++d_ < d) { *ch++ = 0; }
+    }
+  }
+  if (cp) { *cp = grn_char_null; }
+  *d = '\0';
+  nstr->n_characters = length;
+  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
+  return NULL;
+}
+
+/* use cp1252 as latin1 */
+inline static grn_obj *
+latin1_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
+                 grn_user_data *user_data)
+{
+  grn_string *nstr = (grn_string *)args[0];
+  int16_t *ch;
+  const unsigned char *s, *s_, *e;
+  unsigned char *d, *d0, *d_;
+  uint_least8_t *cp, *ctypes, ctype;
+  size_t size = strlen(nstr->original), length = 0;
+  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
+  if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[strinig][latin1] failed to allocate normalized text space");
+    return NULL;
+  }
+  d0 = (unsigned char *) nstr->normalized;
+  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+    if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
+      GRN_FREE(nstr->normalized);
+      nstr->normalized = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[strinig][latin1] failed to allocate checks space");
+      return NULL;
+    }
+  }
+  ch = nstr->checks;
+  if (nstr->flags & GRN_STRING_WITH_TYPES) {
+    if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+      GRN_FREE(nstr->checks);
+      GRN_FREE(nstr->normalized);
+      nstr->checks = NULL;
+      nstr->normalized = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[normalizer][latin1] failed to allocate character types space");
+      return NULL;
+    }
+  }
+  cp = ctypes = nstr->ctypes;
+  e = (unsigned char *)nstr->original + size;
+  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
+    unsigned char c = *s;
+    switch (c >> 4) {
+    case 0 :
+    case 1 :
+      /* skip unprintable ascii */
+      if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+      continue;
+    case 2 :
+      if (c == 0x20) {
+        if (removeblankp) {
+          if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+          continue;
+        } else {
+          *d = ' ';
+          ctype = GRN_CHAR_BLANK|grn_char_symbol;
+        }
+      } else {
+        *d = c;
+        ctype = grn_char_symbol;
+      }
+      break;
+    case 3 :
+      *d = c;
+      ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
+      break;
+    case 4 :
+      *d = ('A' <= c) ? c + 0x20 : c;
+      ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
+      break;
+    case 5 :
+      *d = (c <= 'Z') ? c + 0x20 : c;
+      ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
+      break;
+    case 6 :
+      *d = c;
+      ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
+      break;
+    case 7 :
+      *d = c;
+      ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
+      break;
+    case 8 :
+      if (c == 0x8a || c == 0x8c || c == 0x8e) {
+        *d = c + 0x10;
+        ctype = grn_char_alpha;
+      } else {
+        *d = c;
+        ctype = grn_char_symbol;
+      }
+      break;
+    case 9 :
+      if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) {
+        *d = (c == 0x9f) ? c + 0x60 : c;
+        ctype = grn_char_alpha;
+      } else {
+        *d = c;
+        ctype = grn_char_symbol;
+      }
+      break;
+    case 0x0c :
+      *d = c + 0x20;
+      ctype = grn_char_alpha;
+      break;
+    case 0x0d :
+      *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20;
+      ctype = (c == 0xd7) ? grn_char_symbol : grn_char_alpha;
+      break;
+    case 0x0e :
+      *d = c;
+      ctype = grn_char_alpha;
+      break;
+    case 0x0f :
+      *d = c;
+      ctype = (c == 0xf7) ? grn_char_symbol : grn_char_alpha;
+      break;
+    default :
+      *d = c;
+      ctype = grn_char_others;
+      break;
+    }
+    d++;
+    length++;
+    if (cp) { *cp++ = ctype; }
+    if (ch) {
+      *ch++ = (int16_t)(s + 1 - s_);
+      s_ = s + 1;
+      while (++d_ < d) { *ch++ = 0; }
+    }
+  }
+  if (cp) { *cp = grn_char_null; }
+  *d = '\0';
+  nstr->n_characters = length;
+  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
+  return NULL;
+}
+
+inline static grn_obj *
+koi8r_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
+                grn_user_data *user_data)
+{
+  grn_string *nstr = (grn_string *)args[0];
+  int16_t *ch;
+  const unsigned char *s, *s_, *e;
+  unsigned char *d, *d0, *d_;
+  uint_least8_t *cp, *ctypes, ctype;
+  size_t size = strlen(nstr->original), length = 0;
+  int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
+  if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[strinig][koi8r] failed to allocate normalized text space");
+    return NULL;
+  }
+  d0 = (unsigned char *) nstr->normalized;
+  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+    if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
+      GRN_FREE(nstr->normalized);
+      nstr->normalized = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[strinig][koi8r] failed to allocate checks space");
+      return NULL;
+    }
+  }
+  ch = nstr->checks;
+  if (nstr->flags & GRN_STRING_WITH_TYPES) {
+    if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+      GRN_FREE(nstr->checks);
+      GRN_FREE(nstr->normalized);
+      nstr->checks = NULL;
+      nstr->normalized = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[strinig][koi8r] failed to allocate character types space");
+      return NULL;
+    }
+  }
+  cp = ctypes = nstr->ctypes;
+  e = (unsigned char *)nstr->original + size;
+  for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
+    unsigned char c = *s;
+    switch (c >> 4) {
+    case 0 :
+    case 1 :
+      /* skip unprintable ascii */
+      if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+      continue;
+    case 2 :
+      if (c == 0x20) {
+        if (removeblankp) {
+          if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+          continue;
+        } else {
+          *d = ' ';
+          ctype = GRN_CHAR_BLANK|grn_char_symbol;
+        }
+      } else {
+        *d = c;
+        ctype = grn_char_symbol;
+      }
+      break;
+    case 3 :
+      *d = c;
+      ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
+      break;
+    case 4 :
+      *d = ('A' <= c) ? c + 0x20 : c;
+      ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
+      break;
+    case 5 :
+      *d = (c <= 'Z') ? c + 0x20 : c;
+      ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
+      break;
+    case 6 :
+      *d = c;
+      ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
+      break;
+    case 7 :
+      *d = c;
+      ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
+      break;
+    case 0x0a :
+      *d = c;
+      ctype = (c == 0xa3) ? grn_char_alpha : grn_char_others;
+      break;
+    case 0x0b :
+      if (c == 0xb3) {
+        *d = c - 0x10;
+        ctype = grn_char_alpha;
+      } else {
+        *d = c;
+        ctype = grn_char_others;
+      }
+      break;
+    case 0x0c :
+    case 0x0d :
+      *d = c;
+      ctype = grn_char_alpha;
+      break;
+    case 0x0e :
+    case 0x0f :
+      *d = c - 0x20;
+      ctype = grn_char_alpha;
+      break;
+    default :
+      *d = c;
+      ctype = grn_char_others;
+      break;
+    }
+    d++;
+    length++;
+    if (cp) { *cp++ = ctype; }
+    if (ch) {
+      *ch++ = (int16_t)(s + 1 - s_);
+      s_ = s + 1;
+      while (++d_ < d) { *ch++ = 0; }
+    }
+  }
+  if (cp) { *cp = grn_char_null; }
+  *d = '\0';
+  nstr->n_characters = length;
+  nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
+  return NULL;
+}
+
+static grn_string *
+grn_fake_string_open(grn_ctx *ctx, grn_string *string)
+{
+  /* TODO: support GRN_STRING_REMOVE_BLANK flag and ctypes */
+  grn_string *nstr = string;
+  const char *str;
+  unsigned int str_len;
+
+  str = nstr->original;
+  str_len = nstr->original_length_in_bytes;
+
+  if (!(nstr->normalized = GRN_MALLOC(str_len + 1))) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[strinig][fake] failed to allocate normalized text space");
+    grn_string_close(ctx, (grn_obj *)nstr);
+    return NULL;
+  }
+
+  memcpy(nstr->normalized, str, str_len);
+  nstr->normalized[str_len] = '\0';
+  nstr->normalized_length_in_bytes = str_len;
+
+  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+    int16_t f = 0;
+    unsigned char c;
+    size_t i;
+    if (!(nstr->checks = (int16_t *) GRN_MALLOC(sizeof(int16_t) * str_len))) {
+      grn_string_close(ctx, (grn_obj *)nstr);
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[strinig][fake] failed to allocate checks space");
+      return NULL;
+    }
+    switch (nstr->encoding) {
+    case GRN_ENC_EUC_JP:
+      for (i = 0; i < str_len; i++) {
+        if (!f) {
+          c = (unsigned char) str[i];
+          f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1)
+            );
+          nstr->checks[i] = f;
+        } else {
+          nstr->checks[i] = 0;
+        }
+        f--;
+      }
+      break;
+    case GRN_ENC_SJIS:
+      for (i = 0; i < str_len; i++) {
+        if (!f) {
+          c = (unsigned char) str[i];
+          f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1);
+          nstr->checks[i] = f;
+        } else {
+          nstr->checks[i] = 0;
+        }
+        f--;
+      }
+      break;
+    case GRN_ENC_UTF8:
+      for (i = 0; i < str_len; i++) {
+        if (!f) {
+          c = (unsigned char) str[i];
+          f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3)
+                           : 2)
+               : 1);
+          nstr->checks[i] = f;
+        } else {
+          nstr->checks[i] = 0;
+        }
+        f--;
+      }
+      break;
+    default:
+      for (i = 0; i < str_len; i++) {
+        nstr->checks[i] = 1;
+      }
+      break;
+    }
+  }
+  return nstr;
+}
+
+grn_obj *
+grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len,
+                 grn_obj *normalizer, int flags, grn_encoding encoding)
+{
+  grn_string *string;
+  grn_obj *obj;
+  grn_obj *args[1];
+
+  if (!str || !str_len) {
+    return NULL;
+  }
+
+  string = GRN_MALLOCN(grn_string, 1);
+  if (!string) {
+    GRN_LOG(ctx, GRN_LOG_ALERT,
+            "[string][open] failed to allocate memory");
+    return NULL;
+  }
+
+  obj = (grn_obj *)string;
+  GRN_OBJ_INIT(obj, GRN_STRING, GRN_OBJ_ALLOCATED, GRN_ID_NIL);
+  string->original = str;
+  string->original_length_in_bytes = str_len;
+  string->normalized = NULL;
+  string->normalized_length_in_bytes = 0;
+  string->n_characters = 0;
+  string->checks = NULL;
+  string->ctypes = NULL;
+  string->encoding = encoding;
+  string->flags = flags;
+
+  if (!normalizer) {
+    return (grn_obj *)grn_fake_string_open(ctx, string);
+  }
+
+  args[0] = obj;
+  switch (encoding) {
+  case GRN_ENC_EUC_JP :
+    eucjp_normalize(ctx, 1, args, NULL);
+    break;
+  case GRN_ENC_UTF8 :
+#ifdef WITH_NFKC
+    utf8_normalize(ctx, 1, args, NULL);
+#else /* WITH_NFKC */
+    ascii_normalize(ctx, 1, args, NULL);
+#endif /* WITH_NFKC */
+    break;
+  case GRN_ENC_SJIS :
+    sjis_normalize(ctx, 1, args, NULL);
+    break;
+  case GRN_ENC_LATIN1 :
+    latin1_normalize(ctx, 1, args, NULL);
+    break;
+  case GRN_ENC_KOI8R :
+    koi8r_normalize(ctx, 1, args, NULL);
+    break;
+  default :
+    ascii_normalize(ctx, 1, args, NULL);
+    break;
+  }
+  if (ctx->rc) {
+    grn_obj_close(ctx, obj);
+    obj = NULL;
+  }
+
+  return obj;
+}
+
+grn_obj *
+grn_string_open(grn_ctx *ctx, const char *str, unsigned int str_len,
+                grn_obj *normalizer, int flags)
+{
+  return grn_string_open_(ctx, str, str_len, normalizer, flags, ctx->encoding);
+}
+
+grn_rc
+grn_string_get_original(grn_ctx *ctx, grn_obj *string,
+                        const char **original,
+                        unsigned int *length_in_bytes)
+{
+  grn_rc rc;
+  grn_string *string_ = (grn_string *)string;
+  GRN_API_ENTER;
+  if (string_) {
+    if (original) { *original = string_->original; }
+    if (length_in_bytes) {
+      *length_in_bytes = string_->original_length_in_bytes;
+    }
+    rc = GRN_SUCCESS;
+  } else {
+    rc = GRN_INVALID_ARGUMENT;
+  }
+  GRN_API_RETURN(rc);
+}
+
+int
+grn_string_get_flags(grn_ctx *ctx, grn_obj *string)
+{
+  int flags = 0;
+  grn_string *string_ = (grn_string *)string;
+  GRN_API_ENTER;
+  if (string_) {
+    flags = string_->flags;
+  }
+  GRN_API_RETURN(flags);
+}
+
+grn_rc
+grn_string_get_normalized(grn_ctx *ctx, grn_obj *string,
+                          const char **normalized,
+                          unsigned int *length_in_bytes,
+                          unsigned int *n_characters)
+{
+  grn_rc rc;
+  grn_string *string_ = (grn_string *)string;
+  GRN_API_ENTER;
+  if (string_) {
+    if (normalized) { *normalized = string_->normalized; }
+    if (length_in_bytes) {
+      *length_in_bytes = string_->normalized_length_in_bytes;
+    }
+    if (n_characters) { *n_characters = string_->n_characters; }
+    rc = GRN_SUCCESS;
+  } else {
+    rc = GRN_INVALID_ARGUMENT;
+  }
+  GRN_API_RETURN(rc);
+}
+
+grn_rc
+grn_string_set_normalized(grn_ctx *ctx, grn_obj *string,
+                          char *normalized, unsigned int length_in_bytes,
+                          unsigned int n_characters)
+{
+  grn_rc rc;
+  grn_string *string_ = (grn_string *)string;
+  GRN_API_ENTER;
+  if (string_) {
+    if (string_->normalized) { GRN_FREE(string_->normalized); }
+    string_->normalized = normalized;
+    string_->normalized_length_in_bytes = length_in_bytes;
+    string_->n_characters = n_characters;
+    rc = GRN_SUCCESS;
+  } else {
+    rc = GRN_INVALID_ARGUMENT;
+  }
+  GRN_API_RETURN(rc);
+}
+
+const short *
+grn_string_get_checks(grn_ctx *ctx, grn_obj *string)
+{
+  int16_t *checks = NULL;
+  grn_string *string_ = (grn_string *)string;
+  GRN_API_ENTER;
+  if (string_) {
+    checks = string_->checks;
+  } else {
+    checks = NULL;
+  }
+  GRN_API_RETURN(checks);
+}
+
+grn_rc
+grn_string_set_checks(grn_ctx *ctx, grn_obj *string, short *checks)
+{
+  grn_rc rc;
+  grn_string *string_ = (grn_string *)string;
+  GRN_API_ENTER;
+  if (string_) {
+    if (string_->checks) { GRN_FREE(string_->checks); }
+    string_->checks = checks;
+    rc = GRN_SUCCESS;
+  } else {
+    rc = GRN_INVALID_ARGUMENT;
+  }
+  GRN_API_RETURN(rc);
+}
+
+const unsigned char *
+grn_string_get_types(grn_ctx *ctx, grn_obj *string)
+{
+  unsigned char *types = NULL;
+  grn_string *string_ = (grn_string *)string;
+  GRN_API_ENTER;
+  if (string_) {
+    types = string_->ctypes;
+  } else {
+    types = NULL;
+  }
+  GRN_API_RETURN(types);
+}
+
+grn_rc
+grn_string_set_types(grn_ctx *ctx, grn_obj *string, unsigned char *types)
+{
+  grn_rc rc;
+  grn_string *string_ = (grn_string *)string;
+  GRN_API_ENTER;
+  if (string_) {
+    if (string_->ctypes) { GRN_FREE(string_->ctypes); }
+    string_->ctypes = types;
+    rc = GRN_SUCCESS;
+  } else {
+    rc = GRN_INVALID_ARGUMENT;
+  }
+  GRN_API_RETURN(rc);
+}
+
+grn_encoding
+grn_string_get_encoding(grn_ctx *ctx, grn_obj *string)
+{
+  grn_encoding encoding = GRN_ENC_NONE;
+  grn_string *string_ = (grn_string *)string;
+  GRN_API_ENTER;
+  if (string_) {
+    encoding = string_->encoding;
+  }
+  GRN_API_RETURN(encoding);
+}
+
+grn_rc
+grn_string_close(grn_ctx *ctx, grn_obj *string)
+{
+  grn_rc rc;
+  grn_string *string_ = (grn_string *)string;
+  if (string_) {
+    if (string_->normalized) { GRN_FREE(string_->normalized); }
+    if (string_->ctypes) { GRN_FREE(string_->ctypes); }
+    if (string_->checks) { GRN_FREE(string_->checks); }
+    GRN_FREE(string);
+    rc = GRN_SUCCESS;
+  } else {
+    rc = GRN_INVALID_ARGUMENT;
+  }
+  return rc;
+}

  Added: lib/string_in.h (+64 -0) 100644
===================================================================
--- /dev/null
+++ lib/string_in.h    2012-06-08 16:39:06 +0900 (a6cc1c9)
@@ -0,0 +1,64 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2012 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#ifndef GRN_STRING_H
+#define GRN_STRING_H
+
+#ifndef GROONGA_IN_H
+# include "groonga_in.h"
+#endif /* GROONGA_IN_H */
+
+#ifndef GRN_CTX_H
+# include "ctx.h"
+#endif /* GRN_CTX_H */
+
+#ifndef GRN_DB_H
+# include "db.h"
+#endif /* GRN_DB_H */
+
+#ifndef GRN_STR_H
+# include "str.h"
+#endif /* GRN_STR_IN_H */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  grn_obj_header header;
+  const char *original;
+  unsigned int original_length_in_bytes;
+  char *normalized;
+  unsigned int normalized_length_in_bytes;
+  unsigned int n_characters;
+  short *checks;
+  unsigned char *ctypes;
+  grn_encoding encoding;
+  int flags;
+} grn_string;
+
+grn_obj *grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len,
+                          grn_obj *normalizer, int flags, grn_encoding encoding);
+grn_rc grn_string_close(grn_ctx *ctx, grn_obj *string);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GRN_STRING_IN_H */

  Modified: lib/token.c (+52 -28)
===================================================================
--- lib/token.c    2012-06-08 17:38:04 +0900 (deb8bc2)
+++ lib/token.c    2012-06-08 16:39:06 +0900 (3cb8279)
@@ -23,6 +23,7 @@
 #include "pat.h"
 #include "dat.h"
 #include "hash.h"
+#include "string_in.h"
 
 grn_obj *grn_uvector_tokenizer = NULL;
 
@@ -79,7 +80,7 @@ uvector_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 }
 
 typedef struct {
-  grn_str *nstr;
+  grn_obj *nstr;
   const uint8_t *delimiter;
   uint32_t delimiter_len;
   int32_t pos;
@@ -97,7 +98,10 @@ delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data,
                const uint8_t *delimiter, uint32_t delimiter_len)
 {
   grn_obj *str;
+  grn_obj *normalizer = NULL;
   int nflags = 0;
+  const char *normalized;
+  unsigned int normalized_length_in_bytes;
   grn_delimited_tokenizer *token;
   grn_obj_flags table_flags;
   if (!(str = grn_ctx_pop(ctx))) {
@@ -110,16 +114,21 @@ delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data,
   token->delimiter_len = delimiter_len;
   token->pos = 0;
   grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL);
-  nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
-  if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
-                                    nflags, token->encoding))) {
+  if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
+    normalizer = GRN_NORMALIZER_AUTO;
+  }
+  if (!(token->nstr = grn_string_open_(ctx,
+                                       GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
+                                       normalizer, nflags, token->encoding))) {
     GRN_FREE(token);
-    ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
+    ERR(GRN_TOKENIZER_ERROR, "grn_string_open failed at grn_token_open");
     return NULL;
   }
-  token->next = (unsigned char *)token->nstr->norm;
-  token->end = token->next + token->nstr->norm_blen;
-  token->len = token->nstr->length;
+  grn_string_get_normalized(ctx, token->nstr,
+                            &normalized, &normalized_length_in_bytes,
+                            &(token->len));
+  token->next = (const unsigned char *)normalized;
+  token->end = token->next + normalized_length_in_bytes;
   GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY);
   GRN_UINT32_INIT(&token->stat_, 0);
   return NULL;
@@ -154,7 +163,7 @@ static grn_obj *
 delimited_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
   grn_delimited_tokenizer *token = user_data->ptr;
-  grn_str_close(ctx, token->nstr);
+  grn_obj_close(ctx, token->nstr);
   GRN_FREE(token);
   return NULL;
 }
@@ -178,7 +187,7 @@ delimit_null_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d
 /* ngram tokenizer */
 
 typedef struct {
-  grn_str *nstr;
+  grn_obj *nstr;
   uint8_t uni_alpha;
   uint8_t uni_digit;
   uint8_t uni_symbol;
@@ -190,7 +199,7 @@ typedef struct {
   grn_encoding encoding;
   const unsigned char *next;
   const unsigned char *end;
-  uint_least8_t *ctypes;
+  const uint_least8_t *ctypes;
   int32_t len;
   uint32_t tail;
   grn_obj curr_;
@@ -202,7 +211,10 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
            uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank)
 {
   grn_obj *str;
-  int nflags = GRN_STR_REMOVEBLANK|GRN_STR_WITH_CTYPES;
+  grn_obj *normalizer = NULL;
+  int nflags = GRN_STRING_REMOVE_BLANK|GRN_STRING_WITH_TYPES;
+  const char *normalized;
+  unsigned int normalized_length_in_bytes;
   grn_ngram_tokenizer *token;
   grn_obj_flags table_flags;
   if (!(str = grn_ctx_pop(ctx))) {
@@ -220,17 +232,22 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
   token->pos = 0;
   token->skip = 0;
   grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL);
-  nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
-  if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
-                                    nflags, token->encoding))) {
+  if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
+    normalizer = GRN_NORMALIZER_AUTO;
+  }
+  if (!(token->nstr = grn_string_open_(ctx,
+                                       GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
+                                       normalizer, nflags, token->encoding))) {
     GRN_FREE(token);
-    ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
+    ERR(GRN_TOKENIZER_ERROR, "grn_string_open failed at grn_token_open");
     return NULL;
   }
-  token->next = (unsigned char *)token->nstr->norm;
-  token->end = token->next + token->nstr->norm_blen;
-  token->ctypes = token->nstr->ctypes;
-  token->len = token->nstr->length;
+  grn_string_get_normalized(ctx, token->nstr,
+                            &normalized, &normalized_length_in_bytes,
+                            &(token->len));
+  token->next = (const unsigned char *)normalized;
+  token->end = token->next + normalized_length_in_bytes;
+  token->ctypes = grn_string_get_types(ctx, token->nstr);
   GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY);
   GRN_UINT32_INIT(&token->stat_, 0);
   return NULL;
@@ -283,7 +300,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   grn_ngram_tokenizer *token = user_data->ptr;
   const unsigned char *p = token->next, *r = p, *e = token->end;
   int32_t len = 0, pos = token->pos + token->skip, status = 0;
-  uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL;
+  const uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL;
   if (cp && token->uni_alpha && GRN_STR_CTYPE(*cp) == grn_str_alpha) {
     while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
       len++;
@@ -371,7 +388,7 @@ static grn_obj *
 ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
   grn_ngram_tokenizer *token = user_data->ptr;
-  grn_str_close(ctx, token->nstr);
+  grn_obj_close(ctx, token->nstr);
   GRN_FREE(token);
   return NULL;
 }
@@ -437,13 +454,20 @@ grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
     ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token->pctx.user_data);
     grn_obj_close(ctx, &str_);
   } else {
-    int nflags = table_flags & GRN_OBJ_KEY_NORMALIZE;
-    token->nstr = grn_str_open_(ctx, str, str_len, nflags, token->encoding);
+    grn_obj *normalizer = NULL;
+    int nflags = 0;
+    if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
+      normalizer = GRN_NORMALIZER_AUTO;
+    }
+    token->nstr = grn_string_open_(ctx, str, str_len,
+                                   normalizer, nflags, token->encoding);
     if (token->nstr) {
-      token->curr = (unsigned char *)token->nstr->norm;
-      token->curr_size = token->nstr->norm_blen;
+      const char *normalized;
+      grn_string_get_normalized(ctx, token->nstr,
+                                &normalized, &(token->curr_size), NULL);
+      token->curr = (const unsigned char *)normalized;
     } else {
-      ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
+      ERR(GRN_TOKENIZER_ERROR, "grn_string_open failed at grn_token_open");
     }
   }
   if (ctx->rc) {
@@ -561,7 +585,7 @@ grn_token_close(grn_ctx *ctx, grn_token *token)
     return GRN_SUCCESS;
   } else {
     if (token->nstr) {
-      grn_str_close(ctx, token->nstr);
+      grn_obj_close(ctx, token->nstr);
     }
     return GRN_INVALID_ARGUMENT;
   }

  Modified: lib/token.h (+1 -1)
===================================================================
--- lib/token.h    2012-06-08 17:38:04 +0900 (fb9a5b6)
+++ lib/token.h    2012-06-08 16:39:06 +0900 (785c52c)
@@ -58,7 +58,7 @@ typedef struct {
   grn_obj *tokenizer;
   grn_proc_ctx pctx;
   uint32_t variant;
-  grn_str *nstr;
+  grn_obj *nstr;
 } grn_token;
 
 enum {

  Modified: lib/tokenizer.c (+15 -5)
===================================================================
--- lib/tokenizer.c    2012-06-08 17:38:04 +0900 (0d767bf)
+++ lib/tokenizer.c    2012-06-08 16:39:06 +0900 (addded9)
@@ -25,6 +25,7 @@
 #include "ctx.h"
 #include "db.h"
 #include "str.h"
+#include "string_in.h"
 #include "token.h"
 
 /*
@@ -116,14 +117,23 @@ grn_tokenizer_query *grn_tokenizer_query_create(grn_ctx *ctx,
       }
       grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL);
       {
-        grn_str * const str = grn_str_open_(ctx, GRN_TEXT_VALUE(query_str),
-                                            GRN_TEXT_LEN(query_str),
-                                            table_flags & GRN_OBJ_KEY_NORMALIZE,
-                                            table_encoding);
-        if (str == NULL) {
+        grn_obj *normalizer = NULL;
+        int flags = 0;
+        grn_obj *normalized_string;
+        if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
+          normalizer = GRN_NORMALIZER_AUTO;
+        }
+        normalized_string = grn_string_open_(ctx,
+                                             GRN_TEXT_VALUE(query_str),
+                                             GRN_TEXT_LEN(query_str),
+                                             normalizer,
+                                             flags,
+                                             table_encoding);
+        if (!normalized_string) {
           GRN_PLUGIN_FREE(ctx, query);
           return NULL;
         }
+        query->normalized_query = normalized_string;
         memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
         query_buf[query_length] = '\0';
         query->query_buf = query_buf;




Groonga-commit メーリングリストの案内
Back to archive index