null+****@clear*****
null+****@clear*****
2012年 2月 8日 (水) 18:18:36 JST
Kouhei Sutou 2012-02-08 18:18:36 +0900 (Wed, 08 Feb 2012)
New Revision: 67def859ac4bdcfef7345b5d654e0741d34a8710
Log:
[normalizer] implemented normalizer as grn_proc. refs #1164
Added files:
lib/normalizer.c
lib/normalizer.h
Modified files:
include/groonga.h
lib/Makefile.am
lib/dat.cpp
lib/dat.h
lib/db.c
lib/db.h
lib/expr.c
lib/hash.c
lib/hash.h
lib/pat.c
lib/pat.h
lib/snip.c
lib/str.c
lib/str.h
lib/util.c
test/unit/core/dat/test-dat.cpp
test/unit/util/test-snip.c
test/unit/util/test-string.c
Modified: include/groonga.h (+29 -2)
===================================================================
--- include/groonga.h 2012-02-09 09:53:43 +0900 (8a078c2)
+++ include/groonga.h 2012-02-08 18:18:36 +0900 (728df55)
@@ -414,6 +414,7 @@ typedef unsigned short int grn_obj_flags;
#define GRN_ACCESSOR_VIEW (0x0a)
#define GRN_SNIP (0x0b)
#define GRN_PATSNIP (0x0c)
+#define GRN_NORMALIZED_TEXT (0x0d)
#define GRN_CURSOR_TABLE_HASH_KEY (0x10)
#define GRN_CURSOR_TABLE_PAT_KEY (0x11)
#define GRN_CURSOR_TABLE_DAT_KEY (0x12)
@@ -590,6 +591,16 @@ typedef enum {
GRN_DB_TRIGRAM,
} grn_builtin_tokenizer;
+typedef enum {
+ GRN_DB_NORMALIZER_ASCII = 96,
+ GRN_DB_NORMALIZER_UTF8_NFKC, /* Normalization Form KC */
+ GRN_DB_NORMALIZER_EUC_JP,
+ GRN_DB_NORMALIZER_SJIS,
+ GRN_DB_NORMALIZER_LATIN1,
+ GRN_DB_NORMALIZER_KOI8R,
+ GRN_DB_NORMALIZER_UTF8_UCA /* Unicode Collation Algorithm */
+} grn_builtin_normalizer;
+
GRN_API grn_obj *grn_ctx_at(grn_ctx *ctx, grn_id id);
/**
@@ -636,7 +647,8 @@ typedef enum {
GRN_PROC_TOKENIZER = 1,
GRN_PROC_COMMAND,
GRN_PROC_FUNCTION,
- GRN_PROC_HOOK
+ GRN_PROC_HOOK,
+ GRN_PROC_NORMALIZER
} grn_proc_type;
GRN_API grn_obj *grn_proc_create(grn_ctx *ctx,
@@ -2422,7 +2434,7 @@ GRN_API void grn_time_now(grn_ctx *ctx, grn_obj *obj);
grn_bulk_write((ctx), (obj), (char *)&_val, sizeof(grn_obj *));\
} while (0)
-/* grn_str */
+/* grn_str: deprecated */
typedef struct {
const char *orig;
@@ -2445,6 +2457,21 @@ GRN_API grn_str *grn_str_open(grn_ctx *ctx, const char *str, unsigned int str_le
int flags);
GRN_API grn_rc grn_str_close(grn_ctx *ctx, grn_str *nstr);
+/* grn_normalized_text */
+
+#define GRN_NORMALIZED_TEXT_REMOVE_BLANK (0x01<<0)
+#define GRN_NORMALIZED_TEXT_WITH_CTYPES (0x01<<1)
+#define GRN_NORMALIZED_TEXT_WITH_CHECKS (0x01<<2)
+
+GRN_API grn_obj *grn_normalized_text_open(grn_ctx *ctx, grn_obj *normalizer,
+ const char *str, unsigned int str_len,
+ grn_encoding encoding, int flags);
+GRN_API grn_rc grn_normalized_text_get_value(grn_ctx *ctx,
+ grn_obj *normalized_text,
+ const char **value,
+ unsigned int *length,
+ unsigned int *binary_length);
+
GRN_API int grn_charlen(grn_ctx *ctx, const char *str, const char *end);
/* expr */
Modified: lib/Makefile.am (+2 -0)
===================================================================
--- lib/Makefile.am 2012-02-09 09:53:43 +0900 (639d6e1)
+++ lib/Makefile.am 2012-02-08 18:18:36 +0900 (c9c99da)
@@ -15,6 +15,7 @@ libgroonga_la_SOURCES = \
io.c \
str.c \
nfkc.c \
+ normalizer.c \
snip.c \
store.c \
com.c \
@@ -53,6 +54,7 @@ noinst_HEADERS = \
ii.h \
io.h \
nfkc.h \
+ normalizer.c \
output.h \
pat.h \
plugin_in.h \
Modified: lib/dat.cpp (+15 -0)
===================================================================
--- lib/dat.cpp 2012-02-09 09:53:43 +0900 (897186a)
+++ lib/dat.cpp 2012-02-08 18:18:36 +0900 (eddb5be)
@@ -22,6 +22,7 @@
#include "str.h"
#include "io.h"
#include "dat.h"
+#include "normalizer.h"
#include "util.h"
/*
@@ -312,6 +313,14 @@ grn_dat_create(grn_ctx *ctx, const char *path, uint32_t,
dat->header->encoding = encoding;
dat->header->tokenizer = GRN_ID_NIL;
dat->header->file_id = 0;
+ if (dat->header->flags & GRN_OBJ_KEY_NORMALIZE) {
+ dat->header->flags &= ~GRN_OBJ_KEY_NORMALIZE;
+ dat->header->normalizer = grn_normalizer_find(ctx, ctx->encoding);
+ dat->normalizer = grn_ctx_at(ctx, dat->header->normalizer);
+ } else {
+ dat->header->normalizer = GRN_ID_NIL;
+ dat->normalizer = NULL;
+ }
dat->encoding = encoding;
dat->tokenizer = NULL;
return dat;
@@ -347,6 +356,12 @@ grn_dat_open(grn_ctx *ctx, const char *path)
dat->encoding = dat->header->encoding;
dat->obj.header.flags = dat->header->flags;
dat->tokenizer = grn_ctx_at(ctx, dat->header->tokenizer);
+ if (dat->header->flags & GRN_OBJ_KEY_NORMALIZE) {
+ dat->header->flags &= ~GRN_OBJ_KEY_NORMALIZE;
+ dat->header->normalizer = grn_normalizer_find(ctx, ctx->encoding);
+
+ }
+ dat->normalizer = grn_ctx_at(ctx, dat->header->normalizer);
return dat;
}
Modified: lib/dat.h (+3 -0)
===================================================================
--- lib/dat.h 2012-02-09 09:53:43 +0900 (a92b0f7)
+++ lib/dat.h 2012-02-08 18:18:36 +0900 (4409467)
@@ -36,6 +36,7 @@ struct _grn_dat {
void *trie;
void *old_trie;
grn_obj *tokenizer;
+ grn_obj *normalizer;
grn_critical_section lock;
};
@@ -44,6 +45,8 @@ struct grn_dat_header {
grn_encoding encoding;
grn_id tokenizer;
uint32_t file_id;
+ grn_id normalizer;
+ uint32_t reserved[235];
};
struct _grn_dat_cursor {
Modified: lib/db.c (+21 -6)
===================================================================
--- lib/db.c 2012-02-09 09:53:43 +0900 (9b64a10)
+++ lib/db.c 2012-02-08 18:18:36 +0900 (99099a7)
@@ -22,6 +22,7 @@
#include "ii.h"
#include "ctx_impl.h"
#include "token.h"
+#include "normalizer.h"
#include "proc.h"
#include "plugin_in.h"
#include "geo.h"
@@ -32,13 +33,16 @@
#define NEXT_ADDR(p) (((byte *)(p)) + sizeof *(p))
#define WITH_NORMALIZE(table,key,key_size,block) {\
- if ((table)->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {\
- grn_str *nstr;\
- if ((nstr = grn_str_open(ctx, key, key_size, GRN_STR_NORMALIZE))) { \
- char *key = nstr->norm;\
- unsigned int key_size = nstr->norm_blen;\
+ if ((table)->normalizer) {\
+ grn_obj *nstr;\
+ if ((nstr = grn_normalized_text_open(ctx, (table)->normalizer,\
+ key, key_size,\
+ (table)->encoding, 0))) {\
+ const char *key;\
+ unsigned int key_size;\
+ grn_normalized_text_get_value(ctx, nstr, &key, NULL, &key_size);\
block\
- grn_str_close(ctx, nstr);\
+ grn_obj_close(ctx, nstr);\
}\
} else {\
block\
@@ -139,6 +143,7 @@ grn_db_create(grn_ctx *ctx, const char *path, grn_db_create_optarg *optarg)
if ((s->specs = grn_ja_create(ctx, buffer, 65536, 0))) {
grn_ctx_use(ctx, (grn_obj *)s);
grn_db_init_builtin_types(ctx);
+ grn_db_init_builtin_normalizers(ctx);
GRN_API_RETURN((grn_obj *)s);
} else {
ERR(GRN_NO_MEMORY_AVAILABLE, "ja create failed");
@@ -147,6 +152,7 @@ grn_db_create(grn_ctx *ctx, const char *path, grn_db_create_optarg *optarg)
s->specs = NULL;
grn_ctx_use(ctx, (grn_obj *)s);
grn_db_init_builtin_types(ctx);
+ grn_db_init_builtin_normalizers(ctx);
GRN_API_RETURN((grn_obj *)s);
}
if (use_pat_as_db_keys) {
@@ -208,6 +214,7 @@ grn_db_open(grn_ctx *ctx, const char *path)
}
#endif
grn_db_init_builtin_tokenizers(ctx);
+ grn_db_init_builtin_normalizers(ctx);
grn_db_init_builtin_query(ctx);
GRN_API_RETURN((grn_obj *)s);
}
@@ -6880,6 +6887,9 @@ grn_obj_close(grn_ctx *ctx, grn_obj *obj)
case GRN_ACCESSOR_VIEW :
rc = grn_accessor_view_close(ctx, obj);
break;
+ case GRN_NORMALIZED_TEXT :
+ rc = grn_normalized_text_close(ctx, obj);
+ break;
case GRN_CURSOR_TABLE_PAT_KEY :
grn_pat_cursor_close(ctx, (grn_pat_cursor *)obj);
break;
@@ -7992,6 +8002,11 @@ grn_db_init_builtin_types(grn_ctx *ctx)
}
#endif
grn_db_init_builtin_tokenizers(ctx);
+ for (id = grn_db_curr_id(ctx, db) + 1; id < GRN_DB_NORMALIZER_ASCII; id++) {
+ grn_itoh(id, buf + 3, 2);
+ grn_obj_register(ctx, db, buf, 5);
+ }
+ grn_db_init_builtin_normalizers(ctx);
for (id = grn_db_curr_id(ctx, db) + 1; id < 128; id++) {
grn_itoh(id, buf + 3, 2);
grn_obj_register(ctx, db, buf, 5);
Modified: lib/db.h (+1 -1)
===================================================================
--- lib/db.h 2012-02-09 09:53:43 +0900 (2f7271b)
+++ lib/db.h 2012-02-08 18:18:36 +0900 (4f76d43)
@@ -92,7 +92,7 @@ grn_id grn_table_get_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_si
grn_id grn_table_add_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_size,
void **value, int *added);
GRN_API grn_rc grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags,
- grn_encoding *encoding, grn_obj **tokenizer);
+ grn_encoding *encoding, grn_obj **tokenizer);
const char *_grn_table_key(grn_ctx *ctx, grn_obj *table, grn_id id, uint32_t *key_size);
grn_rc grn_table_search(grn_ctx *ctx, grn_obj *table,
Modified: lib/expr.c (+24 -7)
===================================================================
--- lib/expr.c 2012-02-09 09:53:43 +0900 (3c79202)
+++ lib/expr.c 2012-02-08 18:18:36 +0900 (795192d)
@@ -22,6 +22,7 @@
#include <float.h>
#include "ii.h"
#include "geo.h"
+#include "normalizer.h"
#include "util.h"
static inline int
@@ -2241,13 +2242,19 @@ grn_proc_call(grn_ctx *ctx, grn_obj *proc, int nargs, grn_obj *caller)
void
pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res)
{
- grn_str *a = NULL, *b = NULL;
+ grn_id normalizer_id;
+ grn_obj *normalizer;
+ grn_obj *a = NULL, *b = NULL;
+ normalizer_id = grn_normalizer_find(ctx, ctx->encoding);
+ normalizer = grn_ctx_at(ctx, normalizer_id);
switch (x->header.domain) {
case GRN_DB_SHORT_TEXT:
case GRN_DB_TEXT:
case GRN_DB_LONG_TEXT:
- a = grn_str_open(ctx, GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x), GRN_STR_NORMALIZE);
+ a = grn_normalized_text_open(ctx, normalizer,
+ GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x),
+ ctx->encoding, 0);
break;
default:
break;
@@ -2257,23 +2264,33 @@ pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res)
case GRN_DB_SHORT_TEXT:
case GRN_DB_TEXT:
case GRN_DB_LONG_TEXT:
- b = grn_str_open(ctx, GRN_TEXT_VALUE(y), GRN_TEXT_LEN(y), GRN_STR_NORMALIZE);
+ b = grn_normalized_text_open(ctx, normalizer,
+ GRN_TEXT_VALUE(y), GRN_TEXT_LEN(y),
+ ctx->encoding, 0);
break;
default:
break;
}
/* normalized str doesn't contain '\0'. */
- if (a && b && strstr(a->norm, b->norm)) {
- GRN_INT32_SET(ctx, res, 1);
+ if (a && b) {
+ const char *normalized_a, *normalized_b;
+ grn_normalized_text_get_value(ctx, a, &normalized_a, NULL, NULL);
+ grn_normalized_text_get_value(ctx, b, &normalized_b, NULL, NULL);
+ if (strstr(normalized_a, normalized_b)) {
+ GRN_INT32_SET(ctx, res, 1);
+ } else {
+ GRN_INT32_SET(ctx, res, 0);
+ }
} else {
GRN_INT32_SET(ctx, res, 0);
}
res->header.type = GRN_BULK;
res->header.domain = GRN_DB_INT32;
- if (a) { grn_str_close(ctx, a); }
- if (b) { grn_str_close(ctx, b); }
+ if (a) { grn_obj_close(ctx, a); }
+ if (b) { grn_obj_close(ctx, b); }
+ if (normalizer) { grn_obj_unlink(ctx, normalizer); }
}
grn_obj *
Modified: lib/hash.c (+18 -1)
===================================================================
--- lib/hash.c 2012-02-09 09:53:43 +0900 (e30f1f0)
+++ lib/hash.c 2012-02-08 18:18:36 +0900 (9a5455a)
@@ -18,6 +18,7 @@
#include "hash.h"
#include "pat.h"
#include "output.h"
+#include "normalizer.h"
#include <string.h>
#include <limits.h>
@@ -868,6 +869,14 @@ io_hash_init(grn_hash *ih, grn_ctx *ctx, const char *path, uint32_t key_size,
header->n_entries = 0;
header->n_garbages = 0;
header->tokenizer = GRN_ID_NIL;
+ if (header->flags & GRN_OBJ_KEY_NORMALIZE) {
+ header->flags &= ~GRN_OBJ_KEY_NORMALIZE;
+ header->normalizer = grn_normalizer_find(ctx, ctx->encoding);
+ ih->normalizer = grn_ctx_at(ctx, header->normalizer);
+ } else {
+ header->normalizer = GRN_ID_NIL;
+ ih->normalizer = NULL;
+ }
ih->obj.header.flags = flags;
ih->ctx = ctx;
ih->key_size = key_size;
@@ -922,6 +931,7 @@ tiny_hash_init(grn_hash *ah, grn_ctx *ctx, const char *path, uint32_t key_size,
ah->n_entries_ = 0;
ah->garbages = GRN_ID_NIL;
ah->tokenizer = NULL;
+ ah->normalizer = NULL;
grn_tiny_array_init(ctx, &ah->a, entry_size, GRN_TINY_ARRAY_CLEAR);
grn_tiny_array_init(ctx, &ah->bitmap, 1, GRN_TINY_ARRAY_CLEAR);
return GRN_SUCCESS;
@@ -981,6 +991,11 @@ grn_hash_open(grn_ctx *ctx, const char *path)
hash->header = header;
hash->lock = &header->lock;
hash->tokenizer = grn_ctx_at(ctx, header->tokenizer);
+ if (header->flags & GRN_OBJ_KEY_NORMALIZE) {
+ header->flags &= ~GRN_OBJ_KEY_NORMALIZE;
+ header->normalizer = grn_normalizer_find(ctx, ctx->encoding);
+ }
+ hash->normalizer = grn_ctx_at(ctx, header->normalizer);
return (grn_hash *)hash;
} else {
GRN_LOG(ctx, GRN_LOG_NOTICE, "invalid hash flag. (%x)", header->flags);
@@ -2144,7 +2159,7 @@ grn_hash_check(grn_ctx *ctx, grn_hash *hash)
char buf[8];
struct grn_hash_header *h = hash->header;
GRN_OUTPUT_ARRAY_OPEN("RESULT", 1);
- GRN_OUTPUT_MAP_OPEN("SUMMARY", 24);
+ GRN_OUTPUT_MAP_OPEN("SUMMARY", 25);
GRN_OUTPUT_CSTR("flags");
grn_itoh(h->flags, buf, 8);
GRN_OUTPUT_STR(buf, 8);
@@ -2154,6 +2169,8 @@ grn_hash_check(grn_ctx *ctx, grn_hash *hash)
GRN_OUTPUT_INT64(hash->value_size);
GRN_OUTPUT_CSTR("tokenizer");
GRN_OUTPUT_INT64(h->tokenizer);
+ GRN_OUTPUT_CSTR("normalizer");
+ GRN_OUTPUT_INT64(h->normalizer);
GRN_OUTPUT_CSTR("curr_rec");
GRN_OUTPUT_INT64(h->curr_rec);
GRN_OUTPUT_CSTR("curr_key");
Modified: lib/hash.h (+3 -1)
===================================================================
--- lib/hash.h 2012-02-09 09:53:43 +0900 (efe364f)
+++ lib/hash.h 2012-02-08 18:18:36 +0900 (541835e)
@@ -185,6 +185,7 @@ struct _grn_hash {
uint32_t *n_entries;
uint32_t *max_offset;
grn_obj *tokenizer;
+ grn_obj *normalizer;
/* portions for io_hash */
grn_io *io;
struct grn_hash_header *header;
@@ -225,7 +226,8 @@ struct grn_hash_header {
uint32_t n_entries;
uint32_t n_garbages;
uint32_t lock;
- uint32_t reserved[16];
+ grn_id normalizer;
+ uint32_t reserved[15];
grn_id garbages[GRN_HASH_MAX_KEY_SIZE];
};
Added: lib/normalizer.c (+1183 -0) 100644
===================================================================
--- /dev/null
+++ lib/normalizer.c 2012-02-08 18:18:36 +0900 (a21de48)
@@ -0,0 +1,1183 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+ Copyright(C) 2012 Brazil
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License version 2.1 as published by the Free Software Foundation.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include "groonga_in.h"
+#include <string.h>
+#include "normalizer.h"
+#include "str.h"
+
+grn_id
+grn_normalizer_find(grn_ctx *ctx, grn_encoding encoding)
+{
+ grn_id normalizer_id = GRN_ID_NIL;
+
+ switch (encoding) {
+ case GRN_ENC_EUC_JP :
+ normalizer_id = GRN_DB_NORMALIZER_EUC_JP;
+ break;
+ case GRN_ENC_UTF8 :
+#ifdef NO_NFKC
+ normalizer_id = GRN_DB_NORMALIZER_ASCII;
+#else /* NO_NFKC */
+ normalizer_id = GRN_DB_NORMALIZER_UTF8_NFKC;
+#endif /* NO_NFKC */
+ break;
+ case GRN_ENC_SJIS :
+ normalizer_id = GRN_DB_NORMALIZER_SJIS;
+ break;
+ case GRN_ENC_LATIN1 :
+ normalizer_id = GRN_DB_NORMALIZER_LATIN1;
+ break;
+ case GRN_ENC_KOI8R :
+ normalizer_id = GRN_DB_NORMALIZER_KOI8R;
+ break;
+ default :
+ normalizer_id = GRN_DB_NORMALIZER_ASCII;
+ break;
+ }
+
+ return normalizer_id;
+}
+
+grn_rc
+grn_normalizer_init(void)
+{
+ return GRN_SUCCESS;
+}
+
+grn_rc
+grn_normalizer_fin(void)
+{
+ return GRN_SUCCESS;
+}
+
+grn_obj *
+grn_normalized_text_open(grn_ctx *ctx, grn_obj *normalizer,
+ const char *str, unsigned int str_len,
+ grn_encoding encoding, int flags)
+{
+ grn_normalized_text *normalized_text;
+ grn_obj *obj;
+
+ if (!normalizer) {
+ return NULL;
+ }
+
+ normalized_text = GRN_MALLOCN(grn_normalized_text, 1);
+ if (!normalized_text) {
+ return NULL;
+ }
+
+ GRN_API_ENTER;
+ obj = (grn_obj *)normalized_text;
+ GRN_OBJ_INIT(obj, GRN_NORMALIZED_TEXT, GRN_OBJ_ALLOCATED, GRN_ID_NIL);
+ normalized_text->orig = str;
+ normalized_text->orig_blen = str_len;
+ normalized_text->norm = NULL;
+ normalized_text->norm_blen = 0;
+ normalized_text->length = 0;
+ normalized_text->checks = NULL;
+ normalized_text->ctypes = NULL;
+ normalized_text->encoding = encoding;
+ normalized_text->flags = flags;
+
+ ((grn_proc *)normalizer)->funcs[PROC_NEXT](ctx, 1, &obj, NULL);
+
+ GRN_API_RETURN(obj);
+}
+
+grn_rc
+grn_normalized_text_get_value(grn_ctx *ctx, grn_obj *normalized_text,
+ const char **value, unsigned int *length,
+ unsigned int *binary_length)
+{
+ grn_rc rc;
+ grn_normalized_text *text = (grn_normalized_text *)normalized_text;
+ GRN_API_ENTER;
+ if (text) {
+ if (value) { *value = text->norm; }
+ if (length) { *length = text->length; }
+ if (binary_length) { *binary_length = text->norm_blen; }
+ rc = GRN_SUCCESS;
+ } else {
+ rc = GRN_INVALID_ARGUMENT;
+ }
+ GRN_API_RETURN(rc);
+}
+
+grn_rc
+grn_normalized_text_close(grn_ctx *ctx, grn_obj *normalized_text)
+{
+ grn_rc rc;
+ grn_normalized_text *text = (grn_normalized_text *)normalized_text;
+ if (text) {
+ if (text->norm) { GRN_FREE(text->norm); }
+ if (text->ctypes) { GRN_FREE(text->ctypes); }
+ if (text->checks) { GRN_FREE(text->checks); }
+ GRN_FREE(text);
+ rc = GRN_SUCCESS;
+ } else {
+ rc = GRN_INVALID_ARGUMENT;
+ }
+ return rc;
+}
+
+static unsigned char symbol[] = {
+ ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0,
+ '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+inline static grn_obj *
+eucjp_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
+ grn_user_data *user_data)
+{
+ static uint16_t hankana[] = {
+ 0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3,
+ 0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2,
+ 0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3,
+ 0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6,
+ 0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5,
+ 0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6,
+ 0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab,
+ 0xa1eb
+ };
+ static unsigned char dakuten[] = {
+ 0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0,
+ 0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7,
+ 0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0,
+ 0, 0xdc
+ };
+ static unsigned char handaku[] = {
+ 0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd
+ };
+ grn_normalized_text *nstr = (grn_normalized_text *)args[0];
+ int16_t *ch;
+ const unsigned char *s, *s_, *e;
+ unsigned char *d, *d0, *d_, b;
+ uint_least8_t *cp, *ctypes, ctype;
+ size_t size = nstr->orig_blen, length = 0;
+ int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
+ if (!(nstr->norm = GRN_MALLOC(size * 2 + 1))) {
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][eucjp] failed to allocate normalized text space");
+ return NULL;
+ }
+ d0 = (unsigned char *) nstr->norm;
+ if (nstr->flags & GRN_STR_WITH_CHECKS) {
+ if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
+ GRN_FREE(nstr->norm);
+ nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][eucjp] failed to allocate checks space");
+ return NULL;
+ }
+ }
+ ch = nstr->checks;
+ if (nstr->flags & GRN_STR_WITH_CTYPES) {
+ if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+ GRN_FREE(nstr->checks);
+ GRN_FREE(nstr->norm);
+ nstr->checks = NULL;
+ nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][eucjp] failed to allocate character types space");
+ return NULL;
+ }
+ }
+ cp = ctypes = nstr->ctypes;
+ e = (unsigned char *)nstr->orig + size;
+ for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
+ if ((*s & 0x80)) {
+ if (((s + 1) < e) && (*(s + 1) & 0x80)) {
+ unsigned char c1 = *s++, c2 = *s, c3 = 0;
+ switch (c1 >> 4) {
+ case 0x08 :
+ if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) {
+ uint16_t c = hankana[c2 - 0xa0];
+ switch (c) {
+ case 0xa1ab :
+ if (d > d0 + 1 && d[-2] == 0xa5
+ && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) {
+ *(d - 1) = b;
+ if (ch) { ch[-1] += 2; s_ += 2; }
+ continue;
+ } else {
+ *d++ = c >> 8; *d = c & 0xff;
+ }
+ break;
+ case 0xa1eb :
+ if (d > d0 + 1 && d[-2] == 0xa5
+ && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) {
+ *(d - 1) = b;
+ if (ch) { ch[-1] += 2; s_ += 2; }
+ continue;
+ } else {
+ *d++ = c >> 8; *d = c & 0xff;
+ }
+ break;
+ default :
+ *d++ = c >> 8; *d = c & 0xff;
+ break;
+ }
+ ctype = grn_str_katakana;
+ } else {
+ *d++ = c1; *d = c2;
+ ctype = grn_str_others;
+ }
+ break;
+ case 0x09 :
+ *d++ = c1; *d = c2;
+ ctype = grn_str_others;
+ break;
+ case 0x0a :
+ switch (c1 & 0x0f) {
+ case 1 :
+ switch (c2) {
+ case 0xbc :
+ *d++ = c1; *d = c2;
+ ctype = grn_str_katakana;
+ break;
+ case 0xb9 :
+ *d++ = c1; *d = c2;
+ ctype = grn_str_kanji;
+ break;
+ case 0xa1 :
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_STR_BLANK|grn_str_symbol;
+ }
+ break;
+ default :
+ if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) {
+ *d = c3;
+ ctype = grn_str_symbol;
+ } else {
+ *d++ = c1; *d = c2;
+ ctype = grn_str_others;
+ }
+ break;
+ }
+ break;
+ case 2 :
+ *d++ = c1; *d = c2;
+ ctype = grn_str_symbol;
+ break;
+ case 3 :
+ c3 = c2 - 0x80;
+ if ('a' <= c3 && c3 <= 'z') {
+ ctype = grn_str_alpha;
+ *d = c3;
+ } else if ('A' <= c3 && c3 <= 'Z') {
+ ctype = grn_str_alpha;
+ *d = c3 + 0x20;
+ } else if ('0' <= c3 && c3 <= '9') {
+ ctype = grn_str_digit;
+ *d = c3;
+ } else {
+ ctype = grn_str_others;
+ *d++ = c1; *d = c2;
+ }
+ break;
+ case 4 :
+ *d++ = c1; *d = c2;
+ ctype = grn_str_hiragana;
+ break;
+ case 5 :
+ *d++ = c1; *d = c2;
+ ctype = grn_str_katakana;
+ break;
+ case 6 :
+ case 7 :
+ case 8 :
+ *d++ = c1; *d = c2;
+ ctype = grn_str_symbol;
+ break;
+ default :
+ *d++ = c1; *d = c2;
+ ctype = grn_str_others;
+ break;
+ }
+ break;
+ default :
+ *d++ = c1; *d = c2;
+ ctype = grn_str_kanji;
+ break;
+ }
+ } else {
+ /* skip invalid character */
+ continue;
+ }
+ } else {
+ unsigned char c = *s;
+ switch (c >> 4) {
+ case 0 :
+ case 1 :
+ /* skip unprintable ascii */
+ if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ continue;
+ case 2 :
+ if (c == 0x20) {
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_STR_BLANK|grn_str_symbol;
+ }
+ } else {
+ *d = c;
+ ctype = grn_str_symbol;
+ }
+ break;
+ case 3 :
+ *d = c;
+ ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol;
+ break;
+ case 4 :
+ *d = ('A' <= c) ? c + 0x20 : c;
+ ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha;
+ break;
+ case 5 :
+ *d = (c <= 'Z') ? c + 0x20 : c;
+ ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol;
+ break;
+ case 6 :
+ *d = c;
+ ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha;
+ break;
+ case 7 :
+ *d = c;
+ ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol);
+ break;
+ default :
+ *d = c;
+ ctype = grn_str_others;
+ break;
+ }
+ }
+ d++;
+ length++;
+ if (cp) { *cp++ = ctype; }
+ if (ch) {
+ *ch++ = (int16_t)(s + 1 - s_);
+ s_ = s + 1;
+ while (++d_ < d) { *ch++ = 0; }
+ }
+ }
+ if (cp) { *cp = grn_str_null; }
+ *d = '\0';
+ nstr->length = length;
+ nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
+ return NULL;
+}
+
+#ifndef NO_NFKC
+uint_least8_t grn_nfkc_ctype(const unsigned char *str);
+const char *grn_nfkc_map1(const unsigned char *str);
+const char *grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix);
+
+inline static grn_obj *
+utf8_nfkc_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
+ grn_user_data *user_data)
+{
+ grn_normalized_text *nstr = (grn_normalized_text *)args[0];
+ int16_t *ch;
+ const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
+ unsigned char *d, *d_, *de;
+ uint_least8_t *cp;
+ size_t length = 0, ls, lp, size = nstr->orig_blen, ds = size * 3;
+ int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
+ if (!(nstr->norm = GRN_MALLOC(ds + 1))) {
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][utf8][nfkc] failed to allocate normalized text space");
+ return NULL;
+ }
+ if (nstr->flags & GRN_STR_WITH_CHECKS) {
+ if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) {
+ GRN_FREE(nstr->norm);
+ nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][utf8][nfkc] failed to allocate checks space");
+ return NULL;
+ }
+ }
+ ch = nstr->checks;
+ if (nstr->flags & GRN_STR_WITH_CTYPES) {
+ if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) {
+ if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+ GRN_FREE(nstr->norm);
+ nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][utf8][nfkc] failed to allocate character types space");
+ return NULL;
+ }
+ }
+ cp = nstr->ctypes;
+ d = (unsigned char *)nstr->norm;
+ de = d + ds;
+ d_ = NULL;
+ e = (unsigned char *)nstr->orig + size;
+ for (s = s_ = (unsigned char *)nstr->orig; ; s += ls) {
+ if (!(ls = grn_str_charlen_utf8(ctx, s, e))) {
+ break;
+ }
+ if ((p = (unsigned char *)grn_nfkc_map1(s))) {
+ pe = p + strlen((char *)p);
+ } else {
+ p = s;
+ pe = p + ls;
+ }
+ if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
+ p = p2;
+ pe = p + strlen((char *)p);
+ if (cp) { cp--; }
+ if (ch) {
+ ch -= (d - d_);
+ s_ = s__;
+ }
+ d = d_;
+ length--;
+ }
+ for (; ; p += lp) {
+ if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) {
+ break;
+ }
+ if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) {
+ if (cp > nstr->ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ } else {
+ if (de <= d + lp) {
+ unsigned char *norm;
+ ds += (ds >> 1) + lp;
+ if (!(norm = GRN_REALLOC(nstr->norm, ds + 1))) {
+ if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
+ if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+ GRN_FREE(nstr->norm); nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][utf8][nfkc] "
+ "failed to reallocate normalized text space");
+ return NULL;
+ }
+ de = norm + ds;
+ d = norm + (d - (unsigned char *)nstr->norm);
+ nstr->norm = norm;
+ if (ch) {
+ int16_t *checks;
+ if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) {
+ if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
+ GRN_FREE(nstr->checks); nstr->checks = NULL;
+ GRN_FREE(nstr->norm); nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][utf8][nfkc] "
+ "failed to reallocate checks space");
+ return NULL;
+ }
+ ch = checks + (ch - nstr->checks);
+ nstr->checks = checks;
+ }
+ if (cp) {
+ uint_least8_t *ctypes;
+ if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) {
+ GRN_FREE(nstr->ctypes); nstr->ctypes = NULL;
+ if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+ GRN_FREE(nstr->norm); nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][utf8][nfkc] "
+ "failed to reallocate character types space");
+ return NULL;
+ }
+ cp = ctypes + (cp - nstr->ctypes);
+ nstr->ctypes = ctypes;
+ }
+ }
+ memcpy(d, p, lp);
+ d_ = d;
+ d += lp;
+ length++;
+ if (cp) { *cp++ = grn_nfkc_ctype(p); }
+ if (ch) {
+ size_t i;
+ if (s_ == s + ls) {
+ *ch++ = -1;
+ } else {
+ *ch++ = (int16_t)(s + ls - s_);
+ s__ = s_;
+ s_ = s + ls;
+ }
+ for (i = lp; i > 1; i--) { *ch++ = 0; }
+ }
+ }
+ }
+ }
+ if (cp) { *cp = grn_str_null; }
+ *d = '\0';
+ nstr->length = length;
+ nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
+ return NULL;
+}
+#endif /* NO_NFKC */
+
+inline static grn_obj *
+sjis_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
+ grn_user_data *user_data)
+{
+ static uint16_t hankana[] = {
+ 0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342,
+ 0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341,
+ 0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352,
+ 0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365,
+ 0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374,
+ 0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386,
+ 0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a,
+ 0x814b
+ };
+ static unsigned char dakuten[] = {
+ 0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0,
+ 0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66,
+ 0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0,
+ 0, 0x7b
+ };
+ static unsigned char handaku[] = {
+ 0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c
+ };
+ grn_normalized_text *nstr = (grn_normalized_text *)args[0];
+ int16_t *ch;
+ const unsigned char *s, *s_;
+ unsigned char *d, *d0, *d_, b, *e;
+ uint_least8_t *cp, *ctypes, ctype;
+ size_t size = nstr->orig_blen, length = 0;
+ int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
+ if (!(nstr->norm = GRN_MALLOC(size * 2 + 1))) {
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][sjis] failed to allocate normalized text space");
+ return NULL;
+ }
+ d0 = (unsigned char *) nstr->norm;
+ if (nstr->flags & GRN_STR_WITH_CHECKS) {
+ if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
+ GRN_FREE(nstr->norm);
+ nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][sjis] failed to allocate checks space");
+ return NULL;
+ }
+ }
+ ch = nstr->checks;
+ if (nstr->flags & GRN_STR_WITH_CTYPES) {
+ if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+ GRN_FREE(nstr->checks);
+ GRN_FREE(nstr->norm);
+ nstr->checks = NULL;
+ nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][sjis] failed to allocate character types space");
+ return NULL;
+ }
+ }
+ cp = ctypes = nstr->ctypes;
+ e = (unsigned char *)nstr->orig + size;
+ for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
+ if ((*s & 0x80)) {
+ if (0xa0 <= *s && *s <= 0xdf) {
+ uint16_t c = hankana[*s - 0xa0];
+ switch (c) {
+ case 0x814a :
+ if (d > d0 + 1 && d[-2] == 0x83
+ && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) {
+ *(d - 1) = b;
+ if (ch) { ch[-1]++; s_++; }
+ continue;
+ } else {
+ *d++ = c >> 8; *d = c & 0xff;
+ }
+ break;
+ case 0x814b :
+ if (d > d0 + 1 && d[-2] == 0x83
+ && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) {
+ *(d - 1) = b;
+ if (ch) { ch[-1]++; s_++; }
+ continue;
+ } else {
+ *d++ = c >> 8; *d = c & 0xff;
+ }
+ break;
+ default :
+ *d++ = c >> 8; *d = c & 0xff;
+ break;
+ }
+ ctype = grn_str_katakana;
+ } else {
+ if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) {
+ unsigned char c1 = *s++, c2 = *s, c3 = 0;
+ if (0x81 <= c1 && c1 <= 0x87) {
+ switch (c1 & 0x0f) {
+ case 1 :
+ switch (c2) {
+ case 0x5b :
+ *d++ = c1; *d = c2;
+ ctype = grn_str_katakana;
+ break;
+ case 0x58 :
+ *d++ = c1; *d = c2;
+ ctype = grn_str_kanji;
+ break;
+ case 0x40 :
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_STR_BLANK|grn_str_symbol;
+ }
+ break;
+ default :
+ if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) {
+ *d = c3;
+ ctype = grn_str_symbol;
+ } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) {
+ *d = c3;
+ ctype = grn_str_symbol;
+ } else {
+ *d++ = c1; *d = c2;
+ ctype = grn_str_others;
+ }
+ break;
+ }
+ break;
+ case 2 :
+ c3 = c2 - 0x1f;
+ if (0x4f <= c2 && c2 <= 0x58) {
+ ctype = grn_str_digit;
+ *d = c2 - 0x1f;
+ } else if (0x60 <= c2 && c2 <= 0x79) {
+ ctype = grn_str_alpha;
+ *d = c2 + 0x01;
+ } else if (0x81 <= c2 && c2 <= 0x9a) {
+ ctype = grn_str_alpha;
+ *d = c2 - 0x20;
+ } else if (0x9f <= c2 && c2 <= 0xf1) {
+ *d++ = c1; *d = c2;
+ ctype = grn_str_hiragana;
+ } else {
+ *d++ = c1; *d = c2;
+ ctype = grn_str_others;
+ }
+ break;
+ case 3 :
+ if (0x40 <= c2 && c2 <= 0x96) {
+ *d++ = c1; *d = c2;
+ ctype = grn_str_katakana;
+ } else {
+ *d++ = c1; *d = c2;
+ ctype = grn_str_symbol;
+ }
+ break;
+ case 4 :
+ case 7 :
+ *d++ = c1; *d = c2;
+ ctype = grn_str_symbol;
+ break;
+ default :
+ *d++ = c1; *d = c2;
+ ctype = grn_str_others;
+ break;
+ }
+ } else {
+ *d++ = c1; *d = c2;
+ ctype = grn_str_kanji;
+ }
+ } else {
+ /* skip invalid character */
+ continue;
+ }
+ }
+ } else {
+ unsigned char c = *s;
+ switch (c >> 4) {
+ case 0 :
+ case 1 :
+ /* skip unprintable ascii */
+ if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ continue;
+ case 2 :
+ if (c == 0x20) {
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_STR_BLANK|grn_str_symbol;
+ }
+ } else {
+ *d = c;
+ ctype = grn_str_symbol;
+ }
+ break;
+ case 3 :
+ *d = c;
+ ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol;
+ break;
+ case 4 :
+ *d = ('A' <= c) ? c + 0x20 : c;
+ ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha;
+ break;
+ case 5 :
+ *d = (c <= 'Z') ? c + 0x20 : c;
+ ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol;
+ break;
+ case 6 :
+ *d = c;
+ ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha;
+ break;
+ case 7 :
+ *d = c;
+ ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol);
+ break;
+ default :
+ *d = c;
+ ctype = grn_str_others;
+ break;
+ }
+ }
+ d++;
+ length++;
+ if (cp) { *cp++ = ctype; }
+ if (ch) {
+ *ch++ = (int16_t)(s + 1 - s_);
+ s_ = s + 1;
+ while (++d_ < d) { *ch++ = 0; }
+ }
+ }
+ if (cp) { *cp = grn_str_null; }
+ *d = '\0';
+ nstr->length = length;
+ nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
+ return NULL;
+}
+
+inline static grn_obj *
+ascii_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+ grn_normalized_text *nstr = (grn_normalized_text *)args[0];
+ int16_t *ch;
+ const unsigned char *s, *s_, *e;
+ unsigned char *d, *d0, *d_;
+ uint_least8_t *cp, *ctypes, ctype;
+ size_t size = nstr->orig_blen, length = 0;
+ int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
+ if (!(nstr->norm = GRN_MALLOC(size + 1))) {
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][ascii] failed to allocate normalized text space");
+ return NULL;
+ }
+ d0 = (unsigned char *) nstr->norm;
+ if (nstr->flags & GRN_STR_WITH_CHECKS) {
+ if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
+ GRN_FREE(nstr->norm);
+ nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][ascii] failed to allocate checks space");
+ return NULL;
+ }
+ }
+ ch = nstr->checks;
+ if (nstr->flags & GRN_STR_WITH_CTYPES) {
+ if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+ GRN_FREE(nstr->checks);
+ GRN_FREE(nstr->norm);
+ nstr->checks = NULL;
+ nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][ascii] failed to allocate character types space");
+ return NULL;
+ }
+ }
+ cp = ctypes = nstr->ctypes;
+ e = (unsigned char *)nstr->orig + size;
+ for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
+ unsigned char c = *s;
+ switch (c >> 4) {
+ case 0 :
+ case 1 :
+ /* skip unprintable ascii */
+ if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ continue;
+ case 2 :
+ if (c == 0x20) {
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_STR_BLANK|grn_str_symbol;
+ }
+ } else {
+ *d = c;
+ ctype = grn_str_symbol;
+ }
+ break;
+ case 3 :
+ *d = c;
+ ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol;
+ break;
+ case 4 :
+ *d = ('A' <= c) ? c + 0x20 : c;
+ ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha;
+ break;
+ case 5 :
+ *d = (c <= 'Z') ? c + 0x20 : c;
+ ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol;
+ break;
+ case 6 :
+ *d = c;
+ ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha;
+ break;
+ case 7 :
+ *d = c;
+ ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol);
+ break;
+ default :
+ *d = c;
+ ctype = grn_str_others;
+ break;
+ }
+ d++;
+ length++;
+ if (cp) { *cp++ = ctype; }
+ if (ch) {
+ *ch++ = (int16_t)(s + 1 - s_);
+ s_ = s + 1;
+ while (++d_ < d) { *ch++ = 0; }
+ }
+ }
+ if (cp) { *cp = grn_str_null; }
+ *d = '\0';
+ nstr->length = length;
+ nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
+ return NULL;
+}
+
+/* use cp1252 as latin1 */
+inline static grn_obj *
+latin1_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
+ grn_user_data *user_data)
+{
+ grn_normalized_text *nstr = (grn_normalized_text *)args[0];
+ int16_t *ch;
+ const unsigned char *s, *s_, *e;
+ unsigned char *d, *d0, *d_;
+ uint_least8_t *cp, *ctypes, ctype;
+ size_t size = strlen(nstr->orig), length = 0;
+ int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
+ if (!(nstr->norm = GRN_MALLOC(size + 1))) {
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][latin1] failed to allocate normalized text space");
+ return NULL;
+ }
+ d0 = (unsigned char *) nstr->norm;
+ if (nstr->flags & GRN_STR_WITH_CHECKS) {
+ if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
+ GRN_FREE(nstr->norm);
+ nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][latin1] failed to allocate checks space");
+ return NULL;
+ }
+ }
+ ch = nstr->checks;
+ if (nstr->flags & GRN_STR_WITH_CTYPES) {
+ if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+ GRN_FREE(nstr->checks);
+ GRN_FREE(nstr->norm);
+ nstr->checks = NULL;
+ nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][latin1] failed to allocate character types space");
+ return NULL;
+ }
+ }
+ cp = ctypes = nstr->ctypes;
+ e = (unsigned char *)nstr->orig + size;
+ for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
+ unsigned char c = *s;
+ switch (c >> 4) {
+ case 0 :
+ case 1 :
+ /* skip unprintable ascii */
+ if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ continue;
+ case 2 :
+ if (c == 0x20) {
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_STR_BLANK|grn_str_symbol;
+ }
+ } else {
+ *d = c;
+ ctype = grn_str_symbol;
+ }
+ break;
+ case 3 :
+ *d = c;
+ ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol;
+ break;
+ case 4 :
+ *d = ('A' <= c) ? c + 0x20 : c;
+ ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha;
+ break;
+ case 5 :
+ *d = (c <= 'Z') ? c + 0x20 : c;
+ ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol;
+ break;
+ case 6 :
+ *d = c;
+ ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha;
+ break;
+ case 7 :
+ *d = c;
+ ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol);
+ break;
+ case 8 :
+ if (c == 0x8a || c == 0x8c || c == 0x8e) {
+ *d = c + 0x10;
+ ctype = grn_str_alpha;
+ } else {
+ *d = c;
+ ctype = grn_str_symbol;
+ }
+ break;
+ case 9 :
+ if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) {
+ *d = (c == 0x9f) ? c + 0x60 : c;
+ ctype = grn_str_alpha;
+ } else {
+ *d = c;
+ ctype = grn_str_symbol;
+ }
+ break;
+ case 0x0c :
+ *d = c + 0x20;
+ ctype = grn_str_alpha;
+ break;
+ case 0x0d :
+ *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20;
+ ctype = (c == 0xd7) ? grn_str_symbol : grn_str_alpha;
+ break;
+ case 0x0e :
+ *d = c;
+ ctype = grn_str_alpha;
+ break;
+ case 0x0f :
+ *d = c;
+ ctype = (c == 0xf7) ? grn_str_symbol : grn_str_alpha;
+ break;
+ default :
+ *d = c;
+ ctype = grn_str_others;
+ break;
+ }
+ d++;
+ length++;
+ if (cp) { *cp++ = ctype; }
+ if (ch) {
+ *ch++ = (int16_t)(s + 1 - s_);
+ s_ = s + 1;
+ while (++d_ < d) { *ch++ = 0; }
+ }
+ }
+ if (cp) { *cp = grn_str_null; }
+ *d = '\0';
+ nstr->length = length;
+ nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
+ return NULL;
+}
+
+inline static grn_obj *
+koi8r_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
+ grn_user_data *user_data)
+{
+ grn_normalized_text *nstr = (grn_normalized_text *)args[0];
+ int16_t *ch;
+ const unsigned char *s, *s_, *e;
+ unsigned char *d, *d0, *d_;
+ uint_least8_t *cp, *ctypes, ctype;
+ size_t size = strlen(nstr->orig), length = 0;
+ int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
+ if (!(nstr->norm = GRN_MALLOC(size + 1))) {
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][koi8r] failed to allocate normalized text space");
+ return NULL;
+ }
+ d0 = (unsigned char *) nstr->norm;
+ if (nstr->flags & GRN_STR_WITH_CHECKS) {
+ if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
+ GRN_FREE(nstr->norm);
+ nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][koi8r] failed to allocate checks space");
+ return NULL;
+ }
+ }
+ ch = nstr->checks;
+ if (nstr->flags & GRN_STR_WITH_CTYPES) {
+ if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+ GRN_FREE(nstr->checks);
+ GRN_FREE(nstr->norm);
+ nstr->checks = NULL;
+ nstr->norm = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][koi8r] failed to allocate character types space");
+ return NULL;
+ }
+ }
+ cp = ctypes = nstr->ctypes;
+ e = (unsigned char *)nstr->orig + size;
+ for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
+ unsigned char c = *s;
+ switch (c >> 4) {
+ case 0 :
+ case 1 :
+ /* skip unprintable ascii */
+ if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ continue;
+ case 2 :
+ if (c == 0x20) {
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_STR_BLANK|grn_str_symbol;
+ }
+ } else {
+ *d = c;
+ ctype = grn_str_symbol;
+ }
+ break;
+ case 3 :
+ *d = c;
+ ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol;
+ break;
+ case 4 :
+ *d = ('A' <= c) ? c + 0x20 : c;
+ ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha;
+ break;
+ case 5 :
+ *d = (c <= 'Z') ? c + 0x20 : c;
+ ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol;
+ break;
+ case 6 :
+ *d = c;
+ ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha;
+ break;
+ case 7 :
+ *d = c;
+ ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol);
+ break;
+ case 0x0a :
+ *d = c;
+ ctype = (c == 0xa3) ? grn_str_alpha : grn_str_others;
+ break;
+ case 0x0b :
+ if (c == 0xb3) {
+ *d = c - 0x10;
+ ctype = grn_str_alpha;
+ } else {
+ *d = c;
+ ctype = grn_str_others;
+ }
+ break;
+ case 0x0c :
+ case 0x0d :
+ *d = c;
+ ctype = grn_str_alpha;
+ break;
+ case 0x0e :
+ case 0x0f :
+ *d = c - 0x20;
+ ctype = grn_str_alpha;
+ break;
+ default :
+ *d = c;
+ ctype = grn_str_others;
+ break;
+ }
+ d++;
+ length++;
+ if (cp) { *cp++ = ctype; }
+ if (ch) {
+ *ch++ = (int16_t)(s + 1 - s_);
+ s_ = s + 1;
+ while (++d_ < d) { *ch++ = 0; }
+ }
+ }
+ if (cp) { *cp = grn_str_null; }
+ *d = '\0';
+ nstr->length = length;
+ nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
+ return NULL;
+}
+
+#define DEF_NORMALIZERIZER(name, normalize)\
+ (grn_proc_create(ctx, (name), (sizeof(name) - 1),\
+ GRN_PROC_NORMALIZER, NULL, (normalize), NULL, 0, NULL))
+
+grn_rc
+grn_db_init_builtin_normalizers(grn_ctx *ctx)
+{
+ grn_obj *obj;
+
+ obj = DEF_NORMALIZERIZER("NormalizerASCII", ascii_normalize);
+ if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_ASCII) {
+ return GRN_FILE_CORRUPT;
+ }
+ obj = DEF_NORMALIZERIZER("NormalizerUTF8NFKC", utf8_nfkc_normalize);
+ if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_UTF8_NFKC) {
+ return GRN_FILE_CORRUPT;
+ }
+ obj = DEF_NORMALIZERIZER("NormalizerEUCJP", eucjp_normalize);
+ if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_EUC_JP) {
+ return GRN_FILE_CORRUPT;
+ }
+ obj = DEF_NORMALIZERIZER("NormalizerSJIS", sjis_normalize);
+ if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_SJIS) {
+ return GRN_FILE_CORRUPT;
+ }
+ obj = DEF_NORMALIZERIZER("NormalizerLATIN1", latin1_normalize);
+ if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_LATIN1) {
+ return GRN_FILE_CORRUPT;
+ }
+ obj = DEF_NORMALIZERIZER("NormalizerKOI8R", koi8r_normalize);
+ if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_KOI8R) {
+ return GRN_FILE_CORRUPT;
+ }
+ /* obj = DEF_NORMALIZERIZER("NormalizerUTF8UCA", utf8_uca_normalize); */
+ /* if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_UTF8_UCA) { */
+ /* return GRN_FILE_CORRUPT; */
+ /* } */
+
+ return GRN_SUCCESS;
+}
Added: lib/normalizer.h (+67 -0) 100644
===================================================================
--- /dev/null
+++ lib/normalizer.h 2012-02-08 18:18:36 +0900 (39c1e36)
@@ -0,0 +1,67 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+ Copyright(C) 2012 Brazil
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License version 2.1 as published by the Free Software Foundation.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#ifndef GRN_NORMALIZER_H
+#define GRN_NORMALIZER_H
+
+#ifndef GROONGA_IN_H
+#include "groonga_in.h"
+#endif /* GROONGA_IN_H */
+
+#ifndef GRN_CTX_H
+#include "ctx.h"
+#endif /* GRN_CTX_H */
+
+#ifndef GRN_DB_H
+#include "db.h"
+#endif /* GRN_DB_H */
+
+#ifndef GRN_STR_H
+#include "str.h"
+#endif /* GRN_STR_H */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ grn_obj_header header;
+ const char *orig;
+ unsigned int orig_blen;
+ char *norm;
+ unsigned int norm_blen;
+ unsigned int length;
+ short *checks;
+ unsigned char *ctypes;
+ grn_encoding encoding;
+ int flags;
+} grn_normalized_text;
+
+grn_rc grn_normalizer_init(void);
+grn_rc grn_normalizer_fin(void);
+
+grn_rc grn_normalized_text_close(grn_ctx *ctx, grn_obj *normalized_text);
+
+grn_id grn_normalizer_find(grn_ctx *ctx, grn_encoding encoding);
+
+grn_rc grn_db_init_builtin_normalizers(grn_ctx *ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GRN_NORMALIZER_H */
Modified: lib/pat.c (+17 -1)
===================================================================
--- lib/pat.c 2012-02-09 09:53:43 +0900 (525eba0)
+++ lib/pat.c 2012-02-08 18:18:36 +0900 (e575a3e)
@@ -19,6 +19,7 @@
#include <limits.h>
#include "pat.h"
#include "output.h"
+#include "normalizer.h"
#include "util.h"
#define GRN_PAT_DELETED (GRN_ID_MAX + 1)
@@ -422,6 +423,14 @@ _grn_pat_create(grn_ctx *ctx, grn_pat *pat,
header->curr_del3 = 0;
header->n_garbages = 0;
header->tokenizer = GRN_ID_NIL;
+ if (header->flags & GRN_OBJ_KEY_NORMALIZE) {
+ header->flags &= ~GRN_OBJ_KEY_NORMALIZE;
+ header->normalizer = grn_normalizer_find(ctx, ctx->encoding);
+ pat->normalizer = grn_ctx_at(ctx, header->normalizer);
+ } else {
+ header->normalizer = GRN_ID_NIL;
+ pat->normalizer = NULL;
+ }
pat->io = io;
pat->header = header;
pat->key_size = key_size;
@@ -518,6 +527,11 @@ grn_pat_open(grn_ctx *ctx, const char *path)
pat->encoding = header->encoding;
pat->obj.header.flags = header->flags;
pat->tokenizer = grn_ctx_at(ctx, header->tokenizer);
+ if (header->flags & GRN_OBJ_KEY_NORMALIZE) {
+ header->flags &= ~GRN_OBJ_KEY_NORMALIZE;
+ header->normalizer = grn_normalizer_find(ctx, ctx->encoding);
+ }
+ pat->normalizer = grn_ctx_at(ctx, header->normalizer);
PAT_AT(pat, 0, node0);
if (!node0) {
grn_io_close(ctx, io);
@@ -2271,7 +2285,7 @@ grn_pat_check(grn_ctx *ctx, grn_pat *pat)
char buf[8];
struct grn_pat_header *h = pat->header;
GRN_OUTPUT_ARRAY_OPEN("RESULT", 1);
- GRN_OUTPUT_MAP_OPEN("SUMMARY", 22);
+ GRN_OUTPUT_MAP_OPEN("SUMMARY", 23);
GRN_OUTPUT_CSTR("flags");
grn_itoh(h->flags, buf, 8);
GRN_OUTPUT_STR(buf, 8);
@@ -2281,6 +2295,8 @@ grn_pat_check(grn_ctx *ctx, grn_pat *pat)
GRN_OUTPUT_INT64(h->value_size);
GRN_OUTPUT_CSTR("tokenizer");
GRN_OUTPUT_INT64(h->tokenizer);
+ GRN_OUTPUT_CSTR("normalizer");
+ GRN_OUTPUT_INT64(h->normalizer);
GRN_OUTPUT_CSTR("n_entries");
GRN_OUTPUT_INT64(h->n_entries);
GRN_OUTPUT_CSTR("curr_rec");
Modified: lib/pat.h (+3 -1)
===================================================================
--- lib/pat.h 2012-02-09 09:53:43 +0900 (30e484a)
+++ lib/pat.h 2012-02-08 18:18:36 +0900 (32acdea)
@@ -38,6 +38,7 @@ struct _grn_pat {
uint32_t key_size;
uint32_t value_size;
grn_obj *tokenizer;
+ grn_obj *normalizer;
grn_id *cache;
uint32_t cache_size;
};
@@ -64,7 +65,8 @@ struct grn_pat_header {
int32_t curr_del2;
int32_t curr_del3;
uint32_t n_garbages;
- uint32_t reserved[1005];
+ grn_id normalizer;
+ uint32_t reserved[1004];
grn_pat_delinfo delinfos[GRN_PAT_NDELINFOS];
grn_id garbages[GRN_PAT_MAX_KEY_SIZE + 1];
};
Modified: lib/snip.c (+1 -1)
===================================================================
--- lib/snip.c 2012-02-09 09:53:43 +0900 (cfe958a)
+++ lib/snip.c 2012-02-08 18:18:36 +0900 (0f0f58d)
@@ -247,7 +247,7 @@ grn_snip_cond_close(grn_ctx *ctx, snip_cond *cond)
grn_rc
grn_snip_cond_init(grn_ctx *ctx, snip_cond *sc, const char *keyword, unsigned int keyword_len,
- grn_encoding enc, int flags)
+ grn_encoding enc, int flags)
{
size_t norm_blen;
int f = GRN_STR_REMOVEBLANK;
Modified: lib/str.c (+32 -988)
===================================================================
--- lib/str.c 2012-02-09 09:53:43 +0900 (f6f518c)
+++ lib/str.c 2012-02-08 18:18:36 +0900 (2865a7b)
@@ -20,13 +20,14 @@
#include <string.h>
#include "db.h"
#include "str.h"
+#include "normalizer.h"
#ifndef _ISOC99_SOURCE
#define _ISOC99_SOURCE
#endif /* _ISOC99_SOURCE */
#include <math.h>
-inline static int
+int
grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char *end)
{
/* MEMO: This function allows non-null-terminated string as str. */
@@ -170,952 +171,6 @@ grn_charlen(grn_ctx *ctx, const char *str, const char *end)
return grn_charlen_(ctx, str, end, ctx->encoding);
}
-static unsigned char symbol[] = {
- ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0,
- '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-inline static grn_rc
-normalize_euc(grn_ctx *ctx, grn_str *nstr)
-{
- static uint16_t hankana[] = {
- 0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3,
- 0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2,
- 0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3,
- 0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6,
- 0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5,
- 0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6,
- 0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab,
- 0xa1eb
- };
- static unsigned char dakuten[] = {
- 0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0,
- 0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7,
- 0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0,
- 0, 0xdc
- };
- static unsigned char handaku[] = {
- 0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd
- };
- int16_t *ch;
- const unsigned char *s, *s_, *e;
- unsigned char *d, *d0, *d_, b;
- uint_least8_t *cp, *ctypes, ctype;
- size_t size = nstr->orig_blen, length = 0;
- int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
- if (!(nstr->norm = GRN_MALLOC(size * 2 + 1))) {
- return GRN_NO_MEMORY_AVAILABLE;
- }
- d0 = (unsigned char *) nstr->norm;
- if (nstr->flags & GRN_STR_WITH_CHECKS) {
- if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
- GRN_FREE(nstr->norm);
- nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- }
- ch = nstr->checks;
- if (nstr->flags & GRN_STR_WITH_CTYPES) {
- if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
- GRN_FREE(nstr->checks);
- GRN_FREE(nstr->norm);
- nstr->checks = NULL;
- nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- }
- cp = ctypes = nstr->ctypes;
- e = (unsigned char *)nstr->orig + size;
- for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
- if ((*s & 0x80)) {
- if (((s + 1) < e) && (*(s + 1) & 0x80)) {
- unsigned char c1 = *s++, c2 = *s, c3 = 0;
- switch (c1 >> 4) {
- case 0x08 :
- if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) {
- uint16_t c = hankana[c2 - 0xa0];
- switch (c) {
- case 0xa1ab :
- if (d > d0 + 1 && d[-2] == 0xa5
- && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) {
- *(d - 1) = b;
- if (ch) { ch[-1] += 2; s_ += 2; }
- continue;
- } else {
- *d++ = c >> 8; *d = c & 0xff;
- }
- break;
- case 0xa1eb :
- if (d > d0 + 1 && d[-2] == 0xa5
- && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) {
- *(d - 1) = b;
- if (ch) { ch[-1] += 2; s_ += 2; }
- continue;
- } else {
- *d++ = c >> 8; *d = c & 0xff;
- }
- break;
- default :
- *d++ = c >> 8; *d = c & 0xff;
- break;
- }
- ctype = grn_str_katakana;
- } else {
- *d++ = c1; *d = c2;
- ctype = grn_str_others;
- }
- break;
- case 0x09 :
- *d++ = c1; *d = c2;
- ctype = grn_str_others;
- break;
- case 0x0a :
- switch (c1 & 0x0f) {
- case 1 :
- switch (c2) {
- case 0xbc :
- *d++ = c1; *d = c2;
- ctype = grn_str_katakana;
- break;
- case 0xb9 :
- *d++ = c1; *d = c2;
- ctype = grn_str_kanji;
- break;
- case 0xa1 :
- if (removeblankp) {
- if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
- continue;
- } else {
- *d = ' ';
- ctype = GRN_STR_BLANK|grn_str_symbol;
- }
- break;
- default :
- if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) {
- *d = c3;
- ctype = grn_str_symbol;
- } else {
- *d++ = c1; *d = c2;
- ctype = grn_str_others;
- }
- break;
- }
- break;
- case 2 :
- *d++ = c1; *d = c2;
- ctype = grn_str_symbol;
- break;
- case 3 :
- c3 = c2 - 0x80;
- if ('a' <= c3 && c3 <= 'z') {
- ctype = grn_str_alpha;
- *d = c3;
- } else if ('A' <= c3 && c3 <= 'Z') {
- ctype = grn_str_alpha;
- *d = c3 + 0x20;
- } else if ('0' <= c3 && c3 <= '9') {
- ctype = grn_str_digit;
- *d = c3;
- } else {
- ctype = grn_str_others;
- *d++ = c1; *d = c2;
- }
- break;
- case 4 :
- *d++ = c1; *d = c2;
- ctype = grn_str_hiragana;
- break;
- case 5 :
- *d++ = c1; *d = c2;
- ctype = grn_str_katakana;
- break;
- case 6 :
- case 7 :
- case 8 :
- *d++ = c1; *d = c2;
- ctype = grn_str_symbol;
- break;
- default :
- *d++ = c1; *d = c2;
- ctype = grn_str_others;
- break;
- }
- break;
- default :
- *d++ = c1; *d = c2;
- ctype = grn_str_kanji;
- break;
- }
- } else {
- /* skip invalid character */
- continue;
- }
- } else {
- unsigned char c = *s;
- switch (c >> 4) {
- case 0 :
- case 1 :
- /* skip unprintable ascii */
- if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
- continue;
- case 2 :
- if (c == 0x20) {
- if (removeblankp) {
- if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
- continue;
- } else {
- *d = ' ';
- ctype = GRN_STR_BLANK|grn_str_symbol;
- }
- } else {
- *d = c;
- ctype = grn_str_symbol;
- }
- break;
- case 3 :
- *d = c;
- ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol;
- break;
- case 4 :
- *d = ('A' <= c) ? c + 0x20 : c;
- ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha;
- break;
- case 5 :
- *d = (c <= 'Z') ? c + 0x20 : c;
- ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol;
- break;
- case 6 :
- *d = c;
- ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha;
- break;
- case 7 :
- *d = c;
- ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol);
- break;
- default :
- *d = c;
- ctype = grn_str_others;
- break;
- }
- }
- d++;
- length++;
- if (cp) { *cp++ = ctype; }
- if (ch) {
- *ch++ = (int16_t)(s + 1 - s_);
- s_ = s + 1;
- while (++d_ < d) { *ch++ = 0; }
- }
- }
- if (cp) { *cp = grn_str_null; }
- *d = '\0';
- nstr->length = length;
- nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
- return GRN_SUCCESS;
-}
-
-#ifndef NO_NFKC
-uint_least8_t grn_nfkc_ctype(const unsigned char *str);
-const char *grn_nfkc_map1(const unsigned char *str);
-const char *grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix);
-
-inline static grn_rc
-normalize_utf8(grn_ctx *ctx, grn_str *nstr)
-{
- int16_t *ch;
- const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
- unsigned char *d, *d_, *de;
- uint_least8_t *cp;
- size_t length = 0, ls, lp, size = nstr->orig_blen, ds = size * 3;
- int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
- if (!(nstr->norm = GRN_MALLOC(ds + 1))) {
- return GRN_NO_MEMORY_AVAILABLE;
- }
- if (nstr->flags & GRN_STR_WITH_CHECKS) {
- if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) {
- GRN_FREE(nstr->norm); nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- }
- ch = nstr->checks;
- if (nstr->flags & GRN_STR_WITH_CTYPES) {
- if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) {
- if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
- GRN_FREE(nstr->norm); nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- }
- cp = nstr->ctypes;
- d = (unsigned char *)nstr->norm;
- de = d + ds;
- d_ = NULL;
- e = (unsigned char *)nstr->orig + size;
- for (s = s_ = (unsigned char *)nstr->orig; ; s += ls) {
- if (!(ls = grn_str_charlen_utf8(ctx, s, e))) {
- break;
- }
- if ((p = (unsigned char *)grn_nfkc_map1(s))) {
- pe = p + strlen((char *)p);
- } else {
- p = s;
- pe = p + ls;
- }
- if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
- p = p2;
- pe = p + strlen((char *)p);
- if (cp) { cp--; }
- if (ch) {
- ch -= (d - d_);
- s_ = s__;
- }
- d = d_;
- length--;
- }
- for (; ; p += lp) {
- if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) {
- break;
- }
- if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) {
- if (cp > nstr->ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
- } else {
- if (de <= d + lp) {
- unsigned char *norm;
- ds += (ds >> 1) + lp;
- if (!(norm = GRN_REALLOC(nstr->norm, ds + 1))) {
- if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
- if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
- GRN_FREE(nstr->norm); nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- de = norm + ds;
- d = norm + (d - (unsigned char *)nstr->norm);
- nstr->norm = norm;
- if (ch) {
- int16_t *checks;
- if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) {
- if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
- GRN_FREE(nstr->checks); nstr->checks = NULL;
- GRN_FREE(nstr->norm); nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- ch = checks + (ch - nstr->checks);
- nstr->checks = checks;
- }
- if (cp) {
- uint_least8_t *ctypes;
- if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) {
- GRN_FREE(nstr->ctypes); nstr->ctypes = NULL;
- if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
- GRN_FREE(nstr->norm); nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- cp = ctypes + (cp - nstr->ctypes);
- nstr->ctypes = ctypes;
- }
- }
- memcpy(d, p, lp);
- d_ = d;
- d += lp;
- length++;
- if (cp) { *cp++ = grn_nfkc_ctype(p); }
- if (ch) {
- size_t i;
- if (s_ == s + ls) {
- *ch++ = -1;
- } else {
- *ch++ = (int16_t)(s + ls - s_);
- s__ = s_;
- s_ = s + ls;
- }
- for (i = lp; i > 1; i--) { *ch++ = 0; }
- }
- }
- }
- }
- if (cp) { *cp = grn_str_null; }
- *d = '\0';
- nstr->length = length;
- nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
- return GRN_SUCCESS;
-}
-#endif /* NO_NFKC */
-
-inline static grn_rc
-normalize_sjis(grn_ctx *ctx, grn_str *nstr)
-{
- static uint16_t hankana[] = {
- 0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342,
- 0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341,
- 0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352,
- 0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365,
- 0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374,
- 0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386,
- 0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a,
- 0x814b
- };
- static unsigned char dakuten[] = {
- 0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0,
- 0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66,
- 0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0,
- 0, 0x7b
- };
- static unsigned char handaku[] = {
- 0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c
- };
- int16_t *ch;
- const unsigned char *s, *s_;
- unsigned char *d, *d0, *d_, b, *e;
- uint_least8_t *cp, *ctypes, ctype;
- size_t size = nstr->orig_blen, length = 0;
- int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
- if (!(nstr->norm = GRN_MALLOC(size * 2 + 1))) {
- return GRN_NO_MEMORY_AVAILABLE;
- }
- d0 = (unsigned char *) nstr->norm;
- if (nstr->flags & GRN_STR_WITH_CHECKS) {
- if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
- GRN_FREE(nstr->norm);
- nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- }
- ch = nstr->checks;
- if (nstr->flags & GRN_STR_WITH_CTYPES) {
- if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
- GRN_FREE(nstr->checks);
- GRN_FREE(nstr->norm);
- nstr->checks = NULL;
- nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- }
- cp = ctypes = nstr->ctypes;
- e = (unsigned char *)nstr->orig + size;
- for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
- if ((*s & 0x80)) {
- if (0xa0 <= *s && *s <= 0xdf) {
- uint16_t c = hankana[*s - 0xa0];
- switch (c) {
- case 0x814a :
- if (d > d0 + 1 && d[-2] == 0x83
- && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) {
- *(d - 1) = b;
- if (ch) { ch[-1]++; s_++; }
- continue;
- } else {
- *d++ = c >> 8; *d = c & 0xff;
- }
- break;
- case 0x814b :
- if (d > d0 + 1 && d[-2] == 0x83
- && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) {
- *(d - 1) = b;
- if (ch) { ch[-1]++; s_++; }
- continue;
- } else {
- *d++ = c >> 8; *d = c & 0xff;
- }
- break;
- default :
- *d++ = c >> 8; *d = c & 0xff;
- break;
- }
- ctype = grn_str_katakana;
- } else {
- if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) {
- unsigned char c1 = *s++, c2 = *s, c3 = 0;
- if (0x81 <= c1 && c1 <= 0x87) {
- switch (c1 & 0x0f) {
- case 1 :
- switch (c2) {
- case 0x5b :
- *d++ = c1; *d = c2;
- ctype = grn_str_katakana;
- break;
- case 0x58 :
- *d++ = c1; *d = c2;
- ctype = grn_str_kanji;
- break;
- case 0x40 :
- if (removeblankp) {
- if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
- continue;
- } else {
- *d = ' ';
- ctype = GRN_STR_BLANK|grn_str_symbol;
- }
- break;
- default :
- if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) {
- *d = c3;
- ctype = grn_str_symbol;
- } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) {
- *d = c3;
- ctype = grn_str_symbol;
- } else {
- *d++ = c1; *d = c2;
- ctype = grn_str_others;
- }
- break;
- }
- break;
- case 2 :
- c3 = c2 - 0x1f;
- if (0x4f <= c2 && c2 <= 0x58) {
- ctype = grn_str_digit;
- *d = c2 - 0x1f;
- } else if (0x60 <= c2 && c2 <= 0x79) {
- ctype = grn_str_alpha;
- *d = c2 + 0x01;
- } else if (0x81 <= c2 && c2 <= 0x9a) {
- ctype = grn_str_alpha;
- *d = c2 - 0x20;
- } else if (0x9f <= c2 && c2 <= 0xf1) {
- *d++ = c1; *d = c2;
- ctype = grn_str_hiragana;
- } else {
- *d++ = c1; *d = c2;
- ctype = grn_str_others;
- }
- break;
- case 3 :
- if (0x40 <= c2 && c2 <= 0x96) {
- *d++ = c1; *d = c2;
- ctype = grn_str_katakana;
- } else {
- *d++ = c1; *d = c2;
- ctype = grn_str_symbol;
- }
- break;
- case 4 :
- case 7 :
- *d++ = c1; *d = c2;
- ctype = grn_str_symbol;
- break;
- default :
- *d++ = c1; *d = c2;
- ctype = grn_str_others;
- break;
- }
- } else {
- *d++ = c1; *d = c2;
- ctype = grn_str_kanji;
- }
- } else {
- /* skip invalid character */
- continue;
- }
- }
- } else {
- unsigned char c = *s;
- switch (c >> 4) {
- case 0 :
- case 1 :
- /* skip unprintable ascii */
- if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
- continue;
- case 2 :
- if (c == 0x20) {
- if (removeblankp) {
- if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
- continue;
- } else {
- *d = ' ';
- ctype = GRN_STR_BLANK|grn_str_symbol;
- }
- } else {
- *d = c;
- ctype = grn_str_symbol;
- }
- break;
- case 3 :
- *d = c;
- ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol;
- break;
- case 4 :
- *d = ('A' <= c) ? c + 0x20 : c;
- ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha;
- break;
- case 5 :
- *d = (c <= 'Z') ? c + 0x20 : c;
- ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol;
- break;
- case 6 :
- *d = c;
- ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha;
- break;
- case 7 :
- *d = c;
- ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol);
- break;
- default :
- *d = c;
- ctype = grn_str_others;
- break;
- }
- }
- d++;
- length++;
- if (cp) { *cp++ = ctype; }
- if (ch) {
- *ch++ = (int16_t)(s + 1 - s_);
- s_ = s + 1;
- while (++d_ < d) { *ch++ = 0; }
- }
- }
- if (cp) { *cp = grn_str_null; }
- *d = '\0';
- nstr->length = length;
- nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
- return GRN_SUCCESS;
-}
-
-inline static grn_rc
-normalize_none(grn_ctx *ctx, grn_str *nstr)
-{
- int16_t *ch;
- const unsigned char *s, *s_, *e;
- unsigned char *d, *d0, *d_;
- uint_least8_t *cp, *ctypes, ctype;
- size_t size = nstr->orig_blen, length = 0;
- int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
- if (!(nstr->norm = GRN_MALLOC(size + 1))) {
- return GRN_NO_MEMORY_AVAILABLE;
- }
- d0 = (unsigned char *) nstr->norm;
- if (nstr->flags & GRN_STR_WITH_CHECKS) {
- if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
- GRN_FREE(nstr->norm);
- nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- }
- ch = nstr->checks;
- if (nstr->flags & GRN_STR_WITH_CTYPES) {
- if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
- GRN_FREE(nstr->checks);
- GRN_FREE(nstr->norm);
- nstr->checks = NULL;
- nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- }
- cp = ctypes = nstr->ctypes;
- e = (unsigned char *)nstr->orig + size;
- for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
- unsigned char c = *s;
- switch (c >> 4) {
- case 0 :
- case 1 :
- /* skip unprintable ascii */
- if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
- continue;
- case 2 :
- if (c == 0x20) {
- if (removeblankp) {
- if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
- continue;
- } else {
- *d = ' ';
- ctype = GRN_STR_BLANK|grn_str_symbol;
- }
- } else {
- *d = c;
- ctype = grn_str_symbol;
- }
- break;
- case 3 :
- *d = c;
- ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol;
- break;
- case 4 :
- *d = ('A' <= c) ? c + 0x20 : c;
- ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha;
- break;
- case 5 :
- *d = (c <= 'Z') ? c + 0x20 : c;
- ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol;
- break;
- case 6 :
- *d = c;
- ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha;
- break;
- case 7 :
- *d = c;
- ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol);
- break;
- default :
- *d = c;
- ctype = grn_str_others;
- break;
- }
- d++;
- length++;
- if (cp) { *cp++ = ctype; }
- if (ch) {
- *ch++ = (int16_t)(s + 1 - s_);
- s_ = s + 1;
- while (++d_ < d) { *ch++ = 0; }
- }
- }
- if (cp) { *cp = grn_str_null; }
- *d = '\0';
- nstr->length = length;
- nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
- return GRN_SUCCESS;
-}
-
-/* use cp1252 as latin1 */
-inline static grn_rc
-normalize_latin1(grn_ctx *ctx, grn_str *nstr)
-{
- int16_t *ch;
- const unsigned char *s, *s_, *e;
- unsigned char *d, *d0, *d_;
- uint_least8_t *cp, *ctypes, ctype;
- size_t size = strlen(nstr->orig), length = 0;
- int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
- if (!(nstr->norm = GRN_MALLOC(size + 1))) {
- return GRN_NO_MEMORY_AVAILABLE;
- }
- d0 = (unsigned char *) nstr->norm;
- if (nstr->flags & GRN_STR_WITH_CHECKS) {
- if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
- GRN_FREE(nstr->norm);
- nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- }
- ch = nstr->checks;
- if (nstr->flags & GRN_STR_WITH_CTYPES) {
- if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
- GRN_FREE(nstr->checks);
- GRN_FREE(nstr->norm);
- nstr->checks = NULL;
- nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- }
- cp = ctypes = nstr->ctypes;
- e = (unsigned char *)nstr->orig + size;
- for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
- unsigned char c = *s;
- switch (c >> 4) {
- case 0 :
- case 1 :
- /* skip unprintable ascii */
- if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
- continue;
- case 2 :
- if (c == 0x20) {
- if (removeblankp) {
- if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
- continue;
- } else {
- *d = ' ';
- ctype = GRN_STR_BLANK|grn_str_symbol;
- }
- } else {
- *d = c;
- ctype = grn_str_symbol;
- }
- break;
- case 3 :
- *d = c;
- ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol;
- break;
- case 4 :
- *d = ('A' <= c) ? c + 0x20 : c;
- ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha;
- break;
- case 5 :
- *d = (c <= 'Z') ? c + 0x20 : c;
- ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol;
- break;
- case 6 :
- *d = c;
- ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha;
- break;
- case 7 :
- *d = c;
- ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol);
- break;
- case 8 :
- if (c == 0x8a || c == 0x8c || c == 0x8e) {
- *d = c + 0x10;
- ctype = grn_str_alpha;
- } else {
- *d = c;
- ctype = grn_str_symbol;
- }
- break;
- case 9 :
- if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) {
- *d = (c == 0x9f) ? c + 0x60 : c;
- ctype = grn_str_alpha;
- } else {
- *d = c;
- ctype = grn_str_symbol;
- }
- break;
- case 0x0c :
- *d = c + 0x20;
- ctype = grn_str_alpha;
- break;
- case 0x0d :
- *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20;
- ctype = (c == 0xd7) ? grn_str_symbol : grn_str_alpha;
- break;
- case 0x0e :
- *d = c;
- ctype = grn_str_alpha;
- break;
- case 0x0f :
- *d = c;
- ctype = (c == 0xf7) ? grn_str_symbol : grn_str_alpha;
- break;
- default :
- *d = c;
- ctype = grn_str_others;
- break;
- }
- d++;
- length++;
- if (cp) { *cp++ = ctype; }
- if (ch) {
- *ch++ = (int16_t)(s + 1 - s_);
- s_ = s + 1;
- while (++d_ < d) { *ch++ = 0; }
- }
- }
- if (cp) { *cp = grn_str_null; }
- *d = '\0';
- nstr->length = length;
- nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
- return GRN_SUCCESS;
-}
-
-inline static grn_rc
-normalize_koi8r(grn_ctx *ctx, grn_str *nstr)
-{
- int16_t *ch;
- const unsigned char *s, *s_, *e;
- unsigned char *d, *d0, *d_;
- uint_least8_t *cp, *ctypes, ctype;
- size_t size = strlen(nstr->orig), length = 0;
- int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
- if (!(nstr->norm = GRN_MALLOC(size + 1))) {
- return GRN_NO_MEMORY_AVAILABLE;
- }
- d0 = (unsigned char *) nstr->norm;
- if (nstr->flags & GRN_STR_WITH_CHECKS) {
- if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
- GRN_FREE(nstr->norm);
- nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- }
- ch = nstr->checks;
- if (nstr->flags & GRN_STR_WITH_CTYPES) {
- if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
- GRN_FREE(nstr->checks);
- GRN_FREE(nstr->norm);
- nstr->checks = NULL;
- nstr->norm = NULL;
- return GRN_NO_MEMORY_AVAILABLE;
- }
- }
- cp = ctypes = nstr->ctypes;
- e = (unsigned char *)nstr->orig + size;
- for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
- unsigned char c = *s;
- switch (c >> 4) {
- case 0 :
- case 1 :
- /* skip unprintable ascii */
- if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
- continue;
- case 2 :
- if (c == 0x20) {
- if (removeblankp) {
- if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
- continue;
- } else {
- *d = ' ';
- ctype = GRN_STR_BLANK|grn_str_symbol;
- }
- } else {
- *d = c;
- ctype = grn_str_symbol;
- }
- break;
- case 3 :
- *d = c;
- ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol;
- break;
- case 4 :
- *d = ('A' <= c) ? c + 0x20 : c;
- ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha;
- break;
- case 5 :
- *d = (c <= 'Z') ? c + 0x20 : c;
- ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol;
- break;
- case 6 :
- *d = c;
- ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha;
- break;
- case 7 :
- *d = c;
- ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol);
- break;
- case 0x0a :
- *d = c;
- ctype = (c == 0xa3) ? grn_str_alpha : grn_str_others;
- break;
- case 0x0b :
- if (c == 0xb3) {
- *d = c - 0x10;
- ctype = grn_str_alpha;
- } else {
- *d = c;
- ctype = grn_str_others;
- }
- break;
- case 0x0c :
- case 0x0d :
- *d = c;
- ctype = grn_str_alpha;
- break;
- case 0x0e :
- case 0x0f :
- *d = c - 0x20;
- ctype = grn_str_alpha;
- break;
- default :
- *d = c;
- ctype = grn_str_others;
- break;
- }
- d++;
- length++;
- if (cp) { *cp++ = ctype; }
- if (ch) {
- *ch++ = (int16_t)(s + 1 - s_);
- s_ = s + 1;
- while (++d_ < d) { *ch++ = 0; }
- }
- }
- if (cp) { *cp = grn_str_null; }
- *d = '\0';
- nstr->length = length;
- nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
- return GRN_SUCCESS;
-}
-
static grn_str *
grn_fakenstr_open(grn_ctx *ctx, const char *str, size_t str_len, grn_encoding encoding, int flags)
{
@@ -1202,53 +257,42 @@ grn_fakenstr_open(grn_ctx *ctx, const char *str, size_t str_len, grn_encoding en
grn_str *
grn_str_open_(grn_ctx *ctx, const char *str, unsigned int str_len, int flags, grn_encoding encoding)
{
- grn_rc rc;
- grn_str *nstr;
+ grn_str *nstr = NULL;
+ grn_id normalizer_id;
+ grn_obj *normalizer;
+ grn_obj *normalized_text_obj;
if (!str || !str_len) { return NULL; }
if (!(flags & GRN_STR_NORMALIZE)) {
return grn_fakenstr_open(ctx, str, str_len, encoding, flags);
}
- if (!(nstr = GRN_MALLOC(sizeof(grn_str)))) {
- GRN_LOG(ctx, GRN_LOG_ALERT, "memory allocation on grn_str_open failed !");
- return NULL;
- }
- nstr->orig = str;
- nstr->orig_blen = str_len;
- nstr->norm = NULL;
- nstr->norm_blen = 0;
- nstr->checks = NULL;
- nstr->ctypes = NULL;
- nstr->encoding = encoding;
- nstr->flags = flags;
- switch (encoding) {
- case GRN_ENC_EUC_JP :
- rc = normalize_euc(ctx, nstr);
- break;
- case GRN_ENC_UTF8 :
-#ifdef NO_NFKC
- rc = normalize_none(ctx, nstr);
-#else /* NO_NFKC */
- rc = normalize_utf8(ctx, nstr);
-#endif /* NO_NFKC */
- break;
- case GRN_ENC_SJIS :
- rc = normalize_sjis(ctx, nstr);
- break;
- case GRN_ENC_LATIN1 :
- rc = normalize_latin1(ctx, nstr);
- break;
- case GRN_ENC_KOI8R :
- rc = normalize_koi8r(ctx, nstr);
- break;
- default :
- rc = normalize_none(ctx, nstr);
- break;
- }
- if (rc) {
- grn_str_close(ctx, nstr);
- return NULL;
+ normalizer_id = grn_normalizer_find(ctx, encoding);
+ normalizer = grn_ctx_at(ctx, normalizer_id);
+ normalized_text_obj = grn_normalized_text_open(ctx, normalizer, str, str_len,
+ encoding, flags);
+ if (normalized_text_obj) {
+ grn_normalized_text *normalized_text;
+ if (!(nstr = GRN_MALLOC(sizeof(grn_str)))) {
+ GRN_LOG(ctx, GRN_LOG_ALERT, "memory allocation on grn_str_open failed !");
+ grn_obj_close(ctx, normalized_text_obj);
+ return NULL;
+ }
+ normalized_text = (grn_normalized_text *)normalized_text_obj;
+ nstr->orig = normalized_text->orig;
+ nstr->orig_blen = normalized_text->orig_blen;
+ nstr->norm = normalized_text->norm;
+ normalized_text->norm = NULL;
+ nstr->norm_blen = normalized_text->norm_blen;
+ normalized_text->norm_blen = 0;
+ nstr->length = normalized_text->length;
+ nstr->checks = normalized_text->checks;
+ normalized_text->checks = NULL;
+ nstr->ctypes = normalized_text->ctypes;
+ normalized_text->ctypes = NULL;
+ nstr->encoding = encoding;
+ nstr->flags = flags;
+ grn_obj_close(ctx, normalized_text_obj);
}
return nstr;
}
Modified: lib/str.h (+1 -0)
===================================================================
--- lib/str.h 2012-02-09 09:53:43 +0900 (6bf0ce0)
+++ lib/str.h 2012-02-08 18:18:36 +0900 (bf98e59)
@@ -80,6 +80,7 @@ grn_rc grn_substring(grn_ctx *ctx, char **str, char **str_end, int start, int en
void grn_logger_fin(void);
GRN_API int grn_charlen_(grn_ctx *ctx, const char *str, const char *end, grn_encoding encoding);
+GRN_API int grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char *end);
GRN_API grn_str *grn_str_open_(grn_ctx *ctx, const char *str, unsigned int str_len, int flags, grn_encoding encoding);
#define GRN_BULK_INCR_LEN(buf,len) {\
Modified: lib/util.c (+3 -0)
===================================================================
--- lib/util.c 2012-02-09 09:53:43 +0900 (f8afe19)
+++ lib/util.c 2012-02-08 18:18:36 +0900 (af9838f)
@@ -100,6 +100,9 @@ grn_proc_inspect(grn_ctx *ctx, grn_obj *buf, grn_obj *obj)
case GRN_PROC_HOOK :
GRN_TEXT_PUTS(ctx, buf, "hook");
break;
+ case GRN_PROC_NORMALIZER :
+ GRN_TEXT_PUTS(ctx, buf, "normalizer");
+ break;
}
GRN_TEXT_PUTS(ctx, buf, " ");
Modified: test/unit/core/dat/test-dat.cpp (+5 -0)
===================================================================
--- test/unit/core/dat/test-dat.cpp 2012-02-09 09:53:43 +0900 (1dce81d)
+++ test/unit/core/dat/test-dat.cpp 2012-02-08 18:18:36 +0900 (9841566)
@@ -72,6 +72,7 @@ namespace test_dat
{
const char *base_dir;
grn_ctx ctx;
+ grn_obj *db;
void cut_setup(void)
{
@@ -82,12 +83,16 @@ namespace test_dat
g_mkdir_with_parents(base_dir, 0755);
grn_ctx_init(&ctx, 0);
+ db = grn_db_create(&ctx, NULL, NULL);
enter_api(&ctx);
}
void cut_teardown(void)
{
leave_api(&ctx);
+ if (db) {
+ grn_obj_unlink(&ctx, db);
+ }
grn_ctx_fin(&ctx);
if (base_dir) {
Modified: test/unit/util/test-snip.c (+7 -1)
===================================================================
--- test/unit/util/test-snip.c 2012-02-09 09:53:43 +0900 (925431e)
+++ test/unit/util/test-snip.c 2012-02-08 18:18:36 +0900 (8573e1f)
@@ -1,6 +1,6 @@
/* -*- c-basic-offset: 2; coding: utf-8 -*- */
/*
- Copyright (C) 2008-2009 Kouhei Sutou <kou****@cozmi*****>
+ Copyright (C) 2008-2012 Kouhei Sutou <kou****@clear*****>
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -45,6 +45,7 @@ void test_add_cond_with_too_large_keyword(void);
void test_add_cond_with_copy_tag_flag(void);
static grn_ctx context;
+static grn_obj *db;
static grn_snip *snip;
static gchar *keyword;
static gchar *result;
@@ -197,6 +198,7 @@ void
cut_setup(void)
{
grn_ctx_init(&context, GRN_CTX_USE_QL);
+ db = grn_db_create(&context, NULL, NULL);
snip = NULL;
keyword = NULL;
@@ -233,6 +235,10 @@ cut_teardown(void)
g_free(default_close_tag);
}
+ if (db) {
+ grn_obj_close(&context, db);
+ }
+
grn_ctx_fin(&context);
}
Modified: test/unit/util/test-string.c (+4 -1)
===================================================================
--- test/unit/util/test-string.c 2012-02-09 09:53:43 +0900 (2417060)
+++ test/unit/util/test-string.c 2012-02-08 18:18:36 +0900 (3e97014)
@@ -1,6 +1,6 @@
/* -*- c-basic-offset: 2; coding: utf-8 -*- */
/*
- Copyright (C) 2008-2011 Kouhei Sutou <kou****@clear*****>
+ Copyright (C) 2008-2012 Kouhei Sutou <kou****@clear*****>
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -53,6 +53,7 @@ void data_itoh(void);
void test_itoh(gconstpointer data);
static grn_ctx context;
+static grn_obj *db;
static grn_obj buffer;
static const gchar text_ja_utf8[] =
@@ -75,6 +76,7 @@ void
setup (void)
{
grn_ctx_init(&context, GRN_CTX_USE_QL);
+ db = grn_db_create(&context, NULL, NULL);
GRN_VOID_INIT(&buffer);
}
@@ -82,6 +84,7 @@ void
teardown (void)
{
GRN_OBJ_FIN(&context, &buffer);
+ grn_obj_unlink(&context, db);
grn_ctx_fin(&context);
}