Kouhei Sutou
null+****@clear*****
Fri Dec 14 12:40:02 JST 2012
Kouhei Sutou 2012-12-14 12:40:02 +0900 (Fri, 14 Dec 2012) New Revision: 05f50761b76802859207d885804ab546b0e0b1b1 https://github.com/groonga/groonga/commit/05f50761b76802859207d885804ab546b0e0b1b1 Log: Make normalizer grn_obj We can create a normalizer as a plugin. Yay! TODO: * Add --normalizer option to table_create command * Add Unicode Collation Algorithm (UCA) based normalizer as a plugin We don't inplement NFKC normalizer as plugin because it is used by NormalizerAuto internaly. Added files: include/groonga/normalizer.h lib/normalizer_in.h Copied files: lib/normalizer.c (from lib/string.c) Modified files: include/groonga.h include/groonga/Makefile.am lib/ctx.c lib/dat.cpp lib/dat.h lib/db.c lib/db.h lib/expr.c lib/hash.c lib/hash.h lib/ii.c lib/pat.c lib/pat.h lib/sources.am lib/string.c lib/token.c lib/tokenizer.c lib/util.c test/unit/core/dat/test-dat.cpp test/unit/util/test-snip.c test/unit/util/test-string.c Modified: include/groonga.h (+12 -3) =================================================================== --- include/groonga.h 2012-12-14 12:20:38 +0900 (1a9dda6) +++ include/groonga.h 2012-12-14 12:40:02 +0900 (77280a8) @@ -112,7 +112,8 @@ typedef enum { GRN_TOO_LARGE_OFFSET = -68, GRN_TOO_SMALL_LIMIT = -69, GRN_CAS_ERROR = -70, - GRN_UNSUPPORTED_COMMAND_VERSION = -71 + GRN_UNSUPPORTED_COMMAND_VERSION = -71, + GRN_NORMALIZER_ERROR = -72, } grn_rc; GRN_API grn_rc grn_init(void); @@ -609,6 +610,12 @@ typedef enum { GRN_DB_TRIGRAM } grn_builtin_tokenizer; +typedef enum { + GRN_DB_NORMALIZER_AUTO = 96, + GRN_DB_NORMALIZER_NFKC51, /* Normalization Form KC for Unicode 5.1 */ + GRN_DB_NORMALIZER_UCA /* Unicode Collation Algorithm */ +} grn_builtin_normalizer; + GRN_API grn_obj *grn_ctx_at(grn_ctx *ctx, grn_id id); /** @@ -657,7 +664,8 @@ typedef enum { GRN_PROC_TOKENIZER = 1, GRN_PROC_COMMAND, GRN_PROC_FUNCTION, - GRN_PROC_HOOK + GRN_PROC_HOOK, + GRN_PROC_NORMALIZER } grn_proc_type; GRN_API grn_obj *grn_proc_create(grn_ctx *ctx, @@ -1374,7 +1382,8 @@ typedef enum { GRN_INFO_PARTIAL_MATCH_THRESHOLD, GRN_INFO_II_SPLIT_THRESHOLD, GRN_INFO_SUPPORT_ZLIB, - GRN_INFO_SUPPORT_LZO + GRN_INFO_SUPPORT_LZO, + GRN_INFO_NORMALIZER } grn_info_type; /** Modified: include/groonga/Makefile.am (+2 -1) =================================================================== --- include/groonga/Makefile.am 2012-12-14 12:20:38 +0900 (e6f37e7) +++ include/groonga/Makefile.am 2012-12-14 12:40:02 +0900 (f7151d7) @@ -1,4 +1,5 @@ groonga_includedir = $(pkgincludedir)/groonga groonga_include_HEADERS = \ plugin.h \ - tokenizer.h + tokenizer.h \ + normalizer.h Added: include/groonga/normalizer.h (+55 -0) 100644 =================================================================== --- /dev/null +++ include/groonga/normalizer.h 2012-12-14 12:40:02 +0900 (3ec843c) @@ -0,0 +1,55 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2012 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#ifndef GROONGA_NORMALIER_H +#define GROONGA_NORMALIER_H + +#include <stddef.h> + +#include <groonga/plugin.h> + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* + grn_normalizer_register() registers a normalizer to the database + which is associated with `ctx'. `name_ptr' and `name_length' specify + the normalizer name. `name_length' can be `-1'. `-1' means that + `name_ptr` is NULL-terminated. Alphabetic letters ('A'-'Z' and + 'a'-'z'), digits ('0'-'9') and an underscore ('_') are capable + characters. `init', `next' and `fin' specify the normalizer + functions. `init' is called for initializing a tokenizer for a + document or query. `next' is called for extracting tokens one by + one. `fin' is called for finalizing a + tokenizer. grn_tokenizer_register() returns GRN_SUCCESS on success, + an error code on failure. See "groonga.h" for more details of + grn_proc_func and grn_user_data, that is used as an argument of + grn_proc_func. + */ +GRN_PLUGIN_EXPORT grn_rc grn_normalizer_register(grn_ctx *ctx, + const char *name_ptr, + int name_length, + grn_proc_func *init, + grn_proc_func *next, + grn_proc_func *fin); + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* GROONGA_NORMALIER_H */ Modified: lib/ctx.c (+6 -0) =================================================================== --- lib/ctx.c 2012-12-14 12:20:38 +0900 (a358592) +++ lib/ctx.c 2012-12-14 12:40:02 +0900 (f80d23f) @@ -24,6 +24,7 @@ #include "plugin_in.h" #include "snip.h" #include "output.h" +#include "normalizer_in.h" #include <stdio.h> #include <stdarg.h> #include <time.h> @@ -944,6 +945,10 @@ grn_init(void) GRN_LOG(ctx, GRN_LOG_ALERT, "plugins initialize failed (%d)", rc); return rc; } + if ((rc = grn_normalizer_init())) { + GRN_LOG(ctx, GRN_LOG_ALERT, "grn_normalizer_init failed (%d)", rc); + return rc; + } if ((rc = grn_token_init())) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_token_init failed (%d)", rc); return rc; @@ -1029,6 +1034,7 @@ grn_fin(void) grn_query_logger_fin(ctx); grn_cache_fin(); grn_token_fin(); + grn_normalizer_fin(); grn_plugins_fin(); grn_io_fin(); grn_ctx_fin(ctx); Modified: lib/dat.cpp (+15 -3) =================================================================== --- lib/dat.cpp 2012-12-14 12:20:38 +0900 (e83e60c) +++ lib/dat.cpp 2012-12-14 12:40:02 +0900 (ca208f0) @@ -302,6 +302,14 @@ grn_dat_create(grn_ctx *ctx, const char *path, uint32_t, dat->header->encoding = encoding; dat->header->tokenizer = GRN_ID_NIL; dat->header->file_id = 0; + if (dat->header->flags & GRN_OBJ_KEY_NORMALIZE) { + dat->header->flags &= ~GRN_OBJ_KEY_NORMALIZE; + dat->header->normalizer = GRN_DB_NORMALIZER_AUTO; + dat->normalizer = grn_ctx_at(ctx, dat->header->normalizer); + } else { + dat->header->normalizer = GRN_ID_NIL; + dat->normalizer = NULL; + } dat->encoding = encoding; dat->tokenizer = NULL; return dat; @@ -337,6 +345,11 @@ grn_dat_open(grn_ctx *ctx, const char *path) dat->encoding = dat->header->encoding; dat->obj.header.flags = dat->header->flags; dat->tokenizer = grn_ctx_at(ctx, dat->header->tokenizer); + if (dat->header->flags & GRN_OBJ_KEY_NORMALIZE) { + dat->header->flags &= ~GRN_OBJ_KEY_NORMALIZE; + dat->header->normalizer = GRN_DB_NORMALIZER_AUTO; + } + dat->normalizer = grn_ctx_at(ctx, dat->header->normalizer); return dat; } @@ -672,11 +685,10 @@ grn_dat_scan(grn_ctx *ctx, grn_dat *dat, const char *str, int num_scan_hits = 0; try { - if (dat->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) { - grn_obj *normalizer = GRN_NORMALIZER_AUTO; + if (dat->normalizer) { int flags = GRN_STRING_WITH_CHECKS; grn_obj * const normalized_string = grn_string_open(ctx, str, str_size, - normalizer, + dat->normalizer, flags); if (!normalized_string) { fprintf(stderr, "error: grn_string_open() failed!\n"); Modified: lib/dat.h (+3 -0) =================================================================== --- lib/dat.h 2012-12-14 12:20:38 +0900 (9ae15f8) +++ lib/dat.h 2012-12-14 12:40:02 +0900 (00c71df) @@ -36,6 +36,7 @@ struct _grn_dat { void *trie; void *old_trie; grn_obj *tokenizer; + grn_obj *normalizer; grn_critical_section lock; }; @@ -44,6 +45,8 @@ struct grn_dat_header { grn_encoding encoding; grn_id tokenizer; uint32_t file_id; + grn_id normalizer; + uint32_t reserved[235]; }; struct _grn_dat_cursor { Modified: lib/db.c (+55 -8) =================================================================== --- lib/db.c 2012-12-14 12:20:38 +0900 (c1d6887) +++ lib/db.c 2012-12-14 12:40:02 +0900 (fc625ba) @@ -27,6 +27,7 @@ #include "geo.h" #include "snip.h" #include "string_in.h" +#include "normalizer_in.h" #include "util.h" #include <string.h> #include <float.h> @@ -34,10 +35,10 @@ #define NEXT_ADDR(p) (((byte *)(p)) + sizeof *(p)) #define WITH_NORMALIZE(table,key,key_size,block) do {\ - if ((table)->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {\ + if ((table)->normalizer) {\ grn_obj *nstr;\ if ((nstr = grn_string_open(ctx, key, key_size,\ - GRN_NORMALIZER_AUTO, 0))) {\ + (table)->normalizer, 0))) {\ const char *key;\ unsigned int key_size;\ grn_string_get_normalized(ctx, nstr, &key, &key_size, NULL);\ @@ -253,6 +254,7 @@ grn_db_open(grn_ctx *ctx, const char *path) } #endif grn_db_init_builtin_tokenizers(ctx); + grn_db_init_builtin_normalizers(ctx); grn_db_init_builtin_query(ctx); GRN_API_RETURN((grn_obj *)s); } @@ -1812,7 +1814,7 @@ grn_table_truncate(grn_ctx *ctx, grn_obj *table) } grn_hash_close(ctx, cols); } - grn_table_get_info(ctx, table, NULL, NULL, &tokenizer); + grn_table_get_info(ctx, table, NULL, NULL, &tokenizer, NULL); switch (table->header.type) { case GRN_TABLE_PAT_KEY : for (hooks = DB_OBJ(table)->hooks[GRN_HOOK_INSERT]; hooks; hooks = hooks->next) { @@ -1854,7 +1856,8 @@ exit : grn_rc grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags, - grn_encoding *encoding, grn_obj **tokenizer) + grn_encoding *encoding, grn_obj **tokenizer, + grn_obj **normalizer) { grn_rc rc = GRN_INVALID_ARGUMENT; GRN_API_ENTER; @@ -1864,24 +1867,28 @@ grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags, if (flags) { *flags = ((grn_pat *)table)->obj.header.flags; } if (encoding) { *encoding = ((grn_pat *)table)->encoding; } if (tokenizer) { *tokenizer = ((grn_pat *)table)->tokenizer; } + if (normalizer) { *normalizer = ((grn_pat *)table)->normalizer; } rc = GRN_SUCCESS; break; case GRN_TABLE_DAT_KEY : if (flags) { *flags = ((grn_dat *)table)->obj.header.flags; } if (encoding) { *encoding = ((grn_dat *)table)->encoding; } if (tokenizer) { *tokenizer = ((grn_dat *)table)->tokenizer; } + if (normalizer) { *normalizer = ((grn_dat *)table)->normalizer; } rc = GRN_SUCCESS; break; case GRN_TABLE_HASH_KEY : if (flags) { *flags = ((grn_hash *)table)->obj.header.flags; } if (encoding) { *encoding = ((grn_hash *)table)->encoding; } if (tokenizer) { *tokenizer = ((grn_hash *)table)->tokenizer; } + if (normalizer) { *normalizer = ((grn_hash *)table)->normalizer; } rc = GRN_SUCCESS; break; case GRN_TABLE_NO_KEY : if (flags) { *flags = 0; } if (encoding) { *encoding = GRN_ENC_NONE; } if (tokenizer) { *tokenizer = grn_uvector_tokenizer; } + if (normalizer) { *normalizer = NULL; } rc = GRN_SUCCESS; break; } @@ -6095,6 +6102,19 @@ grn_obj_get_info(grn_ctx *ctx, grn_obj *obj, grn_info_type type, grn_obj *valueb break; } break; + case GRN_INFO_NORMALIZER : + switch (DB_OBJ(obj)->header.type) { + case GRN_TABLE_HASH_KEY : + valuebuf = ((grn_hash *)obj)->normalizer; + break; + case GRN_TABLE_PAT_KEY : + valuebuf = ((grn_pat *)obj)->normalizer; + break; + case GRN_TABLE_DAT_KEY : + valuebuf = ((grn_dat *)obj)->normalizer; + break; + } + break; default : /* todo */ break; @@ -6117,7 +6137,7 @@ build_index(grn_ctx *ctx, grn_obj *obj) grn_obj_flags flags; grn_ii *ii = (grn_ii *)obj; grn_bool use_grn_ii_build; - grn_table_get_info(ctx, ii->lexicon, &flags, NULL, NULL); + grn_table_get_info(ctx, ii->lexicon, &flags, NULL, NULL, NULL); switch (flags & GRN_OBJ_TABLE_TYPE_MASK) { case GRN_OBJ_TABLE_PAT_KEY : case GRN_OBJ_TABLE_DAT_KEY : @@ -6434,6 +6454,28 @@ grn_obj_set_info(grn_ctx *ctx, grn_obj *obj, grn_info_type type, grn_obj *value) break; } } + break; + case GRN_INFO_NORMALIZER : + if (!value || DB_OBJ(value)->header.type == GRN_PROC) { + switch (DB_OBJ(obj)->header.type) { + case GRN_TABLE_HASH_KEY : + ((grn_hash *)obj)->normalizer = value; + ((grn_hash *)obj)->header->normalizer = grn_obj_id(ctx, value); + rc = GRN_SUCCESS; + break; + case GRN_TABLE_PAT_KEY : + ((grn_pat *)obj)->normalizer = value; + ((grn_pat *)obj)->header->normalizer = grn_obj_id(ctx, value); + rc = GRN_SUCCESS; + break; + case GRN_TABLE_DAT_KEY : + ((grn_dat *)obj)->normalizer = value; + ((grn_dat *)obj)->header->normalizer = grn_obj_id(ctx, value); + rc = GRN_SUCCESS; + break; + } + } + break; default : /* todo */ break; @@ -8446,6 +8488,11 @@ grn_db_init_builtin_types(grn_ctx *ctx) } #endif grn_db_init_builtin_tokenizers(ctx); + for (id = grn_db_curr_id(ctx, db) + 1; id < GRN_DB_NORMALIZER_AUTO; id++) { + grn_itoh(id, buf + 3, 2); + grn_obj_register(ctx, db, buf, 5); + } + grn_db_init_builtin_normalizers(ctx); for (id = grn_db_curr_id(ctx, db) + 1; id < 128; id++) { grn_itoh(id, buf + 3, 2); grn_obj_register(ctx, db, buf, 5); @@ -8479,7 +8526,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op, if (obj->header.type != GRN_COLUMN_FIX_SIZE) { grn_obj *tokenizer, *lexicon = grn_ctx_at(ctx, target->header.domain); if (!lexicon) { continue; } - grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer); + grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL); if (tokenizer) { continue; } } if (n < buf_size) { @@ -8520,7 +8567,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op, if (!lexicon) { continue; } if (lexicon->header.type != GRN_TABLE_PAT_KEY) { continue; } /* FIXME: GRN_TABLE_DAT_KEY should be supported */ - grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer); + grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL); if (tokenizer) { continue; } } if (n < buf_size) { @@ -8626,7 +8673,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op, if (!lexicon) { continue; } if (lexicon->header.type != GRN_TABLE_PAT_KEY) { continue; } /* FIXME: GRN_TABLE_DAT_KEY should be supported */ - grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer); + grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL); if (tokenizer) { continue; } } if (n < buf_size) { Modified: lib/db.h (+2 -1) =================================================================== --- lib/db.h 2012-12-14 12:20:38 +0900 (961c968) +++ lib/db.h 2012-12-14 12:40:02 +0900 (0db78e5) @@ -92,7 +92,8 @@ grn_id grn_table_get_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_si grn_id grn_table_add_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_size, void **value, int *added); GRN_API grn_rc grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags, - grn_encoding *encoding, grn_obj **tokenizer); + grn_encoding *encoding, grn_obj **tokenizer, + grn_obj **normalizer); const char *_grn_table_key(grn_ctx *ctx, grn_obj *table, grn_id id, uint32_t *key_size); grn_rc grn_table_search(grn_ctx *ctx, grn_obj *table, Modified: lib/expr.c (+9 -3) =================================================================== --- lib/expr.c 2012-12-14 12:20:38 +0900 (8915385) +++ lib/expr.c 2012-12-14 12:40:02 +0900 (496c2b8) @@ -2298,14 +2298,16 @@ grn_proc_call(grn_ctx *ctx, grn_obj *proc, int nargs, grn_obj *caller) void pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res) { + grn_obj *normalizer; grn_obj *a = NULL, *b = NULL; + normalizer = grn_ctx_at(ctx, GRN_DB_NORMALIZER_AUTO); switch (x->header.domain) { case GRN_DB_SHORT_TEXT: case GRN_DB_TEXT: case GRN_DB_LONG_TEXT: a = grn_string_open(ctx, GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x), - GRN_NORMALIZER_AUTO, 0); + normalizer, 0); break; default: break; @@ -2316,7 +2318,7 @@ pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res) case GRN_DB_TEXT: case GRN_DB_LONG_TEXT: b = grn_string_open(ctx, GRN_TEXT_VALUE(y), GRN_TEXT_LEN(y), - GRN_NORMALIZER_AUTO, 0); + normalizer, 0); break; default: break; @@ -2336,6 +2338,8 @@ pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res) if (a) { grn_obj_close(ctx, a); } if (b) { grn_obj_close(ctx, b); } + + if (normalizer) { grn_obj_unlink(ctx, normalizer); } } grn_obj * @@ -2926,7 +2930,9 @@ grn_expr_exec(grn_ctx *ctx, grn_obj *expr, int nargs) { grn_obj *x, *y; POP2ALLOC1(x, y, res); - pseudo_query_scan(ctx, x, y, res); + WITH_SPSAVE({ + pseudo_query_scan(ctx, x, y, res); + }); } code++; break; Modified: lib/hash.c (+18 -2) =================================================================== --- lib/hash.c 2012-12-14 12:20:38 +0900 (e511517) +++ lib/hash.c 2012-12-14 12:40:02 +0900 (3267179) @@ -1482,9 +1482,17 @@ grn_io_hash_init(grn_ctx *ctx, grn_hash *hash, const char *path, header->n_entries = 0; header->n_garbages = 0; header->tokenizer = GRN_ID_NIL; + if (header->flags & GRN_OBJ_KEY_NORMALIZE) { + header->flags &= ~GRN_OBJ_KEY_NORMALIZE; + header->normalizer = GRN_DB_NORMALIZER_AUTO; + hash->normalizer = grn_ctx_at(ctx, header->normalizer); + } else { + header->normalizer = GRN_ID_NIL; + hash->normalizer = NULL; + } grn_table_queue_init(ctx, &header->queue); - hash->obj.header.flags = flags; + hash->obj.header.flags = header->flags; hash->ctx = ctx; hash->key_size = key_size; hash->encoding = encoding; @@ -1555,6 +1563,7 @@ grn_tiny_hash_init(grn_ctx *ctx, grn_hash *hash, const char *path, hash->n_entries_ = 0; hash->garbages = GRN_ID_NIL; hash->tokenizer = NULL; + hash->normalizer = NULL; grn_tiny_array_init(ctx, &hash->a, entry_size, GRN_TINY_ARRAY_CLEAR); grn_tiny_bitmap_init(ctx, &hash->bitmap); return GRN_SUCCESS; @@ -1621,6 +1630,11 @@ grn_hash_open(grn_ctx *ctx, const char *path) hash->header = header; hash->lock = &header->lock; hash->tokenizer = grn_ctx_at(ctx, header->tokenizer); + if (header->flags & GRN_OBJ_KEY_NORMALIZE) { + header->flags &= ~GRN_OBJ_KEY_NORMALIZE; + header->normalizer = GRN_DB_NORMALIZER_AUTO; + } + hash->normalizer = grn_ctx_at(ctx, header->normalizer); return hash; } else { GRN_LOG(ctx, GRN_LOG_NOTICE, @@ -2922,7 +2936,7 @@ grn_hash_check(grn_ctx *ctx, grn_hash *hash) char buf[8]; struct grn_hash_header *h = hash->header; GRN_OUTPUT_ARRAY_OPEN("RESULT", 1); - GRN_OUTPUT_MAP_OPEN("SUMMARY", 24); + GRN_OUTPUT_MAP_OPEN("SUMMARY", 25); GRN_OUTPUT_CSTR("flags"); grn_itoh(h->flags, buf, 8); GRN_OUTPUT_STR(buf, 8); @@ -2932,6 +2946,8 @@ grn_hash_check(grn_ctx *ctx, grn_hash *hash) GRN_OUTPUT_INT64(hash->value_size); GRN_OUTPUT_CSTR("tokenizer"); GRN_OUTPUT_INT64(h->tokenizer); + GRN_OUTPUT_CSTR("normalizer"); + GRN_OUTPUT_INT64(h->normalizer); GRN_OUTPUT_CSTR("curr_rec"); GRN_OUTPUT_INT64(h->curr_rec); GRN_OUTPUT_CSTR("curr_key"); Modified: lib/hash.h (+3 -1) =================================================================== --- lib/hash.h 2012-12-14 12:20:38 +0900 (626ad01) +++ lib/hash.h 2012-12-14 12:40:02 +0900 (6792e5b) @@ -199,6 +199,7 @@ struct _grn_hash { uint32_t *n_entries; uint32_t *max_offset; grn_obj *tokenizer; + grn_obj *normalizer; /* For grn_io_hash. */ grn_io *io; @@ -242,7 +243,8 @@ struct grn_hash_header { uint32_t n_entries; uint32_t n_garbages; uint32_t lock; - uint32_t reserved[16]; + grn_id normalizer; + uint32_t reserved[15]; grn_id garbages[GRN_HASH_MAX_KEY_SIZE]; grn_table_queue queue; }; Modified: lib/ii.c (+13 -5) =================================================================== --- lib/ii.c 2012-12-14 12:20:38 +0900 (eb80a0c) +++ lib/ii.c 2012-12-14 12:40:02 +0900 (d7dda44) @@ -3458,7 +3458,9 @@ _grn_ii_create(grn_ctx *ctx, grn_ii *ii, const char *path, grn_obj *lexicon, uin free_histogram[i] = 0; } */ - if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer)) { return NULL; } + if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, NULL)) { + return NULL; + } if (path && strlen(path) + 6 >= PATH_MAX) { return NULL; } seg = grn_io_create(ctx, path, sizeof(struct grn_ii_header), S_SEGMENT, GRN_II_MAX_LSEG, grn_io_auto, GRN_IO_EXPIRE_SEGMENT); @@ -3578,7 +3580,9 @@ grn_ii_open(grn_ctx *ctx, const char *path, grn_obj *lexicon) grn_obj_flags lflags; grn_encoding encoding; grn_obj *tokenizer; - if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer)) { return NULL; } + if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, NULL)) { + return NULL; + } if (strlen(path) + 6 >= PATH_MAX) { return NULL; } strcpy(path2, path); strcat(path2, ".c"); @@ -6745,14 +6749,18 @@ get_tmp_lexicon(grn_ctx *ctx, grn_ii_buffer *ii_buffer) grn_obj *domain = grn_ctx_at(ctx, ii_buffer->lexicon->header.domain); grn_obj *range = grn_ctx_at(ctx, DB_OBJ(ii_buffer->lexicon)->range); grn_obj *tokenizer; + grn_obj *normalizer; grn_obj_flags flags; - grn_table_get_info(ctx, ii_buffer->lexicon, &flags, NULL, &tokenizer); + grn_table_get_info(ctx, ii_buffer->lexicon, &flags, NULL, + &tokenizer, &normalizer); flags &= ~GRN_OBJ_PERSISTENT; tmp_lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range); if (tmp_lexicon) { ii_buffer->tmp_lexicon = tmp_lexicon; grn_obj_set_info(ctx, tmp_lexicon, GRN_INFO_DEFAULT_TOKENIZER, tokenizer); + grn_obj_set_info(ctx, tmp_lexicon, + GRN_INFO_NORMALIZER, normalizer); if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { grn_pat_cache_enable(ctx, (grn_pat *)tmp_lexicon, PAT_CACHE_SIZE); } @@ -7193,7 +7201,7 @@ grn_ii_buffer_open(grn_ctx *ctx, grn_ii *ii, S_IRUSR|S_IWUSR); if (ii_buffer->tmpfd != -1) { grn_obj_flags flags; - grn_table_get_info(ctx, ii->lexicon, &flags, NULL, NULL); + grn_table_get_info(ctx, ii->lexicon, &flags, NULL, NULL, NULL); if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { grn_pat_cache_enable(ctx, (grn_pat *)ii->lexicon, PAT_CACHE_SIZE); @@ -7316,7 +7324,7 @@ grn_ii_buffer_close(grn_ctx *ctx, grn_ii_buffer *ii_buffer) { uint32_t i; grn_obj_flags flags; - grn_table_get_info(ctx, ii_buffer->ii->lexicon, &flags, NULL, NULL); + grn_table_get_info(ctx, ii_buffer->ii->lexicon, &flags, NULL, NULL, NULL); if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { grn_pat_cache_disable(ctx, (grn_pat *)ii_buffer->ii->lexicon); } Copied: lib/normalizer.c (+104 -375) 75% =================================================================== --- lib/string.c 2012-12-14 12:20:38 +0900 (5253a35) +++ lib/normalizer.c 2012-12-14 12:40:02 +0900 (8eff99e) @@ -1,6 +1,6 @@ /* -*- c-basic-offset: 2 -*- */ /* - Copyright(C) 2009-2012 Brazil + Copyright(C) 2012 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -16,13 +16,59 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "groonga_in.h" #include <string.h> -#include "string_in.h" -#include "str.h" +#include "normalizer_in.h" +#include "string_in.h" +#include <groonga/normalizer.h> #include <groonga/tokenizer.h> +grn_rc +grn_normalizer_register(grn_ctx *ctx, + const char *name_ptr, + int name_length, + grn_proc_func *init, + grn_proc_func *next, + grn_proc_func *fin) +{ + grn_expr_var vars[] = { + { NULL, 0 } + }; + GRN_PTR_INIT(&vars[0].value, 0, GRN_ID_NIL); + + if (name_length < 0) { + name_length = strlen(name_ptr); + } + + { + grn_obj * const normalizer = grn_proc_create(ctx, + name_ptr, name_length, + GRN_PROC_NORMALIZER, + init, next, fin, + sizeof(*vars) / sizeof(vars), + vars); + if (!normalizer) { + GRN_PLUGIN_ERROR(ctx, GRN_NORMALIZER_ERROR, + "[normalizer] failed to register normalizer: <%.*s>", + name_length, name_ptr); + return ctx->rc; + } + } + return GRN_SUCCESS; +} + +grn_rc +grn_normalizer_init(void) +{ + return GRN_SUCCESS; +} + +grn_rc +grn_normalizer_fin(void) +{ + return GRN_SUCCESS; +} + static unsigned char symbol[] = { ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0, @@ -32,8 +78,7 @@ static unsigned char symbol[] = { }; inline static grn_obj * -eucjp_normalize(grn_ctx *ctx, int nargs, grn_obj **args, - grn_user_data *user_data) +eucjp_normalize(grn_ctx *ctx, grn_string *nstr) { static uint16_t hankana[] = { 0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3, @@ -54,7 +99,6 @@ eucjp_normalize(grn_ctx *ctx, int nargs, grn_obj **args, static unsigned char handaku[] = { 0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd }; - grn_string *nstr = (grn_string *)args[0]; int16_t *ch; const unsigned char *s, *s_, *e; unsigned char *d, *d0, *d_, b; @@ -279,8 +323,7 @@ eucjp_normalize(grn_ctx *ctx, int nargs, grn_obj **args, } inline static grn_obj * -sjis_normalize(grn_ctx *ctx, int nargs, grn_obj **args, - grn_user_data *user_data) +sjis_normalize(grn_ctx *ctx, grn_string *nstr) { static uint16_t hankana[] = { 0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342, @@ -301,7 +344,6 @@ sjis_normalize(grn_ctx *ctx, int nargs, grn_obj **args, static unsigned char handaku[] = { 0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c }; - grn_string *nstr = (grn_string *)args[0]; int16_t *ch; const unsigned char *s, *s_; unsigned char *d, *d0, *d_, b, *e; @@ -572,13 +614,12 @@ grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char } inline static grn_obj * -utf8_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +utf8_normalize(grn_ctx *ctx, grn_string *nstr) { int16_t *ch; const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e; unsigned char *d, *d_, *de; uint_least8_t *cp; - grn_string *nstr = (grn_string *)args[0]; size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3; int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; grn_bool remove_tokenized_delimiter_p = @@ -715,9 +756,8 @@ utf8_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data #endif /* WITH_NFKC */ inline static grn_obj * -ascii_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +ascii_normalize(grn_ctx *ctx, grn_string *nstr) { - grn_string *nstr = (grn_string *)args[0]; int16_t *ch; const unsigned char *s, *s_, *e; unsigned char *d, *d0, *d_; @@ -818,10 +858,8 @@ ascii_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_dat /* use cp1252 as latin1 */ inline static grn_obj * -latin1_normalize(grn_ctx *ctx, int nargs, grn_obj **args, - grn_user_data *user_data) +latin1_normalize(grn_ctx *ctx, grn_string *nstr) { - grn_string *nstr = (grn_string *)args[0]; int16_t *ch; const unsigned char *s, *s_, *e; unsigned char *d, *d0, *d_; @@ -955,10 +993,8 @@ latin1_normalize(grn_ctx *ctx, int nargs, grn_obj **args, } inline static grn_obj * -koi8r_normalize(grn_ctx *ctx, int nargs, grn_obj **args, - grn_user_data *user_data) +koi8r_normalize(grn_ctx *ctx, grn_string *nstr) { - grn_string *nstr = (grn_string *)args[0]; int16_t *ch; const unsigned char *s, *s_, *e; unsigned char *d, *d0, *d_; @@ -1080,396 +1116,89 @@ koi8r_normalize(grn_ctx *ctx, int nargs, grn_obj **args, return NULL; } -static grn_string * -grn_fake_string_open(grn_ctx *ctx, grn_string *string) -{ - /* TODO: support GRN_STRING_REMOVE_BLANK flag and ctypes */ - grn_string *nstr = string; - const char *str; - unsigned int str_len; - - str = nstr->original; - str_len = nstr->original_length_in_bytes; - - if (!(nstr->normalized = GRN_MALLOC(str_len + 1))) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][fake] failed to allocate normalized text space"); - grn_string_close(ctx, (grn_obj *)nstr); - return NULL; - } - - if (nstr->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER && - ctx->encoding == GRN_ENC_UTF8) { - int char_length; - const char *source_current = str; - const char *source_end = str + str_len; - char *destination = nstr->normalized; - unsigned int destination_length = 0; - while ((char_length = grn_charlen(ctx, source_current, source_end)) > 0) { - if (!grn_tokenizer_is_tokenized_delimiter(ctx, - source_current, char_length, - ctx->encoding)) { - memcpy(destination, source_current, char_length); - destination += char_length; - destination_length += char_length; - } - source_current += char_length; - } - nstr->normalized[destination_length] = '\0'; - nstr->normalized_length_in_bytes = destination_length; - } else { - memcpy(nstr->normalized, str, str_len); - nstr->normalized[str_len] = '\0'; - nstr->normalized_length_in_bytes = str_len; - } - - if (nstr->flags & GRN_STRING_WITH_CHECKS) { - int16_t f = 0; - unsigned char c; - size_t i; - if (!(nstr->checks = (int16_t *) GRN_MALLOC(sizeof(int16_t) * str_len))) { - grn_string_close(ctx, (grn_obj *)nstr); - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][fake] failed to allocate checks space"); - return NULL; - } - switch (nstr->encoding) { - case GRN_ENC_EUC_JP: - for (i = 0; i < str_len; i++) { - if (!f) { - c = (unsigned char) str[i]; - f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1) - ); - nstr->checks[i] = f; - } else { - nstr->checks[i] = 0; - } - f--; - } - break; - case GRN_ENC_SJIS: - for (i = 0; i < str_len; i++) { - if (!f) { - c = (unsigned char) str[i]; - f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1); - nstr->checks[i] = f; - } else { - nstr->checks[i] = 0; - } - f--; - } - break; - case GRN_ENC_UTF8: - for (i = 0; i < str_len; i++) { - if (!f) { - c = (unsigned char) str[i]; - f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3) - : 2) - : 1); - nstr->checks[i] = f; - } else { - nstr->checks[i] = 0; - } - f--; - } - break; - default: - for (i = 0; i < str_len; i++) { - nstr->checks[i] = 1; - } - break; - } - } - return nstr; -} - -grn_obj * -grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len, - grn_obj *normalizer, int flags, grn_encoding encoding) +static grn_obj * +auto_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { - grn_string *string; - grn_obj *obj; - grn_obj *args[1]; - - if (!str || !str_len) { - return NULL; - } - - string = GRN_MALLOCN(grn_string, 1); - if (!string) { - GRN_LOG(ctx, GRN_LOG_ALERT, - "[string][open] failed to allocate memory"); - return NULL; - } - - obj = (grn_obj *)string; - GRN_OBJ_INIT(obj, GRN_STRING, GRN_OBJ_ALLOCATED, GRN_ID_NIL); - string->original = str; - string->original_length_in_bytes = str_len; - string->normalized = NULL; - string->normalized_length_in_bytes = 0; - string->n_characters = 0; - string->checks = NULL; - string->ctypes = NULL; - string->encoding = encoding; - string->flags = flags; - - if (!normalizer) { - return (grn_obj *)grn_fake_string_open(ctx, string); - } - - args[0] = obj; - switch (encoding) { + grn_string *string = (grn_string *)(args[0]); + switch (string->encoding) { case GRN_ENC_EUC_JP : - eucjp_normalize(ctx, 1, args, NULL); + eucjp_normalize(ctx, string); break; case GRN_ENC_UTF8 : #ifdef WITH_NFKC - utf8_normalize(ctx, 1, args, NULL); + utf8_normalize(ctx, string); #else /* WITH_NFKC */ - ascii_normalize(ctx, 1, args, NULL); + ascii_normalize(ctx, string); #endif /* WITH_NFKC */ break; case GRN_ENC_SJIS : - sjis_normalize(ctx, 1, args, NULL); + sjis_normalize(ctx, string); break; case GRN_ENC_LATIN1 : - latin1_normalize(ctx, 1, args, NULL); + latin1_normalize(ctx, string); break; case GRN_ENC_KOI8R : - koi8r_normalize(ctx, 1, args, NULL); + koi8r_normalize(ctx, string); break; default : - ascii_normalize(ctx, 1, args, NULL); + ascii_normalize(ctx, string); break; } - if (ctx->rc) { - grn_obj_close(ctx, obj); - obj = NULL; - } - - return obj; -} - -grn_obj * -grn_string_open(grn_ctx *ctx, const char *str, unsigned int str_len, - grn_obj *normalizer, int flags) -{ - return grn_string_open_(ctx, str, str_len, normalizer, flags, ctx->encoding); -} - -grn_rc -grn_string_get_original(grn_ctx *ctx, grn_obj *string, - const char **original, - unsigned int *length_in_bytes) -{ - grn_rc rc; - grn_string *string_ = (grn_string *)string; - GRN_API_ENTER; - if (string_) { - if (original) { *original = string_->original; } - if (length_in_bytes) { - *length_in_bytes = string_->original_length_in_bytes; - } - rc = GRN_SUCCESS; - } else { - rc = GRN_INVALID_ARGUMENT; - } - GRN_API_RETURN(rc); -} - -int -grn_string_get_flags(grn_ctx *ctx, grn_obj *string) -{ - int flags = 0; - grn_string *string_ = (grn_string *)string; - GRN_API_ENTER; - if (string_) { - flags = string_->flags; - } - GRN_API_RETURN(flags); + return NULL; } -grn_rc -grn_string_get_normalized(grn_ctx *ctx, grn_obj *string, - const char **normalized, - unsigned int *length_in_bytes, - unsigned int *n_characters) +#ifdef WITH_NFKC +static grn_obj * +nfkc51_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { - grn_rc rc; - grn_string *string_ = (grn_string *)string; - GRN_API_ENTER; - if (string_) { - if (normalized) { *normalized = string_->normalized; } - if (length_in_bytes) { - *length_in_bytes = string_->normalized_length_in_bytes; - } - if (n_characters) { *n_characters = string_->n_characters; } - rc = GRN_SUCCESS; - } else { - rc = GRN_INVALID_ARGUMENT; - } - GRN_API_RETURN(rc); + grn_string *string = (grn_string *)(args[0]); + utf8_normalize(ctx, string); + return NULL; } +#endif /* WITH_NFKC */ grn_rc -grn_string_set_normalized(grn_ctx *ctx, grn_obj *string, - char *normalized, unsigned int length_in_bytes, - unsigned int n_characters) +grn_normalizer_normalize(grn_ctx *ctx, grn_obj *normalizer, grn_obj *string) { grn_rc rc; - grn_string *string_ = (grn_string *)string; - GRN_API_ENTER; - if (string_) { - if (string_->normalized) { GRN_FREE(string_->normalized); } - string_->normalized = normalized; - string_->normalized_length_in_bytes = length_in_bytes; - string_->n_characters = n_characters; - rc = GRN_SUCCESS; - } else { - rc = GRN_INVALID_ARGUMENT; - } - GRN_API_RETURN(rc); -} - -const short * -grn_string_get_checks(grn_ctx *ctx, grn_obj *string) -{ - int16_t *checks = NULL; - grn_string *string_ = (grn_string *)string; - GRN_API_ENTER; - if (string_) { - checks = string_->checks; - } else { - checks = NULL; - } - GRN_API_RETURN(checks); -} + int nargs = 0; -grn_rc -grn_string_set_checks(grn_ctx *ctx, grn_obj *string, short *checks) -{ - grn_rc rc; - grn_string *string_ = (grn_string *)string; - GRN_API_ENTER; - if (string_) { - if (string_->checks) { GRN_FREE(string_->checks); } - string_->checks = checks; - rc = GRN_SUCCESS; - } else { - rc = GRN_INVALID_ARGUMENT; - } - GRN_API_RETURN(rc); -} + grn_ctx_push(ctx, string); + nargs++; + rc = grn_proc_call(ctx, normalizer, nargs, NULL); + grn_ctx_pop(ctx); -const unsigned char * -grn_string_get_types(grn_ctx *ctx, grn_obj *string) -{ - unsigned char *types = NULL; - grn_string *string_ = (grn_string *)string; - GRN_API_ENTER; - if (string_) { - types = string_->ctypes; - } else { - types = NULL; - } - GRN_API_RETURN(types); + return rc; } grn_rc -grn_string_set_types(grn_ctx *ctx, grn_obj *string, unsigned char *types) +grn_db_init_builtin_normalizers(grn_ctx *ctx) { grn_rc rc; - grn_string *string_ = (grn_string *)string; - GRN_API_ENTER; - if (string_) { - if (string_->ctypes) { GRN_FREE(string_->ctypes); } - string_->ctypes = types; - rc = GRN_SUCCESS; - } else { - rc = GRN_INVALID_ARGUMENT; - } - GRN_API_RETURN(rc); -} - -grn_encoding -grn_string_get_encoding(grn_ctx *ctx, grn_obj *string) -{ - grn_encoding encoding = GRN_ENC_NONE; - grn_string *string_ = (grn_string *)string; - GRN_API_ENTER; - if (string_) { - encoding = string_->encoding; - } - GRN_API_RETURN(encoding); -} - -grn_rc -grn_string_inspect(grn_ctx *ctx, grn_obj *buffer, grn_obj *string) -{ - grn_string *string_ = (grn_string *)string; - - GRN_TEXT_PUTS(ctx, buffer, "#<string:"); - - GRN_TEXT_PUTS(ctx, buffer, " original:<"); - GRN_TEXT_PUT(ctx, buffer, - string_->original, - string_->original_length_in_bytes); - GRN_TEXT_PUTS(ctx, buffer, ">"); - GRN_TEXT_PUTS(ctx, buffer, "("); - grn_text_itoa(ctx, buffer, string_->original_length_in_bytes); - GRN_TEXT_PUTS(ctx, buffer, ")"); - - GRN_TEXT_PUTS(ctx, buffer, " normalized:<"); - GRN_TEXT_PUT(ctx, buffer, - string_->normalized, - string_->normalized_length_in_bytes); - GRN_TEXT_PUTS(ctx, buffer, ">"); - GRN_TEXT_PUTS(ctx, buffer, "("); - grn_text_itoa(ctx, buffer, string_->normalized_length_in_bytes); - GRN_TEXT_PUTS(ctx, buffer, ")"); - - GRN_TEXT_PUTS(ctx, buffer, " n_characters:"); - grn_text_itoa(ctx, buffer, string_->n_characters); - - GRN_TEXT_PUTS(ctx, buffer, " encoding:"); - grn_inspect_encoding(ctx, buffer, string_->encoding); + const char *normalizer_auto_name = "NormalizerAuto"; + const char *normalizer_nfkc51_name = "NormalizerNFKC51"; - GRN_TEXT_PUTS(ctx, buffer, " flags:"); - if (string_->flags & GRN_STRING_REMOVE_BLANK) { - GRN_TEXT_PUTS(ctx, buffer, "REMOVE_BLANK|"); - } - if (string_->flags & GRN_STRING_WITH_TYPES) { - GRN_TEXT_PUTS(ctx, buffer, "WITH_TYPES|"); - } - if (string_->flags & GRN_STRING_WITH_CHECKS) { - GRN_TEXT_PUTS(ctx, buffer, "WITH_CHECKS|"); - } - if (string_->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER) { - GRN_TEXT_PUTS(ctx, buffer, "REMOVE_TOKENIZED_DELIMITER|"); - } - if (GRN_TEXT_VALUE(buffer)[GRN_TEXT_LEN(buffer) - 1] == '|') { - grn_bulk_truncate(ctx, buffer, GRN_TEXT_LEN(buffer) - 1); + rc = grn_normalizer_register(ctx, normalizer_auto_name, -1, + NULL, auto_next, NULL); + if (rc == GRN_SUCCESS) { + grn_obj *obj; + obj = grn_ctx_get(ctx, normalizer_auto_name, -1); + if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_AUTO) { + return GRN_FILE_CORRUPT; + } } - GRN_TEXT_PUTS(ctx, buffer, ">"); +#ifdef WITH_NFKC + grn_normalizer_register(ctx, normalizer_nfkc51_name, -1, + NULL, nfkc51_next, NULL); +#else /* WITH_NFKC */ + grn_normalizer_register(ctx, normalizer_nfkc51_name, -1, + NULL, NULL, NULL); +#endif /* WITH_NFKC */ +/* + grn_normalizer_register(ctx, "NormalizerUCA", -1, + NULL, uca_next, NULL); +*/ return GRN_SUCCESS; } - -grn_rc -grn_string_close(grn_ctx *ctx, grn_obj *string) -{ - grn_rc rc; - grn_string *string_ = (grn_string *)string; - if (string_) { - if (string_->normalized) { GRN_FREE(string_->normalized); } - if (string_->ctypes) { GRN_FREE(string_->ctypes); } - if (string_->checks) { GRN_FREE(string_->checks); } - GRN_FREE(string); - rc = GRN_SUCCESS; - } else { - rc = GRN_INVALID_ARGUMENT; - } - return rc; -} Added: lib/normalizer_in.h (+50 -0) 100644 =================================================================== --- /dev/null +++ lib/normalizer_in.h 2012-12-14 12:40:02 +0900 (e3411f3) @@ -0,0 +1,50 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2012 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#ifndef GRN_NORMALIZER_IN_H +#define GRN_NORMALIZER_IN_H + +#ifndef GROONGA_IN_H +#include "groonga_in.h" +#endif /* GROONGA_IN_H */ + +#ifndef GRN_CTX_H +#include "ctx.h" +#endif /* GRN_CTX_H */ + +#ifndef GRN_DB_H +#include "db.h" +#endif /* GRN_DB_H */ + +#ifdef __cplusplus +extern "C" { +#endif + +grn_rc grn_normalizer_init(void); +grn_rc grn_normalizer_fin(void); + +grn_rc grn_normalizer_normalize(grn_ctx *ctx, + grn_obj *normalizer, + grn_obj *string); + +grn_rc grn_db_init_builtin_normalizers(grn_ctx *ctx); + +#ifdef __cplusplus +} +#endif + +#endif /* GRN_NORMALIZER_IN_H */ Modified: lib/pat.c (+19 -4) =================================================================== --- lib/pat.c 2012-12-14 12:20:38 +0900 (e3f4710) +++ lib/pat.c 2012-12-14 12:40:02 +0900 (8ab4e22) @@ -423,13 +423,21 @@ _grn_pat_create(grn_ctx *ctx, grn_pat *pat, header->curr_del3 = 0; header->n_garbages = 0; header->tokenizer = GRN_ID_NIL; + if (header->flags & GRN_OBJ_KEY_NORMALIZE) { + header->flags &= ~GRN_OBJ_KEY_NORMALIZE; + header->normalizer = GRN_DB_NORMALIZER_AUTO; + pat->normalizer = grn_ctx_at(ctx, header->normalizer); + } else { + header->normalizer = GRN_ID_NIL; + pat->normalizer = NULL; + } pat->io = io; pat->header = header; pat->key_size = key_size; pat->value_size = value_size; pat->tokenizer = NULL; pat->encoding = encoding; - pat->obj.header.flags = flags; + pat->obj.header.flags = header->flags; if (!(node0 = pat_get(ctx, pat, 0))) { grn_io_close(ctx, io); return NULL; @@ -518,6 +526,11 @@ grn_pat_open(grn_ctx *ctx, const char *path) pat->encoding = header->encoding; pat->obj.header.flags = header->flags; pat->tokenizer = grn_ctx_at(ctx, header->tokenizer); + if (header->flags & GRN_OBJ_KEY_NORMALIZE) { + header->flags &= ~GRN_OBJ_KEY_NORMALIZE; + header->normalizer = GRN_DB_NORMALIZER_AUTO; + } + pat->normalizer = grn_ctx_at(ctx, header->normalizer); PAT_AT(pat, 0, node0); if (!node0) { grn_io_close(ctx, io); @@ -1528,9 +1541,9 @@ grn_pat_scan(grn_ctx *ctx, grn_pat *pat, const char *str, unsigned int str_len, { int n = 0; grn_id tid; - if (pat->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) { + if (pat->normalizer) { grn_obj *nstr = grn_string_open(ctx, str, str_len, - GRN_NORMALIZER_AUTO, GRN_STRING_WITH_CHECKS); + pat->normalizer, GRN_STRING_WITH_CHECKS); if (nstr) { const short *cp = grn_string_get_checks(ctx, nstr); unsigned int offset = 0, offset0 = 0; @@ -2281,7 +2294,7 @@ grn_pat_check(grn_ctx *ctx, grn_pat *pat) char buf[8]; struct grn_pat_header *h = pat->header; GRN_OUTPUT_ARRAY_OPEN("RESULT", 1); - GRN_OUTPUT_MAP_OPEN("SUMMARY", 22); + GRN_OUTPUT_MAP_OPEN("SUMMARY", 23); GRN_OUTPUT_CSTR("flags"); grn_itoh(h->flags, buf, 8); GRN_OUTPUT_STR(buf, 8); @@ -2291,6 +2304,8 @@ grn_pat_check(grn_ctx *ctx, grn_pat *pat) GRN_OUTPUT_INT64(h->value_size); GRN_OUTPUT_CSTR("tokenizer"); GRN_OUTPUT_INT64(h->tokenizer); + GRN_OUTPUT_CSTR("normalizer"); + GRN_OUTPUT_INT64(h->normalizer); GRN_OUTPUT_CSTR("n_entries"); GRN_OUTPUT_INT64(h->n_entries); GRN_OUTPUT_CSTR("curr_rec"); Modified: lib/pat.h (+3 -1) =================================================================== --- lib/pat.h 2012-12-14 12:20:38 +0900 (3430f9d) +++ lib/pat.h 2012-12-14 12:40:02 +0900 (09e1fa8) @@ -38,6 +38,7 @@ struct _grn_pat { uint32_t key_size; uint32_t value_size; grn_obj *tokenizer; + grn_obj *normalizer; grn_id *cache; uint32_t cache_size; }; @@ -64,7 +65,8 @@ struct grn_pat_header { int32_t curr_del2; int32_t curr_del3; uint32_t n_garbages; - uint32_t reserved[1005]; + grn_id normalizer; + uint32_t reserved[1004]; grn_pat_delinfo delinfos[GRN_PAT_NDELINFOS]; grn_id garbages[GRN_PAT_MAX_KEY_SIZE + 1]; }; Modified: lib/sources.am (+2 -0) =================================================================== --- lib/sources.am 2012-12-14 12:20:38 +0900 (61e888f) +++ lib/sources.am 2012-12-14 12:40:02 +0900 (f1f704e) @@ -20,6 +20,8 @@ libgroonga_la_SOURCES = \ io.h \ nfkc.c \ nfkc.h \ + normalizer.c \ + normalizer.h \ output.c \ output.h \ pat.c \ Modified: lib/string.c (+13 -1082) =================================================================== --- lib/string.c 2012-12-14 12:20:38 +0900 (5253a35) +++ lib/string.c 2012-12-14 12:40:02 +0900 (2119a70) @@ -19,1067 +19,12 @@ #include "groonga_in.h" #include <string.h> #include "string_in.h" +#include "normalizer_in.h" #include "str.h" +#include "util.h" #include <groonga/tokenizer.h> -static unsigned char symbol[] = { - ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0, - '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0 -}; - -inline static grn_obj * -eucjp_normalize(grn_ctx *ctx, int nargs, grn_obj **args, - grn_user_data *user_data) -{ - static uint16_t hankana[] = { - 0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3, - 0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2, - 0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3, - 0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6, - 0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5, - 0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6, - 0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab, - 0xa1eb - }; - static unsigned char dakuten[] = { - 0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0, - 0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7, - 0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0, - 0, 0xdc - }; - static unsigned char handaku[] = { - 0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd - }; - grn_string *nstr = (grn_string *)args[0]; - int16_t *ch; - const unsigned char *s, *s_, *e; - unsigned char *d, *d0, *d_, b; - uint_least8_t *cp, *ctypes, ctype; - size_t size = nstr->original_length_in_bytes, length = 0; - int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; - if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][eucjp] failed to allocate normalized text space"); - return NULL; - } - d0 = (unsigned char *) nstr->normalized; - if (nstr->flags & GRN_STRING_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->normalized); - nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][eucjp] failed to allocate checks space"); - return NULL; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STRING_WITH_TYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->normalized); - nstr->checks = NULL; - nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][eucjp] failed to allocate character types space"); - return NULL; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->original + size; - for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) { - if ((*s & 0x80)) { - if (((s + 1) < e) && (*(s + 1) & 0x80)) { - unsigned char c1 = *s++, c2 = *s, c3 = 0; - switch (c1 >> 4) { - case 0x08 : - if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) { - uint16_t c = hankana[c2 - 0xa0]; - switch (c) { - case 0xa1ab : - if (d > d0 + 1 && d[-2] == 0xa5 - && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) { - *(d - 1) = b; - if (ch) { ch[-1] += 2; s_ += 2; } - continue; - } else { - *d++ = c >> 8; *d = c & 0xff; - } - break; - case 0xa1eb : - if (d > d0 + 1 && d[-2] == 0xa5 - && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) { - *(d - 1) = b; - if (ch) { ch[-1] += 2; s_ += 2; } - continue; - } else { - *d++ = c >> 8; *d = c & 0xff; - } - break; - default : - *d++ = c >> 8; *d = c & 0xff; - break; - } - ctype = grn_char_katakana; - } else { - *d++ = c1; *d = c2; - ctype = grn_char_others; - } - break; - case 0x09 : - *d++ = c1; *d = c2; - ctype = grn_char_others; - break; - case 0x0a : - switch (c1 & 0x0f) { - case 1 : - switch (c2) { - case 0xbc : - *d++ = c1; *d = c2; - ctype = grn_char_katakana; - break; - case 0xb9 : - *d++ = c1; *d = c2; - ctype = grn_char_kanji; - break; - case 0xa1 : - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_CHAR_BLANK|grn_char_symbol; - } - break; - default : - if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) { - *d = c3; - ctype = grn_char_symbol; - } else { - *d++ = c1; *d = c2; - ctype = grn_char_others; - } - break; - } - break; - case 2 : - *d++ = c1; *d = c2; - ctype = grn_char_symbol; - break; - case 3 : - c3 = c2 - 0x80; - if ('a' <= c3 && c3 <= 'z') { - ctype = grn_char_alpha; - *d = c3; - } else if ('A' <= c3 && c3 <= 'Z') { - ctype = grn_char_alpha; - *d = c3 + 0x20; - } else if ('0' <= c3 && c3 <= '9') { - ctype = grn_char_digit; - *d = c3; - } else { - ctype = grn_char_others; - *d++ = c1; *d = c2; - } - break; - case 4 : - *d++ = c1; *d = c2; - ctype = grn_char_hiragana; - break; - case 5 : - *d++ = c1; *d = c2; - ctype = grn_char_katakana; - break; - case 6 : - case 7 : - case 8 : - *d++ = c1; *d = c2; - ctype = grn_char_symbol; - break; - default : - *d++ = c1; *d = c2; - ctype = grn_char_others; - break; - } - break; - default : - *d++ = c1; *d = c2; - ctype = grn_char_kanji; - break; - } - } else { - /* skip invalid character */ - continue; - } - } else { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_CHAR_BLANK|grn_char_symbol; - } - } else { - *d = c; - ctype = grn_char_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol); - break; - default : - *d = c; - ctype = grn_char_others; - break; - } - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_char_null; } - *d = '\0'; - nstr->n_characters = length; - nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized); - return NULL; -} - -inline static grn_obj * -sjis_normalize(grn_ctx *ctx, int nargs, grn_obj **args, - grn_user_data *user_data) -{ - static uint16_t hankana[] = { - 0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342, - 0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341, - 0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352, - 0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365, - 0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374, - 0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386, - 0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a, - 0x814b - }; - static unsigned char dakuten[] = { - 0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0, - 0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66, - 0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0, - 0, 0x7b - }; - static unsigned char handaku[] = { - 0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c - }; - grn_string *nstr = (grn_string *)args[0]; - int16_t *ch; - const unsigned char *s, *s_; - unsigned char *d, *d0, *d_, b, *e; - uint_least8_t *cp, *ctypes, ctype; - size_t size = nstr->original_length_in_bytes, length = 0; - int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; - if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][sjis] failed to allocate normalized text space"); - return NULL; - } - d0 = (unsigned char *) nstr->normalized; - if (nstr->flags & GRN_STRING_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->normalized); - nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][sjis] failed to allocate checks space"); - return NULL; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STRING_WITH_TYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->normalized); - nstr->checks = NULL; - nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][sjis] failed to allocate character types space"); - return NULL; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->original + size; - for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) { - if ((*s & 0x80)) { - if (0xa0 <= *s && *s <= 0xdf) { - uint16_t c = hankana[*s - 0xa0]; - switch (c) { - case 0x814a : - if (d > d0 + 1 && d[-2] == 0x83 - && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) { - *(d - 1) = b; - if (ch) { ch[-1]++; s_++; } - continue; - } else { - *d++ = c >> 8; *d = c & 0xff; - } - break; - case 0x814b : - if (d > d0 + 1 && d[-2] == 0x83 - && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) { - *(d - 1) = b; - if (ch) { ch[-1]++; s_++; } - continue; - } else { - *d++ = c >> 8; *d = c & 0xff; - } - break; - default : - *d++ = c >> 8; *d = c & 0xff; - break; - } - ctype = grn_char_katakana; - } else { - if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) { - unsigned char c1 = *s++, c2 = *s, c3 = 0; - if (0x81 <= c1 && c1 <= 0x87) { - switch (c1 & 0x0f) { - case 1 : - switch (c2) { - case 0x5b : - *d++ = c1; *d = c2; - ctype = grn_char_katakana; - break; - case 0x58 : - *d++ = c1; *d = c2; - ctype = grn_char_kanji; - break; - case 0x40 : - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_CHAR_BLANK|grn_char_symbol; - } - break; - default : - if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) { - *d = c3; - ctype = grn_char_symbol; - } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) { - *d = c3; - ctype = grn_char_symbol; - } else { - *d++ = c1; *d = c2; - ctype = grn_char_others; - } - break; - } - break; - case 2 : - c3 = c2 - 0x1f; - if (0x4f <= c2 && c2 <= 0x58) { - ctype = grn_char_digit; - *d = c2 - 0x1f; - } else if (0x60 <= c2 && c2 <= 0x79) { - ctype = grn_char_alpha; - *d = c2 + 0x01; - } else if (0x81 <= c2 && c2 <= 0x9a) { - ctype = grn_char_alpha; - *d = c2 - 0x20; - } else if (0x9f <= c2 && c2 <= 0xf1) { - *d++ = c1; *d = c2; - ctype = grn_char_hiragana; - } else { - *d++ = c1; *d = c2; - ctype = grn_char_others; - } - break; - case 3 : - if (0x40 <= c2 && c2 <= 0x96) { - *d++ = c1; *d = c2; - ctype = grn_char_katakana; - } else { - *d++ = c1; *d = c2; - ctype = grn_char_symbol; - } - break; - case 4 : - case 7 : - *d++ = c1; *d = c2; - ctype = grn_char_symbol; - break; - default : - *d++ = c1; *d = c2; - ctype = grn_char_others; - break; - } - } else { - *d++ = c1; *d = c2; - ctype = grn_char_kanji; - } - } else { - /* skip invalid character */ - continue; - } - } - } else { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_CHAR_BLANK|grn_char_symbol; - } - } else { - *d = c; - ctype = grn_char_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol); - break; - default : - *d = c; - ctype = grn_char_others; - break; - } - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_char_null; } - *d = '\0'; - nstr->n_characters = length; - nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized); - return NULL; -} - -#ifdef WITH_NFKC -uint_least8_t grn_nfkc_ctype(const unsigned char *str); -const char *grn_nfkc_map1(const unsigned char *str); -const char *grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix); - -static inline int -grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char *end) -{ - /* MEMO: This function allows non-null-terminated string as str. */ - /* But requires the end of string. */ - const unsigned char *p = str; - if (end <= p || !*p) { return 0; } - if (*p & 0x80) { - int b, w; - int size; - int i; - for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++); - if (!w) { - GRN_LOG(ctx, GRN_LOG_WARNING, - "invalid utf8 string: the first bit is 0x80: <%.*s>: <%.*s>", - (int)(end - p), p, - (int)(end - str), str); - return 0; - } - size = w + 1; - for (i = 1; i < size; i++) { - if (++p >= end) { - GRN_LOG(ctx, GRN_LOG_WARNING, - "invalid utf8 string: too short: " - "%d byte is required but %d byte is given: <%.*s>", - size, i, - (int)(end - str), str); - return 0; - } - if (!*p) { - GRN_LOG(ctx, GRN_LOG_WARNING, - "invalid utf8 string: NULL character is found: <%.*s>", - (int)(end - str), str); - return 0; - } - if ((*p & 0xc0) != 0x80) { - GRN_LOG(ctx, GRN_LOG_WARNING, - "invalid utf8 string: 0x80 is not allowed: <%.*s>: <%.*s>", - (int)(end - p), p, - (int)(end - str), str); - return 0; - } - } - return size; - } else { - return 1; - } - return 0; -} - -inline static grn_obj * -utf8_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) -{ - int16_t *ch; - const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e; - unsigned char *d, *d_, *de; - uint_least8_t *cp; - grn_string *nstr = (grn_string *)args[0]; - size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3; - int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; - grn_bool remove_tokenized_delimiter_p = - nstr->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER; - if (!(nstr->normalized = GRN_MALLOC(ds + 1))) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][utf8] failed to allocate normalized text space"); - return NULL; - } - if (nstr->flags & GRN_STRING_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->normalized); - nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][utf8] failed to allocate checks space"); - return NULL; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STRING_WITH_TYPES) { - if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) { - if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } - GRN_FREE(nstr->normalized); nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][utf8] failed to allocate character types space"); - return NULL; - } - } - cp = nstr->ctypes; - d = (unsigned char *)nstr->normalized; - de = d + ds; - d_ = NULL; - e = (unsigned char *)nstr->original + size; - for (s = s_ = (unsigned char *)nstr->original; ; s += ls) { - if (!(ls = grn_str_charlen_utf8(ctx, s, e))) { - break; - } - if (remove_tokenized_delimiter_p && - grn_tokenizer_is_tokenized_delimiter(ctx, s, ls, GRN_ENC_UTF8)) { - continue; - } - if ((p = (unsigned char *)grn_nfkc_map1(s))) { - pe = p + strlen((char *)p); - } else { - p = s; - pe = p + ls; - } - if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) { - p = p2; - pe = p + strlen((char *)p); - if (cp) { cp--; } - if (ch) { - ch -= (d - d_); - if (ch[0] >= 0) { - s_ = s__; - } - } - d = d_; - length--; - } - for (; ; p += lp) { - if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) { - break; - } - if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) { - if (cp > nstr->ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - } else { - if (de <= d + lp) { - unsigned char *normalized; - ds += (ds >> 1) + lp; - if (!(normalized = GRN_REALLOC(nstr->normalized, ds + 1))) { - if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } - if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } - GRN_FREE(nstr->normalized); nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][utf8] failed to expand normalized text space"); - return NULL; - } - de = normalized + ds; - d = normalized + (d - (unsigned char *)nstr->normalized); - nstr->normalized = normalized; - if (ch) { - int16_t *checks; - if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) { - if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } - GRN_FREE(nstr->checks); nstr->checks = NULL; - GRN_FREE(nstr->normalized); nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][utf8] failed to expand checks space"); - return NULL; - } - ch = checks + (ch - nstr->checks); - nstr->checks = checks; - } - if (cp) { - uint_least8_t *ctypes; - if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) { - GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; - if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } - GRN_FREE(nstr->normalized); nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][utf8] failed to expand character types space"); - return NULL; - } - cp = ctypes + (cp - nstr->ctypes); - nstr->ctypes = ctypes; - } - } - memcpy(d, p, lp); - d_ = d; - d += lp; - length++; - if (cp) { *cp++ = grn_nfkc_ctype(p); } - if (ch) { - size_t i; - if (s_ == s + ls) { - *ch++ = -1; - } else { - *ch++ = (int16_t)(s + ls - s_); - s__ = s_; - s_ = s + ls; - } - for (i = lp; i > 1; i--) { *ch++ = 0; } - } - } - } - } - if (cp) { *cp = grn_str_null; } - *d = '\0'; - nstr->n_characters = length; - nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized); - return NULL; -} -#endif /* WITH_NFKC */ - -inline static grn_obj * -ascii_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) -{ - grn_string *nstr = (grn_string *)args[0]; - int16_t *ch; - const unsigned char *s, *s_, *e; - unsigned char *d, *d0, *d_; - uint_least8_t *cp, *ctypes, ctype; - size_t size = nstr->original_length_in_bytes, length = 0; - int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; - if (!(nstr->normalized = GRN_MALLOC(size + 1))) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][ascii] failed to allocate normalized text space"); - return NULL; - } - d0 = (unsigned char *) nstr->normalized; - if (nstr->flags & GRN_STRING_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->normalized); - nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][ascii] failed to allocate checks space"); - return NULL; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STRING_WITH_TYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->normalized); - nstr->checks = NULL; - nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][ascii] failed to allocate character types space"); - return NULL; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->original + size; - for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_CHAR_BLANK|grn_char_symbol; - } - } else { - *d = c; - ctype = grn_char_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol); - break; - default : - *d = c; - ctype = grn_char_others; - break; - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_char_null; } - *d = '\0'; - nstr->n_characters = length; - nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized); - return NULL; -} - -/* use cp1252 as latin1 */ -inline static grn_obj * -latin1_normalize(grn_ctx *ctx, int nargs, grn_obj **args, - grn_user_data *user_data) -{ - grn_string *nstr = (grn_string *)args[0]; - int16_t *ch; - const unsigned char *s, *s_, *e; - unsigned char *d, *d0, *d_; - uint_least8_t *cp, *ctypes, ctype; - size_t size = nstr->original_length_in_bytes, length = 0; - int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; - if (!(nstr->normalized = GRN_MALLOC(size + 1))) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][latin1] failed to allocate normalized text space"); - return NULL; - } - d0 = (unsigned char *) nstr->normalized; - if (nstr->flags & GRN_STRING_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->normalized); - nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][latin1] failed to allocate checks space"); - return NULL; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STRING_WITH_TYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->normalized); - nstr->checks = NULL; - nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][latin1] failed to allocate character types space"); - return NULL; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->original + size; - for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_CHAR_BLANK|grn_char_symbol; - } - } else { - *d = c; - ctype = grn_char_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol); - break; - case 8 : - if (c == 0x8a || c == 0x8c || c == 0x8e) { - *d = c + 0x10; - ctype = grn_char_alpha; - } else { - *d = c; - ctype = grn_char_symbol; - } - break; - case 9 : - if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) { - *d = (c == 0x9f) ? c + 0x60 : c; - ctype = grn_char_alpha; - } else { - *d = c; - ctype = grn_char_symbol; - } - break; - case 0x0c : - *d = c + 0x20; - ctype = grn_char_alpha; - break; - case 0x0d : - *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20; - ctype = (c == 0xd7) ? grn_char_symbol : grn_char_alpha; - break; - case 0x0e : - *d = c; - ctype = grn_char_alpha; - break; - case 0x0f : - *d = c; - ctype = (c == 0xf7) ? grn_char_symbol : grn_char_alpha; - break; - default : - *d = c; - ctype = grn_char_others; - break; - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_char_null; } - *d = '\0'; - nstr->n_characters = length; - nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized); - return NULL; -} - -inline static grn_obj * -koi8r_normalize(grn_ctx *ctx, int nargs, grn_obj **args, - grn_user_data *user_data) -{ - grn_string *nstr = (grn_string *)args[0]; - int16_t *ch; - const unsigned char *s, *s_, *e; - unsigned char *d, *d0, *d_; - uint_least8_t *cp, *ctypes, ctype; - size_t size = nstr->original_length_in_bytes, length = 0; - int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; - if (!(nstr->normalized = GRN_MALLOC(size + 1))) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][koi8r] failed to allocate normalized text space"); - return NULL; - } - d0 = (unsigned char *) nstr->normalized; - if (nstr->flags & GRN_STRING_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->normalized); - nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][koi8r] failed to allocate checks space"); - return NULL; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STRING_WITH_TYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->normalized); - nstr->checks = NULL; - nstr->normalized = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[strinig][koi8r] failed to allocate character types space"); - return NULL; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->original + size; - for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_CHAR_BLANK|grn_char_symbol; - } - } else { - *d = c; - ctype = grn_char_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol); - break; - case 0x0a : - *d = c; - ctype = (c == 0xa3) ? grn_char_alpha : grn_char_others; - break; - case 0x0b : - if (c == 0xb3) { - *d = c - 0x10; - ctype = grn_char_alpha; - } else { - *d = c; - ctype = grn_char_others; - } - break; - case 0x0c : - case 0x0d : - *d = c; - ctype = grn_char_alpha; - break; - case 0x0e : - case 0x0f : - *d = c - 0x20; - ctype = grn_char_alpha; - break; - default : - *d = c; - ctype = grn_char_others; - break; - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_char_null; } - *d = '\0'; - nstr->n_characters = length; - nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized); - return NULL; -} - static grn_string * grn_fake_string_open(grn_ctx *ctx, grn_string *string) { @@ -1189,7 +134,7 @@ grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len, { grn_string *string; grn_obj *obj; - grn_obj *args[1]; + grn_bool is_normalizer_auto; if (!str || !str_len) { return NULL; @@ -1218,36 +163,22 @@ grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len, return (grn_obj *)grn_fake_string_open(ctx, string); } - args[0] = obj; - switch (encoding) { - case GRN_ENC_EUC_JP : - eucjp_normalize(ctx, 1, args, NULL); - break; - case GRN_ENC_UTF8 : -#ifdef WITH_NFKC - utf8_normalize(ctx, 1, args, NULL); -#else /* WITH_NFKC */ - ascii_normalize(ctx, 1, args, NULL); -#endif /* WITH_NFKC */ - break; - case GRN_ENC_SJIS : - sjis_normalize(ctx, 1, args, NULL); - break; - case GRN_ENC_LATIN1 : - latin1_normalize(ctx, 1, args, NULL); - break; - case GRN_ENC_KOI8R : - koi8r_normalize(ctx, 1, args, NULL); - break; - default : - ascii_normalize(ctx, 1, args, NULL); - break; + is_normalizer_auto = (normalizer == GRN_NORMALIZER_AUTO); + if (is_normalizer_auto) { + normalizer = grn_ctx_at(ctx, GRN_DB_NORMALIZER_AUTO); } + + /* TODO: check rc */ + grn_normalizer_normalize(ctx, normalizer, (grn_obj *)string); if (ctx->rc) { grn_obj_close(ctx, obj); obj = NULL; } + if (is_normalizer_auto) { + grn_obj_unlink(ctx, normalizer); + } + return obj; } Modified: lib/token.c (+7 -14) =================================================================== --- lib/token.c 2012-12-14 12:20:38 +0900 (bdcbafe) +++ lib/token.c 2012-12-14 12:40:02 +0900 (3c730a3) @@ -111,7 +111,8 @@ delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, } user_data->ptr = tokenizer; - grn_table_get_info(ctx, table, &table_flags, &tokenizer->encoding, NULL); + grn_table_get_info(ctx, table, &table_flags, &tokenizer->encoding, NULL, + &normalizer); tokenizer->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, @@ -120,10 +121,6 @@ delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, tokenizer->encoding); tokenizer->delimiter = delimiter; tokenizer->delimiter_len = delimiter_len; - - if (table_flags & GRN_OBJ_KEY_NORMALIZE) { - normalizer = GRN_NORMALIZER_AUTO; - } tokenizer->nstr = grn_string_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), normalizer, nflags, tokenizer->encoding); @@ -260,10 +257,8 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram token->overlap = 0; token->pos = 0; token->skip = 0; - grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL); - if (table_flags & GRN_OBJ_KEY_NORMALIZE) { - normalizer = GRN_NORMALIZER_AUTO; - } + grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL, + &normalizer); if (!(token->nstr = grn_string_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), normalizer, nflags, token->encoding))) { @@ -452,8 +447,10 @@ grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len, grn_token *token; grn_encoding encoding; grn_obj *tokenizer; + grn_obj *normalizer; grn_obj_flags table_flags; - if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer)) { + if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer, + &normalizer)) { return NULL; } if (!(token = GRN_MALLOC(sizeof(grn_token)))) { return NULL; } @@ -483,11 +480,7 @@ grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len, ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token->pctx.user_data); grn_obj_close(ctx, &str_); } else { - grn_obj *normalizer = NULL; int nflags = 0; - if (table_flags & GRN_OBJ_KEY_NORMALIZE) { - normalizer = GRN_NORMALIZER_AUTO; - } token->nstr = grn_string_open_(ctx, str, str_len, normalizer, nflags, token->encoding); if (token->nstr) { Modified: lib/tokenizer.c (+4 -2) =================================================================== --- lib/tokenizer.c 2012-12-14 12:20:38 +0900 (6a377fb) +++ lib/tokenizer.c 2012-12-14 12:40:02 +0900 (c5a58bc) @@ -165,15 +165,17 @@ grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args) grn_encoding table_encoding; unsigned int query_length = GRN_TEXT_LEN(query_str); char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1); + grn_obj *normalizer = NULL; + if (query_buf == NULL) { GRN_PLUGIN_FREE(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer] failed to duplicate query"); return NULL; } - grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL); + grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL, + &normalizer); { - grn_obj *normalizer = NULL; int flags = 0; grn_obj *normalized_string; if (table_flags & GRN_OBJ_KEY_NORMALIZE) { Modified: lib/util.c (+3 -0) =================================================================== --- lib/util.c 2012-12-14 12:20:38 +0900 (46a5511) +++ lib/util.c 2012-12-14 12:40:02 +0900 (3664a94) @@ -252,6 +252,9 @@ grn_proc_inspect(grn_ctx *ctx, grn_obj *buf, grn_obj *obj) case GRN_PROC_HOOK : GRN_TEXT_PUTS(ctx, buf, "hook"); break; + case GRN_PROC_NORMALIZER : + GRN_TEXT_PUTS(ctx, buf, "normalizer"); + break; } GRN_TEXT_PUTS(ctx, buf, " "); Modified: test/unit/core/dat/test-dat.cpp (+3 -0) =================================================================== --- test/unit/core/dat/test-dat.cpp 2012-12-14 12:20:38 +0900 (904316f) +++ test/unit/core/dat/test-dat.cpp 2012-12-14 12:40:02 +0900 (681b70a) @@ -71,6 +71,7 @@ namespace test_dat { const char *base_dir; grn_ctx ctx; + grn_obj *database; void cut_setup(void) { @@ -81,12 +82,14 @@ namespace test_dat g_mkdir_with_parents(base_dir, 0755); grn_ctx_init(&ctx, 0); + database = grn_db_create(&ctx, NULL, NULL); enter_api(&ctx); } void cut_teardown(void) { leave_api(&ctx); + grn_obj_close(&ctx, database); grn_ctx_fin(&ctx); if (base_dir) { Modified: test/unit/util/test-snip.c (+4 -1) =================================================================== --- test/unit/util/test-snip.c 2012-12-14 12:20:38 +0900 (fdd97b5) +++ test/unit/util/test-snip.c 2012-12-14 12:40:02 +0900 (60f30a0) @@ -46,6 +46,7 @@ void test_add_cond_with_too_large_keyword(void); void test_add_cond_with_copy_tag_flag(void); static grn_ctx context; +static grn_obj *database; static grn_snip *snip; static gchar *keyword; static gchar *result; @@ -197,7 +198,8 @@ cut_shutdown(void) void cut_setup(void) { - grn_ctx_init(&context, GRN_CTX_USE_QL); + grn_ctx_init(&context, 0); + database = grn_db_create(&context, NULL, NULL); snip = NULL; keyword = NULL; @@ -234,6 +236,7 @@ cut_teardown(void) g_free(default_close_tag); } + grn_obj_close(&context, database); grn_ctx_fin(&context); } Modified: test/unit/util/test-string.c (+4 -1) =================================================================== --- test/unit/util/test-string.c 2012-12-14 12:20:38 +0900 (b97118f) +++ test/unit/util/test-string.c 2012-12-14 12:40:02 +0900 (b1b617b) @@ -55,6 +55,7 @@ void data_itoh(void); void test_itoh(gconstpointer data); static grn_ctx context; +static grn_obj *database; static grn_obj buffer; static const gchar text_ja_utf8[] = @@ -76,7 +77,8 @@ static const gchar normalized_text_ja_utf8[] = void setup (void) { - grn_ctx_init(&context, GRN_CTX_USE_QL); + grn_ctx_init(&context, 0); + database = grn_db_create(&context, NULL, NULL); GRN_VOID_INIT(&buffer); } @@ -84,6 +86,7 @@ void teardown (void) { GRN_OBJ_FIN(&context, &buffer); + grn_obj_close(&context, database); grn_ctx_fin(&context); }