null+****@clear*****
null+****@clear*****
2012年 2月 10日 (金) 19:16:20 JST
Kouhei Sutou 2012-02-10 19:16:20 +0900 (Fri, 10 Feb 2012)
New Revision: 27aac3047ea6f54bd610e0917bbca0953a23a665
Log:
[normalizer] changed to use grn_normalized_text instead of grn_str.
Modified files:
lib/db.c
lib/db.h
lib/ii.c
lib/token.c
lib/tokenizer.c
plugins/tokenizers/mecab.c
Modified: lib/db.c (+9 -4)
===================================================================
--- lib/db.c 2012-02-10 19:13:32 +0900 (d9eb95c)
+++ lib/db.c 2012-02-10 19:16:20 +0900 (e15e2b8)
@@ -1805,7 +1805,8 @@ exit :
grn_rc
grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags,
- grn_encoding *encoding, grn_obj **tokenizer)
+ grn_encoding *encoding, grn_obj **tokenizer,
+ grn_obj **normalizer)
{
grn_rc rc = GRN_INVALID_ARGUMENT;
GRN_API_ENTER;
@@ -1815,24 +1816,28 @@ grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags,
if (flags) { *flags = ((grn_pat *)table)->obj.header.flags; }
if (encoding) { *encoding = ((grn_pat *)table)->encoding; }
if (tokenizer) { *tokenizer = ((grn_pat *)table)->tokenizer; }
+ if (normalizer) { *normalizer = ((grn_pat *)table)->normalizer; }
rc = GRN_SUCCESS;
break;
case GRN_TABLE_DAT_KEY :
if (flags) { *flags = ((grn_dat *)table)->obj.header.flags; }
if (encoding) { *encoding = ((grn_dat *)table)->encoding; }
if (tokenizer) { *tokenizer = ((grn_dat *)table)->tokenizer; }
+ if (normalizer) { *normalizer = ((grn_dat *)table)->normalizer; }
rc = GRN_SUCCESS;
break;
case GRN_TABLE_HASH_KEY :
if (flags) { *flags = ((grn_hash *)table)->obj.header.flags; }
if (encoding) { *encoding = ((grn_hash *)table)->encoding; }
if (tokenizer) { *tokenizer = ((grn_hash *)table)->tokenizer; }
+ if (normalizer) { *normalizer = ((grn_hash *)table)->normalizer; }
rc = GRN_SUCCESS;
break;
case GRN_TABLE_NO_KEY :
if (flags) { *flags = 0; }
if (encoding) { *encoding = GRN_ENC_NONE; }
if (tokenizer) { *tokenizer = grn_uvector_tokenizer; }
+ if (normalizer) { *normalizer = NULL; }
rc = GRN_SUCCESS;
break;
}
@@ -8074,7 +8079,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op,
if (obj->header.type != GRN_COLUMN_FIX_SIZE) {
grn_obj *tokenizer, *lexicon = grn_ctx_at(ctx, target->header.domain);
if (!lexicon) { continue; }
- grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer);
+ grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL);
if (tokenizer) { continue; }
}
if (n < buf_size) {
@@ -8112,7 +8117,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op,
if (!lexicon) { continue; }
if (lexicon->header.type != GRN_TABLE_PAT_KEY) { continue; }
/* FIXME: GRN_TABLE_DAT_KEY should be supported */
- grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer);
+ grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL);
if (tokenizer) { continue; }
}
if (n < buf_size) {
@@ -8192,7 +8197,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op,
if (!lexicon) { continue; }
if (lexicon->header.type != GRN_TABLE_PAT_KEY) { continue; }
/* FIXME: GRN_TABLE_DAT_KEY should be supported */
- grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer);
+ grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL);
if (tokenizer) { continue; }
}
if (n < buf_size) {
Modified: lib/db.h (+2 -1)
===================================================================
--- lib/db.h 2012-02-10 19:13:32 +0900 (4f76d43)
+++ lib/db.h 2012-02-10 19:16:20 +0900 (537b32f)
@@ -92,7 +92,8 @@ grn_id grn_table_get_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_si
grn_id grn_table_add_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_size,
void **value, int *added);
GRN_API grn_rc grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags,
- grn_encoding *encoding, grn_obj **tokenizer);
+ grn_encoding *encoding, grn_obj **tokenizer,
+ grn_obj **normalizer);
const char *_grn_table_key(grn_ctx *ctx, grn_obj *table, grn_id id, uint32_t *key_size);
grn_rc grn_table_search(grn_ctx *ctx, grn_obj *table,
Modified: lib/ii.c (+8 -4)
===================================================================
--- lib/ii.c 2012-02-10 19:13:32 +0900 (ac340da)
+++ lib/ii.c 2012-02-10 19:16:20 +0900 (09192e5)
@@ -3405,7 +3405,9 @@ _grn_ii_create(grn_ctx *ctx, grn_ii *ii, const char *path, grn_obj *lexicon, uin
free_histogram[i] = 0;
}
*/
- if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer)) { return NULL; }
+ if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, NULL)) {
+ return NULL;
+ }
if (path && strlen(path) + 6 >= PATH_MAX) { return NULL; }
seg = grn_io_create(ctx, path, sizeof(struct grn_ii_header),
S_SEGMENT, GRN_II_MAX_LSEG, grn_io_auto, GRN_IO_EXPIRE_SEGMENT);
@@ -3524,7 +3526,9 @@ grn_ii_open(grn_ctx *ctx, const char *path, grn_obj *lexicon)
grn_obj_flags lflags;
grn_encoding encoding;
grn_obj *tokenizer;
- if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer)) { return NULL; }
+ if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, NULL)) {
+ return NULL;
+ }
if (strlen(path) + 6 >= PATH_MAX) { return NULL; }
strcpy(path2, path);
strcat(path2, ".c");
@@ -6549,7 +6553,7 @@ grn_ii_builder_tokenize(grn_ctx *ctx, grn_ii_builder *builder, grn_id rid, grn_o
grn_obj *range = grn_ctx_at(ctx, DB_OBJ(builder->lexicon)->range);
grn_obj *tokenizer;
grn_obj_flags flags;
- grn_table_get_info(ctx, builder->lexicon, &flags, NULL, &tokenizer);
+ grn_table_get_info(ctx, builder->lexicon, &flags, NULL, &tokenizer, NULL);
flags &= ~GRN_OBJ_PERSISTENT;
builder->tmp_lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range);
grn_obj_set_info(ctx, builder->tmp_lexicon, GRN_INFO_DEFAULT_TOKENIZER, tokenizer);
@@ -6879,7 +6883,7 @@ grn_ii_build(grn_ctx *ctx, grn_ii *ii)
builder.tmp_lexicon = NULL;
{
grn_obj_flags flags;
- grn_table_get_info(ctx, builder.lexicon, &flags, NULL, NULL);
+ grn_table_get_info(ctx, builder.lexicon, &flags, NULL, NULL, NULL);
if (flags & GRN_OBJ_TABLE_PAT_KEY) {
grn_pat_cache_enable(ctx, (grn_pat *)builder.lexicon, PAT_CACHE_SIZE);
}
Modified: lib/token.c (+84 -28)
===================================================================
--- lib/token.c 2012-02-10 19:13:32 +0900 (3ef64d8)
+++ lib/token.c 2012-02-10 19:16:20 +0900 (887491d)
@@ -79,7 +79,8 @@ uvector_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
}
typedef struct {
- grn_str *nstr;
+ grn_obj *normalized_text;
+ char *target_text;
const uint8_t *delimiter;
uint32_t delimiter_len;
int32_t pos;
@@ -97,29 +98,47 @@ delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data,
const uint8_t *delimiter, uint32_t delimiter_len)
{
grn_obj *str;
- int nflags = 0;
grn_delimited_tokenizer *token;
grn_obj_flags table_flags;
+ grn_obj *normalizer;
if (!(str = grn_ctx_pop(ctx))) {
ERR(GRN_INVALID_ARGUMENT, "missing argument");
return NULL;
}
if (!(token = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) { return NULL; }
user_data->ptr = token;
+ token->normalized_text = NULL;
+ token->target_text = NULL;
token->delimiter = delimiter;
token->delimiter_len = delimiter_len;
token->pos = 0;
- grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL);
- nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
- if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
- nflags, token->encoding))) {
- GRN_FREE(token);
- ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
- return NULL;
+ grn_table_get_info(ctx, table, &table_flags,
+ &token->encoding, NULL, &normalizer);
+ if (normalizer) {
+ unsigned int length_in_bytes;
+ if (!(token->normalized_text = grn_normalized_text_open(ctx,
+ normalizer,
+ GRN_TEXT_VALUE(str),
+ GRN_TEXT_LEN(str),
+ token->encoding,
+ 0))) {
+ GRN_FREE(token);
+ ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
+ return NULL;
+ }
+ grn_normalized_text_get_value(ctx, token->normalized_text,
+ (const char **)(&(token->next)),
+ &(token->len),
+ &length_in_bytes);
+ token->end = token->next + length_in_bytes;
+ } else {
+ token->len = GRN_TEXT_LEN(str);
+ token->target_text = GRN_MALLOC(token->len + 1);
+ memcpy(token->target_text, GRN_TEXT_VALUE(str), token->len);
+ token->target_text[token->len] = '\0';
+ token->next = (unsigned char *)token->target_text;
+ token->end = token->next + token->len;
}
- token->next = (unsigned char *)token->nstr->norm;
- token->end = token->next + token->nstr->norm_blen;
- token->len = token->nstr->length;
GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY);
GRN_UINT32_INIT(&token->stat_, 0);
return NULL;
@@ -154,7 +173,12 @@ static grn_obj *
delimited_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
grn_delimited_tokenizer *token = user_data->ptr;
- grn_str_close(ctx, token->nstr);
+ if (token->normalized_text) {
+ grn_obj_unlink(ctx, token->normalized_text);
+ }
+ if (token->target_text) {
+ GRN_FREE(token->target_text);
+ }
GRN_FREE(token);
return NULL;
}
@@ -178,6 +202,7 @@ delimit_null_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d
/* ngram tokenizer */
typedef struct {
+ grn_obj *normalized_text;
grn_str *nstr;
uint8_t uni_alpha;
uint8_t uni_digit;
@@ -190,7 +215,7 @@ typedef struct {
grn_encoding encoding;
const unsigned char *next;
const unsigned char *end;
- uint_least8_t *ctypes;
+ const uint_least8_t *ctypes;
int32_t len;
uint32_t tail;
grn_obj curr_;
@@ -202,6 +227,7 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank)
{
grn_obj *str;
+ grn_obj *normalizer;
int nflags = GRN_NORMALIZE_REMOVE_BLANK|GRN_NORMALIZE_WITH_TYPES;
grn_ngram_tokenizer *token;
grn_obj_flags table_flags;
@@ -211,6 +237,8 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
}
if (!(token = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) { return NULL; }
user_data->ptr = token;
+ token->normalized_text = NULL;
+ token->nstr = NULL;
token->uni_alpha = uni_alpha;
token->uni_digit = uni_digit;
token->uni_symbol = uni_symbol;
@@ -219,18 +247,39 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
token->overlap = 0;
token->pos = 0;
token->skip = 0;
- grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL);
- nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
- if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
- nflags, token->encoding))) {
- GRN_FREE(token);
- ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
- return NULL;
+ grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL,
+ &normalizer);
+ if (normalizer) {
+ unsigned int length_in_bytes;
+ if (!(token->normalized_text = grn_normalized_text_open(ctx,
+ normalizer,
+ GRN_TEXT_VALUE(str),
+ GRN_TEXT_LEN(str),
+ token->encoding,
+ nflags))) {
+ GRN_FREE(token);
+ ERR(GRN_TOKENIZER_ERROR,
+ "[tokenizer][ngram][init] failed to open normalized text");
+ return NULL;
+ }
+ grn_normalized_text_get_value(ctx, token->normalized_text,
+ (const char **)(&(token->next)),
+ &(token->len),
+ &length_in_bytes);
+ token->end = token->next + length_in_bytes;
+ token->ctypes = grn_normalized_text_get_types(ctx, token->normalized_text);
+ } else {
+ if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
+ nflags, token->encoding))) {
+ GRN_FREE(token);
+ ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
+ return NULL;
+ }
+ token->next = (unsigned char *)token->nstr->norm;
+ token->end = token->next + token->nstr->norm_blen;
+ token->ctypes = token->nstr->ctypes;
+ token->len = token->nstr->length;
}
- token->next = (unsigned char *)token->nstr->norm;
- token->end = token->next + token->nstr->norm_blen;
- token->ctypes = token->nstr->ctypes;
- token->len = token->nstr->length;
GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY);
GRN_UINT32_INIT(&token->stat_, 0);
return NULL;
@@ -283,7 +332,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
grn_ngram_tokenizer *token = user_data->ptr;
const unsigned char *p = token->next, *r = p, *e = token->end;
int32_t len = 0, pos = token->pos + token->skip, status = 0;
- uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL;
+ const uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL;
if (cp && token->uni_alpha && GRN_CHAR_TYPE(*cp) == grn_char_alpha) {
while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
len++;
@@ -371,7 +420,12 @@ static grn_obj *
ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
grn_ngram_tokenizer *token = user_data->ptr;
- grn_str_close(ctx, token->nstr);
+ if (token->normalized_text) {
+ grn_obj_unlink(ctx, token->normalized_text);
+ }
+ if (token->nstr) {
+ grn_str_close(ctx, token->nstr);
+ }
GRN_FREE(token);
return NULL;
}
@@ -406,7 +460,9 @@ grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
grn_token *token;
grn_encoding encoding;
grn_obj *tokenizer;
- if (grn_table_get_info(ctx, table, NULL, &encoding, &tokenizer)) { return NULL; }
+ if (grn_table_get_info(ctx, table, NULL, &encoding, &tokenizer, NULL)) {
+ return NULL;
+ }
if (!(token = GRN_MALLOC(sizeof(grn_token)))) { return NULL; }
token->table = table;
token->mode = mode;
Modified: lib/tokenizer.c (+1 -1)
===================================================================
--- lib/tokenizer.c 2012-02-10 19:13:32 +0900 (ee87dbd)
+++ lib/tokenizer.c 2012-02-10 19:16:20 +0900 (7d4415f)
@@ -186,7 +186,7 @@ grn_tokenizer_query *grn_tokenizer_query_create(grn_ctx *ctx,
grn_obj * const table = args[0];
grn_encoding table_encoding;
int flags = 0;
- grn_table_get_info(ctx, table, NULL, &table_encoding, NULL);
+ grn_table_get_info(ctx, table, NULL, &table_encoding, NULL, NULL);
{
grn_str * const str = grn_str_open_(ctx, GRN_TEXT_VALUE(query_str),
GRN_TEXT_LEN(query_str),
Modified: plugins/tokenizers/mecab.c (+30 -13)
===================================================================
--- plugins/tokenizers/mecab.c 2012-02-10 19:13:32 +0900 (f944656)
+++ plugins/tokenizers/mecab.c 2012-02-10 19:16:20 +0900 (0c4b48c)
@@ -34,7 +34,6 @@ static grn_critical_section sole_mecab_lock;
static grn_encoding sole_mecab_encoding = GRN_ENC_NONE;
typedef struct {
- grn_str *nstr;
mecab_t *mecab;
char *buf;
char *next;
@@ -84,9 +83,12 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
grn_obj *str;
int nflags = 0;
+ const char *target_text;
char *buf, *s, *p;
char mecab_err[256];
grn_obj *table = args[0];
+ grn_obj *normalizer;
+ grn_obj *normalized_text;
grn_obj_flags table_flags;
grn_encoding table_encoding;
grn_mecab_tokenizer *token;
@@ -113,7 +115,8 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
"mecab_new2 failed on grn_mecab_init: %s", mecab_err);
return NULL;
}
- grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL);
+ grn_table_get_info(ctx, table, &table_flags, &table_encoding,
+ NULL, &normalizer);
if (table_encoding != sole_mecab_encoding) {
ERR(GRN_TOKENIZER_ERROR,
"MeCab dictionary charset (%s) does not match the context encoding: <%s>",
@@ -123,23 +126,36 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return NULL; }
token->mecab = sole_mecab;
token->encoding = table_encoding;
- nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
- if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
- nflags, token->encoding))) {
- GRN_FREE(token);
- ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
- return NULL;
+ normalized_text = NULL;
+ if (normalizer) {
+ if (!(normalized_text = grn_normalized_text_open(ctx,
+ normalizer,
+ GRN_TEXT_VALUE(str),
+ GRN_TEXT_LEN(str),
+ token->encoding,
+ nflags))) {
+ GRN_FREE(token);
+ ERR(GRN_TOKENIZER_ERROR,
+ "[tokenizer][mecab] failed to open normalized text");
+ return NULL;
+ }
+ grn_normalized_text_get_value(ctx, normalized_text,
+ &target_text, NULL, &len);
+ } else {
+ target_text = GRN_TEXT_VALUE(str);
+ len = GRN_TEXT_LEN(str);
}
- len = token->nstr->norm_blen;
for (bufsize = len * 2 + 1; maxtrial; bufsize *= 2, maxtrial--) {
if (!(buf = GRN_MALLOC(bufsize + 1))) {
GRN_LOG(ctx, GRN_LOG_ALERT, "buffer allocation on mecab_init failed !");
- grn_str_close(ctx, token->nstr);
+ if (normalized_text) {
+ grn_obj_unlink(ctx, normalized_text);
+ }
GRN_FREE(token);
return NULL;
}
CRITICAL_SECTION_ENTER(sole_mecab_lock);
- s = mecab_sparse_tostr3(token->mecab, token->nstr->norm, len, buf, bufsize);
+ s = mecab_sparse_tostr3(token->mecab, target_text, len, buf, bufsize);
if (!s) {
strncpy(mecab_err, mecab_strerror(token->mecab), sizeof(mecab_err) - 1);
}
@@ -148,10 +164,12 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
GRN_FREE(buf);
if (strstr(mecab_err, "output buffer overflow") == NULL) { break; }
}
+ if (normalized_text) {
+ grn_obj_unlink(ctx, normalized_text);
+ }
if (!s) {
ERR(GRN_TOKENIZER_ERROR, "mecab_sparse_tostr failed len=%d bufsize=%d err=%s",
len, bufsize, mecab_err);
- grn_str_close(ctx, token->nstr);
GRN_FREE(token);
return NULL;
}
@@ -206,7 +224,6 @@ static grn_obj *
mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
grn_mecab_tokenizer *token = user_data->ptr;
- grn_str_close(ctx, token->nstr);
GRN_FREE(token->buf);
GRN_FREE(token);
return NULL;