null+****@clear*****
null+****@clear*****
2012年 2月 14日 (火) 13:58:03 JST
Kouhei Sutou 2012-02-14 13:58:03 +0900 (Tue, 14 Feb 2012)
New Revision: 17870f594194e90133bbccd15aa99b536444152a
Log:
Revert "[normalizer] changed to use grn_normalized_text instead of grn_str."
This reverts commit 27aac3047ea6f54bd610e0917bbca0953a23a665.
Conflicts:
lib/ii.c
lib/tokenizer.c
plugins/tokenizers/mecab.c
Modified files:
lib/db.c
lib/db.h
lib/ii.c
lib/token.c
lib/tokenizer.c
plugins/tokenizers/mecab.c
Modified: lib/db.c (+4 -9)
===================================================================
--- lib/db.c 2012-02-14 13:48:09 +0900 (e15e2b8)
+++ lib/db.c 2012-02-14 13:58:03 +0900 (d9eb95c)
@@ -1805,8 +1805,7 @@ exit :
grn_rc
grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags,
- grn_encoding *encoding, grn_obj **tokenizer,
- grn_obj **normalizer)
+ grn_encoding *encoding, grn_obj **tokenizer)
{
grn_rc rc = GRN_INVALID_ARGUMENT;
GRN_API_ENTER;
@@ -1816,28 +1815,24 @@ grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags,
if (flags) { *flags = ((grn_pat *)table)->obj.header.flags; }
if (encoding) { *encoding = ((grn_pat *)table)->encoding; }
if (tokenizer) { *tokenizer = ((grn_pat *)table)->tokenizer; }
- if (normalizer) { *normalizer = ((grn_pat *)table)->normalizer; }
rc = GRN_SUCCESS;
break;
case GRN_TABLE_DAT_KEY :
if (flags) { *flags = ((grn_dat *)table)->obj.header.flags; }
if (encoding) { *encoding = ((grn_dat *)table)->encoding; }
if (tokenizer) { *tokenizer = ((grn_dat *)table)->tokenizer; }
- if (normalizer) { *normalizer = ((grn_dat *)table)->normalizer; }
rc = GRN_SUCCESS;
break;
case GRN_TABLE_HASH_KEY :
if (flags) { *flags = ((grn_hash *)table)->obj.header.flags; }
if (encoding) { *encoding = ((grn_hash *)table)->encoding; }
if (tokenizer) { *tokenizer = ((grn_hash *)table)->tokenizer; }
- if (normalizer) { *normalizer = ((grn_hash *)table)->normalizer; }
rc = GRN_SUCCESS;
break;
case GRN_TABLE_NO_KEY :
if (flags) { *flags = 0; }
if (encoding) { *encoding = GRN_ENC_NONE; }
if (tokenizer) { *tokenizer = grn_uvector_tokenizer; }
- if (normalizer) { *normalizer = NULL; }
rc = GRN_SUCCESS;
break;
}
@@ -8079,7 +8074,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op,
if (obj->header.type != GRN_COLUMN_FIX_SIZE) {
grn_obj *tokenizer, *lexicon = grn_ctx_at(ctx, target->header.domain);
if (!lexicon) { continue; }
- grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL);
+ grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer);
if (tokenizer) { continue; }
}
if (n < buf_size) {
@@ -8117,7 +8112,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op,
if (!lexicon) { continue; }
if (lexicon->header.type != GRN_TABLE_PAT_KEY) { continue; }
/* FIXME: GRN_TABLE_DAT_KEY should be supported */
- grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL);
+ grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer);
if (tokenizer) { continue; }
}
if (n < buf_size) {
@@ -8197,7 +8192,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op,
if (!lexicon) { continue; }
if (lexicon->header.type != GRN_TABLE_PAT_KEY) { continue; }
/* FIXME: GRN_TABLE_DAT_KEY should be supported */
- grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL);
+ grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer);
if (tokenizer) { continue; }
}
if (n < buf_size) {
Modified: lib/db.h (+1 -2)
===================================================================
--- lib/db.h 2012-02-14 13:48:09 +0900 (537b32f)
+++ lib/db.h 2012-02-14 13:58:03 +0900 (4f76d43)
@@ -92,8 +92,7 @@ grn_id grn_table_get_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_si
grn_id grn_table_add_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_size,
void **value, int *added);
GRN_API grn_rc grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags,
- grn_encoding *encoding, grn_obj **tokenizer,
- grn_obj **normalizer);
+ grn_encoding *encoding, grn_obj **tokenizer);
const char *_grn_table_key(grn_ctx *ctx, grn_obj *table, grn_id id, uint32_t *key_size);
grn_rc grn_table_search(grn_ctx *ctx, grn_obj *table,
Modified: lib/ii.c (+3 -7)
===================================================================
--- lib/ii.c 2012-02-14 13:48:09 +0900 (96f959b)
+++ lib/ii.c 2012-02-14 13:58:03 +0900 (66dbbe3)
@@ -3405,9 +3405,7 @@ _grn_ii_create(grn_ctx *ctx, grn_ii *ii, const char *path, grn_obj *lexicon, uin
free_histogram[i] = 0;
}
*/
- if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, NULL)) {
- return NULL;
- }
+ if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer)) { return NULL; }
if (path && strlen(path) + 6 >= PATH_MAX) { return NULL; }
seg = grn_io_create(ctx, path, sizeof(struct grn_ii_header),
S_SEGMENT, GRN_II_MAX_LSEG, grn_io_auto, GRN_IO_EXPIRE_SEGMENT);
@@ -3526,9 +3524,7 @@ grn_ii_open(grn_ctx *ctx, const char *path, grn_obj *lexicon)
grn_obj_flags lflags;
grn_encoding encoding;
grn_obj *tokenizer;
- if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, NULL)) {
- return NULL;
- }
+ if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer)) { return NULL; }
if (strlen(path) + 6 >= PATH_MAX) { return NULL; }
strcpy(path2, path);
strcat(path2, ".c");
@@ -6552,7 +6548,7 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
grn_obj *range = grn_ctx_at(ctx, DB_OBJ(ii_buffer->lexicon)->range);
grn_obj *tokenizer;
grn_obj_flags flags;
- grn_table_get_info(ctx, ii_buffer->lexicon, &flags, NULL, &tokenizer, NULL);
+ grn_table_get_info(ctx, builder->lexicon, &flags, NULL, &tokenizer);
flags &= ~GRN_OBJ_PERSISTENT;
ii_buffer->tmp_lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range);
grn_obj_set_info(ctx, ii_buffer->tmp_lexicon, GRN_INFO_DEFAULT_TOKENIZER, tokenizer);
Modified: lib/token.c (+28 -84)
===================================================================
--- lib/token.c 2012-02-14 13:48:09 +0900 (887491d)
+++ lib/token.c 2012-02-14 13:58:03 +0900 (3ef64d8)
@@ -79,8 +79,7 @@ uvector_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
}
typedef struct {
- grn_obj *normalized_text;
- char *target_text;
+ grn_str *nstr;
const uint8_t *delimiter;
uint32_t delimiter_len;
int32_t pos;
@@ -98,47 +97,29 @@ delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data,
const uint8_t *delimiter, uint32_t delimiter_len)
{
grn_obj *str;
+ int nflags = 0;
grn_delimited_tokenizer *token;
grn_obj_flags table_flags;
- grn_obj *normalizer;
if (!(str = grn_ctx_pop(ctx))) {
ERR(GRN_INVALID_ARGUMENT, "missing argument");
return NULL;
}
if (!(token = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) { return NULL; }
user_data->ptr = token;
- token->normalized_text = NULL;
- token->target_text = NULL;
token->delimiter = delimiter;
token->delimiter_len = delimiter_len;
token->pos = 0;
- grn_table_get_info(ctx, table, &table_flags,
- &token->encoding, NULL, &normalizer);
- if (normalizer) {
- unsigned int length_in_bytes;
- if (!(token->normalized_text = grn_normalized_text_open(ctx,
- normalizer,
- GRN_TEXT_VALUE(str),
- GRN_TEXT_LEN(str),
- token->encoding,
- 0))) {
- GRN_FREE(token);
- ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
- return NULL;
- }
- grn_normalized_text_get_value(ctx, token->normalized_text,
- (const char **)(&(token->next)),
- &(token->len),
- &length_in_bytes);
- token->end = token->next + length_in_bytes;
- } else {
- token->len = GRN_TEXT_LEN(str);
- token->target_text = GRN_MALLOC(token->len + 1);
- memcpy(token->target_text, GRN_TEXT_VALUE(str), token->len);
- token->target_text[token->len] = '\0';
- token->next = (unsigned char *)token->target_text;
- token->end = token->next + token->len;
+ grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL);
+ nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
+ if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
+ nflags, token->encoding))) {
+ GRN_FREE(token);
+ ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
+ return NULL;
}
+ token->next = (unsigned char *)token->nstr->norm;
+ token->end = token->next + token->nstr->norm_blen;
+ token->len = token->nstr->length;
GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY);
GRN_UINT32_INIT(&token->stat_, 0);
return NULL;
@@ -173,12 +154,7 @@ static grn_obj *
delimited_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
grn_delimited_tokenizer *token = user_data->ptr;
- if (token->normalized_text) {
- grn_obj_unlink(ctx, token->normalized_text);
- }
- if (token->target_text) {
- GRN_FREE(token->target_text);
- }
+ grn_str_close(ctx, token->nstr);
GRN_FREE(token);
return NULL;
}
@@ -202,7 +178,6 @@ delimit_null_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d
/* ngram tokenizer */
typedef struct {
- grn_obj *normalized_text;
grn_str *nstr;
uint8_t uni_alpha;
uint8_t uni_digit;
@@ -215,7 +190,7 @@ typedef struct {
grn_encoding encoding;
const unsigned char *next;
const unsigned char *end;
- const uint_least8_t *ctypes;
+ uint_least8_t *ctypes;
int32_t len;
uint32_t tail;
grn_obj curr_;
@@ -227,7 +202,6 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank)
{
grn_obj *str;
- grn_obj *normalizer;
int nflags = GRN_NORMALIZE_REMOVE_BLANK|GRN_NORMALIZE_WITH_TYPES;
grn_ngram_tokenizer *token;
grn_obj_flags table_flags;
@@ -237,8 +211,6 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
}
if (!(token = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) { return NULL; }
user_data->ptr = token;
- token->normalized_text = NULL;
- token->nstr = NULL;
token->uni_alpha = uni_alpha;
token->uni_digit = uni_digit;
token->uni_symbol = uni_symbol;
@@ -247,39 +219,18 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
token->overlap = 0;
token->pos = 0;
token->skip = 0;
- grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL,
- &normalizer);
- if (normalizer) {
- unsigned int length_in_bytes;
- if (!(token->normalized_text = grn_normalized_text_open(ctx,
- normalizer,
- GRN_TEXT_VALUE(str),
- GRN_TEXT_LEN(str),
- token->encoding,
- nflags))) {
- GRN_FREE(token);
- ERR(GRN_TOKENIZER_ERROR,
- "[tokenizer][ngram][init] failed to open normalized text");
- return NULL;
- }
- grn_normalized_text_get_value(ctx, token->normalized_text,
- (const char **)(&(token->next)),
- &(token->len),
- &length_in_bytes);
- token->end = token->next + length_in_bytes;
- token->ctypes = grn_normalized_text_get_types(ctx, token->normalized_text);
- } else {
- if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
- nflags, token->encoding))) {
- GRN_FREE(token);
- ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
- return NULL;
- }
- token->next = (unsigned char *)token->nstr->norm;
- token->end = token->next + token->nstr->norm_blen;
- token->ctypes = token->nstr->ctypes;
- token->len = token->nstr->length;
+ grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL);
+ nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
+ if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
+ nflags, token->encoding))) {
+ GRN_FREE(token);
+ ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
+ return NULL;
}
+ token->next = (unsigned char *)token->nstr->norm;
+ token->end = token->next + token->nstr->norm_blen;
+ token->ctypes = token->nstr->ctypes;
+ token->len = token->nstr->length;
GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY);
GRN_UINT32_INIT(&token->stat_, 0);
return NULL;
@@ -332,7 +283,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
grn_ngram_tokenizer *token = user_data->ptr;
const unsigned char *p = token->next, *r = p, *e = token->end;
int32_t len = 0, pos = token->pos + token->skip, status = 0;
- const uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL;
+ uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL;
if (cp && token->uni_alpha && GRN_CHAR_TYPE(*cp) == grn_char_alpha) {
while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
len++;
@@ -420,12 +371,7 @@ static grn_obj *
ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
grn_ngram_tokenizer *token = user_data->ptr;
- if (token->normalized_text) {
- grn_obj_unlink(ctx, token->normalized_text);
- }
- if (token->nstr) {
- grn_str_close(ctx, token->nstr);
- }
+ grn_str_close(ctx, token->nstr);
GRN_FREE(token);
return NULL;
}
@@ -460,9 +406,7 @@ grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
grn_token *token;
grn_encoding encoding;
grn_obj *tokenizer;
- if (grn_table_get_info(ctx, table, NULL, &encoding, &tokenizer, NULL)) {
- return NULL;
- }
+ if (grn_table_get_info(ctx, table, NULL, &encoding, &tokenizer)) { return NULL; }
if (!(token = GRN_MALLOC(sizeof(grn_token)))) { return NULL; }
token->table = table;
token->mode = mode;
Modified: lib/tokenizer.c (+10 -23)
===================================================================
--- lib/tokenizer.c 2012-02-14 13:48:09 +0900 (b47f1db)
+++ lib/tokenizer.c 2012-02-14 13:58:03 +0900 (b33fed0)
@@ -103,29 +103,16 @@ grn_tokenizer_query *grn_tokenizer_query_create(grn_ctx *ctx,
{
grn_obj * const table = args[0];
- grn_encoding table_encoding = GRN_ENC_NONE;
- grn_obj *normalizer = NULL;
- grn_table_get_info(ctx, table, NULL, &table_encoding, NULL, &normalizer);
- if (normalizer != NULL) {
- grn_obj * const normalized_query = grn_normalized_text_open(
- ctx, normalizer, GRN_TEXT_VALUE(query_str),
- GRN_TEXT_LEN(query_str), table_encoding, 0);
- if (query->normalized_query == NULL) {
- GRN_PLUGIN_FREE(ctx, query);
- GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
- "[tokenizer] failed to open normalized text");
- return NULL;
- }
- query->normalized_query = normalized_query;
- grn_normalized_text_get_value(ctx, query->normalized_query,
- &query->ptr, NULL, &query->length);
- } else {
- unsigned int query_length = GRN_TEXT_LEN(query_str);
- char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1);
- if (query_buf == NULL) {
- GRN_PLUGIN_FREE(ctx, query);
- GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
- "[tokenizer] failed to duplicate query");
+ grn_encoding table_encoding;
+ int flags = 0;
+ grn_table_get_info(ctx, table, NULL, &table_encoding, NULL);
+ {
+ grn_str * const str = grn_str_open_(ctx, GRN_TEXT_VALUE(query_str),
+ GRN_TEXT_LEN(query_str),
+ flags | GRN_OBJ_KEY_NORMALIZE,
+ table_encoding);
+ if (str == NULL) {
+ GRN_TOKENIZER_FREE(ctx, query);
return NULL;
}
memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
Modified: plugins/tokenizers/mecab.c (+15 -29)
===================================================================
--- plugins/tokenizers/mecab.c 2012-02-14 13:48:09 +0900 (dc9886d)
+++ plugins/tokenizers/mecab.c 2012-02-14 13:58:03 +0900 (f944656)
@@ -34,6 +34,7 @@ static grn_critical_section sole_mecab_lock;
static grn_encoding sole_mecab_encoding = GRN_ENC_NONE;
typedef struct {
+ grn_str *nstr;
mecab_t *mecab;
char *buf;
char *next;
@@ -82,12 +83,11 @@ static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
grn_obj *str;
- const char *target_text;
+ int nflags = 0;
char *buf, *s, *p;
char mecab_err[256];
grn_obj *table = args[0];
- grn_obj *normalizer;
- grn_obj *normalized_text;
+ grn_obj_flags table_flags;
grn_encoding table_encoding;
grn_mecab_tokenizer *token;
unsigned int bufsize, maxtrial = 10, len;
@@ -113,7 +113,7 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
"mecab_new2 failed on grn_mecab_init: %s", mecab_err);
return NULL;
}
- grn_table_get_info(ctx, table, NULL, &table_encoding, NULL, &normalizer);
+ grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL);
if (table_encoding != sole_mecab_encoding) {
ERR(GRN_TOKENIZER_ERROR,
"MeCab dictionary charset (%s) does not match the context encoding: <%s>",
@@ -123,36 +123,23 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return NULL; }
token->mecab = sole_mecab;
token->encoding = table_encoding;
- normalized_text = NULL;
- if (normalizer) {
- if (!(normalized_text = grn_normalized_text_open(ctx,
- normalizer,
- GRN_TEXT_VALUE(str),
- GRN_TEXT_LEN(str),
- token->encoding,
- 0))) {
- GRN_FREE(token);
- ERR(GRN_TOKENIZER_ERROR,
- "[tokenizer][mecab] failed to open normalized text");
- return NULL;
- }
- grn_normalized_text_get_value(ctx, normalized_text,
- &target_text, NULL, &len);
- } else {
- target_text = GRN_TEXT_VALUE(str);
- len = GRN_TEXT_LEN(str);
+ nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
+ if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
+ nflags, token->encoding))) {
+ GRN_FREE(token);
+ ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
+ return NULL;
}
+ len = token->nstr->norm_blen;
for (bufsize = len * 2 + 1; maxtrial; bufsize *= 2, maxtrial--) {
if (!(buf = GRN_MALLOC(bufsize + 1))) {
GRN_LOG(ctx, GRN_LOG_ALERT, "buffer allocation on mecab_init failed !");
- if (normalized_text) {
- grn_obj_unlink(ctx, normalized_text);
- }
+ grn_str_close(ctx, token->nstr);
GRN_FREE(token);
return NULL;
}
CRITICAL_SECTION_ENTER(sole_mecab_lock);
- s = mecab_sparse_tostr3(token->mecab, target_text, len, buf, bufsize);
+ s = mecab_sparse_tostr3(token->mecab, token->nstr->norm, len, buf, bufsize);
if (!s) {
strncpy(mecab_err, mecab_strerror(token->mecab), sizeof(mecab_err) - 1);
}
@@ -161,12 +148,10 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
GRN_FREE(buf);
if (strstr(mecab_err, "output buffer overflow") == NULL) { break; }
}
- if (normalized_text) {
- grn_obj_unlink(ctx, normalized_text);
- }
if (!s) {
ERR(GRN_TOKENIZER_ERROR, "mecab_sparse_tostr failed len=%d bufsize=%d err=%s",
len, bufsize, mecab_err);
+ grn_str_close(ctx, token->nstr);
GRN_FREE(token);
return NULL;
}
@@ -221,6 +206,7 @@ static grn_obj *
mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
grn_mecab_tokenizer *token = user_data->ptr;
+ grn_str_close(ctx, token->nstr);
GRN_FREE(token->buf);
GRN_FREE(token);
return NULL;