null+****@clear*****
null+****@clear*****
2012年 1月 24日 (火) 07:40:29 JST
Daijiro MORI 2012-01-24 07:40:29 +0900 (Tue, 24 Jan 2012)
New Revision: 27054d32ebc0333383ab1866c4377c8febc0d82d
Log:
Adopted temporary lexicons to offline index builder
Modified files:
lib/ii.c
Modified: lib/ii.c (+58 -41)
===================================================================
--- lib/ii.c 2012-01-23 22:07:20 +0900 (27b257f)
+++ lib/ii.c 2012-01-24 07:40:29 +0900 (20a5919)
@@ -6419,12 +6419,15 @@ grn_ii_builder_flush(grn_ctx *ctx, grn_ii_builder *builder)
grn_id tid;
grn_table_cursor *tc;
uint32_t *pnext = &block->nextsize;
- tc = grn_table_cursor_open(ctx, builder->lexicon, NULL, 0, NULL, 0, 0, -1, BUILD_ORDER);
+ tc = grn_table_cursor_open(ctx, builder->tmp_lexicon, NULL, 0, NULL, 0, 0, -1, BUILD_ORDER);
while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) {
+ unsigned int key_size;
+ const char *key = _grn_table_key(ctx, builder->tmp_lexicon, tid, &key_size);
+ grn_id gtid = grn_table_add(ctx, builder->lexicon, key, key_size, NULL);
builder_counter *counter = &builder->counters[tid - 1];
if (counter->nrecs) {
uint32_t nposts = counter->nposts;
- outbuf[pos++] = tid;
+ outbuf[pos++] = gtid;
outbuf[pos++] = counter->nrecs;
outbuf[pos++] = counter->nposts;
counter->offset = pos;
@@ -6479,7 +6482,7 @@ grn_ii_builder_flush(grn_ctx *ctx, grn_ii_builder *builder)
GRN_FREE(outbuf);
{
builder_counter *counter;
- grn_id tid, tid_max = grn_table_size(ctx, builder->lexicon);
+ grn_id tid, tid_max = grn_table_size(ctx, builder->tmp_lexicon);
for (counter = builder->counters, tid = 1; tid <= tid_max; counter++, tid++) {
counter->nrecs = 0;
counter->nposts = 0;
@@ -6488,60 +6491,73 @@ grn_ii_builder_flush(grn_ctx *ctx, grn_ii_builder *builder)
}
}
builder->blockpos = 0;
+ grn_obj_close(ctx, builder->tmp_lexicon);
+ builder->tmp_lexicon = NULL;
}
static void
grn_ii_builder_tokenize(grn_ctx *ctx, grn_ii_builder *builder, grn_id rid, grn_obj *value)
{
- uint32_t pos;
- grn_token *token;
- grn_id *buffer = builder->blockbuf;
- if (BUILD_BLOCK_SIZE <= builder->blockpos + GRN_TEXT_LEN(value) * 2) {
- grn_ii_builder_flush(ctx, builder);
- }
- pos = builder->blockpos;
- buffer[pos++] = rid + BUILD_RID_FLAG;
- if (GRN_TEXT_LEN(value) &&
- (token = grn_token_open(ctx, builder->lexicon,
- GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value), grn_token_add))) {
- uint32_t pos_ = pos;
- while (!token->status) {
- grn_id tid;
- if ((tid = grn_token_next(ctx, token))) {
- if (tid > builder->ncounters) {
- uint32_t ncounters = grn_table_size(ctx, builder->lexicon) + BUILD_NCOUNTERS_MARGIN;
- builder_counter *counters = GRN_REALLOC(builder->counters,
- ncounters * sizeof(builder_counter));
- if (!counters) { return; }
- memset(&counters[builder->ncounters], 0,
- (ncounters - builder->ncounters) * sizeof(builder_counter));
- builder->ncounters = ncounters;
- builder->counters = counters;
- }
- {
- builder_counter *counter = &builder->counters[tid - 1];
- buffer[pos++] = tid;
- if (counter->lastrec != rid) {
- counter->lastrec = rid;
- counter->nrecs++;
+ if (GRN_TEXT_LEN(value)) {
+ uint32_t pos;
+ grn_token *token;
+ grn_id *buffer = builder->blockbuf;
+ if (BUILD_BLOCK_SIZE <= builder->blockpos + GRN_TEXT_LEN(value) * 2) {
+ grn_ii_builder_flush(ctx, builder);
+ }
+ if (!builder->tmp_lexicon) {
+ grn_obj *domain = grn_ctx_at(ctx, builder->lexicon->header.domain);
+ grn_obj *range = grn_ctx_at(ctx, DB_OBJ(builder->lexicon)->range);
+ grn_obj *tokenizer;
+ grn_obj_flags flags;
+ grn_table_get_info(ctx, builder->lexicon, &flags, NULL, &tokenizer);
+ flags &= ~GRN_OBJ_PERSISTENT;
+ builder->tmp_lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range);
+ grn_obj_set_info(ctx, builder->tmp_lexicon, GRN_INFO_DEFAULT_TOKENIZER, tokenizer);
+ }
+ pos = builder->blockpos;
+ buffer[pos++] = rid + BUILD_RID_FLAG;
+ if ((token = grn_token_open(ctx, builder->tmp_lexicon,
+ GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value), grn_token_add))) {
+ uint32_t pos_ = pos;
+ while (!token->status) {
+ grn_id tid;
+ if ((tid = grn_token_next(ctx, token))) {
+ if (tid > builder->ncounters) {
+ uint32_t ncounters = grn_table_size(ctx, builder->tmp_lexicon) + BUILD_NCOUNTERS_MARGIN;
+ builder_counter *counters = GRN_REALLOC(builder->counters,
+ ncounters * sizeof(builder_counter));
+ if (!counters) { return; }
+ memset(&counters[builder->ncounters], 0,
+ (ncounters - builder->ncounters) * sizeof(builder_counter));
+ builder->ncounters = ncounters;
+ builder->counters = counters;
+ }
+ {
+ builder_counter *counter = &builder->counters[tid - 1];
+ buffer[pos++] = tid;
+ if (counter->lastrec != rid) {
+ counter->lastrec = rid;
+ counter->nrecs++;
+ }
+ counter->nposts++;
}
- counter->nposts++;
}
}
+ grn_token_close(ctx, token);
+ if (pos - pos_ > GRN_TEXT_LEN(value)) {
+ GRN_LOG(ctx, GRN_LOG_WARNING, "%d > %d", pos - pos_, GRN_TEXT_LEN(value));
+ }
}
- grn_token_close(ctx, token);
- if (pos - pos_ > GRN_TEXT_LEN(value)) {
- GRN_LOG(ctx, GRN_LOG_WARNING, "%d > %d", pos - pos_, GRN_TEXT_LEN(value));
- }
+ builder->blockpos = pos;
}
- builder->blockpos = pos;
}
static void
grn_ii_builder_parse(grn_ctx *ctx, grn_ii_builder *builder)
{
grn_table_cursor *tc;
- builder->ncounters = grn_table_size(ctx, builder->lexicon) + BUILD_NCOUNTERS_MARGIN;
+ builder->ncounters = BUILD_NCOUNTERS_MARGIN;
builder->counters = GRN_CALLOC(builder->ncounters * sizeof(builder_counter));
builder->blockbuf = (grn_id *)GRN_MALLOC(BUILD_BLOCK_SIZE * sizeof(grn_id));
builder->blockpos = 0;
@@ -6816,6 +6832,7 @@ grn_ii_build(grn_ctx *ctx, grn_ii *ii)
builder.source = src;
builder.target = target;
builder.lexicon = ii->lexicon;
+ builder.tmp_lexicon = NULL;
builder.nblocks = 0;
builder.blocks = NULL;