null+****@clear*****
null+****@clear*****
2012年 2月 13日 (月) 20:24:59 JST
Daijiro MORI 2012-02-13 20:24:59 +0900 (Mon, 13 Feb 2012) New Revision: f15d654592ca6faf4cf540f792b0b83b7320332f Log: Renamed grn_ii_builder to grn_ii_buffer Modified files: lib/ii.c lib/ii.h Modified: lib/ii.c (+286 -219) =================================================================== --- lib/ii.c 2012-02-13 18:44:06 +0900 (09192e5) +++ lib/ii.c 2012-02-13 20:24:59 +0900 (8e537a8) @@ -6339,20 +6339,20 @@ grn_ii_inspect_elements(grn_ctx *ctx, grn_ii *ii, grn_obj *buf) } #ifndef WIN32 -/********************** offline index builder ***********************/ - -const grn_id BUILD_RID_FLAG = 0x80000000; -#ifdef BUILD_ORDER_BY_ID -const int BUILD_ORDER = GRN_CURSOR_BY_ID; -#else /* BUILD_ORDER_BY_ID */ -const int BUILD_ORDER = GRN_CURSOR_BY_KEY; -#endif /* BUILD_ORDER_BY_ID */ -const uint16_t BUILD_NTERMS_PER_BUFFER = 16300; -const uint32_t BUILD_PACKED_BUFFER_SIZE = 0x4000000; -const char *TMPFILE_PATH = "grn_ii_builder_tmp"; -const uint32_t BUILD_NCOUNTERS_MARGIN = 0x100000; -const size_t BUILD_BLOCK_SIZE = 0x100000; -const uint32_t BUILD_BLOCK_READ_UNIT_SIZE = 0x200000; +/********************** buffered index builder ***********************/ + +const grn_id II_BUFFER_RID_FLAG = 0x80000000; +#ifdef II_BUFFER_ORDER_BY_ID +const int II_BUFFER_ORDER = GRN_CURSOR_BY_ID; +#else /* II_BUFFER_ORDER_BY_ID */ +const int II_BUFFER_ORDER = GRN_CURSOR_BY_KEY; +#endif /* II_BUFFER_ORDER_BY_ID */ +const uint16_t II_BUFFER_NTERMS_PER_BUFFER = 16300; +const uint32_t II_BUFFER_PACKED_BUFFER_SIZE = 0x4000000; +const char *TMPFILE_PATH = "grn_ii_buffer_tmp"; +const uint32_t II_BUFFER_NCOUNTERS_MARGIN = 0x100000; +const size_t II_BUFFER_BLOCK_SIZE = 0x100000; +const uint32_t II_BUFFER_BLOCK_READ_UNIT_SIZE = 0x200000; typedef struct { uint32_t nrecs; @@ -6366,7 +6366,7 @@ typedef struct { uint32_t offset_weight; uint32_t offset_tf; uint32_t offset_pos; -} builder_counter; +} ii_buffer_counter; typedef struct { off_t head; @@ -6382,21 +6382,21 @@ typedef struct { grn_id *recs; uint32_t *tfs; uint32_t *posts; -} builder_block; +} ii_buffer_block; -typedef struct { +struct _grn_ii_buffer { grn_obj *target; grn_obj *lexicon; grn_obj *source; grn_obj *tmp_lexicon; - builder_block *blocks; + ii_buffer_block *blocks; uint32_t nblocks; int tmpfd; // stuff for parsing off_t filepos; grn_id *blockbuf; uint32_t blockpos; - builder_counter *counters; + ii_buffer_counter *counters; uint32_t ncounters; // stuff for merging grn_ii *ii; @@ -6407,31 +6407,31 @@ typedef struct { uint8_t *packed; uint32_t packed_len; uint64_t total_chunk_size; -} grn_ii_builder; +}; static void -grn_ii_builder_flush(grn_ctx *ctx, grn_ii_builder *builder) +grn_ii_buffer_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) { uint8_t *outbuf, *outbufp, *outbufp_; - builder_block *block; - outbuf = (uint8_t *)GRN_MALLOC(builder->blockpos * 7 * sizeof(uint32_t)); + ii_buffer_block *block; + outbuf = (uint8_t *)GRN_MALLOC(ii_buffer->blockpos * 7 * sizeof(uint32_t)); /* if (!outbuf) { err } */ outbufp_ = outbufp = outbuf; - if (!(builder->nblocks & 0x3ff)) { - builder_block *blocks = GRN_REALLOC(builder->blocks, - (builder->nblocks + 0x400) * sizeof(builder_block)); + if (!(ii_buffer->nblocks & 0x3ff)) { + ii_buffer_block *blocks = GRN_REALLOC(ii_buffer->blocks, + (ii_buffer->nblocks + 0x400) * sizeof(ii_buffer_block)); if (!blocks) { /* err */ } - builder->blocks = blocks; + ii_buffer->blocks = blocks; } - block = &builder->blocks[builder->nblocks]; - block->head = builder->filepos; + block = &ii_buffer->blocks[ii_buffer->nblocks]; + block->head = ii_buffer->filepos; block->rest = 0; block->buffer = NULL; block->buffersize = 0; { - builder_counter *counter; - grn_id tid, tid_max = grn_table_size(ctx, builder->tmp_lexicon); - for (counter = builder->counters, tid = 1; tid <= tid_max; counter++, tid++) { + ii_buffer_counter *counter; + grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon); + for (counter = ii_buffer->counters, tid = 1; tid <= tid_max; counter++, tid++) { counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1); counter->last_rid = 0; counter->last_tf = 0; @@ -6441,12 +6441,12 @@ grn_ii_builder_flush(grn_ctx *ctx, grn_ii_builder *builder) grn_id tid; grn_table_cursor *tc; uint8_t *pnext = (uint8_t *)&block->nextsize; - tc = grn_table_cursor_open(ctx, builder->tmp_lexicon, NULL, 0, NULL, 0, 0, -1, BUILD_ORDER); + tc = grn_table_cursor_open(ctx, ii_buffer->tmp_lexicon, NULL, 0, NULL, 0, 0, -1, II_BUFFER_ORDER); while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { unsigned int key_size; - const char *key = _grn_table_key(ctx, builder->tmp_lexicon, tid, &key_size); - grn_id gtid = grn_table_add(ctx, builder->lexicon, key, key_size, NULL); - builder_counter *counter = &builder->counters[tid - 1]; + const char *key = _grn_table_key(ctx, ii_buffer->tmp_lexicon, tid, &key_size); + grn_id gtid = grn_table_add(ctx, ii_buffer->lexicon, key, key_size, NULL); + ii_buffer_counter *counter = &ii_buffer->counters[tid - 1]; if (counter->nrecs) { uint32_t offset_rid = counter->offset_rid; uint32_t offset_tf = counter->offset_tf; @@ -6461,7 +6461,7 @@ grn_ii_builder_flush(grn_ctx *ctx, grn_ii_builder *builder) counter->offset_pos = outbufp - outbuf; outbufp += offset_pos; } - if (outbufp_ + BUILD_BLOCK_READ_UNIT_SIZE < outbufp) { + if (outbufp_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < outbufp) { uint32_t size = outbufp - outbufp_ + sizeof(uint32_t); memcpy(pnext, &size, sizeof(uint32_t)); pnext = outbufp; @@ -6480,13 +6480,13 @@ grn_ii_builder_flush(grn_ctx *ctx, grn_ii_builder *builder) uint32_t pos = 0; uint32_t rest; grn_id *bp; - for (bp = builder->blockbuf, rest = builder->blockpos; rest; bp++, rest--) { + for (bp = ii_buffer->blockbuf, rest = ii_buffer->blockpos; rest; bp++, rest--) { grn_id id = *bp; - if (id & BUILD_RID_FLAG) { - rid = id - BUILD_RID_FLAG; + if (id & II_BUFFER_RID_FLAG) { + rid = id - II_BUFFER_RID_FLAG; pos = 0; } else { - builder_counter *counter = &builder->counters[id - 1]; + ii_buffer_counter *counter = &ii_buffer->counters[id - 1]; if (counter->last_rid == rid) { counter->last_tf++; } else { @@ -6516,71 +6516,73 @@ grn_ii_builder_flush(grn_ctx *ctx, grn_ii_builder *builder) } } { - builder_counter *counter; - grn_id tid, tid_max = grn_table_size(ctx, builder->tmp_lexicon); - for (counter = builder->counters, tid = 1; tid <= tid_max; counter++, tid++) { + ii_buffer_counter *counter; + grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon); + for (counter = ii_buffer->counters, tid = 1; tid <= tid_max; counter++, tid++) { uint8_t *p = outbuf + counter->offset_tf; GRN_B_ENC(counter->last_tf - 1, p); } - memset(builder->counters, 0, tid_max * sizeof(builder_counter)); + memset(ii_buffer->counters, 0, tid_max * sizeof(ii_buffer_counter)); } { - ssize_t r = write(builder->tmpfd, outbuf, outbufp - outbuf); - if (r > 0) { builder->filepos += r; } - block->tail = builder->filepos; + ssize_t r = write(ii_buffer->tmpfd, outbuf, outbufp - outbuf); + if (r > 0) { ii_buffer->filepos += r; } + block->tail = ii_buffer->filepos; } - builder->nblocks++; + ii_buffer->nblocks++; GRN_FREE(outbuf); - builder->blockpos = 0; - grn_obj_close(ctx, builder->tmp_lexicon); - builder->tmp_lexicon = NULL; + ii_buffer->blockpos = 0; + grn_obj_close(ctx, ii_buffer->tmp_lexicon); + ii_buffer->tmp_lexicon = NULL; } const uint32_t PAT_CACHE_SIZE = 1<<20; static void -grn_ii_builder_tokenize(grn_ctx *ctx, grn_ii_builder *builder, grn_id rid, grn_obj *value) +grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + grn_id rid, unsigned int section, grn_obj *value) { if (GRN_TEXT_LEN(value)) { uint32_t blockpos; grn_token *token; - grn_id *buffer = builder->blockbuf; - if (BUILD_BLOCK_SIZE <= builder->blockpos + GRN_TEXT_LEN(value) * 2) { - grn_ii_builder_flush(ctx, builder); + grn_id *buffer = ii_buffer->blockbuf; + if (II_BUFFER_BLOCK_SIZE <= ii_buffer->blockpos + GRN_TEXT_LEN(value) * 2) { + grn_ii_buffer_flush(ctx, ii_buffer); } - if (!builder->tmp_lexicon) { - grn_obj *domain = grn_ctx_at(ctx, builder->lexicon->header.domain); - grn_obj *range = grn_ctx_at(ctx, DB_OBJ(builder->lexicon)->range); + if (!ii_buffer->tmp_lexicon) { + grn_obj *domain = grn_ctx_at(ctx, ii_buffer->lexicon->header.domain); + grn_obj *range = grn_ctx_at(ctx, DB_OBJ(ii_buffer->lexicon)->range); grn_obj *tokenizer; grn_obj_flags flags; - grn_table_get_info(ctx, builder->lexicon, &flags, NULL, &tokenizer, NULL); + grn_table_get_info(ctx, ii_buffer->lexicon, &flags, NULL, &tokenizer, NULL); flags &= ~GRN_OBJ_PERSISTENT; - builder->tmp_lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range); - grn_obj_set_info(ctx, builder->tmp_lexicon, GRN_INFO_DEFAULT_TOKENIZER, tokenizer); + ii_buffer->tmp_lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range); + grn_obj_set_info(ctx, ii_buffer->tmp_lexicon, GRN_INFO_DEFAULT_TOKENIZER, tokenizer); if (flags & GRN_OBJ_TABLE_PAT_KEY) { - grn_pat_cache_enable(ctx, (grn_pat *)builder->tmp_lexicon, PAT_CACHE_SIZE); + grn_pat_cache_enable(ctx, (grn_pat *)ii_buffer->tmp_lexicon, PAT_CACHE_SIZE); } } - blockpos = builder->blockpos; - buffer[blockpos++] = rid + BUILD_RID_FLAG; - if ((token = grn_token_open(ctx, builder->tmp_lexicon, + blockpos = ii_buffer->blockpos; + buffer[blockpos++] = rid + II_BUFFER_RID_FLAG; + if ((token = grn_token_open(ctx, ii_buffer->tmp_lexicon, GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value), grn_token_add))) { uint32_t pos, blockpos_ = blockpos; for (pos = 0; !token->status; pos++) { grn_id tid; if ((tid = grn_token_next(ctx, token))) { - if (tid > builder->ncounters) { - uint32_t ncounters = grn_table_size(ctx, builder->tmp_lexicon) + BUILD_NCOUNTERS_MARGIN; - builder_counter *counters = GRN_REALLOC(builder->counters, - ncounters * sizeof(builder_counter)); + if (tid > ii_buffer->ncounters) { + uint32_t ncounters = grn_table_size(ctx, ii_buffer->tmp_lexicon) + + II_BUFFER_NCOUNTERS_MARGIN; + ii_buffer_counter *counters = GRN_REALLOC(ii_buffer->counters, + ncounters * sizeof(ii_buffer_counter)); if (!counters) { return; } - memset(&counters[builder->ncounters], 0, - (ncounters - builder->ncounters) * sizeof(builder_counter)); - builder->ncounters = ncounters; - builder->counters = counters; + memset(&counters[ii_buffer->ncounters], 0, + (ncounters - ii_buffer->ncounters) * sizeof(ii_buffer_counter)); + ii_buffer->ncounters = ncounters; + ii_buffer->counters = counters; } { - builder_counter *counter = &builder->counters[tid - 1]; + ii_buffer_counter *counter = &ii_buffer->counters[tid - 1]; buffer[blockpos++] = tid; if (counter->last_rid != rid) { counter->offset_rid += GRN_B_ENC_SIZE(rid - counter->last_rid); @@ -6606,44 +6608,12 @@ grn_ii_builder_tokenize(grn_ctx *ctx, grn_ii_builder *builder, grn_id rid, grn_o GRN_LOG(ctx, GRN_LOG_WARNING, "%d > %d", blockpos - blockpos_, GRN_TEXT_LEN(value)); } } - builder->blockpos = blockpos; + ii_buffer->blockpos = blockpos; } } static void -grn_ii_builder_parse(grn_ctx *ctx, grn_ii_builder *builder) -{ - grn_table_cursor *tc; - builder->ncounters = BUILD_NCOUNTERS_MARGIN; - builder->counters = GRN_CALLOC(builder->ncounters * sizeof(builder_counter)); - builder->blockbuf = (grn_id *)GRN_MALLOC(BUILD_BLOCK_SIZE * sizeof(grn_id)); - builder->blockpos = 0; - builder->tmpfd = open(TMPFILE_PATH, O_WRONLY|O_CREAT|O_TRUNC|O_NONBLOCK, 0666); - builder->filepos = 0; - if ((tc = grn_table_cursor_open(ctx, builder->target, - NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID))) { - grn_id id; - grn_obj rv; - GRN_TEXT_INIT(&rv, 0); - while ((id = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { - GRN_BULK_REWIND(&rv); - grn_obj_get_value(ctx, builder->source, id, &rv); - grn_ii_builder_tokenize(ctx, builder, id, &rv); - } - GRN_OBJ_FIN(ctx, &rv); - if (builder->blockpos) { - grn_ii_builder_flush(ctx, builder); - } - grn_table_cursor_close(ctx, tc); - } - close(builder->tmpfd); - GRN_FREE(builder->blockbuf); - GRN_FREE(builder->counters); - GRN_LOG(ctx, GRN_LOG_NOTICE, "nblocks: %d", builder->nblocks); -} - -static void -grn_ii_builder_fetch(grn_ctx *ctx, grn_ii_builder *builder, builder_block *block) +grn_ii_buffer_fetch(grn_ctx *ctx, grn_ii_buffer *ii_buffer, ii_buffer_block *block) { if (!block->rest) { if (block->head < block->tail) { @@ -6658,7 +6628,7 @@ grn_ii_builder_fetch(grn_ctx *ctx, grn_ii_builder *builder, builder_block *block return; } } - pread(builder->tmpfd, block->buffer, bytesize, block->head); + pread(ii_buffer->tmpfd, block->buffer, bytesize, block->head); block->head += bytesize; block->bufcur = block->buffer; if (block->head >= block->tail) { @@ -6686,37 +6656,37 @@ grn_ii_builder_fetch(grn_ctx *ctx, grn_ii_builder *builder, builder_block *block } static void -grn_ii_builder_chunk_flush(grn_ctx *ctx, grn_ii_builder *builder) +grn_ii_buffer_chunk_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) { grn_io_win io_win; uint32_t chunk_number; - chunk_new(ctx, builder->ii, &chunk_number, builder->packed_len); + chunk_new(ctx, ii_buffer->ii, &chunk_number, ii_buffer->packed_len); GRN_LOG(ctx, GRN_LOG_INFO, "chunk:%d, packed_len:%d", - chunk_number, builder->packed_len); - fake_map2(ctx, builder->ii->chunk, &io_win, builder->packed, - chunk_number, builder->packed_len); + chunk_number, ii_buffer->packed_len); + fake_map2(ctx, ii_buffer->ii->chunk, &io_win, ii_buffer->packed, + chunk_number, ii_buffer->packed_len); grn_io_win_unmap2(&io_win); - builder->term_buffer->header.chunk = chunk_number; - builder->term_buffer->header.chunk_size = builder->packed_len; - builder->term_buffer->header.buffer_free = + ii_buffer->term_buffer->header.chunk = chunk_number; + ii_buffer->term_buffer->header.chunk_size = ii_buffer->packed_len; + ii_buffer->term_buffer->header.buffer_free = S_SEGMENT - sizeof(buffer_header) - - builder->term_buffer->header.nterms * sizeof(buffer_term); - builder->term_buffer->header.nterms_void = 0; - buffer_segment_update(builder->ii, builder->lseg, builder->dseg); - builder->ii->header->total_chunk_size += builder->packed_len; - builder->term_buffer = NULL; - builder->total_chunk_size += builder->packed_len; - builder->packed = NULL; - builder->packed_len = 0; + ii_buffer->term_buffer->header.nterms * sizeof(buffer_term); + ii_buffer->term_buffer->header.nterms_void = 0; + buffer_segment_update(ii_buffer->ii, ii_buffer->lseg, ii_buffer->dseg); + ii_buffer->ii->header->total_chunk_size += ii_buffer->packed_len; + ii_buffer->term_buffer = NULL; + ii_buffer->total_chunk_size += ii_buffer->packed_len; + ii_buffer->packed = NULL; + ii_buffer->packed_len = 0; } static void -grn_ii_builder_merge_one(grn_ctx *ctx, grn_ii_builder *builder, - grn_id tid, builder_block *hits[], int nhits) +grn_ii_buffer_merge(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + grn_id tid, ii_buffer_block *hits[], int nhits) { - uint32_t *a = array_get(ctx, builder->ii, tid); + uint32_t *a = array_get(ctx, ii_buffer->ii, tid); if (nhits == 1 && hits[0]->nrecs == 1 && hits[0]->nposts == 1) { - builder_block *block = hits[0]; + ii_buffer_block *block = hits[0]; uint8_t *p = block->bufcur; grn_id rid; uint32_t tf, pos; @@ -6730,44 +6700,44 @@ grn_ii_builder_merge_one(grn_ctx *ctx, grn_ii_builder *builder, } a[0] = (rid << 1) + 1; a[1] = pos; - grn_ii_builder_fetch(ctx, builder, block); + grn_ii_buffer_fetch(ctx, ii_buffer, block); } else { uint64_t spos = 0; uint32_t nrecs = 0; uint32_t nposts = 0; uint16_t nterm; buffer_term *bt; - if (!builder->term_buffer) { + if (!ii_buffer->term_buffer) { uint32_t lseg; void *term_buffer; for (lseg = 0; lseg < GRN_II_MAX_LSEG; lseg++) { - if (builder->ii->header->binfo[lseg] == NOT_ASSIGNED) { break; } + if (ii_buffer->ii->header->binfo[lseg] == NOT_ASSIGNED) { break; } } - builder->lseg = lseg; - builder->dseg = segment_get(ctx, builder->ii); - GRN_IO_SEG_REF(builder->ii->seg, builder->dseg, term_buffer); - builder->term_buffer = (buffer *)term_buffer; - } - nterm = builder->term_buffer->header.nterms++; - bt = &builder->term_buffer->terms[nterm]; - a[0] = SEG2POS(builder->lseg, (sizeof(buffer_header) + sizeof(buffer_term) * nterm)); + ii_buffer->lseg = lseg; + ii_buffer->dseg = segment_get(ctx, ii_buffer->ii); + GRN_IO_SEG_REF(ii_buffer->ii->seg, ii_buffer->dseg, term_buffer); + ii_buffer->term_buffer = (buffer *)term_buffer; + } + nterm = ii_buffer->term_buffer->header.nterms++; + bt = &ii_buffer->term_buffer->terms[nterm]; + a[0] = SEG2POS(ii_buffer->lseg, (sizeof(buffer_header) + sizeof(buffer_term) * nterm)); { int i; for (i = 0; i < nhits; i++) { - builder_block *block = hits[i]; + ii_buffer_block *block = hits[i]; nrecs += block->nrecs; nposts += block->nposts; } } - datavec_reset(ctx, builder->data_vectors, builder->ii->n_elements, nrecs, nrecs * 2 + nposts); + datavec_reset(ctx, ii_buffer->data_vectors, ii_buffer->ii->n_elements, nrecs, nrecs * 2 + nposts); { - uint32_t *ridp = builder->data_vectors[0].data; - uint32_t *tfp = builder->data_vectors[1].data; - uint32_t *posp = builder->data_vectors[2].data; + uint32_t *ridp = ii_buffer->data_vectors[0].data; + uint32_t *tfp = ii_buffer->data_vectors[1].data; + uint32_t *posp = ii_buffer->data_vectors[2].data; uint32_t lr = 0; int i; for (i = 0; i < nhits; i++) { - builder_block *block = hits[i]; + ii_buffer_block *block = hits[i]; uint8_t *p = block->bufcur; uint32_t n = block->nrecs; if (n) { @@ -6788,121 +6758,218 @@ grn_ii_builder_merge_one(grn_ctx *ctx, grn_ii_builder *builder, } block->rest -= (p - block->bufcur); block->bufcur = p; - grn_ii_builder_fetch(ctx, builder, block); + grn_ii_buffer_fetch(ctx, ii_buffer, block); } - builder->data_vectors[0].data_size = nrecs; - builder->data_vectors[1].data_size = nrecs; - builder->data_vectors[2].data_size = nposts; + ii_buffer->data_vectors[0].data_size = nrecs; + ii_buffer->data_vectors[1].data_size = nrecs; + ii_buffer->data_vectors[2].data_size = nposts; - builder->data_vectors[0].flags = ((nrecs < 16) || (nrecs <= (lr >> 8))) ? 0 : USE_P_ENC; - builder->data_vectors[1].flags = (nrecs < 3) ? 0 : USE_P_ENC; - builder->data_vectors[2].flags = + ii_buffer->data_vectors[0].flags = ((nrecs < 16) || (nrecs <= (lr >> 8))) ? 0 : USE_P_ENC; + ii_buffer->data_vectors[1].flags = (nrecs < 3) ? 0 : USE_P_ENC; + ii_buffer->data_vectors[2].flags = (((nposts < 32) || (nposts <= (spos >> 13))) ? 0 : USE_P_ENC)|ODD; } - if (!builder->packed) { builder->packed = GRN_MALLOC(BUILD_PACKED_BUFFER_SIZE * 2); } + if (!ii_buffer->packed) { ii_buffer->packed = GRN_MALLOC(II_BUFFER_PACKED_BUFFER_SIZE * 2); } { - int packed_len = grn_p_encv(ctx, builder->data_vectors, builder->ii->n_elements, - builder->packed + builder->packed_len); + int packed_len = grn_p_encv(ctx, ii_buffer->data_vectors, ii_buffer->ii->n_elements, + ii_buffer->packed + ii_buffer->packed_len); bt->tid = tid; bt->size_in_buffer = 0; bt->pos_in_buffer = 0; bt->size_in_chunk = packed_len; - bt->pos_in_chunk = builder->packed_len; - builder->packed_len += packed_len; - } - if (nterm == BUILD_NTERMS_PER_BUFFER || builder->packed_len > BUILD_PACKED_BUFFER_SIZE) { - grn_ii_builder_chunk_flush(ctx, builder); + bt->pos_in_chunk = ii_buffer->packed_len; + ii_buffer->packed_len += packed_len; + } + if (nterm == II_BUFFER_NTERMS_PER_BUFFER || ii_buffer->packed_len > II_BUFFER_PACKED_BUFFER_SIZE) { + grn_ii_buffer_chunk_flush(ctx, ii_buffer); + } + } +} + +grn_ii_buffer * +grn_ii_buffer_open(grn_ctx *ctx, grn_ii *ii) +{ + if (ii && ii->lexicon) { + grn_ii_buffer *ii_buffer = GRN_MALLOCN(grn_ii_buffer, 1); + if (ii_buffer) { + ii_buffer->ii = ii; + ii_buffer->lexicon = ii->lexicon; + ii_buffer->tmp_lexicon = NULL; + ii_buffer->nblocks = 0; + ii_buffer->blocks = NULL; + ii_buffer->ncounters = II_BUFFER_NCOUNTERS_MARGIN; + ii_buffer->blockpos = 0; + ii_buffer->filepos = 0; + ii_buffer->counters = GRN_CALLOC(ii_buffer->ncounters * + sizeof(ii_buffer_counter)); + if (ii_buffer->counters) { + ii_buffer->blockbuf = (grn_id *)GRN_MALLOC(II_BUFFER_BLOCK_SIZE * + sizeof(grn_id)); + if (ii_buffer->blockbuf) { + ii_buffer->tmpfd = open(TMPFILE_PATH, + O_WRONLY|O_CREAT|O_TRUNC|O_NONBLOCK, 0666); + if (ii_buffer->tmpfd) { + grn_obj_flags flags; + grn_table_get_info(ctx, ii->lexicon, + &flags, NULL, NULL, NULL); + if (flags & GRN_OBJ_TABLE_PAT_KEY) { + grn_pat_cache_enable(ctx, (grn_pat *)ii->lexicon, + PAT_CACHE_SIZE); + } + return ii_buffer; + } else { + ERR(GRN_INVALID_ARGUMENT, "temporary file open failed"); + } + GRN_FREE(ii_buffer->blockbuf); + } + GRN_FREE(ii_buffer->counters); + } + GRN_FREE(ii_buffer); } + } else { + ERR(GRN_INVALID_ARGUMENT, "ii or ii->lexicon is NULL"); } + return NULL; } -static void -grn_ii_builder_merge(grn_ctx *ctx, grn_ii_builder *builder) -{ - builder->term_buffer = NULL; - builder->packed = NULL; - builder->packed_len = 0; - builder->total_chunk_size = 0; - builder->tmpfd = open(TMPFILE_PATH, O_RDONLY); - datavec_init(ctx, builder->data_vectors, builder->ii->n_elements, 0, 0); +grn_rc +grn_ii_buffer_append(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + grn_id rid, unsigned int section, grn_obj *value) +{ + grn_ii_buffer_tokenize(ctx, ii_buffer, rid, section, value); + return ctx->rc; +} + +grn_rc +grn_ii_buffer_commit(grn_ctx *ctx, grn_ii_buffer *ii_buffer) +{ + if (ii_buffer->blockpos) { + grn_ii_buffer_flush(ctx, ii_buffer); + } + + close(ii_buffer->tmpfd); + GRN_FREE(ii_buffer->blockbuf); + GRN_FREE(ii_buffer->counters); + GRN_LOG(ctx, GRN_LOG_NOTICE, "nblocks: %d", ii_buffer->nblocks); + + + ii_buffer->term_buffer = NULL; + ii_buffer->packed = NULL; + ii_buffer->packed_len = 0; + ii_buffer->total_chunk_size = 0; + ii_buffer->tmpfd = open(TMPFILE_PATH, O_RDONLY); + datavec_init(ctx, ii_buffer->data_vectors, ii_buffer->ii->n_elements, 0, 0); { uint32_t i; - for (i = 0; i < builder->nblocks; i++) { - grn_ii_builder_fetch(ctx, builder, &builder->blocks[i]); + for (i = 0; i < ii_buffer->nblocks; i++) { + grn_ii_buffer_fetch(ctx, ii_buffer, &ii_buffer->blocks[i]); } } { - builder_block *hits[builder->nblocks]; + ii_buffer_block *hits[ii_buffer->nblocks]; grn_id tid; grn_table_cursor *tc; - tc = grn_table_cursor_open(ctx, builder->lexicon, NULL, 0, NULL, 0, 0, -1, BUILD_ORDER); + tc = grn_table_cursor_open(ctx, ii_buffer->lexicon, + NULL, 0, NULL, 0, 0, -1, II_BUFFER_ORDER); while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { int nrests = 0; int nhits = 0; uint32_t i; - for (i = 0; i < builder->nblocks; i++) { - if (builder->blocks[i].tid == tid) { - hits[nhits++] = &builder->blocks[i]; + for (i = 0; i < ii_buffer->nblocks; i++) { + if (ii_buffer->blocks[i].tid == tid) { + hits[nhits++] = &ii_buffer->blocks[i]; } - if (builder->blocks[i].tid) { nrests++; } + if (ii_buffer->blocks[i].tid) { nrests++; } } - if (nhits) { grn_ii_builder_merge_one(ctx, builder, tid, hits, nhits); } + if (nhits) { grn_ii_buffer_merge(ctx, ii_buffer, tid, hits, nhits); } if (!nrests) { break; } } - if (builder->packed_len) { - grn_ii_builder_chunk_flush(ctx, builder); + if (ii_buffer->packed_len) { + grn_ii_buffer_chunk_flush(ctx, ii_buffer); } grn_table_cursor_close(ctx, tc); } - datavec_fin(ctx, builder->data_vectors); + datavec_fin(ctx, ii_buffer->data_vectors); GRN_LOG(ctx, GRN_LOG_NOTICE, "tmpfile_size:%jd > total_chunk_size:%zu", - builder->filepos, builder->total_chunk_size); - close(builder->tmpfd); + ii_buffer->filepos, ii_buffer->total_chunk_size); + close(ii_buffer->tmpfd); unlink(TMPFILE_PATH); + return ctx->rc; } grn_rc -grn_ii_build(grn_ctx *ctx, grn_ii *ii) +grn_ii_buffer_close(grn_ctx *ctx, grn_ii_buffer *ii_buffer) { - grn_rc rc = GRN_INVALID_ARGUMENT; - grn_ii_builder builder; - grn_id *s = ii->obj.source; - grn_obj *src, *target; - if (!(ii->obj.source_size) || !s) { goto exit; } - if (!(src = grn_ctx_at(ctx, *s))) { - goto exit; - } - if (!(target = GRN_OBJ_TABLEP(src) ? src : grn_ctx_at(ctx, src->header.domain))) { - goto exit; + uint32_t i; + for (i = 0; i < ii_buffer->nblocks; i++) { + if (ii_buffer->blocks[i].buffer) { + GRN_FREE(ii_buffer->blocks[i].buffer); + } } + GRN_FREE(ii_buffer->blocks); + GRN_FREE(ii_buffer); + return ctx->rc; +} - builder.ii = ii; - builder.source = src; - builder.target = target; - builder.lexicon = ii->lexicon; - builder.tmp_lexicon = NULL; - { - grn_obj_flags flags; - grn_table_get_info(ctx, builder.lexicon, &flags, NULL, NULL, NULL); - if (flags & GRN_OBJ_TABLE_PAT_KEY) { - grn_pat_cache_enable(ctx, (grn_pat *)builder.lexicon, PAT_CACHE_SIZE); +static void +grn_ii_buffer_parse(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + grn_obj *target, int ncols, grn_obj **cols) +{ + grn_table_cursor *tc; + if ((tc = grn_table_cursor_open(ctx, target, + NULL, 0, NULL, 0, 0, -1, + GRN_CURSOR_BY_ID))) { + grn_id id; + grn_obj rv; + GRN_TEXT_INIT(&rv, 0); + while ((id = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { + int i; + for (i = 0; i < ncols; i++) { + GRN_BULK_REWIND(&rv); + grn_obj_get_value(ctx, cols[i], id, &rv); + grn_ii_buffer_tokenize(ctx, ii_buffer, id, i + 1, &rv); + } } + GRN_OBJ_FIN(ctx, &rv); + grn_table_cursor_close(ctx, tc); } - builder.nblocks = 0; - builder.blocks = NULL; - - grn_ii_builder_parse(ctx, &builder); - grn_ii_builder_merge(ctx, &builder); +} - { - uint32_t i; - for (i = 0; i < builder.nblocks; i++) { - if (builder.blocks[i].buffer) { GRN_FREE(builder.blocks[i].buffer); } +grn_rc +grn_ii_build(grn_ctx *ctx, grn_ii *ii) +{ + grn_ii_buffer *ii_buffer = grn_ii_buffer_open(ctx, ii); + if (ii_buffer) { + grn_id *s = ii->obj.source; + if ((ii->obj.source_size) && s) { + int ncols = ii->obj.source_size / sizeof(grn_id); + grn_obj **cols = GRN_MALLOCN(grn_obj *, ncols); + if (cols) { + int i; + for (i = 0; i < ncols; i++) { + if (!(cols[i] = grn_ctx_at(ctx, s[i]))) { break; } + } + if (i == ncols) { + grn_obj *target = cols[0]; + if (!GRN_OBJ_TABLEP(target)) { + target = grn_ctx_at(ctx, target->header.domain); + } + if (target) { + grn_ii_buffer_parse(ctx, ii_buffer, target, ncols, cols); + grn_ii_buffer_commit(ctx, ii_buffer); + } else { + ERR(GRN_INVALID_ARGUMENT, "failed to resolve the target"); + } + } else { + ERR(GRN_INVALID_ARGUMENT, "failed to resolve a column (%d)", i); + } + GRN_FREE(cols); + } + } else { + ERR(GRN_INVALID_ARGUMENT, "ii->obj.source is void"); } + grn_ii_buffer_close(ctx, ii_buffer); } - GRN_FREE(builder.blocks); - rc = GRN_SUCCESS; /* FIXME */ -exit : - return rc; + return ctx->rc; } #endif /* WIN32 */ Modified: lib/ii.h (+6 -0) =================================================================== --- lib/ii.h 2012-02-13 18:44:06 +0900 (af289d3) +++ lib/ii.h 2012-02-13 20:24:59 +0900 (4d9708d) @@ -40,6 +40,7 @@ extern "C" { #endif typedef struct _grn_ii grn_ii; +typedef struct _grn_ii_buffer grn_ii_buffer; struct _grn_ii { grn_db_obj obj; @@ -188,6 +189,11 @@ void grn_ii_inspect_elements(grn_ctx *ctx, grn_ii *ii, grn_obj *buf); void grn_ii_cursor_inspect(grn_ctx *ctx, grn_ii_cursor *c, grn_obj *buf); grn_rc grn_ii_build(grn_ctx *ctx, grn_ii *ii); +grn_ii_buffer *grn_ii_buffer_open(grn_ctx *ctx, grn_ii *ii); +grn_rc grn_ii_buffer_append(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + grn_id rid, unsigned int section, grn_obj *value); +grn_rc grn_ii_buffer_commit(grn_ctx *ctx, grn_ii_buffer *ii_buffer); +grn_rc grn_ii_buffer_close(grn_ctx *ctx, grn_ii_buffer *ii_buffer); #ifdef __cplusplus }