null+****@clear*****
null+****@clear*****
2012年 2月 15日 (水) 11:06:17 JST
Daijiro MORI 2012-02-15 11:06:17 +0900 (Wed, 15 Feb 2012) New Revision: 57ffad03a73c3731ea18401b6695e986daf2b5a3 Log: grn_ii_buffer: calculate the outbuf size accurately. Modified files: lib/ii.c Modified: lib/ii.c (+153 -102) =================================================================== --- lib/ii.c 2012-02-14 20:03:27 +0900 (23f5e4a) +++ lib/ii.c 2012-02-15 11:06:17 +0900 (090563e) @@ -6405,20 +6405,17 @@ struct _grn_ii_buffer { uint64_t total_chunk_size; }; -static void -grn_ii_buffer_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) +static ii_buffer_block * +block_new(grn_ctx *ctx, grn_ii_buffer *ii_buffer) { - uint8_t *outbuf, *outbufp, *outbufp_; ii_buffer_block *block; - GRN_LOG(ctx, GRN_LOG_NOTICE, "flushing:%d npostings:%u", - ii_buffer->nblocks, ii_buffer->blockpos); - outbuf = (uint8_t *)GRN_MALLOC(ii_buffer->blockpos * 7 * sizeof(uint32_t)); - /* if (!outbuf) { err } */ - outbufp_ = outbufp = outbuf; if (!(ii_buffer->nblocks & 0x3ff)) { - ii_buffer_block *blocks = GRN_REALLOC(ii_buffer->blocks, - (ii_buffer->nblocks + 0x400) * sizeof(ii_buffer_block)); - if (!blocks) { /* err */ } + ii_buffer_block *blocks; + if (!(blocks = GRN_REALLOC(ii_buffer->blocks, + (ii_buffer->nblocks + 0x400) * + sizeof(ii_buffer_block)))) { + return NULL; + } ii_buffer->blocks = blocks; } block = &ii_buffer->blocks[ii_buffer->nblocks]; @@ -6426,114 +6423,168 @@ grn_ii_buffer_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) block->rest = 0; block->buffer = NULL; block->buffersize = 0; - { - ii_buffer_counter *counter; - grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon); - for (counter = ii_buffer->counters, tid = 1; tid <= tid_max; counter++, tid++) { - counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1); - counter->last_rid = 0; - counter->last_tf = 0; - } - } - { - grn_id tid; - grn_table_cursor *tc; - uint8_t *pnext = (uint8_t *)&block->nextsize; - tc = grn_table_cursor_open(ctx, ii_buffer->tmp_lexicon, NULL, 0, NULL, 0, 0, -1, II_BUFFER_ORDER); - while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { - unsigned int key_size; - const char *key = _grn_table_key(ctx, ii_buffer->tmp_lexicon, tid, &key_size); - grn_id gtid = grn_table_add(ctx, ii_buffer->lexicon, key, key_size, NULL); - ii_buffer_counter *counter = &ii_buffer->counters[tid - 1]; - if (counter->nrecs) { - uint32_t offset_rid = counter->offset_rid; - uint32_t offset_tf = counter->offset_tf; - uint32_t offset_pos = counter->offset_pos; - GRN_B_ENC(gtid, outbufp); - GRN_B_ENC(counter->nrecs, outbufp); - GRN_B_ENC(counter->nposts, outbufp); - counter->offset_rid = outbufp - outbuf; - outbufp += offset_rid; - counter->offset_tf = outbufp - outbuf; - outbufp += offset_tf; - counter->offset_pos = outbufp - outbuf; - outbufp += offset_pos; - } - if (outbufp_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < outbufp) { - uint32_t size = outbufp - outbufp_ + sizeof(uint32_t); - memcpy(pnext, &size, sizeof(uint32_t)); - pnext = outbufp; - outbufp += sizeof(uint32_t); - outbufp_ = outbufp; - } - } - grn_table_cursor_close(ctx, tc); - if (outbufp_ < outbufp) { - uint32_t size = outbufp - outbufp_; + return block; +} + +static uint8_t * +allocate_outbuf(grn_ctx *ctx, grn_ii_buffer *ii_buffer) +{ + size_t bufsize = 0, bufsize_ = 0; + ii_buffer_counter *counter = ii_buffer->counters; + grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon); + for (tid = 1; tid <= tid_max; counter++, tid++) { + counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1); + counter->last_rid = 0; + counter->last_tf = 0; + bufsize += 5; + bufsize += GRN_B_ENC_SIZE(counter->nrecs); + bufsize += GRN_B_ENC_SIZE(counter->nposts); + bufsize += counter->offset_rid; + bufsize += counter->offset_tf; + bufsize += counter->offset_pos; + if (bufsize_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < bufsize) { + bufsize += sizeof(uint32_t); + bufsize_ = bufsize; + } + } + GRN_LOG(ctx, GRN_LOG_INFO, "flushing:%d bufsize:%zu", + ii_buffer->nblocks, bufsize); + return (uint8_t *)GRN_MALLOC(bufsize); +} + +static size_t +encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + uint8_t *outbuf, ii_buffer_block *block) +{ + grn_id tid; + uint8_t *outbufp = outbuf; + uint8_t *outbufp_ = outbuf; + grn_table_cursor *tc; + uint8_t *pnext = (uint8_t *)&block->nextsize; + tc = grn_table_cursor_open(ctx, ii_buffer->tmp_lexicon, + NULL, 0, NULL, 0, 0, -1, II_BUFFER_ORDER); + while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { + unsigned int key_size; + const char *key = _grn_table_key(ctx, ii_buffer->tmp_lexicon, + tid, &key_size); + grn_id gtid = grn_table_add(ctx, ii_buffer->lexicon, key, key_size, NULL); + ii_buffer_counter *counter = &ii_buffer->counters[tid - 1]; + if (counter->nrecs) { + uint32_t offset_rid = counter->offset_rid; + uint32_t offset_tf = counter->offset_tf; + uint32_t offset_pos = counter->offset_pos; + GRN_B_ENC(gtid, outbufp); + GRN_B_ENC(counter->nrecs, outbufp); + GRN_B_ENC(counter->nposts, outbufp); + counter->offset_rid = outbufp - outbuf; + outbufp += offset_rid; + counter->offset_tf = outbufp - outbuf; + outbufp += offset_tf; + counter->offset_pos = outbufp - outbuf; + outbufp += offset_pos; + } + if (outbufp_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < outbufp) { + uint32_t size = outbufp - outbufp_ + sizeof(uint32_t); memcpy(pnext, &size, sizeof(uint32_t)); + pnext = outbufp; + outbufp += sizeof(uint32_t); + outbufp_ = outbufp; } } - { - grn_id rid = 0; - uint32_t pos = 0; - uint32_t rest; - grn_id *bp; - for (bp = ii_buffer->blockbuf, rest = ii_buffer->blockpos; rest; bp++, rest--) { - grn_id id = *bp; - if (id & II_BUFFER_RID_FLAG) { - rid = id - II_BUFFER_RID_FLAG; - pos = 0; + grn_table_cursor_close(ctx, tc); + if (outbufp_ < outbufp) { + uint32_t size = outbufp - outbufp_; + memcpy(pnext, &size, sizeof(uint32_t)); + } + return outbufp - outbuf; +} + +static void +encode_postings(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf) +{ + grn_id rid = 0; + uint32_t pos = 0; + uint32_t rest; + grn_id *bp = ii_buffer->blockbuf; + for (rest = ii_buffer->blockpos; rest; bp++, rest--) { + grn_id id = *bp; + if (id & II_BUFFER_RID_FLAG) { + rid = id - II_BUFFER_RID_FLAG; + pos = 0; + } else { + ii_buffer_counter *counter = &ii_buffer->counters[id - 1]; + if (counter->last_rid == rid) { + counter->last_tf++; } else { - ii_buffer_counter *counter = &ii_buffer->counters[id - 1]; - if (counter->last_rid == rid) { - counter->last_tf++; - } else { - if (counter->last_tf) { - uint8_t *p = outbuf + counter->offset_tf; - GRN_B_ENC(counter->last_tf - 1, p); - counter->offset_tf = p - outbuf; - } - { - uint8_t *p = outbuf + counter->offset_rid; - GRN_B_ENC(rid - counter->last_rid, p); - counter->offset_rid = p - outbuf; - } - counter->last_rid = rid; - counter->last_sid = 0; - counter->last_tf = 1; - counter->last_pos = 0; + if (counter->last_tf) { + uint8_t *p = outbuf + counter->offset_tf; + GRN_B_ENC(counter->last_tf - 1, p); + counter->offset_tf = p - outbuf; } { - uint8_t *p = outbuf + counter->offset_pos; - GRN_B_ENC(pos - counter->last_pos, p); - counter->offset_pos = p - outbuf; + uint8_t *p = outbuf + counter->offset_rid; + GRN_B_ENC(rid - counter->last_rid, p); + counter->offset_rid = p - outbuf; } - counter->last_pos = pos; - pos++; + counter->last_rid = rid; + counter->last_sid = 0; + counter->last_tf = 1; + counter->last_pos = 0; } + { + uint8_t *p = outbuf + counter->offset_pos; + GRN_B_ENC(pos - counter->last_pos, p); + counter->offset_pos = p - outbuf; + } + counter->last_pos = pos; + pos++; } } - { - ii_buffer_counter *counter; - grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon); - for (counter = ii_buffer->counters, tid = 1; tid <= tid_max; counter++, tid++) { - uint8_t *p = outbuf + counter->offset_tf; - GRN_B_ENC(counter->last_tf - 1, p); - } - memset(ii_buffer->counters, 0, tid_max * sizeof(ii_buffer_counter)); +} + +static void +encode_last_tf(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf) +{ + ii_buffer_counter *counter = ii_buffer->counters; + grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon); + for (tid = 1; tid <= tid_max; counter++, tid++) { + uint8_t *p = outbuf + counter->offset_tf; + GRN_B_ENC(counter->last_tf - 1, p); } +} + +static void +grn_ii_buffer_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) +{ + size_t encsize; + uint8_t *outbuf; + ii_buffer_block *block; + GRN_LOG(ctx, GRN_LOG_NOTICE, "flushing:%d npostings:%u", + ii_buffer->nblocks, ii_buffer->blockpos); + if (!(block = block_new(ctx, ii_buffer))) { return; } + if (!(outbuf = allocate_outbuf(ctx, ii_buffer))) { return; } + encsize = encode_terms(ctx, ii_buffer, outbuf, block); + encode_postings(ctx, ii_buffer, outbuf); + encode_last_tf(ctx, ii_buffer, outbuf); { - ssize_t r = write(ii_buffer->tmpfd, outbuf, outbufp - outbuf); - if (r > 0) { ii_buffer->filepos += r; } + ssize_t r = write(ii_buffer->tmpfd, outbuf, encsize); + if (r != encsize) { + ERR(GRN_INPUT_OUTPUT_ERROR, "write returned %d != %d", r, encsize); + return; + } + ii_buffer->filepos += r; block->tail = ii_buffer->filepos; - GRN_LOG(ctx, GRN_LOG_NOTICE, "flushed: %d encoded_size:%jdKB", - ii_buffer->nblocks, r >> 10); } - ii_buffer->nblocks++; GRN_FREE(outbuf); - ii_buffer->blockpos = 0; + memset(ii_buffer->counters, 0, + grn_table_size(ctx, ii_buffer->tmp_lexicon) * + sizeof(ii_buffer_counter)); grn_obj_close(ctx, ii_buffer->tmp_lexicon); + GRN_LOG(ctx, GRN_LOG_NOTICE, "flushed: %d encsize:%zu", + ii_buffer->nblocks, encsize); ii_buffer->tmp_lexicon = NULL; + ii_buffer->nblocks++; + ii_buffer->blockpos = 0; } const uint32_t PAT_CACHE_SIZE = 1<<20;