Susumu Yata
null+****@clear*****
Wed Mar 23 19:46:05 JST 2016
Susumu Yata 2016-03-23 19:46:05 +0900 (Wed, 23 Mar 2016) New Revision: 468023ca14429561014a63fa987a4f16f9977291 https://github.com/groonga/groonga/commit/468023ca14429561014a63fa987a4f16f9977291 Message: ii: enable to split chunks in static indexing It is enabled if an environment variable GRN_INDEX_CHUNK_SPLIT_ENABLE is defined as follows: GRN_INDEX_CHUNK_SPLIT_ENABLE=yes groonga ... Note that the implementation still needs improvements. GitHub: #479 Modified files: lib/grn_ii.h lib/ii.c lib/index_column.c Modified: lib/grn_ii.h (+5 -0) =================================================================== --- lib/grn_ii.h 2016-03-23 14:54:50 +0900 (cbff79d) +++ lib/grn_ii.h 2016-03-23 19:46:05 +0900 (aeea6a1) @@ -171,6 +171,11 @@ void grn_ii_cursor_inspect(grn_ctx *ctx, grn_ii_cursor *c, grn_obj *buf); grn_rc grn_ii_truncate(grn_ctx *ctx, grn_ii *ii); grn_rc grn_ii_build(grn_ctx *ctx, grn_ii *ii, uint64_t sparsity); +typedef struct grn_ii_builder_options grn_ii_builder_options; + +grn_rc grn_ii_build2(grn_ctx *ctx, grn_ii *ii, + const grn_ii_builder_options *options); + #ifdef __cplusplus } #endif Modified: lib/ii.c (+2147 -0) =================================================================== --- lib/ii.c 2016-03-23 14:54:50 +0900 (8d4a9e5) +++ lib/ii.c 2016-03-23 19:46:05 +0900 (e4dc6e3) @@ -8848,3 +8848,2150 @@ grn_ii_build(grn_ctx *ctx, grn_ii *ii, uint64_t sparsity) } return ctx->rc; } + +/* + * ========================================================================== + * The following part provides constants, structures and functions for static + * indexing. + * ========================================================================== + */ + +#define GRN_II_BUILDER_BUFFER_CHUNK_SIZE (S_CHUNK >> 2) + +#define GRN_II_BUILDER_MAX_LEXICON_CACHE_SIZE (1 << 24) + +#define GRN_II_BUILDER_MIN_BASE_TERMS_SIZE 1 +#define GRN_II_BUILDER_MAX_BASE_TERMS_SIZE (1 << 20) + +#define GRN_II_BUILDER_MIN_BLOCK_THRESHOLD 1 +#define GRN_II_BUILDER_MAX_BLOCK_THRESHOLD (1 << 28) + +#define GRN_II_BUILDER_MIN_FILE_BUF_SIZE (1 << 12) +#define GRN_II_BUILDER_MAX_FILE_BUF_SIZE (1 << 30) + +#define GRN_II_BUILDER_MIN_BLOCK_BUF_SIZE (1 << 12) +#define GRN_II_BUILDER_MAX_BLOCK_BUF_SIZE (1 << 30) + +#define GRN_II_BUILDER_MIN_CHUNK_THRESHOLD 1 +#define GRN_II_BUILDER_MAX_CHUNK_THRESHOLD (1 << 28) + +#define GRN_II_BUILDER_MIN_BUFFER_MAX_N_TERMS 1 +#define GRN_II_BUILDER_MAX_BUFFER_MAX_N_TERMS \ + ((S_SEGMENT - sizeof(buffer_header)) / sizeof(buffer_term)) + +struct grn_ii_builder_options { + uint32_t lexicon_cache_size; /* Cache size of temporary lexicon */ + uint32_t base_terms_size; /* Base size of buffer for terms */ + /* A block is flushed if builder->n reaches this value. */ + uint32_t block_threshold; + uint32_t file_buf_size; /* Buffer size for buffered output */ + uint32_t block_buf_size; /* Buffer size for buffered input */ + /* A chunk is flushed if chunk->n reaches this value. */ + uint32_t chunk_threshold; + uint32_t buffer_max_n_terms; /* Maximum number of terms in each buffer */ +}; + +static const grn_ii_builder_options grn_ii_builder_default_options = { + 0x80000, /* lexicon_cache_size */ + 0x100, /* base_terms_size */ + 0x4000000, /* block_threshold */ + 0x10000, /* file_buf_size */ + 0x10000, /* block_buf_size */ + 0x1000, /* chunk_threshold */ + 0x3000, /* buffer_max_n_terms */ +}; + +/* grn_ii_builder_options_init fills options with the default options. */ +void +grn_ii_builder_options_init(grn_ii_builder_options *options) +{ + *options = grn_ii_builder_default_options; +} + +/* grn_ii_builder_options_fix fixes out-of-range options. */ +static void +grn_ii_builder_options_fix(grn_ii_builder_options *options) +{ + if (options->lexicon_cache_size > GRN_II_BUILDER_MAX_LEXICON_CACHE_SIZE) { + options->lexicon_cache_size = GRN_II_BUILDER_MAX_LEXICON_CACHE_SIZE; + } + + if (options->base_terms_size < GRN_II_BUILDER_MIN_BASE_TERMS_SIZE) { + options->base_terms_size = GRN_II_BUILDER_MIN_BASE_TERMS_SIZE; + } + if (options->base_terms_size > GRN_II_BUILDER_MAX_BASE_TERMS_SIZE) { + options->base_terms_size = GRN_II_BUILDER_MAX_BASE_TERMS_SIZE; + } + + if (options->block_threshold < GRN_II_BUILDER_MIN_BLOCK_THRESHOLD) { + options->block_threshold = GRN_II_BUILDER_MIN_BLOCK_THRESHOLD; + } + if (options->block_threshold > GRN_II_BUILDER_MAX_BLOCK_THRESHOLD) { + options->block_threshold = GRN_II_BUILDER_MAX_BLOCK_THRESHOLD; + } + + if (options->file_buf_size < GRN_II_BUILDER_MIN_FILE_BUF_SIZE) { + options->file_buf_size = GRN_II_BUILDER_MIN_FILE_BUF_SIZE; + } + if (options->file_buf_size > GRN_II_BUILDER_MAX_FILE_BUF_SIZE) { + options->file_buf_size = GRN_II_BUILDER_MAX_FILE_BUF_SIZE; + } + + if (options->block_buf_size < GRN_II_BUILDER_MIN_BLOCK_BUF_SIZE) { + options->block_buf_size = GRN_II_BUILDER_MIN_BLOCK_BUF_SIZE; + } + if (options->block_buf_size > GRN_II_BUILDER_MAX_BLOCK_BUF_SIZE) { + options->block_buf_size = GRN_II_BUILDER_MAX_BLOCK_BUF_SIZE; + } + + if (options->chunk_threshold < GRN_II_BUILDER_MIN_CHUNK_THRESHOLD) { + options->chunk_threshold = GRN_II_BUILDER_MIN_CHUNK_THRESHOLD; + } + if (options->chunk_threshold > GRN_II_BUILDER_MAX_CHUNK_THRESHOLD) { + options->chunk_threshold = GRN_II_BUILDER_MAX_CHUNK_THRESHOLD; + } + + if (options->buffer_max_n_terms < GRN_II_BUILDER_MIN_BUFFER_MAX_N_TERMS) { + options->buffer_max_n_terms = GRN_II_BUILDER_MIN_BUFFER_MAX_N_TERMS; + } + if (options->buffer_max_n_terms > GRN_II_BUILDER_MAX_BUFFER_MAX_N_TERMS) { + options->buffer_max_n_terms = GRN_II_BUILDER_MAX_BUFFER_MAX_N_TERMS; + } +} + +#define GRN_II_BUILDER_TERM_INPLACE_SIZE\ + (sizeof(grn_ii_builder_term) - (uintptr_t)&((grn_ii_builder_term *)0)->dummy) + +typedef struct { + grn_id rid; /* Last record ID */ + uint32_t sid; /* Last section ID */ + union { + uint32_t pos; /* Last position (GRN_OBJ_WITH_POSITION) */ + uint32_t freq; /* Frequency */ + }; + uint32_t offset; /* Buffer write offset */ + uint32_t size; /* Buffer size */ + uint32_t dummy; /* Padding */ + uint8_t *buf; /* Buffer (to be freed) */ +} grn_ii_builder_term; + +/* grn_ii_builder_term_is_inplace returns whether a term buffer is inplace. */ +inline static grn_bool +grn_ii_builder_term_is_inplace(grn_ii_builder_term *term) +{ + return term->size == GRN_II_BUILDER_TERM_INPLACE_SIZE; +} + +/* grn_ii_builder_term_get_buf returns a term buffer. */ +inline static uint8_t * +grn_ii_builder_term_get_buf(grn_ii_builder_term *term) +{ + if (grn_ii_builder_term_is_inplace(term)) { + return (uint8_t *)&term->dummy; + } else { + return term->buf; + } +} + +/* + * grn_ii_builder_term_init intiializes a term. Note that an initialized term + * must be finalized by grn_ii_builder_term_fin. + */ +static void +grn_ii_builder_term_init(grn_ctx *ctx, grn_ii_builder_term *term) +{ + term->rid = GRN_ID_NIL; + term->sid = 0; + term->pos = 0; + term->offset = 0; + term->size = GRN_II_BUILDER_TERM_INPLACE_SIZE; +} + +/* grn_ii_builder_term_fin finalizes a term. */ +static void +grn_ii_builder_term_fin(grn_ctx *ctx, grn_ii_builder_term *term) +{ + if (!grn_ii_builder_term_is_inplace(term)) { + GRN_FREE(term->buf); + } +} + +/* grn_ii_builder_term_reinit reinitializes a term. */ +static void +grn_ii_builder_term_reinit(grn_ctx *ctx, grn_ii_builder_term *term) +{ + grn_ii_builder_term_fin(ctx, term); + grn_ii_builder_term_init(ctx, term); +} + +/* grn_ii_builder_term_extend extends a term buffer. */ +static grn_rc +grn_ii_builder_term_extend(grn_ctx *ctx, grn_ii_builder_term *term) +{ + uint8_t *buf; + uint32_t size = term->size * 2; + if (grn_ii_builder_term_is_inplace(term)) { + buf = (uint8_t *)GRN_MALLOC(size); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for term buffer: size = %u", size); + return ctx->rc; + } + memcpy(buf, &term->dummy, term->offset); + } else { + buf = (uint8_t *)GRN_REALLOC(term->buf, size); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to reallocate memory for term buffer: size = %u", size); + return ctx->rc; + } + } + term->buf = buf; + term->size = size; + return GRN_SUCCESS; +} + +/* grn_ii_builder_term_append appends an integer to a term buffer. */ +inline static grn_rc +grn_ii_builder_term_append(grn_ctx *ctx, grn_ii_builder_term *term, + uint64_t value) +{ + uint8_t *p; + if (value < (uint64_t)1 << 5) { + if (term->offset + 1 > term->size) { + grn_rc rc = grn_ii_builder_term_extend(ctx, term); + if (rc != GRN_SUCCESS) { + return rc; + } + } + p = grn_ii_builder_term_get_buf(term) + term->offset; + p[0] = (uint8_t)value; + term->offset++; + return GRN_SUCCESS; + } else if (value < (uint64_t)1 << 13) { + if (term->offset + 2 > term->size) { + grn_rc rc = grn_ii_builder_term_extend(ctx, term); + if (rc != GRN_SUCCESS) { + return rc; + } + } + p = grn_ii_builder_term_get_buf(term) + term->offset; + p[0] = (uint8_t)((value & 0x1f) | (1 << 5)); + p[1] = (uint8_t)(value >> 5); + term->offset += 2; + return GRN_SUCCESS; + } else { + uint8_t i, n; + if (value < (uint64_t)1 << 21) { + n = 3; + } else if (value < (uint64_t)1 << 29) { + n = 4; + } else if (value < (uint64_t)1 << 37) { + n = 5; + } else if (value < (uint64_t)1 << 45) { + n = 6; + } else if (value < (uint64_t)1 << 53) { + n = 7; + } else { + n = 8; + } + if (term->offset + n > term->size) { + grn_rc rc = grn_ii_builder_term_extend(ctx, term); + if (rc != GRN_SUCCESS) { + return rc; + } + } + p = grn_ii_builder_term_get_buf(term) + term->offset; + p[0] = (uint8_t)(value & 0x1f) | ((n - 1) << 5); + value >>= 5; + for (i = 1; i < n; i++) { + p[i] = (uint8_t)value; + value >>= 8; + } + term->offset += n; + return GRN_SUCCESS; + } +} + +typedef struct { + uint64_t offset; /* File offset */ + uint32_t rest; /* Remaining size */ + uint8_t *buf; /* Buffer (to be freed) */ + uint8_t *cur; /* Current pointer */ + uint8_t *end; /* End pointer */ + uint32_t tid; /* Term ID */ +} grn_ii_builder_block; + +/* + * grn_ii_builder_block_init initializes a block. Note that an initialized + * block must be finalized by grn_ii_builder_block_fin. + */ +static void +grn_ii_builder_block_init(grn_ctx *ctx, grn_ii_builder_block *block) +{ + block->offset = 0; + block->rest = 0; + block->buf = NULL; + block->cur = NULL; + block->end = NULL; + block->tid = GRN_ID_NIL; +} + +/* grn_ii_builder_block_fin finalizes a block. */ +static void +grn_ii_builder_block_fin(grn_ctx *ctx, grn_ii_builder_block *block) +{ + if (block->buf) { + GRN_FREE(block->buf); + } +} + +/* + * grn_ii_builder_block_next reads the next integer. Note that this function + * returns GRN_END_OF_DATA if it reaches the end of a block. + */ +inline static grn_rc +grn_ii_builder_block_next(grn_ctx *ctx, grn_ii_builder_block *block, + uint64_t *value) +{ + uint8_t n; + if (block->cur == block->end) { + return GRN_END_OF_DATA; + } + n = (*block->cur >> 5) + 1; + if (n > block->end - block->cur) { + return GRN_END_OF_DATA; + } + *value = 0; + switch (n) { + case 8 : + *value |= (uint64_t)block->cur[7] << 53; + case 7 : + *value |= (uint64_t)block->cur[6] << 45; + case 6 : + *value |= (uint64_t)block->cur[5] << 37; + case 5 : + *value |= (uint64_t)block->cur[4] << 29; + case 4 : + *value |= (uint64_t)block->cur[3] << 21; + case 3 : + *value |= (uint64_t)block->cur[2] << 13; + case 2 : + *value |= (uint64_t)block->cur[1] << 5; + case 1 : + *value |= block->cur[0] & 0x1f; + break; + } + block->cur += n; + return GRN_SUCCESS; +} + +typedef struct { + grn_ii *ii; /* Inverted index */ + uint32_t buf_id; /* Buffer ID */ + uint32_t buf_seg_id; /* Buffer segment ID */ + buffer *buf; /* Buffer (to be unreferenced) */ + uint32_t chunk_id; /* Chunk ID */ + uint32_t chunk_seg_id; /* Chunk segment ID */ + uint8_t *chunk; /* Chunk (to be unreferenced) */ + uint32_t chunk_offset; /* Chunk write position */ +} grn_ii_builder_buffer; + +/* + * grn_ii_builder_buffer_init initializes a buffer. Note that a bufer must be + * finalized by grn_ii_builder_buffer_fin. + */ +static void +grn_ii_builder_buffer_init(grn_ctx *ctx, grn_ii_builder_buffer *buf, + grn_ii *ii) +{ + buf->ii = ii; + buf->buf_id = 0; + buf->buf_seg_id = 0; + buf->buf = NULL; + buf->chunk_id = 0; + buf->chunk_seg_id = 0; + buf->chunk = NULL; + buf->chunk_offset = 0; +} + +/* grn_ii_builder_buffer_fin finalizes a buffer. */ +static void +grn_ii_builder_buffer_fin(grn_ctx *ctx, grn_ii_builder_buffer *buf) +{ + if (buf->buf) { + GRN_IO_SEG_UNREF(buf->ii->seg, buf->buf_seg_id); + } + if (buf->chunk) { + GRN_IO_SEG_UNREF(buf->ii->chunk, buf->chunk_seg_id); + } +} + +/* grn_ii_builder_buffer_is_assigned returns whether a buffer is assigned. */ +static grn_bool +grn_ii_builder_buffer_is_assigned(grn_ctx *ctx, grn_ii_builder_buffer *buf) +{ + return buf->buf != NULL; +} + +/* grn_ii_builder_buffer_assign assigns a buffer. */ +static grn_rc +grn_ii_builder_buffer_assign(grn_ctx *ctx, grn_ii_builder_buffer *buf) +{ + void *seg; + grn_rc rc; + + /* Create a buffer. */ + buf->buf_id = NOT_ASSIGNED; + rc = buffer_segment_new(ctx, buf->ii, &buf->buf_id); + if (rc != GRN_SUCCESS) { + if (ctx->rc != GRN_SUCCESS) { + ERR(rc, "failed to allocate segment for buffer"); + } + return rc; + } + buf->buf_seg_id = buf->ii->header->binfo[buf->buf_id]; + GRN_IO_SEG_REF(buf->ii->seg, buf->buf_seg_id, seg); + if (!seg) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, + "failed access buffer segment: buf_id = %u, seg_id = %u", + buf->buf_id, buf->buf_seg_id); + } + return ctx->rc; + } + buf->buf = (buffer *)seg; + + /* Create a chunk. */ + rc = chunk_new(ctx, buf->ii, &buf->chunk_id, + GRN_II_BUILDER_BUFFER_CHUNK_SIZE); + if (rc != GRN_SUCCESS) { + return rc; + } + buf->chunk_seg_id = buf->chunk_id >> GRN_II_N_CHUNK_VARIATION; + GRN_IO_SEG_REF(buf->ii->chunk, buf->chunk_seg_id, seg); + if (!seg) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, + "failed access chunk segment: chunk_id = %u, seg_id = %u", + buf->chunk_id, buf->chunk_seg_id); + } + return ctx->rc; + } + buf->chunk = (uint8_t *)seg; + buf->chunk += (buf->chunk_id & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << + GRN_II_W_LEAST_CHUNK; + buf->chunk_offset = 0; + + buf->buf->header.chunk = buf->chunk_id; + buf->buf->header.chunk_size = GRN_II_BUILDER_BUFFER_CHUNK_SIZE; + buf->buf->header.buffer_free = S_SEGMENT - sizeof(buffer_header); + buf->buf->header.nterms = 0; + buf->buf->header.nterms_void = 0; + buf->ii->header->total_chunk_size += GRN_II_BUILDER_BUFFER_CHUNK_SIZE; + return GRN_SUCCESS; +} + +/* grn_ii_builder_buffer_flush flushes a buffer. */ +static grn_rc +grn_ii_builder_buffer_flush(grn_ctx *ctx, grn_ii_builder_buffer *buf) +{ + grn_ii *ii; + + buf->buf->header.buffer_free = S_SEGMENT - sizeof(buffer_header) - + buf->buf->header.nterms * sizeof(buffer_term); + GRN_LOG(ctx, GRN_LOG_DEBUG, + "n_terms = %u, chunk_offset = %u, chunk_size = %u, total = %" + GRN_FMT_INT64U "KB", + buf->buf->header.nterms, + buf->chunk_offset, + buf->buf->header.chunk_size, + buf->ii->header->total_chunk_size >> 10); + + ii = buf->ii; + grn_ii_builder_buffer_fin(ctx, buf); + grn_ii_builder_buffer_init(ctx, buf, ii); + return GRN_SUCCESS; +} + +typedef struct { + grn_id tid; /* Term ID */ + uint32_t n; /* Number of integers in buffers */ + grn_id rid; /* Record ID */ + uint32_t rid_gap; /* Record ID gap */ + uint64_t pos_sum; /* Sum of position gaps */ + + uint32_t offset; /* Write offset */ + uint32_t size; /* Buffer size */ + grn_id *rid_buf; /* Buffer for record IDs (to be freed) */ + uint32_t *sid_buf; /* Buffer for section IDs (to be freed) */ + uint32_t *freq_buf; /* Buffer for frequencies (to be freed) */ + uint32_t *weight_buf; /* Buffer for weights (to be freed) */ + + uint32_t pos_offset; /* Write offset of pos_buf */ + uint32_t pos_size; /* Buffer size of pos_buf */ + uint32_t *pos_buf; /* Buffer for positions (to be freed) */ + + size_t enc_offset; /* Write offset of enc_buf */ + size_t enc_size; /* Buffer size of enc_buf */ + uint8_t *enc_buf; /* Buffer for encoded data (to be freed) */ +} grn_ii_builder_chunk; + +/* + * grn_ii_builder_chunk_init initializes a chunk. Note that an initialized + * chunk must be finalized by grn_ii_builder_chunk_fin. + */ +static void +grn_ii_builder_chunk_init(grn_ctx *ctx, grn_ii_builder_chunk *chunk) +{ + chunk->tid = GRN_ID_NIL; + chunk->n = 0; + chunk->rid = GRN_ID_NIL; + chunk->rid_gap = 0; + chunk->pos_sum = 0; + + chunk->offset = 0; + chunk->size = 0; + chunk->rid_buf = NULL; + chunk->sid_buf = NULL; + chunk->freq_buf = NULL; + chunk->weight_buf = NULL; + + chunk->pos_offset = 0; + chunk->pos_size = 0; + chunk->pos_buf = NULL; + + chunk->enc_offset = 0; + chunk->enc_size = 0; + chunk->enc_buf = NULL; +} + +/* grn_ii_builder_chunk_fin finalizes a chunk. */ +static void +grn_ii_builder_chunk_fin(grn_ctx *ctx, grn_ii_builder_chunk *chunk) +{ + if (chunk->enc_buf) { + GRN_FREE(chunk->enc_buf); + } + if (chunk->pos_buf) { + GRN_FREE(chunk->pos_buf); + } + if (chunk->weight_buf) { + GRN_FREE(chunk->weight_buf); + } + if (chunk->freq_buf) { + GRN_FREE(chunk->freq_buf); + } + if (chunk->sid_buf) { + GRN_FREE(chunk->sid_buf); + } + if (chunk->rid_buf) { + GRN_FREE(chunk->rid_buf); + } +} + +/* + * grn_ii_builder_chunk_clear clears stats except rid and buffers except + * enc_buf. + */ +static void +grn_ii_builder_chunk_clear(grn_ctx *ctx, grn_ii_builder_chunk *chunk) +{ + chunk->n = 0; + chunk->rid_gap = 0; + chunk->pos_sum = 0; + chunk->offset = 0; + chunk->pos_offset = 0; +} + +/* + * grn_ii_builder_chunk_extend_bufs extends buffers except pos_buf and enc_buf. + */ +static grn_rc +grn_ii_builder_chunk_extend_bufs(grn_ctx *ctx, grn_ii_builder_chunk *chunk, + uint32_t ii_flags) +{ + uint32_t *buf, size = chunk->size ? chunk->size * 2 : 1; + size_t n_bytes = size * sizeof(uint32_t); + + buf = (uint32_t *)GRN_REALLOC(chunk->rid_buf, n_bytes); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for record IDs: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + chunk->rid_buf = buf; + + if (ii_flags & GRN_OBJ_WITH_SECTION) { + buf = (uint32_t *)GRN_REALLOC(chunk->sid_buf, n_bytes); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for section IDs:" + " n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + chunk->sid_buf = buf; + } + + buf = (uint32_t *)GRN_REALLOC(chunk->freq_buf, n_bytes); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for frequencies: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + chunk->freq_buf = buf; + + if (ii_flags & GRN_OBJ_WITH_WEIGHT) { + buf = (uint32_t *)GRN_REALLOC(chunk->weight_buf, n_bytes); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for weights: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + chunk->weight_buf = buf; + } + + chunk->size = size; + return GRN_SUCCESS; +} + +/* grn_ii_builder_chunk_extend_pos_buf extends pos_buf. */ +static grn_rc +grn_ii_builder_chunk_extend_pos_buf(grn_ctx *ctx, grn_ii_builder_chunk *chunk) +{ + uint32_t *buf, size = chunk->pos_size ? chunk->pos_size * 2 : 1; + size_t n_bytes = size * sizeof(uint32_t); + buf = (uint32_t *)GRN_REALLOC(chunk->pos_buf, n_bytes); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for positions: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + chunk->pos_buf = buf; + chunk->pos_size = size; + return GRN_SUCCESS; +} + +/* + * grn_ii_builder_chunk_reserve_enc_buf estimates a size that is enough to + * store encoded data and allocates memory to enc_buf. + */ +static grn_rc +grn_ii_builder_chunk_reserve_enc_buf(grn_ctx *ctx, grn_ii_builder_chunk *chunk, + uint32_t n_cinfos) +{ + size_t rich_size = (chunk->n + 4) * sizeof(uint32_t) + + n_cinfos * sizeof(chunk_info); + if (chunk->enc_size < rich_size) { + size_t size = chunk->enc_size ? chunk->enc_size * 2 : 1; + uint8_t *buf; + while (size < rich_size) { + size *= 2; + } + buf = GRN_REALLOC(chunk->enc_buf, size); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for encoding: size = %" GRN_FMT_SIZE, + size); + return ctx->rc; + } + chunk->enc_buf = buf; + chunk->enc_size = size; + } + chunk->enc_offset = 0; + return GRN_SUCCESS; +} + +/* grn_ii_builder_chunk_encode encodes a chunk buffer. */ +static void +grn_ii_builder_chunk_encode_buf(grn_ctx *ctx, grn_ii_builder_chunk *chunk, + uint32_t *values, uint32_t n_values, + grn_bool use_p_enc) +{ + uint8_t *p = chunk->enc_buf + chunk->enc_offset; + uint32_t i; + if (use_p_enc) { + uint8_t freq[33]; + uint32_t buf[UNIT_SIZE]; + while (n_values >= UNIT_SIZE) { + memset(freq, 0, 33); + for (i = 0; i < UNIT_SIZE; i++) { + buf[i] = values[i]; + if (buf[i]) { + uint32_t w; + GRN_BIT_SCAN_REV(buf[i], w); + freq[w + 1]++; + } else { + freq[0]++; + } + } + p = pack(buf, UNIT_SIZE, freq, p); + values += UNIT_SIZE; + n_values -= UNIT_SIZE; + } + if (n_values) { + memset(freq, 0, 33); + for (i = 0; i < n_values; i++) { + buf[i] = values[i]; + if (buf[i]) { + uint32_t w; + GRN_BIT_SCAN_REV(buf[i], w); + freq[w + 1]++; + } else { + freq[0]++; + } + } + p = pack(buf, n_values, freq, p); + } + } else { + for (i = 0; i < n_values; i++) { + GRN_B_ENC(values[i], p); + } + } + chunk->enc_offset = p - chunk->enc_buf; +} + +/* grn_ii_builder_chunk_encode encodes a chunk. */ +static grn_rc +grn_ii_builder_chunk_encode(grn_ctx *ctx, grn_ii_builder_chunk *chunk, + chunk_info *cinfos, uint32_t n_cinfos) +{ + grn_rc rc; + uint8_t *p; + uint8_t shift = 0, use_p_enc_flags = 0; + uint8_t rid_use_p_enc, rest_use_p_enc, pos_use_p_enc = 0; + + /* Choose an encoding. */ + rid_use_p_enc = chunk->offset >= 16 && chunk->offset > (chunk->rid >> 8); + use_p_enc_flags |= rid_use_p_enc << shift++; + rest_use_p_enc = chunk->offset >= 3; + if (chunk->sid_buf) { + use_p_enc_flags |= rest_use_p_enc << shift++; + } + use_p_enc_flags |= rest_use_p_enc << shift++; + if (chunk->weight_buf) { + use_p_enc_flags |= rest_use_p_enc << shift++; + } + if (chunk->pos_buf) { + pos_use_p_enc = chunk->pos_offset >= 32 && + chunk->pos_offset > (chunk->pos_sum >> 13); + use_p_enc_flags |= pos_use_p_enc << shift++; + } + + rc = grn_ii_builder_chunk_reserve_enc_buf(ctx, chunk, n_cinfos); + if (rc != GRN_SUCCESS) { + return rc; + } + + /* Encode a header. */ + p = chunk->enc_buf; + if (n_cinfos) { + uint32_t i; + GRN_B_ENC(n_cinfos, p); + for (i = 0; i < n_cinfos; i++) { + GRN_B_ENC(cinfos[i].segno, p); + GRN_B_ENC(cinfos[i].size, p); + GRN_B_ENC(cinfos[i].dgap, p); + } + } + if (use_p_enc_flags) { + GRN_B_ENC(use_p_enc_flags << 1, p); + GRN_B_ENC(chunk->offset, p); + if (chunk->pos_buf) { + GRN_B_ENC(chunk->pos_offset - chunk->offset, p); + } + } else { + GRN_B_ENC((chunk->offset << 1) | 1, p); + } + chunk->enc_offset = p - chunk->enc_buf; + + /* Encode a body. */ + grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->rid_buf, chunk->offset, + rid_use_p_enc); + if (chunk->sid_buf) { + grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->sid_buf, chunk->offset, + rest_use_p_enc); + } + grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->freq_buf, chunk->offset, + rest_use_p_enc); + if (chunk->weight_buf) { + grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->weight_buf, + chunk->offset, rest_use_p_enc); + } + if (chunk->pos_buf) { + grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->pos_buf, + chunk->pos_offset, pos_use_p_enc); + } + + return GRN_SUCCESS; +} + +typedef struct { + grn_ii *ii; /* Building inverted index */ + grn_ii_builder_options options; /* Options */ + + grn_obj *src_table; /* Source table */ + grn_obj **srcs; /* Source columns (to be freed) */ + uint32_t n_srcs; /* Number of source columns */ + uint8_t sid_bits; /* Number of bits for section ID */ + uint64_t sid_mask; /* Mask bits for section ID */ + + grn_obj *lexicon; /* Block lexicon (to be closed) */ + + uint32_t n; /* Number of integers appended to the current block */ + grn_id rid; /* Record ID */ + uint32_t sid; /* Section ID */ + uint32_t pos; /* Position */ + + grn_ii_builder_term *terms; /* Terms (to be freed) */ + uint32_t n_terms; /* Number of distinct terms */ + uint32_t max_n_terms; /* Maximum number of distinct terms */ + uint32_t terms_size; /* Buffer size of terms */ + + /* A temporary file to save blocks. */ + char path[PATH_MAX]; /* File path */ + int fd; /* File descriptor (to be closed) */ + uint8_t *file_buf; /* File buffer for buffered output (to be freed) */ + uint32_t file_buf_offset; /* File buffer write offset */ + + grn_ii_builder_block *blocks; /* Blocks (to be freed) */ + uint32_t n_blocks; /* Number of blocks */ + uint32_t blocks_size; /* Buffer size of blocks */ + + grn_ii_builder_buffer buf; /* Buffer (to be finalized) */ + grn_ii_builder_chunk chunk; /* Chunk (to be finalized) */ + + uint32_t df; /* Document frequency (number of sections) */ + chunk_info *cinfos; /* Chunk headers (to be freed) */ + uint32_t n_cinfos; /* Number of chunks */ + uint32_t cinfos_size; /* Size of cinfos */ +} grn_ii_builder; + +/* + * grn_ii_builder_init initializes a builder. Note that an initialized builder + * must be finalized by grn_ii_builder_fin. + */ +static grn_rc +grn_ii_builder_init(grn_ctx *ctx, grn_ii_builder *builder, + grn_ii *ii, const grn_ii_builder_options *options) +{ + builder->ii = ii; + builder->options = *options; + grn_ii_builder_options_fix(&builder->options); + + builder->src_table = NULL; + builder->srcs = NULL; + builder->n_srcs = 0; + builder->sid_bits = 0; + builder->sid_mask = 0; + + builder->lexicon = NULL; + + builder->n = 0; + builder->rid = GRN_ID_NIL; + builder->sid = 0; + builder->pos = 0; + + builder->terms = NULL; + builder->n_terms = 0; + builder->max_n_terms = 0; + builder->terms_size = 0; + + builder->path[0] = '\0'; + builder->fd = -1; + builder->file_buf = NULL; + builder->file_buf_offset = 0; + + builder->blocks = NULL; + builder->n_blocks = 0; + builder->blocks_size = 0; + + grn_ii_builder_buffer_init(ctx, &builder->buf, ii); + grn_ii_builder_chunk_init(ctx, &builder->chunk); + + builder->df = 0; + builder->cinfos = NULL; + builder->n_cinfos = 0; + builder->cinfos_size = 0; + + return GRN_SUCCESS; +} + +/* grn_ii_builder_fin_terms finalizes terms. */ +static void +grn_ii_builder_fin_terms(grn_ctx *ctx, grn_ii_builder *builder) +{ + if (builder->terms) { + uint32_t i; + for (i = 0; i < builder->max_n_terms; i++) { + grn_ii_builder_term_fin(ctx, &builder->terms[i]); + } + GRN_FREE(builder->terms); + + /* To avoid double finalization. */ + builder->terms = NULL; + } +} + +/* grn_ii_builder_fin finalizes a builder. */ +static grn_rc +grn_ii_builder_fin(grn_ctx *ctx, grn_ii_builder *builder) +{ + if (builder->cinfos) { + GRN_FREE(builder->cinfos); + } + grn_ii_builder_chunk_fin(ctx, &builder->chunk); + grn_ii_builder_buffer_fin(ctx, &builder->buf); + if (builder->blocks) { + uint32_t i; + for (i = 0; i < builder->n_blocks; i++) { + grn_ii_builder_block_fin(ctx, &builder->blocks[i]); + } + GRN_FREE(builder->blocks); + } + if (builder->file_buf) { + GRN_FREE(builder->file_buf); + } + if (builder->fd != -1) { + grn_close(builder->fd); + grn_unlink(builder->path); + } + grn_ii_builder_fin_terms(ctx, builder); + if (builder->lexicon) { + grn_obj_close(ctx, builder->lexicon); + } + if (builder->srcs) { + GRN_FREE(builder->srcs); + } +{ + grn_obj_flags flags; + grn_table_get_info(ctx, builder->ii->lexicon, &flags, NULL, + NULL, NULL, NULL); + grn_pat_cache_disable(ctx, (grn_pat *)builder->ii->lexicon); +} + return GRN_SUCCESS; +} + +/* + * grn_ii_builder_open creates a builder. Note that a builder must be closed by + * grn_ii_builder_close. + */ +static grn_rc +grn_ii_builder_open(grn_ctx *ctx, grn_ii *ii, + const grn_ii_builder_options *options, + grn_ii_builder **builder) +{ + grn_rc rc; + grn_ii_builder *new_builder = GRN_MALLOCN(grn_ii_builder, 1); + if (!new_builder) { + return GRN_NO_MEMORY_AVAILABLE; + } + if (!options) { + options = &grn_ii_builder_default_options; + } + rc = grn_ii_builder_init(ctx, new_builder, ii, options); + if (rc != GRN_SUCCESS) { + GRN_FREE(new_builder); + return rc; + } + *builder = new_builder; + return GRN_SUCCESS; +} + +/* grn_ii_builder_close closes a builder. */ +static grn_rc +grn_ii_builder_close(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_rc rc; + if (!builder) { + ERR(GRN_INVALID_ARGUMENT, "builder is null"); + return ctx->rc; + } + rc = grn_ii_builder_fin(ctx, builder); + GRN_FREE(builder); + return rc; +} + +/* grn_ii_builder_create_lexicon creates a block lexicon. */ +static grn_rc +grn_ii_builder_create_lexicon(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_obj_flags flags; + grn_obj *domain = grn_ctx_at(ctx, builder->ii->lexicon->header.domain); + grn_obj *range = grn_ctx_at(ctx, DB_OBJ(builder->ii->lexicon)->range); + grn_obj *tokenizer, *normalizer, *token_filters; + grn_rc rc = grn_table_get_info(ctx, builder->ii->lexicon, &flags, NULL, + &tokenizer, &normalizer, &token_filters); + if (rc != GRN_SUCCESS) { + return rc; + } + flags &= ~GRN_OBJ_PERSISTENT; + builder->lexicon = grn_table_create(ctx, NULL, 0, NULL, + flags, domain, range); + if (!builder->lexicon) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, "[index] failed to create a block lexicon"); + } + return ctx->rc; + } + rc = grn_obj_set_info(ctx, builder->lexicon, + GRN_INFO_DEFAULT_TOKENIZER, tokenizer); + if (rc == GRN_SUCCESS) { + rc = grn_obj_set_info(ctx, builder->lexicon, + GRN_INFO_NORMALIZER, normalizer); + if (rc == GRN_SUCCESS) { + rc = grn_obj_set_info(ctx, builder->lexicon, + GRN_INFO_TOKEN_FILTERS, token_filters); + } + } + if (rc != GRN_SUCCESS) { + return rc; + } + if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { + if (builder->options.lexicon_cache_size) { + rc = grn_pat_cache_enable(ctx, (grn_pat *)builder->lexicon, + builder->options.lexicon_cache_size); + if (rc != GRN_SUCCESS) { + return rc; + } + rc = grn_pat_cache_enable(ctx, (grn_pat *)builder->ii->lexicon, + builder->options.lexicon_cache_size); + if (rc != GRN_SUCCESS) { + return rc; + } + } + } + return GRN_SUCCESS; +} + +/* + * grn_ii_builder_extend_terms extends a buffer for terms in order to make + * terms[n_terms - 1] available. + */ +static grn_rc +grn_ii_builder_extend_terms(grn_ctx *ctx, grn_ii_builder *builder, + uint32_t n_terms) +{ + if (n_terms <= builder->n_terms) { + return GRN_SUCCESS; + } + + if (n_terms > builder->max_n_terms) { + uint32_t i; + if (n_terms > builder->terms_size) { + /* Resize builder->terms for new terms. */ + size_t n_bytes; + uint32_t terms_size = builder->options.base_terms_size; + grn_ii_builder_term *terms; + while (terms_size < n_terms) { + terms_size *= 2; + } + n_bytes = terms_size * sizeof(grn_ii_builder_term); + terms = (grn_ii_builder_term *)GRN_REALLOC(builder->terms, n_bytes); + if (!terms) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for terms: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + builder->terms = terms; + builder->terms_size = terms_size; + } + /* Initialize new terms. */ + for (i = builder->max_n_terms; i < n_terms; i++) { + grn_ii_builder_term_init(ctx, &builder->terms[i]); + } + builder->max_n_terms = n_terms; + } + + builder->n = n_terms - builder->n_terms; + builder->n_terms = n_terms; + return GRN_SUCCESS; +} + +/* grn_ii_builder_get_term gets a term associated with tid. */ +inline static grn_rc +grn_ii_builder_get_term(grn_ctx *ctx, grn_ii_builder *builder, grn_id tid, + grn_ii_builder_term **term) +{ + uint32_t n_terms = tid; + if (n_terms > builder->n_terms) { + grn_rc rc = grn_ii_builder_extend_terms(ctx, builder, n_terms); + if (rc != GRN_SUCCESS) { + return rc; + } + } + *term = &builder->terms[tid - 1]; + return GRN_SUCCESS; +} + +/* grn_ii_builder_flush_file_buf flushes buffered data as a block. */ +static grn_rc +grn_ii_builder_flush_file_buf(grn_ctx *ctx, grn_ii_builder *builder) +{ + if (builder->file_buf_offset) { + ssize_t size = grn_write(builder->fd, builder->file_buf, + builder->file_buf_offset); + if ((uint64_t)size != builder->file_buf_offset) { + SERR("failed to write data: expected = %u, actual = %" GRN_FMT_INT64D, + builder->file_buf_offset, (int64_t)size); + } + builder->file_buf_offset = 0; + } + return GRN_SUCCESS; +} + +/* grn_ii_builder_flush_term flushes a term and clears it */ +static grn_rc +grn_ii_builder_flush_term(grn_ctx *ctx, grn_ii_builder *builder, + grn_ii_builder_term *term) +{ + grn_rc rc; + uint8_t *term_buf; + + /* Append sentinels. */ + if (term->rid != GRN_ID_NIL) { + if (builder->ii->header->flags & GRN_OBJ_WITH_POSITION) { + rc = grn_ii_builder_term_append(ctx, term, 0); + } else { + rc = grn_ii_builder_term_append(ctx, term, term->freq); + } + if (rc != GRN_SUCCESS) { + return rc; + } + } + rc = grn_ii_builder_term_append(ctx, term, 0); + if (rc != GRN_SUCCESS) { + return rc; + } + + { + /* Put the global term ID. */ + int key_size; + char key[GRN_TABLE_MAX_KEY_SIZE]; + uint8_t *p; + uint32_t rest, value; + grn_rc rc; + grn_id local_tid = term - builder->terms + 1, global_tid; + key_size = grn_table_get_key(ctx, builder->lexicon, local_tid, + key, GRN_TABLE_MAX_KEY_SIZE); + if (!key_size) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, "failed to get key: tid = %u", local_tid); + } + return ctx->rc; + } + global_tid = grn_table_add(ctx, builder->ii->lexicon, key, key_size, NULL); + if (global_tid == GRN_ID_NIL) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, + "failed to get global term ID: tid = %u, key = \"%.*s\"", + local_tid, key_size, key); + } + return ctx->rc; + } + + rest = builder->options.file_buf_size - builder->file_buf_offset; + if (rest < 10) { + rc = grn_ii_builder_flush_file_buf(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + } + value = global_tid; + p = builder->file_buf + builder->file_buf_offset; + if (value < 1U << 5) { + p[0] = (uint8_t)value; + builder->file_buf_offset++; + } else if (value < 1U << 13) { + p[0] = (uint8_t)((value & 0x1f) | (1 << 5)); + p[1] = (uint8_t)(value >> 5); + builder->file_buf_offset += 2; + } else { + uint8_t i, n; + if (value < 1U << 21) { + n = 3; + } else if (value < 1U << 29) { + n = 4; + } else { + n = 5; + } + p[0] = (uint8_t)(value & 0x1f) | ((n - 1) << 5); + value >>= 5; + for (i = 1; i < n; i++) { + p[i] = (uint8_t)value; + value >>= 8; + } + builder->file_buf_offset += n; + } + } + + /* Flush a term buffer. */ + term_buf = grn_ii_builder_term_get_buf(term); + if (term->offset > builder->options.file_buf_size) { + ssize_t size; + rc = grn_ii_builder_flush_file_buf(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + size = grn_write(builder->fd, term_buf, term->offset); + if ((uint64_t)size != term->offset) { + SERR("failed to write data: expected = %u, actual = %" GRN_FMT_INT64D, + term->offset, (int64_t)size); + } + } else { + uint32_t rest = builder->options.file_buf_size - builder->file_buf_offset; + if (term->offset <= rest) { + grn_memcpy(builder->file_buf + builder->file_buf_offset, + term_buf, term->offset); + builder->file_buf_offset += term->offset; + } else { + grn_memcpy(builder->file_buf + builder->file_buf_offset, + term_buf, rest); + builder->file_buf_offset += rest; + rc = grn_ii_builder_flush_file_buf(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->file_buf_offset = term->offset - rest; + grn_memcpy(builder->file_buf, term_buf + rest, builder->file_buf_offset); + } + } + grn_ii_builder_term_reinit(ctx, term); + return GRN_SUCCESS; +} + +/* + * grn_ii_builder_create_file creates a temporary file and allocates memory for + * buffered output. + */ +static grn_rc +grn_ii_builder_create_file(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_snprintf(builder->path, PATH_MAX, PATH_MAX, + "%sXXXXXX", grn_io_path(builder->ii->seg)); + builder->fd = grn_mkstemp(builder->path); + if (builder->fd == -1) { + SERR("failed to create a temporary file: path = \"%s\"", + builder->path); + return ctx->rc; + } + builder->file_buf = (uint8_t *)GRN_MALLOC(builder->options.file_buf_size); + if (!builder->file_buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for buffered output: size = %u", + builder->options.file_buf_size); + return ctx->rc; + } + return GRN_SUCCESS; +} + +/* grn_ii_builder_register_block registers a block. */ +static grn_rc +grn_ii_builder_register_block(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_ii_builder_block *block; + uint64_t file_offset = grn_lseek(builder->fd, 0, SEEK_CUR); + if (file_offset == (uint64_t)-1) { + SERR("failed to get file offset"); + return ctx->rc; + } + if (builder->n_blocks >= builder->blocks_size) { + size_t n_bytes; + uint32_t blocks_size = 1; + grn_ii_builder_block *blocks; + while (blocks_size <= builder->n_blocks) { + blocks_size *= 2; + } + n_bytes = blocks_size * sizeof(grn_ii_builder_block); + blocks = (grn_ii_builder_block *)GRN_REALLOC(builder->blocks, n_bytes); + if (!blocks) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for block: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + builder->blocks = blocks; + builder->blocks_size = blocks_size; + } + block = &builder->blocks[builder->n_blocks]; + grn_ii_builder_block_init(ctx, block); + if (!builder->n_blocks) { + block->offset = 0; + } else { + grn_ii_builder_block *prev_block = &builder->blocks[builder->n_blocks - 1]; + block->offset = prev_block->offset + prev_block->rest; + } + block->rest = (uint32_t)(file_offset - block->offset); + builder->n_blocks++; + return GRN_SUCCESS; +} + +/* grn_ii_builder_flush_block flushes a block to a temporary file. */ +static grn_rc +grn_ii_builder_flush_block(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_rc rc; + grn_table_cursor *cursor; + + if (!builder->n) { + /* Do nothing if there are no output data. */ + return GRN_SUCCESS; + } + if (builder->fd == -1) { + rc = grn_ii_builder_create_file(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + } + + /* Flush terms into a temporary file. */ + cursor = grn_table_cursor_open(ctx, builder->lexicon, + NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_KEY); + for (;;) { + grn_id tid = grn_table_cursor_next(ctx, cursor); + if (tid == GRN_ID_NIL) { + break; + } + rc = grn_ii_builder_flush_term(ctx, builder, &builder->terms[tid - 1]); + if (rc != GRN_SUCCESS) { + return rc; + } + } + rc = grn_ii_builder_flush_file_buf(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + + /* Register a block and clear the current data. */ + rc = grn_ii_builder_register_block(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + rc = grn_table_truncate(ctx, builder->lexicon); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->rid = GRN_ID_NIL; + builder->n_terms = 0; + builder->n = 0; + return GRN_SUCCESS; +} + +/* grn_ii_builder_append_token appends a token. */ +static grn_rc +grn_ii_builder_append_token(grn_ctx *ctx, grn_ii_builder *builder, + grn_id rid, uint32_t sid, uint32_t weight, + grn_id tid, uint32_t pos) +{ + grn_rc rc; + uint32_t ii_flags = builder->ii->header->flags; + grn_ii_builder_term *term; + rc = grn_ii_builder_get_term(ctx, builder, tid, &term); + if (rc != GRN_SUCCESS) { + return rc; + } + if (rid != term->rid || sid != term->sid) { + uint64_t rsid; + if (term->rid != GRN_ID_NIL) { + if (ii_flags & GRN_OBJ_WITH_POSITION) { + /* Append the end of positions. */ + rc = grn_ii_builder_term_append(ctx, term, 0); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->n++; + } else { + /* Append a frequency if positions are not available. */ + rc = grn_ii_builder_term_append(ctx, term, term->freq); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->n++; + } + } + rsid = ((uint64_t)(rid - term->rid) << builder->sid_bits) | (sid - 1); + rc = grn_ii_builder_term_append(ctx, term, rsid); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->n++; + if (ii_flags & GRN_OBJ_WITH_WEIGHT) { + rc = grn_ii_builder_term_append(ctx, term, weight); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->n++; + } + term->rid = rid; + term->sid = sid; + term->pos = 0; + } + if (ii_flags & GRN_OBJ_WITH_POSITION) { + rc = grn_ii_builder_term_append(ctx, term, pos - term->pos); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->n++; + term->pos = pos; + } else { + term->freq++; + } + return GRN_SUCCESS; +} + +/* + * grn_ii_builder_append_value appends a value. Note that values must be + * appended in ascending rid and sid order. + */ +static grn_rc +grn_ii_builder_append_value(grn_ctx *ctx, grn_ii_builder *builder, + grn_id rid, uint32_t sid, uint32_t weight, + const char *value, uint32_t value_size) +{ + uint32_t pos = 0; + grn_token_cursor *cursor; + if (rid != builder->rid) { + builder->rid = rid; + builder->sid = sid; + builder->pos = 1; + } else if (sid != builder->sid) { + builder->sid = sid; + builder->pos = 1; + } else { + /* Insert a space between values. */ + builder->pos++; + } + cursor = grn_token_cursor_open(ctx, builder->lexicon, value, value_size, + GRN_TOKEN_ADD, 0); + if (!cursor) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, + "grn_token_cursor_open failed: value = <%.*s>", value_size, value); + } + return ctx->rc; + } + while (cursor->status == GRN_TOKEN_CURSOR_DOING) { + grn_id tid = grn_token_cursor_next(ctx, cursor); + if (tid != GRN_ID_NIL) { + pos = builder->pos + cursor->pos; + grn_rc rc = grn_ii_builder_append_token(ctx, builder, rid, sid, weight, + tid, pos); + if (rc != GRN_SUCCESS) { + break; + } + } + } + grn_token_cursor_close(ctx, cursor); + builder->pos = pos + 1; + return ctx->rc; +} + +/* grn_ii_builder_append_obj appends a BULK, UVECTOR or VECTOR object. */ +static grn_rc +grn_ii_builder_append_obj(grn_ctx *ctx, grn_ii_builder *builder, + grn_id rid, uint32_t sid, grn_obj *obj) +{ + switch (obj->header.type) { + case GRN_BULK : + return grn_ii_builder_append_value(ctx, builder, rid, sid, 0, + GRN_TEXT_VALUE(obj), GRN_TEXT_LEN(obj)); + case GRN_UVECTOR : + { + const char *p = GRN_BULK_HEAD(obj); + uint32_t i, n_values = grn_uvector_size(ctx, obj); + uint32_t value_size = grn_uvector_element_size(ctx, obj); + for (i = 0; i < n_values; i++) { + grn_rc rc = grn_ii_builder_append_value(ctx, builder, rid, sid, 0, + p, value_size); + if (rc != GRN_SUCCESS) { + return rc; + } + p += value_size; + } + } + return GRN_SUCCESS; + case GRN_VECTOR : + if (obj->u.v.body) { + /* + * Note that the following sections and n_sections don't correspond to + * source columns. + */ + int i, n_secs = obj->u.v.n_sections; + grn_section *secs = obj->u.v.sections; + const char *head = GRN_BULK_HEAD(obj->u.v.body); + for (i = 0; i < n_secs; i++) { + grn_rc rc; + grn_section *sec = &secs[i]; + if (sec->length == 0) { + continue; + } + rc = grn_ii_builder_append_value(ctx, builder, rid, sid, sec->weight, + head + sec->offset, sec->length); + if (rc != GRN_SUCCESS) { + return rc; + } + } + } + return GRN_SUCCESS; + default : + ERR(GRN_INVALID_ARGUMENT, "[index] invalid object assigned as value"); + return ctx->rc; + } +} + +/* + * grn_ii_builder_append_srcs reads values from source columns and appends the + * values. + */ +static grn_rc +grn_ii_builder_append_srcs(grn_ctx *ctx, grn_ii_builder *builder) +{ + size_t i; + grn_rc rc = GRN_SUCCESS; + grn_obj *objs; + grn_table_cursor *cursor; + + /* Allocate memory for objects to store source values. */ + objs = GRN_MALLOCN(grn_obj, builder->n_srcs); + if (!objs) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for objs: n_srcs = %u", builder->n_srcs); + return ctx->rc; + } + + /* Create a block lexicon. */ + rc = grn_ii_builder_create_lexicon(ctx, builder); + if (rc != GRN_SUCCESS) { + GRN_FREE(objs); + return rc; + } + + /* Create a cursor to get records in the ID order. */ + cursor = grn_table_cursor_open(ctx, builder->src_table, NULL, 0, NULL, 0, + 0, -1, GRN_CURSOR_BY_ID); + if (!cursor) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_OBJECT_CORRUPT, "[index] failed to open table cursor"); + } + GRN_FREE(objs); + return ctx->rc; + } + + /* Read source values and append it. */ + for (i = 0; i < builder->n_srcs; i++) { + GRN_TEXT_INIT(&objs[i], 0); + } + while (rc == GRN_SUCCESS) { + grn_id rid = grn_table_cursor_next(ctx, cursor); + if (rid == GRN_ID_NIL) { + break; + } + for (i = 0; i < builder->n_srcs; i++) { + grn_obj *obj = &objs[i]; + grn_obj *src = builder->srcs[i]; + rc = grn_obj_reinit_for(ctx, obj, src); + if (rc == GRN_SUCCESS) { + if (GRN_OBJ_TABLEP(src)) { + int len = grn_table_get_key2(ctx, src, rid, obj); + if (len <= 0) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, "failed to get key: rid = %u, len = %d", + rid, len); + } + rc = ctx->rc; + } + } else { + if (!grn_obj_get_value(ctx, src, rid, obj)) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, "failed to get value: rid = %u", rid); + } + rc = ctx->rc; + } + } + if (rc == GRN_SUCCESS) { + uint32_t sid = (uint32_t)(i + 1); + rc = grn_ii_builder_append_obj(ctx, builder, rid, sid, obj); + if (rc == GRN_SUCCESS && + builder->n >= builder->options.block_threshold) { + rc = grn_ii_builder_flush_block(ctx, builder); + } + } + } + } + } + if (rc == GRN_SUCCESS) { + rc = grn_ii_builder_flush_block(ctx, builder); + } + for (i = 0; i < builder->n_srcs; i++) { + GRN_OBJ_FIN(ctx, &objs[i]); + } + grn_table_cursor_close(ctx, cursor); + GRN_FREE(objs); + return rc; +} + +/* grn_ii_builder_set_src_table sets a source table. */ +static grn_rc +grn_ii_builder_set_src_table(grn_ctx *ctx, grn_ii_builder *builder) +{ + builder->src_table = grn_ctx_at(ctx, DB_OBJ(builder->ii)->range); + if (!builder->src_table) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_INVALID_ARGUMENT, "source table is null: range = %d", + DB_OBJ(builder->ii)->range); + } + return ctx->rc; + } + return GRN_SUCCESS; +} + +/* grn_ii_builder_set_srcs sets source columns. */ +static grn_rc +grn_ii_builder_set_srcs(grn_ctx *ctx, grn_ii_builder *builder) +{ + size_t i; + grn_id *source; + builder->n_srcs = builder->ii->obj.source_size / sizeof(grn_id); + source = (grn_id *)builder->ii->obj.source; + if (!source || !builder->n_srcs) { + ERR(GRN_INVALID_ARGUMENT, + "source is not available: source = %p, source_size = %u", + builder->ii->obj.source, builder->ii->obj.source_size); + return ctx->rc; + } + builder->srcs = GRN_MALLOCN(grn_obj *, builder->n_srcs); + if (!builder->srcs) { + return GRN_NO_MEMORY_AVAILABLE; + } + for (i = 0; i < builder->n_srcs; i++) { + builder->srcs[i] = grn_ctx_at(ctx, source[i]); + if (!builder->srcs[i]) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_OBJECT_CORRUPT, "source not found: id = %d", source[i]); + } + return ctx->rc; + } + } + /* Calculate the number of bits required to represent a section ID. */ + while (((uint32_t)1 << builder->sid_bits) < builder->n_srcs) { + builder->sid_bits++; + } + builder->sid_mask = ((uint64_t)1 << builder->sid_bits) - 1; + return GRN_SUCCESS; +} + +/* grn_ii_builder_append_source appends values in source columns. */ +static grn_rc +grn_ii_builder_append_source(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_rc rc = grn_ii_builder_set_src_table(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + if (grn_table_size(ctx, builder->src_table) == 0) { + /* Nothing to do because there are no values. */ + return ctx->rc; + } + rc = grn_ii_builder_set_srcs(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + rc = grn_ii_builder_append_srcs(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + grn_ii_builder_fin_terms(ctx, builder); + return GRN_SUCCESS; +} + +/* + * grn_ii_builder_fill_block reads the next data from a temporary file and fill + * a block buffer. + */ +static grn_rc +grn_ii_builder_fill_block(grn_ctx *ctx, grn_ii_builder *builder, + uint32_t block_id) +{ + ssize_t size; + uint32_t buf_rest; + uint64_t file_offset; + grn_ii_builder_block *block = &builder->blocks[block_id]; + if (!block->rest) { + return GRN_END_OF_DATA; + } + if (!block->buf) { + block->buf = (uint8_t *)GRN_MALLOC(builder->options.block_buf_size); + if (!block->buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for buffered input: size = %u", + builder->options.block_buf_size); + return ctx->rc; + } + } + + /* Move the remaining data to the head. */ + buf_rest = block->end - block->cur; + if (buf_rest) { + grn_memmove(block->buf, block->cur, buf_rest); + } + block->cur = block->buf; + block->end = block->buf + buf_rest; + + /* Read the next data. */ + file_offset = grn_lseek(builder->fd, block->offset, SEEK_SET); + if (file_offset != block->offset) { + SERR("failed to seek file: expected = %" GRN_FMT_INT64U + ", actual = %" GRN_FMT_INT64D, + block->offset, file_offset); + return ctx->rc; + } + buf_rest = builder->options.block_buf_size - buf_rest; + if (block->rest < buf_rest) { + buf_rest = block->rest; + } + size = grn_read(builder->fd, block->end, buf_rest); + if (size <= 0) { + SERR("failed to read data: expected = %u, actual = %" GRN_FMT_INT64D, + buf_rest, (int64_t)size); + return ctx->rc; + } + block->offset += size; + block->rest -= size; + block->end += size; + return GRN_SUCCESS; +} + +/* grn_ii_builder_read_from_block reads the next value from a block. */ +static grn_rc +grn_ii_builder_read_from_block(grn_ctx *ctx, grn_ii_builder *builder, + uint32_t block_id, uint64_t *value) +{ + grn_ii_builder_block *block = &builder->blocks[block_id]; + grn_rc rc = grn_ii_builder_block_next(ctx, block, value); + if (rc == GRN_SUCCESS) { + return GRN_SUCCESS; + } else if (rc == GRN_END_OF_DATA) { + rc = grn_ii_builder_fill_block(ctx, builder, block_id); + if (rc != GRN_SUCCESS) { + return rc; + } + return grn_ii_builder_block_next(ctx, block, value); + } + return rc; +} + +/* grn_ii_builder_pack_chunk tries to pack a chunk. */ +static grn_rc +grn_ii_builder_pack_chunk(grn_ctx *ctx, grn_ii_builder *builder, + grn_bool *packed) +{ + grn_id rid; + uint32_t sid, pos, *a; + grn_ii_builder_chunk *chunk = &builder->chunk; + *packed = GRN_FALSE; + if (chunk->offset != 1) { /* df != 1 */ + return GRN_SUCCESS; + } + if (chunk->weight_buf && chunk->weight_buf[0]) { /* weight != 0 */ + return GRN_SUCCESS; + } + if (chunk->freq_buf[0] != 0) { /* freq != 1 */ + return GRN_SUCCESS; + } + rid = chunk->rid_buf[0]; + if (chunk->sid_buf) { + if (rid >= 0x100000) { + return GRN_SUCCESS; + } + sid = chunk->sid_buf[0] + 1; + if (sid >= 0x800) { + return GRN_SUCCESS; + } + a = array_get(ctx, builder->ii, chunk->tid); + a[0] = ((rid << 12) + (sid << 1)) | 1; + } else { + a = array_get(ctx, builder->ii, chunk->tid); + a[0] = (rid << 1) | 1; + } + pos = 0; + if (chunk->pos_buf) { + pos = chunk->pos_buf[0]; + } + a[1] = pos; + array_unref(builder->ii, chunk->tid); + *packed = GRN_TRUE; + + grn_ii_builder_chunk_clear(ctx, chunk); + return GRN_SUCCESS; +} + +/* grn_ii_builder_get_cinfo returns a new cinfo. */ +static grn_rc +grn_ii_builder_get_cinfo(grn_ctx *ctx, grn_ii_builder *builder, + chunk_info **cinfo) +{ + if (builder->n_cinfos == builder->cinfos_size) { + uint32_t size = builder->cinfos_size ? (builder->cinfos_size * 2) : 1; + size_t n_bytes = size * sizeof(chunk_info); + chunk_info *cinfos = (chunk_info *)GRN_REALLOC(builder->cinfos, n_bytes); + if (!cinfos) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for cinfos: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + builder->cinfos = cinfos; + builder->cinfos_size = size; + } + *cinfo = &builder->cinfos[builder->n_cinfos++]; + return GRN_SUCCESS; +} + +/* grn_ii_builder_flush_chunk flushes a chunk. */ +static grn_rc +grn_ii_builder_flush_chunk(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_rc rc; + chunk_info *cinfo = NULL; + grn_ii_builder_chunk *chunk = &builder->chunk; + void *seg; + uint8_t *in; + uint32_t in_size, chunk_id, seg_id, seg_offset, seg_rest; + + rc = grn_ii_builder_chunk_encode(ctx, chunk, NULL, 0); + if (rc != GRN_SUCCESS) { + return rc; + } + in = chunk->enc_buf; + in_size = chunk->enc_offset; + + rc = chunk_new(ctx, builder->ii, &chunk_id, chunk->enc_offset); + if (rc != GRN_SUCCESS) { + return rc; + } + + /* Copy to the first segment. */ + seg_id = chunk_id >> GRN_II_N_CHUNK_VARIATION; + seg_offset = (chunk_id & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << + GRN_II_W_LEAST_CHUNK; + GRN_IO_SEG_REF(builder->ii->chunk, seg_id, seg); + if (!seg) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, + "failed access chunk segment: chunk_id = %u, seg_id = %u", + chunk_id, seg_id); + } + return ctx->rc; + } + seg_rest = S_CHUNK - seg_offset; + if (in_size <= seg_rest) { + grn_memcpy((uint8_t *)seg + seg_offset, in, in_size); + in_size = 0; + } else { + grn_memcpy((uint8_t *)seg + seg_offset, in, seg_rest); + in += seg_rest; + in_size -= seg_rest; + } + GRN_IO_SEG_UNREF(builder->ii->chunk, seg_id); + + /* Copy to the next segments. */ + while (in_size) { + seg_id++; + GRN_IO_SEG_REF(builder->ii->chunk, seg_id, seg); + if (!seg) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, + "failed access chunk segment: chunk_id = %u, seg_id = %u", + chunk_id, seg_id); + } + return ctx->rc; + } + if (in_size <= S_CHUNK) { + grn_memcpy(seg, in, in_size); + in_size = 0; + } else { + grn_memcpy(seg, in, S_CHUNK); + in += S_CHUNK; + in_size -= S_CHUNK; + } + GRN_IO_SEG_UNREF(builder->ii->chunk, seg_id); + } + + /* Append a cinfo. */ + rc = grn_ii_builder_get_cinfo(ctx, builder, &cinfo); + if (rc != GRN_SUCCESS) { + return rc; + } + cinfo->segno = chunk_id; + cinfo->size = chunk->enc_offset; + cinfo->dgap = chunk->rid_gap; + + builder->buf.ii->header->total_chunk_size += chunk->enc_offset; + grn_ii_builder_chunk_clear(ctx, chunk); + return GRN_SUCCESS; +} + +/* grn_ii_builder_read_to_chunk read values from a block to a chunk. */ +static grn_rc +grn_ii_builder_read_to_chunk(grn_ctx *ctx, grn_ii_builder *builder, + uint32_t block_id) +{ + grn_rc rc; + uint64_t value; + uint32_t rid = GRN_ID_NIL, last_sid = 0; + uint32_t ii_flags = builder->ii->header->flags; + grn_ii_builder_chunk *chunk = &builder->chunk; + for (;;) { + uint32_t gap, freq; + uint64_t value; + rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); + if (rc != GRN_SUCCESS) { + return rc; + } + if (!value) { + break; + } + if (builder->chunk.offset == builder->chunk.size) { + rc = grn_ii_builder_chunk_extend_bufs(ctx, chunk, ii_flags); + if (rc != GRN_SUCCESS) { + return rc; + } + } + + /* Read record ID. */ + gap = value >> builder->sid_bits; /* In-block gap */ + if (gap) { + if (chunk->n >= builder->options.chunk_threshold) { + rc = grn_ii_builder_flush_chunk(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + } + last_sid = 0; + } + rid += gap; + gap = rid - chunk->rid; /* Global gap */ + chunk->rid_buf[chunk->offset] = chunk->offset ? gap : rid; + chunk->n++; + chunk->rid = rid; + chunk->rid_gap += gap; + builder->df++; + + /* Read section ID. */ + if (ii_flags & GRN_OBJ_WITH_SECTION) { + uint32_t sid = (value & builder->sid_mask) + 1; + chunk->sid_buf[chunk->offset] = sid - last_sid - 1; + chunk->n++; + last_sid = sid; + } + + /* Read weight. */ + if (ii_flags & GRN_OBJ_WITH_WEIGHT) { + uint32_t weight; + rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); + if (rc != GRN_SUCCESS) { + return rc; + } + weight = value; + chunk->weight_buf[chunk->offset] = weight; + chunk->n++; + } + + /* Read positions or a frequency. */ + if (ii_flags & GRN_OBJ_WITH_POSITION) { + uint32_t pos = -1; + freq = 0; + for (;;) { + rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); + if (rc != GRN_SUCCESS) { + return rc; + } + if (!value) { + break; + } + if (builder->chunk.pos_offset == builder->chunk.pos_size) { + rc = grn_ii_builder_chunk_extend_pos_buf(ctx, chunk); + if (rc != GRN_SUCCESS) { + return rc; + } + } + if (pos == -1) { + chunk->pos_buf[chunk->pos_offset] = value - 1; + chunk->pos_sum += value - 1; + } else { + chunk->pos_buf[chunk->pos_offset] = value; + chunk->pos_sum += value; + } + chunk->n++; + pos += value; + chunk->pos_offset++; + freq++; + } + } else { + rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); + if (rc != GRN_SUCCESS) { + return rc; + } + freq = value; + } + chunk->freq_buf[chunk->offset] = freq - 1; + chunk->n++; + chunk->offset++; + } + rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); + if (rc == GRN_SUCCESS) { + builder->blocks[block_id].tid = value; + } else if (rc == GRN_END_OF_DATA) { + builder->blocks[block_id].tid = GRN_ID_NIL; + } else { + return rc; + } + return GRN_SUCCESS; +} + +/* grn_ii_builder_register_chunks registers chunks. */ +static grn_rc +grn_ii_builder_register_chunks(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_rc rc; + uint32_t buf_tid, *a; + buffer_term *buf_term; + + rc = grn_ii_builder_chunk_encode(ctx, &builder->chunk, builder->cinfos, + builder->n_cinfos); + if (rc != GRN_SUCCESS) { + return rc; + } + + if (!grn_ii_builder_buffer_is_assigned(ctx, &builder->buf)) { + rc = grn_ii_builder_buffer_assign(ctx, &builder->buf); + if (rc != GRN_SUCCESS) { + return rc; + } + } + buf_tid = builder->buf.buf->header.nterms; + if (buf_tid >= builder->options.buffer_max_n_terms || + GRN_II_BUILDER_BUFFER_CHUNK_SIZE - builder->buf.chunk_offset < + builder->chunk.enc_offset) { + rc = grn_ii_builder_buffer_flush(ctx, &builder->buf); + if (rc != GRN_SUCCESS) { + return rc; + } + rc = grn_ii_builder_buffer_assign(ctx, &builder->buf); + if (rc != GRN_SUCCESS) { + return rc; + } + buf_tid = 0; + } + buf_term = &builder->buf.buf->terms[buf_tid]; + buf_term->tid = builder->chunk.tid; + if (builder->n_cinfos) { + buf_term->tid |= CHUNK_SPLIT; + } + buf_term->size_in_buffer = 0; + buf_term->pos_in_buffer = 0; + buf_term->size_in_chunk = builder->chunk.enc_offset; + buf_term->pos_in_chunk = builder->buf.chunk_offset; + + grn_memcpy(builder->buf.chunk + builder->buf.chunk_offset, + builder->chunk.enc_buf, builder->chunk.enc_offset); + builder->buf.chunk_offset += builder->chunk.enc_offset; + + a = array_get(ctx, builder->ii, builder->chunk.tid); + a[0] = SEG2POS(builder->buf.buf_id, + sizeof(buffer_header) + buf_tid * sizeof(buffer_term)); + a[1] = builder->df; + array_unref(builder->ii, builder->chunk.tid); + + builder->buf.buf->header.nterms++; + builder->n_cinfos = 0; + grn_ii_builder_chunk_clear(ctx, &builder->chunk); + return GRN_SUCCESS; +} + +static grn_rc +grn_ii_builder_commit(grn_ctx *ctx, grn_ii_builder *builder) +{ + uint32_t i; + grn_rc rc; + grn_table_cursor *cursor; + + for (i = 0; i < builder->n_blocks; i++) { + uint64_t value; + rc = grn_ii_builder_read_from_block(ctx, builder, i, &value); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->blocks[i].tid = value; + } + + cursor = grn_table_cursor_open(ctx, builder->ii->lexicon, + NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_KEY); + for (;;) { + grn_id tid = grn_table_cursor_next(ctx, cursor); + if (tid == GRN_ID_NIL) { + break; + } + builder->chunk.tid = tid; + builder->chunk.rid = GRN_ID_NIL; + builder->df = 0; + for (i = 0; i < builder->n_blocks; i++) { + if (tid == builder->blocks[i].tid) { + rc = grn_ii_builder_read_to_chunk(ctx, builder, i); + if (rc != GRN_SUCCESS) { + return rc; + } + } + } + if (!builder->chunk.n) { + /* This term does not appear. */ + continue; + } + if (!builder->n_cinfos) { + grn_bool packed; + rc = grn_ii_builder_pack_chunk(ctx, builder, &packed); + if (rc != GRN_SUCCESS) { + return rc; + } + if (packed) { + continue; + } + } + rc = grn_ii_builder_register_chunks(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + } + if (grn_ii_builder_buffer_is_assigned(ctx, &builder->buf)) { + rc = grn_ii_builder_buffer_flush(ctx, &builder->buf); + if (rc != GRN_SUCCESS) { + return rc; + } + } + return GRN_SUCCESS; +} + +grn_rc +grn_ii_build2(grn_ctx *ctx, grn_ii *ii, const grn_ii_builder_options *options) +{ + grn_rc rc, rc_close; + grn_ii_builder *builder; + rc = grn_ii_builder_open(ctx, ii, options, &builder); + if (rc == GRN_SUCCESS) { + rc = grn_ii_builder_append_source(ctx, builder); + if (rc == GRN_SUCCESS) { + rc = grn_ii_builder_commit(ctx, builder); + } + rc_close = grn_ii_builder_close(ctx, builder); + if (rc == GRN_SUCCESS) { + rc = rc_close; + } + } + return rc; +} Modified: lib/index_column.c (+30 -11) =================================================================== --- lib/index_column.c 2016-03-23 14:54:50 +0900 (32004e4) +++ lib/index_column.c 2016-03-23 19:46:05 +0900 (958e000) @@ -23,20 +23,35 @@ #include <string.h> static uint64_t grn_index_sparsity = 10; +static grn_bool grn_index_chunk_split_enable = GRN_FALSE; void grn_index_column_init_from_env(void) { - char grn_index_sparsity_env[GRN_ENV_BUFFER_SIZE]; - grn_getenv("GRN_INDEX_SPARSITY", - grn_index_sparsity_env, - GRN_ENV_BUFFER_SIZE); - if (grn_index_sparsity_env[0]) { - uint64_t sparsity; - errno = 0; - sparsity = strtoull(grn_index_sparsity_env, NULL, 0); - if (errno == 0) { - grn_index_sparsity = sparsity; + { + char grn_index_sparsity_env[GRN_ENV_BUFFER_SIZE]; + grn_getenv("GRN_INDEX_SPARSITY", + grn_index_sparsity_env, + GRN_ENV_BUFFER_SIZE); + if (grn_index_sparsity_env[0]) { + uint64_t sparsity; + errno = 0; + sparsity = strtoull(grn_index_sparsity_env, NULL, 0); + if (errno == 0) { + grn_index_sparsity = sparsity; + } + } + } + + { + char grn_index_chunk_split_enable_env[GRN_ENV_BUFFER_SIZE]; + grn_getenv("GRN_INDEX_CHUNK_SPLIT_ENABLE", + grn_index_chunk_split_enable_env, + GRN_ENV_BUFFER_SIZE); + if (grn_index_chunk_split_enable_env[0]) { + grn_index_chunk_split_enable = GRN_TRUE; + } else { + grn_index_chunk_split_enable = GRN_FALSE; } } } @@ -118,7 +133,11 @@ grn_index_column_build(grn_ctx *ctx, grn_obj *index_column) } } if (use_grn_ii_build) { - grn_ii_build(ctx, ii, grn_index_sparsity); + if (grn_index_chunk_split_enable) { + grn_ii_build2(ctx, ii, NULL); + } else { + grn_ii_build(ctx, ii, grn_index_sparsity); + } } else { grn_table_cursor *tc; if ((tc = grn_table_cursor_open(ctx, target, NULL, 0, NULL, 0,