null+****@clear*****
null+****@clear*****
2012年 2月 20日 (月) 20:53:10 JST
Daijiro MORI 2012-02-20 20:53:10 +0900 (Mon, 20 Feb 2012)
New Revision: 2c3ecbcf7eae08a84be48f35e1431ab0b0686e57
Log:
grn_ii_buffer: support GRN_OBJ_WITH_SECTION.
Modified files:
lib/ii.c
Modified: lib/ii.c (+189 -51)
===================================================================
--- lib/ii.c 2012-02-17 17:05:22 +0900 (bc88833)
+++ lib/ii.c 2012-02-20 20:53:10 +0900 (7ecadd3)
@@ -6338,6 +6338,7 @@ grn_ii_inspect_elements(grn_ctx *ctx, grn_ii *ii, grn_obj *buf)
/********************** buffered index builder ***********************/
const grn_id II_BUFFER_RID_FLAG = 0x80000000;
+const grn_id II_BUFFER_WEIGHT_FLAG = 0x40000000;
#ifdef II_BUFFER_ORDER_BY_ID
const int II_BUFFER_ORDER = GRN_CURSOR_BY_ID;
#else /* II_BUFFER_ORDER_BY_ID */
@@ -6356,11 +6357,12 @@ typedef struct {
grn_id last_rid;
uint32_t last_sid;
uint32_t last_tf;
+ uint32_t last_weight;
uint32_t last_pos;
uint32_t offset_rid;
uint32_t offset_sid;
- uint32_t offset_weight;
uint32_t offset_tf;
+ uint32_t offset_weight;
uint32_t offset_pos;
} ii_buffer_counter;
@@ -6412,7 +6414,7 @@ block_new(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
if (!(ii_buffer->nblocks & 0x3ff)) {
ii_buffer_block *blocks;
if (!(blocks = GRN_REALLOC(ii_buffer->blocks,
- (ii_buffer->nblocks + 0x400) *
+ (ii_buffer->nblocks + 0x400) *
sizeof(ii_buffer_block)))) {
return NULL;
}
@@ -6430,6 +6432,7 @@ static uint8_t *
allocate_outbuf(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
{
size_t bufsize = 0, bufsize_ = 0;
+ uint32_t flags = ii_buffer->ii->header->flags;
ii_buffer_counter *counter = ii_buffer->counters;
grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon);
for (tid = 1; tid <= tid_max; counter++, tid++) {
@@ -6440,8 +6443,16 @@ allocate_outbuf(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
bufsize += GRN_B_ENC_SIZE(counter->nrecs);
bufsize += GRN_B_ENC_SIZE(counter->nposts);
bufsize += counter->offset_rid;
+ if ((flags & GRN_OBJ_WITH_SECTION)) {
+ bufsize += counter->offset_sid;
+ }
bufsize += counter->offset_tf;
- bufsize += counter->offset_pos;
+ if ((flags & GRN_OBJ_WITH_WEIGHT)) {
+ bufsize += counter->offset_weight;
+ }
+ if ((flags & GRN_OBJ_WITH_POSITION)) {
+ bufsize += counter->offset_pos;
+ }
if (bufsize_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < bufsize) {
bufsize += sizeof(uint32_t);
bufsize_ = bufsize;
@@ -6461,6 +6472,7 @@ encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
uint8_t *outbufp_ = outbuf;
grn_table_cursor *tc;
uint8_t *pnext = (uint8_t *)&block->nextsize;
+ uint32_t flags = ii_buffer->ii->header->flags;
tc = grn_table_cursor_open(ctx, ii_buffer->tmp_lexicon,
NULL, 0, NULL, 0, 0, -1, II_BUFFER_ORDER);
while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) {
@@ -6471,17 +6483,29 @@ encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
ii_buffer_counter *counter = &ii_buffer->counters[tid - 1];
if (counter->nrecs) {
uint32_t offset_rid = counter->offset_rid;
+ uint32_t offset_sid = counter->offset_sid;
uint32_t offset_tf = counter->offset_tf;
+ uint32_t offset_weight = counter->offset_weight;
uint32_t offset_pos = counter->offset_pos;
GRN_B_ENC(gtid, outbufp);
GRN_B_ENC(counter->nrecs, outbufp);
GRN_B_ENC(counter->nposts, outbufp);
counter->offset_rid = outbufp - outbuf;
outbufp += offset_rid;
+ if ((flags & GRN_OBJ_WITH_SECTION)) {
+ counter->offset_sid = outbufp - outbuf;
+ outbufp += offset_sid;
+ }
counter->offset_tf = outbufp - outbuf;
outbufp += offset_tf;
- counter->offset_pos = outbufp - outbuf;
- outbufp += offset_pos;
+ if ((flags & GRN_OBJ_WITH_WEIGHT)) {
+ counter->offset_weight = outbufp - outbuf;
+ outbufp += offset_weight;
+ }
+ if ((flags & GRN_OBJ_WITH_POSITION)) {
+ counter->offset_pos = outbufp - outbuf;
+ outbufp += offset_pos;
+ }
}
if (outbufp_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < outbufp) {
uint32_t size = outbufp - outbufp_ + sizeof(uint32_t);
@@ -6503,40 +6527,64 @@ static void
encode_postings(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf)
{
grn_id rid = 0;
+ unsigned int sid = 1;
+ unsigned int weight = 0;
uint32_t pos = 0;
uint32_t rest;
grn_id *bp = ii_buffer->block_buf;
+ uint32_t flags = ii_buffer->ii->header->flags;
for (rest = ii_buffer->block_pos; rest; bp++, rest--) {
grn_id id = *bp;
if (id & II_BUFFER_RID_FLAG) {
rid = id - II_BUFFER_RID_FLAG;
+ if ((flags & GRN_OBJ_WITH_SECTION)) {
+ sid = *++bp;
+ }
+ weight = 0;
pos = 0;
+ } else if (id & II_BUFFER_WEIGHT_FLAG) {
+ weight = id - II_BUFFER_WEIGHT_FLAG;
} else {
ii_buffer_counter *counter = &ii_buffer->counters[id - 1];
- if (counter->last_rid == rid) {
+ if (counter->last_rid == rid && counter->last_sid == sid) {
counter->last_tf++;
+ counter->last_weight += weight;
} else {
if (counter->last_tf) {
uint8_t *p = outbuf + counter->offset_tf;
GRN_B_ENC(counter->last_tf - 1, p);
counter->offset_tf = p - outbuf;
+ if (flags & GRN_OBJ_WITH_WEIGHT) {
+ p = outbuf + counter->offset_weight;
+ GRN_B_ENC(counter->last_weight, p);
+ counter->offset_weight = p - outbuf;
+ }
}
{
uint8_t *p = outbuf + counter->offset_rid;
GRN_B_ENC(rid - counter->last_rid, p);
counter->offset_rid = p - outbuf;
}
+ if (flags & GRN_OBJ_WITH_SECTION) {
+ uint8_t *p = outbuf + counter->offset_sid;
+ if (counter->last_rid != rid) {
+ GRN_B_ENC(sid - 1, p);
+ } else {
+ GRN_B_ENC(sid - counter->last_sid - 1, p);
+ }
+ counter->offset_sid = p - outbuf;
+ }
counter->last_rid = rid;
- counter->last_sid = 0;
+ counter->last_sid = sid;
counter->last_tf = 1;
counter->last_pos = 0;
}
- {
+ if (flags & GRN_OBJ_WITH_POSITION) {
uint8_t *p = outbuf + counter->offset_pos;
GRN_B_ENC(pos - counter->last_pos, p);
counter->offset_pos = p - outbuf;
+ counter->last_pos = pos;
}
- counter->last_pos = pos;
pos++;
}
}
@@ -6551,6 +6599,12 @@ encode_last_tf(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf)
uint8_t *p = outbuf + counter->offset_tf;
GRN_B_ENC(counter->last_tf - 1, p);
}
+ if ((ii_buffer->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) {
+ for (tid = 1; tid <= tid_max; counter++, tid++) {
+ uint8_t *p = outbuf + counter->offset_weight;
+ GRN_B_ENC(counter->last_weight, p);
+ }
+ }
}
static void
@@ -6633,27 +6687,34 @@ get_buffer_counter(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
}
static void
-grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
- grn_id rid, unsigned int section, grn_obj *value)
+grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid,
+ unsigned int sid, unsigned int weight, grn_obj *value)
{
uint32_t value_len = GRN_TEXT_LEN(value);
if (value_len) {
grn_obj *tmp_lexicon;
- if (ii_buffer->block_buf_size < ii_buffer->block_pos + value_len) {
+ uint32_t est_len = value_len + 2;
+ if (ii_buffer->block_buf_size < ii_buffer->block_pos + est_len) {
grn_ii_buffer_flush(ctx, ii_buffer);
}
- if (ii_buffer->block_buf_size < value_len) {
+ if (ii_buffer->block_buf_size < est_len) {
grn_id *block_buf = (grn_id *)GRN_REALLOC(ii_buffer->block_buf,
- value_len * sizeof(grn_id));
+ est_len * sizeof(grn_id));
if (!block_buf) { return; }
ii_buffer->block_buf = block_buf;
- ii_buffer->block_buf_size = value_len;
+ ii_buffer->block_buf_size = est_len;
}
if ((tmp_lexicon = get_tmp_lexicon(ctx, ii_buffer))) {
grn_token *token;
grn_id *buffer = ii_buffer->block_buf;
uint32_t block_pos = ii_buffer->block_pos;
buffer[block_pos++] = rid + II_BUFFER_RID_FLAG;
+ if ((ii_buffer->ii->header->flags & GRN_OBJ_WITH_SECTION)) {
+ buffer[block_pos++] = sid;
+ }
+ if (weight) {
+ buffer[block_pos++] = weight + II_BUFFER_WEIGHT_FLAG;
+ }
if ((token = grn_token_open(ctx, tmp_lexicon, GRN_TEXT_VALUE(value),
value_len, grn_token_add))) {
uint32_t pos;
@@ -6667,9 +6728,26 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
if (counter->last_rid != rid) {
counter->offset_rid += GRN_B_ENC_SIZE(rid - counter->last_rid);
counter->last_rid = rid;
+ counter->offset_sid += GRN_B_ENC_SIZE(sid - 1);
+ counter->last_sid = sid;
if (counter->last_tf) {
counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1);
counter->last_tf = 0;
+ counter->offset_weight += GRN_B_ENC_SIZE(counter->last_weight);
+ counter->last_weight = 0;
+ }
+ counter->last_pos = 0;
+ counter->nrecs++;
+ } else if (counter->last_sid != sid) {
+ counter->offset_rid += GRN_B_ENC_SIZE(0);
+ counter->offset_sid +=
+ GRN_B_ENC_SIZE(sid - counter->last_sid - 1);
+ counter->last_sid = sid;
+ if (counter->last_tf) {
+ counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1);
+ counter->last_tf = 0;
+ counter->offset_weight += GRN_B_ENC_SIZE(counter->last_weight);
+ counter->last_weight = 0;
}
counter->last_pos = 0;
counter->nrecs++;
@@ -6677,6 +6755,7 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
counter->offset_pos += GRN_B_ENC_SIZE(pos - counter->last_pos);
counter->last_pos = pos;
counter->last_tf++;
+ counter->last_weight += weight;
counter->nposts++;
}
}
@@ -6776,16 +6855,29 @@ merge_hit_blocks(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
nrecs += block->nrecs;
nposts += block->nposts;
}
- max_size = nrecs * 2 + nposts;
+ max_size = nrecs * (ii_buffer->ii->n_elements - 1) + nposts;
datavec_reset(ctx, ii_buffer->data_vectors,
ii_buffer->ii->n_elements, nrecs, max_size);
{
- uint32_t *ridp = ii_buffer->data_vectors[0].data;
- uint32_t *tfp = ii_buffer->data_vectors[1].data;
- uint32_t *posp = ii_buffer->data_vectors[2].data;
+ int i;
uint32_t lr = 0;
uint64_t spos = 0;
- int i;
+ uint32_t flags = ii_buffer->ii->header->flags;
+ uint32_t *ridp, *sidp = NULL, *tfp, *weightp = NULL, *posp = NULL;
+ {
+ int j = 0;
+ ridp = ii_buffer->data_vectors[j++].data;
+ if (flags & GRN_OBJ_WITH_SECTION) {
+ sidp = ii_buffer->data_vectors[j++].data;
+ }
+ tfp = ii_buffer->data_vectors[j++].data;
+ if (flags & GRN_OBJ_WITH_WEIGHT) {
+ weightp = ii_buffer->data_vectors[j++].data;
+ }
+ if (flags & GRN_OBJ_WITH_POSITION) {
+ posp = ii_buffer->data_vectors[j++].data;
+ }
+ }
for (i = 0; i < nhits; i++) {
ii_buffer_block *block = hits[i];
uint8_t *p = block->bufcur;
@@ -6799,27 +6891,52 @@ merge_hit_blocks(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
lr += *ridp++;
}
}
+ if ((flags & GRN_OBJ_WITH_SECTION)) {
+ for (n = block->nrecs; n; n--) {
+ GRN_B_DEC(*sidp++, p);
+ }
+ }
for (n = block->nrecs; n; n--) {
GRN_B_DEC(*tfp++, p);
}
- for (n = block->nposts; n; n--) {
- GRN_B_DEC(*posp, p);
- spos += *posp++;
+ if ((flags & GRN_OBJ_WITH_WEIGHT)) {
+ for (n = block->nrecs; n; n--) {
+ GRN_B_DEC(*weightp++, p);
+ }
+ }
+ if ((flags & GRN_OBJ_WITH_POSITION)) {
+ for (n = block->nposts; n; n--) {
+ GRN_B_DEC(*posp, p);
+ spos += *posp++;
+ }
}
block->rest -= (p - block->bufcur);
block->bufcur = p;
grn_ii_buffer_fetch(ctx, ii_buffer, block);
}
- ii_buffer->data_vectors[0].data_size = nrecs;
- ii_buffer->data_vectors[1].data_size = nrecs;
- ii_buffer->data_vectors[2].data_size = nposts;
-
- ii_buffer->data_vectors[0].flags =
- ((nrecs < 16) || (nrecs <= (lr >> 8))) ? 0 : USE_P_ENC;
- ii_buffer->data_vectors[1].flags =
- (nrecs < 3) ? 0 : USE_P_ENC;
- ii_buffer->data_vectors[2].flags =
- (((nposts < 32) || (nposts <= (spos >> 13))) ? 0 : USE_P_ENC)|ODD;
+ {
+ int j = 0;
+ uint32_t f_s = (nrecs < 3) ? 0 : USE_P_ENC;
+ uint32_t f_d = ((nrecs < 16) || (nrecs <= (lr >> 8))) ? 0 : USE_P_ENC;
+ ii_buffer->data_vectors[j].data_size = nrecs;
+ ii_buffer->data_vectors[j++].flags = f_d;
+ if ((flags & GRN_OBJ_WITH_SECTION)) {
+ ii_buffer->data_vectors[j].data_size = nrecs;
+ ii_buffer->data_vectors[j++].flags = f_s;
+ }
+ ii_buffer->data_vectors[j].data_size = nrecs;
+ ii_buffer->data_vectors[j++].flags = f_s;
+ if ((flags & GRN_OBJ_WITH_WEIGHT)) {
+ ii_buffer->data_vectors[j].data_size = nrecs;
+ ii_buffer->data_vectors[j++].flags = f_s;
+ }
+ if ((flags & GRN_OBJ_WITH_POSITION)) {
+ uint32_t f_p = (((nposts < 32) ||
+ (nposts <= (spos >> 13))) ? 0 : USE_P_ENC);
+ ii_buffer->data_vectors[j].data_size = nposts;
+ ii_buffer->data_vectors[j++].flags = f_p|ODD;
+ }
+ }
}
return max_size;
}
@@ -6851,22 +6968,38 @@ grn_ii_buffer_merge(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
{
uint32_t *a = array_get(ctx, ii_buffer->ii, tid);
if (nhits == 1 && hits[0]->nrecs == 1 && hits[0]->nposts == 1) {
+ grn_id rid;
+ uint32_t sid = 1, tf, pos, weight = 0;
ii_buffer_block *block = hits[0];
uint8_t *p = block->bufcur;
- grn_id rid;
- uint32_t tf, pos;
+ uint32_t flags = ii_buffer->ii->header->flags;
GRN_B_DEC(rid, p);
+ if (flags & GRN_OBJ_WITH_SECTION) { GRN_B_DEC(sid, p); }
GRN_B_DEC(tf, p);
+ if (tf != 0) { GRN_LOG(ctx, GRN_LOG_WARNING, "tf=%d", tf); }
+ if (flags & GRN_OBJ_WITH_WEIGHT) { GRN_B_DEC(weight, p); }
GRN_B_DEC(pos, p);
- block->rest -= (p - block->bufcur);
- block->bufcur = p;
- if (tf != 0) {
- GRN_LOG(ctx, GRN_LOG_WARNING, "tf=%d", tf);
+ if (!weight) {
+ if (flags & GRN_OBJ_WITH_SECTION) {
+ if (rid < 0x100000 && sid < 0x800) {
+ a[0] = (rid << 12) + (sid << 1) + 1;
+ a[1] = (flags & GRN_OBJ_WITH_POSITION) ? pos : 0;
+ block->rest -= (p - block->bufcur);
+ block->bufcur = p;
+ grn_ii_buffer_fetch(ctx, ii_buffer, block);
+ return;
+ }
+ } else {
+ a[0] = (rid << 1) + 1;
+ a[1] = (flags & GRN_OBJ_WITH_POSITION) ? pos : 0;
+ block->rest -= (p - block->bufcur);
+ block->bufcur = p;
+ grn_ii_buffer_fetch(ctx, ii_buffer, block);
+ return;
+ }
}
- a[0] = (rid << 1) + 1;
- a[1] = pos;
- grn_ii_buffer_fetch(ctx, ii_buffer, block);
- } else {
+ }
+ {
uint32_t max_size = merge_hit_blocks(ctx, ii_buffer, hits, nhits);
if (ii_buffer->packed_buf &&
ii_buffer->packed_buf_size <
@@ -6954,9 +7087,9 @@ grn_ii_buffer_open(grn_ctx *ctx, grn_ii *ii)
grn_rc
grn_ii_buffer_append(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
- grn_id rid, unsigned int section, grn_obj *value)
+ grn_id rid, unsigned int sid, grn_obj *value)
{
- grn_ii_buffer_tokenize(ctx, ii_buffer, rid, section, value);
+ grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0, value);
return ctx->rc;
}
@@ -7047,15 +7180,20 @@ grn_ii_buffer_parse(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
if ((tc = grn_table_cursor_open(ctx, target,
NULL, 0, NULL, 0, 0, -1,
GRN_CURSOR_BY_ID))) {
- grn_id id;
+ grn_id rid;
grn_obj rv;
GRN_TEXT_INIT(&rv, 0);
- while ((id = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) {
- int i;
- for (i = 0; i < ncols; i++) {
+ while ((rid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) {
+ int sid;
+ grn_obj **col;
+ for (sid = 1, col = cols; sid <= ncols; sid++, col++) {
GRN_BULK_REWIND(&rv);
- grn_obj_get_value(ctx, cols[i], id, &rv);
- grn_ii_buffer_tokenize(ctx, ii_buffer, id, i + 1, &rv);
+ if (GRN_OBJ_TABLEP(*col)) {
+ grn_table_get_key2(ctx, *col, rid, &rv);
+ } else {
+ grn_obj_get_value(ctx, *col, rid, &rv);
+ }
+ grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0, &rv);
}
}
GRN_OBJ_FIN(ctx, &rv);