Kouhei Sutou
null+****@clear*****
Mon Oct 29 14:56:09 JST 2012
Kouhei Sutou 2012-10-29 14:56:09 +0900 (Mon, 29 Oct 2012) New Revision: e79a06dd6b6579abf0ac2366b708edf0ec635145 https://github.com/groonga/groonga/commit/e79a06dd6b6579abf0ac2366b708edf0ec635145 Merged 0b337f3: Merge pull request #39 from groonga/offline-index-costruction-support-vector Log: Support vector in offline index construction The change in db.c is for reusing text vector object. Added files: test/command/suite/load/index/offline/vector.expected test/command/suite/load/index/offline/vector.test Modified files: lib/db.c lib/ii.c Modified: lib/db.c (+4 -1) =================================================================== --- lib/db.c 2012-10-29 13:31:00 +0900 (d931e1d) +++ lib/db.c 2012-10-29 14:56:09 +0900 (8295425) @@ -7322,7 +7322,10 @@ grn_obj_reinit(grn_ctx *ctx, grn_obj *obj, grn_id domain, unsigned char flags) if (flags & GRN_OBJ_VECTOR) { if (obj->header.type != GRN_VECTOR) { grn_bulk_fin(ctx, obj); } obj->header.type = GRN_VECTOR; - obj->u.v.sections = NULL; + if (obj->u.v.body) { + grn_obj_reinit(ctx, obj->u.v.body, domain, 0); + } + obj->u.v.n_sections = 0; } else { if (obj->header.type == GRN_VECTOR) { VECTOR_CLEAR(ctx,obj); } obj->header.type = GRN_BULK; Modified: lib/ii.c (+32 -6) =================================================================== --- lib/ii.c 2012-10-29 13:31:00 +0900 (04a7e90) +++ lib/ii.c 2012-10-29 14:56:09 +0900 (3d49396) @@ -6743,9 +6743,9 @@ get_buffer_counter(grn_ctx *ctx, grn_ii_buffer *ii_buffer, static void grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid, - unsigned int sid, unsigned int weight, grn_obj *value) + unsigned int sid, unsigned int weight, + const char *value, uint32_t value_len) { - uint32_t value_len = GRN_TEXT_LEN(value); if (value_len) { grn_obj *tmp_lexicon; uint32_t est_len = value_len + 2; @@ -6770,7 +6770,7 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid, if (weight) { buffer[block_pos++] = weight + II_BUFFER_WEIGHT_FLAG; } - if ((token = grn_token_open(ctx, tmp_lexicon, GRN_TEXT_VALUE(value), + if ((token = grn_token_open(ctx, tmp_lexicon, value, value_len, grn_token_add))) { uint32_t pos; for (pos = 0; !token->status; pos++) { @@ -7179,7 +7179,8 @@ grn_rc grn_ii_buffer_append(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid, unsigned int sid, grn_obj *value) { - grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0, value); + grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0, + GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value)); return ctx->rc; } @@ -7318,13 +7319,38 @@ grn_ii_buffer_parse(grn_ctx *ctx, grn_ii_buffer *ii_buffer, int sid; grn_obj **col; for (sid = 1, col = cols; sid <= ncols; sid++, col++) { - GRN_BULK_REWIND(&rv); + grn_obj_reinit_for(ctx, &rv, *col); if (GRN_OBJ_TABLEP(*col)) { grn_table_get_key2(ctx, *col, rid, &rv); } else { grn_obj_get_value(ctx, *col, rid, &rv); } - grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0, &rv); + switch (rv.header.type) { + case GRN_BULK : + grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0, + GRN_TEXT_VALUE(&rv), GRN_TEXT_LEN(&rv)); + break; + case GRN_VECTOR : + if (rv.u.v.body) { + int i; + int n_sections = rv.u.v.n_sections; + grn_section *sections = rv.u.v.sections; + const char *head = GRN_BULK_HEAD(rv.u.v.body); + for (i = 0; i < n_sections; i++) { + grn_section *section = sections + i; + if (section->length == 0) { + continue; + } + grn_ii_buffer_tokenize(ctx, ii_buffer, rid, + sid, section->weight, + head + section->offset, section->length); + } + } + break; + default : + ERR(GRN_INVALID_ARGUMENT, "[index] invalid object assigned as value"); + break; + } } } GRN_OBJ_FIN(ctx, &rv); Added: test/command/suite/load/index/offline/vector.expected (+60 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/load/index/offline/vector.expected 2012-10-29 14:56:09 +0900 (feef26b) @@ -0,0 +1,60 @@ +table_create Users TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Users name COLUMN_VECTOR ShortText +[[0,0.0,0.0],true] +load --table Users +[ +["name"], +["Alice"], +["Bob"] +] +[[0,0.0,0.0],2] +table_create Words TABLE_PAT_KEY --key_type ShortText --default_tokenizer TokenBigramSplitSymbolAlpha +[[0,0.0,0.0],true] +column_create Words users_name COLUMN_INDEX Users name +[[0,0.0,0.0],true] +select Words --output_columns _key +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 8 + ], + [ + [ + "_key", + "ShortText" + ] + ], + [ + "Al" + ], + [ + "Bo" + ], + [ + "b" + ], + [ + "ce" + ], + [ + "e" + ], + [ + "ic" + ], + [ + "li" + ], + [ + "ob" + ] + ] + ] +] Added: test/command/suite/load/index/offline/vector.test (+15 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/load/index/offline/vector.test 2012-10-29 14:56:09 +0900 (54a24e5) @@ -0,0 +1,15 @@ +table_create Users TABLE_NO_KEY +column_create Users name COLUMN_VECTOR ShortText + +load --table Users +[ +["name"], +["Alice"], +["Bob"] +] + +table_create Words TABLE_PAT_KEY --key_type ShortText \ + --default_tokenizer TokenBigramSplitSymbolAlpha +column_create Words users_name COLUMN_INDEX Users name + +select Words --output_columns _key -------------- next part -------------- HTML����������������������������... Download