[Groonga-commit] groonga/groonga [master] Support vector in offline index construction

Back to archive index

Kouhei Sutou null+****@clear*****
Mon Oct 29 14:56:09 JST 2012


Kouhei Sutou	2012-10-29 14:56:09 +0900 (Mon, 29 Oct 2012)

  New Revision: e79a06dd6b6579abf0ac2366b708edf0ec635145
  https://github.com/groonga/groonga/commit/e79a06dd6b6579abf0ac2366b708edf0ec635145

  Merged 0b337f3: Merge pull request #39 from groonga/offline-index-costruction-support-vector

  Log:
    Support vector in offline index construction
    
    The change in db.c is for reusing text vector object.

  Added files:
    test/command/suite/load/index/offline/vector.expected
    test/command/suite/load/index/offline/vector.test
  Modified files:
    lib/db.c
    lib/ii.c

  Modified: lib/db.c (+4 -1)
===================================================================
--- lib/db.c    2012-10-29 13:31:00 +0900 (d931e1d)
+++ lib/db.c    2012-10-29 14:56:09 +0900 (8295425)
@@ -7322,7 +7322,10 @@ grn_obj_reinit(grn_ctx *ctx, grn_obj *obj, grn_id domain, unsigned char flags)
       if (flags & GRN_OBJ_VECTOR) {
         if (obj->header.type != GRN_VECTOR) { grn_bulk_fin(ctx, obj); }
         obj->header.type = GRN_VECTOR;
-        obj->u.v.sections = NULL;
+        if (obj->u.v.body) {
+          grn_obj_reinit(ctx, obj->u.v.body, domain, 0);
+        }
+        obj->u.v.n_sections = 0;
       } else {
         if (obj->header.type == GRN_VECTOR) { VECTOR_CLEAR(ctx,obj); }
         obj->header.type = GRN_BULK;

  Modified: lib/ii.c (+32 -6)
===================================================================
--- lib/ii.c    2012-10-29 13:31:00 +0900 (04a7e90)
+++ lib/ii.c    2012-10-29 14:56:09 +0900 (3d49396)
@@ -6743,9 +6743,9 @@ get_buffer_counter(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
 
 static void
 grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid,
-                       unsigned int sid, unsigned int weight, grn_obj *value)
+                       unsigned int sid, unsigned int weight,
+                       const char *value, uint32_t value_len)
 {
-  uint32_t value_len = GRN_TEXT_LEN(value);
   if (value_len) {
     grn_obj *tmp_lexicon;
     uint32_t est_len = value_len + 2;
@@ -6770,7 +6770,7 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid,
       if (weight) {
         buffer[block_pos++] = weight + II_BUFFER_WEIGHT_FLAG;
       }
-      if ((token = grn_token_open(ctx, tmp_lexicon, GRN_TEXT_VALUE(value),
+      if ((token = grn_token_open(ctx, tmp_lexicon, value,
                                   value_len, grn_token_add))) {
         uint32_t pos;
         for (pos = 0; !token->status; pos++) {
@@ -7179,7 +7179,8 @@ grn_rc
 grn_ii_buffer_append(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
                      grn_id rid, unsigned int sid, grn_obj *value)
 {
-  grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0, value);
+  grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0,
+                         GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value));
   return ctx->rc;
 }
 
@@ -7318,13 +7319,38 @@ grn_ii_buffer_parse(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
       int sid;
       grn_obj **col;
       for (sid = 1, col = cols; sid <= ncols; sid++, col++) {
-        GRN_BULK_REWIND(&rv);
+        grn_obj_reinit_for(ctx, &rv, *col);
         if (GRN_OBJ_TABLEP(*col)) {
           grn_table_get_key2(ctx, *col, rid, &rv);
         } else {
           grn_obj_get_value(ctx, *col, rid, &rv);
         }
-        grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0, &rv);
+        switch (rv.header.type) {
+        case GRN_BULK :
+          grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0,
+                                 GRN_TEXT_VALUE(&rv), GRN_TEXT_LEN(&rv));
+          break;
+        case GRN_VECTOR :
+          if (rv.u.v.body) {
+            int i;
+            int n_sections = rv.u.v.n_sections;
+            grn_section *sections = rv.u.v.sections;
+            const char *head = GRN_BULK_HEAD(rv.u.v.body);
+            for (i = 0; i < n_sections; i++) {
+              grn_section *section = sections + i;
+              if (section->length == 0) {
+                continue;
+              }
+              grn_ii_buffer_tokenize(ctx, ii_buffer, rid,
+                                     sid, section->weight,
+                                     head + section->offset, section->length);
+            }
+          }
+          break;
+        default :
+          ERR(GRN_INVALID_ARGUMENT, "[index] invalid object assigned as value");
+          break;
+        }
       }
     }
     GRN_OBJ_FIN(ctx, &rv);

  Added: test/command/suite/load/index/offline/vector.expected (+60 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/load/index/offline/vector.expected    2012-10-29 14:56:09 +0900 (feef26b)
@@ -0,0 +1,60 @@
+table_create Users TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Users name COLUMN_VECTOR ShortText
+[[0,0.0,0.0],true]
+load --table Users
+[
+["name"],
+["Alice"],
+["Bob"]
+]
+[[0,0.0,0.0],2]
+table_create Words TABLE_PAT_KEY --key_type ShortText   --default_tokenizer TokenBigramSplitSymbolAlpha
+[[0,0.0,0.0],true]
+column_create Words users_name COLUMN_INDEX Users name
+[[0,0.0,0.0],true]
+select Words --output_columns _key
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        8
+      ],
+      [
+        [
+          "_key",
+          "ShortText"
+        ]
+      ],
+      [
+        "Al"
+      ],
+      [
+        "Bo"
+      ],
+      [
+        "b"
+      ],
+      [
+        "ce"
+      ],
+      [
+        "e"
+      ],
+      [
+        "ic"
+      ],
+      [
+        "li"
+      ],
+      [
+        "ob"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/load/index/offline/vector.test (+15 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/load/index/offline/vector.test    2012-10-29 14:56:09 +0900 (54a24e5)
@@ -0,0 +1,15 @@
+table_create Users TABLE_NO_KEY
+column_create Users name COLUMN_VECTOR ShortText
+
+load --table Users
+[
+["name"],
+["Alice"],
+["Bob"]
+]
+
+table_create Words TABLE_PAT_KEY --key_type ShortText \
+  --default_tokenizer TokenBigramSplitSymbolAlpha
+column_create Words users_name COLUMN_INDEX Users name
+
+select Words --output_columns _key
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index