[Groonga-commit] groonga/groonga [master] Adopted temporary lexicons to offline index builder

Back to archive index

null+****@clear***** null+****@clear*****
2012年 1月 24日 (火) 07:40:29 JST


Daijiro MORI	2012-01-24 07:40:29 +0900 (Tue, 24 Jan 2012)

  New Revision: 27054d32ebc0333383ab1866c4377c8febc0d82d

  Log:
    Adopted temporary lexicons to offline index builder

  Modified files:
    lib/ii.c

  Modified: lib/ii.c (+58 -41)
===================================================================
--- lib/ii.c    2012-01-23 22:07:20 +0900 (27b257f)
+++ lib/ii.c    2012-01-24 07:40:29 +0900 (20a5919)
@@ -6419,12 +6419,15 @@ grn_ii_builder_flush(grn_ctx *ctx, grn_ii_builder *builder)
     grn_id tid;
     grn_table_cursor  *tc;
     uint32_t *pnext = &block->nextsize;
-    tc = grn_table_cursor_open(ctx, builder->lexicon, NULL, 0, NULL, 0, 0, -1, BUILD_ORDER);
+    tc = grn_table_cursor_open(ctx, builder->tmp_lexicon, NULL, 0, NULL, 0, 0, -1, BUILD_ORDER);
     while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) {
+      unsigned int key_size;
+      const char *key = _grn_table_key(ctx, builder->tmp_lexicon, tid, &key_size);
+      grn_id gtid = grn_table_add(ctx, builder->lexicon, key, key_size, NULL);
       builder_counter *counter = &builder->counters[tid - 1];
       if (counter->nrecs) {
         uint32_t nposts = counter->nposts;
-        outbuf[pos++] = tid;
+        outbuf[pos++] = gtid;
         outbuf[pos++] = counter->nrecs;
         outbuf[pos++] = counter->nposts;
         counter->offset = pos;
@@ -6479,7 +6482,7 @@ grn_ii_builder_flush(grn_ctx *ctx, grn_ii_builder *builder)
   GRN_FREE(outbuf);
   {
     builder_counter *counter;
-    grn_id tid, tid_max = grn_table_size(ctx, builder->lexicon);
+    grn_id tid, tid_max = grn_table_size(ctx, builder->tmp_lexicon);
     for (counter = builder->counters, tid = 1; tid <= tid_max; counter++, tid++) {
       counter->nrecs = 0;
       counter->nposts = 0;
@@ -6488,60 +6491,73 @@ grn_ii_builder_flush(grn_ctx *ctx, grn_ii_builder *builder)
     }
   }
   builder->blockpos = 0;
+  grn_obj_close(ctx, builder->tmp_lexicon);
+  builder->tmp_lexicon = NULL;
 }
 
 static void
 grn_ii_builder_tokenize(grn_ctx *ctx, grn_ii_builder *builder, grn_id rid, grn_obj *value)
 {
-  uint32_t pos;
-  grn_token *token;
-  grn_id *buffer = builder->blockbuf;
-  if (BUILD_BLOCK_SIZE <= builder->blockpos + GRN_TEXT_LEN(value) * 2) {
-    grn_ii_builder_flush(ctx, builder);
-  }
-  pos = builder->blockpos;
-  buffer[pos++] = rid + BUILD_RID_FLAG;
-  if (GRN_TEXT_LEN(value) &&
-      (token = grn_token_open(ctx, builder->lexicon,
-                              GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value), grn_token_add))) {
-    uint32_t pos_ = pos;
-    while (!token->status) {
-      grn_id tid;
-      if ((tid = grn_token_next(ctx, token))) {
-        if (tid > builder->ncounters) {
-          uint32_t ncounters = grn_table_size(ctx, builder->lexicon) + BUILD_NCOUNTERS_MARGIN;
-          builder_counter *counters = GRN_REALLOC(builder->counters,
-                                                  ncounters * sizeof(builder_counter));
-          if (!counters) { return; }
-          memset(&counters[builder->ncounters], 0,
-                 (ncounters - builder->ncounters) * sizeof(builder_counter));
-          builder->ncounters = ncounters;
-          builder->counters = counters;
-        }
-        {
-          builder_counter *counter = &builder->counters[tid - 1];
-          buffer[pos++] = tid;
-          if (counter->lastrec != rid) {
-            counter->lastrec = rid;
-            counter->nrecs++;
+  if (GRN_TEXT_LEN(value)) {
+    uint32_t pos;
+    grn_token *token;
+    grn_id *buffer = builder->blockbuf;
+    if (BUILD_BLOCK_SIZE <= builder->blockpos + GRN_TEXT_LEN(value) * 2) {
+      grn_ii_builder_flush(ctx, builder);
+    }
+    if (!builder->tmp_lexicon) {
+      grn_obj *domain = grn_ctx_at(ctx, builder->lexicon->header.domain);
+      grn_obj *range = grn_ctx_at(ctx, DB_OBJ(builder->lexicon)->range);
+      grn_obj *tokenizer;
+      grn_obj_flags flags;
+      grn_table_get_info(ctx, builder->lexicon, &flags, NULL, &tokenizer);
+      flags &= ~GRN_OBJ_PERSISTENT;
+      builder->tmp_lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range);
+      grn_obj_set_info(ctx, builder->tmp_lexicon, GRN_INFO_DEFAULT_TOKENIZER, tokenizer);
+    }
+    pos = builder->blockpos;
+    buffer[pos++] = rid + BUILD_RID_FLAG;
+    if ((token = grn_token_open(ctx, builder->tmp_lexicon,
+                                GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value), grn_token_add))) {
+      uint32_t pos_ = pos;
+      while (!token->status) {
+        grn_id tid;
+        if ((tid = grn_token_next(ctx, token))) {
+          if (tid > builder->ncounters) {
+            uint32_t ncounters = grn_table_size(ctx, builder->tmp_lexicon) + BUILD_NCOUNTERS_MARGIN;
+            builder_counter *counters = GRN_REALLOC(builder->counters,
+                                                    ncounters * sizeof(builder_counter));
+            if (!counters) { return; }
+            memset(&counters[builder->ncounters], 0,
+                   (ncounters - builder->ncounters) * sizeof(builder_counter));
+            builder->ncounters = ncounters;
+            builder->counters = counters;
+          }
+          {
+            builder_counter *counter = &builder->counters[tid - 1];
+            buffer[pos++] = tid;
+            if (counter->lastrec != rid) {
+              counter->lastrec = rid;
+              counter->nrecs++;
+            }
+            counter->nposts++;
           }
-          counter->nposts++;
         }
       }
+      grn_token_close(ctx, token);
+      if (pos - pos_ > GRN_TEXT_LEN(value)) {
+        GRN_LOG(ctx, GRN_LOG_WARNING, "%d > %d", pos - pos_, GRN_TEXT_LEN(value));
+      }
     }
-    grn_token_close(ctx, token);
-    if (pos - pos_ > GRN_TEXT_LEN(value)) {
-      GRN_LOG(ctx, GRN_LOG_WARNING, "%d > %d", pos - pos_, GRN_TEXT_LEN(value));
-    }
+    builder->blockpos = pos;
   }
-  builder->blockpos = pos;
 }
 
 static void
 grn_ii_builder_parse(grn_ctx *ctx, grn_ii_builder *builder)
 {
   grn_table_cursor  *tc;
-  builder->ncounters = grn_table_size(ctx, builder->lexicon) + BUILD_NCOUNTERS_MARGIN;
+  builder->ncounters = BUILD_NCOUNTERS_MARGIN;
   builder->counters = GRN_CALLOC(builder->ncounters * sizeof(builder_counter));
   builder->blockbuf = (grn_id *)GRN_MALLOC(BUILD_BLOCK_SIZE * sizeof(grn_id));
   builder->blockpos = 0;
@@ -6816,6 +6832,7 @@ grn_ii_build(grn_ctx *ctx, grn_ii *ii)
   builder.source = src;
   builder.target = target;
   builder.lexicon = ii->lexicon;
+  builder.tmp_lexicon = NULL;
 
   builder.nblocks = 0;
   builder.blocks = NULL;




Groonga-commit メーリングリストの案内
Back to archive index