[Groonga-commit] groonga/groonga at b2b6be4 [master] ii: add comments

Back to archive index

Susumu Yata null+****@clear*****
Mon Feb 22 19:13:38 JST 2016


Susumu Yata	2016-02-22 19:13:38 +0900 (Mon, 22 Feb 2016)

  New Revision: b2b6be4fcec969884f50cb2038da1736a4f66a06
  https://github.com/groonga/groonga/commit/b2b6be4fcec969884f50cb2038da1736a4f66a06

  Message:
    ii: add comments

  Modified files:
    lib/ii.c

  Modified: lib/ii.c (+71 -29)
===================================================================
--- lib/ii.c    2016-02-22 15:50:41 +0900 (6094edc)
+++ lib/ii.c    2016-02-22 19:13:38 +0900 (403ca72)
@@ -7563,19 +7563,26 @@ typedef struct {
   uint32_t cap;        /* Buffer size */
 } ii_buffer_value;
 
+/* ii_buffer_counter is associated with a block. */
 typedef struct {
-  uint32_t nrecs;
-  uint32_t nposts;
-  grn_id last_rid;
-  uint32_t last_sid;
-  uint32_t last_tf;
-  uint32_t last_weight;
-  uint32_t last_pos;
-  uint32_t offset_rid;
-  uint32_t offset_sid;
-  uint32_t offset_tf;
-  uint32_t offset_weight;
-  uint32_t offset_pos;
+  uint32_t nrecs;  /* Number of values */
+  uint32_t nposts; /* Number of values */
+
+  /* Information of the last value */
+  grn_id last_rid;      /* Record ID */
+  uint32_t last_sid;    /* Section ID */
+  uint32_t last_tf;     /* Term frequency */
+  uint32_t last_weight; /* Total weight */
+  uint32_t last_pos;    /* Token position */
+
+  /* Meaning of offset_* is different before/after encoding. */
+  /* Before encoding: size in encoded sequence */
+  /* After encoding: Offset in encoded sequence */
+  uint32_t offset_rid;    /* Record ID */
+  uint32_t offset_sid;    /* Section ID */
+  uint32_t offset_tf;     /* Term frequency */
+  uint32_t offset_weight; /* Weight */
+  uint32_t offset_pos;    /* Token position */
 } ii_buffer_counter;
 
 typedef struct {
@@ -7595,26 +7602,28 @@ typedef struct {
 } ii_buffer_block;
 
 struct _grn_ii_buffer {
-  grn_obj *lexicon;
-  grn_obj *tmp_lexicon;
-  ii_buffer_block *blocks;
-  uint32_t nblocks;
-  int tmpfd;
-  char tmpfpath[PATH_MAX];
+  grn_obj *lexicon;            /* Global lexicon */
+  grn_obj *tmp_lexicon;        /* Temporary lexicon for each block */
+  ii_buffer_block *blocks;     /* Blocks */
+  uint32_t nblocks;            /* Number of blocks */
+  int tmpfd;                   /* Descriptor of temporary file */
+  char tmpfpath[PATH_MAX];     /* Path of temporary file */
   uint64_t update_buffer_size;
+
   // stuff for parsing
-  off64_t filepos;
-  grn_id *block_buf;
-  size_t block_buf_size;
-  size_t block_pos;
-  ii_buffer_counter *counters;
-  uint32_t ncounters;
+  off64_t filepos;             /* Write position of temporary file */
+  grn_id *block_buf;           /* Buffer for the current block */
+  size_t block_buf_size;       /* Size of block_buf */
+  size_t block_pos;            /* Write position of block_buf */
+  ii_buffer_counter *counters; /* Status of terms */
+  uint32_t ncounters;          /* Number of counters */
   size_t total_size;
   size_t curr_size;
-  ii_buffer_value *values;
-  unsigned int nvalues;
-  unsigned int max_nvalues;
+  ii_buffer_value *values;     /* Values in block */
+  unsigned int nvalues;        /* Number of values in block */
+  unsigned int max_nvalues;    /* Size of values */
   grn_id last_rid;
+
   // stuff for merging
   grn_ii *ii;
   uint32_t lseg;
@@ -7685,6 +7694,27 @@ allocate_outbuf(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
   return (uint8_t *)GRN_MALLOC(bufsize);
 }
 
+/*
+ * The temporary file format is roughly as follows:
+ *
+ * File  = Block...
+ * Block = Unit...
+ * Unit  = TermChunk (key order)
+ *         NextUnitSize (The first unit size is kept on memory)
+ * Chunk = Term...
+ * Term  = ID (gtid)
+ *         NumSections (nrecs), NumValues (nposts)
+ *         RecordID... (rid, diff)
+ *         [SectionID... (sid, diff)]
+ *         TermFrequency... (tf, diff)
+ *         [Weight... (weight, diff)]
+ *         [Position... (pos, diff)]
+ */
+
+/*
+ * encode_terms encodes terms in ii_buffer->tmp_lexicon and returns the
+ * expected temporary file size.
+ */
 static size_t
 encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
              uint8_t *outbuf, ii_buffer_block *block)
@@ -7693,6 +7723,7 @@ encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
   uint8_t *outbufp = outbuf;
   uint8_t *outbufp_ = outbuf;
   grn_table_cursor  *tc;
+  /* The first size is written into block->nextsize. */
   uint8_t *pnext = (uint8_t *)&block->nextsize;
   uint32_t flags = ii_buffer->ii->header->flags;
   tc = grn_table_cursor_open(ctx, ii_buffer->tmp_lexicon,
@@ -7701,6 +7732,7 @@ encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
     char key[GRN_TABLE_MAX_KEY_SIZE];
     int key_size = grn_table_get_key(ctx, ii_buffer->tmp_lexicon, tid,
                                      key, GRN_TABLE_MAX_KEY_SIZE);
+    /* gtid is a global term ID, not in a temporary lexicon. */
     grn_id gtid = grn_table_add(ctx, ii_buffer->lexicon, key, key_size, NULL);
     ii_buffer_counter *counter = &ii_buffer->counters[tid - 1];
     if (counter->nrecs) {
@@ -7746,6 +7778,7 @@ encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
   return outbufp - outbuf;
 }
 
+/* encode_postings encodes data in ii_buffer->block_buf. */
 static void
 encode_postings(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf)
 {
@@ -7822,6 +7855,7 @@ encode_postings(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf)
   }
 }
 
+/* encode_last_tf encodes last_tf and last_weight in counters. */
 static void
 encode_last_tf(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf)
 {
@@ -7840,8 +7874,8 @@ encode_last_tf(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf)
 }
 
 /*
- * grn_ii_buffer_flush flushes the current block (ii_buffer->buffer, counters
- * and tmp_lexicon) to a temporary file (ii_buffer->tmpfd).
+ * grn_ii_buffer_flush flushes the current block (ii_buffer->block_buf,
+ * counters and tmp_lexicon) to a temporary file (ii_buffer->tmpfd).
  * Also, block information is stored into ii_buffer->blocks.
  */
 static void
@@ -8059,11 +8093,13 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid)
   ii_buffer->nvalues = 0;
 }
 
+/* grn_ii_buffer_fetch fetches the next term. */
 static void
 grn_ii_buffer_fetch(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
                     ii_buffer_block *block)
 {
   if (!block->rest) {
+    /* Read the next unit. */
     if (block->head < block->tail) {
       size_t bytesize = block->nextsize;
       if (block->buffersize < block->nextsize) {
@@ -8517,9 +8553,14 @@ grn_ii_buffer_append(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
   return ctx->rc;
 }
 
+/*
+ * grn_ii_buffer_commit completes tokenization and builds an inverted index
+ * from data in a temporary file.
+ */
 grn_rc
 grn_ii_buffer_commit(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
 {
+  /* Tokenize the remaining values and free resources. */
   if (ii_buffer->last_rid && ii_buffer->nvalues) {
     grn_ii_buffer_tokenize(ctx, ii_buffer, ii_buffer->last_rid);
   }
@@ -8562,6 +8603,7 @@ grn_ii_buffer_commit(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
     return ctx->rc;
   }
   {
+    /* Fetch the first term of each block. */
     uint32_t i;
     for (i = 0; i < ii_buffer->nblocks; i++) {
       grn_ii_buffer_fetch(ctx, ii_buffer, &ii_buffer->blocks[i]);
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index