[Groonga-commit] groonga/groonga at 3561e0d [master] highlighter: reuse created token id chunks for performance

Back to archive index

Kouhei Sutou null+****@clear*****
Tue May 15 11:07:46 JST 2018


Kouhei Sutou	2018-05-15 11:07:46 +0900 (Tue, 15 May 2018)

  New Revision: 3561e0d8b11ace0df649e6e7610b274b015e95dd
  https://github.com/groonga/groonga/commit/3561e0d8b11ace0df649e6e7610b274b015e95dd

  Message:
    highlighter: reuse created token id chunks for performance
    
    It may be better that we remove all existing records on many keywords
    case.

  Modified files:
    lib/highlighter.c

  Modified: lib/highlighter.c (+115 -75)
===================================================================
--- lib/highlighter.c    2018-05-15 10:54:20 +0900 (50557b1c1)
+++ lib/highlighter.c    2018-05-15 11:07:46 +0900 (127718fcf)
@@ -66,6 +66,7 @@ struct _grn_highlighter {
     grn_obj *object;
     grn_encoding encoding;
     grn_obj *token_id_chunks;
+    grn_obj token_id_chunk_ids;
     grn_obj token_id_chunk;
     grn_obj token_ids;
     grn_obj token_locations;
@@ -111,6 +112,9 @@ grn_highlighter_open(grn_ctx *ctx)
   highlighter->lexicon.object = NULL;
   highlighter->lexicon.encoding = GRN_ENC_NONE;
   highlighter->lexicon.token_id_chunks = NULL;
+  GRN_RECORD_INIT(&(highlighter->lexicon.token_id_chunk_ids),
+                  GRN_OBJ_VECTOR,
+                  GRN_ID_NIL);
   GRN_TEXT_INIT(&(highlighter->lexicon.token_id_chunk), 0);
   GRN_RECORD_INIT(&(highlighter->lexicon.token_ids), GRN_OBJ_VECTOR, GRN_ID_NIL);
   GRN_TEXT_INIT(&(highlighter->lexicon.token_locations), 0);
@@ -140,6 +144,7 @@ grn_highlighter_close(grn_ctx *ctx,
   if (highlighter->lexicon.token_id_chunks) {
     grn_obj_close(ctx, highlighter->lexicon.token_id_chunks);
   }
+  GRN_OBJ_FIN(ctx, &(highlighter->lexicon.token_id_chunk_ids));
   GRN_OBJ_FIN(ctx, &(highlighter->lexicon.candidates));
   GRN_OBJ_FIN(ctx, &(highlighter->lexicon.token_locations));
   GRN_OBJ_FIN(ctx, &(highlighter->lexicon.token_ids));
@@ -151,9 +156,60 @@ grn_highlighter_close(grn_ctx *ctx,
 }
 
 static void
+grn_highlighter_remove_unused_ids(grn_ctx *ctx,
+                                  grn_obj *table,
+                                  grn_obj *added_ids,
+                                  const char *tag)
+{
+  grn_table_cursor *cursor;
+  size_t n;
+  grn_id id;
+
+  cursor = grn_table_cursor_open(ctx,
+                                 table,
+                                 NULL, 0,
+                                 NULL, 0,
+                                 0, -1, 0);
+  if (!cursor) {
+    grn_rc rc = ctx->rc;
+    if (rc == GRN_SUCCESS) {
+      rc = GRN_UNKNOWN_ERROR;
+    }
+    ERR(rc,
+        "[highlighter][prepare]%s "
+        "failed to create a cursor for internal patricia trie: %s",
+        tag,
+        ctx->errbuf);
+    return;
+  }
+
+  n = GRN_BULK_VSIZE(added_ids) / sizeof(grn_id);
+  while ((id = grn_table_cursor_next(ctx, cursor)) != GRN_ID_NIL) {
+    size_t i;
+    grn_bool using = GRN_FALSE;
+
+    for (i = 0; i < n; i++) {
+      if (id == GRN_RECORD_VALUE_AT(added_ids, i)) {
+        using = GRN_TRUE;
+        break;
+      }
+    }
+
+    if (using) {
+      continue;
+    }
+
+    grn_table_cursor_delete(ctx, cursor);
+  }
+  grn_table_cursor_close(ctx, cursor);
+}
+
+static void
 grn_highlighter_prepare_lexicon(grn_ctx *ctx,
                                 grn_highlighter *highlighter)
 {
+  grn_bool have_token_id_chunks = GRN_FALSE;
+  grn_obj *token_id_chunk_ids = &(highlighter->lexicon.token_id_chunk_ids);
   size_t i, n_keywords;
   grn_obj *token_id_chunk = &(highlighter->lexicon.token_id_chunk);
 
@@ -169,27 +225,35 @@ grn_highlighter_prepare_lexicon(grn_ctx *ctx,
                      NULL);
 
   if (highlighter->lexicon.token_id_chunks) {
-    grn_obj_close(ctx, highlighter->lexicon.token_id_chunks);
-  }
-  highlighter->lexicon.token_id_chunks =
-    grn_table_create(ctx,
-                     NULL, 0,
-                     NULL,
-                     GRN_OBJ_TABLE_PAT_KEY,
-                     grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
-                     NULL);
-  if (!highlighter->lexicon.token_id_chunks) {
-    grn_rc rc = ctx->rc;
-    if (rc == GRN_SUCCESS) {
-      rc = GRN_UNKNOWN_ERROR;
+    /* TODO: It may be better that we remove all existing records here
+     * for many keywords case. */
+    have_token_id_chunks =
+      grn_table_size(ctx, highlighter->lexicon.token_id_chunks) > 0;
+  } else {
+    highlighter->lexicon.token_id_chunks =
+      grn_table_create(ctx,
+                       NULL, 0,
+                       NULL,
+                       GRN_OBJ_TABLE_PAT_KEY,
+                       grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
+                       NULL);
+    if (!highlighter->lexicon.token_id_chunks) {
+      grn_rc rc = ctx->rc;
+      if (rc == GRN_SUCCESS) {
+        rc = GRN_UNKNOWN_ERROR;
+      }
+      ERR(rc,
+          "[highlighter][prepare][lexicon] "
+          "failed to create an internal patricia trie: %s",
+          ctx->errbuf);
+      return;
     }
-    ERR(rc,
-        "[highlighter][prepare][lexicon] "
-        "failed to create an internal patricia trie: %s",
-        ctx->errbuf);
-    return;
+    token_id_chunk_ids->header.domain =
+      grn_obj_id(ctx, highlighter->lexicon.token_id_chunks);
   }
 
+  GRN_BULK_REWIND(token_id_chunk_ids);
+
   n_keywords = grn_vector_size(ctx, &(highlighter->raw_keywords));
   for (i = 0; i < n_keywords; i++) {
     const char *keyword;
@@ -230,18 +294,37 @@ grn_highlighter_prepare_lexicon(grn_ctx *ctx,
       GRN_TEXT_PUT(ctx, token_id_chunk, &token_id, sizeof(grn_id));
     }
     grn_token_cursor_close(ctx, cursor);
+
     {
-      grn_encoding encoding = ctx->encoding;
-      /* token_id_chunk is a binary data */
-      ctx->encoding = GRN_ENC_NONE;
-      grn_table_add(ctx,
-                    highlighter->lexicon.token_id_chunks,
-                    GRN_TEXT_VALUE(token_id_chunk),
-                    GRN_TEXT_LEN(token_id_chunk),
-                    NULL);
-      ctx->encoding = encoding;
+      grn_id token_id_chunk_id;
+
+      {
+        grn_encoding encoding = ctx->encoding;
+        /* token_id_chunk is a binary data */
+        ctx->encoding = GRN_ENC_NONE;
+        token_id_chunk_id = grn_table_add(ctx,
+                                          highlighter->lexicon.token_id_chunks,
+                                          GRN_TEXT_VALUE(token_id_chunk),
+                                          GRN_TEXT_LEN(token_id_chunk),
+                                          NULL);
+        ctx->encoding = encoding;
+      }
+      if (!have_token_id_chunks) {
+        continue;
+      }
+      if (token_id_chunk_id == GRN_ID_NIL) {
+        continue;
+      }
+      GRN_RECORD_PUT(ctx, token_id_chunk_ids, token_id_chunk_id);
     }
   }
+
+  if (have_token_id_chunks) {
+    grn_highlighter_remove_unused_ids(ctx,
+                                      highlighter->lexicon.token_id_chunks,
+                                      token_id_chunk_ids,
+                                      "[prepare][lexicon]");
+  }
 }
 
 static void
@@ -314,54 +397,11 @@ grn_highlighter_prepare_patricia_trie(grn_ctx *ctx,
     }
   }
 
-  {
-    size_t i, n;
-    grn_table_cursor *cursor;
-
-    n = GRN_BULK_VSIZE(keyword_ids) / sizeof(grn_id);
-    if (n == 0) {
-      return;
-    }
-
-    cursor = grn_table_cursor_open(ctx,
-                                   highlighter->pat.keywords,
-                                   NULL, 0,
-                                   NULL, 0,
-                                   0, -1, 0);
-    if (!cursor) {
-      grn_rc rc = ctx->rc;
-      if (rc == GRN_SUCCESS) {
-        rc = GRN_UNKNOWN_ERROR;
-      }
-      ERR(rc,
-          "[highlighter][prepare][no-lexicon] "
-          "failed to create a cursor for internal patricia trie: %s",
-          ctx->errbuf);
-      return;
-    }
-
-    for (i = 0; i < n; i++) {
-      grn_id id;
-
-      while ((id = grn_table_cursor_next(ctx, cursor)) != GRN_ID_NIL) {
-        size_t i;
-        grn_bool specified = GRN_FALSE;
-
-        for (i = 0; i < n; i++) {
-          if (id == GRN_RECORD_VALUE_AT(keyword_ids, i)) {
-            specified = GRN_TRUE;
-            break;
-          }
-        }
-
-        if (specified) {
-          continue;
-        }
-
-        grn_table_cursor_delete(ctx, cursor);
-      }
-      grn_table_cursor_close(ctx, cursor);
-    }
+  if (have_keywords) {
+    grn_highlighter_remove_unused_ids(ctx,
+                                      highlighter->pat.keywords,
+                                      keyword_ids,
+                                      "[prepare][no-lexicon]");
   }
 }
 
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180515/7619ae0a/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index