[Groonga-commit] groonga/groonga at 5d2675f [master] highligher: implement lexicon mode

Back to archive index

Kouhei Sutou null+****@clear*****
Wed May 9 18:13:15 JST 2018


Kouhei Sutou	2018-05-09 18:13:15 +0900 (Wed, 09 May 2018)

  New Revision: 5d2675f7be5e6bb8e7954079283eb5d52bf67264
  https://github.com/groonga/groonga/commit/5d2675f7be5e6bb8e7954079283eb5d52bf67264

  Message:
    highligher: implement lexicon mode
    
    TODO:
    
      * Performance tuning

  Modified files:
    lib/highlighter.c

  Modified: lib/highlighter.c (+262 -27)
===================================================================
--- lib/highlighter.c    2018-05-09 17:53:04 +0900 (f87d4e7e5)
+++ lib/highlighter.c    2018-05-09 18:13:15 +0900 (ef8e2d6b6)
@@ -17,7 +17,10 @@
 */
 
 #include "grn.h"
+#include "grn_token_cursor.h"
+
 #include "grn_pat.h"
+#include "grn_str.h"
 
 /*
  * TODO:
@@ -39,11 +42,20 @@ struct _grn_highlighter {
   } tag;
 
   /* For lexicon mode */
-  grn_obj *lexicon;
-  grn_obj keyword_token_ids;
+  struct {
+    grn_obj *object;
+    grn_encoding encoding;
+    grn_obj *token_id_chunks;
+    grn_obj token_id_chunk;
+    grn_obj token_ids;
+    grn_obj offsets;
+    grn_obj lengths;
+  } lexicon;
 
   /* For patricia trie mode */
-  grn_obj *keywords;
+  struct {
+    grn_obj *keywords;
+  } pat;
 };
 
 grn_highlighter *
@@ -75,10 +87,15 @@ grn_highlighter_open(grn_ctx *ctx)
   highlighter->tag.close = "</span>";
   highlighter->tag.close_length = strlen(highlighter->tag.close);
 
-  highlighter->lexicon = NULL;
-  GRN_RECORD_INIT(&(highlighter->keyword_token_ids), GRN_OBJ_VECTOR, GRN_ID_NIL);
+  highlighter->lexicon.object = NULL;
+  highlighter->lexicon.encoding = GRN_ENC_NONE;
+  highlighter->lexicon.token_id_chunks = NULL;
+  GRN_TEXT_INIT(&(highlighter->lexicon.token_id_chunk), 0);
+  GRN_RECORD_INIT(&(highlighter->lexicon.token_ids), GRN_OBJ_VECTOR, GRN_ID_NIL);
+  GRN_UINT64_INIT(&(highlighter->lexicon.offsets), GRN_OBJ_VECTOR);
+  GRN_UINT32_INIT(&(highlighter->lexicon.lengths), GRN_OBJ_VECTOR);
 
-  highlighter->keywords = NULL;
+  highlighter->pat.keywords = NULL;
 
   GRN_API_RETURN(highlighter);
 }
@@ -93,11 +110,16 @@ grn_highlighter_close(grn_ctx *ctx,
     GRN_API_RETURN(ctx->rc);
   }
 
-  if (highlighter->keywords) {
-    grn_obj_close(ctx, highlighter->keywords);
+  if (highlighter->pat.keywords) {
+    grn_obj_close(ctx, highlighter->pat.keywords);
   }
 
-  GRN_OBJ_FIN(ctx, &(highlighter->keyword_token_ids));
+  if (highlighter->lexicon.token_id_chunks) {
+    grn_obj_close(ctx, highlighter->lexicon.token_id_chunks);
+  }
+  GRN_OBJ_FIN(ctx, &(highlighter->lexicon.token_ids));
+  GRN_OBJ_FIN(ctx, &(highlighter->lexicon.offsets));
+  GRN_OBJ_FIN(ctx, &(highlighter->lexicon.lengths));
 
   GRN_OBJ_FIN(ctx, &(highlighter->raw_keywords));
   GRN_FREE(highlighter);
@@ -109,24 +131,106 @@ static void
 grn_highlighter_prepare_lexicon(grn_ctx *ctx,
                                 grn_highlighter *highlighter)
 {
-  /* TODO */
+  size_t i, n_keywords;
+  grn_obj *token_id_chunk = &(highlighter->lexicon.token_id_chunk);
+
+  highlighter->lexicon.token_ids.header.domain =
+    grn_obj_id(ctx, highlighter->lexicon.object);
+
+  grn_table_get_info(ctx,
+                     highlighter->lexicon.object,
+                     NULL,
+                     &(highlighter->lexicon.encoding),
+                     NULL,
+                     NULL,
+                     NULL);
+
+  if (highlighter->lexicon.token_id_chunks) {
+    grn_obj_close(ctx, highlighter->lexicon.token_id_chunks);
+  }
+  highlighter->lexicon.token_id_chunks =
+    grn_table_create(ctx,
+                     NULL, 0,
+                     NULL,
+                     GRN_OBJ_TABLE_PAT_KEY,
+                     grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
+                     NULL);
+  if (!highlighter->lexicon.token_id_chunks) {
+    grn_rc rc = ctx->rc;
+    if (rc == GRN_SUCCESS) {
+      rc = GRN_UNKNOWN_ERROR;
+    }
+    ERR(rc,
+        "[highlighter][prepare][lexicon] "
+        "failed to create an internal patricia trie: %s",
+        ctx->errbuf);
+    return;
+  }
+
+  n_keywords = grn_vector_size(ctx, &(highlighter->raw_keywords));
+  for (i = 0; i < n_keywords; i++) {
+    const char *keyword;
+    unsigned int keyword_length;
+    grn_token_cursor *cursor;
+    grn_id token_id;
+
+    keyword_length = grn_vector_get_element(ctx,
+                                            &(highlighter->raw_keywords),
+                                            i,
+                                            &keyword,
+                                            NULL,
+                                            NULL);
+    cursor = grn_token_cursor_open(ctx,
+                                   highlighter->lexicon.object,
+                                   keyword,
+                                   keyword_length,
+                                   GRN_TOKENIZE_ADD,
+                                   0);
+    if (!cursor) {
+      continue;
+    }
+    while (grn_token_cursor_next(ctx, cursor) != GRN_ID_NIL) {
+    }
+    grn_token_cursor_close(ctx, cursor);
+
+    cursor = grn_token_cursor_open(ctx,
+                                   highlighter->lexicon.object,
+                                   keyword,
+                                   keyword_length,
+                                   GRN_TOKENIZE_GET,
+                                   0);
+    if (!cursor) {
+      continue;
+    }
+    GRN_BULK_REWIND(token_id_chunk);
+    while ((token_id = grn_token_cursor_next(ctx, cursor)) != GRN_ID_NIL) {
+      GRN_TEXT_PUT(ctx, token_id_chunk, &token_id, sizeof(grn_id));
+    }
+    grn_token_cursor_close(ctx, cursor);
+    grn_table_add(ctx,
+                  highlighter->lexicon.token_id_chunks,
+                  GRN_TEXT_VALUE(token_id_chunk),
+                  GRN_TEXT_LEN(token_id_chunk),
+                  NULL);
+  }
 }
 
 static void
 grn_highlighter_prepare_patricia_trie(grn_ctx *ctx,
                                       grn_highlighter *highlighter)
 {
-  if (highlighter->keywords) {
-    grn_obj_close(ctx, highlighter->keywords);
+  if (highlighter->pat.keywords) {
+    grn_obj_close(ctx, highlighter->pat.keywords);
   }
 
-  highlighter->keywords = grn_table_create(ctx,
-                                           NULL, 0,
-                                           NULL,
-                                           GRN_OBJ_TABLE_PAT_KEY,
-                                           grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
-                                           NULL);
-  if (!highlighter->keywords) {
+  highlighter->pat.keywords =
+    grn_table_create(ctx,
+                     NULL, 0,
+                     NULL,
+                     GRN_OBJ_TABLE_PAT_KEY,
+                     grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
+                     NULL);
+  if (!highlighter->pat.keywords) {
     grn_rc rc = ctx->rc;
     if (rc == GRN_SUCCESS) {
       rc = GRN_UNKNOWN_ERROR;
@@ -139,7 +243,7 @@ grn_highlighter_prepare_patricia_trie(grn_ctx *ctx,
   }
 
   grn_obj_set_info(ctx,
-                   highlighter->keywords,
+                   highlighter->pat.keywords,
                    GRN_INFO_NORMALIZER,
                    grn_ctx_get(ctx, "NormalizerAuto", -1));
 
@@ -158,7 +262,7 @@ grn_highlighter_prepare_patricia_trie(grn_ctx *ctx,
                                             NULL,
                                             NULL);
       grn_table_add(ctx,
-                    highlighter->keywords,
+                    highlighter->pat.keywords,
                     keyword,
                     keyword_size,
                     NULL);
@@ -170,11 +274,12 @@ static void
 grn_highlighter_prepare(grn_ctx *ctx,
                         grn_highlighter *highlighter)
 {
-  if (highlighter->lexicon) {
+  if (highlighter->lexicon.object) {
     grn_highlighter_prepare_lexicon(ctx, highlighter);
   } else {
     grn_highlighter_prepare_patricia_trie(ctx, highlighter);
   }
+  highlighter->need_prepared = GRN_FALSE;
 }
 
 static void
@@ -184,7 +289,134 @@ grn_highlighter_highlight_lexicon(grn_ctx *ctx,
                                   size_t text_length,
                                   grn_obj *output)
 {
-  /* TODO */
+  grn_token_cursor *cursor;
+  grn_encoding encoding = highlighter->lexicon.encoding;
+  grn_obj *token_ids = &(highlighter->lexicon.token_ids);
+  grn_obj *offsets = &(highlighter->lexicon.offsets);
+  grn_obj *lengths = &(highlighter->lexicon.lengths);
+
+  GRN_BULK_REWIND(token_ids);
+  GRN_BULK_REWIND(offsets);
+  GRN_BULK_REWIND(lengths);
+  cursor = grn_token_cursor_open(ctx,
+                                 highlighter->lexicon.object,
+                                 text,
+                                 text_length,
+                                 GRN_TOKENIZE_ADD,
+                                 0);
+  if (!cursor) {
+    ERR(ctx->rc,
+        "[highlighter][highlight][lexicon] failed to start tokenizing: %s",
+        ctx->errbuf);
+    return;
+  }
+
+  while (cursor->status == GRN_TOKEN_CURSOR_DOING) {
+    grn_id token_id = grn_token_cursor_next(ctx, cursor);
+    grn_token *token;
+    uint32_t length;
+
+    if (token_id == GRN_ID_NIL) {
+      continue;
+    }
+    token = grn_token_cursor_get_token(ctx, cursor);
+    GRN_RECORD_PUT(ctx, token_ids, token_id);
+    GRN_UINT64_PUT(ctx, offsets, grn_token_get_source_offset(ctx, token));
+    length = grn_token_get_source_length(ctx, token);
+    if (grn_token_is_overlap(ctx, token)) {
+      const char *data;
+      size_t data_length;
+      int first_character_length;
+
+      data = grn_token_get_data_raw(ctx, token, &data_length);
+      first_character_length =
+        grn_charlen_(ctx, data, data + data_length, encoding);
+      GRN_UINT32_PUT(ctx, lengths, first_character_length);
+    } else {
+      GRN_UINT32_PUT(ctx, lengths, length);
+    }
+  }
+  grn_token_cursor_close(ctx, cursor);
+
+  {
+    grn_pat *chunks = (grn_pat *)(highlighter->lexicon.token_id_chunks);
+    size_t i, n_token_ids = GRN_BULK_VSIZE(token_ids) / sizeof(grn_id);
+    grn_id *raw_token_ids = (grn_id *)GRN_BULK_HEAD(token_ids);
+    uint64_t max_offset = 0;
+    uint32_t last_length = 0;
+    grn_bool have_highlight = GRN_FALSE;
+
+    for (i = 0; i < n_token_ids; i++) {
+      grn_id chunk_id;
+      uint64_t offset;
+      uint32_t length;
+
+      chunk_id = grn_pat_lcp_search(ctx,
+                                    chunks,
+                                    raw_token_ids + i,
+                                    (n_token_ids - i) * sizeof(grn_id));
+      offset = GRN_UINT64_VALUE_AT(offsets, i);
+      if (offset == 0 && GRN_TEXT_LEN(output) > 0) {
+        if (have_highlight) {
+          break;
+        }
+        max_offset = 0;
+        last_length = 0;
+        GRN_BULK_REWIND(output);
+      }
+      if (chunk_id == GRN_ID_NIL) {
+        if (offset < max_offset) {
+          continue;
+        }
+        length = GRN_UINT32_VALUE_AT(lengths, i);
+        grn_text_escape_xml(ctx,
+                            output,
+                            text + offset,
+                            length);
+        max_offset = offset + 1;
+        last_length = length;
+      } else {
+        uint32_t key_size;
+        size_t j;
+        size_t n_ids;
+
+        have_highlight = GRN_TRUE;
+
+        if (last_length > 0 && offset + 1 == max_offset) {
+          grn_bulk_truncate(ctx, output, GRN_TEXT_LEN(output) - last_length);
+          max_offset -= last_length;
+          last_length = 0;
+        } else if (offset < max_offset) {
+          continue;
+        }
+
+        _grn_pat_key(ctx, chunks, chunk_id, &key_size);
+        n_ids = key_size / sizeof(grn_id);
+        GRN_TEXT_PUT(ctx,
+                     output,
+                     highlighter->tag.open,
+                     highlighter->tag.open_length);
+        for (j = 0; j < n_ids; j++) {
+          offset = GRN_UINT64_VALUE_AT(offsets, i + j);
+          if (offset < max_offset) {
+            continue;
+          }
+          length = GRN_UINT32_VALUE_AT(lengths, i + j);
+          grn_text_escape_xml(ctx,
+                              output,
+                              text + offset,
+                              length);
+          max_offset = offset + 1;
+          last_length = length;
+        }
+        GRN_TEXT_PUT(ctx,
+                     output,
+                     highlighter->tag.close,
+                     highlighter->tag.close_length);
+        i += n_ids - 1;
+      }
+    }
+  }
 }
 
 static void
@@ -206,7 +438,7 @@ grn_highlighter_highlight_patricia_trie(grn_ctx *ctx,
     size_t chunk_length;
 
     n_hits = grn_pat_scan(ctx,
-                          (grn_pat *)(highlighter->keywords),
+                          (grn_pat *)(highlighter->pat.keywords),
                           current, current_length,
                           hits, MAX_N_HITS,
                           &rest);
@@ -277,7 +509,7 @@ grn_highlighter_highlight(grn_ctx *ctx,
     }
   }
 
-  if (highlighter->lexicon) {
+  if (highlighter->lexicon.object) {
     grn_highlighter_highlight_lexicon(ctx,
                                       highlighter,
                                       text,
@@ -302,7 +534,10 @@ grn_highlighter_set_lexicon(grn_ctx *ctx,
 {
   GRN_API_ENTER;
 
-  highlighter->lexicon = lexicon;
+  if (highlighter->lexicon.object != lexicon) {
+    highlighter->need_prepared = GRN_TRUE;
+    highlighter->lexicon.object = lexicon;
+  }
 
   GRN_API_RETURN(ctx->rc);
 }
@@ -313,7 +548,7 @@ grn_highlighter_get_lexicon(grn_ctx *ctx,
 {
   GRN_API_ENTER;
 
-  GRN_API_RETURN(highlighter->lexicon);
+  GRN_API_RETURN(highlighter->lexicon.object);
 }
 
 grn_rc
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180509/c8075fe9/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index