[Groonga-commit] groonga/groonga at dca7ca6 [highlight-support-similar-search] highlight_html: don't use too much appeared tokens

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Mar 17 15:01:19 JST 2017


Kouhei Sutou	2017-03-17 15:01:19 +0900 (Fri, 17 Mar 2017)

  New Revision: dca7ca6017d119cc8d5541e5624b27286eda2c06
  https://github.com/groonga/groonga/commit/dca7ca6017d119cc8d5541e5624b27286eda2c06

  Message:
    highlight_html: don't use too much appeared tokens

  Modified files:
    lib/expr.c
    test/command/suite/select/function/highlight_html/similar_search.expected
    test/command/suite/select/function/highlight_html/similar_search.test

  Modified: lib/expr.c (+10 -0)
===================================================================
--- lib/expr.c    2017-03-17 14:00:24 +0900 (ee60806)
+++ lib/expr.c    2017-03-17 15:01:19 +0900 (a7236c1)
@@ -8754,12 +8754,22 @@ grn_expr_get_keywords(grn_ctx *ctx, grn_obj *expr, grn_obj *keywords)
                                                    GRN_TOKENIZE_GET,
                                                    token_flags);
               if (token_cursor) {
+                grn_obj *source_table;
+                uint32_t n_records_threshold;
+                source_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, index));
+                n_records_threshold = grn_table_size(ctx, source_table) / 2;
                 while (token_cursor->status != GRN_TOKEN_CURSOR_DONE) {
                   grn_id token_id;
+                  uint32_t n_estimated_records;
                   token_id = grn_token_cursor_next(ctx, token_cursor);
                   if (token_id == GRN_ID_NIL) {
                     continue;
                   }
+                  n_estimated_records =
+                    grn_ii_estimate_size(ctx, (grn_ii *)index, token_id);
+                  if (n_estimated_records >= n_records_threshold) {
+                    continue;
+                  }
                   grn_vector_add_element(ctx,
                                          keywords,
                                          token_cursor->curr,

  Modified: test/command/suite/select/function/highlight_html/similar_search.expected (+7 -12)
===================================================================
--- test/command/suite/select/function/highlight_html/similar_search.expected    2017-03-17 14:00:24 +0900 (7633b78)
+++ test/command/suite/select/function/highlight_html/similar_search.expected    2017-03-17 15:01:19 +0900 (7a53203)
@@ -1,26 +1,21 @@
-plugin_register token_filters/stop_word
-[[0,0.0,0.0],true]
 table_create Entries TABLE_NO_KEY
 [[0,0.0,0.0],true]
 column_create Entries body COLUMN_SCALAR ShortText
 [[0,0.0,0.0],true]
-table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram   --normalizer NormalizerAuto   --token_filters TokenFilterStopWord
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram   --normalizer NormalizerAuto
 [[0,0.0,0.0],true]
 column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body
 [[0,0.0,0.0],true]
 column_create Terms is_stop_word COLUMN_SCALAR Bool
 [[0,0.0,0.0],true]
-load --table Terms
-[
-{"_key": "is", "is_stop_word": true},
-{"_key": ".",  "is_stop_word": true}
-]
-[[0,0.0,0.0],2]
 load --table Entries
 [
-{"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."}
+{"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."},
+{"body": "It is a pen."},
+{"body": "I am a boy."},
+{"body": "This is good."}
 ]
-[[0,0.0,0.0],1]
+[[0,0.0,0.0],4]
 select Entries   --filter 'body *S "Groonga is fast full text search engine. There are SQL interfaces by Mroonga and PGroonga and Ruby interface by Rroonga."'   --output_columns 'highlight_html(body)'
 [
   [
@@ -40,7 +35,7 @@ select Entries   --filter 'body *S "Groonga is fast full text search engine. The
         ]
       ],
       [
-        "<span class=\"keyword\">Mroonga</span> is a MySQL storage <span class=\"keyword\">engine</span> based on <span class=\"keyword\">Groonga</span>. &lt;b&gt;<span class=\"keyword\">Rroonga</span>&lt;/b&gt; is a <span class=\"keyword\">Ruby</span> binding of <span class=\"keyword\">Groonga</span>."
+        "<span class=\"keyword\">Mroonga</span> is a MySQL storage <span class=\"keyword\">engine</span> based on Groonga. &lt;b&gt;<span class=\"keyword\">Rroonga</span>&lt;/b&gt; is a <span class=\"keyword\">Ruby</span> binding of Groonga."
       ]
     ]
   ]

  Modified: test/command/suite/select/function/highlight_html/similar_search.test (+5 -11)
===================================================================
--- test/command/suite/select/function/highlight_html/similar_search.test    2017-03-17 14:00:24 +0900 (68e7563)
+++ test/command/suite/select/function/highlight_html/similar_search.test    2017-03-17 15:01:19 +0900 (253037c)
@@ -1,24 +1,18 @@
-plugin_register token_filters/stop_word
-
 table_create Entries TABLE_NO_KEY
 column_create Entries body COLUMN_SCALAR ShortText
 
 table_create Terms TABLE_PAT_KEY ShortText \
   --default_tokenizer TokenBigram \
-  --normalizer NormalizerAuto \
-  --token_filters TokenFilterStopWord
+  --normalizer NormalizerAuto
 column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body
 column_create Terms is_stop_word COLUMN_SCALAR Bool
 
-load --table Terms
-[
-{"_key": "is", "is_stop_word": true},
-{"_key": ".",  "is_stop_word": true}
-]
-
 load --table Entries
 [
-{"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."}
+{"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."},
+{"body": "It is a pen."},
+{"body": "I am a boy."},
+{"body": "This is good."}
 ]
 
 select Entries \
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index