[Groonga-commit] groonga/groonga at 57fe8e4 [master] Fix a stop word handling bug

Back to archive index
Kouhei Sutou null+****@clear*****
Mon Mar 4 16:23:13 JST 2019


Kouhei Sutou	2019-03-04 16:23:13 +0900 (Mon, 04 Mar 2019)

  Revision: 57fe8e4eb6d151a90fb8c2b9a8653663fd998376
  https://github.com/groonga/groonga/commit/57fe8e4eb6d151a90fb8c2b9a8653663fd998376

  Message:
    Fix a stop word handling bug
    
    If the first token is skipped as a stop word, following search was buggy.

  Modified files:
    lib/expr.c
    lib/ii.c
    test/command/suite/token_filters/stop_word/column.expected
    test/command/suite/token_filters/stop_word/column.test
    test/command/suite/token_filters/stop_word/offline_index_construction.expected
    test/command/suite/token_filters/stop_word/offline_index_construction.test
    test/command/suite/token_filters/stop_word/skip.expected
    test/command/suite/token_filters/stop_word/skip.test

  Modified: lib/expr.c (+17 -1)
===================================================================
--- lib/expr.c    2019-03-04 16:13:40 +0900 (40ae8d247)
+++ lib/expr.c    2019-03-04 16:23:13 +0900 (b2cc25ad9)
@@ -3474,6 +3474,8 @@ typedef struct {
   scan_info *scan_info;
   grn_obj *res;
   grn_id min_id;
+  grn_bool is_skipped;
+  grn_bool is_first_unskipped_scan_info;
 } grn_table_select_data;
 
 static void
@@ -4125,7 +4127,9 @@ grn_table_select_index_match(grn_ctx *ctx,
     }
   }
   ctx->flags &= ~GRN_CTX_TEMPORARY_DISABLE_II_RESOLVE_SEL_AND;
-  if (!(optarg.match_info.flags & GRN_MATCH_INFO_ONLY_SKIP_TOKEN)) {
+  if (optarg.match_info.flags & GRN_MATCH_INFO_ONLY_SKIP_TOKEN) {
+    data->is_skipped = GRN_TRUE;
+  } else {
     grn_ii_resolve_sel_and(ctx, (grn_hash *)res, si->logical_op);
   }
   if ((si->logical_op == GRN_OP_AND) ||
@@ -4905,6 +4909,8 @@ grn_table_select(grn_ctx *ctx, grn_obj *table, grn_obj *expr,
       data.scanner = scanner;
       data.res = res;
       data.min_id = GRN_ID_NIL;
+      data.is_skipped = GRN_FALSE;
+      data.is_first_unskipped_scan_info = GRN_TRUE;
       if (res_size > 0 && op == GRN_OP_AND) {
         grn_bool have_push = GRN_FALSE;
         for (i = 0; i < scanner->n_sis; i++) {
@@ -4941,6 +4947,16 @@ grn_table_select(grn_ctx *ctx, grn_obj *table, grn_obj *expr,
         scan_info *si = scanner->sis[i];
         data.nth_scan_info = i;
         data.scan_info = si;
+        if (i > 0 && data.is_first_unskipped_scan_info) {
+          if (data.is_skipped) {
+            if (si->logical_op == GRN_OP_AND) {
+              si->logical_op = GRN_OP_OR;
+            }
+          } else {
+            data.is_first_unskipped_scan_info = GRN_FALSE;
+          }
+        }
+        data.is_skipped = GRN_FALSE;
         if (si->flags & SCAN_POP) {
           grn_obj *res_;
           GRN_PTR_POP(&res_stack, res_);

  Modified: lib/ii.c (+2 -1)
===================================================================
--- lib/ii.c    2019-03-04 16:13:40 +0900 (5a723bad4)
+++ lib/ii.c    2019-03-04 16:23:13 +0900 (019c72a3c)
@@ -9043,7 +9043,8 @@ grn_ii_select_data_fin(grn_ctx *ctx,
     GRN_OBJ_FIN(ctx, &(data->record.term_weights));
   }
 
-  if (data->set_min_enable_for_and_query) {
+  if (data->set_min_enable_for_and_query &&
+      !data->only_skip_token) {
     if (data->current_min > data->previous_min) {
       data->optarg->match_info->min = data->current_min;
     }

  Modified: test/command/suite/token_filters/stop_word/column.expected (+1 -1)
===================================================================
--- test/command/suite/token_filters/stop_word/column.expected    2019-03-04 16:13:40 +0900 (3c563762c)
+++ test/command/suite/token_filters/stop_word/column.expected    2019-03-04 16:23:13 +0900 (4b27062ff)
@@ -22,7 +22,7 @@ load --table Memos
 {"content": "Good-bye"}
 ]
 [[0,0.0,0.0],3]
-select Memos --match_columns content --query "Hello and"
+select Memos   --match_columns content   --query "Hello and"   --match_escalation_threshold -1
 [
   [
     0,

  Modified: test/command/suite/token_filters/stop_word/column.test (+4 -1)
===================================================================
--- test/command/suite/token_filters/stop_word/column.test    2019-03-04 16:13:40 +0900 (fc4ebd34a)
+++ test/command/suite/token_filters/stop_word/column.test    2019-03-04 16:23:13 +0900 (7c33b9ddf)
@@ -19,4 +19,7 @@ load --table Memos
 {"content": "Good-bye"}
 ]
 
-select Memos --match_columns content --query "Hello and"
+select Memos \
+  --match_columns content \
+  --query "Hello and" \
+  --match_escalation_threshold -1

  Modified: test/command/suite/token_filters/stop_word/offline_index_construction.expected (+1 -1)
===================================================================
--- test/command/suite/token_filters/stop_word/offline_index_construction.expected    2019-03-04 16:13:40 +0900 (3f8a18044)
+++ test/command/suite/token_filters/stop_word/offline_index_construction.expected    2019-03-04 16:23:13 +0900 (cc858813e)
@@ -22,7 +22,7 @@ load --table Terms
 {"_key": "and", "is_stop_word": true}
 ]
 [[0,0.0,0.0],1]
-select Memos --match_columns content --query "Hello and"
+select Memos   --match_columns content   --query "Hello and"   --match_escalation_threshold -1   --sort_keys -_score
 [
   [
     0,

  Modified: test/command/suite/token_filters/stop_word/offline_index_construction.test (+5 -1)
===================================================================
--- test/command/suite/token_filters/stop_word/offline_index_construction.test    2019-03-04 16:13:40 +0900 (75c516e47)
+++ test/command/suite/token_filters/stop_word/offline_index_construction.test    2019-03-04 16:23:13 +0900 (dfc5c07e4)
@@ -22,4 +22,8 @@ load --table Terms
 {"_key": "and", "is_stop_word": true}
 ]
 
-select Memos --match_columns content --query "Hello and"
+select Memos \
+  --match_columns content \
+  --query "Hello and" \
+  --match_escalation_threshold -1 \
+  --sort_keys -_score

  Modified: test/command/suite/token_filters/stop_word/skip.expected (+1 -1)
===================================================================
--- test/command/suite/token_filters/stop_word/skip.expected    2019-03-04 16:13:40 +0900 (d2f6583b0)
+++ test/command/suite/token_filters/stop_word/skip.expected    2019-03-04 16:23:13 +0900 (ea73614d9)
@@ -22,7 +22,7 @@ load --table Memos
 {"content": "Good-bye"}
 ]
 [[0,0.0,0.0],3]
-select Memos --match_columns content --query "Hello and"
+select Memos   --match_columns content   --query "Hello and"   --match_escalation_threshold -1   --sort_keys -_score
 [
   [
     0,

  Modified: test/command/suite/token_filters/stop_word/skip.test (+5 -1)
===================================================================
--- test/command/suite/token_filters/stop_word/skip.test    2019-03-04 16:13:40 +0900 (364e09026)
+++ test/command/suite/token_filters/stop_word/skip.test    2019-03-04 16:23:13 +0900 (c7acc5fef)
@@ -22,4 +22,8 @@ load --table Memos
 {"content": "Good-bye"}
 ]
 
-select Memos --match_columns content --query "Hello and"
+select Memos \
+  --match_columns content \
+  --query "Hello and" \
+  --match_escalation_threshold -1 \
+  --sort_keys -_score
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190304/6df20cbe/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index