[Groonga-commit] groonga/groonga at ef6e895 [master] TokenPattern: stop returning needless the last empty token

Back to archive index
Kouhei Sutou null+****@clear*****
Thu Feb 7 15:11:00 JST 2019


Kouhei Sutou	2019-02-07 15:11:00 +0900 (Thu, 07 Feb 2019)

  Revision: ef6e895e3d52972267801f21667debe017b31c65
  https://github.com/groonga/groonga/commit/ef6e895e3d52972267801f21667debe017b31c65

  Message:
    TokenPattern: stop returning needless the last empty token

  Added files:
    test/command/suite/select/query/match/with_index/token_pattern/match.expected
    test/command/suite/select/query/match/with_index/token_pattern/match.test
  Modified files:
    lib/tokenizers.c

  Modified: lib/tokenizers.c (+47 -22)
===================================================================
--- lib/tokenizers.c    2019-02-07 14:48:48 +0900 (41f8ee843)
+++ lib/tokenizers.c    2019-02-07 15:11:00 +0900 (4abd5450d)
@@ -1561,6 +1561,8 @@ typedef struct {
   const unsigned char *start;
   const unsigned char *next;
   const unsigned char *end;
+  const unsigned char *current;
+  size_t current_length;
 } grn_pattern_tokenizer;
 
 static void
@@ -1698,11 +1700,42 @@ pattern_init(grn_ctx *ctx, grn_tokenizer_query *query)
     tokenizer->start = (const unsigned char *)normalized;
     tokenizer->next = tokenizer->start;
     tokenizer->end = tokenizer->start + normalized_length_in_bytes;
+    tokenizer->current = NULL;
+    tokenizer->current_length = 0;
   }
 
   return tokenizer;
 }
 
+#ifdef GRN_SUPPORT_REGEXP
+static void
+pattern_search(grn_ctx *ctx,
+               grn_pattern_tokenizer *tokenizer)
+{
+  OnigPosition position;
+  OnigRegion region;
+
+  onig_region_init(&region);
+  position = onig_search(tokenizer->options->regex,
+                         tokenizer->start,
+                         tokenizer->end,
+                         tokenizer->next,
+                         tokenizer->end,
+                         &region,
+                         ONIG_OPTION_NONE);
+  if (position == ONIG_MISMATCH) {
+    tokenizer->current = NULL;
+    tokenizer->current_length = 0;
+    tokenizer->next = tokenizer->end;
+  } else {
+    tokenizer->current = tokenizer->start + region.beg[0];
+    tokenizer->current_length = region.end[0] - region.beg[0];
+    tokenizer->next = tokenizer->start + region.end[0];
+  }
+  onig_region_free(&region, 0);
+}
+#endif
+
 static void
 pattern_next(grn_ctx *ctx,
              grn_tokenizer_query *query,
@@ -1723,29 +1756,21 @@ pattern_next(grn_ctx *ctx,
         tokenizer->encoding);
 #ifdef GRN_SUPPORT_REGEXP
   } else if (tokenizer->options->regex) {
-    OnigPosition position;
-    OnigRegion region;
-
-    onig_region_init(&region);
-    position = onig_search(tokenizer->options->regex,
-                           tokenizer->start,
-                           tokenizer->end,
-                           tokenizer->next,
-                           tokenizer->end,
-                           &region,
-                           ONIG_OPTION_NONE);
-    if (position == ONIG_MISMATCH) {
-      grn_token_set_data(ctx, token, NULL, 0);
-      grn_token_set_status(ctx, token, GRN_TOKEN_LAST);
-    } else {
-      grn_token_set_data(ctx,
-                         token,
-                         tokenizer->start + region.beg[0],
-                         region.end[0] - region.beg[0]);
-      grn_token_set_status(ctx, token, GRN_TOKEN_CONTINUE);
-      tokenizer->next = tokenizer->start + region.end[0];
-      onig_region_free(&region, 0);
+    grn_token_status status = GRN_TOKEN_CONTINUE;
+    if (tokenizer->next == tokenizer->start) {
+      pattern_search(ctx, tokenizer);
     }
+    grn_token_set_data(ctx,
+                       token,
+                       tokenizer->current,
+                       tokenizer->current_length);
+    if (tokenizer->next != tokenizer->end) {
+      pattern_search(ctx, tokenizer);
+    }
+    if (tokenizer->next == tokenizer->end) {
+      status = GRN_TOKEN_LAST;
+    }
+    grn_token_set_status(ctx, token, status);
 #endif /* GRN_SUPPORT_REGEXP */
   } else {
     grn_token_set_data(ctx, token, NULL, 0);

  Added: test/command/suite/select/query/match/with_index/token_pattern/match.expected (+16 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/query/match/with_index/token_pattern/match.expected    2019-02-07 15:11:00 +0900 (fb2c6767a)
@@ -0,0 +1,16 @@
+table_create Menus TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Menus name COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+table_create Keywords TABLE_PAT_KEY ShortText   --normalize NormalizerNFKC100   --default_tokenizer 'TokenPattern("pattern", "焼き?肉")'
+[[0,0.0,0.0],true]
+column_create Keywords index COLUMN_INDEX Menus name
+[[0,0.0,0.0],true]
+load --table Menus
+[
+{"name": "焼肉定食"},
+{"name": "焼き肉定食"}
+]
+[[0,0.0,0.0],2]
+select Menus --match_columns name --query "焼き肉弁当"
+[[0,0.0,0.0],[[[1],[["_id","UInt32"],["name","Text"]],[2,"焼き肉定食"]]]]

  Added: test/command/suite/select/query/match/with_index/token_pattern/match.test (+15 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/query/match/with_index/token_pattern/match.test    2019-02-07 15:11:00 +0900 (f6e1d3dd1)
@@ -0,0 +1,15 @@
+table_create Menus TABLE_NO_KEY
+column_create Menus name COLUMN_SCALAR Text
+
+table_create Keywords TABLE_PAT_KEY ShortText \
+  --normalize NormalizerNFKC100 \
+  --default_tokenizer 'TokenPattern("pattern", "焼き?肉")'
+column_create Keywords index COLUMN_INDEX Menus name
+
+load --table Menus
+[
+{"name": "焼肉定食"},
+{"name": "焼き肉定食"}
+]
+
+select Menus --match_columns name --query "焼き肉弁当"
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190207/d9b1f6f0/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index