[Groonga-commit] groonga/groonga at b9d6001 [master] TokenRegexp: fix a bug that "\Ax\z" returns all one character data

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Jan 6 16:57:37 JST 2017


Kouhei Sutou	2017-01-06 16:57:37 +0900 (Fri, 06 Jan 2017)

  New Revision: b9d600189174ae9706924f4805d8dd3ae54b8eb8
  https://github.com/groonga/groonga/commit/b9d600189174ae9706924f4805d8dd3ae54b8eb8

  Message:
    TokenRegexp: fix a bug that "\Ax\z" returns all one character data

  Added files:
    test/command/suite/tokenizers/regexp/get/begin_end/one.expected
    test/command/suite/tokenizers/regexp/get/begin_end/one.test
  Modified files:
    lib/tokenizers.c

  Modified: lib/tokenizers.c (+2 -1)
===================================================================
--- lib/tokenizers.c    2017-01-05 12:49:57 +0900 (87938e6)
+++ lib/tokenizers.c    2017-01-06 16:57:37 +0900 (6bd0a1b)
@@ -1,6 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
-  Copyright(C) 2009-2015 Brazil
+  Copyright(C) 2009-2017 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -603,6 +603,7 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     if (is_begin &&
         char_len == GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN &&
         memcmp(current, GRN_TOKENIZER_BEGIN_MARK_UTF8, char_len) == 0) {
+      tokenizer->is_start_token = GRN_TRUE;
       n_characters++;
       GRN_TEXT_PUT(ctx, buffer, current, char_len);
       current += char_len;

  Added: test/command/suite/tokenizers/regexp/get/begin_end/one.expected (+52 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/begin_end/one.expected    2017-01-06 16:57:37 +0900 (6fa5b6e)
@@ -0,0 +1,52 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "x" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0,
+      "force_prefix": false
+    },
+    {
+      "value": "x",
+      "position": 1,
+      "force_prefix": false
+    },
+    {
+      "value": "￰",
+      "position": 2,
+      "force_prefix": false
+    }
+  ]
+]
+table_tokenize Lexicon "￯x￰" --mode GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0,
+      "force_prefix": false
+    },
+    {
+      "value": "x",
+      "position": 1,
+      "force_prefix": false
+    },
+    {
+      "value": "￰",
+      "position": 2,
+      "force_prefix": false
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/get/begin_end/one.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/begin_end/one.test    2017-01-06 16:57:37 +0900 (28e1e0a)
@@ -0,0 +1,5 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+table_tokenize Lexicon "x" --mode ADD
+
+table_tokenize Lexicon "￯x￰" --mode GET
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index