[Groonga-commit] groonga/groonga at 3cfdcca [master] TokenRegexp: move regular expression parser to ii.c from tokenizers

Back to archive index

Kouhei Sutou null+****@clear*****
Sun May 10 17:14:35 JST 2015


Kouhei Sutou	2015-05-10 17:14:35 +0900 (Sun, 10 May 2015)

  New Revision: 3cfdcca9663f31d613e331a087c758fa0d791b60
  https://github.com/groonga/groonga/commit/3cfdcca9663f31d613e331a087c758fa0d791b60

  Message:
    TokenRegexp: move regular expression parser to ii.c from tokenizers
    
    Now, we can use TokenRegexp for match operation (@ in script syntax)
    without escaping.

  Added files:
    test/command/suite/select/filter/index/match/token_regexp.expected
    test/command/suite/select/filter/index/match/token_regexp.test
  Removed files:
    test/command/suite/tokenizers/regexp/get/escape/one.expected
    test/command/suite/tokenizers/regexp/get/escape/one.test
    test/command/suite/tokenizers/regexp/get/escape/two.expected
    test/command/suite/tokenizers/regexp/get/escape/two.test
    test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.expected
    test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.test
  Modified files:
    lib/ii.c
    lib/tokenizers.c
    test/command/suite/tokenizers/regexp/get/begin/one.expected
    test/command/suite/tokenizers/regexp/get/begin/one.test
    test/command/suite/tokenizers/regexp/get/begin/three.expected
    test/command/suite/tokenizers/regexp/get/begin/three.test
    test/command/suite/tokenizers/regexp/get/begin/two.expected
    test/command/suite/tokenizers/regexp/get/begin/two.test
    test/command/suite/tokenizers/regexp/get/end/four.expected
    test/command/suite/tokenizers/regexp/get/end/four.test
    test/command/suite/tokenizers/regexp/get/end/one.expected
    test/command/suite/tokenizers/regexp/get/end/one.test
    test/command/suite/tokenizers/regexp/get/end/three.expected
    test/command/suite/tokenizers/regexp/get/end/three.test
    test/command/suite/tokenizers/regexp/get/end/two.expected
    test/command/suite/tokenizers/regexp/get/end/two.test

  Modified: lib/ii.c (+74 -0)
===================================================================
--- lib/ii.c    2015-05-10 15:29:38 +0900 (2d32df2)
+++ lib/ii.c    2015-05-10 17:14:35 +0900 (6f6a0ca)
@@ -6019,6 +6019,77 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string,
   return rc;
 }
 
+static grn_rc
+grn_ii_select_regexp(grn_ctx *ctx, grn_ii *ii,
+                     const char *string, unsigned int string_len,
+                     grn_hash *s, grn_operator op, grn_select_optarg *optarg)
+{
+  grn_rc rc;
+  grn_obj parsed_string;
+  grn_bool escaping = GRN_FALSE;
+  int nth_char = 0;
+  const char *current = string;
+  const char *string_end = string + string_len;
+
+  GRN_TEXT_INIT(&parsed_string, 0);
+  while (current < string_end) {
+    const char *target;
+    int char_len;
+
+    char_len = grn_charlen(ctx, current, string_end);
+    if (char_len == 0) {
+      ERR(GRN_INVALID_ARGUMENT,
+          "[ii][select][regexp] invalid encoding character: <%.*s|%#x|>",
+          (int)(current - string), string,
+          *current);
+      return ctx->rc;
+    }
+    target = current;
+    current += char_len;
+
+    if (escaping) {
+      escaping = GRN_FALSE;
+      if (char_len == 1) {
+        switch (*target) {
+        case 'A' :
+          if (nth_char == 0) {
+            target = GRN_TOKENIZER_BEGIN_MARK_UTF8;
+            char_len = GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN;
+          }
+          break;
+        case 'z' :
+          if (current == string_end) {
+            target = GRN_TOKENIZER_END_MARK_UTF8;
+            char_len = GRN_TOKENIZER_END_MARK_UTF8_LEN;
+          }
+          break;
+        default :
+          break;
+        }
+      }
+    } else {
+      if (char_len == 1 && *target == '\\') {
+        escaping = GRN_TRUE;
+        continue;
+      }
+    }
+
+    GRN_TEXT_PUT(ctx, &parsed_string, target, char_len);
+    nth_char++;
+  }
+
+  if (optarg) {
+    optarg->mode = GRN_OP_MATCH;
+  }
+
+  rc = grn_ii_select(ctx, ii,
+                     GRN_TEXT_VALUE(&parsed_string),
+                     GRN_TEXT_LEN(&parsed_string),
+                     s, op, optarg);
+
+  return rc;
+}
+
 #ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH
 static grn_bool
 grn_ii_select_sequential_search_should_use(grn_ctx *ctx,
@@ -6259,6 +6330,9 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
   if (mode == GRN_OP_TERM_EXTRACT) {
     return grn_ii_term_extract(ctx, ii, string, string_len, s, op, optarg);
   }
+  if (mode == GRN_OP_REGEXP) {
+    return grn_ii_select_regexp(ctx, ii, string, string_len, s, op, optarg);
+  }
   /* todo : support subrec
   rep = (s->record_unit == grn_rec_position || s->subrec_unit == grn_rec_position);
   orp = (s->record_unit == grn_rec_position || op == GRN_OP_OR);

  Modified: lib/tokenizers.c (+72 -106)
===================================================================
--- lib/tokenizers.c    2015-05-10 15:29:38 +0900 (dfa9cc5)
+++ lib/tokenizers.c    2015-05-10 17:14:35 +0900 (0dd0f1e)
@@ -473,8 +473,6 @@ typedef struct {
   grn_tokenizer_token token;
   grn_tokenizer_query *query;
   struct {
-    grn_bool have_begin;
-    grn_bool have_end;
     int32_t n_skip_tokens;
   } get;
   grn_bool is_begin;
@@ -514,8 +512,6 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   grn_tokenizer_token_init(ctx, &(tokenizer->token));
   tokenizer->query = query;
 
-  tokenizer->get.have_begin = GRN_FALSE;
-  tokenizer->get.have_end   = GRN_FALSE;
   tokenizer->get.n_skip_tokens = 0;
 
   tokenizer->is_begin = GRN_TRUE;
@@ -532,40 +528,6 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   tokenizer->char_types =
     grn_string_get_types(ctx, tokenizer->query->normalized_query);
 
-  if (tokenizer->query->tokenize_mode == GRN_TOKEN_GET) {
-    unsigned int query_length = tokenizer->query->length;
-    if (query_length >= 2) {
-      const char *query_string = tokenizer->query->ptr;
-      grn_encoding encoding = tokenizer->query->encoding;
-      if (query_string[0] == '\\' && query_string[1] == 'A') {
-        tokenizer->get.have_begin = GRN_TRUE;
-        /* TODO: It assumes that both "\\" and "A" are normalized to 1
-           characters. Normalizer may omit character or expand to
-           multiple characters. */
-        tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end,
-                                        encoding);
-        tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end,
-                                        encoding);
-        tokenizer->nth_char = 2;
-      }
-      if (query_string[query_length - 2] == '\\' &&
-          query_string[query_length - 1] == 'z') {
-        tokenizer->get.have_end = GRN_TRUE;
-        /* TODO: It assumes that both "\\" and "z" are normalized to 1
-           byte characters. Normalizer may omit character or expand to
-           multiple characters. */
-        tokenizer->end -= grn_charlen_(ctx,
-                                       tokenizer->end - 1,
-                                       tokenizer->end,
-                                       encoding);
-        tokenizer->end -= grn_charlen_(ctx,
-                                       tokenizer->end - 1,
-                                       tokenizer->end,
-                                       encoding);
-      }
-    }
-  }
-
   GRN_TEXT_INIT(&(tokenizer->buffer), 0);
 
   return NULL;
@@ -584,45 +546,26 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   const char *end = tokenizer->end;
   const const uint_least8_t *char_types = tokenizer->char_types;
   grn_tokenize_mode mode = tokenizer->query->tokenize_mode;
+  grn_bool is_begin = tokenizer->is_begin;
   grn_bool is_start_token = tokenizer->is_start_token;
-  grn_bool escaping = GRN_FALSE;
   grn_bool break_by_blank = GRN_FALSE;
+  grn_bool break_by_end_mark = GRN_FALSE;
 
   GRN_BULK_REWIND(buffer);
+  tokenizer->is_begin = GRN_FALSE;
   tokenizer->is_start_token = GRN_FALSE;
 
   if (char_types) {
     char_types += tokenizer->nth_char;
   }
 
-  if (mode == GRN_TOKEN_GET) {
-    if (tokenizer->get.have_begin) {
-      grn_tokenizer_token_push(ctx,
-                               &(tokenizer->token),
-                               GRN_TOKENIZER_BEGIN_MARK_UTF8,
-                               GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN,
-                               status);
-      tokenizer->get.have_begin = GRN_FALSE;
-      return NULL;
-    }
-
-    if (tokenizer->is_end && tokenizer->get.have_end) {
-      status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
-      grn_tokenizer_token_push(ctx,
-                               &(tokenizer->token),
-                               GRN_TOKENIZER_END_MARK_UTF8,
-                               GRN_TOKENIZER_END_MARK_UTF8_LEN,
-                               status);
-      return NULL;
-    }
-  } else {
-    if (tokenizer->is_begin) {
+  if (mode != GRN_TOKEN_GET) {
+    if (is_begin) {
       grn_tokenizer_token_push(ctx,
                                &(tokenizer->token),
                                GRN_TOKENIZER_BEGIN_MARK_UTF8,
                                GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN,
                                status);
-      tokenizer->is_begin = GRN_FALSE;
       return NULL;
     }
 
@@ -651,37 +594,54 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     return NULL;
   }
 
-  while (GRN_TRUE) {
-    if (!escaping && mode == GRN_TOKEN_GET &&
-        char_len == 1 && current[0] == '\\') {
-      current += char_len;
-      escaping = GRN_TRUE;
-      if (char_types) {
-        char_types++;
-      }
-    } else {
+  if (mode == GRN_TOKEN_GET) {
+    if (is_begin &&
+        char_len == GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN &&
+        memcmp(current, GRN_TOKENIZER_BEGIN_MARK_UTF8, char_len) == 0) {
       n_characters++;
       GRN_TEXT_PUT(ctx, buffer, current, char_len);
       current += char_len;
-      if (n_characters == 1) {
-        tokenizer->next = current;
-        tokenizer->nth_char++;
-        if (escaping) {
-          tokenizer->nth_char++;
-        }
-      }
-      escaping = GRN_FALSE;
-      if (char_types) {
-        uint_least8_t char_type;
-        char_type = char_types[0];
-        char_types++;
-        if (GRN_STR_ISBLANK(char_type)) {
-          break_by_blank = GRN_TRUE;
-          break;
-        }
+      tokenizer->next = current;
+      tokenizer->nth_char++;
+      if (current == end) {
+        status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
       }
-      if (n_characters == ngram_unit) {
-        break;
+      grn_tokenizer_token_push(ctx,
+                               &(tokenizer->token),
+                               GRN_TOKENIZER_BEGIN_MARK_UTF8,
+                               GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN,
+                               status);
+      return NULL;
+    }
+
+    if (current + char_len == end &&
+        char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN &&
+        memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) {
+      status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
+      grn_tokenizer_token_push(ctx,
+                               &(tokenizer->token),
+                               GRN_TOKENIZER_END_MARK_UTF8,
+                               GRN_TOKENIZER_END_MARK_UTF8_LEN,
+                               status);
+      return NULL;
+    }
+  }
+
+  while (GRN_TRUE) {
+    n_characters++;
+    GRN_TEXT_PUT(ctx, buffer, current, char_len);
+    current += char_len;
+    if (n_characters == 1) {
+      tokenizer->next = current;
+      tokenizer->nth_char++;
+    }
+
+    if (char_types) {
+      uint_least8_t char_type;
+      char_type = char_types[0];
+      char_types++;
+      if (GRN_STR_ISBLANK(char_type)) {
+        break_by_blank = GRN_TRUE;
       }
     }
 
@@ -690,6 +650,21 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     if (char_len == 0) {
       break;
     }
+
+    if (mode == GRN_TOKEN_GET &&
+        current + char_len == end &&
+        char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN &&
+        memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) {
+      break_by_end_mark = GRN_TRUE;
+    }
+
+    if (break_by_blank || break_by_end_mark) {
+      break;
+    }
+
+    if (n_characters == ngram_unit) {
+      break;
+    }
   }
 
   if (tokenizer->is_overlapping) {
@@ -702,28 +677,19 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 
   if (mode == GRN_TOKEN_GET) {
     if (current == end) {
-      if (tokenizer->get.have_end) {
-        if (tokenizer->next == end) {
-          tokenizer->is_end = GRN_TRUE;
-        }
-        if (status & GRN_TOKEN_UNMATURED) {
-          if (is_start_token) {
-            status |= GRN_TOKEN_FORCE_PREFIX;
-          } else {
-            status |= GRN_TOKEN_SKIP;
-          }
-        }
-      } else {
-        tokenizer->is_end = GRN_TRUE;
-        status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
-        if (status & GRN_TOKEN_UNMATURED) {
-          status |= GRN_TOKEN_FORCE_PREFIX;
-        }
+      tokenizer->is_end = GRN_TRUE;
+      status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
+      if (status & GRN_TOKEN_UNMATURED) {
+        status |= GRN_TOKEN_FORCE_PREFIX;
       }
     } else {
       if (break_by_blank) {
         tokenizer->get.n_skip_tokens = 0;
         tokenizer->is_start_token = GRN_TRUE;
+      } else if (break_by_end_mark) {
+        if (!is_start_token && (status & GRN_TOKEN_UNMATURED)) {
+          status |= GRN_TOKEN_SKIP;
+        }
       } else if (tokenizer->get.n_skip_tokens > 0) {
         tokenizer->get.n_skip_tokens--;
         status |= GRN_TOKEN_SKIP;

  Added: test/command/suite/select/filter/index/match/token_regexp.expected (+42 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/match/token_regexp.expected    2015-05-10 17:14:35 +0900 (76ff708)
@@ -0,0 +1,42 @@
+table_create Paths TABLE_PAT_KEY ShortText
+[[0,0.0,0.0],true]
+table_create RegexpTokens TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION   Paths _key
+[[0,0.0,0.0],true]
+load --table Paths
+[
+{"_key": "c:\\Users\\alice"},
+{"_key": "c:\\Users\\alice\\Downloads"},
+{"_key": "c:\\Users\\bob\\Downloads"}
+]
+[[0,0.0,0.0],3]
+select Paths --filter '_key @ "\\\\Users\\\\alice\\\\"'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        1
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "_key",
+          "ShortText"
+        ]
+      ],
+      [
+        2,
+        "c:\\Users\\alice\\Downloads"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/select/filter/index/match/token_regexp.test (+16 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/match/token_regexp.test    2015-05-10 17:14:35 +0900 (4d27bb0)
@@ -0,0 +1,16 @@
+table_create Paths TABLE_PAT_KEY ShortText
+
+table_create RegexpTokens TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp \
+  --normalizer NormalizerAuto
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
+  Paths _key
+
+load --table Paths
+[
+{"_key": "c:\\Users\\alice"},
+{"_key": "c:\\Users\\alice\\Downloads"},
+{"_key": "c:\\Users\\bob\\Downloads"}
+]
+
+select Paths --filter '_key @ "\\\\Users\\\\alice\\\\"'

  Modified: test/command/suite/tokenizers/regexp/get/begin/one.expected (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/begin/one.expected    2015-05-10 15:29:38 +0900 (8c9747a)
+++ test/command/suite/tokenizers/regexp/get/begin/one.expected    2015-05-10 17:14:35 +0900 (81aeaab)
@@ -22,5 +22,5 @@ table_tokenize Lexicon "x" --mode ADD
     }
   ]
 ]
-table_tokenize Lexicon "\\Ax" --mode GET
+table_tokenize Lexicon "￯x" --mode GET
 [[0,0.0,0.0],[{"value":"￯","position":0},{"value":"x","position":1}]]

  Modified: test/command/suite/tokenizers/regexp/get/begin/one.test (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/begin/one.test    2015-05-10 15:29:38 +0900 (0a4f35b)
+++ test/command/suite/tokenizers/regexp/get/begin/one.test    2015-05-10 17:14:35 +0900 (fb1cb1c)
@@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \
   --default_tokenizer TokenRegexp
 table_tokenize Lexicon "x" --mode ADD
 
-table_tokenize Lexicon "\\Ax" --mode GET
+table_tokenize Lexicon "￯x" --mode GET

  Modified: test/command/suite/tokenizers/regexp/get/begin/three.expected (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/begin/three.expected    2015-05-10 15:29:38 +0900 (878c093)
+++ test/command/suite/tokenizers/regexp/get/begin/three.expected    2015-05-10 17:14:35 +0900 (7047d1e)
@@ -30,7 +30,7 @@ table_tokenize Lexicon "xyz" --mode ADD
     }
   ]
 ]
-table_tokenize Lexicon "\\Axyz" --mode GET
+table_tokenize Lexicon "￯xyz" --mode GET
 [
   [
     0,

  Modified: test/command/suite/tokenizers/regexp/get/begin/three.test (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/begin/three.test    2015-05-10 15:29:38 +0900 (82d674f)
+++ test/command/suite/tokenizers/regexp/get/begin/three.test    2015-05-10 17:14:35 +0900 (c4efefd)
@@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \
   --default_tokenizer TokenRegexp
 table_tokenize Lexicon "xyz" --mode ADD
 
-table_tokenize Lexicon "\\Axyz" --mode GET
+table_tokenize Lexicon "￯xyz" --mode GET

  Modified: test/command/suite/tokenizers/regexp/get/begin/two.expected (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/begin/two.expected    2015-05-10 15:29:38 +0900 (8e82fb0)
+++ test/command/suite/tokenizers/regexp/get/begin/two.expected    2015-05-10 17:14:35 +0900 (588669b)
@@ -26,5 +26,5 @@ table_tokenize Lexicon "xy" --mode ADD
     }
   ]
 ]
-table_tokenize Lexicon "\\Axy" --mode GET
+table_tokenize Lexicon "￯xy" --mode GET
 [[0,0.0,0.0],[{"value":"￯","position":0},{"value":"xy","position":1}]]

  Modified: test/command/suite/tokenizers/regexp/get/begin/two.test (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/begin/two.test    2015-05-10 15:29:38 +0900 (9be9343)
+++ test/command/suite/tokenizers/regexp/get/begin/two.test    2015-05-10 17:14:35 +0900 (1a0dc73)
@@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \
   --default_tokenizer TokenRegexp
 table_tokenize Lexicon "xy" --mode ADD
 
-table_tokenize Lexicon "\\Axy" --mode GET
+table_tokenize Lexicon "￯xy" --mode GET

  Modified: test/command/suite/tokenizers/regexp/get/end/four.expected (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/end/four.expected    2015-05-10 15:29:38 +0900 (ad58a34)
+++ test/command/suite/tokenizers/regexp/get/end/four.expected    2015-05-10 17:14:35 +0900 (6ce64ce)
@@ -34,7 +34,7 @@ table_tokenize Lexicon "abcd" --mode ADD
     }
   ]
 ]
-table_tokenize Lexicon "abcd\\z" --mode GET
+table_tokenize Lexicon "abcd￰" --mode GET
 [
   [
     0,

  Modified: test/command/suite/tokenizers/regexp/get/end/four.test (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/end/four.test    2015-05-10 15:29:38 +0900 (a4b1c2d)
+++ test/command/suite/tokenizers/regexp/get/end/four.test    2015-05-10 17:14:35 +0900 (0c18d81)
@@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \
   --default_tokenizer TokenRegexp
 table_tokenize Lexicon "abcd" --mode ADD
 
-table_tokenize Lexicon "abcd\\z" --mode GET
+table_tokenize Lexicon "abcd￰" --mode GET

  Modified: test/command/suite/tokenizers/regexp/get/end/one.expected (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/end/one.expected    2015-05-10 15:29:38 +0900 (acaf793)
+++ test/command/suite/tokenizers/regexp/get/end/one.expected    2015-05-10 17:14:35 +0900 (9ba69f8)
@@ -22,5 +22,5 @@ table_tokenize Lexicon "x" --mode ADD
     }
   ]
 ]
-table_tokenize Lexicon "x\\z" --mode GET
+table_tokenize Lexicon "x￰" --mode GET
 [[0,0.0,0.0],[{"value":"x","position":0},{"value":"￰","position":1}]]

  Modified: test/command/suite/tokenizers/regexp/get/end/one.test (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/end/one.test    2015-05-10 15:29:38 +0900 (3314d6f)
+++ test/command/suite/tokenizers/regexp/get/end/one.test    2015-05-10 17:14:35 +0900 (b54d648)
@@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \
   --default_tokenizer TokenRegexp
 table_tokenize Lexicon "x" --mode ADD
 
-table_tokenize Lexicon "x\\z" --mode GET
+table_tokenize Lexicon "x￰" --mode GET

  Modified: test/command/suite/tokenizers/regexp/get/end/three.expected (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/end/three.expected    2015-05-10 15:29:38 +0900 (d082e85)
+++ test/command/suite/tokenizers/regexp/get/end/three.expected    2015-05-10 17:14:35 +0900 (aaba665)
@@ -30,7 +30,7 @@ table_tokenize Lexicon "xyz" --mode ADD
     }
   ]
 ]
-table_tokenize Lexicon "xyz\\z" --mode GET
+table_tokenize Lexicon "xyz￰" --mode GET
 [
   [
     0,

  Modified: test/command/suite/tokenizers/regexp/get/end/three.test (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/end/three.test    2015-05-10 15:29:38 +0900 (510d69c)
+++ test/command/suite/tokenizers/regexp/get/end/three.test    2015-05-10 17:14:35 +0900 (8e225df)
@@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \
   --default_tokenizer TokenRegexp
 table_tokenize Lexicon "xyz" --mode ADD
 
-table_tokenize Lexicon "xyz\\z" --mode GET
+table_tokenize Lexicon "xyz￰" --mode GET

  Modified: test/command/suite/tokenizers/regexp/get/end/two.expected (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/end/two.expected    2015-05-10 15:29:38 +0900 (40ed20b)
+++ test/command/suite/tokenizers/regexp/get/end/two.expected    2015-05-10 17:14:35 +0900 (3b94144)
@@ -26,5 +26,5 @@ table_tokenize Lexicon "xy" --mode ADD
     }
   ]
 ]
-table_tokenize Lexicon "xy\\z" --mode GET
+table_tokenize Lexicon "xy￰" --mode GET
 [[0,0.0,0.0],[{"value":"xy","position":0},{"value":"￰","position":2}]]

  Modified: test/command/suite/tokenizers/regexp/get/end/two.test (+1 -1)
===================================================================
--- test/command/suite/tokenizers/regexp/get/end/two.test    2015-05-10 15:29:38 +0900 (58b3e77)
+++ test/command/suite/tokenizers/regexp/get/end/two.test    2015-05-10 17:14:35 +0900 (3c9cc9b)
@@ -2,4 +2,4 @@ table_create Lexicon TABLE_PAT_KEY ShortText \
   --default_tokenizer TokenRegexp
 table_tokenize Lexicon "xy" --mode ADD
 
-table_tokenize Lexicon "xy\\z" --mode GET
+table_tokenize Lexicon "xy￰" --mode GET

  Deleted: test/command/suite/tokenizers/regexp/get/escape/one.expected (+0 -30) 100644
===================================================================
--- test/command/suite/tokenizers/regexp/get/escape/one.expected    2015-05-10 15:29:38 +0900 (f79eadc)
+++ /dev/null
@@ -1,30 +0,0 @@
-table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
-[[0,0.0,0.0],true]
-table_tokenize Lexicon "[e" --mode ADD
-[
-  [
-    0,
-    0.0,
-    0.0
-  ],
-  [
-    {
-      "value": "￯",
-      "position": 0
-    },
-    {
-      "value": "[e",
-      "position": 1
-    },
-    {
-      "value": "e",
-      "position": 2
-    },
-    {
-      "value": "￰",
-      "position": 3
-    }
-  ]
-]
-table_tokenize Lexicon "\\[e" --mode GET
-[[0,0.0,0.0],[{"value":"[e","position":0}]]

  Deleted: test/command/suite/tokenizers/regexp/get/escape/one.test (+0 -5) 100644
===================================================================
--- test/command/suite/tokenizers/regexp/get/escape/one.test    2015-05-10 15:29:38 +0900 (d2e7562)
+++ /dev/null
@@ -1,5 +0,0 @@
-table_create Lexicon TABLE_PAT_KEY ShortText \
-  --default_tokenizer TokenRegexp
-table_tokenize Lexicon "[e" --mode ADD
-
-table_tokenize Lexicon "\\[e" --mode GET

  Deleted: test/command/suite/tokenizers/regexp/get/escape/two.expected (+0 -86) 100644
===================================================================
--- test/command/suite/tokenizers/regexp/get/escape/two.expected    2015-05-10 15:29:38 +0900 (2de6d20)
+++ /dev/null
@@ -1,86 +0,0 @@
-table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
-[[0,0.0,0.0],true]
-table_tokenize Lexicon "c:\\server" --mode ADD
-[
-  [
-    0,
-    0.0,
-    0.0
-  ],
-  [
-    {
-      "value": "￯",
-      "position": 0
-    },
-    {
-      "value": "c:",
-      "position": 1
-    },
-    {
-      "value": ":\\",
-      "position": 2
-    },
-    {
-      "value": "\\s",
-      "position": 3
-    },
-    {
-      "value": "se",
-      "position": 4
-    },
-    {
-      "value": "er",
-      "position": 5
-    },
-    {
-      "value": "rv",
-      "position": 6
-    },
-    {
-      "value": "ve",
-      "position": 7
-    },
-    {
-      "value": "er",
-      "position": 8
-    },
-    {
-      "value": "r",
-      "position": 9
-    },
-    {
-      "value": "￰",
-      "position": 10
-    }
-  ]
-]
-table_tokenize Lexicon "c:\\\\server" --mode GET
-[
-  [
-    0,
-    0.0,
-    0.0
-  ],
-  [
-    {
-      "value": "c:",
-      "position": 0
-    },
-    {
-      "value": "\\s",
-      "position": 2
-    },
-    {
-      "value": "er",
-      "position": 4
-    },
-    {
-      "value": "ve",
-      "position": 6
-    },
-    {
-      "value": "er",
-      "position": 7
-    }
-  ]
-]

  Deleted: test/command/suite/tokenizers/regexp/get/escape/two.test (+0 -5) 100644
===================================================================
--- test/command/suite/tokenizers/regexp/get/escape/two.test    2015-05-10 15:29:38 +0900 (a2e47e7)
+++ /dev/null
@@ -1,5 +0,0 @@
-table_create Lexicon TABLE_PAT_KEY ShortText \
-  --default_tokenizer TokenRegexp
-table_tokenize Lexicon "c:\\server" --mode ADD
-
-table_tokenize Lexicon "c:\\\\server" --mode GET

  Deleted: test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.expected (+0 -70) 100644
===================================================================
--- test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.expected    2015-05-10 15:29:38 +0900 (791177e)
+++ /dev/null
@@ -1,70 +0,0 @@
-table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp   --normalizer NormalizerAuto
-[[0,0.0,0.0],true]
-table_tokenize Lexicon "abc\ndef" --mode ADD
-[
-  [
-    0,
-    0.0,
-    0.0
-  ],
-  [
-    {
-      "value": "￯",
-      "position": 0
-    },
-    {
-      "value": "ab",
-      "position": 1
-    },
-    {
-      "value": "bc",
-      "position": 2
-    },
-    {
-      "value": "c",
-      "position": 3
-    },
-    {
-      "value": "de",
-      "position": 5
-    },
-    {
-      "value": "ef",
-      "position": 6
-    },
-    {
-      "value": "f",
-      "position": 7
-    },
-    {
-      "value": "￰",
-      "position": 8
-    }
-  ]
-]
-table_tokenize Lexicon "a\\bc\ndef" --mode GET
-[
-  [
-    0,
-    0.0,
-    0.0
-  ],
-  [
-    {
-      "value": "ab",
-      "position": 0
-    },
-    {
-      "value": "bc",
-      "position": 1
-    },
-    {
-      "value": "de",
-      "position": 3
-    },
-    {
-      "value": "ef",
-      "position": 4
-    }
-  ]
-]

  Deleted: test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.test (+0 -6) 100644
===================================================================
--- test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.test    2015-05-10 15:29:38 +0900 (e4772fc)
+++ /dev/null
@@ -1,6 +0,0 @@
-table_create Lexicon TABLE_PAT_KEY ShortText \
-  --default_tokenizer TokenRegexp \
-  --normalizer NormalizerAuto
-table_tokenize Lexicon "abc\ndef" --mode ADD
-
-table_tokenize Lexicon "a\\bc\ndef" --mode GET
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index