[Groonga-commit] groonga/groonga at 3036b42 [master] TokenRegexp: support escape

Back to archive index

Kouhei Sutou null+****@clear*****
Tue Mar 17 17:48:42 JST 2015


Kouhei Sutou	2015-03-17 17:48:42 +0900 (Tue, 17 Mar 2015)

  New Revision: 3036b42552215de4f613404cf055c9c5282a9b6d
  https://github.com/groonga/groonga/commit/3036b42552215de4f613404cf055c9c5282a9b6d

  Message:
    TokenRegexp: support escape

  Added files:
    test/command/suite/select/filter/index/regexp/escape.expected
    test/command/suite/select/filter/index/regexp/escape.test
    test/command/suite/tokenizers/regexp/get/escape/one.expected
    test/command/suite/tokenizers/regexp/get/escape/one.test
    test/command/suite/tokenizers/regexp/get/escape/two.expected
    test/command/suite/tokenizers/regexp/get/escape/two.test
  Modified files:
    lib/tokenizers.c

  Modified: lib/tokenizers.c (+29 -10)
===================================================================
--- lib/tokenizers.c    2015-03-17 15:46:14 +0900 (ef1a0c5)
+++ lib/tokenizers.c    2015-03-17 17:48:42 +0900 (1c07c52)
@@ -481,6 +481,7 @@ typedef struct {
   grn_bool is_overlapping;
   const char *next;
   const char *end;
+  grn_obj buffer;
 } grn_regexp_tokenizer;
 
 static grn_obj *
@@ -555,6 +556,8 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     }
   }
 
+  GRN_TEXT_INIT(&(tokenizer->buffer), 0);
+
   return NULL;
 }
 
@@ -566,10 +569,13 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   grn_regexp_tokenizer *tokenizer = user_data->ptr;
   unsigned int n_characters = 0;
   int ngram_unit = 2;
-  const char *start = tokenizer->next;
-  const char *current = start;
+  grn_obj *buffer = &(tokenizer->buffer);
+  const char *current = tokenizer->next;
   const char *end = tokenizer->end;
   grn_tokenize_mode mode = tokenizer->query->tokenize_mode;
+  grn_bool escaping = GRN_FALSE;
+
+  GRN_BULK_REWIND(buffer);
 
   if (mode == GRN_TOKEN_GET) {
     if (tokenizer->get.have_begin) {
@@ -620,17 +626,29 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     return NULL;
   }
 
-  n_characters++;
-  current += char_len;
-  tokenizer->next = current;
-  while (n_characters < ngram_unit) {
+  while (GRN_TRUE) {
+    if (!escaping && mode == GRN_TOKEN_GET &&
+        char_len == 1 && current[0] == '\\') {
+      current += char_len;
+      escaping = GRN_TRUE;
+    } else {
+      n_characters++;
+      GRN_TEXT_PUT(ctx, buffer, current, char_len);
+      current += char_len;
+      escaping = GRN_FALSE;
+      if (n_characters == 1) {
+        tokenizer->next = current;
+      }
+      if (n_characters == ngram_unit) {
+        break;
+      }
+    }
+
     char_len = grn_charlen_(ctx, (const char *)current, (const char *)end,
                             tokenizer->query->encoding);
     if (char_len == 0) {
       break;
     }
-    n_characters++;
-    current += char_len;
   }
 
   if (tokenizer->is_overlapping) {
@@ -654,8 +672,8 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 
   grn_tokenizer_token_push(ctx,
                            &(tokenizer->token),
-                           (const char *)start,
-                           current - start,
+                           GRN_TEXT_VALUE(buffer),
+                           GRN_TEXT_LEN(buffer),
                            status);
   return NULL;
 }
@@ -669,6 +687,7 @@ regexp_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   }
   grn_tokenizer_token_fin(ctx, &(tokenizer->token));
   grn_tokenizer_query_close(ctx, tokenizer->query);
+  GRN_OBJ_FIN(ctx, &(tokenizer->buffer));
   GRN_FREE(tokenizer);
   return NULL;
 }

  Added: test/command/suite/select/filter/index/regexp/escape.expected (+54 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/escape.expected    2015-03-17 17:48:42 +0900 (dd195d8)
@@ -0,0 +1,54 @@
+table_create Logs TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Logs message COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+table_create RegexpLexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+column_create RegexpLexicon logs_message_index   COLUMN_INDEX|WITH_POSITION Logs message
+[[0,0.0,0.0],true]
+load --table Logs
+[
+{"message": "host1:[error]: No memory"},
+{"message": "host1:[warning]: Remained disk space is less than 30%"},
+{"message": "host1:[error]: Disk full"},
+{"message": "host2:[error]: No memory"},
+{"message": "host2:[info]: Shutdown"}
+]
+[[0,0.0,0.0],5]
+select Logs --filter 'message @~ "\\\\[error\\\\]"'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        3
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "message",
+          "Text"
+        ]
+      ],
+      [
+        1,
+        "host1:[error]: No memory"
+      ],
+      [
+        3,
+        "host1:[error]: Disk full"
+      ],
+      [
+        4,
+        "host2:[error]: No memory"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/select/filter/index/regexp/escape.test (+19 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/escape.test    2015-03-17 17:48:42 +0900 (d50b732)
@@ -0,0 +1,19 @@
+table_create Logs TABLE_NO_KEY
+column_create Logs message COLUMN_SCALAR Text
+
+table_create RegexpLexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp \
+  --normalizer NormalizerAuto
+column_create RegexpLexicon logs_message_index \
+  COLUMN_INDEX|WITH_POSITION Logs message
+
+load --table Logs
+[
+{"message": "host1:[error]: No memory"},
+{"message": "host1:[warning]: Remained disk space is less than 30%"},
+{"message": "host1:[error]: Disk full"},
+{"message": "host2:[error]: No memory"},
+{"message": "host2:[info]: Shutdown"}
+]
+
+select Logs --filter 'message @~ "\\\\[error\\\\]"'

  Added: test/command/suite/tokenizers/regexp/get/escape/one.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/escape/one.expected    2015-03-17 17:48:42 +0900 (f79eadc)
@@ -0,0 +1,30 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "[e" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "[e",
+      "position": 1
+    },
+    {
+      "value": "e",
+      "position": 2
+    },
+    {
+      "value": "￰",
+      "position": 3
+    }
+  ]
+]
+table_tokenize Lexicon "\\[e" --mode GET
+[[0,0.0,0.0],[{"value":"[e","position":0}]]

  Added: test/command/suite/tokenizers/regexp/get/escape/one.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/escape/one.test    2015-03-17 17:48:42 +0900 (d2e7562)
@@ -0,0 +1,5 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+table_tokenize Lexicon "[e" --mode ADD
+
+table_tokenize Lexicon "\\[e" --mode GET

  Added: test/command/suite/tokenizers/regexp/get/escape/two.expected (+98 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/escape/two.expected    2015-03-17 17:48:42 +0900 (47b7da8)
@@ -0,0 +1,98 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "c:\\server" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "c:",
+      "position": 1
+    },
+    {
+      "value": ":\\",
+      "position": 2
+    },
+    {
+      "value": "\\s",
+      "position": 3
+    },
+    {
+      "value": "se",
+      "position": 4
+    },
+    {
+      "value": "er",
+      "position": 5
+    },
+    {
+      "value": "rv",
+      "position": 6
+    },
+    {
+      "value": "ve",
+      "position": 7
+    },
+    {
+      "value": "er",
+      "position": 8
+    },
+    {
+      "value": "r",
+      "position": 9
+    },
+    {
+      "value": "￰",
+      "position": 10
+    }
+  ]
+]
+table_tokenize Lexicon "c:\\\\server" --mode GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "c:",
+      "position": 0
+    },
+    {
+      "value": ":\\",
+      "position": 1
+    },
+    {
+      "value": "\\s",
+      "position": 2
+    },
+    {
+      "value": "se",
+      "position": 3
+    },
+    {
+      "value": "er",
+      "position": 4
+    },
+    {
+      "value": "rv",
+      "position": 5
+    },
+    {
+      "value": "ve",
+      "position": 6
+    },
+    {
+      "value": "er",
+      "position": 7
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/get/escape/two.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/escape/two.test    2015-03-17 17:48:42 +0900 (a2e47e7)
@@ -0,0 +1,5 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+table_tokenize Lexicon "c:\\server" --mode ADD
+
+table_tokenize Lexicon "c:\\\\server" --mode GET
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index