[Groonga-commit] groonga/groonga at 161d31f [master] TokenRegexp: don't ignore blank on GET mode

Back to archive index

Kouhei Sutou null+****@clear*****
Sun May 10 00:02:05 JST 2015


Kouhei Sutou	2015-05-10 00:02:05 +0900 (Sun, 10 May 2015)

  New Revision: 161d31f5fe46031facf4c511176d9f3d299a097d
  https://github.com/groonga/groonga/commit/161d31f5fe46031facf4c511176d9f3d299a097d

  Message:
    TokenRegexp: don't ignore blank on GET mode

  Added files:
    test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.expected
    test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.test
    test/command/suite/tokenizers/regexp/get/normalizer/blank/less_after.expected
    test/command/suite/tokenizers/regexp/get/normalizer/blank/less_after.test
    test/command/suite/tokenizers/regexp/get/normalizer/blank/less_before.expected
    test/command/suite/tokenizers/regexp/get/normalizer/blank/less_before.test
  Modified files:
    lib/tokenizers.c

  Modified: lib/tokenizers.c (+15 -8)
===================================================================
--- lib/tokenizers.c    2015-05-09 23:14:59 +0900 (f4bb58a)
+++ lib/tokenizers.c    2015-05-10 00:02:05 +0900 (e983ef1)
@@ -585,10 +585,12 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   const const uint_least8_t *char_types =
     tokenizer->char_types + tokenizer->nth_char;
   grn_tokenize_mode mode = tokenizer->query->tokenize_mode;
+  grn_bool is_first_token = tokenizer->is_first_token;
   grn_bool escaping = GRN_FALSE;
   grn_bool break_by_blank = GRN_FALSE;
 
   GRN_BULK_REWIND(buffer);
+  tokenizer->is_first_token = GRN_FALSE;
 
   if (mode == GRN_TOKEN_GET) {
     if (tokenizer->get.have_begin) {
@@ -650,20 +652,23 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
       n_characters++;
       GRN_TEXT_PUT(ctx, buffer, current, char_len);
       current += char_len;
-      escaping = GRN_FALSE;
       if (n_characters == 1) {
         tokenizer->next = current;
         tokenizer->nth_char++;
+        if (escaping) {
+          tokenizer->nth_char++;
+        }
       }
-      if (n_characters == ngram_unit) {
-        break;
-      }
+      escaping = GRN_FALSE;
       char_type = char_types[0];
       char_types++;
       if (GRN_STR_ISBLANK(char_type)) {
         break_by_blank = GRN_TRUE;
         break;
       }
+      if (n_characters == ngram_unit) {
+        break;
+      }
     }
 
     char_len = grn_charlen_(ctx, (const char *)current, (const char *)end,
@@ -676,7 +681,7 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   if (tokenizer->is_overlapping) {
     status |= GRN_TOKEN_OVERLAP;
   }
-  if (n_characters < ngram_unit && !break_by_blank) {
+  if (n_characters < ngram_unit) {
     status |= GRN_TOKEN_UNMATURED;
   }
   tokenizer->is_overlapping = (n_characters > 1);
@@ -688,7 +693,7 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
           tokenizer->is_end = GRN_TRUE;
         }
         if (status & GRN_TOKEN_UNMATURED) {
-          if (tokenizer->is_first_token) {
+          if (is_first_token) {
             status |= GRN_TOKEN_FORCE_PREFIX;
           } else {
             status |= GRN_TOKEN_SKIP;
@@ -702,7 +707,10 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
         }
       }
     } else {
-      if (tokenizer->get.n_skip_tokens > 0) {
+      if (break_by_blank) {
+        tokenizer->get.n_skip_tokens = 0;
+        tokenizer->is_first_token = GRN_TRUE;
+      } else if (tokenizer->get.n_skip_tokens > 0) {
         tokenizer->get.n_skip_tokens--;
         status |= GRN_TOKEN_SKIP;
       } else {
@@ -720,7 +728,6 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
                            GRN_TEXT_VALUE(buffer),
                            GRN_TEXT_LEN(buffer),
                            status);
-  tokenizer->is_first_token = GRN_FALSE;
 
   return NULL;
 }

  Added: test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.expected (+70 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.expected    2015-05-10 00:02:05 +0900 (989ce04)
@@ -0,0 +1,70 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "abc\ndef" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "ab",
+      "position": 1
+    },
+    {
+      "value": "bc",
+      "position": 2
+    },
+    {
+      "value": "c",
+      "position": 3
+    },
+    {
+      "value": "de",
+      "position": 4
+    },
+    {
+      "value": "ef",
+      "position": 5
+    },
+    {
+      "value": "f",
+      "position": 6
+    },
+    {
+      "value": "￰",
+      "position": 7
+    }
+  ]
+]
+table_tokenize Lexicon "a\\bc\ndef" --mode GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "ab",
+      "position": 0
+    },
+    {
+      "value": "bc",
+      "position": 1
+    },
+    {
+      "value": "de",
+      "position": 3
+    },
+    {
+      "value": "ef",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.test (+6 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/normalizer/blank/escape.test    2015-05-10 00:02:05 +0900 (e4772fc)
@@ -0,0 +1,6 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp \
+  --normalizer NormalizerAuto
+table_tokenize Lexicon "abc\ndef" --mode ADD
+
+table_tokenize Lexicon "a\\bc\ndef" --mode GET

  Added: test/command/suite/tokenizers/regexp/get/normalizer/blank/less_after.expected (+58 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/normalizer/blank/less_after.expected    2015-05-10 00:02:05 +0900 (c05a87c)
@@ -0,0 +1,58 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "abc\nd" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "ab",
+      "position": 1
+    },
+    {
+      "value": "bc",
+      "position": 2
+    },
+    {
+      "value": "c",
+      "position": 3
+    },
+    {
+      "value": "d",
+      "position": 4
+    },
+    {
+      "value": "￰",
+      "position": 5
+    }
+  ]
+]
+table_tokenize Lexicon "abc\nd" --mode GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "ab",
+      "position": 0
+    },
+    {
+      "value": "bc",
+      "position": 1
+    },
+    {
+      "value": "d",
+      "position": 3
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/get/normalizer/blank/less_after.test (+6 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/normalizer/blank/less_after.test    2015-05-10 00:02:05 +0900 (6d79cc7)
@@ -0,0 +1,6 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp \
+  --normalizer NormalizerAuto
+table_tokenize Lexicon "abc\nd" --mode ADD
+
+table_tokenize Lexicon "abc\nd" --mode GET

  Added: test/command/suite/tokenizers/regexp/get/normalizer/blank/less_before.expected (+58 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/normalizer/blank/less_before.expected    2015-05-10 00:02:05 +0900 (1607760)
@@ -0,0 +1,58 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "a\ndef" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "a",
+      "position": 1
+    },
+    {
+      "value": "de",
+      "position": 2
+    },
+    {
+      "value": "ef",
+      "position": 3
+    },
+    {
+      "value": "f",
+      "position": 4
+    },
+    {
+      "value": "￰",
+      "position": 5
+    }
+  ]
+]
+table_tokenize Lexicon "a\ndef" --mode GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "a",
+      "position": 0
+    },
+    {
+      "value": "de",
+      "position": 1
+    },
+    {
+      "value": "ef",
+      "position": 2
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/get/normalizer/blank/less_before.test (+6 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/normalizer/blank/less_before.test    2015-05-10 00:02:05 +0900 (b753510)
@@ -0,0 +1,6 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp \
+  --normalizer NormalizerAuto
+table_tokenize Lexicon "a\ndef" --mode ADD
+
+table_tokenize Lexicon "a\ndef" --mode GET
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index