[Groonga-commit] groonga/groonga at 1fdc0fd [master] TokenRegexp: don't ignore blank

Back to archive index

Kouhei Sutou null+****@clear*****
Sat May 9 23:14:59 JST 2015


Kouhei Sutou	2015-05-09 23:14:59 +0900 (Sat, 09 May 2015)

  New Revision: 1fdc0fd578ecbe4cb8928e9069291a3d729fbbd7
  https://github.com/groonga/groonga/commit/1fdc0fd578ecbe4cb8928e9069291a3d729fbbd7

  Message:
    TokenRegexp: don't ignore blank

  Added files:
    test/command/suite/tokenizers/regexp/add/normalizer/blank.expected
    test/command/suite/tokenizers/regexp/add/normalizer/blank.test
  Modified files:
    lib/tokenizers.c

  Modified: lib/tokenizers.c (+20 -2)
===================================================================
--- lib/tokenizers.c    2015-05-09 21:13:11 +0900 (5578b53)
+++ lib/tokenizers.c    2015-05-09 23:14:59 +0900 (f4bb58a)
@@ -483,13 +483,15 @@ typedef struct {
   grn_bool is_overlapping;
   const char *next;
   const char *end;
+  unsigned int nth_char;
+  const uint_least8_t *char_types;
   grn_obj buffer;
 } grn_regexp_tokenizer;
 
 static grn_obj *
 regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
-  unsigned int normalize_flags = 0;
+  unsigned int normalize_flags = GRN_STRING_WITH_TYPES;
   grn_tokenizer_query *query;
   const char *normalized;
   unsigned int normalized_length_in_bytes;
@@ -526,6 +528,9 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
                             NULL);
   tokenizer->next = normalized;
   tokenizer->end = tokenizer->next + normalized_length_in_bytes;
+  tokenizer->nth_char = 0;
+  tokenizer->char_types =
+    grn_string_get_types(ctx, tokenizer->query->normalized_query);
 
   if (tokenizer->query->tokenize_mode == GRN_TOKEN_GET) {
     unsigned int query_length = tokenizer->query->length;
@@ -541,6 +546,7 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
                                         encoding);
         tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end,
                                         encoding);
+        tokenizer->nth_char = 2;
       }
       if (query_string[query_length - 2] == '\\' &&
           query_string[query_length - 1] == 'z') {
@@ -576,8 +582,11 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   grn_obj *buffer = &(tokenizer->buffer);
   const char *current = tokenizer->next;
   const char *end = tokenizer->end;
+  const const uint_least8_t *char_types =
+    tokenizer->char_types + tokenizer->nth_char;
   grn_tokenize_mode mode = tokenizer->query->tokenize_mode;
   grn_bool escaping = GRN_FALSE;
+  grn_bool break_by_blank = GRN_FALSE;
 
   GRN_BULK_REWIND(buffer);
 
@@ -635,17 +644,26 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
         char_len == 1 && current[0] == '\\') {
       current += char_len;
       escaping = GRN_TRUE;
+      char_types++;
     } else {
+      uint_least8_t char_type;
       n_characters++;
       GRN_TEXT_PUT(ctx, buffer, current, char_len);
       current += char_len;
       escaping = GRN_FALSE;
       if (n_characters == 1) {
         tokenizer->next = current;
+        tokenizer->nth_char++;
       }
       if (n_characters == ngram_unit) {
         break;
       }
+      char_type = char_types[0];
+      char_types++;
+      if (GRN_STR_ISBLANK(char_type)) {
+        break_by_blank = GRN_TRUE;
+        break;
+      }
     }
 
     char_len = grn_charlen_(ctx, (const char *)current, (const char *)end,
@@ -658,7 +676,7 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   if (tokenizer->is_overlapping) {
     status |= GRN_TOKEN_OVERLAP;
   }
-  if (n_characters < ngram_unit) {
+  if (n_characters < ngram_unit && !break_by_blank) {
     status |= GRN_TOKEN_UNMATURED;
   }
   tokenizer->is_overlapping = (n_characters > 1);

  Added: test/command/suite/tokenizers/regexp/add/normalizer/blank.expected (+52 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/add/normalizer/blank.expected    2015-05-09 23:14:59 +0900 (bd9b53e)
@@ -0,0 +1,52 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "abcd\nefgh" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "ab",
+      "position": 1
+    },
+    {
+      "value": "bc",
+      "position": 2
+    },
+    {
+      "value": "cd",
+      "position": 3
+    },
+    {
+      "value": "d",
+      "position": 4
+    },
+    {
+      "value": "ef",
+      "position": 5
+    },
+    {
+      "value": "fg",
+      "position": 6
+    },
+    {
+      "value": "gh",
+      "position": 7
+    },
+    {
+      "value": "h",
+      "position": 8
+    },
+    {
+      "value": "￰",
+      "position": 9
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/add/normalizer/blank.test (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/add/normalizer/blank.test    2015-05-09 23:14:59 +0900 (194183f)
@@ -0,0 +1,4 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp \
+  --normalizer NormalizerAuto
+table_tokenize Lexicon "abcd\nefgh" --mode ADD
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index