[Groonga-commit] groonga/groonga at 4227a6c [master] TokenNgram: add "loose_blank" option

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Apr 6 15:55:32 JST 2018


Kouhei Sutou	2018-04-06 15:55:32 +0900 (Fri, 06 Apr 2018)

  New Revision: 4227a6cde35090d8e2f8766dd2e26b82196f1b1d
  https://github.com/groonga/groonga/commit/4227a6cde35090d8e2f8766dd2e26b82196f1b1d

  Message:
    TokenNgram: add "loose_blank" option

  Added files:
    test/command/suite/table_create/default_tokenizer/ngram/options/loose.expected
    test/command/suite/table_create/default_tokenizer/ngram/options/loose.test
    test/command/suite/tokenizers/ngram/loose_blank/add.expected
    test/command/suite/tokenizers/ngram/loose_blank/add.test
    test/command/suite/tokenizers/ngram/loose_blank/get.expected
    test/command/suite/tokenizers/ngram/loose_blank/get.test
  Modified files:
    lib/tokenizers.c

  Modified: lib/tokenizers.c (+31 -9)
===================================================================
--- lib/tokenizers.c    2018-04-06 15:55:03 +0900 (405ffba5a)
+++ lib/tokenizers.c    2018-04-06 15:55:32 +0900 (b9f264739)
@@ -248,6 +248,7 @@ typedef struct {
   grn_bool ignore_blank;
   grn_bool remove_blank;
   grn_bool loose_symbol;
+  grn_bool loose_blank;
 } grn_ngram_options;
 
 typedef struct {
@@ -281,6 +282,7 @@ ngram_options_init(grn_ngram_options *options, uint8_t unit)
   options->ignore_blank = GRN_FALSE;
   options->remove_blank = grn_ngram_tokenizer_remove_blank_enable;
   options->loose_symbol = GRN_FALSE;
+  options->loose_blank = GRN_FALSE;
 }
 
 static void
@@ -321,10 +323,16 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
       if (length == 0) {
         break;
       }
-      if (!(tokenizer->options.loose_symbol &&
-            GRN_STR_CTYPE(*types) == GRN_CHAR_SYMBOL)) {
+      if (!((tokenizer->options.loose_symbol &&
+             GRN_STR_CTYPE(*types) == GRN_CHAR_SYMBOL) ||
+            (!tokenizer->options.remove_blank &&
+             tokenizer->options.loose_blank &&
+             GRN_STR_ISBLANK(*types)))) {
         GRN_TEXT_PUT(ctx, &(tokenizer->loose.text), normalized, length);
         *loose_types = *types;
+        if (tokenizer->options.loose_blank && GRN_STR_ISBLANK(*types)) {
+          *loose_types &= ~GRN_STR_BLANK;
+        }
         loose_types++;
       }
       normalized += length;
@@ -540,6 +548,11 @@ ngram_open_options(grn_ctx *ctx,
                                                           raw_options,
                                                           i,
                                                           options->loose_symbol);
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "loose_blank")) {
+      options->loose_blank = grn_vector_get_element_bool(ctx,
+                                                          raw_options,
+                                                          i,
+                                                          options->loose_blank);
     }
   } GRN_OPTION_VALUES_EACH_END();
 
@@ -592,13 +605,18 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     return NULL;
   }
 
-  if (cp &&
-      !tokenizer->loose.ing &&
-      !tokenizer->loose.need &&
-      tokenizer->options.loose_symbol &&
-      GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) {
-    tokenizer->loose.need = GRN_TRUE;
-  }
+#define LOOSE_NEED_CHECK(cp, tokenizer) do {                            \
+    if (cp &&                                                           \
+        !tokenizer->loose.ing &&                                        \
+        !tokenizer->loose.need &&                                       \
+        ((tokenizer->options.loose_symbol &&                            \
+          GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) ||                     \
+         (tokenizer->options.loose_blank && GRN_STR_ISBLANK(*cp)))) {   \
+      tokenizer->loose.need = GRN_TRUE;                                 \
+    }                                                                   \
+  } while (GRN_FALSE)
+
+  LOOSE_NEED_CHECK(cp, tokenizer);
 
   if (cp && tokenizer->options.uni_alpha &&
       GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) {
@@ -606,6 +624,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
                               tokenizer->query->encoding))) {
       len++;
       r += cl;
+      LOOSE_NEED_CHECK(cp, tokenizer);
       if (/* !tokenizer->options.ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
       if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_ALPHA) { break; }
     }
@@ -618,6 +637,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
                               tokenizer->query->encoding))) {
       len++;
       r += cl;
+      LOOSE_NEED_CHECK(cp, tokenizer);
       if (/* !tokenizer->options.ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
       if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_DIGIT) { break; }
     }
@@ -630,6 +650,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
                               tokenizer->query->encoding))) {
       len++;
       r += cl;
+      LOOSE_NEED_CHECK(cp, tokenizer);
       if (!tokenizer->options.ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
       if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_SYMBOL) { break; }
     }
@@ -664,6 +685,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
              (cl = grn_charlen_(ctx, (char *)r, (char *)e,
                                 tokenizer->query->encoding))) {
         if (cp) {
+          LOOSE_NEED_CHECK(cp, tokenizer);
           if (!tokenizer->options.ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
           cp++;
           if ((tokenizer->options.uni_alpha &&

  Added: test/command/suite/table_create/default_tokenizer/ngram/options/loose.expected (+317 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/ngram/options/loose.expected    2018-04-06 15:55:32 +0900 (fc395e84e)
@@ -0,0 +1,317 @@
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos tel COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer 'TokenNgram("loose_symbol", true, "loose_blank", true)'   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+column_create Terms memos_tel COLUMN_INDEX Memos tel
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"tel": "03-5632-7432"},
+{"tel": "03 5632 7432"},
+{"tel": "(03)5632-7432"},
+{"tel": "0356327432"},
+{"tel": "03-7432-5632"}
+]
+[[0,0.0,0.0],5]
+select Terms --output_columns _key --limit -1
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        9
+      ],
+      [
+        [
+          "_key",
+          "ShortText"
+        ]
+      ],
+      [
+        "("
+      ],
+      [
+        ")"
+      ],
+      [
+        "-"
+      ],
+      [
+        "03"
+      ],
+      [
+        "0356327432"
+      ],
+      [
+        "0374325632"
+      ],
+      [
+        "5632"
+      ],
+      [
+        "7432"
+      ],
+      [
+        "￰"
+      ]
+    ]
+  ]
+]
+select Memos --match_columns tel --query '0356327432'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        4
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "tel",
+          "ShortText"
+        ]
+      ],
+      [
+        1,
+        "03-5632-7432"
+      ],
+      [
+        2,
+        "03 5632 7432"
+      ],
+      [
+        3,
+        "(03)5632-7432"
+      ],
+      [
+        4,
+        "0356327432"
+      ]
+    ]
+  ]
+]
+select Memos --match_columns tel --query '"03-5632-7432"'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        4
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "tel",
+          "ShortText"
+        ]
+      ],
+      [
+        1,
+        "03-5632-7432"
+      ],
+      [
+        2,
+        "03 5632 7432"
+      ],
+      [
+        3,
+        "(03)5632-7432"
+      ],
+      [
+        4,
+        "0356327432"
+      ]
+    ]
+  ]
+]
+select Memos --match_columns tel --query '"03 5632 7432"'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        4
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "tel",
+          "ShortText"
+        ]
+      ],
+      [
+        1,
+        "03-5632-7432"
+      ],
+      [
+        2,
+        "03 5632 7432"
+      ],
+      [
+        3,
+        "(03)5632-7432"
+      ],
+      [
+        4,
+        "0356327432"
+      ]
+    ]
+  ]
+]
+select Memos --match_columns tel --query '5632'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        4
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "tel",
+          "ShortText"
+        ]
+      ],
+      [
+        1,
+        "03-5632-7432"
+      ],
+      [
+        2,
+        "03 5632 7432"
+      ],
+      [
+        3,
+        "(03)5632-7432"
+      ],
+      [
+        5,
+        "03-7432-5632"
+      ]
+    ]
+  ]
+]
+select Memos --match_columns tel --query '32'
+[[0,0.0,0.0],[[[0],[["_id","UInt32"],["tel","ShortText"]]]]]
+select Memos --match_columns tel --query '0'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        5
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "tel",
+          "ShortText"
+        ]
+      ],
+      [
+        1,
+        "03-5632-7432"
+      ],
+      [
+        2,
+        "03 5632 7432"
+      ],
+      [
+        3,
+        "(03)5632-7432"
+      ],
+      [
+        4,
+        "0356327432"
+      ],
+      [
+        5,
+        "03-7432-5632"
+      ]
+    ]
+  ]
+]
+select Memos --match_columns tel --query '03'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        4
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "tel",
+          "ShortText"
+        ]
+      ],
+      [
+        1,
+        "03-5632-7432"
+      ],
+      [
+        2,
+        "03 5632 7432"
+      ],
+      [
+        3,
+        "(03)5632-7432"
+      ],
+      [
+        5,
+        "03-7432-5632"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/table_create/default_tokenizer/ngram/options/loose.test (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/ngram/options/loose.test    2018-04-06 15:55:32 +0900 (266ae166a)
@@ -0,0 +1,26 @@
+table_create Memos TABLE_NO_KEY
+column_create Memos tel COLUMN_SCALAR ShortText
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer 'TokenNgram("loose_symbol", true, "loose_blank", true)' \
+  --normalizer NormalizerAuto
+column_create Terms memos_tel COLUMN_INDEX Memos tel
+
+load --table Memos
+[
+{"tel": "03-5632-7432"},
+{"tel": "03 5632 7432"},
+{"tel": "(03)5632-7432"},
+{"tel": "0356327432"},
+{"tel": "03-7432-5632"}
+]
+
+select Terms --output_columns _key --limit -1
+
+select Memos --match_columns tel --query '0356327432'
+select Memos --match_columns tel --query '"03-5632-7432"'
+select Memos --match_columns tel --query '"03 5632 7432"'
+select Memos --match_columns tel --query '5632'
+select Memos --match_columns tel --query '32'
+select Memos --match_columns tel --query '0'
+select Memos --match_columns tel --query '03'

  Added: test/command/suite/tokenizers/ngram/loose_blank/add.expected (+35 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/loose_blank/add.expected    2018-04-06 15:55:32 +0900 (e3e668ae7)
@@ -0,0 +1,35 @@
+tokenize   'TokenNgram("loose_blank", true)'   "090 1234 5678"   NormalizerAuto   --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "090",
+      "position": 0,
+      "force_prefix": false
+    },
+    {
+      "value": "1234",
+      "position": 1,
+      "force_prefix": false
+    },
+    {
+      "value": "5678",
+      "position": 2,
+      "force_prefix": false
+    },
+    {
+      "value": "￰",
+      "position": 3,
+      "force_prefix": false
+    },
+    {
+      "value": "09012345678",
+      "position": 4,
+      "force_prefix": false
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/ngram/loose_blank/add.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/loose_blank/add.test    2018-04-06 15:55:32 +0900 (57310f54b)
@@ -0,0 +1,5 @@
+tokenize \
+  'TokenNgram("loose_blank", true)' \
+  "090 1234 5678" \
+  NormalizerAuto \
+  --mode ADD

  Added: test/command/suite/tokenizers/ngram/loose_blank/get.expected (+2 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/loose_blank/get.expected    2018-04-06 15:55:32 +0900 (069d4e464)
@@ -0,0 +1,2 @@
+tokenize   'TokenNgram("loose_blank", true)'   "090 1234 5678"   NormalizerAuto   --mode GET
+[[0,0.0,0.0],[{"value":"09012345678","position":0,"force_prefix":false}]]

  Added: test/command/suite/tokenizers/ngram/loose_blank/get.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/loose_blank/get.test    2018-04-06 15:55:32 +0900 (2c0fee207)
@@ -0,0 +1,5 @@
+tokenize \
+  'TokenNgram("loose_blank", true)' \
+  "090 1234 5678" \
+  NormalizerAuto \
+  --mode GET
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180406/1b5528a7/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index