[Groonga-commit] groonga/groonga at 32b2e82 [master] TokenNgram: add unify_alphabet option

Back to archive index

Kouhei Sutou null+****@clear*****
Thu Sep 20 17:11:09 JST 2018


Kouhei Sutou	2018-09-20 17:11:09 +0900 (Thu, 20 Sep 2018)

  Revision: 32b2e82cd29e89c0e26e6226d9baa8b2d8725a54
  https://github.com/groonga/groonga/commit/32b2e82cd29e89c0e26e6226d9baa8b2d8725a54

  Message:
    TokenNgram: add unify_alphabet option
    
    TokenNgram("unify_alphabet", false) == TokenBigramSplitAlpha (not
    exist ;p)

  Added files:
    test/command/suite/tokenizers/ngram/unify_alphabet.expected
    test/command/suite/tokenizers/ngram/unify_alphabet.test
  Modified files:
    lib/tokenizers.c

  Modified: lib/tokenizers.c (+30 -24)
===================================================================
--- lib/tokenizers.c    2018-09-20 14:11:38 +0900 (657b8fa71)
+++ lib/tokenizers.c    2018-09-20 17:11:09 +0900 (ca3037e5c)
@@ -260,9 +260,9 @@ static grn_bool grn_ngram_tokenizer_remove_blank_enable = GRN_TRUE;
 
 typedef struct {
   uint8_t unit;
-  grn_bool uni_alpha;
-  grn_bool uni_digit;
-  grn_bool uni_symbol;
+  grn_bool unify_alphabet;
+  grn_bool unify_digit;
+  grn_bool unify_symbol;
   grn_bool ignore_blank;
   grn_bool remove_blank;
   grn_bool loose_symbol;
@@ -302,9 +302,9 @@ static void
 ngram_options_init(grn_ngram_options *options, uint8_t unit)
 {
   options->unit = unit;
-  options->uni_alpha = GRN_TRUE;
-  options->uni_digit = GRN_TRUE;
-  options->uni_symbol = GRN_TRUE;
+  options->unify_alphabet = GRN_TRUE;
+  options->unify_digit = GRN_TRUE;
+  options->unify_symbol = GRN_TRUE;
   options->ignore_blank = GRN_FALSE;
   options->remove_blank = grn_ngram_tokenizer_remove_blank_enable;
   options->loose_symbol = GRN_FALSE;
@@ -607,7 +607,7 @@ bigrams_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
   grn_ngram_options options;
   ngram_options_init(&options, 2);
-  options.uni_symbol = GRN_FALSE;
+  options.unify_symbol = GRN_FALSE;
   return ngram_init_deprecated(ctx, nargs, args, user_data, &options);
 }
 
@@ -616,8 +616,8 @@ bigramsa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
   grn_ngram_options options;
   ngram_options_init(&options, 2);
-  options.uni_symbol = GRN_FALSE;
-  options.uni_alpha = GRN_FALSE;
+  options.unify_symbol = GRN_FALSE;
+  options.unify_alphabet = GRN_FALSE;
   return ngram_init_deprecated(ctx, nargs, args, user_data, &options);
 }
 
@@ -626,9 +626,9 @@ bigramsad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data
 {
   grn_ngram_options options;
   ngram_options_init(&options, 2);
-  options.uni_symbol = GRN_FALSE;
-  options.uni_alpha = GRN_FALSE;
-  options.uni_digit = GRN_FALSE;
+  options.unify_symbol = GRN_FALSE;
+  options.unify_alphabet = GRN_FALSE;
+  options.unify_digit = GRN_FALSE;
   return ngram_init_deprecated(ctx, nargs, args, user_data, &options);
 }
 
@@ -647,7 +647,7 @@ bigramis_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   grn_ngram_options options;
   ngram_options_init(&options, 2);
   options.ignore_blank = GRN_TRUE;
-  options.uni_symbol = GRN_FALSE;
+  options.unify_symbol = GRN_FALSE;
   return ngram_init_deprecated(ctx, nargs, args, user_data, &options);
 }
 
@@ -657,8 +657,8 @@ bigramisa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data
   grn_ngram_options options;
   ngram_options_init(&options, 2);
   options.ignore_blank = GRN_TRUE;
-  options.uni_symbol = GRN_FALSE;
-  options.uni_alpha = GRN_FALSE;
+  options.unify_symbol = GRN_FALSE;
+  options.unify_alphabet = GRN_FALSE;
   return ngram_init_deprecated(ctx, nargs, args, user_data, &options);
 }
 
@@ -668,9 +668,9 @@ bigramisad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_dat
   grn_ngram_options options;
   ngram_options_init(&options, 2);
   options.ignore_blank = GRN_TRUE;
-  options.uni_symbol = GRN_FALSE;
-  options.uni_alpha = GRN_FALSE;
-  options.uni_digit = GRN_FALSE;
+  options.unify_symbol = GRN_FALSE;
+  options.unify_alphabet = GRN_FALSE;
+  options.unify_digit = GRN_FALSE;
   return ngram_init_deprecated(ctx, nargs, args, user_data, &options);
 }
 
@@ -730,6 +730,12 @@ ngram_open_options(grn_ctx *ctx,
                                     raw_options,
                                     i,
                                     options->include_removed_source_location);
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_alphabet")) {
+      options->unify_alphabet =
+        grn_vector_get_element_bool(ctx,
+                                    raw_options,
+                                    i,
+                                    options->unify_alphabet);
     }
   } GRN_OPTION_VALUES_EACH_END();
 
@@ -815,7 +821,7 @@ ngram_next(grn_ctx *ctx,
 
   LOOSE_NEED_CHECK(cp, tokenizer);
 
-  if (cp && tokenizer->options.uni_alpha &&
+  if (cp && tokenizer->options.unify_alphabet &&
       GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) {
     while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) {
       n_characters++;
@@ -827,7 +833,7 @@ ngram_next(grn_ctx *ctx,
     tokenizer->next = r;
     tokenizer->overlap = GRN_FALSE;
   } else if (cp &&
-             tokenizer->options.uni_digit &&
+             tokenizer->options.unify_digit &&
              GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) {
     while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) {
       n_characters++;
@@ -839,7 +845,7 @@ ngram_next(grn_ctx *ctx,
     tokenizer->next = r;
     tokenizer->overlap = GRN_FALSE;
   } else if (cp &&
-             tokenizer->options.uni_symbol &&
+             tokenizer->options.unify_symbol &&
              GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) {
     while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) {
       n_characters++;
@@ -880,11 +886,11 @@ ngram_next(grn_ctx *ctx,
           LOOSE_NEED_CHECK(cp, tokenizer);
           if (!tokenizer->options.ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
           cp++;
-          if ((tokenizer->options.uni_alpha &&
+          if ((tokenizer->options.unify_alphabet &&
                GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) ||
-              (tokenizer->options.uni_digit &&
+              (tokenizer->options.unify_digit &&
                GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) ||
-              (tokenizer->options.uni_symbol &&
+              (tokenizer->options.unify_symbol &&
                GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL)) {
             break;
           }

  Added: test/command/suite/tokenizers/ngram/unify_alphabet.expected (+35 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/unify_alphabet.expected    2018-09-20 17:11:09 +0900 (1e6d748eb)
@@ -0,0 +1,35 @@
+tokenize   'TokenNgram("unify_alphabet", false)'   "abcde"   NormalizerAuto
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "ab",
+      "position": 0,
+      "force_prefix": false
+    },
+    {
+      "value": "bc",
+      "position": 1,
+      "force_prefix": false
+    },
+    {
+      "value": "cd",
+      "position": 2,
+      "force_prefix": false
+    },
+    {
+      "value": "de",
+      "position": 3,
+      "force_prefix": false
+    },
+    {
+      "value": "e",
+      "position": 4,
+      "force_prefix": false
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/ngram/unify_alphabet.test (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/unify_alphabet.test    2018-09-20 17:11:09 +0900 (b37dfdca6)
@@ -0,0 +1,4 @@
+tokenize \
+  'TokenNgram("unify_alphabet", false)' \
+  "abcde" \
+  NormalizerAuto
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180920/48aa8d43/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index