[Groonga-commit] groonga/groonga at 4a6f981 [master] tokenize: support mode

Back to archive index

naoa null+****@clear*****
Thu Aug 14 16:35:36 JST 2014


naoa	2014-08-14 16:35:36 +0900 (Thu, 14 Aug 2014)

  New Revision: 4a6f981ceb908a571c07ff4d9c7f85208774f8f9
  https://github.com/groonga/groonga/commit/4a6f981ceb908a571c07ff4d9c7f85208774f8f9

  Merged 46e2243: Merge pull request #191 from naoa/tokenize-support-mode

  Message:
    tokenize: support mode

  Added files:
    test/command/suite/tokenize/add_mode.expected
    test/command/suite/tokenize/add_mode.test
    test/command/suite/tokenize/get_mode.expected
    test/command/suite/tokenize/get_mode.test
    test/command/suite/tokenize/invalid/mode/unknown_mode.expected
    test/command/suite/tokenize/invalid/mode/unknown_mode.test
  Modified files:
    lib/proc.c

  Modified: lib/proc.c (+29 -9)
===================================================================
--- lib/proc.c    2014-08-13 12:17:09 +0900 (ad5b20b)
+++ lib/proc.c    2014-08-14 16:35:36 +0900 (d7d97fe)
@@ -3274,14 +3274,14 @@ create_lexicon_for_tokenize(grn_ctx *ctx,
 }
 
 static void
-tokenize(grn_ctx *ctx, grn_hash *lexicon, grn_obj *string, unsigned int flags,
-         grn_obj *tokens)
+tokenize(grn_ctx *ctx, grn_hash *lexicon, grn_obj *string, grn_token_mode mode,
+         unsigned int flags, grn_obj *tokens)
 {
   grn_token *token;
 
   token = grn_token_open(ctx, (grn_obj *)lexicon,
                          GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string),
-                         GRN_TOKEN_ADD, flags);
+                         mode, flags);
   if (!token) {
     return;
   }
@@ -3307,11 +3307,13 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   grn_obj *string;
   grn_obj *normalizer_name;
   grn_obj *flag_names;
+  grn_obj *mode_name;
 
   tokenizer_name = VAR(0);
   string = VAR(1);
   normalizer_name = VAR(2);
   flag_names = VAR(3);
+  mode_name = VAR(4);
 
   if (GRN_TEXT_LEN(tokenizer_name) == 0) {
     ERR(GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing");
@@ -3328,7 +3330,6 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   {
     unsigned int flags;
     grn_hash *lexicon;
-    grn_obj tokens;
 
     flags = parse_tokenize_flags(ctx, flag_names);
     if (ctx->rc != GRN_SUCCESS) {
@@ -3342,10 +3343,28 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
       return NULL;
     }
 
-    GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
-    tokenize(ctx, lexicon, string, flags, &tokens);
-    output_tokens(ctx, &tokens, lexicon);
-    GRN_OBJ_FIN(ctx, &tokens);
+    if (GRN_TEXT_LEN(mode_name) == 0 ||
+        !memcmp(GRN_TEXT_VALUE(mode_name), "ADD", 3)) {
+      grn_obj add_tokens;
+      GRN_VALUE_FIX_SIZE_INIT(&add_tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
+      tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &add_tokens);
+      output_tokens(ctx, &add_tokens, lexicon);
+      GRN_OBJ_FIN(ctx, &add_tokens);
+    } else if (!memcmp(GRN_TEXT_VALUE(mode_name), "GET", 3)) {
+      grn_obj add_tokens;
+      grn_obj get_tokens;
+      GRN_VALUE_FIX_SIZE_INIT(&add_tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
+      GRN_VALUE_FIX_SIZE_INIT(&get_tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
+      tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &add_tokens);
+      tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &get_tokens);
+      output_tokens(ctx, &get_tokens, lexicon);
+      GRN_OBJ_FIN(ctx, &add_tokens);
+      GRN_OBJ_FIN(ctx, &get_tokens);
+    } else {
+      ERR(GRN_INVALID_ARGUMENT, "[tokenize] invalid mode: <%.*s>",
+          (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name));
+      output_tokens(ctx, NULL, NULL);
+    }
 
     grn_hash_close(ctx, lexicon);
   }
@@ -5128,7 +5147,8 @@ grn_db_init_builtin_query(grn_ctx *ctx)
   DEF_VAR(vars[1], "string");
   DEF_VAR(vars[2], "normalizer");
   DEF_VAR(vars[3], "flags");
-  DEF_COMMAND("tokenize", proc_tokenize, 4, vars);
+  DEF_VAR(vars[4], "mode");
+  DEF_COMMAND("tokenize", proc_tokenize, 5, vars);
 
   DEF_COMMAND("tokenizer_list", proc_tokenizer_list, 0, vars);
 

  Added: test/command/suite/tokenize/add_mode.expected (+30 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenize/add_mode.expected    2014-08-14 16:35:36 +0900 (3fdbed8)
@@ -0,0 +1,30 @@
+tokenize TokenBigram "あいabアイ" NormalizerAuto NONE ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "あい",
+      "position": 0
+    },
+    {
+      "value": "い",
+      "position": 1
+    },
+    {
+      "value": "ab",
+      "position": 2
+    },
+    {
+      "value": "アイ",
+      "position": 3
+    },
+    {
+      "value": "イ",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenize/add_mode.test (+1 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenize/add_mode.test    2014-08-14 16:35:36 +0900 (3e5089b)
@@ -0,0 +1 @@
+tokenize TokenBigram "あいabアイ" NormalizerAuto NONE ADD

  Added: test/command/suite/tokenize/get_mode.expected (+22 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenize/get_mode.expected    2014-08-14 16:35:36 +0900 (bc06ba9)
@@ -0,0 +1,22 @@
+tokenize TokenBigram "あいabアイ" NormalizerAuto NONE GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "あい",
+      "position": 0
+    },
+    {
+      "value": "ab",
+      "position": 2
+    },
+    {
+      "value": "アイ",
+      "position": 3
+    }
+  ]
+]

  Added: test/command/suite/tokenize/get_mode.test (+1 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenize/get_mode.test    2014-08-14 16:35:36 +0900 (2b08416)
@@ -0,0 +1 @@
+tokenize TokenBigram "あいabアイ" NormalizerAuto NONE GET

  Added: test/command/suite/tokenize/invalid/mode/unknown_mode.expected (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenize/invalid/mode/unknown_mode.expected    2014-08-14 16:35:36 +0900 (37f8645)
@@ -0,0 +1,3 @@
+tokenize TokenBigram "あいabアイ" NormalizerAuto NONE UNKNOWN
+[[[-22,0.0,0.0],"[tokenize] invalid mode: <UNKNOWN>"],[]]
+#|e| [tokenize] invalid mode: <UNKNOWN>

  Added: test/command/suite/tokenize/invalid/mode/unknown_mode.test (+1 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenize/invalid/mode/unknown_mode.test    2014-08-14 16:35:36 +0900 (b4418af)
@@ -0,0 +1 @@
+tokenize TokenBigram "あいabアイ" NormalizerAuto NONE UNKNOWN
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index