[Groonga-commit] groonga/groonga at 0b54c74 [master] Add table_tokenize command

Back to archive index

naoa null+****@clear*****
Sun Oct 26 20:00:49 JST 2014


naoa	2014-10-26 20:00:49 +0900 (Sun, 26 Oct 2014)

  New Revision: 0b54c748fd84f59c7cee3f8443b63d7c2ee608b5
  https://github.com/groonga/groonga/commit/0b54c748fd84f59c7cee3f8443b63d7c2ee608b5

  Merged ff11b0c: Merge pull request #232 from naoa/tokenize-support-token_filters

  Message:
    Add table_tokenize command

  Added files:
    test/command/suite/table_tokenize/add_mode.expected
    test/command/suite/table_tokenize/add_mode.test
    test/command/suite/table_tokenize/flags.expected
    test/command/suite/table_tokenize/flags.test
    test/command/suite/table_tokenize/get_mode.expected
    test/command/suite/table_tokenize/get_mode.test
    test/command/suite/table_tokenize/with_normalizer.expected
    test/command/suite/table_tokenize/with_normalizer.test
    test/command/suite/table_tokenize/with_token_filters.expected
    test/command/suite/table_tokenize/with_token_filters.test
  Modified files:
    lib/proc.c

  Modified: lib/proc.c (+102 -26)
===================================================================
--- lib/proc.c    2014-10-26 18:27:34 +0900 (74b85d6)
+++ lib/proc.c    2014-10-26 20:00:49 +0900 (cab6995)
@@ -3536,6 +3536,42 @@ tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, grn_token_mode mode,
   grn_token_cursor_close(ctx, token_cursor);
 }
 
+static void
+add_tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, unsigned int flags)
+{
+  grn_obj tokens;
+  GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
+  tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
+  output_tokens(ctx, &tokens, lexicon);
+  GRN_OBJ_FIN(ctx, &tokens);
+}
+
+static void
+get_tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, unsigned int flags)
+{
+  {
+    grn_token_cursor *token_cursor;
+    token_cursor =
+      grn_token_cursor_open(ctx, lexicon,
+                            GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string),
+                            GRN_TOKEN_ADD, flags);
+    if (token_cursor) {
+      while (token_cursor->status == GRN_TOKEN_DOING) {
+        grn_token_cursor_next(ctx, token_cursor);
+      }
+      grn_token_cursor_close(ctx, token_cursor);
+    }
+  }
+
+  {
+    grn_obj tokens;
+    GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
+    tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
+    output_tokens(ctx, &tokens, lexicon);
+    GRN_OBJ_FIN(ctx, &tokens);
+  }
+}
+
 static grn_obj *
 proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
@@ -3585,33 +3621,9 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
      memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0)
 
     if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) {
-      grn_obj tokens;
-      GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
-      tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
-      output_tokens(ctx, &tokens, lexicon);
-      GRN_OBJ_FIN(ctx, &tokens);
+      add_tokenize(ctx, lexicon, string, flags);
     } else if (MODE_NAME_EQUAL("GET")) {
-      {
-        grn_token_cursor *token_cursor;
-        token_cursor =
-          grn_token_cursor_open(ctx, lexicon,
-                                GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string),
-                                GRN_TOKEN_ADD, flags);
-        if (token_cursor) {
-          while (token_cursor->status == GRN_TOKEN_DOING) {
-            grn_token_cursor_next(ctx, token_cursor);
-          }
-          grn_token_cursor_close(ctx, token_cursor);
-        }
-      }
-
-      {
-        grn_obj tokens;
-        GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
-        tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
-        output_tokens(ctx, &tokens, lexicon);
-        GRN_OBJ_FIN(ctx, &tokens);
-      }
+      get_tokenize(ctx, lexicon, string, flags);
     } else {
       ERR(GRN_INVALID_ARGUMENT, "[tokenize] invalid mode: <%.*s>",
           (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name));
@@ -3624,6 +3636,64 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   return NULL;
 }
 
+static grn_obj *
+proc_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+  grn_obj *table_name;
+  grn_obj *string;
+  grn_obj *flag_names;
+  grn_obj *mode_name;
+
+  table_name = VAR(0);
+  string = VAR(1);
+  flag_names = VAR(2);
+  mode_name = VAR(3);
+
+  if (GRN_TEXT_LEN(table_name) == 0) {
+    ERR(GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing");
+    return NULL;
+  }
+
+  if (GRN_TEXT_LEN(string) == 0) {
+    ERR(GRN_INVALID_ARGUMENT, "[table_tokenize] string is missing");
+    return NULL;
+  }
+
+  {
+    unsigned int flags;
+    grn_obj *lexicon;
+
+    flags = parse_tokenize_flags(ctx, flag_names);
+    if (ctx->rc != GRN_SUCCESS) {
+      return NULL;
+    }
+
+    lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name));
+
+    if (!lexicon) {
+      return NULL;
+    }
+
+#define MODE_NAME_EQUAL(name)\
+    (GRN_TEXT_LEN(mode_name) == strlen(name) &&\
+     memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0)
+
+    if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) {
+      add_tokenize(ctx, lexicon, string, flags);
+    } else if (MODE_NAME_EQUAL("GET")) {
+      get_tokenize(ctx, lexicon, string, flags);
+    } else {
+      ERR(GRN_INVALID_ARGUMENT, "[table_tokenize] invalid mode: <%.*s>",
+          (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name));
+    }
+#undef MODE_NAME_EQUAL
+
+    grn_obj_unlink(ctx, lexicon);
+  }
+
+  return NULL;
+}
+
 static void
 list_proc(grn_ctx *ctx, grn_proc_type target_proc_type,
           const char *name, const char *plural_name)
@@ -5495,6 +5565,12 @@ grn_db_init_builtin_query(grn_ctx *ctx)
   DEF_VAR(vars[5], "token_filters");
   DEF_COMMAND("tokenize", proc_tokenize, 6, vars);
 
+  DEF_VAR(vars[0], "table");
+  DEF_VAR(vars[1], "string");
+  DEF_VAR(vars[2], "flags");
+  DEF_VAR(vars[3], "mode");
+  DEF_COMMAND("table_tokenize", proc_table_tokenize, 4, vars);
+
   DEF_COMMAND("tokenizer_list", proc_tokenizer_list, 0, vars);
 
   DEF_COMMAND("normalizer_list", proc_normalizer_list, 0, vars);

  Added: test/command/suite/table_tokenize/add_mode.expected (+32 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_tokenize/add_mode.expected    2014-10-26 20:00:49 +0900 (001b9b2)
@@ -0,0 +1,32 @@
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+table_tokenize Terms "あいabアイ" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "あい",
+      "position": 0
+    },
+    {
+      "value": "い",
+      "position": 1
+    },
+    {
+      "value": "ab",
+      "position": 2
+    },
+    {
+      "value": "アイ",
+      "position": 3
+    },
+    {
+      "value": "イ",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/table_tokenize/add_mode.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_tokenize/add_mode.test    2014-10-26 20:00:49 +0900 (8de0873)
@@ -0,0 +1,5 @@
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenBigram \
+  --normalizer NormalizerAuto
+
+table_tokenize Terms "あいabアイ" --mode ADD

  Added: test/command/suite/table_tokenize/flags.expected (+24 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_tokenize/flags.expected    2014-10-26 20:00:49 +0900 (20bb1c6)
@@ -0,0 +1,24 @@
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenDelimit   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+table_tokenize Terms "aB￾cDe 1￾23" ENABLE_TOKENIZED_DELIMITER
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "ab",
+      "position": 0
+    },
+    {
+      "value": "cde 1",
+      "position": 1
+    },
+    {
+      "value": "23",
+      "position": 2
+    }
+  ]
+]

  Added: test/command/suite/table_tokenize/flags.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_tokenize/flags.test    2014-10-26 20:00:49 +0900 (e5da57a)
@@ -0,0 +1,5 @@
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenDelimit \
+  --normalizer NormalizerAuto
+
+table_tokenize Terms "aB￾cDe 1￾23" ENABLE_TOKENIZED_DELIMITER

  Added: test/command/suite/table_tokenize/get_mode.expected (+24 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_tokenize/get_mode.expected    2014-10-26 20:00:49 +0900 (774c69d)
@@ -0,0 +1,24 @@
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+table_tokenize Terms "あいabアイ" --mode GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "あい",
+      "position": 0
+    },
+    {
+      "value": "ab",
+      "position": 2
+    },
+    {
+      "value": "アイ",
+      "position": 3
+    }
+  ]
+]

  Added: test/command/suite/table_tokenize/get_mode.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_tokenize/get_mode.test    2014-10-26 20:00:49 +0900 (529c395)
@@ -0,0 +1,5 @@
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenBigram \
+  --normalizer NormalizerAuto
+
+table_tokenize Terms "あいabアイ" --mode GET

  Added: test/command/suite/table_tokenize/with_normalizer.expected (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_tokenize/with_normalizer.expected    2014-10-26 20:00:49 +0900 (ac20b02)
@@ -0,0 +1,4 @@
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+table_tokenize Terms "aBcDe 123"
+[[0,0.0,0.0],[{"value":"abcde","position":0},{"value":"123","position":1}]]

  Added: test/command/suite/table_tokenize/with_normalizer.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_tokenize/with_normalizer.test    2014-10-26 20:00:49 +0900 (3cda398)
@@ -0,0 +1,5 @@
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenBigram \
+  --normalizer NormalizerAuto
+
+table_tokenize Terms "aBcDe 123"

  Added: test/command/suite/table_tokenize/with_token_filters.expected (+37 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_tokenize/with_token_filters.expected    2014-10-26 20:00:49 +0900 (7cfcda0)
@@ -0,0 +1,37 @@
+register token_filters/stop_word
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram   --normalizer NormalizerAuto   --token_filters TokenFilterStopWord
+[[0,0.0,0.0],true]
+column_create Terms is_stop_word COLUMN_SCALAR Bool
+[[0,0.0,0.0],true]
+load --table Terms
+[
+{"_key": "and", "is_stop_word": true}
+]
+[[0,0.0,0.0],1]
+table_tokenize Terms "Hello and Good-bye" --mode GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "hello",
+      "position": 0
+    },
+    {
+      "value": "good",
+      "position": 2
+    },
+    {
+      "value": "-",
+      "position": 3
+    },
+    {
+      "value": "bye",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/table_tokenize/with_token_filters.test (+14 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_tokenize/with_token_filters.test    2014-10-26 20:00:49 +0900 (80c5adb)
@@ -0,0 +1,14 @@
+register token_filters/stop_word
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenBigram \
+  --normalizer NormalizerAuto \
+  --token_filters TokenFilterStopWord
+column_create Terms is_stop_word COLUMN_SCALAR Bool
+
+load --table Terms
+[
+{"_key": "and", "is_stop_word": true}
+]
+
+table_tokenize Terms "Hello and Good-bye" --mode GET
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index