[Groonga-commit] groonga/groonga at 8d7f275 [master] tokenize: support token filters

Back to archive index

naoa null+****@clear*****
Sun Oct 26 17:24:38 JST 2014


naoa	2014-10-26 17:24:38 +0900 (Sun, 26 Oct 2014)

  New Revision: 8d7f275527588aeaa932666ef6df3cd18d82f41c
  https://github.com/groonga/groonga/commit/8d7f275527588aeaa932666ef6df3cd18d82f41c

  Merged ff11b0c: Merge pull request #232 from naoa/tokenize-support-token_filters

  Message:
    tokenize: support token filters

  Added files:
    test/command/suite/tokenize/with_token_filters.expected
    test/command/suite/tokenize/with_token_filters.test
  Modified files:
    lib/proc.c

  Modified: lib/proc.c (+35 -17)
===================================================================
--- lib/proc.c    2014-10-26 15:05:34 +0900 (b0b54fe)
+++ lib/proc.c    2014-10-26 17:24:38 +0900 (15948b6)
@@ -3400,7 +3400,7 @@ typedef struct {
 } tokenize_token;
 
 static void
-output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_hash *lexicon)
+output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon)
 {
   int i, n_tokens;
 
@@ -3416,8 +3416,8 @@ output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_hash *lexicon)
     GRN_OUTPUT_MAP_OPEN("TOKEN", 2);
 
     GRN_OUTPUT_CSTR("value");
-    value_size = grn_hash_get_key(ctx, lexicon, token->id,
-                                  value, GRN_TABLE_MAX_KEY_SIZE);
+    value_size = grn_table_get_key(ctx, lexicon, token->id,
+                                   value, GRN_TABLE_MAX_KEY_SIZE);
     GRN_OUTPUT_STR(value, value_size);
 
     GRN_OUTPUT_CSTR("position");
@@ -3428,12 +3428,13 @@ output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_hash *lexicon)
   GRN_OUTPUT_ARRAY_CLOSE();
 }
 
-static grn_hash *
+static grn_obj *
 create_lexicon_for_tokenize(grn_ctx *ctx,
                             grn_obj *tokenizer_name,
-                            grn_obj *normalizer_name)
+                            grn_obj *normalizer_name,
+                            grn_obj *token_filter_names)
 {
-  grn_hash *lexicon;
+  grn_obj *lexicon;
   grn_obj *tokenizer;
   grn_obj *normalizer = NULL;
 
@@ -3489,28 +3490,32 @@ create_lexicon_for_tokenize(grn_ctx *ctx,
     }
   }
 
-  lexicon = grn_hash_create(ctx, NULL, GRN_TABLE_MAX_KEY_SIZE, 0,
-                            GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_VAR_SIZE);
-  grn_obj_set_info(ctx, (grn_obj *)lexicon,
+  lexicon = grn_table_create(ctx, NULL, 0,
+                             NULL,
+                             GRN_OBJ_TABLE_HASH_KEY,
+                             grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
+                             NULL);
+  grn_obj_set_info(ctx, lexicon,
                    GRN_INFO_DEFAULT_TOKENIZER, tokenizer);
   grn_obj_unlink(ctx, tokenizer);
   if (normalizer) {
-    grn_obj_set_info(ctx, (grn_obj *)lexicon,
+    grn_obj_set_info(ctx, lexicon,
                      GRN_INFO_NORMALIZER, normalizer);
     grn_obj_unlink(ctx, normalizer);
   }
+  proc_table_create_set_token_filters(ctx, lexicon, token_filter_names);
 
   return lexicon;
 }
 
 static void
-tokenize(grn_ctx *ctx, grn_hash *lexicon, grn_obj *string, grn_token_mode mode,
+tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, grn_token_mode mode,
          unsigned int flags, grn_obj *tokens)
 {
   grn_token_cursor *token_cursor;
 
   token_cursor =
-    grn_token_cursor_open(ctx, (grn_obj *)lexicon,
+    grn_token_cursor_open(ctx, lexicon,
                           GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string),
                           mode, flags);
   if (!token_cursor) {
@@ -3539,12 +3544,16 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   grn_obj *normalizer_name;
   grn_obj *flag_names;
   grn_obj *mode_name;
+  grn_obj *token_filter_names;
+  grn_obj *table_name;
 
   tokenizer_name = VAR(0);
   string = VAR(1);
   normalizer_name = VAR(2);
   flag_names = VAR(3);
   mode_name = VAR(4);
+  token_filter_names = VAR(5);
+  table_name = VAR(6);
 
   if (GRN_TEXT_LEN(tokenizer_name) == 0) {
     ERR(GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing");
@@ -3558,14 +3567,21 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 
   {
     unsigned int flags;
-    grn_hash *lexicon;
+    grn_obj *lexicon;
 
     flags = parse_tokenize_flags(ctx, flag_names);
     if (ctx->rc != GRN_SUCCESS) {
       return NULL;
     }
 
-    lexicon = create_lexicon_for_tokenize(ctx, tokenizer_name, normalizer_name);
+    if (GRN_TEXT_LEN(table_name)) {
+      lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name));
+    } else {
+      lexicon = create_lexicon_for_tokenize(ctx,
+                                            tokenizer_name,
+                                            normalizer_name,
+                                            token_filter_names);
+    }
     if (!lexicon) {
       return NULL;
     }
@@ -3584,7 +3600,7 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
       {
         grn_token_cursor *token_cursor;
         token_cursor =
-          grn_token_cursor_open(ctx, (grn_obj *)lexicon,
+          grn_token_cursor_open(ctx, lexicon,
                                 GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string),
                                 GRN_TOKEN_ADD, flags);
         if (token_cursor) {
@@ -3608,7 +3624,7 @@ proc_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     }
 #undef MODE_NAME_EQUAL
 
-    grn_hash_close(ctx, lexicon);
+    grn_obj_unlink(ctx, lexicon);
   }
 
   return NULL;
@@ -5482,7 +5498,9 @@ grn_db_init_builtin_query(grn_ctx *ctx)
   DEF_VAR(vars[2], "normalizer");
   DEF_VAR(vars[3], "flags");
   DEF_VAR(vars[4], "mode");
-  DEF_COMMAND("tokenize", proc_tokenize, 5, vars);
+  DEF_VAR(vars[5], "token_filters");
+  DEF_VAR(vars[6], "table");
+  DEF_COMMAND("tokenize", proc_tokenize, 7, vars);
 
   DEF_COMMAND("tokenizer_list", proc_tokenizer_list, 0, vars);
 

  Added: test/command/suite/tokenize/with_token_filters.expected (+37 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenize/with_token_filters.expected    2014-10-26 17:24:38 +0900 (87c7367)
@@ -0,0 +1,37 @@
+register token_filters/stop_word
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram   --normalizer NormalizerAuto   --token_filters TokenFilterStopWord
+[[0,0.0,0.0],true]
+column_create Terms is_stop_word COLUMN_SCALAR Bool
+[[0,0.0,0.0],true]
+load --table Terms
+[
+{"_key": "and", "is_stop_word": true}
+]
+[[0,0.0,0.0],1]
+tokenize TokenBigram "Hello and Good-bye" --mode GET --table Terms
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "hello",
+      "position": 0
+    },
+    {
+      "value": "good",
+      "position": 2
+    },
+    {
+      "value": "-",
+      "position": 3
+    },
+    {
+      "value": "bye",
+      "position": 4
+    }
+  ]
+]

  Added: test/command/suite/tokenize/with_token_filters.test (+14 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenize/with_token_filters.test    2014-10-26 17:24:38 +0900 (19a85a5)
@@ -0,0 +1,14 @@
+register token_filters/stop_word
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenBigram \
+  --normalizer NormalizerAuto \
+  --token_filters TokenFilterStopWord
+column_create Terms is_stop_word COLUMN_SCALAR Bool
+
+load --table Terms
+[
+{"_key": "and", "is_stop_word": true}
+]
+
+tokenize TokenBigram "Hello and Good-bye" --mode GET --table Terms
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index