[Groonga-commit] groonga/groonga at 9e75d2d [master] tokenize: Add force_prefix

Back to archive index

naoa null+****@clear*****
Thu May 21 21:04:13 JST 2015


naoa	2015-05-21 21:04:13 +0900 (Thu, 21 May 2015)

  New Revision: 9e75d2d48c30c9e906f66c5e3762bd4a84559707
  https://github.com/groonga/groonga/commit/9e75d2d48c30c9e906f66c5e3762bd4a84559707

  Merged 6c4e87d: Merge pull request #339 from naoa/add-force_prefix-to-tokenize

  Message:
    tokenize: Add force_prefix

  Added files:
    test/command/suite/tokenizers/bigram/force_prefix/multiple_tokens/matured.expected
    test/command/suite/tokenizers/bigram/force_prefix/multiple_tokens/matured.test
    test/command/suite/tokenizers/bigram/force_prefix/multiple_tokens/unmatured.expected
    test/command/suite/tokenizers/bigram/force_prefix/multiple_tokens/unmatured.test
    test/command/suite/tokenizers/bigram/force_prefix/single_token/matured.expected
    test/command/suite/tokenizers/bigram/force_prefix/single_token/matured.test
    test/command/suite/tokenizers/bigram/force_prefix/single_token/unmatured.expected
    test/command/suite/tokenizers/bigram/force_prefix/single_token/unmatured.test
  Modified files:
    lib/proc.c

  Modified: lib/proc.c (+6 -1)
===================================================================
--- lib/proc.c    2015-05-22 10:02:12 +0900 (00be429)
+++ lib/proc.c    2015-05-21 21:04:13 +0900 (f4a58dc)
@@ -3983,6 +3983,7 @@ parse_tokenize_flags(grn_ctx *ctx, grn_obj *flag_names)
 typedef struct {
   grn_id id;
   int32_t position;
+  grn_bool force_prefix;
 } tokenize_token;
 
 static void
@@ -3999,7 +4000,7 @@ output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon)
 
     token = ((tokenize_token *)(GRN_BULK_HEAD(tokens))) + i;
 
-    GRN_OUTPUT_MAP_OPEN("TOKEN", 2);
+    GRN_OUTPUT_MAP_OPEN("TOKEN", 3);
 
     GRN_OUTPUT_CSTR("value");
     value_size = grn_table_get_key(ctx, lexicon, token->id,
@@ -4009,6 +4010,9 @@ output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon)
     GRN_OUTPUT_CSTR("position");
     GRN_OUTPUT_INT32(token->position);
 
+    GRN_OUTPUT_CSTR("force_prefix");
+    GRN_OUTPUT_BOOL(token->force_prefix);
+
     GRN_OUTPUT_MAP_CLOSE();
   }
   GRN_OUTPUT_ARRAY_CLOSE();
@@ -4118,6 +4122,7 @@ tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, grn_tokenize_mode mode
     current_token = ((tokenize_token *)(GRN_BULK_CURR(tokens))) - 1;
     current_token->id = token_id;
     current_token->position = token_cursor->pos;
+    current_token->force_prefix = token_cursor->force_prefix;
   }
   grn_token_cursor_close(ctx, token_cursor);
 }

  Added: test/command/suite/tokenizers/bigram/force_prefix/multiple_tokens/matured.expected (+20 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/bigram/force_prefix/multiple_tokens/matured.expected    2015-05-21 21:04:13 +0900 (aa9a223)
@@ -0,0 +1,20 @@
+tokenize TokenBigram "ABCだよ" NormalizerAuto --mode GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "abc",
+      "position": 0,
+      "force_prefix": false
+    },
+    {
+      "value": "だよ",
+      "position": 1,
+      "force_prefix": false
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/bigram/force_prefix/multiple_tokens/matured.test (+1 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/bigram/force_prefix/multiple_tokens/matured.test    2015-05-21 21:04:13 +0900 (01f8ec1)
@@ -0,0 +1 @@
+tokenize TokenBigram "ABCだよ" NormalizerAuto --mode GET

  Added: test/command/suite/tokenizers/bigram/force_prefix/multiple_tokens/unmatured.expected (+20 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/bigram/force_prefix/multiple_tokens/unmatured.expected    2015-05-21 21:04:13 +0900 (23f8a25)
@@ -0,0 +1,20 @@
+tokenize TokenBigram "ABCだ" NormalizerAuto --mode GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "abc",
+      "position": 0,
+      "force_prefix": false
+    },
+    {
+      "value": "だ",
+      "position": 1,
+      "force_prefix": true
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/bigram/force_prefix/multiple_tokens/unmatured.test (+1 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/bigram/force_prefix/multiple_tokens/unmatured.test    2015-05-21 21:04:13 +0900 (a924758)
@@ -0,0 +1 @@
+tokenize TokenBigram "ABCだ" NormalizerAuto --mode GET

  Added: test/command/suite/tokenizers/bigram/force_prefix/single_token/matured.expected (+2 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/bigram/force_prefix/single_token/matured.expected    2015-05-21 21:04:13 +0900 (a311466)
@@ -0,0 +1,2 @@
+tokenize TokenBigram "だよ" NormalizerAuto --mode GET
+[[0,0.0,0.0],[{"value":"だよ","position":0,"force_prefix":false}]]

  Added: test/command/suite/tokenizers/bigram/force_prefix/single_token/matured.test (+1 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/bigram/force_prefix/single_token/matured.test    2015-05-21 21:04:13 +0900 (2e4f195)
@@ -0,0 +1 @@
+tokenize TokenBigram "だよ" NormalizerAuto --mode GET

  Added: test/command/suite/tokenizers/bigram/force_prefix/single_token/unmatured.expected (+2 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/bigram/force_prefix/single_token/unmatured.expected    2015-05-21 21:04:13 +0900 (4867dda)
@@ -0,0 +1,2 @@
+tokenize TokenBigram "だ" NormalizerAuto --mode GET
+[[0,0.0,0.0],[{"value":"だ","position":0,"force_prefix":true}]]

  Added: test/command/suite/tokenizers/bigram/force_prefix/single_token/unmatured.test (+1 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/bigram/force_prefix/single_token/unmatured.test    2015-05-21 21:04:13 +0900 (2f82f49)
@@ -0,0 +1 @@
+tokenize TokenBigram "だ" NormalizerAuto --mode GET
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index