[Groonga-commit] groonga/groonga [master] TokenDelimit family: support tokenized delimiter

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Nov 9 16:40:05 JST 2012


Kouhei Sutou	2012-11-09 16:40:05 +0900 (Fri, 09 Nov 2012)

  New Revision: 1817432d96d5e9a64865b7c21c0a1e9077d4dc0b
  https://github.com/groonga/groonga/commit/1817432d96d5e9a64865b7c21c0a1e9077d4dc0b

  Log:
    TokenDelimit family: support tokenized delimiter

  Added files:
    test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/default.expected
    test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/default.test
    test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/normalize.expected
    test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/normalize.test
  Modified files:
    lib/token.c

  Modified: lib/token.c (+15 -2)
===================================================================
--- lib/token.c    2012-11-09 16:39:26 +0900 (58f039a)
+++ lib/token.c    2012-11-09 16:40:05 +0900 (bd54953)
@@ -88,6 +88,7 @@ typedef struct {
   const unsigned char *next;
   const unsigned char *end;
   grn_tokenizer_token token;
+  grn_bool have_tokenized_delimiter;
 } grn_delimited_tokenizer;
 
 static grn_obj *
@@ -109,9 +110,21 @@ delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data,
     return NULL;
   }
   user_data->ptr = tokenizer;
-  tokenizer->delimiter = delimiter;
-  tokenizer->delimiter_len = delimiter_len;
+
   grn_table_get_info(ctx, table, &table_flags, &tokenizer->encoding, NULL);
+
+  tokenizer->have_tokenized_delimiter =
+    grn_tokenizer_have_delimiter(ctx,
+                                 GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
+                                 tokenizer->encoding);
+  if (tokenizer->have_tokenized_delimiter) {
+    tokenizer->delimiter = GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8;
+    tokenizer->delimiter_len = strlen(tokenizer->delimiter);
+  } else {
+    tokenizer->delimiter = delimiter;
+    tokenizer->delimiter_len = delimiter_len;
+  }
+
   if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
     normalizer = GRN_NORMALIZER_AUTO;
   }

  Added: test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/default.expected (+56 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/default.expected    2012-11-09 16:40:05 +0900 (3fd3567)
@@ -0,0 +1,56 @@
+table_create Tags TABLE_PAT_KEY ShortText   --default_tokenizer TokenDelimit
+[[0,0.0,0.0],true]
+table_create Movies TABLE_HASH_KEY ShortText
+[[0,0.0,0.0],true]
+column_create Movies tags COLUMN_VECTOR Tags
+[[0,0.0,0.0],true]
+column_create Tags movies_tags COLUMN_INDEX Movies tags
+[[0,0.0,0.0],true]
+load --table Movies
+[
+{"_key": "Seven Samurai", tags: "Samurai\uFFFEJapanese\uFFFEJapan\uFFFEKurosawa Akira"},
+{"_key": "The Last Samurai", tags: "Samurai\uFFFEEnglish\uFFFEJapanese\uFFFEUS\uFFFEJapan\uFFFETom Cruise"}
+]
+[[0,0.0,0.0],2]
+select Tags --output_columns _key --limit -1
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        7
+      ],
+      [
+        [
+          "_key",
+          "ShortText"
+        ]
+      ],
+      [
+        "English"
+      ],
+      [
+        "Japan"
+      ],
+      [
+        "Japanese"
+      ],
+      [
+        "Kurosawa Akira"
+      ],
+      [
+        "Samurai"
+      ],
+      [
+        "Tom Cruise"
+      ],
+      [
+        "US"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/default.test (+15 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/default.test    2012-11-09 16:40:05 +0900 (63d061f)
@@ -0,0 +1,15 @@
+table_create Tags TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenDelimit
+
+table_create Movies TABLE_HASH_KEY ShortText
+column_create Movies tags COLUMN_VECTOR Tags
+
+column_create Tags movies_tags COLUMN_INDEX Movies tags
+
+load --table Movies
+[
+{"_key": "Seven Samurai", tags: "Samurai\uFFFEJapanese\uFFFEJapan\uFFFEKurosawa Akira"},
+{"_key": "The Last Samurai", tags: "Samurai\uFFFEEnglish\uFFFEJapanese\uFFFEUS\uFFFEJapan\uFFFETom Cruise"}
+]
+
+select Tags --output_columns _key --limit -1

  Added: test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/normalize.expected (+56 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/normalize.expected    2012-11-09 16:40:05 +0900 (28bb4d9)
@@ -0,0 +1,56 @@
+table_create Tags TABLE_PAT_KEY|KEY_NORMALIZE ShortText   --default_tokenizer TokenDelimit
+[[0,0.0,0.0],true]
+table_create Movies TABLE_HASH_KEY ShortText
+[[0,0.0,0.0],true]
+column_create Movies tags COLUMN_VECTOR Tags
+[[0,0.0,0.0],true]
+column_create Tags movies_tags COLUMN_INDEX Movies tags
+[[0,0.0,0.0],true]
+load --table Movies
+[
+{"_key": "Seven Samurai", tags: "Samurai\uFFFEJapanese\uFFFEJapan\uFFFEKurosawa Akira"},
+{"_key": "The Last Samurai", tags: "Samurai\uFFFEEnglish\uFFFEJapanese\uFFFEUS\uFFFEJapan\uFFFETom Cruise"}
+]
+[[0,0.0,0.0],2]
+select Tags --output_columns _key --limit -1
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        7
+      ],
+      [
+        [
+          "_key",
+          "ShortText"
+        ]
+      ],
+      [
+        "english"
+      ],
+      [
+        "japan"
+      ],
+      [
+        "japanese"
+      ],
+      [
+        "kurosawa akira"
+      ],
+      [
+        "samurai"
+      ],
+      [
+        "tom cruise"
+      ],
+      [
+        "us"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/normalize.test (+15 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/normalize.test    2012-11-09 16:40:05 +0900 (d4ebc41)
@@ -0,0 +1,15 @@
+table_create Tags TABLE_PAT_KEY|KEY_NORMALIZE ShortText \
+  --default_tokenizer TokenDelimit
+
+table_create Movies TABLE_HASH_KEY ShortText
+column_create Movies tags COLUMN_VECTOR Tags
+
+column_create Tags movies_tags COLUMN_INDEX Movies tags
+
+load --table Movies
+[
+{"_key": "Seven Samurai", tags: "Samurai\uFFFEJapanese\uFFFEJapan\uFFFEKurosawa Akira"},
+{"_key": "The Last Samurai", tags: "Samurai\uFFFEEnglish\uFFFEJapanese\uFFFEUS\uFFFEJapan\uFFFETom Cruise"}
+]
+
+select Tags --output_columns _key --limit -1
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index