Kouhei Sutou
null+****@clear*****
Fri Nov 9 16:40:05 JST 2012
Kouhei Sutou 2012-11-09 16:40:05 +0900 (Fri, 09 Nov 2012) New Revision: 1817432d96d5e9a64865b7c21c0a1e9077d4dc0b https://github.com/groonga/groonga/commit/1817432d96d5e9a64865b7c21c0a1e9077d4dc0b Log: TokenDelimit family: support tokenized delimiter Added files: test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/default.expected test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/default.test test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/normalize.expected test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/normalize.test Modified files: lib/token.c Modified: lib/token.c (+15 -2) =================================================================== --- lib/token.c 2012-11-09 16:39:26 +0900 (58f039a) +++ lib/token.c 2012-11-09 16:40:05 +0900 (bd54953) @@ -88,6 +88,7 @@ typedef struct { const unsigned char *next; const unsigned char *end; grn_tokenizer_token token; + grn_bool have_tokenized_delimiter; } grn_delimited_tokenizer; static grn_obj * @@ -109,9 +110,21 @@ delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, return NULL; } user_data->ptr = tokenizer; - tokenizer->delimiter = delimiter; - tokenizer->delimiter_len = delimiter_len; + grn_table_get_info(ctx, table, &table_flags, &tokenizer->encoding, NULL); + + tokenizer->have_tokenized_delimiter = + grn_tokenizer_have_delimiter(ctx, + GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str), + tokenizer->encoding); + if (tokenizer->have_tokenized_delimiter) { + tokenizer->delimiter = GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8; + tokenizer->delimiter_len = strlen(tokenizer->delimiter); + } else { + tokenizer->delimiter = delimiter; + tokenizer->delimiter_len = delimiter_len; + } + if (table_flags & GRN_OBJ_KEY_NORMALIZE) { normalizer = GRN_NORMALIZER_AUTO; } Added: test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/default.expected (+56 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/default.expected 2012-11-09 16:40:05 +0900 (3fd3567) @@ -0,0 +1,56 @@ +table_create Tags TABLE_PAT_KEY ShortText --default_tokenizer TokenDelimit +[[0,0.0,0.0],true] +table_create Movies TABLE_HASH_KEY ShortText +[[0,0.0,0.0],true] +column_create Movies tags COLUMN_VECTOR Tags +[[0,0.0,0.0],true] +column_create Tags movies_tags COLUMN_INDEX Movies tags +[[0,0.0,0.0],true] +load --table Movies +[ +{"_key": "Seven Samurai", tags: "Samurai\uFFFEJapanese\uFFFEJapan\uFFFEKurosawa Akira"}, +{"_key": "The Last Samurai", tags: "Samurai\uFFFEEnglish\uFFFEJapanese\uFFFEUS\uFFFEJapan\uFFFETom Cruise"} +] +[[0,0.0,0.0],2] +select Tags --output_columns _key --limit -1 +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 7 + ], + [ + [ + "_key", + "ShortText" + ] + ], + [ + "English" + ], + [ + "Japan" + ], + [ + "Japanese" + ], + [ + "Kurosawa Akira" + ], + [ + "Samurai" + ], + [ + "Tom Cruise" + ], + [ + "US" + ] + ] + ] +] Added: test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/default.test (+15 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/default.test 2012-11-09 16:40:05 +0900 (63d061f) @@ -0,0 +1,15 @@ +table_create Tags TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenDelimit + +table_create Movies TABLE_HASH_KEY ShortText +column_create Movies tags COLUMN_VECTOR Tags + +column_create Tags movies_tags COLUMN_INDEX Movies tags + +load --table Movies +[ +{"_key": "Seven Samurai", tags: "Samurai\uFFFEJapanese\uFFFEJapan\uFFFEKurosawa Akira"}, +{"_key": "The Last Samurai", tags: "Samurai\uFFFEEnglish\uFFFEJapanese\uFFFEUS\uFFFEJapan\uFFFETom Cruise"} +] + +select Tags --output_columns _key --limit -1 Added: test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/normalize.expected (+56 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/normalize.expected 2012-11-09 16:40:05 +0900 (28bb4d9) @@ -0,0 +1,56 @@ +table_create Tags TABLE_PAT_KEY|KEY_NORMALIZE ShortText --default_tokenizer TokenDelimit +[[0,0.0,0.0],true] +table_create Movies TABLE_HASH_KEY ShortText +[[0,0.0,0.0],true] +column_create Movies tags COLUMN_VECTOR Tags +[[0,0.0,0.0],true] +column_create Tags movies_tags COLUMN_INDEX Movies tags +[[0,0.0,0.0],true] +load --table Movies +[ +{"_key": "Seven Samurai", tags: "Samurai\uFFFEJapanese\uFFFEJapan\uFFFEKurosawa Akira"}, +{"_key": "The Last Samurai", tags: "Samurai\uFFFEEnglish\uFFFEJapanese\uFFFEUS\uFFFEJapan\uFFFETom Cruise"} +] +[[0,0.0,0.0],2] +select Tags --output_columns _key --limit -1 +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 7 + ], + [ + [ + "_key", + "ShortText" + ] + ], + [ + "english" + ], + [ + "japan" + ], + [ + "japanese" + ], + [ + "kurosawa akira" + ], + [ + "samurai" + ], + [ + "tom cruise" + ], + [ + "us" + ] + ] + ] +] Added: test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/normalize.test (+15 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/delimit/tokenized_delimiter/normalize.test 2012-11-09 16:40:05 +0900 (d4ebc41) @@ -0,0 +1,15 @@ +table_create Tags TABLE_PAT_KEY|KEY_NORMALIZE ShortText \ + --default_tokenizer TokenDelimit + +table_create Movies TABLE_HASH_KEY ShortText +column_create Movies tags COLUMN_VECTOR Tags + +column_create Tags movies_tags COLUMN_INDEX Movies tags + +load --table Movies +[ +{"_key": "Seven Samurai", tags: "Samurai\uFFFEJapanese\uFFFEJapan\uFFFEKurosawa Akira"}, +{"_key": "The Last Samurai", tags: "Samurai\uFFFEEnglish\uFFFEJapanese\uFFFEUS\uFFFEJapan\uFFFETom Cruise"} +] + +select Tags --output_columns _key --limit -1 -------------- next part -------------- HTML����������������������������...Download