Kouhei Sutou
null+****@clear*****
Sun Mar 1 18:52:20 JST 2015
Kouhei Sutou 2015-03-01 18:52:20 +0900 (Sun, 01 Mar 2015) New Revision: a556a40f295a512a565de35c5046222ac68dc718 https://github.com/groonga/groonga/commit/a556a40f295a512a565de35c5046222ac68dc718 Message: mecab: use the last found delimiter instead of first found delimiter Added files: test/command/suite/tokenizers/mecab/chunk/consecutive_delimiters.expected test/command/suite/tokenizers/mecab/chunk/consecutive_delimiters.test Modified files: plugins/tokenizers/mecab.c Modified: plugins/tokenizers/mecab.c (+21 -7) =================================================================== --- plugins/tokenizers/mecab.c 2015-03-01 00:41:03 +0900 (5f7c1a2) +++ plugins/tokenizers/mecab.c 2015-03-01 18:52:20 +0900 (a9e6b65) @@ -174,6 +174,7 @@ chunked_tokenize_utf8(grn_ctx *ctx, { const char *chunk_start; const char *current; + const char *last_delimiter; const char *string_end = string + string_bytes; grn_encoding encoding = tokenizer->query->encoding; @@ -185,6 +186,7 @@ chunked_tokenize_utf8(grn_ctx *ctx, } chunk_start = current = string; + last_delimiter = NULL; while (current < string_end) { int space_bytes; int character_bytes; @@ -204,6 +206,7 @@ chunked_tokenize_utf8(grn_ctx *ctx, } current += space_bytes; chunk_start = current; + last_delimiter = NULL; continue; } @@ -218,18 +221,29 @@ chunked_tokenize_utf8(grn_ctx *ctx, current_character = current; current += character_bytes; + if (is_delimiter_character(ctx, current_character, character_bytes)) { + last_delimiter = current; + } - if (is_delimiter_character(ctx, current_character, character_bytes) || - (current - chunk_start) >= grn_mecab_chunk_size_threshold) { + if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) { grn_bool succeeded; - succeeded = chunked_tokenize_utf8_chunk(ctx, - tokenizer, - chunk_start, - current - chunk_start); + if (last_delimiter) { + succeeded = chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + last_delimiter - chunk_start); + chunk_start = last_delimiter; + } else { + succeeded = chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + current - chunk_start); + chunk_start = current; + } if (!succeeded) { return succeeded; } - chunk_start = current; + last_delimiter = NULL; } } Added: test/command/suite/tokenizers/mecab/chunk/consecutive_delimiters.expected (+30 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/chunk/consecutive_delimiters.expected 2015-03-01 18:52:20 +0900 (945dd0b) @@ -0,0 +1,30 @@ +tokenize TokenMecab '日本。エンジン。エンジン' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "日本", + "position": 0 + }, + { + "value": "。", + "position": 1 + }, + { + "value": "エンジン", + "position": 2 + }, + { + "value": "。", + "position": 3 + }, + { + "value": "エンジン", + "position": 4 + } + ] +] Added: test/command/suite/tokenizers/mecab/chunk/consecutive_delimiters.test (+3 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/chunk/consecutive_delimiters.test 2015-03-01 18:52:20 +0900 (34f5fe0) @@ -0,0 +1,3 @@ +#@on-error omit +tokenize TokenMecab '日本。エンジン。エンジン' +#@on-error default -------------- next part -------------- HTML����������������������������... Download