Kouhei Sutou
null+****@clear*****
Fri May 11 14:40:37 JST 2018
Kouhei Sutou 2018-05-11 14:40:37 +0900 (Fri, 11 May 2018) New Revision: b91e966ff9b8ffa564c245e2db2b8e06021e110e https://github.com/groonga/groonga/commit/b91e966ff9b8ffa564c245e2db2b8e06021e110e Message: TokenNgram report_source_location: fix wrong report Added files: test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected Copied files: test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test (from test/command/suite/tokenizers/ngram/report_source_location/expand.test) test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test (from test/command/suite/tokenizers/ngram/report_source_location/expand.test) Modified files: lib/tokenizers.c Renamed files: test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected (from test/command/suite/tokenizers/ngram/report_source_location/expand.expected) test/command/suite/tokenizers/ngram/report_source_location/hiragana.test (from test/command/suite/tokenizers/ngram/report_source_location/expand.test) Modified: lib/tokenizers.c (+13 -17) =================================================================== --- lib/tokenizers.c 2018-05-11 12:26:32 +0900 (be7d9bca5) +++ lib/tokenizers.c 2018-05-11 14:40:37 +0900 (3e40e8f02) @@ -842,39 +842,35 @@ ngram_next(grn_ctx *ctx, grn_token_set_overlap(ctx, token, tokenizer->overlap); if (checks) { size_t i; - uint32_t uncount_offset = 0; uint32_t source_length = 0; + uint64_t next_offset = tokenizer->source_offset; + grn_bool first_character = GRN_TRUE; grn_token_set_source_offset(ctx, token, tokenizer->source_offset); if (checks[0] == -1) { size_t n_leading_bytes = p - tokenizer->start; for (i = 1; i <= n_leading_bytes; i++) { if (checks[-i] > 0) { - uncount_offset = source_length = checks[-i]; + source_length = checks[-i]; + if (!tokenizer->overlap) { + next_offset += checks[-i]; + } + first_character = GRN_FALSE; break; } } } for (i = 0; i < data_size; i++) { if (checks[i] > 0) { - source_length += checks[i]; - } - } - if (r < e) { - if (checks[i] > 0) { - if (!tokenizer->overlap) { - uncount_offset = 0; - } - } else if (checks[i] == -1) { - for (; i > 0; i--) { - if (checks[i - 1] > 0) { - uncount_offset += checks[i - 1]; - break; - } + if ((tokenizer->overlap && !first_character) || + !tokenizer->overlap) { + next_offset += checks[i]; } + source_length += checks[i]; + first_character = GRN_FALSE; } } grn_token_set_source_length(ctx, token, source_length); - tokenizer->source_offset += source_length - uncount_offset; + tokenizer->source_offset = next_offset; } } } Renamed: test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected (+28 -14) 58% =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/expand.expected 2018-05-11 12:26:32 +0900 (56c2b8d84) +++ test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected 2018-05-11 14:40:37 +0900 (7f8fdc0a2) @@ -1,4 +1,4 @@ -tokenize 'TokenNgram("report_source_location", true)' "ア㌕Az" NormalizerAuto +tokenize 'TokenNgram("report_source_location", true)' "アイ㌕エオ" NormalizerAuto [ [ 0, @@ -7,53 +7,67 @@ tokenize 'TokenNgram("report_source_location", true)' "ア㌕Az" Normali ], [ { - "value": "アキ", + "value": "アイ", "position": 0, "force_prefix": false, "source_offset": 0, "source_length": 6 }, { - "value": "キロ", + "value": "イキ", "position": 1, "force_prefix": false, "source_offset": 3, - "source_length": 3 + "source_length": 6 }, { - "value": "ログ", + "value": "キロ", "position": 2, "force_prefix": false, - "source_offset": 3, + "source_offset": 6, "source_length": 3 }, { - "value": "グラ", + "value": "ログ", "position": 3, "force_prefix": false, - "source_offset": 3, + "source_offset": 6, "source_length": 3 }, { - "value": "ラム", + "value": "グラ", "position": 4, "force_prefix": false, - "source_offset": 3, + "source_offset": 6, "source_length": 3 }, { - "value": "ム", + "value": "ラム", "position": 5, "force_prefix": false, - "source_offset": 3, + "source_offset": 6, "source_length": 3 }, { - "value": "az", + "value": "ムエ", "position": 6, "force_prefix": false, "source_offset": 6, - "source_length": 4 + "source_length": 6 + }, + { + "value": "エオ", + "position": 7, + "force_prefix": false, + "source_offset": 9, + "source_length": 6 + }, + { + "value": "オ", + "position": 8, + "force_prefix": false, + "source_offset": 12, + "source_length": 3 } ] ] Copied: test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test (+1 -1) 77% =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/expand.test 2018-05-11 12:26:32 +0900 (f45dd3257) +++ test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test 2018-05-11 14:40:37 +0900 (5c4795fd8) @@ -1,4 +1,4 @@ tokenize \ 'TokenNgram("report_source_location", true)' \ - "ア㌕Az" \ + "アイ㌕エオ" \ NormalizerAuto Added: test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected (+38 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected 2018-05-11 14:40:37 +0900 (1e4bb5327) @@ -0,0 +1,38 @@ +tokenize 'TokenNgram("report_source_location", true)' "A㌔Z" NormalizerAuto +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "a", + "position": 0, + "force_prefix": false, + "source_offset": 0, + "source_length": 3 + }, + { + "value": "キロ", + "position": 1, + "force_prefix": false, + "source_offset": 3, + "source_length": 3 + }, + { + "value": "ロ", + "position": 2, + "force_prefix": false, + "source_offset": 3, + "source_length": 3 + }, + { + "value": "z", + "position": 3, + "force_prefix": false, + "source_offset": 6, + "source_length": 3 + } + ] +] Copied: test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test (+1 -1) 81% =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/expand.test 2018-05-11 12:26:32 +0900 (f45dd3257) +++ test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test 2018-05-11 14:40:37 +0900 (6e26f39a6) @@ -1,4 +1,4 @@ tokenize \ 'TokenNgram("report_source_location", true)' \ - "ア㌕Az" \ + "A㌔Z" \ NormalizerAuto Added: test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected (+45 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected 2018-05-11 14:40:37 +0900 (f092fa91a) @@ -0,0 +1,45 @@ +tokenize 'TokenNgram("report_source_location", true)' "あいうえお" NormalizerAuto +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "あい", + "position": 0, + "force_prefix": false, + "source_offset": 0, + "source_length": 6 + }, + { + "value": "いう", + "position": 1, + "force_prefix": false, + "source_offset": 3, + "source_length": 6 + }, + { + "value": "うえ", + "position": 2, + "force_prefix": false, + "source_offset": 6, + "source_length": 6 + }, + { + "value": "えお", + "position": 3, + "force_prefix": false, + "source_offset": 9, + "source_length": 6 + }, + { + "value": "お", + "position": 4, + "force_prefix": false, + "source_offset": 12, + "source_length": 3 + } + ] +] Renamed: test/command/suite/tokenizers/ngram/report_source_location/hiragana.test (+1 -1) 77% =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/expand.test 2018-05-11 12:26:32 +0900 (f45dd3257) +++ test/command/suite/tokenizers/ngram/report_source_location/hiragana.test 2018-05-11 14:40:37 +0900 (d1ac43c8d) @@ -1,4 +1,4 @@ tokenize \ 'TokenNgram("report_source_location", true)' \ - "ア㌕Az" \ + "あいうえお" \ NormalizerAuto -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180511/8a3547d2/attachment-0001.htm