Kouhei Sutou
null+****@clear*****
Mon May 28 14:01:54 JST 2018
Kouhei Sutou 2018-05-28 14:01:54 +0900 (Mon, 28 May 2018) New Revision: d90f50ee1ee02cb7c7833c0dfc1b89569a781685 https://github.com/groonga/groonga/commit/d90f50ee1ee02cb7c7833c0dfc1b89569a781685 Message: TokenNgram: use offsets information from normalizer only when it's available Added files: test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test Copied files: test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected (from test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected) Modified files: lib/tokenizers.c test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected test/command/suite/tokenizers/ngram/report_source_location/hiragana.test test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.expected test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.test test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test Modified: lib/tokenizers.c (+127 -39) =================================================================== --- lib/tokenizers.c 2018-05-28 12:16:40 +0900 (e59fcd6ca) +++ lib/tokenizers.c 2018-05-28 14:01:54 +0900 (8fd624ce0) @@ -282,14 +282,17 @@ typedef struct { grn_obj text; uint_least8_t *ctypes; int16_t *checks; + uint64_t *offsets; } loose; int32_t pos; uint32_t skip; + unsigned int n_chars; const unsigned char *start; const unsigned char *next; const unsigned char *end; const uint_least8_t *ctypes; const int16_t *checks; + const uint64_t *offsets; uint32_t tail; uint64_t source_offset; } grn_ngram_tokenizer; @@ -319,6 +322,7 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, const char *normalized_end; const uint_least8_t *types = tokenizer->ctypes; const int16_t *checks = tokenizer->checks; + const uint64_t *offsets = tokenizer->offsets; string = grn_tokenizer_query_get_normalized_string(ctx, tokenizer->query); grn_string_get_normalized(ctx, @@ -333,7 +337,10 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, grn_tokenizer_query_get_encoding(ctx, tokenizer->query); uint_least8_t *loose_types; int16_t *loose_checks = NULL; + uint64_t *loose_offsets = NULL; const int16_t *removed_checks = NULL; + uint64_t last_offset = 0; + unsigned int n_chars = 0; tokenizer->loose.ctypes = GRN_MALLOC(sizeof(uint_least8_t) * normalized_length_in_chars); @@ -350,12 +357,23 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, if (!tokenizer->loose.checks) { ERR(GRN_NO_MEMORY_AVAILABLE, "[tokenizer][ngram][loose] " + "failed to allocate memory for character lengths"); + return; + } + } + if (offsets) { + tokenizer->loose.offsets = + GRN_CALLOC(sizeof(uint64_t) * normalized_length_in_chars); + if (!tokenizer->loose.offsets) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][ngram][loose] " "failed to allocate memory for character offsets"); return; } } loose_types = tokenizer->loose.ctypes; loose_checks = tokenizer->loose.checks; + loose_offsets = tokenizer->loose.offsets; while (normalized < normalized_end) { size_t length; length = grn_charlen_(ctx, @@ -373,6 +391,9 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, if (!removed_checks) { removed_checks = checks; } + if (offsets && last_offset == 0) { + last_offset = *offsets; + } } else { GRN_TEXT_PUT(ctx, &(tokenizer->loose.text), normalized, length); *loose_types = *types; @@ -393,12 +414,29 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, } loose_checks += length; } + if (loose_offsets) { + *loose_offsets = *offsets; + loose_offsets++; + last_offset = 0; + } + n_chars++; } normalized += length; types++; if (checks) { checks += length; } + if (offsets) { + offsets++; + } + } + *loose_checks = *checks; + if (offsets) { + if (last_offset) { + *loose_offsets = last_offset; + } else { + *loose_offsets = *offsets; + } } tokenizer->start = (const unsigned char *)GRN_TEXT_VALUE(&(tokenizer->loose.text)); @@ -406,6 +444,8 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, tokenizer->end = tokenizer->start + GRN_TEXT_LEN(&(tokenizer->loose.text)); tokenizer->ctypes = tokenizer->loose.ctypes; tokenizer->checks = tokenizer->loose.checks; + tokenizer->offsets = tokenizer->loose.offsets; + tokenizer->n_chars = n_chars; } else { tokenizer->start = normalized; tokenizer->next = tokenizer->start; @@ -457,6 +497,7 @@ ngram_init_raw(grn_ctx *ctx, GRN_TEXT_INIT(&(tokenizer->loose.text), 0); tokenizer->loose.ctypes = NULL; tokenizer->loose.checks = NULL; + tokenizer->loose.offsets = NULL; tokenizer->pos = 0; tokenizer->skip = 0; tokenizer->source_offset = 0; @@ -465,17 +506,21 @@ ngram_init_raw(grn_ctx *ctx, grn_obj *string; const char *normalized_raw; unsigned int normalized_length_in_bytes; + unsigned int normalized_length_in_chars; string = grn_tokenizer_query_get_normalized_string(ctx, tokenizer->query); grn_string_get_normalized(ctx, string, - &normalized_raw, &normalized_length_in_bytes, - NULL); + &normalized_raw, + &normalized_length_in_bytes, + &normalized_length_in_chars); tokenizer->start = (const unsigned char *)normalized_raw; tokenizer->next = tokenizer->start; tokenizer->end = tokenizer->start + normalized_length_in_bytes; + tokenizer->n_chars = normalized_length_in_chars; tokenizer->ctypes = grn_string_get_types(ctx, string); tokenizer->checks = grn_string_get_checks(ctx, string); + tokenizer->offsets = grn_string_get_offsets(ctx, string); } if (grn_tokenizer_query_get_mode(ctx, tokenizer->query) == GRN_TOKEN_GET) { @@ -700,6 +745,7 @@ ngram_next(grn_ctx *ctx, grn_token_status status = 0; const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL; const int16_t *checks = NULL; + const uint64_t *offsets = tokenizer->offsets ? tokenizer->offsets + pos : NULL; grn_encoding encoding = grn_tokenizer_query_get_encoding(ctx, query); if (tokenizer->checks) { @@ -712,8 +758,14 @@ ngram_next(grn_ctx *ctx, GRN_TOKENIZER_END_MARK_UTF8, GRN_TOKENIZER_END_MARK_UTF8_LEN); grn_token_set_status(ctx, token, status); - if (checks) { - grn_token_set_source_offset(ctx, token, tokenizer->source_offset); + if (offsets) { + grn_token_set_source_offset(ctx, + token, + tokenizer->offsets[tokenizer->n_chars]); + } else if (checks) { + grn_token_set_source_offset(ctx, + token, + tokenizer->source_offset); } ngram_switch_to_loose_mode(ctx, tokenizer); tokenizer->loose.need_end_mark = GRN_FALSE; @@ -840,51 +892,84 @@ ngram_next(grn_ctx *ctx, grn_token_set_data(ctx, token, p, data_size); grn_token_set_status(ctx, token, status); grn_token_set_overlap(ctx, token, tokenizer->overlap); - if (checks) { - size_t i; - uint32_t source_length = 0; - uint32_t source_first_character_length = 0; - uint64_t next_offset = tokenizer->source_offset; - grn_token_set_source_offset(ctx, token, tokenizer->source_offset); - if (checks[0] == -1) { - size_t n_leading_bytes = p - tokenizer->start; - for (i = 1; i <= n_leading_bytes; i++) { - if (checks[-i] > 0) { - source_length = source_first_character_length = checks[-i]; - if (!tokenizer->overlap) { - next_offset += checks[-i]; + /* TODO: Clean and complete... */ + if (offsets) { + grn_token_set_source_offset(ctx, token, offsets[0]); + if (checks) { + size_t i; + uint32_t source_first_character_length = 0; + if (checks[0] == -1) { + size_t n_leading_bytes = p - tokenizer->start; + for (i = 1; i <= n_leading_bytes; i++) { + if (checks[-i] > 0) { + source_first_character_length = checks[-i]; + break; } - break; } } - } - { - uint64_t first_offset = 0; - for (i = 0; i < data_size; i++) { - if (checks[i] > 0) { - if ((tokenizer->overlap && first_offset == 0) || - !tokenizer->overlap) { - if (first_offset == 0) { - first_offset = checks[i]; + { + for (i = 0; i < data_size; i++) { + if (checks[i] > 0) { + if (source_first_character_length == 0) { + source_first_character_length = checks[i]; } - next_offset += checks[i]; } - if (source_first_character_length == 0) { - source_first_character_length = checks[i]; + } + } + grn_token_set_source_length(ctx, + token, + offsets[n_characters] - offsets[0]); + grn_token_set_source_first_character_length(ctx, + token, + source_first_character_length); + } + } else { + if (checks) { + size_t i; + uint32_t source_length = 0; + uint32_t source_first_character_length = 0; + uint64_t next_offset = tokenizer->source_offset; + grn_token_set_source_offset(ctx, token, tokenizer->source_offset); + if (checks[0] == -1) { + size_t n_leading_bytes = p - tokenizer->start; + for (i = 1; i <= n_leading_bytes; i++) { + if (checks[-i] > 0) { + source_length = source_first_character_length = checks[-i]; + if (!tokenizer->overlap) { + next_offset += checks[-i]; + } + break; } - source_length += checks[i]; - } else if (checks[i] < 0) { - if (tokenizer->overlap) { - next_offset -= first_offset; + } + } + { + uint64_t first_offset = 0; + for (i = 0; i < data_size; i++) { + if (checks[i] > 0) { + if ((tokenizer->overlap && first_offset == 0) || + !tokenizer->overlap) { + if (first_offset == 0) { + first_offset = checks[i]; + } + next_offset += checks[i]; + } + if (source_first_character_length == 0) { + source_first_character_length = checks[i]; + } + source_length += checks[i]; + } else if (checks[i] < 0) { + if (tokenizer->overlap) { + next_offset -= first_offset; + } } } } + grn_token_set_source_length(ctx, token, source_length); + grn_token_set_source_first_character_length(ctx, + token, + source_first_character_length); + tokenizer->source_offset = next_offset; } - grn_token_set_source_length(ctx, token, source_length); - grn_token_set_source_first_character_length(ctx, - token, - source_first_character_length); - tokenizer->source_offset = next_offset; } } } @@ -925,6 +1010,9 @@ ngram_fin(grn_ctx *ctx, void *user_data) if (tokenizer->loose.checks) { GRN_FREE(tokenizer->loose.checks); } + if (tokenizer->loose.offsets) { + GRN_FREE(tokenizer->loose.offsets); + } GRN_OBJ_FIN(ctx, &(tokenizer->loose.text)); grn_tokenizer_token_fin(ctx, &(tokenizer->token)); GRN_FREE(tokenizer); Modified: test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected (+1 -1) =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected 2018-05-28 12:16:40 +0900 (390cd501f) +++ test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.expected 2018-05-28 14:01:54 +0900 (0674f5ce8) @@ -1,4 +1,4 @@ -tokenize 'TokenNgram("report_source_location", true)' "アイ㌕エオ" NormalizerAuto +tokenize 'TokenNgram("report_source_location", true)' "アイ㌕エオ" 'NormalizerNFKC100' [ [ 0, Modified: test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test (+1 -1) =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test 2018-05-28 12:16:40 +0900 (5c4795fd8) +++ test/command/suite/tokenizers/ngram/report_source_location/expand_katakana.test 2018-05-28 14:01:54 +0900 (fdf9fb6d4) @@ -1,4 +1,4 @@ tokenize \ 'TokenNgram("report_source_location", true)' \ "アイ㌕エオ" \ - NormalizerAuto + 'NormalizerNFKC100' Modified: test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected (+1 -1) =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected 2018-05-28 12:16:40 +0900 (eacdd61bb) +++ test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.expected 2018-05-28 14:01:54 +0900 (02dd5427b) @@ -1,4 +1,4 @@ -tokenize 'TokenNgram("report_source_location", true)' "A㌔Z" NormalizerAuto +tokenize 'TokenNgram("report_source_location", true)' "A㌔Z" 'NormalizerNFKC100("report_source_offset", true)' [ [ 0, Modified: test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test (+1 -1) =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test 2018-05-28 12:16:40 +0900 (6e26f39a6) +++ test/command/suite/tokenizers/ngram/report_source_location/expand_no_overlap.test 2018-05-28 14:01:54 +0900 (38b01dced) @@ -1,4 +1,4 @@ tokenize \ 'TokenNgram("report_source_location", true)' \ "A㌔Z" \ - NormalizerAuto + 'NormalizerNFKC100("report_source_offset", true)' Modified: test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected (+1 -1) =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected 2018-05-28 12:16:40 +0900 (d1d4a8ad9) +++ test/command/suite/tokenizers/ngram/report_source_location/hiragana.expected 2018-05-28 14:01:54 +0900 (33074e0de) @@ -1,4 +1,4 @@ -tokenize 'TokenNgram("report_source_location", true)' "あいうえお" NormalizerAuto +tokenize 'TokenNgram("report_source_location", true)' "あいうえお" 'NormalizerNFKC100("report_source_offset", true)' [ [ 0, Modified: test/command/suite/tokenizers/ngram/report_source_location/hiragana.test (+1 -1) =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/hiragana.test 2018-05-28 12:16:40 +0900 (d1ac43c8d) +++ test/command/suite/tokenizers/ngram/report_source_location/hiragana.test 2018-05-28 14:01:54 +0900 (d5ff21622) @@ -1,4 +1,4 @@ tokenize \ 'TokenNgram("report_source_location", true)' \ "あいうえお" \ - NormalizerAuto + 'NormalizerNFKC100("report_source_offset", true)' Copied: test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected (+35 -27) 71% =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected 2018-05-28 12:16:40 +0900 (e0ccd2903) +++ test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected 2018-05-28 14:01:54 +0900 (63a079929) @@ -1,4 +1,4 @@ -tokenize 'TokenNgram("loose_symbol", true, "report_source_location", true)' "(あいうえお)" 'NormalizerNFKC100' +tokenize 'TokenNgram("loose_symbol", true, "report_source_location", true)' "[クリアコード]" 'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true, "report_source_offset", true)' [ [ 0, @@ -7,7 +7,7 @@ tokenize 'TokenNgram("loose_symbol", true, "report_source_locati ], [ { - "value": "(", + "value": "[", "position": 0, "force_prefix": false, "source_offset": 0, @@ -15,7 +15,7 @@ tokenize 'TokenNgram("loose_symbol", true, "report_source_locati "source_first_character_length": 1 }, { - "value": "あい", + "value": "クリ", "position": 1, "force_prefix": false, "source_offset": 1, @@ -23,7 +23,7 @@ tokenize 'TokenNgram("loose_symbol", true, "report_source_locati "source_first_character_length": 3 }, { - "value": "いう", + "value": "リア", "position": 2, "force_prefix": false, "source_offset": 4, @@ -31,7 +31,7 @@ tokenize 'TokenNgram("loose_symbol", true, "report_source_locati "source_first_character_length": 3 }, { - "value": "うえ", + "value": "アコ", "position": 3, "force_prefix": false, "source_offset": 7, @@ -39,15 +39,15 @@ tokenize 'TokenNgram("loose_symbol", true, "report_source_locati "source_first_character_length": 3 }, { - "value": "えお", + "value": "コ", "position": 4, "force_prefix": false, "source_offset": 10, - "source_length": 6, + "source_length": 3, "source_first_character_length": 3 }, { - "value": "お", + "value": "-", "position": 5, "force_prefix": false, "source_offset": 13, @@ -55,60 +55,68 @@ tokenize 'TokenNgram("loose_symbol", true, "report_source_locati "source_first_character_length": 3 }, { - "value": ")", + "value": "ド", "position": 6, "force_prefix": false, "source_offset": 16, + "source_length": 3, + "source_first_character_length": 3 + }, + { + "value": "]", + "position": 7, + "force_prefix": false, + "source_offset": 19, "source_length": 1, "source_first_character_length": 1 }, { "value": "", - "position": 7, + "position": 8, "force_prefix": false, - "source_offset": 17, + "source_offset": 20, "source_length": 0, "source_first_character_length": 0 }, { - "value": "あい", - "position": 8, + "value": "クリ", + "position": 9, "force_prefix": false, - "source_offset": 0, - "source_length": 7, + "source_offset": 1, + "source_length": 6, "source_first_character_length": 4 }, { - "value": "いう", - "position": 9, + "value": "リア", + "position": 10, "force_prefix": false, "source_offset": 4, "source_length": 6, "source_first_character_length": 3 }, { - "value": "うえ", - "position": 10, + "value": "アコ", + "position": 11, "force_prefix": false, "source_offset": 7, - "source_length": 6, + "source_length": 9, "source_first_character_length": 3 }, { - "value": "えお", - "position": 11, + "value": "コド", + "position": 12, "force_prefix": false, "source_offset": 10, - "source_length": 6, + "source_length": 9, "source_first_character_length": 3 }, { - "value": "お", - "position": 12, + "value": "ド", + "position": 13, "force_prefix": false, - "source_offset": 13, + "source_offset": 16, "source_length": 3, - "source_first_character_length": 3 + "source_first_character_length": 6 } ] ] Added: test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test (+6 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test 2018-05-28 14:01:54 +0900 (f1e51e3b9) @@ -0,0 +1,6 @@ +tokenize \ + 'TokenNgram("loose_symbol", true, \ + "report_source_location", true)' \ + "[クリアコード]" \ + 'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true, \ + "report_source_offset", true)' Modified: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.expected (+1 -1) =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.expected 2018-05-28 12:16:40 +0900 (83927a3eb) +++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.expected 2018-05-28 14:01:54 +0900 (d7ecd9fa7) @@ -1,4 +1,4 @@ -tokenize 'TokenNgram("report_source_location", true, "loose_symbol", true)' "090(1234)56−78" NormalizerAuto +tokenize 'TokenNgram("report_source_location", true, "loose_symbol", true)' "090(1234)56−78" 'NormalizerNFKC100("report_source_offset", true)' [ [ 0, Modified: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.test (+1 -1) =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.test 2018-05-28 12:16:40 +0900 (135a9c270) +++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol.test 2018-05-28 14:01:54 +0900 (c10ee0528) @@ -1,4 +1,4 @@ tokenize \ 'TokenNgram("report_source_location", true, "loose_symbol", true)' \ "090(1234)56−78" \ - NormalizerAuto + 'NormalizerNFKC100("report_source_offset", true)' Modified: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected (+3 -3) =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected 2018-05-28 12:16:40 +0900 (e0ccd2903) +++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected 2018-05-28 14:01:54 +0900 (231e19e69) @@ -1,4 +1,4 @@ -tokenize 'TokenNgram("loose_symbol", true, "report_source_location", true)' "(あいうえお)" 'NormalizerNFKC100' +tokenize 'TokenNgram("loose_symbol", true, "report_source_location", true)' "(あいうえお)" 'NormalizerNFKC100("report_source_offset", true)' [ [ 0, @@ -74,8 +74,8 @@ tokenize 'TokenNgram("loose_symbol", true, "report_source_locati "value": "あい", "position": 8, "force_prefix": false, - "source_offset": 0, - "source_length": 7, + "source_offset": 1, + "source_length": 6, "source_first_character_length": 4 }, { Modified: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test (+1 -1) =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test 2018-05-28 12:16:40 +0900 (8b9a1545a) +++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test 2018-05-28 14:01:54 +0900 (d25604773) @@ -2,4 +2,4 @@ tokenize \ 'TokenNgram("loose_symbol", true, \ "report_source_location", true)' \ "(あいうえお)" \ - 'NormalizerNFKC100' + 'NormalizerNFKC100("report_source_offset", true)' -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180528/51f9f6be/attachment-0001.htm