Kouhei Sutou
null+****@clear*****
Tue Jun 26 15:09:40 JST 2018
Kouhei Sutou 2018-06-26 15:09:40 +0900 (Tue, 26 Jun 2018) New Revision: 49cae7d24ca79512505abe0fc1a9781ae64b1bf1 https://github.com/groonga/groonga/commit/49cae7d24ca79512505abe0fc1a9781ae64b1bf1 Message: TokenNgram: fix wrong first character length It's caused for U+3231 PARENTHESIZED IDEOGRAPH characters such as U+3231 PARENTHESIZED IDEOGRAPH STOCK. Added files: test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.expected test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.test test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.expected test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.test Modified files: lib/tokenizers.c Modified: lib/tokenizers.c (+3 -1) =================================================================== --- lib/tokenizers.c 2018-06-26 12:59:24 +0900 (854d5133d) +++ lib/tokenizers.c 2018-06-26 15:09:40 +0900 (bfd88d7b8) @@ -416,7 +416,9 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, removed_checks = NULL; } for (i = 0; i < length; i++) { - loose_checks[i] += checks[i]; + if (checks[i] != -1) { + loose_checks[i] += checks[i]; + } } loose_checks += length; } Added: test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.expected (+37 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.expected 2018-06-26 15:09:40 +0900 (715103f97) @@ -0,0 +1,37 @@ +table_create Entries TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Entries body COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer 'TokenNgram("loose_symbol", true, "report_source_location", true)' --normalizer 'NormalizerNFKC100' +[[0,0.0,0.0],true] +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body +[[0,0.0,0.0],true] +load --table Entries +[ +{"body": "ここは㈱グルンガ"} +] +[[0,0.0,0.0],1] +select Entries --match_columns body --query '株グル' --output_columns 'highlight_html(body, Terms)' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "highlight_html", + null + ] + ], + [ + "ここは<span class=\"keyword\">㈱グル</span>ンガ" + ] + ] + ] +] Added: test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.test (+19 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.test 2018-06-26 15:09:40 +0900 (d8e77567a) @@ -0,0 +1,19 @@ +table_create Entries TABLE_NO_KEY +column_create Entries body COLUMN_SCALAR ShortText + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer 'TokenNgram("loose_symbol", true, \ + "report_source_location", true)' \ + --normalizer 'NormalizerNFKC100' +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body + +load --table Entries +[ +{"body": "ここは㈱グルンガ"} +] + +select Entries \ + --match_columns body \ + --query '株グル' \ + --output_columns 'highlight_html(body, Terms)' + Added: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.expected (+154 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.expected 2018-06-26 15:09:40 +0900 (d8add60e0) @@ -0,0 +1,154 @@ +tokenize 'TokenNgram("loose_symbol", true, "report_source_location", true)' "ここは㈱グルンガ" 'NormalizerNFKC100("report_source_offset", true)' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "ここ", + "position": 0, + "force_prefix": false, + "source_offset": 0, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "こは", + "position": 1, + "force_prefix": false, + "source_offset": 3, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "は", + "position": 2, + "force_prefix": false, + "source_offset": 6, + "source_length": 3, + "source_first_character_length": 3 + }, + { + "value": "(", + "position": 3, + "force_prefix": false, + "source_offset": 9, + "source_length": 0, + "source_first_character_length": 3 + }, + { + "value": "株", + "position": 4, + "force_prefix": false, + "source_offset": 9, + "source_length": 0, + "source_first_character_length": 3 + }, + { + "value": ")", + "position": 5, + "force_prefix": false, + "source_offset": 9, + "source_length": 3, + "source_first_character_length": 3 + }, + { + "value": "グル", + "position": 6, + "force_prefix": false, + "source_offset": 12, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "ルン", + "position": 7, + "force_prefix": false, + "source_offset": 15, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "ンガ", + "position": 8, + "force_prefix": false, + "source_offset": 18, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "", + "position": 9, + "force_prefix": false, + "source_offset": 24, + "source_length": 0, + "source_first_character_length": 0 + }, + { + "value": "ここ", + "position": 10, + "force_prefix": false, + "source_offset": 0, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "こは", + "position": 11, + "force_prefix": false, + "source_offset": 3, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "は株", + "position": 12, + "force_prefix": false, + "source_offset": 6, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "株グ", + "position": 13, + "force_prefix": false, + "source_offset": 9, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "グル", + "position": 14, + "force_prefix": false, + "source_offset": 12, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "ルン", + "position": 15, + "force_prefix": false, + "source_offset": 15, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "ンガ", + "position": 16, + "force_prefix": false, + "source_offset": 18, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "ガ", + "position": 17, + "force_prefix": false, + "source_offset": 21, + "source_length": 3, + "source_first_character_length": 3 + } + ] +] Added: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.test 2018-06-26 15:09:40 +0900 (ac0999366) @@ -0,0 +1,5 @@ +tokenize \ + 'TokenNgram("loose_symbol", true, \ + "report_source_location", true)' \ + "ここは㈱グルンガ" \ + 'NormalizerNFKC100("report_source_offset", true)' -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180626/4df8ecf1/attachment-0001.htm