Kouhei Sutou
null+****@clear*****
Mon May 28 14:05:24 JST 2018
Kouhei Sutou 2018-05-28 14:05:24 +0900 (Mon, 28 May 2018) New Revision: ba79b939a7861e265539f6aa5e333ee03ad902aa https://github.com/groonga/groonga/commit/ba79b939a7861e265539f6aa5e333ee03ad902aa Message: TokenNgram: add a missing NULL check Added files: test/command/suite/tokenizers/ngram/report_source_location/include_removed_source_location.expected test/command/suite/tokenizers/ngram/report_source_location/include_removed_source_location.test Modified files: lib/tokenizers.c test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test Modified: lib/tokenizers.c (+22 -7) =================================================================== --- lib/tokenizers.c 2018-05-28 14:01:54 +0900 (8fd624ce0) +++ lib/tokenizers.c 2018-05-28 14:05:24 +0900 (6b3667d21) @@ -268,6 +268,7 @@ typedef struct { grn_bool loose_symbol; grn_bool loose_blank; grn_bool report_source_location; + grn_bool include_removed_source_location; } grn_ngram_options; typedef struct { @@ -309,6 +310,7 @@ ngram_options_init(grn_ngram_options *options, uint8_t unit) options->loose_symbol = GRN_FALSE; options->loose_blank = GRN_FALSE; options->report_source_location = GRN_FALSE; + options->include_removed_source_location = GRN_TRUE; } static void @@ -388,8 +390,10 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, (!tokenizer->options.remove_blank && tokenizer->options.loose_blank && GRN_STR_ISBLANK(*types))) { - if (!removed_checks) { - removed_checks = checks; + if (tokenizer->options.include_removed_source_location) { + if (!removed_checks) { + removed_checks = checks; + } } if (offsets && last_offset == 0) { last_offset = *offsets; @@ -403,12 +407,14 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, loose_types++; if (loose_checks) { size_t i; - for (; removed_checks && removed_checks < checks; removed_checks++) { - if (*removed_checks > 0) { - *loose_checks += *removed_checks; + if (tokenizer->options.include_removed_source_location) { + for (; removed_checks && removed_checks < checks; removed_checks++) { + if (*removed_checks > 0) { + *loose_checks += *removed_checks; + } } + removed_checks = NULL; } - removed_checks = NULL; for (i = 0; i < length; i++) { loose_checks[i] += checks[i]; } @@ -430,7 +436,9 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, offsets++; } } - *loose_checks = *checks; + if (checks) { + *loose_checks = *checks; + } if (offsets) { if (last_offset) { *loose_offsets = last_offset; @@ -700,6 +708,13 @@ ngram_open_options(grn_ctx *ctx, raw_options, i, options->report_source_location); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, + "include_removed_source_location")) { + options->include_removed_source_location = + grn_vector_get_element_bool(ctx, + raw_options, + i, + options->include_removed_source_location); } } GRN_OPTION_VALUES_EACH_END(); Added: test/command/suite/tokenizers/ngram/report_source_location/include_removed_source_location.expected (+82 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/report_source_location/include_removed_source_location.expected 2018-05-28 14:05:24 +0900 (ad2d85f6d) @@ -0,0 +1,82 @@ +tokenize 'TokenNgram("report_source_location", true, "include_removed_source_location", false, "loose_symbol", true)' "090(1234)56−78" 'NormalizerNFKC100("include_removed_source_location", false, "report_source_offset", true)' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "090", + "position": 0, + "force_prefix": false, + "source_offset": 0, + "source_length": 9, + "source_first_character_length": 3 + }, + { + "value": "(", + "position": 1, + "force_prefix": false, + "source_offset": 9, + "source_length": 3, + "source_first_character_length": 3 + }, + { + "value": "1234", + "position": 2, + "force_prefix": false, + "source_offset": 12, + "source_length": 8, + "source_first_character_length": 3 + }, + { + "value": ")", + "position": 3, + "force_prefix": false, + "source_offset": 20, + "source_length": 3, + "source_first_character_length": 3 + }, + { + "value": "56", + "position": 4, + "force_prefix": false, + "source_offset": 23, + "source_length": 4, + "source_first_character_length": 3 + }, + { + "value": "−", + "position": 5, + "force_prefix": false, + "source_offset": 27, + "source_length": 3, + "source_first_character_length": 3 + }, + { + "value": "78", + "position": 6, + "force_prefix": false, + "source_offset": 30, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "", + "position": 7, + "force_prefix": false, + "source_offset": 36, + "source_length": 0, + "source_first_character_length": 0 + }, + { + "value": "09012345678", + "position": 8, + "force_prefix": false, + "source_offset": 0, + "source_length": 36, + "source_first_character_length": 3 + } + ] +] Added: test/command/suite/tokenizers/ngram/report_source_location/include_removed_source_location.test (+7 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/report_source_location/include_removed_source_location.test 2018-05-28 14:05:24 +0900 (aafbe5592) @@ -0,0 +1,7 @@ +tokenize \ + 'TokenNgram("report_source_location", true, \ + "include_removed_source_location", false, \ + "loose_symbol", true)' \ + "090(1234)56−78" \ + 'NormalizerNFKC100("include_removed_source_location", false, \ + "report_source_offset", true)' Modified: test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected (+3 -3) =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected 2018-05-28 14:01:54 +0900 (63a079929) +++ test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected 2018-05-28 14:05:24 +0900 (496eefd29) @@ -1,4 +1,4 @@ -tokenize 'TokenNgram("loose_symbol", true, "report_source_location", true)' "[クリアコード]" 'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true, "report_source_offset", true)' +tokenize 'TokenNgram("loose_symbol", true, "include_removed_source_location", false, "report_source_location", true)' "[クリアコード]" 'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true, "include_removed_source_location", false, "report_source_offset", true)' [ [ 0, @@ -84,7 +84,7 @@ tokenize 'TokenNgram("loose_symbol", true, "report_source_locati "force_prefix": false, "source_offset": 1, "source_length": 6, - "source_first_character_length": 4 + "source_first_character_length": 3 }, { "value": "リア", @@ -116,7 +116,7 @@ tokenize 'TokenNgram("loose_symbol", true, "report_source_locati "force_prefix": false, "source_offset": 16, "source_length": 3, - "source_first_character_length": 6 + "source_first_character_length": 3 } ] ] Modified: test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test (+2 -0) =================================================================== --- test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test 2018-05-28 14:01:54 +0900 (f1e51e3b9) +++ test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test 2018-05-28 14:05:24 +0900 (8338e1ecb) @@ -1,6 +1,8 @@ tokenize \ 'TokenNgram("loose_symbol", true, \ + "include_removed_source_location", false, \ "report_source_location", true)' \ "[クリアコード]" \ 'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true, \ + "include_removed_source_location", false, \ "report_source_offset", true)' -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180528/98c10bb4/attachment-0001.htm