Kouhei Sutou
null+****@clear*****
Fri Apr 6 15:55:32 JST 2018
Kouhei Sutou 2018-04-06 15:55:32 +0900 (Fri, 06 Apr 2018) New Revision: 4227a6cde35090d8e2f8766dd2e26b82196f1b1d https://github.com/groonga/groonga/commit/4227a6cde35090d8e2f8766dd2e26b82196f1b1d Message: TokenNgram: add "loose_blank" option Added files: test/command/suite/table_create/default_tokenizer/ngram/options/loose.expected test/command/suite/table_create/default_tokenizer/ngram/options/loose.test test/command/suite/tokenizers/ngram/loose_blank/add.expected test/command/suite/tokenizers/ngram/loose_blank/add.test test/command/suite/tokenizers/ngram/loose_blank/get.expected test/command/suite/tokenizers/ngram/loose_blank/get.test Modified files: lib/tokenizers.c Modified: lib/tokenizers.c (+31 -9) =================================================================== --- lib/tokenizers.c 2018-04-06 15:55:03 +0900 (405ffba5a) +++ lib/tokenizers.c 2018-04-06 15:55:32 +0900 (b9f264739) @@ -248,6 +248,7 @@ typedef struct { grn_bool ignore_blank; grn_bool remove_blank; grn_bool loose_symbol; + grn_bool loose_blank; } grn_ngram_options; typedef struct { @@ -281,6 +282,7 @@ ngram_options_init(grn_ngram_options *options, uint8_t unit) options->ignore_blank = GRN_FALSE; options->remove_blank = grn_ngram_tokenizer_remove_blank_enable; options->loose_symbol = GRN_FALSE; + options->loose_blank = GRN_FALSE; } static void @@ -321,10 +323,16 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, if (length == 0) { break; } - if (!(tokenizer->options.loose_symbol && - GRN_STR_CTYPE(*types) == GRN_CHAR_SYMBOL)) { + if (!((tokenizer->options.loose_symbol && + GRN_STR_CTYPE(*types) == GRN_CHAR_SYMBOL) || + (!tokenizer->options.remove_blank && + tokenizer->options.loose_blank && + GRN_STR_ISBLANK(*types)))) { GRN_TEXT_PUT(ctx, &(tokenizer->loose.text), normalized, length); *loose_types = *types; + if (tokenizer->options.loose_blank && GRN_STR_ISBLANK(*types)) { + *loose_types &= ~GRN_STR_BLANK; + } loose_types++; } normalized += length; @@ -540,6 +548,11 @@ ngram_open_options(grn_ctx *ctx, raw_options, i, options->loose_symbol); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "loose_blank")) { + options->loose_blank = grn_vector_get_element_bool(ctx, + raw_options, + i, + options->loose_blank); } } GRN_OPTION_VALUES_EACH_END(); @@ -592,13 +605,18 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) return NULL; } - if (cp && - !tokenizer->loose.ing && - !tokenizer->loose.need && - tokenizer->options.loose_symbol && - GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) { - tokenizer->loose.need = GRN_TRUE; - } +#define LOOSE_NEED_CHECK(cp, tokenizer) do { \ + if (cp && \ + !tokenizer->loose.ing && \ + !tokenizer->loose.need && \ + ((tokenizer->options.loose_symbol && \ + GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) || \ + (tokenizer->options.loose_blank && GRN_STR_ISBLANK(*cp)))) { \ + tokenizer->loose.need = GRN_TRUE; \ + } \ + } while (GRN_FALSE) + + LOOSE_NEED_CHECK(cp, tokenizer); if (cp && tokenizer->options.uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) { @@ -606,6 +624,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) tokenizer->query->encoding))) { len++; r += cl; + LOOSE_NEED_CHECK(cp, tokenizer); if (/* !tokenizer->options.ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_ALPHA) { break; } } @@ -618,6 +637,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) tokenizer->query->encoding))) { len++; r += cl; + LOOSE_NEED_CHECK(cp, tokenizer); if (/* !tokenizer->options.ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_DIGIT) { break; } } @@ -630,6 +650,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) tokenizer->query->encoding))) { len++; r += cl; + LOOSE_NEED_CHECK(cp, tokenizer); if (!tokenizer->options.ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_SYMBOL) { break; } } @@ -664,6 +685,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) (cl = grn_charlen_(ctx, (char *)r, (char *)e, tokenizer->query->encoding))) { if (cp) { + LOOSE_NEED_CHECK(cp, tokenizer); if (!tokenizer->options.ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } cp++; if ((tokenizer->options.uni_alpha && Added: test/command/suite/table_create/default_tokenizer/ngram/options/loose.expected (+317 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/ngram/options/loose.expected 2018-04-06 15:55:32 +0900 (fc395e84e) @@ -0,0 +1,317 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos tel COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer 'TokenNgram("loose_symbol", true, "loose_blank", true)' --normalizer NormalizerAuto +[[0,0.0,0.0],true] +column_create Terms memos_tel COLUMN_INDEX Memos tel +[[0,0.0,0.0],true] +load --table Memos +[ +{"tel": "03-5632-7432"}, +{"tel": "03 5632 7432"}, +{"tel": "(03)5632-7432"}, +{"tel": "0356327432"}, +{"tel": "03-7432-5632"} +] +[[0,0.0,0.0],5] +select Terms --output_columns _key --limit -1 +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 9 + ], + [ + [ + "_key", + "ShortText" + ] + ], + [ + "(" + ], + [ + ")" + ], + [ + "-" + ], + [ + "03" + ], + [ + "0356327432" + ], + [ + "0374325632" + ], + [ + "5632" + ], + [ + "7432" + ], + [ + "" + ] + ] + ] +] +select Memos --match_columns tel --query '0356327432' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 4 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "tel", + "ShortText" + ] + ], + [ + 1, + "03-5632-7432" + ], + [ + 2, + "03 5632 7432" + ], + [ + 3, + "(03)5632-7432" + ], + [ + 4, + "0356327432" + ] + ] + ] +] +select Memos --match_columns tel --query '"03-5632-7432"' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 4 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "tel", + "ShortText" + ] + ], + [ + 1, + "03-5632-7432" + ], + [ + 2, + "03 5632 7432" + ], + [ + 3, + "(03)5632-7432" + ], + [ + 4, + "0356327432" + ] + ] + ] +] +select Memos --match_columns tel --query '"03 5632 7432"' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 4 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "tel", + "ShortText" + ] + ], + [ + 1, + "03-5632-7432" + ], + [ + 2, + "03 5632 7432" + ], + [ + 3, + "(03)5632-7432" + ], + [ + 4, + "0356327432" + ] + ] + ] +] +select Memos --match_columns tel --query '5632' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 4 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "tel", + "ShortText" + ] + ], + [ + 1, + "03-5632-7432" + ], + [ + 2, + "03 5632 7432" + ], + [ + 3, + "(03)5632-7432" + ], + [ + 5, + "03-7432-5632" + ] + ] + ] +] +select Memos --match_columns tel --query '32' +[[0,0.0,0.0],[[[0],[["_id","UInt32"],["tel","ShortText"]]]]] +select Memos --match_columns tel --query '0' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 5 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "tel", + "ShortText" + ] + ], + [ + 1, + "03-5632-7432" + ], + [ + 2, + "03 5632 7432" + ], + [ + 3, + "(03)5632-7432" + ], + [ + 4, + "0356327432" + ], + [ + 5, + "03-7432-5632" + ] + ] + ] +] +select Memos --match_columns tel --query '03' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 4 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "tel", + "ShortText" + ] + ], + [ + 1, + "03-5632-7432" + ], + [ + 2, + "03 5632 7432" + ], + [ + 3, + "(03)5632-7432" + ], + [ + 5, + "03-7432-5632" + ] + ] + ] +] Added: test/command/suite/table_create/default_tokenizer/ngram/options/loose.test (+26 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/table_create/default_tokenizer/ngram/options/loose.test 2018-04-06 15:55:32 +0900 (266ae166a) @@ -0,0 +1,26 @@ +table_create Memos TABLE_NO_KEY +column_create Memos tel COLUMN_SCALAR ShortText + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer 'TokenNgram("loose_symbol", true, "loose_blank", true)' \ + --normalizer NormalizerAuto +column_create Terms memos_tel COLUMN_INDEX Memos tel + +load --table Memos +[ +{"tel": "03-5632-7432"}, +{"tel": "03 5632 7432"}, +{"tel": "(03)5632-7432"}, +{"tel": "0356327432"}, +{"tel": "03-7432-5632"} +] + +select Terms --output_columns _key --limit -1 + +select Memos --match_columns tel --query '0356327432' +select Memos --match_columns tel --query '"03-5632-7432"' +select Memos --match_columns tel --query '"03 5632 7432"' +select Memos --match_columns tel --query '5632' +select Memos --match_columns tel --query '32' +select Memos --match_columns tel --query '0' +select Memos --match_columns tel --query '03' Added: test/command/suite/tokenizers/ngram/loose_blank/add.expected (+35 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/loose_blank/add.expected 2018-04-06 15:55:32 +0900 (e3e668ae7) @@ -0,0 +1,35 @@ +tokenize 'TokenNgram("loose_blank", true)' "090 1234 5678" NormalizerAuto --mode ADD +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "090", + "position": 0, + "force_prefix": false + }, + { + "value": "1234", + "position": 1, + "force_prefix": false + }, + { + "value": "5678", + "position": 2, + "force_prefix": false + }, + { + "value": "", + "position": 3, + "force_prefix": false + }, + { + "value": "09012345678", + "position": 4, + "force_prefix": false + } + ] +] Added: test/command/suite/tokenizers/ngram/loose_blank/add.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/loose_blank/add.test 2018-04-06 15:55:32 +0900 (57310f54b) @@ -0,0 +1,5 @@ +tokenize \ + 'TokenNgram("loose_blank", true)' \ + "090 1234 5678" \ + NormalizerAuto \ + --mode ADD Added: test/command/suite/tokenizers/ngram/loose_blank/get.expected (+2 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/loose_blank/get.expected 2018-04-06 15:55:32 +0900 (069d4e464) @@ -0,0 +1,2 @@ +tokenize 'TokenNgram("loose_blank", true)' "090 1234 5678" NormalizerAuto --mode GET +[[0,0.0,0.0],[{"value":"09012345678","position":0,"force_prefix":false}]] Added: test/command/suite/tokenizers/ngram/loose_blank/get.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/loose_blank/get.test 2018-04-06 15:55:32 +0900 (2c0fee207) @@ -0,0 +1,5 @@ +tokenize \ + 'TokenNgram("loose_blank", true)' \ + "090 1234 5678" \ + NormalizerAuto \ + --mode GET -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180406/1b5528a7/attachment-0001.htm