Kouhei Sutou
null+****@clear*****
Sat May 9 23:14:59 JST 2015
Kouhei Sutou 2015-05-09 23:14:59 +0900 (Sat, 09 May 2015) New Revision: 1fdc0fd578ecbe4cb8928e9069291a3d729fbbd7 https://github.com/groonga/groonga/commit/1fdc0fd578ecbe4cb8928e9069291a3d729fbbd7 Message: TokenRegexp: don't ignore blank Added files: test/command/suite/tokenizers/regexp/add/normalizer/blank.expected test/command/suite/tokenizers/regexp/add/normalizer/blank.test Modified files: lib/tokenizers.c Modified: lib/tokenizers.c (+20 -2) =================================================================== --- lib/tokenizers.c 2015-05-09 21:13:11 +0900 (5578b53) +++ lib/tokenizers.c 2015-05-09 23:14:59 +0900 (f4bb58a) @@ -483,13 +483,15 @@ typedef struct { grn_bool is_overlapping; const char *next; const char *end; + unsigned int nth_char; + const uint_least8_t *char_types; grn_obj buffer; } grn_regexp_tokenizer; static grn_obj * regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { - unsigned int normalize_flags = 0; + unsigned int normalize_flags = GRN_STRING_WITH_TYPES; grn_tokenizer_query *query; const char *normalized; unsigned int normalized_length_in_bytes; @@ -526,6 +528,9 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) NULL); tokenizer->next = normalized; tokenizer->end = tokenizer->next + normalized_length_in_bytes; + tokenizer->nth_char = 0; + tokenizer->char_types = + grn_string_get_types(ctx, tokenizer->query->normalized_query); if (tokenizer->query->tokenize_mode == GRN_TOKEN_GET) { unsigned int query_length = tokenizer->query->length; @@ -541,6 +546,7 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) encoding); tokenizer->next += grn_charlen_(ctx, tokenizer->next, tokenizer->end, encoding); + tokenizer->nth_char = 2; } if (query_string[query_length - 2] == '\\' && query_string[query_length - 1] == 'z') { @@ -576,8 +582,11 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) grn_obj *buffer = &(tokenizer->buffer); const char *current = tokenizer->next; const char *end = tokenizer->end; + const const uint_least8_t *char_types = + tokenizer->char_types + tokenizer->nth_char; grn_tokenize_mode mode = tokenizer->query->tokenize_mode; grn_bool escaping = GRN_FALSE; + grn_bool break_by_blank = GRN_FALSE; GRN_BULK_REWIND(buffer); @@ -635,17 +644,26 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) char_len == 1 && current[0] == '\\') { current += char_len; escaping = GRN_TRUE; + char_types++; } else { + uint_least8_t char_type; n_characters++; GRN_TEXT_PUT(ctx, buffer, current, char_len); current += char_len; escaping = GRN_FALSE; if (n_characters == 1) { tokenizer->next = current; + tokenizer->nth_char++; } if (n_characters == ngram_unit) { break; } + char_type = char_types[0]; + char_types++; + if (GRN_STR_ISBLANK(char_type)) { + break_by_blank = GRN_TRUE; + break; + } } char_len = grn_charlen_(ctx, (const char *)current, (const char *)end, @@ -658,7 +676,7 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) if (tokenizer->is_overlapping) { status |= GRN_TOKEN_OVERLAP; } - if (n_characters < ngram_unit) { + if (n_characters < ngram_unit && !break_by_blank) { status |= GRN_TOKEN_UNMATURED; } tokenizer->is_overlapping = (n_characters > 1); Added: test/command/suite/tokenizers/regexp/add/normalizer/blank.expected (+52 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/add/normalizer/blank.expected 2015-05-09 23:14:59 +0900 (bd9b53e) @@ -0,0 +1,52 @@ +table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp --normalizer NormalizerAuto +[[0,0.0,0.0],true] +table_tokenize Lexicon "abcd\nefgh" --mode ADD +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "", + "position": 0 + }, + { + "value": "ab", + "position": 1 + }, + { + "value": "bc", + "position": 2 + }, + { + "value": "cd", + "position": 3 + }, + { + "value": "d", + "position": 4 + }, + { + "value": "ef", + "position": 5 + }, + { + "value": "fg", + "position": 6 + }, + { + "value": "gh", + "position": 7 + }, + { + "value": "h", + "position": 8 + }, + { + "value": "", + "position": 9 + } + ] +] Added: test/command/suite/tokenizers/regexp/add/normalizer/blank.test (+4 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/add/normalizer/blank.test 2015-05-09 23:14:59 +0900 (194183f) @@ -0,0 +1,4 @@ +table_create Lexicon TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp \ + --normalizer NormalizerAuto +table_tokenize Lexicon "abcd\nefgh" --mode ADD -------------- next part -------------- HTML����������������������������...Download