Kouhei Sutou
null+****@clear*****
Sun May 10 00:05:08 JST 2015
Kouhei Sutou 2015-05-10 00:05:08 +0900 (Sun, 10 May 2015) New Revision: 8f4852d9ae082eb595f17d52e3348b0262099577 https://github.com/groonga/groonga/commit/8f4852d9ae082eb595f17d52e3348b0262099577 Message: TokenRegexp: don't require character types to normalizer Modified files: lib/tokenizers.c Modified: lib/tokenizers.c (+16 -9) =================================================================== --- lib/tokenizers.c 2015-05-10 00:02:05 +0900 (e983ef1) +++ lib/tokenizers.c 2015-05-10 00:05:08 +0900 (8e83da2) @@ -582,8 +582,7 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) grn_obj *buffer = &(tokenizer->buffer); const char *current = tokenizer->next; const char *end = tokenizer->end; - const const uint_least8_t *char_types = - tokenizer->char_types + tokenizer->nth_char; + const const uint_least8_t *char_types = tokenizer->char_types; grn_tokenize_mode mode = tokenizer->query->tokenize_mode; grn_bool is_first_token = tokenizer->is_first_token; grn_bool escaping = GRN_FALSE; @@ -592,6 +591,10 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) GRN_BULK_REWIND(buffer); tokenizer->is_first_token = GRN_FALSE; + if (char_types) { + char_types += tokenizer->nth_char; + } + if (mode == GRN_TOKEN_GET) { if (tokenizer->get.have_begin) { grn_tokenizer_token_push(ctx, @@ -646,9 +649,10 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) char_len == 1 && current[0] == '\\') { current += char_len; escaping = GRN_TRUE; - char_types++; + if (char_types) { + char_types++; + } } else { - uint_least8_t char_type; n_characters++; GRN_TEXT_PUT(ctx, buffer, current, char_len); current += char_len; @@ -660,11 +664,14 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) } } escaping = GRN_FALSE; - char_type = char_types[0]; - char_types++; - if (GRN_STR_ISBLANK(char_type)) { - break_by_blank = GRN_TRUE; - break; + if (char_types) { + uint_least8_t char_type; + char_type = char_types[0]; + char_types++; + if (GRN_STR_ISBLANK(char_type)) { + break_by_blank = GRN_TRUE; + break; + } } if (n_characters == ngram_unit) { break; -------------- next part -------------- HTML����������������������������...Download