Kouhei Sutou
null+****@clear*****
Fri Nov 9 13:38:10 JST 2012
Kouhei Sutou 2012-11-09 13:38:10 +0900 (Fri, 09 Nov 2012) New Revision: d217a84b1540ab826c232bd81aa99e5b0b32d4ce https://github.com/groonga/groonga/commit/d217a84b1540ab826c232bd81aa99e5b0b32d4ce Log: Add GRN_STRING_REMOVE_TOKENIZER_DELIMITER flag It is a flag for removing tokenizer delimiter (U+FFFE) from string by normalization. Modified files: include/groonga.h lib/string.c test/unit/util/test-string.c Modified: include/groonga.h (+4 -3) =================================================================== --- include/groonga.h 2012-11-09 13:07:31 +0900 (d83e1db) +++ include/groonga.h 2012-11-09 13:38:10 +0900 (74704b8) @@ -2547,9 +2547,10 @@ GRN_API grn_rc grn_str_close(grn_ctx *ctx, grn_str *nstr); /* grn_string */ -#define GRN_STRING_REMOVE_BLANK (0x01<<0) -#define GRN_STRING_WITH_TYPES (0x01<<1) -#define GRN_STRING_WITH_CHECKS (0x01<<2) +#define GRN_STRING_REMOVE_BLANK (0x01<<0) +#define GRN_STRING_WITH_TYPES (0x01<<1) +#define GRN_STRING_WITH_CHECKS (0x01<<2) +#define GRN_STRING_REMOVE_TOKENIZER_DELIMITER (0x01<<3) #define GRN_NORMALIZER_AUTO ((grn_obj *)1) Modified: lib/string.c (+32 -3) =================================================================== --- lib/string.c 2012-11-09 13:07:31 +0900 (0f203bb) +++ lib/string.c 2012-11-09 13:38:10 +0900 (0f90636) @@ -21,6 +21,8 @@ #include "string_in.h" #include "str.h" +#include <groonga/tokenizer.h> + static unsigned char symbol[] = { ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0, @@ -557,6 +559,8 @@ utf8_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data grn_string *nstr = (grn_string *)args[0]; size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3; int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; + grn_bool remove_tokenizer_delimiter_p = + nstr->flags & GRN_STRING_REMOVE_TOKENIZER_DELIMITER; if (!(nstr->normalized = GRN_MALLOC(ds + 1))) { ERR(GRN_NO_MEMORY_AVAILABLE, "[strinig][utf8] failed to allocate normalized text space"); @@ -590,6 +594,10 @@ utf8_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data if (!(ls = grn_str_charlen_utf8(ctx, s, e))) { break; } + if (remove_tokenizer_delimiter_p && + grn_tokenizer_is_delimiter(ctx, s, ls, GRN_ENC_UTF8)) { + continue; + } if ((p = (unsigned char *)grn_nfkc_map1(s))) { pe = p + strlen((char *)p); } else { @@ -1068,9 +1076,30 @@ grn_fake_string_open(grn_ctx *ctx, grn_string *string) return NULL; } - memcpy(nstr->normalized, str, str_len); - nstr->normalized[str_len] = '\0'; - nstr->normalized_length_in_bytes = str_len; + if (nstr->flags & GRN_STRING_REMOVE_TOKENIZER_DELIMITER && + ctx->encoding == GRN_ENC_UTF8) { + int char_length; + const char *source_current = str; + const char *source_end = str + str_len; + char *destination = nstr->normalized; + unsigned int destination_length = 0; + while ((char_length = grn_charlen(ctx, source_current, source_end)) > 0) { + if (!grn_tokenizer_is_delimiter(ctx, + source_current, char_length, + ctx->encoding)) { + memcpy(destination, source_current, char_length); + destination += char_length; + destination_length += char_length; + } + source_current += char_length; + } + destination[destination_length] = '\0'; + nstr->normalized_length_in_bytes = destination_length; + } else { + memcpy(nstr->normalized, str, str_len); + nstr->normalized[str_len] = '\0'; + nstr->normalized_length_in_bytes = str_len; + } if (nstr->flags & GRN_STRING_WITH_CHECKS) { int16_t f = 0; Modified: test/unit/util/test-string.c (+65 -0) =================================================================== --- test/unit/util/test-string.c 2012-11-09 13:07:31 +0900 (2c87da4) +++ test/unit/util/test-string.c 2012-11-09 13:38:10 +0900 (81d0a27) @@ -31,6 +31,8 @@ void data_normalize(void); void test_normalize(gconstpointer data); void data_normalize_broken(void); void test_normalize_broken(gconstpointer data); +void data_remove_tokenizer_delimiter(void); +void test_remove_tokenizer_delimiter(gconstpointer data); void data_charlen_broken(void); void test_charlen_broken(gconstpointer data); void data_urlenc(void); @@ -261,6 +263,69 @@ test_normalize_broken(gconstpointer data) } void +data_remove_tokenizer_delimiter(void) +{ +#define ADD_DATUM(label, expected, input, flags) \ + gcut_add_datum(label, \ + "expected", G_TYPE_STRING, expected, \ + "input", G_TYPE_STRING, input, \ + "flags", G_TYPE_INT, flags, \ + NULL) + +#define UFFFE_IN_UTF8 "\xef\xbf\xbe" + + ADD_DATUM("normalize", + "abあい", + UFFFE_IN_UTF8 "A" + UFFFE_IN_UTF8 "B" + UFFFE_IN_UTF8 "あ" + UFFFE_IN_UTF8 "い" + UFFFE_IN_UTF8, + GRN_OBJ_KEY_NORMALIZE); + ADD_DATUM("not normalize", + "ABあい", + UFFFE_IN_UTF8 "A" + UFFFE_IN_UTF8 "B" + UFFFE_IN_UTF8 "あ" + UFFFE_IN_UTF8 "い" + UFFFE_IN_UTF8, + 0); + +#undef UFFFE_IN_UTF8 + +#undef ADD_DATUM +} + +void +test_remove_tokenizer_delimiter(gconstpointer data) +{ + grn_obj *string; + grn_obj *normalizer = NULL; + const gchar *expected; + const gchar *input; + const gchar *normalized; + unsigned int length_in_bytes; + int flags = GRN_STRING_REMOVE_TOKENIZER_DELIMITER; + + GRN_CTX_SET_ENCODING(&context, GRN_ENC_UTF8); + + input = gcut_data_get_string(data, "input"); + flags |= gcut_data_get_int(data, "flags"); + if (flags & GRN_OBJ_KEY_NORMALIZE) { + normalizer = GRN_NORMALIZER_AUTO; + } + + string = grn_string_open(&context, input, strlen(input), normalizer, flags); + grn_string_get_normalized(&context, string, + &normalized, &length_in_bytes, NULL); + normalized = cut_take_strndup(normalized, length_in_bytes); + grn_obj_unlink(&context, string); + + expected = gcut_data_get_string(data, "expected"); + cut_assert_equal_string(expected, normalized); +} + +void data_charlen_broken(void) { #define ADD_DATUM_WITH_ENCODING(label, input, input_length, encoding) \ -------------- next part -------------- HTML����������������������������...Download