Kouhei Sutou
null+****@clear*****
Mon May 28 11:41:05 JST 2018
Kouhei Sutou 2018-05-28 11:41:05 +0900 (Mon, 28 May 2018) New Revision: e325b7571faaec9d5608774e8d9957a79dd9b706 https://github.com/groonga/groonga/commit/e325b7571faaec9d5608774e8d9957a79dd9b706 Message: NormalizerNFKC100: add report_source_offset option normalize command also supports "offsets" output. Added files: test/command/suite/normalizers/nfkc100/report_source_offset.expected test/command/suite/normalizers/nfkc100/report_source_offset.test Modified files: lib/normalizer.c lib/proc/proc_normalize.c Modified: lib/normalizer.c (+44 -0) =================================================================== --- lib/normalizer.c 2018-05-28 11:40:43 +0900 (0be649903) +++ lib/normalizer.c 2018-05-28 11:41:05 +0900 (fffbc90ad) @@ -621,6 +621,7 @@ typedef struct { grn_nfkc_decompose_func decompose_func; grn_nfkc_compose_func compose_func; grn_bool include_removed_source_location; + grn_bool report_source_offset; grn_bool unify_kana; grn_bool unify_kana_case; grn_bool unify_kana_voiced_sound_mark; @@ -642,6 +643,7 @@ utf8_normalize_options_init(grn_utf8_normalize_options *options, options->decompose_func = decompose_func; options->compose_func = compose_func; options->include_removed_source_location = GRN_TRUE; + options->report_source_offset = GRN_FALSE; options->unify_kana = GRN_FALSE; options->unify_kana_case = GRN_FALSE; options->unify_kana_voiced_sound_mark = GRN_FALSE; @@ -1088,6 +1090,7 @@ utf8_normalize(grn_ctx *ctx, const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e; unsigned char *d, *d_, *de; uint_least8_t *cp; + uint64_t *offsets; size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3; int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; grn_bool remove_tokenized_delimiter_p = @@ -1117,6 +1120,17 @@ utf8_normalize(grn_ctx *ctx, } } cp = nstr->ctypes; + if (options->report_source_offset) { + if (!(nstr->offsets = GRN_MALLOC(ds + 1))) { + if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } + if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } + GRN_FREE(nstr->normalized); nstr->normalized = NULL; + ERR(GRN_NO_MEMORY_AVAILABLE, + "[string][utf8] failed to allocate offsets space"); + return NULL; + } + } + offsets = nstr->offsets; d = (unsigned char *)nstr->normalized; de = d + ds; d_ = NULL; @@ -1146,6 +1160,9 @@ utf8_normalize(grn_ctx *ctx, s_ = s__; } } + if (offsets) { + offsets--; + } d = d_; length--; } @@ -1169,6 +1186,7 @@ utf8_normalize(grn_ctx *ctx, if (!(normalized = GRN_REALLOC(nstr->normalized, ds + 1))) { if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } + if (nstr->offsets) { GRN_FREE(nstr->offsets); nstr->offsets = NULL; } GRN_FREE(nstr->normalized); nstr->normalized = NULL; ERR(GRN_NO_MEMORY_AVAILABLE, "[string][utf8] failed to expand normalized text space"); @@ -1181,6 +1199,7 @@ utf8_normalize(grn_ctx *ctx, int16_t *checks; if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t) + 1))) { if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } + if (nstr->offsets) { GRN_FREE(nstr->offsets); nstr->offsets = NULL; } GRN_FREE(nstr->checks); nstr->checks = NULL; GRN_FREE(nstr->normalized); nstr->normalized = NULL; ERR(GRN_NO_MEMORY_AVAILABLE, @@ -1195,6 +1214,7 @@ utf8_normalize(grn_ctx *ctx, if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } + if (nstr->offsets) { GRN_FREE(nstr->offsets); nstr->offsets = NULL; } GRN_FREE(nstr->normalized); nstr->normalized = NULL; ERR(GRN_NO_MEMORY_AVAILABLE, "[string][utf8] failed to expand character types space"); @@ -1203,6 +1223,20 @@ utf8_normalize(grn_ctx *ctx, cp = ctypes + (cp - nstr->ctypes); nstr->ctypes = ctypes; } + if (offsets) { + uint64_t *new_offsets; + if (!(new_offsets = GRN_REALLOC(nstr->offsets, ds + 1))) { + GRN_FREE(nstr->offsets); nstr->offsets = NULL; + if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } + if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } + GRN_FREE(nstr->normalized); nstr->normalized = NULL; + ERR(GRN_NO_MEMORY_AVAILABLE, + "[string][utf8] failed to expand offsets space"); + return NULL; + } + offsets = new_offsets + (offsets - nstr->offsets); + nstr->offsets = new_offsets; + } } { @@ -1326,12 +1360,16 @@ utf8_normalize(grn_ctx *ctx, } for (i = lp; i > 1; i--) { *ch++ = 0; } } + if (offsets) { + *offsets++ = (uint64_t)(s - (const unsigned char *)nstr->original); + } } lp = lp_original; } } } if (cp) { *cp = GRN_CHAR_NULL; } + if (offsets) { *offsets = nstr->original_length_in_bytes; } if (options->unify_katakana_v_sounds) { utf8_normalize_unify_katakana_v_sounds(NULL, 0, d_, d); } @@ -1792,6 +1830,12 @@ nfkc100_open_options(grn_ctx *ctx, raw_options, i, options->include_removed_source_location); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "report_source_offset")) { + options->report_source_offset = + grn_vector_get_element_bool(ctx, + raw_options, + i, + options->report_source_offset); } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_kana")) { options->unify_kana = grn_vector_get_element_bool(ctx, raw_options, Modified: lib/proc/proc_normalize.c (+19 -1) =================================================================== --- lib/proc/proc_normalize.c 2018-05-28 11:40:43 +0900 (63f061e49) +++ lib/proc/proc_normalize.c 2018-05-28 11:41:05 +0900 (9f764ba88) @@ -93,10 +93,12 @@ command_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d { int flags; + int n_elements = 3; grn_obj *lexicon; grn_obj *grn_string; unsigned int normalized_length_in_bytes; unsigned int normalized_n_characters; + const uint64_t *offsets; flags = parse_normalize_flags(ctx, &flags_raw); if (ctx->rc != GRN_SUCCESS) { @@ -118,7 +120,12 @@ command_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d lexicon, flags); - grn_ctx_output_map_open(ctx, "RESULT", 3); + offsets = grn_string_get_offsets(ctx, grn_string); + if (offsets) { + n_elements++; + } + + grn_ctx_output_map_open(ctx, "RESULT", n_elements); { const char *normalized; @@ -164,6 +171,17 @@ command_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d grn_ctx_output_array_close(ctx); } } + if (offsets) { + unsigned int i; + + grn_ctx_output_cstr(ctx, "offsets"); + grn_ctx_output_array_open(ctx, "offsets", normalized_n_characters); + for (i = 0; i < normalized_n_characters; i++) { + grn_ctx_output_uint64(ctx, offsets[i]); + } + grn_ctx_output_array_close(ctx); + } + grn_ctx_output_map_close(ctx); grn_obj_unlink(ctx, grn_string); Added: test/command/suite/normalizers/nfkc100/report_source_offset.expected (+26 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/report_source_offset.expected 2018-05-28 11:41:05 +0900 (0ec059aab) @@ -0,0 +1,26 @@ +normalize 'NormalizerNFKC100("report_source_offset", true)' "( あ いうえお )" REMOVE_BLANK +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "(あいうえお)", + "types": [ + + ], + "checks": [ + + ], + "offsets": [ + 0, + 2, + 7, + 10, + 13, + 16, + 20 + ] + } +] Added: test/command/suite/normalizers/nfkc100/report_source_offset.test (+4 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/report_source_offset.test 2018-05-28 11:41:05 +0900 (d4938b98e) @@ -0,0 +1,4 @@ +normalize \ + 'NormalizerNFKC100("report_source_offset", true)' \ + "( あ いうえお )" \ + REMOVE_BLANK -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180528/b1d764b4/attachment-0001.htm