Kouhei Sutou
null+****@clear*****
Fri Mar 10 15:00:36 JST 2017
Kouhei Sutou 2017-03-10 15:00:36 +0900 (Fri, 10 Mar 2017) New Revision: 074fcc08392d1dff4d10b2c69883137432ce6071 https://github.com/groonga/groonga/commit/074fcc08392d1dff4d10b2c69883137432ce6071 Message: grn_pat_scan: ignore blanks in target string. With this change, highlight family functions support highlighting text that has some blanks. See the test in this change for example. Added files: test/command/suite/select/function/highlight_html/space_in_target.expected test/command/suite/select/function/highlight_html/space_in_target.test Modified files: lib/pat.c Modified: lib/pat.c (+47 -8) =================================================================== --- lib/pat.c 2017-03-01 17:27:05 +0900 (0fa4da9) +++ lib/pat.c 2017-03-10 15:00:36 +0900 (5b168ff) @@ -2161,10 +2161,15 @@ grn_pat_scan(grn_ctx *ctx, grn_pat *pat, const char *str, unsigned int str_len, return 0; } if (pat->normalizer) { + int flags = + GRN_STRING_REMOVE_BLANK | + GRN_STRING_WITH_TYPES | + GRN_STRING_WITH_CHECKS; grn_obj *nstr = grn_string_open(ctx, str, str_len, - pat->normalizer, GRN_STRING_WITH_CHECKS); + pat->normalizer, flags); if (nstr) { const short *cp = grn_string_get_checks(ctx, nstr); + const unsigned char *tp = grn_string_get_types(ctx, nstr); unsigned int offset = 0, offset0 = 0; unsigned int normalized_length_in_bytes; const char *sp, *se; @@ -2173,18 +2178,52 @@ grn_pat_scan(grn_ctx *ctx, grn_pat *pat, const char *str, unsigned int str_len, se = sp + normalized_length_in_bytes; while (n < sh_size) { if ((tid = grn_pat_lcp_search(ctx, pat, sp, se - sp))) { + const char *key; uint32_t len; - _grn_pat_key(ctx, pat, tid, &len); + key = _grn_pat_key(ctx, pat, tid, &len); sh[n].id = tid; sh[n].offset = (*cp > 0) ? offset : offset0; - while (len--) { - if (*cp > 0) { offset0 = offset; offset += *cp; } - sp++; cp++; + if (sh[n].offset > 0 && + GRN_CHAR_IS_BLANK(tp[-1]) && + grn_charlen(ctx, key, key + len) == 1 && + key[0] != ' ') { + /* Remove leading spaces. */ + const char *original_str = str + sh[n].offset; + while (grn_charlen(ctx, original_str, str + str_len) == 1 && + original_str[0] == ' ') { + original_str++; + sh[n].offset++; + } + } + { + grn_bool blank_in_alnum = GRN_FALSE; + const unsigned char *start_tp = tp; + const unsigned char *blank_in_alnum_check_tp; + while (len--) { + if (*cp > 0) { offset0 = offset; offset += *cp; tp++; } + sp++; cp++; + } + sh[n].length = offset - sh[n].offset; + for (blank_in_alnum_check_tp = start_tp + 1; + blank_in_alnum_check_tp < tp; + blank_in_alnum_check_tp++) { +#define GRN_CHAR_IS_ALNUM(char_type) \ + (GRN_CHAR_TYPE(char_type) == GRN_CHAR_ALPHA || \ + GRN_CHAR_TYPE(char_type) == GRN_CHAR_DIGIT) + if (GRN_CHAR_IS_BLANK(blank_in_alnum_check_tp[0]) && + GRN_CHAR_IS_ALNUM(blank_in_alnum_check_tp[-1]) && + (blank_in_alnum_check_tp + 1) < tp && + GRN_CHAR_IS_ALNUM(blank_in_alnum_check_tp[1])) { + blank_in_alnum = GRN_TRUE; + } +#undef GRN_CHAR_IS_ALNUM + } + if (!blank_in_alnum) { + n++; + } } - sh[n].length = offset - sh[n].offset; - n++; } else { - if (*cp > 0) { offset0 = offset; offset += *cp; } + if (*cp > 0) { offset0 = offset; offset += *cp; tp++; } do { sp++; cp++; } while (sp < se && !*cp); Added: test/command/suite/select/function/highlight_html/space_in_target.expected (+37 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/space_in_target.expected 2017-03-10 15:00:36 +0900 (30e3e0d) @@ -0,0 +1,37 @@ +table_create Entries TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Entries body COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto +[[0,0.0,0.0],true] +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body +[[0,0.0,0.0],true] +load --table Entries +[ +{"body": "高速な Mroonga ストレージエンジン。 Mr oongaストレージ"} +] +[[0,0.0,0.0],1] +select Entries --output_columns --match_columns body --query 'Mroongaストレージ' --output_columns 'highlight_html(body)' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "highlight_html", + null + ] + ], + [ + "高速な <span class=\"keyword\">Mroonga ストレージ</span>エンジン。 Mr oongaストレージ" + ] + ] + ] +] Added: test/command/suite/select/function/highlight_html/space_in_target.test (+14 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/space_in_target.test 2017-03-10 15:00:36 +0900 (4a8cd04) @@ -0,0 +1,14 @@ +table_create Entries TABLE_NO_KEY +column_create Entries body COLUMN_SCALAR ShortText + +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body + +load --table Entries +[ +{"body": "高速な Mroonga ストレージエンジン。 Mr oongaストレージ"} +] + +select Entries --output_columns \ + --match_columns body --query 'Mroongaストレージ' \ + --output_columns 'highlight_html(body)' -------------- next part -------------- HTML����������������������������...Download