Kouhei Sutou
null+****@clear*****
Sat Dec 9 17:00:27 JST 2017
Kouhei Sutou 2017-12-09 17:00:27 +0900 (Sat, 09 Dec 2017) New Revision: b7b522a1703688ceb5ca448f814bed04c4f3f8f6 https://github.com/groonga/groonga/commit/b7b522a1703688ceb5ca448f814bed04c4f3f8f6 Message: Partially support keyword extraction from regexp search GitHub: fix #787 It enables highlight_html and snippet_html for regexp search. Reported by takagi01. Thanks!!! Added files: test/command/suite/select/function/highlight_html/regexp.expected test/command/suite/select/function/highlight_html/regexp.test Modified files: lib/expr.c Modified: lib/expr.c (+98 -0) =================================================================== --- lib/expr.c 2017-12-08 19:16:48 +0900 (0c38e0ae2) +++ lib/expr.c 2017-12-09 17:00:27 +0900 (738948c51) @@ -9076,6 +9076,94 @@ grn_expr_syntax_expand_query_by_table(grn_ctx *ctx, GRN_API_RETURN(ctx->rc); } +/* + * TODO: It's very loose implementations. It just splits regexp by + * meta characters to extract keywords. For example, "r.*nga" has "r" + * and "nga" keywords. + */ +static void +grn_expr_get_keywords_regexp(grn_ctx *ctx, grn_obj *keywords, grn_obj *regexp) +{ + const char *regexp_raw; + const char *regexp_raw_end; + grn_bool escaping = GRN_FALSE; + grn_obj keyword; + + regexp_raw = GRN_TEXT_VALUE(regexp); + regexp_raw_end = regexp_raw + GRN_TEXT_LEN(regexp); + + GRN_TEXT_INIT(&keyword, 0); + while (regexp_raw < regexp_raw_end) { + unsigned int char_len; + + char_len = grn_charlen(ctx, regexp_raw, regexp_raw_end); + + if (char_len == 1) { + if (escaping) { + escaping = GRN_FALSE; + switch (regexp_raw[0]) { + case 'A' : + case 'z' : + if (GRN_TEXT_LEN(&keyword) > 0) { + grn_vector_add_element(ctx, + keywords, + GRN_TEXT_VALUE(&keyword), + GRN_TEXT_LEN(&keyword), + 0, + GRN_DB_TEXT); + GRN_BULK_REWIND(&keyword); + } + break; + default : + GRN_TEXT_PUTC(ctx, &keyword, regexp_raw[0]); + break; + } + } else { + switch (regexp_raw[0]) { + case '.' : + escaping = GRN_FALSE; + break; + case '*' : + escaping = GRN_FALSE; + if (GRN_TEXT_LEN(&keyword) > 0) { + grn_vector_add_element(ctx, + keywords, + GRN_TEXT_VALUE(&keyword), + GRN_TEXT_LEN(&keyword), + 0, + GRN_DB_TEXT); + GRN_BULK_REWIND(&keyword); + } + break; + case '\\' : + escaping = GRN_TRUE; + break; + default : + escaping = GRN_FALSE; + GRN_TEXT_PUTC(ctx, &keyword, regexp_raw[0]); + break; + } + } + } else { + escaping = GRN_FALSE; + GRN_TEXT_PUT(ctx, &keyword, regexp_raw, char_len); + } + + regexp_raw += char_len; + } + + if (GRN_TEXT_LEN(&keyword) > 0) { + grn_vector_add_element(ctx, + keywords, + GRN_TEXT_VALUE(&keyword), + GRN_TEXT_LEN(&keyword), + 0, + GRN_DB_TEXT); + } + + GRN_OBJ_FIN(ctx, &keyword); +} + grn_rc grn_expr_get_keywords(grn_ctx *ctx, grn_obj *expr, grn_obj *keywords) { @@ -9111,6 +9199,16 @@ grn_expr_get_keywords(grn_ctx *ctx, grn_obj *expr, grn_obj *keywords) GRN_DB_TEXT); } break; + case GRN_OP_REGEXP : + /* TODO: It should be refined. */ + if (is_index_searchable_regexp(ctx, si->query)) { + if (keywords->header.type == GRN_PVECTOR) { + GRN_PTR_PUT(ctx, keywords, si->query); + } else { + grn_expr_get_keywords_regexp(ctx, keywords, si->query); + } + } + break; case GRN_OP_SIMILAR : if (keywords->header.type == GRN_VECTOR && GRN_BULK_VSIZE(&(si->index)) > 0) { Added: test/command/suite/select/function/highlight_html/regexp.expected (+37 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/regexp.expected 2017-12-09 17:00:27 +0900 (265cb9d70) @@ -0,0 +1,37 @@ +table_create Entries TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Entries body COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp --normalizer NormalizerAuto +[[0,0.0,0.0],true] +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body +[[0,0.0,0.0],true] +load --table Entries +[ +{"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."} +] +[[0,0.0,0.0],1] +select Entries --filter "body @~ 'ro.*ga'" --output_columns "highlight_html(body)" +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "highlight_html", + null + ] + ], + [ + "M<span class=\"keyword\">ro</span>on<span class=\"keyword\">ga</span> is a MySQL storage engine based on G<span class=\"keyword\">ro</span>on<span class=\"keyword\">ga</span>. <b>R<span class=\"keyword\">ro</span>on<span class=\"keyword\">ga</span></b> is a Ruby binding of G<span class=\"keyword\">ro</span>on<span class=\"keyword\">ga</span>." + ] + ] + ] +] Added: test/command/suite/select/function/highlight_html/regexp.test (+14 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/regexp.test 2017-12-09 17:00:27 +0900 (77eb47652) @@ -0,0 +1,14 @@ +table_create Entries TABLE_NO_KEY +column_create Entries body COLUMN_SCALAR ShortText + +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp --normalizer NormalizerAuto +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body + +load --table Entries +[ +{"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."} +] + +select Entries \ + --filter "body @~ 'ro.*ga'" \ + --output_columns "highlight_html(body)" -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20171209/ec455164/attachment-0001.htm