Kouhei Sutou
null+****@clear*****
Fri Mar 17 15:01:19 JST 2017
Kouhei Sutou 2017-03-17 15:01:19 +0900 (Fri, 17 Mar 2017) New Revision: c138b3844ef504e133ba64cb95aac311c235d446 https://github.com/groonga/groonga/commit/c138b3844ef504e133ba64cb95aac311c235d446 Message: highlight_html: don't use too much appeared tokens Modified files: lib/expr.c test/command/suite/select/function/highlight_html/similar_search.expected test/command/suite/select/function/highlight_html/similar_search.test Modified: lib/expr.c (+10 -0) =================================================================== --- lib/expr.c 2017-03-17 14:00:24 +0900 (ee60806) +++ lib/expr.c 2017-03-17 15:01:19 +0900 (a7236c1) @@ -8754,12 +8754,22 @@ grn_expr_get_keywords(grn_ctx *ctx, grn_obj *expr, grn_obj *keywords) GRN_TOKENIZE_GET, token_flags); if (token_cursor) { + grn_obj *source_table; + uint32_t n_records_threshold; + source_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, index)); + n_records_threshold = grn_table_size(ctx, source_table) / 2; while (token_cursor->status != GRN_TOKEN_CURSOR_DONE) { grn_id token_id; + uint32_t n_estimated_records; token_id = grn_token_cursor_next(ctx, token_cursor); if (token_id == GRN_ID_NIL) { continue; } + n_estimated_records = + grn_ii_estimate_size(ctx, (grn_ii *)index, token_id); + if (n_estimated_records >= n_records_threshold) { + continue; + } grn_vector_add_element(ctx, keywords, token_cursor->curr, Modified: test/command/suite/select/function/highlight_html/similar_search.expected (+7 -12) =================================================================== --- test/command/suite/select/function/highlight_html/similar_search.expected 2017-03-17 14:00:24 +0900 (7633b78) +++ test/command/suite/select/function/highlight_html/similar_search.expected 2017-03-17 15:01:19 +0900 (7a53203) @@ -1,26 +1,21 @@ -plugin_register token_filters/stop_word -[[0,0.0,0.0],true] table_create Entries TABLE_NO_KEY [[0,0.0,0.0],true] column_create Entries body COLUMN_SCALAR ShortText [[0,0.0,0.0],true] -table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto --token_filters TokenFilterStopWord +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto [[0,0.0,0.0],true] column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body [[0,0.0,0.0],true] column_create Terms is_stop_word COLUMN_SCALAR Bool [[0,0.0,0.0],true] -load --table Terms -[ -{"_key": "is", "is_stop_word": true}, -{"_key": ".", "is_stop_word": true} -] -[[0,0.0,0.0],2] load --table Entries [ -{"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."} +{"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."}, +{"body": "It is a pen."}, +{"body": "I am a boy."}, +{"body": "This is good."} ] -[[0,0.0,0.0],1] +[[0,0.0,0.0],4] select Entries --filter 'body *S "Groonga is fast full text search engine. There are SQL interfaces by Mroonga and PGroonga and Ruby interface by Rroonga."' --output_columns 'highlight_html(body)' [ [ @@ -40,7 +35,7 @@ select Entries --filter 'body *S "Groonga is fast full text search engine. The ] ], [ - "<span class=\"keyword\">Mroonga</span> is a MySQL storage <span class=\"keyword\">engine</span> based on <span class=\"keyword\">Groonga</span>. <b><span class=\"keyword\">Rroonga</span></b> is a <span class=\"keyword\">Ruby</span> binding of <span class=\"keyword\">Groonga</span>." + "<span class=\"keyword\">Mroonga</span> is a MySQL storage <span class=\"keyword\">engine</span> based on Groonga. <b><span class=\"keyword\">Rroonga</span></b> is a <span class=\"keyword\">Ruby</span> binding of Groonga." ] ] ] Modified: test/command/suite/select/function/highlight_html/similar_search.test (+5 -11) =================================================================== --- test/command/suite/select/function/highlight_html/similar_search.test 2017-03-17 14:00:24 +0900 (68e7563) +++ test/command/suite/select/function/highlight_html/similar_search.test 2017-03-17 15:01:19 +0900 (253037c) @@ -1,24 +1,18 @@ -plugin_register token_filters/stop_word - table_create Entries TABLE_NO_KEY column_create Entries body COLUMN_SCALAR ShortText table_create Terms TABLE_PAT_KEY ShortText \ --default_tokenizer TokenBigram \ - --normalizer NormalizerAuto \ - --token_filters TokenFilterStopWord + --normalizer NormalizerAuto column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body column_create Terms is_stop_word COLUMN_SCALAR Bool -load --table Terms -[ -{"_key": "is", "is_stop_word": true}, -{"_key": ".", "is_stop_word": true} -] - load --table Entries [ -{"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."} +{"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."}, +{"body": "It is a pen."}, +{"body": "I am a boy."}, +{"body": "This is good."} ] select Entries \ -------------- next part -------------- HTML����������������������������...Download