Kouhei Sutou
null+****@clear*****
Wed Feb 18 22:17:18 JST 2015
Kouhei Sutou 2015-02-18 22:17:18 +0900 (Wed, 18 Feb 2015) New Revision: 116d2ef1ac27a32f14e740f26e3b91be75e56963 https://github.com/groonga/groonga/commit/116d2ef1ac27a32f14e740f26e3b91be75e56963 Message: scorer: support 1 argument scorer_tf_at_most() scorer is also added. It uses 1 argument. Added files: test/command/suite/select/match_columns/scorer/tf_at_most.expected test/command/suite/select/match_columns/scorer/tf_at_most.test Modified files: include/groonga/groonga.h include/groonga/scorer.h lib/expr.c lib/grn_expr.h lib/grn_ii.h lib/grn_scorer.h lib/ii.c lib/mrb/mrb_expr.c lib/mrb/scripts/scan_info.rb lib/mrb/scripts/scan_info_data.rb lib/scorer.c lib/scorers.c Modified: include/groonga/groonga.h (+2 -0) =================================================================== --- include/groonga/groonga.h 2015-02-18 18:42:09 +0900 (16bc424) +++ include/groonga/groonga.h 2015-02-18 22:17:18 +0900 (48fb97e) @@ -912,6 +912,8 @@ struct _grn_search_optarg { grn_obj *proc; int max_size; grn_obj *scorer; + grn_obj *scorer_args_expr; + unsigned int scorer_args_expr_offset; }; GRN_API grn_rc grn_obj_search(grn_ctx *ctx, grn_obj *obj, grn_obj *query, Modified: include/groonga/scorer.h (+7 -0) =================================================================== --- include/groonga/scorer.h 2015-02-18 18:42:09 +0900 (a50d0d8) +++ include/groonga/scorer.h 2015-02-18 22:17:18 +0900 (6f9c589) @@ -59,6 +59,13 @@ GRN_API unsigned int GRN_API int grn_scorer_matched_record_get_weight(grn_ctx *ctx, grn_scorer_matched_record *record); +GRN_API grn_obj * + grn_scorer_matched_record_get_arg(grn_ctx *ctx, + grn_scorer_matched_record *record, + unsigned int i); +GRN_API unsigned int + grn_scorer_matched_record_get_n_args(grn_ctx *ctx, + grn_scorer_matched_record *record); Modified: lib/expr.c (+34 -9) =================================================================== --- lib/expr.c 2015-02-18 18:42:09 +0900 (f417e26) +++ lib/expr.c 2015-02-18 22:17:18 +0900 (8f5b688) @@ -3754,6 +3754,8 @@ struct _grn_scan_info { int max_interval; int similarity_threshold; grn_obj *scorer; + grn_obj *scorer_args_expr; + uint32_t scorer_args_expr_offset; }; #define SI_FREE(si) do {\ @@ -3778,6 +3780,8 @@ struct _grn_scan_info { (si)->similarity_threshold = DEFAULT_SIMILARITY_THRESHOLD;\ (si)->start = (st);\ (si)->scorer = NULL;\ + (si)->scorer_args_expr = NULL;\ + (si)->scorer_args_expr_offset = 0;\ } while (0) static scan_info ** @@ -4068,6 +4072,8 @@ grn_scan_info_open(grn_ctx *ctx, int start) si->similarity_threshold = DEFAULT_SIMILARITY_THRESHOLD; si->start = start; si->scorer = NULL; + si->scorer_args_expr = NULL; + si->scorer_args_expr_offset = 0; return si; } @@ -4181,6 +4187,30 @@ grn_scan_info_set_scorer(scan_info *si, grn_obj *scorer) si->scorer = scorer; } +grn_obj * +grn_scan_info_get_scorer_args_expr(scan_info *si) +{ + return si->scorer_args_expr; +} + +void +grn_scan_info_set_scorer_args_expr(scan_info *si, grn_obj *expr) +{ + si->scorer_args_expr = expr; +} + +uint32_t +grn_scan_info_get_scorer_args_expr_offset(scan_info *si) +{ + return si->scorer_args_expr_offset; +} + +void +grn_scan_info_set_scorer_args_expr_offset(scan_info *si, uint32_t offset) +{ + si->scorer_args_expr_offset = offset; +} + grn_bool grn_scan_info_push_arg(scan_info *si, grn_obj *arg) { @@ -4288,15 +4318,8 @@ scan_info_build_match_expr_codes(grn_ctx *ctx, scan_info *si, si->scorer = ec->value; i = scan_info_build_match_expr_codes(ctx, si, expr, i + 1); if (expr->codes[i].op != GRN_OP_CALL) { - grn_obj inspected; - GRN_TEXT_INIT(&inspected, 0); - grn_inspect(ctx, &inspected, si->scorer); - ERR(GRN_INVALID_ARGUMENT, - "scorer must have only one argument: <%.*s>", - (int)GRN_TEXT_LEN(&inspected), - GRN_TEXT_VALUE(&inspected)); - GRN_OBJ_FIN(ctx, &inspected); - return expr->codes_curr; + si->scorer_args_expr = (grn_obj *)expr; + si->scorer_args_expr_offset = i; } break; case GRN_TABLE_NO_KEY : @@ -5218,6 +5241,8 @@ grn_table_select_index(grn_ctx *ctx, grn_obj *table, scan_info *si, optarg.proc = NULL; optarg.max_size = 0; optarg.scorer = si->scorer; + optarg.scorer_args_expr = si->scorer_args_expr; + optarg.scorer_args_expr_offset = si->scorer_args_expr_offset; ctx->flags |= GRN_CTX_TEMPORARY_DISABLE_II_RESOLVE_SEL_AND; for (; j--; ip++, wp += 2) { uint32_t sid = (uint32_t) wp[0]; Modified: lib/grn_expr.h (+6 -0) =================================================================== --- lib/grn_expr.h 2015-02-18 18:42:09 +0900 (0a21f72) +++ lib/grn_expr.h 2015-02-18 22:17:18 +0900 (7bb134b) @@ -61,6 +61,12 @@ int grn_scan_info_get_similarity_threshold(scan_info *si); void grn_scan_info_set_similarity_threshold(scan_info *si, int similarity_threshold); grn_obj *grn_scan_info_get_scorer(scan_info *si); void grn_scan_info_set_scorer(scan_info *si, grn_obj *scorer); +grn_obj *grn_scan_info_get_scorer_args_expr(scan_info *si); +void grn_scan_info_set_scorer_args_expr(scan_info *si, + grn_obj *scorer_args_expr); +uint32_t grn_scan_info_get_scorer_args_expr_offset(scan_info *si); +void grn_scan_info_set_scorer_args_expr_offset(scan_info *si, + uint32_t offset); grn_bool grn_scan_info_push_arg(scan_info *si, grn_obj *arg); grn_obj *grn_scan_info_get_arg(grn_ctx *ctx, scan_info *si, int i); Modified: lib/grn_ii.h (+2 -0) =================================================================== --- lib/grn_ii.h 2015-02-18 18:42:09 +0900 (0d44079) +++ lib/grn_ii.h 2015-02-18 22:17:18 +0900 (fe9b1ec) @@ -152,6 +152,8 @@ struct _grn_select_optarg { void *func_arg; int max_size; grn_obj *scorer; + grn_obj *scorer_args_expr; + unsigned int scorer_args_expr_offset; }; GRN_API grn_rc grn_ii_column_update(grn_ctx *ctx, grn_ii *ii, grn_id id, Modified: lib/grn_scorer.h (+2 -0) =================================================================== --- lib/grn_scorer.h 2015-02-18 18:42:09 +0900 (898a833) +++ lib/grn_scorer.h 2015-02-18 22:17:18 +0900 (05f9821) @@ -39,6 +39,8 @@ struct _grn_scorer_matched_record { uint64_t n_candidates; uint32_t n_tokens; int weight; + grn_obj *args_expr; + unsigned int args_expr_offset; }; Modified: lib/ii.c (+4 -2) =================================================================== --- lib/ii.c 2015-02-18 18:42:09 +0900 (3b5478f) +++ lib/ii.c 2015-02-18 22:17:18 +0900 (7e38bb8) @@ -6161,7 +6161,6 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_ grn_wv_mode wvm = grn_wv_none; grn_obj *lexicon = ii->lexicon; grn_scorer_score_func *score_func = NULL; - void *score_func_user_data = NULL; grn_scorer_matched_record record; if (!lexicon || !ii || !s) { return GRN_INVALID_ARGUMENT; } @@ -6240,7 +6239,6 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_ if (optarg && optarg->scorer) { grn_proc *scorer = (grn_proc *)(optarg->scorer); score_func = scorer->callbacks.scorer.score; - score_func_user_data = scorer->user_data; record.table = grn_ctx_at(ctx, s->obj.header.domain); record.lexicon = lexicon; record.id = GRN_ID_NIL; @@ -6252,6 +6250,8 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_ record.n_candidates = 0; record.n_tokens = 0; record.weight = 0; + record.args_expr = optarg->scorer_args_expr; + record.args_expr_offset = optarg->scorer_args_expr_offset; } for (;;) { @@ -6425,6 +6425,8 @@ grn_ii_sel(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len arg.vector_size = optarg->vector_size; } arg.scorer = optarg->scorer; + arg.scorer_args_expr = optarg->scorer_args_expr; + arg.scorer_args_expr_offset = optarg->scorer_args_expr_offset; } /* todo : support subrec grn_rset_init(ctx, s, grn_rec_document, 0, grn_rec_none, 0, 0); Modified: lib/mrb/mrb_expr.c (+60 -0) =================================================================== --- lib/mrb/mrb_expr.c 2015-02-18 18:42:09 +0900 (cbe5ebb) +++ lib/mrb/mrb_expr.c 2015-02-18 22:17:18 +0900 (f726075) @@ -284,6 +284,56 @@ mrb_grn_scan_info_get_scorer(mrb_state *mrb, mrb_value self) } static mrb_value +mrb_grn_scan_info_get_scorer_args_expr(mrb_state *mrb, mrb_value self) +{ + scan_info *si; + grn_obj *scorer_args_expr; + + si = DATA_PTR(self); + scorer_args_expr = grn_scan_info_get_scorer_args_expr(si); + return grn_mrb_value_from_grn_obj(mrb, scorer_args_expr); +} + +static mrb_value +mrb_grn_scan_info_set_scorer_args_expr(mrb_state *mrb, mrb_value self) +{ + scan_info *si; + mrb_value mrb_scorer_args_expr; + + mrb_get_args(mrb, "o", &mrb_scorer_args_expr); + si = DATA_PTR(self); + if (mrb_nil_p(mrb_scorer_args_expr)) { + grn_scan_info_set_scorer_args_expr(si, NULL); + } else { + grn_scan_info_set_scorer_args_expr(si, DATA_PTR(mrb_scorer_args_expr)); + } + return self; +} + +static mrb_value +mrb_grn_scan_info_get_scorer_args_expr_offset(mrb_state *mrb, mrb_value self) +{ + scan_info *si; + uint32_t offset; + + si = DATA_PTR(self); + offset = grn_scan_info_get_scorer_args_expr_offset(si); + return mrb_fixnum_value(offset); +} + +static mrb_value +mrb_grn_scan_info_set_scorer_args_expr_offset(mrb_state *mrb, mrb_value self) +{ + scan_info *si; + mrb_int offset; + + mrb_get_args(mrb, "i", &offset); + si = DATA_PTR(self); + grn_scan_info_set_scorer_args_expr_offset(si, offset); + return self; +} + +static mrb_value mrb_grn_scan_info_get_arg(mrb_state *mrb, mrb_value self) { grn_ctx *ctx = (grn_ctx *)mrb->ud; @@ -678,6 +728,16 @@ grn_mrb_expr_init(grn_ctx *ctx) mrb_grn_scan_info_get_scorer, MRB_ARGS_NONE()); mrb_define_method(mrb, klass, "scorer=", mrb_grn_scan_info_set_scorer, MRB_ARGS_REQ(1)); + mrb_define_method(mrb, klass, "scorer_args_expr", + mrb_grn_scan_info_get_scorer_args_expr, MRB_ARGS_NONE()); + mrb_define_method(mrb, klass, "scorer_args_expr=", + mrb_grn_scan_info_set_scorer_args_expr, MRB_ARGS_REQ(1)); + mrb_define_method(mrb, klass, "scorer_args_expr_offset", + mrb_grn_scan_info_get_scorer_args_expr_offset, + MRB_ARGS_NONE()); + mrb_define_method(mrb, klass, "scorer_args_expr_offset=", + mrb_grn_scan_info_set_scorer_args_expr_offset, + MRB_ARGS_REQ(1)); mrb_define_method(mrb, klass, "get_arg", mrb_grn_scan_info_get_arg, MRB_ARGS_REQ(1)); mrb_define_method(mrb, klass, "push_arg", Modified: lib/mrb/scripts/scan_info.rb (+4 -0) =================================================================== --- lib/mrb/scripts/scan_info.rb 2015-02-18 18:42:09 +0900 (64262f8) +++ lib/mrb/scripts/scan_info.rb 2015-02-18 22:17:18 +0900 (7332cac) @@ -21,6 +21,10 @@ module Groonga end if data.scorer self.scorer = data.scorer + if data.scorer_args_expr + self.scorer_args_expr = data.scorer_args_expr + self.scorer_args_expr_offset = data.scorer_args_expr_offset + end end data.args.each do |arg| push_arg(arg) Modified: lib/mrb/scripts/scan_info_data.rb (+15 -3) =================================================================== --- lib/mrb/scripts/scan_info_data.rb 2015-02-18 18:42:09 +0900 (b0ebfea) +++ lib/mrb/scripts/scan_info_data.rb 2015-02-18 22:17:18 +0900 (f6cd317) @@ -11,6 +11,8 @@ module Groonga attr_accessor :max_interval attr_accessor :similarity_threshold attr_accessor :scorer + attr_accessor :scorer_args_expr + attr_accessor :scorer_args_expr_offset def initialize(start) @start = start @end = 0 @@ -23,6 +25,8 @@ module Groonga @max_interval = nil @similarity_threshold = nil @scorer = nil + @scorer_args_expr = nil + @scorer_args_expr_offset = nil end def match_resolve_index @@ -109,11 +113,11 @@ module Groonga n_codes = codes.size i = 0 while i < n_codes - i = match_resolve_index_expression_codes(codes, i, n_codes) + i = match_resolve_index_expression_codes(expression, codes, i, n_codes) end end - def match_resolve_index_expression_codes(codes, i, n_codes) + def match_resolve_index_expression_codes(expression, codes, i, n_codes) code = codes[i] value = code.value case value @@ -145,7 +149,15 @@ module Groonga message = "match target is required as an argument: <#{scorer.name}>" raise ErrorMessage, message end - i = match_resolve_index_expression_codes(codes, i + 1, n_codes) + i = match_resolve_index_expression_codes(expression, codes, i + 1, + n_codes) + unless codes[i].op == Operator::CALL + @scorer_args_expr = expression + @scorer_args_expr_offset = i + until codes[i].op == Operator::CALL + i += 1 + end + end when Table raise ErrorMessage, "invalid match target: <#{value.name}>" end Modified: lib/scorer.c (+55 -0) =================================================================== --- lib/scorer.c 2015-02-18 18:42:09 +0900 (f5bc043) +++ lib/scorer.c 2015-02-18 22:17:18 +0900 (2670bb3) @@ -100,6 +100,61 @@ grn_scorer_matched_record_get_weight(grn_ctx *ctx, return record->weight; } +grn_obj * +grn_scorer_matched_record_get_arg(grn_ctx *ctx, + grn_scorer_matched_record *record, + unsigned int i) +{ + grn_expr *expr; + grn_expr_code *codes_original; + uint32_t codes_curr_original; + grn_obj *arg; + + if (!record->args_expr) { + return NULL; + } + + expr = (grn_expr *)(record->args_expr); + /* TODO: support getting column value */ + codes_original = expr->codes; + codes_curr_original = expr->codes_curr; + expr->codes += record->args_expr_offset; + expr->codes_curr = 1; /* TODO: support 1 or more codes */ + arg = grn_expr_exec(ctx, (grn_obj *)expr, 0); + expr->codes_curr = codes_curr_original; + expr->codes = codes_original; + + return arg; +} + +unsigned int +grn_scorer_matched_record_get_n_args(grn_ctx *ctx, + grn_scorer_matched_record *record) +{ + grn_expr *expr; + grn_expr_code *codes; + unsigned int n_args = 0; + + if (!record->args_expr) { + return 0; + } + + expr = (grn_expr *)(record->args_expr); + codes = expr->codes + record->args_expr_offset; + if (codes[0].op == GRN_OP_CALL) { + return 0; + } + + n_args++; + for (; codes[0].op != GRN_OP_CALL; codes++) { + if (codes[0].op == GRN_OP_COMMA) { + n_args++; + } + } + + return n_args; +} + grn_rc grn_scorer_register(grn_ctx *ctx, const char *plugin_name_ptr, Modified: lib/scorers.c (+39 -0) =================================================================== --- lib/scorers.c 2015-02-18 18:42:09 +0900 (b039e01) +++ lib/scorers.c 2015-02-18 22:17:18 +0900 (a80abcb) @@ -18,6 +18,8 @@ #include <groonga/scorer.h> +#include "grn_db.h" + #include <math.h> static double @@ -49,9 +51,46 @@ scorer_tf_idf(grn_ctx *ctx, grn_scorer_matched_record *record) } } +static double +scorer_tf_at_most(grn_ctx *ctx, grn_scorer_matched_record *record) +{ + double tf; + double max; + grn_obj *max_raw; + + tf = grn_scorer_matched_record_get_n_occurrences(ctx, record) + + grn_scorer_matched_record_get_total_term_weights(ctx, record); + max_raw = grn_scorer_matched_record_get_arg(ctx, record, 0); + + if (!max_raw) { + return tf; + } + + if (max_raw->header.type != GRN_BULK) { + return tf; + } + + if (max_raw->header.domain == GRN_DB_FLOAT) { + max = GRN_FLOAT_VALUE(max_raw); + } else { + grn_obj casted_max_raw; + GRN_FLOAT_INIT(&casted_max_raw, 0); + if (grn_obj_cast(ctx, &casted_max_raw, max_raw, GRN_FALSE) != GRN_SUCCESS) { + GRN_OBJ_FIN(ctx, &casted_max_raw); + return tf; + } else { + max = GRN_FLOAT_VALUE(&casted_max_raw); + } + GRN_OBJ_FIN(ctx, &casted_max_raw); + } + + return fmin(tf, max); +} + grn_rc grn_db_init_builtin_scorers(grn_ctx *ctx) { grn_scorer_register(ctx, "scorer_tf_idf", -1, scorer_tf_idf); + grn_scorer_register(ctx, "scorer_tf_at_most", -1, scorer_tf_at_most); return GRN_SUCCESS; } Added: test/command/suite/select/match_columns/scorer/tf_at_most.expected (+58 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/match_columns/scorer/tf_at_most.expected 2015-02-18 22:17:18 +0900 (d3c7195) @@ -0,0 +1,58 @@ +table_create Logs TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Logs message COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto +[[0,0.0,0.0],true] +column_create Terms index COLUMN_INDEX|WITH_POSITION Logs message +[[0,0.0,0.0],true] +load --table Logs +[ +["message"], +["Error"], +["Error Error"], +["Error Error Error"], +["Error Error Error Error"] +] +[[0,0.0,0.0],4] +select Logs --match_columns 'scorer_tf_at_most(message, 3.0)' --query 'error' --output_columns '_score, message' --sortby -_score +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 4 + ], + [ + [ + "_score", + "Int32" + ], + [ + "message", + "Text" + ] + ], + [ + 3, + "Error Error Error Error" + ], + [ + 3, + "Error Error Error" + ], + [ + 2, + "Error Error" + ], + [ + 1, + "Error" + ] + ] + ] +] Added: test/command/suite/select/match_columns/scorer/tf_at_most.test (+22 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/match_columns/scorer/tf_at_most.test 2015-02-18 22:17:18 +0900 (f9288b8) @@ -0,0 +1,22 @@ +table_create Logs TABLE_NO_KEY +column_create Logs message COLUMN_SCALAR Text + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram \ + --normalizer NormalizerAuto +column_create Terms index COLUMN_INDEX|WITH_POSITION Logs message + +load --table Logs +[ +["message"], +["Error"], +["Error Error"], +["Error Error Error"], +["Error Error Error Error"] +] + +select Logs \ + --match_columns 'scorer_tf_at_most(message, 3.0)' \ + --query 'error' \ + --output_columns '_score, message' \ + --sortby -_score -------------- next part -------------- HTML����������������������������...Download