[Groonga-commit] groonga/groonga at 116d2ef [master] scorer: support 1 argument

Back to archive index

Kouhei Sutou null+****@clear*****
Wed Feb 18 22:17:18 JST 2015


Kouhei Sutou	2015-02-18 22:17:18 +0900 (Wed, 18 Feb 2015)

  New Revision: 116d2ef1ac27a32f14e740f26e3b91be75e56963
  https://github.com/groonga/groonga/commit/116d2ef1ac27a32f14e740f26e3b91be75e56963

  Message:
    scorer: support 1 argument
    
    scorer_tf_at_most() scorer is also added. It uses 1 argument.

  Added files:
    test/command/suite/select/match_columns/scorer/tf_at_most.expected
    test/command/suite/select/match_columns/scorer/tf_at_most.test
  Modified files:
    include/groonga/groonga.h
    include/groonga/scorer.h
    lib/expr.c
    lib/grn_expr.h
    lib/grn_ii.h
    lib/grn_scorer.h
    lib/ii.c
    lib/mrb/mrb_expr.c
    lib/mrb/scripts/scan_info.rb
    lib/mrb/scripts/scan_info_data.rb
    lib/scorer.c
    lib/scorers.c

  Modified: include/groonga/groonga.h (+2 -0)
===================================================================
--- include/groonga/groonga.h    2015-02-18 18:42:09 +0900 (16bc424)
+++ include/groonga/groonga.h    2015-02-18 22:17:18 +0900 (48fb97e)
@@ -912,6 +912,8 @@ struct _grn_search_optarg {
   grn_obj *proc;
   int max_size;
   grn_obj *scorer;
+  grn_obj *scorer_args_expr;
+  unsigned int scorer_args_expr_offset;
 };
 
 GRN_API grn_rc grn_obj_search(grn_ctx *ctx, grn_obj *obj, grn_obj *query,

  Modified: include/groonga/scorer.h (+7 -0)
===================================================================
--- include/groonga/scorer.h    2015-02-18 18:42:09 +0900 (a50d0d8)
+++ include/groonga/scorer.h    2015-02-18 22:17:18 +0900 (6f9c589)
@@ -59,6 +59,13 @@ GRN_API unsigned int
 GRN_API int
   grn_scorer_matched_record_get_weight(grn_ctx *ctx,
                                        grn_scorer_matched_record *record);
+GRN_API grn_obj *
+  grn_scorer_matched_record_get_arg(grn_ctx *ctx,
+                                    grn_scorer_matched_record *record,
+                                    unsigned int i);
+GRN_API unsigned int
+  grn_scorer_matched_record_get_n_args(grn_ctx *ctx,
+                                       grn_scorer_matched_record *record);
 
 
 

  Modified: lib/expr.c (+34 -9)
===================================================================
--- lib/expr.c    2015-02-18 18:42:09 +0900 (f417e26)
+++ lib/expr.c    2015-02-18 22:17:18 +0900 (8f5b688)
@@ -3754,6 +3754,8 @@ struct _grn_scan_info {
   int max_interval;
   int similarity_threshold;
   grn_obj *scorer;
+  grn_obj *scorer_args_expr;
+  uint32_t scorer_args_expr_offset;
 };
 
 #define SI_FREE(si) do {\
@@ -3778,6 +3780,8 @@ struct _grn_scan_info {
   (si)->similarity_threshold = DEFAULT_SIMILARITY_THRESHOLD;\
   (si)->start = (st);\
   (si)->scorer = NULL;\
+  (si)->scorer_args_expr = NULL;\
+  (si)->scorer_args_expr_offset = 0;\
 } while (0)
 
 static scan_info **
@@ -4068,6 +4072,8 @@ grn_scan_info_open(grn_ctx *ctx, int start)
   si->similarity_threshold = DEFAULT_SIMILARITY_THRESHOLD;
   si->start = start;
   si->scorer = NULL;
+  si->scorer_args_expr = NULL;
+  si->scorer_args_expr_offset = 0;
 
   return si;
 }
@@ -4181,6 +4187,30 @@ grn_scan_info_set_scorer(scan_info *si, grn_obj *scorer)
   si->scorer = scorer;
 }
 
+grn_obj *
+grn_scan_info_get_scorer_args_expr(scan_info *si)
+{
+  return si->scorer_args_expr;
+}
+
+void
+grn_scan_info_set_scorer_args_expr(scan_info *si, grn_obj *expr)
+{
+  si->scorer_args_expr = expr;
+}
+
+uint32_t
+grn_scan_info_get_scorer_args_expr_offset(scan_info *si)
+{
+  return si->scorer_args_expr_offset;
+}
+
+void
+grn_scan_info_set_scorer_args_expr_offset(scan_info *si, uint32_t offset)
+{
+  si->scorer_args_expr_offset = offset;
+}
+
 grn_bool
 grn_scan_info_push_arg(scan_info *si, grn_obj *arg)
 {
@@ -4288,15 +4318,8 @@ scan_info_build_match_expr_codes(grn_ctx *ctx, scan_info *si,
     si->scorer = ec->value;
     i = scan_info_build_match_expr_codes(ctx, si, expr, i + 1);
     if (expr->codes[i].op != GRN_OP_CALL) {
-      grn_obj inspected;
-      GRN_TEXT_INIT(&inspected, 0);
-      grn_inspect(ctx, &inspected, si->scorer);
-      ERR(GRN_INVALID_ARGUMENT,
-          "scorer must have only one argument: <%.*s>",
-          (int)GRN_TEXT_LEN(&inspected),
-          GRN_TEXT_VALUE(&inspected));
-      GRN_OBJ_FIN(ctx, &inspected);
-      return expr->codes_curr;
+      si->scorer_args_expr = (grn_obj *)expr;
+      si->scorer_args_expr_offset = i;
     }
     break;
   case GRN_TABLE_NO_KEY :
@@ -5218,6 +5241,8 @@ grn_table_select_index(grn_ctx *ctx, grn_obj *table, scan_info *si,
         optarg.proc = NULL;
         optarg.max_size = 0;
         optarg.scorer = si->scorer;
+        optarg.scorer_args_expr = si->scorer_args_expr;
+        optarg.scorer_args_expr_offset = si->scorer_args_expr_offset;
         ctx->flags |= GRN_CTX_TEMPORARY_DISABLE_II_RESOLVE_SEL_AND;
         for (; j--; ip++, wp += 2) {
           uint32_t sid = (uint32_t) wp[0];

  Modified: lib/grn_expr.h (+6 -0)
===================================================================
--- lib/grn_expr.h    2015-02-18 18:42:09 +0900 (0a21f72)
+++ lib/grn_expr.h    2015-02-18 22:17:18 +0900 (7bb134b)
@@ -61,6 +61,12 @@ int grn_scan_info_get_similarity_threshold(scan_info *si);
 void grn_scan_info_set_similarity_threshold(scan_info *si, int similarity_threshold);
 grn_obj *grn_scan_info_get_scorer(scan_info *si);
 void grn_scan_info_set_scorer(scan_info *si, grn_obj *scorer);
+grn_obj *grn_scan_info_get_scorer_args_expr(scan_info *si);
+void grn_scan_info_set_scorer_args_expr(scan_info *si,
+                                        grn_obj *scorer_args_expr);
+uint32_t grn_scan_info_get_scorer_args_expr_offset(scan_info *si);
+void grn_scan_info_set_scorer_args_expr_offset(scan_info *si,
+                                               uint32_t offset);
 grn_bool grn_scan_info_push_arg(scan_info *si, grn_obj *arg);
 grn_obj *grn_scan_info_get_arg(grn_ctx *ctx, scan_info *si, int i);
 

  Modified: lib/grn_ii.h (+2 -0)
===================================================================
--- lib/grn_ii.h    2015-02-18 18:42:09 +0900 (0d44079)
+++ lib/grn_ii.h    2015-02-18 22:17:18 +0900 (fe9b1ec)
@@ -152,6 +152,8 @@ struct _grn_select_optarg {
   void *func_arg;
   int max_size;
   grn_obj *scorer;
+  grn_obj *scorer_args_expr;
+  unsigned int scorer_args_expr_offset;
 };
 
 GRN_API grn_rc grn_ii_column_update(grn_ctx *ctx, grn_ii *ii, grn_id id,

  Modified: lib/grn_scorer.h (+2 -0)
===================================================================
--- lib/grn_scorer.h    2015-02-18 18:42:09 +0900 (898a833)
+++ lib/grn_scorer.h    2015-02-18 22:17:18 +0900 (05f9821)
@@ -39,6 +39,8 @@ struct _grn_scorer_matched_record {
   uint64_t n_candidates;
   uint32_t n_tokens;
   int weight;
+  grn_obj *args_expr;
+  unsigned int args_expr_offset;
 };
 
 

  Modified: lib/ii.c (+4 -2)
===================================================================
--- lib/ii.c    2015-02-18 18:42:09 +0900 (3b5478f)
+++ lib/ii.c    2015-02-18 22:17:18 +0900 (7e38bb8)
@@ -6161,7 +6161,6 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
   grn_wv_mode wvm = grn_wv_none;
   grn_obj *lexicon = ii->lexicon;
   grn_scorer_score_func *score_func = NULL;
-  void *score_func_user_data = NULL;
   grn_scorer_matched_record record;
 
   if (!lexicon || !ii || !s) { return GRN_INVALID_ARGUMENT; }
@@ -6240,7 +6239,6 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
   if (optarg && optarg->scorer) {
     grn_proc *scorer = (grn_proc *)(optarg->scorer);
     score_func = scorer->callbacks.scorer.score;
-    score_func_user_data = scorer->user_data;
     record.table = grn_ctx_at(ctx, s->obj.header.domain);
     record.lexicon = lexicon;
     record.id = GRN_ID_NIL;
@@ -6252,6 +6250,8 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
     record.n_candidates = 0;
     record.n_tokens = 0;
     record.weight = 0;
+    record.args_expr = optarg->scorer_args_expr;
+    record.args_expr_offset = optarg->scorer_args_expr_offset;
   }
 
   for (;;) {
@@ -6425,6 +6425,8 @@ grn_ii_sel(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len
         arg.vector_size = optarg->vector_size;
       }
       arg.scorer = optarg->scorer;
+      arg.scorer_args_expr = optarg->scorer_args_expr;
+      arg.scorer_args_expr_offset = optarg->scorer_args_expr_offset;
     }
     /* todo : support subrec
     grn_rset_init(ctx, s, grn_rec_document, 0, grn_rec_none, 0, 0);

  Modified: lib/mrb/mrb_expr.c (+60 -0)
===================================================================
--- lib/mrb/mrb_expr.c    2015-02-18 18:42:09 +0900 (cbe5ebb)
+++ lib/mrb/mrb_expr.c    2015-02-18 22:17:18 +0900 (f726075)
@@ -284,6 +284,56 @@ mrb_grn_scan_info_get_scorer(mrb_state *mrb, mrb_value self)
 }
 
 static mrb_value
+mrb_grn_scan_info_get_scorer_args_expr(mrb_state *mrb, mrb_value self)
+{
+  scan_info *si;
+  grn_obj *scorer_args_expr;
+
+  si = DATA_PTR(self);
+  scorer_args_expr = grn_scan_info_get_scorer_args_expr(si);
+  return grn_mrb_value_from_grn_obj(mrb, scorer_args_expr);
+}
+
+static mrb_value
+mrb_grn_scan_info_set_scorer_args_expr(mrb_state *mrb, mrb_value self)
+{
+  scan_info *si;
+  mrb_value mrb_scorer_args_expr;
+
+  mrb_get_args(mrb, "o", &mrb_scorer_args_expr);
+  si = DATA_PTR(self);
+  if (mrb_nil_p(mrb_scorer_args_expr)) {
+    grn_scan_info_set_scorer_args_expr(si, NULL);
+  } else {
+    grn_scan_info_set_scorer_args_expr(si, DATA_PTR(mrb_scorer_args_expr));
+  }
+  return self;
+}
+
+static mrb_value
+mrb_grn_scan_info_get_scorer_args_expr_offset(mrb_state *mrb, mrb_value self)
+{
+  scan_info *si;
+  uint32_t offset;
+
+  si = DATA_PTR(self);
+  offset = grn_scan_info_get_scorer_args_expr_offset(si);
+  return mrb_fixnum_value(offset);
+}
+
+static mrb_value
+mrb_grn_scan_info_set_scorer_args_expr_offset(mrb_state *mrb, mrb_value self)
+{
+  scan_info *si;
+  mrb_int offset;
+
+  mrb_get_args(mrb, "i", &offset);
+  si = DATA_PTR(self);
+  grn_scan_info_set_scorer_args_expr_offset(si, offset);
+  return self;
+}
+
+static mrb_value
 mrb_grn_scan_info_get_arg(mrb_state *mrb, mrb_value self)
 {
   grn_ctx *ctx = (grn_ctx *)mrb->ud;
@@ -678,6 +728,16 @@ grn_mrb_expr_init(grn_ctx *ctx)
                     mrb_grn_scan_info_get_scorer, MRB_ARGS_NONE());
   mrb_define_method(mrb, klass, "scorer=",
                     mrb_grn_scan_info_set_scorer, MRB_ARGS_REQ(1));
+  mrb_define_method(mrb, klass, "scorer_args_expr",
+                    mrb_grn_scan_info_get_scorer_args_expr, MRB_ARGS_NONE());
+  mrb_define_method(mrb, klass, "scorer_args_expr=",
+                    mrb_grn_scan_info_set_scorer_args_expr, MRB_ARGS_REQ(1));
+  mrb_define_method(mrb, klass, "scorer_args_expr_offset",
+                    mrb_grn_scan_info_get_scorer_args_expr_offset,
+                    MRB_ARGS_NONE());
+  mrb_define_method(mrb, klass, "scorer_args_expr_offset=",
+                    mrb_grn_scan_info_set_scorer_args_expr_offset,
+                    MRB_ARGS_REQ(1));
   mrb_define_method(mrb, klass, "get_arg",
                     mrb_grn_scan_info_get_arg, MRB_ARGS_REQ(1));
   mrb_define_method(mrb, klass, "push_arg",

  Modified: lib/mrb/scripts/scan_info.rb (+4 -0)
===================================================================
--- lib/mrb/scripts/scan_info.rb    2015-02-18 18:42:09 +0900 (64262f8)
+++ lib/mrb/scripts/scan_info.rb    2015-02-18 22:17:18 +0900 (7332cac)
@@ -21,6 +21,10 @@ module Groonga
       end
       if data.scorer
         self.scorer = data.scorer
+        if data.scorer_args_expr
+          self.scorer_args_expr = data.scorer_args_expr
+          self.scorer_args_expr_offset = data.scorer_args_expr_offset
+        end
       end
       data.args.each do |arg|
         push_arg(arg)

  Modified: lib/mrb/scripts/scan_info_data.rb (+15 -3)
===================================================================
--- lib/mrb/scripts/scan_info_data.rb    2015-02-18 18:42:09 +0900 (b0ebfea)
+++ lib/mrb/scripts/scan_info_data.rb    2015-02-18 22:17:18 +0900 (f6cd317)
@@ -11,6 +11,8 @@ module Groonga
     attr_accessor :max_interval
     attr_accessor :similarity_threshold
     attr_accessor :scorer
+    attr_accessor :scorer_args_expr
+    attr_accessor :scorer_args_expr_offset
     def initialize(start)
       @start = start
       @end = 0
@@ -23,6 +25,8 @@ module Groonga
       @max_interval = nil
       @similarity_threshold = nil
       @scorer = nil
+      @scorer_args_expr = nil
+      @scorer_args_expr_offset = nil
     end
 
     def match_resolve_index
@@ -109,11 +113,11 @@ module Groonga
       n_codes = codes.size
       i = 0
       while i < n_codes
-        i = match_resolve_index_expression_codes(codes, i, n_codes)
+        i = match_resolve_index_expression_codes(expression, codes, i, n_codes)
       end
     end
 
-    def match_resolve_index_expression_codes(codes, i, n_codes)
+    def match_resolve_index_expression_codes(expression, codes, i, n_codes)
       code = codes[i]
       value = code.value
       case value
@@ -145,7 +149,15 @@ module Groonga
           message = "match target is required as an argument: <#{scorer.name}>"
           raise ErrorMessage, message
         end
-        i = match_resolve_index_expression_codes(codes, i + 1, n_codes)
+        i = match_resolve_index_expression_codes(expression, codes, i + 1,
+                                                 n_codes)
+        unless codes[i].op == Operator::CALL
+          @scorer_args_expr = expression
+          @scorer_args_expr_offset = i
+          until codes[i].op == Operator::CALL
+            i += 1
+          end
+        end
       when Table
         raise ErrorMessage, "invalid match target: <#{value.name}>"
       end

  Modified: lib/scorer.c (+55 -0)
===================================================================
--- lib/scorer.c    2015-02-18 18:42:09 +0900 (f5bc043)
+++ lib/scorer.c    2015-02-18 22:17:18 +0900 (2670bb3)
@@ -100,6 +100,61 @@ grn_scorer_matched_record_get_weight(grn_ctx *ctx,
   return record->weight;
 }
 
+grn_obj *
+grn_scorer_matched_record_get_arg(grn_ctx *ctx,
+                                  grn_scorer_matched_record *record,
+                                  unsigned int i)
+{
+  grn_expr *expr;
+  grn_expr_code *codes_original;
+  uint32_t codes_curr_original;
+  grn_obj *arg;
+
+  if (!record->args_expr) {
+    return NULL;
+  }
+
+  expr = (grn_expr *)(record->args_expr);
+  /* TODO: support getting column value */
+  codes_original = expr->codes;
+  codes_curr_original = expr->codes_curr;
+  expr->codes += record->args_expr_offset;
+  expr->codes_curr = 1; /* TODO: support 1 or more codes */
+  arg = grn_expr_exec(ctx, (grn_obj *)expr, 0);
+  expr->codes_curr = codes_curr_original;
+  expr->codes = codes_original;
+
+  return arg;
+}
+
+unsigned int
+grn_scorer_matched_record_get_n_args(grn_ctx *ctx,
+                                     grn_scorer_matched_record *record)
+{
+  grn_expr *expr;
+  grn_expr_code *codes;
+  unsigned int n_args = 0;
+
+  if (!record->args_expr) {
+    return 0;
+  }
+
+  expr = (grn_expr *)(record->args_expr);
+  codes = expr->codes + record->args_expr_offset;
+  if (codes[0].op == GRN_OP_CALL) {
+    return 0;
+  }
+
+  n_args++;
+  for (; codes[0].op != GRN_OP_CALL; codes++) {
+    if (codes[0].op == GRN_OP_COMMA) {
+      n_args++;
+    }
+  }
+
+  return n_args;
+}
+
 grn_rc
 grn_scorer_register(grn_ctx *ctx,
                     const char *plugin_name_ptr,

  Modified: lib/scorers.c (+39 -0)
===================================================================
--- lib/scorers.c    2015-02-18 18:42:09 +0900 (b039e01)
+++ lib/scorers.c    2015-02-18 22:17:18 +0900 (a80abcb)
@@ -18,6 +18,8 @@
 
 #include <groonga/scorer.h>
 
+#include "grn_db.h"
+
 #include <math.h>
 
 static double
@@ -49,9 +51,46 @@ scorer_tf_idf(grn_ctx *ctx, grn_scorer_matched_record *record)
   }
 }
 
+static double
+scorer_tf_at_most(grn_ctx *ctx, grn_scorer_matched_record *record)
+{
+  double tf;
+  double max;
+  grn_obj *max_raw;
+
+  tf = grn_scorer_matched_record_get_n_occurrences(ctx, record) +
+    grn_scorer_matched_record_get_total_term_weights(ctx, record);
+  max_raw = grn_scorer_matched_record_get_arg(ctx, record, 0);
+
+  if (!max_raw) {
+    return tf;
+  }
+
+  if (max_raw->header.type != GRN_BULK) {
+    return tf;
+  }
+
+  if (max_raw->header.domain == GRN_DB_FLOAT) {
+    max = GRN_FLOAT_VALUE(max_raw);
+  } else {
+    grn_obj casted_max_raw;
+    GRN_FLOAT_INIT(&casted_max_raw, 0);
+    if (grn_obj_cast(ctx, &casted_max_raw, max_raw, GRN_FALSE) != GRN_SUCCESS) {
+      GRN_OBJ_FIN(ctx, &casted_max_raw);
+      return tf;
+    } else {
+      max = GRN_FLOAT_VALUE(&casted_max_raw);
+    }
+    GRN_OBJ_FIN(ctx, &casted_max_raw);
+  }
+
+  return fmin(tf, max);
+}
+
 grn_rc
 grn_db_init_builtin_scorers(grn_ctx *ctx)
 {
   grn_scorer_register(ctx, "scorer_tf_idf", -1, scorer_tf_idf);
+  grn_scorer_register(ctx, "scorer_tf_at_most", -1, scorer_tf_at_most);
   return GRN_SUCCESS;
 }

  Added: test/command/suite/select/match_columns/scorer/tf_at_most.expected (+58 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/match_columns/scorer/tf_at_most.expected    2015-02-18 22:17:18 +0900 (d3c7195)
@@ -0,0 +1,58 @@
+table_create Logs TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Logs message COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+column_create Terms index COLUMN_INDEX|WITH_POSITION Logs message
+[[0,0.0,0.0],true]
+load --table Logs
+[
+["message"],
+["Error"],
+["Error Error"],
+["Error Error Error"],
+["Error Error Error Error"]
+]
+[[0,0.0,0.0],4]
+select Logs   --match_columns 'scorer_tf_at_most(message, 3.0)'   --query 'error'   --output_columns '_score, message'   --sortby -_score
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        4
+      ],
+      [
+        [
+          "_score",
+          "Int32"
+        ],
+        [
+          "message",
+          "Text"
+        ]
+      ],
+      [
+        3,
+        "Error Error Error Error"
+      ],
+      [
+        3,
+        "Error Error Error"
+      ],
+      [
+        2,
+        "Error Error"
+      ],
+      [
+        1,
+        "Error"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/select/match_columns/scorer/tf_at_most.test (+22 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/match_columns/scorer/tf_at_most.test    2015-02-18 22:17:18 +0900 (f9288b8)
@@ -0,0 +1,22 @@
+table_create Logs TABLE_NO_KEY
+column_create Logs message COLUMN_SCALAR Text
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenBigram \
+  --normalizer NormalizerAuto
+column_create Terms index COLUMN_INDEX|WITH_POSITION Logs message
+
+load --table Logs
+[
+["message"],
+["Error"],
+["Error Error"],
+["Error Error Error"],
+["Error Error Error Error"]
+]
+
+select Logs \
+  --match_columns 'scorer_tf_at_most(message, 3.0)' \
+  --query 'error' \
+  --output_columns '_score, message' \
+  --sortby -_score
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index