[Groonga-commit] groonga/groonga at 0892bf6 [master] Add custom score function mechanism

Back to archive index

Kouhei Sutou null+****@clear*****
Mon Feb 16 16:58:51 JST 2015


Kouhei Sutou	2015-02-16 16:58:51 +0900 (Mon, 16 Feb 2015)

  New Revision: 0892bf630af04cc9f76d2a6576690adbacd6875b
  https://github.com/groonga/groonga/commit/0892bf630af04cc9f76d2a6576690adbacd6875b

  Message:
    Add custom score function mechanism
    
    TODO:
    
      * Support `score_func(target) * weight` syntax in `match_columns`

  Added files:
    include/groonga/scorer.h
    lib/grn_scorer.h
    lib/grn_scorers.h
    lib/scorer.c
    lib/scorers.c
  Modified files:
    include/groonga/groonga.h
    lib/db.c
    lib/expr.c
    lib/grn_db.h
    lib/grn_ii.h
    lib/ii.c
    lib/sources.am
    lib/util.c

  Modified: include/groonga/groonga.h (+4 -1)
===================================================================
--- include/groonga/groonga.h    2015-02-16 14:24:55 +0900 (6a03747)
+++ include/groonga/groonga.h    2015-02-16 16:58:51 +0900 (0107185)
@@ -121,6 +121,7 @@ typedef enum {
   GRN_TOKEN_FILTER_ERROR = -73,
   GRN_COMMAND_ERROR = -74,
   GRN_PLUGIN_ERROR = -75,
+  GRN_SCORER_ERROR = -76
 } grn_rc;
 
 GRN_API grn_rc grn_init(void);
@@ -488,7 +489,8 @@ typedef enum {
   GRN_PROC_FUNCTION,
   GRN_PROC_HOOK,
   GRN_PROC_NORMALIZER,
-  GRN_PROC_TOKEN_FILTER
+  GRN_PROC_TOKEN_FILTER,
+  GRN_PROC_SCORER
 } grn_proc_type;
 
 GRN_API grn_obj *grn_proc_create(grn_ctx *ctx,
@@ -911,6 +913,7 @@ struct _grn_search_optarg {
   int vector_size;
   grn_obj *proc;
   int max_size;
+  grn_obj *scorer;
 };
 
 GRN_API grn_rc grn_obj_search(grn_ctx *ctx, grn_obj *obj, grn_obj *query,

  Added: include/groonga/scorer.h (+88 -0) 100644
===================================================================
--- /dev/null
+++ include/groonga/scorer.h    2015-02-16 16:58:51 +0900 (a50d0d8)
@@ -0,0 +1,88 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2015 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#ifndef GROONGA_SCORER_H
+#define GROONGA_SCORER_H
+
+#include <groonga/plugin.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  /* __cplusplus */
+
+typedef struct _grn_scorer_matched_record grn_scorer_matched_record;
+
+GRN_API grn_obj *
+  grn_scorer_matched_record_get_table(grn_ctx *ctx,
+                                      grn_scorer_matched_record *record);
+GRN_API grn_obj *
+  grn_scorer_matched_record_get_lexicon(grn_ctx *ctx,
+                                        grn_scorer_matched_record *record);
+GRN_API grn_id
+  grn_scorer_matched_record_get_id(grn_ctx *ctx,
+                                   grn_scorer_matched_record *record);
+GRN_API grn_obj *
+  grn_scorer_matched_record_get_terms(grn_ctx *ctx,
+                                      grn_scorer_matched_record *record);
+GRN_API grn_obj *
+  grn_scorer_matched_record_get_term_weights(grn_ctx *ctx,
+                                             grn_scorer_matched_record *record);
+GRN_API unsigned int
+  grn_scorer_matched_record_get_total_term_weights(grn_ctx *ctx,
+                                                   grn_scorer_matched_record *record);
+GRN_API long long unsigned int
+  grn_scorer_matched_record_get_n_documents(grn_ctx *ctx,
+                                            grn_scorer_matched_record *record);
+GRN_API unsigned int
+  grn_scorer_matched_record_get_n_occurrences(grn_ctx *ctx,
+                                              grn_scorer_matched_record *record);
+GRN_API long long unsigned int
+  grn_scorer_matched_record_get_n_candidates(grn_ctx *ctx,
+                                             grn_scorer_matched_record *record);
+GRN_API unsigned int
+  grn_scorer_matched_record_get_n_tokens(grn_ctx *ctx,
+                                         grn_scorer_matched_record *record);
+GRN_API int
+  grn_scorer_matched_record_get_weight(grn_ctx *ctx,
+                                       grn_scorer_matched_record *record);
+
+
+
+typedef double grn_scorer_score_func(grn_ctx *ctx,
+                                     grn_scorer_matched_record *record);
+
+/*
+  grn_scorer_register() registers a plugin to the database which is
+  associated with `ctx'. `plugin_name_ptr' and `plugin_name_length' specify the
+  plugin name. Alphabetic letters ('A'-'Z' and 'a'-'z'), digits ('0'-'9') and
+  an underscore ('_') are capable characters.
+
+  `score' is called for scoring matched records one by one.
+
+  grn_scorer_register() returns GRN_SUCCESS on success, an error
+  code on failure.
+ */
+GRN_PLUGIN_EXPORT grn_rc grn_scorer_register(grn_ctx *ctx,
+                                             const char *plugin_name_ptr,
+                                             int plugin_name_length,
+                                             grn_scorer_score_func *score);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif  /* __cplusplus */
+
+#endif  /* GROONGA_SCORER_H */

  Modified: lib/db.c (+3 -0)
===================================================================
--- lib/db.c    2015-02-16 14:24:55 +0900 (625ab41)
+++ lib/db.c    2015-02-16 16:58:51 +0900 (5395c24)
@@ -26,6 +26,7 @@
 #include "grn_proc.h"
 #include "grn_plugin.h"
 #include "grn_geo.h"
+#include "grn_scorers.h"
 #include "grn_snip.h"
 #include "grn_string.h"
 #include "grn_normalizer.h"
@@ -260,6 +261,7 @@ grn_db_open(grn_ctx *ctx, const char *path)
 #endif
           grn_db_init_builtin_tokenizers(ctx);
           grn_db_init_builtin_normalizers(ctx);
+          grn_db_init_builtin_scorers(ctx);
           grn_db_init_builtin_query(ctx);
           GRN_API_RETURN((grn_obj *)s);
         }
@@ -10447,6 +10449,7 @@ grn_db_init_builtin_types(grn_ctx *ctx)
 #endif
   grn_db_init_builtin_tokenizers(ctx);
   grn_db_init_builtin_normalizers(ctx);
+  grn_db_init_builtin_scorers(ctx);
   for (id = grn_db_curr_id(ctx, db) + 1; id < 128; id++) {
     grn_itoh(id, buf + 3, 2);
     grn_obj_register(ctx, db, buf, 5);

  Modified: lib/expr.c (+16 -0)
===================================================================
--- lib/expr.c    2015-02-16 14:24:55 +0900 (12c24f9)
+++ lib/expr.c    2015-02-16 16:58:51 +0900 (4809653)
@@ -3766,6 +3766,7 @@ struct _grn_scan_info {
   grn_obj *args[GRN_SCAN_INFO_MAX_N_ARGS];
   int max_interval;
   int similarity_threshold;
+  grn_obj *scorer;
 };
 
 #define SI_FREE(si) do {\
@@ -3789,6 +3790,7 @@ struct _grn_scan_info {
   (si)->max_interval = DEFAULT_MAX_INTERVAL;\
   (si)->similarity_threshold = DEFAULT_SIMILARITY_THRESHOLD;\
   (si)->start = (st);\
+  (si)->scorer = NULL;\
 } while (0)
 
 static scan_info **
@@ -4078,6 +4080,7 @@ grn_scan_info_open(grn_ctx *ctx, int start)
   si->max_interval = DEFAULT_MAX_INTERVAL;
   si->similarity_threshold = DEFAULT_SIMILARITY_THRESHOLD;
   si->start = start;
+  si->scorer = NULL;
 
   return si;
 }
@@ -4199,6 +4202,18 @@ grn_scan_info_get_arg(grn_ctx *ctx, scan_info *si, int i)
   return si->args[i];
 }
 
+grn_obj *
+grn_scan_info_get_scorer(grn_ctx *ctx, scan_info *si)
+{
+  return si->scorer;
+}
+
+void
+grn_scan_info_set_scorer(grn_ctx *ctx, scan_info *si, grn_obj *scorer)
+{
+  si->scorer = scorer;
+}
+
 static uint32_t
 scan_info_build_find_index_column_index(grn_ctx *ctx,
                                         scan_info *si,
@@ -5168,6 +5183,7 @@ grn_table_select_index(grn_ctx *ctx, grn_obj *table, scan_info *si,
         optarg.vector_size = 1;
         optarg.proc = NULL;
         optarg.max_size = 0;
+        optarg.scorer = si->scorer;
         ctx->flags |= GRN_CTX_TEMPORARY_DISABLE_II_RESOLVE_SEL_AND;
         for (; j--; ip++, wp += 2) {
           uint32_t sid = (uint32_t) wp[0];

  Modified: lib/grn_db.h (+4 -0)
===================================================================
--- lib/grn_db.h    2015-02-16 14:24:55 +0900 (8213c99)
+++ lib/grn_db.h    2015-02-16 16:58:51 +0900 (0eeab14)
@@ -24,6 +24,7 @@
 
 #include <groonga/command.h>
 #include <groonga/token_filter.h>
+#include <groonga/scorer.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -166,6 +167,9 @@ struct _grn_proc {
       grn_token_filter_filter_func *filter;
       grn_token_filter_fin_func    *fin;
     } token_filter;
+    struct {
+      grn_scorer_score_func *score;
+    } scorer;
   } callbacks;
 
   void *user_data;

  Modified: lib/grn_ii.h (+1 -0)
===================================================================
--- lib/grn_ii.h    2015-02-16 14:24:55 +0900 (416d69f)
+++ lib/grn_ii.h    2015-02-16 16:58:51 +0900 (0d44079)
@@ -151,6 +151,7 @@ struct _grn_select_optarg {
   int (*func)(grn_ctx *, grn_hash *, const void *, int, void *);
   void *func_arg;
   int max_size;
+  grn_obj *scorer;
 };
 
 GRN_API grn_rc grn_ii_column_update(grn_ctx *ctx, grn_ii *ii, grn_id id,

  Added: lib/grn_scorer.h (+49 -0) 100644
===================================================================
--- /dev/null
+++ lib/grn_scorer.h    2015-02-16 16:58:51 +0900 (898a833)
@@ -0,0 +1,49 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2015 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#ifndef GRN_SCORER_H
+#define GRN_SCORER_H
+
+#include "grn_ctx.h"
+#include "grn_db.h"
+
+#include <groonga/scorer.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct _grn_scorer_matched_record {
+  grn_obj *table;
+  grn_obj *lexicon;
+  grn_id id;
+  grn_obj terms;
+  grn_obj term_weights;
+  uint32_t total_term_weights;
+  uint64_t n_documents;
+  uint32_t n_occurrences;
+  uint64_t n_candidates;
+  uint32_t n_tokens;
+  int weight;
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GRN_SCORER_H */

  Added: lib/grn_scorers.h (+33 -0) 100644
===================================================================
--- /dev/null
+++ lib/grn_scorers.h    2015-02-16 16:58:51 +0900 (ed6c18c)
@@ -0,0 +1,33 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2015 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#ifndef GRN_SCORERS_H
+#define GRN_SCORERS_H
+
+#include "grn_ctx.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+grn_rc grn_db_init_builtin_scorers(grn_ctx *ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GRN_SCORERS_H */

  Modified: lib/ii.c (+68 -2)
===================================================================
--- lib/ii.c    2015-02-16 14:24:55 +0900 (427523d)
+++ lib/ii.c    2015-02-16 16:58:51 +0900 (3b5478f)
@@ -26,6 +26,7 @@
 #include "grn_pat.h"
 #include "grn_db.h"
 #include "grn_output.h"
+#include "grn_scorer.h"
 #include "grn_util.h"
 
 #ifdef GRN_WITH_ONIGMO
@@ -6159,6 +6160,10 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
   grn_operator mode = GRN_OP_EXACT;
   grn_wv_mode wvm = grn_wv_none;
   grn_obj *lexicon = ii->lexicon;
+  grn_scorer_score_func *score_func = NULL;
+  void *score_func_user_data = NULL;
+  grn_scorer_matched_record record;
+
   if (!lexicon || !ii || !s) { return GRN_INVALID_ARGUMENT; }
   if (optarg) {
     mode = optarg->mode;
@@ -6231,6 +6236,24 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
     goto exit;
   }
 #endif
+
+  if (optarg && optarg->scorer) {
+    grn_proc *scorer = (grn_proc *)(optarg->scorer);
+    score_func = scorer->callbacks.scorer.score;
+    score_func_user_data = scorer->user_data;
+    record.table = grn_ctx_at(ctx, s->obj.header.domain);
+    record.lexicon = lexicon;
+    record.id = GRN_ID_NIL;
+    GRN_RECORD_INIT(&(record.terms), GRN_OBJ_VECTOR, lexicon->header.domain);
+    GRN_UINT32_INIT(&(record.term_weights), GRN_OBJ_VECTOR);
+    record.total_term_weights = 0;
+    record.n_documents = grn_table_size(ctx, record.table);
+    record.n_occurrences = 0;
+    record.n_candidates = 0;
+    record.n_tokens = 0;
+    record.weight = 0;
+  }
+
   for (;;) {
     rid = (*tis)->p->rid;
     sid = (*tis)->p->sid;
@@ -6249,6 +6272,13 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
       if (orp || grn_hash_get(ctx, s, &pi, s->key_size, NULL)) {
         int count = 0, noccur = 0, pos = 0, score = 0, tscore = 0, min, max;
 
+        if (score_func) {
+          GRN_BULK_REWIND(&(record.terms));
+          GRN_BULK_REWIND(&(record.term_weights));
+          record.n_candidates = 0;
+          record.n_tokens = 0;
+        }
+
 #define SKIP_OR_BREAK(pos) {\
   if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; }    \
   if (ti->p->rid != rid || ti->p->sid != sid) { \
@@ -6260,6 +6290,13 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
         if (n == 1 && !rep) {
           noccur = (*tis)->p->tf;
           tscore = (*tis)->p->weight;
+          if (score_func) {
+            GRN_RECORD_PUT(ctx, &(record.terms), (*tis)->cursors->bins[0]->id);
+            GRN_UINT32_PUT(ctx, &(record.term_weights), tscore);
+            record.n_occurrences = noccur;
+            record.n_candidates = (*tis)->size;
+            record.n_tokens = (*tis)->ntoken;
+          }
         } else if (mode == GRN_OP_NEAR) {
           bt_zap(bt);
           for (tip = tis; tip < tie; tip++) {
@@ -6296,6 +6333,18 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
               score += ti->p->weight; count++;
             } else {
               score = ti->p->weight; count = 1; pos = ti->pos;
+              if (noccur == 0 && score_func) {
+                GRN_BULK_REWIND(&(record.terms));
+                GRN_BULK_REWIND(&(record.term_weights));
+                record.n_candidates = 0;
+                record.n_tokens = 0;
+              }
+            }
+            if (noccur == 0 && score_func) {
+              GRN_RECORD_PUT(ctx, &(record.terms), ti->cursors->bins[0]->id);
+              GRN_UINT32_PUT(ctx, &(record.term_weights), ti->p->weight);
+              record.n_candidates += ti->size;
+              record.n_tokens += ti->ntoken;
             }
             if (count == n) {
               if (rep) { pi.pos = pos; res_add(ctx, s, &pi, (score + 1) * weight, op); }
@@ -6305,13 +6354,29 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
             }
           }
         }
-        if (noccur && !rep) { res_add(ctx, s, &pi, (noccur + tscore) * weight, op); }
+        if (noccur && !rep) {
+          double record_score;
+          if (score_func) {
+            record.id = rid;
+            record.weight = weight;
+            record.n_occurrences = noccur;
+            record.total_term_weights = tscore;
+            record_score = score_func(ctx, &record) * weight;
+          } else {
+            record_score = (noccur + tscore) * weight;
+          }
+          res_add(ctx, s, &pi, record_score, op);
+        }
 #undef SKIP_OR_BREAK
       }
     }
     if (token_info_skip(ctx, *tis, nrid, nsid)) { goto exit; }
   }
 exit :
+  if (score_func) {
+    GRN_OBJ_FIN(ctx, &(record.terms));
+    GRN_OBJ_FIN(ctx, &(record.term_weights));
+  }
   for (tip = tis; tip < tis + n; tip++) {
     if (*tip) { token_info_close(ctx, *tip); }
   }
@@ -6339,7 +6404,7 @@ grn_ii_sel(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len
   ERRCLR(ctx);
   GRN_LOG(ctx, GRN_LOG_INFO, "grn_ii_sel > (%.*s)", string_len, string);
   {
-    grn_select_optarg arg = {GRN_OP_EXACT, 0, 0, NULL, 0, NULL, NULL, 0};
+    grn_select_optarg arg = {GRN_OP_EXACT, 0, 0, NULL, 0, NULL, NULL, 0, NULL};
     if (!s) { return GRN_INVALID_ARGUMENT; }
     if (optarg) {
       switch (optarg->mode) {
@@ -6359,6 +6424,7 @@ grn_ii_sel(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len
         arg.weight_vector = optarg->weight_vector;
         arg.vector_size = optarg->vector_size;
       }
+      arg.scorer = optarg->scorer;
     }
     /* todo : support subrec
     grn_rset_init(ctx, s, grn_rec_document, 0, grn_rec_none, 0, 0);

  Added: lib/scorer.c (+134 -0) 100644
===================================================================
--- /dev/null
+++ lib/scorer.c    2015-02-16 16:58:51 +0900 (1b4159a)
@@ -0,0 +1,134 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2015 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include <string.h>
+
+#include "grn.h"
+#include "grn_db.h"
+#include "grn_scorer.h"
+#include <groonga/scorer.h>
+
+grn_obj *
+grn_scorer_matched_record_get_table(grn_ctx *ctx,
+                                    grn_scorer_matched_record *record)
+{
+  return record->table;
+}
+
+grn_obj *
+grn_scorer_matched_record_get_lexicon(grn_ctx *ctx,
+                                      grn_scorer_matched_record *record)
+{
+  return record->lexicon;
+}
+
+grn_id
+grn_scorer_matched_record_get_id(grn_ctx *ctx,
+                                 grn_scorer_matched_record *record)
+{
+  return record->id;
+}
+
+grn_obj *
+grn_scorer_matched_record_get_terms(grn_ctx *ctx,
+                                    grn_scorer_matched_record *record)
+{
+  return &(record->terms);
+}
+
+grn_obj *
+grn_scorer_matched_record_get_term_weights(grn_ctx *ctx,
+                                           grn_scorer_matched_record *record)
+{
+  return &(record->term_weights);
+}
+
+unsigned int
+grn_scorer_matched_record_get_total_term_weights(grn_ctx *ctx,
+                                                 grn_scorer_matched_record *record)
+{
+  return record->total_term_weights;
+}
+
+long long unsigned int
+grn_scorer_matched_record_get_n_documents(grn_ctx *ctx,
+                                          grn_scorer_matched_record *record)
+{
+  return record->n_documents;
+}
+
+unsigned int
+grn_scorer_matched_record_get_n_occurrences(grn_ctx *ctx,
+                                            grn_scorer_matched_record *record)
+{
+  return record->n_occurrences;
+}
+
+long long unsigned int
+grn_scorer_matched_record_get_n_candidates(grn_ctx *ctx,
+                                           grn_scorer_matched_record *record)
+{
+  return record->n_candidates;
+}
+
+unsigned int
+grn_scorer_matched_record_get_n_tokens(grn_ctx *ctx,
+                                       grn_scorer_matched_record *record)
+{
+  return record->n_tokens;
+}
+
+int
+ grn_scorer_matched_record_get_weight(grn_ctx *ctx,
+                                      grn_scorer_matched_record *record)
+{
+  return record->weight;
+}
+
+grn_rc
+grn_scorer_register(grn_ctx *ctx,
+                    const char *plugin_name_ptr,
+                    int plugin_name_length,
+                    grn_scorer_score_func *score)
+{
+  if (plugin_name_length == -1) {
+    plugin_name_length = strlen(plugin_name_ptr);
+  }
+
+  {
+    grn_obj *scorer_object = grn_proc_create(ctx,
+                                             plugin_name_ptr,
+                                             plugin_name_length,
+                                             GRN_PROC_SCORER,
+                                             NULL, NULL, NULL, 0, NULL);
+    if (scorer_object == NULL) {
+      GRN_PLUGIN_ERROR(ctx, GRN_SCORER_ERROR,
+                       "[scorer][%.*s] failed to grn_proc_create()",
+                       plugin_name_length, plugin_name_ptr);
+      return ctx->rc;
+    }
+
+    {
+      grn_proc *scorer = (grn_proc *)scorer_object;
+      scorer->callbacks.scorer.score = score;
+    }
+  }
+
+  return GRN_SUCCESS;
+}
+

  Added: lib/scorers.c (+57 -0) 100644
===================================================================
--- /dev/null
+++ lib/scorers.c    2015-02-16 16:58:51 +0900 (b039e01)
@@ -0,0 +1,57 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2015 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include <groonga/scorer.h>
+
+#include <math.h>
+
+static double
+scorer_tf_idf(grn_ctx *ctx, grn_scorer_matched_record *record)
+{
+  double min_score = 1.0;
+  double tf;
+  double n_all_documents;
+  double n_candidates;
+  double n_tokens;
+  double n_estimated_match_documents;
+
+  tf = grn_scorer_matched_record_get_n_occurrences(ctx, record) +
+    grn_scorer_matched_record_get_total_term_weights(ctx, record);
+  n_all_documents = grn_scorer_matched_record_get_n_documents(ctx, record);
+  n_candidates = grn_scorer_matched_record_get_n_candidates(ctx, record);
+  n_tokens = grn_scorer_matched_record_get_n_tokens(ctx, record);
+  n_estimated_match_documents = n_candidates / n_tokens;
+
+  if (n_estimated_match_documents >= n_all_documents) {
+    return min_score;
+  } else {
+    double idf;
+    double tf_idf;
+
+    idf = log(n_all_documents / n_estimated_match_documents);
+    tf_idf = tf * idf;
+    return fmax(tf_idf, min_score);
+  }
+}
+
+grn_rc
+grn_db_init_builtin_scorers(grn_ctx *ctx)
+{
+  grn_scorer_register(ctx, "scorer_tf_idf", -1, scorer_tf_idf);
+  return GRN_SUCCESS;
+}

  Modified: lib/sources.am (+4 -0)
===================================================================
--- lib/sources.am    2015-02-16 14:24:55 +0900 (185b64b)
+++ lib/sources.am    2015-02-16 16:58:51 +0900 (6dd1127)
@@ -42,6 +42,10 @@ libgroonga_la_SOURCES =				\
 	grn_request_canceler.h			\
 	rset.c					\
 	grn_rset.h				\
+	scorer.c				\
+	grn_scorer.h				\
+	scorers.c				\
+	grn_scorers.h				\
 	snip.c					\
 	grn_snip.h				\
 	store.c					\

  Modified: lib/util.c (+4 -1)
===================================================================
--- lib/util.c    2015-02-16 14:24:55 +0900 (6606476)
+++ lib/util.c    2015-02-16 16:58:51 +0900 (dd703b5)
@@ -1,5 +1,5 @@
 /* -*- c-basic-offset: 2 -*- */
-/* Copyright(C) 2010-2014 Brazil
+/* Copyright(C) 2010-2015 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -254,6 +254,9 @@ grn_proc_inspect(grn_ctx *ctx, grn_obj *buf, grn_obj *obj)
   case GRN_PROC_TOKEN_FILTER :
     GRN_TEXT_PUTS(ctx, buf, "token-filter");
     break;
+  case GRN_PROC_SCORER :
+    GRN_TEXT_PUTS(ctx, buf, "scorer");
+    break;
   }
   GRN_TEXT_PUTS(ctx, buf, " ");
 
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index