[Groonga-commit] groonga/groonga [master] [suggest] split --threshold.

Back to archive index

null+****@clear***** null+****@clear*****
2011年 8月 18日 (木) 17:14:04 JST


Kouhei Sutou	2011-08-18 08:14:04 +0000 (Thu, 18 Aug 2011)

  New Revision: b76928425ed3342104097c9a72e6419fbdf85bf1

  Log:
    [suggest] split --threshold.
    
    New two --frequency_threshold and --conditional_probability_threshold
    paramters are added.

  Modified files:
    plugins/suggest/suggest.c
    test/unit/story/test-rurema.c

  Modified: plugins/suggest/suggest.c (+85 -40)
===================================================================
--- plugins/suggest/suggest.c    2011-08-17 07:54:12 +0000 (c9946bc)
+++ plugins/suggest/suggest.c    2011-08-18 08:14:04 +0000 (b7da465)
@@ -65,7 +65,8 @@ grn_parse_suggest_types(const char *nptr, const char *end)
 
 static int32_t
 cooccurrence_search(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_id id,
-                    grn_obj *res, int query_type, int threshold)
+                    grn_obj *res, int query_type, int frequency_threshold,
+                    double conditional_probability_threshold)
 {
   int32_t max_score = 0;
   if (id) {
@@ -73,6 +74,7 @@ cooccurrence_search(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_id i
     grn_obj *co = grn_obj_column(ctx, items, CONST_STR_LEN("co"));
     grn_obj *pairs = grn_ctx_at(ctx, grn_obj_get_range(ctx, co));
     grn_obj *items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq"));
+    grn_obj *items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2"));
     grn_obj *pairs_freq, *pairs_post = grn_obj_column(ctx, pairs, CONST_STR_LEN("post"));
     switch (query_type) {
     case COMPLETE :
@@ -90,42 +92,59 @@ cooccurrence_search(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_id i
     if ((c = grn_ii_cursor_open(ctx, (grn_ii *)co, id, GRN_ID_NIL, GRN_ID_MAX,
                                 ((grn_ii *)co)->n_elements - 1, 0))) {
       grn_ii_posting *p;
-      grn_obj post, pair_freq, item_freq, item_boost;
+      grn_obj post, pair_freq, item_freq, item_freq2, item_boost;
       GRN_RECORD_INIT(&post, 0, grn_obj_id(ctx, items));
       GRN_INT32_INIT(&pair_freq, 0);
       GRN_INT32_INIT(&item_freq, 0);
+      GRN_INT32_INIT(&item_freq2, 0);
       GRN_INT32_INIT(&item_boost, 0);
       while ((p = grn_ii_cursor_next(ctx, c))) {
         grn_id post_id;
-        int pfreq, ifreq, boost;
+        int pfreq, ifreq, ifreq2, boost;
+        double conditional_probability;
         GRN_BULK_REWIND(&post);
         GRN_BULK_REWIND(&pair_freq);
         GRN_BULK_REWIND(&item_freq);
+        GRN_BULK_REWIND(&item_freq2);
         GRN_BULK_REWIND(&item_boost);
         grn_obj_get_value(ctx, pairs_post, p->rid, &post);
         grn_obj_get_value(ctx, pairs_freq, p->rid, &pair_freq);
         post_id = GRN_RECORD_VALUE(&post);
         grn_obj_get_value(ctx, items_freq, post_id, &item_freq);
+        grn_obj_get_value(ctx, items_freq2, post_id, &item_freq2);
         grn_obj_get_value(ctx, items_boost, post_id, &item_boost);
         pfreq = GRN_INT32_VALUE(&pair_freq);
         ifreq = GRN_INT32_VALUE(&item_freq);
+        ifreq2 = GRN_INT32_VALUE(&item_freq2);
+        if (ifreq2 > 0) {
+          conditional_probability = (double)pfreq / (double)ifreq2;
+        } else {
+          conditional_probability = 0.0;
+        }
         boost = GRN_INT32_VALUE(&item_boost);
-        if (pfreq >= threshold && ifreq >= threshold && boost >= 0) {
+        if (pfreq >= frequency_threshold && ifreq >= frequency_threshold &&
+            conditional_probability >= conditional_probability_threshold &&
+            boost >= 0) {
           grn_rset_recinfo *ri;
           void *value;
           int32_t score = pfreq;
-          if (max_score < score) { max_score = score; }
+          int added;
+          if (max_score < score + boost) { max_score = score + boost; }
           /* put any formula if desired */
           if (grn_hash_add(ctx, (grn_hash *)res,
-                           &post_id, sizeof(grn_id), &value, NULL)) {
+                           &post_id, sizeof(grn_id), &value, &added)) {
             ri = value;
             ri->score += score;
+            if (added) {
+              ri->score += boost;
+            }
           }
         }
       }
       GRN_OBJ_FIN(ctx, &post);
       GRN_OBJ_FIN(ctx, &pair_freq);
       GRN_OBJ_FIN(ctx, &item_freq);
+      GRN_OBJ_FIN(ctx, &item_freq2);
       GRN_OBJ_FIN(ctx, &item_boost);
       grn_ii_cursor_close(ctx, c);
     }
@@ -136,7 +155,8 @@ cooccurrence_search(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_id i
 #define DEFAULT_LIMIT           10
 #define DEFAULT_SORTBY          "-_score"
 #define DEFAULT_OUTPUT_COLUMNS  "_key,_score"
-#define DEFAULT_THRESHOLD       100
+#define DEFAULT_FREQUENCY_THRESHOLD 100
+#define DEFAULT_CONDITIONAL_PROBABILITY_THRESHOLD 0.5
 
 static void
 output(grn_ctx *ctx, grn_obj *table, grn_obj *res, grn_id tid,
@@ -178,7 +198,7 @@ output(grn_ctx *ctx, grn_obj *table, grn_obj *res, grn_id tid,
 }
 
 static inline void
-complete_add_item(grn_ctx *ctx, grn_id id, grn_obj *res, int threshold,
+complete_add_item(grn_ctx *ctx, grn_id id, grn_obj *res, int frequency_threshold,
                   grn_obj *items_freq, grn_obj *items_boost,
                   grn_obj *item_freq, grn_obj *item_boost)
 {
@@ -191,7 +211,7 @@ complete_add_item(grn_ctx *ctx, grn_id id, grn_obj *res, int threshold,
     score = 1 +
             GRN_INT32_VALUE(item_freq) +
             GRN_INT32_VALUE(item_boost);
-    if (score >= threshold) {
+    if (score >= frequency_threshold) {
       void *value;
       if (grn_hash_add(ctx, (grn_hash *)res, &id, sizeof(grn_id),
                        &value, NULL)) {
@@ -207,7 +227,7 @@ static void
 complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col,
          grn_obj *query, grn_obj *sortby,
          grn_obj *output_columns, int offset, int limit,
-         int threshold,
+         int frequency_threshold, double conditional_probability_threshold,
          grn_suggest_prefix_search_mode prefix_search_mode)
 {
   grn_obj *res;
@@ -236,7 +256,7 @@ complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col,
                                              GRN_ID_NIL, GRN_ID_MAX, 1, 0))) {
                 grn_ii_posting *p;
                 while ((p = grn_ii_cursor_next(ctx, icur))) {
-                  complete_add_item(ctx, p->rid, res, threshold,
+                  complete_add_item(ctx, p->rid, res, frequency_threshold,
                                     items_freq, items_boost,
                                     &item_freq, &item_boost);
                 }
@@ -252,7 +272,9 @@ complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col,
         }
         grn_str_close(ctx, norm);
       }
-      cooccurrence_search(ctx, items, items_boost, tid, res, COMPLETE, threshold);
+      cooccurrence_search(ctx, items, items_boost, tid, res, COMPLETE,
+                          frequency_threshold,
+                          conditional_probability_threshold);
       if (((prefix_search_mode == GRN_SUGGEST_PREFIX_SEARCH_YES) ||
            (prefix_search_mode == GRN_SUGGEST_PREFIX_SEARCH_AUTO &&
             !grn_table_size(ctx, res))) &&
@@ -260,7 +282,7 @@ complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col,
                                        NULL, 0, 0, -1, GRN_CURSOR_PREFIX))) {
         grn_id id;
         while ((id = grn_table_cursor_next(ctx, cur))) {
-          complete_add_item(ctx, id, res, threshold,
+          complete_add_item(ctx, id, res, frequency_threshold,
                             items_freq, items_boost, &item_freq, &item_boost);
         }
         grn_table_cursor_close(ctx, cur);
@@ -279,7 +301,7 @@ static void
 correct(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost,
         grn_obj *query, grn_obj *sortby,
         grn_obj *output_columns, int offset, int limit,
-        int threshold)
+        int frequency_threshold, double conditional_probability_threshold)
 {
   grn_obj *res;
   grn_obj *items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2"));
@@ -289,9 +311,12 @@ correct(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost,
   if ((res = grn_table_create(ctx, NULL, 0, NULL,
                               GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) {
     grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query));
-    int32_t max_score = cooccurrence_search(ctx, items, items_boost, tid, res, CORRECT, threshold);
+    int32_t max_score;
+    max_score = cooccurrence_search(ctx, items, items_boost, tid, res, CORRECT,
+                                    frequency_threshold,
+                                    conditional_probability_threshold);
     LAP(":", "cooccur(%d)", max_score);
-    if (GRN_TEXT_LEN(query) && max_score < threshold) {
+    if (GRN_TEXT_LEN(query) && max_score < frequency_threshold) {
       grn_obj *key, *index;
       if ((key = grn_obj_column(ctx, items, CONST_STR_LEN("_key")))) {
         if (grn_column_index(ctx, key, GRN_OP_MATCH, &index, 1, NULL)) {
@@ -325,9 +350,9 @@ correct(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost,
                             GRN_INT32_VALUE(&item_boost);
                     ri = value;
                     ri->score += score;
-                    if (score >= threshold) { continue; }
+                    if (score >= frequency_threshold) { continue; }
                   }
-                  /* score < threshold || item_boost < 0 */
+                  /* score < frequency_threshold || item_boost < 0 */
                   grn_hash_cursor_delete(ctx, hc, NULL);
                 }
               }
@@ -366,7 +391,7 @@ correct(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost,
                   GRN_BULK_REWIND(&score_value);
                   grn_obj_get_value(ctx, score, GRN_RECORD_VALUE(var),
                                     &score_value);
-                  if (GRN_INT32_VALUE(&score_value) < threshold) {
+                  if (GRN_INT32_VALUE(&score_value) < frequency_threshold) {
                     grn_table_cursor_delete(ctx, tc);
                   }
                 }
@@ -398,13 +423,14 @@ static void
 suggest(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost,
         grn_obj *query, grn_obj *sortby,
         grn_obj *output_columns, int offset, int limit,
-        int threshold)
+        int frequency_threshold, double conditional_probability_threshold)
 {
   grn_obj *res;
   if ((res = grn_table_create(ctx, NULL, 0, NULL,
                               GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) {
     grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query));
-    cooccurrence_search(ctx, items, items_boost, tid, res, SUGGEST, threshold);
+    cooccurrence_search(ctx, items, items_boost, tid, res, SUGGEST,
+                        frequency_threshold, conditional_probability_threshold);
     output(ctx, items, res, tid, sortby, output_columns, offset, limit);
     grn_obj_close(ctx, res);
   } else {
@@ -416,27 +442,41 @@ static grn_obj *
 command_suggest(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
   grn_obj *items, *col, *items_boost;
-  int types = grn_parse_suggest_types(GRN_TEXT_VALUE(VAR(0)), GRN_BULK_CURR(VAR(0)));
-  int offset = GRN_TEXT_LEN(VAR(6))
-    ? grn_atoi(GRN_TEXT_VALUE(VAR(6)), GRN_BULK_CURR(VAR(6)), NULL)
-    : 0;
-  int limit = GRN_TEXT_LEN(VAR(7))
-    ? grn_atoi(GRN_TEXT_VALUE(VAR(7)), GRN_BULK_CURR(VAR(7)), NULL)
-    : DEFAULT_LIMIT;
-  int threshold = GRN_TEXT_LEN(VAR(8))
-    ? grn_atoi(GRN_TEXT_VALUE(VAR(8)), GRN_BULK_CURR(VAR(8)), NULL)
-    : DEFAULT_THRESHOLD;
-  int prefix_search_len = GRN_TEXT_LEN(VAR(9));
+  int types;
+  int offset = 0;
+  int limit = DEFAULT_LIMIT;
+  int frequency_threshold = DEFAULT_FREQUENCY_THRESHOLD;
+  int prefix_search_len;
+  double conditional_probability_threshold =
+    DEFAULT_CONDITIONAL_PROBABILITY_THRESHOLD;
   grn_suggest_prefix_search_mode prefix_search_mode;
+
+  types = grn_parse_suggest_types(GRN_TEXT_VALUE(VAR(0)), GRN_BULK_CURR(VAR(0)));
+  if (GRN_TEXT_LEN(VAR(6)) > 0) {
+    offset = grn_atoi(GRN_TEXT_VALUE(VAR(6)), GRN_BULK_CURR(VAR(6)), NULL);
+  }
+  if (GRN_TEXT_LEN(VAR(7)) > 0) {
+    limit = grn_atoi(GRN_TEXT_VALUE(VAR(7)), GRN_BULK_CURR(VAR(7)), NULL);
+  }
+  if (GRN_TEXT_LEN(VAR(8)) > 0) {
+    frequency_threshold = grn_atoi(GRN_TEXT_VALUE(VAR(8)), GRN_BULK_CURR(VAR(8)), NULL);
+  }
+  if (GRN_TEXT_LEN(VAR(9)) > 0) {
+    GRN_TEXT_PUTC(ctx, VAR(9), '\0');
+    conditional_probability_threshold = strtod(GRN_TEXT_VALUE(VAR(9)), NULL);
+  }
+
+  prefix_search_len = GRN_TEXT_LEN(VAR(10));
   if (prefix_search_len == 3 &&
-      strncasecmp("yes", GRN_TEXT_VALUE(VAR(9)), 3) == 0) {
+      strncasecmp("yes", GRN_TEXT_VALUE(VAR(10)), 3) == 0) {
     prefix_search_mode = GRN_SUGGEST_PREFIX_SEARCH_YES;
   } else if (prefix_search_len == 2 &&
-             strncasecmp("no", GRN_TEXT_VALUE(VAR(9)), 2) == 0) {
+             strncasecmp("no", GRN_TEXT_VALUE(VAR(10)), 2) == 0) {
     prefix_search_mode = GRN_SUGGEST_PREFIX_SEARCH_NO;
   } else {
     prefix_search_mode = GRN_SUGGEST_PREFIX_SEARCH_AUTO;
   }
+
   if ((items = grn_ctx_get(ctx, TEXT_VALUE_LEN(VAR(1))))) {
     if ((items_boost = grn_obj_column(ctx, items, CONST_STR_LEN("boost")))) {
       GRN_OUTPUT_MAP_OPEN("RESULT_SET", -1);
@@ -444,7 +484,9 @@ command_suggest(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_dat
         if ((col = grn_obj_column(ctx, items, TEXT_VALUE_LEN(VAR(2))))) {
           GRN_OUTPUT_CSTR("complete");
           complete(ctx, items, items_boost, col, VAR(3), VAR(4),
-                   VAR(5), offset, limit, threshold, prefix_search_mode);
+                   VAR(5), offset, limit,
+                   frequency_threshold, conditional_probability_threshold,
+                   prefix_search_mode);
         } else {
           ERR(GRN_INVALID_ARGUMENT, "invalid column.");
         }
@@ -452,12 +494,14 @@ command_suggest(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_dat
       if (types & CORRECT) {
         GRN_OUTPUT_CSTR("correct");
         correct(ctx, items, items_boost, VAR(3), VAR(4),
-                VAR(5), offset, limit, threshold);
+                VAR(5), offset, limit,
+                frequency_threshold, conditional_probability_threshold);
       }
       if (types & SUGGEST) {
         GRN_OUTPUT_CSTR("suggest");
         suggest(ctx, items, items_boost, VAR(3), VAR(4),
-                VAR(5), offset, limit, threshold);
+                VAR(5), offset, limit,
+                frequency_threshold, conditional_probability_threshold);
       }
       GRN_OUTPUT_MAP_CLOSE();
     } else {
@@ -585,7 +629,6 @@ GRN_PLUGIN_INIT(grn_ctx *ctx)
 grn_rc
 GRN_PLUGIN_REGISTER(grn_ctx *ctx)
 {
-  /* TODO: offset/limit */
   grn_expr_var vars[] = {
     {CONST_STR_LEN("types")},
     {CONST_STR_LEN("table")},
@@ -595,7 +638,8 @@ GRN_PLUGIN_REGISTER(grn_ctx *ctx)
     {CONST_STR_LEN("output_columns")},
     {CONST_STR_LEN("offset")},
     {CONST_STR_LEN("limit")},
-    {CONST_STR_LEN("threshold")},
+    {CONST_STR_LEN("frequency_threshold")},
+    {CONST_STR_LEN("conditional_probability_threshold")},
     {CONST_STR_LEN("prefix_search")}
   };
   GRN_TEXT_INIT(&vars[0].value, 0);
@@ -608,8 +652,9 @@ GRN_PLUGIN_REGISTER(grn_ctx *ctx)
   GRN_TEXT_INIT(&vars[7].value, 0);
   GRN_TEXT_INIT(&vars[8].value, 0);
   GRN_TEXT_INIT(&vars[9].value, 0);
+  GRN_TEXT_INIT(&vars[10].value, 0);
   grn_proc_create(ctx, CONST_STR_LEN("suggest"), GRN_PROC_COMMAND,
-                  command_suggest, NULL, NULL, 10, vars);
+                  command_suggest, NULL, NULL, 11, vars);
 
   grn_proc_create(ctx, CONST_STR_LEN("suggest_preparer"), GRN_PROC_FUNCTION,
                   func_suggest_preparer, NULL, NULL, 0, NULL);

  Modified: test/unit/story/test-rurema.c (+12 -11)
===================================================================
--- test/unit/story/test-rurema.c    2011-08-17 07:54:12 +0000 (2b472fa)
+++ test/unit/story/test-rurema.c    2011-08-18 08:14:04 +0000 (d5c9a18)
@@ -134,7 +134,7 @@ test_complete_prefix_rk_search(gconstpointer data)
         "--table item_rurema "
         "--column kana "
         "--types complete "
-        "--threshold 1 "
+        "--frequency_threshold 1 "
         "--query '%s'",
         gcut_data_get_string(data, "query"))));
 }
@@ -153,7 +153,7 @@ test_complete_prefix_rk_search_threshold_found(void)
       "--table item_rurema "
       "--column kana "
       "--types complete "
-      "--threshold 101 "
+      "--frequency_threshold 101 "
       "--query 'hen'"));
 }
 
@@ -170,7 +170,7 @@ test_complete_prefix_rk_search_threshold_not_fuond(void)
       "--table item_rurema "
       "--column kana "
       "--types complete "
-      "--threshold 102 "
+      "--frequency_threshold 102 "
       "--query 'hen'"));
 }
 
@@ -188,7 +188,8 @@ test_complete_coocurrence(void)
         "--table item_rurema "
         "--column kana "
         "--types complete "
-        "--threshold 1 "
+        "--frequency_threshold 1 "
+        "--conditional_probability_threshold 0.1 "
         "--query 'stりん'"));
 }
 
@@ -207,7 +208,7 @@ test_complete_prefix_search_force(void)
       "--column kana "
       "--types complete "
       "--prefix_search yes "
-      "--threshold 1 "
+      "--frequency_threshold 1 "
       "--query '置'"));
 }
 
@@ -225,7 +226,7 @@ test_complete_prefix_search_disable(void)
       "--column kana "
       "--types complete "
       "--prefix_search no "
-      "--threshold 1 "
+      "--frequency_threshold 1 "
       "--query '置'"));
 }
 
@@ -243,7 +244,7 @@ test_complete_prefix_search_threshold_found(void)
       "--table item_rurema "
       "--column kana "
       "--types complete "
-      "--threshold 101 "
+      "--frequency_threshold 101 "
       "--query '変'"));
 }
 
@@ -260,7 +261,7 @@ test_complete_prefix_search_threshold_not_fuond(void)
       "--table item_rurema "
       "--column kana "
       "--types complete "
-      "--threshold 102 "
+      "--frequency_threshold 102 "
       "--query '変'"));
 }
 
@@ -278,7 +279,7 @@ test_correct_coocurrence(void)
         "--table item_rurema "
         "--column kana "
         "--types correct "
-        "--threshold 1 "
+        "--frequency_threshold 1 "
         "--query 'avg'"));
 }
 
@@ -296,7 +297,7 @@ test_correct_similar(void)
         "--table item_rurema "
         "--column kana "
         "--types correct "
-        "--threshold 1 "
+        "--frequency_threshold 1 "
         "--query 'kernel'"));
 }
 
@@ -314,6 +315,6 @@ test_suggest_coocurrence(void)
         "--table item_rurema "
         "--column kana "
         "--types suggest "
-        "--threshold 1 "
+        "--frequency_threshold 1 "
         "--query 'CSV'"));
 }




Groonga-commit メーリングリストの案内
Back to archive index