[Groonga-commit] groonga/groonga at 750f449 [master] grn_obj_search: support GRN_OP_FUZZY for index_column

Back to archive index

naoa null+****@clear*****
Tue Feb 2 14:23:44 JST 2016


naoa	2016-02-02 14:23:44 +0900 (Tue, 02 Feb 2016)

  New Revision: 750f449f4aba6249c07da776c08a3690e2c82252
  https://github.com/groonga/groonga/commit/750f449f4aba6249c07da776c08a3690e2c82252

  Merged e4e194e: Merge pull request #463 from naoa/op-fuzzy

  Message:
    grn_obj_search: support GRN_OP_FUZZY for index_column

  Modified files:
    lib/db.c
    lib/grn_ii.h
    lib/ii.c

  Modified: lib/db.c (+3 -0)
===================================================================
--- lib/db.c    2016-02-03 06:30:45 +0900 (4c145be)
+++ lib/db.c    2016-02-02 14:23:44 +0900 (9ea11db)
@@ -3365,6 +3365,9 @@ grn_obj_search_column_index_by_key(grn_ctx *ctx, grn_obj *obj,
         case GRN_OP_REGEXP :
           tag = "[key][regexp]";
           break;
+        case GRN_OP_FUZZY :
+          tag = "[key][fuzzy]";
+          break;
         default :
           tag = "[key][unknown]";
           break;

  Modified: lib/grn_ii.h (+3 -0)
===================================================================
--- lib/grn_ii.h    2016-02-03 06:30:45 +0900 (21e3fc7)
+++ lib/grn_ii.h    2016-02-02 14:23:44 +0900 (5ca7bab)
@@ -143,6 +143,9 @@ struct _grn_select_optarg {
   grn_obj *scorer;
   grn_obj *scorer_args_expr;
   unsigned int scorer_args_expr_offset;
+  unsigned int fuzzy_prefix_match_size;
+  unsigned int fuzzy_max_distance;
+  int fuzzy_flags;
 };
 
 GRN_API grn_rc grn_ii_column_update(grn_ctx *ctx, grn_ii *ii, grn_id id,

  Modified: lib/ii.c (+129 -14)
===================================================================
--- lib/ii.c    2016-02-03 06:30:45 +0900 (79d849b)
+++ lib/ii.c    2016-02-02 14:23:44 +0900 (9ea12e2)
@@ -5460,10 +5460,17 @@ typedef struct {
   grn_posting *p;
 } token_info;
 
+typedef struct {
+  unsigned int fuzzy_prefix_match_size;
+  unsigned int fuzzy_max_distance;
+  int fuzzy_flags;
+} token_info_optarg;
+
 #define EX_NONE   0
 #define EX_PREFIX 1
 #define EX_SUFFIX 2
 #define EX_BOTH   3
+#define EX_FUZZY  4
 
 inline static void
 token_info_expand_both(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii,
@@ -5526,7 +5533,8 @@ token_info_close(grn_ctx *ctx, token_info *ti)
 
 inline static token_info *
 token_info_open(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii,
-                const char *key, unsigned int key_size, uint32_t offset, int mode)
+                const char *key, unsigned int key_size, uint32_t offset, int mode,
+                token_info_optarg *arg)
 {
   int s = 0;
   grn_hash *h;
@@ -5589,6 +5597,26 @@ token_info_open(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii,
       grn_hash_close(ctx, h);
     }
     break;
+  case EX_FUZZY :
+    if ((h = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, 0))) {
+      grn_table_fuzzy_search(ctx, lexicon, key, key_size,
+                             arg->fuzzy_prefix_match_size,
+                             arg->fuzzy_max_distance,
+                             arg->fuzzy_flags, (grn_obj *)h);
+      if (GRN_HASH_SIZE(h)) {
+        if ((ti->cursors = cursor_heap_open(ctx, GRN_HASH_SIZE(h)))) {
+          GRN_HASH_EACH(ctx, h, id, &tp, NULL, NULL, {
+            if ((s = grn_ii_estimate_size(ctx, ii, *tp))) {
+              cursor_heap_push(ctx, ti->cursors, ii, *tp, 0);
+              ti->ntoken++;
+              ti->size += s;
+            }
+          });
+        }
+      }
+      grn_hash_close(ctx, h);
+    }
+    break;
   }
   if (cursor_heap_push2(ti->cursors)) {
     token_info_close(ctx, ti);
@@ -5651,7 +5679,7 @@ token_compare(const void *a, const void *b)
 inline static grn_rc
 token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, unsigned int string_len,
                  token_info **tis, uint32_t *n, grn_bool *only_skip_token,
-                 grn_operator mode)
+                 grn_operator mode, token_info_optarg *arg)
 {
   token_info *ti;
   const char *key;
@@ -5665,7 +5693,7 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string,
   *only_skip_token = GRN_FALSE;
   if (!token_cursor) { return GRN_NO_MEMORY_AVAILABLE; }
   if (mode == GRN_OP_UNSPLIT) {
-    if ((ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig, token_cursor->orig_blen, 0, EX_BOTH))) {
+    if ((ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig, token_cursor->orig_blen, 0, EX_BOTH, arg))) {
       tis[(*n)++] = ti;
       rc = GRN_SUCCESS;
     }
@@ -5691,21 +5719,21 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string,
     switch (token_cursor->status) {
     case GRN_TOKEN_CURSOR_DOING :
       key = _grn_table_key(ctx, lexicon, tid, &size);
-      ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, ef & EX_SUFFIX);
+      ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, ef & EX_SUFFIX, arg);
       break;
     case GRN_TOKEN_CURSOR_DONE :
       ti = token_info_open(ctx, lexicon, ii, (const char *)token_cursor->curr,
-                           token_cursor->curr_size, 0, ef);
+                           token_cursor->curr_size, 0, ef, arg);
       /*
       key = _grn_table_key(ctx, lexicon, tid, &size);
-      ti = token_info_open(ctx, lexicon, ii, token_cursor->curr, token_cursor->curr_size, token_cursor->pos, ef);
+      ti = token_info_open(ctx, lexicon, ii, token_cursor->curr, token_cursor->curr_size, token_cursor->pos, ef, arg);
       ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig,
-                           token_cursor->orig_blen, token_cursor->pos, ef);
+                           token_cursor->orig_blen, token_cursor->pos, ef, arg);
       */
       break;
     case GRN_TOKEN_CURSOR_NOT_FOUND :
       ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig,
-                           token_cursor->orig_blen, 0, ef);
+                           token_cursor->orig_blen, 0, ef, arg);
       break;
     case GRN_TOKEN_CURSOR_DONE_SKIP :
       *only_skip_token = GRN_TRUE;
@@ -5723,17 +5751,17 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string,
         continue;
       case GRN_TOKEN_CURSOR_DOING :
         key = _grn_table_key(ctx, lexicon, tid, &size);
-        ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, EX_NONE);
+        ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, EX_NONE, arg);
         break;
       case GRN_TOKEN_CURSOR_DONE :
         if (tid) {
           key = _grn_table_key(ctx, lexicon, tid, &size);
-          ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, ef & EX_PREFIX);
+          ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, ef & EX_PREFIX, arg);
           break;
         } /* else fallthru */
       default :
         ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->curr,
-                             token_cursor->curr_size, token_cursor->pos, ef & EX_PREFIX);
+                             token_cursor->curr_size, token_cursor->pos, ef & EX_PREFIX, arg);
         break;
       }
       if (!ti) {
@@ -5748,6 +5776,62 @@ exit :
   return rc;
 }
 
+inline static grn_rc
+token_info_build_fuzzy(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii,
+                       const char *string, unsigned int string_len,
+                       token_info **tis, uint32_t *n, grn_bool *only_skip_token,
+                       grn_operator mode, token_info_optarg *arg)
+{
+  token_info *ti;
+  grn_rc rc = GRN_END_OF_DATA;
+  unsigned int token_flags = GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER;
+  grn_token_cursor *token_cursor = grn_token_cursor_open(ctx, lexicon,
+                                                         string, string_len,
+                                                         GRN_TOKENIZE_ONLY,
+                                                         token_flags);
+  *only_skip_token = GRN_FALSE;
+  if (!token_cursor) { return GRN_NO_MEMORY_AVAILABLE; }
+  grn_token_cursor_next(ctx, token_cursor);
+  switch (token_cursor->status) {
+  case GRN_TOKEN_CURSOR_DONE_SKIP :
+    *only_skip_token = GRN_TRUE;
+    goto exit;
+  case GRN_TOKEN_CURSOR_DOING :
+  case GRN_TOKEN_CURSOR_DONE :
+    ti = token_info_open(ctx, lexicon, ii, (const char *)token_cursor->curr,
+                         token_cursor->curr_size, token_cursor->pos, EX_FUZZY, arg);
+    break;
+  default :
+    break;
+  }
+  if (!ti) {
+    goto exit ;
+  }
+  tis[(*n)++] = ti;
+  while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) {
+    grn_token_cursor_next(ctx, token_cursor);
+    switch (token_cursor->status) {
+    case GRN_TOKEN_CURSOR_DONE_SKIP :
+      continue;
+    case GRN_TOKEN_CURSOR_DOING :
+    case GRN_TOKEN_CURSOR_DONE :
+      ti = token_info_open(ctx, lexicon, ii, (const char *)token_cursor->curr,
+                           token_cursor->curr_size, token_cursor->pos, EX_FUZZY, arg);
+      break;
+    default :
+      break;
+    }
+    if (!ti) {
+      goto exit;
+    }
+    tis[(*n)++] = ti;
+  }
+  rc = GRN_SUCCESS;
+exit :
+  grn_token_cursor_close(ctx, token_cursor);
+  return rc;
+}
+
 static void
 token_info_clear_offset(token_info **tis, uint32_t n)
 {
@@ -6465,6 +6549,7 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
   grn_obj *lexicon = ii->lexicon;
   grn_scorer_score_func *score_func = NULL;
   grn_scorer_matched_record record;
+  token_info_optarg token_info_arg = {0, 0, 0};
 
   if (!lexicon || !ii || !s) { return GRN_INVALID_ARGUMENT; }
   if (optarg) {
@@ -6474,6 +6559,11 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
     } else if (optarg->vector_size) {
       wvm = optarg->weight_vector ? grn_wv_static : grn_wv_constant;
     }
+    if (mode == GRN_OP_FUZZY) {
+      token_info_arg.fuzzy_prefix_match_size = optarg->fuzzy_prefix_match_size;
+      token_info_arg.fuzzy_max_distance = optarg->fuzzy_max_distance;
+      token_info_arg.fuzzy_flags = optarg->fuzzy_flags;
+    }
   }
   if (mode == GRN_OP_SIMILAR) {
     return grn_ii_similar_search(ctx, ii, string, string_len, s, op, optarg);
@@ -6494,7 +6584,11 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
   if (!(tis = GRN_MALLOC(sizeof(token_info *) * string_len * 2))) {
     return GRN_NO_MEMORY_AVAILABLE;
   }
-  if (token_info_build(ctx, lexicon, ii, string, string_len, tis, &n, &only_skip_token, mode) || !n) { goto exit; }
+  if (mode == GRN_OP_FUZZY) {
+    if (token_info_build_fuzzy(ctx, lexicon, ii, string, string_len, tis, &n, &only_skip_token, mode, &token_info_arg) || !n) { goto exit; }
+  } else {
+    if (token_info_build(ctx, lexicon, ii, string, string_len, tis, &n, &only_skip_token, mode, &token_info_arg) || !n) { goto exit; }
+  }
   switch (mode) {
   case GRN_OP_NEAR2 :
     token_info_clear_offset(tis, n);
@@ -6765,6 +6859,7 @@ grn_ii_estimate_size_for_query(grn_ctx *ctx, grn_ii *ii,
   grn_operator mode = GRN_OP_EXACT;
   double estimated_size = 0;
   double normalized_ratio = 1.0;
+  token_info_optarg token_info_arg = {0, 0, 0};
 
   if (query_len == 0) {
     return 0;
@@ -6782,6 +6877,11 @@ grn_ii_estimate_size_for_query(grn_ctx *ctx, grn_ii *ii,
     case GRN_OP_REGEXP :
       mode = optarg->mode;
       break;
+    case GRN_OP_FUZZY :
+      mode = optarg->mode;
+      token_info_arg.fuzzy_prefix_match_size = optarg->fuzzy_prefix_match_size;
+      token_info_arg.fuzzy_max_distance = optarg->fuzzy_max_distance;
+      token_info_arg.fuzzy_flags = optarg->fuzzy_flags;
     default :
       break;
     }
@@ -6797,8 +6897,17 @@ grn_ii_estimate_size_for_query(grn_ctx *ctx, grn_ii *ii,
     return 0;
   }
 
-  rc = token_info_build(ctx, lexicon, ii, query, query_len,
-                        tis, &n_tis, &only_skip_token, mode);
+  switch (mode) {
+  case GRN_OP_FUZZY :
+    rc = token_info_build_fuzzy(ctx, lexicon, ii, query, query_len,
+                                tis, &n_tis, &only_skip_token, mode, &token_info_arg);
+    break;
+  default :
+    rc = token_info_build(ctx, lexicon, ii, query, query_len,
+                          tis, &n_tis, &only_skip_token, mode, &token_info_arg);
+    break;
+  }
+
   if (rc != GRN_SUCCESS) {
     goto exit;
   }
@@ -6875,6 +6984,12 @@ grn_ii_sel(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len
       case GRN_OP_REGEXP :
         arg.mode = optarg->mode;
         break;
+      case GRN_OP_FUZZY :
+        arg.mode = optarg->mode;
+        arg.fuzzy_prefix_match_size = optarg->fuzzy_prefix_match_size;
+        arg.fuzzy_max_distance = optarg->fuzzy_max_distance;
+        arg.fuzzy_flags = optarg->fuzzy_flags;
+        break;
       default :
         break;
       }
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index