[Groonga-commit] groonga/groonga at 86682c2 [master] pat: add max_expansion option to grn_pat_fuzzy_seach()

Back to archive index

naoa null+****@clear*****
Fri Feb 5 07:32:15 JST 2016


naoa	2016-02-05 07:32:15 +0900 (Fri, 05 Feb 2016)

  New Revision: 86682c2fa2132df67ea721406a41a8c43f9d5081
  https://github.com/groonga/groonga/commit/86682c2fa2132df67ea721406a41a8c43f9d5081

  Merged a300aa1: Merge pull request #464 from naoa/master

  Message:
    pat: add max_expansion option to grn_pat_fuzzy_seach()

  Modified files:
    include/groonga/groonga.h
    lib/pat.c
    test/unit/core/test-patricia-trie-search.c

  Modified: include/groonga/groonga.h (+1 -0)
===================================================================
--- include/groonga/groonga.h    2016-02-05 07:24:25 +0900 (247137a)
+++ include/groonga/groonga.h    2016-02-05 07:32:15 +0900 (e629ad9)
@@ -939,6 +939,7 @@ typedef struct _grn_fuzzy_optarg grn_fuzzy_optarg;
 
 struct _grn_fuzzy_optarg {
   unsigned int max_distance;
+  unsigned int max_expansion;
   unsigned int prefix_match_size;
   int flags;
 };

  Modified: lib/pat.c (+91 -13)
===================================================================
--- lib/pat.c    2016-02-05 07:24:25 +0900 (7d3d9bd)
+++ lib/pat.c    2016-02-05 07:32:15 +0900 (566d05d)
@@ -1222,6 +1222,67 @@ grn_pat_lcp_search(grn_ctx *ctx, grn_pat *pat, const void *key, uint32_t key_siz
   return r2;
 }
 
+typedef struct {
+  grn_id id;
+  uint16_t distance;
+} fuzzy_heap_node;
+
+typedef struct {
+  int n_entries;
+  int limit;
+  fuzzy_heap_node *nodes;
+} fuzzy_heap;
+
+static inline fuzzy_heap *
+fuzzy_heap_open(grn_ctx *ctx, int max)
+{
+  fuzzy_heap *h = GRN_MALLOC(sizeof(fuzzy_heap));
+  if (!h) { return NULL; }
+  h->nodes = GRN_MALLOC(sizeof(fuzzy_heap_node) * max);
+  if (!h->nodes) {
+    GRN_FREE(h);
+    return NULL;
+  }
+  h->n_entries = 0;
+  h->limit = max;
+  return h;
+}
+
+static inline grn_bool
+fuzzy_heap_push(grn_ctx *ctx, fuzzy_heap *h, grn_id id, uint16_t distance)
+{
+  int n, n2;
+  fuzzy_heap_node node = {id, distance};
+  fuzzy_heap_node node2;
+  if (h->n_entries >= h->limit) {
+    int max = h->limit * 2;
+    fuzzy_heap_node *nodes = GRN_REALLOC(h->nodes, sizeof(fuzzy_heap) * max);
+    if (!h) {
+      return GRN_FALSE;
+    }
+    h->limit = max;
+    h->nodes = nodes;
+  }
+  h->nodes[h->n_entries] = node;
+  n = h->n_entries++;
+  while (n) {
+    n2 = (n - 1) >> 1;
+    if (h->nodes[n2].distance <= h->nodes[n].distance) { break; }
+    node2 = h->nodes[n];
+    h->nodes[n] = h->nodes[n2];
+    h->nodes[n2] = node2;
+    n = n2;
+  }
+  return GRN_TRUE;
+}
+
+static inline void
+fuzzy_heap_close(grn_ctx *ctx, fuzzy_heap *h)
+{
+  GRN_FREE(h->nodes);
+  GRN_FREE(h);
+}
+
 #define DIST(ox,oy) (dists[((lx + 1) * (oy)) + (ox)])
 
 inline static uint16_t
@@ -1295,7 +1356,7 @@ _grn_pat_fuzzy_search(grn_ctx *ctx, grn_pat *pat, grn_id id,
                       const char *key, uint32_t key_size,
                       uint16_t *dists, uint32_t lx,
                       int last_check, fuzzy_node *last_node,
-                      uint32_t max_distance, int flags, grn_hash *h)
+                      uint32_t max_distance, int flags, fuzzy_heap *heap)
 {
   pat_node *node = NULL;
   int check, len;
@@ -1320,12 +1381,12 @@ _grn_pat_fuzzy_search(grn_ctx *ctx, grn_pat *pat, grn_id id,
     _grn_pat_fuzzy_search(ctx, pat, node->lr[0],
                           key, key_size, dists, lx,
                           check, last_node,
-                          max_distance, flags, h);
+                          max_distance, flags, heap);
 
     _grn_pat_fuzzy_search(ctx, pat, node->lr[1],
                           key, key_size, dists, lx,
                           check, last_node,
-                          max_distance, flags, h);
+                          max_distance, flags, heap);
   } else {
     if (id) {
       /* Set already calculated common prefix length */
@@ -1364,14 +1425,7 @@ _grn_pat_fuzzy_search(grn_ctx *ctx, grn_pat *pat, grn_id id,
                                        offset, max_distance,
                                        &(last_node->can_transition), flags);
         if (distance <= max_distance) {
-          if (DB_OBJ(h)->header.flags & GRN_OBJ_WITH_SUBREC) {
-            grn_rset_recinfo *ri;
-            if (grn_hash_add(ctx, h, &id, sizeof(grn_id), (void **)&ri, NULL)) {
-              ri->score = distance;
-            }
-          } else {
-            grn_hash_add(ctx, h, &id, sizeof(grn_id), NULL, NULL);
-          }
+          fuzzy_heap_push(ctx, heap, id, distance);
         }
       }
       last_node->key = k;
@@ -1381,6 +1435,8 @@ _grn_pat_fuzzy_search(grn_ctx *ctx, grn_pat *pat, grn_id id,
   return;
 }
 
+#define HEAP_SIZE 256
+
 grn_rc
 grn_pat_fuzzy_search(grn_ctx *ctx, grn_pat *pat,
                      const void *key, uint32_t key_size,
@@ -1389,11 +1445,13 @@ grn_pat_fuzzy_search(grn_ctx *ctx, grn_pat *pat,
   pat_node *node;
   grn_id id;
   uint16_t *dists;
-  uint32_t lx, len, x, y;
+  uint32_t lx, len, x, y, i;
   const char *s = key;
   const char *e = (const char *)key + key_size;
   fuzzy_node last_node;
+  fuzzy_heap *heap;
   uint32_t max_distance = 1;
+  uint32_t max_expansion = 0;
   uint32_t prefix_match_size = 0;
   int flags = 0;
   grn_rc rc = grn_pat_error_if_truncated(ctx, pat);
@@ -1402,6 +1460,7 @@ grn_pat_fuzzy_search(grn_ctx *ctx, grn_pat *pat,
   }
   if (args) {
     max_distance = args->max_distance;
+    max_expansion = args->max_expansion;
     prefix_match_size = args->prefix_match_size;
     flags = args->flags;
   }
@@ -1411,6 +1470,11 @@ grn_pat_fuzzy_search(grn_ctx *ctx, grn_pat *pat,
     return GRN_INVALID_ARGUMENT;
   }
 
+  heap = fuzzy_heap_open(ctx, HEAP_SIZE);
+  if (!heap) {
+    return GRN_NO_MEMORY_AVAILABLE;
+  }
+
   PAT_AT(pat, GRN_ID_NIL, node);
   id = node->lr[1];
 
@@ -1444,8 +1508,22 @@ grn_pat_fuzzy_search(grn_ctx *ctx, grn_pat *pat,
   last_node.can_transition = GRN_TRUE;
   _grn_pat_fuzzy_search(ctx, pat, id,
                         key, key_size, dists, lx,
-                        -1, &last_node, max_distance, flags, h);
+                        -1, &last_node, max_distance, flags, heap);
   GRN_FREE(dists);
+  for (i = 0; i < heap->n_entries; i++) {
+    if (max_expansion > 0 && i >= max_expansion) {
+      break;
+    }
+    if (DB_OBJ(h)->header.flags & GRN_OBJ_WITH_SUBREC) {
+      grn_rset_recinfo *ri;
+      if (grn_hash_add(ctx, h, &(heap->nodes[i].id), sizeof(grn_id), (void **)&ri, NULL)) {
+        ri->score = heap->nodes[i].distance;
+      }
+    } else {
+      grn_hash_add(ctx, h, &(heap->nodes[i].id), sizeof(grn_id), NULL, NULL);
+    }
+  }
+  fuzzy_heap_close(ctx, heap);
   if (grn_hash_size(ctx, h)) {
     return GRN_SUCCESS;
   } else {

  Modified: test/unit/core/test-patricia-trie-search.c (+1 -0)
===================================================================
--- test/unit/core/test-patricia-trie-search.c    2016-02-05 07:24:25 +0900 (34c3a6f)
+++ test/unit/core/test-patricia-trie-search.c    2016-02-05 07:32:15 +0900 (4dff853)
@@ -429,6 +429,7 @@ test_fuzzy_search(gconstpointer data)
   grn_fuzzy_optarg args;
   args.prefix_match_size = 0;
   args.max_distance = 1;
+  args.max_expansion = 0;
   args.flags = GRN_TABLE_FUZZY_WITH_TRANSPOSITION;
 
   trie_test_data_set_parameters(test_data);
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index