[Groonga-commit] groonga/groonga at 3511a23 [master] pat: Add grn_pat_fussy_search()

Back to archive index

naoa null+****@clear*****
Mon Feb 1 01:03:12 JST 2016


naoa	2016-02-01 01:03:12 +0900 (Mon, 01 Feb 2016)

  New Revision: 3511a23248290fb96eb312cfafed068b217ef1e5
  https://github.com/groonga/groonga/commit/3511a23248290fb96eb312cfafed068b217ef1e5

  Merged 2d929d3: Merge pull request #460 from naoa/pat-fussy-search

  Message:
    pat: Add grn_pat_fussy_search()

  Modified files:
    lib/grn_pat.h
    lib/pat.c
    test/unit/core/test-patricia-trie-search.c

  Modified: lib/grn_pat.h (+6 -0)
===================================================================
--- lib/grn_pat.h    2016-01-31 16:36:16 +0900 (6a2a433)
+++ lib/grn_pat.h    2016-02-01 01:03:12 +0900 (db642be)
@@ -107,6 +107,12 @@ void grn_pat_cursor_inspect(grn_ctx *ctx, grn_pat_cursor *c, grn_obj *buf);
 grn_rc grn_pat_cache_enable(grn_ctx *ctx, grn_pat *pat, uint32_t cache_size);
 void grn_pat_cache_disable(grn_ctx *ctx, grn_pat *pat);
 
+#define WITH_TRANSPOSITION                  (0x01)
+
+GRN_API grn_rc grn_pat_fussy_search(grn_ctx *ctx, grn_pat *pat,
+                                    const void *key, unsigned int key_size, unsigned int max_size,
+                                    unsigned int max_distance, int flags, grn_hash *h);
+
 #ifdef __cplusplus
 }
 #endif

  Modified: lib/pat.c (+216 -0)
===================================================================
--- lib/pat.c    2016-01-31 16:36:16 +0900 (0d94a71)
+++ lib/pat.c    2016-02-01 01:03:12 +0900 (f185f51)
@@ -1224,6 +1224,222 @@ grn_pat_lcp_search(grn_ctx *ctx, grn_pat *pat, const void *key, uint32_t key_siz
   return r2;
 }
 
+#define DIST(ox,oy) (dists[((lx + 1) * (oy)) + (ox)])
+
+inline static uint16_t
+calc_edit_distance_by_offset(grn_ctx *ctx,
+                             const char *sx, const char *ex,
+                             const char *sy, const char *ey,
+                             uint16_t *dists, uint32_t lx,
+                             uint32_t offset, uint32_t max_distance,
+                             grn_bool *can_transition, int flags)
+{
+  uint32_t cx, cy, x = 1, y = 1;
+  const char *px, *py;
+
+  /* Skip already calculated rows */
+  for (py = sy, y = 1; py < ey && (cy = grn_charlen(ctx, py, ey)); py += cy, y++) {
+    if (py - sy >= offset) {
+      break;
+    }
+  }
+  for (; py < ey && (cy = grn_charlen(ctx, py, ey)); py += cy, y++) {
+    /* children nodes will be no longer smaller than max distance
+     * with only insertion costs.
+     * This is end of row on allocated memory. */
+    if (y > lx + max_distance) {
+      *can_transition = GRN_FALSE;
+      return max_distance + 1;
+    }
+
+    for (px = sx, x = 1; px < ex && (cx = grn_charlen(ctx, px, ex)); px += cx, x++) {
+      if (cx == cy && !memcmp(px, py, cx)) {
+        DIST(x, y) = DIST(x - 1, y - 1);
+      } else {
+        uint32_t a, b, c;
+        a = DIST(x - 1, y) + 1;
+        b = DIST(x, y - 1) + 1;
+        c = DIST(x - 1, y - 1) + 1;
+        DIST(x, y) = ((a < b) ? ((a < c) ? a : c) : ((b < c) ? b : c));
+        if (flags == WITH_TRANSPOSITION
+            && x > 1 && y > 1
+            && cx == cy
+            && memcmp(px, py - cy, cx) == 0
+            && memcmp(px - cx, py, cx) == 0) {
+          uint32_t t = DIST(x - 2, y - 2) + 1;
+          DIST(x, y) = ((DIST(x, y) < t) ? DIST(x, y) : t);
+        }
+      }
+    }
+  }
+  if (lx) {
+    /* If there is no cell which is smaller than equal to max distance on end of row,
+     * children nodes will be no longer smaller than max distance */
+    *can_transition = GRN_FALSE;
+    for (x = 1; x <= lx; x++) {
+      if (DIST(x, y - 1) <= max_distance) {
+        *can_transition = GRN_TRUE;
+        break;
+      }
+    }
+  }
+  return DIST(lx, y - 1);
+}
+
+typedef struct {
+  const char *key;
+  int key_length;
+  grn_bool can_transition;
+} fussy_node;
+
+inline static void
+_grn_pat_fussy_search(grn_ctx *ctx, grn_pat *pat, grn_id id,
+                      const char *key, uint32_t key_size,
+                      uint16_t *dists, uint32_t lx,
+                      int last_check, fussy_node *last_node,
+                      uint32_t max_distance, int flags, grn_hash *h)
+{
+  pat_node *node = NULL;
+  int check, len;
+  const char *k;
+  uint32_t offset = 0;
+
+  PAT_AT(pat, id, node);
+  if (!node) {
+    return;
+  }
+  check = PAT_CHK(node);
+  len = PAT_LEN(node);
+  k = pat_node_get_key(ctx, pat, node);
+
+  if (check > last_check) {
+    if (len >= last_node->key_length &&
+        !memcmp(k, last_node->key, last_node->key_length)) {
+      if (last_node->can_transition == GRN_FALSE) {
+        return;
+      }
+    }
+    _grn_pat_fussy_search(ctx, pat, node->lr[0],
+                          key, key_size, dists, lx,
+                          check, last_node,
+                          max_distance, flags, h);
+
+    _grn_pat_fussy_search(ctx, pat, node->lr[1],
+                          key, key_size, dists, lx,
+                          check, last_node,
+                          max_distance, flags, h);
+  } else {
+    if (id) {
+      /* Set already calculated common prefix length */
+      if (len >= last_node->key_length &&
+          !memcmp(k, last_node->key, last_node->key_length)) {
+        if (last_node->can_transition == GRN_FALSE) {
+          return;
+        }
+        offset = last_node->key_length;
+      } else {
+        if (last_node->can_transition == GRN_FALSE) {
+          last_node->can_transition = GRN_TRUE;
+        }
+        if (last_node->key_length) {
+          const char *kp = k;
+          const char *ke = k + len;
+          const char *p = last_node->key;
+          const char *e = last_node->key + last_node->key_length;
+          int lp;
+          for (;p < e && kp < ke && (lp = grn_charlen(ctx, p, e));
+               p += lp, kp += lp) {
+            if (p + lp <= e && kp + lp <= ke && memcmp(p, kp, lp)) {
+              break;
+            }
+          }
+          offset = kp - k;
+        }
+      }
+      if (len - offset) {
+        uint16_t distance = 0;
+        distance =
+          calc_edit_distance_by_offset(ctx,
+                                       key, key + key_size,
+                                       k, k + len,
+                                       dists, lx,
+                                       offset, max_distance,
+                                       &(last_node->can_transition), flags);
+        if (distance <= max_distance) {
+          if (DB_OBJ(h)->header.flags & GRN_OBJ_WITH_SUBREC) {
+            grn_rset_recinfo *ri;
+            if (grn_hash_add(ctx, h, &id, sizeof(grn_id), (void **)&ri, NULL)) {
+              ri->score = distance;
+            }
+          } else {
+            grn_hash_add(ctx, h, &id, sizeof(grn_id), NULL, NULL);
+          }
+        }
+      }
+      last_node->key = k;
+      last_node->key_length = len;
+    }
+  }
+  return;
+}
+
+grn_rc
+grn_pat_fussy_search(grn_ctx *ctx, grn_pat *pat,
+                     const void *key, uint32_t key_size, uint32_t max_size,
+                     uint32_t max_distance, int flags, grn_hash *h)
+{
+  pat_node *node;
+  grn_id id;
+  uint16_t *dists;
+  uint32_t lx, len, x, y;
+  const char *s = key;
+  const char *e = (const char *)key + key_size;
+  fussy_node last_node;
+  PAT_AT(pat, GRN_ID_NIL, node);
+  id = node->lr[1];
+
+  if (key_size > GRN_TABLE_MAX_KEY_SIZE ||
+      max_distance > GRN_TABLE_MAX_KEY_SIZE ||
+      max_size > key_size) {
+    return GRN_INVALID_ARGUMENT;
+  }
+  if (grn_pat_error_if_truncated(ctx, pat) != GRN_SUCCESS) {
+    return GRN_ID_NIL;
+  }
+  if (max_size) {
+    grn_pat_cursor *cur;
+    if ((cur = grn_pat_cursor_open(ctx, pat, key, max_size,
+                                   NULL, 0, 0, -1, GRN_CURSOR_PREFIX))) {
+      grn_id tid;
+      tid = grn_pat_cursor_next(ctx, cur);
+      grn_pat_cursor_close(ctx, cur);
+      if (tid) {
+        id = tid;
+      } else {
+        return GRN_END_OF_DATA;
+      }
+    }
+  }
+  for (lx = 0; s < e && (len = grn_charlen(ctx, s, e)); s += len, lx++);
+  dists = GRN_MALLOC((lx + 1) * (lx + max_distance + 1) * sizeof(uint16_t));
+
+  for (x = 0; x <= lx; x++) { DIST(x, 0) = x; }
+  for (y = 0; y <= lx + max_distance ; y++) { DIST(0, y) = y; }
+
+  last_node.key = NULL;
+  last_node.key_length = 0;
+  last_node.can_transition = GRN_TRUE;
+  _grn_pat_fussy_search(ctx, pat, id,
+                        key, key_size, dists, lx,
+                        -1, &last_node, max_distance, flags, h);
+  GRN_FREE(dists);
+  if (grn_hash_size(ctx, h)) {
+    return GRN_SUCCESS;
+  } else {
+    return GRN_END_OF_DATA;
+  }
+}
+
 inline static grn_rc
 _grn_pat_del(grn_ctx *ctx, grn_pat *pat, const char *key, uint32_t key_size, int shared,
              grn_table_delete_optarg *optarg)

  Modified: test/unit/core/test-patricia-trie-search.c (+63 -0)
===================================================================
--- test/unit/core/test-patricia-trie-search.c    2016-01-31 16:36:16 +0900 (2f948d7)
+++ test/unit/core/test-patricia-trie-search.c    2016-02-01 01:03:12 +0900 (894f31e)
@@ -24,6 +24,8 @@ void data_prefix_search(void);
 void test_prefix_search(gconstpointer data);
 void data_suffix_search(void);
 void test_suffix_search(gconstpointer data);
+void data_fussy_search(void);
+void test_fussy_search(gconstpointer data);
 
 static GList *keys;
 
@@ -385,3 +387,64 @@ test_suffix_search(gconstpointer data)
   gcut_assert_equal_list_string(test_data->expected_strings,
                                 retrieve_all_keys());
 }
+
+void
+data_fussy_search(void)
+{
+  cut_add_data("nonexistence",
+               xfix_test_data_new(GRN_END_OF_DATA, NULL, "cccc", NULL, NULL),
+               xfix_test_data_free,
+               "insertion",
+               xfix_test_data_new(GRN_SUCCESS,
+                                  gcut_list_string_new("ああ", "あ", NULL),
+               "あ", NULL, NULL),
+               xfix_test_data_free,
+               "deletion",
+               xfix_test_data_new(GRN_SUCCESS,
+                                  gcut_list_string_new("bbbbb", NULL),
+               "bbbbbb", NULL, NULL),
+               xfix_test_data_free,
+               "substitution",
+               xfix_test_data_new(GRN_SUCCESS,
+                                  gcut_list_string_new("cdefg", NULL),
+               "cdefh", NULL, NULL),
+               xfix_test_data_free,
+               "transposition",
+               xfix_test_data_new(GRN_SUCCESS,
+                                  gcut_list_string_new("cdefg", NULL),
+               "cdegf", NULL, NULL),
+               xfix_test_data_free);
+}
+
+void
+test_fussy_search(gconstpointer data)
+{
+  const grn_trie_test_data *test_data = data;
+  const gchar key1[]  = "あ";
+  const gchar key2[]  = "ああ";
+  const gchar key3[]  = "あああ";
+  const gchar key4[]  = "bbbb";
+  const gchar key5[]  = "bbbbb";
+  const gchar key6[]  = "cdefg";
+
+  trie_test_data_set_parameters(test_data);
+
+  cut_assert_create_trie();
+
+  cut_assert_lookup_add(key1);
+  cut_assert_lookup_add(key2);
+  cut_assert_lookup_add(key3);
+  cut_assert_lookup_add(key4);
+  cut_assert_lookup_add(key5);
+  cut_assert_lookup_add(key6);
+
+  cut_assert_create_hash();
+  grn_test_assert_equal_rc(test_data->expected_rc,
+                           grn_pat_fussy_search(context, trie,
+                                                 test_data->search_key,
+                                                 strlen(test_data->search_key),
+                                                 0, 1, 1,
+                                                 hash));
+  gcut_assert_equal_list_string(test_data->expected_strings,
+                                retrieve_all_keys());
+}
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index