[Groonga-commit] groonga/groonga at 740e845 [master] Extract codes for edit_distance and fuzzy_search

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Feb 5 18:40:56 JST 2016


Kouhei Sutou	2016-02-05 18:40:56 +0900 (Fri, 05 Feb 2016)

  New Revision: 740e845df22d351cc1f416da838d76f9dd6d9da4
  https://github.com/groonga/groonga/commit/740e845df22d351cc1f416da838d76f9dd6d9da4

  Message:
    Extract codes for edit_distance and fuzzy_search

  Added files:
    lib/proc/proc_fuzzy_search.c
  Modified files:
    lib/grn_proc.h
    lib/proc.c
    lib/proc/sources.am

  Modified: lib/grn_proc.h (+2 -0)
===================================================================
--- lib/grn_proc.h    2016-02-05 18:40:17 +0900 (a5b03c5)
+++ lib/grn_proc.h    2016-02-05 18:40:56 +0900 (e1896cf)
@@ -33,6 +33,8 @@ void grn_proc_init_clearlock(grn_ctx *ctx);
 void grn_proc_init_config_get(grn_ctx *ctx);
 void grn_proc_init_config_set(grn_ctx *ctx);
 void grn_proc_init_config_delete(grn_ctx *ctx);
+void grn_proc_init_edit_distance(grn_ctx *ctx);
+void grn_proc_init_fuzzy_search(grn_ctx *ctx);
 void grn_proc_init_inspect(grn_ctx *ctx);
 void grn_proc_init_lock_acquire(grn_ctx *ctx);
 void grn_proc_init_lock_clear(grn_ctx *ctx);

  Modified: lib/proc.c (+3 -406)
===================================================================
--- lib/proc.c    2016-02-05 18:40:17 +0900 (9aa93fe)
+++ lib/proc.c    2016-02-05 18:40:56 +0900 (b0df3e6)
@@ -2433,27 +2433,6 @@ dump_column(grn_ctx *ctx, grn_obj *outbuf , grn_obj *table, grn_obj *column)
   grn_obj_unlink(ctx, type);
 }
 
-static int
-reference_column_p(grn_ctx *ctx, grn_obj *column)
-{
-  grn_obj *range;
-
-  range = grn_ctx_at(ctx, grn_obj_get_range(ctx, column));
-  if (!range) {
-    return GRN_FALSE;
-  }
-
-  switch (range->header.type) {
-  case GRN_TABLE_HASH_KEY:
-  case GRN_TABLE_PAT_KEY:
-  case GRN_TABLE_DAT_KEY:
-  case GRN_TABLE_NO_KEY:
-    return GRN_TRUE;
-  default:
-    return GRN_FALSE;
-  }
-}
-
 static void
 dump_columns(grn_ctx *ctx, grn_obj *outbuf, grn_obj *table,
              grn_obj *pending_reference_columns)
@@ -2474,7 +2453,7 @@ dump_columns(grn_ctx *ctx, grn_obj *outbuf, grn_obj *table,
       if ((column = grn_ctx_at(ctx, *key))) {
         if (GRN_OBJ_INDEX_COLUMNP(column)) {
           /* do nothing */
-        } else if (reference_column_p(ctx, column)) {
+        } else if (grn_obj_is_reference_column(ctx, column)) {
           GRN_PTR_PUT(ctx, pending_reference_columns, column);
         } else {
           dump_column(ctx, outbuf, table, column);
@@ -4316,70 +4295,6 @@ func_geo_distance3(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_
   return obj;
 }
 
-#define DIST(ox,oy) (dists[((lx + 1) * (oy)) + (ox)])
-
-static uint32_t
-calc_edit_distance(grn_ctx *ctx, char *sx, char *ex, char *sy, char *ey, int flags)
-{
-  int d = 0;
-  uint32_t cx, lx, cy, ly, *dists;
-  char *px, *py;
-  for (px = sx, lx = 0; px < ex && (cx = grn_charlen(ctx, px, ex)); px += cx, lx++);
-  for (py = sy, ly = 0; py < ey && (cy = grn_charlen(ctx, py, ey)); py += cy, ly++);
-  if ((dists = GRN_MALLOC((lx + 1) * (ly + 1) * sizeof(uint32_t)))) {
-    uint32_t x, y;
-    for (x = 0; x <= lx; x++) { DIST(x, 0) = x; }
-    for (y = 0; y <= ly; y++) { DIST(0, y) = y; }
-    for (x = 1, px = sx; x <= lx; x++, px += cx) {
-      cx = grn_charlen(ctx, px, ex);
-      for (y = 1, py = sy; y <= ly; y++, py += cy) {
-        cy = grn_charlen(ctx, py, ey);
-        if (cx == cy && !memcmp(px, py, cx)) {
-          DIST(x, y) = DIST(x - 1, y - 1);
-        } else {
-          uint32_t a = DIST(x - 1, y) + 1;
-          uint32_t b = DIST(x, y - 1) + 1;
-          uint32_t c = DIST(x - 1, y - 1) + 1;
-          DIST(x, y) = ((a < b) ? ((a < c) ? a : c) : ((b < c) ? b : c));
-          if (flags & GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION &&
-              x > 1 && y > 1 && cx == cy &&
-              memcmp(px, py - cy, cx) == 0 &&
-              memcmp(px - cx, py, cx) == 0) {
-            uint32_t t = DIST(x - 2, y - 2) + 1;
-            DIST(x, y) = ((DIST(x, y) < t) ? DIST(x, y) : t);
-          }
-        }
-      }
-    }
-    d = DIST(lx, ly);
-    GRN_FREE(dists);
-  }
-  return d;
-}
-
-static grn_obj *
-func_edit_distance(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-#define N_REQUIRED_ARGS 2
-#define MAX_ARGS 3
-  int d = 0;
-  int flags = 0;
-  grn_obj *obj;
-  if (nargs >= N_REQUIRED_ARGS && nargs <= MAX_ARGS) {
-    if (nargs == MAX_ARGS && GRN_BOOL_VALUE(args[2])) {
-      flags |= GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION;
-    }
-    d = calc_edit_distance(ctx, GRN_TEXT_VALUE(args[0]), GRN_BULK_CURR(args[0]),
-                           GRN_TEXT_VALUE(args[1]), GRN_BULK_CURR(args[1]), flags);
-  }
-  if ((obj = GRN_PROC_ALLOC(GRN_DB_UINT32, 0))) {
-    GRN_UINT32_SET(ctx, obj, d);
-  }
-  return obj;
-#undef N_REQUIRED_ARGS
-#undef MAX_ARGS
-}
-
 static grn_obj *
 func_all_records(grn_ctx *ctx, int nargs, grn_obj **args,
                  grn_user_data *user_data)
@@ -6829,316 +6744,6 @@ exit :
   return rc;
 }
 
-#define SCORE_HEAP_SIZE 256
-
-typedef struct {
-  grn_id id;
-  uint32_t score;
-} score_heap_node;
-
-typedef struct {
-  int n_entries;
-  int limit;
-  score_heap_node *nodes;
-} score_heap;
-
-static inline score_heap *
-score_heap_open(grn_ctx *ctx, int max)
-{
-  score_heap *h = GRN_MALLOC(sizeof(score_heap));
-  if (!h) { return NULL; }
-  h->nodes = GRN_MALLOC(sizeof(score_heap_node) * max);
-  if (!h->nodes) {
-    GRN_FREE(h);
-    return NULL;
-  }
-  h->n_entries = 0;
-  h->limit = max;
-  return h;
-}
-
-static inline grn_bool
-score_heap_push(grn_ctx *ctx, score_heap *h, grn_id id, uint32_t score)
-{
-  int n, n2;
-  score_heap_node node = {id, score};
-  score_heap_node node2;
-  if (h->n_entries >= h->limit) {
-    int max = h->limit * 2;
-    score_heap_node *nodes = GRN_REALLOC(h->nodes, sizeof(score_heap) * max);
-    if (!h) {
-      return GRN_FALSE;
-    }
-    h->limit = max;
-    h->nodes = nodes;
-  }
-  h->nodes[h->n_entries] = node;
-  n = h->n_entries++;
-  while (n) {
-    n2 = (n - 1) >> 1;
-    if (h->nodes[n2].score <= h->nodes[n].score) { break; }
-    node2 = h->nodes[n];
-    h->nodes[n] = h->nodes[n2];
-    h->nodes[n2] = node2;
-    n = n2;
-  }
-  return GRN_TRUE;
-}
-
-static inline void
-score_heap_close(grn_ctx *ctx, score_heap *h)
-{
-  GRN_FREE(h->nodes);
-  GRN_FREE(h);
-}
-
-static grn_rc
-sequential_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *column, grn_obj *query,
-                        uint32_t max_distance, uint32_t prefix_match_size,
-                        uint32_t max_expansion, int flags, grn_obj *hash)
-{
-  grn_table_cursor *tc;
-  char *sx = GRN_TEXT_VALUE(query);
-  char *ex = GRN_BULK_CURR(query);
-
-  if ((tc = grn_table_cursor_open(ctx, table, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID))) {
-    grn_id id;
-    grn_obj value;
-    score_heap *heap;
-    int i, n;
-    GRN_TEXT_INIT(&value, 0);
-
-    heap = score_heap_open(ctx, SCORE_HEAP_SIZE);
-    if (!heap) {
-      return GRN_NO_MEMORY_AVAILABLE;
-    }
-
-    while ((id = grn_table_cursor_next(ctx, tc))) {
-      unsigned int distance = 0;
-      grn_obj *domain;
-      GRN_BULK_REWIND(&value);
-      grn_obj_get_value(ctx, column, id, &value);
-      domain = grn_ctx_at(ctx, ((&value))->header.domain);
-      if ((&(value))->header.type == GRN_VECTOR) {
-        n = grn_vector_size(ctx, &value);
-        for (i = 0; i < n; i++) {
-          unsigned int length;
-          const char *vector_value = NULL;
-          length = grn_vector_get_element(ctx, &value, i, &vector_value, NULL, NULL);
-
-          if (!prefix_match_size ||
-              (prefix_match_size > 0 && length >= prefix_match_size &&
-               !memcmp(sx, vector_value, prefix_match_size))) {
-            distance = calc_edit_distance(ctx, sx, ex,
-                                          (char *)vector_value,
-                                          (char *)vector_value + length, flags);
-            if (distance <= max_distance) {
-              score_heap_push(ctx, heap, id, distance);
-              break;
-            }
-          }
-        }
-      } else if ((&(value))->header.type == GRN_UVECTOR &&
-                  grn_obj_is_table(ctx, domain)) {
-        n = grn_vector_size(ctx, &value);
-        for (i = 0; i < n; i++) {
-          grn_id rid;
-          char key_name[GRN_TABLE_MAX_KEY_SIZE];
-          int key_length;
-          rid = grn_uvector_get_element(ctx, &value, i, NULL);
-          key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE);
-
-          if (!prefix_match_size ||
-              (prefix_match_size > 0 && key_length >= prefix_match_size &&
-               !memcmp(sx, key_name, prefix_match_size))) {
-            distance = calc_edit_distance(ctx, sx, ex,
-                                          key_name, key_name + key_length, flags);
-            if (distance <= max_distance) {
-              score_heap_push(ctx, heap, id, distance);
-              break;
-            }
-          }
-        }
-      } else {
-        if (reference_column_p(ctx, column)) {
-          grn_id rid;
-          char key_name[GRN_TABLE_MAX_KEY_SIZE];
-          int key_length;
-          rid = GRN_RECORD_VALUE(&value);
-          key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE);
-          if (!prefix_match_size ||
-              (prefix_match_size > 0 && key_length >= prefix_match_size &&
-               !memcmp(sx, key_name, prefix_match_size))) {
-            distance = calc_edit_distance(ctx, sx, ex,
-                                          key_name, key_name + key_length, flags);
-            if (distance <= max_distance) {
-              score_heap_push(ctx, heap, id, distance);
-            }
-          }
-        } else {
-          if (!prefix_match_size ||
-              (prefix_match_size > 0 && GRN_TEXT_LEN(&value) >= prefix_match_size &&
-               !memcmp(sx, GRN_TEXT_VALUE(&value), prefix_match_size))) {
-            distance = calc_edit_distance(ctx, sx, ex,
-                                          GRN_TEXT_VALUE(&value),
-                                          GRN_BULK_CURR(&value), flags);
-            if (distance <= max_distance) {
-              score_heap_push(ctx, heap, id, distance);
-            }
-          }
-        }
-      }
-      grn_obj_unlink(ctx, domain);
-    }
-    grn_table_cursor_close(ctx, tc);
-    grn_obj_unlink(ctx, &value);
-
-    for (i = 0; i < heap->n_entries; i++) {
-      if (max_expansion > 0 && i >= max_expansion) {
-        break;
-      }
-      grn_rset_recinfo *ri;
-      if (grn_hash_add(ctx, (grn_hash *)hash, &(heap->nodes[i].id), sizeof(grn_id), (void **)&ri, NULL)) {
-        ri->score = heap->nodes[i].score;
-      }
-    }
-    score_heap_close(ctx, heap);
-  }
-
-  return GRN_SUCCESS;
-}
-
-static grn_rc
-selector_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *index,
-                      int nargs, grn_obj **args,
-                      grn_obj *res, grn_operator op)
-{
-  grn_rc rc = GRN_SUCCESS;
-  grn_obj *target = NULL;
-  grn_obj *obj;
-  grn_obj *query;
-  uint32_t max_distance = 1;
-  uint32_t prefix_length = 0;
-  uint32_t prefix_match_size = 0;
-  uint32_t max_expansion = 0;
-  int flags = 0;
-  grn_bool use_sequential_search = GRN_FALSE;
-
-  if ((nargs - 1) < 2) {
-    ERR(GRN_INVALID_ARGUMENT,
-        "fuzzy_serach(): wrong number of arguments (%d ...)", nargs - 1);
-    rc = ctx->rc;
-    goto exit;
-  }
-  obj = args[1];
-  query = args[2];
-
-  if (nargs >= 4) {
-    max_distance = GRN_UINT32_VALUE(args[3]);
-  }
-  if (nargs >= 5) {
-    prefix_length = GRN_UINT32_VALUE(args[4]);
-  }
-  if (nargs >= 6) {
-    max_expansion = GRN_UINT32_VALUE(args[5]);
-  }
-  if (nargs == 7) {
-    if (GRN_BOOL_VALUE(args[6])) {
-      flags |= GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION;
-    }
-  }
-
-  if (index) {
-    target = index;
-  } else {
-    if (obj->header.type == GRN_COLUMN_INDEX) {
-      target = obj;
-    } else {
-      grn_column_index(ctx, obj, GRN_OP_FUZZY, &target, 1, NULL);
-    }
-  }
-
-  if (target) {
-    grn_obj *lexicon;
-    lexicon = grn_ctx_at(ctx, target->header.domain);
-    if (lexicon->header.type != GRN_TABLE_PAT_KEY) {
-      use_sequential_search = GRN_TRUE;
-    }
-  } else {
-    if (grn_obj_is_key_accessor(ctx, obj) &&
-        table->header.type == GRN_TABLE_PAT_KEY) {
-       target = table;
-    } else {
-      use_sequential_search = GRN_TRUE;
-    }
-  }
-
-  if (prefix_length) {
-    const char *s = GRN_TEXT_VALUE(query);
-    const char *e = GRN_BULK_CURR(query);
-    const char *p;
-    unsigned int cl = 0;
-    unsigned int length = 0;
-    for (p = s; p < e && (cl = grn_charlen(ctx, p, e)); p += cl) {
-      length++;
-      if (length > prefix_length) {
-        break;
-      }
-    }
-    prefix_match_size = p - s;
-  }
-
-  if (use_sequential_search) {
-    grn_obj *hash;
-    hash = grn_table_create(ctx, NULL, 0, NULL,
-                            GRN_OBJ_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC,
-                            table, NULL);
-    if (!hash) {
-      return GRN_NO_MEMORY_AVAILABLE;
-    }
-
-    if (op == GRN_OP_AND) {
-      rc = sequential_fuzzy_search(ctx, res, obj, query,
-                                   max_distance, prefix_match_size,
-                                   max_expansion, flags, hash);
-    } else {
-      rc = sequential_fuzzy_search(ctx, table, obj, query,
-                                   max_distance, prefix_match_size,
-                                   max_expansion, flags, hash);
-    }
-
-    if (rc == GRN_SUCCESS) {
-      rc = grn_table_setoperation(ctx, res, hash, res, op);
-    }
-    grn_obj_unlink(ctx, hash);
-    goto exit;
-  }
-
-  if (!target) {
-    grn_obj inspected;
-    GRN_TEXT_INIT(&inspected, 0);
-    grn_inspect(ctx, &inspected, target);
-    ERR(GRN_INVALID_ARGUMENT,
-        "fuzzy_search(): column mast be COLUMN_INDEX or TABLE_PAT_KEY: <%.*s>",
-        (int)GRN_TEXT_LEN(&inspected),
-        GRN_TEXT_VALUE(&inspected));
-    rc = ctx->rc;
-    GRN_OBJ_FIN(ctx, &inspected);
-    goto exit;
-  } else {
-    grn_search_optarg options = {0};
-    options.mode = GRN_OP_FUZZY;
-    options.fuzzy.prefix_match_size = prefix_match_size;
-    options.fuzzy.max_distance = max_distance;
-    options.fuzzy.max_expansion = max_expansion;
-    options.fuzzy.flags = flags;
-    grn_obj_search(ctx, target, query, res, op, &options);
-  }
-
-exit :
-  return rc;
-}
-
 #define DEF_VAR(v,name_str) do {\
   (v).name = (name_str);\
   (v).name_size = GRN_STRLEN(name_str);\
@@ -7328,8 +6933,7 @@ grn_db_init_builtin_query(grn_ctx *ctx)
   grn_proc_create(ctx, "geo_distance3", -1, GRN_PROC_FUNCTION,
                   func_geo_distance3, NULL, NULL, 0, NULL);
 
-  grn_proc_create(ctx, "edit_distance", -1, GRN_PROC_FUNCTION,
-                  func_edit_distance, NULL, NULL, 0, NULL);
+  grn_proc_init_edit_distance(ctx);
 
   {
     grn_obj *selector_proc;
@@ -7446,12 +7050,5 @@ grn_db_init_builtin_query(grn_ctx *ctx)
 
   grn_proc_init_inspect(ctx);
 
-  {
-    grn_obj *selector_proc;
-
-    selector_proc = grn_proc_create(ctx, "fuzzy_search", -1,
-                                    GRN_PROC_FUNCTION,
-                                    NULL, NULL, NULL, 0, NULL);
-    grn_proc_set_selector(ctx, selector_proc, selector_fuzzy_search);
-  }
+  grn_proc_init_fuzzy_search(ctx);
 }

  Added: lib/proc/proc_fuzzy_search.c (+419 -0) 100644
===================================================================
--- /dev/null
+++ lib/proc/proc_fuzzy_search.c    2016-02-05 18:40:56 +0900 (be500fe)
@@ -0,0 +1,419 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2009-2016 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include "../grn_proc.h"
+#include "../grn_rset.h"
+
+#include <groonga/plugin.h>
+
+#include <string.h>
+
+#define DIST(ox,oy) (dists[((lx + 1) * (oy)) + (ox)])
+
+static uint32_t
+calc_edit_distance(grn_ctx *ctx, char *sx, char *ex, char *sy, char *ey, int flags)
+{
+  int d = 0;
+  uint32_t cx, lx, cy, ly, *dists;
+  char *px, *py;
+  for (px = sx, lx = 0; px < ex && (cx = grn_charlen(ctx, px, ex)); px += cx, lx++);
+  for (py = sy, ly = 0; py < ey && (cy = grn_charlen(ctx, py, ey)); py += cy, ly++);
+  if ((dists = GRN_PLUGIN_MALLOC(ctx, (lx + 1) * (ly + 1) * sizeof(uint32_t)))) {
+    uint32_t x, y;
+    for (x = 0; x <= lx; x++) { DIST(x, 0) = x; }
+    for (y = 0; y <= ly; y++) { DIST(0, y) = y; }
+    for (x = 1, px = sx; x <= lx; x++, px += cx) {
+      cx = grn_charlen(ctx, px, ex);
+      for (y = 1, py = sy; y <= ly; y++, py += cy) {
+        cy = grn_charlen(ctx, py, ey);
+        if (cx == cy && !memcmp(px, py, cx)) {
+          DIST(x, y) = DIST(x - 1, y - 1);
+        } else {
+          uint32_t a = DIST(x - 1, y) + 1;
+          uint32_t b = DIST(x, y - 1) + 1;
+          uint32_t c = DIST(x - 1, y - 1) + 1;
+          DIST(x, y) = ((a < b) ? ((a < c) ? a : c) : ((b < c) ? b : c));
+          if (flags & GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION &&
+              x > 1 && y > 1 && cx == cy &&
+              memcmp(px, py - cy, cx) == 0 &&
+              memcmp(px - cx, py, cx) == 0) {
+            uint32_t t = DIST(x - 2, y - 2) + 1;
+            DIST(x, y) = ((DIST(x, y) < t) ? DIST(x, y) : t);
+          }
+        }
+      }
+    }
+    d = DIST(lx, ly);
+    GRN_PLUGIN_FREE(ctx, dists);
+  }
+  return d;
+}
+
+static grn_obj *
+func_edit_distance(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+#define N_REQUIRED_ARGS 2
+#define MAX_ARGS 3
+  int d = 0;
+  int flags = 0;
+  grn_obj *obj;
+  if (nargs >= N_REQUIRED_ARGS && nargs <= MAX_ARGS) {
+    if (nargs == MAX_ARGS && GRN_BOOL_VALUE(args[2])) {
+      flags |= GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION;
+    }
+    d = calc_edit_distance(ctx, GRN_TEXT_VALUE(args[0]), GRN_BULK_CURR(args[0]),
+                           GRN_TEXT_VALUE(args[1]), GRN_BULK_CURR(args[1]), flags);
+  }
+  if ((obj = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_UINT32, 0))) {
+    GRN_UINT32_SET(ctx, obj, d);
+  }
+  return obj;
+#undef N_REQUIRED_ARGS
+#undef MAX_ARGS
+}
+
+void
+grn_proc_init_edit_distance(grn_ctx *ctx)
+{
+  grn_proc_create(ctx, "edit_distance", -1, GRN_PROC_FUNCTION,
+                  func_edit_distance, NULL, NULL, 0, NULL);
+}
+
+#define SCORE_HEAP_SIZE 256
+
+typedef struct {
+  grn_id id;
+  uint32_t score;
+} score_heap_node;
+
+typedef struct {
+  int n_entries;
+  int limit;
+  score_heap_node *nodes;
+} score_heap;
+
+static inline score_heap *
+score_heap_open(grn_ctx *ctx, int max)
+{
+  score_heap *h = GRN_PLUGIN_MALLOC(ctx, sizeof(score_heap));
+  if (!h) { return NULL; }
+  h->nodes = GRN_PLUGIN_MALLOC(ctx, sizeof(score_heap_node) * max);
+  if (!h->nodes) {
+    GRN_PLUGIN_FREE(ctx, h);
+    return NULL;
+  }
+  h->n_entries = 0;
+  h->limit = max;
+  return h;
+}
+
+static inline grn_bool
+score_heap_push(grn_ctx *ctx, score_heap *h, grn_id id, uint32_t score)
+{
+  int n, n2;
+  score_heap_node node = {id, score};
+  score_heap_node node2;
+  if (h->n_entries >= h->limit) {
+    int max = h->limit * 2;
+    score_heap_node *nodes;
+    nodes = GRN_PLUGIN_REALLOC(ctx, h->nodes, sizeof(score_heap) * max);
+    if (!nodes) {
+      return GRN_FALSE;
+    }
+    h->limit = max;
+    h->nodes = nodes;
+  }
+  h->nodes[h->n_entries] = node;
+  n = h->n_entries++;
+  while (n) {
+    n2 = (n - 1) >> 1;
+    if (h->nodes[n2].score <= h->nodes[n].score) { break; }
+    node2 = h->nodes[n];
+    h->nodes[n] = h->nodes[n2];
+    h->nodes[n2] = node2;
+    n = n2;
+  }
+  return GRN_TRUE;
+}
+
+static inline void
+score_heap_close(grn_ctx *ctx, score_heap *h)
+{
+  GRN_PLUGIN_FREE(ctx, h->nodes);
+  GRN_PLUGIN_FREE(ctx, h);
+}
+
+static grn_rc
+sequential_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *column, grn_obj *query,
+                        uint32_t max_distance, uint32_t prefix_match_size,
+                        uint32_t max_expansion, int flags, grn_obj *hash)
+{
+  grn_table_cursor *tc;
+  char *sx = GRN_TEXT_VALUE(query);
+  char *ex = GRN_BULK_CURR(query);
+
+  if ((tc = grn_table_cursor_open(ctx, table, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID))) {
+    grn_id id;
+    grn_obj value;
+    score_heap *heap;
+    int i, n;
+    GRN_TEXT_INIT(&value, 0);
+
+    heap = score_heap_open(ctx, SCORE_HEAP_SIZE);
+    if (!heap) {
+      return GRN_NO_MEMORY_AVAILABLE;
+    }
+
+    while ((id = grn_table_cursor_next(ctx, tc))) {
+      unsigned int distance = 0;
+      grn_obj *domain;
+      GRN_BULK_REWIND(&value);
+      grn_obj_get_value(ctx, column, id, &value);
+      domain = grn_ctx_at(ctx, ((&value))->header.domain);
+      if ((&(value))->header.type == GRN_VECTOR) {
+        n = grn_vector_size(ctx, &value);
+        for (i = 0; i < n; i++) {
+          unsigned int length;
+          const char *vector_value = NULL;
+          length = grn_vector_get_element(ctx, &value, i, &vector_value, NULL, NULL);
+
+          if (!prefix_match_size ||
+              (prefix_match_size > 0 && length >= prefix_match_size &&
+               !memcmp(sx, vector_value, prefix_match_size))) {
+            distance = calc_edit_distance(ctx, sx, ex,
+                                          (char *)vector_value,
+                                          (char *)vector_value + length, flags);
+            if (distance <= max_distance) {
+              score_heap_push(ctx, heap, id, distance);
+              break;
+            }
+          }
+        }
+      } else if ((&(value))->header.type == GRN_UVECTOR &&
+                  grn_obj_is_table(ctx, domain)) {
+        n = grn_vector_size(ctx, &value);
+        for (i = 0; i < n; i++) {
+          grn_id rid;
+          char key_name[GRN_TABLE_MAX_KEY_SIZE];
+          int key_length;
+          rid = grn_uvector_get_element(ctx, &value, i, NULL);
+          key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE);
+
+          if (!prefix_match_size ||
+              (prefix_match_size > 0 && key_length >= prefix_match_size &&
+               !memcmp(sx, key_name, prefix_match_size))) {
+            distance = calc_edit_distance(ctx, sx, ex,
+                                          key_name, key_name + key_length, flags);
+            if (distance <= max_distance) {
+              score_heap_push(ctx, heap, id, distance);
+              break;
+            }
+          }
+        }
+      } else {
+        if (grn_obj_is_reference_column(ctx, column)) {
+          grn_id rid;
+          char key_name[GRN_TABLE_MAX_KEY_SIZE];
+          int key_length;
+          rid = GRN_RECORD_VALUE(&value);
+          key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE);
+          if (!prefix_match_size ||
+              (prefix_match_size > 0 && key_length >= prefix_match_size &&
+               !memcmp(sx, key_name, prefix_match_size))) {
+            distance = calc_edit_distance(ctx, sx, ex,
+                                          key_name, key_name + key_length, flags);
+            if (distance <= max_distance) {
+              score_heap_push(ctx, heap, id, distance);
+            }
+          }
+        } else {
+          if (!prefix_match_size ||
+              (prefix_match_size > 0 && GRN_TEXT_LEN(&value) >= prefix_match_size &&
+               !memcmp(sx, GRN_TEXT_VALUE(&value), prefix_match_size))) {
+            distance = calc_edit_distance(ctx, sx, ex,
+                                          GRN_TEXT_VALUE(&value),
+                                          GRN_BULK_CURR(&value), flags);
+            if (distance <= max_distance) {
+              score_heap_push(ctx, heap, id, distance);
+            }
+          }
+        }
+      }
+      grn_obj_unlink(ctx, domain);
+    }
+    grn_table_cursor_close(ctx, tc);
+    grn_obj_unlink(ctx, &value);
+
+    for (i = 0; i < heap->n_entries; i++) {
+      if (max_expansion > 0 && i >= max_expansion) {
+        break;
+      }
+      grn_rset_recinfo *ri;
+      if (grn_hash_add(ctx, (grn_hash *)hash, &(heap->nodes[i].id), sizeof(grn_id), (void **)&ri, NULL)) {
+        ri->score = heap->nodes[i].score;
+      }
+    }
+    score_heap_close(ctx, heap);
+  }
+
+  return GRN_SUCCESS;
+}
+
+static grn_rc
+selector_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *index,
+                      int nargs, grn_obj **args,
+                      grn_obj *res, grn_operator op)
+{
+  grn_rc rc = GRN_SUCCESS;
+  grn_obj *target = NULL;
+  grn_obj *obj;
+  grn_obj *query;
+  uint32_t max_distance = 1;
+  uint32_t prefix_length = 0;
+  uint32_t prefix_match_size = 0;
+  uint32_t max_expansion = 0;
+  int flags = 0;
+  grn_bool use_sequential_search = GRN_FALSE;
+
+  if ((nargs - 1) < 2) {
+    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+                     "fuzzy_serach(): wrong number of arguments (%d ...)",
+                     nargs - 1);
+    rc = ctx->rc;
+    goto exit;
+  }
+  obj = args[1];
+  query = args[2];
+
+  if (nargs >= 4) {
+    max_distance = GRN_UINT32_VALUE(args[3]);
+  }
+  if (nargs >= 5) {
+    prefix_length = GRN_UINT32_VALUE(args[4]);
+  }
+  if (nargs >= 6) {
+    max_expansion = GRN_UINT32_VALUE(args[5]);
+  }
+  if (nargs == 7) {
+    if (GRN_BOOL_VALUE(args[6])) {
+      flags |= GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION;
+    }
+  }
+
+  if (index) {
+    target = index;
+  } else {
+    if (obj->header.type == GRN_COLUMN_INDEX) {
+      target = obj;
+    } else {
+      grn_column_index(ctx, obj, GRN_OP_FUZZY, &target, 1, NULL);
+    }
+  }
+
+  if (target) {
+    grn_obj *lexicon;
+    lexicon = grn_ctx_at(ctx, target->header.domain);
+    if (lexicon->header.type != GRN_TABLE_PAT_KEY) {
+      use_sequential_search = GRN_TRUE;
+    }
+  } else {
+    if (grn_obj_is_key_accessor(ctx, obj) &&
+        table->header.type == GRN_TABLE_PAT_KEY) {
+       target = table;
+    } else {
+      use_sequential_search = GRN_TRUE;
+    }
+  }
+
+  if (prefix_length) {
+    const char *s = GRN_TEXT_VALUE(query);
+    const char *e = GRN_BULK_CURR(query);
+    const char *p;
+    unsigned int cl = 0;
+    unsigned int length = 0;
+    for (p = s; p < e && (cl = grn_charlen(ctx, p, e)); p += cl) {
+      length++;
+      if (length > prefix_length) {
+        break;
+      }
+    }
+    prefix_match_size = p - s;
+  }
+
+  if (use_sequential_search) {
+    grn_obj *hash;
+    hash = grn_table_create(ctx, NULL, 0, NULL,
+                            GRN_OBJ_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC,
+                            table, NULL);
+    if (!hash) {
+      return GRN_NO_MEMORY_AVAILABLE;
+    }
+
+    if (op == GRN_OP_AND) {
+      rc = sequential_fuzzy_search(ctx, res, obj, query,
+                                   max_distance, prefix_match_size,
+                                   max_expansion, flags, hash);
+    } else {
+      rc = sequential_fuzzy_search(ctx, table, obj, query,
+                                   max_distance, prefix_match_size,
+                                   max_expansion, flags, hash);
+    }
+
+    if (rc == GRN_SUCCESS) {
+      rc = grn_table_setoperation(ctx, res, hash, res, op);
+    }
+    grn_obj_unlink(ctx, hash);
+    goto exit;
+  }
+
+  if (!target) {
+    grn_obj inspected;
+    GRN_TEXT_INIT(&inspected, 0);
+    grn_inspect(ctx, &inspected, target);
+    GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+                     "fuzzy_search(): "
+                     "column mast be COLUMN_INDEX or TABLE_PAT_KEY: <%.*s>",
+                     (int)GRN_TEXT_LEN(&inspected),
+                     GRN_TEXT_VALUE(&inspected));
+    rc = ctx->rc;
+    GRN_OBJ_FIN(ctx, &inspected);
+    goto exit;
+  } else {
+    grn_search_optarg options = {0};
+    options.mode = GRN_OP_FUZZY;
+    options.fuzzy.prefix_match_size = prefix_match_size;
+    options.fuzzy.max_distance = max_distance;
+    options.fuzzy.max_expansion = max_expansion;
+    options.fuzzy.flags = flags;
+    grn_obj_search(ctx, target, query, res, op, &options);
+  }
+
+exit :
+  return rc;
+}
+
+void
+grn_proc_init_fuzzy_search(grn_ctx *ctx)
+{
+  grn_obj *selector_proc;
+
+  selector_proc = grn_proc_create(ctx, "fuzzy_search", -1,
+                                  GRN_PROC_FUNCTION,
+                                  NULL, NULL, NULL, 0, NULL);
+  grn_proc_set_selector(ctx, selector_proc, selector_fuzzy_search);
+}

  Modified: lib/proc/sources.am (+1 -0)
===================================================================
--- lib/proc/sources.am    2016-02-05 18:40:17 +0900 (2c564e9)
+++ lib/proc/sources.am    2016-02-05 18:40:56 +0900 (98b27a9)
@@ -1,5 +1,6 @@
 libgrnproc_la_SOURCES =				\
 	proc_config.c				\
+	proc_fuzzy_search.c			\
 	proc_inspect.c				\
 	proc_lock.c				\
 	proc_schema.c				\
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index