naoa
null+****@clear*****
Mon Feb 1 01:03:12 JST 2016
naoa 2016-02-01 01:03:12 +0900 (Mon, 01 Feb 2016) New Revision: 3511a23248290fb96eb312cfafed068b217ef1e5 https://github.com/groonga/groonga/commit/3511a23248290fb96eb312cfafed068b217ef1e5 Merged 2d929d3: Merge pull request #460 from naoa/pat-fussy-search Message: pat: Add grn_pat_fussy_search() Modified files: lib/grn_pat.h lib/pat.c test/unit/core/test-patricia-trie-search.c Modified: lib/grn_pat.h (+6 -0) =================================================================== --- lib/grn_pat.h 2016-01-31 16:36:16 +0900 (6a2a433) +++ lib/grn_pat.h 2016-02-01 01:03:12 +0900 (db642be) @@ -107,6 +107,12 @@ void grn_pat_cursor_inspect(grn_ctx *ctx, grn_pat_cursor *c, grn_obj *buf); grn_rc grn_pat_cache_enable(grn_ctx *ctx, grn_pat *pat, uint32_t cache_size); void grn_pat_cache_disable(grn_ctx *ctx, grn_pat *pat); +#define WITH_TRANSPOSITION (0x01) + +GRN_API grn_rc grn_pat_fussy_search(grn_ctx *ctx, grn_pat *pat, + const void *key, unsigned int key_size, unsigned int max_size, + unsigned int max_distance, int flags, grn_hash *h); + #ifdef __cplusplus } #endif Modified: lib/pat.c (+216 -0) =================================================================== --- lib/pat.c 2016-01-31 16:36:16 +0900 (0d94a71) +++ lib/pat.c 2016-02-01 01:03:12 +0900 (f185f51) @@ -1224,6 +1224,222 @@ grn_pat_lcp_search(grn_ctx *ctx, grn_pat *pat, const void *key, uint32_t key_siz return r2; } +#define DIST(ox,oy) (dists[((lx + 1) * (oy)) + (ox)]) + +inline static uint16_t +calc_edit_distance_by_offset(grn_ctx *ctx, + const char *sx, const char *ex, + const char *sy, const char *ey, + uint16_t *dists, uint32_t lx, + uint32_t offset, uint32_t max_distance, + grn_bool *can_transition, int flags) +{ + uint32_t cx, cy, x = 1, y = 1; + const char *px, *py; + + /* Skip already calculated rows */ + for (py = sy, y = 1; py < ey && (cy = grn_charlen(ctx, py, ey)); py += cy, y++) { + if (py - sy >= offset) { + break; + } + } + for (; py < ey && (cy = grn_charlen(ctx, py, ey)); py += cy, y++) { + /* children nodes will be no longer smaller than max distance + * with only insertion costs. + * This is end of row on allocated memory. */ + if (y > lx + max_distance) { + *can_transition = GRN_FALSE; + return max_distance + 1; + } + + for (px = sx, x = 1; px < ex && (cx = grn_charlen(ctx, px, ex)); px += cx, x++) { + if (cx == cy && !memcmp(px, py, cx)) { + DIST(x, y) = DIST(x - 1, y - 1); + } else { + uint32_t a, b, c; + a = DIST(x - 1, y) + 1; + b = DIST(x, y - 1) + 1; + c = DIST(x - 1, y - 1) + 1; + DIST(x, y) = ((a < b) ? ((a < c) ? a : c) : ((b < c) ? b : c)); + if (flags == WITH_TRANSPOSITION + && x > 1 && y > 1 + && cx == cy + && memcmp(px, py - cy, cx) == 0 + && memcmp(px - cx, py, cx) == 0) { + uint32_t t = DIST(x - 2, y - 2) + 1; + DIST(x, y) = ((DIST(x, y) < t) ? DIST(x, y) : t); + } + } + } + } + if (lx) { + /* If there is no cell which is smaller than equal to max distance on end of row, + * children nodes will be no longer smaller than max distance */ + *can_transition = GRN_FALSE; + for (x = 1; x <= lx; x++) { + if (DIST(x, y - 1) <= max_distance) { + *can_transition = GRN_TRUE; + break; + } + } + } + return DIST(lx, y - 1); +} + +typedef struct { + const char *key; + int key_length; + grn_bool can_transition; +} fussy_node; + +inline static void +_grn_pat_fussy_search(grn_ctx *ctx, grn_pat *pat, grn_id id, + const char *key, uint32_t key_size, + uint16_t *dists, uint32_t lx, + int last_check, fussy_node *last_node, + uint32_t max_distance, int flags, grn_hash *h) +{ + pat_node *node = NULL; + int check, len; + const char *k; + uint32_t offset = 0; + + PAT_AT(pat, id, node); + if (!node) { + return; + } + check = PAT_CHK(node); + len = PAT_LEN(node); + k = pat_node_get_key(ctx, pat, node); + + if (check > last_check) { + if (len >= last_node->key_length && + !memcmp(k, last_node->key, last_node->key_length)) { + if (last_node->can_transition == GRN_FALSE) { + return; + } + } + _grn_pat_fussy_search(ctx, pat, node->lr[0], + key, key_size, dists, lx, + check, last_node, + max_distance, flags, h); + + _grn_pat_fussy_search(ctx, pat, node->lr[1], + key, key_size, dists, lx, + check, last_node, + max_distance, flags, h); + } else { + if (id) { + /* Set already calculated common prefix length */ + if (len >= last_node->key_length && + !memcmp(k, last_node->key, last_node->key_length)) { + if (last_node->can_transition == GRN_FALSE) { + return; + } + offset = last_node->key_length; + } else { + if (last_node->can_transition == GRN_FALSE) { + last_node->can_transition = GRN_TRUE; + } + if (last_node->key_length) { + const char *kp = k; + const char *ke = k + len; + const char *p = last_node->key; + const char *e = last_node->key + last_node->key_length; + int lp; + for (;p < e && kp < ke && (lp = grn_charlen(ctx, p, e)); + p += lp, kp += lp) { + if (p + lp <= e && kp + lp <= ke && memcmp(p, kp, lp)) { + break; + } + } + offset = kp - k; + } + } + if (len - offset) { + uint16_t distance = 0; + distance = + calc_edit_distance_by_offset(ctx, + key, key + key_size, + k, k + len, + dists, lx, + offset, max_distance, + &(last_node->can_transition), flags); + if (distance <= max_distance) { + if (DB_OBJ(h)->header.flags & GRN_OBJ_WITH_SUBREC) { + grn_rset_recinfo *ri; + if (grn_hash_add(ctx, h, &id, sizeof(grn_id), (void **)&ri, NULL)) { + ri->score = distance; + } + } else { + grn_hash_add(ctx, h, &id, sizeof(grn_id), NULL, NULL); + } + } + } + last_node->key = k; + last_node->key_length = len; + } + } + return; +} + +grn_rc +grn_pat_fussy_search(grn_ctx *ctx, grn_pat *pat, + const void *key, uint32_t key_size, uint32_t max_size, + uint32_t max_distance, int flags, grn_hash *h) +{ + pat_node *node; + grn_id id; + uint16_t *dists; + uint32_t lx, len, x, y; + const char *s = key; + const char *e = (const char *)key + key_size; + fussy_node last_node; + PAT_AT(pat, GRN_ID_NIL, node); + id = node->lr[1]; + + if (key_size > GRN_TABLE_MAX_KEY_SIZE || + max_distance > GRN_TABLE_MAX_KEY_SIZE || + max_size > key_size) { + return GRN_INVALID_ARGUMENT; + } + if (grn_pat_error_if_truncated(ctx, pat) != GRN_SUCCESS) { + return GRN_ID_NIL; + } + if (max_size) { + grn_pat_cursor *cur; + if ((cur = grn_pat_cursor_open(ctx, pat, key, max_size, + NULL, 0, 0, -1, GRN_CURSOR_PREFIX))) { + grn_id tid; + tid = grn_pat_cursor_next(ctx, cur); + grn_pat_cursor_close(ctx, cur); + if (tid) { + id = tid; + } else { + return GRN_END_OF_DATA; + } + } + } + for (lx = 0; s < e && (len = grn_charlen(ctx, s, e)); s += len, lx++); + dists = GRN_MALLOC((lx + 1) * (lx + max_distance + 1) * sizeof(uint16_t)); + + for (x = 0; x <= lx; x++) { DIST(x, 0) = x; } + for (y = 0; y <= lx + max_distance ; y++) { DIST(0, y) = y; } + + last_node.key = NULL; + last_node.key_length = 0; + last_node.can_transition = GRN_TRUE; + _grn_pat_fussy_search(ctx, pat, id, + key, key_size, dists, lx, + -1, &last_node, max_distance, flags, h); + GRN_FREE(dists); + if (grn_hash_size(ctx, h)) { + return GRN_SUCCESS; + } else { + return GRN_END_OF_DATA; + } +} + inline static grn_rc _grn_pat_del(grn_ctx *ctx, grn_pat *pat, const char *key, uint32_t key_size, int shared, grn_table_delete_optarg *optarg) Modified: test/unit/core/test-patricia-trie-search.c (+63 -0) =================================================================== --- test/unit/core/test-patricia-trie-search.c 2016-01-31 16:36:16 +0900 (2f948d7) +++ test/unit/core/test-patricia-trie-search.c 2016-02-01 01:03:12 +0900 (894f31e) @@ -24,6 +24,8 @@ void data_prefix_search(void); void test_prefix_search(gconstpointer data); void data_suffix_search(void); void test_suffix_search(gconstpointer data); +void data_fussy_search(void); +void test_fussy_search(gconstpointer data); static GList *keys; @@ -385,3 +387,64 @@ test_suffix_search(gconstpointer data) gcut_assert_equal_list_string(test_data->expected_strings, retrieve_all_keys()); } + +void +data_fussy_search(void) +{ + cut_add_data("nonexistence", + xfix_test_data_new(GRN_END_OF_DATA, NULL, "cccc", NULL, NULL), + xfix_test_data_free, + "insertion", + xfix_test_data_new(GRN_SUCCESS, + gcut_list_string_new("ああ", "あ", NULL), + "あ", NULL, NULL), + xfix_test_data_free, + "deletion", + xfix_test_data_new(GRN_SUCCESS, + gcut_list_string_new("bbbbb", NULL), + "bbbbbb", NULL, NULL), + xfix_test_data_free, + "substitution", + xfix_test_data_new(GRN_SUCCESS, + gcut_list_string_new("cdefg", NULL), + "cdefh", NULL, NULL), + xfix_test_data_free, + "transposition", + xfix_test_data_new(GRN_SUCCESS, + gcut_list_string_new("cdefg", NULL), + "cdegf", NULL, NULL), + xfix_test_data_free); +} + +void +test_fussy_search(gconstpointer data) +{ + const grn_trie_test_data *test_data = data; + const gchar key1[] = "あ"; + const gchar key2[] = "ああ"; + const gchar key3[] = "あああ"; + const gchar key4[] = "bbbb"; + const gchar key5[] = "bbbbb"; + const gchar key6[] = "cdefg"; + + trie_test_data_set_parameters(test_data); + + cut_assert_create_trie(); + + cut_assert_lookup_add(key1); + cut_assert_lookup_add(key2); + cut_assert_lookup_add(key3); + cut_assert_lookup_add(key4); + cut_assert_lookup_add(key5); + cut_assert_lookup_add(key6); + + cut_assert_create_hash(); + grn_test_assert_equal_rc(test_data->expected_rc, + grn_pat_fussy_search(context, trie, + test_data->search_key, + strlen(test_data->search_key), + 0, 1, 1, + hash)); + gcut_assert_equal_list_string(test_data->expected_strings, + retrieve_all_keys()); +} -------------- next part -------------- HTML����������������������������...Download