null+****@clear*****
null+****@clear*****
2012年 1月 19日 (木) 12:20:08 JST
Susumu Yata 2012-01-19 12:20:08 +0900 (Thu, 19 Jan 2012)
New Revision: 76543ff617ce97ebe0fb1c7280ac422c635fe49f
Log:
implemented grn_dat_repair().
Modified files:
lib/dat.cpp
lib/dat.h
lib/dat/dat.hpp
lib/dat/entry.hpp
lib/dat/trie.cpp
lib/dat/trie.hpp
Modified: lib/dat.cpp (+27 -0)
===================================================================
--- lib/dat.cpp 2012-01-17 10:49:32 +0900 (0f99f68)
+++ lib/dat.cpp 2012-01-19 12:20:08 +0900 (7ffc0a9)
@@ -1015,4 +1015,31 @@ grn_dat_clear_status_flags(grn_ctx *ctx, grn_dat *dat)
return GRN_SUCCESS;
}
+grn_rc
+grn_dat_repair(grn_ctx *ctx, grn_dat *dat)
+{
+ if (!grn_dat_open_trie_if_needed(ctx, dat)) {
+ return ctx->rc;
+ }
+ const grn::dat::Trie * const trie = static_cast<grn::dat::Trie *>(dat->trie);
+ if (!trie) {
+ return GRN_INVALID_ARGUMENT;
+ }
+
+ char trie_path[PATH_MAX];
+ grn_dat_generate_trie_path(grn_io_path(dat->io), trie_path, dat->header->file_id + 1);
+ try {
+ grn::dat::Trie().repair(*trie, trie_path);
+ } catch (const grn::dat::Exception &ex) {
+ const grn_rc error_code = grn_dat_translate_error_code(ex.code());
+ ERR(error_code, "grn::dat::Trie::create failed");
+ return error_code;
+ }
+ ++dat->header->file_id;
+ if (!grn_dat_open_trie_if_needed(ctx, dat)) {
+ return ctx->rc;
+ }
+ return GRN_SUCCESS;
+}
+
} // extern "C"
Modified: lib/dat.h (+10 -0)
===================================================================
--- lib/dat.h 2012-01-17 10:49:32 +0900 (df9477d)
+++ lib/dat.h 2012-01-19 12:20:08 +0900 (b747b76)
@@ -70,6 +70,10 @@ grn_id grn_dat_lcp_search(grn_ctx *ctx, grn_dat *dat,
grn_id grn_dat_curr_id(grn_ctx *ctx, grn_dat *dat);
+/*
+ Currently, grn_dat_truncate() is available if the grn_dat object is
+ associated with a file.
+ */
grn_rc grn_dat_truncate(grn_ctx *ctx, grn_dat *dat);
const char *_grn_dat_key(grn_ctx *ctx, grn_dat *dat, grn_id id, uint32_t *key_size);
@@ -78,6 +82,12 @@ grn_id grn_dat_at(grn_ctx *ctx, grn_dat *dat, grn_id id);
grn_rc grn_dat_clear_status_flags(grn_ctx *ctx, grn_dat *dat);
+/*
+ Currently, grn_dat_repair() is available if the grn_dat object is associated
+ with a file.
+ */
+grn_rc grn_dat_repair(grn_ctx *ctx, grn_dat *dat);
+
#ifdef __cplusplus
}
#endif
Modified: lib/dat/dat.hpp (+2 -0)
===================================================================
--- lib/dat/dat.hpp 2012-01-17 10:49:32 +0900 (1dace8d)
+++ lib/dat/dat.hpp 2012-01-19 12:20:08 +0900 (29fcd0e)
@@ -142,6 +142,8 @@ const UInt32 INSERTING_FLAG = 1U << 1;
const UInt32 UPDATING_FLAG = 1U << 2;
const UInt32 CHANGING_MASK = REMOVING_FLAG | INSERTING_FLAG | UPDATING_FLAG;
+const UInt32 MKQ_SORT_THRESHOLD = 10;
+
enum ErrorCode {
PARAM_ERROR = -1,
IO_ERROR = -2,
Modified: lib/dat/entry.hpp (+0 -2)
===================================================================
--- lib/dat/entry.hpp 2012-01-17 10:49:32 +0900 (60be482)
+++ lib/dat/entry.hpp 2012-01-19 12:20:08 +0900 (80b4155)
@@ -43,11 +43,9 @@ class Entry {
}
void set_key_pos(UInt32 x) {
- GRN_DAT_DEBUG_THROW_IF((x & IS_VALID_FLAG) != 0);
value_ = IS_VALID_FLAG | x;
}
void set_next(UInt32 x) {
- GRN_DAT_DEBUG_THROW_IF((x & IS_VALID_FLAG) != 0);
value_ = x;
}
Modified: lib/dat/trie.cpp (+239 -0)
===================================================================
--- lib/dat/trie.cpp 2012-01-17 10:49:32 +0900 (ebb6b00)
+++ lib/dat/trie.cpp 2012-01-19 12:20:08 +0900 (3be2845)
@@ -20,6 +20,8 @@
#include <algorithm>
#include <cstring>
+#include "vector.hpp"
+
namespace grn {
namespace dat {
namespace {
@@ -137,6 +139,14 @@ void Trie::create(const Trie &trie,
new_trie.swap(this);
}
+void Trie::repair(const Trie &trie, const char *file_name) {
+ Trie new_trie;
+ new_trie.create_file(file_name, trie.file_size(), trie.max_num_keys(),
+ trie.max_num_blocks(), trie.key_buf_size());
+ new_trie.repair_trie(trie);
+ new_trie.swap(this);
+}
+
void Trie::open(const char *file_name) {
GRN_DAT_THROW_IF(PARAM_ERROR, file_name == NULL);
@@ -339,6 +349,235 @@ void Trie::build_from_trie(const Trie &trie, UInt32 src, UInt32 dest) {
}
}
+void Trie::repair_trie(const Trie &trie) {
+ Vector<UInt32> valid_ids;
+ header_->set_max_key_id(trie.max_key_id());
+ header_->set_next_key_id(trie.max_key_id());
+ UInt32 prev_invalid_key_id = INVALID_KEY_ID;
+ for (UInt32 i = min_key_id(); i <= max_key_id(); ++i) {
+ const Entry &entry = trie.ith_entry(i);
+ if (entry.is_valid()) {
+ valid_ids.push_back(i);
+ ith_entry(i) = entry;
+ const Key &key = trie.get_key(entry.key_pos());
+ Key::create(key_buf_.ptr() + next_key_pos(),
+ key.id(), key.ptr(), key.length());
+ ith_entry(i).set_key_pos(next_key_pos());
+ header_->set_next_key_pos(
+ next_key_pos() + Key::estimate_size(key.length()));
+ header_->set_total_key_length(
+ total_key_length() + key.length());
+ header_->set_num_keys(num_keys() + 1);
+ } else {
+ if (prev_invalid_key_id == INVALID_KEY_ID) {
+ header_->set_next_key_id(i);
+ } else {
+ ith_entry(prev_invalid_key_id).set_next(i);
+ }
+ prev_invalid_key_id = i;
+ }
+ }
+ if (prev_invalid_key_id != INVALID_KEY_ID) {
+ ith_entry(prev_invalid_key_id).set_next(max_key_id() + 1);
+ }
+ mkq_sort(valid_ids.begin(), valid_ids.end(), 0);
+ build_from_keys(valid_ids.begin(), valid_ids.end(), 0, ROOT_NODE_ID);
+}
+
+void Trie::build_from_keys(const UInt32 *begin, const UInt32 *end,
+ UInt32 depth, UInt32 node_id) {
+ if ((end - begin) == 1) {
+ ith_node(node_id).set_key_pos(ith_entry(*begin).key_pos());
+ return;
+ }
+
+ UInt32 offset;
+ {
+ UInt16 labels[MAX_LABEL + 1];
+ UInt32 num_labels = 0;
+
+ const UInt32 *it = begin;
+ if (ith_key(*it).length() == depth) {
+ labels[num_labels++] = TERMINAL_LABEL;
+ ++it;
+ }
+
+ labels[num_labels++] = (UInt8)ith_key(*it)[depth];
+ for (++it; it < end; ++it) {
+ const Key &key = ith_key(*it);
+ if ((UInt8)key[depth] != labels[num_labels - 1]) {
+ labels[num_labels++] = (UInt8)key[depth];
+ }
+ }
+
+ offset = find_offset(labels, num_labels);
+ for (UInt32 i = 0; i < num_labels; ++i) {
+ const UInt32 next = offset ^ labels[i];
+ reserve_node(next);
+ ith_node(next).set_label(labels[i]);
+ }
+
+ if (offset >= num_nodes()) {
+ reserve_block(num_blocks());
+ }
+ ith_node(offset).set_is_offset(true);
+ ith_node(node_id).set_offset(offset);
+ }
+
+ if (ith_key(*begin).length() == depth) {
+ build_from_keys(begin, begin + 1, depth + 1, offset ^ TERMINAL_LABEL);
+ ++begin;
+ }
+
+ UInt16 label = ith_key(*begin)[depth];
+ for (const UInt32 *it = begin + 1; it < end; ++it) {
+ const Key &key = ith_key(*it);
+ if ((UInt8)key[depth] != label) {
+ build_from_keys(begin, it, depth + 1, offset ^ label);
+ label = (UInt8)key[depth];
+ begin = it;
+ }
+ }
+ build_from_keys(begin, end, depth + 1, offset ^ label);
+}
+
+void Trie::mkq_sort(UInt32 *l, UInt32 *r, UInt32 depth) {
+ while ((r - l) >= MKQ_SORT_THRESHOLD) {
+ UInt32 *pl = l;
+ UInt32 *pr = r;
+ UInt32 *pivot_l = l;
+ UInt32 *pivot_r = r;
+
+ const int pivot = get_median(*l, *(l + (r - l) / 2), *(r - 1), depth);
+ for ( ; ; ) {
+ while (pl < pr) {
+ const int label = get_label(*pl, depth);
+ if (label > pivot) {
+ break;
+ } else if (label == pivot) {
+ swap_ids(pl, pivot_l);
+ ++pivot_l;
+ }
+ ++pl;
+ }
+ while (pl < pr) {
+ const int label = get_label(*--pr, depth);
+ if (label < pivot) {
+ break;
+ } else if (label == pivot) {
+ swap_ids(pr, --pivot_r);
+ }
+ }
+ if (pl >= pr) {
+ break;
+ }
+ swap_ids(pl, pr);
+ ++pl;
+ }
+ while (pivot_l > l) {
+ swap_ids(--pivot_l, --pl);
+ }
+ while (pivot_r < r) {
+ swap_ids(pivot_r, pr);
+ ++pivot_r;
+ ++pr;
+ }
+
+ if (((pl - l) > (pr - pl)) || ((r - pr) > (pr - pl))) {
+ if ((pr - pl) > 1) {
+ mkq_sort(pl, pr, depth + 1);
+ }
+
+ if ((pl - l) < (r - pr)) {
+ if ((pl - l) > 1) {
+ mkq_sort(l, pl, depth);
+ }
+ l = pr;
+ } else {
+ if ((r - pr) > 1) {
+ mkq_sort(pr, r, depth);
+ }
+ r = pl;
+ }
+ } else {
+ if ((pl - l) > 1) {
+ mkq_sort(l, pl, depth);
+ }
+
+ if ((r - pr) > 1) {
+ mkq_sort(pr, r, depth);
+ }
+
+ l = pl, r = pr;
+ if ((pr - pl) > 1) {
+ ++depth;
+ }
+ }
+ }
+
+ if ((r - l) > 1) {
+ insertion_sort(l, r, depth);
+ }
+}
+
+void Trie::insertion_sort(UInt32 *l, UInt32 *r, UInt32 depth) {
+ for (UInt32 *i = l + 1; i < r; ++i) {
+ for (UInt32 *j = i; j > l; --j) {
+ if (less_than(*(j - 1), *j, depth)) {
+ break;
+ }
+ swap_ids(j - 1, j);
+ }
+ }
+}
+
+int Trie::get_median(UInt32 a, UInt32 b, UInt32 c, UInt32 depth) const {
+ const int x = get_label(a, depth);
+ const int y = get_label(b, depth);
+ const int z = get_label(c, depth);
+ if (x < y) {
+ if (y < z) {
+ return y;
+ } else if (x < z) {
+ return z;
+ }
+ return x;
+ } else if (x < z) {
+ return x;
+ } else if (y < z) {
+ return z;
+ }
+ return y;
+}
+
+int Trie::get_label(UInt32 key_id, UInt32 depth) const {
+ const Key &key = ith_key(key_id);
+ if (depth == key.length()) {
+ return -1;
+ } else {
+ return (UInt8)key[depth];
+ }
+}
+
+bool Trie::less_than(UInt32 lhs, UInt32 rhs, UInt32 depth) const {
+ const Key &lhs_key = ith_key(lhs);
+ const Key &rhs_key = ith_key(rhs);
+ const UInt32 length = (lhs_key.length() < rhs_key.length()) ?
+ lhs_key.length() : rhs_key.length();
+ for (UInt32 i = depth; i < length; ++i) {
+ if (lhs_key[i] != rhs_key[i]) {
+ return (UInt8)lhs_key[i] < (UInt8)rhs_key[i];
+ }
+ }
+ return lhs_key.length() < rhs_key.length();
+}
+
+void Trie::swap_ids(UInt32 *lhs, UInt32 *rhs) {
+ UInt32 temp = *lhs;
+ *lhs = *rhs;
+ *rhs = temp;
+}
+
bool Trie::search_key(const UInt8 *ptr, UInt32 length, UInt32 *key_pos) const {
GRN_DAT_DEBUG_THROW_IF((ptr == NULL) && (length != 0));
Modified: lib/dat/trie.hpp (+15 -1)
===================================================================
--- lib/dat/trie.hpp 2012-01-17 10:49:32 +0900 (199dac3)
+++ lib/dat/trie.hpp 2012-01-19 12:20:08 +0900 (048846f)
@@ -40,13 +40,15 @@ class Trie {
double num_nodes_per_key = 0.0,
double average_key_length = 0.0);
- void create(const Trie &src_trie,
+ void create(const Trie &trie,
const char *file_name = NULL,
UInt64 file_size = 0,
UInt32 max_num_keys = 0,
double num_nodes_per_key = 0.0,
double average_key_length = 0.0);
+ void repair(const Trie &trie, const char *file_name = NULL);
+
void open(const char *file_name);
void close();
@@ -213,6 +215,18 @@ class Trie {
void build_from_trie(const Trie &trie);
void build_from_trie(const Trie &trie, UInt32 src, UInt32 dest);
+ void repair_trie(const Trie &trie);
+ void build_from_keys(const UInt32 *begin, const UInt32 *end,
+ UInt32 depth, UInt32 node_id);
+
+ void mkq_sort(UInt32 *l, UInt32 *r, UInt32 depth);
+ void insertion_sort(UInt32 *l, UInt32 *r, UInt32 depth);
+
+ inline int get_label(UInt32 key_id, UInt32 depth) const;
+ inline int get_median(UInt32 a, UInt32 b, UInt32 c, UInt32 depth) const;
+ inline bool less_than(UInt32 lhs, UInt32 rhs, UInt32 depth) const;
+ inline static void swap_ids(UInt32 *lhs, UInt32 *rhs);
+
bool search_key(const UInt8 *ptr, UInt32 length, UInt32 *key_pos) const;
bool search_linker(const UInt8 *ptr, UInt32 length,
UInt32 &node_id, UInt32 &query_pos) const;