[Groonga-commit] groonga/grnxx [master] Implement defragmentation of grnxx::map::da::basic::Trie.

Back to archive index

susumu.yata null+****@clear*****
Thu Feb 14 15:34:45 JST 2013


susumu.yata	2013-02-14 15:34:45 +0900 (Thu, 14 Feb 2013)

  New Revision: a3bd274673871f771b7f08f7bb2a2dd0a931407e
  https://github.com/groonga/grnxx/commit/a3bd274673871f771b7f08f7bb2a2dd0a931407e

  Log:
    Implement defragmentation of grnxx::map::da::basic::Trie.

  Modified files:
    lib/map/da/basic_trie.cpp
    lib/map/da/basic_trie.hpp
    lib/map/da/trie.cpp
    lib/map/da/trie.hpp

  Modified: lib/map/da/basic_trie.cpp (+179 -30)
===================================================================
--- lib/map/da/basic_trie.cpp    2013-02-14 12:56:55 +0900 (f5a8522)
+++ lib/map/da/basic_trie.cpp    2013-02-14 15:34:45 +0900 (06ab83d)
@@ -55,14 +55,13 @@ Trie::~Trie() {
   }
 }
 
-// TODO: Currently, "options" is not used.
-Trie *Trie::create(const TrieOptions &, io::Pool pool) {
+Trie *Trie::create(const TrieOptions &options, io::Pool pool) {
   std::unique_ptr<Trie> trie(new (std::nothrow) Trie);
   if (!trie) {
     GRNXX_ERROR() << "new grnxx::map::Trie failed";
     GRNXX_THROW();
   }
-  trie->create_double_array(pool);
+  trie->create_trie(options, pool);
   return trie.release();
 }
 
@@ -72,13 +71,19 @@ Trie *Trie::open(io::Pool pool, uint32_t block_id) {
     GRNXX_ERROR() << "new grnxx::map::Trie failed";
     GRNXX_THROW();
   }
-  trie->open_double_array(pool, block_id);
+  trie->open_trie(pool, block_id);
   return trie.release();
 }
 
-// TODO
-//da::Trie *Trie::defrag() {
-//}
+Trie *Trie::defrag(const TrieOptions &options) {
+  std::unique_ptr<Trie> trie(new (std::nothrow) Trie);
+  if (!trie) {
+    GRNXX_ERROR() << "new grnxx::map::Trie failed";
+    GRNXX_THROW();
+  }
+  trie->defrag_trie(options, *this, pool_);
+  return trie.release();
+}
 
 uint32_t Trie::block_id() const {
   return block_info_->id();
@@ -253,7 +258,7 @@ Trie::Trie()
     keys_(nullptr),
     initialized_(false) {}
 
-void Trie::create_double_array(io::Pool pool) {
+void Trie::create_trie(const TrieOptions &options, io::Pool pool) {
   pool_ = pool;
 
   block_info_ = pool_.create_block(sizeof(Header));
@@ -262,27 +267,22 @@ void Trie::create_double_array(io::Pool pool) {
   header_ = static_cast<Header *>(block_address);
   *header_ = Header();
 
-  const io::BlockInfo *block_info;
-
-  block_info = pool_.create_block(sizeof(Node) * INITIAL_NODES_SIZE);
-  header_->nodes_block_id = block_info->id();
-  header_->nodes_size = INITIAL_NODES_SIZE;
-  nodes_ = static_cast<Node *>(pool_.get_block_address(*block_info));
-
-  block_info = pool_.create_block(sizeof(Chunk) * INITIAL_CHUNKS_SIZE);
-  header_->chunks_block_id = block_info->id();
-  header_->chunks_size = INITIAL_CHUNKS_SIZE;
-  chunks_ = static_cast<Chunk *>(pool_.get_block_address(*block_info));
-
-  block_info = pool_.create_block(sizeof(Entry) * INITIAL_ENTRIES_SIZE);
-  header_->entries_block_id = block_info->id();
-  header_->entries_size = INITIAL_ENTRIES_SIZE;
-  entries_ = static_cast<Entry *>(pool_.get_block_address(*block_info));
+  header_->nodes_size = static_cast<uint32_t>(options.nodes_size);
+  header_->nodes_size &= ~CHUNK_MASK;
+  if (header_->nodes_size == 0) {
+    header_->nodes_size = INITIAL_NODES_SIZE;
+  }
+  header_->chunks_size = header_->nodes_size / CHUNK_SIZE;
+  header_->entries_size = static_cast<uint32_t>(options.entries_size);
+  if (header_->entries_size == 0) {
+    header_->entries_size = INITIAL_ENTRIES_SIZE;
+  }
+  header_->keys_size = static_cast<uint32_t>(options.keys_size);
+  if (header_->keys_size == 0) {
+    header_->keys_size = INITIAL_KEYS_SIZE;
+  }
 
-  block_info = pool_.create_block(sizeof(uint32_t) * INITIAL_KEYS_SIZE);
-  header_->keys_block_id = block_info->id();
-  header_->keys_size = INITIAL_KEYS_SIZE;
-  keys_ = static_cast<uint32_t *>(pool_.get_block_address(*block_info));
+  create_arrays();
 
   reserve_node(ROOT_NODE_ID);
   nodes_[INVALID_OFFSET].set_is_origin(true);
@@ -290,7 +290,7 @@ void Trie::create_double_array(io::Pool pool) {
   initialized_ = true;
 }
 
-void Trie::open_double_array(io::Pool pool, uint32_t block_id) {
+void Trie::open_trie(io::Pool pool, uint32_t block_id) {
   pool_ = pool;
   initialized_ = true;
 
@@ -311,6 +311,155 @@ void Trie::open_double_array(io::Pool pool, uint32_t block_id) {
       pool_.get_block_address(header_->keys_block_id));
 }
 
+void Trie::defrag_trie(const TrieOptions &options, const Trie &trie,
+                       io::Pool pool) {
+  uint64_t nodes_size = options.nodes_size;
+  if (nodes_size == 0) {
+    nodes_size = trie.header_->num_chunks * CHUNK_SIZE;
+    nodes_size *= 2;
+  }
+  uint64_t entries_size = options.entries_size;
+  if (entries_size == 0) {
+    entries_size = trie.header_->max_key_id;
+    entries_size *= 2;
+  }
+  uint64_t keys_size = options.keys_size;
+  if (keys_size == 0) {
+    keys_size = trie.header_->next_key_pos;
+    keys_size *= 2;
+  }
+
+  if (nodes_size > MAX_NODES_SIZE) {
+    GRNXX_ERROR() << "too large request: nodes_size = " << nodes_size
+                  << ", MAX_NODES_SIZE = " << MAX_NODES_SIZE;
+    GRNXX_THROW();
+  }
+  if (entries_size > MAX_ENTRIES_SIZE) {
+    GRNXX_ERROR() << "too large request: entries_size = " << entries_size
+                  << ", MAX_ENTRIES_SIZE = " << MAX_ENTRIES_SIZE;
+    GRNXX_THROW();
+  }
+  if (keys_size > MAX_KEYS_SIZE) {
+    GRNXX_ERROR() << "too large request: keys_size = " << keys_size
+                  << ", MAX_KEYS_SIZE = " << MAX_KEYS_SIZE;
+    GRNXX_THROW();
+  }
+
+  pool_ = pool;
+
+  block_info_ = pool_.create_block(sizeof(Header));
+
+  void * const block_address = pool_.get_block_address(*block_info_);
+  header_ = static_cast<Header *>(block_address);
+  *header_ = Header();
+
+  header_->nodes_size = static_cast<uint32_t>(nodes_size);
+  header_->nodes_size &= ~CHUNK_MASK;
+  if (header_->nodes_size == 0) {
+    header_->nodes_size = INITIAL_NODES_SIZE;
+  }
+  header_->chunks_size = header_->nodes_size / CHUNK_SIZE;
+  header_->entries_size = static_cast<uint32_t>(entries_size);
+  if (header_->entries_size == 0) {
+    header_->entries_size = INITIAL_ENTRIES_SIZE;
+  }
+  header_->keys_size = static_cast<uint32_t>(keys_size);
+  if (header_->keys_size == 0) {
+    header_->keys_size = INITIAL_KEYS_SIZE;
+  }
+
+  create_arrays();
+
+  reserve_node(ROOT_NODE_ID);
+  nodes_[INVALID_OFFSET].set_is_origin(true);
+
+  header_->total_key_length = trie.header_->total_key_length;
+  header_->num_keys = trie.header_->num_keys;
+  header_->max_key_id = trie.header_->max_key_id;
+  header_->next_key_id = trie.header_->next_key_id;
+  for (int32_t key_id = MIN_KEY_ID; key_id <= header_->max_key_id; ++key_id) {
+    entries_[key_id] = trie.entries_[key_id];
+  }
+
+  defrag_trie(trie, ROOT_NODE_ID, ROOT_NODE_ID);
+
+  initialized_ = true;
+}
+
+void Trie::defrag_trie(const Trie &trie, uint32_t src, uint32_t dest) {
+  if (trie.nodes_[src].is_leaf()) {
+    const Key &key = trie.get_key(trie.nodes_[src].key_pos());
+    const uint32_t key_pos = header_->next_key_pos;
+    new (&keys_[key_pos]) Key(key.id(), key.slice());
+    nodes_[dest].set_key_pos(key_pos);
+    entries_[key.id()] = Entry::valid_entry(key_pos);
+    header_->next_key_pos += Key::estimate_size(key.size());
+    return;
+  }
+
+  const uint32_t src_offset = trie.nodes_[src].offset();
+  uint32_t dest_offset;
+  {
+    uint16_t labels[MAX_LABEL + 1];
+    uint16_t num_labels = 0;
+
+    uint16_t label = trie.nodes_[src].child();
+    while (label != INVALID_LABEL) {
+      GRNXX_DEBUG_THROW_IF(label > MAX_LABEL);
+      const uint32_t child = src_offset ^ label;
+      if (trie.nodes_[child].is_leaf() ||
+          (trie.nodes_[child].child() != INVALID_LABEL)) {
+        labels[num_labels++] = label;
+      }
+      label = trie.nodes_[child].sibling();
+    }
+    if (num_labels == 0) {
+      return;
+    }
+
+    dest_offset = find_offset(labels, num_labels);
+    for (uint16_t i = 0; i < num_labels; ++i) {
+      const uint32_t child = dest_offset ^ labels[i];
+      reserve_node(child);
+      nodes_[child].set_label(labels[i]);
+      if ((i + 1) < num_labels) {
+        nodes_[child].set_sibling(labels[i + 1]);
+      }
+    }
+
+    GRNXX_DEBUG_THROW_IF(nodes_[dest_offset].is_origin());
+    nodes_[dest_offset].set_is_origin(true);
+    nodes_[dest].set_offset(dest_offset);
+    nodes_[dest].set_child(labels[0]);
+  }
+
+  uint16_t label = nodes_[dest].child();
+  while (label != INVALID_LABEL) {
+    defrag_trie(trie, src_offset ^ label, dest_offset ^ label);
+    label = nodes_[dest_offset ^ label].sibling();
+  }
+}
+
+void Trie::create_arrays() {
+  const io::BlockInfo *block_info;
+
+  block_info = pool_.create_block(sizeof(Node) * header_->nodes_size);
+  header_->nodes_block_id = block_info->id();
+  nodes_ = static_cast<Node *>(pool_.get_block_address(*block_info));
+
+  block_info = pool_.create_block(sizeof(Chunk) * header_->chunks_size);
+  header_->chunks_block_id = block_info->id();
+  chunks_ = static_cast<Chunk *>(pool_.get_block_address(*block_info));
+
+  block_info = pool_.create_block(sizeof(Entry) * header_->entries_size);
+  header_->entries_block_id = block_info->id();
+  entries_ = static_cast<Entry *>(pool_.get_block_address(*block_info));
+
+  block_info = pool_.create_block(sizeof(uint32_t) * header_->keys_size);
+  header_->keys_block_id = block_info->id();
+  keys_ = static_cast<uint32_t *>(pool_.get_block_address(*block_info));
+}
+
 bool Trie::remove_key(const Slice &key) {
   uint32_t node_id = ROOT_NODE_ID;
   size_t query_pos = 0;
@@ -501,7 +650,7 @@ uint32_t Trie::append_key(const Slice &key, int32_t key_id) {
   }
 
   const uint32_t key_pos = header_->next_key_pos;
-  const uint32_t key_size = Key::estimate_size(key);
+  const uint32_t key_size = Key::estimate_size(key.size());
 
   if (key_size > (header_->keys_size - key_pos)) {
     GRNXX_ERROR() << "too many keys: key_size = " << key_size

  Modified: lib/map/da/basic_trie.hpp (+19 -10)
===================================================================
--- lib/map/da/basic_trie.hpp    2013-02-14 12:56:55 +0900 (3921ef2)
+++ lib/map/da/basic_trie.hpp    2013-02-14 15:34:45 +0900 (f22e2cb)
@@ -20,7 +20,7 @@
 
 #include "trie.hpp"
 
-#if 0
+#if 1
 # define GRNXX_DEBUG_THROW(msg)\
    ({ GRNXX_ERROR() << msg; GRNXX_THROW(); })
 # define GRNXX_DEBUG_THROW_IF(cond)\
@@ -36,7 +36,7 @@ namespace da {
 namespace basic {
 
 constexpr  int32_t MIN_KEY_ID     = 0;
-constexpr  int32_t MAX_KEY_ID     = 0x7FFFFFFF;
+constexpr  int32_t MAX_KEY_ID     = 0x7FFFFFFE;
 
 constexpr   size_t MIN_KEY_SIZE   = 1;
 constexpr   size_t MAX_KEY_SIZE   = 4095;
@@ -55,10 +55,14 @@ constexpr uint32_t CHUNK_MASK     = 0x1FF;
 // Assume that #nodes per key is 4 and #uint32_ts per key is 8.
 // Note that an entries is associated with a key.
 constexpr uint32_t INITIAL_NODES_SIZE   = 1 << 16;
-constexpr uint32_t INITIAL_CHUNKS_SIZE  = INITIAL_NODES_SIZE / CHUNK_SIZE;
 constexpr uint32_t INITIAL_ENTRIES_SIZE = 1 << 14;
 constexpr uint32_t INITIAL_KEYS_SIZE    = 1 << 17;
 
+constexpr uint32_t MAX_NODES_SIZE    =
+    std::numeric_limits<uint32_t>::max() & ~CHUNK_MASK;
+constexpr uint32_t MAX_ENTRIES_SIZE  = uint32_t(MAX_KEY_ID) + 1;
+constexpr uint32_t MAX_KEYS_SIZE     = uint32_t(1) << 31;
+
 // Chunks are grouped by the level which indicates how easily update operations
 // can find a good offset in that chunk. The chunk level rises when
 // find_offset() fails in that chunk many times. MAX_FAILURE_COUNT
@@ -395,8 +399,8 @@ class Key {
     return true;
   }
 
-  static uint32_t estimate_size(const Slice &key) {
-    return (9 + key.size()) / sizeof(uint32_t);
+  static uint32_t estimate_size(const size_t key_size) {
+    return (9 + key_size) / sizeof(uint32_t);
   }
 
  private:
@@ -412,8 +416,7 @@ class Trie : public da::Trie {
   static Trie *create(const TrieOptions &options, io::Pool pool);
   static Trie *open(io::Pool pool, uint32_t block_id);
 
-  // TODO
-//  da::Trie *defrag();
+  Trie *defrag(const TrieOptions &options);
 
   uint32_t block_id() const;
 
@@ -441,10 +444,16 @@ class Trie : public da::Trie {
 
   Trie();
 
-  void create_double_array(io::Pool pool);
-  void open_double_array(io::Pool pool, uint32_t block_id);
+  void create_trie(const TrieOptions &options, io::Pool pool);
+  void open_trie(io::Pool pool, uint32_t block_id);
+
+  void defrag_trie(const TrieOptions &options, const Trie &trie,
+                   io::Pool pool);
+  void defrag_trie(const Trie &trie, uint32_t src, uint32_t dest);
+
+  void create_arrays();
 
-  const Key &get_key(uint32_t key_pos) {
+  const Key &get_key(uint32_t key_pos) const {
     return *reinterpret_cast<const Key *>(&keys_[key_pos]);
   }
 

  Modified: lib/map/da/trie.cpp (+4 -1)
===================================================================
--- lib/map/da/trie.cpp    2013-02-14 12:56:55 +0900 (a7cd495)
+++ lib/map/da/trie.cpp    2013-02-14 15:34:45 +0900 (99da9cc)
@@ -5,7 +5,10 @@ namespace grnxx {
 namespace map {
 namespace da {
 
-TrieOptions::TrieOptions() {}
+TrieOptions::TrieOptions()
+  : nodes_size(0),
+    entries_size(0),
+    keys_size(0) {}
 
 Trie::Trie() {}
 Trie::~Trie() {}

  Modified: lib/map/da/trie.hpp (+5 -2)
===================================================================
--- lib/map/da/trie.hpp    2013-02-14 12:56:55 +0900 (6376a79)
+++ lib/map/da/trie.hpp    2013-02-14 15:34:45 +0900 (993a862)
@@ -25,6 +25,10 @@ namespace map {
 namespace da {
 
 struct TrieOptions {
+  uint64_t nodes_size;
+  uint64_t entries_size;
+  uint64_t keys_size;
+
   TrieOptions();
 };
 
@@ -36,8 +40,7 @@ class Trie {
   static Trie *create(const TrieOptions &options, io::Pool pool);
   static Trie *open(io::Pool pool, uint32_t block_id);
 
-  // TODO
-//  virtual Trie *defrag() = 0;
+  virtual Trie *defrag(const TrieOptions &options) = 0;
 
   virtual uint32_t block_id() const = 0;
 
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index