1 // Copyright (c) 2020 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef TOOLS_GN_HASH_TABLE_BASE_H_ 6 #define TOOLS_GN_HASH_TABLE_BASE_H_ 7 8 #include "base/compiler_specific.h" 9 10 #include <stdlib.h> 11 #include <type_traits> 12 #include <utility> 13 14 // IMPORTANT DISCLAIMER: 15 // 16 // THIS IS *NOT* A GENERAL PURPOSE HASH TABLE TEMPLATE. INSTEAD, IT CAN 17 // CAN BE USED AS THE BASIS FOR VERY HIGH SPEED AND COMPACT HASH TABLES 18 // THAT OBEY VERY STRICT CONDITIONS DESCRIBED BELOW. 19 // 20 // DO NOT USE THIS UNLESS YOU HAVE A GOOD REASON, I.E. THAT PROFILING 21 // SHOWS THERE *IS* AN OVERALL BENEFIT TO DO SO. FOR MOST CASES, 22 // std::set<>, std::unordered_set<> and base::flat_set<> ARE PERFECTLY 23 // FINE AND SHOULD BE PREFERRED. 24 // 25 // 26 // That being said, this implementation uses a completely typical 27 // open-addressing scheme with a buckets array size which is always 28 // a power of 2, instead of a prime number. Experience shows this is 29 // not detrimental to performance as long as you have a sufficiently 30 // good hash function (which is the case of all C++ standard libraries 31 // these days for strings and pointers). 32 // 33 // The reason it is so fast is due to its compactness and the very 34 // simple but tight code for a typical lookup / insert / deletion 35 // operation. 36 // 37 // The |buckets_| field holds a pointer to an array of NODE_TYPE 38 // instances, called nodes. Each node represents one of the following: 39 // a free entry in the table, an inserted item, or a tombstone marking 40 // the location of a previously deleted item. Tombstones are only 41 // used if the hash table instantiation requires deletion support 42 // (see the is_tombstone() description below). 43 // 44 // The template requires that Node be a type with the following traits: 45 // 46 // - It *must* be a trivial type, that is zero-initialized. 47 // 48 // - It provides an is_null() const method, which should return true 49 // if the corresponding node matches a 'free' entry in the hash 50 // table, i.e. one that has not been assigned to an item, or 51 // to a deletion tombstone (see below). 52 // 53 // Of course, a default (zeroed) value, should always return true. 54 // 55 // - It provides an is_tombstone() const method, which should return 56 // return true if the corresponding node indicates a previously 57 // deleted item. 58 // 59 // Note that if your hash table does not need deletion support, 60 // it is highly recommended to make this a static constexpr method 61 // that always return false. Doing so will optimize the lookup loop 62 // automatically! 63 // 64 // - It must provide an is_valid() method, that simply returns 65 // (!is_null() && !is_tombstone()). This is a convenience for this 66 // template, but also for the derived class that will extend it 67 // (more on this below, in the usage description). 68 // 69 // Note that because Node instances are trivial, std::unique_ptr<> 70 // cannot be used in them. Item lifecycle must this be managed 71 // explicitly by a derived class of the template instantiation 72 // instead. 73 // 74 // Lookup, insertion and deletion are performed in ways that 75 // are *very* different from standard containers, and the reason 76 // is, unsurprisingly, performance. 77 // 78 // Use NodeLookup() to look for an existing item in the hash table. 79 // This takes the item's hash value, and a templated callable to 80 // compare a node against the search key. This scheme allows 81 // heterogeneous lookups, and keeps the node type details 82 // out of this header. Any recent C++ optimizer will generate 83 // very tight machine code for this call. 84 // 85 // The NodeLookup() function always returns a non-null and 86 // mutable |node| pointer. If |node->is_valid()| is true, 87 // then the item was found, and |node| points to it. 88 // 89 // Otherwise, |node| corresponds to a location that may be 90 // used for insertion. To do so, the caller should update the 91 // content of |node| appropriately (e.g. writing a pointer and/or 92 // hash value to it), then call UpdateAfterInsertion(), which 93 // may eventually grow the table and rehash nodes in it. 94 // 95 // To delete an item, call NodeLookup() first to verify that 96 // the item is present, then write a tombstone value to |node|, 97 // then call UpdateAfterDeletion(). 98 // 99 // Note that what the tombstone value is doesn't matter to this 100 // header, as long as |node->is_tombstone()| returns true for 101 // it. 102 // 103 // Here's an example of a concrete implementation that stores 104 // a hash value and an owning pointer in each node: 105 // 106 // struct MyFooNode { 107 // size_t hash; 108 // Foo* foo; 109 // 110 // bool is_null() const { return !foo; } 111 // bool is_tombstone() const { return foo == &kTombstone; } 112 // bool is_valid() const { return !is_null() && !is_tombstone(); } 113 // 114 // static const Foo kTombstone; 115 // }; 116 // 117 // class MyFooSet : public HashTableBase<MyFoodNode> { 118 // public: 119 // using BaseType = HashTableBase<MyFooNode>; 120 // using Node = BaseType::Node; 121 // 122 // ~MyFooSet() { 123 // // Destroy all items in the set. 124 // // Note that the iterator only parses over valid items. 125 // for (Node* node : *this) { 126 // delete node->foo; 127 // } 128 // } 129 // 130 // // Returns true if this set contains |key|. 131 // bool contains(const Foo& key) const { 132 // Node* node = BaseType::NodeLookup( 133 // MakeHash(key), 134 // [&](const Node* node) { return node->foo == key; }); 135 // 136 // return node->is_valid(); 137 // } 138 // 139 // // Try to add |key| to the set. Return true if the insertion 140 // // was successful, or false if the item was already in the set. 141 // bool add(const Foo& key) { 142 // size_t hash = MakeHash(key); 143 // Node* node = NodeLookup( 144 // hash, 145 // [&](const Node* node) { return node->foo == key; }); 146 // 147 // if (node->is_valid()) { 148 // // Already in the set. 149 // return false; 150 // } 151 // 152 // // Add it. 153 // node->hash = hash; 154 // node->foo = new Foo(key); 155 // UpdateAfterInsert(); 156 // return true; 157 // } 158 // 159 // // Try to remove |key| from the set. Return true if the 160 // // item was already in it, false otherwise. 161 // bool erase(const Foo& key) { 162 // Node* node = BaseType::NodeLookup( 163 // MakeHash(key), 164 // [&](const Node* node) { return node->foo == key; }); 165 // 166 // if (!node->is_valid()) { 167 // // Not in the set. 168 // return false; 169 // } 170 // 171 // delete node->foo; 172 // node->foo = Node::kTombstone; 173 // UpdateAfterDeletion(). 174 // return true; 175 // } 176 // 177 // static size_t MakeHash(const Foo& foo) { 178 // return std::hash<Foo>()(); 179 // } 180 // }; 181 // 182 // For more concrete examples, see the implementation of 183 // StringAtom or UniqueVector<> 184 // 185 template <typename NODE_TYPE> 186 class HashTableBase { 187 public: 188 using Node = NODE_TYPE; 189 190 static_assert(std::is_trivial<NODE_TYPE>::value, 191 "KEY_TYPE should be a trivial type!"); 192 193 static_assert(std::is_standard_layout<NODE_TYPE>::value, 194 "KEY_TYPE should be a standard layout type!"); 195 196 // Default constructor. 197 HashTableBase() = default; 198 199 // Destructor. This only deals with the |buckets_| array itself. ~HashTableBase()200 ~HashTableBase() { 201 if (buckets_ != buckets0_) 202 free(buckets_); 203 } 204 205 // Copy and move operations. These only operate on the |buckets_| array 206 // so any owned pointer inside nodes should be handled by custom 207 // constructors and operators in the derived class, if needed. HashTableBase(const HashTableBase & other)208 HashTableBase(const HashTableBase& other) 209 : count_(other.count_), size_(other.size_) { 210 if (other.buckets_ != other.buckets0_) { 211 // NOTE: using malloc() here to clarify that no object construction 212 // should occur here. 213 buckets_ = reinterpret_cast<Node*>(malloc(other.size_ * sizeof(Node))); 214 } 215 memcpy(buckets_, other.buckets_, other.size_ * sizeof(Node)); 216 } 217 218 HashTableBase& operator=(const HashTableBase& other) { 219 if (this != &other) { 220 this->~HashTableBase(); 221 new (this) HashTableBase(other); 222 } 223 return *this; 224 } 225 HashTableBase(HashTableBase && other)226 HashTableBase(HashTableBase&& other) noexcept 227 : count_(other.count_), size_(other.size_), buckets_(other.buckets_) { 228 if (buckets_ == other.buckets0_) { 229 buckets0_[0] = other.buckets0_[0]; 230 buckets_ = buckets0_; 231 } else { 232 other.buckets_ = other.buckets0_; 233 } 234 other.NodeClear(); 235 } 236 237 HashTableBase& operator=(HashTableBase&& other) noexcept { 238 if (this != &other) { 239 this->~HashTableBase(); 240 new (this) HashTableBase(std::move(other)); 241 } 242 return *this; 243 } 244 245 // Return true if the table is empty. empty()246 bool empty() const { return count_ == 0; } 247 248 // Return the number of keys in the set. size()249 size_t size() const { return count_; } 250 251 protected: 252 // The following should only be called by derived classes that 253 // extend this template class, and are not available to their 254 // clients. This forces the derived class to implement lookup 255 // insertion and deletion with sane APIs instead. 256 257 // Iteration support note: 258 // 259 // Derived classes, if they wish to support iteration, should provide their 260 // own iterator/const_iterator/begin()/end() types and methods, possibly as 261 // simple wrappers around the NodeIterator/NodeBegin/NodeEnd below. 262 // 263 // Defining a custom iterator is as easy as deriving from NodeIterator 264 // and overloading operator*() and operator->(), as in: 265 // 266 // struct FooNode { 267 // size_t hash; 268 // Foo* foo_ptr; 269 // ... 270 // }; 271 // 272 // class FooTable : public HashTableBase<FooNode> { 273 // public: 274 // ... 275 // 276 // // Iterators point to Foo instances, not table nodes. 277 // struct ConstIterator : NodeIterator { 278 // const Foo* operator->() { return node_->foo_ptr; } 279 // const Foo& operator*)) { return *(node_->foo_ptr); } 280 // }; 281 // 282 // ConstIterator begin() const { return { NodeBegin() }; } 283 // ConstIterator end() const { return { NodeEnd() }; } 284 // 285 // The NodeIterator type has a valid() method that can be used to perform 286 // faster iteration though at the cost of using a slightly more annoying 287 // syntax: 288 // 289 // for (auto it = my_table.begin(); it.valid(); ++it) { 290 // const Foo& foo = *it; 291 // ... 292 // } 293 // 294 // Instead of: 295 // 296 // for (const Foo& foo : my_table) { 297 // ... 298 // } 299 // 300 // The ValidNodesRange() method also returns a object that has begin() and 301 // end() methods to be used in for-range loops over Node values as in: 302 // 303 // for (Node& node : my_table.ValidNodesRange()) { ... } 304 // 305 struct NodeIterator { 306 Node& operator*() { return *node_; } 307 const Node& operator*() const { return *node_; } 308 309 Node* operator->() { return node_; } 310 const Node* operator->() const { return node_; } 311 312 bool operator==(const NodeIterator& other) const { 313 return node_ == other.node_; 314 } 315 316 bool operator!=(const NodeIterator& other) const { 317 return node_ != other.node_; 318 } 319 320 // pre-increment 321 NodeIterator& operator++() { 322 node_++; 323 while (node_ < node_limit_ && !node_->is_valid()) 324 node_++; 325 326 return *this; 327 } 328 329 // post-increment 330 NodeIterator operator++(int) { 331 NodeIterator result = *this; 332 ++(*this); 333 return result; 334 } 335 336 // Returns true if the iterator points to a valid node. validNodeIterator337 bool valid() const { return node_ != node_limit_; } 338 339 Node* node_ = nullptr; 340 Node* node_limit_ = nullptr; 341 }; 342 343 // NOTE: This is intentionally not public to avoid exposing 344 // them in derived classes by mistake. If a derived class 345 // wants to support iteration, it provide its own begin() and end() 346 // methods, possibly using NodeBegin() and NodeEnd() below to 347 // implement them. begin()348 NodeIterator begin() { return NodeBegin(); } end()349 NodeIterator end() { return NodeEnd(); } 350 351 // Providing methods named NodeBegin() and NodeEnd(), instead of 352 // just using begin() and end() is a convenience to derived classes 353 // that need to provide their own begin() and end() method, e.g. 354 // consider this class: 355 // 356 // struct FooNode { 357 // size_t hash; 358 // Foo* foo_ptr; 359 // ... 360 // }; 361 // 362 // class FooTable : public HashTableBase<FooNode> { 363 // public: 364 // ... 365 // 366 // // Iterators point to Foo instances, not table nodes. 367 // struct ConstIterator : NodeIterator { 368 // const Foo* operator->() { return node_->foo_ptr; } 369 // const Foo& operator*)) { return *(node_->foo_ptr); } 370 // }; 371 // 372 // and compare two ways to implement its begin() method: 373 // 374 // Foo::ConstIterator Foo::begin() const { 375 // return { 376 // reinterpret_cast<const HashTableBase<FooNode> *>(this)->begin() 377 // }; 378 // }; 379 // 380 // with: 381 // 382 // Foo::ConstIterator Foo::begin() const { 383 // return { NodeBegin(); } 384 // } 385 // NodeBegin()386 NodeIterator NodeBegin() const { 387 Node* node = buckets_; 388 Node* limit = node + size_; 389 while (node < limit && !node->is_valid()) 390 node++; 391 392 return {node, limit}; 393 } 394 NodeEnd()395 NodeIterator NodeEnd() const { 396 Node* limit = buckets_ + size_; 397 return {limit, limit}; 398 } 399 400 // ValidNodeRange() allows a derived-class to use range-based loops 401 // over valid nodes, even if they have defined their own begin() and 402 // end() methods, e.g. following the same class design as in the 403 // above comment: 404 // 405 // FooTable::~FooTable() { 406 // for (const FooNode& node : ValidNodesRange()) { 407 // delete node->foo_ptr; 408 // } 409 // } 410 // 411 // which is a little bit clearer than the (valid) alternative: 412 // 413 // FooTable::~FooTable() { 414 // for (Foo& foo : *this) { 415 // delete &foo; // WHAT!? 416 // } 417 // } 418 // 419 struct NodeIteratorPair { beginNodeIteratorPair420 NodeIterator begin() { return begin_; } endNodeIteratorPair421 NodeIterator end() { return end_; } 422 423 NodeIterator begin_; 424 NodeIterator end_; 425 }; 426 ValidNodesRange()427 NodeIteratorPair ValidNodesRange() const { return {NodeBegin(), NodeEnd()}; } 428 429 // Clear the nodes table completely. NodeClear()430 void NodeClear() { 431 if (buckets_ != buckets0_) 432 free(buckets_); 433 434 count_ = 0; 435 size_ = 1; 436 buckets_ = buckets0_; 437 buckets0_[0] = Node{}; 438 } 439 440 // Lookup for a node in the hash table that matches the |node_equal| 441 // predicate, which takes a const Node* pointer argument, and returns 442 // true if this corresponds to a lookup match. 443 // 444 // IMPORTANT: |node_equal| may or may not check the node hash value, 445 // the choice is left to the implementation. 446 // 447 // Returns a non-null *mutable* node pointer. |node->is_valid()| will 448 // be true for matches, and false for misses. 449 // 450 // NOTE: In case of a miss, this will return the location of the first 451 // tombstone encountered during probing, if any, or the first free entry 452 // otherwise. This keeps the table consistent in case the node is overwritten 453 // by the caller in a following insert operation. 454 template <typename NODE_EQUAL> NodeLookup(size_t hash,NODE_EQUAL node_equal)455 Node* NodeLookup(size_t hash, NODE_EQUAL node_equal) const { 456 size_t mask = size_ - 1; 457 size_t index = hash & mask; 458 Node* tombstone = nullptr; // First tombstone node found, if any. 459 for (;;) { 460 Node* node = buckets_ + index; 461 if (node->is_null()) { 462 return tombstone ? tombstone : node; 463 } 464 if (node->is_tombstone()) { 465 if (!tombstone) 466 tombstone = node; 467 } else if (node_equal(node)) { 468 return node; 469 } 470 index = (index + 1) & mask; 471 } 472 } 473 474 // Call this method after updating the content of the |node| pointer 475 // returned by an unsuccessful NodeLookup(). Return true to indicate that 476 // the table size changed, and that existing iterators were invalidated. UpdateAfterInsert()477 bool UpdateAfterInsert() { 478 count_ += 1; 479 if (UNLIKELY(count_ * 4 >= size_ * 3)) { 480 GrowBuckets(); 481 return true; 482 } 483 return false; 484 } 485 486 // Call this method after updating the content of the |node| value 487 // returned a by successful NodeLookup, to the tombstone value, if any. 488 // Return true to indicate a table size change, ie. that existing 489 // iterators where invalidated. UpdateAfterRemoval()490 bool UpdateAfterRemoval() { 491 count_ -= 1; 492 // For now don't support shrinking since this is not useful for GN. 493 return false; 494 } 495 496 private: 497 #if defined(__GNUC__) || defined(__clang__) 498 [[gnu::noinline]] 499 #endif 500 void GrowBuckets()501 GrowBuckets() { 502 size_t size = size_; 503 size_t new_size = (size == 1) ? 8 : size * 2; 504 size_t new_mask = new_size - 1; 505 506 // NOTE: Using calloc() since no object constructiopn can or should take 507 // place here. 508 Node* new_buckets = reinterpret_cast<Node*>(calloc(new_size, sizeof(Node))); 509 510 for (size_t src_index = 0; src_index < size; ++src_index) { 511 const Node* node = &buckets_[src_index]; 512 if (!node->is_valid()) 513 continue; 514 size_t dst_index = node->hash_value() & new_mask; 515 for (;;) { 516 Node* node2 = new_buckets + dst_index; 517 if (node2->is_null()) { 518 *node2 = *node; 519 break; 520 } 521 dst_index = (dst_index + 1) & new_mask; 522 } 523 } 524 525 if (buckets_ != buckets0_) 526 free(buckets_); 527 528 buckets_ = new_buckets; 529 buckets0_[0] = Node{}; 530 size_ = new_size; 531 } 532 533 // NOTE: The reason for default-initializing |buckets_| to a storage slot 534 // inside the object is to ensure the value is never null. This removes one 535 // nullptr check from each NodeLookup() instantiation. 536 size_t count_ = 0; 537 size_t size_ = 1; 538 Node* buckets_ = buckets0_; 539 Node buckets0_[1] = {{}}; 540 }; 541 542 #endif // TOOLS_GN_HASH_TABLE_BASE_H_ 543