• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2020 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef TOOLS_GN_HASH_TABLE_BASE_H_
6 #define TOOLS_GN_HASH_TABLE_BASE_H_
7 
8 #include "base/compiler_specific.h"
9 
10 #include <stdlib.h>
11 #include <type_traits>
12 #include <utility>
13 
14 // IMPORTANT DISCLAIMER:
15 //
16 // THIS IS *NOT* A GENERAL PURPOSE HASH TABLE TEMPLATE. INSTEAD, IT CAN
17 // CAN BE USED AS THE BASIS FOR VERY HIGH SPEED AND COMPACT HASH TABLES
18 // THAT OBEY VERY STRICT CONDITIONS DESCRIBED BELOW.
19 //
20 // DO NOT USE THIS UNLESS YOU HAVE A GOOD REASON, I.E. THAT PROFILING
21 // SHOWS THERE *IS* AN OVERALL BENEFIT TO DO SO. FOR MOST CASES,
22 // std::set<>, std::unordered_set<> and base::flat_set<> ARE PERFECTLY
23 // FINE AND SHOULD BE PREFERRED.
24 //
25 //
26 // That being said, this implementation uses a completely typical
27 // open-addressing scheme with a buckets array size which is always
28 // a power of 2, instead of a prime number. Experience shows this is
29 // not detrimental to performance as long as you have a sufficiently
30 // good hash function (which is the case of all C++ standard libraries
31 // these days for strings and pointers).
32 //
33 // The reason it is so fast is due to its compactness and the very
34 // simple but tight code for a typical lookup / insert / deletion
35 // operation.
36 //
37 // The |buckets_| field holds a pointer to an array of NODE_TYPE
38 // instances, called nodes. Each node represents one of the following:
39 // a free entry in the table, an inserted item, or a tombstone marking
40 // the location of a previously deleted item. Tombstones are only
41 // used if the hash table instantiation requires deletion support
42 // (see the is_tombstone() description below).
43 //
44 // The template requires that Node be a type with the following traits:
45 //
46 //  - It *must* be a trivial type, that is zero-initialized.
47 //
48 //  - It provides an is_null() const method, which should return true
49 //    if the corresponding node matches a 'free' entry in the hash
50 //    table, i.e. one that has not been assigned to an item, or
51 //    to a deletion tombstone (see below).
52 //
53 //    Of course, a default (zeroed) value, should always return true.
54 //
55 //  - It provides an is_tombstone() const method, which should return
56 //    return true if the corresponding node indicates a previously
57 //    deleted item.
58 //
59 //    Note that if your hash table does not need deletion support,
60 //    it is highly recommended to make this a static constexpr method
61 //    that always return false. Doing so will optimize the lookup loop
62 //    automatically!
63 //
64 //  - It must provide an is_valid() method, that simply returns
65 //    (!is_null() && !is_tombstone()). This is a convenience for this
66 //    template, but also for the derived class that will extend it
67 //    (more on this below, in the usage description).
68 //
69 // Note that because Node instances are trivial, std::unique_ptr<>
70 // cannot be used in them. Item lifecycle must this be managed
71 // explicitly by a derived class of the template instantiation
72 // instead.
73 //
74 // Lookup, insertion and deletion are performed in ways that
75 // are *very* different from standard containers, and the reason
76 // is, unsurprisingly, performance.
77 //
78 // Use NodeLookup() to look for an existing item in the hash table.
79 // This takes the item's hash value, and a templated callable to
80 // compare a node against the search key. This scheme allows
81 // heterogeneous lookups, and keeps the node type details
82 // out of this header. Any recent C++ optimizer will generate
83 // very tight machine code for this call.
84 //
85 // The NodeLookup() function always returns a non-null and
86 // mutable |node| pointer. If |node->is_valid()| is true,
87 // then the item was found, and |node| points to it.
88 //
89 // Otherwise, |node| corresponds to a location that may be
90 // used for insertion. To do so, the caller should update the
91 // content of |node| appropriately (e.g. writing a pointer and/or
92 // hash value to it), then call UpdateAfterInsertion(), which
93 // may eventually grow the table and rehash nodes in it.
94 //
95 // To delete an item, call NodeLookup() first to verify that
96 // the item is present, then write a tombstone value to |node|,
97 // then call UpdateAfterDeletion().
98 //
99 // Note that what the tombstone value is doesn't matter to this
100 // header, as long as |node->is_tombstone()| returns true for
101 // it.
102 //
103 // Here's an example of a concrete implementation that stores
104 // a hash value and an owning pointer in each node:
105 //
106 //     struct MyFooNode {
107 //       size_t hash;
108 //       Foo*   foo;
109 //
110 //       bool is_null() const { return !foo; }
111 //       bool is_tombstone() const { return foo == &kTombstone; }
112 //       bool is_valid() const { return !is_null() && !is_tombstone(); }
113 //
114 //       static const Foo kTombstone;
115 //     };
116 //
117 //    class MyFooSet : public HashTableBase<MyFoodNode> {
118 //    public:
119 //      using BaseType = HashTableBase<MyFooNode>;
120 //      using Node = BaseType::Node;
121 //
122 //      ~MyFooSet() {
123 //        // Destroy all items in the set.
124 //        // Note that the iterator only parses over valid items.
125 //        for (Node* node : *this) {
126 //          delete node->foo;
127 //        }
128 //      }
129 //
130 //      // Returns true if this set contains |key|.
131 //      bool contains(const Foo& key) const {
132 //        Node* node = BaseType::NodeLookup(
133 //            MakeHash(key),
134 //            [&](const Node* node) { return node->foo == key; });
135 //
136 //        return node->is_valid();
137 //      }
138 //
139 //      // Try to add |key| to the set. Return true if the insertion
140 //      // was successful, or false if the item was already in the set.
141 //      bool add(const Foo& key) {
142 //        size_t hash = MakeHash(key);
143 //        Node* node = NodeLookup(
144 //            hash,
145 //            [&](const Node* node) { return node->foo == key; });
146 //
147 //        if (node->is_valid()) {
148 //          // Already in the set.
149 //          return false;
150 //        }
151 //
152 //        // Add it.
153 //        node->hash = hash;
154 //        node->foo  = new Foo(key);
155 //        UpdateAfterInsert();
156 //        return true;
157 //      }
158 //
159 //      // Try to remove |key| from the set. Return true if the
160 //      // item was already in it, false otherwise.
161 //      bool erase(const Foo& key) {
162 //        Node* node = BaseType::NodeLookup(
163 //            MakeHash(key),
164 //            [&](const Node* node) { return node->foo == key; });
165 //
166 //        if (!node->is_valid()) {
167 //          // Not in the set.
168 //          return false;
169 //        }
170 //
171 //        delete node->foo;
172 //        node->foo = Node::kTombstone;
173 //        UpdateAfterDeletion().
174 //        return true;
175 //      }
176 //
177 //      static size_t MakeHash(const Foo& foo) {
178 //        return std::hash<Foo>()();
179 //      }
180 //    };
181 //
182 // For more concrete examples, see the implementation of
183 // StringAtom or UniqueVector<>
184 //
185 template <typename NODE_TYPE>
186 class HashTableBase {
187  public:
188   using Node = NODE_TYPE;
189 
190   static_assert(std::is_trivial<NODE_TYPE>::value,
191                 "KEY_TYPE should be a trivial type!");
192 
193   static_assert(std::is_standard_layout<NODE_TYPE>::value,
194                 "KEY_TYPE should be a standard layout type!");
195 
196   // Default constructor.
197   HashTableBase() = default;
198 
199   // Destructor. This only deals with the |buckets_| array itself.
~HashTableBase()200   ~HashTableBase() {
201     if (buckets_ != buckets0_)
202       free(buckets_);
203   }
204 
205   // Copy and move operations. These only operate on the |buckets_| array
206   // so any owned pointer inside nodes should be handled by custom
207   // constructors and operators in the derived class, if needed.
HashTableBase(const HashTableBase & other)208   HashTableBase(const HashTableBase& other)
209       : count_(other.count_), size_(other.size_) {
210     if (other.buckets_ != other.buckets0_) {
211       // NOTE: using malloc() here to clarify that no object construction
212       // should occur here.
213       buckets_ = reinterpret_cast<Node*>(malloc(other.size_ * sizeof(Node)));
214     }
215     memcpy(buckets_, other.buckets_, other.size_ * sizeof(Node));
216   }
217 
218   HashTableBase& operator=(const HashTableBase& other) {
219     if (this != &other) {
220       this->~HashTableBase();
221       new (this) HashTableBase(other);
222     }
223     return *this;
224   }
225 
HashTableBase(HashTableBase && other)226   HashTableBase(HashTableBase&& other) noexcept
227       : count_(other.count_), size_(other.size_), buckets_(other.buckets_) {
228     if (buckets_ == other.buckets0_) {
229       buckets0_[0] = other.buckets0_[0];
230       buckets_ = buckets0_;
231     } else {
232       other.buckets_ = other.buckets0_;
233     }
234     other.NodeClear();
235   }
236 
237   HashTableBase& operator=(HashTableBase&& other) noexcept {
238     if (this != &other) {
239       this->~HashTableBase();
240       new (this) HashTableBase(std::move(other));
241     }
242     return *this;
243   }
244 
245   // Return true if the table is empty.
empty()246   bool empty() const { return count_ == 0; }
247 
248   // Return the number of keys in the set.
size()249   size_t size() const { return count_; }
250 
251  protected:
252   // The following should only be called by derived classes that
253   // extend this template class, and are not available to their
254   // clients. This forces the derived class to implement lookup
255   // insertion and deletion with sane APIs instead.
256 
257   // Iteration support note:
258   //
259   // Derived classes, if they wish to support iteration, should provide their
260   // own iterator/const_iterator/begin()/end() types and methods, possibly as
261   // simple wrappers around the NodeIterator/NodeBegin/NodeEnd below.
262   //
263   // Defining a custom iterator is as easy as deriving from NodeIterator
264   // and overloading operator*() and operator->(), as in:
265   //
266   //  struct FooNode {
267   //     size_t hash;
268   //     Foo*   foo_ptr;
269   //     ...
270   //  };
271   //
272   //  class FooTable : public HashTableBase<FooNode> {
273   //  public:
274   //     ...
275   //
276   //     // Iterators point to Foo instances, not table nodes.
277   //     struct ConstIterator : NodeIterator {
278   //       const Foo* operator->() { return node_->foo_ptr; }
279   //       const Foo& operator*)) { return *(node_->foo_ptr); }
280   //     };
281   //
282   //     ConstIterator begin() const { return { NodeBegin() }; }
283   //     ConstIterator end() const { return { NodeEnd() }; }
284   //
285   // The NodeIterator type has a valid() method that can be used to perform
286   // faster iteration though at the cost of using a slightly more annoying
287   // syntax:
288   //
289   //    for (auto it = my_table.begin(); it.valid(); ++it) {
290   //      const Foo& foo = *it;
291   //      ...
292   //    }
293   //
294   // Instead of:
295   //
296   //    for (const Foo& foo : my_table) {
297   //      ...
298   //    }
299   //
300   // The ValidNodesRange() method also returns a object that has begin() and
301   // end() methods to be used in for-range loops over Node values as in:
302   //
303   //    for (Node& node : my_table.ValidNodesRange()) { ... }
304   //
305   struct NodeIterator {
306     Node& operator*() { return *node_; }
307     const Node& operator*() const { return *node_; }
308 
309     Node* operator->() { return node_; }
310     const Node* operator->() const { return node_; }
311 
312     bool operator==(const NodeIterator& other) const {
313       return node_ == other.node_;
314     }
315 
316     bool operator!=(const NodeIterator& other) const {
317       return node_ != other.node_;
318     }
319 
320     // pre-increment
321     NodeIterator& operator++() {
322       node_++;
323       while (node_ < node_limit_ && !node_->is_valid())
324         node_++;
325 
326       return *this;
327     }
328 
329     // post-increment
330     NodeIterator operator++(int) {
331       NodeIterator result = *this;
332       ++(*this);
333       return result;
334     }
335 
336     // Returns true if the iterator points to a valid node.
validNodeIterator337     bool valid() const { return node_ != node_limit_; }
338 
339     Node* node_ = nullptr;
340     Node* node_limit_ = nullptr;
341   };
342 
343   // NOTE: This is intentionally not public to avoid exposing
344   // them in derived classes by mistake. If a derived class
345   // wants to support iteration, it provide its own begin() and end()
346   // methods, possibly using NodeBegin() and NodeEnd() below to
347   // implement them.
begin()348   NodeIterator begin() { return NodeBegin(); }
end()349   NodeIterator end() { return NodeEnd(); }
350 
351   // Providing methods named NodeBegin() and NodeEnd(), instead of
352   // just using begin() and end() is a convenience to derived classes
353   // that need to provide their own begin() and end() method, e.g.
354   // consider this class:
355   //
356   //  struct FooNode {
357   //     size_t hash;
358   //     Foo*   foo_ptr;
359   //     ...
360   //  };
361   //
362   //  class FooTable : public HashTableBase<FooNode> {
363   //  public:
364   //     ...
365   //
366   //     // Iterators point to Foo instances, not table nodes.
367   //     struct ConstIterator : NodeIterator {
368   //       const Foo* operator->() { return node_->foo_ptr; }
369   //       const Foo& operator*)) { return *(node_->foo_ptr); }
370   //     };
371   //
372   // and compare two ways to implement its begin() method:
373   //
374   //    Foo::ConstIterator Foo::begin() const {
375   //      return {
376   //        reinterpret_cast<const HashTableBase<FooNode> *>(this)->begin()
377   //      };
378   //    };
379   //
380   // with:
381   //
382   //    Foo::ConstIterator Foo::begin() const {
383   //      return { NodeBegin(); }
384   //    }
385   //
NodeBegin()386   NodeIterator NodeBegin() const {
387     Node* node = buckets_;
388     Node* limit = node + size_;
389     while (node < limit && !node->is_valid())
390       node++;
391 
392     return {node, limit};
393   }
394 
NodeEnd()395   NodeIterator NodeEnd() const {
396     Node* limit = buckets_ + size_;
397     return {limit, limit};
398   }
399 
400   // ValidNodeRange() allows a derived-class to use range-based  loops
401   // over valid nodes, even if they have defined their own begin() and
402   // end() methods, e.g. following the same class design as in the
403   // above comment:
404   //
405   //   FooTable::~FooTable() {
406   //     for (const FooNode& node : ValidNodesRange()) {
407   //       delete node->foo_ptr;
408   //     }
409   //   }
410   //
411   // which is a little bit clearer than the (valid) alternative:
412   //
413   //   FooTable::~FooTable() {
414   //     for (Foo& foo : *this) {
415   //       delete &foo;  // WHAT!?
416   //     }
417   //   }
418   //
419   struct NodeIteratorPair {
beginNodeIteratorPair420     NodeIterator begin() { return begin_; }
endNodeIteratorPair421     NodeIterator end() { return end_; }
422 
423     NodeIterator begin_;
424     NodeIterator end_;
425   };
426 
ValidNodesRange()427   NodeIteratorPair ValidNodesRange() const { return {NodeBegin(), NodeEnd()}; }
428 
429   // Clear the nodes table completely.
NodeClear()430   void NodeClear() {
431     if (buckets_ != buckets0_)
432       free(buckets_);
433 
434     count_ = 0;
435     size_ = 1;
436     buckets_ = buckets0_;
437     buckets0_[0] = Node{};
438   }
439 
440   // Lookup for a node in the hash table that matches the |node_equal|
441   // predicate, which takes a const Node* pointer argument, and returns
442   // true if this corresponds to a lookup match.
443   //
444   // IMPORTANT: |node_equal| may or may not check the node hash value,
445   // the choice is left to the implementation.
446   //
447   // Returns a non-null *mutable* node pointer. |node->is_valid()| will
448   // be true for matches, and false for misses.
449   //
450   // NOTE: In case of a miss, this will return the location of the first
451   // tombstone encountered during probing, if any, or the first free entry
452   // otherwise. This keeps the table consistent in case the node is overwritten
453   // by the caller in a following insert operation.
454   template <typename NODE_EQUAL>
NodeLookup(size_t hash,NODE_EQUAL node_equal)455   Node* NodeLookup(size_t hash, NODE_EQUAL node_equal) const {
456     size_t mask = size_ - 1;
457     size_t index = hash & mask;
458     Node* tombstone = nullptr;  // First tombstone node found, if any.
459     for (;;) {
460       Node* node = buckets_ + index;
461       if (node->is_null()) {
462         return tombstone ? tombstone : node;
463       }
464       if (node->is_tombstone()) {
465         if (!tombstone)
466           tombstone = node;
467       } else if (node_equal(node)) {
468         return node;
469       }
470       index = (index + 1) & mask;
471     }
472   }
473 
474   // Call this method after updating the content of the |node| pointer
475   // returned by an unsuccessful NodeLookup(). Return true to indicate that
476   // the table size changed, and that existing iterators were invalidated.
UpdateAfterInsert()477   bool UpdateAfterInsert() {
478     count_ += 1;
479     if (UNLIKELY(count_ * 4 >= size_ * 3)) {
480       GrowBuckets();
481       return true;
482     }
483     return false;
484   }
485 
486   // Call this method after updating the content of the |node| value
487   // returned a by successful NodeLookup, to the tombstone value, if any.
488   // Return true to indicate a table size change, ie. that existing
489   // iterators where invalidated.
UpdateAfterRemoval()490   bool UpdateAfterRemoval() {
491     count_ -= 1;
492     // For now don't support shrinking since this is not useful for GN.
493     return false;
494   }
495 
496  private:
497 #if defined(__GNUC__) || defined(__clang__)
498   [[gnu::noinline]]
499 #endif
500   void
GrowBuckets()501   GrowBuckets() {
502     size_t size = size_;
503     size_t new_size = (size == 1) ? 8 : size * 2;
504     size_t new_mask = new_size - 1;
505 
506     // NOTE: Using calloc() since no object constructiopn can or should take
507     // place here.
508     Node* new_buckets = reinterpret_cast<Node*>(calloc(new_size, sizeof(Node)));
509 
510     for (size_t src_index = 0; src_index < size; ++src_index) {
511       const Node* node = &buckets_[src_index];
512       if (!node->is_valid())
513         continue;
514       size_t dst_index = node->hash_value() & new_mask;
515       for (;;) {
516         Node* node2 = new_buckets + dst_index;
517         if (node2->is_null()) {
518           *node2 = *node;
519           break;
520         }
521         dst_index = (dst_index + 1) & new_mask;
522       }
523     }
524 
525     if (buckets_ != buckets0_)
526       free(buckets_);
527 
528     buckets_ = new_buckets;
529     buckets0_[0] = Node{};
530     size_ = new_size;
531   }
532 
533   // NOTE: The reason for default-initializing |buckets_| to a storage slot
534   // inside the object is to ensure the value is never null. This removes one
535   // nullptr check from each NodeLookup() instantiation.
536   size_t count_ = 0;
537   size_t size_ = 1;
538   Node* buckets_ = buckets0_;
539   Node buckets0_[1] = {{}};
540 };
541 
542 #endif  // TOOLS_GN_HASH_TABLE_BASE_H_
543