1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_
17 #define TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_
18
19 #include <algorithm>
20 #include <vector>
21 #include "tensorflow/core/framework/types.h"
22 #include "tensorflow/core/platform/macros.h"
23 #include "tensorflow/core/platform/prefetch.h"
24
25 namespace tensorflow {
26
27 // Class for efficiently storing key->value mappings when the size is
28 // known in advance and the keys are pre-hashed into uint64s.
29 // Keys should have "good enough" randomness (be spread across the
30 // entire 64 bit space).
31 //
32 // Important: Clients wishing to use deterministic keys must
33 // ensure that their keys fall in the range 0 .. (uint64max-1);
34 // the table uses 2^64-1 as the "not occupied" flag.
35 //
36 // Inserted keys must be unique, and there are no update
37 // or delete functions (until some subsequent use of this table
38 // requires them).
39 //
40 // Threads must synchronize their access to a PresizedCuckooMap.
41 //
42 // The cuckoo hash table is 4-way associative (each "bucket" has 4
43 // "slots" for key/value entries). Uses breadth-first-search to find
44 // a good cuckoo path with less data movement (see
45 // http://www.cs.cmu.edu/~dga/papers/cuckoo-eurosys14.pdf )
46
47 namespace presized_cuckoo_map {
48 // Utility function to compute (x * y) >> 64, or "multiply high".
49 // On x86-64, this is a single instruction, but not all platforms
50 // support the __uint128_t type, so we provide a generic
51 // implementation as well.
multiply_high_u64(uint64 x,uint64 y)52 inline uint64 multiply_high_u64(uint64 x, uint64 y) {
53 #if defined(__SIZEOF_INT128__)
54 return (uint64)(((__uint128_t)x * (__uint128_t)y) >> 64);
55 #else
56 // For platforms without int128 support, do it the long way.
57 uint64 x_lo = x & 0xffffffff;
58 uint64 x_hi = x >> 32;
59 uint64 buckets_lo = y & 0xffffffff;
60 uint64 buckets_hi = y >> 32;
61 uint64 prod_hi = x_hi * buckets_hi;
62 uint64 prod_lo = x_lo * buckets_lo;
63 uint64 prod_mid1 = x_hi * buckets_lo;
64 uint64 prod_mid2 = x_lo * buckets_hi;
65 uint64 carry =
66 ((prod_mid1 & 0xffffffff) + (prod_mid2 & 0xffffffff) + (prod_lo >> 32)) >>
67 32;
68 return prod_hi + (prod_mid1 >> 32) + (prod_mid2 >> 32) + carry;
69 #endif
70 }
71 } // namespace presized_cuckoo_map
72
73 template <class value>
74 class PresizedCuckooMap {
75 public:
76 // The key type is fixed as a pre-hashed key for this specialized use.
77 typedef uint64 key_type;
78
PresizedCuckooMap(uint64 num_entries)79 explicit PresizedCuckooMap(uint64 num_entries) { Clear(num_entries); }
80
Clear(uint64 num_entries)81 void Clear(uint64 num_entries) {
82 cpq_.reset(new CuckooPathQueue());
83 double n(num_entries);
84 n /= kLoadFactor;
85 num_buckets_ = (static_cast<uint64>(n) / kSlotsPerBucket);
86 // Very small cuckoo tables don't work, because the probability
87 // of having same-bucket hashes is large. We compromise for those
88 // uses by having a larger static starting size.
89 num_buckets_ += 32;
90 Bucket empty_bucket;
91 for (int i = 0; i < kSlotsPerBucket; i++) {
92 empty_bucket.keys[i] = kUnusedSlot;
93 }
94 buckets_.clear();
95 buckets_.resize(num_buckets_, empty_bucket);
96 }
97
98 // Returns false if k is already in table or if the table
99 // is full; true otherwise.
InsertUnique(const key_type k,const value & v)100 bool InsertUnique(const key_type k, const value& v) {
101 uint64 tk = key_transform(k);
102 uint64 b1 = fast_map_to_buckets(tk);
103 uint64 b2 = fast_map_to_buckets(h2(tk));
104
105 // Merged find and duplicate checking.
106 uint64 target_bucket = 0;
107 int target_slot = kNoSpace;
108
109 for (auto bucket : {b1, b2}) {
110 Bucket* bptr = &buckets_[bucket];
111 for (int slot = 0; slot < kSlotsPerBucket; slot++) {
112 if (bptr->keys[slot] == k) { // Duplicates are not allowed.
113 return false;
114 } else if (target_slot == kNoSpace && bptr->keys[slot] == kUnusedSlot) {
115 target_bucket = bucket;
116 target_slot = slot;
117 }
118 }
119 }
120
121 if (target_slot != kNoSpace) {
122 InsertInternal(tk, v, target_bucket, target_slot);
123 return true;
124 }
125
126 return CuckooInsert(tk, v, b1, b2);
127 }
128
129 // Returns true if found. Sets *out = value.
Find(const key_type k,value * out)130 bool Find(const key_type k, value* out) const {
131 uint64 tk = key_transform(k);
132 return FindInBucket(k, fast_map_to_buckets(tk), out) ||
133 FindInBucket(k, fast_map_to_buckets(h2(tk)), out);
134 }
135
136 // Prefetch memory associated with the key k into cache levels specified by
137 // hint.
138 template <port::PrefetchHint hint = port::PREFETCH_HINT_T0>
PrefetchKey(const key_type k)139 void PrefetchKey(const key_type k) const {
140 const uint64 tk = key_transform(k);
141 port::prefetch<hint>(&buckets_[fast_map_to_buckets(tk)].keys);
142 port::prefetch<hint>(&buckets_[fast_map_to_buckets(h2(tk))].keys);
143 }
144
MemoryUsed()145 int64_t MemoryUsed() const {
146 return sizeof(PresizedCuckooMap<value>) + sizeof(CuckooPathQueue);
147 }
148
149 private:
150 static constexpr int kSlotsPerBucket = 4;
151
152 // The load factor is chosen slightly conservatively for speed and
153 // to avoid the need for a table rebuild on insertion failure.
154 // 0.94 is achievable, but 0.85 is faster and keeps the code simple
155 // at the cost of a small amount of memory.
156 // NOTE: 0 < kLoadFactor <= 1.0
157 static constexpr double kLoadFactor = 0.85;
158
159 // Cuckoo insert: The maximum number of entries to scan should be ~400
160 // (Source: Personal communication with Michael Mitzenmacher; empirical
161 // experiments validate.). After trying 400 candidate locations, declare
162 // the table full - it's probably full of unresolvable cycles. Less than
163 // 400 reduces max occupancy; much more results in very poor performance
164 // around the full point. For (2,4) a max BFS path len of 5 results in ~682
165 // nodes to visit, calculated below, and is a good value.
166
167 static constexpr uint8 kMaxBFSPathLen = 5;
168
169 // Constants for BFS cuckoo path search:
170 // The visited list must be maintained for all but the last level of search
171 // in order to trace back the path. The BFS search has two roots
172 // and each can go to a total depth (including the root) of 5.
173 // The queue must be sized for 2 * \sum_{k=0...4}{kSlotsPerBucket^k} = 682.
174 // The visited queue, however, does not need to hold the deepest level,
175 // and so it is sized 2 * \sum{k=0...3}{kSlotsPerBucket^k} = 170
176 static constexpr int kMaxQueueSize = 682;
177 static constexpr int kVisitedListSize = 170;
178
179 static constexpr int kNoSpace = -1; // SpaceAvailable return
180 static constexpr uint64 kUnusedSlot = ~(0ULL);
181
182 // Buckets are organized with key_types clustered for access speed
183 // and for compactness while remaining aligned.
184 struct Bucket {
185 key_type keys[kSlotsPerBucket];
186 value values[kSlotsPerBucket];
187 };
188
189 // Insert uses the BFS optimization (search before moving) to reduce
190 // the number of cache lines dirtied during search.
191
192 struct CuckooPathEntry {
193 uint64 bucket;
194 int depth;
195 int parent; // To index in the visited array.
196 int parent_slot; // Which slot in our parent did we come from? -1 == root.
197 };
198
199 // CuckooPathQueue is a trivial circular queue for path entries.
200 // The caller is responsible for not inserting more than kMaxQueueSize
201 // entries. Each PresizedCuckooMap has one (heap-allocated) CuckooPathQueue
202 // that it reuses across inserts.
203 class CuckooPathQueue {
204 public:
CuckooPathQueue()205 CuckooPathQueue() : head_(0), tail_(0) {}
206
push_back(CuckooPathEntry e)207 void push_back(CuckooPathEntry e) {
208 queue_[tail_] = e;
209 tail_ = (tail_ + 1) % kMaxQueueSize;
210 }
211
pop_front()212 CuckooPathEntry pop_front() {
213 CuckooPathEntry& e = queue_[head_];
214 head_ = (head_ + 1) % kMaxQueueSize;
215 return e;
216 }
217
empty()218 bool empty() const { return head_ == tail_; }
219
full()220 bool full() const { return ((tail_ + 1) % kMaxQueueSize) == head_; }
221
reset()222 void reset() { head_ = tail_ = 0; }
223
224 private:
225 CuckooPathEntry queue_[kMaxQueueSize];
226 int head_;
227 int tail_;
228 };
229
230 typedef std::array<CuckooPathEntry, kMaxBFSPathLen> CuckooPath;
231
232 // Callers are expected to have pre-hashed the keys into a uint64
233 // and are expected to be able to handle (a very low rate) of
234 // collisions, OR must ensure that their keys are always in
235 // the range 0 - (uint64max - 1). This transforms 'not found flag'
236 // keys into something else.
key_transform(const key_type k)237 inline uint64 key_transform(const key_type k) const {
238 return k + (k == kUnusedSlot);
239 }
240
241 // h2 performs a very quick mix of h to generate the second bucket hash.
242 // Assumes there is plenty of remaining entropy in the initial h.
h2(uint64 h)243 inline uint64 h2(uint64 h) const {
244 const uint64 m = 0xc6a4a7935bd1e995;
245 return m * ((h >> 32) | (h << 32));
246 }
247
248 // alt_bucket identifies the "other" bucket for key k, where
249 // other is "the one that isn't bucket b"
alt_bucket(key_type k,uint64 b)250 inline uint64 alt_bucket(key_type k, uint64 b) const {
251 if (fast_map_to_buckets(k) != b) {
252 return fast_map_to_buckets(k);
253 }
254 return fast_map_to_buckets(h2(k));
255 }
256
InsertInternal(key_type k,const value & v,uint64 b,int slot)257 inline void InsertInternal(key_type k, const value& v, uint64 b, int slot) {
258 Bucket* bptr = &buckets_[b];
259 bptr->keys[slot] = k;
260 bptr->values[slot] = v;
261 }
262
263 // For the associative cuckoo table, check all of the slots in
264 // the bucket to see if the key is present.
FindInBucket(key_type k,uint64 b,value * out)265 bool FindInBucket(key_type k, uint64 b, value* out) const {
266 const Bucket& bref = buckets_[b];
267 for (int i = 0; i < kSlotsPerBucket; i++) {
268 if (bref.keys[i] == k) {
269 *out = bref.values[i];
270 return true;
271 }
272 }
273 return false;
274 }
275
276 // returns either kNoSpace or the index of an
277 // available slot (0 <= slot < kSlotsPerBucket)
SpaceAvailable(uint64 bucket)278 inline int SpaceAvailable(uint64 bucket) const {
279 const Bucket& bref = buckets_[bucket];
280 for (int i = 0; i < kSlotsPerBucket; i++) {
281 if (bref.keys[i] == kUnusedSlot) {
282 return i;
283 }
284 }
285 return kNoSpace;
286 }
287
CopyItem(uint64 src_bucket,int src_slot,uint64 dst_bucket,int dst_slot)288 inline void CopyItem(uint64 src_bucket, int src_slot, uint64 dst_bucket,
289 int dst_slot) {
290 Bucket& src_ref = buckets_[src_bucket];
291 Bucket& dst_ref = buckets_[dst_bucket];
292 dst_ref.keys[dst_slot] = src_ref.keys[src_slot];
293 dst_ref.values[dst_slot] = src_ref.values[src_slot];
294 }
295
CuckooInsert(key_type k,const value & v,uint64 b1,uint64 b2)296 bool CuckooInsert(key_type k, const value& v, uint64 b1, uint64 b2) {
297 int visited_end = 0;
298 cpq_->reset();
299
300 cpq_->push_back({b1, 1, 0, 0}); // Note depth starts at 1.
301 cpq_->push_back({b2, 1, 0, 0});
302
303 while (!cpq_->empty()) {
304 CuckooPathEntry e = cpq_->pop_front();
305 int free_slot;
306 free_slot = SpaceAvailable(e.bucket);
307 if (free_slot != kNoSpace) {
308 while (e.depth > 1) {
309 // "copy" instead of "swap" because one entry is always zero.
310 // After, write target key/value over top of last copied entry.
311 CuckooPathEntry parent = visited_[e.parent];
312 CopyItem(parent.bucket, e.parent_slot, e.bucket, free_slot);
313 free_slot = e.parent_slot;
314 e = parent;
315 }
316 InsertInternal(k, v, e.bucket, free_slot);
317 return true;
318 } else {
319 if (e.depth < (kMaxBFSPathLen)) {
320 auto parent_index = visited_end;
321 visited_[visited_end] = e;
322 visited_end++;
323 // Don't always start with the same slot, to even out the path depth.
324 int start_slot = (k + e.bucket) % kSlotsPerBucket;
325 const Bucket& bref = buckets_[e.bucket];
326 for (int i = 0; i < kSlotsPerBucket; i++) {
327 int slot = (start_slot + i) % kSlotsPerBucket;
328 uint64 next_bucket = alt_bucket(bref.keys[slot], e.bucket);
329 // Optimization: Avoid single-step cycles (from e, don't
330 // add a child node that is actually e's parent).
331 uint64 e_parent_bucket = visited_[e.parent].bucket;
332 if (next_bucket != e_parent_bucket) {
333 cpq_->push_back({next_bucket, e.depth + 1, parent_index, slot});
334 }
335 }
336 }
337 }
338 }
339
340 LOG(WARNING) << "Cuckoo path finding failed: Table too small?";
341 return false;
342 }
343
fast_map_to_buckets(uint64 x)344 inline uint64 fast_map_to_buckets(uint64 x) const {
345 // Map x (uniform in 2^64) to the range [0, num_buckets_ -1]
346 // using Lemire's alternative to modulo reduction:
347 // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
348 // Instead of x % N, use (x * N) >> 64.
349 return presized_cuckoo_map::multiply_high_u64(x, num_buckets_);
350 }
351
352 // Set upon initialization: num_entries / kLoadFactor / kSlotsPerBucket.
353 uint64 num_buckets_;
354 std::vector<Bucket> buckets_;
355
356 std::unique_ptr<CuckooPathQueue> cpq_;
357 CuckooPathEntry visited_[kVisitedListSize];
358
359 TF_DISALLOW_COPY_AND_ASSIGN(PresizedCuckooMap);
360 };
361
362 } // namespace tensorflow
363
364 #endif // TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_
365