1 //===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines facilities for reading and writing on-disk hash
11 // tables.
12 //
13 //===----------------------------------------------------------------------===//
14 #ifndef LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
15 #define LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
16
17 #include "llvm/Support/Allocator.h"
18 #include "llvm/Support/DataTypes.h"
19 #include "llvm/Support/MathExtras.h"
20 #include "llvm/Support/raw_ostream.h"
21 #include "llvm/Support/Host.h"
22 #include <cassert>
23 #include <cstdlib>
24
25 namespace clang {
26
27 namespace io {
28
29 typedef uint32_t Offset;
30
Emit8(llvm::raw_ostream & Out,uint32_t V)31 inline void Emit8(llvm::raw_ostream& Out, uint32_t V) {
32 Out << (unsigned char)(V);
33 }
34
Emit16(llvm::raw_ostream & Out,uint32_t V)35 inline void Emit16(llvm::raw_ostream& Out, uint32_t V) {
36 Out << (unsigned char)(V);
37 Out << (unsigned char)(V >> 8);
38 assert((V >> 16) == 0);
39 }
40
Emit24(llvm::raw_ostream & Out,uint32_t V)41 inline void Emit24(llvm::raw_ostream& Out, uint32_t V) {
42 Out << (unsigned char)(V);
43 Out << (unsigned char)(V >> 8);
44 Out << (unsigned char)(V >> 16);
45 assert((V >> 24) == 0);
46 }
47
Emit32(llvm::raw_ostream & Out,uint32_t V)48 inline void Emit32(llvm::raw_ostream& Out, uint32_t V) {
49 Out << (unsigned char)(V);
50 Out << (unsigned char)(V >> 8);
51 Out << (unsigned char)(V >> 16);
52 Out << (unsigned char)(V >> 24);
53 }
54
Emit64(llvm::raw_ostream & Out,uint64_t V)55 inline void Emit64(llvm::raw_ostream& Out, uint64_t V) {
56 Out << (unsigned char)(V);
57 Out << (unsigned char)(V >> 8);
58 Out << (unsigned char)(V >> 16);
59 Out << (unsigned char)(V >> 24);
60 Out << (unsigned char)(V >> 32);
61 Out << (unsigned char)(V >> 40);
62 Out << (unsigned char)(V >> 48);
63 Out << (unsigned char)(V >> 56);
64 }
65
Pad(llvm::raw_ostream & Out,unsigned A)66 inline void Pad(llvm::raw_ostream& Out, unsigned A) {
67 Offset off = (Offset) Out.tell();
68 uint32_t n = ((uintptr_t)(off+A-1) & ~(uintptr_t)(A-1)) - off;
69 for (; n ; --n)
70 Emit8(Out, 0);
71 }
72
ReadUnalignedLE16(const unsigned char * & Data)73 inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) {
74 uint16_t V = ((uint16_t)Data[0]) |
75 ((uint16_t)Data[1] << 8);
76 Data += 2;
77 return V;
78 }
79
ReadUnalignedLE32(const unsigned char * & Data)80 inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) {
81 uint32_t V = ((uint32_t)Data[0]) |
82 ((uint32_t)Data[1] << 8) |
83 ((uint32_t)Data[2] << 16) |
84 ((uint32_t)Data[3] << 24);
85 Data += 4;
86 return V;
87 }
88
ReadUnalignedLE64(const unsigned char * & Data)89 inline uint64_t ReadUnalignedLE64(const unsigned char *&Data) {
90 uint64_t V = ((uint64_t)Data[0]) |
91 ((uint64_t)Data[1] << 8) |
92 ((uint64_t)Data[2] << 16) |
93 ((uint64_t)Data[3] << 24) |
94 ((uint64_t)Data[4] << 32) |
95 ((uint64_t)Data[5] << 40) |
96 ((uint64_t)Data[6] << 48) |
97 ((uint64_t)Data[7] << 56);
98 Data += 8;
99 return V;
100 }
101
ReadLE32(const unsigned char * & Data)102 inline uint32_t ReadLE32(const unsigned char *&Data) {
103 // Hosts that directly support little-endian 32-bit loads can just
104 // use them. Big-endian hosts need a bswap.
105 uint32_t V = *((uint32_t*)Data);
106 if (llvm::sys::isBigEndianHost())
107 V = llvm::ByteSwap_32(V);
108 Data += 4;
109 return V;
110 }
111
112 } // end namespace io
113
114 template<typename Info>
115 class OnDiskChainedHashTableGenerator {
116 unsigned NumBuckets;
117 unsigned NumEntries;
118 llvm::BumpPtrAllocator BA;
119
120 class Item {
121 public:
122 typename Info::key_type key;
123 typename Info::data_type data;
124 Item *next;
125 const uint32_t hash;
126
Item(typename Info::key_type_ref k,typename Info::data_type_ref d,Info & InfoObj)127 Item(typename Info::key_type_ref k, typename Info::data_type_ref d,
128 Info &InfoObj)
129 : key(k), data(d), next(0), hash(InfoObj.ComputeHash(k)) {}
130 };
131
132 class Bucket {
133 public:
134 io::Offset off;
135 Item* head;
136 unsigned length;
137
Bucket()138 Bucket() {}
139 };
140
141 Bucket* Buckets;
142
143 private:
insert(Bucket * b,size_t size,Item * E)144 void insert(Bucket* b, size_t size, Item* E) {
145 unsigned idx = E->hash & (size - 1);
146 Bucket& B = b[idx];
147 E->next = B.head;
148 ++B.length;
149 B.head = E;
150 }
151
resize(size_t newsize)152 void resize(size_t newsize) {
153 Bucket* newBuckets = (Bucket*) std::calloc(newsize, sizeof(Bucket));
154 // Populate newBuckets with the old entries.
155 for (unsigned i = 0; i < NumBuckets; ++i)
156 for (Item* E = Buckets[i].head; E ; ) {
157 Item* N = E->next;
158 E->next = 0;
159 insert(newBuckets, newsize, E);
160 E = N;
161 }
162
163 free(Buckets);
164 NumBuckets = newsize;
165 Buckets = newBuckets;
166 }
167
168 public:
169
insert(typename Info::key_type_ref key,typename Info::data_type_ref data)170 void insert(typename Info::key_type_ref key,
171 typename Info::data_type_ref data) {
172 Info InfoObj;
173 insert(key, data, InfoObj);
174 }
175
insert(typename Info::key_type_ref key,typename Info::data_type_ref data,Info & InfoObj)176 void insert(typename Info::key_type_ref key,
177 typename Info::data_type_ref data, Info &InfoObj) {
178
179 ++NumEntries;
180 if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2);
181 insert(Buckets, NumBuckets, new (BA.Allocate<Item>()) Item(key, data,
182 InfoObj));
183 }
184
Emit(llvm::raw_ostream & out)185 io::Offset Emit(llvm::raw_ostream &out) {
186 Info InfoObj;
187 return Emit(out, InfoObj);
188 }
189
Emit(llvm::raw_ostream & out,Info & InfoObj)190 io::Offset Emit(llvm::raw_ostream &out, Info &InfoObj) {
191 using namespace clang::io;
192
193 // Emit the payload of the table.
194 for (unsigned i = 0; i < NumBuckets; ++i) {
195 Bucket& B = Buckets[i];
196 if (!B.head) continue;
197
198 // Store the offset for the data of this bucket.
199 B.off = out.tell();
200 assert(B.off && "Cannot write a bucket at offset 0. Please add padding.");
201
202 // Write out the number of items in the bucket.
203 Emit16(out, B.length);
204
205 // Write out the entries in the bucket.
206 for (Item *I = B.head; I ; I = I->next) {
207 Emit32(out, I->hash);
208 const std::pair<unsigned, unsigned>& Len =
209 InfoObj.EmitKeyDataLength(out, I->key, I->data);
210 InfoObj.EmitKey(out, I->key, Len.first);
211 InfoObj.EmitData(out, I->key, I->data, Len.second);
212 }
213 }
214
215 // Emit the hashtable itself.
216 Pad(out, 4);
217 io::Offset TableOff = out.tell();
218 Emit32(out, NumBuckets);
219 Emit32(out, NumEntries);
220 for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off);
221
222 return TableOff;
223 }
224
OnDiskChainedHashTableGenerator()225 OnDiskChainedHashTableGenerator() {
226 NumEntries = 0;
227 NumBuckets = 64;
228 // Note that we do not need to run the constructors of the individual
229 // Bucket objects since 'calloc' returns bytes that are all 0.
230 Buckets = (Bucket*) std::calloc(NumBuckets, sizeof(Bucket));
231 }
232
~OnDiskChainedHashTableGenerator()233 ~OnDiskChainedHashTableGenerator() {
234 std::free(Buckets);
235 }
236 };
237
238 template<typename Info>
239 class OnDiskChainedHashTable {
240 const unsigned NumBuckets;
241 const unsigned NumEntries;
242 const unsigned char* const Buckets;
243 const unsigned char* const Base;
244 Info InfoObj;
245
246 public:
247 typedef typename Info::internal_key_type internal_key_type;
248 typedef typename Info::external_key_type external_key_type;
249 typedef typename Info::data_type data_type;
250
251 OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries,
252 const unsigned char* buckets,
253 const unsigned char* base,
254 const Info &InfoObj = Info())
NumBuckets(numBuckets)255 : NumBuckets(numBuckets), NumEntries(numEntries),
256 Buckets(buckets), Base(base), InfoObj(InfoObj) {
257 assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
258 "'buckets' must have a 4-byte alignment");
259 }
260
getNumBuckets()261 unsigned getNumBuckets() const { return NumBuckets; }
getNumEntries()262 unsigned getNumEntries() const { return NumEntries; }
getBase()263 const unsigned char* getBase() const { return Base; }
getBuckets()264 const unsigned char* getBuckets() const { return Buckets; }
265
isEmpty()266 bool isEmpty() const { return NumEntries == 0; }
267
268 class iterator {
269 internal_key_type key;
270 const unsigned char* const data;
271 const unsigned len;
272 Info *InfoObj;
273 public:
iterator()274 iterator() : data(0), len(0) {}
iterator(const internal_key_type k,const unsigned char * d,unsigned l,Info * InfoObj)275 iterator(const internal_key_type k, const unsigned char* d, unsigned l,
276 Info *InfoObj)
277 : key(k), data(d), len(l), InfoObj(InfoObj) {}
278
279 data_type operator*() const { return InfoObj->ReadData(key, data, len); }
280 bool operator==(const iterator& X) const { return X.data == data; }
281 bool operator!=(const iterator& X) const { return X.data != data; }
282 };
283
284 iterator find(const external_key_type& eKey, Info *InfoPtr = 0) {
285 if (!InfoPtr)
286 InfoPtr = &InfoObj;
287
288 using namespace io;
289 const internal_key_type& iKey = InfoObj.GetInternalKey(eKey);
290 unsigned key_hash = InfoObj.ComputeHash(iKey);
291
292 // Each bucket is just a 32-bit offset into the hash table file.
293 unsigned idx = key_hash & (NumBuckets - 1);
294 const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx;
295
296 unsigned offset = ReadLE32(Bucket);
297 if (offset == 0) return iterator(); // Empty bucket.
298 const unsigned char* Items = Base + offset;
299
300 // 'Items' starts with a 16-bit unsigned integer representing the
301 // number of items in this bucket.
302 unsigned len = ReadUnalignedLE16(Items);
303
304 for (unsigned i = 0; i < len; ++i) {
305 // Read the hash.
306 uint32_t item_hash = ReadUnalignedLE32(Items);
307
308 // Determine the length of the key and the data.
309 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Items);
310 unsigned item_len = L.first + L.second;
311
312 // Compare the hashes. If they are not the same, skip the entry entirely.
313 if (item_hash != key_hash) {
314 Items += item_len;
315 continue;
316 }
317
318 // Read the key.
319 const internal_key_type& X =
320 InfoPtr->ReadKey((const unsigned char* const) Items, L.first);
321
322 // If the key doesn't match just skip reading the value.
323 if (!InfoPtr->EqualKey(X, iKey)) {
324 Items += item_len;
325 continue;
326 }
327
328 // The key matches!
329 return iterator(X, Items + L.first, L.second, InfoPtr);
330 }
331
332 return iterator();
333 }
334
end()335 iterator end() const { return iterator(); }
336
337 /// \brief Iterates over all of the keys in the table.
338 class key_iterator {
339 const unsigned char* Ptr;
340 unsigned NumItemsInBucketLeft;
341 unsigned NumEntriesLeft;
342 Info *InfoObj;
343 public:
344 typedef external_key_type value_type;
345
key_iterator(const unsigned char * const Ptr,unsigned NumEntries,Info * InfoObj)346 key_iterator(const unsigned char* const Ptr, unsigned NumEntries,
347 Info *InfoObj)
348 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
349 InfoObj(InfoObj) { }
key_iterator()350 key_iterator()
351 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
352
353 friend bool operator==(const key_iterator &X, const key_iterator &Y) {
354 return X.NumEntriesLeft == Y.NumEntriesLeft;
355 }
356 friend bool operator!=(const key_iterator& X, const key_iterator &Y) {
357 return X.NumEntriesLeft != Y.NumEntriesLeft;
358 }
359
360 key_iterator& operator++() { // Preincrement
361 if (!NumItemsInBucketLeft) {
362 // 'Items' starts with a 16-bit unsigned integer representing the
363 // number of items in this bucket.
364 NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
365 }
366 Ptr += 4; // Skip the hash.
367 // Determine the length of the key and the data.
368 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
369 Ptr += L.first + L.second;
370 assert(NumItemsInBucketLeft);
371 --NumItemsInBucketLeft;
372 assert(NumEntriesLeft);
373 --NumEntriesLeft;
374 return *this;
375 }
376 key_iterator operator++(int) { // Postincrement
377 key_iterator tmp = *this; ++*this; return tmp;
378 }
379
380 value_type operator*() const {
381 const unsigned char* LocalPtr = Ptr;
382 if (!NumItemsInBucketLeft)
383 LocalPtr += 2; // number of items in bucket
384 LocalPtr += 4; // Skip the hash.
385
386 // Determine the length of the key and the data.
387 const std::pair<unsigned, unsigned>& L
388 = Info::ReadKeyDataLength(LocalPtr);
389
390 // Read the key.
391 const internal_key_type& Key = InfoObj->ReadKey(LocalPtr, L.first);
392 return InfoObj->GetExternalKey(Key);
393 }
394 };
395
key_begin()396 key_iterator key_begin() {
397 return key_iterator(Base + 4, getNumEntries(), &InfoObj);
398 }
key_end()399 key_iterator key_end() { return key_iterator(); }
400
401 /// \brief Iterates over all the entries in the table, returning
402 /// a key/data pair.
403 class item_iterator {
404 const unsigned char* Ptr;
405 unsigned NumItemsInBucketLeft;
406 unsigned NumEntriesLeft;
407 Info *InfoObj;
408 public:
409 typedef std::pair<external_key_type, data_type> value_type;
410
item_iterator(const unsigned char * const Ptr,unsigned NumEntries,Info * InfoObj)411 item_iterator(const unsigned char* const Ptr, unsigned NumEntries,
412 Info *InfoObj)
413 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
414 InfoObj(InfoObj) { }
item_iterator()415 item_iterator()
416 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
417
418 bool operator==(const item_iterator& X) const {
419 return X.NumEntriesLeft == NumEntriesLeft;
420 }
421 bool operator!=(const item_iterator& X) const {
422 return X.NumEntriesLeft != NumEntriesLeft;
423 }
424
425 item_iterator& operator++() { // Preincrement
426 if (!NumItemsInBucketLeft) {
427 // 'Items' starts with a 16-bit unsigned integer representing the
428 // number of items in this bucket.
429 NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
430 }
431 Ptr += 4; // Skip the hash.
432 // Determine the length of the key and the data.
433 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
434 Ptr += L.first + L.second;
435 assert(NumItemsInBucketLeft);
436 --NumItemsInBucketLeft;
437 assert(NumEntriesLeft);
438 --NumEntriesLeft;
439 return *this;
440 }
441 item_iterator operator++(int) { // Postincrement
442 item_iterator tmp = *this; ++*this; return tmp;
443 }
444
445 value_type operator*() const {
446 const unsigned char* LocalPtr = Ptr;
447 if (!NumItemsInBucketLeft)
448 LocalPtr += 2; // number of items in bucket
449 LocalPtr += 4; // Skip the hash.
450
451 // Determine the length of the key and the data.
452 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(LocalPtr);
453
454 // Read the key.
455 const internal_key_type& Key =
456 InfoObj->ReadKey(LocalPtr, L.first);
457 return std::make_pair(InfoObj->GetExternalKey(Key),
458 InfoObj->ReadData(Key, LocalPtr + L.first, L.second));
459 }
460 };
461
item_begin()462 item_iterator item_begin() {
463 return item_iterator(Base + 4, getNumEntries(), &InfoObj);
464 }
item_end()465 item_iterator item_end() { return item_iterator(); }
466
467 static OnDiskChainedHashTable* Create(const unsigned char* buckets,
468 const unsigned char* const base,
469 const Info &InfoObj = Info()) {
470 using namespace io;
471 assert(buckets > base);
472 assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
473 "buckets should be 4-byte aligned.");
474
475 unsigned numBuckets = ReadLE32(buckets);
476 unsigned numEntries = ReadLE32(buckets);
477 return new OnDiskChainedHashTable<Info>(numBuckets, numEntries, buckets,
478 base, InfoObj);
479 }
480 };
481
482 } // end namespace clang
483
484 #endif
485