1 // Copyright (c) 2020 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "gn/string_atom.h"
6
7 #include <array>
8 #include <memory>
9 #include <mutex>
10 #include <set>
11 #include <string>
12 #include <unordered_set>
13 #include <vector>
14
15 #include "base/containers/flat_set.h"
16
17 namespace {
18
19 // Implementation note:
20 //
21 // StringAtomSet implements the global shared state, which is:
22 //
23 // - a group of std::string instances with a persistent address, allocated
24 // through a fast slab allocator.
25 //
26 // - a set of string pointers, corresponding to the known strings in the
27 // group.
28 //
29 // - a mutex to ensure correct thread-safety.
30 //
31 // - a find() method that takes an std::string_view argument, and uses it
32 // to find a matching entry in the string tree. If none is available,
33 // a new std::string is allocated and its address inserted into the tree
34 // before being returned.
35 //
36 // Because the mutex is a large bottleneck, each thread implements
37 // its own local string pointer cache, and will only call StringAtomSet::find()
38 // in case of a lookup miss. This is critical for good performance.
39 //
40
41 static const std::string kEmptyString;
42
43 using KeyType = const std::string*;
44
45 // This is a trivial hash table of string pointers, using open addressing.
46 // It is faster in practice than using a standard container or even a
47 // base::flat_set<>.
48 //
49 // Usage is the following:
50 //
51 // 1) Compute string hash value.
52 //
53 // 2) Call Lookup() with the hash value and the string_view key,
54 // this always returns a mutable Node* pointer, say |node|.
55 //
56 // 3) If |node->key| is not nullptr, this is the key to use.
57 // Otherwise, allocate a new string with persistent address,
58 // and call Insert(), passing the |node|, |hash| and new string
59 // address as arguments.
60 //
61 struct KeySet {
62 struct Node {
63 size_t hash = 0;
64 KeyType key = nullptr;
65 };
66
67 // Compute hash for |str|. Replace with faster hash function if available.
Hash__anon30f874f30111::KeySet68 static size_t Hash(std::string_view str) {
69 return std::hash<std::string_view>()(str);
70 }
71
72 // Lookup for |str| with specific |hash| value.
73 // Return a Node pointer. If the key was found, |node.key| is its value.
74 // Otherwise, the caller should create a new key value, then call Insert()
75 // below.
76 //
77 // NOTE: Even though this method is const, because it doesn't modify the
78 // state of the KeySet, it returns a *mutable* node pointer, to be
79 // passed to Insert() in case of a miss.
80 //
Lookup__anon30f874f30111::KeySet81 Node* Lookup(size_t hash, std::string_view str) const {
82 size_t index = hash & (size_ - 1);
83 const Node* nodes = &buckets_[0];
84 const Node* nodes_limit = nodes + size_;
85 const Node* node = nodes + index;
86 for (;;) {
87 if (!node->key || (node->hash == hash && *node->key == str))
88 return const_cast<Node*>(node);
89 if (++node == nodes_limit)
90 node = nodes;
91 }
92 }
93
94 // Insert a new key in this set. |node| must be a value returned by
95 // a previous Lookup() call. |hash| is the hash value for |key|.
Insert__anon30f874f30111::KeySet96 void Insert(Node* node, size_t hash, KeyType key) {
97 node->hash = hash;
98 node->key = key;
99 count_ += 1;
100 if (count_ * 4 >= size_ * 3) // 75% max load factor
101 GrowBuckets();
102 }
103
GrowBuckets__anon30f874f30111::KeySet104 void GrowBuckets() {
105 size_t size = buckets_.size();
106 size_t new_size = size * 2;
107 size_t new_mask = new_size - 1;
108
109 std::vector<Node> new_buckets;
110 new_buckets.resize(new_size);
111 for (const Node& node : buckets_) {
112 size_t index = node.hash & new_mask;
113 for (;;) {
114 Node& node2 = new_buckets[index];
115 if (!node2.key) {
116 node2 = node;
117 break;
118 }
119 index = (index + 1) & new_mask;
120 }
121 }
122 buckets_ = std::move(new_buckets);
123 size_ = new_size;
124 }
125
126 size_t size_ = 2;
127 size_t count_ = 0;
128 std::vector<Node> buckets_ = {Node{}, Node{}};
129 };
130
131 class StringAtomSet {
132 public:
StringAtomSet()133 StringAtomSet() {
134 // Ensure kEmptyString is in our set while not being allocated
135 // from a slab. The end result is that find("") should always
136 // return this address.
137 //
138 // This allows the StringAtom() default initializer to use the same
139 // address directly, avoiding a table lookup.
140 //
141 size_t hash = set_.Hash("");
142 auto* node = set_.Lookup(hash, "");
143 set_.Insert(node, hash, &kEmptyString);
144 }
145
146 // Find the unique constant string pointer for |key|.
find(std::string_view key)147 const std::string* find(std::string_view key) {
148 std::lock_guard<std::mutex> lock(mutex_);
149 size_t hash = set_.Hash(key);
150 auto* node = set_.Lookup(hash, key);
151 if (node->key)
152 return node->key;
153
154 // Allocate new string, insert its address in the set.
155 if (slab_index_ >= kStringsPerSlab) {
156 slabs_.push_back(new Slab());
157 slab_index_ = 0;
158 }
159 std::string* result = slabs_.back()->init(slab_index_++, key);
160 set_.Insert(node, hash, result);
161 return result;
162 }
163
164 private:
165 static constexpr unsigned int kStringsPerSlab = 128;
166
167 // Each slab is allocated independently, has a fixed address and stores
168 // kStringsPerSlab items of type StringStorage. The latter has the same
169 // size and alignment as std::string, but doesn't need default-initialization.
170 // This is used to slightly speed up Slab allocation and string
171 // initialization. The drawback is that on process exit, allocated strings
172 // are leaked (but GN already leaks several hundred MiBs of memory anyway).
173
174 // A C++ union that can store an std::string but without default
175 // initialization and destruction.
176 union StringStorage {
StringStorage()177 StringStorage() {}
~StringStorage()178 ~StringStorage() {}
179 char dummy;
180 std::string str;
181 };
182
183 // A fixed array of StringStorage items. Can be allocated cheaply.
184 class Slab {
185 public:
186 // Init the n-th string in the slab with |str|.
187 // Return its location as well.
init(size_t index,const std::string_view & str)188 std::string* init(size_t index, const std::string_view& str) {
189 std::string* result = &items_[index].str;
190 new (result) std::string(str);
191 return result;
192 }
193
194 private:
195 StringStorage items_[kStringsPerSlab];
196 };
197
198 std::mutex mutex_;
199 KeySet set_;
200 std::vector<Slab*> slabs_;
201 unsigned int slab_index_ = kStringsPerSlab;
202 };
203
GetStringAtomSet()204 StringAtomSet& GetStringAtomSet() {
205 static StringAtomSet s_string_atom_set;
206 return s_string_atom_set;
207 }
208
209 // Each thread maintains its own ThreadLocalCache to perform fast lookups
210 // without taking any mutex in most cases.
211 class ThreadLocalCache {
212 public:
213 // Find the unique constant string pointer for |key| in this cache,
214 // and fallback to the global one in case of a miss.
find(std::string_view key)215 KeyType find(std::string_view key) {
216 size_t hash = local_set_.Hash(key);
217 auto* node = local_set_.Lookup(hash, key);
218 if (node->key)
219 return node->key;
220
221 KeyType result = GetStringAtomSet().find(key);
222 local_set_.Insert(node, hash, result);
223 return result;
224 }
225
226 private:
227 KeySet local_set_;
228 };
229
230 thread_local ThreadLocalCache s_local_cache;
231
232 } // namespace
233
StringAtom()234 StringAtom::StringAtom() : value_(kEmptyString) {}
235
StringAtom(std::string_view str)236 StringAtom::StringAtom(std::string_view str) noexcept
237 : value_(*s_local_cache.find(str)) {}
238