1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/safe_browsing/prefix_set.h"
6
7 #include <algorithm>
8 #include <math.h>
9
10 #include "base/file_util.h"
11 #include "base/logging.h"
12 #include "base/md5.h"
13 #include "base/metrics/histogram.h"
14
15 namespace {
16
17 // |kMagic| should be reasonably unique, and not match itself across
18 // endianness changes. I generated this value with:
19 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
20 static uint32 kMagic = 0x864088dd;
21
22 // Current version the code writes out.
23 static uint32 kVersion = 0x1;
24
25 typedef struct {
26 uint32 magic;
27 uint32 version;
28 uint32 index_size;
29 uint32 deltas_size;
30 } FileHeader;
31
32 // For |std::upper_bound()| to find a prefix w/in a vector of pairs.
PrefixLess(const std::pair<SBPrefix,size_t> & a,const std::pair<SBPrefix,size_t> & b)33 bool PrefixLess(const std::pair<SBPrefix,size_t>& a,
34 const std::pair<SBPrefix,size_t>& b) {
35 return a.first < b.first;
36 }
37
38 } // namespace
39
40 namespace safe_browsing {
41
PrefixSet(const std::vector<SBPrefix> & sorted_prefixes)42 PrefixSet::PrefixSet(const std::vector<SBPrefix>& sorted_prefixes) {
43 if (sorted_prefixes.size()) {
44 // Lead with the first prefix.
45 SBPrefix prev_prefix = sorted_prefixes[0];
46 size_t run_length = 0;
47 index_.push_back(std::make_pair(prev_prefix, deltas_.size()));
48
49 // Used to build a checksum from the data used to construct the
50 // structures. Since the data is a bunch of uniform hashes, it
51 // seems reasonable to just xor most of it in, rather than trying
52 // to use a more complicated algorithm.
53 uint32 checksum = static_cast<uint32>(sorted_prefixes[0]);
54 checksum ^= static_cast<uint32>(deltas_.size());
55
56 for (size_t i = 1; i < sorted_prefixes.size(); ++i) {
57 // Skip duplicates.
58 if (sorted_prefixes[i] == prev_prefix)
59 continue;
60
61 // Calculate the delta. |unsigned| is mandatory, because the
62 // sorted_prefixes could be more than INT_MAX apart.
63 DCHECK_GT(sorted_prefixes[i], prev_prefix);
64 const unsigned delta = sorted_prefixes[i] - prev_prefix;
65 const uint16 delta16 = static_cast<uint16>(delta);
66
67 // New index ref if the delta doesn't fit, or if too many
68 // consecutive deltas have been encoded.
69 if (delta != static_cast<unsigned>(delta16) || run_length >= kMaxRun) {
70 checksum ^= static_cast<uint32>(sorted_prefixes[i]);
71 checksum ^= static_cast<uint32>(deltas_.size());
72 index_.push_back(std::make_pair(sorted_prefixes[i], deltas_.size()));
73 run_length = 0;
74 } else {
75 checksum ^= static_cast<uint32>(delta16);
76 // Continue the run of deltas.
77 deltas_.push_back(delta16);
78 DCHECK_EQ(static_cast<unsigned>(deltas_.back()), delta);
79 ++run_length;
80 }
81
82 prev_prefix = sorted_prefixes[i];
83 }
84 checksum_ = checksum;
85 DCHECK(CheckChecksum());
86
87 // Send up some memory-usage stats. Bits because fractional bytes
88 // are weird.
89 const size_t bits_used = index_.size() * sizeof(index_[0]) * CHAR_BIT +
90 deltas_.size() * sizeof(deltas_[0]) * CHAR_BIT;
91 const size_t unique_prefixes = index_.size() + deltas_.size();
92 static const size_t kMaxBitsPerPrefix = sizeof(SBPrefix) * CHAR_BIT;
93 UMA_HISTOGRAM_ENUMERATION("SB2.PrefixSetBitsPerPrefix",
94 bits_used / unique_prefixes,
95 kMaxBitsPerPrefix);
96 }
97 }
98
PrefixSet(std::vector<std::pair<SBPrefix,size_t>> * index,std::vector<uint16> * deltas)99 PrefixSet::PrefixSet(std::vector<std::pair<SBPrefix,size_t> > *index,
100 std::vector<uint16> *deltas) {
101 DCHECK(index && deltas);
102 index_.swap(*index);
103 deltas_.swap(*deltas);
104 }
105
~PrefixSet()106 PrefixSet::~PrefixSet() {}
107
Exists(SBPrefix prefix) const108 bool PrefixSet::Exists(SBPrefix prefix) const {
109 if (index_.empty())
110 return false;
111
112 // Find the first position after |prefix| in |index_|.
113 std::vector<std::pair<SBPrefix,size_t> >::const_iterator
114 iter = std::upper_bound(index_.begin(), index_.end(),
115 std::pair<SBPrefix,size_t>(prefix, 0),
116 PrefixLess);
117
118 // |prefix| comes before anything that's in the set.
119 if (iter == index_.begin())
120 return false;
121
122 // Capture the upper bound of our target entry's deltas.
123 const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);
124
125 // Back up to the entry our target is in.
126 --iter;
127
128 // All prefixes in |index_| are in the set.
129 SBPrefix current = iter->first;
130 if (current == prefix)
131 return true;
132
133 // Scan forward accumulating deltas while a match is possible.
134 for (size_t di = iter->second; di < bound && current < prefix; ++di) {
135 current += deltas_[di];
136 }
137
138 return current == prefix;
139 }
140
GetPrefixes(std::vector<SBPrefix> * prefixes) const141 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {
142 for (size_t ii = 0; ii < index_.size(); ++ii) {
143 // The deltas for this |index_| entry run to the next index entry,
144 // or the end of the deltas.
145 const size_t deltas_end =
146 (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();
147
148 SBPrefix current = index_[ii].first;
149 prefixes->push_back(current);
150 for (size_t di = index_[ii].second; di < deltas_end; ++di) {
151 current += deltas_[di];
152 prefixes->push_back(current);
153 }
154 }
155 }
156
157 // static
LoadFile(const FilePath & filter_name)158 PrefixSet* PrefixSet::LoadFile(const FilePath& filter_name) {
159 int64 size_64;
160 if (!file_util::GetFileSize(filter_name, &size_64))
161 return NULL;
162 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest)))
163 return NULL;
164
165 file_util::ScopedFILE file(file_util::OpenFile(filter_name, "rb"));
166 if (!file.get())
167 return NULL;
168
169 FileHeader header;
170 size_t read = fread(&header, sizeof(header), 1, file.get());
171 if (read != 1)
172 return NULL;
173
174 if (header.magic != kMagic || header.version != kVersion)
175 return NULL;
176
177 std::vector<std::pair<SBPrefix,size_t> > index;
178 const size_t index_bytes = sizeof(index[0]) * header.index_size;
179
180 std::vector<uint16> deltas;
181 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
182
183 // Check for bogus sizes before allocating any space.
184 const size_t expected_bytes =
185 sizeof(header) + index_bytes + deltas_bytes + sizeof(MD5Digest);
186 if (static_cast<int64>(expected_bytes) != size_64)
187 return NULL;
188
189 // The file looks valid, start building the digest.
190 MD5Context context;
191 MD5Init(&context);
192 MD5Update(&context, &header, sizeof(header));
193
194 // Read the index vector. Herb Sutter indicates that vectors are
195 // guaranteed to be contiuguous, so reading to where element 0 lives
196 // is valid.
197 index.resize(header.index_size);
198 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
199 if (read != index.size())
200 return NULL;
201 MD5Update(&context, &(index[0]), index_bytes);
202
203 // Read vector of deltas.
204 deltas.resize(header.deltas_size);
205 read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());
206 if (read != deltas.size())
207 return NULL;
208 MD5Update(&context, &(deltas[0]), deltas_bytes);
209
210 MD5Digest calculated_digest;
211 MD5Final(&calculated_digest, &context);
212
213 MD5Digest file_digest;
214 read = fread(&file_digest, sizeof(file_digest), 1, file.get());
215 if (read != 1)
216 return NULL;
217
218 if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
219 return NULL;
220
221 // Steals contents of |index| and |deltas| via swap().
222 return new PrefixSet(&index, &deltas);
223 }
224
WriteFile(const FilePath & filter_name) const225 bool PrefixSet::WriteFile(const FilePath& filter_name) const {
226 FileHeader header;
227 header.magic = kMagic;
228 header.version = kVersion;
229 header.index_size = static_cast<uint32>(index_.size());
230 header.deltas_size = static_cast<uint32>(deltas_.size());
231
232 // Sanity check that the 32-bit values never mess things up.
233 if (static_cast<size_t>(header.index_size) != index_.size() ||
234 static_cast<size_t>(header.deltas_size) != deltas_.size()) {
235 NOTREACHED();
236 return false;
237 }
238
239 file_util::ScopedFILE file(file_util::OpenFile(filter_name, "wb"));
240 if (!file.get())
241 return false;
242
243 MD5Context context;
244 MD5Init(&context);
245
246 // TODO(shess): The I/O code in safe_browsing_store_file.cc would
247 // sure be useful about now.
248 size_t written = fwrite(&header, sizeof(header), 1, file.get());
249 if (written != 1)
250 return false;
251 MD5Update(&context, &header, sizeof(header));
252
253 // As for reads, the standard guarantees the ability to access the
254 // contents of the vector by a pointer to an element.
255 const size_t index_bytes = sizeof(index_[0]) * index_.size();
256 written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(), file.get());
257 if (written != index_.size())
258 return false;
259 MD5Update(&context, &(index_[0]), index_bytes);
260
261 const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();
262 written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),
263 file.get());
264 if (written != deltas_.size())
265 return false;
266 MD5Update(&context, &(deltas_[0]), deltas_bytes);
267
268 MD5Digest digest;
269 MD5Final(&digest, &context);
270 written = fwrite(&digest, sizeof(digest), 1, file.get());
271 if (written != 1)
272 return false;
273
274 // TODO(shess): Can this code check that the close was successful?
275 file.reset();
276
277 return true;
278 }
279
IndexBinFor(size_t target_index) const280 size_t PrefixSet::IndexBinFor(size_t target_index) const {
281 // The items in |index_| have the logical index of each previous
282 // item in |index_| plus the count of deltas between the items.
283 // Since the indices into |deltas_| are absolute, the logical index
284 // is then the sum of the two indices.
285 size_t lo = 0;
286 size_t hi = index_.size();
287
288 // Binary search because linear search was too slow (really, the
289 // unit test sucked). Inline because the elements can't be compared
290 // in isolation (their position matters).
291 while (hi - lo > 1) {
292 const size_t i = (lo + hi) / 2;
293
294 if (target_index < i + index_[i].second) {
295 DCHECK_LT(i, hi); // Always making progress.
296 hi = i;
297 } else {
298 DCHECK_GT(i, lo); // Always making progress.
299 lo = i;
300 }
301 }
302 return lo;
303 }
304
GetSize() const305 size_t PrefixSet::GetSize() const {
306 return index_.size() + deltas_.size();
307 }
308
IsDeltaAt(size_t target_index) const309 bool PrefixSet::IsDeltaAt(size_t target_index) const {
310 CHECK_LT(target_index, GetSize());
311
312 const size_t i = IndexBinFor(target_index);
313 return target_index > i + index_[i].second;
314 }
315
DeltaAt(size_t target_index) const316 uint16 PrefixSet::DeltaAt(size_t target_index) const {
317 CHECK_LT(target_index, GetSize());
318
319 // Find the |index_| entry which contains |target_index|.
320 const size_t i = IndexBinFor(target_index);
321
322 // Exactly on the |index_| entry means no delta.
323 CHECK_GT(target_index, i + index_[i].second);
324
325 // -i backs out the |index_| entries, -1 gets the delta that lead to
326 // the value at |target_index|.
327 CHECK_LT(target_index - i - 1, deltas_.size());
328 return deltas_[target_index - i - 1];
329 }
330
CheckChecksum() const331 bool PrefixSet::CheckChecksum() const {
332 uint32 checksum = 0;
333
334 for (size_t ii = 0; ii < index_.size(); ++ii) {
335 checksum ^= static_cast<uint32>(index_[ii].first);
336 checksum ^= static_cast<uint32>(index_[ii].second);
337 }
338
339 for (size_t di = 0; di < deltas_.size(); ++di) {
340 checksum ^= static_cast<uint32>(deltas_[di]);
341 }
342
343 return checksum == checksum_;
344 }
345
346 } // namespace safe_browsing
347