1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
6
7 #include "base/files/file_util.h"
8 #include "base/files/scoped_file.h"
9 #include "base/md5.h"
10 #include "base/metrics/histogram.h"
11 #include "base/metrics/sparse_histogram.h"
12
13 namespace {
14
15 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
16 // that byte-order changes force corruption.
17 const int32 kFileMagic = 0x600D71FE;
18
19 // Version history:
20 // Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
21 // Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
22 // Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
23 const int32 kFileVersion = 8;
24
25 // ReadAndVerifyHeader() returns this in case of error.
26 const int32 kInvalidVersion = -1;
27
28 // Starting with version 8, the storage is sorted and can be sharded to allow
29 // updates to be done with lower memory requirements. Newly written files will
30 // be sharded to need less than this amount of memory during update. Larger
31 // values are preferred to minimize looping overhead during processing.
32 const int64 kUpdateStorageBytes = 100 * 1024;
33
34 // Prevent excessive sharding by setting a lower limit on the shard stride.
35 // Smaller values should work fine, but very small values will probably lead to
36 // poor performance. Shard stride is indirectly related to
37 // |kUpdateStorageBytes|, setting that very small will bump against this.
38 const uint32 kMinShardStride = 1 << 24;
39
40 // Strides over the entire SBPrefix space.
41 const uint64 kMaxShardStride = 1ULL << 32;
42
43 // Maximum SBPrefix value.
44 const SBPrefix kMaxSBPrefix = 0xFFFFFFFF;
45
46 // Header at the front of the main database file.
47 struct FileHeader {
48 int32 magic, version;
49 uint32 add_chunk_count, sub_chunk_count;
50 uint32 shard_stride;
51 // TODO(shess): Is this where 64-bit will bite me? Perhaps write a
52 // specialized read/write?
53 };
54
55 // Header for each chunk in the chunk-accumulation file.
56 struct ChunkHeader {
57 uint32 add_prefix_count, sub_prefix_count;
58 uint32 add_hash_count, sub_hash_count;
59 };
60
61 // Header for each shard of data in the main database file.
62 struct ShardHeader {
63 uint32 add_prefix_count, sub_prefix_count;
64 uint32 add_hash_count, sub_hash_count;
65 };
66
67 // Enumerate different format-change events for histogramming
68 // purposes. DO NOT CHANGE THE ORDERING OF THESE VALUES.
69 enum FormatEventType {
70 // Corruption detected, broken down by file format.
71 FORMAT_EVENT_FILE_CORRUPT,
72 FORMAT_EVENT_SQLITE_CORRUPT, // Obsolete
73
74 // The type of format found in the file. The expected case (new
75 // file format) is intentionally not covered.
76 FORMAT_EVENT_FOUND_SQLITE, // Obsolete
77 FORMAT_EVENT_FOUND_UNKNOWN, // magic does not match.
78
79 // The number of SQLite-format files deleted should be the same as
80 // FORMAT_EVENT_FOUND_SQLITE. It can differ if the delete fails,
81 // or if a failure prevents the update from succeeding.
82 FORMAT_EVENT_SQLITE_DELETED, // Obsolete
83 FORMAT_EVENT_SQLITE_DELETE_FAILED, // Obsolete
84
85 // Found and deleted (or failed to delete) the ancient "Safe
86 // Browsing" file.
87 FORMAT_EVENT_DELETED_ORIGINAL, // Obsolete
88 FORMAT_EVENT_DELETED_ORIGINAL_FAILED, // Obsolete
89
90 // The checksum did not check out in CheckValidity() or in
91 // FinishUpdate(). This most likely indicates that the machine
92 // crashed before the file was fully sync'ed to disk.
93 FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
94 FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,
95
96 // The header checksum was incorrect in ReadAndVerifyHeader(). Likely
97 // indicates that the system crashed while writing an update.
98 FORMAT_EVENT_HEADER_CHECKSUM_FAILURE,
99
100 FORMAT_EVENT_FOUND_DEPRECATED, // version too old.
101
102 // Memory space for histograms is determined by the max. ALWAYS
103 // ADD NEW VALUES BEFORE THIS ONE.
104 FORMAT_EVENT_MAX
105 };
106
RecordFormatEvent(FormatEventType event_type)107 void RecordFormatEvent(FormatEventType event_type) {
108 UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
109 }
110
111 // Rewind the file. Using fseek(2) because rewind(3) errors are
112 // weird.
FileRewind(FILE * fp)113 bool FileRewind(FILE* fp) {
114 int rv = fseek(fp, 0, SEEK_SET);
115 DCHECK_EQ(rv, 0);
116 return rv == 0;
117 }
118
119 // Read from |fp| into |item|, and fold the input data into the
120 // checksum in |context|, if non-NULL. Return true on success.
121 template <class T>
ReadItem(T * item,FILE * fp,base::MD5Context * context)122 bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
123 const size_t ret = fread(item, sizeof(T), 1, fp);
124 if (ret != 1)
125 return false;
126
127 if (context) {
128 base::MD5Update(context,
129 base::StringPiece(reinterpret_cast<char*>(item),
130 sizeof(T)));
131 }
132 return true;
133 }
134
135 // Write |item| to |fp|, and fold the output data into the checksum in
136 // |context|, if non-NULL. Return true on success.
137 template <class T>
WriteItem(const T & item,FILE * fp,base::MD5Context * context)138 bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
139 const size_t ret = fwrite(&item, sizeof(T), 1, fp);
140 if (ret != 1)
141 return false;
142
143 if (context) {
144 base::MD5Update(context,
145 base::StringPiece(reinterpret_cast<const char*>(&item),
146 sizeof(T)));
147 }
148
149 return true;
150 }
151
152 // Read |count| items into |values| from |fp|, and fold them into the
153 // checksum in |context|. Returns true on success.
154 template <typename CT>
ReadToContainer(CT * values,size_t count,FILE * fp,base::MD5Context * context)155 bool ReadToContainer(CT* values, size_t count, FILE* fp,
156 base::MD5Context* context) {
157 if (!count)
158 return true;
159
160 for (size_t i = 0; i < count; ++i) {
161 typename CT::value_type value;
162 if (!ReadItem(&value, fp, context))
163 return false;
164
165 // push_back() is more obvious, but coded this way std::set can
166 // also be read.
167 values->insert(values->end(), value);
168 }
169
170 return true;
171 }
172
173 // Write values between |beg| and |end| to |fp|, and fold the data into the
174 // checksum in |context|, if non-NULL. Returns true if all items successful.
175 template <typename CTI>
WriteRange(const CTI & beg,const CTI & end,FILE * fp,base::MD5Context * context)176 bool WriteRange(const CTI& beg, const CTI& end,
177 FILE* fp, base::MD5Context* context) {
178 for (CTI iter = beg; iter != end; ++iter) {
179 if (!WriteItem(*iter, fp, context))
180 return false;
181 }
182 return true;
183 }
184
185 // Write all of |values| to |fp|, and fold the data into the checksum
186 // in |context|, if non-NULL. Returns true if all items successful.
187 template <typename CT>
WriteContainer(const CT & values,FILE * fp,base::MD5Context * context)188 bool WriteContainer(const CT& values, FILE* fp,
189 base::MD5Context* context) {
190 return WriteRange(values.begin(), values.end(), fp, context);
191 }
192
193 // Delete the chunks in |deleted| from |chunks|.
DeleteChunksFromSet(const base::hash_set<int32> & deleted,std::set<int32> * chunks)194 void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
195 std::set<int32>* chunks) {
196 for (std::set<int32>::iterator iter = chunks->begin();
197 iter != chunks->end();) {
198 std::set<int32>::iterator prev = iter++;
199 if (deleted.count(*prev) > 0)
200 chunks->erase(prev);
201 }
202 }
203
ReadAndVerifyChecksum(FILE * fp,base::MD5Context * context)204 bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) {
205 base::MD5Digest calculated_digest;
206 base::MD5IntermediateFinal(&calculated_digest, context);
207
208 base::MD5Digest file_digest;
209 if (!ReadItem(&file_digest, fp, context))
210 return false;
211
212 return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0;
213 }
214
215 // Helper function to read the file header and chunk TOC. Rewinds |fp| and
216 // initializes |context|. The header is left in |header|, with the version
217 // returned. kInvalidVersion is returned for sanity check or checksum failure.
ReadAndVerifyHeader(const base::FilePath & filename,FileHeader * header,std::set<int32> * add_chunks,std::set<int32> * sub_chunks,FILE * fp,base::MD5Context * context)218 int ReadAndVerifyHeader(const base::FilePath& filename,
219 FileHeader* header,
220 std::set<int32>* add_chunks,
221 std::set<int32>* sub_chunks,
222 FILE* fp,
223 base::MD5Context* context) {
224 DCHECK(header);
225 DCHECK(add_chunks);
226 DCHECK(sub_chunks);
227 DCHECK(fp);
228 DCHECK(context);
229
230 base::MD5Init(context);
231 if (!FileRewind(fp))
232 return kInvalidVersion;
233 if (!ReadItem(header, fp, context))
234 return kInvalidVersion;
235 if (header->magic != kFileMagic)
236 return kInvalidVersion;
237
238 // Track version read to inform removal of support for older versions.
239 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->version);
240
241 if (header->version != kFileVersion)
242 return kInvalidVersion;
243
244 if (!ReadToContainer(add_chunks, header->add_chunk_count, fp, context) ||
245 !ReadToContainer(sub_chunks, header->sub_chunk_count, fp, context)) {
246 return kInvalidVersion;
247 }
248
249 // Verify that the data read thus far is valid.
250 if (!ReadAndVerifyChecksum(fp, context)) {
251 RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE);
252 return kInvalidVersion;
253 }
254
255 return kFileVersion;
256 }
257
258 // Helper function to write out the initial header and chunks-contained data.
259 // Rewinds |fp|, initializes |context|, then writes a file header and
260 // |add_chunks| and |sub_chunks|.
WriteHeader(uint32 out_stride,const std::set<int32> & add_chunks,const std::set<int32> & sub_chunks,FILE * fp,base::MD5Context * context)261 bool WriteHeader(uint32 out_stride,
262 const std::set<int32>& add_chunks,
263 const std::set<int32>& sub_chunks,
264 FILE* fp,
265 base::MD5Context* context) {
266 if (!FileRewind(fp))
267 return false;
268
269 base::MD5Init(context);
270 FileHeader header;
271 header.magic = kFileMagic;
272 header.version = kFileVersion;
273 header.add_chunk_count = add_chunks.size();
274 header.sub_chunk_count = sub_chunks.size();
275 header.shard_stride = out_stride;
276 if (!WriteItem(header, fp, context))
277 return false;
278
279 if (!WriteContainer(add_chunks, fp, context) ||
280 !WriteContainer(sub_chunks, fp, context))
281 return false;
282
283 // Write out the header digest.
284 base::MD5Digest header_digest;
285 base::MD5IntermediateFinal(&header_digest, context);
286 if (!WriteItem(header_digest, fp, context))
287 return false;
288
289 return true;
290 }
291
292 // Return |true| if the range is sorted by the given comparator.
293 template <typename CTI, typename LESS>
sorted(CTI beg,CTI end,LESS less)294 bool sorted(CTI beg, CTI end, LESS less) {
295 while ((end - beg) > 2) {
296 CTI n = beg++;
297 DCHECK(!less(*beg, *n));
298 if (less(*beg, *n))
299 return false;
300 }
301 return true;
302 }
303
304 // Merge |beg|..|end| into |container|. Both should be sorted by the given
305 // comparator, and the range iterators should not be derived from |container|.
306 // Differs from std::inplace_merge() in that additional memory is not required
307 // for linear performance.
308 template <typename CT, typename CTI, typename COMP>
container_merge(CT * container,CTI beg,CTI end,const COMP & less)309 void container_merge(CT* container, CTI beg, CTI end, const COMP& less) {
310 DCHECK(sorted(container->begin(), container->end(), less));
311 DCHECK(sorted(beg, end, less));
312
313 // Size the container to fit the results.
314 const size_t c_size = container->size();
315 container->resize(c_size + (end - beg));
316
317 // |c_end| points to the original endpoint, while |c_out| points to the
318 // endpoint that will scan from end to beginning while merging.
319 typename CT::iterator c_end = container->begin() + c_size;
320 typename CT::iterator c_out = container->end();
321
322 // While both inputs have data, move the greater to |c_out|.
323 while (c_end != container->begin() && end != beg) {
324 if (less(*(c_end - 1), *(end - 1))) {
325 *(--c_out) = *(--end);
326 } else {
327 *(--c_out) = *(--c_end);
328 }
329 }
330
331 // Copy any data remaining in the new range.
332 if (end != beg) {
333 // The original container data has been fully shifted.
334 DCHECK(c_end == container->begin());
335
336 // There is exactly the correct amount of space left.
337 DCHECK_EQ(c_out - c_end, end - beg);
338
339 std::copy(beg, end, container->begin());
340 }
341
342 DCHECK(sorted(container->begin(), container->end(), less));
343 }
344
345 // Collection of iterators used while stepping through StateInternal (see
346 // below).
347 class StateInternalPos {
348 public:
StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,SBSubPrefixes::iterator sub_prefixes_iter,std::vector<SBAddFullHash>::iterator add_hashes_iter,std::vector<SBSubFullHash>::iterator sub_hashes_iter)349 StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,
350 SBSubPrefixes::iterator sub_prefixes_iter,
351 std::vector<SBAddFullHash>::iterator add_hashes_iter,
352 std::vector<SBSubFullHash>::iterator sub_hashes_iter)
353 : add_prefixes_iter_(add_prefixes_iter),
354 sub_prefixes_iter_(sub_prefixes_iter),
355 add_hashes_iter_(add_hashes_iter),
356 sub_hashes_iter_(sub_hashes_iter) {
357 }
358
359 SBAddPrefixes::iterator add_prefixes_iter_;
360 SBSubPrefixes::iterator sub_prefixes_iter_;
361 std::vector<SBAddFullHash>::iterator add_hashes_iter_;
362 std::vector<SBSubFullHash>::iterator sub_hashes_iter_;
363 };
364
365 // Helper to find the next shard boundary.
366 template <class T>
prefix_bounder(SBPrefix val,const T & elt)367 bool prefix_bounder(SBPrefix val, const T& elt) {
368 return val < elt.GetAddPrefix();
369 }
370
371 // Container for partial database state. Includes add/sub prefixes/hashes, plus
372 // aggregate operations on same.
373 class StateInternal {
374 public:
375 // Append indicated amount of data from |fp|.
AppendData(size_t add_prefix_count,size_t sub_prefix_count,size_t add_hash_count,size_t sub_hash_count,FILE * fp,base::MD5Context * context)376 bool AppendData(size_t add_prefix_count, size_t sub_prefix_count,
377 size_t add_hash_count, size_t sub_hash_count,
378 FILE* fp, base::MD5Context* context) {
379 return
380 ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) &&
381 ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) &&
382 ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) &&
383 ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context);
384 }
385
ClearData()386 void ClearData() {
387 add_prefixes_.clear();
388 sub_prefixes_.clear();
389 add_full_hashes_.clear();
390 sub_full_hashes_.clear();
391 }
392
393 // Merge data from |beg|..|end| into receiver's state, then process the state.
394 // The current state and the range given should corrospond to the same sorted
395 // shard of data from different sources. |add_del_cache| and |sub_del_cache|
396 // indicate the chunk ids which should be deleted during processing (see
397 // SBProcessSubs).
MergeDataAndProcess(const StateInternalPos & beg,const StateInternalPos & end,const base::hash_set<int32> & add_del_cache,const base::hash_set<int32> & sub_del_cache)398 void MergeDataAndProcess(const StateInternalPos& beg,
399 const StateInternalPos& end,
400 const base::hash_set<int32>& add_del_cache,
401 const base::hash_set<int32>& sub_del_cache) {
402 container_merge(&add_prefixes_,
403 beg.add_prefixes_iter_,
404 end.add_prefixes_iter_,
405 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
406
407 container_merge(&sub_prefixes_,
408 beg.sub_prefixes_iter_,
409 end.sub_prefixes_iter_,
410 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
411
412 container_merge(&add_full_hashes_,
413 beg.add_hashes_iter_,
414 end.add_hashes_iter_,
415 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
416
417 container_merge(&sub_full_hashes_,
418 beg.sub_hashes_iter_,
419 end.sub_hashes_iter_,
420 SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>);
421
422 SBProcessSubs(&add_prefixes_, &sub_prefixes_,
423 &add_full_hashes_, &sub_full_hashes_,
424 add_del_cache, sub_del_cache);
425 }
426
427 // Sort the data appropriately for the sharding, merging, and processing
428 // operations.
SortData()429 void SortData() {
430 std::sort(add_prefixes_.begin(), add_prefixes_.end(),
431 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
432 std::sort(sub_prefixes_.begin(), sub_prefixes_.end(),
433 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
434 std::sort(add_full_hashes_.begin(), add_full_hashes_.end(),
435 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
436 std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(),
437 SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
438 }
439
440 // Iterator from the beginning of the state's data.
StateBegin()441 StateInternalPos StateBegin() {
442 return StateInternalPos(add_prefixes_.begin(),
443 sub_prefixes_.begin(),
444 add_full_hashes_.begin(),
445 sub_full_hashes_.begin());
446 }
447
448 // An iterator pointing just after the last possible element of the shard
449 // indicated by |shard_max|. Used to step through the state by shard.
450 // TODO(shess): Verify whether binary search really improves over linear.
451 // Merging or writing will immediately touch all of these elements.
ShardEnd(const StateInternalPos & beg,SBPrefix shard_max)452 StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) {
453 return StateInternalPos(
454 std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(),
455 shard_max, prefix_bounder<SBAddPrefix>),
456 std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(),
457 shard_max, prefix_bounder<SBSubPrefix>),
458 std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(),
459 shard_max, prefix_bounder<SBAddFullHash>),
460 std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(),
461 shard_max, prefix_bounder<SBSubFullHash>));
462 }
463
464 // Write a shard header and data for the shard starting at |beg| and ending at
465 // the element before |end|.
WriteShard(const StateInternalPos & beg,const StateInternalPos & end,FILE * fp,base::MD5Context * context)466 bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end,
467 FILE* fp, base::MD5Context* context) {
468 ShardHeader shard_header;
469 shard_header.add_prefix_count =
470 end.add_prefixes_iter_ - beg.add_prefixes_iter_;
471 shard_header.sub_prefix_count =
472 end.sub_prefixes_iter_ - beg.sub_prefixes_iter_;
473 shard_header.add_hash_count =
474 end.add_hashes_iter_ - beg.add_hashes_iter_;
475 shard_header.sub_hash_count =
476 end.sub_hashes_iter_ - beg.sub_hashes_iter_;
477
478 return
479 WriteItem(shard_header, fp, context) &&
480 WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_,
481 fp, context) &&
482 WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_,
483 fp, context) &&
484 WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_,
485 fp, context) &&
486 WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_,
487 fp, context);
488 }
489
490 SBAddPrefixes add_prefixes_;
491 SBSubPrefixes sub_prefixes_;
492 std::vector<SBAddFullHash> add_full_hashes_;
493 std::vector<SBSubFullHash> sub_full_hashes_;
494 };
495
496 // True if |val| is an even power of two.
497 template <typename T>
IsPowerOfTwo(const T & val)498 bool IsPowerOfTwo(const T& val) {
499 return val && (val & (val - 1)) == 0;
500 }
501
502 // Helper to read the entire database state, used by GetAddPrefixes() and
503 // GetAddFullHashes(). Those functions are generally used only for smaller
504 // files. Returns false in case of errors reading the data.
ReadDbStateHelper(const base::FilePath & filename,StateInternal * db_state)505 bool ReadDbStateHelper(const base::FilePath& filename,
506 StateInternal* db_state) {
507 base::ScopedFILE file(base::OpenFile(filename, "rb"));
508 if (file.get() == NULL)
509 return false;
510
511 std::set<int32> add_chunks;
512 std::set<int32> sub_chunks;
513
514 base::MD5Context context;
515 FileHeader header;
516 const int version =
517 ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks,
518 file.get(), &context);
519 if (version == kInvalidVersion)
520 return false;
521
522 uint64 in_min = 0;
523 uint64 in_stride = header.shard_stride;
524 if (!in_stride)
525 in_stride = kMaxShardStride;
526 if (!IsPowerOfTwo(in_stride))
527 return false;
528
529 do {
530 ShardHeader shard_header;
531 if (!ReadItem(&shard_header, file.get(), &context))
532 return false;
533
534 if (!db_state->AppendData(shard_header.add_prefix_count,
535 shard_header.sub_prefix_count,
536 shard_header.add_hash_count,
537 shard_header.sub_hash_count,
538 file.get(), &context)) {
539 return false;
540 }
541
542 in_min += in_stride;
543 } while (in_min <= kMaxSBPrefix);
544
545 if (!ReadAndVerifyChecksum(file.get(), &context))
546 return false;
547
548 int64 size = 0;
549 if (!base::GetFileSize(filename, &size))
550 return false;
551
552 return static_cast<int64>(ftell(file.get())) == size;
553 }
554
555 } // namespace
556
SafeBrowsingStoreFile()557 SafeBrowsingStoreFile::SafeBrowsingStoreFile()
558 : chunks_written_(0), empty_(false), corruption_seen_(false) {}
559
~SafeBrowsingStoreFile()560 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
561 Close();
562 }
563
Delete()564 bool SafeBrowsingStoreFile::Delete() {
565 // The database should not be open at this point. But, just in
566 // case, close everything before deleting.
567 if (!Close()) {
568 NOTREACHED();
569 return false;
570 }
571
572 return DeleteStore(filename_);
573 }
574
CheckValidity()575 bool SafeBrowsingStoreFile::CheckValidity() {
576 // The file was either empty or never opened. The empty case is
577 // presumed not to be invalid. The never-opened case can happen if
578 // BeginUpdate() fails for any databases, and should already have
579 // caused the corruption callback to fire.
580 if (!file_.get())
581 return true;
582
583 if (!FileRewind(file_.get()))
584 return OnCorruptDatabase();
585
586 int64 size = 0;
587 if (!base::GetFileSize(filename_, &size))
588 return OnCorruptDatabase();
589
590 base::MD5Context context;
591 base::MD5Init(&context);
592
593 // Read everything except the final digest.
594 size_t bytes_left = static_cast<size_t>(size);
595 CHECK(size == static_cast<int64>(bytes_left));
596 if (bytes_left < sizeof(base::MD5Digest))
597 return OnCorruptDatabase();
598 bytes_left -= sizeof(base::MD5Digest);
599
600 // Fold the contents of the file into the checksum.
601 while (bytes_left > 0) {
602 char buf[4096];
603 const size_t c = std::min(sizeof(buf), bytes_left);
604 const size_t ret = fread(buf, 1, c, file_.get());
605
606 // The file's size changed while reading, give up.
607 if (ret != c)
608 return OnCorruptDatabase();
609 base::MD5Update(&context, base::StringPiece(buf, c));
610 bytes_left -= c;
611 }
612
613 if (!ReadAndVerifyChecksum(file_.get(), &context)) {
614 RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE);
615 return OnCorruptDatabase();
616 }
617
618 return true;
619 }
620
Init(const base::FilePath & filename,const base::Closure & corruption_callback)621 void SafeBrowsingStoreFile::Init(
622 const base::FilePath& filename,
623 const base::Closure& corruption_callback
624 ) {
625 filename_ = filename;
626 corruption_callback_ = corruption_callback;
627 }
628
BeginChunk()629 bool SafeBrowsingStoreFile::BeginChunk() {
630 return ClearChunkBuffers();
631 }
632
WriteAddPrefix(int32 chunk_id,SBPrefix prefix)633 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
634 add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
635 return true;
636 }
637
GetAddPrefixes(SBAddPrefixes * add_prefixes)638 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
639 add_prefixes->clear();
640 if (!base::PathExists(filename_))
641 return true;
642
643 StateInternal db_state;
644 if (!ReadDbStateHelper(filename_, &db_state))
645 return OnCorruptDatabase();
646
647 add_prefixes->swap(db_state.add_prefixes_);
648 return true;
649 }
650
GetAddFullHashes(std::vector<SBAddFullHash> * add_full_hashes)651 bool SafeBrowsingStoreFile::GetAddFullHashes(
652 std::vector<SBAddFullHash>* add_full_hashes) {
653 add_full_hashes->clear();
654 if (!base::PathExists(filename_))
655 return true;
656
657 StateInternal db_state;
658 if (!ReadDbStateHelper(filename_, &db_state))
659 return OnCorruptDatabase();
660
661 add_full_hashes->swap(db_state.add_full_hashes_);
662 return true;
663 }
664
WriteAddHash(int32 chunk_id,const SBFullHash & full_hash)665 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
666 const SBFullHash& full_hash) {
667 add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash));
668 return true;
669 }
670
WriteSubPrefix(int32 chunk_id,int32 add_chunk_id,SBPrefix prefix)671 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
672 int32 add_chunk_id,
673 SBPrefix prefix) {
674 sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
675 return true;
676 }
677
WriteSubHash(int32 chunk_id,int32 add_chunk_id,const SBFullHash & full_hash)678 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
679 const SBFullHash& full_hash) {
680 sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
681 return true;
682 }
683
OnCorruptDatabase()684 bool SafeBrowsingStoreFile::OnCorruptDatabase() {
685 if (!corruption_seen_)
686 RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
687 corruption_seen_ = true;
688
689 corruption_callback_.Run();
690
691 // Return false as a convenience to callers.
692 return false;
693 }
694
Close()695 bool SafeBrowsingStoreFile::Close() {
696 ClearUpdateBuffers();
697
698 // Make sure the files are closed.
699 file_.reset();
700 new_file_.reset();
701 return true;
702 }
703
BeginUpdate()704 bool SafeBrowsingStoreFile::BeginUpdate() {
705 DCHECK(!file_.get() && !new_file_.get());
706
707 // Structures should all be clear unless something bad happened.
708 DCHECK(add_chunks_cache_.empty());
709 DCHECK(sub_chunks_cache_.empty());
710 DCHECK(add_del_cache_.empty());
711 DCHECK(sub_del_cache_.empty());
712 DCHECK(add_prefixes_.empty());
713 DCHECK(sub_prefixes_.empty());
714 DCHECK(add_hashes_.empty());
715 DCHECK(sub_hashes_.empty());
716 DCHECK_EQ(chunks_written_, 0);
717
718 corruption_seen_ = false;
719
720 const base::FilePath new_filename = TemporaryFileForFilename(filename_);
721 base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+"));
722 if (new_file.get() == NULL)
723 return false;
724
725 base::ScopedFILE file(base::OpenFile(filename_, "rb"));
726 empty_ = (file.get() == NULL);
727 if (empty_) {
728 // If the file exists but cannot be opened, try to delete it (not
729 // deleting directly, the bloom filter needs to be deleted, too).
730 if (base::PathExists(filename_))
731 return OnCorruptDatabase();
732
733 new_file_.swap(new_file);
734 return true;
735 }
736
737 base::MD5Context context;
738 FileHeader header;
739 const int version =
740 ReadAndVerifyHeader(filename_, &header,
741 &add_chunks_cache_, &sub_chunks_cache_,
742 file.get(), &context);
743 if (version == kInvalidVersion) {
744 FileHeader retry_header;
745 if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL)) {
746 if (retry_header.magic == kFileMagic &&
747 retry_header.version < kFileVersion) {
748 RecordFormatEvent(FORMAT_EVENT_FOUND_DEPRECATED);
749 } else {
750 RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
751 }
752 }
753
754 // Close the file so that it can be deleted.
755 file.reset();
756
757 return OnCorruptDatabase();
758 }
759
760 file_.swap(file);
761 new_file_.swap(new_file);
762 return true;
763 }
764
FinishChunk()765 bool SafeBrowsingStoreFile::FinishChunk() {
766 if (!add_prefixes_.size() && !sub_prefixes_.size() &&
767 !add_hashes_.size() && !sub_hashes_.size())
768 return true;
769
770 ChunkHeader header;
771 header.add_prefix_count = add_prefixes_.size();
772 header.sub_prefix_count = sub_prefixes_.size();
773 header.add_hash_count = add_hashes_.size();
774 header.sub_hash_count = sub_hashes_.size();
775 if (!WriteItem(header, new_file_.get(), NULL))
776 return false;
777
778 if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
779 !WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
780 !WriteContainer(add_hashes_, new_file_.get(), NULL) ||
781 !WriteContainer(sub_hashes_, new_file_.get(), NULL))
782 return false;
783
784 ++chunks_written_;
785
786 // Clear everything to save memory.
787 return ClearChunkBuffers();
788 }
789
DoUpdate(safe_browsing::PrefixSetBuilder * builder,std::vector<SBAddFullHash> * add_full_hashes_result)790 bool SafeBrowsingStoreFile::DoUpdate(
791 safe_browsing::PrefixSetBuilder* builder,
792 std::vector<SBAddFullHash>* add_full_hashes_result) {
793 DCHECK(file_.get() || empty_);
794 DCHECK(new_file_.get());
795 CHECK(builder);
796 CHECK(add_full_hashes_result);
797
798 // Rewind the temporary storage.
799 if (!FileRewind(new_file_.get()))
800 return false;
801
802 // Get chunk file's size for validating counts.
803 int64 update_size = 0;
804 if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size))
805 return OnCorruptDatabase();
806
807 // Track update size to answer questions at http://crbug.com/72216 .
808 // Log small updates as 1k so that the 0 (underflow) bucket can be
809 // used for "empty" in SafeBrowsingDatabase.
810 UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
811 std::max(static_cast<int>(update_size / 1024), 1));
812
813 // Chunk updates to integrate.
814 StateInternal new_state;
815
816 // Read update chunks.
817 for (int i = 0; i < chunks_written_; ++i) {
818 ChunkHeader header;
819
820 int64 ofs = ftell(new_file_.get());
821 if (ofs == -1)
822 return false;
823
824 if (!ReadItem(&header, new_file_.get(), NULL))
825 return false;
826
827 // As a safety measure, make sure that the header describes a sane
828 // chunk, given the remaining file size.
829 int64 expected_size = ofs + sizeof(ChunkHeader);
830 expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
831 expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
832 expected_size += header.add_hash_count * sizeof(SBAddFullHash);
833 expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
834 if (expected_size > update_size)
835 return false;
836
837 if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count,
838 header.add_hash_count, header.sub_hash_count,
839 new_file_.get(), NULL)) {
840 return false;
841 }
842 }
843
844 // The state was accumulated by chunk, sort by prefix.
845 new_state.SortData();
846
847 // These strides control how much data is loaded into memory per pass.
848 // Strides must be an even power of two. |in_stride| will be derived from the
849 // input file. |out_stride| will be derived from an estimate of the resulting
850 // file's size. |process_stride| will be the max of both.
851 uint64 in_stride = kMaxShardStride;
852 uint64 out_stride = kMaxShardStride;
853 uint64 process_stride = 0;
854
855 // Used to verify the input's checksum if |!empty_|.
856 base::MD5Context in_context;
857
858 if (!empty_) {
859 DCHECK(file_.get());
860
861 FileHeader header = {0};
862 int version = ReadAndVerifyHeader(filename_, &header,
863 &add_chunks_cache_, &sub_chunks_cache_,
864 file_.get(), &in_context);
865 if (version == kInvalidVersion)
866 return OnCorruptDatabase();
867
868 if (header.shard_stride)
869 in_stride = header.shard_stride;
870
871 // The header checksum should have prevented this case, but the code will be
872 // broken if this is not correct.
873 if (!IsPowerOfTwo(in_stride))
874 return OnCorruptDatabase();
875 }
876
877 // We no longer need to track deleted chunks.
878 DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
879 DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);
880
881 // Calculate |out_stride| to break the file down into reasonable shards.
882 {
883 int64 original_size = 0;
884 if (!empty_ && !base::GetFileSize(filename_, &original_size))
885 return OnCorruptDatabase();
886
887 // Approximate the final size as everything. Subs and deletes will reduce
888 // the size, but modest over-sharding won't hurt much.
889 int64 shard_size = original_size + update_size;
890
891 // Keep splitting until a single stride of data fits the target.
892 size_t shifts = 0;
893 while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) {
894 out_stride >>= 1;
895 shard_size >>= 1;
896 ++shifts;
897 }
898 UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts);
899
900 DCHECK(IsPowerOfTwo(out_stride));
901 }
902
903 // Outer loop strides by the max of the input stride (to read integral shards)
904 // and the output stride (to write integral shards).
905 process_stride = std::max(in_stride, out_stride);
906 DCHECK(IsPowerOfTwo(process_stride));
907 DCHECK_EQ(0u, process_stride % in_stride);
908 DCHECK_EQ(0u, process_stride % out_stride);
909
910 // Start writing the new data to |new_file_|.
911 base::MD5Context out_context;
912 if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_,
913 new_file_.get(), &out_context)) {
914 return false;
915 }
916
917 // Start at the beginning of the SBPrefix space.
918 uint64 in_min = 0;
919 uint64 out_min = 0;
920 uint64 process_min = 0;
921
922 // Start at the beginning of the updates.
923 StateInternalPos new_pos = new_state.StateBegin();
924
925 // Re-usable container for shard processing.
926 StateInternal db_state;
927
928 // Track aggregate counts for histograms.
929 size_t add_prefix_count = 0;
930 size_t sub_prefix_count = 0;
931
932 do {
933 // Maximum element in the current shard.
934 SBPrefix process_max =
935 static_cast<SBPrefix>(process_min + process_stride - 1);
936 DCHECK_GT(process_max, process_min);
937
938 // Drop the data from previous pass.
939 db_state.ClearData();
940
941 // Fill the processing shard with one or more input shards.
942 if (!empty_) {
943 do {
944 ShardHeader shard_header;
945 if (!ReadItem(&shard_header, file_.get(), &in_context))
946 return OnCorruptDatabase();
947
948 if (!db_state.AppendData(shard_header.add_prefix_count,
949 shard_header.sub_prefix_count,
950 shard_header.add_hash_count,
951 shard_header.sub_hash_count,
952 file_.get(), &in_context))
953 return OnCorruptDatabase();
954
955 in_min += in_stride;
956 } while (in_min <= kMaxSBPrefix && in_min < process_max);
957 }
958
959 // Shard the update data to match the database data, then merge the update
960 // data and process the results.
961 {
962 StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max);
963 db_state.MergeDataAndProcess(new_pos, new_end,
964 add_del_cache_, sub_del_cache_);
965 new_pos = new_end;
966 }
967
968 // Collect the processed data for return to caller.
969 for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) {
970 builder->AddPrefix(db_state.add_prefixes_[i].prefix);
971 }
972 add_full_hashes_result->insert(add_full_hashes_result->end(),
973 db_state.add_full_hashes_.begin(),
974 db_state.add_full_hashes_.end());
975 add_prefix_count += db_state.add_prefixes_.size();
976 sub_prefix_count += db_state.sub_prefixes_.size();
977
978 // Write one or more shards of processed output.
979 StateInternalPos out_pos = db_state.StateBegin();
980 do {
981 SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1);
982 DCHECK_GT(out_max, out_min);
983
984 StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max);
985 if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context))
986 return false;
987 out_pos = out_end;
988
989 out_min += out_stride;
990 } while (out_min == static_cast<SBPrefix>(out_min) &&
991 out_min < process_max);
992
993 process_min += process_stride;
994 } while (process_min <= kMaxSBPrefix);
995
996 // Verify the overall checksum.
997 if (!empty_) {
998 if (!ReadAndVerifyChecksum(file_.get(), &in_context)) {
999 RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE);
1000 return OnCorruptDatabase();
1001 }
1002
1003 // TODO(shess): Verify EOF?
1004
1005 // Close the input file so the new file can be renamed over it.
1006 file_.reset();
1007 }
1008 DCHECK(!file_.get());
1009
1010 // Write the overall checksum.
1011 base::MD5Digest out_digest;
1012 base::MD5Final(&out_digest, &out_context);
1013 if (!WriteItem(out_digest, new_file_.get(), NULL))
1014 return false;
1015
1016 // Trim any excess left over from the temporary chunk data.
1017 if (!base::TruncateFile(new_file_.get()))
1018 return false;
1019
1020 // Close the file handle and swizzle the file into place.
1021 new_file_.reset();
1022 if (!base::DeleteFile(filename_, false) &&
1023 base::PathExists(filename_))
1024 return false;
1025
1026 const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1027 if (!base::Move(new_filename, filename_))
1028 return false;
1029
1030 // Record counts before swapping to caller.
1031 UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count);
1032 UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count);
1033
1034 return true;
1035 }
1036
FinishUpdate(safe_browsing::PrefixSetBuilder * builder,std::vector<SBAddFullHash> * add_full_hashes_result)1037 bool SafeBrowsingStoreFile::FinishUpdate(
1038 safe_browsing::PrefixSetBuilder* builder,
1039 std::vector<SBAddFullHash>* add_full_hashes_result) {
1040 DCHECK(builder);
1041 DCHECK(add_full_hashes_result);
1042
1043 if (!DoUpdate(builder, add_full_hashes_result)) {
1044 CancelUpdate();
1045 return false;
1046 }
1047
1048 DCHECK(!new_file_.get());
1049 DCHECK(!file_.get());
1050
1051 return Close();
1052 }
1053
CancelUpdate()1054 bool SafeBrowsingStoreFile::CancelUpdate() {
1055 bool ret = Close();
1056
1057 // Delete stale staging file.
1058 const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1059 base::DeleteFile(new_filename, false);
1060
1061 return ret;
1062 }
1063
SetAddChunk(int32 chunk_id)1064 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
1065 add_chunks_cache_.insert(chunk_id);
1066 }
1067
CheckAddChunk(int32 chunk_id)1068 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
1069 return add_chunks_cache_.count(chunk_id) > 0;
1070 }
1071
GetAddChunks(std::vector<int32> * out)1072 void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
1073 out->clear();
1074 out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
1075 }
1076
SetSubChunk(int32 chunk_id)1077 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
1078 sub_chunks_cache_.insert(chunk_id);
1079 }
1080
CheckSubChunk(int32 chunk_id)1081 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
1082 return sub_chunks_cache_.count(chunk_id) > 0;
1083 }
1084
GetSubChunks(std::vector<int32> * out)1085 void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
1086 out->clear();
1087 out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
1088 }
1089
DeleteAddChunk(int32 chunk_id)1090 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
1091 add_del_cache_.insert(chunk_id);
1092 }
1093
DeleteSubChunk(int32 chunk_id)1094 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
1095 sub_del_cache_.insert(chunk_id);
1096 }
1097
1098 // static
DeleteStore(const base::FilePath & basename)1099 bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) {
1100 if (!base::DeleteFile(basename, false) &&
1101 base::PathExists(basename)) {
1102 NOTREACHED();
1103 return false;
1104 }
1105
1106 const base::FilePath new_filename = TemporaryFileForFilename(basename);
1107 if (!base::DeleteFile(new_filename, false) &&
1108 base::PathExists(new_filename)) {
1109 NOTREACHED();
1110 return false;
1111 }
1112
1113 // With SQLite support gone, one way to get to this code is if the
1114 // existing file is a SQLite file. Make sure the journal file is
1115 // also removed.
1116 const base::FilePath journal_filename(
1117 basename.value() + FILE_PATH_LITERAL("-journal"));
1118 if (base::PathExists(journal_filename))
1119 base::DeleteFile(journal_filename, false);
1120
1121 return true;
1122 }
1123