• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
6 
7 #include "base/files/file_util.h"
8 #include "base/files/scoped_file.h"
9 #include "base/md5.h"
10 #include "base/metrics/histogram.h"
11 #include "base/metrics/sparse_histogram.h"
12 
13 namespace {
14 
15 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
16 // that byte-order changes force corruption.
17 const int32 kFileMagic = 0x600D71FE;
18 
19 // Version history:
20 // Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
21 // Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
22 // Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
23 const int32 kFileVersion = 8;
24 
25 // ReadAndVerifyHeader() returns this in case of error.
26 const int32 kInvalidVersion = -1;
27 
28 // Starting with version 8, the storage is sorted and can be sharded to allow
29 // updates to be done with lower memory requirements.  Newly written files will
30 // be sharded to need less than this amount of memory during update.  Larger
31 // values are preferred to minimize looping overhead during processing.
32 const int64 kUpdateStorageBytes = 100 * 1024;
33 
34 // Prevent excessive sharding by setting a lower limit on the shard stride.
35 // Smaller values should work fine, but very small values will probably lead to
36 // poor performance.  Shard stride is indirectly related to
37 // |kUpdateStorageBytes|, setting that very small will bump against this.
38 const uint32 kMinShardStride = 1 << 24;
39 
40 // Strides over the entire SBPrefix space.
41 const uint64 kMaxShardStride = 1ULL << 32;
42 
43 // Maximum SBPrefix value.
44 const SBPrefix kMaxSBPrefix = 0xFFFFFFFF;
45 
46 // Header at the front of the main database file.
47 struct FileHeader {
48   int32 magic, version;
49   uint32 add_chunk_count, sub_chunk_count;
50   uint32 shard_stride;
51   // TODO(shess): Is this where 64-bit will bite me?  Perhaps write a
52   // specialized read/write?
53 };
54 
55 // Header for each chunk in the chunk-accumulation file.
56 struct ChunkHeader {
57   uint32 add_prefix_count, sub_prefix_count;
58   uint32 add_hash_count, sub_hash_count;
59 };
60 
61 // Header for each shard of data in the main database file.
62 struct ShardHeader {
63   uint32 add_prefix_count, sub_prefix_count;
64   uint32 add_hash_count, sub_hash_count;
65 };
66 
67 // Enumerate different format-change events for histogramming
68 // purposes.  DO NOT CHANGE THE ORDERING OF THESE VALUES.
69 enum FormatEventType {
70   // Corruption detected, broken down by file format.
71   FORMAT_EVENT_FILE_CORRUPT,
72   FORMAT_EVENT_SQLITE_CORRUPT,  // Obsolete
73 
74   // The type of format found in the file.  The expected case (new
75   // file format) is intentionally not covered.
76   FORMAT_EVENT_FOUND_SQLITE,  // Obsolete
77   FORMAT_EVENT_FOUND_UNKNOWN,  // magic does not match.
78 
79   // The number of SQLite-format files deleted should be the same as
80   // FORMAT_EVENT_FOUND_SQLITE.  It can differ if the delete fails,
81   // or if a failure prevents the update from succeeding.
82   FORMAT_EVENT_SQLITE_DELETED,  // Obsolete
83   FORMAT_EVENT_SQLITE_DELETE_FAILED,  // Obsolete
84 
85   // Found and deleted (or failed to delete) the ancient "Safe
86   // Browsing" file.
87   FORMAT_EVENT_DELETED_ORIGINAL,  // Obsolete
88   FORMAT_EVENT_DELETED_ORIGINAL_FAILED,  // Obsolete
89 
90   // The checksum did not check out in CheckValidity() or in
91   // FinishUpdate().  This most likely indicates that the machine
92   // crashed before the file was fully sync'ed to disk.
93   FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
94   FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,
95 
96   // The header checksum was incorrect in ReadAndVerifyHeader().  Likely
97   // indicates that the system crashed while writing an update.
98   FORMAT_EVENT_HEADER_CHECKSUM_FAILURE,
99 
100   FORMAT_EVENT_FOUND_DEPRECATED,  // version too old.
101 
102   // Memory space for histograms is determined by the max.  ALWAYS
103   // ADD NEW VALUES BEFORE THIS ONE.
104   FORMAT_EVENT_MAX
105 };
106 
RecordFormatEvent(FormatEventType event_type)107 void RecordFormatEvent(FormatEventType event_type) {
108   UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
109 }
110 
111 // Rewind the file.  Using fseek(2) because rewind(3) errors are
112 // weird.
FileRewind(FILE * fp)113 bool FileRewind(FILE* fp) {
114   int rv = fseek(fp, 0, SEEK_SET);
115   DCHECK_EQ(rv, 0);
116   return rv == 0;
117 }
118 
119 // Read from |fp| into |item|, and fold the input data into the
120 // checksum in |context|, if non-NULL.  Return true on success.
121 template <class T>
ReadItem(T * item,FILE * fp,base::MD5Context * context)122 bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
123   const size_t ret = fread(item, sizeof(T), 1, fp);
124   if (ret != 1)
125     return false;
126 
127   if (context) {
128     base::MD5Update(context,
129                     base::StringPiece(reinterpret_cast<char*>(item),
130                                       sizeof(T)));
131   }
132   return true;
133 }
134 
135 // Write |item| to |fp|, and fold the output data into the checksum in
136 // |context|, if non-NULL.  Return true on success.
137 template <class T>
WriteItem(const T & item,FILE * fp,base::MD5Context * context)138 bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
139   const size_t ret = fwrite(&item, sizeof(T), 1, fp);
140   if (ret != 1)
141     return false;
142 
143   if (context) {
144     base::MD5Update(context,
145                     base::StringPiece(reinterpret_cast<const char*>(&item),
146                                       sizeof(T)));
147   }
148 
149   return true;
150 }
151 
152 // Read |count| items into |values| from |fp|, and fold them into the
153 // checksum in |context|.  Returns true on success.
154 template <typename CT>
ReadToContainer(CT * values,size_t count,FILE * fp,base::MD5Context * context)155 bool ReadToContainer(CT* values, size_t count, FILE* fp,
156                      base::MD5Context* context) {
157   if (!count)
158     return true;
159 
160   for (size_t i = 0; i < count; ++i) {
161     typename CT::value_type value;
162     if (!ReadItem(&value, fp, context))
163       return false;
164 
165     // push_back() is more obvious, but coded this way std::set can
166     // also be read.
167     values->insert(values->end(), value);
168   }
169 
170   return true;
171 }
172 
173 // Write values between |beg| and |end| to |fp|, and fold the data into the
174 // checksum in |context|, if non-NULL.  Returns true if all items successful.
175 template <typename CTI>
WriteRange(const CTI & beg,const CTI & end,FILE * fp,base::MD5Context * context)176 bool WriteRange(const CTI& beg, const CTI& end,
177                 FILE* fp, base::MD5Context* context) {
178   for (CTI iter = beg; iter != end; ++iter) {
179     if (!WriteItem(*iter, fp, context))
180       return false;
181   }
182   return true;
183 }
184 
185 // Write all of |values| to |fp|, and fold the data into the checksum
186 // in |context|, if non-NULL.  Returns true if all items successful.
187 template <typename CT>
WriteContainer(const CT & values,FILE * fp,base::MD5Context * context)188 bool WriteContainer(const CT& values, FILE* fp,
189                     base::MD5Context* context) {
190   return WriteRange(values.begin(), values.end(), fp, context);
191 }
192 
193 // Delete the chunks in |deleted| from |chunks|.
DeleteChunksFromSet(const base::hash_set<int32> & deleted,std::set<int32> * chunks)194 void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
195                          std::set<int32>* chunks) {
196   for (std::set<int32>::iterator iter = chunks->begin();
197        iter != chunks->end();) {
198     std::set<int32>::iterator prev = iter++;
199     if (deleted.count(*prev) > 0)
200       chunks->erase(prev);
201   }
202 }
203 
ReadAndVerifyChecksum(FILE * fp,base::MD5Context * context)204 bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) {
205   base::MD5Digest calculated_digest;
206   base::MD5IntermediateFinal(&calculated_digest, context);
207 
208   base::MD5Digest file_digest;
209   if (!ReadItem(&file_digest, fp, context))
210     return false;
211 
212   return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0;
213 }
214 
215 // Helper function to read the file header and chunk TOC.  Rewinds |fp| and
216 // initializes |context|.  The header is left in |header|, with the version
217 // returned.  kInvalidVersion is returned for sanity check or checksum failure.
ReadAndVerifyHeader(const base::FilePath & filename,FileHeader * header,std::set<int32> * add_chunks,std::set<int32> * sub_chunks,FILE * fp,base::MD5Context * context)218 int ReadAndVerifyHeader(const base::FilePath& filename,
219                         FileHeader* header,
220                         std::set<int32>* add_chunks,
221                         std::set<int32>* sub_chunks,
222                         FILE* fp,
223                         base::MD5Context* context) {
224   DCHECK(header);
225   DCHECK(add_chunks);
226   DCHECK(sub_chunks);
227   DCHECK(fp);
228   DCHECK(context);
229 
230   base::MD5Init(context);
231   if (!FileRewind(fp))
232     return kInvalidVersion;
233   if (!ReadItem(header, fp, context))
234     return kInvalidVersion;
235   if (header->magic != kFileMagic)
236     return kInvalidVersion;
237 
238   // Track version read to inform removal of support for older versions.
239   UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->version);
240 
241   if (header->version != kFileVersion)
242     return kInvalidVersion;
243 
244   if (!ReadToContainer(add_chunks, header->add_chunk_count, fp, context) ||
245       !ReadToContainer(sub_chunks, header->sub_chunk_count, fp, context)) {
246     return kInvalidVersion;
247   }
248 
249   // Verify that the data read thus far is valid.
250   if (!ReadAndVerifyChecksum(fp, context)) {
251     RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE);
252     return kInvalidVersion;
253   }
254 
255   return kFileVersion;
256 }
257 
258 // Helper function to write out the initial header and chunks-contained data.
259 // Rewinds |fp|, initializes |context|, then writes a file header and
260 // |add_chunks| and |sub_chunks|.
WriteHeader(uint32 out_stride,const std::set<int32> & add_chunks,const std::set<int32> & sub_chunks,FILE * fp,base::MD5Context * context)261 bool WriteHeader(uint32 out_stride,
262                  const std::set<int32>& add_chunks,
263                  const std::set<int32>& sub_chunks,
264                  FILE* fp,
265                  base::MD5Context* context) {
266   if (!FileRewind(fp))
267     return false;
268 
269   base::MD5Init(context);
270   FileHeader header;
271   header.magic = kFileMagic;
272   header.version = kFileVersion;
273   header.add_chunk_count = add_chunks.size();
274   header.sub_chunk_count = sub_chunks.size();
275   header.shard_stride = out_stride;
276   if (!WriteItem(header, fp, context))
277     return false;
278 
279   if (!WriteContainer(add_chunks, fp, context) ||
280       !WriteContainer(sub_chunks, fp, context))
281     return false;
282 
283   // Write out the header digest.
284   base::MD5Digest header_digest;
285   base::MD5IntermediateFinal(&header_digest, context);
286   if (!WriteItem(header_digest, fp, context))
287     return false;
288 
289   return true;
290 }
291 
292 // Return |true| if the range is sorted by the given comparator.
293 template <typename CTI, typename LESS>
sorted(CTI beg,CTI end,LESS less)294 bool sorted(CTI beg, CTI end, LESS less) {
295   while ((end - beg) > 2) {
296     CTI n = beg++;
297     DCHECK(!less(*beg, *n));
298     if (less(*beg, *n))
299       return false;
300   }
301   return true;
302 }
303 
304 // Merge |beg|..|end| into |container|.  Both should be sorted by the given
305 // comparator, and the range iterators should not be derived from |container|.
306 // Differs from std::inplace_merge() in that additional memory is not required
307 // for linear performance.
308 template <typename CT, typename CTI, typename COMP>
container_merge(CT * container,CTI beg,CTI end,const COMP & less)309 void container_merge(CT* container, CTI beg, CTI end, const COMP& less) {
310   DCHECK(sorted(container->begin(), container->end(), less));
311   DCHECK(sorted(beg, end, less));
312 
313   // Size the container to fit the results.
314   const size_t c_size = container->size();
315   container->resize(c_size + (end - beg));
316 
317   // |c_end| points to the original endpoint, while |c_out| points to the
318   // endpoint that will scan from end to beginning while merging.
319   typename CT::iterator c_end = container->begin() + c_size;
320   typename CT::iterator c_out = container->end();
321 
322   // While both inputs have data, move the greater to |c_out|.
323   while (c_end != container->begin() && end != beg) {
324     if (less(*(c_end - 1), *(end - 1))) {
325       *(--c_out) = *(--end);
326     } else {
327       *(--c_out) = *(--c_end);
328     }
329   }
330 
331   // Copy any data remaining in the new range.
332   if (end != beg) {
333     // The original container data has been fully shifted.
334     DCHECK(c_end == container->begin());
335 
336     // There is exactly the correct amount of space left.
337     DCHECK_EQ(c_out - c_end, end - beg);
338 
339     std::copy(beg, end, container->begin());
340   }
341 
342   DCHECK(sorted(container->begin(), container->end(), less));
343 }
344 
345 // Collection of iterators used while stepping through StateInternal (see
346 // below).
347 class StateInternalPos {
348  public:
StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,SBSubPrefixes::iterator sub_prefixes_iter,std::vector<SBAddFullHash>::iterator add_hashes_iter,std::vector<SBSubFullHash>::iterator sub_hashes_iter)349   StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,
350                    SBSubPrefixes::iterator sub_prefixes_iter,
351                    std::vector<SBAddFullHash>::iterator add_hashes_iter,
352                    std::vector<SBSubFullHash>::iterator sub_hashes_iter)
353       : add_prefixes_iter_(add_prefixes_iter),
354         sub_prefixes_iter_(sub_prefixes_iter),
355         add_hashes_iter_(add_hashes_iter),
356         sub_hashes_iter_(sub_hashes_iter) {
357   }
358 
359   SBAddPrefixes::iterator add_prefixes_iter_;
360   SBSubPrefixes::iterator sub_prefixes_iter_;
361   std::vector<SBAddFullHash>::iterator add_hashes_iter_;
362   std::vector<SBSubFullHash>::iterator sub_hashes_iter_;
363 };
364 
365 // Helper to find the next shard boundary.
366 template <class T>
prefix_bounder(SBPrefix val,const T & elt)367 bool prefix_bounder(SBPrefix val, const T& elt) {
368   return val < elt.GetAddPrefix();
369 }
370 
371 // Container for partial database state.  Includes add/sub prefixes/hashes, plus
372 // aggregate operations on same.
373 class StateInternal {
374  public:
375   // Append indicated amount of data from |fp|.
AppendData(size_t add_prefix_count,size_t sub_prefix_count,size_t add_hash_count,size_t sub_hash_count,FILE * fp,base::MD5Context * context)376   bool AppendData(size_t add_prefix_count, size_t sub_prefix_count,
377                   size_t add_hash_count, size_t sub_hash_count,
378                   FILE* fp, base::MD5Context* context) {
379     return
380         ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) &&
381         ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) &&
382         ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) &&
383         ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context);
384   }
385 
ClearData()386   void ClearData() {
387     add_prefixes_.clear();
388     sub_prefixes_.clear();
389     add_full_hashes_.clear();
390     sub_full_hashes_.clear();
391   }
392 
393   // Merge data from |beg|..|end| into receiver's state, then process the state.
394   // The current state and the range given should corrospond to the same sorted
395   // shard of data from different sources.  |add_del_cache| and |sub_del_cache|
396   // indicate the chunk ids which should be deleted during processing (see
397   // SBProcessSubs).
MergeDataAndProcess(const StateInternalPos & beg,const StateInternalPos & end,const base::hash_set<int32> & add_del_cache,const base::hash_set<int32> & sub_del_cache)398   void MergeDataAndProcess(const StateInternalPos& beg,
399                            const StateInternalPos& end,
400                            const base::hash_set<int32>& add_del_cache,
401                            const base::hash_set<int32>& sub_del_cache) {
402     container_merge(&add_prefixes_,
403                     beg.add_prefixes_iter_,
404                     end.add_prefixes_iter_,
405                     SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
406 
407     container_merge(&sub_prefixes_,
408                     beg.sub_prefixes_iter_,
409                     end.sub_prefixes_iter_,
410                     SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
411 
412     container_merge(&add_full_hashes_,
413                     beg.add_hashes_iter_,
414                     end.add_hashes_iter_,
415                     SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
416 
417     container_merge(&sub_full_hashes_,
418                     beg.sub_hashes_iter_,
419                     end.sub_hashes_iter_,
420                     SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>);
421 
422     SBProcessSubs(&add_prefixes_, &sub_prefixes_,
423                   &add_full_hashes_, &sub_full_hashes_,
424                   add_del_cache, sub_del_cache);
425   }
426 
427   // Sort the data appropriately for the sharding, merging, and processing
428   // operations.
SortData()429   void SortData() {
430     std::sort(add_prefixes_.begin(), add_prefixes_.end(),
431               SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
432     std::sort(sub_prefixes_.begin(), sub_prefixes_.end(),
433               SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
434     std::sort(add_full_hashes_.begin(), add_full_hashes_.end(),
435               SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
436     std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(),
437               SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
438   }
439 
440   // Iterator from the beginning of the state's data.
StateBegin()441   StateInternalPos StateBegin() {
442     return StateInternalPos(add_prefixes_.begin(),
443                             sub_prefixes_.begin(),
444                             add_full_hashes_.begin(),
445                             sub_full_hashes_.begin());
446   }
447 
448   // An iterator pointing just after the last possible element of the shard
449   // indicated by |shard_max|.  Used to step through the state by shard.
450   // TODO(shess): Verify whether binary search really improves over linear.
451   // Merging or writing will immediately touch all of these elements.
ShardEnd(const StateInternalPos & beg,SBPrefix shard_max)452   StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) {
453     return StateInternalPos(
454         std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(),
455                          shard_max, prefix_bounder<SBAddPrefix>),
456         std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(),
457                          shard_max, prefix_bounder<SBSubPrefix>),
458         std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(),
459                          shard_max, prefix_bounder<SBAddFullHash>),
460         std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(),
461                          shard_max, prefix_bounder<SBSubFullHash>));
462   }
463 
464   // Write a shard header and data for the shard starting at |beg| and ending at
465   // the element before |end|.
WriteShard(const StateInternalPos & beg,const StateInternalPos & end,FILE * fp,base::MD5Context * context)466   bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end,
467                   FILE* fp, base::MD5Context* context) {
468     ShardHeader shard_header;
469     shard_header.add_prefix_count =
470         end.add_prefixes_iter_ - beg.add_prefixes_iter_;
471     shard_header.sub_prefix_count =
472         end.sub_prefixes_iter_ - beg.sub_prefixes_iter_;
473     shard_header.add_hash_count =
474         end.add_hashes_iter_ - beg.add_hashes_iter_;
475     shard_header.sub_hash_count =
476         end.sub_hashes_iter_ - beg.sub_hashes_iter_;
477 
478     return
479         WriteItem(shard_header, fp, context) &&
480         WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_,
481                    fp, context) &&
482         WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_,
483                    fp, context) &&
484         WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_,
485                    fp, context) &&
486         WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_,
487                    fp, context);
488   }
489 
490   SBAddPrefixes add_prefixes_;
491   SBSubPrefixes sub_prefixes_;
492   std::vector<SBAddFullHash> add_full_hashes_;
493   std::vector<SBSubFullHash> sub_full_hashes_;
494 };
495 
496 // True if |val| is an even power of two.
497 template <typename T>
IsPowerOfTwo(const T & val)498 bool IsPowerOfTwo(const T& val) {
499   return val && (val & (val - 1)) == 0;
500 }
501 
502 // Helper to read the entire database state, used by GetAddPrefixes() and
503 // GetAddFullHashes().  Those functions are generally used only for smaller
504 // files.  Returns false in case of errors reading the data.
ReadDbStateHelper(const base::FilePath & filename,StateInternal * db_state)505 bool ReadDbStateHelper(const base::FilePath& filename,
506                        StateInternal* db_state) {
507   base::ScopedFILE file(base::OpenFile(filename, "rb"));
508   if (file.get() == NULL)
509     return false;
510 
511   std::set<int32> add_chunks;
512   std::set<int32> sub_chunks;
513 
514   base::MD5Context context;
515   FileHeader header;
516   const int version =
517       ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks,
518                           file.get(), &context);
519   if (version == kInvalidVersion)
520     return false;
521 
522   uint64 in_min = 0;
523   uint64 in_stride = header.shard_stride;
524   if (!in_stride)
525     in_stride = kMaxShardStride;
526   if (!IsPowerOfTwo(in_stride))
527     return false;
528 
529   do {
530     ShardHeader shard_header;
531     if (!ReadItem(&shard_header, file.get(), &context))
532       return false;
533 
534     if (!db_state->AppendData(shard_header.add_prefix_count,
535                               shard_header.sub_prefix_count,
536                               shard_header.add_hash_count,
537                               shard_header.sub_hash_count,
538                               file.get(), &context)) {
539       return false;
540     }
541 
542     in_min += in_stride;
543   } while (in_min <= kMaxSBPrefix);
544 
545   if (!ReadAndVerifyChecksum(file.get(), &context))
546     return false;
547 
548   int64 size = 0;
549   if (!base::GetFileSize(filename, &size))
550     return false;
551 
552   return static_cast<int64>(ftell(file.get())) == size;
553 }
554 
555 }  // namespace
556 
SafeBrowsingStoreFile()557 SafeBrowsingStoreFile::SafeBrowsingStoreFile()
558     : chunks_written_(0), empty_(false), corruption_seen_(false) {}
559 
~SafeBrowsingStoreFile()560 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
561   Close();
562 }
563 
Delete()564 bool SafeBrowsingStoreFile::Delete() {
565   // The database should not be open at this point.  But, just in
566   // case, close everything before deleting.
567   if (!Close()) {
568     NOTREACHED();
569     return false;
570   }
571 
572   return DeleteStore(filename_);
573 }
574 
CheckValidity()575 bool SafeBrowsingStoreFile::CheckValidity() {
576   // The file was either empty or never opened.  The empty case is
577   // presumed not to be invalid.  The never-opened case can happen if
578   // BeginUpdate() fails for any databases, and should already have
579   // caused the corruption callback to fire.
580   if (!file_.get())
581     return true;
582 
583   if (!FileRewind(file_.get()))
584     return OnCorruptDatabase();
585 
586   int64 size = 0;
587   if (!base::GetFileSize(filename_, &size))
588     return OnCorruptDatabase();
589 
590   base::MD5Context context;
591   base::MD5Init(&context);
592 
593   // Read everything except the final digest.
594   size_t bytes_left = static_cast<size_t>(size);
595   CHECK(size == static_cast<int64>(bytes_left));
596   if (bytes_left < sizeof(base::MD5Digest))
597     return OnCorruptDatabase();
598   bytes_left -= sizeof(base::MD5Digest);
599 
600   // Fold the contents of the file into the checksum.
601   while (bytes_left > 0) {
602     char buf[4096];
603     const size_t c = std::min(sizeof(buf), bytes_left);
604     const size_t ret = fread(buf, 1, c, file_.get());
605 
606     // The file's size changed while reading, give up.
607     if (ret != c)
608       return OnCorruptDatabase();
609     base::MD5Update(&context, base::StringPiece(buf, c));
610     bytes_left -= c;
611   }
612 
613   if (!ReadAndVerifyChecksum(file_.get(), &context)) {
614     RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE);
615     return OnCorruptDatabase();
616   }
617 
618   return true;
619 }
620 
Init(const base::FilePath & filename,const base::Closure & corruption_callback)621 void SafeBrowsingStoreFile::Init(
622     const base::FilePath& filename,
623     const base::Closure& corruption_callback
624 ) {
625   filename_ = filename;
626   corruption_callback_ = corruption_callback;
627 }
628 
BeginChunk()629 bool SafeBrowsingStoreFile::BeginChunk() {
630   return ClearChunkBuffers();
631 }
632 
WriteAddPrefix(int32 chunk_id,SBPrefix prefix)633 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
634   add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
635   return true;
636 }
637 
GetAddPrefixes(SBAddPrefixes * add_prefixes)638 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
639   add_prefixes->clear();
640   if (!base::PathExists(filename_))
641     return true;
642 
643   StateInternal db_state;
644   if (!ReadDbStateHelper(filename_, &db_state))
645     return OnCorruptDatabase();
646 
647   add_prefixes->swap(db_state.add_prefixes_);
648   return true;
649 }
650 
GetAddFullHashes(std::vector<SBAddFullHash> * add_full_hashes)651 bool SafeBrowsingStoreFile::GetAddFullHashes(
652     std::vector<SBAddFullHash>* add_full_hashes) {
653   add_full_hashes->clear();
654   if (!base::PathExists(filename_))
655     return true;
656 
657   StateInternal db_state;
658   if (!ReadDbStateHelper(filename_, &db_state))
659     return OnCorruptDatabase();
660 
661   add_full_hashes->swap(db_state.add_full_hashes_);
662   return true;
663 }
664 
WriteAddHash(int32 chunk_id,const SBFullHash & full_hash)665 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
666                                          const SBFullHash& full_hash) {
667   add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash));
668   return true;
669 }
670 
WriteSubPrefix(int32 chunk_id,int32 add_chunk_id,SBPrefix prefix)671 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
672                                            int32 add_chunk_id,
673                                            SBPrefix prefix) {
674   sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
675   return true;
676 }
677 
WriteSubHash(int32 chunk_id,int32 add_chunk_id,const SBFullHash & full_hash)678 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
679                                          const SBFullHash& full_hash) {
680   sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
681   return true;
682 }
683 
OnCorruptDatabase()684 bool SafeBrowsingStoreFile::OnCorruptDatabase() {
685   if (!corruption_seen_)
686     RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
687   corruption_seen_ = true;
688 
689   corruption_callback_.Run();
690 
691   // Return false as a convenience to callers.
692   return false;
693 }
694 
Close()695 bool SafeBrowsingStoreFile::Close() {
696   ClearUpdateBuffers();
697 
698   // Make sure the files are closed.
699   file_.reset();
700   new_file_.reset();
701   return true;
702 }
703 
BeginUpdate()704 bool SafeBrowsingStoreFile::BeginUpdate() {
705   DCHECK(!file_.get() && !new_file_.get());
706 
707   // Structures should all be clear unless something bad happened.
708   DCHECK(add_chunks_cache_.empty());
709   DCHECK(sub_chunks_cache_.empty());
710   DCHECK(add_del_cache_.empty());
711   DCHECK(sub_del_cache_.empty());
712   DCHECK(add_prefixes_.empty());
713   DCHECK(sub_prefixes_.empty());
714   DCHECK(add_hashes_.empty());
715   DCHECK(sub_hashes_.empty());
716   DCHECK_EQ(chunks_written_, 0);
717 
718   corruption_seen_ = false;
719 
720   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
721   base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+"));
722   if (new_file.get() == NULL)
723     return false;
724 
725   base::ScopedFILE file(base::OpenFile(filename_, "rb"));
726   empty_ = (file.get() == NULL);
727   if (empty_) {
728     // If the file exists but cannot be opened, try to delete it (not
729     // deleting directly, the bloom filter needs to be deleted, too).
730     if (base::PathExists(filename_))
731       return OnCorruptDatabase();
732 
733     new_file_.swap(new_file);
734     return true;
735   }
736 
737   base::MD5Context context;
738   FileHeader header;
739   const int version =
740       ReadAndVerifyHeader(filename_, &header,
741                           &add_chunks_cache_, &sub_chunks_cache_,
742                           file.get(), &context);
743   if (version == kInvalidVersion) {
744     FileHeader retry_header;
745     if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL)) {
746       if (retry_header.magic == kFileMagic &&
747           retry_header.version < kFileVersion) {
748         RecordFormatEvent(FORMAT_EVENT_FOUND_DEPRECATED);
749       } else {
750         RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
751       }
752     }
753 
754     // Close the file so that it can be deleted.
755     file.reset();
756 
757     return OnCorruptDatabase();
758   }
759 
760   file_.swap(file);
761   new_file_.swap(new_file);
762   return true;
763 }
764 
FinishChunk()765 bool SafeBrowsingStoreFile::FinishChunk() {
766   if (!add_prefixes_.size() && !sub_prefixes_.size() &&
767       !add_hashes_.size() && !sub_hashes_.size())
768     return true;
769 
770   ChunkHeader header;
771   header.add_prefix_count = add_prefixes_.size();
772   header.sub_prefix_count = sub_prefixes_.size();
773   header.add_hash_count = add_hashes_.size();
774   header.sub_hash_count = sub_hashes_.size();
775   if (!WriteItem(header, new_file_.get(), NULL))
776     return false;
777 
778   if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
779       !WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
780       !WriteContainer(add_hashes_, new_file_.get(), NULL) ||
781       !WriteContainer(sub_hashes_, new_file_.get(), NULL))
782     return false;
783 
784   ++chunks_written_;
785 
786   // Clear everything to save memory.
787   return ClearChunkBuffers();
788 }
789 
DoUpdate(safe_browsing::PrefixSetBuilder * builder,std::vector<SBAddFullHash> * add_full_hashes_result)790 bool SafeBrowsingStoreFile::DoUpdate(
791     safe_browsing::PrefixSetBuilder* builder,
792     std::vector<SBAddFullHash>* add_full_hashes_result) {
793   DCHECK(file_.get() || empty_);
794   DCHECK(new_file_.get());
795   CHECK(builder);
796   CHECK(add_full_hashes_result);
797 
798   // Rewind the temporary storage.
799   if (!FileRewind(new_file_.get()))
800     return false;
801 
802   // Get chunk file's size for validating counts.
803   int64 update_size = 0;
804   if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size))
805     return OnCorruptDatabase();
806 
807   // Track update size to answer questions at http://crbug.com/72216 .
808   // Log small updates as 1k so that the 0 (underflow) bucket can be
809   // used for "empty" in SafeBrowsingDatabase.
810   UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
811                        std::max(static_cast<int>(update_size / 1024), 1));
812 
813   // Chunk updates to integrate.
814   StateInternal new_state;
815 
816   // Read update chunks.
817   for (int i = 0; i < chunks_written_; ++i) {
818     ChunkHeader header;
819 
820     int64 ofs = ftell(new_file_.get());
821     if (ofs == -1)
822       return false;
823 
824     if (!ReadItem(&header, new_file_.get(), NULL))
825       return false;
826 
827     // As a safety measure, make sure that the header describes a sane
828     // chunk, given the remaining file size.
829     int64 expected_size = ofs + sizeof(ChunkHeader);
830     expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
831     expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
832     expected_size += header.add_hash_count * sizeof(SBAddFullHash);
833     expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
834     if (expected_size > update_size)
835       return false;
836 
837     if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count,
838                               header.add_hash_count, header.sub_hash_count,
839                               new_file_.get(), NULL)) {
840       return false;
841     }
842   }
843 
844   // The state was accumulated by chunk, sort by prefix.
845   new_state.SortData();
846 
847   // These strides control how much data is loaded into memory per pass.
848   // Strides must be an even power of two.  |in_stride| will be derived from the
849   // input file.  |out_stride| will be derived from an estimate of the resulting
850   // file's size.  |process_stride| will be the max of both.
851   uint64 in_stride = kMaxShardStride;
852   uint64 out_stride = kMaxShardStride;
853   uint64 process_stride = 0;
854 
855   // Used to verify the input's checksum if |!empty_|.
856   base::MD5Context in_context;
857 
858   if (!empty_) {
859     DCHECK(file_.get());
860 
861     FileHeader header = {0};
862     int version = ReadAndVerifyHeader(filename_, &header,
863                                       &add_chunks_cache_, &sub_chunks_cache_,
864                                       file_.get(), &in_context);
865     if (version == kInvalidVersion)
866       return OnCorruptDatabase();
867 
868     if (header.shard_stride)
869       in_stride = header.shard_stride;
870 
871     // The header checksum should have prevented this case, but the code will be
872     // broken if this is not correct.
873     if (!IsPowerOfTwo(in_stride))
874       return OnCorruptDatabase();
875   }
876 
877   // We no longer need to track deleted chunks.
878   DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
879   DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);
880 
881   // Calculate |out_stride| to break the file down into reasonable shards.
882   {
883     int64 original_size = 0;
884     if (!empty_ && !base::GetFileSize(filename_, &original_size))
885       return OnCorruptDatabase();
886 
887     // Approximate the final size as everything.  Subs and deletes will reduce
888     // the size, but modest over-sharding won't hurt much.
889     int64 shard_size = original_size + update_size;
890 
891     // Keep splitting until a single stride of data fits the target.
892     size_t shifts = 0;
893     while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) {
894       out_stride >>= 1;
895       shard_size >>= 1;
896       ++shifts;
897     }
898     UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts);
899 
900     DCHECK(IsPowerOfTwo(out_stride));
901   }
902 
903   // Outer loop strides by the max of the input stride (to read integral shards)
904   // and the output stride (to write integral shards).
905   process_stride = std::max(in_stride, out_stride);
906   DCHECK(IsPowerOfTwo(process_stride));
907   DCHECK_EQ(0u, process_stride % in_stride);
908   DCHECK_EQ(0u, process_stride % out_stride);
909 
910   // Start writing the new data to |new_file_|.
911   base::MD5Context out_context;
912   if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_,
913                    new_file_.get(), &out_context)) {
914     return false;
915   }
916 
917   // Start at the beginning of the SBPrefix space.
918   uint64 in_min = 0;
919   uint64 out_min = 0;
920   uint64 process_min = 0;
921 
922   // Start at the beginning of the updates.
923   StateInternalPos new_pos = new_state.StateBegin();
924 
925   // Re-usable container for shard processing.
926   StateInternal db_state;
927 
928   // Track aggregate counts for histograms.
929   size_t add_prefix_count = 0;
930   size_t sub_prefix_count = 0;
931 
932   do {
933     // Maximum element in the current shard.
934     SBPrefix process_max =
935         static_cast<SBPrefix>(process_min + process_stride - 1);
936     DCHECK_GT(process_max, process_min);
937 
938     // Drop the data from previous pass.
939     db_state.ClearData();
940 
941     // Fill the processing shard with one or more input shards.
942     if (!empty_) {
943       do {
944         ShardHeader shard_header;
945         if (!ReadItem(&shard_header, file_.get(), &in_context))
946           return OnCorruptDatabase();
947 
948         if (!db_state.AppendData(shard_header.add_prefix_count,
949                                  shard_header.sub_prefix_count,
950                                  shard_header.add_hash_count,
951                                  shard_header.sub_hash_count,
952                                  file_.get(), &in_context))
953           return OnCorruptDatabase();
954 
955         in_min += in_stride;
956       } while (in_min <= kMaxSBPrefix && in_min < process_max);
957     }
958 
959     // Shard the update data to match the database data, then merge the update
960     // data and process the results.
961     {
962       StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max);
963       db_state.MergeDataAndProcess(new_pos, new_end,
964                                    add_del_cache_, sub_del_cache_);
965       new_pos = new_end;
966     }
967 
968     // Collect the processed data for return to caller.
969     for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) {
970       builder->AddPrefix(db_state.add_prefixes_[i].prefix);
971     }
972     add_full_hashes_result->insert(add_full_hashes_result->end(),
973                                    db_state.add_full_hashes_.begin(),
974                                    db_state.add_full_hashes_.end());
975     add_prefix_count += db_state.add_prefixes_.size();
976     sub_prefix_count += db_state.sub_prefixes_.size();
977 
978     // Write one or more shards of processed output.
979     StateInternalPos out_pos = db_state.StateBegin();
980     do {
981       SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1);
982       DCHECK_GT(out_max, out_min);
983 
984       StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max);
985       if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context))
986         return false;
987       out_pos = out_end;
988 
989       out_min += out_stride;
990     } while (out_min == static_cast<SBPrefix>(out_min) &&
991              out_min < process_max);
992 
993     process_min += process_stride;
994   } while (process_min <= kMaxSBPrefix);
995 
996   // Verify the overall checksum.
997   if (!empty_) {
998     if (!ReadAndVerifyChecksum(file_.get(), &in_context)) {
999       RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE);
1000       return OnCorruptDatabase();
1001     }
1002 
1003     // TODO(shess): Verify EOF?
1004 
1005     // Close the input file so the new file can be renamed over it.
1006     file_.reset();
1007   }
1008   DCHECK(!file_.get());
1009 
1010   // Write the overall checksum.
1011   base::MD5Digest out_digest;
1012   base::MD5Final(&out_digest, &out_context);
1013   if (!WriteItem(out_digest, new_file_.get(), NULL))
1014     return false;
1015 
1016   // Trim any excess left over from the temporary chunk data.
1017   if (!base::TruncateFile(new_file_.get()))
1018     return false;
1019 
1020   // Close the file handle and swizzle the file into place.
1021   new_file_.reset();
1022   if (!base::DeleteFile(filename_, false) &&
1023       base::PathExists(filename_))
1024     return false;
1025 
1026   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1027   if (!base::Move(new_filename, filename_))
1028     return false;
1029 
1030   // Record counts before swapping to caller.
1031   UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count);
1032   UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count);
1033 
1034   return true;
1035 }
1036 
FinishUpdate(safe_browsing::PrefixSetBuilder * builder,std::vector<SBAddFullHash> * add_full_hashes_result)1037 bool SafeBrowsingStoreFile::FinishUpdate(
1038     safe_browsing::PrefixSetBuilder* builder,
1039     std::vector<SBAddFullHash>* add_full_hashes_result) {
1040   DCHECK(builder);
1041   DCHECK(add_full_hashes_result);
1042 
1043   if (!DoUpdate(builder, add_full_hashes_result)) {
1044     CancelUpdate();
1045     return false;
1046   }
1047 
1048   DCHECK(!new_file_.get());
1049   DCHECK(!file_.get());
1050 
1051   return Close();
1052 }
1053 
CancelUpdate()1054 bool SafeBrowsingStoreFile::CancelUpdate() {
1055   bool ret = Close();
1056 
1057   // Delete stale staging file.
1058   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1059   base::DeleteFile(new_filename, false);
1060 
1061   return ret;
1062 }
1063 
SetAddChunk(int32 chunk_id)1064 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
1065   add_chunks_cache_.insert(chunk_id);
1066 }
1067 
CheckAddChunk(int32 chunk_id)1068 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
1069   return add_chunks_cache_.count(chunk_id) > 0;
1070 }
1071 
GetAddChunks(std::vector<int32> * out)1072 void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
1073   out->clear();
1074   out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
1075 }
1076 
SetSubChunk(int32 chunk_id)1077 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
1078   sub_chunks_cache_.insert(chunk_id);
1079 }
1080 
CheckSubChunk(int32 chunk_id)1081 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
1082   return sub_chunks_cache_.count(chunk_id) > 0;
1083 }
1084 
GetSubChunks(std::vector<int32> * out)1085 void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
1086   out->clear();
1087   out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
1088 }
1089 
DeleteAddChunk(int32 chunk_id)1090 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
1091   add_del_cache_.insert(chunk_id);
1092 }
1093 
DeleteSubChunk(int32 chunk_id)1094 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
1095   sub_del_cache_.insert(chunk_id);
1096 }
1097 
1098 // static
DeleteStore(const base::FilePath & basename)1099 bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) {
1100   if (!base::DeleteFile(basename, false) &&
1101       base::PathExists(basename)) {
1102     NOTREACHED();
1103     return false;
1104   }
1105 
1106   const base::FilePath new_filename = TemporaryFileForFilename(basename);
1107   if (!base::DeleteFile(new_filename, false) &&
1108       base::PathExists(new_filename)) {
1109     NOTREACHED();
1110     return false;
1111   }
1112 
1113   // With SQLite support gone, one way to get to this code is if the
1114   // existing file is a SQLite file.  Make sure the journal file is
1115   // also removed.
1116   const base::FilePath journal_filename(
1117       basename.value() + FILE_PATH_LITERAL("-journal"));
1118   if (base::PathExists(journal_filename))
1119     base::DeleteFile(journal_filename, false);
1120 
1121   return true;
1122 }
1123