• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
7 
8 #include <set>
9 #include <vector>
10 
11 #include "chrome/browser/safe_browsing/safe_browsing_store.h"
12 
13 #include "base/callback.h"
14 #include "base/files/file_path.h"
15 #include "base/files/scoped_file.h"
16 
17 // Implement SafeBrowsingStore in terms of a flat file.  The file
18 // format is pretty literal:
19 //
20 // int32 magic;             // magic number "validating" file
21 // int32 version;           // format version
22 //
23 // // Counts for the various data which follows the header.
24 // uint32 add_chunk_count;  // Chunks seen, including empties.
25 // uint32 sub_chunk_count;  // Ditto.
26 // uint32 shard_stride;     // SBPrefix space covered per shard.
27 //                          // 0==entire space in one shard.
28 // // Sorted by chunk_id.
29 // array[add_chunk_count] {
30 //   int32 chunk_id;
31 // }
32 // // Sorted by chunk_id.
33 // array[sub_chunk_count] {
34 //   int32 chunk_id;
35 // }
36 // MD5Digest header_checksum;  // Checksum over preceeding data.
37 //
38 // // Sorted by prefix, then add chunk_id, then hash, both within shards and
39 // // overall.
40 // array[from 0 to wraparound to 0 by shard_stride] {
41 //   uint32 add_prefix_count;
42 //   uint32 sub_prefix_count;
43 //   uint32 add_hash_count;
44 //   uint32 sub_hash_count;
45 //   array[add_prefix_count] {
46 //     int32 chunk_id;
47 //     uint32 prefix;
48 //   }
49 //   array[sub_prefix_count] {
50 //     int32 chunk_id;
51 //     int32 add_chunk_id;
52 //     uint32 add_prefix;
53 //   }
54 //   array[add_hash_count] {
55 //     int32 chunk_id;
56 //     int32 received_time;     // From base::Time::ToTimeT().
57 //     char[32] full_hash;
58 //   }
59 //   array[sub_hash_count] {
60 //     int32 chunk_id;
61 //     int32 add_chunk_id;
62 //     char[32] add_full_hash;
63 //   }
64 // }
65 // MD5Digest checksum;      // Checksum over entire file.
66 //
67 // The checksums are used to allow writing the file without doing an expensive
68 // fsync().  Since the data can be re-fetched, failing the checksum is not
69 // catastrophic.  Histograms indicate that file corruption here is pretty
70 // uncommon.
71 //
72 // The |header_checksum| is present to guarantee valid header and chunk data for
73 // updates.  Only that part of the file needs to be read to post the update.
74 //
75 // |shard_stride| breaks the file into approximately-equal portions, allowing
76 // updates to stream from one file to another with modest memory usage.  It is
77 // dynamic to adjust to different file sizes without adding excessive overhead.
78 //
79 // During the course of an update, uncommitted data is stored in a
80 // temporary file (which is later re-used to commit).  This is an
81 // array of chunks, with the count kept in memory until the end of the
82 // transaction.  The format of this file is like the main file, with
83 // the list of chunks seen omitted, as that data is tracked in-memory:
84 //
85 // array[] {
86 //   uint32 add_prefix_count;
87 //   uint32 sub_prefix_count;
88 //   uint32 add_hash_count;
89 //   uint32 sub_hash_count;
90 //   array[add_prefix_count] {
91 //     int32 chunk_id;
92 //     uint32 prefix;
93 //   }
94 //   array[sub_prefix_count] {
95 //     int32 chunk_id;
96 //     int32 add_chunk_id;
97 //     uint32 add_prefix;
98 //   }
99 //   array[add_hash_count] {
100 //     int32 chunk_id;
101 //     int32 received_time;     // From base::Time::ToTimeT().
102 //     char[32] full_hash;
103 //   }
104 //   array[sub_hash_count] {
105 //     int32 chunk_id;
106 //     int32 add_chunk_id;
107 //     char[32] add_full_hash;
108 //   }
109 // }
110 //
111 // The overall transaction works like this:
112 // - Open the original file to get the chunks-seen data.
113 // - Open a temp file for storing new chunk info.
114 // - Write new chunks to the temp file.
115 // - When the transaction is finished:
116 //   - Read the update data from the temp file into memory.
117 //   - Overwrite the temp file with new header data.
118 //   - Until done:
119 //     - Read shards of the original file's data into memory.
120 //     - Merge from the update data.
121 //     - Write shards to the temp file.
122 //   - Delete original file.
123 //   - Rename temp file to original filename.
124 
125 class SafeBrowsingStoreFile : public SafeBrowsingStore {
126  public:
127   SafeBrowsingStoreFile();
128   virtual ~SafeBrowsingStoreFile();
129 
130   virtual void Init(const base::FilePath& filename,
131                     const base::Closure& corruption_callback) OVERRIDE;
132 
133   // Delete any on-disk files, including the permanent storage.
134   virtual bool Delete() OVERRIDE;
135 
136   // Get all add hash prefixes and full-length hashes, respectively, from
137   // the store.
138   virtual bool GetAddPrefixes(SBAddPrefixes* add_prefixes) OVERRIDE;
139   virtual bool GetAddFullHashes(
140       std::vector<SBAddFullHash>* add_full_hashes) OVERRIDE;
141 
142   virtual bool BeginChunk() OVERRIDE;
143 
144   virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix) OVERRIDE;
145   virtual bool WriteAddHash(int32 chunk_id,
146                             const SBFullHash& full_hash) OVERRIDE;
147   virtual bool WriteSubPrefix(int32 chunk_id,
148                               int32 add_chunk_id, SBPrefix prefix) OVERRIDE;
149   virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id,
150                             const SBFullHash& full_hash) OVERRIDE;
151   virtual bool FinishChunk() OVERRIDE;
152 
153   virtual bool BeginUpdate() OVERRIDE;
154   virtual bool FinishUpdate(
155       safe_browsing::PrefixSetBuilder* builder,
156       std::vector<SBAddFullHash>* add_full_hashes_result) OVERRIDE;
157   virtual bool CancelUpdate() OVERRIDE;
158 
159   virtual void SetAddChunk(int32 chunk_id) OVERRIDE;
160   virtual bool CheckAddChunk(int32 chunk_id) OVERRIDE;
161   virtual void GetAddChunks(std::vector<int32>* out) OVERRIDE;
162   virtual void SetSubChunk(int32 chunk_id) OVERRIDE;
163   virtual bool CheckSubChunk(int32 chunk_id) OVERRIDE;
164   virtual void GetSubChunks(std::vector<int32>* out) OVERRIDE;
165 
166   virtual void DeleteAddChunk(int32 chunk_id) OVERRIDE;
167   virtual void DeleteSubChunk(int32 chunk_id) OVERRIDE;
168 
169   // Verify |file_|'s checksum, calling the corruption callback if it
170   // does not check out.  Empty input is considered valid.
171   virtual bool CheckValidity() OVERRIDE;
172 
173   // Returns the name of the temporary file used to buffer data for
174   // |filename|.  Exported for unit tests.
TemporaryFileForFilename(const base::FilePath & filename)175   static const base::FilePath TemporaryFileForFilename(
176       const base::FilePath& filename) {
177     return base::FilePath(filename.value() + FILE_PATH_LITERAL("_new"));
178   }
179 
180   // Delete any on-disk files, including the permanent storage.
181   static bool DeleteStore(const base::FilePath& basename);
182 
183  private:
184   // Does the actual update for FinishUpdate(), so that FinishUpdate() can clean
185   // up correctly in case of error.
186   virtual bool DoUpdate(safe_browsing::PrefixSetBuilder* builder,
187                         std::vector<SBAddFullHash>* add_full_hashes_result);
188 
189   // Some very lucky users have an original-format file still in their
190   // profile.  Check for it and delete, recording a histogram for the
191   // result (no histogram for not-found).  Logically this
192   // would make more sense at the SafeBrowsingDatabase level, but
193   // practically speaking that code doesn't touch files directly.
194   static void CheckForOriginalAndDelete(const base::FilePath& filename);
195 
196   // Close all files and clear all buffers.
197   bool Close();
198 
199   // Calls |corruption_callback_| if non-NULL, always returns false as
200   // a convenience to the caller.
201   bool OnCorruptDatabase();
202 
203   // Helper for creating a corruption callback for |old_store_|.
204   // TODO(shess): Remove after migration.
205   void HandleCorruptDatabase();
206 
207   // Clear temporary buffers used to accumulate chunk data.
ClearChunkBuffers()208   bool ClearChunkBuffers() {
209     // NOTE: .clear() doesn't release memory.
210     // TODO(shess): Figure out if this is overkill.  Some amount of
211     // pre-reserved space is probably reasonable between each chunk
212     // collected.
213     SBAddPrefixes().swap(add_prefixes_);
214     SBSubPrefixes().swap(sub_prefixes_);
215     std::vector<SBAddFullHash>().swap(add_hashes_);
216     std::vector<SBSubFullHash>().swap(sub_hashes_);
217     return true;
218   }
219 
220   // Clear all buffers used during update.
ClearUpdateBuffers()221   void ClearUpdateBuffers() {
222     ClearChunkBuffers();
223     chunks_written_ = 0;
224     std::set<int32>().swap(add_chunks_cache_);
225     std::set<int32>().swap(sub_chunks_cache_);
226     base::hash_set<int32>().swap(add_del_cache_);
227     base::hash_set<int32>().swap(sub_del_cache_);
228   }
229 
230   // Buffers for collecting data between BeginChunk() and
231   // FinishChunk().
232   SBAddPrefixes add_prefixes_;
233   SBSubPrefixes sub_prefixes_;
234   std::vector<SBAddFullHash> add_hashes_;
235   std::vector<SBSubFullHash> sub_hashes_;
236 
237   // Count of chunks collected in |new_file_|.
238   int chunks_written_;
239 
240   // Name of the main database file.
241   base::FilePath filename_;
242 
243   // Handles to the main and scratch files.  |empty_| is true if the
244   // main file didn't exist when the update was started.
245   base::ScopedFILE file_;
246   base::ScopedFILE new_file_;
247   bool empty_;
248 
249   // Cache of chunks which have been seen.  Loaded from the database
250   // on BeginUpdate() so that it can be queried during the
251   // transaction.
252   std::set<int32> add_chunks_cache_;
253   std::set<int32> sub_chunks_cache_;
254 
255   // Cache the set of deleted chunks during a transaction, applied on
256   // FinishUpdate().
257   // TODO(shess): If the set is small enough, hash_set<> might be
258   // slower than plain set<>.
259   base::hash_set<int32> add_del_cache_;
260   base::hash_set<int32> sub_del_cache_;
261 
262   base::Closure corruption_callback_;
263 
264   // Tracks whether corruption has already been seen in the current
265   // update, so that only one instance is recorded in the stats.
266   // TODO(shess): Remove with format-migration support.
267   bool corruption_seen_;
268 
269   DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStoreFile);
270 };
271 
272 #endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
273