• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
7 #pragma once
8 
9 #include <set>
10 #include <vector>
11 
12 #include "chrome/browser/safe_browsing/safe_browsing_store.h"
13 
14 #include "base/callback.h"
15 #include "base/file_util.h"
16 
17 // Implement SafeBrowsingStore in terms of a flat file.  The file
18 // format is pretty literal:
19 //
20 // int32 magic;             // magic number "validating" file
21 // int32 version;           // format version
22 //
23 // // Counts for the various data which follows the header.
24 // uint32 add_chunk_count;   // Chunks seen, including empties.
25 // uint32 sub_chunk_count;   // Ditto.
26 // uint32 add_prefix_count;
27 // uint32 sub_prefix_count;
28 // uint32 add_hash_count;
29 // uint32 sub_hash_count;
30 //
31 // array[add_chunk_count] {
32 //   int32 chunk_id;
33 // }
34 // array[sub_chunk_count] {
35 //   int32 chunk_id;
36 // }
37 // array[add_prefix_count] {
38 //   int32 chunk_id;
39 //   int32 prefix;
40 // }
41 // array[sub_prefix_count] {
42 //   int32 chunk_id;
43 //   int32 add_chunk_id;
44 //   int32 add_prefix;
45 // }
46 // array[add_hash_count] {
47 //   int32 chunk_id;
48 //   int32 received_time;     // From base::Time::ToTimeT().
49 //   char[32] full_hash;
50 // array[sub_hash_count] {
51 //   int32 chunk_id;
52 //   int32 add_chunk_id;
53 //   char[32] add_full_hash;
54 // }
55 // MD5Digest checksum;      // Checksum over preceeding data.
56 //
57 // During the course of an update, uncommitted data is stored in a
58 // temporary file (which is later re-used to commit).  This is an
59 // array of chunks, with the count kept in memory until the end of the
60 // transaction.  The format of this file is like the main file, with
61 // the list of chunks seen omitted, as that data is tracked in-memory:
62 //
63 // array[] {
64 //   uint32 add_prefix_count;
65 //   uint32 sub_prefix_count;
66 //   uint32 add_hash_count;
67 //   uint32 sub_hash_count;
68 //   array[add_prefix_count] {
69 //     int32 chunk_id;
70 //     int32 prefix;
71 //   }
72 //   array[sub_prefix_count] {
73 //     int32 chunk_id;
74 //     int32 add_chunk_id;
75 //     int32 add_prefix;
76 //   }
77 //   array[add_hash_count] {
78 //     int32 chunk_id;
79 //     int32 received_time;     // From base::Time::ToTimeT().
80 //     char[32] full_hash;
81 //   }
82 //   array[sub_hash_count] {
83 //     int32 chunk_id;
84 //     int32 add_chunk_id;
85 //     char[32] add_full_hash;
86 //   }
87 // }
88 //
89 // The overall transaction works like this:
90 // - Open the original file to get the chunks-seen data.
91 // - Open a temp file for storing new chunk info.
92 // - Write new chunks to the temp file.
93 // - When the transaction is finished:
94 //   - Read the rest of the original file's data into buffers.
95 //   - Rewind the temp file and merge the new data into buffers.
96 //   - Process buffers for deletions and apply subs.
97 //   - Rewind and write the buffers out to temp file.
98 //   - Delete original file.
99 //   - Rename temp file to original filename.
100 
101 // TODO(shess): By using a checksum, this code can avoid doing an
102 // fsync(), at the possible cost of more frequently retrieving the
103 // full dataset.  Measure how often this occurs, and if it occurs too
104 // often, consider retaining the last known-good file for recovery
105 // purposes, rather than deleting it.
106 
107 class SafeBrowsingStoreFile : public SafeBrowsingStore {
108  public:
109   SafeBrowsingStoreFile();
110   virtual ~SafeBrowsingStoreFile();
111 
112   virtual void Init(const FilePath& filename,
113                     Callback0::Type* corruption_callback);
114 
115   // Delete any on-disk files, including the permanent storage.
116   virtual bool Delete();
117 
118   // Get all add hash prefixes and full-length hashes, respectively, from
119   // the store.
120   virtual bool GetAddPrefixes(std::vector<SBAddPrefix>* add_prefixes);
121   virtual bool GetAddFullHashes(std::vector<SBAddFullHash>* add_full_hashes);
122 
123   virtual bool BeginChunk();
124 
125   virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix);
126   virtual bool WriteAddHash(int32 chunk_id,
127                             base::Time receive_time,
128                             const SBFullHash& full_hash);
129   virtual bool WriteSubPrefix(int32 chunk_id,
130                               int32 add_chunk_id, SBPrefix prefix);
131   virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id,
132                             const SBFullHash& full_hash);
133   virtual bool FinishChunk();
134 
135   virtual bool BeginUpdate();
136   // Store updates with pending add full hashes in file store and
137   // return |add_prefixes_result| and |add_full_hashes_result|.
138   virtual bool FinishUpdate(const std::vector<SBAddFullHash>& pending_adds,
139                             const std::set<SBPrefix>& prefix_misses,
140                             std::vector<SBAddPrefix>* add_prefixes_result,
141                             std::vector<SBAddFullHash>* add_full_hashes_result);
142   virtual bool CancelUpdate();
143 
144   virtual void SetAddChunk(int32 chunk_id);
145   virtual bool CheckAddChunk(int32 chunk_id);
146   virtual void GetAddChunks(std::vector<int32>* out);
147   virtual void SetSubChunk(int32 chunk_id);
148   virtual bool CheckSubChunk(int32 chunk_id);
149   virtual void GetSubChunks(std::vector<int32>* out);
150 
151   virtual void DeleteAddChunk(int32 chunk_id);
152   virtual void DeleteSubChunk(int32 chunk_id);
153 
154   // Returns the name of the temporary file used to buffer data for
155   // |filename|.  Exported for unit tests.
TemporaryFileForFilename(const FilePath & filename)156   static const FilePath TemporaryFileForFilename(const FilePath& filename) {
157     return FilePath(filename.value() + FILE_PATH_LITERAL("_new"));
158   }
159 
160  private:
161   // Update store file with pending full hashes.
162   virtual bool DoUpdate(const std::vector<SBAddFullHash>& pending_adds,
163                         const std::set<SBPrefix>& prefix_misses,
164                         std::vector<SBAddPrefix>* add_prefixes_result,
165                         std::vector<SBAddFullHash>* add_full_hashes_result);
166 
167   // Enumerate different format-change events for histogramming
168   // purposes.  DO NOT CHANGE THE ORDERING OF THESE VALUES.
169   // TODO(shess): Remove this once the format change is complete.
170   enum FormatEventType {
171     // Corruption detected, broken down by file format.
172     FORMAT_EVENT_FILE_CORRUPT,
173     FORMAT_EVENT_SQLITE_CORRUPT,  // Obsolete
174 
175     // The type of format found in the file.  The expected case (new
176     // file format) is intentionally not covered.
177     FORMAT_EVENT_FOUND_SQLITE,
178     FORMAT_EVENT_FOUND_UNKNOWN,
179 
180     // The number of SQLite-format files deleted should be the same as
181     // FORMAT_EVENT_FOUND_SQLITE.  It can differ if the delete fails,
182     // or if a failure prevents the update from succeeding.
183     FORMAT_EVENT_SQLITE_DELETED,  // Obsolete
184     FORMAT_EVENT_SQLITE_DELETE_FAILED,  // Obsolete
185 
186     // Found and deleted (or failed to delete) the ancient "Safe
187     // Browsing" file.
188     FORMAT_EVENT_DELETED_ORIGINAL,
189     FORMAT_EVENT_DELETED_ORIGINAL_FAILED,
190 
191     // Memory space for histograms is determined by the max.  ALWAYS
192     // ADD NEW VALUES BEFORE THIS ONE.
193     FORMAT_EVENT_MAX
194   };
195 
196   // Helper to record an event related to format conversion from
197   // SQLite to file.
198   static void RecordFormatEvent(FormatEventType event_type);
199 
200   // Some very lucky users have an original-format file still in their
201   // profile.  Check for it and delete, recording a histogram for the
202   // result (no histogram for not-found).  Logically this
203   // would make more sense at the SafeBrowsingDatabase level, but
204   // practically speaking that code doesn't touch files directly.
205   static void CheckForOriginalAndDelete(const FilePath& filename);
206 
207   // Close all files and clear all buffers.
208   bool Close();
209 
210   // Calls |corruption_callback_| if non-NULL, always returns false as
211   // a convenience to the caller.
212   bool OnCorruptDatabase();
213 
214   // Helper for creating a corruption callback for |old_store_|.
215   // TODO(shess): Remove after migration.
216   void HandleCorruptDatabase();
217 
218   // Clear temporary buffers used to accumulate chunk data.
ClearChunkBuffers()219   bool ClearChunkBuffers() {
220     // NOTE: .clear() doesn't release memory.
221     // TODO(shess): Figure out if this is overkill.  Some amount of
222     // pre-reserved space is probably reasonable between each chunk
223     // collected.
224     std::vector<SBAddPrefix>().swap(add_prefixes_);
225     std::vector<SBSubPrefix>().swap(sub_prefixes_);
226     std::vector<SBAddFullHash>().swap(add_hashes_);
227     std::vector<SBSubFullHash>().swap(sub_hashes_);
228     return true;
229   }
230 
231   // Clear all buffers used during update.
ClearUpdateBuffers()232   void ClearUpdateBuffers() {
233     ClearChunkBuffers();
234     chunks_written_ = 0;
235     std::set<int32>().swap(add_chunks_cache_);
236     std::set<int32>().swap(sub_chunks_cache_);
237     base::hash_set<int32>().swap(add_del_cache_);
238     base::hash_set<int32>().swap(sub_del_cache_);
239   }
240 
241   // Buffers for collecting data between BeginChunk() and
242   // FinishChunk().
243   std::vector<SBAddPrefix> add_prefixes_;
244   std::vector<SBSubPrefix> sub_prefixes_;
245   std::vector<SBAddFullHash> add_hashes_;
246   std::vector<SBSubFullHash> sub_hashes_;
247 
248   // Count of chunks collected in |new_file_|.
249   int chunks_written_;
250 
251   // Name of the main database file.
252   FilePath filename_;
253 
254   // Handles to the main and scratch files.  |empty_| is true if the
255   // main file didn't exist when the update was started.
256   file_util::ScopedFILE file_;
257   file_util::ScopedFILE new_file_;
258   bool empty_;
259 
260   // Cache of chunks which have been seen.  Loaded from the database
261   // on BeginUpdate() so that it can be queried during the
262   // transaction.
263   std::set<int32> add_chunks_cache_;
264   std::set<int32> sub_chunks_cache_;
265 
266   // Cache the set of deleted chunks during a transaction, applied on
267   // FinishUpdate().
268   // TODO(shess): If the set is small enough, hash_set<> might be
269   // slower than plain set<>.
270   base::hash_set<int32> add_del_cache_;
271   base::hash_set<int32> sub_del_cache_;
272 
273   scoped_ptr<Callback0::Type> corruption_callback_;
274 
275   // Tracks whether corruption has already been seen in the current
276   // update, so that only one instance is recorded in the stats.
277   // TODO(shess): Remove with format-migration support.
278   bool corruption_seen_;
279 
280   DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStoreFile);
281 };
282 
283 #endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
284