1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_ 6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_ 7 #pragma once 8 9 #include <set> 10 #include <vector> 11 12 #include "chrome/browser/safe_browsing/safe_browsing_store.h" 13 14 #include "base/callback.h" 15 #include "base/file_util.h" 16 17 // Implement SafeBrowsingStore in terms of a flat file. The file 18 // format is pretty literal: 19 // 20 // int32 magic; // magic number "validating" file 21 // int32 version; // format version 22 // 23 // // Counts for the various data which follows the header. 24 // uint32 add_chunk_count; // Chunks seen, including empties. 25 // uint32 sub_chunk_count; // Ditto. 26 // uint32 add_prefix_count; 27 // uint32 sub_prefix_count; 28 // uint32 add_hash_count; 29 // uint32 sub_hash_count; 30 // 31 // array[add_chunk_count] { 32 // int32 chunk_id; 33 // } 34 // array[sub_chunk_count] { 35 // int32 chunk_id; 36 // } 37 // array[add_prefix_count] { 38 // int32 chunk_id; 39 // int32 prefix; 40 // } 41 // array[sub_prefix_count] { 42 // int32 chunk_id; 43 // int32 add_chunk_id; 44 // int32 add_prefix; 45 // } 46 // array[add_hash_count] { 47 // int32 chunk_id; 48 // int32 received_time; // From base::Time::ToTimeT(). 49 // char[32] full_hash; 50 // array[sub_hash_count] { 51 // int32 chunk_id; 52 // int32 add_chunk_id; 53 // char[32] add_full_hash; 54 // } 55 // MD5Digest checksum; // Checksum over preceeding data. 56 // 57 // During the course of an update, uncommitted data is stored in a 58 // temporary file (which is later re-used to commit). This is an 59 // array of chunks, with the count kept in memory until the end of the 60 // transaction. The format of this file is like the main file, with 61 // the list of chunks seen omitted, as that data is tracked in-memory: 62 // 63 // array[] { 64 // uint32 add_prefix_count; 65 // uint32 sub_prefix_count; 66 // uint32 add_hash_count; 67 // uint32 sub_hash_count; 68 // array[add_prefix_count] { 69 // int32 chunk_id; 70 // int32 prefix; 71 // } 72 // array[sub_prefix_count] { 73 // int32 chunk_id; 74 // int32 add_chunk_id; 75 // int32 add_prefix; 76 // } 77 // array[add_hash_count] { 78 // int32 chunk_id; 79 // int32 received_time; // From base::Time::ToTimeT(). 80 // char[32] full_hash; 81 // } 82 // array[sub_hash_count] { 83 // int32 chunk_id; 84 // int32 add_chunk_id; 85 // char[32] add_full_hash; 86 // } 87 // } 88 // 89 // The overall transaction works like this: 90 // - Open the original file to get the chunks-seen data. 91 // - Open a temp file for storing new chunk info. 92 // - Write new chunks to the temp file. 93 // - When the transaction is finished: 94 // - Read the rest of the original file's data into buffers. 95 // - Rewind the temp file and merge the new data into buffers. 96 // - Process buffers for deletions and apply subs. 97 // - Rewind and write the buffers out to temp file. 98 // - Delete original file. 99 // - Rename temp file to original filename. 100 101 // TODO(shess): By using a checksum, this code can avoid doing an 102 // fsync(), at the possible cost of more frequently retrieving the 103 // full dataset. Measure how often this occurs, and if it occurs too 104 // often, consider retaining the last known-good file for recovery 105 // purposes, rather than deleting it. 106 107 class SafeBrowsingStoreFile : public SafeBrowsingStore { 108 public: 109 SafeBrowsingStoreFile(); 110 virtual ~SafeBrowsingStoreFile(); 111 112 virtual void Init(const FilePath& filename, 113 Callback0::Type* corruption_callback); 114 115 // Delete any on-disk files, including the permanent storage. 116 virtual bool Delete(); 117 118 // Get all add hash prefixes and full-length hashes, respectively, from 119 // the store. 120 virtual bool GetAddPrefixes(std::vector<SBAddPrefix>* add_prefixes); 121 virtual bool GetAddFullHashes(std::vector<SBAddFullHash>* add_full_hashes); 122 123 virtual bool BeginChunk(); 124 125 virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix); 126 virtual bool WriteAddHash(int32 chunk_id, 127 base::Time receive_time, 128 const SBFullHash& full_hash); 129 virtual bool WriteSubPrefix(int32 chunk_id, 130 int32 add_chunk_id, SBPrefix prefix); 131 virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id, 132 const SBFullHash& full_hash); 133 virtual bool FinishChunk(); 134 135 virtual bool BeginUpdate(); 136 // Store updates with pending add full hashes in file store and 137 // return |add_prefixes_result| and |add_full_hashes_result|. 138 virtual bool FinishUpdate(const std::vector<SBAddFullHash>& pending_adds, 139 const std::set<SBPrefix>& prefix_misses, 140 std::vector<SBAddPrefix>* add_prefixes_result, 141 std::vector<SBAddFullHash>* add_full_hashes_result); 142 virtual bool CancelUpdate(); 143 144 virtual void SetAddChunk(int32 chunk_id); 145 virtual bool CheckAddChunk(int32 chunk_id); 146 virtual void GetAddChunks(std::vector<int32>* out); 147 virtual void SetSubChunk(int32 chunk_id); 148 virtual bool CheckSubChunk(int32 chunk_id); 149 virtual void GetSubChunks(std::vector<int32>* out); 150 151 virtual void DeleteAddChunk(int32 chunk_id); 152 virtual void DeleteSubChunk(int32 chunk_id); 153 154 // Returns the name of the temporary file used to buffer data for 155 // |filename|. Exported for unit tests. TemporaryFileForFilename(const FilePath & filename)156 static const FilePath TemporaryFileForFilename(const FilePath& filename) { 157 return FilePath(filename.value() + FILE_PATH_LITERAL("_new")); 158 } 159 160 private: 161 // Update store file with pending full hashes. 162 virtual bool DoUpdate(const std::vector<SBAddFullHash>& pending_adds, 163 const std::set<SBPrefix>& prefix_misses, 164 std::vector<SBAddPrefix>* add_prefixes_result, 165 std::vector<SBAddFullHash>* add_full_hashes_result); 166 167 // Enumerate different format-change events for histogramming 168 // purposes. DO NOT CHANGE THE ORDERING OF THESE VALUES. 169 // TODO(shess): Remove this once the format change is complete. 170 enum FormatEventType { 171 // Corruption detected, broken down by file format. 172 FORMAT_EVENT_FILE_CORRUPT, 173 FORMAT_EVENT_SQLITE_CORRUPT, // Obsolete 174 175 // The type of format found in the file. The expected case (new 176 // file format) is intentionally not covered. 177 FORMAT_EVENT_FOUND_SQLITE, 178 FORMAT_EVENT_FOUND_UNKNOWN, 179 180 // The number of SQLite-format files deleted should be the same as 181 // FORMAT_EVENT_FOUND_SQLITE. It can differ if the delete fails, 182 // or if a failure prevents the update from succeeding. 183 FORMAT_EVENT_SQLITE_DELETED, // Obsolete 184 FORMAT_EVENT_SQLITE_DELETE_FAILED, // Obsolete 185 186 // Found and deleted (or failed to delete) the ancient "Safe 187 // Browsing" file. 188 FORMAT_EVENT_DELETED_ORIGINAL, 189 FORMAT_EVENT_DELETED_ORIGINAL_FAILED, 190 191 // Memory space for histograms is determined by the max. ALWAYS 192 // ADD NEW VALUES BEFORE THIS ONE. 193 FORMAT_EVENT_MAX 194 }; 195 196 // Helper to record an event related to format conversion from 197 // SQLite to file. 198 static void RecordFormatEvent(FormatEventType event_type); 199 200 // Some very lucky users have an original-format file still in their 201 // profile. Check for it and delete, recording a histogram for the 202 // result (no histogram for not-found). Logically this 203 // would make more sense at the SafeBrowsingDatabase level, but 204 // practically speaking that code doesn't touch files directly. 205 static void CheckForOriginalAndDelete(const FilePath& filename); 206 207 // Close all files and clear all buffers. 208 bool Close(); 209 210 // Calls |corruption_callback_| if non-NULL, always returns false as 211 // a convenience to the caller. 212 bool OnCorruptDatabase(); 213 214 // Helper for creating a corruption callback for |old_store_|. 215 // TODO(shess): Remove after migration. 216 void HandleCorruptDatabase(); 217 218 // Clear temporary buffers used to accumulate chunk data. ClearChunkBuffers()219 bool ClearChunkBuffers() { 220 // NOTE: .clear() doesn't release memory. 221 // TODO(shess): Figure out if this is overkill. Some amount of 222 // pre-reserved space is probably reasonable between each chunk 223 // collected. 224 std::vector<SBAddPrefix>().swap(add_prefixes_); 225 std::vector<SBSubPrefix>().swap(sub_prefixes_); 226 std::vector<SBAddFullHash>().swap(add_hashes_); 227 std::vector<SBSubFullHash>().swap(sub_hashes_); 228 return true; 229 } 230 231 // Clear all buffers used during update. ClearUpdateBuffers()232 void ClearUpdateBuffers() { 233 ClearChunkBuffers(); 234 chunks_written_ = 0; 235 std::set<int32>().swap(add_chunks_cache_); 236 std::set<int32>().swap(sub_chunks_cache_); 237 base::hash_set<int32>().swap(add_del_cache_); 238 base::hash_set<int32>().swap(sub_del_cache_); 239 } 240 241 // Buffers for collecting data between BeginChunk() and 242 // FinishChunk(). 243 std::vector<SBAddPrefix> add_prefixes_; 244 std::vector<SBSubPrefix> sub_prefixes_; 245 std::vector<SBAddFullHash> add_hashes_; 246 std::vector<SBSubFullHash> sub_hashes_; 247 248 // Count of chunks collected in |new_file_|. 249 int chunks_written_; 250 251 // Name of the main database file. 252 FilePath filename_; 253 254 // Handles to the main and scratch files. |empty_| is true if the 255 // main file didn't exist when the update was started. 256 file_util::ScopedFILE file_; 257 file_util::ScopedFILE new_file_; 258 bool empty_; 259 260 // Cache of chunks which have been seen. Loaded from the database 261 // on BeginUpdate() so that it can be queried during the 262 // transaction. 263 std::set<int32> add_chunks_cache_; 264 std::set<int32> sub_chunks_cache_; 265 266 // Cache the set of deleted chunks during a transaction, applied on 267 // FinishUpdate(). 268 // TODO(shess): If the set is small enough, hash_set<> might be 269 // slower than plain set<>. 270 base::hash_set<int32> add_del_cache_; 271 base::hash_set<int32> sub_del_cache_; 272 273 scoped_ptr<Callback0::Type> corruption_callback_; 274 275 // Tracks whether corruption has already been seen in the current 276 // update, so that only one instance is recorded in the stats. 277 // TODO(shess): Remove with format-migration support. 278 bool corruption_seen_; 279 280 DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStoreFile); 281 }; 282 283 #endif // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_ 284