1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/safe_browsing/safe_browsing_database.h"
6
7 #include <algorithm>
8 #include <iterator>
9
10 #include "base/file_util.h"
11 #include "base/metrics/histogram.h"
12 #include "base/metrics/stats_counters.h"
13 #include "base/time.h"
14 #include "base/message_loop.h"
15 #include "base/process_util.h"
16 #include "crypto/sha2.h"
17 #include "chrome/browser/safe_browsing/bloom_filter.h"
18 #include "chrome/browser/safe_browsing/prefix_set.h"
19 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
20 #include "content/browser/browser_thread.h"
21 #include "googleurl/src/gurl.h"
22
23 namespace {
24
25 // Filename suffix for the bloom filter.
26 const FilePath::CharType kBloomFilterFile[] = FILE_PATH_LITERAL(" Filter 2");
27 // Filename suffix for download store.
28 const FilePath::CharType kDownloadDBFile[] = FILE_PATH_LITERAL(" Download");
29 // Filename suffix for client-side phishing detection whitelist store.
30 const FilePath::CharType kCsdWhitelistDBFile[] =
31 FILE_PATH_LITERAL(" Csd Whitelist");
32 // Filename suffix for browse store.
33 // TODO(lzheng): change to a better name when we change the file format.
34 const FilePath::CharType kBrowseDBFile[] = FILE_PATH_LITERAL(" Bloom");
35
36 // The maximum staleness for a cached entry.
37 const int kMaxStalenessMinutes = 45;
38
39 // Maximum number of entries we allow in the client-side phishing detection
40 // whitelist. If the whitelist on disk contains more entries then
41 // ContainsCsdWhitelistedUrl will always return true.
42 const size_t kMaxCsdWhitelistSize = 5000;
43
44 // If the hash of this exact expression is on the csd whitelist then
45 // ContainsCsdWhitelistedUrl will always return true.
46 const char kCsdKillSwitchUrl[] =
47 "sb-ssl.google.com/safebrowsing/csd/killswitch";
48
49 // To save space, the incoming |chunk_id| and |list_id| are combined
50 // into an |encoded_chunk_id| for storage by shifting the |list_id|
51 // into the low-order bits. These functions decode that information.
52 // TODO(lzheng): It was reasonable when database is saved in sqlite, but
53 // there should be better ways to save chunk_id and list_id after we use
54 // SafeBrowsingStoreFile.
GetListIdBit(const int encoded_chunk_id)55 int GetListIdBit(const int encoded_chunk_id) {
56 return encoded_chunk_id & 1;
57 }
DecodeChunkId(int encoded_chunk_id)58 int DecodeChunkId(int encoded_chunk_id) {
59 return encoded_chunk_id >> 1;
60 }
EncodeChunkId(const int chunk,const int list_id)61 int EncodeChunkId(const int chunk, const int list_id) {
62 DCHECK_NE(list_id, safe_browsing_util::INVALID);
63 return chunk << 1 | list_id % 2;
64 }
65
66 // Generate the set of full hashes to check for |url|. If
67 // |include_whitelist_hashes| is true we will generate additional path-prefixes
68 // to match against the csd whitelist. E.g., if the path-prefix /foo is on the
69 // whitelist it should also match /foo/bar which is not the case for all the
70 // other lists.
71 // TODO(shess): This function is almost the same as
72 // |CompareFullHashes()| in safe_browsing_util.cc, except that code
73 // does an early exit on match. Since match should be the infrequent
74 // case (phishing or malware found), consider combining this function
75 // with that one.
BrowseFullHashesToCheck(const GURL & url,bool include_whitelist_hashes,std::vector<SBFullHash> * full_hashes)76 void BrowseFullHashesToCheck(const GURL& url,
77 bool include_whitelist_hashes,
78 std::vector<SBFullHash>* full_hashes) {
79 std::vector<std::string> hosts;
80 if (url.HostIsIPAddress()) {
81 hosts.push_back(url.host());
82 } else {
83 safe_browsing_util::GenerateHostsToCheck(url, &hosts);
84 }
85
86 std::vector<std::string> paths;
87 safe_browsing_util::GeneratePathsToCheck(url, &paths);
88
89 for (size_t i = 0; i < hosts.size(); ++i) {
90 for (size_t j = 0; j < paths.size(); ++j) {
91 const std::string& path = paths[j];
92 SBFullHash full_hash;
93 crypto::SHA256HashString(hosts[i] + path, &full_hash,
94 sizeof(full_hash));
95 full_hashes->push_back(full_hash);
96
97 // We may have /foo as path-prefix in the whitelist which should
98 // also match with /foo/bar and /foo?bar. Hence, for every path
99 // that ends in '/' we also add the path without the slash.
100 if (include_whitelist_hashes &&
101 path.size() > 1 &&
102 path[path.size() - 1] == '/') {
103 crypto::SHA256HashString(hosts[i] + path.substr(0, path.size() - 1),
104 &full_hash, sizeof(full_hash));
105 full_hashes->push_back(full_hash);
106 }
107 }
108 }
109 }
110
111 // Get the prefixes matching the download |urls|.
GetDownloadUrlPrefixes(const std::vector<GURL> & urls,std::vector<SBPrefix> * prefixes)112 void GetDownloadUrlPrefixes(const std::vector<GURL>& urls,
113 std::vector<SBPrefix>* prefixes) {
114 std::vector<SBFullHash> full_hashes;
115 for (size_t i = 0; i < urls.size(); ++i)
116 BrowseFullHashesToCheck(urls[i], false, &full_hashes);
117
118 for (size_t i = 0; i < full_hashes.size(); ++i)
119 prefixes->push_back(full_hashes[i].prefix);
120 }
121
122 // Find the entries in |full_hashes| with prefix in |prefix_hits|, and
123 // add them to |full_hits| if not expired. "Not expired" is when
124 // either |last_update| was recent enough, or the item has been
125 // received recently enough. Expired items are not deleted because a
126 // future update may make them acceptable again.
127 //
128 // For efficiency reasons the code walks |prefix_hits| and
129 // |full_hashes| in parallel, so they must be sorted by prefix.
GetCachedFullHashesForBrowse(const std::vector<SBPrefix> & prefix_hits,const std::vector<SBAddFullHash> & full_hashes,std::vector<SBFullHashResult> * full_hits,base::Time last_update)130 void GetCachedFullHashesForBrowse(const std::vector<SBPrefix>& prefix_hits,
131 const std::vector<SBAddFullHash>& full_hashes,
132 std::vector<SBFullHashResult>* full_hits,
133 base::Time last_update) {
134 const base::Time expire_time =
135 base::Time::Now() - base::TimeDelta::FromMinutes(kMaxStalenessMinutes);
136
137 std::vector<SBPrefix>::const_iterator piter = prefix_hits.begin();
138 std::vector<SBAddFullHash>::const_iterator hiter = full_hashes.begin();
139
140 while (piter != prefix_hits.end() && hiter != full_hashes.end()) {
141 if (*piter < hiter->full_hash.prefix) {
142 ++piter;
143 } else if (hiter->full_hash.prefix < *piter) {
144 ++hiter;
145 } else {
146 if (expire_time < last_update ||
147 expire_time.ToTimeT() < hiter->received) {
148 SBFullHashResult result;
149 const int list_bit = GetListIdBit(hiter->chunk_id);
150 DCHECK(list_bit == safe_browsing_util::MALWARE ||
151 list_bit == safe_browsing_util::PHISH);
152 if (!safe_browsing_util::GetListName(list_bit, &result.list_name))
153 continue;
154 result.add_chunk_id = DecodeChunkId(hiter->chunk_id);
155 result.hash = hiter->full_hash;
156 full_hits->push_back(result);
157 }
158
159 // Only increment |hiter|, |piter| might have multiple hits.
160 ++hiter;
161 }
162 }
163 }
164
165 // This function generates a chunk range string for |chunks|. It
166 // outputs one chunk range string per list and writes it to the
167 // |list_ranges| vector. We expect |list_ranges| to already be of the
168 // right size. E.g., if |chunks| contains chunks with two different
169 // list ids then |list_ranges| must contain two elements.
GetChunkRanges(const std::vector<int> & chunks,std::vector<std::string> * list_ranges)170 void GetChunkRanges(const std::vector<int>& chunks,
171 std::vector<std::string>* list_ranges) {
172 DCHECK_GT(list_ranges->size(), 0U);
173 DCHECK_LE(list_ranges->size(), 2U);
174 std::vector<std::vector<int> > decoded_chunks(list_ranges->size());
175 for (std::vector<int>::const_iterator iter = chunks.begin();
176 iter != chunks.end(); ++iter) {
177 int mod_list_id = GetListIdBit(*iter);
178 DCHECK_GE(mod_list_id, 0);
179 DCHECK_LT(static_cast<size_t>(mod_list_id), decoded_chunks.size());
180 decoded_chunks[mod_list_id].push_back(DecodeChunkId(*iter));
181 }
182 for (size_t i = 0; i < decoded_chunks.size(); ++i) {
183 ChunksToRangeString(decoded_chunks[i], &((*list_ranges)[i]));
184 }
185 }
186
187 // Helper function to create chunk range lists for Browse related
188 // lists.
UpdateChunkRanges(SafeBrowsingStore * store,const std::vector<std::string> & listnames,std::vector<SBListChunkRanges> * lists)189 void UpdateChunkRanges(SafeBrowsingStore* store,
190 const std::vector<std::string>& listnames,
191 std::vector<SBListChunkRanges>* lists) {
192 DCHECK_GT(listnames.size(), 0U);
193 DCHECK_LE(listnames.size(), 2U);
194 std::vector<int> add_chunks;
195 std::vector<int> sub_chunks;
196 store->GetAddChunks(&add_chunks);
197 store->GetSubChunks(&sub_chunks);
198
199 std::vector<std::string> adds(listnames.size());
200 std::vector<std::string> subs(listnames.size());
201 GetChunkRanges(add_chunks, &adds);
202 GetChunkRanges(sub_chunks, &subs);
203
204 for (size_t i = 0; i < listnames.size(); ++i) {
205 const std::string& listname = listnames[i];
206 DCHECK_EQ(safe_browsing_util::GetListId(listname) % 2,
207 static_cast<int>(i % 2));
208 DCHECK_NE(safe_browsing_util::GetListId(listname),
209 safe_browsing_util::INVALID);
210 lists->push_back(SBListChunkRanges(listname));
211 lists->back().adds.swap(adds[i]);
212 lists->back().subs.swap(subs[i]);
213 }
214 }
215
216 // Order |SBAddFullHash| on the prefix part. |SBAddPrefixLess()| from
217 // safe_browsing_store.h orders on both chunk-id and prefix.
SBAddFullHashPrefixLess(const SBAddFullHash & a,const SBAddFullHash & b)218 bool SBAddFullHashPrefixLess(const SBAddFullHash& a, const SBAddFullHash& b) {
219 return a.full_hash.prefix < b.full_hash.prefix;
220 }
221
222 // As compared to the bloom filter, PrefixSet should have these
223 // properties:
224 // - Any bloom filter miss should be a prefix set miss.
225 // - Any prefix set hit should be a bloom filter hit.
226 // - Bloom filter false positives are prefix set misses.
227 // The following is to log actual performance to verify this.
228 enum PrefixSetEvent {
229 PREFIX_SET_EVENT_HIT,
230 PREFIX_SET_EVENT_BLOOM_HIT,
231 PREFIX_SET_EVENT_BLOOM_MISS_PREFIX_HIT,
232 PREFIX_SET_EVENT_BLOOM_MISS_PREFIX_HIT_INVALID,
233 PREFIX_SET_GETPREFIXES_BROKEN,
234 PREFIX_SET_GETPREFIXES_BROKEN_SIZE,
235 PREFIX_SET_GETPREFIXES_FIRST_BROKEN,
236 PREFIX_SET_SBPREFIX_WAS_BROKEN,
237 PREFIX_SET_GETPREFIXES_BROKEN_SORTING,
238 PREFIX_SET_GETPREFIXES_BROKEN_DUPLICATION,
239 PREFIX_SET_GETPREFIX_UNSORTED_IS_DELTA,
240 PREFIX_SET_GETPREFIX_UNSORTED_IS_INDEX,
241 PREFIX_SET_GETPREFIX_CHECKSUM_MISMATCH,
242
243 // Memory space for histograms is determined by the max. ALWAYS ADD
244 // NEW VALUES BEFORE THIS ONE.
245 PREFIX_SET_EVENT_MAX
246 };
247
RecordPrefixSetInfo(PrefixSetEvent event_type)248 void RecordPrefixSetInfo(PrefixSetEvent event_type) {
249 UMA_HISTOGRAM_ENUMERATION("SB2.PrefixSetEvent", event_type,
250 PREFIX_SET_EVENT_MAX);
251 }
252
253 // Generate a |PrefixSet| instance from the contents of
254 // |add_prefixes|. Additionally performs various checks to make sure
255 // that the resulting prefix set is valid, so that the
256 // PREFIX_SET_EVENT_BLOOM_MISS_PREFIX_HIT_INVALID histogram in
257 // ContainsBrowseUrl() can be trustworthy.
PrefixSetFromAddPrefixes(const std::vector<SBAddPrefix> & add_prefixes)258 safe_browsing::PrefixSet* PrefixSetFromAddPrefixes(
259 const std::vector<SBAddPrefix>& add_prefixes) {
260 // TODO(shess): If |add_prefixes| were sorted by the prefix, it
261 // could be passed directly to |PrefixSet()|, removing the need for
262 // |prefixes|. For now, |prefixes| is useful while debugging
263 // things.
264 std::vector<SBPrefix> prefixes;
265 for (size_t i = 0; i < add_prefixes.size(); ++i) {
266 prefixes.push_back(add_prefixes[i].prefix);
267 }
268
269 std::sort(prefixes.begin(), prefixes.end());
270 prefixes.erase(std::unique(prefixes.begin(), prefixes.end()),
271 prefixes.end());
272
273 scoped_ptr<safe_browsing::PrefixSet>
274 prefix_set(new safe_browsing::PrefixSet(prefixes));
275
276 std::vector<SBPrefix> restored;
277 prefix_set->GetPrefixes(&restored);
278
279 // Expect them to be equal.
280 if (restored.size() == prefixes.size() &&
281 std::equal(prefixes.begin(), prefixes.end(), restored.begin()))
282 return prefix_set.release();
283
284 // Log BROKEN for continuity with previous release, and SIZE to
285 // distinguish which test failed.
286 NOTREACHED();
287 RecordPrefixSetInfo(PREFIX_SET_GETPREFIXES_BROKEN);
288 if (restored.size() != prefixes.size())
289 RecordPrefixSetInfo(PREFIX_SET_GETPREFIXES_BROKEN_SIZE);
290
291 // Try to distinguish between updates from one broken user and a
292 // distributed problem.
293 static bool logged_broken = false;
294 if (!logged_broken) {
295 RecordPrefixSetInfo(PREFIX_SET_GETPREFIXES_FIRST_BROKEN);
296 logged_broken = true;
297 }
298
299 // This seems so very very unlikely. But if it ever were true, then
300 // it could explain why GetPrefixes() seemed broken.
301 if (sizeof(int) != sizeof(int32))
302 RecordPrefixSetInfo(PREFIX_SET_SBPREFIX_WAS_BROKEN);
303
304 // Check if memory was corrupted during construction.
305 if (!prefix_set->CheckChecksum())
306 RecordPrefixSetInfo(PREFIX_SET_GETPREFIX_CHECKSUM_MISMATCH);
307
308 // Check whether |restored| is unsorted, or has duplication.
309 if (restored.size()) {
310 size_t unsorted_count = 0;
311 bool duplicates = false;
312 SBPrefix prev = restored[0];
313 for (size_t i = 0; i < restored.size(); prev = restored[i], ++i) {
314 if (prev > restored[i]) {
315 unsorted_count++;
316 UMA_HISTOGRAM_COUNTS("SB2.PrefixSetUnsortedDifference",
317 prev - restored[i]);
318
319 // When unsorted, how big is the set, and how far are we into
320 // it. If the set is very small or large, that might inform
321 // pursuit of a degenerate case. If the percentage is close
322 // to 0%, 100%, or 50%, then there might be an interesting
323 // degenerate case to explore.
324 UMA_HISTOGRAM_COUNTS("SB2.PrefixSetUnsortedSize", restored.size());
325 UMA_HISTOGRAM_PERCENTAGE("SB2.PrefixSetUnsortedPercent",
326 i * 100 / restored.size());
327
328 if (prefix_set->IsDeltaAt(i)) {
329 RecordPrefixSetInfo(PREFIX_SET_GETPREFIX_UNSORTED_IS_DELTA);
330
331 // Histograms require memory on the order of the number of
332 // buckets, making high-precision logging expensive. For
333 // now aim for a sense of the range of the problem.
334 UMA_HISTOGRAM_CUSTOM_COUNTS("SB2.PrefixSetUnsortedDelta",
335 prefix_set->DeltaAt(i), 1, 0xFFFF, 50);
336 } else {
337 RecordPrefixSetInfo(PREFIX_SET_GETPREFIX_UNSORTED_IS_INDEX);
338 }
339 }
340 if (prev == restored[i])
341 duplicates = true;
342 }
343
344 // Record findings.
345 if (unsorted_count) {
346 RecordPrefixSetInfo(PREFIX_SET_GETPREFIXES_BROKEN_SORTING);
347 UMA_HISTOGRAM_COUNTS_100("SB2.PrefixSetUnsorted", unsorted_count);
348 }
349 if (duplicates)
350 RecordPrefixSetInfo(PREFIX_SET_GETPREFIXES_BROKEN_DUPLICATION);
351
352 // Fix the problems noted. If |restored| was unsorted, then
353 // |duplicates| may give a false negative.
354 if (unsorted_count)
355 std::sort(restored.begin(), restored.end());
356 if (unsorted_count || duplicates)
357 restored.erase(std::unique(restored.begin(), restored.end()),
358 restored.end());
359 }
360
361 // NOTE(shess): The following could be done using a single
362 // uber-loop, but it's complicated by needing multiple parallel
363 // iterators. Didn't seem worthwhile for something that will only
364 // live for a short period and only fires for one in a million
365 // updates.
366
367 // Find elements in |restored| which are not in |prefixes|.
368 std::vector<SBPrefix> difference;
369 std::set_difference(restored.begin(), restored.end(),
370 prefixes.begin(), prefixes.end(),
371 std::back_inserter(difference));
372 if (difference.size())
373 UMA_HISTOGRAM_COUNTS_100("SB2.PrefixSetRestoredExcess", difference.size());
374
375 // Find elements in |prefixes| which are not in |restored|.
376 difference.clear();
377 std::set_difference(prefixes.begin(), prefixes.end(),
378 restored.begin(), restored.end(),
379 std::back_inserter(difference));
380 if (difference.size())
381 UMA_HISTOGRAM_COUNTS_100("SB2.PrefixSetRestoredShortfall",
382 difference.size());
383
384 return prefix_set.release();
385 }
386
387 } // namespace
388
389 // The default SafeBrowsingDatabaseFactory.
390 class SafeBrowsingDatabaseFactoryImpl : public SafeBrowsingDatabaseFactory {
391 public:
CreateSafeBrowsingDatabase(bool enable_download_protection,bool enable_client_side_whitelist)392 virtual SafeBrowsingDatabase* CreateSafeBrowsingDatabase(
393 bool enable_download_protection,
394 bool enable_client_side_whitelist) {
395 return new SafeBrowsingDatabaseNew(
396 new SafeBrowsingStoreFile,
397 enable_download_protection ? new SafeBrowsingStoreFile : NULL,
398 enable_client_side_whitelist ? new SafeBrowsingStoreFile : NULL);
399 }
400
SafeBrowsingDatabaseFactoryImpl()401 SafeBrowsingDatabaseFactoryImpl() { }
402
403 private:
404 DISALLOW_COPY_AND_ASSIGN(SafeBrowsingDatabaseFactoryImpl);
405 };
406
407 // static
408 SafeBrowsingDatabaseFactory* SafeBrowsingDatabase::factory_ = NULL;
409
410 // Factory method, non-thread safe. Caller has to make sure this s called
411 // on SafeBrowsing Thread.
412 // TODO(shess): There's no need for a factory any longer. Convert
413 // SafeBrowsingDatabaseNew to SafeBrowsingDatabase, and have Create()
414 // callers just construct things directly.
Create(bool enable_download_protection,bool enable_client_side_whitelist)415 SafeBrowsingDatabase* SafeBrowsingDatabase::Create(
416 bool enable_download_protection,
417 bool enable_client_side_whitelist) {
418 if (!factory_)
419 factory_ = new SafeBrowsingDatabaseFactoryImpl();
420 return factory_->CreateSafeBrowsingDatabase(enable_download_protection,
421 enable_client_side_whitelist);
422 }
423
~SafeBrowsingDatabase()424 SafeBrowsingDatabase::~SafeBrowsingDatabase() {
425 }
426
427 // static
BrowseDBFilename(const FilePath & db_base_filename)428 FilePath SafeBrowsingDatabase::BrowseDBFilename(
429 const FilePath& db_base_filename) {
430 return FilePath(db_base_filename.value() + kBrowseDBFile);
431 }
432
433 // static
DownloadDBFilename(const FilePath & db_base_filename)434 FilePath SafeBrowsingDatabase::DownloadDBFilename(
435 const FilePath& db_base_filename) {
436 return FilePath(db_base_filename.value() + kDownloadDBFile);
437 }
438
439 // static
BloomFilterForFilename(const FilePath & db_filename)440 FilePath SafeBrowsingDatabase::BloomFilterForFilename(
441 const FilePath& db_filename) {
442 return FilePath(db_filename.value() + kBloomFilterFile);
443 }
444
445 // static
CsdWhitelistDBFilename(const FilePath & db_filename)446 FilePath SafeBrowsingDatabase::CsdWhitelistDBFilename(
447 const FilePath& db_filename) {
448 return FilePath(db_filename.value() + kCsdWhitelistDBFile);
449 }
450
GetStore(const int list_id)451 SafeBrowsingStore* SafeBrowsingDatabaseNew::GetStore(const int list_id) {
452 DVLOG(3) << "Get store for list: " << list_id;
453 if (list_id == safe_browsing_util::PHISH ||
454 list_id == safe_browsing_util::MALWARE) {
455 return browse_store_.get();
456 } else if (list_id == safe_browsing_util::BINURL ||
457 list_id == safe_browsing_util::BINHASH) {
458 return download_store_.get();
459 } else if (list_id == safe_browsing_util::CSDWHITELIST) {
460 return csd_whitelist_store_.get();
461 }
462 return NULL;
463 }
464
465 // static
RecordFailure(FailureType failure_type)466 void SafeBrowsingDatabase::RecordFailure(FailureType failure_type) {
467 UMA_HISTOGRAM_ENUMERATION("SB2.DatabaseFailure", failure_type,
468 FAILURE_DATABASE_MAX);
469 }
470
SafeBrowsingDatabaseNew()471 SafeBrowsingDatabaseNew::SafeBrowsingDatabaseNew()
472 : creation_loop_(MessageLoop::current()),
473 browse_store_(new SafeBrowsingStoreFile),
474 download_store_(NULL),
475 csd_whitelist_store_(NULL),
476 ALLOW_THIS_IN_INITIALIZER_LIST(reset_factory_(this)) {
477 DCHECK(browse_store_.get());
478 DCHECK(!download_store_.get());
479 DCHECK(!csd_whitelist_store_.get());
480 }
481
SafeBrowsingDatabaseNew(SafeBrowsingStore * browse_store,SafeBrowsingStore * download_store,SafeBrowsingStore * csd_whitelist_store)482 SafeBrowsingDatabaseNew::SafeBrowsingDatabaseNew(
483 SafeBrowsingStore* browse_store,
484 SafeBrowsingStore* download_store,
485 SafeBrowsingStore* csd_whitelist_store)
486 : creation_loop_(MessageLoop::current()),
487 browse_store_(browse_store),
488 download_store_(download_store),
489 csd_whitelist_store_(csd_whitelist_store),
490 ALLOW_THIS_IN_INITIALIZER_LIST(reset_factory_(this)),
491 corruption_detected_(false) {
492 DCHECK(browse_store_.get());
493 }
494
~SafeBrowsingDatabaseNew()495 SafeBrowsingDatabaseNew::~SafeBrowsingDatabaseNew() {
496 DCHECK_EQ(creation_loop_, MessageLoop::current());
497 }
498
Init(const FilePath & filename_base)499 void SafeBrowsingDatabaseNew::Init(const FilePath& filename_base) {
500 DCHECK_EQ(creation_loop_, MessageLoop::current());
501 // Ensure we haven't been run before.
502 DCHECK(browse_filename_.empty());
503 DCHECK(download_filename_.empty());
504 DCHECK(csd_whitelist_filename_.empty());
505
506 browse_filename_ = BrowseDBFilename(filename_base);
507 bloom_filter_filename_ = BloomFilterForFilename(browse_filename_);
508
509 browse_store_->Init(
510 browse_filename_,
511 NewCallback(this, &SafeBrowsingDatabaseNew::HandleCorruptDatabase));
512 DVLOG(1) << "Init browse store: " << browse_filename_.value();
513
514 {
515 // NOTE: There is no need to grab the lock in this function, since
516 // until it returns, there are no pointers to this class on other
517 // threads. Then again, that means there is no possibility of
518 // contention on the lock...
519 base::AutoLock locked(lookup_lock_);
520 full_browse_hashes_.clear();
521 pending_browse_hashes_.clear();
522 LoadBloomFilter();
523 }
524
525 if (download_store_.get()) {
526 download_filename_ = DownloadDBFilename(filename_base);
527 download_store_->Init(
528 download_filename_,
529 NewCallback(this, &SafeBrowsingDatabaseNew::HandleCorruptDatabase));
530 DVLOG(1) << "Init download store: " << download_filename_.value();
531 }
532
533 if (csd_whitelist_store_.get()) {
534 csd_whitelist_filename_ = CsdWhitelistDBFilename(filename_base);
535 csd_whitelist_store_->Init(
536 csd_whitelist_filename_,
537 NewCallback(this, &SafeBrowsingDatabaseNew::HandleCorruptDatabase));
538 DVLOG(1) << "Init csd whitelist store: " << csd_whitelist_filename_.value();
539 std::vector<SBAddFullHash> full_hashes;
540 if (csd_whitelist_store_->GetAddFullHashes(&full_hashes)) {
541 LoadCsdWhitelist(full_hashes);
542 } else {
543 CsdWhitelistAllUrls();
544 }
545 } else {
546 CsdWhitelistAllUrls(); // Just to be safe.
547 }
548 }
549
ResetDatabase()550 bool SafeBrowsingDatabaseNew::ResetDatabase() {
551 DCHECK_EQ(creation_loop_, MessageLoop::current());
552
553 // Delete files on disk.
554 // TODO(shess): Hard to see where one might want to delete without a
555 // reset. Perhaps inline |Delete()|?
556 if (!Delete())
557 return false;
558
559 // Reset objects in memory.
560 {
561 base::AutoLock locked(lookup_lock_);
562 full_browse_hashes_.clear();
563 pending_browse_hashes_.clear();
564 prefix_miss_cache_.clear();
565 // TODO(shess): This could probably be |bloom_filter_.reset()|.
566 browse_bloom_filter_ = new BloomFilter(BloomFilter::kBloomFilterMinSize *
567 BloomFilter::kBloomFilterSizeRatio);
568 // TODO(shess): It is simpler for the code to assume that presence
569 // of a bloom filter always implies presence of a prefix set.
570 prefix_set_.reset(new safe_browsing::PrefixSet(std::vector<SBPrefix>()));
571 }
572 // Wants to acquire the lock itself.
573 CsdWhitelistAllUrls();
574
575 return true;
576 }
577
578 // TODO(lzheng): Remove matching_list, it is not used anywhere.
ContainsBrowseUrl(const GURL & url,std::string * matching_list,std::vector<SBPrefix> * prefix_hits,std::vector<SBFullHashResult> * full_hits,base::Time last_update)579 bool SafeBrowsingDatabaseNew::ContainsBrowseUrl(
580 const GURL& url,
581 std::string* matching_list,
582 std::vector<SBPrefix>* prefix_hits,
583 std::vector<SBFullHashResult>* full_hits,
584 base::Time last_update) {
585 // Clear the results first.
586 matching_list->clear();
587 prefix_hits->clear();
588 full_hits->clear();
589
590 std::vector<SBFullHash> full_hashes;
591 BrowseFullHashesToCheck(url, false, &full_hashes);
592 if (full_hashes.empty())
593 return false;
594
595 // This function is called on the I/O thread, prevent changes to
596 // bloom filter and caches.
597 base::AutoLock locked(lookup_lock_);
598
599 if (!browse_bloom_filter_.get())
600 return false;
601 DCHECK(prefix_set_.get());
602
603 // Used to double-check in case of a hit mis-match.
604 std::vector<SBPrefix> restored;
605
606 size_t miss_count = 0;
607 for (size_t i = 0; i < full_hashes.size(); ++i) {
608 bool found = prefix_set_->Exists(full_hashes[i].prefix);
609
610 if (browse_bloom_filter_->Exists(full_hashes[i].prefix)) {
611 RecordPrefixSetInfo(PREFIX_SET_EVENT_BLOOM_HIT);
612 if (found)
613 RecordPrefixSetInfo(PREFIX_SET_EVENT_HIT);
614 prefix_hits->push_back(full_hashes[i].prefix);
615 if (prefix_miss_cache_.count(full_hashes[i].prefix) > 0)
616 ++miss_count;
617 } else {
618 // Bloom filter misses should never be in prefix set. Re-create
619 // the original prefixes and manually search for it, to check if
620 // there's a bug with how |Exists()| is implemented.
621 // |UpdateBrowseStore()| previously verified that
622 // |GetPrefixes()| returns the same prefixes as were passed to
623 // the constructor.
624 DCHECK(!found);
625 if (found) {
626 if (restored.empty())
627 prefix_set_->GetPrefixes(&restored);
628
629 // If the item is not in the re-created list, then there is an
630 // error in |PrefixSet::Exists()|. If the item is in the
631 // re-created list, then the bloom filter was wrong.
632 if (std::binary_search(restored.begin(), restored.end(),
633 full_hashes[i].prefix)) {
634 RecordPrefixSetInfo(PREFIX_SET_EVENT_BLOOM_MISS_PREFIX_HIT);
635 } else {
636 RecordPrefixSetInfo(PREFIX_SET_EVENT_BLOOM_MISS_PREFIX_HIT_INVALID);
637 }
638 }
639 }
640 }
641
642 // If all the prefixes are cached as 'misses', don't issue a GetHash.
643 if (miss_count == prefix_hits->size())
644 return false;
645
646 // Find the matching full-hash results. |full_browse_hashes_| are from the
647 // database, |pending_browse_hashes_| are from GetHash requests between
648 // updates.
649 std::sort(prefix_hits->begin(), prefix_hits->end());
650
651 GetCachedFullHashesForBrowse(*prefix_hits, full_browse_hashes_,
652 full_hits, last_update);
653 GetCachedFullHashesForBrowse(*prefix_hits, pending_browse_hashes_,
654 full_hits, last_update);
655 return true;
656 }
657
MatchDownloadAddPrefixes(int list_bit,const std::vector<SBPrefix> & prefixes,std::vector<SBPrefix> * prefix_hits)658 bool SafeBrowsingDatabaseNew::MatchDownloadAddPrefixes(
659 int list_bit,
660 const std::vector<SBPrefix>& prefixes,
661 std::vector<SBPrefix>* prefix_hits) {
662 prefix_hits->clear();
663
664 std::vector<SBAddPrefix> add_prefixes;
665 download_store_->GetAddPrefixes(&add_prefixes);
666 for (size_t i = 0; i < add_prefixes.size(); ++i) {
667 for (size_t j = 0; j < prefixes.size(); ++j) {
668 const SBPrefix& prefix = prefixes[j];
669 if (prefix == add_prefixes[i].prefix &&
670 GetListIdBit(add_prefixes[i].chunk_id) == list_bit) {
671 prefix_hits->push_back(prefix);
672 }
673 }
674 }
675 return !prefix_hits->empty();
676 }
677
ContainsDownloadUrl(const std::vector<GURL> & urls,std::vector<SBPrefix> * prefix_hits)678 bool SafeBrowsingDatabaseNew::ContainsDownloadUrl(
679 const std::vector<GURL>& urls,
680 std::vector<SBPrefix>* prefix_hits) {
681 DCHECK_EQ(creation_loop_, MessageLoop::current());
682
683 // Ignore this check when download checking is not enabled.
684 if (!download_store_.get())
685 return false;
686
687 std::vector<SBPrefix> prefixes;
688 GetDownloadUrlPrefixes(urls, &prefixes);
689 return MatchDownloadAddPrefixes(safe_browsing_util::BINURL % 2,
690 prefixes,
691 prefix_hits);
692 }
693
ContainsDownloadHashPrefix(const SBPrefix & prefix)694 bool SafeBrowsingDatabaseNew::ContainsDownloadHashPrefix(
695 const SBPrefix& prefix) {
696 DCHECK_EQ(creation_loop_, MessageLoop::current());
697
698 // Ignore this check when download store is not available.
699 if (!download_store_.get())
700 return false;
701
702 std::vector<SBPrefix> prefixes(1, prefix);
703 std::vector<SBPrefix> prefix_hits;
704 return MatchDownloadAddPrefixes(safe_browsing_util::BINHASH % 2,
705 prefixes,
706 &prefix_hits);
707 }
708
ContainsCsdWhitelistedUrl(const GURL & url)709 bool SafeBrowsingDatabaseNew::ContainsCsdWhitelistedUrl(const GURL& url) {
710 // This method is theoretically thread-safe but we expect all calls to
711 // originate from the IO thread.
712 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
713 base::AutoLock l(lookup_lock_);
714 if (csd_whitelist_all_urls_)
715 return true;
716
717 std::vector<SBFullHash> full_hashes;
718 BrowseFullHashesToCheck(url, true, &full_hashes);
719 for (std::vector<SBFullHash>::const_iterator it = full_hashes.begin();
720 it != full_hashes.end(); ++it) {
721 if (std::binary_search(csd_whitelist_.begin(), csd_whitelist_.end(), *it))
722 return true;
723 }
724 return false;
725 }
726
727 // Helper to insert entries for all of the prefixes or full hashes in
728 // |entry| into the store.
InsertAdd(int chunk_id,SBPrefix host,const SBEntry * entry,int list_id)729 void SafeBrowsingDatabaseNew::InsertAdd(int chunk_id, SBPrefix host,
730 const SBEntry* entry, int list_id) {
731 DCHECK_EQ(creation_loop_, MessageLoop::current());
732
733 SafeBrowsingStore* store = GetStore(list_id);
734 if (!store) return;
735
736 STATS_COUNTER("SB.HostInsert", 1);
737 const int encoded_chunk_id = EncodeChunkId(chunk_id, list_id);
738 const int count = entry->prefix_count();
739
740 DCHECK(!entry->IsSub());
741 if (!count) {
742 // No prefixes, use host instead.
743 STATS_COUNTER("SB.PrefixAdd", 1);
744 store->WriteAddPrefix(encoded_chunk_id, host);
745 } else if (entry->IsPrefix()) {
746 // Prefixes only.
747 for (int i = 0; i < count; i++) {
748 const SBPrefix prefix = entry->PrefixAt(i);
749 STATS_COUNTER("SB.PrefixAdd", 1);
750 store->WriteAddPrefix(encoded_chunk_id, prefix);
751 }
752 } else {
753 // Prefixes and hashes.
754 const base::Time receive_time = base::Time::Now();
755 for (int i = 0; i < count; ++i) {
756 const SBFullHash full_hash = entry->FullHashAt(i);
757 const SBPrefix prefix = full_hash.prefix;
758
759 STATS_COUNTER("SB.PrefixAdd", 1);
760 store->WriteAddPrefix(encoded_chunk_id, prefix);
761
762 STATS_COUNTER("SB.PrefixAddFull", 1);
763 store->WriteAddHash(encoded_chunk_id, receive_time, full_hash);
764 }
765 }
766 }
767
768 // Helper to iterate over all the entries in the hosts in |chunks| and
769 // add them to the store.
InsertAddChunks(const int list_id,const SBChunkList & chunks)770 void SafeBrowsingDatabaseNew::InsertAddChunks(const int list_id,
771 const SBChunkList& chunks) {
772 DCHECK_EQ(creation_loop_, MessageLoop::current());
773
774 SafeBrowsingStore* store = GetStore(list_id);
775 if (!store) return;
776
777 for (SBChunkList::const_iterator citer = chunks.begin();
778 citer != chunks.end(); ++citer) {
779 const int chunk_id = citer->chunk_number;
780
781 // The server can give us a chunk that we already have because
782 // it's part of a range. Don't add it again.
783 const int encoded_chunk_id = EncodeChunkId(chunk_id, list_id);
784 if (store->CheckAddChunk(encoded_chunk_id))
785 continue;
786
787 store->SetAddChunk(encoded_chunk_id);
788 for (std::deque<SBChunkHost>::const_iterator hiter = citer->hosts.begin();
789 hiter != citer->hosts.end(); ++hiter) {
790 // NOTE: Could pass |encoded_chunk_id|, but then inserting add
791 // chunks would look different from inserting sub chunks.
792 InsertAdd(chunk_id, hiter->host, hiter->entry, list_id);
793 }
794 }
795 }
796
797 // Helper to insert entries for all of the prefixes or full hashes in
798 // |entry| into the store.
InsertSub(int chunk_id,SBPrefix host,const SBEntry * entry,int list_id)799 void SafeBrowsingDatabaseNew::InsertSub(int chunk_id, SBPrefix host,
800 const SBEntry* entry, int list_id) {
801 DCHECK_EQ(creation_loop_, MessageLoop::current());
802
803 SafeBrowsingStore* store = GetStore(list_id);
804 if (!store) return;
805
806 STATS_COUNTER("SB.HostDelete", 1);
807 const int encoded_chunk_id = EncodeChunkId(chunk_id, list_id);
808 const int count = entry->prefix_count();
809
810 DCHECK(entry->IsSub());
811 if (!count) {
812 // No prefixes, use host instead.
813 STATS_COUNTER("SB.PrefixSub", 1);
814 const int add_chunk_id = EncodeChunkId(entry->chunk_id(), list_id);
815 store->WriteSubPrefix(encoded_chunk_id, add_chunk_id, host);
816 } else if (entry->IsPrefix()) {
817 // Prefixes only.
818 for (int i = 0; i < count; i++) {
819 const SBPrefix prefix = entry->PrefixAt(i);
820 const int add_chunk_id =
821 EncodeChunkId(entry->ChunkIdAtPrefix(i), list_id);
822
823 STATS_COUNTER("SB.PrefixSub", 1);
824 store->WriteSubPrefix(encoded_chunk_id, add_chunk_id, prefix);
825 }
826 } else {
827 // Prefixes and hashes.
828 for (int i = 0; i < count; ++i) {
829 const SBFullHash full_hash = entry->FullHashAt(i);
830 const int add_chunk_id =
831 EncodeChunkId(entry->ChunkIdAtPrefix(i), list_id);
832
833 STATS_COUNTER("SB.PrefixSub", 1);
834 store->WriteSubPrefix(encoded_chunk_id, add_chunk_id, full_hash.prefix);
835
836 STATS_COUNTER("SB.PrefixSubFull", 1);
837 store->WriteSubHash(encoded_chunk_id, add_chunk_id, full_hash);
838 }
839 }
840 }
841
842 // Helper to iterate over all the entries in the hosts in |chunks| and
843 // add them to the store.
InsertSubChunks(int list_id,const SBChunkList & chunks)844 void SafeBrowsingDatabaseNew::InsertSubChunks(int list_id,
845 const SBChunkList& chunks) {
846 DCHECK_EQ(creation_loop_, MessageLoop::current());
847
848 SafeBrowsingStore* store = GetStore(list_id);
849 if (!store) return;
850
851 for (SBChunkList::const_iterator citer = chunks.begin();
852 citer != chunks.end(); ++citer) {
853 const int chunk_id = citer->chunk_number;
854
855 // The server can give us a chunk that we already have because
856 // it's part of a range. Don't add it again.
857 const int encoded_chunk_id = EncodeChunkId(chunk_id, list_id);
858 if (store->CheckSubChunk(encoded_chunk_id))
859 continue;
860
861 store->SetSubChunk(encoded_chunk_id);
862 for (std::deque<SBChunkHost>::const_iterator hiter = citer->hosts.begin();
863 hiter != citer->hosts.end(); ++hiter) {
864 InsertSub(chunk_id, hiter->host, hiter->entry, list_id);
865 }
866 }
867 }
868
InsertChunks(const std::string & list_name,const SBChunkList & chunks)869 void SafeBrowsingDatabaseNew::InsertChunks(const std::string& list_name,
870 const SBChunkList& chunks) {
871 DCHECK_EQ(creation_loop_, MessageLoop::current());
872
873 if (corruption_detected_ || chunks.empty())
874 return;
875
876 const base::Time insert_start = base::Time::Now();
877
878 const int list_id = safe_browsing_util::GetListId(list_name);
879 DVLOG(2) << list_name << ": " << list_id;
880
881 SafeBrowsingStore* store = GetStore(list_id);
882 if (!store) return;
883
884 change_detected_ = true;
885
886 store->BeginChunk();
887 if (chunks.front().is_add) {
888 InsertAddChunks(list_id, chunks);
889 } else {
890 InsertSubChunks(list_id, chunks);
891 }
892 store->FinishChunk();
893
894 UMA_HISTOGRAM_TIMES("SB2.ChunkInsert", base::Time::Now() - insert_start);
895 }
896
DeleteChunks(const std::vector<SBChunkDelete> & chunk_deletes)897 void SafeBrowsingDatabaseNew::DeleteChunks(
898 const std::vector<SBChunkDelete>& chunk_deletes) {
899 DCHECK_EQ(creation_loop_, MessageLoop::current());
900
901 if (corruption_detected_ || chunk_deletes.empty())
902 return;
903
904 const std::string& list_name = chunk_deletes.front().list_name;
905 const int list_id = safe_browsing_util::GetListId(list_name);
906
907 SafeBrowsingStore* store = GetStore(list_id);
908 if (!store) return;
909
910 change_detected_ = true;
911
912 for (size_t i = 0; i < chunk_deletes.size(); ++i) {
913 std::vector<int> chunk_numbers;
914 RangesToChunks(chunk_deletes[i].chunk_del, &chunk_numbers);
915 for (size_t j = 0; j < chunk_numbers.size(); ++j) {
916 const int encoded_chunk_id = EncodeChunkId(chunk_numbers[j], list_id);
917 if (chunk_deletes[i].is_sub_del)
918 store->DeleteSubChunk(encoded_chunk_id);
919 else
920 store->DeleteAddChunk(encoded_chunk_id);
921 }
922 }
923 }
924
CacheHashResults(const std::vector<SBPrefix> & prefixes,const std::vector<SBFullHashResult> & full_hits)925 void SafeBrowsingDatabaseNew::CacheHashResults(
926 const std::vector<SBPrefix>& prefixes,
927 const std::vector<SBFullHashResult>& full_hits) {
928 // This is called on the I/O thread, lock against updates.
929 base::AutoLock locked(lookup_lock_);
930
931 if (full_hits.empty()) {
932 prefix_miss_cache_.insert(prefixes.begin(), prefixes.end());
933 return;
934 }
935
936 // TODO(shess): SBFullHashResult and SBAddFullHash are very similar.
937 // Refactor to make them identical.
938 const base::Time now = base::Time::Now();
939 const size_t orig_size = pending_browse_hashes_.size();
940 for (std::vector<SBFullHashResult>::const_iterator iter = full_hits.begin();
941 iter != full_hits.end(); ++iter) {
942 const int list_id = safe_browsing_util::GetListId(iter->list_name);
943 if (list_id == safe_browsing_util::MALWARE ||
944 list_id == safe_browsing_util::PHISH) {
945 int encoded_chunk_id = EncodeChunkId(iter->add_chunk_id, list_id);
946 SBAddFullHash add_full_hash(encoded_chunk_id, now, iter->hash);
947 pending_browse_hashes_.push_back(add_full_hash);
948 }
949 }
950
951 // Sort new entries then merge with the previously-sorted entries.
952 std::vector<SBAddFullHash>::iterator
953 orig_end = pending_browse_hashes_.begin() + orig_size;
954 std::sort(orig_end, pending_browse_hashes_.end(), SBAddFullHashPrefixLess);
955 std::inplace_merge(pending_browse_hashes_.begin(),
956 orig_end, pending_browse_hashes_.end(),
957 SBAddFullHashPrefixLess);
958 }
959
UpdateStarted(std::vector<SBListChunkRanges> * lists)960 bool SafeBrowsingDatabaseNew::UpdateStarted(
961 std::vector<SBListChunkRanges>* lists) {
962 DCHECK_EQ(creation_loop_, MessageLoop::current());
963 DCHECK(lists);
964
965 // If |BeginUpdate()| fails, reset the database.
966 if (!browse_store_->BeginUpdate()) {
967 RecordFailure(FAILURE_BROWSE_DATABASE_UPDATE_BEGIN);
968 HandleCorruptDatabase();
969 return false;
970 }
971
972 if (download_store_.get() && !download_store_->BeginUpdate()) {
973 RecordFailure(FAILURE_DOWNLOAD_DATABASE_UPDATE_BEGIN);
974 HandleCorruptDatabase();
975 return false;
976 }
977
978 if (csd_whitelist_store_.get() && !csd_whitelist_store_->BeginUpdate()) {
979 RecordFailure(FAILURE_CSD_WHITELIST_DATABASE_UPDATE_BEGIN);
980 HandleCorruptDatabase();
981 return false;
982 }
983
984 std::vector<std::string> browse_listnames;
985 browse_listnames.push_back(safe_browsing_util::kMalwareList);
986 browse_listnames.push_back(safe_browsing_util::kPhishingList);
987 UpdateChunkRanges(browse_store_.get(), browse_listnames, lists);
988
989 if (download_store_.get()) {
990 std::vector<std::string> download_listnames;
991 download_listnames.push_back(safe_browsing_util::kBinUrlList);
992 download_listnames.push_back(safe_browsing_util::kBinHashList);
993 UpdateChunkRanges(download_store_.get(), download_listnames, lists);
994 }
995
996 if (csd_whitelist_store_.get()) {
997 std::vector<std::string> csd_whitelist_listnames;
998 csd_whitelist_listnames.push_back(safe_browsing_util::kCsdWhiteList);
999 UpdateChunkRanges(csd_whitelist_store_.get(),
1000 csd_whitelist_listnames, lists);
1001 }
1002
1003 corruption_detected_ = false;
1004 change_detected_ = false;
1005 return true;
1006 }
1007
UpdateFinished(bool update_succeeded)1008 void SafeBrowsingDatabaseNew::UpdateFinished(bool update_succeeded) {
1009 DCHECK_EQ(creation_loop_, MessageLoop::current());
1010 if (corruption_detected_)
1011 return;
1012
1013 // Unroll the transaction if there was a protocol error or if the
1014 // transaction was empty. This will leave the bloom filter, the
1015 // pending hashes, and the prefix miss cache in place.
1016 if (!update_succeeded || !change_detected_) {
1017 // Track empty updates to answer questions at http://crbug.com/72216 .
1018 if (update_succeeded && !change_detected_)
1019 UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes", 0);
1020 browse_store_->CancelUpdate();
1021 if (download_store_.get())
1022 download_store_->CancelUpdate();
1023 if (csd_whitelist_store_.get())
1024 csd_whitelist_store_->CancelUpdate();
1025 return;
1026 }
1027
1028 // for download
1029 UpdateDownloadStore();
1030 // for browsing
1031 UpdateBrowseStore();
1032 // for csd whitelist
1033 UpdateCsdWhitelistStore();
1034 }
1035
UpdateCsdWhitelistStore()1036 void SafeBrowsingDatabaseNew::UpdateCsdWhitelistStore() {
1037 if (!csd_whitelist_store_.get())
1038 return;
1039
1040 // For the csd whitelist, we don't cache and save full hashes since all
1041 // hashes are already full.
1042 std::vector<SBAddFullHash> empty_add_hashes;
1043
1044 // Not needed for the csd whitelist.
1045 std::set<SBPrefix> empty_miss_cache;
1046
1047 // Note: prefixes will not be empty. The current data store implementation
1048 // stores all full-length hashes as both full and prefix hashes.
1049 std::vector<SBAddPrefix> prefixes;
1050 std::vector<SBAddFullHash> full_hashes;
1051 if (!csd_whitelist_store_->FinishUpdate(empty_add_hashes,
1052 empty_miss_cache,
1053 &prefixes,
1054 &full_hashes)) {
1055 RecordFailure(FAILURE_CSD_WHITELIST_DATABASE_UPDATE_FINISH);
1056 CsdWhitelistAllUrls();
1057 return;
1058 }
1059 LoadCsdWhitelist(full_hashes);
1060 }
1061
UpdateDownloadStore()1062 void SafeBrowsingDatabaseNew::UpdateDownloadStore() {
1063 if (!download_store_.get())
1064 return;
1065
1066 // For download, we don't cache and save full hashes.
1067 std::vector<SBAddFullHash> empty_add_hashes;
1068
1069 // For download, backend lookup happens only if a prefix is in add list.
1070 // No need to pass in miss cache when call FinishUpdate to caculate
1071 // bloomfilter false positives.
1072 std::set<SBPrefix> empty_miss_cache;
1073
1074 // These results are not used after this call. Simply ignore the
1075 // returned value after FinishUpdate(...).
1076 std::vector<SBAddPrefix> add_prefixes_result;
1077 std::vector<SBAddFullHash> add_full_hashes_result;
1078
1079 if (!download_store_->FinishUpdate(empty_add_hashes,
1080 empty_miss_cache,
1081 &add_prefixes_result,
1082 &add_full_hashes_result))
1083 RecordFailure(FAILURE_DOWNLOAD_DATABASE_UPDATE_FINISH);
1084 return;
1085 }
1086
UpdateBrowseStore()1087 void SafeBrowsingDatabaseNew::UpdateBrowseStore() {
1088 // Copy out the pending add hashes. Copy rather than swapping in
1089 // case |ContainsBrowseURL()| is called before the new filter is complete.
1090 std::vector<SBAddFullHash> pending_add_hashes;
1091 {
1092 base::AutoLock locked(lookup_lock_);
1093 pending_add_hashes.insert(pending_add_hashes.end(),
1094 pending_browse_hashes_.begin(),
1095 pending_browse_hashes_.end());
1096 }
1097
1098 // Measure the amount of IO during the bloom filter build.
1099 base::IoCounters io_before, io_after;
1100 base::ProcessHandle handle = base::Process::Current().handle();
1101 scoped_ptr<base::ProcessMetrics> metric(
1102 #if !defined(OS_MACOSX)
1103 base::ProcessMetrics::CreateProcessMetrics(handle)
1104 #else
1105 // Getting stats only for the current process is enough, so NULL is fine.
1106 base::ProcessMetrics::CreateProcessMetrics(handle, NULL)
1107 #endif
1108 );
1109
1110 // IoCounters are currently not supported on Mac, and may not be
1111 // available for Linux, so we check the result and only show IO
1112 // stats if they are available.
1113 const bool got_counters = metric->GetIOCounters(&io_before);
1114
1115 const base::Time before = base::Time::Now();
1116
1117 std::vector<SBAddPrefix> add_prefixes;
1118 std::vector<SBAddFullHash> add_full_hashes;
1119 if (!browse_store_->FinishUpdate(pending_add_hashes, prefix_miss_cache_,
1120 &add_prefixes, &add_full_hashes)) {
1121 RecordFailure(FAILURE_BROWSE_DATABASE_UPDATE_FINISH);
1122 return;
1123 }
1124
1125 // Create and populate |filter| from |add_prefixes|.
1126 // TODO(shess): The bloom filter doesn't need to be a
1127 // scoped_refptr<> for this code. Refactor that away.
1128 const int filter_size =
1129 BloomFilter::FilterSizeForKeyCount(add_prefixes.size());
1130 scoped_refptr<BloomFilter> filter(new BloomFilter(filter_size));
1131 for (size_t i = 0; i < add_prefixes.size(); ++i) {
1132 filter->Insert(add_prefixes[i].prefix);
1133 }
1134
1135 scoped_ptr<safe_browsing::PrefixSet>
1136 prefix_set(PrefixSetFromAddPrefixes(add_prefixes));
1137
1138 // This needs to be in sorted order by prefix for efficient access.
1139 std::sort(add_full_hashes.begin(), add_full_hashes.end(),
1140 SBAddFullHashPrefixLess);
1141
1142 // Swap in the newly built filter and cache.
1143 {
1144 base::AutoLock locked(lookup_lock_);
1145 full_browse_hashes_.swap(add_full_hashes);
1146
1147 // TODO(shess): If |CacheHashResults()| is posted between the
1148 // earlier lock and this clear, those pending hashes will be lost.
1149 // It could be fixed by only removing hashes which were collected
1150 // at the earlier point. I believe that is fail-safe as-is (the
1151 // hash will be fetched again).
1152 pending_browse_hashes_.clear();
1153 prefix_miss_cache_.clear();
1154 browse_bloom_filter_.swap(filter);
1155 prefix_set_.swap(prefix_set);
1156 }
1157
1158 const base::TimeDelta bloom_gen = base::Time::Now() - before;
1159
1160 // Persist the bloom filter to disk. Since only this thread changes
1161 // |browse_bloom_filter_|, there is no need to lock.
1162 WriteBloomFilter();
1163
1164 // Gather statistics.
1165 if (got_counters && metric->GetIOCounters(&io_after)) {
1166 UMA_HISTOGRAM_COUNTS("SB2.BuildReadKilobytes",
1167 static_cast<int>(io_after.ReadTransferCount -
1168 io_before.ReadTransferCount) / 1024);
1169 UMA_HISTOGRAM_COUNTS("SB2.BuildWriteKilobytes",
1170 static_cast<int>(io_after.WriteTransferCount -
1171 io_before.WriteTransferCount) / 1024);
1172 UMA_HISTOGRAM_COUNTS("SB2.BuildReadOperations",
1173 static_cast<int>(io_after.ReadOperationCount -
1174 io_before.ReadOperationCount));
1175 UMA_HISTOGRAM_COUNTS("SB2.BuildWriteOperations",
1176 static_cast<int>(io_after.WriteOperationCount -
1177 io_before.WriteOperationCount));
1178 }
1179 DVLOG(1) << "SafeBrowsingDatabaseImpl built bloom filter in "
1180 << bloom_gen.InMilliseconds() << " ms total. prefix count: "
1181 << add_prefixes.size();
1182 UMA_HISTOGRAM_LONG_TIMES("SB2.BuildFilter", bloom_gen);
1183 UMA_HISTOGRAM_COUNTS("SB2.FilterKilobytes",
1184 browse_bloom_filter_->size() / 1024);
1185 int64 size_64;
1186 if (file_util::GetFileSize(browse_filename_, &size_64))
1187 UMA_HISTOGRAM_COUNTS("SB2.BrowseDatabaseKilobytes",
1188 static_cast<int>(size_64 / 1024));
1189 if (file_util::GetFileSize(download_filename_, &size_64))
1190 UMA_HISTOGRAM_COUNTS("SB2.DownloadDatabaseKilobytes",
1191 static_cast<int>(size_64 / 1024));
1192 }
1193
HandleCorruptDatabase()1194 void SafeBrowsingDatabaseNew::HandleCorruptDatabase() {
1195 // Reset the database after the current task has unwound (but only
1196 // reset once within the scope of a given task).
1197 if (reset_factory_.empty()) {
1198 RecordFailure(FAILURE_DATABASE_CORRUPT);
1199 MessageLoop::current()->PostTask(FROM_HERE,
1200 reset_factory_.NewRunnableMethod(
1201 &SafeBrowsingDatabaseNew::OnHandleCorruptDatabase));
1202 }
1203 }
1204
OnHandleCorruptDatabase()1205 void SafeBrowsingDatabaseNew::OnHandleCorruptDatabase() {
1206 RecordFailure(FAILURE_DATABASE_CORRUPT_HANDLER);
1207 corruption_detected_ = true; // Stop updating the database.
1208 ResetDatabase();
1209 DCHECK(false) << "SafeBrowsing database was corrupt and reset";
1210 }
1211
1212 // TODO(shess): I'm not clear why this code doesn't have any
1213 // real error-handling.
LoadBloomFilter()1214 void SafeBrowsingDatabaseNew::LoadBloomFilter() {
1215 DCHECK_EQ(creation_loop_, MessageLoop::current());
1216 DCHECK(!bloom_filter_filename_.empty());
1217
1218 // If we're missing either of the database or filter files, we wait until the
1219 // next update to generate a new filter.
1220 // TODO(paulg): Investigate how often the filter file is missing and how
1221 // expensive it would be to regenerate it.
1222 int64 size_64;
1223 if (!file_util::GetFileSize(browse_filename_, &size_64) || size_64 == 0)
1224 return;
1225
1226 if (!file_util::GetFileSize(bloom_filter_filename_, &size_64) ||
1227 size_64 == 0) {
1228 RecordFailure(FAILURE_DATABASE_FILTER_MISSING);
1229 return;
1230 }
1231
1232 const base::TimeTicks before = base::TimeTicks::Now();
1233 browse_bloom_filter_ = BloomFilter::LoadFile(bloom_filter_filename_);
1234 DVLOG(1) << "SafeBrowsingDatabaseNew read bloom filter in "
1235 << (base::TimeTicks::Now() - before).InMilliseconds() << " ms";
1236
1237 if (!browse_bloom_filter_.get())
1238 RecordFailure(FAILURE_DATABASE_FILTER_READ);
1239
1240 // Manually re-generate the prefix set from the main database.
1241 // TODO(shess): Write/read for prefix set.
1242 std::vector<SBAddPrefix> add_prefixes;
1243 browse_store_->GetAddPrefixes(&add_prefixes);
1244 prefix_set_.reset(PrefixSetFromAddPrefixes(add_prefixes));
1245 }
1246
Delete()1247 bool SafeBrowsingDatabaseNew::Delete() {
1248 DCHECK_EQ(creation_loop_, MessageLoop::current());
1249
1250 const bool r1 = browse_store_->Delete();
1251 if (!r1)
1252 RecordFailure(FAILURE_DATABASE_STORE_DELETE);
1253
1254 const bool r2 = download_store_.get() ? download_store_->Delete() : true;
1255 if (!r2)
1256 RecordFailure(FAILURE_DATABASE_STORE_DELETE);
1257
1258 const bool r3 = csd_whitelist_store_.get() ?
1259 csd_whitelist_store_->Delete() : true;
1260 if (!r3)
1261 RecordFailure(FAILURE_DATABASE_STORE_DELETE);
1262
1263 const bool r4 = file_util::Delete(bloom_filter_filename_, false);
1264 if (!r4)
1265 RecordFailure(FAILURE_DATABASE_FILTER_DELETE);
1266 return r1 && r2 && r3 && r4;
1267 }
1268
WriteBloomFilter()1269 void SafeBrowsingDatabaseNew::WriteBloomFilter() {
1270 DCHECK_EQ(creation_loop_, MessageLoop::current());
1271
1272 if (!browse_bloom_filter_.get())
1273 return;
1274
1275 const base::TimeTicks before = base::TimeTicks::Now();
1276 const bool write_ok = browse_bloom_filter_->WriteFile(bloom_filter_filename_);
1277 DVLOG(1) << "SafeBrowsingDatabaseNew wrote bloom filter in "
1278 << (base::TimeTicks::Now() - before).InMilliseconds() << " ms";
1279
1280 if (!write_ok)
1281 RecordFailure(FAILURE_DATABASE_FILTER_WRITE);
1282 }
1283
CsdWhitelistAllUrls()1284 void SafeBrowsingDatabaseNew::CsdWhitelistAllUrls() {
1285 base::AutoLock locked(lookup_lock_);
1286 csd_whitelist_all_urls_ = true;
1287 csd_whitelist_.clear();
1288 }
1289
LoadCsdWhitelist(const std::vector<SBAddFullHash> & full_hashes)1290 void SafeBrowsingDatabaseNew::LoadCsdWhitelist(
1291 const std::vector<SBAddFullHash>& full_hashes) {
1292 DCHECK_EQ(creation_loop_, MessageLoop::current());
1293 if (full_hashes.size() > kMaxCsdWhitelistSize) {
1294 CsdWhitelistAllUrls();
1295 return;
1296 }
1297
1298 std::vector<SBFullHash> new_csd_whitelist;
1299 for (std::vector<SBAddFullHash>::const_iterator it = full_hashes.begin();
1300 it != full_hashes.end(); ++it) {
1301 new_csd_whitelist.push_back(it->full_hash);
1302 }
1303 std::sort(new_csd_whitelist.begin(), new_csd_whitelist.end());
1304
1305 SBFullHash kill_switch;
1306 crypto::SHA256HashString(kCsdKillSwitchUrl, &kill_switch,
1307 sizeof(kill_switch));
1308 if (std::binary_search(new_csd_whitelist.begin(), new_csd_whitelist.end(),
1309 kill_switch)) {
1310 // The kill switch is whitelisted hence we whitelist all URLs.
1311 CsdWhitelistAllUrls();
1312 } else {
1313 base::AutoLock locked(lookup_lock_);
1314 csd_whitelist_all_urls_ = false;
1315 csd_whitelist_.swap(new_csd_whitelist);
1316 }
1317 }
1318