1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/safe_browsing/safe_browsing_store.h"
6
7 #include <algorithm>
8
9 #include "base/metrics/histogram.h"
10
11 namespace {
12
13 // Find items matching between |subs| and |adds|, and remove them,
14 // recording the item from |adds| in |adds_removed|. To minimize
15 // copies, the inputs are processing in parallel, so |subs| and |adds|
16 // should be compatibly ordered (either by SBAddPrefixLess or
17 // SBAddPrefixHashLess).
18 //
19 // |predAS| provides add < sub, |predSA| provides sub < add, for the
20 // tightest compare appropriate (see calls in SBProcessSubs).
21 template <class S, class A, typename PredAS, typename PredSA>
KnockoutSubs(std::vector<S> * subs,std::vector<A> * adds,PredAS predAS,PredSA predSA,std::vector<A> * adds_removed)22 void KnockoutSubs(std::vector<S>* subs,
23 std::vector<A>* adds,
24 PredAS predAS, PredSA predSA,
25 std::vector<A>* adds_removed) {
26 // Keep a pair of output iterators for writing kept items. Due to
27 // deletions, these may lag the main iterators. Using erase() on
28 // individual items would result in O(N^2) copies. Using std::list
29 // would work around that, at double or triple the memory cost.
30 typename std::vector<A>::iterator add_out = adds->begin();
31 typename std::vector<S>::iterator sub_out = subs->begin();
32
33 // Current location in vectors.
34 // TODO(shess): I want these to be const_iterator, but then
35 // std::copy() gets confused. Could snag a const_iterator add_end,
36 // or write an inline std::copy(), but it seems like I'm doing
37 // something wrong.
38 typename std::vector<A>::iterator add_iter = adds->begin();
39 typename std::vector<S>::iterator sub_iter = subs->begin();
40
41 while (add_iter != adds->end() && sub_iter != subs->end()) {
42 // If |*sub_iter| < |*add_iter|, retain the sub.
43 if (predSA(*sub_iter, *add_iter)) {
44 *sub_out = *sub_iter;
45 ++sub_out;
46 ++sub_iter;
47
48 // If |*add_iter| < |*sub_iter|, retain the add.
49 } else if (predAS(*add_iter, *sub_iter)) {
50 *add_out = *add_iter;
51 ++add_out;
52 ++add_iter;
53
54 // Record equal items and drop them.
55 } else {
56 adds_removed->push_back(*add_iter);
57 ++add_iter;
58 ++sub_iter;
59 }
60 }
61
62 // Erase any leftover gap.
63 adds->erase(add_out, add_iter);
64 subs->erase(sub_out, sub_iter);
65 }
66
67 // Remove items in |removes| from |full_hashes|. |full_hashes| and
68 // |removes| should be ordered by SBAddPrefix component.
69 template <class T>
RemoveMatchingPrefixes(const std::vector<SBAddPrefix> & removes,std::vector<T> * full_hashes)70 void RemoveMatchingPrefixes(const std::vector<SBAddPrefix>& removes,
71 std::vector<T>* full_hashes) {
72 // This is basically an inline of std::set_difference().
73 // Unfortunately, that algorithm requires that the two iterator
74 // pairs use the same value types.
75
76 // Where to store kept items.
77 typename std::vector<T>::iterator out = full_hashes->begin();
78
79 typename std::vector<T>::iterator hash_iter = full_hashes->begin();
80 std::vector<SBAddPrefix>::const_iterator remove_iter = removes.begin();
81
82 while (hash_iter != full_hashes->end() && remove_iter != removes.end()) {
83 // Keep items less than |*remove_iter|.
84 if (SBAddPrefixLess(*hash_iter, *remove_iter)) {
85 *out = *hash_iter;
86 ++out;
87 ++hash_iter;
88
89 // No hit for |*remove_iter|, bump it forward.
90 } else if (SBAddPrefixLess(*remove_iter, *hash_iter)) {
91 ++remove_iter;
92
93 // Drop equal items, there may be multiple hits.
94 } else {
95 do {
96 ++hash_iter;
97 } while (hash_iter != full_hashes->end() &&
98 !SBAddPrefixLess(*remove_iter, *hash_iter));
99 ++remove_iter;
100 }
101 }
102
103 // Erase any leftover gap.
104 full_hashes->erase(out, hash_iter);
105 }
106
107 // Remove deleted items (|chunk_id| in |del_set|) from the vector.
108 template <class T>
RemoveDeleted(std::vector<T> * vec,const base::hash_set<int32> & del_set)109 void RemoveDeleted(std::vector<T>* vec, const base::hash_set<int32>& del_set) {
110 DCHECK(vec);
111
112 // Scan through the items read, dropping the items in |del_set|.
113 typename std::vector<T>::iterator add_iter = vec->begin();
114 for (typename std::vector<T>::iterator iter = add_iter;
115 iter != vec->end(); ++iter) {
116 if (del_set.count(iter->chunk_id) == 0) {
117 *add_iter = *iter;
118 ++add_iter;
119 }
120 }
121 vec->erase(add_iter, vec->end());
122 }
123
124 enum MissTypes {
125 MISS_TYPE_ALL,
126 MISS_TYPE_FALSE,
127
128 // Always at the end.
129 MISS_TYPE_MAX
130 };
131
132 } // namespace
133
SBCheckPrefixMisses(const std::vector<SBAddPrefix> & add_prefixes,const std::set<SBPrefix> & prefix_misses)134 void SBCheckPrefixMisses(const std::vector<SBAddPrefix>& add_prefixes,
135 const std::set<SBPrefix>& prefix_misses) {
136 if (prefix_misses.empty())
137 return;
138
139 // Record a hit for all prefixes which missed when sent to the
140 // server.
141 for (size_t i = 0; i < prefix_misses.size(); ++i) {
142 UMA_HISTOGRAM_ENUMERATION("SB2.BloomFilterFalsePositives",
143 MISS_TYPE_ALL, MISS_TYPE_MAX);
144 }
145
146 // Collect the misses which are not present in |add_prefixes|.
147 // Since |add_prefixes| can contain multiple copies of the same
148 // prefix, it is not sufficient to count the number of elements
149 // present in both collections.
150 std::set<SBPrefix> false_misses(prefix_misses.begin(), prefix_misses.end());
151 for (size_t i = 0; i < add_prefixes.size(); ++i) {
152 // |erase()| on an absent element should cost like |find()|.
153 false_misses.erase(add_prefixes[i].prefix);
154 }
155
156 // Record a hit for prefixes which we shouldn't have sent in the
157 // first place.
158 for (size_t i = 0; i < false_misses.size(); ++i) {
159 UMA_HISTOGRAM_ENUMERATION("SB2.BloomFilterFalsePositives",
160 MISS_TYPE_FALSE, MISS_TYPE_MAX);
161 }
162
163 // Divide |MISS_TYPE_FALSE| by |MISS_TYPE_ALL| to get the
164 // bloom-filter false-positive rate.
165 }
166
SBProcessSubs(std::vector<SBAddPrefix> * add_prefixes,std::vector<SBSubPrefix> * sub_prefixes,std::vector<SBAddFullHash> * add_full_hashes,std::vector<SBSubFullHash> * sub_full_hashes,const base::hash_set<int32> & add_chunks_deleted,const base::hash_set<int32> & sub_chunks_deleted)167 void SBProcessSubs(std::vector<SBAddPrefix>* add_prefixes,
168 std::vector<SBSubPrefix>* sub_prefixes,
169 std::vector<SBAddFullHash>* add_full_hashes,
170 std::vector<SBSubFullHash>* sub_full_hashes,
171 const base::hash_set<int32>& add_chunks_deleted,
172 const base::hash_set<int32>& sub_chunks_deleted) {
173 // It is possible to structure templates and template
174 // specializations such that the following calls work without having
175 // to qualify things. It becomes very arbitrary, though, and less
176 // clear how things are working.
177
178 // Sort the inputs by the SBAddPrefix bits.
179 std::sort(add_prefixes->begin(), add_prefixes->end(),
180 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
181 std::sort(sub_prefixes->begin(), sub_prefixes->end(),
182 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
183 std::sort(add_full_hashes->begin(), add_full_hashes->end(),
184 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
185 std::sort(sub_full_hashes->begin(), sub_full_hashes->end(),
186 SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
187
188 // Factor out the prefix subs.
189 std::vector<SBAddPrefix> removed_adds;
190 KnockoutSubs(sub_prefixes, add_prefixes,
191 SBAddPrefixLess<SBAddPrefix,SBSubPrefix>,
192 SBAddPrefixLess<SBSubPrefix,SBAddPrefix>,
193 &removed_adds);
194
195 // Remove the full-hashes corrosponding to the adds which
196 // KnockoutSubs() removed. Processing these w/in KnockoutSubs()
197 // would make the code more complicated, and they are very small
198 // relative to the prefix lists so the gain would be modest.
199 RemoveMatchingPrefixes(removed_adds, add_full_hashes);
200 RemoveMatchingPrefixes(removed_adds, sub_full_hashes);
201
202 // http://crbug.com/52385
203 // TODO(shess): AFAICT this pass is not done on the trunk. I
204 // believe that's a bug, but it may not matter because full-hash
205 // subs almost never happen (I think you'd need multiple collisions
206 // where one of the sites stopped being flagged?). Enable this once
207 // everything is in. [if(0) instead of #ifdef 0 to make sure it
208 // compiles.]
209 if (0) {
210 // Factor out the full-hash subs.
211 std::vector<SBAddFullHash> removed_full_adds;
212 KnockoutSubs(sub_full_hashes, add_full_hashes,
213 SBAddPrefixHashLess<SBAddFullHash,SBSubFullHash>,
214 SBAddPrefixHashLess<SBSubFullHash,SBAddFullHash>,
215 &removed_full_adds);
216 }
217
218 // Remove items from the deleted chunks. This is done after other
219 // processing to allow subs to knock out adds (and be removed) even
220 // if the add's chunk is deleted.
221 RemoveDeleted(add_prefixes, add_chunks_deleted);
222 RemoveDeleted(sub_prefixes, sub_chunks_deleted);
223 RemoveDeleted(add_full_hashes, add_chunks_deleted);
224 RemoveDeleted(sub_full_hashes, sub_chunks_deleted);
225 }
226