1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/managed_mode/managed_mode_url_filter.h"
6
7 #include "base/containers/hash_tables.h"
8 #include "base/files/file_path.h"
9 #include "base/json/json_file_value_serializer.h"
10 #include "base/metrics/histogram.h"
11 #include "base/sha1.h"
12 #include "base/strings/string_number_conversions.h"
13 #include "base/strings/string_util.h"
14 #include "base/task_runner_util.h"
15 #include "base/threading/sequenced_worker_pool.h"
16 #include "chrome/browser/policy/url_blacklist_manager.h"
17 #include "components/url_matcher/url_matcher.h"
18 #include "content/public/browser/browser_thread.h"
19 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
20 #include "url/gurl.h"
21
22 using content::BrowserThread;
23 using net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES;
24 using net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES;
25 using net::registry_controlled_domains::GetRegistryLength;
26 using url_matcher::URLMatcher;
27 using url_matcher::URLMatcherConditionSet;
28
29 struct ManagedModeURLFilter::Contents {
30 URLMatcher url_matcher;
31 std::map<URLMatcherConditionSet::ID, int> matcher_site_map;
32 base::hash_multimap<std::string, int> hash_site_map;
33 std::vector<ManagedModeSiteList::Site> sites;
34 };
35
36 namespace {
37
38 // URL schemes not in this list (e.g., file:// and chrome://) will always be
39 // allowed.
40 const char* kFilteredSchemes[] = {
41 "http",
42 "https",
43 "ftp",
44 "gopher",
45 "ws",
46 "wss"
47 };
48
49
50 // This class encapsulates all the state that is required during construction of
51 // a new ManagedModeURLFilter::Contents.
52 class FilterBuilder {
53 public:
54 FilterBuilder();
55 ~FilterBuilder();
56
57 // Adds a single URL pattern for the site identified by |site_id|.
58 bool AddPattern(const std::string& pattern, int site_id);
59
60 // Adds a single hostname SHA1 hash for the site identified by |site_id|.
61 void AddHostnameHash(const std::string& hash, int site_id);
62
63 // Adds all the sites in |site_list|, with URL patterns and hostname hashes.
64 void AddSiteList(ManagedModeSiteList* site_list);
65
66 // Finalizes construction of the ManagedModeURLFilter::Contents and returns
67 // them. This method should be called before this object is destroyed.
68 scoped_ptr<ManagedModeURLFilter::Contents> Build();
69
70 private:
71 scoped_ptr<ManagedModeURLFilter::Contents> contents_;
72 URLMatcherConditionSet::Vector all_conditions_;
73 URLMatcherConditionSet::ID matcher_id_;
74 };
75
FilterBuilder()76 FilterBuilder::FilterBuilder()
77 : contents_(new ManagedModeURLFilter::Contents()),
78 matcher_id_(0) {}
79
~FilterBuilder()80 FilterBuilder::~FilterBuilder() {
81 DCHECK(!contents_.get());
82 }
83
AddPattern(const std::string & pattern,int site_id)84 bool FilterBuilder::AddPattern(const std::string& pattern, int site_id) {
85 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
86 std::string scheme;
87 std::string host;
88 uint16 port;
89 std::string path;
90 bool match_subdomains = true;
91 if (!policy::URLBlacklist::FilterToComponents(
92 pattern, &scheme, &host, &match_subdomains, &port, &path)) {
93 LOG(ERROR) << "Invalid pattern " << pattern;
94 return false;
95 }
96
97 scoped_refptr<URLMatcherConditionSet> condition_set =
98 policy::URLBlacklist::CreateConditionSet(
99 &contents_->url_matcher, ++matcher_id_,
100 scheme, host, match_subdomains, port, path);
101 all_conditions_.push_back(condition_set);
102 contents_->matcher_site_map[matcher_id_] = site_id;
103 return true;
104 }
105
AddHostnameHash(const std::string & hash,int site_id)106 void FilterBuilder::AddHostnameHash(const std::string& hash, int site_id) {
107 contents_->hash_site_map.insert(std::make_pair(StringToUpperASCII(hash),
108 site_id));
109 }
110
AddSiteList(ManagedModeSiteList * site_list)111 void FilterBuilder::AddSiteList(ManagedModeSiteList* site_list) {
112 std::vector<ManagedModeSiteList::Site> sites;
113 site_list->GetSites(&sites);
114 int site_id = contents_->sites.size();
115 for (std::vector<ManagedModeSiteList::Site>::const_iterator it =
116 sites.begin(); it != sites.end(); ++it) {
117 const ManagedModeSiteList::Site& site = *it;
118 contents_->sites.push_back(site);
119
120 for (std::vector<std::string>::const_iterator pattern_it =
121 site.patterns.begin();
122 pattern_it != site.patterns.end(); ++pattern_it) {
123 AddPattern(*pattern_it, site_id);
124 }
125
126 for (std::vector<std::string>::const_iterator hash_it =
127 site.hostname_hashes.begin();
128 hash_it != site.hostname_hashes.end(); ++hash_it) {
129 AddHostnameHash(*hash_it, site_id);
130 }
131
132 site_id++;
133 }
134 }
135
Build()136 scoped_ptr<ManagedModeURLFilter::Contents> FilterBuilder::Build() {
137 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
138 contents_->url_matcher.AddConditionSets(all_conditions_);
139 return contents_.Pass();
140 }
141
CreateWhitelistFromPatterns(const std::vector<std::string> & patterns)142 scoped_ptr<ManagedModeURLFilter::Contents> CreateWhitelistFromPatterns(
143 const std::vector<std::string>& patterns) {
144 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
145
146 FilterBuilder builder;
147 for (std::vector<std::string>::const_iterator it = patterns.begin();
148 it != patterns.end(); ++it) {
149 // TODO(bauerb): We should create a fake site for the whitelist.
150 builder.AddPattern(*it, -1);
151 }
152
153 return builder.Build();
154 }
155
LoadWhitelistsOnBlockingPoolThread(ScopedVector<ManagedModeSiteList> site_lists)156 scoped_ptr<ManagedModeURLFilter::Contents> LoadWhitelistsOnBlockingPoolThread(
157 ScopedVector<ManagedModeSiteList> site_lists) {
158 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
159
160 FilterBuilder builder;
161 for (ScopedVector<ManagedModeSiteList>::iterator it = site_lists.begin();
162 it != site_lists.end(); ++it) {
163 builder.AddSiteList(*it);
164 }
165
166 return builder.Build();
167 }
168
169 } // namespace
170
ManagedModeURLFilter()171 ManagedModeURLFilter::ManagedModeURLFilter()
172 : default_behavior_(ALLOW),
173 contents_(new Contents()) {
174 // Detach from the current thread so we can be constructed on a different
175 // thread than the one where we're used.
176 DetachFromThread();
177 }
178
~ManagedModeURLFilter()179 ManagedModeURLFilter::~ManagedModeURLFilter() {
180 DCHECK(CalledOnValidThread());
181 }
182
183 // static
184 ManagedModeURLFilter::FilteringBehavior
BehaviorFromInt(int behavior_value)185 ManagedModeURLFilter::BehaviorFromInt(int behavior_value) {
186 DCHECK_GE(behavior_value, ALLOW);
187 DCHECK_LE(behavior_value, BLOCK);
188 return static_cast<FilteringBehavior>(behavior_value);
189 }
190
191 // static
Normalize(const GURL & url)192 GURL ManagedModeURLFilter::Normalize(const GURL& url) {
193 GURL normalized_url = url;
194 GURL::Replacements replacements;
195 // Strip username, password, query, and ref.
196 replacements.ClearUsername();
197 replacements.ClearPassword();
198 replacements.ClearQuery();
199 replacements.ClearRef();
200 return url.ReplaceComponents(replacements);
201 }
202
203 // static
HasFilteredScheme(const GURL & url)204 bool ManagedModeURLFilter::HasFilteredScheme(const GURL& url) {
205 for (size_t i = 0; i < arraysize(kFilteredSchemes); ++i) {
206 if (url.scheme() == kFilteredSchemes[i])
207 return true;
208 }
209 return false;
210 }
211
GetHostnameHash(const GURL & url)212 std::string GetHostnameHash(const GURL& url) {
213 std::string hash = base::SHA1HashString(url.host());
214 return base::HexEncode(hash.data(), hash.length());
215 }
216
217 // static
HostMatchesPattern(const std::string & host,const std::string & pattern)218 bool ManagedModeURLFilter::HostMatchesPattern(const std::string& host,
219 const std::string& pattern) {
220 std::string trimmed_pattern = pattern;
221 std::string trimmed_host = host;
222 if (EndsWith(pattern, ".*", true)) {
223 size_t registry_length = GetRegistryLength(
224 trimmed_host, EXCLUDE_UNKNOWN_REGISTRIES, EXCLUDE_PRIVATE_REGISTRIES);
225 // A host without a known registry part does not match.
226 if (registry_length == 0)
227 return false;
228
229 trimmed_pattern.erase(trimmed_pattern.length() - 2);
230 trimmed_host.erase(trimmed_host.length() - (registry_length + 1));
231 }
232
233 if (StartsWithASCII(trimmed_pattern, "*.", true)) {
234 trimmed_pattern.erase(0, 2);
235
236 // The remaining pattern should be non-empty, and it should not contain
237 // further stars. Also the trimmed host needs to end with the trimmed
238 // pattern.
239 if (trimmed_pattern.empty() ||
240 trimmed_pattern.find('*') != std::string::npos ||
241 !EndsWith(trimmed_host, trimmed_pattern, true)) {
242 return false;
243 }
244
245 // The trimmed host needs to have a dot separating the subdomain from the
246 // matched pattern piece, unless there is no subdomain.
247 int pos = trimmed_host.length() - trimmed_pattern.length();
248 DCHECK_GE(pos, 0);
249 return (pos == 0) || (trimmed_host[pos - 1] == '.');
250 }
251
252 return trimmed_host == trimmed_pattern;
253 }
254
255 ManagedModeURLFilter::FilteringBehavior
GetFilteringBehaviorForURL(const GURL & url) const256 ManagedModeURLFilter::GetFilteringBehaviorForURL(const GURL& url) const {
257 DCHECK(CalledOnValidThread());
258
259 // URLs with a non-standard scheme (e.g. chrome://) are always allowed.
260 if (!HasFilteredScheme(url))
261 return ALLOW;
262
263 // Check manual overrides for the exact URL.
264 std::map<GURL, bool>::const_iterator url_it = url_map_.find(Normalize(url));
265 if (url_it != url_map_.end())
266 return url_it->second ? ALLOW : BLOCK;
267
268 // Check manual overrides for the hostname.
269 std::string host = url.host();
270 std::map<std::string, bool>::const_iterator host_it = host_map_.find(host);
271 if (host_it != host_map_.end())
272 return host_it->second ? ALLOW : BLOCK;
273
274 // Look for patterns matching the hostname, with a value that is different
275 // from the default (a value of true in the map meaning allowed).
276 for (std::map<std::string, bool>::const_iterator host_it =
277 host_map_.begin(); host_it != host_map_.end(); ++host_it) {
278 if ((host_it->second == (default_behavior_ == BLOCK)) &&
279 HostMatchesPattern(host, host_it->first)) {
280 return host_it->second ? ALLOW : BLOCK;
281 }
282 }
283
284 // If the default behavior is to allow, we don't need to check anything else.
285 if (default_behavior_ == ALLOW)
286 return ALLOW;
287
288 // Check the list of URL patterns.
289 std::set<URLMatcherConditionSet::ID> matching_ids =
290 contents_->url_matcher.MatchURL(url);
291 if (!matching_ids.empty())
292 return ALLOW;
293
294 // Check the list of hostname hashes.
295 if (contents_->hash_site_map.count(GetHostnameHash(url)))
296 return ALLOW;
297
298 // Fall back to the default behavior.
299 return default_behavior_;
300 }
301
GetSites(const GURL & url,std::vector<ManagedModeSiteList::Site * > * sites) const302 void ManagedModeURLFilter::GetSites(
303 const GURL& url,
304 std::vector<ManagedModeSiteList::Site*>* sites) const {
305 std::set<URLMatcherConditionSet::ID> matching_ids =
306 contents_->url_matcher.MatchURL(url);
307 for (std::set<URLMatcherConditionSet::ID>::const_iterator it =
308 matching_ids.begin(); it != matching_ids.end(); ++it) {
309 std::map<URLMatcherConditionSet::ID, int>::const_iterator entry =
310 contents_->matcher_site_map.find(*it);
311 if (entry == contents_->matcher_site_map.end()) {
312 NOTREACHED();
313 continue;
314 }
315 sites->push_back(&contents_->sites[entry->second]);
316 }
317
318 typedef base::hash_multimap<std::string, int>::const_iterator
319 hash_site_map_iterator;
320 std::pair<hash_site_map_iterator, hash_site_map_iterator> bounds =
321 contents_->hash_site_map.equal_range(GetHostnameHash(url));
322 for (hash_site_map_iterator hash_it = bounds.first;
323 hash_it != bounds.second; hash_it++) {
324 sites->push_back(&contents_->sites[hash_it->second]);
325 }
326 }
327
SetDefaultFilteringBehavior(FilteringBehavior behavior)328 void ManagedModeURLFilter::SetDefaultFilteringBehavior(
329 FilteringBehavior behavior) {
330 DCHECK(CalledOnValidThread());
331 default_behavior_ = behavior;
332 }
333
LoadWhitelists(ScopedVector<ManagedModeSiteList> site_lists)334 void ManagedModeURLFilter::LoadWhitelists(
335 ScopedVector<ManagedModeSiteList> site_lists) {
336 DCHECK(CalledOnValidThread());
337
338 base::PostTaskAndReplyWithResult(
339 BrowserThread::GetBlockingPool(),
340 FROM_HERE,
341 base::Bind(&LoadWhitelistsOnBlockingPoolThread,
342 base::Passed(&site_lists)),
343 base::Bind(&ManagedModeURLFilter::SetContents, this));
344 }
345
SetFromPatterns(const std::vector<std::string> & patterns)346 void ManagedModeURLFilter::SetFromPatterns(
347 const std::vector<std::string>& patterns) {
348 DCHECK(CalledOnValidThread());
349
350 base::PostTaskAndReplyWithResult(
351 BrowserThread::GetBlockingPool(),
352 FROM_HERE,
353 base::Bind(&CreateWhitelistFromPatterns, patterns),
354 base::Bind(&ManagedModeURLFilter::SetContents, this));
355 }
356
SetManualHosts(const std::map<std::string,bool> * host_map)357 void ManagedModeURLFilter::SetManualHosts(
358 const std::map<std::string, bool>* host_map) {
359 DCHECK(CalledOnValidThread());
360 host_map_ = *host_map;
361 UMA_HISTOGRAM_CUSTOM_COUNTS("ManagedMode.ManualHostsEntries",
362 host_map->size(), 1, 1000, 50);
363 }
364
SetManualURLs(const std::map<GURL,bool> * url_map)365 void ManagedModeURLFilter::SetManualURLs(
366 const std::map<GURL, bool>* url_map) {
367 DCHECK(CalledOnValidThread());
368 url_map_ = *url_map;
369 UMA_HISTOGRAM_CUSTOM_COUNTS("ManagedMode.ManualURLsEntries",
370 url_map->size(), 1, 1000, 50);
371 }
372
AddObserver(Observer * observer)373 void ManagedModeURLFilter::AddObserver(Observer* observer) {
374 observers_.AddObserver(observer);
375 }
376
RemoveObserver(Observer * observer)377 void ManagedModeURLFilter::RemoveObserver(Observer* observer) {
378 observers_.RemoveObserver(observer);
379 }
380
SetContents(scoped_ptr<Contents> contents)381 void ManagedModeURLFilter::SetContents(scoped_ptr<Contents> contents) {
382 DCHECK(CalledOnValidThread());
383 contents_ = contents.Pass();
384 FOR_EACH_OBSERVER(Observer, observers_, OnSiteListUpdated());
385 }
386