1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/url_matcher/url_matcher.h"
6
7 #include <algorithm>
8 #include <iterator>
9
10 #include "base/logging.h"
11 #include "url/gurl.h"
12 #include "url/url_canon.h"
13
14 namespace url_matcher {
15
16 // This set of classes implement a mapping of URL Component Patterns, such as
17 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
18 // for use in substring comparisons.
19 //
20 // The idea of this mapping is to reduce the problem of comparing many
21 // URL Component Patterns against one URL to the problem of searching many
22 // substrings in one string:
23 //
24 // ---------------------- -----------------
25 // | URL Query operator | ----translate----> | StringPattern |
26 // ---------------------- -----------------
27 // ^
28 // |
29 // compare
30 // |
31 // v
32 // ---------------------- -----------------
33 // | URL to compare | | |
34 // | to all URL Query | ----translate----> | String |
35 // | operators | | |
36 // ---------------------- -----------------
37 //
38 // The reason for this problem reduction is that there are efficient algorithms
39 // for searching many substrings in one string (see Aho-Corasick algorithm).
40 //
41 // Additionally, some of the same pieces are reused to implement regular
42 // expression comparisons. The FilteredRE2 implementation for matching many
43 // regular expressions against one string uses prefiltering, in which a set
44 // of substrings (derived from the regexes) are first searched for, to reduce
45 // the number of regular expressions to test; the prefiltering step also
46 // uses Aho-Corasick.
47 //
48 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
49 // ==========================================================
50 //
51 // For searches in this class, we normalize URLs as follows:
52 //
53 // Step 1:
54 // Remove scheme, port and segment from URL:
55 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
56 // www.example.com/index.html?search=foo
57 //
58 // We remove the scheme and port number because they can be checked later
59 // in a secondary filter step. We remove the segment (the #... part) because
60 // this is not guaranteed to be ASCII-7 encoded.
61 //
62 // Step 2:
63 // Translate URL to String and add the following position markers:
64 // - BU = Beginning of URL
65 // - ED = End of Domain
66 // - EP = End of Path
67 // - EU = End of URL
68 // Furthermore, the hostname is canonicalized to start with a ".".
69 //
70 // Position markers are represented as characters >127, which are therefore
71 // guaranteed not to be part of the ASCII-7 encoded URL character set.
72 //
73 // -> www.example.com/index.html?search=foo becomes
74 // BU .www.example.com ED /index.html EP ?search=foo EU
75 //
76 // -> www.example.com/index.html becomes
77 // BU .www.example.com ED /index.html EP EU
78 //
79 // Step 3:
80 // Translate URL Component Patterns as follows:
81 //
82 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
83 // -> host_prefix("www.example") = BU .www.example
84 //
85 // host_suffix(suffix) = suffix ED
86 // -> host_suffix("example.com") = example.com ED
87 // -> host_suffix(".example.com") = .example.com ED
88 //
89 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
90 // -> host_equals("www.example.com") = BU .www.example.com ED
91 //
92 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
93 //
94 // With this, we can search the StringPatterns in the normalized URL.
95 //
96 //
97 // Case 2: url_{prefix,suffix,equals,contains} searches.
98 // =====================================================
99 //
100 // Step 1: as above, except that
101 // - the scheme is not removed
102 // - the port is not removed if it is specified and does not match the default
103 // port for the given scheme.
104 //
105 // Step 2:
106 // Translate URL to String and add the following position markers:
107 // - BU = Beginning of URL
108 // - EU = End of URL
109 //
110 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
111 // BU http://www.example.com:8080/index.html?search=foo EU
112 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
113 // BU http://www.example.com/index.html?search=foo EU
114 //
115 // url_prefix(prefix) = BU prefix
116 // -> url_prefix("http://www.example") = BU http://www.example
117 //
118 // url_contains(substring) = substring
119 // -> url_contains("index") = index
120 //
121 //
122 // Case 3: {host,path,query}_contains searches.
123 // ============================================
124 //
125 // These kinds of searches are not supported directly but can be derived
126 // by a combination of a url_contains() query followed by an explicit test:
127 //
128 // host_contains(str) = url_contains(str) followed by test whether str occurs
129 // in host component of original URL.
130 // -> host_contains("example.co") = example.co
131 // followed by gurl.host().find("example.co");
132 //
133 // [similarly for path_contains and query_contains].
134 //
135 //
136 // Regular expression matching (url_matches searches)
137 // ==================================================
138 //
139 // This class also supports matching regular expressions (RE2 syntax)
140 // against full URLs, which are transformed as in case 2.
141
142 namespace {
143
IsRegexCriterion(URLMatcherCondition::Criterion criterion)144 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
145 return criterion == URLMatcherCondition::URL_MATCHES;
146 }
147
IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion)148 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
149 return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
150 }
151
152 } // namespace
153
154 //
155 // URLMatcherCondition
156 //
157
URLMatcherCondition()158 URLMatcherCondition::URLMatcherCondition()
159 : criterion_(HOST_PREFIX),
160 string_pattern_(NULL) {}
161
~URLMatcherCondition()162 URLMatcherCondition::~URLMatcherCondition() {}
163
URLMatcherCondition(Criterion criterion,const StringPattern * string_pattern)164 URLMatcherCondition::URLMatcherCondition(
165 Criterion criterion,
166 const StringPattern* string_pattern)
167 : criterion_(criterion),
168 string_pattern_(string_pattern) {}
169
URLMatcherCondition(const URLMatcherCondition & rhs)170 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
171 : criterion_(rhs.criterion_),
172 string_pattern_(rhs.string_pattern_) {}
173
operator =(const URLMatcherCondition & rhs)174 URLMatcherCondition& URLMatcherCondition::operator=(
175 const URLMatcherCondition& rhs) {
176 criterion_ = rhs.criterion_;
177 string_pattern_ = rhs.string_pattern_;
178 return *this;
179 }
180
operator <(const URLMatcherCondition & rhs) const181 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
182 if (criterion_ < rhs.criterion_) return true;
183 if (criterion_ > rhs.criterion_) return false;
184 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
185 return *string_pattern_ < *rhs.string_pattern_;
186 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
187 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
188 // or both are NULL.
189 return false;
190 }
191
IsFullURLCondition() const192 bool URLMatcherCondition::IsFullURLCondition() const {
193 // For these criteria the SubstringMatcher needs to be executed on the
194 // GURL that is canonicalized with
195 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
196 switch (criterion_) {
197 case HOST_CONTAINS:
198 case PATH_CONTAINS:
199 case QUERY_CONTAINS:
200 case URL_PREFIX:
201 case URL_SUFFIX:
202 case URL_CONTAINS:
203 case URL_EQUALS:
204 return true;
205 default:
206 break;
207 }
208 return false;
209 }
210
IsRegexCondition() const211 bool URLMatcherCondition::IsRegexCondition() const {
212 return IsRegexCriterion(criterion_);
213 }
214
IsOriginAndPathRegexCondition() const215 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
216 return IsOriginAndPathRegexCriterion(criterion_);
217 }
218
IsMatch(const std::set<StringPattern::ID> & matching_patterns,const GURL & url) const219 bool URLMatcherCondition::IsMatch(
220 const std::set<StringPattern::ID>& matching_patterns,
221 const GURL& url) const {
222 DCHECK(string_pattern_);
223 if (!ContainsKey(matching_patterns, string_pattern_->id()))
224 return false;
225 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
226 // a substring match on the raw URL. In case of a match, we need to verify
227 // that the match was found in the correct component of the URL.
228 switch (criterion_) {
229 case HOST_CONTAINS:
230 return url.host().find(string_pattern_->pattern()) !=
231 std::string::npos;
232 case PATH_CONTAINS:
233 return url.path().find(string_pattern_->pattern()) !=
234 std::string::npos;
235 case QUERY_CONTAINS:
236 return url.query().find(string_pattern_->pattern()) !=
237 std::string::npos;
238 default:
239 break;
240 }
241 return true;
242 }
243
244 //
245 // URLMatcherConditionFactory
246 //
247
248 namespace {
249 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
250 const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
251 const char kEndOfDomain[] = {static_cast<char>(-2), 0};
252 const char kEndOfPath[] = {static_cast<char>(-3), 0};
253 const char kEndOfURL[] = {static_cast<char>(-4), 0};
254 } // namespace
255
URLMatcherConditionFactory()256 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
257
~URLMatcherConditionFactory()258 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
259 STLDeleteElements(&substring_pattern_singletons_);
260 STLDeleteElements(®ex_pattern_singletons_);
261 STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
262 }
263
CanonicalizeURLForComponentSearches(const GURL & url) const264 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
265 const GURL& url) const {
266 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
267 url.path() + kEndOfPath +
268 (url.has_query() ? "?" + url.query() : std::string()) + kEndOfURL;
269 }
270
CreateHostPrefixCondition(const std::string & prefix)271 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
272 const std::string& prefix) {
273 return CreateCondition(URLMatcherCondition::HOST_PREFIX,
274 kBeginningOfURL + CanonicalizeHostname(prefix));
275 }
276
CreateHostSuffixCondition(const std::string & suffix)277 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
278 const std::string& suffix) {
279 return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
280 suffix + kEndOfDomain);
281 }
282
CreateHostContainsCondition(const std::string & str)283 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
284 const std::string& str) {
285 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
286 }
287
CreateHostEqualsCondition(const std::string & str)288 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
289 const std::string& str) {
290 return CreateCondition(URLMatcherCondition::HOST_EQUALS,
291 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
292 }
293
CreatePathPrefixCondition(const std::string & prefix)294 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
295 const std::string& prefix) {
296 return CreateCondition(URLMatcherCondition::PATH_PREFIX,
297 kEndOfDomain + prefix);
298 }
299
CreatePathSuffixCondition(const std::string & suffix)300 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
301 const std::string& suffix) {
302 return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
303 }
304
CreatePathContainsCondition(const std::string & str)305 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
306 const std::string& str) {
307 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
308 }
309
CreatePathEqualsCondition(const std::string & str)310 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
311 const std::string& str) {
312 return CreateCondition(URLMatcherCondition::PATH_EQUALS,
313 kEndOfDomain + str + kEndOfPath);
314 }
315
CreateQueryPrefixCondition(const std::string & prefix)316 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
317 const std::string& prefix) {
318 std::string pattern;
319 if (!prefix.empty() && prefix[0] == '?')
320 pattern = kEndOfPath + prefix;
321 else
322 pattern = kEndOfPath + ('?' + prefix);
323
324 return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
325 }
326
CreateQuerySuffixCondition(const std::string & suffix)327 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
328 const std::string& suffix) {
329 if (!suffix.empty() && suffix[0] == '?') {
330 return CreateQueryEqualsCondition(suffix);
331 } else {
332 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
333 suffix + kEndOfURL);
334 }
335 }
336
CreateQueryContainsCondition(const std::string & str)337 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
338 const std::string& str) {
339 if (!str.empty() && str[0] == '?')
340 return CreateQueryPrefixCondition(str);
341 else
342 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
343 }
344
CreateQueryEqualsCondition(const std::string & str)345 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
346 const std::string& str) {
347 std::string pattern;
348 if (!str.empty() && str[0] == '?')
349 pattern = kEndOfPath + str + kEndOfURL;
350 else
351 pattern = kEndOfPath + ('?' + str) + kEndOfURL;
352
353 return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
354 }
355
356 URLMatcherCondition
CreateHostSuffixPathPrefixCondition(const std::string & host_suffix,const std::string & path_prefix)357 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
358 const std::string& host_suffix,
359 const std::string& path_prefix) {
360 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
361 host_suffix + kEndOfDomain + path_prefix);
362 }
363
364 URLMatcherCondition
CreateHostEqualsPathPrefixCondition(const std::string & host,const std::string & path_prefix)365 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
366 const std::string& host,
367 const std::string& path_prefix) {
368 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
369 kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
370 path_prefix);
371 }
372
CanonicalizeURLForFullSearches(const GURL & url) const373 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
374 const GURL& url) const {
375 GURL::Replacements replacements;
376 replacements.ClearPassword();
377 replacements.ClearUsername();
378 replacements.ClearRef();
379 // Clear port if it is implicit from scheme.
380 if (url.has_port()) {
381 const std::string& port = url.scheme();
382 if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
383 url.EffectiveIntPort()) {
384 replacements.ClearPort();
385 }
386 }
387 return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
388 kEndOfURL;
389 }
390
CanonicalizeURLForRegexSearchesHelper(const GURL & url,bool clear_query)391 static std::string CanonicalizeURLForRegexSearchesHelper(
392 const GURL& url,
393 bool clear_query) {
394 GURL::Replacements replacements;
395 replacements.ClearPassword();
396 replacements.ClearUsername();
397 replacements.ClearRef();
398 if (clear_query)
399 replacements.ClearQuery();
400 // Clear port if it is implicit from scheme.
401 if (url.has_port()) {
402 const std::string& port = url.scheme();
403 if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
404 url.EffectiveIntPort()) {
405 replacements.ClearPort();
406 }
407 }
408 return url.ReplaceComponents(replacements).spec();
409 }
410
CanonicalizeURLForRegexSearches(const GURL & url) const411 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
412 const GURL& url) const {
413 return CanonicalizeURLForRegexSearchesHelper(url, false);
414 }
415
416 std::string
CanonicalizeURLForOriginAndPathRegexSearches(const GURL & url) const417 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
418 const GURL& url) const {
419 return CanonicalizeURLForRegexSearchesHelper(url, true);
420 }
421
CreateURLPrefixCondition(const std::string & prefix)422 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
423 const std::string& prefix) {
424 return CreateCondition(URLMatcherCondition::URL_PREFIX,
425 kBeginningOfURL + prefix);
426 }
427
CreateURLSuffixCondition(const std::string & suffix)428 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
429 const std::string& suffix) {
430 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
431 }
432
CreateURLContainsCondition(const std::string & str)433 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
434 const std::string& str) {
435 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
436 }
437
CreateURLEqualsCondition(const std::string & str)438 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
439 const std::string& str) {
440 return CreateCondition(URLMatcherCondition::URL_EQUALS,
441 kBeginningOfURL + str + kEndOfURL);
442 }
443
CreateURLMatchesCondition(const std::string & regex)444 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
445 const std::string& regex) {
446 return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
447 }
448
449 URLMatcherCondition
CreateOriginAndPathMatchesCondition(const std::string & regex)450 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
451 const std::string& regex) {
452 return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
453 }
454
ForgetUnusedPatterns(const std::set<StringPattern::ID> & used_patterns)455 void URLMatcherConditionFactory::ForgetUnusedPatterns(
456 const std::set<StringPattern::ID>& used_patterns) {
457 PatternSingletons::iterator i = substring_pattern_singletons_.begin();
458 while (i != substring_pattern_singletons_.end()) {
459 if (ContainsKey(used_patterns, (*i)->id())) {
460 ++i;
461 } else {
462 delete *i;
463 substring_pattern_singletons_.erase(i++);
464 }
465 }
466 i = regex_pattern_singletons_.begin();
467 while (i != regex_pattern_singletons_.end()) {
468 if (ContainsKey(used_patterns, (*i)->id())) {
469 ++i;
470 } else {
471 delete *i;
472 regex_pattern_singletons_.erase(i++);
473 }
474 }
475 i = origin_and_path_regex_pattern_singletons_.begin();
476 while (i != origin_and_path_regex_pattern_singletons_.end()) {
477 if (ContainsKey(used_patterns, (*i)->id())) {
478 ++i;
479 } else {
480 delete *i;
481 origin_and_path_regex_pattern_singletons_.erase(i++);
482 }
483 }
484 }
485
IsEmpty() const486 bool URLMatcherConditionFactory::IsEmpty() const {
487 return substring_pattern_singletons_.empty() &&
488 regex_pattern_singletons_.empty() &&
489 origin_and_path_regex_pattern_singletons_.empty();
490 }
491
CreateCondition(URLMatcherCondition::Criterion criterion,const std::string & pattern)492 URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
493 URLMatcherCondition::Criterion criterion,
494 const std::string& pattern) {
495 StringPattern search_pattern(pattern, 0);
496 PatternSingletons* pattern_singletons = NULL;
497 if (IsRegexCriterion(criterion))
498 pattern_singletons = ®ex_pattern_singletons_;
499 else if (IsOriginAndPathRegexCriterion(criterion))
500 pattern_singletons = &origin_and_path_regex_pattern_singletons_;
501 else
502 pattern_singletons = &substring_pattern_singletons_;
503
504 PatternSingletons::const_iterator iter =
505 pattern_singletons->find(&search_pattern);
506
507 if (iter != pattern_singletons->end()) {
508 return URLMatcherCondition(criterion, *iter);
509 } else {
510 StringPattern* new_pattern =
511 new StringPattern(pattern, id_counter_++);
512 pattern_singletons->insert(new_pattern);
513 return URLMatcherCondition(criterion, new_pattern);
514 }
515 }
516
CanonicalizeHostname(const std::string & hostname) const517 std::string URLMatcherConditionFactory::CanonicalizeHostname(
518 const std::string& hostname) const {
519 if (!hostname.empty() && hostname[0] == '.')
520 return hostname;
521 else
522 return "." + hostname;
523 }
524
operator ()(StringPattern * lhs,StringPattern * rhs) const525 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
526 StringPattern* lhs,
527 StringPattern* rhs) const {
528 if (lhs == NULL && rhs != NULL) return true;
529 if (lhs != NULL && rhs != NULL)
530 return lhs->pattern() < rhs->pattern();
531 // Either both are NULL or only rhs is NULL.
532 return false;
533 }
534
535 //
536 // URLMatcherSchemeFilter
537 //
538
URLMatcherSchemeFilter(const std::string & filter)539 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
540 : filters_(1) {
541 filters_.push_back(filter);
542 }
543
URLMatcherSchemeFilter(const std::vector<std::string> & filters)544 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
545 const std::vector<std::string>& filters)
546 : filters_(filters) {}
547
~URLMatcherSchemeFilter()548 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
549
IsMatch(const GURL & url) const550 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
551 return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
552 filters_.end();
553 }
554
555 //
556 // URLMatcherPortFilter
557 //
558
URLMatcherPortFilter(const std::vector<URLMatcherPortFilter::Range> & ranges)559 URLMatcherPortFilter::URLMatcherPortFilter(
560 const std::vector<URLMatcherPortFilter::Range>& ranges)
561 : ranges_(ranges) {}
562
~URLMatcherPortFilter()563 URLMatcherPortFilter::~URLMatcherPortFilter() {}
564
IsMatch(const GURL & url) const565 bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
566 int port = url.EffectiveIntPort();
567 for (std::vector<Range>::const_iterator i = ranges_.begin();
568 i != ranges_.end(); ++i) {
569 if (i->first <= port && port <= i->second)
570 return true;
571 }
572 return false;
573 }
574
575 // static
CreateRange(int from,int to)576 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
577 int to) {
578 return Range(from, to);
579 }
580
581 // static
CreateRange(int port)582 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
583 return Range(port, port);
584 }
585
586 //
587 // URLMatcherConditionSet
588 //
589
~URLMatcherConditionSet()590 URLMatcherConditionSet::~URLMatcherConditionSet() {}
591
URLMatcherConditionSet(ID id,const Conditions & conditions)592 URLMatcherConditionSet::URLMatcherConditionSet(
593 ID id,
594 const Conditions& conditions)
595 : id_(id),
596 conditions_(conditions) {}
597
URLMatcherConditionSet(ID id,const Conditions & conditions,scoped_ptr<URLMatcherSchemeFilter> scheme_filter,scoped_ptr<URLMatcherPortFilter> port_filter)598 URLMatcherConditionSet::URLMatcherConditionSet(
599 ID id,
600 const Conditions& conditions,
601 scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
602 scoped_ptr<URLMatcherPortFilter> port_filter)
603 : id_(id),
604 conditions_(conditions),
605 scheme_filter_(scheme_filter.Pass()),
606 port_filter_(port_filter.Pass()) {}
607
IsMatch(const std::set<StringPattern::ID> & matching_patterns,const GURL & url) const608 bool URLMatcherConditionSet::IsMatch(
609 const std::set<StringPattern::ID>& matching_patterns,
610 const GURL& url) const {
611 for (Conditions::const_iterator i = conditions_.begin();
612 i != conditions_.end(); ++i) {
613 if (!i->IsMatch(matching_patterns, url))
614 return false;
615 }
616 if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
617 return false;
618 if (port_filter_.get() && !port_filter_->IsMatch(url))
619 return false;
620 return true;
621 }
622
623 //
624 // URLMatcher
625 //
626
URLMatcher()627 URLMatcher::URLMatcher() {}
628
~URLMatcher()629 URLMatcher::~URLMatcher() {}
630
AddConditionSets(const URLMatcherConditionSet::Vector & condition_sets)631 void URLMatcher::AddConditionSets(
632 const URLMatcherConditionSet::Vector& condition_sets) {
633 for (URLMatcherConditionSet::Vector::const_iterator i =
634 condition_sets.begin(); i != condition_sets.end(); ++i) {
635 DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
636 url_matcher_condition_sets_.end());
637 url_matcher_condition_sets_[(*i)->id()] = *i;
638 }
639 UpdateInternalDatastructures();
640 }
641
RemoveConditionSets(const std::vector<URLMatcherConditionSet::ID> & condition_set_ids)642 void URLMatcher::RemoveConditionSets(
643 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
644 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
645 condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
646 DCHECK(url_matcher_condition_sets_.find(*i) !=
647 url_matcher_condition_sets_.end());
648 url_matcher_condition_sets_.erase(*i);
649 }
650 UpdateInternalDatastructures();
651 }
652
ClearUnusedConditionSets()653 void URLMatcher::ClearUnusedConditionSets() {
654 UpdateConditionFactory();
655 }
656
MatchURL(const GURL & url) const657 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
658 const GURL& url) const {
659 // Find all IDs of StringPatterns that match |url|.
660 // See URLMatcherConditionFactory for the canonicalization of URLs and the
661 // distinction between full url searches and url component searches.
662 std::set<StringPattern::ID> matches;
663 if (!full_url_matcher_.IsEmpty()) {
664 full_url_matcher_.Match(
665 condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
666 }
667 if (!url_component_matcher_.IsEmpty()) {
668 url_component_matcher_.Match(
669 condition_factory_.CanonicalizeURLForComponentSearches(url), &matches);
670 }
671 if (!regex_set_matcher_.IsEmpty()) {
672 regex_set_matcher_.Match(
673 condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
674 }
675 if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
676 origin_and_path_regex_set_matcher_.Match(
677 condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
678 &matches);
679 }
680
681 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
682 // were fulfilled.
683 std::set<URLMatcherConditionSet::ID> result;
684 for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
685 i != matches.end(); ++i) {
686 // For each URLMatcherConditionSet there is exactly one condition
687 // registered in substring_match_triggers_. This means that the following
688 // logic tests each URLMatcherConditionSet exactly once if it can be
689 // completely fulfilled.
690 StringPatternTriggers::const_iterator triggered_condition_sets_iter =
691 substring_match_triggers_.find(*i);
692 if (triggered_condition_sets_iter == substring_match_triggers_.end())
693 continue; // Not all substring matches are triggers for a condition set.
694 const std::set<URLMatcherConditionSet::ID>& condition_sets =
695 triggered_condition_sets_iter->second;
696 for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
697 condition_sets.begin(); j != condition_sets.end(); ++j) {
698 URLMatcherConditionSets::const_iterator condition_set_iter =
699 url_matcher_condition_sets_.find(*j);
700 DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
701 if (condition_set_iter->second->IsMatch(matches, url))
702 result.insert(*j);
703 }
704 }
705
706 return result;
707 }
708
IsEmpty() const709 bool URLMatcher::IsEmpty() const {
710 return condition_factory_.IsEmpty() &&
711 url_matcher_condition_sets_.empty() &&
712 substring_match_triggers_.empty() &&
713 full_url_matcher_.IsEmpty() &&
714 url_component_matcher_.IsEmpty() &&
715 regex_set_matcher_.IsEmpty() &&
716 origin_and_path_regex_set_matcher_.IsEmpty() &&
717 registered_full_url_patterns_.empty() &&
718 registered_url_component_patterns_.empty();
719 }
720
UpdateSubstringSetMatcher(bool full_url_conditions)721 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
722 // The purpose of |full_url_conditions| is just that we need to execute
723 // the same logic once for Full URL searches and once for URL Component
724 // searches (see URLMatcherConditionFactory).
725
726 // Determine which patterns need to be registered when this function
727 // terminates.
728 std::set<const StringPattern*> new_patterns;
729 for (URLMatcherConditionSets::const_iterator condition_set_iter =
730 url_matcher_condition_sets_.begin();
731 condition_set_iter != url_matcher_condition_sets_.end();
732 ++condition_set_iter) {
733 const URLMatcherConditionSet::Conditions& conditions =
734 condition_set_iter->second->conditions();
735 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
736 conditions.begin(); condition_iter != conditions.end();
737 ++condition_iter) {
738 // If we are called to process Full URL searches, ignore others, and
739 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
740 if (!condition_iter->IsRegexCondition() &&
741 !condition_iter->IsOriginAndPathRegexCondition() &&
742 full_url_conditions == condition_iter->IsFullURLCondition())
743 new_patterns.insert(condition_iter->string_pattern());
744 }
745 }
746
747 // This is the set of patterns that were registered before this function
748 // is called.
749 std::set<const StringPattern*>& registered_patterns =
750 full_url_conditions ? registered_full_url_patterns_
751 : registered_url_component_patterns_;
752
753 // Add all patterns that are in new_patterns but not in registered_patterns.
754 std::vector<const StringPattern*> patterns_to_register =
755 base::STLSetDifference<std::vector<const StringPattern*> >(
756 new_patterns, registered_patterns);
757
758 // Remove all patterns that are in registered_patterns but not in
759 // new_patterns.
760 std::vector<const StringPattern*> patterns_to_unregister =
761 base::STLSetDifference<std::vector<const StringPattern*> >(
762 registered_patterns, new_patterns);
763
764 // Update the SubstringSetMatcher.
765 SubstringSetMatcher& url_matcher =
766 full_url_conditions ? full_url_matcher_ : url_component_matcher_;
767 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
768 patterns_to_unregister);
769
770 // Update the set of registered_patterns for the next time this function
771 // is being called.
772 registered_patterns.swap(new_patterns);
773 }
774
UpdateRegexSetMatcher()775 void URLMatcher::UpdateRegexSetMatcher() {
776 std::vector<const StringPattern*> new_patterns;
777 std::vector<const StringPattern*> new_origin_and_path_patterns;
778
779 for (URLMatcherConditionSets::const_iterator condition_set_iter =
780 url_matcher_condition_sets_.begin();
781 condition_set_iter != url_matcher_condition_sets_.end();
782 ++condition_set_iter) {
783 const URLMatcherConditionSet::Conditions& conditions =
784 condition_set_iter->second->conditions();
785 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
786 conditions.begin(); condition_iter != conditions.end();
787 ++condition_iter) {
788 if (condition_iter->IsRegexCondition()) {
789 new_patterns.push_back(condition_iter->string_pattern());
790 } else if (condition_iter->IsOriginAndPathRegexCondition()) {
791 new_origin_and_path_patterns.push_back(
792 condition_iter->string_pattern());
793 }
794 }
795 }
796
797 // Start over from scratch. We can't really do better than this, since the
798 // FilteredRE2 backend doesn't support incremental updates.
799 regex_set_matcher_.ClearPatterns();
800 regex_set_matcher_.AddPatterns(new_patterns);
801 origin_and_path_regex_set_matcher_.ClearPatterns();
802 origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
803 }
804
UpdateTriggers()805 void URLMatcher::UpdateTriggers() {
806 // Count substring pattern frequencies.
807 std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
808 for (URLMatcherConditionSets::const_iterator condition_set_iter =
809 url_matcher_condition_sets_.begin();
810 condition_set_iter != url_matcher_condition_sets_.end();
811 ++condition_set_iter) {
812 const URLMatcherConditionSet::Conditions& conditions =
813 condition_set_iter->second->conditions();
814 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
815 conditions.begin(); condition_iter != conditions.end();
816 ++condition_iter) {
817 const StringPattern* pattern = condition_iter->string_pattern();
818 substring_pattern_frequencies[pattern->id()]++;
819 }
820 }
821
822 // Update trigger conditions: Determine for each URLMatcherConditionSet which
823 // URLMatcherCondition contains a StringPattern that occurs least
824 // frequently in this URLMatcher. We assume that this condition is very
825 // specific and occurs rarely in URLs. If a match occurs for this
826 // URLMatcherCondition, we want to test all other URLMatcherCondition in the
827 // respective URLMatcherConditionSet as well to see whether the entire
828 // URLMatcherConditionSet is considered matching.
829 substring_match_triggers_.clear();
830 for (URLMatcherConditionSets::const_iterator condition_set_iter =
831 url_matcher_condition_sets_.begin();
832 condition_set_iter != url_matcher_condition_sets_.end();
833 ++condition_set_iter) {
834 const URLMatcherConditionSet::Conditions& conditions =
835 condition_set_iter->second->conditions();
836 if (conditions.empty())
837 continue;
838 URLMatcherConditionSet::Conditions::const_iterator condition_iter =
839 conditions.begin();
840 StringPattern::ID trigger = condition_iter->string_pattern()->id();
841 // We skip the first element in the following loop.
842 ++condition_iter;
843 for (; condition_iter != conditions.end(); ++condition_iter) {
844 StringPattern::ID current_id =
845 condition_iter->string_pattern()->id();
846 if (substring_pattern_frequencies[trigger] >
847 substring_pattern_frequencies[current_id]) {
848 trigger = current_id;
849 }
850 }
851 substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
852 }
853 }
854
UpdateConditionFactory()855 void URLMatcher::UpdateConditionFactory() {
856 std::set<StringPattern::ID> used_patterns;
857 for (URLMatcherConditionSets::const_iterator condition_set_iter =
858 url_matcher_condition_sets_.begin();
859 condition_set_iter != url_matcher_condition_sets_.end();
860 ++condition_set_iter) {
861 const URLMatcherConditionSet::Conditions& conditions =
862 condition_set_iter->second->conditions();
863 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
864 conditions.begin(); condition_iter != conditions.end();
865 ++condition_iter) {
866 used_patterns.insert(condition_iter->string_pattern()->id());
867 }
868 }
869 condition_factory_.ForgetUnusedPatterns(used_patterns);
870 }
871
UpdateInternalDatastructures()872 void URLMatcher::UpdateInternalDatastructures() {
873 UpdateSubstringSetMatcher(false);
874 UpdateSubstringSetMatcher(true);
875 UpdateRegexSetMatcher();
876 UpdateTriggers();
877 UpdateConditionFactory();
878 }
879
880 } // namespace url_matcher
881