• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/url_matcher/url_matcher.h"
6 
7 #include <algorithm>
8 #include <iterator>
9 
10 #include "base/logging.h"
11 #include "url/gurl.h"
12 #include "url/url_canon.h"
13 
14 namespace url_matcher {
15 
16 // This set of classes implement a mapping of URL Component Patterns, such as
17 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
18 // for use in substring comparisons.
19 //
20 // The idea of this mapping is to reduce the problem of comparing many
21 // URL Component Patterns against one URL to the problem of searching many
22 // substrings in one string:
23 //
24 // ----------------------                    -----------------
25 // | URL Query operator | ----translate----> | StringPattern |
26 // ----------------------                    -----------------
27 //                                                   ^
28 //                                                   |
29 //                                                compare
30 //                                                   |
31 //                                                   v
32 // ----------------------                    -----------------
33 // | URL to compare     |                    |               |
34 // | to all URL Query   | ----translate----> | String        |
35 // | operators          |                    |               |
36 // ----------------------                    -----------------
37 //
38 // The reason for this problem reduction is that there are efficient algorithms
39 // for searching many substrings in one string (see Aho-Corasick algorithm).
40 //
41 // Additionally, some of the same pieces are reused to implement regular
42 // expression comparisons. The FilteredRE2 implementation for matching many
43 // regular expressions against one string uses prefiltering, in which a set
44 // of substrings (derived from the regexes) are first searched for, to reduce
45 // the number of regular expressions to test; the prefiltering step also
46 // uses Aho-Corasick.
47 //
48 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
49 // ==========================================================
50 //
51 // For searches in this class, we normalize URLs as follows:
52 //
53 // Step 1:
54 // Remove scheme, port and segment from URL:
55 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
56 //    www.example.com/index.html?search=foo
57 //
58 // We remove the scheme and port number because they can be checked later
59 // in a secondary filter step. We remove the segment (the #... part) because
60 // this is not guaranteed to be ASCII-7 encoded.
61 //
62 // Step 2:
63 // Translate URL to String and add the following position markers:
64 // - BU = Beginning of URL
65 // - ED = End of Domain
66 // - EP = End of Path
67 // - EU = End of URL
68 // Furthermore, the hostname is canonicalized to start with a ".".
69 //
70 // Position markers are represented as characters >127, which are therefore
71 // guaranteed not to be part of the ASCII-7 encoded URL character set.
72 //
73 // -> www.example.com/index.html?search=foo becomes
74 // BU .www.example.com ED /index.html EP ?search=foo EU
75 //
76 // -> www.example.com/index.html becomes
77 // BU .www.example.com ED /index.html EP EU
78 //
79 // Step 3:
80 // Translate URL Component Patterns as follows:
81 //
82 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
83 // -> host_prefix("www.example") = BU .www.example
84 //
85 // host_suffix(suffix) = suffix ED
86 // -> host_suffix("example.com") = example.com ED
87 // -> host_suffix(".example.com") = .example.com ED
88 //
89 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
90 // -> host_equals("www.example.com") = BU .www.example.com ED
91 //
92 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
93 //
94 // With this, we can search the StringPatterns in the normalized URL.
95 //
96 //
97 // Case 2: url_{prefix,suffix,equals,contains} searches.
98 // =====================================================
99 //
100 // Step 1: as above, except that
101 // - the scheme is not removed
102 // - the port is not removed if it is specified and does not match the default
103 //   port for the given scheme.
104 //
105 // Step 2:
106 // Translate URL to String and add the following position markers:
107 // - BU = Beginning of URL
108 // - EU = End of URL
109 //
110 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
111 // BU http://www.example.com:8080/index.html?search=foo EU
112 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
113 // BU http://www.example.com/index.html?search=foo EU
114 //
115 // url_prefix(prefix) = BU prefix
116 // -> url_prefix("http://www.example") = BU http://www.example
117 //
118 // url_contains(substring) = substring
119 // -> url_contains("index") = index
120 //
121 //
122 // Case 3: {host,path,query}_contains searches.
123 // ============================================
124 //
125 // These kinds of searches are not supported directly but can be derived
126 // by a combination of a url_contains() query followed by an explicit test:
127 //
128 // host_contains(str) = url_contains(str) followed by test whether str occurs
129 //   in host component of original URL.
130 // -> host_contains("example.co") = example.co
131 //    followed by gurl.host().find("example.co");
132 //
133 // [similarly for path_contains and query_contains].
134 //
135 //
136 // Regular expression matching (url_matches searches)
137 // ==================================================
138 //
139 // This class also supports matching regular expressions (RE2 syntax)
140 // against full URLs, which are transformed as in case 2.
141 
142 namespace {
143 
IsRegexCriterion(URLMatcherCondition::Criterion criterion)144 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
145   return criterion == URLMatcherCondition::URL_MATCHES;
146 }
147 
IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion)148 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
149   return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
150 }
151 
152 }  // namespace
153 
154 //
155 // URLMatcherCondition
156 //
157 
URLMatcherCondition()158 URLMatcherCondition::URLMatcherCondition()
159     : criterion_(HOST_PREFIX),
160       string_pattern_(NULL) {}
161 
~URLMatcherCondition()162 URLMatcherCondition::~URLMatcherCondition() {}
163 
URLMatcherCondition(Criterion criterion,const StringPattern * string_pattern)164 URLMatcherCondition::URLMatcherCondition(
165     Criterion criterion,
166     const StringPattern* string_pattern)
167     : criterion_(criterion),
168       string_pattern_(string_pattern) {}
169 
URLMatcherCondition(const URLMatcherCondition & rhs)170 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
171     : criterion_(rhs.criterion_),
172       string_pattern_(rhs.string_pattern_) {}
173 
operator =(const URLMatcherCondition & rhs)174 URLMatcherCondition& URLMatcherCondition::operator=(
175     const URLMatcherCondition& rhs) {
176   criterion_ = rhs.criterion_;
177   string_pattern_ = rhs.string_pattern_;
178   return *this;
179 }
180 
operator <(const URLMatcherCondition & rhs) const181 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
182   if (criterion_ < rhs.criterion_) return true;
183   if (criterion_ > rhs.criterion_) return false;
184   if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
185     return *string_pattern_ < *rhs.string_pattern_;
186   if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
187   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
188   // or both are NULL.
189   return false;
190 }
191 
IsFullURLCondition() const192 bool URLMatcherCondition::IsFullURLCondition() const {
193   // For these criteria the SubstringMatcher needs to be executed on the
194   // GURL that is canonicalized with
195   // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
196   switch (criterion_) {
197     case HOST_CONTAINS:
198     case PATH_CONTAINS:
199     case QUERY_CONTAINS:
200     case URL_PREFIX:
201     case URL_SUFFIX:
202     case URL_CONTAINS:
203     case URL_EQUALS:
204       return true;
205     default:
206       break;
207   }
208   return false;
209 }
210 
IsRegexCondition() const211 bool URLMatcherCondition::IsRegexCondition() const {
212   return IsRegexCriterion(criterion_);
213 }
214 
IsOriginAndPathRegexCondition() const215 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
216   return IsOriginAndPathRegexCriterion(criterion_);
217 }
218 
IsMatch(const std::set<StringPattern::ID> & matching_patterns,const GURL & url) const219 bool URLMatcherCondition::IsMatch(
220     const std::set<StringPattern::ID>& matching_patterns,
221     const GURL& url) const {
222   DCHECK(string_pattern_);
223   if (!ContainsKey(matching_patterns, string_pattern_->id()))
224     return false;
225   // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
226   // a substring match on the raw URL. In case of a match, we need to verify
227   // that the match was found in the correct component of the URL.
228   switch (criterion_) {
229     case HOST_CONTAINS:
230       return url.host().find(string_pattern_->pattern()) !=
231           std::string::npos;
232     case PATH_CONTAINS:
233       return url.path().find(string_pattern_->pattern()) !=
234           std::string::npos;
235     case QUERY_CONTAINS:
236       return url.query().find(string_pattern_->pattern()) !=
237           std::string::npos;
238     default:
239       break;
240   }
241   return true;
242 }
243 
244 //
245 // URLMatcherConditionFactory
246 //
247 
248 namespace {
249 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
250 const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
251 const char kEndOfDomain[] = {static_cast<char>(-2), 0};
252 const char kEndOfPath[] = {static_cast<char>(-3), 0};
253 const char kEndOfURL[] = {static_cast<char>(-4), 0};
254 }  // namespace
255 
URLMatcherConditionFactory()256 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
257 
~URLMatcherConditionFactory()258 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
259   STLDeleteElements(&substring_pattern_singletons_);
260   STLDeleteElements(&regex_pattern_singletons_);
261   STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
262 }
263 
CanonicalizeURLForComponentSearches(const GURL & url) const264 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
265     const GURL& url) const {
266   return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
267          url.path() + kEndOfPath +
268          (url.has_query() ? "?" + url.query() : std::string()) + kEndOfURL;
269 }
270 
CreateHostPrefixCondition(const std::string & prefix)271 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
272     const std::string& prefix) {
273   return CreateCondition(URLMatcherCondition::HOST_PREFIX,
274       kBeginningOfURL + CanonicalizeHostname(prefix));
275 }
276 
CreateHostSuffixCondition(const std::string & suffix)277 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
278     const std::string& suffix) {
279   return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
280       suffix + kEndOfDomain);
281 }
282 
CreateHostContainsCondition(const std::string & str)283 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
284     const std::string& str) {
285   return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
286 }
287 
CreateHostEqualsCondition(const std::string & str)288 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
289     const std::string& str) {
290   return CreateCondition(URLMatcherCondition::HOST_EQUALS,
291       kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
292 }
293 
CreatePathPrefixCondition(const std::string & prefix)294 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
295     const std::string& prefix) {
296   return CreateCondition(URLMatcherCondition::PATH_PREFIX,
297       kEndOfDomain + prefix);
298 }
299 
CreatePathSuffixCondition(const std::string & suffix)300 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
301     const std::string& suffix) {
302   return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
303 }
304 
CreatePathContainsCondition(const std::string & str)305 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
306     const std::string& str) {
307   return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
308 }
309 
CreatePathEqualsCondition(const std::string & str)310 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
311     const std::string& str) {
312   return CreateCondition(URLMatcherCondition::PATH_EQUALS,
313       kEndOfDomain + str + kEndOfPath);
314 }
315 
CreateQueryPrefixCondition(const std::string & prefix)316 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
317     const std::string& prefix) {
318   std::string pattern;
319   if (!prefix.empty() && prefix[0] == '?')
320     pattern = kEndOfPath + prefix;
321   else
322     pattern = kEndOfPath + ('?' + prefix);
323 
324   return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
325 }
326 
CreateQuerySuffixCondition(const std::string & suffix)327 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
328     const std::string& suffix) {
329   if (!suffix.empty() && suffix[0] == '?') {
330     return CreateQueryEqualsCondition(suffix);
331   } else {
332     return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
333                            suffix + kEndOfURL);
334   }
335 }
336 
CreateQueryContainsCondition(const std::string & str)337 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
338     const std::string& str) {
339   if (!str.empty() && str[0] == '?')
340     return CreateQueryPrefixCondition(str);
341   else
342     return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
343 }
344 
CreateQueryEqualsCondition(const std::string & str)345 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
346     const std::string& str) {
347   std::string pattern;
348   if (!str.empty() && str[0] == '?')
349     pattern = kEndOfPath + str + kEndOfURL;
350   else
351     pattern = kEndOfPath + ('?' + str) + kEndOfURL;
352 
353   return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
354 }
355 
356 URLMatcherCondition
CreateHostSuffixPathPrefixCondition(const std::string & host_suffix,const std::string & path_prefix)357     URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
358     const std::string& host_suffix,
359     const std::string& path_prefix) {
360   return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
361       host_suffix + kEndOfDomain + path_prefix);
362 }
363 
364 URLMatcherCondition
CreateHostEqualsPathPrefixCondition(const std::string & host,const std::string & path_prefix)365 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
366     const std::string& host,
367     const std::string& path_prefix) {
368   return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
369       kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
370       path_prefix);
371 }
372 
CanonicalizeURLForFullSearches(const GURL & url) const373 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
374     const GURL& url) const {
375   GURL::Replacements replacements;
376   replacements.ClearPassword();
377   replacements.ClearUsername();
378   replacements.ClearRef();
379   // Clear port if it is implicit from scheme.
380   if (url.has_port()) {
381     const std::string& port = url.scheme();
382     if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
383             url.EffectiveIntPort()) {
384       replacements.ClearPort();
385     }
386   }
387   return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
388       kEndOfURL;
389 }
390 
CanonicalizeURLForRegexSearchesHelper(const GURL & url,bool clear_query)391 static std::string CanonicalizeURLForRegexSearchesHelper(
392     const GURL& url,
393     bool clear_query) {
394   GURL::Replacements replacements;
395   replacements.ClearPassword();
396   replacements.ClearUsername();
397   replacements.ClearRef();
398   if (clear_query)
399     replacements.ClearQuery();
400   // Clear port if it is implicit from scheme.
401   if (url.has_port()) {
402     const std::string& port = url.scheme();
403     if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
404             url.EffectiveIntPort()) {
405       replacements.ClearPort();
406     }
407   }
408   return url.ReplaceComponents(replacements).spec();
409 }
410 
CanonicalizeURLForRegexSearches(const GURL & url) const411 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
412     const GURL& url) const {
413   return CanonicalizeURLForRegexSearchesHelper(url, false);
414 }
415 
416 std::string
CanonicalizeURLForOriginAndPathRegexSearches(const GURL & url) const417 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
418     const GURL& url) const {
419   return CanonicalizeURLForRegexSearchesHelper(url, true);
420 }
421 
CreateURLPrefixCondition(const std::string & prefix)422 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
423     const std::string& prefix) {
424   return CreateCondition(URLMatcherCondition::URL_PREFIX,
425       kBeginningOfURL + prefix);
426 }
427 
CreateURLSuffixCondition(const std::string & suffix)428 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
429     const std::string& suffix) {
430   return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
431 }
432 
CreateURLContainsCondition(const std::string & str)433 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
434     const std::string& str) {
435   return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
436 }
437 
CreateURLEqualsCondition(const std::string & str)438 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
439     const std::string& str) {
440   return CreateCondition(URLMatcherCondition::URL_EQUALS,
441       kBeginningOfURL + str + kEndOfURL);
442 }
443 
CreateURLMatchesCondition(const std::string & regex)444 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
445     const std::string& regex) {
446   return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
447 }
448 
449 URLMatcherCondition
CreateOriginAndPathMatchesCondition(const std::string & regex)450 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
451     const std::string& regex) {
452   return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
453 }
454 
ForgetUnusedPatterns(const std::set<StringPattern::ID> & used_patterns)455 void URLMatcherConditionFactory::ForgetUnusedPatterns(
456       const std::set<StringPattern::ID>& used_patterns) {
457   PatternSingletons::iterator i = substring_pattern_singletons_.begin();
458   while (i != substring_pattern_singletons_.end()) {
459     if (ContainsKey(used_patterns, (*i)->id())) {
460       ++i;
461     } else {
462       delete *i;
463       substring_pattern_singletons_.erase(i++);
464     }
465   }
466   i = regex_pattern_singletons_.begin();
467   while (i != regex_pattern_singletons_.end()) {
468     if (ContainsKey(used_patterns, (*i)->id())) {
469       ++i;
470     } else {
471       delete *i;
472       regex_pattern_singletons_.erase(i++);
473     }
474   }
475   i = origin_and_path_regex_pattern_singletons_.begin();
476   while (i != origin_and_path_regex_pattern_singletons_.end()) {
477     if (ContainsKey(used_patterns, (*i)->id())) {
478       ++i;
479     } else {
480       delete *i;
481       origin_and_path_regex_pattern_singletons_.erase(i++);
482     }
483   }
484 }
485 
IsEmpty() const486 bool URLMatcherConditionFactory::IsEmpty() const {
487   return substring_pattern_singletons_.empty() &&
488       regex_pattern_singletons_.empty() &&
489       origin_and_path_regex_pattern_singletons_.empty();
490 }
491 
CreateCondition(URLMatcherCondition::Criterion criterion,const std::string & pattern)492 URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
493     URLMatcherCondition::Criterion criterion,
494     const std::string& pattern) {
495   StringPattern search_pattern(pattern, 0);
496   PatternSingletons* pattern_singletons = NULL;
497   if (IsRegexCriterion(criterion))
498     pattern_singletons = &regex_pattern_singletons_;
499   else if (IsOriginAndPathRegexCriterion(criterion))
500     pattern_singletons = &origin_and_path_regex_pattern_singletons_;
501   else
502     pattern_singletons = &substring_pattern_singletons_;
503 
504   PatternSingletons::const_iterator iter =
505       pattern_singletons->find(&search_pattern);
506 
507   if (iter != pattern_singletons->end()) {
508     return URLMatcherCondition(criterion, *iter);
509   } else {
510     StringPattern* new_pattern =
511         new StringPattern(pattern, id_counter_++);
512     pattern_singletons->insert(new_pattern);
513     return URLMatcherCondition(criterion, new_pattern);
514   }
515 }
516 
CanonicalizeHostname(const std::string & hostname) const517 std::string URLMatcherConditionFactory::CanonicalizeHostname(
518     const std::string& hostname) const {
519   if (!hostname.empty() && hostname[0] == '.')
520     return hostname;
521   else
522     return "." + hostname;
523 }
524 
operator ()(StringPattern * lhs,StringPattern * rhs) const525 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
526     StringPattern* lhs,
527     StringPattern* rhs) const {
528   if (lhs == NULL && rhs != NULL) return true;
529   if (lhs != NULL && rhs != NULL)
530     return lhs->pattern() < rhs->pattern();
531   // Either both are NULL or only rhs is NULL.
532   return false;
533 }
534 
535 //
536 // URLMatcherSchemeFilter
537 //
538 
URLMatcherSchemeFilter(const std::string & filter)539 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
540     : filters_(1) {
541   filters_.push_back(filter);
542 }
543 
URLMatcherSchemeFilter(const std::vector<std::string> & filters)544 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
545     const std::vector<std::string>& filters)
546     : filters_(filters) {}
547 
~URLMatcherSchemeFilter()548 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
549 
IsMatch(const GURL & url) const550 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
551   return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
552       filters_.end();
553 }
554 
555 //
556 // URLMatcherPortFilter
557 //
558 
URLMatcherPortFilter(const std::vector<URLMatcherPortFilter::Range> & ranges)559 URLMatcherPortFilter::URLMatcherPortFilter(
560     const std::vector<URLMatcherPortFilter::Range>& ranges)
561     : ranges_(ranges) {}
562 
~URLMatcherPortFilter()563 URLMatcherPortFilter::~URLMatcherPortFilter() {}
564 
IsMatch(const GURL & url) const565 bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
566   int port = url.EffectiveIntPort();
567   for (std::vector<Range>::const_iterator i = ranges_.begin();
568        i != ranges_.end(); ++i) {
569     if (i->first <= port && port <= i->second)
570       return true;
571   }
572   return false;
573 }
574 
575 // static
CreateRange(int from,int to)576 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
577                                                               int to) {
578   return Range(from, to);
579 }
580 
581 // static
CreateRange(int port)582 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
583   return Range(port, port);
584 }
585 
586 //
587 // URLMatcherConditionSet
588 //
589 
~URLMatcherConditionSet()590 URLMatcherConditionSet::~URLMatcherConditionSet() {}
591 
URLMatcherConditionSet(ID id,const Conditions & conditions)592 URLMatcherConditionSet::URLMatcherConditionSet(
593     ID id,
594     const Conditions& conditions)
595     : id_(id),
596       conditions_(conditions) {}
597 
URLMatcherConditionSet(ID id,const Conditions & conditions,scoped_ptr<URLMatcherSchemeFilter> scheme_filter,scoped_ptr<URLMatcherPortFilter> port_filter)598 URLMatcherConditionSet::URLMatcherConditionSet(
599     ID id,
600     const Conditions& conditions,
601     scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
602     scoped_ptr<URLMatcherPortFilter> port_filter)
603     : id_(id),
604       conditions_(conditions),
605       scheme_filter_(scheme_filter.Pass()),
606       port_filter_(port_filter.Pass()) {}
607 
IsMatch(const std::set<StringPattern::ID> & matching_patterns,const GURL & url) const608 bool URLMatcherConditionSet::IsMatch(
609     const std::set<StringPattern::ID>& matching_patterns,
610     const GURL& url) const {
611   for (Conditions::const_iterator i = conditions_.begin();
612        i != conditions_.end(); ++i) {
613     if (!i->IsMatch(matching_patterns, url))
614       return false;
615   }
616   if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
617     return false;
618   if (port_filter_.get() && !port_filter_->IsMatch(url))
619     return false;
620   return true;
621 }
622 
623 //
624 // URLMatcher
625 //
626 
URLMatcher()627 URLMatcher::URLMatcher() {}
628 
~URLMatcher()629 URLMatcher::~URLMatcher() {}
630 
AddConditionSets(const URLMatcherConditionSet::Vector & condition_sets)631 void URLMatcher::AddConditionSets(
632     const URLMatcherConditionSet::Vector& condition_sets) {
633   for (URLMatcherConditionSet::Vector::const_iterator i =
634        condition_sets.begin(); i != condition_sets.end(); ++i) {
635     DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
636         url_matcher_condition_sets_.end());
637     url_matcher_condition_sets_[(*i)->id()] = *i;
638   }
639   UpdateInternalDatastructures();
640 }
641 
RemoveConditionSets(const std::vector<URLMatcherConditionSet::ID> & condition_set_ids)642 void URLMatcher::RemoveConditionSets(
643     const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
644   for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
645        condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
646     DCHECK(url_matcher_condition_sets_.find(*i) !=
647         url_matcher_condition_sets_.end());
648     url_matcher_condition_sets_.erase(*i);
649   }
650   UpdateInternalDatastructures();
651 }
652 
ClearUnusedConditionSets()653 void URLMatcher::ClearUnusedConditionSets() {
654   UpdateConditionFactory();
655 }
656 
MatchURL(const GURL & url) const657 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
658     const GURL& url) const {
659   // Find all IDs of StringPatterns that match |url|.
660   // See URLMatcherConditionFactory for the canonicalization of URLs and the
661   // distinction between full url searches and url component searches.
662   std::set<StringPattern::ID> matches;
663   if (!full_url_matcher_.IsEmpty()) {
664     full_url_matcher_.Match(
665         condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
666   }
667   if (!url_component_matcher_.IsEmpty()) {
668     url_component_matcher_.Match(
669         condition_factory_.CanonicalizeURLForComponentSearches(url), &matches);
670   }
671   if (!regex_set_matcher_.IsEmpty()) {
672     regex_set_matcher_.Match(
673         condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
674   }
675   if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
676     origin_and_path_regex_set_matcher_.Match(
677         condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
678         &matches);
679   }
680 
681   // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
682   // were fulfilled.
683   std::set<URLMatcherConditionSet::ID> result;
684   for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
685        i != matches.end(); ++i) {
686     // For each URLMatcherConditionSet there is exactly one condition
687     // registered in substring_match_triggers_. This means that the following
688     // logic tests each URLMatcherConditionSet exactly once if it can be
689     // completely fulfilled.
690     StringPatternTriggers::const_iterator triggered_condition_sets_iter =
691         substring_match_triggers_.find(*i);
692     if (triggered_condition_sets_iter == substring_match_triggers_.end())
693       continue;  // Not all substring matches are triggers for a condition set.
694     const std::set<URLMatcherConditionSet::ID>& condition_sets =
695         triggered_condition_sets_iter->second;
696     for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
697          condition_sets.begin(); j != condition_sets.end(); ++j) {
698       URLMatcherConditionSets::const_iterator condition_set_iter =
699           url_matcher_condition_sets_.find(*j);
700       DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
701       if (condition_set_iter->second->IsMatch(matches, url))
702         result.insert(*j);
703     }
704   }
705 
706   return result;
707 }
708 
IsEmpty() const709 bool URLMatcher::IsEmpty() const {
710   return condition_factory_.IsEmpty() &&
711       url_matcher_condition_sets_.empty() &&
712       substring_match_triggers_.empty() &&
713       full_url_matcher_.IsEmpty() &&
714       url_component_matcher_.IsEmpty() &&
715       regex_set_matcher_.IsEmpty() &&
716       origin_and_path_regex_set_matcher_.IsEmpty() &&
717       registered_full_url_patterns_.empty() &&
718       registered_url_component_patterns_.empty();
719 }
720 
UpdateSubstringSetMatcher(bool full_url_conditions)721 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
722   // The purpose of |full_url_conditions| is just that we need to execute
723   // the same logic once for Full URL searches and once for URL Component
724   // searches (see URLMatcherConditionFactory).
725 
726   // Determine which patterns need to be registered when this function
727   // terminates.
728   std::set<const StringPattern*> new_patterns;
729   for (URLMatcherConditionSets::const_iterator condition_set_iter =
730       url_matcher_condition_sets_.begin();
731       condition_set_iter != url_matcher_condition_sets_.end();
732       ++condition_set_iter) {
733     const URLMatcherConditionSet::Conditions& conditions =
734         condition_set_iter->second->conditions();
735     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
736          conditions.begin(); condition_iter != conditions.end();
737          ++condition_iter) {
738       // If we are called to process Full URL searches, ignore others, and
739       // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
740       if (!condition_iter->IsRegexCondition() &&
741           !condition_iter->IsOriginAndPathRegexCondition() &&
742           full_url_conditions == condition_iter->IsFullURLCondition())
743         new_patterns.insert(condition_iter->string_pattern());
744     }
745   }
746 
747   // This is the set of patterns that were registered before this function
748   // is called.
749   std::set<const StringPattern*>& registered_patterns =
750       full_url_conditions ? registered_full_url_patterns_
751                           : registered_url_component_patterns_;
752 
753   // Add all patterns that are in new_patterns but not in registered_patterns.
754   std::vector<const StringPattern*> patterns_to_register =
755       base::STLSetDifference<std::vector<const StringPattern*> >(
756           new_patterns, registered_patterns);
757 
758   // Remove all patterns that are in registered_patterns but not in
759   // new_patterns.
760   std::vector<const StringPattern*> patterns_to_unregister =
761       base::STLSetDifference<std::vector<const StringPattern*> >(
762            registered_patterns, new_patterns);
763 
764   // Update the SubstringSetMatcher.
765   SubstringSetMatcher& url_matcher =
766       full_url_conditions ? full_url_matcher_ : url_component_matcher_;
767   url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
768                                             patterns_to_unregister);
769 
770   // Update the set of registered_patterns for the next time this function
771   // is being called.
772   registered_patterns.swap(new_patterns);
773 }
774 
UpdateRegexSetMatcher()775 void URLMatcher::UpdateRegexSetMatcher() {
776   std::vector<const StringPattern*> new_patterns;
777   std::vector<const StringPattern*> new_origin_and_path_patterns;
778 
779   for (URLMatcherConditionSets::const_iterator condition_set_iter =
780       url_matcher_condition_sets_.begin();
781       condition_set_iter != url_matcher_condition_sets_.end();
782       ++condition_set_iter) {
783     const URLMatcherConditionSet::Conditions& conditions =
784         condition_set_iter->second->conditions();
785     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
786          conditions.begin(); condition_iter != conditions.end();
787          ++condition_iter) {
788       if (condition_iter->IsRegexCondition()) {
789         new_patterns.push_back(condition_iter->string_pattern());
790       } else if (condition_iter->IsOriginAndPathRegexCondition()) {
791         new_origin_and_path_patterns.push_back(
792             condition_iter->string_pattern());
793       }
794     }
795   }
796 
797   // Start over from scratch. We can't really do better than this, since the
798   // FilteredRE2 backend doesn't support incremental updates.
799   regex_set_matcher_.ClearPatterns();
800   regex_set_matcher_.AddPatterns(new_patterns);
801   origin_and_path_regex_set_matcher_.ClearPatterns();
802   origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
803 }
804 
UpdateTriggers()805 void URLMatcher::UpdateTriggers() {
806   // Count substring pattern frequencies.
807   std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
808   for (URLMatcherConditionSets::const_iterator condition_set_iter =
809       url_matcher_condition_sets_.begin();
810       condition_set_iter != url_matcher_condition_sets_.end();
811       ++condition_set_iter) {
812     const URLMatcherConditionSet::Conditions& conditions =
813         condition_set_iter->second->conditions();
814     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
815          conditions.begin(); condition_iter != conditions.end();
816          ++condition_iter) {
817       const StringPattern* pattern = condition_iter->string_pattern();
818       substring_pattern_frequencies[pattern->id()]++;
819     }
820   }
821 
822   // Update trigger conditions: Determine for each URLMatcherConditionSet which
823   // URLMatcherCondition contains a StringPattern that occurs least
824   // frequently in this URLMatcher. We assume that this condition is very
825   // specific and occurs rarely in URLs. If a match occurs for this
826   // URLMatcherCondition, we want to test all other URLMatcherCondition in the
827   // respective URLMatcherConditionSet as well to see whether the entire
828   // URLMatcherConditionSet is considered matching.
829   substring_match_triggers_.clear();
830   for (URLMatcherConditionSets::const_iterator condition_set_iter =
831       url_matcher_condition_sets_.begin();
832       condition_set_iter != url_matcher_condition_sets_.end();
833       ++condition_set_iter) {
834     const URLMatcherConditionSet::Conditions& conditions =
835         condition_set_iter->second->conditions();
836     if (conditions.empty())
837       continue;
838     URLMatcherConditionSet::Conditions::const_iterator condition_iter =
839         conditions.begin();
840     StringPattern::ID trigger = condition_iter->string_pattern()->id();
841     // We skip the first element in the following loop.
842     ++condition_iter;
843     for (; condition_iter != conditions.end(); ++condition_iter) {
844       StringPattern::ID current_id =
845           condition_iter->string_pattern()->id();
846       if (substring_pattern_frequencies[trigger] >
847           substring_pattern_frequencies[current_id]) {
848         trigger = current_id;
849       }
850     }
851     substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
852   }
853 }
854 
UpdateConditionFactory()855 void URLMatcher::UpdateConditionFactory() {
856   std::set<StringPattern::ID> used_patterns;
857   for (URLMatcherConditionSets::const_iterator condition_set_iter =
858       url_matcher_condition_sets_.begin();
859       condition_set_iter != url_matcher_condition_sets_.end();
860       ++condition_set_iter) {
861     const URLMatcherConditionSet::Conditions& conditions =
862         condition_set_iter->second->conditions();
863     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
864          conditions.begin(); condition_iter != conditions.end();
865          ++condition_iter) {
866       used_patterns.insert(condition_iter->string_pattern()->id());
867     }
868   }
869   condition_factory_.ForgetUnusedPatterns(used_patterns);
870 }
871 
UpdateInternalDatastructures()872 void URLMatcher::UpdateInternalDatastructures() {
873   UpdateSubstringSetMatcher(false);
874   UpdateSubstringSetMatcher(true);
875   UpdateRegexSetMatcher();
876   UpdateTriggers();
877   UpdateConditionFactory();
878 }
879 
880 }  // namespace url_matcher
881