• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "extensions/common/url_pattern.h"
6 
7 #include <ostream>
8 
9 #include "base/strings/string_number_conversions.h"
10 #include "base/strings/string_piece.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/stringprintf.h"
14 #include "content/public/common/url_constants.h"
15 #include "extensions/common/constants.h"
16 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
17 #include "url/gurl.h"
18 #include "url/url_util.h"
19 
20 const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
21 
22 namespace {
23 
24 // TODO(aa): What about more obscure schemes like data: and javascript: ?
25 // Note: keep this array in sync with kValidSchemeMasks.
26 const char* kValidSchemes[] = {
27     url::kHttpScheme,
28     url::kHttpsScheme,
29     url::kFileScheme,
30     url::kFtpScheme,
31     content::kChromeUIScheme,
32     extensions::kExtensionScheme,
33     url::kFileSystemScheme,
34 };
35 
36 const int kValidSchemeMasks[] = {
37   URLPattern::SCHEME_HTTP,
38   URLPattern::SCHEME_HTTPS,
39   URLPattern::SCHEME_FILE,
40   URLPattern::SCHEME_FTP,
41   URLPattern::SCHEME_CHROMEUI,
42   URLPattern::SCHEME_EXTENSION,
43   URLPattern::SCHEME_FILESYSTEM,
44 };
45 
46 COMPILE_ASSERT(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
47                must_keep_these_arrays_in_sync);
48 
49 const char kParseSuccess[] = "Success.";
50 const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
51 const char kParseErrorInvalidScheme[] = "Invalid scheme.";
52 const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
53 const char kParseErrorEmptyHost[] = "Host can not be empty.";
54 const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
55 const char kParseErrorEmptyPath[] = "Empty path.";
56 const char kParseErrorInvalidPort[] = "Invalid port.";
57 const char kParseErrorInvalidHost[] = "Invalid host.";
58 
59 // Message explaining each URLPattern::ParseResult.
60 const char* const kParseResultMessages[] = {
61   kParseSuccess,
62   kParseErrorMissingSchemeSeparator,
63   kParseErrorInvalidScheme,
64   kParseErrorWrongSchemeType,
65   kParseErrorEmptyHost,
66   kParseErrorInvalidHostWildcard,
67   kParseErrorEmptyPath,
68   kParseErrorInvalidPort,
69   kParseErrorInvalidHost,
70 };
71 
72 COMPILE_ASSERT(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
73                must_add_message_for_each_parse_result);
74 
75 const char kPathSeparator[] = "/";
76 
IsStandardScheme(const std::string & scheme)77 bool IsStandardScheme(const std::string& scheme) {
78   // "*" gets the same treatment as a standard scheme.
79   if (scheme == "*")
80     return true;
81 
82   return url::IsStandard(scheme.c_str(),
83                          url::Component(0, static_cast<int>(scheme.length())));
84 }
85 
IsValidPortForScheme(const std::string & scheme,const std::string & port)86 bool IsValidPortForScheme(const std::string& scheme, const std::string& port) {
87   if (port == "*")
88     return true;
89 
90   // Only accept non-wildcard ports if the scheme uses ports.
91   if (url::DefaultPortForScheme(scheme.c_str(), scheme.length()) ==
92       url::PORT_UNSPECIFIED) {
93     return false;
94   }
95 
96   int parsed_port = url::PORT_UNSPECIFIED;
97   if (!base::StringToInt(port, &parsed_port))
98     return false;
99   return (parsed_port >= 0) && (parsed_port < 65536);
100 }
101 
102 // Returns |path| with the trailing wildcard stripped if one existed.
103 //
104 // The functions that rely on this (OverlapsWith and Contains) are only
105 // called for the patterns inside URLPatternSet. In those cases, we know that
106 // the path will have only a single wildcard at the end. This makes figuring
107 // out overlap much easier. It seems like there is probably a computer-sciency
108 // way to solve the general case, but we don't need that yet.
StripTrailingWildcard(const std::string & path)109 std::string StripTrailingWildcard(const std::string& path) {
110   size_t wildcard_index = path.find('*');
111   size_t path_last = path.size() - 1;
112   DCHECK(wildcard_index == std::string::npos || wildcard_index == path_last);
113   return wildcard_index == path_last ? path.substr(0, path_last) : path;
114 }
115 
116 }  // namespace
117 
118 // static
IsValidSchemeForExtensions(const std::string & scheme)119 bool URLPattern::IsValidSchemeForExtensions(const std::string& scheme) {
120   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
121     if (scheme == kValidSchemes[i])
122       return true;
123   }
124   return false;
125 }
126 
URLPattern()127 URLPattern::URLPattern()
128     : valid_schemes_(SCHEME_NONE),
129       match_all_urls_(false),
130       match_subdomains_(false),
131       port_("*") {}
132 
URLPattern(int valid_schemes)133 URLPattern::URLPattern(int valid_schemes)
134     : valid_schemes_(valid_schemes),
135       match_all_urls_(false),
136       match_subdomains_(false),
137       port_("*") {}
138 
URLPattern(int valid_schemes,const std::string & pattern)139 URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
140     // Strict error checking is used, because this constructor is only
141     // appropriate when we know |pattern| is valid.
142     : valid_schemes_(valid_schemes),
143       match_all_urls_(false),
144       match_subdomains_(false),
145       port_("*") {
146   ParseResult result = Parse(pattern);
147   if (PARSE_SUCCESS != result)
148     NOTREACHED() << "URLPattern invalid: " << pattern << " result " << result;
149 }
150 
~URLPattern()151 URLPattern::~URLPattern() {
152 }
153 
operator <(const URLPattern & other) const154 bool URLPattern::operator<(const URLPattern& other) const {
155   return GetAsString() < other.GetAsString();
156 }
157 
operator >(const URLPattern & other) const158 bool URLPattern::operator>(const URLPattern& other) const {
159   return GetAsString() > other.GetAsString();
160 }
161 
operator ==(const URLPattern & other) const162 bool URLPattern::operator==(const URLPattern& other) const {
163   return GetAsString() == other.GetAsString();
164 }
165 
operator <<(std::ostream & out,const URLPattern & url_pattern)166 std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern) {
167   return out << '"' << url_pattern.GetAsString() << '"';
168 }
169 
Parse(const std::string & pattern)170 URLPattern::ParseResult URLPattern::Parse(const std::string& pattern) {
171   spec_.clear();
172   SetMatchAllURLs(false);
173   SetMatchSubdomains(false);
174   SetPort("*");
175 
176   // Special case pattern to match every valid URL.
177   if (pattern == kAllUrlsPattern) {
178     SetMatchAllURLs(true);
179     return PARSE_SUCCESS;
180   }
181 
182   // Parse out the scheme.
183   size_t scheme_end_pos = pattern.find(url::kStandardSchemeSeparator);
184   bool has_standard_scheme_separator = true;
185 
186   // Some urls also use ':' alone as the scheme separator.
187   if (scheme_end_pos == std::string::npos) {
188     scheme_end_pos = pattern.find(':');
189     has_standard_scheme_separator = false;
190   }
191 
192   if (scheme_end_pos == std::string::npos)
193     return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;
194 
195   if (!SetScheme(pattern.substr(0, scheme_end_pos)))
196     return PARSE_ERROR_INVALID_SCHEME;
197 
198   bool standard_scheme = IsStandardScheme(scheme_);
199   if (standard_scheme != has_standard_scheme_separator)
200     return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;
201 
202   // Advance past the scheme separator.
203   scheme_end_pos +=
204       (standard_scheme ? strlen(url::kStandardSchemeSeparator) : 1);
205   if (scheme_end_pos >= pattern.size())
206     return PARSE_ERROR_EMPTY_HOST;
207 
208   // Parse out the host and path.
209   size_t host_start_pos = scheme_end_pos;
210   size_t path_start_pos = 0;
211 
212   if (!standard_scheme) {
213     path_start_pos = host_start_pos;
214   } else if (scheme_ == url::kFileScheme) {
215     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
216     if (host_end_pos == std::string::npos) {
217       // Allow hostname omission.
218       // e.g. file://* is interpreted as file:///*,
219       // file://foo* is interpreted as file:///foo*.
220       path_start_pos = host_start_pos - 1;
221     } else {
222       // Ignore hostname if scheme is file://.
223       // e.g. file://localhost/foo is equal to file:///foo.
224       path_start_pos = host_end_pos;
225     }
226   } else {
227     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
228 
229     // Host is required.
230     if (host_start_pos == host_end_pos)
231       return PARSE_ERROR_EMPTY_HOST;
232 
233     if (host_end_pos == std::string::npos)
234       return PARSE_ERROR_EMPTY_PATH;
235 
236     host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
237 
238     // The first component can optionally be '*' to match all subdomains.
239     std::vector<std::string> host_components;
240     base::SplitString(host_, '.', &host_components);
241 
242     // Could be empty if the host only consists of whitespace characters.
243     if (host_components.empty())
244       return PARSE_ERROR_EMPTY_HOST;
245 
246     if (host_components[0] == "*") {
247       match_subdomains_ = true;
248       host_components.erase(host_components.begin(),
249                             host_components.begin() + 1);
250     }
251     host_ = JoinString(host_components, '.');
252 
253     path_start_pos = host_end_pos;
254   }
255 
256   SetPath(pattern.substr(path_start_pos));
257 
258   size_t port_pos = host_.find(':');
259   if (port_pos != std::string::npos) {
260     if (!SetPort(host_.substr(port_pos + 1)))
261       return PARSE_ERROR_INVALID_PORT;
262     host_ = host_.substr(0, port_pos);
263   }
264 
265   // No other '*' can occur in the host, though. This isn't necessary, but is
266   // done as a convenience to developers who might otherwise be confused and
267   // think '*' works as a glob in the host.
268   if (host_.find('*') != std::string::npos)
269     return PARSE_ERROR_INVALID_HOST_WILDCARD;
270 
271   // Null characters are not allowed in hosts.
272   if (host_.find('\0') != std::string::npos)
273     return PARSE_ERROR_INVALID_HOST;
274 
275   return PARSE_SUCCESS;
276 }
277 
SetValidSchemes(int valid_schemes)278 void URLPattern::SetValidSchemes(int valid_schemes) {
279   spec_.clear();
280   valid_schemes_ = valid_schemes;
281 }
282 
SetHost(const std::string & host)283 void URLPattern::SetHost(const std::string& host) {
284   spec_.clear();
285   host_ = host;
286 }
287 
SetMatchAllURLs(bool val)288 void URLPattern::SetMatchAllURLs(bool val) {
289   spec_.clear();
290   match_all_urls_ = val;
291 
292   if (val) {
293     match_subdomains_ = true;
294     scheme_ = "*";
295     host_.clear();
296     SetPath("/*");
297   }
298 }
299 
SetMatchSubdomains(bool val)300 void URLPattern::SetMatchSubdomains(bool val) {
301   spec_.clear();
302   match_subdomains_ = val;
303 }
304 
SetScheme(const std::string & scheme)305 bool URLPattern::SetScheme(const std::string& scheme) {
306   spec_.clear();
307   scheme_ = scheme;
308   if (scheme_ == "*") {
309     valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
310   } else if (!IsValidScheme(scheme_)) {
311     return false;
312   }
313   return true;
314 }
315 
IsValidScheme(const std::string & scheme) const316 bool URLPattern::IsValidScheme(const std::string& scheme) const {
317   if (valid_schemes_ == SCHEME_ALL)
318     return true;
319 
320   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
321     if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
322       return true;
323   }
324 
325   return false;
326 }
327 
SetPath(const std::string & path)328 void URLPattern::SetPath(const std::string& path) {
329   spec_.clear();
330   path_ = path;
331   path_escaped_ = path_;
332   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
333   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
334 }
335 
SetPort(const std::string & port)336 bool URLPattern::SetPort(const std::string& port) {
337   spec_.clear();
338   if (IsValidPortForScheme(scheme_, port)) {
339     port_ = port;
340     return true;
341   }
342   return false;
343 }
344 
MatchesURL(const GURL & test) const345 bool URLPattern::MatchesURL(const GURL& test) const {
346   const GURL* test_url = &test;
347   bool has_inner_url = test.inner_url() != NULL;
348 
349   if (has_inner_url) {
350     if (!test.SchemeIsFileSystem())
351       return false;  // The only nested URLs we handle are filesystem URLs.
352     test_url = test.inner_url();
353   }
354 
355   if (!MatchesScheme(test_url->scheme()))
356     return false;
357 
358   if (match_all_urls_)
359     return true;
360 
361   std::string path_for_request = test.PathForRequest();
362   if (has_inner_url)
363     path_for_request = test_url->path() + path_for_request;
364 
365   return MatchesSecurityOriginHelper(*test_url) &&
366          MatchesPath(path_for_request);
367 }
368 
MatchesSecurityOrigin(const GURL & test) const369 bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
370   const GURL* test_url = &test;
371   bool has_inner_url = test.inner_url() != NULL;
372 
373   if (has_inner_url) {
374     if (!test.SchemeIsFileSystem())
375       return false;  // The only nested URLs we handle are filesystem URLs.
376     test_url = test.inner_url();
377   }
378 
379   if (!MatchesScheme(test_url->scheme()))
380     return false;
381 
382   if (match_all_urls_)
383     return true;
384 
385   return MatchesSecurityOriginHelper(*test_url);
386 }
387 
MatchesScheme(const std::string & test) const388 bool URLPattern::MatchesScheme(const std::string& test) const {
389   if (!IsValidScheme(test))
390     return false;
391 
392   return scheme_ == "*" || test == scheme_;
393 }
394 
MatchesHost(const std::string & host) const395 bool URLPattern::MatchesHost(const std::string& host) const {
396   std::string test(url::kHttpScheme);
397   test += url::kStandardSchemeSeparator;
398   test += host;
399   test += "/";
400   return MatchesHost(GURL(test));
401 }
402 
MatchesHost(const GURL & test) const403 bool URLPattern::MatchesHost(const GURL& test) const {
404   // If the hosts are exactly equal, we have a match.
405   if (test.host() == host_)
406     return true;
407 
408   // If we're matching subdomains, and we have no host in the match pattern,
409   // that means that we're matching all hosts, which means we have a match no
410   // matter what the test host is.
411   if (match_subdomains_ && host_.empty())
412     return true;
413 
414   // Otherwise, we can only match if our match pattern matches subdomains.
415   if (!match_subdomains_)
416     return false;
417 
418   // We don't do subdomain matching against IP addresses, so we can give up now
419   // if the test host is an IP address.
420   if (test.HostIsIPAddress())
421     return false;
422 
423   // Check if the test host is a subdomain of our host.
424   if (test.host().length() <= (host_.length() + 1))
425     return false;
426 
427   if (test.host().compare(test.host().length() - host_.length(),
428                           host_.length(), host_) != 0)
429     return false;
430 
431   return test.host()[test.host().length() - host_.length() - 1] == '.';
432 }
433 
ImpliesAllHosts() const434 bool URLPattern::ImpliesAllHosts() const {
435   // Check if it matches all urls or is a pattern like http://*/*.
436   if (match_all_urls_ ||
437       (match_subdomains_ && host_.empty() && port_ == "*" && path_ == "/*")) {
438     return true;
439   }
440 
441   // If this doesn't even match subdomains, it can't possibly imply all hosts.
442   if (!match_subdomains_)
443     return false;
444 
445   // If |host_| is a recognized TLD, this will be 0. We don't include private
446   // TLDs, so that, e.g., *.appspot.com does not imply all hosts.
447   size_t registry_length = net::registry_controlled_domains::GetRegistryLength(
448       host_,
449       net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
450       net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
451   // If there was more than just a TLD in the host (e.g., *.foobar.com), it
452   // doesn't imply all hosts.
453   if (registry_length > 0)
454     return false;
455 
456   // At this point the host could either be just a TLD ("com") or some unknown
457   // TLD-like string ("notatld"). To disambiguate between them construct a
458   // fake URL, and check the registry. This returns 0 if the TLD is
459   // unrecognized, or the length of the recognized TLD.
460   registry_length = net::registry_controlled_domains::GetRegistryLength(
461       base::StringPrintf("foo.%s", host_.c_str()),
462       net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
463       net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
464   // If we recognized this TLD, then this is a pattern like *.com, and it
465   // should imply all hosts. Otherwise, this doesn't imply all hosts.
466   return registry_length > 0;
467 }
468 
MatchesSingleOrigin() const469 bool URLPattern::MatchesSingleOrigin() const {
470   // Strictly speaking, the port is part of the origin, but in URLPattern it
471   // defaults to *. It's not very interesting anyway, so leave it out.
472   return !ImpliesAllHosts() && scheme_ != "*" && !match_subdomains_;
473 }
474 
MatchesPath(const std::string & test) const475 bool URLPattern::MatchesPath(const std::string& test) const {
476   // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
477   // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
478   if (test + "/*" == path_escaped_)
479     return true;
480 
481   return MatchPattern(test, path_escaped_);
482 }
483 
GetAsString() const484 const std::string& URLPattern::GetAsString() const {
485   if (!spec_.empty())
486     return spec_;
487 
488   if (match_all_urls_) {
489     spec_ = kAllUrlsPattern;
490     return spec_;
491   }
492 
493   bool standard_scheme = IsStandardScheme(scheme_);
494 
495   std::string spec = scheme_ +
496       (standard_scheme ? url::kStandardSchemeSeparator : ":");
497 
498   if (scheme_ != url::kFileScheme && standard_scheme) {
499     if (match_subdomains_) {
500       spec += "*";
501       if (!host_.empty())
502         spec += ".";
503     }
504 
505     if (!host_.empty())
506       spec += host_;
507 
508     if (port_ != "*") {
509       spec += ":";
510       spec += port_;
511     }
512   }
513 
514   if (!path_.empty())
515     spec += path_;
516 
517   spec_ = spec;
518   return spec_;
519 }
520 
OverlapsWith(const URLPattern & other) const521 bool URLPattern::OverlapsWith(const URLPattern& other) const {
522   if (match_all_urls() || other.match_all_urls())
523     return true;
524   return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
525           other.MatchesAnyScheme(GetExplicitSchemes()))
526       && (MatchesHost(other.host()) || other.MatchesHost(host()))
527       && (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port()))
528       && (MatchesPath(StripTrailingWildcard(other.path())) ||
529           other.MatchesPath(StripTrailingWildcard(path())));
530 }
531 
Contains(const URLPattern & other) const532 bool URLPattern::Contains(const URLPattern& other) const {
533   if (match_all_urls())
534     return true;
535   return MatchesAllSchemes(other.GetExplicitSchemes())
536       && MatchesHost(other.host())
537       && MatchesPortPattern(other.port())
538       && MatchesPath(StripTrailingWildcard(other.path()));
539 }
540 
MatchesAnyScheme(const std::vector<std::string> & schemes) const541 bool URLPattern::MatchesAnyScheme(
542     const std::vector<std::string>& schemes) const {
543   for (std::vector<std::string>::const_iterator i = schemes.begin();
544        i != schemes.end(); ++i) {
545     if (MatchesScheme(*i))
546       return true;
547   }
548 
549   return false;
550 }
551 
MatchesAllSchemes(const std::vector<std::string> & schemes) const552 bool URLPattern::MatchesAllSchemes(
553     const std::vector<std::string>& schemes) const {
554   for (std::vector<std::string>::const_iterator i = schemes.begin();
555        i != schemes.end(); ++i) {
556     if (!MatchesScheme(*i))
557       return false;
558   }
559 
560   return true;
561 }
562 
MatchesSecurityOriginHelper(const GURL & test) const563 bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
564   // Ignore hostname if scheme is file://.
565   if (scheme_ != url::kFileScheme && !MatchesHost(test))
566     return false;
567 
568   if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
569     return false;
570 
571   return true;
572 }
573 
MatchesPortPattern(const std::string & port) const574 bool URLPattern::MatchesPortPattern(const std::string& port) const {
575   return port_ == "*" || port_ == port;
576 }
577 
GetExplicitSchemes() const578 std::vector<std::string> URLPattern::GetExplicitSchemes() const {
579   std::vector<std::string> result;
580 
581   if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
582     result.push_back(scheme_);
583     return result;
584   }
585 
586   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
587     if (MatchesScheme(kValidSchemes[i])) {
588       result.push_back(kValidSchemes[i]);
589     }
590   }
591 
592   return result;
593 }
594 
ConvertToExplicitSchemes() const595 std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
596   std::vector<std::string> explicit_schemes = GetExplicitSchemes();
597   std::vector<URLPattern> result;
598 
599   for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
600        i != explicit_schemes.end(); ++i) {
601     URLPattern temp = *this;
602     temp.SetScheme(*i);
603     temp.SetMatchAllURLs(false);
604     result.push_back(temp);
605   }
606 
607   return result;
608 }
609 
610 // static
GetParseResultString(URLPattern::ParseResult parse_result)611 const char* URLPattern::GetParseResultString(
612     URLPattern::ParseResult parse_result) {
613   return kParseResultMessages[parse_result];
614 }
615