• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 #ifndef CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
5 #define CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
6 #pragma once
7 
8 #include <functional>
9 #include <string>
10 #include <vector>
11 
12 class GURL;
13 
14 // A pattern that can be used to match URLs. A URLPattern is a very restricted
15 // subset of URL syntax:
16 //
17 // <url-pattern> := <scheme>://<host><path> | '<all_urls>'
18 // <scheme> := '*' | 'http' | 'https' | 'file' | 'ftp' | 'chrome'
19 // <host> := '*' | '*.' <anychar except '/' and '*'>+
20 // <path> := '/' <any chars>
21 //
22 // * Host is not used when the scheme is 'file'.
23 // * The path can have embedded '*' characters which act as glob wildcards.
24 // * '<all_urls>' is a special pattern that matches any URL that contains a
25 //   valid scheme (as specified by valid_schemes_).
26 // * The '*' scheme pattern excludes file URLs.
27 //
28 // Examples of valid patterns:
29 // - http://*/*
30 // - http://*/foo*
31 // - https://*.google.com/foo*bar
32 // - file://monkey*
33 // - http://127.0.0.1/*
34 //
35 // Examples of invalid patterns:
36 // - http://* -- path not specified
37 // - http://*foo/bar -- * not allowed as substring of host component
38 // - http://foo.*.bar/baz -- * must be first component
39 // - http:/bar -- scheme separator not found
40 // - foo://* -- invalid scheme
41 // - chrome:// -- we don't support chrome internal URLs
42 //
43 // Design rationale:
44 // * We need to be able to tell users what 'sites' a given URLPattern will
45 //   affect. For example "This extension will interact with the site
46 //   'www.google.com'.
47 // * We'd like to be able to convert as many existing Greasemonkey @include
48 //   patterns to URLPatterns as possible. Greasemonkey @include patterns are
49 //   simple globs, so this won't be perfect.
50 // * Although we would like to support any scheme, it isn't clear what to tell
51 //   users about URLPatterns that affect data or javascript URLs, so those are
52 //   left out for now.
53 //
54 // From a 2008-ish crawl of userscripts.org, the following patterns were found
55 // in @include lines:
56 // - total lines                    : 24471
57 // - @include *                     :   919
58 // - @include http://[^\*]+?/       : 11128 (no star in host)
59 // - @include http://\*\.[^\*]+?/   :  2325 (host prefixed by *.)
60 // - @include http://\*[^\.][^\*]+?/:  1524 (host prefixed by *, no dot -- many
61 //                                           appear to only need subdomain
62 //                                           matching, not real prefix matching)
63 // - @include http://[^\*/]+\*/     :   320 (host suffixed by *)
64 // - @include contains .tld         :   297 (host suffixed by .tld -- a special
65 //                                           Greasemonkey domain component that
66 //                                           tries to match all valid registry-
67 //                                           controlled suffixes)
68 // - @include http://\*/            :   228 (host is * exactly, but there is
69 //                                           more to the pattern)
70 //
71 // So, we can support at least half of current @include lines without supporting
72 // subdomain matching. We can pick up at least another 10% by supporting
73 // subdomain matching. It is probably possible to coerce more of the existing
74 // patterns to URLPattern, but the resulting pattern will be more restrictive
75 // than the original glob, which is probably better than nothing.
76 class URLPattern {
77  public:
78   // A collection of scheme bitmasks for use with valid_schemes.
79   enum SchemeMasks {
80     SCHEME_NONE       = 0,
81     SCHEME_HTTP       = 1 << 0,
82     SCHEME_HTTPS      = 1 << 1,
83     SCHEME_FILE       = 1 << 2,
84     SCHEME_FTP        = 1 << 3,
85     SCHEME_CHROMEUI   = 1 << 4,
86     SCHEME_FILESYSTEM = 1 << 5,
87     // SCHEME_ALL will match every scheme, including chrome://, chrome-
88     // extension://, about:, etc. Because this has lots of security
89     // implications, third-party extensions should never be able to get access
90     // to URL patterns initialized this way. It should only be used for internal
91     // Chrome code.
92     SCHEME_ALL      = -1,
93   };
94 
95   // Options for URLPattern::Parse().
96   enum ParseOption {
97     PARSE_LENIENT,
98     PARSE_STRICT
99   };
100 
101   // Error codes returned from Parse().
102   enum ParseResult {
103     PARSE_SUCCESS = 0,
104     PARSE_ERROR_MISSING_SCHEME_SEPARATOR,
105     PARSE_ERROR_INVALID_SCHEME,
106     PARSE_ERROR_WRONG_SCHEME_SEPARATOR,
107     PARSE_ERROR_EMPTY_HOST,
108     PARSE_ERROR_INVALID_HOST_WILDCARD,
109     PARSE_ERROR_EMPTY_PATH,
110     PARSE_ERROR_HAS_COLON,  // Only checked when strict checks are enabled.
111     NUM_PARSE_RESULTS
112   };
113 
114   // The <all_urls> string pattern.
115   static const char kAllUrlsPattern[];
116 
117   // Construct an URLPattern with the given set of allowable schemes. See
118   // valid_schemes_ for more info.
119   explicit URLPattern(int valid_schemes);
120 
121   // Convenience to construct a URLPattern from a string. The string is expected
122   // to be a valid pattern. If the string is not known ahead of time, use
123   // Parse() instead, which returns success or failure.
124   URLPattern(int valid_schemes, const std::string& pattern);
125 
126 #if defined(_MSC_VER) && _MSC_VER >= 1600
127   // Note: don't use this directly. This exists so URLPattern can be used
128   // with STL containers.  Starting with Visual Studio 2010, we can't have this
129   // method private and use "friend class std::vector<URLPattern>;" as we used
130   // to do.
131   URLPattern();
132 #endif
133 
134   ~URLPattern();
135 
136   // Gets the bitmask of valid schemes.
valid_schemes()137   int valid_schemes() const { return valid_schemes_; }
set_valid_schemes(int valid_schemes)138   void set_valid_schemes(int valid_schemes) { valid_schemes_ = valid_schemes; }
139 
140   // Gets the host the pattern matches. This can be an empty string if the
141   // pattern matches all hosts (the input was <scheme>://*/<whatever>).
host()142   const std::string& host() const { return host_; }
set_host(const std::string & host)143   void set_host(const std::string& host) { host_ = host; }
144 
145   // Gets whether to match subdomains of host().
match_subdomains()146   bool match_subdomains() const { return match_subdomains_; }
set_match_subdomains(bool val)147   void set_match_subdomains(bool val) { match_subdomains_ = val; }
148 
149   // Gets the path the pattern matches with the leading slash. This can have
150   // embedded asterisks which are interpreted using glob rules.
path()151   const std::string& path() const { return path_; }
152   void SetPath(const std::string& path);
153 
154   // Returns true if this pattern matches all urls.
match_all_urls()155   bool match_all_urls() const { return match_all_urls_; }
set_match_all_urls(bool val)156   void set_match_all_urls(bool val) { match_all_urls_ = val; }
157 
158   // Initializes this instance by parsing the provided string. Returns
159   // URLPattern::PARSE_SUCCESS on success, or an error code otherwise. On
160   // failure, this instance will have some intermediate values and is in an
161   // invalid state.  Adding error checks to URLPattern::Parse() can cause
162   // patterns in installed extensions to fail.  If an installed extension
163   // uses a pattern that was valid but fails a new error check, the
164   // extension will fail to load when chrome is auto-updated.  To avoid
165   // this, new parse checks are enabled only when |strictness| is
166   // OPTION_STRICT.  OPTION_STRICT should be used when loading in developer
167   // mode, or when an extension's patterns are controlled by chrome (such
168   // as component extensions).
169   ParseResult Parse(const std::string& pattern_str,
170                     ParseOption strictness);
171 
172   // Sets the scheme for pattern matches. This can be a single '*' if the
173   // pattern matches all valid schemes (as defined by the valid_schemes_
174   // property). Returns false on failure (if the scheme is not valid).
175   bool SetScheme(const std::string& scheme);
176   // Note: You should use MatchesScheme() instead of this getter unless you
177   // absolutely need the exact scheme. This is exposed for testing.
scheme()178   const std::string& scheme() const { return scheme_; }
179 
180   // Returns true if the specified scheme can be used in this URL pattern, and
181   // false otherwise. Uses valid_schemes_ to determine validity.
182   bool IsValidScheme(const std::string& scheme) const;
183 
184   // Returns true if this instance matches the specified URL.
185   bool MatchesUrl(const GURL& url) const;
186 
187   // Returns true if |test| matches our scheme.
188   bool MatchesScheme(const std::string& test) const;
189 
190   // Returns true if |test| matches our host.
191   bool MatchesHost(const std::string& test) const;
192   bool MatchesHost(const GURL& test) const;
193 
194   // Returns true if |test| matches our path.
195   bool MatchesPath(const std::string& test) const;
196 
197   // Returns a string representing this instance.
198   std::string GetAsString() const;
199 
200   // Determine whether there is a URL that would match this instance and another
201   // instance. This method is symmetrical: Calling other.OverlapsWith(this)
202   // would result in the same answer.
203   bool OverlapsWith(const URLPattern& other) const;
204 
205   // Convert this URLPattern into an equivalent set of URLPatterns that don't
206   // use a wildcard in the scheme component. If this URLPattern doesn't use a
207   // wildcard scheme, then the returned set will contain one element that is
208   // equivalent to this instance.
209   std::vector<URLPattern> ConvertToExplicitSchemes() const;
210 
EffectiveHostCompare(const URLPattern & a,const URLPattern & b)211   static bool EffectiveHostCompare(const URLPattern& a, const URLPattern& b) {
212     if (a.match_all_urls_ && b.match_all_urls_)
213       return false;
214     return a.host_.compare(b.host_) < 0;
215   };
216 
217   // Used for origin comparisons in a std::set.
218   class EffectiveHostCompareFunctor {
219    public:
operator()220     bool operator()(const URLPattern& a, const URLPattern& b) const {
221       return EffectiveHostCompare(a, b);
222     };
223   };
224 
225   // Get an error string for a ParseResult.
226   static const char* GetParseResultString(URLPattern::ParseResult parse_result);
227 
228  private:
229 #if !(defined(_MSC_VER) && _MSC_VER >= 1600)
230   friend class std::vector<URLPattern>;
231 
232   // Note: don't use this directly. This exists so URLPattern can be used
233   // with STL containers.
234   URLPattern();
235 #endif
236 
237   // A bitmask containing the schemes which are considered valid for this
238   // pattern. Parse() uses this to decide whether a pattern contains a valid
239   // scheme. MatchesScheme uses this to decide whether a wildcard scheme_
240   // matches a given test scheme.
241   int valid_schemes_;
242 
243   // True if this is a special-case "<all_urls>" pattern.
244   bool match_all_urls_;
245 
246   // The scheme for the pattern.
247   std::string scheme_;
248 
249   // The host without any leading "*" components.
250   std::string host_;
251 
252   // Whether we should match subdomains of the host. This is true if the first
253   // component of the pattern's host was "*".
254   bool match_subdomains_;
255 
256   // The path to match. This is everything after the host of the URL, or
257   // everything after the scheme in the case of file:// URLs.
258   std::string path_;
259 
260   // The path with "?" and "\" characters escaped for use with the
261   // MatchPattern() function.
262   std::string path_escaped_;
263 };
264 
265 typedef std::vector<URLPattern> URLPatternList;
266 
267 #endif  // CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
268