• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef COMPONENTS_URL_MATCHER_URL_MATCHER_H_
6 #define COMPONENTS_URL_MATCHER_URL_MATCHER_H_
7 
8 #include <set>
9 #include <vector>
10 
11 #include "base/memory/ref_counted.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/memory/scoped_vector.h"
14 #include "components/url_matcher/regex_set_matcher.h"
15 #include "components/url_matcher/substring_set_matcher.h"
16 #include "components/url_matcher/url_matcher_export.h"
17 
18 class GURL;
19 
20 namespace base {
21 class DictionaryValue;
22 }
23 
24 namespace url_matcher {
25 
26 // This class represents a single URL matching condition, e.g. a match on the
27 // host suffix or the containment of a string in the query component of a GURL.
28 //
29 // The difference from a simple StringPattern is that this also supports
30 // checking whether the {Host, Path, Query} of a URL contains a string. The
31 // reduction of URL matching conditions to StringPatterns conducted by
32 // URLMatcherConditionFactory is not capable of expressing that alone.
33 //
34 // Also supported is matching regular expressions against the URL (URL_MATCHES).
35 class URL_MATCHER_EXPORT URLMatcherCondition {
36  public:
37   enum Criterion {
38     HOST_PREFIX,
39     HOST_SUFFIX,
40     HOST_CONTAINS,
41     HOST_EQUALS,
42     PATH_PREFIX,
43     PATH_SUFFIX,
44     PATH_CONTAINS,
45     PATH_EQUALS,
46     QUERY_PREFIX,
47     QUERY_SUFFIX,
48     QUERY_CONTAINS,
49     QUERY_EQUALS,
50     HOST_SUFFIX_PATH_PREFIX,
51     HOST_EQUALS_PATH_PREFIX,
52     URL_PREFIX,
53     URL_SUFFIX,
54     URL_CONTAINS,
55     URL_EQUALS,
56     URL_MATCHES,
57     ORIGIN_AND_PATH_MATCHES,  // Matches the URL minus its query string.
58   };
59 
60   URLMatcherCondition();
61   ~URLMatcherCondition();
62   URLMatcherCondition(Criterion criterion,
63                       const StringPattern* substring_pattern);
64   URLMatcherCondition(const URLMatcherCondition& rhs);
65   URLMatcherCondition& operator=(const URLMatcherCondition& rhs);
66   bool operator<(const URLMatcherCondition& rhs) const;
67 
criterion()68   Criterion criterion() const { return criterion_; }
string_pattern()69   const StringPattern* string_pattern() const {
70     return string_pattern_;
71   }
72 
73   // Returns whether this URLMatcherCondition needs to be executed on a
74   // full URL rather than the individual components (see
75   // URLMatcherConditionFactory).
76   bool IsFullURLCondition() const;
77 
78   // Returns whether this URLMatcherCondition is a regular expression to be
79   // handled by a regex matcher instead of a substring matcher.
80   bool IsRegexCondition() const;
81 
82   // Returns whether this URLMatcherCondition is a regular expression that shall
83   // be evaluated on the URL without the query parameter.
84   bool IsOriginAndPathRegexCondition() const;
85 
86   // Returns whether this condition is fulfilled according to
87   // |matching_patterns| and |url|.
88   bool IsMatch(const std::set<StringPattern::ID>& matching_patterns,
89                const GURL& url) const;
90 
91  private:
92   // |criterion_| and |string_pattern_| describe together what property a URL
93   // needs to fulfill to be considered a match.
94   Criterion criterion_;
95 
96   // This is the StringPattern that is used in a SubstringSetMatcher.
97   const StringPattern* string_pattern_;
98 };
99 
100 // Class to map the problem of finding {host, path, query} {prefixes, suffixes,
101 // containments, and equality} in GURLs to the substring matching problem.
102 //
103 // Say, you want to check whether the path of a URL starts with "/index.html".
104 // This class preprocesses a URL like "www.google.com/index.html" into something
105 // like "www.google.com|/index.html". After preprocessing, you can search for
106 // "|/index.html" in the string and see that this candidate URL actually has
107 // a path that starts with "/index.html". On the contrary,
108 // "www.google.com/images/index.html" would be normalized to
109 // "www.google.com|/images/index.html". It is easy to see that it contains
110 // "/index.html" but the path of the URL does not start with "/index.html".
111 //
112 // This preprocessing is important if you want to match a URL against many
113 // patterns because it reduces the matching to a "discover all substrings
114 // of a dictionary in a text" problem, which can be solved very efficiently
115 // by the Aho-Corasick algorithm.
116 //
117 // IMPORTANT: The URLMatcherConditionFactory owns the StringPattern
118 // referenced by created URLMatcherConditions. Therefore, it must outlive
119 // all created URLMatcherCondition and the SubstringSetMatcher.
120 class URL_MATCHER_EXPORT URLMatcherConditionFactory {
121  public:
122   URLMatcherConditionFactory();
123   ~URLMatcherConditionFactory();
124 
125   // Canonicalizes a URL for "Create{Host,Path,Query}*Condition" searches.
126   std::string CanonicalizeURLForComponentSearches(const GURL& url) const;
127 
128   // Factory methods for various condition types.
129   //
130   // Note that these methods fill the pattern_singletons_. If you create
131   // conditions and don't register them to a URLMatcher, they will continue to
132   // consume memory. You need to call ForgetUnusedPatterns() or
133   // URLMatcher::ClearUnusedConditionSets() in this case.
134   URLMatcherCondition CreateHostPrefixCondition(const std::string& prefix);
135   URLMatcherCondition CreateHostSuffixCondition(const std::string& suffix);
136   URLMatcherCondition CreateHostContainsCondition(const std::string& str);
137   URLMatcherCondition CreateHostEqualsCondition(const std::string& str);
138 
139   URLMatcherCondition CreatePathPrefixCondition(const std::string& prefix);
140   URLMatcherCondition CreatePathSuffixCondition(const std::string& suffix);
141   URLMatcherCondition CreatePathContainsCondition(const std::string& str);
142   URLMatcherCondition CreatePathEqualsCondition(const std::string& str);
143 
144   URLMatcherCondition CreateQueryPrefixCondition(const std::string& prefix);
145   URLMatcherCondition CreateQuerySuffixCondition(const std::string& suffix);
146   URLMatcherCondition CreateQueryContainsCondition(const std::string& str);
147   URLMatcherCondition CreateQueryEqualsCondition(const std::string& str);
148 
149   // This covers the common case, where you don't care whether a domain
150   // "foobar.com" is expressed as "foobar.com" or "www.foobar.com", and it
151   // should be followed by a given |path_prefix|.
152   URLMatcherCondition CreateHostSuffixPathPrefixCondition(
153       const std::string& host_suffix,
154       const std::string& path_prefix);
155   URLMatcherCondition CreateHostEqualsPathPrefixCondition(
156       const std::string& host,
157       const std::string& path_prefix);
158 
159   // Canonicalizes a URL for "CreateURL*Condition" searches.
160   std::string CanonicalizeURLForFullSearches(const GURL& url) const;
161 
162   // Canonicalizes a URL for "CreateURLMatchesCondition" searches.
163   std::string CanonicalizeURLForRegexSearches(const GURL& url) const;
164   // Canonicalizes a URL for "CreateOriginAndPathMatchesCondition" searches.
165   std::string CanonicalizeURLForOriginAndPathRegexSearches(
166       const GURL& url) const;
167 
168   URLMatcherCondition CreateURLPrefixCondition(const std::string& prefix);
169   URLMatcherCondition CreateURLSuffixCondition(const std::string& suffix);
170   URLMatcherCondition CreateURLContainsCondition(const std::string& str);
171   URLMatcherCondition CreateURLEqualsCondition(const std::string& str);
172 
173   URLMatcherCondition CreateURLMatchesCondition(const std::string& regex);
174   URLMatcherCondition CreateOriginAndPathMatchesCondition(
175       const std::string& regex);
176 
177   // Removes all patterns from |pattern_singletons_| that are not listed in
178   // |used_patterns|. These patterns are not referenced any more and get
179   // freed.
180   void ForgetUnusedPatterns(
181       const std::set<StringPattern::ID>& used_patterns);
182 
183   // Returns true if this object retains no allocated data. Only for debugging.
184   bool IsEmpty() const;
185 
186  private:
187   // Creates a URLMatcherCondition according to the parameters passed.
188   // The URLMatcherCondition will refer to a StringPattern that is
189   // owned by |pattern_singletons_|.
190   URLMatcherCondition CreateCondition(URLMatcherCondition::Criterion criterion,
191                                       const std::string& pattern);
192 
193   // Prepends a "." to the hostname if it does not start with one.
194   std::string CanonicalizeHostname(const std::string& hostname) const;
195 
196   // Counter that ensures that all created StringPatterns have unique IDs.
197   // Note that substring patterns and regex patterns will use different IDs.
198   int id_counter_;
199 
200   // This comparison considers only the pattern() value of the
201   // StringPatterns.
202   struct StringPatternPointerCompare {
203     bool operator()(StringPattern* lhs, StringPattern* rhs) const;
204   };
205   // Set to ensure that we generate only one StringPattern for each content
206   // of StringPattern::pattern().
207   typedef std::set<StringPattern*, StringPatternPointerCompare>
208       PatternSingletons;
209   PatternSingletons substring_pattern_singletons_;
210   PatternSingletons regex_pattern_singletons_;
211   PatternSingletons origin_and_path_regex_pattern_singletons_;
212 
213   DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionFactory);
214 };
215 
216 // This class represents a filter for the URL scheme to be hooked up into a
217 // URLMatcherConditionSet.
218 class URL_MATCHER_EXPORT URLMatcherSchemeFilter {
219  public:
220   explicit URLMatcherSchemeFilter(const std::string& filter);
221   explicit URLMatcherSchemeFilter(const std::vector<std::string>& filters);
222   ~URLMatcherSchemeFilter();
223   bool IsMatch(const GURL& url) const;
224 
225  private:
226   std::vector<std::string> filters_;
227 
228   DISALLOW_COPY_AND_ASSIGN(URLMatcherSchemeFilter);
229 };
230 
231 // This class represents a filter for port numbers to be hooked up into a
232 // URLMatcherConditionSet.
233 class URL_MATCHER_EXPORT URLMatcherPortFilter {
234  public:
235   // Boundaries of a port range (both ends are included).
236   typedef std::pair<int, int> Range;
237   explicit URLMatcherPortFilter(const std::vector<Range>& ranges);
238   ~URLMatcherPortFilter();
239   bool IsMatch(const GURL& url) const;
240 
241   // Creates a port range [from, to]; both ends are included.
242   static Range CreateRange(int from, int to);
243   // Creates a port range containing a single port.
244   static Range CreateRange(int port);
245 
246  private:
247   std::vector<Range> ranges_;
248 
249   DISALLOW_COPY_AND_ASSIGN(URLMatcherPortFilter);
250 };
251 
252 // This class represents a set of conditions that all need to match on a
253 // given URL in order to be considered a match.
254 class URL_MATCHER_EXPORT URLMatcherConditionSet
255     : public base::RefCounted<URLMatcherConditionSet> {
256  public:
257   typedef int ID;
258   typedef std::set<URLMatcherCondition> Conditions;
259   typedef std::vector<scoped_refptr<URLMatcherConditionSet> > Vector;
260 
261   // Matches if all conditions in |conditions| are fulfilled.
262   URLMatcherConditionSet(ID id, const Conditions& conditions);
263 
264   // Matches if all conditions in |conditions|, |scheme_filter| and
265   // |port_filter| are fulfilled. |scheme_filter| and |port_filter| may be NULL,
266   // in which case, no restrictions are imposed on the scheme/port of a URL.
267   URLMatcherConditionSet(ID id, const Conditions& conditions,
268                          scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
269                          scoped_ptr<URLMatcherPortFilter> port_filter);
270 
id()271   ID id() const { return id_; }
conditions()272   const Conditions& conditions() const { return conditions_; }
273 
274   bool IsMatch(const std::set<StringPattern::ID>& matching_patterns,
275                const GURL& url) const;
276 
277  private:
278   friend class base::RefCounted<URLMatcherConditionSet>;
279   ~URLMatcherConditionSet();
280   ID id_;
281   Conditions conditions_;
282   scoped_ptr<URLMatcherSchemeFilter> scheme_filter_;
283   scoped_ptr<URLMatcherPortFilter> port_filter_;
284 
285   DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionSet);
286 };
287 
288 // This class allows matching one URL against a large set of
289 // URLMatcherConditionSets at the same time.
290 class URL_MATCHER_EXPORT URLMatcher {
291  public:
292   URLMatcher();
293   ~URLMatcher();
294 
295   // Adds new URLMatcherConditionSet to this URL Matcher. Each condition set
296   // must have a unique ID.
297   // This is an expensive operation as it triggers pre-calculations on the
298   // currently registered condition sets. Do not call this operation many
299   // times with a single condition set in each call.
300   void AddConditionSets(const URLMatcherConditionSet::Vector& condition_sets);
301 
302   // Removes the listed condition sets. All |condition_set_ids| must be
303   // currently registered. This function should be called with large batches
304   // of |condition_set_ids| at a time to improve performance.
305   void RemoveConditionSets(
306       const std::vector<URLMatcherConditionSet::ID>& condition_set_ids);
307 
308   // Removes all unused condition sets from the ConditionFactory.
309   void ClearUnusedConditionSets();
310 
311   // Returns the IDs of all URLMatcherConditionSet that match to this |url|.
312   std::set<URLMatcherConditionSet::ID> MatchURL(const GURL& url) const;
313 
314   // Returns the URLMatcherConditionFactory that must be used to create
315   // URLMatcherConditionSets for this URLMatcher.
condition_factory()316   URLMatcherConditionFactory* condition_factory() {
317     return &condition_factory_;
318   }
319 
320   // Returns true if this object retains no allocated data. Only for debugging.
321   bool IsEmpty() const;
322 
323  private:
324   void UpdateSubstringSetMatcher(bool full_url_conditions);
325   void UpdateRegexSetMatcher();
326   void UpdateTriggers();
327   void UpdateConditionFactory();
328   void UpdateInternalDatastructures();
329 
330   URLMatcherConditionFactory condition_factory_;
331 
332   // Maps the ID of a URLMatcherConditionSet to the respective
333   // URLMatcherConditionSet.
334   typedef std::map<URLMatcherConditionSet::ID,
335                    scoped_refptr<URLMatcherConditionSet> >
336       URLMatcherConditionSets;
337   URLMatcherConditionSets url_matcher_condition_sets_;
338 
339   // Maps a StringPattern ID to the URLMatcherConditions that need to
340   // be triggered in case of a StringPattern match.
341   typedef std::map<StringPattern::ID, std::set<URLMatcherConditionSet::ID> >
342       StringPatternTriggers;
343   StringPatternTriggers substring_match_triggers_;
344 
345   SubstringSetMatcher full_url_matcher_;
346   SubstringSetMatcher url_component_matcher_;
347   RegexSetMatcher regex_set_matcher_;
348   RegexSetMatcher origin_and_path_regex_set_matcher_;
349   std::set<const StringPattern*> registered_full_url_patterns_;
350   std::set<const StringPattern*> registered_url_component_patterns_;
351 
352   DISALLOW_COPY_AND_ASSIGN(URLMatcher);
353 };
354 
355 }  // namespace url_matcher
356 
357 #endif  // COMPONENTS_URL_MATCHER_URL_MATCHER_H_
358