1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_ 6 #define COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_ 7 8 #include <limits> 9 #include <map> 10 #include <set> 11 #include <string> 12 #include <vector> 13 14 #include "base/basictypes.h" 15 #include "components/url_matcher/string_pattern.h" 16 #include "components/url_matcher/url_matcher_export.h" 17 18 namespace url_matcher { 19 20 // Class that store a set of string patterns and can find for a string S, 21 // which string patterns occur in S. 22 class URL_MATCHER_EXPORT SubstringSetMatcher { 23 public: 24 SubstringSetMatcher(); 25 ~SubstringSetMatcher(); 26 27 // Registers all |patterns|. The ownership remains with the caller. 28 // The same pattern cannot be registered twice and each pattern needs to have 29 // a unique ID. 30 // Ownership of the patterns remains with the caller. 31 void RegisterPatterns(const std::vector<const StringPattern*>& patterns); 32 33 // Unregisters the passed |patterns|. 34 void UnregisterPatterns(const std::vector<const StringPattern*>& patterns); 35 36 // Analogous to RegisterPatterns and UnregisterPatterns but executes both 37 // operations in one step, which is cheaper in the execution. 38 void RegisterAndUnregisterPatterns( 39 const std::vector<const StringPattern*>& to_register, 40 const std::vector<const StringPattern*>& to_unregister); 41 42 // Matches |text| against all registered StringPatterns. Stores the IDs 43 // of matching patterns in |matches|. |matches| is not cleared before adding 44 // to it. 45 bool Match(const std::string& text, 46 std::set<StringPattern::ID>* matches) const; 47 48 // Returns true if this object retains no allocated data. Only for debugging. 49 bool IsEmpty() const; 50 51 private: 52 // A node of an Aho Corasick Tree. This is implemented according to 53 // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf 54 // 55 // The algorithm is based on the idea of building a trie of all registered 56 // patterns. Each node of the tree is annotated with a set of pattern 57 // IDs that are used to report matches. 58 // 59 // The root of the trie represents an empty match. If we were looking whether 60 // any registered pattern matches a text at the beginning of the text (i.e. 61 // whether any pattern is a prefix of the text), we could just follow 62 // nodes in the trie according to the matching characters in the text. 63 // E.g., if text == "foobar", we would follow the trie from the root node 64 // to its child labeled 'f', from there to child 'o', etc. In this process we 65 // would report all pattern IDs associated with the trie nodes as matches. 66 // 67 // As we are not looking for all prefix matches but all substring matches, 68 // this algorithm would need to compare text.substr(0), text.substr(1), ... 69 // against the trie, which is in O(|text|^2). 70 // 71 // The Aho Corasick algorithm improves this runtime by using failure edges. 72 // In case we have found a partial match of length k in the text 73 // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at 74 // a node at depth k, but cannot find a match in the trie for character 75 // text[i + k] at depth k + 1, we follow a failure edge. This edge 76 // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that 77 // is a prefix of any registered pattern. 78 // 79 // If your brain thinks "Forget it, let's go shopping.", don't worry. 80 // Take a nap and read an introductory text on the Aho Corasick algorithm. 81 // It will make sense. Eventually. 82 class AhoCorasickNode { 83 public: 84 // Key: label of the edge, value: node index in |tree_| of parent class. 85 typedef std::map<char, uint32> Edges; 86 typedef std::set<StringPattern::ID> Matches; 87 88 static const uint32 kNoSuchEdge; // Represents an invalid node index. 89 90 AhoCorasickNode(); 91 ~AhoCorasickNode(); 92 AhoCorasickNode(const AhoCorasickNode& other); 93 AhoCorasickNode& operator=(const AhoCorasickNode& other); 94 95 uint32 GetEdge(char c) const; 96 void SetEdge(char c, uint32 node); edges()97 const Edges& edges() const { return edges_; } 98 failure()99 uint32 failure() const { return failure_; } set_failure(uint32 failure)100 void set_failure(uint32 failure) { failure_ = failure; } 101 102 void AddMatch(StringPattern::ID id); 103 void AddMatches(const Matches& matches); matches()104 const Matches& matches() const { return matches_; } 105 106 private: 107 // Outgoing edges of current node. 108 Edges edges_; 109 110 // Node index that failure edge leads to. 111 uint32 failure_; 112 113 // Identifiers of matches. 114 Matches matches_; 115 }; 116 117 typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap; 118 typedef std::vector<const StringPattern*> SubstringPatternVector; 119 120 // |sorted_patterns| is a copy of |patterns_| sorted by the pattern string. 121 void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns); 122 123 // Inserts a path for |pattern->pattern()| into the tree and adds 124 // |pattern->id()| to the set of matches. Ownership of |pattern| remains with 125 // the caller. 126 void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern); 127 void CreateFailureEdges(); 128 129 // Set of all registered StringPatterns. Used to regenerate the 130 // Aho-Corasick tree in case patterns are registered or unregistered. 131 SubstringPatternMap patterns_; 132 133 // The nodes of a Aho-Corasick tree. 134 std::vector<AhoCorasickNode> tree_; 135 136 DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher); 137 }; 138 139 } // namespace url_matcher 140 141 #endif // COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_ 142