• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/url_matcher/substring_set_matcher.h"
6 
7 #include <algorithm>
8 #include <queue>
9 
10 #include "base/logging.h"
11 #include "base/stl_util.h"
12 
13 namespace url_matcher {
14 
15 namespace {
16 
17 // Compare StringPattern instances based on their string patterns.
ComparePatterns(const StringPattern * a,const StringPattern * b)18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) {
19   return a->pattern() < b->pattern();
20 }
21 
22 // Given the set of patterns, compute how many nodes will the corresponding
23 // Aho-Corasick tree have. Note that |patterns| need to be sorted.
TreeSize(const std::vector<const StringPattern * > & patterns)24 uint32 TreeSize(const std::vector<const StringPattern*>& patterns) {
25   uint32 result = 1u;  // 1 for the root node.
26   if (patterns.empty())
27     return result;
28 
29   std::vector<const StringPattern*>::const_iterator last = patterns.begin();
30   std::vector<const StringPattern*>::const_iterator current = last + 1;
31   // For the first pattern, each letter is a label of an edge to a new node.
32   result += (*last)->pattern().size();
33 
34   // For the subsequent patterns, only count the edges which were not counted
35   // yet. For this it suffices to test against the previous pattern, because the
36   // patterns are sorted.
37   for (; current != patterns.end(); ++last, ++current) {
38     const std::string& last_pattern = (*last)->pattern();
39     const std::string& current_pattern = (*current)->pattern();
40     const uint32 prefix_bound =
41         std::min(last_pattern.size(), current_pattern.size());
42 
43     uint32 common_prefix = 0;
44     while (common_prefix < prefix_bound &&
45            last_pattern[common_prefix] == current_pattern[common_prefix])
46       ++common_prefix;
47     result += current_pattern.size() - common_prefix;
48   }
49   return result;
50 }
51 
52 }  // namespace
53 
54 //
55 // SubstringSetMatcher
56 //
57 
SubstringSetMatcher()58 SubstringSetMatcher::SubstringSetMatcher() {
59   RebuildAhoCorasickTree(SubstringPatternVector());
60 }
61 
~SubstringSetMatcher()62 SubstringSetMatcher::~SubstringSetMatcher() {}
63 
RegisterPatterns(const std::vector<const StringPattern * > & patterns)64 void SubstringSetMatcher::RegisterPatterns(
65     const std::vector<const StringPattern*>& patterns) {
66   RegisterAndUnregisterPatterns(patterns,
67                                 std::vector<const StringPattern*>());
68 }
69 
UnregisterPatterns(const std::vector<const StringPattern * > & patterns)70 void SubstringSetMatcher::UnregisterPatterns(
71     const std::vector<const StringPattern*>& patterns) {
72   RegisterAndUnregisterPatterns(std::vector<const StringPattern*>(),
73                                 patterns);
74 }
75 
RegisterAndUnregisterPatterns(const std::vector<const StringPattern * > & to_register,const std::vector<const StringPattern * > & to_unregister)76 void SubstringSetMatcher::RegisterAndUnregisterPatterns(
77       const std::vector<const StringPattern*>& to_register,
78       const std::vector<const StringPattern*>& to_unregister) {
79   // Register patterns.
80   for (std::vector<const StringPattern*>::const_iterator i =
81       to_register.begin(); i != to_register.end(); ++i) {
82     DCHECK(patterns_.find((*i)->id()) == patterns_.end());
83     patterns_[(*i)->id()] = *i;
84   }
85 
86   // Unregister patterns
87   for (std::vector<const StringPattern*>::const_iterator i =
88       to_unregister.begin(); i != to_unregister.end(); ++i) {
89     patterns_.erase((*i)->id());
90   }
91 
92   // Now we compute the total number of tree nodes needed.
93   SubstringPatternVector sorted_patterns;
94   sorted_patterns.resize(patterns_.size());
95 
96   size_t next = 0;
97   for (SubstringPatternMap::const_iterator i = patterns_.begin();
98        i != patterns_.end();
99        ++i, ++next) {
100     sorted_patterns[next] = i->second;
101   }
102 
103   std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns);
104   tree_.reserve(TreeSize(sorted_patterns));
105 
106   RebuildAhoCorasickTree(sorted_patterns);
107 }
108 
Match(const std::string & text,std::set<StringPattern::ID> * matches) const109 bool SubstringSetMatcher::Match(const std::string& text,
110                                 std::set<StringPattern::ID>* matches) const {
111   const size_t old_number_of_matches = matches->size();
112 
113   // Handle patterns matching the empty string.
114   matches->insert(tree_[0].matches().begin(), tree_[0].matches().end());
115 
116   uint32 current_node = 0;
117   for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) {
118     uint32 edge_from_current = tree_[current_node].GetEdge(*i);
119     while (edge_from_current == AhoCorasickNode::kNoSuchEdge &&
120            current_node != 0) {
121       current_node = tree_[current_node].failure();
122       edge_from_current = tree_[current_node].GetEdge(*i);
123     }
124     if (edge_from_current != AhoCorasickNode::kNoSuchEdge) {
125       current_node = edge_from_current;
126       matches->insert(tree_[current_node].matches().begin(),
127                       tree_[current_node].matches().end());
128     } else {
129       DCHECK_EQ(0u, current_node);
130     }
131   }
132 
133   return old_number_of_matches != matches->size();
134 }
135 
IsEmpty() const136 bool SubstringSetMatcher::IsEmpty() const {
137   // An empty tree consists of only the root node.
138   return patterns_.empty() && tree_.size() == 1u;
139 }
140 
RebuildAhoCorasickTree(const SubstringPatternVector & sorted_patterns)141 void SubstringSetMatcher::RebuildAhoCorasickTree(
142     const SubstringPatternVector& sorted_patterns) {
143   tree_.clear();
144 
145   // Initialize root note of tree.
146   AhoCorasickNode root;
147   root.set_failure(0);
148   tree_.push_back(root);
149 
150   // Insert all patterns.
151   for (SubstringPatternVector::const_iterator i = sorted_patterns.begin();
152        i != sorted_patterns.end();
153        ++i) {
154     InsertPatternIntoAhoCorasickTree(*i);
155   }
156 
157   CreateFailureEdges();
158 }
159 
InsertPatternIntoAhoCorasickTree(const StringPattern * pattern)160 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
161     const StringPattern* pattern) {
162   const std::string& text = pattern->pattern();
163   const std::string::const_iterator text_end = text.end();
164 
165   // Iterators on the tree and the text.
166   uint32 current_node = 0;
167   std::string::const_iterator i = text.begin();
168 
169   // Follow existing paths for as long as possible.
170   while (i != text_end) {
171     uint32 edge_from_current = tree_[current_node].GetEdge(*i);
172     if (edge_from_current == AhoCorasickNode::kNoSuchEdge)
173       break;
174     current_node = edge_from_current;
175     ++i;
176   }
177 
178   // Create new nodes if necessary.
179   while (i != text_end) {
180     tree_.push_back(AhoCorasickNode());
181     tree_[current_node].SetEdge(*i, tree_.size() - 1);
182     current_node = tree_.size() - 1;
183     ++i;
184   }
185 
186   // Register match.
187   tree_[current_node].AddMatch(pattern->id());
188 }
189 
CreateFailureEdges()190 void SubstringSetMatcher::CreateFailureEdges() {
191   typedef AhoCorasickNode::Edges Edges;
192 
193   std::queue<uint32> queue;
194 
195   AhoCorasickNode& root = tree_[0];
196   root.set_failure(0);
197   const Edges& root_edges = root.edges();
198   for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end();
199        ++e) {
200     const uint32& leads_to = e->second;
201     tree_[leads_to].set_failure(0);
202     queue.push(leads_to);
203   }
204 
205   while (!queue.empty()) {
206     AhoCorasickNode& current_node = tree_[queue.front()];
207     queue.pop();
208     for (Edges::const_iterator e = current_node.edges().begin();
209          e != current_node.edges().end(); ++e) {
210       const char& edge_label = e->first;
211       const uint32& leads_to = e->second;
212       queue.push(leads_to);
213 
214       uint32 failure = current_node.failure();
215       uint32 edge_from_failure = tree_[failure].GetEdge(edge_label);
216       while (edge_from_failure == AhoCorasickNode::kNoSuchEdge &&
217              failure != 0) {
218         failure = tree_[failure].failure();
219         edge_from_failure = tree_[failure].GetEdge(edge_label);
220       }
221 
222       const uint32 follow_in_case_of_failure =
223           edge_from_failure != AhoCorasickNode::kNoSuchEdge
224               ? edge_from_failure
225               : 0;
226       tree_[leads_to].set_failure(follow_in_case_of_failure);
227       tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches());
228     }
229   }
230 }
231 
232 const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = ~0;
233 
AhoCorasickNode()234 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode()
235     : failure_(kNoSuchEdge) {}
236 
~AhoCorasickNode()237 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {}
238 
AhoCorasickNode(const SubstringSetMatcher::AhoCorasickNode & other)239 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(
240     const SubstringSetMatcher::AhoCorasickNode& other)
241     : edges_(other.edges_),
242       failure_(other.failure_),
243       matches_(other.matches_) {}
244 
245 SubstringSetMatcher::AhoCorasickNode&
operator =(const SubstringSetMatcher::AhoCorasickNode & other)246 SubstringSetMatcher::AhoCorasickNode::operator=(
247     const SubstringSetMatcher::AhoCorasickNode& other) {
248   edges_ = other.edges_;
249   failure_ = other.failure_;
250   matches_ = other.matches_;
251   return *this;
252 }
253 
GetEdge(char c) const254 uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const {
255   Edges::const_iterator i = edges_.find(c);
256   return i == edges_.end() ? kNoSuchEdge : i->second;
257 }
258 
SetEdge(char c,uint32 node)259 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) {
260   edges_[c] = node;
261 }
262 
AddMatch(StringPattern::ID id)263 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) {
264   matches_.insert(id);
265 }
266 
AddMatches(const SubstringSetMatcher::AhoCorasickNode::Matches & matches)267 void SubstringSetMatcher::AhoCorasickNode::AddMatches(
268     const SubstringSetMatcher::AhoCorasickNode::Matches& matches) {
269   matches_.insert(matches.begin(), matches.end());
270 }
271 
272 }  // namespace url_matcher
273