• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 #include "base/substring_set_matcher/substring_set_matcher.h"
11 
12 #include <stddef.h>
13 
14 #include <algorithm>
15 #include <queue>
16 
17 #ifdef __SSE2__
18 #include <immintrin.h>
19 #include "base/bits.h"
20 #endif
21 
22 #include "base/check_op.h"
23 #include "base/containers/contains.h"
24 #include "base/containers/queue.h"
25 #include "base/numerics/checked_math.h"
26 #include "base/trace_event/memory_usage_estimator.h"  // no-presubmit-check
27 
28 namespace base {
29 
30 namespace {
31 
32 // Compare MatcherStringPattern instances based on their string patterns.
ComparePatterns(const MatcherStringPattern * a,const MatcherStringPattern * b)33 bool ComparePatterns(const MatcherStringPattern* a,
34                      const MatcherStringPattern* b) {
35   return a->pattern() < b->pattern();
36 }
37 
GetVectorOfPointers(const std::vector<MatcherStringPattern> & patterns)38 std::vector<const MatcherStringPattern*> GetVectorOfPointers(
39     const std::vector<MatcherStringPattern>& patterns) {
40   std::vector<const MatcherStringPattern*> pattern_pointers;
41   pattern_pointers.reserve(patterns.size());
42 
43   for (const MatcherStringPattern& pattern : patterns)
44     pattern_pointers.push_back(&pattern);
45 
46   return pattern_pointers;
47 }
48 
49 }  // namespace
50 
Build(const std::vector<MatcherStringPattern> & patterns)51 bool SubstringSetMatcher::Build(
52     const std::vector<MatcherStringPattern>& patterns) {
53   return Build(GetVectorOfPointers(patterns));
54 }
55 
Build(std::vector<const MatcherStringPattern * > patterns)56 bool SubstringSetMatcher::Build(
57     std::vector<const MatcherStringPattern*> patterns) {
58   // Ensure there are no duplicate IDs and all pattern strings are distinct.
59 #if DCHECK_IS_ON()
60   {
61     std::set<MatcherStringPattern::ID> ids;
62     std::set<std::string> pattern_strings;
63     for (const MatcherStringPattern* pattern : patterns) {
64       CHECK(!base::Contains(ids, pattern->id()));
65       CHECK(!base::Contains(pattern_strings, pattern->pattern()));
66       ids.insert(pattern->id());
67       pattern_strings.insert(pattern->pattern());
68     }
69   }
70 #endif
71 
72   // Check that all the match labels fit into an edge.
73   for (const MatcherStringPattern* pattern : patterns) {
74     if (pattern->id() >= kInvalidNodeID) {
75       return false;
76     }
77   }
78 
79   // Compute the total number of tree nodes needed.
80   std::sort(patterns.begin(), patterns.end(), ComparePatterns);
81   NodeID tree_size = GetTreeSize(patterns);
82   if (tree_size >= kInvalidNodeID) {
83     return false;
84   }
85   tree_.reserve(GetTreeSize(patterns));
86   BuildAhoCorasickTree(patterns);
87 
88   // Sanity check that no new allocations happened in the tree and our computed
89   // size was correct.
90   DCHECK_EQ(tree_.size(), static_cast<size_t>(GetTreeSize(patterns)));
91 
92   is_empty_ = patterns.empty() && tree_.size() == 1u;
93   return true;
94 }
95 
96 SubstringSetMatcher::SubstringSetMatcher() = default;
97 SubstringSetMatcher::~SubstringSetMatcher() = default;
98 
Match(const std::string & text,std::set<MatcherStringPattern::ID> * matches) const99 bool SubstringSetMatcher::Match(
100     const std::string& text,
101     std::set<MatcherStringPattern::ID>* matches) const {
102   const size_t old_number_of_matches = matches->size();
103 
104   // Handle patterns matching the empty string.
105   const AhoCorasickNode* const root = &tree_[kRootID];
106   AccumulateMatchesForNode(root, matches);
107 
108   const AhoCorasickNode* current_node = root;
109   for (const char c : text) {
110     NodeID child = current_node->GetEdge(static_cast<unsigned char>(c));
111 
112     // If the child not can't be found, progressively iterate over the longest
113     // proper suffix of the string represented by the current node. In a sense
114     // we are pruning prefixes from the text.
115     while (child == kInvalidNodeID && current_node != root) {
116       current_node = &tree_[current_node->failure()];
117       child = current_node->GetEdge(static_cast<unsigned char>(c));
118     }
119 
120     if (child != kInvalidNodeID) {
121       // The string represented by |child| is the longest possible suffix of the
122       // current position of |text| in the trie.
123       current_node = &tree_[child];
124       AccumulateMatchesForNode(current_node, matches);
125     } else {
126       // The empty string is the longest possible suffix of the current position
127       // of |text| in the trie.
128       DCHECK_EQ(root, current_node);
129     }
130   }
131 
132   return old_number_of_matches != matches->size();
133 }
134 
AnyMatch(const std::string & text) const135 bool SubstringSetMatcher::AnyMatch(const std::string& text) const {
136   // Handle patterns matching the empty string.
137   const AhoCorasickNode* const root = &tree_[kRootID];
138   if (root->has_outputs()) {
139     return true;
140   }
141 
142   const AhoCorasickNode* current_node = root;
143   for (const char c : text) {
144     NodeID child = current_node->GetEdge(static_cast<unsigned char>(c));
145 
146     // If the child not can't be found, progressively iterate over the longest
147     // proper suffix of the string represented by the current node. In a sense
148     // we are pruning prefixes from the text.
149     while (child == kInvalidNodeID && current_node != root) {
150       current_node = &tree_[current_node->failure()];
151       child = current_node->GetEdge(static_cast<unsigned char>(c));
152     }
153 
154     if (child != kInvalidNodeID) {
155       // The string represented by |child| is the longest possible suffix of the
156       // current position of |text| in the trie.
157       current_node = &tree_[child];
158       if (current_node->has_outputs()) {
159         return true;
160       }
161     } else {
162       // The empty string is the longest possible suffix of the current position
163       // of |text| in the trie.
164       DCHECK_EQ(root, current_node);
165     }
166   }
167 
168   return false;
169 }
170 
EstimateMemoryUsage() const171 size_t SubstringSetMatcher::EstimateMemoryUsage() const {
172   return base::trace_event::EstimateMemoryUsage(tree_);
173 }
174 
175 // static
176 constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kInvalidNodeID;
177 constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kRootID;
178 
GetTreeSize(const std::vector<const MatcherStringPattern * > & patterns) const179 SubstringSetMatcher::NodeID SubstringSetMatcher::GetTreeSize(
180     const std::vector<const MatcherStringPattern*>& patterns) const {
181   DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns));
182 
183   base::CheckedNumeric<NodeID> result = 1u;  // 1 for the root node.
184   if (patterns.empty())
185     return result.ValueOrDie();
186 
187   auto last = patterns.begin();
188   auto current = last + 1;
189   // For the first pattern, each letter is a label of an edge to a new node.
190   result += (*last)->pattern().size();
191 
192   // For the subsequent patterns, only count the edges which were not counted
193   // yet. For this it suffices to test against the previous pattern, because the
194   // patterns are sorted.
195   for (; current != patterns.end(); ++last, ++current) {
196     const std::string& last_pattern = (*last)->pattern();
197     const std::string& current_pattern = (*current)->pattern();
198     size_t prefix_bound = std::min(last_pattern.size(), current_pattern.size());
199 
200     size_t common_prefix = 0;
201     while (common_prefix < prefix_bound &&
202            last_pattern[common_prefix] == current_pattern[common_prefix]) {
203       ++common_prefix;
204     }
205 
206     result -= common_prefix;
207     result += current_pattern.size();
208   }
209 
210   return result.ValueOrDie();
211 }
212 
BuildAhoCorasickTree(const SubstringPatternVector & patterns)213 void SubstringSetMatcher::BuildAhoCorasickTree(
214     const SubstringPatternVector& patterns) {
215   DCHECK(tree_.empty());
216 
217   // Initialize root node of tree.
218   tree_.emplace_back();
219 
220   // Build the initial trie for all the patterns.
221   for (const MatcherStringPattern* pattern : patterns)
222     InsertPatternIntoAhoCorasickTree(pattern);
223 
224   CreateFailureAndOutputEdges();
225 }
226 
InsertPatternIntoAhoCorasickTree(const MatcherStringPattern * pattern)227 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
228     const MatcherStringPattern* pattern) {
229   const std::string& text = pattern->pattern();
230   const std::string::const_iterator text_end = text.end();
231 
232   // Iterators on the tree and the text.
233   AhoCorasickNode* current_node = &tree_[kRootID];
234   std::string::const_iterator i = text.begin();
235 
236   // Follow existing paths for as long as possible.
237   while (i != text_end) {
238     NodeID child = current_node->GetEdge(static_cast<unsigned char>(*i));
239     if (child == kInvalidNodeID)
240       break;
241     current_node = &tree_[child];
242     ++i;
243   }
244 
245   // Create new nodes if necessary.
246   while (i != text_end) {
247     tree_.emplace_back();
248     current_node->SetEdge(static_cast<unsigned char>(*i),
249                           static_cast<NodeID>(tree_.size() - 1));
250     current_node = &tree_.back();
251     ++i;
252   }
253 
254   // Register match.
255   current_node->SetMatchID(pattern->id());
256 }
257 
CreateFailureAndOutputEdges()258 void SubstringSetMatcher::CreateFailureAndOutputEdges() {
259   base::queue<AhoCorasickNode*> queue;
260 
261   // Initialize the failure edges for |root| and its children.
262   AhoCorasickNode* const root = &tree_[0];
263 
264   root->SetOutputLink(kInvalidNodeID);
265 
266   NodeID root_output_link = root->IsEndOfPattern() ? kRootID : kInvalidNodeID;
267 
268   for (unsigned edge_idx = 0; edge_idx < root->num_edges(); ++edge_idx) {
269     const AhoCorasickEdge& edge = root->edges()[edge_idx];
270     if (edge.label >= kFirstSpecialLabel) {
271       continue;
272     }
273     AhoCorasickNode* child = &tree_[edge.node_id];
274     // Failure node is kept as the root.
275     child->SetOutputLink(root_output_link);
276     queue.push(child);
277   }
278 
279   // Do a breadth first search over the trie to create failure edges. We
280   // maintain the invariant that any node in |queue| has had its |failure_| and
281   // |output_link_| edge already initialized.
282   while (!queue.empty()) {
283     AhoCorasickNode* current_node = queue.front();
284     queue.pop();
285 
286     // Compute the failure and output edges of children using the failure edges
287     // of the current node.
288     for (unsigned edge_idx = 0; edge_idx < current_node->num_edges();
289          ++edge_idx) {
290       const AhoCorasickEdge& edge = current_node->edges()[edge_idx];
291       if (edge.label >= kFirstSpecialLabel) {
292         continue;
293       }
294       AhoCorasickNode* child = &tree_[edge.node_id];
295 
296       const AhoCorasickNode* failure_candidate_parent =
297           &tree_[current_node->failure()];
298       NodeID failure_candidate_id =
299           failure_candidate_parent->GetEdge(edge.label);
300       while (failure_candidate_id == kInvalidNodeID &&
301              failure_candidate_parent != root) {
302         failure_candidate_parent = &tree_[failure_candidate_parent->failure()];
303         failure_candidate_id = failure_candidate_parent->GetEdge(edge.label);
304       }
305 
306       if (failure_candidate_id == kInvalidNodeID) {
307         DCHECK_EQ(root, failure_candidate_parent);
308         // |failure_candidate| is invalid and we can't proceed further since we
309         // have reached the root. Hence the longest proper suffix of this string
310         // represented by this node is the empty string (represented by root).
311         failure_candidate_id = kRootID;
312       } else {
313         child->SetFailure(failure_candidate_id);
314       }
315 
316       const AhoCorasickNode* failure_candidate = &tree_[failure_candidate_id];
317       // Now |failure_candidate| is |child|'s longest possible proper suffix in
318       // the trie. We also know that since we are doing a breadth first search,
319       // we would have established |failure_candidate|'s output link by now.
320       // Hence we can define |child|'s output link as follows:
321       child->SetOutputLink(failure_candidate->IsEndOfPattern()
322                                ? failure_candidate_id
323                                : failure_candidate->output_link());
324 
325       queue.push(child);
326     }
327   }
328 }
329 
AccumulateMatchesForNode(const AhoCorasickNode * node,std::set<MatcherStringPattern::ID> * matches) const330 void SubstringSetMatcher::AccumulateMatchesForNode(
331     const AhoCorasickNode* node,
332     std::set<MatcherStringPattern::ID>* matches) const {
333   DCHECK(matches);
334 
335   if (!node->has_outputs()) {
336     // Fast reject.
337     return;
338   }
339   if (node->IsEndOfPattern())
340     matches->insert(node->GetMatchID());
341 
342   NodeID node_id = node->output_link();
343   while (node_id != kInvalidNodeID) {
344     node = &tree_[node_id];
345     matches->insert(node->GetMatchID());
346     node_id = node->output_link();
347   }
348 }
349 
AhoCorasickNode()350 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() {
351   static_assert(kNumInlineEdges == 2, "Code below needs updating");
352   edges_.inline_edges[0].label = kEmptyLabel;
353   edges_.inline_edges[1].label = kEmptyLabel;
354 }
355 
~AhoCorasickNode()356 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {
357   if (edges_capacity_ != 0) {
358     delete[] edges_.edges;
359   }
360 }
361 
AhoCorasickNode(AhoCorasickNode && other)362 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) {
363   *this = std::move(other);
364 }
365 
366 SubstringSetMatcher::AhoCorasickNode&
operator =(AhoCorasickNode && other)367 SubstringSetMatcher::AhoCorasickNode::operator=(AhoCorasickNode&& other) {
368   if (edges_capacity_ != 0) {
369     // Delete the old heap allocation if needed.
370     delete[] edges_.edges;
371   }
372   if (other.edges_capacity_ == 0) {
373     static_assert(kNumInlineEdges == 2, "Code below needs updating");
374     edges_.inline_edges[0] = other.edges_.inline_edges[0];
375     edges_.inline_edges[1] = other.edges_.inline_edges[1];
376   } else {
377     // Move over the heap allocation.
378     edges_.edges = other.edges_.edges;
379     other.edges_.edges = nullptr;
380   }
381   num_free_edges_ = other.num_free_edges_;
382   edges_capacity_ = other.edges_capacity_;
383   return *this;
384 }
385 
386 SubstringSetMatcher::NodeID
GetEdgeNoInline(uint32_t label) const387 SubstringSetMatcher::AhoCorasickNode::GetEdgeNoInline(uint32_t label) const {
388   DCHECK(edges_capacity_ != 0);
389 #ifdef __SSE2__
390   const __m128i lbl = _mm_set1_epi32(static_cast<int>(label));
391   const __m128i mask = _mm_set1_epi32(0x1ff);
392   for (unsigned edge_idx = 0; edge_idx < num_edges(); edge_idx += 4) {
393     const __m128i four = _mm_loadu_si128(
394         reinterpret_cast<const __m128i*>(&edges_.edges[edge_idx]));
395     const __m128i match = _mm_cmpeq_epi32(_mm_and_si128(four, mask), lbl);
396     const uint32_t match_mask = static_cast<uint32_t>(_mm_movemask_epi8(match));
397     if (match_mask != 0) {
398       if (match_mask & 0x1u) {
399         return edges_.edges[edge_idx].node_id;
400       }
401       if (match_mask & 0x10u) {
402         return edges_.edges[edge_idx + 1].node_id;
403       }
404       if (match_mask & 0x100u) {
405         return edges_.edges[edge_idx + 2].node_id;
406       }
407       DCHECK(match_mask & 0x1000u);
408       return edges_.edges[edge_idx + 3].node_id;
409     }
410   }
411 #else
412   for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) {
413     const AhoCorasickEdge& edge = edges_.edges[edge_idx];
414     if (edge.label == label)
415       return edge.node_id;
416   }
417 #endif
418   return kInvalidNodeID;
419 }
420 
SetEdge(uint32_t label,NodeID node)421 void SubstringSetMatcher::AhoCorasickNode::SetEdge(uint32_t label,
422                                                    NodeID node) {
423   DCHECK_LT(node, kInvalidNodeID);
424 
425 #if DCHECK_IS_ON()
426   // We don't support overwriting existing edges.
427   for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) {
428     DCHECK_NE(label, edges()[edge_idx].label);
429   }
430 #endif
431 
432   if (edges_capacity_ == 0 && num_free_edges_ > 0) {
433     // Still space in the inline storage, so use that.
434     edges_.inline_edges[num_edges()] = AhoCorasickEdge{label, node};
435     if (label == kFailureNodeLabel) {
436       // Make sure that kFailureNodeLabel is first.
437       // NOTE: We don't use std::swap here, because the compiler doesn't
438       // understand that inline_edges[] is 4-aligned and can give
439       // a warning or error.
440       AhoCorasickEdge temp = edges_.inline_edges[0];
441       edges_.inline_edges[0] = edges_.inline_edges[num_edges()];
442       edges_.inline_edges[num_edges()] = temp;
443     }
444     --num_free_edges_;
445     return;
446   }
447 
448   if (num_free_edges_ == 0) {
449     // We are out of space, so double our capacity (unless that would cause
450     // num_free_edges_ to overflow). This can either be because we are
451     // converting from inline to heap storage, or because we are increasing the
452     // size of our heap storage.
453     unsigned old_capacity =
454         edges_capacity_ == 0 ? kNumInlineEdges : edges_capacity_;
455     unsigned new_capacity = std::min(old_capacity * 2, kEmptyLabel + 1);
456     DCHECK_EQ(0u, new_capacity % 4);
457     AhoCorasickEdge* new_edges = new AhoCorasickEdge[new_capacity];
458     memcpy(new_edges, edges(), sizeof(AhoCorasickEdge) * old_capacity);
459     for (unsigned edge_idx = old_capacity; edge_idx < new_capacity;
460          ++edge_idx) {
461       new_edges[edge_idx].label = kEmptyLabel;
462     }
463     if (edges_capacity_ != 0) {
464       delete[] edges_.edges;
465     }
466     edges_.edges = new_edges;
467     // These casts are safe due to the DCHECK above.
468     edges_capacity_ = static_cast<uint16_t>(new_capacity);
469     num_free_edges_ = static_cast<uint8_t>(new_capacity - old_capacity);
470   }
471 
472   // Insert the new edge at the end of our heap storage.
473   edges_.edges[num_edges()] = AhoCorasickEdge{label, node};
474   if (label == kFailureNodeLabel) {
475     // Make sure that kFailureNodeLabel is first.
476     std::swap(edges_.edges[0], edges_.edges[num_edges()]);
477   }
478   --num_free_edges_;
479 }
480 
SetFailure(NodeID node)481 void SubstringSetMatcher::AhoCorasickNode::SetFailure(NodeID node) {
482   DCHECK_NE(kInvalidNodeID, node);
483   if (node != kRootID) {
484     SetEdge(kFailureNodeLabel, node);
485   }
486 }
487 
EstimateMemoryUsage() const488 size_t SubstringSetMatcher::AhoCorasickNode::EstimateMemoryUsage() const {
489   if (edges_capacity_ == 0) {
490     return 0;
491   } else {
492     return base::trace_event::EstimateMemoryUsage(
493         base::span<const AhoCorasickEdge>(edges_.edges, edges_capacity_));
494   }
495 }
496 
497 }  // namespace base
498