1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9
10 #include "base/substring_set_matcher/substring_set_matcher.h"
11
12 #include <stddef.h>
13
14 #include <algorithm>
15 #include <queue>
16
17 #ifdef __SSE2__
18 #include <immintrin.h>
19 #include "base/bits.h"
20 #endif
21
22 #include "base/check_op.h"
23 #include "base/containers/contains.h"
24 #include "base/containers/queue.h"
25 #include "base/numerics/checked_math.h"
26 #include "base/trace_event/memory_usage_estimator.h" // no-presubmit-check
27
28 namespace base {
29
30 namespace {
31
32 // Compare MatcherStringPattern instances based on their string patterns.
ComparePatterns(const MatcherStringPattern * a,const MatcherStringPattern * b)33 bool ComparePatterns(const MatcherStringPattern* a,
34 const MatcherStringPattern* b) {
35 return a->pattern() < b->pattern();
36 }
37
GetVectorOfPointers(const std::vector<MatcherStringPattern> & patterns)38 std::vector<const MatcherStringPattern*> GetVectorOfPointers(
39 const std::vector<MatcherStringPattern>& patterns) {
40 std::vector<const MatcherStringPattern*> pattern_pointers;
41 pattern_pointers.reserve(patterns.size());
42
43 for (const MatcherStringPattern& pattern : patterns)
44 pattern_pointers.push_back(&pattern);
45
46 return pattern_pointers;
47 }
48
49 } // namespace
50
Build(const std::vector<MatcherStringPattern> & patterns)51 bool SubstringSetMatcher::Build(
52 const std::vector<MatcherStringPattern>& patterns) {
53 return Build(GetVectorOfPointers(patterns));
54 }
55
Build(std::vector<const MatcherStringPattern * > patterns)56 bool SubstringSetMatcher::Build(
57 std::vector<const MatcherStringPattern*> patterns) {
58 // Ensure there are no duplicate IDs and all pattern strings are distinct.
59 #if DCHECK_IS_ON()
60 {
61 std::set<MatcherStringPattern::ID> ids;
62 std::set<std::string> pattern_strings;
63 for (const MatcherStringPattern* pattern : patterns) {
64 CHECK(!base::Contains(ids, pattern->id()));
65 CHECK(!base::Contains(pattern_strings, pattern->pattern()));
66 ids.insert(pattern->id());
67 pattern_strings.insert(pattern->pattern());
68 }
69 }
70 #endif
71
72 // Check that all the match labels fit into an edge.
73 for (const MatcherStringPattern* pattern : patterns) {
74 if (pattern->id() >= kInvalidNodeID) {
75 return false;
76 }
77 }
78
79 // Compute the total number of tree nodes needed.
80 std::sort(patterns.begin(), patterns.end(), ComparePatterns);
81 NodeID tree_size = GetTreeSize(patterns);
82 if (tree_size >= kInvalidNodeID) {
83 return false;
84 }
85 tree_.reserve(GetTreeSize(patterns));
86 BuildAhoCorasickTree(patterns);
87
88 // Sanity check that no new allocations happened in the tree and our computed
89 // size was correct.
90 DCHECK_EQ(tree_.size(), static_cast<size_t>(GetTreeSize(patterns)));
91
92 is_empty_ = patterns.empty() && tree_.size() == 1u;
93 return true;
94 }
95
96 SubstringSetMatcher::SubstringSetMatcher() = default;
97 SubstringSetMatcher::~SubstringSetMatcher() = default;
98
Match(const std::string & text,std::set<MatcherStringPattern::ID> * matches) const99 bool SubstringSetMatcher::Match(
100 const std::string& text,
101 std::set<MatcherStringPattern::ID>* matches) const {
102 const size_t old_number_of_matches = matches->size();
103
104 // Handle patterns matching the empty string.
105 const AhoCorasickNode* const root = &tree_[kRootID];
106 AccumulateMatchesForNode(root, matches);
107
108 const AhoCorasickNode* current_node = root;
109 for (const char c : text) {
110 NodeID child = current_node->GetEdge(static_cast<unsigned char>(c));
111
112 // If the child not can't be found, progressively iterate over the longest
113 // proper suffix of the string represented by the current node. In a sense
114 // we are pruning prefixes from the text.
115 while (child == kInvalidNodeID && current_node != root) {
116 current_node = &tree_[current_node->failure()];
117 child = current_node->GetEdge(static_cast<unsigned char>(c));
118 }
119
120 if (child != kInvalidNodeID) {
121 // The string represented by |child| is the longest possible suffix of the
122 // current position of |text| in the trie.
123 current_node = &tree_[child];
124 AccumulateMatchesForNode(current_node, matches);
125 } else {
126 // The empty string is the longest possible suffix of the current position
127 // of |text| in the trie.
128 DCHECK_EQ(root, current_node);
129 }
130 }
131
132 return old_number_of_matches != matches->size();
133 }
134
AnyMatch(const std::string & text) const135 bool SubstringSetMatcher::AnyMatch(const std::string& text) const {
136 // Handle patterns matching the empty string.
137 const AhoCorasickNode* const root = &tree_[kRootID];
138 if (root->has_outputs()) {
139 return true;
140 }
141
142 const AhoCorasickNode* current_node = root;
143 for (const char c : text) {
144 NodeID child = current_node->GetEdge(static_cast<unsigned char>(c));
145
146 // If the child not can't be found, progressively iterate over the longest
147 // proper suffix of the string represented by the current node. In a sense
148 // we are pruning prefixes from the text.
149 while (child == kInvalidNodeID && current_node != root) {
150 current_node = &tree_[current_node->failure()];
151 child = current_node->GetEdge(static_cast<unsigned char>(c));
152 }
153
154 if (child != kInvalidNodeID) {
155 // The string represented by |child| is the longest possible suffix of the
156 // current position of |text| in the trie.
157 current_node = &tree_[child];
158 if (current_node->has_outputs()) {
159 return true;
160 }
161 } else {
162 // The empty string is the longest possible suffix of the current position
163 // of |text| in the trie.
164 DCHECK_EQ(root, current_node);
165 }
166 }
167
168 return false;
169 }
170
EstimateMemoryUsage() const171 size_t SubstringSetMatcher::EstimateMemoryUsage() const {
172 return base::trace_event::EstimateMemoryUsage(tree_);
173 }
174
175 // static
176 constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kInvalidNodeID;
177 constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kRootID;
178
GetTreeSize(const std::vector<const MatcherStringPattern * > & patterns) const179 SubstringSetMatcher::NodeID SubstringSetMatcher::GetTreeSize(
180 const std::vector<const MatcherStringPattern*>& patterns) const {
181 DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns));
182
183 base::CheckedNumeric<NodeID> result = 1u; // 1 for the root node.
184 if (patterns.empty())
185 return result.ValueOrDie();
186
187 auto last = patterns.begin();
188 auto current = last + 1;
189 // For the first pattern, each letter is a label of an edge to a new node.
190 result += (*last)->pattern().size();
191
192 // For the subsequent patterns, only count the edges which were not counted
193 // yet. For this it suffices to test against the previous pattern, because the
194 // patterns are sorted.
195 for (; current != patterns.end(); ++last, ++current) {
196 const std::string& last_pattern = (*last)->pattern();
197 const std::string& current_pattern = (*current)->pattern();
198 size_t prefix_bound = std::min(last_pattern.size(), current_pattern.size());
199
200 size_t common_prefix = 0;
201 while (common_prefix < prefix_bound &&
202 last_pattern[common_prefix] == current_pattern[common_prefix]) {
203 ++common_prefix;
204 }
205
206 result -= common_prefix;
207 result += current_pattern.size();
208 }
209
210 return result.ValueOrDie();
211 }
212
BuildAhoCorasickTree(const SubstringPatternVector & patterns)213 void SubstringSetMatcher::BuildAhoCorasickTree(
214 const SubstringPatternVector& patterns) {
215 DCHECK(tree_.empty());
216
217 // Initialize root node of tree.
218 tree_.emplace_back();
219
220 // Build the initial trie for all the patterns.
221 for (const MatcherStringPattern* pattern : patterns)
222 InsertPatternIntoAhoCorasickTree(pattern);
223
224 CreateFailureAndOutputEdges();
225 }
226
InsertPatternIntoAhoCorasickTree(const MatcherStringPattern * pattern)227 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
228 const MatcherStringPattern* pattern) {
229 const std::string& text = pattern->pattern();
230 const std::string::const_iterator text_end = text.end();
231
232 // Iterators on the tree and the text.
233 AhoCorasickNode* current_node = &tree_[kRootID];
234 std::string::const_iterator i = text.begin();
235
236 // Follow existing paths for as long as possible.
237 while (i != text_end) {
238 NodeID child = current_node->GetEdge(static_cast<unsigned char>(*i));
239 if (child == kInvalidNodeID)
240 break;
241 current_node = &tree_[child];
242 ++i;
243 }
244
245 // Create new nodes if necessary.
246 while (i != text_end) {
247 tree_.emplace_back();
248 current_node->SetEdge(static_cast<unsigned char>(*i),
249 static_cast<NodeID>(tree_.size() - 1));
250 current_node = &tree_.back();
251 ++i;
252 }
253
254 // Register match.
255 current_node->SetMatchID(pattern->id());
256 }
257
CreateFailureAndOutputEdges()258 void SubstringSetMatcher::CreateFailureAndOutputEdges() {
259 base::queue<AhoCorasickNode*> queue;
260
261 // Initialize the failure edges for |root| and its children.
262 AhoCorasickNode* const root = &tree_[0];
263
264 root->SetOutputLink(kInvalidNodeID);
265
266 NodeID root_output_link = root->IsEndOfPattern() ? kRootID : kInvalidNodeID;
267
268 for (unsigned edge_idx = 0; edge_idx < root->num_edges(); ++edge_idx) {
269 const AhoCorasickEdge& edge = root->edges()[edge_idx];
270 if (edge.label >= kFirstSpecialLabel) {
271 continue;
272 }
273 AhoCorasickNode* child = &tree_[edge.node_id];
274 // Failure node is kept as the root.
275 child->SetOutputLink(root_output_link);
276 queue.push(child);
277 }
278
279 // Do a breadth first search over the trie to create failure edges. We
280 // maintain the invariant that any node in |queue| has had its |failure_| and
281 // |output_link_| edge already initialized.
282 while (!queue.empty()) {
283 AhoCorasickNode* current_node = queue.front();
284 queue.pop();
285
286 // Compute the failure and output edges of children using the failure edges
287 // of the current node.
288 for (unsigned edge_idx = 0; edge_idx < current_node->num_edges();
289 ++edge_idx) {
290 const AhoCorasickEdge& edge = current_node->edges()[edge_idx];
291 if (edge.label >= kFirstSpecialLabel) {
292 continue;
293 }
294 AhoCorasickNode* child = &tree_[edge.node_id];
295
296 const AhoCorasickNode* failure_candidate_parent =
297 &tree_[current_node->failure()];
298 NodeID failure_candidate_id =
299 failure_candidate_parent->GetEdge(edge.label);
300 while (failure_candidate_id == kInvalidNodeID &&
301 failure_candidate_parent != root) {
302 failure_candidate_parent = &tree_[failure_candidate_parent->failure()];
303 failure_candidate_id = failure_candidate_parent->GetEdge(edge.label);
304 }
305
306 if (failure_candidate_id == kInvalidNodeID) {
307 DCHECK_EQ(root, failure_candidate_parent);
308 // |failure_candidate| is invalid and we can't proceed further since we
309 // have reached the root. Hence the longest proper suffix of this string
310 // represented by this node is the empty string (represented by root).
311 failure_candidate_id = kRootID;
312 } else {
313 child->SetFailure(failure_candidate_id);
314 }
315
316 const AhoCorasickNode* failure_candidate = &tree_[failure_candidate_id];
317 // Now |failure_candidate| is |child|'s longest possible proper suffix in
318 // the trie. We also know that since we are doing a breadth first search,
319 // we would have established |failure_candidate|'s output link by now.
320 // Hence we can define |child|'s output link as follows:
321 child->SetOutputLink(failure_candidate->IsEndOfPattern()
322 ? failure_candidate_id
323 : failure_candidate->output_link());
324
325 queue.push(child);
326 }
327 }
328 }
329
AccumulateMatchesForNode(const AhoCorasickNode * node,std::set<MatcherStringPattern::ID> * matches) const330 void SubstringSetMatcher::AccumulateMatchesForNode(
331 const AhoCorasickNode* node,
332 std::set<MatcherStringPattern::ID>* matches) const {
333 DCHECK(matches);
334
335 if (!node->has_outputs()) {
336 // Fast reject.
337 return;
338 }
339 if (node->IsEndOfPattern())
340 matches->insert(node->GetMatchID());
341
342 NodeID node_id = node->output_link();
343 while (node_id != kInvalidNodeID) {
344 node = &tree_[node_id];
345 matches->insert(node->GetMatchID());
346 node_id = node->output_link();
347 }
348 }
349
AhoCorasickNode()350 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() {
351 static_assert(kNumInlineEdges == 2, "Code below needs updating");
352 edges_.inline_edges[0].label = kEmptyLabel;
353 edges_.inline_edges[1].label = kEmptyLabel;
354 }
355
~AhoCorasickNode()356 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {
357 if (edges_capacity_ != 0) {
358 delete[] edges_.edges;
359 }
360 }
361
AhoCorasickNode(AhoCorasickNode && other)362 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) {
363 *this = std::move(other);
364 }
365
366 SubstringSetMatcher::AhoCorasickNode&
operator =(AhoCorasickNode && other)367 SubstringSetMatcher::AhoCorasickNode::operator=(AhoCorasickNode&& other) {
368 if (edges_capacity_ != 0) {
369 // Delete the old heap allocation if needed.
370 delete[] edges_.edges;
371 }
372 if (other.edges_capacity_ == 0) {
373 static_assert(kNumInlineEdges == 2, "Code below needs updating");
374 edges_.inline_edges[0] = other.edges_.inline_edges[0];
375 edges_.inline_edges[1] = other.edges_.inline_edges[1];
376 } else {
377 // Move over the heap allocation.
378 edges_.edges = other.edges_.edges;
379 other.edges_.edges = nullptr;
380 }
381 num_free_edges_ = other.num_free_edges_;
382 edges_capacity_ = other.edges_capacity_;
383 return *this;
384 }
385
386 SubstringSetMatcher::NodeID
GetEdgeNoInline(uint32_t label) const387 SubstringSetMatcher::AhoCorasickNode::GetEdgeNoInline(uint32_t label) const {
388 DCHECK(edges_capacity_ != 0);
389 #ifdef __SSE2__
390 const __m128i lbl = _mm_set1_epi32(static_cast<int>(label));
391 const __m128i mask = _mm_set1_epi32(0x1ff);
392 for (unsigned edge_idx = 0; edge_idx < num_edges(); edge_idx += 4) {
393 const __m128i four = _mm_loadu_si128(
394 reinterpret_cast<const __m128i*>(&edges_.edges[edge_idx]));
395 const __m128i match = _mm_cmpeq_epi32(_mm_and_si128(four, mask), lbl);
396 const uint32_t match_mask = static_cast<uint32_t>(_mm_movemask_epi8(match));
397 if (match_mask != 0) {
398 if (match_mask & 0x1u) {
399 return edges_.edges[edge_idx].node_id;
400 }
401 if (match_mask & 0x10u) {
402 return edges_.edges[edge_idx + 1].node_id;
403 }
404 if (match_mask & 0x100u) {
405 return edges_.edges[edge_idx + 2].node_id;
406 }
407 DCHECK(match_mask & 0x1000u);
408 return edges_.edges[edge_idx + 3].node_id;
409 }
410 }
411 #else
412 for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) {
413 const AhoCorasickEdge& edge = edges_.edges[edge_idx];
414 if (edge.label == label)
415 return edge.node_id;
416 }
417 #endif
418 return kInvalidNodeID;
419 }
420
SetEdge(uint32_t label,NodeID node)421 void SubstringSetMatcher::AhoCorasickNode::SetEdge(uint32_t label,
422 NodeID node) {
423 DCHECK_LT(node, kInvalidNodeID);
424
425 #if DCHECK_IS_ON()
426 // We don't support overwriting existing edges.
427 for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) {
428 DCHECK_NE(label, edges()[edge_idx].label);
429 }
430 #endif
431
432 if (edges_capacity_ == 0 && num_free_edges_ > 0) {
433 // Still space in the inline storage, so use that.
434 edges_.inline_edges[num_edges()] = AhoCorasickEdge{label, node};
435 if (label == kFailureNodeLabel) {
436 // Make sure that kFailureNodeLabel is first.
437 // NOTE: We don't use std::swap here, because the compiler doesn't
438 // understand that inline_edges[] is 4-aligned and can give
439 // a warning or error.
440 AhoCorasickEdge temp = edges_.inline_edges[0];
441 edges_.inline_edges[0] = edges_.inline_edges[num_edges()];
442 edges_.inline_edges[num_edges()] = temp;
443 }
444 --num_free_edges_;
445 return;
446 }
447
448 if (num_free_edges_ == 0) {
449 // We are out of space, so double our capacity (unless that would cause
450 // num_free_edges_ to overflow). This can either be because we are
451 // converting from inline to heap storage, or because we are increasing the
452 // size of our heap storage.
453 unsigned old_capacity =
454 edges_capacity_ == 0 ? kNumInlineEdges : edges_capacity_;
455 unsigned new_capacity = std::min(old_capacity * 2, kEmptyLabel + 1);
456 DCHECK_EQ(0u, new_capacity % 4);
457 AhoCorasickEdge* new_edges = new AhoCorasickEdge[new_capacity];
458 memcpy(new_edges, edges(), sizeof(AhoCorasickEdge) * old_capacity);
459 for (unsigned edge_idx = old_capacity; edge_idx < new_capacity;
460 ++edge_idx) {
461 new_edges[edge_idx].label = kEmptyLabel;
462 }
463 if (edges_capacity_ != 0) {
464 delete[] edges_.edges;
465 }
466 edges_.edges = new_edges;
467 // These casts are safe due to the DCHECK above.
468 edges_capacity_ = static_cast<uint16_t>(new_capacity);
469 num_free_edges_ = static_cast<uint8_t>(new_capacity - old_capacity);
470 }
471
472 // Insert the new edge at the end of our heap storage.
473 edges_.edges[num_edges()] = AhoCorasickEdge{label, node};
474 if (label == kFailureNodeLabel) {
475 // Make sure that kFailureNodeLabel is first.
476 std::swap(edges_.edges[0], edges_.edges[num_edges()]);
477 }
478 --num_free_edges_;
479 }
480
SetFailure(NodeID node)481 void SubstringSetMatcher::AhoCorasickNode::SetFailure(NodeID node) {
482 DCHECK_NE(kInvalidNodeID, node);
483 if (node != kRootID) {
484 SetEdge(kFailureNodeLabel, node);
485 }
486 }
487
EstimateMemoryUsage() const488 size_t SubstringSetMatcher::AhoCorasickNode::EstimateMemoryUsage() const {
489 if (edges_capacity_ == 0) {
490 return 0;
491 } else {
492 return base::trace_event::EstimateMemoryUsage(
493 base::span<const AhoCorasickEdge>(edges_.edges, edges_capacity_));
494 }
495 }
496
497 } // namespace base
498