• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 #ifndef BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
11 #define BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
12 
13 #include <stdint.h>
14 
15 #include <limits>
16 #include <set>
17 #include <string>
18 #include <vector>
19 
20 #include "base/base_export.h"
21 #include "base/check_op.h"
22 #include "base/memory/raw_ptr_exclusion.h"
23 #include "base/substring_set_matcher/matcher_string_pattern.h"
24 
25 namespace base {
26 
27 // Class that store a set of string patterns and can find for a string S,
28 // which string patterns occur in S.
29 class BASE_EXPORT SubstringSetMatcher {
30  public:
31   SubstringSetMatcher();
32   SubstringSetMatcher(const SubstringSetMatcher&) = delete;
33   SubstringSetMatcher& operator=(const SubstringSetMatcher&) = delete;
34 
35   ~SubstringSetMatcher();
36 
37   // Registers all |patterns|. Each pattern needs to have a unique ID and all
38   // pattern strings must be unique. Build() should be called exactly once
39   // (before it is called, the tree is empty).
40   //
41   // Complexity:
42   //    Let n = number of patterns.
43   //    Let S = sum of pattern lengths.
44   //    Let k = range of char. Generally 256.
45   // Complexity = O(nlogn + S * logk)
46   // nlogn comes from sorting the patterns.
47   // log(k) comes from our usage of std::map to store edges.
48   //
49   // Returns true on success (may fail if e.g. if the tree gets too many nodes).
50   bool Build(const std::vector<MatcherStringPattern>& patterns);
51   bool Build(std::vector<const MatcherStringPattern*> patterns);
52 
53   // Matches |text| against all registered MatcherStringPatterns. Stores the IDs
54   // of matching patterns in |matches|. |matches| is not cleared before adding
55   // to it.
56   // Complexity:
57   //    Let t = length of |text|.
58   //    Let k = range of char. Generally 256.
59   //    Let z = number of matches returned.
60   // Complexity = O(t * logk + zlogz)
61   bool Match(const std::string& text,
62              std::set<MatcherStringPattern::ID>* matches) const;
63 
64   // As Match(), except it returns immediately on the first match.
65   // This allows true/false matching to be done without any dynamic
66   // memory allocation.
67   // Complexity = O(t * logk)
68   bool AnyMatch(const std::string& text) const;
69 
70   // Returns true if this object retains no allocated data.
IsEmpty()71   bool IsEmpty() const { return is_empty_; }
72 
73   // Returns the dynamically allocated memory usage in bytes. See
74   // base/trace_event/memory_usage_estimator.h for details.
75   size_t EstimateMemoryUsage() const;
76 
77  private:
78   // Represents the index of the node within |tree_|. It is specifically
79   // uint32_t so that we can be sure it takes up 4 bytes when stored together
80   // with the 9-bit label (so 23 bits are allocated to the NodeID, even though
81   // it is exposed as uint32_t). If the computed size of |tree_| is
82   // larger than what can be stored within 23 bits, Build() will fail.
83   using NodeID = uint32_t;
84 
85   // This is the maximum possible size of |tree_| and hence can't be a valid ID.
86   static constexpr NodeID kInvalidNodeID = (1u << 23) - 1;
87 
88   static constexpr NodeID kRootID = 0;
89 
90   // A node of an Aho Corasick Tree. See
91   // http://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/02/Small02.pdf
92   // to understand the algorithm.
93   //
94   // The algorithm is based on the idea of building a trie of all registered
95   // patterns. Each node of the tree is annotated with a set of pattern
96   // IDs that are used to report matches.
97   //
98   // The root of the trie represents an empty match. If we were looking whether
99   // any registered pattern matches a text at the beginning of the text (i.e.
100   // whether any pattern is a prefix of the text), we could just follow
101   // nodes in the trie according to the matching characters in the text.
102   // E.g., if text == "foobar", we would follow the trie from the root node
103   // to its child labeled 'f', from there to child 'o', etc. In this process we
104   // would report all pattern IDs associated with the trie nodes as matches.
105   //
106   // As we are not looking for all prefix matches but all substring matches,
107   // this algorithm would need to compare text.substr(0), text.substr(1), ...
108   // against the trie, which is in O(|text|^2).
109   //
110   // The Aho Corasick algorithm improves this runtime by using failure edges.
111   // In case we have found a partial match of length k in the text
112   // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
113   // a node at depth k, but cannot find a match in the trie for character
114   // text[i + k] at depth k + 1, we follow a failure edge. This edge
115   // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
116   // is a prefix of any registered pattern.
117   //
118   // If your brain thinks "Forget it, let's go shopping.", don't worry.
119   // Take a nap and read an introductory text on the Aho Corasick algorithm.
120   // It will make sense. Eventually.
121 
122   // An edge internal to the tree. We pack the label (character we are
123   // matching on) and the destination node ID into 32 bits, to save memory.
124   // We also use these edges as a sort of generic key/value store for
125   // some special values that not all nodes will have; this also saves on
126   // memory over the otherwise obvious choice of having them as struct fields,
127   // as it means we do not to store them when they are not present.
128   struct AhoCorasickEdge {
129     // char (unsigned, so [0..255]), or a special label below.
130     uint32_t label : 9;
131     NodeID node_id : 23;
132   };
133 
134   // Node index that failure edge leads to. The failure node corresponds to
135   // the node which represents the longest proper suffix (include empty
136   // string) of the string represented by this node. Not stored if it is
137   // equal to kRootID (since that is the most common value).
138   //
139   // NOTE: Assigning |root| as the failure edge for itself doesn't strictly
140   // abide by the definition of "proper" suffix. The proper suffix of an empty
141   // string should probably be defined as null, but we assign it to the |root|
142   // to simplify the code and have the invariant that the failure edge is always
143   // defined.
144   static constexpr uint32_t kFailureNodeLabel = 0x100;
145   static constexpr uint32_t kFirstSpecialLabel = kFailureNodeLabel;
146 
147   // Node index that corresponds to the longest proper suffix (including empty
148   // suffix) of this node and which also represents the end of a pattern.
149   // Does not have to exist.
150   static constexpr uint32_t kOutputLinkLabel = 0x101;
151 
152   // If present, this node represents the end of a pattern. It stores the ID of
153   // the corresponding pattern (ie., it is not really a NodeID, but a
154   // MatcherStringPattern::ID).
155   static constexpr uint32_t kMatchIDLabel = 0x102;
156 
157   // Used for uninitialized label slots; used so that we do not have to test for
158   // them in other ways, since we know the data will be initialized and never
159   // match any other labels.
160   static constexpr uint32_t kEmptyLabel = 0x103;
161 
162   // A node in the trie, packed tightly together so that it occupies 12 bytes
163   // (both on 32- and 64-bit platforms), but aligned to at least 4 (see the
164   // comment on edges_).
165   class alignas(AhoCorasickEdge) AhoCorasickNode {
166    public:
167     AhoCorasickNode();
168     ~AhoCorasickNode();
169     AhoCorasickNode(AhoCorasickNode&& other);
170     AhoCorasickNode& operator=(AhoCorasickNode&& other);
171 
GetEdge(uint32_t label)172     NodeID GetEdge(uint32_t label) const {
173       if (edges_capacity_ != 0) {
174         return GetEdgeNoInline(label);
175       }
176       static_assert(kNumInlineEdges == 2, "Code below needs updating");
177       if (edges_.inline_edges[0].label == label) {
178         return edges_.inline_edges[0].node_id;
179       }
180       if (edges_.inline_edges[1].label == label) {
181         return edges_.inline_edges[1].node_id;
182       }
183       return kInvalidNodeID;
184     }
185     NodeID GetEdgeNoInline(uint32_t label) const;
186     void SetEdge(uint32_t label, NodeID node);
edges()187     const AhoCorasickEdge* edges() const {
188       // NOTE: Returning edges_.inline_edges here is fine, because it's
189       // the first thing in the struct (see the comment on edges_).
190       DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(edges_.inline_edges) %
191                         alignof(AhoCorasickEdge));
192       return edges_capacity_ == 0 ? edges_.inline_edges : edges_.edges;
193     }
194 
failure()195     NodeID failure() const {
196       // NOTE: Even if num_edges_ == 0, we are not doing anything
197       // undefined, as we will have room for at least two edges
198       // and empty edges are set to kEmptyLabel.
199       const AhoCorasickEdge& first_edge = *edges();
200       if (first_edge.label == kFailureNodeLabel) {
201         return first_edge.node_id;
202       } else {
203         return kRootID;
204       }
205     }
206     void SetFailure(NodeID failure);
207 
SetMatchID(MatcherStringPattern::ID id)208     void SetMatchID(MatcherStringPattern::ID id) {
209       DCHECK(!IsEndOfPattern());
210       DCHECK(id < kInvalidNodeID);  // This is enforced by Build().
211       SetEdge(kMatchIDLabel, static_cast<NodeID>(id));
212       has_outputs_ = true;
213     }
214 
215     // Returns true if this node corresponds to a pattern.
IsEndOfPattern()216     bool IsEndOfPattern() const {
217       if (!has_outputs_) {
218         // Fast reject.
219         return false;
220       }
221       return GetEdge(kMatchIDLabel) != kInvalidNodeID;
222     }
223 
224     // Must only be called if |IsEndOfPattern| returns true for this node.
GetMatchID()225     MatcherStringPattern::ID GetMatchID() const {
226       DCHECK(IsEndOfPattern());
227       return GetEdge(kMatchIDLabel);
228     }
229 
SetOutputLink(NodeID node)230     void SetOutputLink(NodeID node) {
231       if (node != kInvalidNodeID) {
232         SetEdge(kOutputLinkLabel, node);
233         has_outputs_ = true;
234       }
235     }
output_link()236     NodeID output_link() const { return GetEdge(kOutputLinkLabel); }
237 
238     size_t EstimateMemoryUsage() const;
num_edges()239     size_t num_edges() const {
240       if (edges_capacity_ == 0) {
241         return kNumInlineEdges - num_free_edges_;
242       } else {
243         return edges_capacity_ - num_free_edges_;
244       }
245     }
246 
has_outputs()247     bool has_outputs() const { return has_outputs_; }
248 
249    private:
250     // Outgoing edges of current node, including failure edge and output links.
251     // Most nodes have only one or two (or even zero) edges, not the last
252     // because many of them are leaves. Thus, we make an optimization for this
253     // common case; instead of a pointer to an edge array on the heap, we can
254     // pack two edges inline where the pointer would otherwise be. This reduces
255     // memory usage dramatically, as well as saving us a cache-line fetch.
256     //
257     // Note that even though most nodes have fewer outgoing edges, most nodes
258     // that we actually traverse will have any of them. This apparent
259     // contradiction is because we tend to spend more of our time near the root
260     // of the trie, where it is wide. This means that another layout would be
261     // possible: If we wanted to, non-inline nodes could simply store an array
262     // of 259 (256 possible characters plus the three special label types)
263     // edges, indexed directly by label type. This would use 20–50% more RAM,
264     // but also increases the speed of lookups due to removing the search loop.
265     //
266     // The nodes are generally unordered; since we typically index text, even
267     // the root will rarely be more than 20–30 wide, and at that point, it's
268     // better to just do a linear search than a binary one (which fares poorly
269     // on branch predictors). However, a special case, we put kFailureNodeLabel
270     // in the first slot if it exists (ie., is not equal to kRootID), since we
271     // need to access that label during every single node we look at during
272     // traversal.
273     //
274     // NOTE: Keep this the first member in the struct, so that inline_edges gets
275     // 4-aligned (since the class is marked as such, despite being packed.
276     // Otherwise, edges() can return an unaligned pointer marked as aligned
277     // (the unalignedness gets lost).
278     static constexpr int kNumInlineEdges = 2;
279     union {
280       // Out-of-line edge storage, having room for edges_capacity_ elements.
281       // Note that due to __attribute__((packed)) below, this pointer may be
282       // unaligned on 64-bit platforms, causing slightly less efficient
283       // access to it in some cases.
284       // This field is not a raw_ptr<> because it was filtered by the rewriter
285       // for: #union
286       RAW_PTR_EXCLUSION AhoCorasickEdge* edges;
287 
288       // Inline edge storage, used if edges_capacity_ == 0.
289       AhoCorasickEdge inline_edges[kNumInlineEdges];
290     } edges_;
291 
292     // Whether we have an edge for kMatchIDLabel or kOutputLinkLabel,
293     // ie., hitting this node during traversal will create one or more
294     // matches. This is redundant, but since every single lookup during
295     // traversal needs this, it saves a few searches for us.
296     bool has_outputs_ = false;
297 
298     // Number of unused left in edges_. Edges are always allocated from the
299     // beginning and never deleted; those after num_edges_ will be marked with
300     // kEmptyLabel (and have an undefined node_id). We store the number of
301     // free edges instead of the more common number of _used_ edges, to be
302     // sure that we are able to fit it in an uint8_t. num_edges() provides
303     // a useful abstraction over this.
304     uint8_t num_free_edges_ = kNumInlineEdges;
305 
306     // How many edges we have allocated room for (can never be more than
307     // kEmptyLabel + 1). If equal to zero, we are not using heap storage,
308     // but instead are using inline_edges.
309     //
310     // If not equal to zero, will be a multiple of 4, so that we can use
311     // SIMD to accelerate looking for edges.
312     uint16_t edges_capacity_ = 0;
313   } __attribute__((packed));
314 
315   using SubstringPatternVector = std::vector<const MatcherStringPattern*>;
316 
317   // Given the set of patterns, compute how many nodes will the corresponding
318   // Aho-Corasick tree have. Note that |patterns| need to be sorted.
319   NodeID GetTreeSize(
320       const std::vector<const MatcherStringPattern*>& patterns) const;
321 
322   void BuildAhoCorasickTree(const SubstringPatternVector& patterns);
323 
324   // Inserts a path for |pattern->pattern()| into the tree and adds
325   // |pattern->id()| to the set of matches.
326   void InsertPatternIntoAhoCorasickTree(const MatcherStringPattern* pattern);
327 
328   void CreateFailureAndOutputEdges();
329 
330   // Adds all pattern IDs to |matches| which are a suffix of the string
331   // represented by |node|.
332   void AccumulateMatchesForNode(
333       const AhoCorasickNode* node,
334       std::set<MatcherStringPattern::ID>* matches) const;
335 
336   // The nodes of a Aho-Corasick tree.
337   std::vector<AhoCorasickNode> tree_;
338 
339   bool is_empty_ = true;
340 };
341 
342 }  // namespace base
343 
344 #endif  // BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
345