1 // Copyright 2013 The Chromium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifdef UNSAFE_BUFFERS_BUILD 6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors. 7 #pragma allow_unsafe_buffers 8 #endif 9 10 #ifndef BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_ 11 #define BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_ 12 13 #include <stdint.h> 14 15 #include <limits> 16 #include <set> 17 #include <string> 18 #include <vector> 19 20 #include "base/base_export.h" 21 #include "base/check_op.h" 22 #include "base/memory/raw_ptr_exclusion.h" 23 #include "base/substring_set_matcher/matcher_string_pattern.h" 24 25 namespace base { 26 27 // Class that store a set of string patterns and can find for a string S, 28 // which string patterns occur in S. 29 class BASE_EXPORT SubstringSetMatcher { 30 public: 31 SubstringSetMatcher(); 32 SubstringSetMatcher(const SubstringSetMatcher&) = delete; 33 SubstringSetMatcher& operator=(const SubstringSetMatcher&) = delete; 34 35 ~SubstringSetMatcher(); 36 37 // Registers all |patterns|. Each pattern needs to have a unique ID and all 38 // pattern strings must be unique. Build() should be called exactly once 39 // (before it is called, the tree is empty). 40 // 41 // Complexity: 42 // Let n = number of patterns. 43 // Let S = sum of pattern lengths. 44 // Let k = range of char. Generally 256. 45 // Complexity = O(nlogn + S * logk) 46 // nlogn comes from sorting the patterns. 47 // log(k) comes from our usage of std::map to store edges. 48 // 49 // Returns true on success (may fail if e.g. if the tree gets too many nodes). 50 bool Build(const std::vector<MatcherStringPattern>& patterns); 51 bool Build(std::vector<const MatcherStringPattern*> patterns); 52 53 // Matches |text| against all registered MatcherStringPatterns. Stores the IDs 54 // of matching patterns in |matches|. |matches| is not cleared before adding 55 // to it. 56 // Complexity: 57 // Let t = length of |text|. 58 // Let k = range of char. Generally 256. 59 // Let z = number of matches returned. 60 // Complexity = O(t * logk + zlogz) 61 bool Match(const std::string& text, 62 std::set<MatcherStringPattern::ID>* matches) const; 63 64 // As Match(), except it returns immediately on the first match. 65 // This allows true/false matching to be done without any dynamic 66 // memory allocation. 67 // Complexity = O(t * logk) 68 bool AnyMatch(const std::string& text) const; 69 70 // Returns true if this object retains no allocated data. IsEmpty()71 bool IsEmpty() const { return is_empty_; } 72 73 // Returns the dynamically allocated memory usage in bytes. See 74 // base/trace_event/memory_usage_estimator.h for details. 75 size_t EstimateMemoryUsage() const; 76 77 private: 78 // Represents the index of the node within |tree_|. It is specifically 79 // uint32_t so that we can be sure it takes up 4 bytes when stored together 80 // with the 9-bit label (so 23 bits are allocated to the NodeID, even though 81 // it is exposed as uint32_t). If the computed size of |tree_| is 82 // larger than what can be stored within 23 bits, Build() will fail. 83 using NodeID = uint32_t; 84 85 // This is the maximum possible size of |tree_| and hence can't be a valid ID. 86 static constexpr NodeID kInvalidNodeID = (1u << 23) - 1; 87 88 static constexpr NodeID kRootID = 0; 89 90 // A node of an Aho Corasick Tree. See 91 // http://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/02/Small02.pdf 92 // to understand the algorithm. 93 // 94 // The algorithm is based on the idea of building a trie of all registered 95 // patterns. Each node of the tree is annotated with a set of pattern 96 // IDs that are used to report matches. 97 // 98 // The root of the trie represents an empty match. If we were looking whether 99 // any registered pattern matches a text at the beginning of the text (i.e. 100 // whether any pattern is a prefix of the text), we could just follow 101 // nodes in the trie according to the matching characters in the text. 102 // E.g., if text == "foobar", we would follow the trie from the root node 103 // to its child labeled 'f', from there to child 'o', etc. In this process we 104 // would report all pattern IDs associated with the trie nodes as matches. 105 // 106 // As we are not looking for all prefix matches but all substring matches, 107 // this algorithm would need to compare text.substr(0), text.substr(1), ... 108 // against the trie, which is in O(|text|^2). 109 // 110 // The Aho Corasick algorithm improves this runtime by using failure edges. 111 // In case we have found a partial match of length k in the text 112 // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at 113 // a node at depth k, but cannot find a match in the trie for character 114 // text[i + k] at depth k + 1, we follow a failure edge. This edge 115 // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that 116 // is a prefix of any registered pattern. 117 // 118 // If your brain thinks "Forget it, let's go shopping.", don't worry. 119 // Take a nap and read an introductory text on the Aho Corasick algorithm. 120 // It will make sense. Eventually. 121 122 // An edge internal to the tree. We pack the label (character we are 123 // matching on) and the destination node ID into 32 bits, to save memory. 124 // We also use these edges as a sort of generic key/value store for 125 // some special values that not all nodes will have; this also saves on 126 // memory over the otherwise obvious choice of having them as struct fields, 127 // as it means we do not to store them when they are not present. 128 struct AhoCorasickEdge { 129 // char (unsigned, so [0..255]), or a special label below. 130 uint32_t label : 9; 131 NodeID node_id : 23; 132 }; 133 134 // Node index that failure edge leads to. The failure node corresponds to 135 // the node which represents the longest proper suffix (include empty 136 // string) of the string represented by this node. Not stored if it is 137 // equal to kRootID (since that is the most common value). 138 // 139 // NOTE: Assigning |root| as the failure edge for itself doesn't strictly 140 // abide by the definition of "proper" suffix. The proper suffix of an empty 141 // string should probably be defined as null, but we assign it to the |root| 142 // to simplify the code and have the invariant that the failure edge is always 143 // defined. 144 static constexpr uint32_t kFailureNodeLabel = 0x100; 145 static constexpr uint32_t kFirstSpecialLabel = kFailureNodeLabel; 146 147 // Node index that corresponds to the longest proper suffix (including empty 148 // suffix) of this node and which also represents the end of a pattern. 149 // Does not have to exist. 150 static constexpr uint32_t kOutputLinkLabel = 0x101; 151 152 // If present, this node represents the end of a pattern. It stores the ID of 153 // the corresponding pattern (ie., it is not really a NodeID, but a 154 // MatcherStringPattern::ID). 155 static constexpr uint32_t kMatchIDLabel = 0x102; 156 157 // Used for uninitialized label slots; used so that we do not have to test for 158 // them in other ways, since we know the data will be initialized and never 159 // match any other labels. 160 static constexpr uint32_t kEmptyLabel = 0x103; 161 162 // A node in the trie, packed tightly together so that it occupies 12 bytes 163 // (both on 32- and 64-bit platforms), but aligned to at least 4 (see the 164 // comment on edges_). 165 class alignas(AhoCorasickEdge) AhoCorasickNode { 166 public: 167 AhoCorasickNode(); 168 ~AhoCorasickNode(); 169 AhoCorasickNode(AhoCorasickNode&& other); 170 AhoCorasickNode& operator=(AhoCorasickNode&& other); 171 GetEdge(uint32_t label)172 NodeID GetEdge(uint32_t label) const { 173 if (edges_capacity_ != 0) { 174 return GetEdgeNoInline(label); 175 } 176 static_assert(kNumInlineEdges == 2, "Code below needs updating"); 177 if (edges_.inline_edges[0].label == label) { 178 return edges_.inline_edges[0].node_id; 179 } 180 if (edges_.inline_edges[1].label == label) { 181 return edges_.inline_edges[1].node_id; 182 } 183 return kInvalidNodeID; 184 } 185 NodeID GetEdgeNoInline(uint32_t label) const; 186 void SetEdge(uint32_t label, NodeID node); edges()187 const AhoCorasickEdge* edges() const { 188 // NOTE: Returning edges_.inline_edges here is fine, because it's 189 // the first thing in the struct (see the comment on edges_). 190 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(edges_.inline_edges) % 191 alignof(AhoCorasickEdge)); 192 return edges_capacity_ == 0 ? edges_.inline_edges : edges_.edges; 193 } 194 failure()195 NodeID failure() const { 196 // NOTE: Even if num_edges_ == 0, we are not doing anything 197 // undefined, as we will have room for at least two edges 198 // and empty edges are set to kEmptyLabel. 199 const AhoCorasickEdge& first_edge = *edges(); 200 if (first_edge.label == kFailureNodeLabel) { 201 return first_edge.node_id; 202 } else { 203 return kRootID; 204 } 205 } 206 void SetFailure(NodeID failure); 207 SetMatchID(MatcherStringPattern::ID id)208 void SetMatchID(MatcherStringPattern::ID id) { 209 DCHECK(!IsEndOfPattern()); 210 DCHECK(id < kInvalidNodeID); // This is enforced by Build(). 211 SetEdge(kMatchIDLabel, static_cast<NodeID>(id)); 212 has_outputs_ = true; 213 } 214 215 // Returns true if this node corresponds to a pattern. IsEndOfPattern()216 bool IsEndOfPattern() const { 217 if (!has_outputs_) { 218 // Fast reject. 219 return false; 220 } 221 return GetEdge(kMatchIDLabel) != kInvalidNodeID; 222 } 223 224 // Must only be called if |IsEndOfPattern| returns true for this node. GetMatchID()225 MatcherStringPattern::ID GetMatchID() const { 226 DCHECK(IsEndOfPattern()); 227 return GetEdge(kMatchIDLabel); 228 } 229 SetOutputLink(NodeID node)230 void SetOutputLink(NodeID node) { 231 if (node != kInvalidNodeID) { 232 SetEdge(kOutputLinkLabel, node); 233 has_outputs_ = true; 234 } 235 } output_link()236 NodeID output_link() const { return GetEdge(kOutputLinkLabel); } 237 238 size_t EstimateMemoryUsage() const; num_edges()239 size_t num_edges() const { 240 if (edges_capacity_ == 0) { 241 return kNumInlineEdges - num_free_edges_; 242 } else { 243 return edges_capacity_ - num_free_edges_; 244 } 245 } 246 has_outputs()247 bool has_outputs() const { return has_outputs_; } 248 249 private: 250 // Outgoing edges of current node, including failure edge and output links. 251 // Most nodes have only one or two (or even zero) edges, not the last 252 // because many of them are leaves. Thus, we make an optimization for this 253 // common case; instead of a pointer to an edge array on the heap, we can 254 // pack two edges inline where the pointer would otherwise be. This reduces 255 // memory usage dramatically, as well as saving us a cache-line fetch. 256 // 257 // Note that even though most nodes have fewer outgoing edges, most nodes 258 // that we actually traverse will have any of them. This apparent 259 // contradiction is because we tend to spend more of our time near the root 260 // of the trie, where it is wide. This means that another layout would be 261 // possible: If we wanted to, non-inline nodes could simply store an array 262 // of 259 (256 possible characters plus the three special label types) 263 // edges, indexed directly by label type. This would use 20–50% more RAM, 264 // but also increases the speed of lookups due to removing the search loop. 265 // 266 // The nodes are generally unordered; since we typically index text, even 267 // the root will rarely be more than 20–30 wide, and at that point, it's 268 // better to just do a linear search than a binary one (which fares poorly 269 // on branch predictors). However, a special case, we put kFailureNodeLabel 270 // in the first slot if it exists (ie., is not equal to kRootID), since we 271 // need to access that label during every single node we look at during 272 // traversal. 273 // 274 // NOTE: Keep this the first member in the struct, so that inline_edges gets 275 // 4-aligned (since the class is marked as such, despite being packed. 276 // Otherwise, edges() can return an unaligned pointer marked as aligned 277 // (the unalignedness gets lost). 278 static constexpr int kNumInlineEdges = 2; 279 union { 280 // Out-of-line edge storage, having room for edges_capacity_ elements. 281 // Note that due to __attribute__((packed)) below, this pointer may be 282 // unaligned on 64-bit platforms, causing slightly less efficient 283 // access to it in some cases. 284 // This field is not a raw_ptr<> because it was filtered by the rewriter 285 // for: #union 286 RAW_PTR_EXCLUSION AhoCorasickEdge* edges; 287 288 // Inline edge storage, used if edges_capacity_ == 0. 289 AhoCorasickEdge inline_edges[kNumInlineEdges]; 290 } edges_; 291 292 // Whether we have an edge for kMatchIDLabel or kOutputLinkLabel, 293 // ie., hitting this node during traversal will create one or more 294 // matches. This is redundant, but since every single lookup during 295 // traversal needs this, it saves a few searches for us. 296 bool has_outputs_ = false; 297 298 // Number of unused left in edges_. Edges are always allocated from the 299 // beginning and never deleted; those after num_edges_ will be marked with 300 // kEmptyLabel (and have an undefined node_id). We store the number of 301 // free edges instead of the more common number of _used_ edges, to be 302 // sure that we are able to fit it in an uint8_t. num_edges() provides 303 // a useful abstraction over this. 304 uint8_t num_free_edges_ = kNumInlineEdges; 305 306 // How many edges we have allocated room for (can never be more than 307 // kEmptyLabel + 1). If equal to zero, we are not using heap storage, 308 // but instead are using inline_edges. 309 // 310 // If not equal to zero, will be a multiple of 4, so that we can use 311 // SIMD to accelerate looking for edges. 312 uint16_t edges_capacity_ = 0; 313 } __attribute__((packed)); 314 315 using SubstringPatternVector = std::vector<const MatcherStringPattern*>; 316 317 // Given the set of patterns, compute how many nodes will the corresponding 318 // Aho-Corasick tree have. Note that |patterns| need to be sorted. 319 NodeID GetTreeSize( 320 const std::vector<const MatcherStringPattern*>& patterns) const; 321 322 void BuildAhoCorasickTree(const SubstringPatternVector& patterns); 323 324 // Inserts a path for |pattern->pattern()| into the tree and adds 325 // |pattern->id()| to the set of matches. 326 void InsertPatternIntoAhoCorasickTree(const MatcherStringPattern* pattern); 327 328 void CreateFailureAndOutputEdges(); 329 330 // Adds all pattern IDs to |matches| which are a suffix of the string 331 // represented by |node|. 332 void AccumulateMatchesForNode( 333 const AhoCorasickNode* node, 334 std::set<MatcherStringPattern::ID>* matches) const; 335 336 // The nodes of a Aho-Corasick tree. 337 std::vector<AhoCorasickNode> tree_; 338 339 bool is_empty_ = true; 340 }; 341 342 } // namespace base 343 344 #endif // BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_ 345