1 // Copyright 2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #ifndef RE2_PREFILTER_TREE_H_ 6 #define RE2_PREFILTER_TREE_H_ 7 8 // The PrefilterTree class is used to form an AND-OR tree of strings 9 // that would trigger each regexp. The 'prefilter' of each regexp is 10 // added to PrefilterTree, and then PrefilterTree is used to find all 11 // the unique strings across the prefilters. During search, by using 12 // matches from a string matching engine, PrefilterTree deduces the 13 // set of regexps that are to be triggered. The 'string matching 14 // engine' itself is outside of this class, and the caller can use any 15 // favorite engine. PrefilterTree provides a set of strings (called 16 // atoms) that the user of this class should use to do the string 17 // matching. 18 19 #include <string> 20 #include <vector> 21 22 #include "absl/container/flat_hash_set.h" 23 #include "re2/prefilter.h" 24 #include "re2/sparse_array.h" 25 #include "util/logging.h" 26 27 namespace re2 { 28 29 class PrefilterTree { 30 public: 31 PrefilterTree(); 32 explicit PrefilterTree(int min_atom_len); 33 ~PrefilterTree(); 34 35 // Adds the prefilter for the next regexp. Note that we assume that 36 // Add called sequentially for all regexps. All Add calls 37 // must precede Compile. 38 void Add(Prefilter* prefilter); 39 40 // The Compile returns a vector of string in atom_vec. 41 // Call this after all the prefilters are added through Add. 42 // No calls to Add after Compile are allowed. 43 // The caller should use the returned set of strings to do string matching. 44 // Each time a string matches, the corresponding index then has to be 45 // and passed to RegexpsGivenStrings below. 46 void Compile(std::vector<std::string>* atom_vec); 47 48 // Given the indices of the atoms that matched, returns the indexes 49 // of regexps that should be searched. The matched_atoms should 50 // contain all the ids of string atoms that were found to match the 51 // content. The caller can use any string match engine to perform 52 // this function. This function is thread safe. 53 void RegexpsGivenStrings(const std::vector<int>& matched_atoms, 54 std::vector<int>* regexps) const; 55 56 // Print debug prefilter. Also prints unique ids associated with 57 // nodes of the prefilter of the regexp. 58 void PrintPrefilter(int regexpid); 59 60 private: 61 using IntMap = SparseArray<int>; 62 63 struct PrefilterHash { operatorPrefilterHash64 size_t operator()(const Prefilter* a) const { 65 DCHECK(a != NULL); 66 return absl::Hash<Prefilter>()(*a); 67 } 68 }; 69 70 struct PrefilterEqual { operatorPrefilterEqual71 bool operator()(const Prefilter* a, const Prefilter* b) const { 72 DCHECK(a != NULL); 73 DCHECK(b != NULL); 74 return *a == *b; 75 } 76 }; 77 78 using NodeSet = 79 absl::flat_hash_set<Prefilter*, PrefilterHash, PrefilterEqual>; 80 81 // Each unique node has a corresponding Entry that helps in 82 // passing the matching trigger information along the tree. 83 struct Entry { 84 public: 85 // How many children should match before this node triggers the 86 // parent. For an atom and an OR node, this is 1 and for an AND 87 // node, it is the number of unique children. 88 int propagate_up_at_count; 89 90 // When this node is ready to trigger the parent, what are the indices 91 // of the parent nodes to trigger. The reason there may be more than 92 // one is because of sharing. For example (abc | def) and (xyz | def) 93 // are two different nodes, but they share the atom 'def'. So when 94 // 'def' matches, it triggers two parents, corresponding to the two 95 // different OR nodes. 96 std::vector<int> parents; 97 98 // When this node is ready to trigger the parent, what are the 99 // regexps that are triggered. 100 std::vector<int> regexps; 101 }; 102 103 // Returns true if the prefilter node should be kept. 104 bool KeepNode(Prefilter* node) const; 105 106 // This function assigns unique ids to various parts of the 107 // prefilter, by looking at if these nodes are already in the 108 // PrefilterTree. 109 void AssignUniqueIds(NodeSet* nodes, std::vector<std::string>* atom_vec); 110 111 // Given the matching atoms, find the regexps to be triggered. 112 void PropagateMatch(const std::vector<int>& atom_ids, 113 IntMap* regexps) const; 114 115 // Returns the prefilter node that has the same atom/subs as this 116 // node. For the canonical node, returns node. Assumes that the 117 // children of node have already been assigned unique ids. 118 Prefilter* CanonicalNode(NodeSet* nodes, Prefilter* node); 119 120 // Recursively constructs a readable prefilter string. 121 std::string DebugNodeString(Prefilter* node) const; 122 123 // Used for debugging. 124 void PrintDebugInfo(NodeSet* nodes); 125 126 // These are all the nodes formed by Compile. Essentially, there is 127 // one node for each unique atom and each unique AND/OR node. 128 std::vector<Entry> entries_; 129 130 // indices of regexps that always pass through the filter (since we 131 // found no required literals in these regexps). 132 std::vector<int> unfiltered_; 133 134 // vector of Prefilter for all regexps. 135 std::vector<Prefilter*> prefilter_vec_; 136 137 // Atom index in returned strings to entry id mapping. 138 std::vector<int> atom_index_to_id_; 139 140 // Has the prefilter tree been compiled. 141 bool compiled_; 142 143 // Strings less than this length are not stored as atoms. 144 const int min_atom_len_; 145 146 PrefilterTree(const PrefilterTree&) = delete; 147 PrefilterTree& operator=(const PrefilterTree&) = delete; 148 }; 149 150 } // namespace 151 152 #endif // RE2_PREFILTER_TREE_H_ 153