1 // Copyright 2009 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #ifndef RE2_PREFILTER_TREE_H_ 6 #define RE2_PREFILTER_TREE_H_ 7 8 // The PrefilterTree class is used to form an AND-OR tree of strings 9 // that would trigger each regexp. The 'prefilter' of each regexp is 10 // added to PrefilterTree, and then PrefilterTree is used to find all 11 // the unique strings across the prefilters. During search, by using 12 // matches from a string matching engine, PrefilterTree deduces the 13 // set of regexps that are to be triggered. The 'string matching 14 // engine' itself is outside of this class, and the caller can use any 15 // favorite engine. PrefilterTree provides a set of strings (called 16 // atoms) that the user of this class should use to do the string 17 // matching. 18 19 #include <map> 20 #include <string> 21 #include <vector> 22 23 #include "util/util.h" 24 #include "re2/prefilter.h" 25 #include "re2/sparse_array.h" 26 27 namespace re2 { 28 29 class PrefilterTree { 30 public: 31 PrefilterTree(); 32 explicit PrefilterTree(int min_atom_len); 33 ~PrefilterTree(); 34 35 // Adds the prefilter for the next regexp. Note that we assume that 36 // Add called sequentially for all regexps. All Add calls 37 // must precede Compile. 38 void Add(Prefilter* prefilter); 39 40 // The Compile returns a vector of string in atom_vec. 41 // Call this after all the prefilters are added through Add. 42 // No calls to Add after Compile are allowed. 43 // The caller should use the returned set of strings to do string matching. 44 // Each time a string matches, the corresponding index then has to be 45 // and passed to RegexpsGivenStrings below. 46 void Compile(std::vector<std::string>* atom_vec); 47 48 // Given the indices of the atoms that matched, returns the indexes 49 // of regexps that should be searched. The matched_atoms should 50 // contain all the ids of string atoms that were found to match the 51 // content. The caller can use any string match engine to perform 52 // this function. This function is thread safe. 53 void RegexpsGivenStrings(const std::vector<int>& matched_atoms, 54 std::vector<int>* regexps) const; 55 56 // Print debug prefilter. Also prints unique ids associated with 57 // nodes of the prefilter of the regexp. 58 void PrintPrefilter(int regexpid); 59 60 private: 61 typedef SparseArray<int> IntMap; 62 typedef std::map<int, int> StdIntMap; 63 typedef std::map<std::string, Prefilter*> NodeMap; 64 65 // Each unique node has a corresponding Entry that helps in 66 // passing the matching trigger information along the tree. 67 struct Entry { 68 public: 69 // How many children should match before this node triggers the 70 // parent. For an atom and an OR node, this is 1 and for an AND 71 // node, it is the number of unique children. 72 int propagate_up_at_count; 73 74 // When this node is ready to trigger the parent, what are the indices 75 // of the parent nodes to trigger. The reason there may be more than 76 // one is because of sharing. For example (abc | def) and (xyz | def) 77 // are two different nodes, but they share the atom 'def'. So when 78 // 'def' matches, it triggers two parents, corresponding to the two 79 // different OR nodes. 80 StdIntMap* parents; 81 82 // When this node is ready to trigger the parent, what are the 83 // regexps that are triggered. 84 std::vector<int> regexps; 85 }; 86 87 // Returns true if the prefilter node should be kept. 88 bool KeepNode(Prefilter* node) const; 89 90 // This function assigns unique ids to various parts of the 91 // prefilter, by looking at if these nodes are already in the 92 // PrefilterTree. 93 void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec); 94 95 // Given the matching atoms, find the regexps to be triggered. 96 void PropagateMatch(const std::vector<int>& atom_ids, 97 IntMap* regexps) const; 98 99 // Returns the prefilter node that has the same NodeString as this 100 // node. For the canonical node, returns node. 101 Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node); 102 103 // A string that uniquely identifies the node. Assumes that the 104 // children of node has already been assigned unique ids. 105 std::string NodeString(Prefilter* node) const; 106 107 // Recursively constructs a readable prefilter string. 108 std::string DebugNodeString(Prefilter* node) const; 109 110 // Used for debugging. 111 void PrintDebugInfo(NodeMap* nodes); 112 113 // These are all the nodes formed by Compile. Essentially, there is 114 // one node for each unique atom and each unique AND/OR node. 115 std::vector<Entry> entries_; 116 117 // indices of regexps that always pass through the filter (since we 118 // found no required literals in these regexps). 119 std::vector<int> unfiltered_; 120 121 // vector of Prefilter for all regexps. 122 std::vector<Prefilter*> prefilter_vec_; 123 124 // Atom index in returned strings to entry id mapping. 125 std::vector<int> atom_index_to_id_; 126 127 // Has the prefilter tree been compiled. 128 bool compiled_; 129 130 // Strings less than this length are not stored as atoms. 131 const int min_atom_len_; 132 133 PrefilterTree(const PrefilterTree&) = delete; 134 PrefilterTree& operator=(const PrefilterTree&) = delete; 135 }; 136 137 } // namespace 138 139 #endif // RE2_PREFILTER_TREE_H_ 140