• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #ifndef RE2_PREFILTER_TREE_H_
6 #define RE2_PREFILTER_TREE_H_
7 
8 // The PrefilterTree class is used to form an AND-OR tree of strings
9 // that would trigger each regexp. The 'prefilter' of each regexp is
10 // added to PrefilterTree, and then PrefilterTree is used to find all
11 // the unique strings across the prefilters. During search, by using
12 // matches from a string matching engine, PrefilterTree deduces the
13 // set of regexps that are to be triggered. The 'string matching
14 // engine' itself is outside of this class, and the caller can use any
15 // favorite engine. PrefilterTree provides a set of strings (called
16 // atoms) that the user of this class should use to do the string
17 // matching.
18 
19 #include <string>
20 #include <vector>
21 
22 #include "absl/container/flat_hash_set.h"
23 #include "re2/prefilter.h"
24 #include "re2/sparse_array.h"
25 #include "util/logging.h"
26 
27 namespace re2 {
28 
29 class PrefilterTree {
30  public:
31   PrefilterTree();
32   explicit PrefilterTree(int min_atom_len);
33   ~PrefilterTree();
34 
35   // Adds the prefilter for the next regexp. Note that we assume that
36   // Add called sequentially for all regexps. All Add calls
37   // must precede Compile.
38   void Add(Prefilter* prefilter);
39 
40   // The Compile returns a vector of string in atom_vec.
41   // Call this after all the prefilters are added through Add.
42   // No calls to Add after Compile are allowed.
43   // The caller should use the returned set of strings to do string matching.
44   // Each time a string matches, the corresponding index then has to be
45   // and passed to RegexpsGivenStrings below.
46   void Compile(std::vector<std::string>* atom_vec);
47 
48   // Given the indices of the atoms that matched, returns the indexes
49   // of regexps that should be searched.  The matched_atoms should
50   // contain all the ids of string atoms that were found to match the
51   // content. The caller can use any string match engine to perform
52   // this function. This function is thread safe.
53   void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
54                            std::vector<int>* regexps) const;
55 
56   // Print debug prefilter. Also prints unique ids associated with
57   // nodes of the prefilter of the regexp.
58   void PrintPrefilter(int regexpid);
59 
60  private:
61   using IntMap = SparseArray<int>;
62 
63   struct PrefilterHash {
operatorPrefilterHash64     size_t operator()(const Prefilter* a) const {
65       DCHECK(a != NULL);
66       return absl::Hash<Prefilter>()(*a);
67     }
68   };
69 
70   struct PrefilterEqual {
operatorPrefilterEqual71     bool operator()(const Prefilter* a, const Prefilter* b) const {
72       DCHECK(a != NULL);
73       DCHECK(b != NULL);
74       return *a == *b;
75     }
76   };
77 
78   using NodeSet =
79       absl::flat_hash_set<Prefilter*, PrefilterHash, PrefilterEqual>;
80 
81   // Each unique node has a corresponding Entry that helps in
82   // passing the matching trigger information along the tree.
83   struct Entry {
84    public:
85     // How many children should match before this node triggers the
86     // parent. For an atom and an OR node, this is 1 and for an AND
87     // node, it is the number of unique children.
88     int propagate_up_at_count;
89 
90     // When this node is ready to trigger the parent, what are the indices
91     // of the parent nodes to trigger. The reason there may be more than
92     // one is because of sharing. For example (abc | def) and (xyz | def)
93     // are two different nodes, but they share the atom 'def'. So when
94     // 'def' matches, it triggers two parents, corresponding to the two
95     // different OR nodes.
96     std::vector<int> parents;
97 
98     // When this node is ready to trigger the parent, what are the
99     // regexps that are triggered.
100     std::vector<int> regexps;
101   };
102 
103   // Returns true if the prefilter node should be kept.
104   bool KeepNode(Prefilter* node) const;
105 
106   // This function assigns unique ids to various parts of the
107   // prefilter, by looking at if these nodes are already in the
108   // PrefilterTree.
109   void AssignUniqueIds(NodeSet* nodes, std::vector<std::string>* atom_vec);
110 
111   // Given the matching atoms, find the regexps to be triggered.
112   void PropagateMatch(const std::vector<int>& atom_ids,
113                       IntMap* regexps) const;
114 
115   // Returns the prefilter node that has the same atom/subs as this
116   // node. For the canonical node, returns node. Assumes that the
117   // children of node have already been assigned unique ids.
118   Prefilter* CanonicalNode(NodeSet* nodes, Prefilter* node);
119 
120   // Recursively constructs a readable prefilter string.
121   std::string DebugNodeString(Prefilter* node) const;
122 
123   // Used for debugging.
124   void PrintDebugInfo(NodeSet* nodes);
125 
126   // These are all the nodes formed by Compile. Essentially, there is
127   // one node for each unique atom and each unique AND/OR node.
128   std::vector<Entry> entries_;
129 
130   // indices of regexps that always pass through the filter (since we
131   // found no required literals in these regexps).
132   std::vector<int> unfiltered_;
133 
134   // vector of Prefilter for all regexps.
135   std::vector<Prefilter*> prefilter_vec_;
136 
137   // Atom index in returned strings to entry id mapping.
138   std::vector<int> atom_index_to_id_;
139 
140   // Has the prefilter tree been compiled.
141   bool compiled_;
142 
143   // Strings less than this length are not stored as atoms.
144   const int min_atom_len_;
145 
146   PrefilterTree(const PrefilterTree&) = delete;
147   PrefilterTree& operator=(const PrefilterTree&) = delete;
148 };
149 
150 }  // namespace
151 
152 #endif  // RE2_PREFILTER_TREE_H_
153