1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "re2/prefilter_tree.h"
6
7 #include <stddef.h>
8 #include <algorithm>
9 #include <cmath>
10 #include <memory>
11 #include <string>
12 #include <utility>
13 #include <vector>
14
15 #include "absl/strings/str_format.h"
16 #include "util/logging.h"
17 #include "re2/prefilter.h"
18 #include "re2/re2.h"
19
20 namespace re2 {
21
22 static const bool ExtraDebug = false;
23
PrefilterTree()24 PrefilterTree::PrefilterTree()
25 : compiled_(false),
26 min_atom_len_(3) {
27 }
28
PrefilterTree(int min_atom_len)29 PrefilterTree::PrefilterTree(int min_atom_len)
30 : compiled_(false),
31 min_atom_len_(min_atom_len) {
32 }
33
~PrefilterTree()34 PrefilterTree::~PrefilterTree() {
35 for (size_t i = 0; i < prefilter_vec_.size(); i++)
36 delete prefilter_vec_[i];
37 }
38
Add(Prefilter * prefilter)39 void PrefilterTree::Add(Prefilter* prefilter) {
40 if (compiled_) {
41 LOG(DFATAL) << "Add called after Compile.";
42 return;
43 }
44 if (prefilter != NULL && !KeepNode(prefilter)) {
45 delete prefilter;
46 prefilter = NULL;
47 }
48
49 prefilter_vec_.push_back(prefilter);
50 }
51
Compile(std::vector<std::string> * atom_vec)52 void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
53 if (compiled_) {
54 LOG(DFATAL) << "Compile called already.";
55 return;
56 }
57
58 // Some legacy users of PrefilterTree call Compile() before
59 // adding any regexps and expect Compile() to have no effect.
60 if (prefilter_vec_.empty())
61 return;
62
63 compiled_ = true;
64
65 NodeSet nodes;
66 AssignUniqueIds(&nodes, atom_vec);
67 if (ExtraDebug)
68 PrintDebugInfo(&nodes);
69 }
70
CanonicalNode(NodeSet * nodes,Prefilter * node)71 Prefilter* PrefilterTree::CanonicalNode(NodeSet* nodes, Prefilter* node) {
72 NodeSet::const_iterator iter = nodes->find(node);
73 if (iter != nodes->end()) {
74 return *iter;
75 }
76 return NULL;
77 }
78
KeepNode(Prefilter * node) const79 bool PrefilterTree::KeepNode(Prefilter* node) const {
80 if (node == NULL)
81 return false;
82
83 switch (node->op()) {
84 default:
85 LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
86 return false;
87
88 case Prefilter::ALL:
89 case Prefilter::NONE:
90 return false;
91
92 case Prefilter::ATOM:
93 return node->atom().size() >= static_cast<size_t>(min_atom_len_);
94
95 case Prefilter::AND: {
96 int j = 0;
97 std::vector<Prefilter*>* subs = node->subs();
98 for (size_t i = 0; i < subs->size(); i++)
99 if (KeepNode((*subs)[i]))
100 (*subs)[j++] = (*subs)[i];
101 else
102 delete (*subs)[i];
103
104 subs->resize(j);
105 return j > 0;
106 }
107
108 case Prefilter::OR:
109 for (size_t i = 0; i < node->subs()->size(); i++)
110 if (!KeepNode((*node->subs())[i]))
111 return false;
112 return true;
113 }
114 }
115
AssignUniqueIds(NodeSet * nodes,std::vector<std::string> * atom_vec)116 void PrefilterTree::AssignUniqueIds(NodeSet* nodes,
117 std::vector<std::string>* atom_vec) {
118 atom_vec->clear();
119
120 // Build vector of all filter nodes, sorted topologically
121 // from top to bottom in v.
122 std::vector<Prefilter*> v;
123
124 // Add the top level nodes of each regexp prefilter.
125 for (size_t i = 0; i < prefilter_vec_.size(); i++) {
126 Prefilter* f = prefilter_vec_[i];
127 if (f == NULL)
128 unfiltered_.push_back(static_cast<int>(i));
129
130 // We push NULL also on to v, so that we maintain the
131 // mapping of index==regexpid for level=0 prefilter nodes.
132 v.push_back(f);
133 }
134
135 // Now add all the descendant nodes.
136 for (size_t i = 0; i < v.size(); i++) {
137 Prefilter* f = v[i];
138 if (f == NULL)
139 continue;
140 if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
141 const std::vector<Prefilter*>& subs = *f->subs();
142 for (size_t j = 0; j < subs.size(); j++)
143 v.push_back(subs[j]);
144 }
145 }
146
147 // Identify unique nodes.
148 int unique_id = 0;
149 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
150 Prefilter *node = v[i];
151 if (node == NULL)
152 continue;
153 node->set_unique_id(-1);
154 Prefilter* canonical = CanonicalNode(nodes, node);
155 if (canonical == NULL) {
156 // Any further nodes that have the same atom/subs
157 // will find this node as the canonical node.
158 nodes->emplace(node);
159 if (node->op() == Prefilter::ATOM) {
160 atom_vec->push_back(node->atom());
161 atom_index_to_id_.push_back(unique_id);
162 }
163 node->set_unique_id(unique_id++);
164 } else {
165 node->set_unique_id(canonical->unique_id());
166 }
167 }
168 entries_.resize(unique_id);
169
170 // Fill the entries.
171 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
172 Prefilter* prefilter = v[i];
173 if (prefilter == NULL)
174 continue;
175 if (CanonicalNode(nodes, prefilter) != prefilter)
176 continue;
177 int id = prefilter->unique_id();
178 switch (prefilter->op()) {
179 default:
180 LOG(DFATAL) << "Unexpected op: " << prefilter->op();
181 return;
182
183 case Prefilter::ATOM:
184 entries_[id].propagate_up_at_count = 1;
185 break;
186
187 case Prefilter::OR:
188 case Prefilter::AND: {
189 // For each child, we append our id to the child's list of
190 // parent ids... unless we happen to have done so already.
191 // The number of appends is the number of unique children,
192 // which allows correct upward propagation from AND nodes.
193 int up_count = 0;
194 for (size_t j = 0; j < prefilter->subs()->size(); j++) {
195 int child_id = (*prefilter->subs())[j]->unique_id();
196 std::vector<int>& parents = entries_[child_id].parents;
197 if (parents.empty() || parents.back() != id) {
198 parents.push_back(id);
199 up_count++;
200 }
201 }
202 entries_[id].propagate_up_at_count =
203 prefilter->op() == Prefilter::AND ? up_count : 1;
204 break;
205 }
206 }
207 }
208
209 // For top level nodes, populate regexp id.
210 for (size_t i = 0; i < prefilter_vec_.size(); i++) {
211 if (prefilter_vec_[i] == NULL)
212 continue;
213 int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
214 DCHECK_LE(0, id);
215 Entry* entry = &entries_[id];
216 entry->regexps.push_back(static_cast<int>(i));
217 }
218
219 // Lastly, using probability-based heuristics, we identify nodes
220 // that trigger too many parents and then we try to prune edges.
221 // We use logarithms below to avoid the likelihood of underflow.
222 double log_num_regexps = std::log(prefilter_vec_.size() - unfiltered_.size());
223 // Hoisted this above the loop so that we don't thrash the heap.
224 std::vector<std::pair<size_t, int>> entries_by_num_edges;
225 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
226 Prefilter* prefilter = v[i];
227 // Pruning applies only to AND nodes because it "just" reduces
228 // precision; applied to OR nodes, it would break correctness.
229 if (prefilter == NULL || prefilter->op() != Prefilter::AND)
230 continue;
231 if (CanonicalNode(nodes, prefilter) != prefilter)
232 continue;
233 int id = prefilter->unique_id();
234
235 // Sort the current node's children by the numbers of parents.
236 entries_by_num_edges.clear();
237 for (size_t j = 0; j < prefilter->subs()->size(); j++) {
238 int child_id = (*prefilter->subs())[j]->unique_id();
239 const std::vector<int>& parents = entries_[child_id].parents;
240 entries_by_num_edges.emplace_back(parents.size(), child_id);
241 }
242 std::stable_sort(entries_by_num_edges.begin(), entries_by_num_edges.end());
243
244 // A running estimate of how many regexps will be triggered by
245 // pruning the remaining children's edges to the current node.
246 // Our nominal target is one, so the threshold is log(1) == 0;
247 // pruning occurs iff the child has more than nine edges left.
248 double log_num_triggered = log_num_regexps;
249 for (const auto& pair : entries_by_num_edges) {
250 int child_id = pair.second;
251 std::vector<int>& parents = entries_[child_id].parents;
252 if (log_num_triggered > 0.) {
253 log_num_triggered += std::log(parents.size());
254 log_num_triggered -= log_num_regexps;
255 } else if (parents.size() > 9) {
256 auto it = std::find(parents.begin(), parents.end(), id);
257 if (it != parents.end()) {
258 parents.erase(it);
259 entries_[id].propagate_up_at_count--;
260 }
261 }
262 }
263 }
264 }
265
266 // Functions for triggering during search.
RegexpsGivenStrings(const std::vector<int> & matched_atoms,std::vector<int> * regexps) const267 void PrefilterTree::RegexpsGivenStrings(
268 const std::vector<int>& matched_atoms,
269 std::vector<int>* regexps) const {
270 regexps->clear();
271 if (!compiled_) {
272 // Some legacy users of PrefilterTree call Compile() before
273 // adding any regexps and expect Compile() to have no effect.
274 // This kludge is a counterpart to that kludge.
275 if (prefilter_vec_.empty())
276 return;
277
278 LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
279 for (size_t i = 0; i < prefilter_vec_.size(); i++)
280 regexps->push_back(static_cast<int>(i));
281 } else {
282 IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
283 std::vector<int> matched_atom_ids;
284 for (size_t j = 0; j < matched_atoms.size(); j++)
285 matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
286 PropagateMatch(matched_atom_ids, ®exps_map);
287 for (IntMap::const_iterator it = regexps_map.begin();
288 it != regexps_map.end();
289 ++it)
290 regexps->push_back(it->index());
291
292 regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
293 }
294 std::sort(regexps->begin(), regexps->end());
295 }
296
PropagateMatch(const std::vector<int> & atom_ids,IntMap * regexps) const297 void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
298 IntMap* regexps) const {
299 IntMap count(static_cast<int>(entries_.size()));
300 IntMap work(static_cast<int>(entries_.size()));
301 for (size_t i = 0; i < atom_ids.size(); i++)
302 work.set(atom_ids[i], 1);
303 for (IntMap::const_iterator it = work.begin(); it != work.end(); ++it) {
304 const Entry& entry = entries_[it->index()];
305 // Record regexps triggered.
306 for (size_t i = 0; i < entry.regexps.size(); i++)
307 regexps->set(entry.regexps[i], 1);
308 int c;
309 // Pass trigger up to parents.
310 for (int j : entry.parents) {
311 const Entry& parent = entries_[j];
312 // Delay until all the children have succeeded.
313 if (parent.propagate_up_at_count > 1) {
314 if (count.has_index(j)) {
315 c = count.get_existing(j) + 1;
316 count.set_existing(j, c);
317 } else {
318 c = 1;
319 count.set_new(j, c);
320 }
321 if (c < parent.propagate_up_at_count)
322 continue;
323 }
324 // Trigger the parent.
325 work.set(j, 1);
326 }
327 }
328 }
329
330 // Debugging help.
PrintPrefilter(int regexpid)331 void PrefilterTree::PrintPrefilter(int regexpid) {
332 LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
333 }
334
PrintDebugInfo(NodeSet * nodes)335 void PrefilterTree::PrintDebugInfo(NodeSet* nodes) {
336 LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
337 LOG(ERROR) << "#Unique Nodes: " << entries_.size();
338
339 for (size_t i = 0; i < entries_.size(); i++) {
340 const std::vector<int>& parents = entries_[i].parents;
341 const std::vector<int>& regexps = entries_[i].regexps;
342 LOG(ERROR) << "EntryId: " << i
343 << " N: " << parents.size() << " R: " << regexps.size();
344 for (int parent : parents)
345 LOG(ERROR) << parent;
346 }
347 LOG(ERROR) << "Set:";
348 for (NodeSet::const_iterator iter = nodes->begin();
349 iter != nodes->end(); ++iter)
350 LOG(ERROR) << "NodeId: " << (*iter)->unique_id();
351 }
352
DebugNodeString(Prefilter * node) const353 std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
354 std::string node_string = "";
355 if (node->op() == Prefilter::ATOM) {
356 DCHECK(!node->atom().empty());
357 node_string += node->atom();
358 } else {
359 // Adding the operation disambiguates AND and OR nodes.
360 node_string += node->op() == Prefilter::AND ? "AND" : "OR";
361 node_string += "(";
362 for (size_t i = 0; i < node->subs()->size(); i++) {
363 if (i > 0)
364 node_string += ',';
365 node_string += absl::StrFormat("%d", (*node->subs())[i]->unique_id());
366 node_string += ":";
367 node_string += DebugNodeString((*node->subs())[i]);
368 }
369 node_string += ")";
370 }
371 return node_string;
372 }
373
374 } // namespace re2
375