1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "re2/prefilter_tree.h"
6
7 #include <stddef.h>
8
9 #include <algorithm>
10 #include <cmath>
11 #include <string>
12 #include <utility>
13 #include <vector>
14
15 #include "absl/log/absl_check.h"
16 #include "absl/log/absl_log.h"
17 #include "absl/strings/str_format.h"
18 #include "re2/prefilter.h"
19
20 namespace re2 {
21
22 static const bool ExtraDebug = false;
23
PrefilterTree()24 PrefilterTree::PrefilterTree()
25 : compiled_(false),
26 min_atom_len_(3) {
27 }
28
PrefilterTree(int min_atom_len)29 PrefilterTree::PrefilterTree(int min_atom_len)
30 : compiled_(false),
31 min_atom_len_(min_atom_len) {
32 }
33
~PrefilterTree()34 PrefilterTree::~PrefilterTree() {
35 for (size_t i = 0; i < prefilter_vec_.size(); i++)
36 delete prefilter_vec_[i];
37 }
38
Add(Prefilter * prefilter)39 void PrefilterTree::Add(Prefilter* prefilter) {
40 if (compiled_) {
41 ABSL_LOG(DFATAL) << "Add called after Compile.";
42 return;
43 }
44 if (prefilter != NULL && !KeepNode(prefilter)) {
45 delete prefilter;
46 prefilter = NULL;
47 }
48
49 prefilter_vec_.push_back(prefilter);
50 }
51
Compile(std::vector<std::string> * atom_vec)52 void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
53 if (compiled_) {
54 ABSL_LOG(DFATAL) << "Compile called already.";
55 return;
56 }
57
58 // Some legacy users of PrefilterTree call Compile() before
59 // adding any regexps and expect Compile() to have no effect.
60 if (prefilter_vec_.empty()) {
61 return;
62 }
63
64 compiled_ = true;
65
66 NodeSet nodes;
67 AssignUniqueIds(&nodes, atom_vec);
68 if (ExtraDebug)
69 PrintDebugInfo(&nodes);
70 }
71
CanonicalNode(NodeSet * nodes,Prefilter * node)72 Prefilter* PrefilterTree::CanonicalNode(NodeSet* nodes, Prefilter* node) {
73 NodeSet::const_iterator iter = nodes->find(node);
74 if (iter != nodes->end()) {
75 return *iter;
76 }
77 return NULL;
78 }
79
KeepNode(Prefilter * node) const80 bool PrefilterTree::KeepNode(Prefilter* node) const {
81 if (node == NULL)
82 return false;
83
84 switch (node->op()) {
85 default:
86 ABSL_LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
87 return false;
88
89 case Prefilter::ALL:
90 case Prefilter::NONE:
91 return false;
92
93 case Prefilter::ATOM:
94 return node->atom().size() >= static_cast<size_t>(min_atom_len_);
95
96 case Prefilter::AND: {
97 int j = 0;
98 std::vector<Prefilter*>* subs = node->subs();
99 for (size_t i = 0; i < subs->size(); i++)
100 if (KeepNode((*subs)[i]))
101 (*subs)[j++] = (*subs)[i];
102 else
103 delete (*subs)[i];
104
105 subs->resize(j);
106 return j > 0;
107 }
108
109 case Prefilter::OR:
110 for (size_t i = 0; i < node->subs()->size(); i++)
111 if (!KeepNode((*node->subs())[i]))
112 return false;
113 return true;
114 }
115 }
116
AssignUniqueIds(NodeSet * nodes,std::vector<std::string> * atom_vec)117 void PrefilterTree::AssignUniqueIds(NodeSet* nodes,
118 std::vector<std::string>* atom_vec) {
119 atom_vec->clear();
120
121 // Build vector of all filter nodes, sorted topologically
122 // from top to bottom in v.
123 std::vector<Prefilter*> v;
124
125 // Add the top level nodes of each regexp prefilter.
126 for (size_t i = 0; i < prefilter_vec_.size(); i++) {
127 Prefilter* f = prefilter_vec_[i];
128 if (f == NULL)
129 unfiltered_.push_back(static_cast<int>(i));
130
131 // We push NULL also on to v, so that we maintain the
132 // mapping of index==regexpid for level=0 prefilter nodes.
133 v.push_back(f);
134 }
135
136 // Now add all the descendant nodes.
137 for (size_t i = 0; i < v.size(); i++) {
138 Prefilter* f = v[i];
139 if (f == NULL)
140 continue;
141 if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
142 const std::vector<Prefilter*>& subs = *f->subs();
143 for (size_t j = 0; j < subs.size(); j++)
144 v.push_back(subs[j]);
145 }
146 }
147
148 // Identify unique nodes.
149 int unique_id = 0;
150 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
151 Prefilter *node = v[i];
152 if (node == NULL)
153 continue;
154 node->set_unique_id(-1);
155 Prefilter* canonical = CanonicalNode(nodes, node);
156 if (canonical == NULL) {
157 // Any further nodes that have the same atom/subs
158 // will find this node as the canonical node.
159 nodes->emplace(node);
160 if (node->op() == Prefilter::ATOM) {
161 atom_vec->push_back(node->atom());
162 atom_index_to_id_.push_back(unique_id);
163 }
164 node->set_unique_id(unique_id++);
165 } else {
166 node->set_unique_id(canonical->unique_id());
167 }
168 }
169 entries_.resize(unique_id);
170
171 // Fill the entries.
172 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
173 Prefilter* prefilter = v[i];
174 if (prefilter == NULL)
175 continue;
176 if (CanonicalNode(nodes, prefilter) != prefilter)
177 continue;
178 int id = prefilter->unique_id();
179 switch (prefilter->op()) {
180 default:
181 ABSL_LOG(DFATAL) << "Unexpected op: " << prefilter->op();
182 return;
183
184 case Prefilter::ATOM:
185 entries_[id].propagate_up_at_count = 1;
186 break;
187
188 case Prefilter::OR:
189 case Prefilter::AND: {
190 // For each child, we append our id to the child's list of
191 // parent ids... unless we happen to have done so already.
192 // The number of appends is the number of unique children,
193 // which allows correct upward propagation from AND nodes.
194 int up_count = 0;
195 for (size_t j = 0; j < prefilter->subs()->size(); j++) {
196 int child_id = (*prefilter->subs())[j]->unique_id();
197 std::vector<int>& parents = entries_[child_id].parents;
198 if (parents.empty() || parents.back() != id) {
199 parents.push_back(id);
200 up_count++;
201 }
202 }
203 entries_[id].propagate_up_at_count =
204 prefilter->op() == Prefilter::AND ? up_count : 1;
205 break;
206 }
207 }
208 }
209
210 // For top level nodes, populate regexp id.
211 for (size_t i = 0; i < prefilter_vec_.size(); i++) {
212 if (prefilter_vec_[i] == NULL)
213 continue;
214 int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
215 ABSL_DCHECK_LE(0, id);
216 Entry* entry = &entries_[id];
217 entry->regexps.push_back(static_cast<int>(i));
218 }
219
220 // Lastly, using probability-based heuristics, we identify nodes
221 // that trigger too many parents and then we try to prune edges.
222 // We use logarithms below to avoid the likelihood of underflow.
223 double log_num_regexps = std::log(prefilter_vec_.size() - unfiltered_.size());
224 // Hoisted this above the loop so that we don't thrash the heap.
225 std::vector<std::pair<size_t, int>> entries_by_num_edges;
226 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
227 Prefilter* prefilter = v[i];
228 // Pruning applies only to AND nodes because it "just" reduces
229 // precision; applied to OR nodes, it would break correctness.
230 if (prefilter == NULL || prefilter->op() != Prefilter::AND)
231 continue;
232 if (CanonicalNode(nodes, prefilter) != prefilter)
233 continue;
234 int id = prefilter->unique_id();
235
236 // Sort the current node's children by the numbers of parents.
237 entries_by_num_edges.clear();
238 for (size_t j = 0; j < prefilter->subs()->size(); j++) {
239 int child_id = (*prefilter->subs())[j]->unique_id();
240 const std::vector<int>& parents = entries_[child_id].parents;
241 entries_by_num_edges.emplace_back(parents.size(), child_id);
242 }
243 std::stable_sort(entries_by_num_edges.begin(), entries_by_num_edges.end());
244
245 // A running estimate of how many regexps will be triggered by
246 // pruning the remaining children's edges to the current node.
247 // Our nominal target is one, so the threshold is log(1) == 0;
248 // pruning occurs iff the child has more than nine edges left.
249 double log_num_triggered = log_num_regexps;
250 for (const auto& pair : entries_by_num_edges) {
251 int child_id = pair.second;
252 std::vector<int>& parents = entries_[child_id].parents;
253 if (log_num_triggered > 0.) {
254 log_num_triggered += std::log(parents.size());
255 log_num_triggered -= log_num_regexps;
256 } else if (parents.size() > 9) {
257 auto it = std::find(parents.begin(), parents.end(), id);
258 if (it != parents.end()) {
259 parents.erase(it);
260 entries_[id].propagate_up_at_count--;
261 }
262 }
263 }
264 }
265 }
266
267 // Functions for triggering during search.
RegexpsGivenStrings(const std::vector<int> & matched_atoms,std::vector<int> * regexps) const268 void PrefilterTree::RegexpsGivenStrings(
269 const std::vector<int>& matched_atoms,
270 std::vector<int>* regexps) const {
271 regexps->clear();
272 if (!compiled_) {
273 // Some legacy users of PrefilterTree call Compile() before
274 // adding any regexps and expect Compile() to have no effect.
275 // This kludge is a counterpart to that kludge.
276 if (prefilter_vec_.empty()) {
277 return;
278 }
279
280 ABSL_LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
281 for (size_t i = 0; i < prefilter_vec_.size(); i++)
282 regexps->push_back(static_cast<int>(i));
283 } else {
284 IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
285 std::vector<int> matched_atom_ids;
286 for (size_t j = 0; j < matched_atoms.size(); j++)
287 matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
288 PropagateMatch(matched_atom_ids, ®exps_map);
289 for (IntMap::const_iterator it = regexps_map.begin();
290 it != regexps_map.end();
291 ++it)
292 regexps->push_back(it->index());
293
294 regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
295 }
296 std::sort(regexps->begin(), regexps->end());
297 }
298
PropagateMatch(const std::vector<int> & atom_ids,IntMap * regexps) const299 void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
300 IntMap* regexps) const {
301 IntMap count(static_cast<int>(entries_.size()));
302 IntMap work(static_cast<int>(entries_.size()));
303 for (size_t i = 0; i < atom_ids.size(); i++)
304 work.set(atom_ids[i], 1);
305 for (IntMap::const_iterator it = work.begin(); it != work.end(); ++it) {
306 const Entry& entry = entries_[it->index()];
307 // Record regexps triggered.
308 for (size_t i = 0; i < entry.regexps.size(); i++)
309 regexps->set(entry.regexps[i], 1);
310 int c;
311 // Pass trigger up to parents.
312 for (int j : entry.parents) {
313 const Entry& parent = entries_[j];
314 // Delay until all the children have succeeded.
315 if (parent.propagate_up_at_count > 1) {
316 if (count.has_index(j)) {
317 c = count.get_existing(j) + 1;
318 count.set_existing(j, c);
319 } else {
320 c = 1;
321 count.set_new(j, c);
322 }
323 if (c < parent.propagate_up_at_count)
324 continue;
325 }
326 // Trigger the parent.
327 work.set(j, 1);
328 }
329 }
330 }
331
332 // Debugging help.
PrintPrefilter(int regexpid)333 void PrefilterTree::PrintPrefilter(int regexpid) {
334 ABSL_LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
335 }
336
PrintDebugInfo(NodeSet * nodes)337 void PrefilterTree::PrintDebugInfo(NodeSet* nodes) {
338 ABSL_LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
339 ABSL_LOG(ERROR) << "#Unique Nodes: " << entries_.size();
340
341 for (size_t i = 0; i < entries_.size(); i++) {
342 const std::vector<int>& parents = entries_[i].parents;
343 const std::vector<int>& regexps = entries_[i].regexps;
344 ABSL_LOG(ERROR) << "EntryId: " << i
345 << " N: " << parents.size() << " R: " << regexps.size();
346 for (int parent : parents)
347 ABSL_LOG(ERROR) << parent;
348 }
349 ABSL_LOG(ERROR) << "Set:";
350 for (NodeSet::const_iterator iter = nodes->begin();
351 iter != nodes->end(); ++iter)
352 ABSL_LOG(ERROR) << "NodeId: " << (*iter)->unique_id();
353 }
354
DebugNodeString(Prefilter * node) const355 std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
356 std::string node_string = "";
357 if (node->op() == Prefilter::ATOM) {
358 ABSL_DCHECK(!node->atom().empty());
359 node_string += node->atom();
360 } else {
361 // Adding the operation disambiguates AND and OR nodes.
362 node_string += node->op() == Prefilter::AND ? "AND" : "OR";
363 node_string += "(";
364 for (size_t i = 0; i < node->subs()->size(); i++) {
365 if (i > 0)
366 node_string += ',';
367 node_string += absl::StrFormat("%d", (*node->subs())[i]->unique_id());
368 node_string += ":";
369 node_string += DebugNodeString((*node->subs())[i]);
370 }
371 node_string += ")";
372 }
373 return node_string;
374 }
375
376 } // namespace re2
377