1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "re2/prefilter_tree.h"
6
7 #include <stddef.h>
8 #include <algorithm>
9 #include <map>
10 #include <memory>
11 #include <set>
12 #include <string>
13 #include <utility>
14 #include <vector>
15
16 #include "util/util.h"
17 #include "util/logging.h"
18 #include "util/strutil.h"
19 #include "re2/prefilter.h"
20 #include "re2/re2.h"
21
22 namespace re2 {
23
24 static const bool ExtraDebug = false;
25
PrefilterTree()26 PrefilterTree::PrefilterTree()
27 : compiled_(false),
28 min_atom_len_(3) {
29 }
30
PrefilterTree(int min_atom_len)31 PrefilterTree::PrefilterTree(int min_atom_len)
32 : compiled_(false),
33 min_atom_len_(min_atom_len) {
34 }
35
~PrefilterTree()36 PrefilterTree::~PrefilterTree() {
37 for (size_t i = 0; i < prefilter_vec_.size(); i++)
38 delete prefilter_vec_[i];
39
40 for (size_t i = 0; i < entries_.size(); i++)
41 delete entries_[i].parents;
42 }
43
Add(Prefilter * prefilter)44 void PrefilterTree::Add(Prefilter* prefilter) {
45 if (compiled_) {
46 LOG(DFATAL) << "Add called after Compile.";
47 return;
48 }
49 if (prefilter != NULL && !KeepNode(prefilter)) {
50 delete prefilter;
51 prefilter = NULL;
52 }
53
54 prefilter_vec_.push_back(prefilter);
55 }
56
Compile(std::vector<std::string> * atom_vec)57 void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
58 if (compiled_) {
59 LOG(DFATAL) << "Compile called already.";
60 return;
61 }
62
63 // Some legacy users of PrefilterTree call Compile() before
64 // adding any regexps and expect Compile() to have no effect.
65 if (prefilter_vec_.empty())
66 return;
67
68 compiled_ = true;
69
70 // TODO(junyer): Use std::unordered_set<Prefilter*> instead?
71 NodeMap nodes;
72 AssignUniqueIds(&nodes, atom_vec);
73
74 // Identify nodes that are too common among prefilters and are
75 // triggering too many parents. Then get rid of them if possible.
76 // Note that getting rid of a prefilter node simply means they are
77 // no longer necessary for their parent to trigger; that is, we do
78 // not miss out on any regexps triggering by getting rid of a
79 // prefilter node.
80 for (size_t i = 0; i < entries_.size(); i++) {
81 StdIntMap* parents = entries_[i].parents;
82 if (parents->size() > 8) {
83 // This one triggers too many things. If all the parents are AND
84 // nodes and have other things guarding them, then get rid of
85 // this trigger. TODO(vsri): Adjust the threshold appropriately,
86 // make it a function of total number of nodes?
87 bool have_other_guard = true;
88 for (StdIntMap::iterator it = parents->begin();
89 it != parents->end(); ++it) {
90 have_other_guard = have_other_guard &&
91 (entries_[it->first].propagate_up_at_count > 1);
92 }
93
94 if (have_other_guard) {
95 for (StdIntMap::iterator it = parents->begin();
96 it != parents->end(); ++it)
97 entries_[it->first].propagate_up_at_count -= 1;
98
99 parents->clear(); // Forget the parents
100 }
101 }
102 }
103
104 if (ExtraDebug)
105 PrintDebugInfo(&nodes);
106 }
107
CanonicalNode(NodeMap * nodes,Prefilter * node)108 Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
109 std::string node_string = NodeString(node);
110 NodeMap::iterator iter = nodes->find(node_string);
111 if (iter == nodes->end())
112 return NULL;
113 return (*iter).second;
114 }
115
NodeString(Prefilter * node) const116 std::string PrefilterTree::NodeString(Prefilter* node) const {
117 // Adding the operation disambiguates AND/OR/atom nodes.
118 std::string s = StringPrintf("%d", node->op()) + ":";
119 if (node->op() == Prefilter::ATOM) {
120 s += node->atom();
121 } else {
122 for (size_t i = 0; i < node->subs()->size(); i++) {
123 if (i > 0)
124 s += ',';
125 s += StringPrintf("%d", (*node->subs())[i]->unique_id());
126 }
127 }
128 return s;
129 }
130
KeepNode(Prefilter * node) const131 bool PrefilterTree::KeepNode(Prefilter* node) const {
132 if (node == NULL)
133 return false;
134
135 switch (node->op()) {
136 default:
137 LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
138 return false;
139
140 case Prefilter::ALL:
141 case Prefilter::NONE:
142 return false;
143
144 case Prefilter::ATOM:
145 return node->atom().size() >= static_cast<size_t>(min_atom_len_);
146
147 case Prefilter::AND: {
148 int j = 0;
149 std::vector<Prefilter*>* subs = node->subs();
150 for (size_t i = 0; i < subs->size(); i++)
151 if (KeepNode((*subs)[i]))
152 (*subs)[j++] = (*subs)[i];
153 else
154 delete (*subs)[i];
155
156 subs->resize(j);
157 return j > 0;
158 }
159
160 case Prefilter::OR:
161 for (size_t i = 0; i < node->subs()->size(); i++)
162 if (!KeepNode((*node->subs())[i]))
163 return false;
164 return true;
165 }
166 }
167
AssignUniqueIds(NodeMap * nodes,std::vector<std::string> * atom_vec)168 void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
169 std::vector<std::string>* atom_vec) {
170 atom_vec->clear();
171
172 // Build vector of all filter nodes, sorted topologically
173 // from top to bottom in v.
174 std::vector<Prefilter*> v;
175
176 // Add the top level nodes of each regexp prefilter.
177 for (size_t i = 0; i < prefilter_vec_.size(); i++) {
178 Prefilter* f = prefilter_vec_[i];
179 if (f == NULL)
180 unfiltered_.push_back(static_cast<int>(i));
181
182 // We push NULL also on to v, so that we maintain the
183 // mapping of index==regexpid for level=0 prefilter nodes.
184 v.push_back(f);
185 }
186
187 // Now add all the descendant nodes.
188 for (size_t i = 0; i < v.size(); i++) {
189 Prefilter* f = v[i];
190 if (f == NULL)
191 continue;
192 if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
193 const std::vector<Prefilter*>& subs = *f->subs();
194 for (size_t j = 0; j < subs.size(); j++)
195 v.push_back(subs[j]);
196 }
197 }
198
199 // Identify unique nodes.
200 int unique_id = 0;
201 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
202 Prefilter *node = v[i];
203 if (node == NULL)
204 continue;
205 node->set_unique_id(-1);
206 Prefilter* canonical = CanonicalNode(nodes, node);
207 if (canonical == NULL) {
208 // Any further nodes that have the same node string
209 // will find this node as the canonical node.
210 nodes->emplace(NodeString(node), node);
211 if (node->op() == Prefilter::ATOM) {
212 atom_vec->push_back(node->atom());
213 atom_index_to_id_.push_back(unique_id);
214 }
215 node->set_unique_id(unique_id++);
216 } else {
217 node->set_unique_id(canonical->unique_id());
218 }
219 }
220 entries_.resize(nodes->size());
221
222 // Create parent StdIntMap for the entries.
223 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
224 Prefilter* prefilter = v[i];
225 if (prefilter == NULL)
226 continue;
227
228 if (CanonicalNode(nodes, prefilter) != prefilter)
229 continue;
230
231 Entry* entry = &entries_[prefilter->unique_id()];
232 entry->parents = new StdIntMap();
233 }
234
235 // Fill the entries.
236 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
237 Prefilter* prefilter = v[i];
238 if (prefilter == NULL)
239 continue;
240
241 if (CanonicalNode(nodes, prefilter) != prefilter)
242 continue;
243
244 Entry* entry = &entries_[prefilter->unique_id()];
245
246 switch (prefilter->op()) {
247 default:
248 case Prefilter::ALL:
249 LOG(DFATAL) << "Unexpected op: " << prefilter->op();
250 return;
251
252 case Prefilter::ATOM:
253 entry->propagate_up_at_count = 1;
254 break;
255
256 case Prefilter::OR:
257 case Prefilter::AND: {
258 std::set<int> uniq_child;
259 for (size_t j = 0; j < prefilter->subs()->size(); j++) {
260 Prefilter* child = (*prefilter->subs())[j];
261 Prefilter* canonical = CanonicalNode(nodes, child);
262 if (canonical == NULL) {
263 LOG(DFATAL) << "Null canonical node";
264 return;
265 }
266 int child_id = canonical->unique_id();
267 uniq_child.insert(child_id);
268 // To the child, we want to add to parent indices.
269 Entry* child_entry = &entries_[child_id];
270 if (child_entry->parents->find(prefilter->unique_id()) ==
271 child_entry->parents->end()) {
272 (*child_entry->parents)[prefilter->unique_id()] = 1;
273 }
274 }
275 entry->propagate_up_at_count = prefilter->op() == Prefilter::AND
276 ? static_cast<int>(uniq_child.size())
277 : 1;
278
279 break;
280 }
281 }
282 }
283
284 // For top level nodes, populate regexp id.
285 for (size_t i = 0; i < prefilter_vec_.size(); i++) {
286 if (prefilter_vec_[i] == NULL)
287 continue;
288 int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
289 DCHECK_LE(0, id);
290 Entry* entry = &entries_[id];
291 entry->regexps.push_back(static_cast<int>(i));
292 }
293 }
294
295 // Functions for triggering during search.
RegexpsGivenStrings(const std::vector<int> & matched_atoms,std::vector<int> * regexps) const296 void PrefilterTree::RegexpsGivenStrings(
297 const std::vector<int>& matched_atoms,
298 std::vector<int>* regexps) const {
299 regexps->clear();
300 if (!compiled_) {
301 // Some legacy users of PrefilterTree call Compile() before
302 // adding any regexps and expect Compile() to have no effect.
303 // This kludge is a counterpart to that kludge.
304 if (prefilter_vec_.empty())
305 return;
306
307 LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
308 for (size_t i = 0; i < prefilter_vec_.size(); i++)
309 regexps->push_back(static_cast<int>(i));
310 } else {
311 IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
312 std::vector<int> matched_atom_ids;
313 for (size_t j = 0; j < matched_atoms.size(); j++)
314 matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
315 PropagateMatch(matched_atom_ids, ®exps_map);
316 for (IntMap::iterator it = regexps_map.begin();
317 it != regexps_map.end();
318 ++it)
319 regexps->push_back(it->index());
320
321 regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
322 }
323 std::sort(regexps->begin(), regexps->end());
324 }
325
PropagateMatch(const std::vector<int> & atom_ids,IntMap * regexps) const326 void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
327 IntMap* regexps) const {
328 IntMap count(static_cast<int>(entries_.size()));
329 IntMap work(static_cast<int>(entries_.size()));
330 for (size_t i = 0; i < atom_ids.size(); i++)
331 work.set(atom_ids[i], 1);
332 for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
333 const Entry& entry = entries_[it->index()];
334 // Record regexps triggered.
335 for (size_t i = 0; i < entry.regexps.size(); i++)
336 regexps->set(entry.regexps[i], 1);
337 int c;
338 // Pass trigger up to parents.
339 for (StdIntMap::iterator it = entry.parents->begin();
340 it != entry.parents->end();
341 ++it) {
342 int j = it->first;
343 const Entry& parent = entries_[j];
344 // Delay until all the children have succeeded.
345 if (parent.propagate_up_at_count > 1) {
346 if (count.has_index(j)) {
347 c = count.get_existing(j) + 1;
348 count.set_existing(j, c);
349 } else {
350 c = 1;
351 count.set_new(j, c);
352 }
353 if (c < parent.propagate_up_at_count)
354 continue;
355 }
356 // Trigger the parent.
357 work.set(j, 1);
358 }
359 }
360 }
361
362 // Debugging help.
PrintPrefilter(int regexpid)363 void PrefilterTree::PrintPrefilter(int regexpid) {
364 LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
365 }
366
PrintDebugInfo(NodeMap * nodes)367 void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
368 LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
369 LOG(ERROR) << "#Unique Nodes: " << entries_.size();
370
371 for (size_t i = 0; i < entries_.size(); i++) {
372 StdIntMap* parents = entries_[i].parents;
373 const std::vector<int>& regexps = entries_[i].regexps;
374 LOG(ERROR) << "EntryId: " << i
375 << " N: " << parents->size() << " R: " << regexps.size();
376 for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
377 LOG(ERROR) << it->first;
378 }
379 LOG(ERROR) << "Map:";
380 for (NodeMap::const_iterator iter = nodes->begin();
381 iter != nodes->end(); ++iter)
382 LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
383 << " Str: " << (*iter).first;
384 }
385
DebugNodeString(Prefilter * node) const386 std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
387 std::string node_string = "";
388 if (node->op() == Prefilter::ATOM) {
389 DCHECK(!node->atom().empty());
390 node_string += node->atom();
391 } else {
392 // Adding the operation disambiguates AND and OR nodes.
393 node_string += node->op() == Prefilter::AND ? "AND" : "OR";
394 node_string += "(";
395 for (size_t i = 0; i < node->subs()->size(); i++) {
396 if (i > 0)
397 node_string += ',';
398 node_string += StringPrintf("%d", (*node->subs())[i]->unique_id());
399 node_string += ":";
400 node_string += DebugNodeString((*node->subs())[i]);
401 }
402 node_string += ")";
403 }
404 return node_string;
405 }
406
407 } // namespace re2
408