• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include "re2/prefilter_tree.h"
6 
7 #include <stddef.h>
8 
9 #include <algorithm>
10 #include <cmath>
11 #include <string>
12 #include <utility>
13 #include <vector>
14 
15 #include "absl/log/absl_check.h"
16 #include "absl/log/absl_log.h"
17 #include "absl/strings/str_format.h"
18 #include "re2/prefilter.h"
19 
20 namespace re2 {
21 
22 static const bool ExtraDebug = false;
23 
PrefilterTree()24 PrefilterTree::PrefilterTree()
25     : compiled_(false),
26       min_atom_len_(3) {
27 }
28 
PrefilterTree(int min_atom_len)29 PrefilterTree::PrefilterTree(int min_atom_len)
30     : compiled_(false),
31       min_atom_len_(min_atom_len) {
32 }
33 
~PrefilterTree()34 PrefilterTree::~PrefilterTree() {
35   for (size_t i = 0; i < prefilter_vec_.size(); i++)
36     delete prefilter_vec_[i];
37 }
38 
Add(Prefilter * prefilter)39 void PrefilterTree::Add(Prefilter* prefilter) {
40   if (compiled_) {
41     ABSL_LOG(DFATAL) << "Add called after Compile.";
42     return;
43   }
44   if (prefilter != NULL && !KeepNode(prefilter)) {
45     delete prefilter;
46     prefilter = NULL;
47   }
48 
49   prefilter_vec_.push_back(prefilter);
50 }
51 
Compile(std::vector<std::string> * atom_vec)52 void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
53   if (compiled_) {
54     ABSL_LOG(DFATAL) << "Compile called already.";
55     return;
56   }
57 
58   // Some legacy users of PrefilterTree call Compile() before
59   // adding any regexps and expect Compile() to have no effect.
60   if (prefilter_vec_.empty()) {
61     return;
62   }
63 
64   compiled_ = true;
65 
66   NodeSet nodes;
67   AssignUniqueIds(&nodes, atom_vec);
68   if (ExtraDebug)
69     PrintDebugInfo(&nodes);
70 }
71 
CanonicalNode(NodeSet * nodes,Prefilter * node)72 Prefilter* PrefilterTree::CanonicalNode(NodeSet* nodes, Prefilter* node) {
73   NodeSet::const_iterator iter = nodes->find(node);
74   if (iter != nodes->end()) {
75     return *iter;
76   }
77   return NULL;
78 }
79 
KeepNode(Prefilter * node) const80 bool PrefilterTree::KeepNode(Prefilter* node) const {
81   if (node == NULL)
82     return false;
83 
84   switch (node->op()) {
85     default:
86       ABSL_LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
87       return false;
88 
89     case Prefilter::ALL:
90     case Prefilter::NONE:
91       return false;
92 
93     case Prefilter::ATOM:
94       return node->atom().size() >= static_cast<size_t>(min_atom_len_);
95 
96     case Prefilter::AND: {
97       int j = 0;
98       std::vector<Prefilter*>* subs = node->subs();
99       for (size_t i = 0; i < subs->size(); i++)
100         if (KeepNode((*subs)[i]))
101           (*subs)[j++] = (*subs)[i];
102         else
103           delete (*subs)[i];
104 
105       subs->resize(j);
106       return j > 0;
107     }
108 
109     case Prefilter::OR:
110       for (size_t i = 0; i < node->subs()->size(); i++)
111         if (!KeepNode((*node->subs())[i]))
112           return false;
113       return true;
114   }
115 }
116 
AssignUniqueIds(NodeSet * nodes,std::vector<std::string> * atom_vec)117 void PrefilterTree::AssignUniqueIds(NodeSet* nodes,
118                                     std::vector<std::string>* atom_vec) {
119   atom_vec->clear();
120 
121   // Build vector of all filter nodes, sorted topologically
122   // from top to bottom in v.
123   std::vector<Prefilter*> v;
124 
125   // Add the top level nodes of each regexp prefilter.
126   for (size_t i = 0; i < prefilter_vec_.size(); i++) {
127     Prefilter* f = prefilter_vec_[i];
128     if (f == NULL)
129       unfiltered_.push_back(static_cast<int>(i));
130 
131     // We push NULL also on to v, so that we maintain the
132     // mapping of index==regexpid for level=0 prefilter nodes.
133     v.push_back(f);
134   }
135 
136   // Now add all the descendant nodes.
137   for (size_t i = 0; i < v.size(); i++) {
138     Prefilter* f = v[i];
139     if (f == NULL)
140       continue;
141     if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
142       const std::vector<Prefilter*>& subs = *f->subs();
143       for (size_t j = 0; j < subs.size(); j++)
144         v.push_back(subs[j]);
145     }
146   }
147 
148   // Identify unique nodes.
149   int unique_id = 0;
150   for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
151     Prefilter *node = v[i];
152     if (node == NULL)
153       continue;
154     node->set_unique_id(-1);
155     Prefilter* canonical = CanonicalNode(nodes, node);
156     if (canonical == NULL) {
157       // Any further nodes that have the same atom/subs
158       // will find this node as the canonical node.
159       nodes->emplace(node);
160       if (node->op() == Prefilter::ATOM) {
161         atom_vec->push_back(node->atom());
162         atom_index_to_id_.push_back(unique_id);
163       }
164       node->set_unique_id(unique_id++);
165     } else {
166       node->set_unique_id(canonical->unique_id());
167     }
168   }
169   entries_.resize(unique_id);
170 
171   // Fill the entries.
172   for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
173     Prefilter* prefilter = v[i];
174     if (prefilter == NULL)
175       continue;
176     if (CanonicalNode(nodes, prefilter) != prefilter)
177       continue;
178     int id = prefilter->unique_id();
179     switch (prefilter->op()) {
180       default:
181         ABSL_LOG(DFATAL) << "Unexpected op: " << prefilter->op();
182         return;
183 
184       case Prefilter::ATOM:
185         entries_[id].propagate_up_at_count = 1;
186         break;
187 
188       case Prefilter::OR:
189       case Prefilter::AND: {
190         // For each child, we append our id to the child's list of
191         // parent ids... unless we happen to have done so already.
192         // The number of appends is the number of unique children,
193         // which allows correct upward propagation from AND nodes.
194         int up_count = 0;
195         for (size_t j = 0; j < prefilter->subs()->size(); j++) {
196           int child_id = (*prefilter->subs())[j]->unique_id();
197           std::vector<int>& parents = entries_[child_id].parents;
198           if (parents.empty() || parents.back() != id) {
199             parents.push_back(id);
200             up_count++;
201           }
202         }
203         entries_[id].propagate_up_at_count =
204             prefilter->op() == Prefilter::AND ? up_count : 1;
205         break;
206       }
207     }
208   }
209 
210   // For top level nodes, populate regexp id.
211   for (size_t i = 0; i < prefilter_vec_.size(); i++) {
212     if (prefilter_vec_[i] == NULL)
213       continue;
214     int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
215     ABSL_DCHECK_LE(0, id);
216     Entry* entry = &entries_[id];
217     entry->regexps.push_back(static_cast<int>(i));
218   }
219 
220   // Lastly, using probability-based heuristics, we identify nodes
221   // that trigger too many parents and then we try to prune edges.
222   // We use logarithms below to avoid the likelihood of underflow.
223   double log_num_regexps = std::log(prefilter_vec_.size() - unfiltered_.size());
224   // Hoisted this above the loop so that we don't thrash the heap.
225   std::vector<std::pair<size_t, int>> entries_by_num_edges;
226   for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
227     Prefilter* prefilter = v[i];
228     // Pruning applies only to AND nodes because it "just" reduces
229     // precision; applied to OR nodes, it would break correctness.
230     if (prefilter == NULL || prefilter->op() != Prefilter::AND)
231       continue;
232     if (CanonicalNode(nodes, prefilter) != prefilter)
233       continue;
234     int id = prefilter->unique_id();
235 
236     // Sort the current node's children by the numbers of parents.
237     entries_by_num_edges.clear();
238     for (size_t j = 0; j < prefilter->subs()->size(); j++) {
239       int child_id = (*prefilter->subs())[j]->unique_id();
240       const std::vector<int>& parents = entries_[child_id].parents;
241       entries_by_num_edges.emplace_back(parents.size(), child_id);
242     }
243     std::stable_sort(entries_by_num_edges.begin(), entries_by_num_edges.end());
244 
245     // A running estimate of how many regexps will be triggered by
246     // pruning the remaining children's edges to the current node.
247     // Our nominal target is one, so the threshold is log(1) == 0;
248     // pruning occurs iff the child has more than nine edges left.
249     double log_num_triggered = log_num_regexps;
250     for (const auto& pair : entries_by_num_edges) {
251       int child_id = pair.second;
252       std::vector<int>& parents = entries_[child_id].parents;
253       if (log_num_triggered > 0.) {
254         log_num_triggered += std::log(parents.size());
255         log_num_triggered -= log_num_regexps;
256       } else if (parents.size() > 9) {
257         auto it = std::find(parents.begin(), parents.end(), id);
258         if (it != parents.end()) {
259           parents.erase(it);
260           entries_[id].propagate_up_at_count--;
261         }
262       }
263     }
264   }
265 }
266 
267 // Functions for triggering during search.
RegexpsGivenStrings(const std::vector<int> & matched_atoms,std::vector<int> * regexps) const268 void PrefilterTree::RegexpsGivenStrings(
269     const std::vector<int>& matched_atoms,
270     std::vector<int>* regexps) const {
271   regexps->clear();
272   if (!compiled_) {
273     // Some legacy users of PrefilterTree call Compile() before
274     // adding any regexps and expect Compile() to have no effect.
275     // This kludge is a counterpart to that kludge.
276     if (prefilter_vec_.empty()) {
277       return;
278     }
279 
280     ABSL_LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
281     for (size_t i = 0; i < prefilter_vec_.size(); i++)
282       regexps->push_back(static_cast<int>(i));
283   } else {
284     IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
285     std::vector<int> matched_atom_ids;
286     for (size_t j = 0; j < matched_atoms.size(); j++)
287       matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
288     PropagateMatch(matched_atom_ids, &regexps_map);
289     for (IntMap::const_iterator it = regexps_map.begin();
290          it != regexps_map.end();
291          ++it)
292       regexps->push_back(it->index());
293 
294     regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
295   }
296   std::sort(regexps->begin(), regexps->end());
297 }
298 
PropagateMatch(const std::vector<int> & atom_ids,IntMap * regexps) const299 void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
300                                    IntMap* regexps) const {
301   IntMap count(static_cast<int>(entries_.size()));
302   IntMap work(static_cast<int>(entries_.size()));
303   for (size_t i = 0; i < atom_ids.size(); i++)
304     work.set(atom_ids[i], 1);
305   for (IntMap::const_iterator it = work.begin(); it != work.end(); ++it) {
306     const Entry& entry = entries_[it->index()];
307     // Record regexps triggered.
308     for (size_t i = 0; i < entry.regexps.size(); i++)
309       regexps->set(entry.regexps[i], 1);
310     int c;
311     // Pass trigger up to parents.
312     for (int j : entry.parents) {
313       const Entry& parent = entries_[j];
314       // Delay until all the children have succeeded.
315       if (parent.propagate_up_at_count > 1) {
316         if (count.has_index(j)) {
317           c = count.get_existing(j) + 1;
318           count.set_existing(j, c);
319         } else {
320           c = 1;
321           count.set_new(j, c);
322         }
323         if (c < parent.propagate_up_at_count)
324           continue;
325       }
326       // Trigger the parent.
327       work.set(j, 1);
328     }
329   }
330 }
331 
332 // Debugging help.
PrintPrefilter(int regexpid)333 void PrefilterTree::PrintPrefilter(int regexpid) {
334   ABSL_LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
335 }
336 
PrintDebugInfo(NodeSet * nodes)337 void PrefilterTree::PrintDebugInfo(NodeSet* nodes) {
338   ABSL_LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
339   ABSL_LOG(ERROR) << "#Unique Nodes: " << entries_.size();
340 
341   for (size_t i = 0; i < entries_.size(); i++) {
342     const std::vector<int>& parents = entries_[i].parents;
343     const std::vector<int>& regexps = entries_[i].regexps;
344     ABSL_LOG(ERROR) << "EntryId: " << i
345                     << " N: " << parents.size() << " R: " << regexps.size();
346     for (int parent : parents)
347       ABSL_LOG(ERROR) << parent;
348   }
349   ABSL_LOG(ERROR) << "Set:";
350   for (NodeSet::const_iterator iter = nodes->begin();
351        iter != nodes->end(); ++iter)
352     ABSL_LOG(ERROR) << "NodeId: " << (*iter)->unique_id();
353 }
354 
DebugNodeString(Prefilter * node) const355 std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
356   std::string node_string = "";
357   if (node->op() == Prefilter::ATOM) {
358     ABSL_DCHECK(!node->atom().empty());
359     node_string += node->atom();
360   } else {
361     // Adding the operation disambiguates AND and OR nodes.
362     node_string +=  node->op() == Prefilter::AND ? "AND" : "OR";
363     node_string += "(";
364     for (size_t i = 0; i < node->subs()->size(); i++) {
365       if (i > 0)
366         node_string += ',';
367       node_string += absl::StrFormat("%d", (*node->subs())[i]->unique_id());
368       node_string += ":";
369       node_string += DebugNodeString((*node->subs())[i]);
370     }
371     node_string += ")";
372   }
373   return node_string;
374 }
375 
376 }  // namespace re2
377