• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include "re2/prefilter_tree.h"
6 
7 #include <stddef.h>
8 #include <algorithm>
9 #include <cmath>
10 #include <memory>
11 #include <string>
12 #include <utility>
13 #include <vector>
14 
15 #include "absl/strings/str_format.h"
16 #include "util/logging.h"
17 #include "re2/prefilter.h"
18 #include "re2/re2.h"
19 
20 namespace re2 {
21 
22 static const bool ExtraDebug = false;
23 
PrefilterTree()24 PrefilterTree::PrefilterTree()
25     : compiled_(false),
26       min_atom_len_(3) {
27 }
28 
PrefilterTree(int min_atom_len)29 PrefilterTree::PrefilterTree(int min_atom_len)
30     : compiled_(false),
31       min_atom_len_(min_atom_len) {
32 }
33 
~PrefilterTree()34 PrefilterTree::~PrefilterTree() {
35   for (size_t i = 0; i < prefilter_vec_.size(); i++)
36     delete prefilter_vec_[i];
37 }
38 
Add(Prefilter * prefilter)39 void PrefilterTree::Add(Prefilter* prefilter) {
40   if (compiled_) {
41     LOG(DFATAL) << "Add called after Compile.";
42     return;
43   }
44   if (prefilter != NULL && !KeepNode(prefilter)) {
45     delete prefilter;
46     prefilter = NULL;
47   }
48 
49   prefilter_vec_.push_back(prefilter);
50 }
51 
Compile(std::vector<std::string> * atom_vec)52 void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
53   if (compiled_) {
54     LOG(DFATAL) << "Compile called already.";
55     return;
56   }
57 
58   // Some legacy users of PrefilterTree call Compile() before
59   // adding any regexps and expect Compile() to have no effect.
60   if (prefilter_vec_.empty())
61     return;
62 
63   compiled_ = true;
64 
65   NodeSet nodes;
66   AssignUniqueIds(&nodes, atom_vec);
67   if (ExtraDebug)
68     PrintDebugInfo(&nodes);
69 }
70 
CanonicalNode(NodeSet * nodes,Prefilter * node)71 Prefilter* PrefilterTree::CanonicalNode(NodeSet* nodes, Prefilter* node) {
72   NodeSet::const_iterator iter = nodes->find(node);
73   if (iter != nodes->end()) {
74     return *iter;
75   }
76   return NULL;
77 }
78 
KeepNode(Prefilter * node) const79 bool PrefilterTree::KeepNode(Prefilter* node) const {
80   if (node == NULL)
81     return false;
82 
83   switch (node->op()) {
84     default:
85       LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
86       return false;
87 
88     case Prefilter::ALL:
89     case Prefilter::NONE:
90       return false;
91 
92     case Prefilter::ATOM:
93       return node->atom().size() >= static_cast<size_t>(min_atom_len_);
94 
95     case Prefilter::AND: {
96       int j = 0;
97       std::vector<Prefilter*>* subs = node->subs();
98       for (size_t i = 0; i < subs->size(); i++)
99         if (KeepNode((*subs)[i]))
100           (*subs)[j++] = (*subs)[i];
101         else
102           delete (*subs)[i];
103 
104       subs->resize(j);
105       return j > 0;
106     }
107 
108     case Prefilter::OR:
109       for (size_t i = 0; i < node->subs()->size(); i++)
110         if (!KeepNode((*node->subs())[i]))
111           return false;
112       return true;
113   }
114 }
115 
AssignUniqueIds(NodeSet * nodes,std::vector<std::string> * atom_vec)116 void PrefilterTree::AssignUniqueIds(NodeSet* nodes,
117                                     std::vector<std::string>* atom_vec) {
118   atom_vec->clear();
119 
120   // Build vector of all filter nodes, sorted topologically
121   // from top to bottom in v.
122   std::vector<Prefilter*> v;
123 
124   // Add the top level nodes of each regexp prefilter.
125   for (size_t i = 0; i < prefilter_vec_.size(); i++) {
126     Prefilter* f = prefilter_vec_[i];
127     if (f == NULL)
128       unfiltered_.push_back(static_cast<int>(i));
129 
130     // We push NULL also on to v, so that we maintain the
131     // mapping of index==regexpid for level=0 prefilter nodes.
132     v.push_back(f);
133   }
134 
135   // Now add all the descendant nodes.
136   for (size_t i = 0; i < v.size(); i++) {
137     Prefilter* f = v[i];
138     if (f == NULL)
139       continue;
140     if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
141       const std::vector<Prefilter*>& subs = *f->subs();
142       for (size_t j = 0; j < subs.size(); j++)
143         v.push_back(subs[j]);
144     }
145   }
146 
147   // Identify unique nodes.
148   int unique_id = 0;
149   for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
150     Prefilter *node = v[i];
151     if (node == NULL)
152       continue;
153     node->set_unique_id(-1);
154     Prefilter* canonical = CanonicalNode(nodes, node);
155     if (canonical == NULL) {
156       // Any further nodes that have the same atom/subs
157       // will find this node as the canonical node.
158       nodes->emplace(node);
159       if (node->op() == Prefilter::ATOM) {
160         atom_vec->push_back(node->atom());
161         atom_index_to_id_.push_back(unique_id);
162       }
163       node->set_unique_id(unique_id++);
164     } else {
165       node->set_unique_id(canonical->unique_id());
166     }
167   }
168   entries_.resize(unique_id);
169 
170   // Fill the entries.
171   for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
172     Prefilter* prefilter = v[i];
173     if (prefilter == NULL)
174       continue;
175     if (CanonicalNode(nodes, prefilter) != prefilter)
176       continue;
177     int id = prefilter->unique_id();
178     switch (prefilter->op()) {
179       default:
180         LOG(DFATAL) << "Unexpected op: " << prefilter->op();
181         return;
182 
183       case Prefilter::ATOM:
184         entries_[id].propagate_up_at_count = 1;
185         break;
186 
187       case Prefilter::OR:
188       case Prefilter::AND: {
189         // For each child, we append our id to the child's list of
190         // parent ids... unless we happen to have done so already.
191         // The number of appends is the number of unique children,
192         // which allows correct upward propagation from AND nodes.
193         int up_count = 0;
194         for (size_t j = 0; j < prefilter->subs()->size(); j++) {
195           int child_id = (*prefilter->subs())[j]->unique_id();
196           std::vector<int>& parents = entries_[child_id].parents;
197           if (parents.empty() || parents.back() != id) {
198             parents.push_back(id);
199             up_count++;
200           }
201         }
202         entries_[id].propagate_up_at_count =
203             prefilter->op() == Prefilter::AND ? up_count : 1;
204         break;
205       }
206     }
207   }
208 
209   // For top level nodes, populate regexp id.
210   for (size_t i = 0; i < prefilter_vec_.size(); i++) {
211     if (prefilter_vec_[i] == NULL)
212       continue;
213     int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
214     DCHECK_LE(0, id);
215     Entry* entry = &entries_[id];
216     entry->regexps.push_back(static_cast<int>(i));
217   }
218 
219   // Lastly, using probability-based heuristics, we identify nodes
220   // that trigger too many parents and then we try to prune edges.
221   // We use logarithms below to avoid the likelihood of underflow.
222   double log_num_regexps = std::log(prefilter_vec_.size() - unfiltered_.size());
223   // Hoisted this above the loop so that we don't thrash the heap.
224   std::vector<std::pair<size_t, int>> entries_by_num_edges;
225   for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
226     Prefilter* prefilter = v[i];
227     // Pruning applies only to AND nodes because it "just" reduces
228     // precision; applied to OR nodes, it would break correctness.
229     if (prefilter == NULL || prefilter->op() != Prefilter::AND)
230       continue;
231     if (CanonicalNode(nodes, prefilter) != prefilter)
232       continue;
233     int id = prefilter->unique_id();
234 
235     // Sort the current node's children by the numbers of parents.
236     entries_by_num_edges.clear();
237     for (size_t j = 0; j < prefilter->subs()->size(); j++) {
238       int child_id = (*prefilter->subs())[j]->unique_id();
239       const std::vector<int>& parents = entries_[child_id].parents;
240       entries_by_num_edges.emplace_back(parents.size(), child_id);
241     }
242     std::stable_sort(entries_by_num_edges.begin(), entries_by_num_edges.end());
243 
244     // A running estimate of how many regexps will be triggered by
245     // pruning the remaining children's edges to the current node.
246     // Our nominal target is one, so the threshold is log(1) == 0;
247     // pruning occurs iff the child has more than nine edges left.
248     double log_num_triggered = log_num_regexps;
249     for (const auto& pair : entries_by_num_edges) {
250       int child_id = pair.second;
251       std::vector<int>& parents = entries_[child_id].parents;
252       if (log_num_triggered > 0.) {
253         log_num_triggered += std::log(parents.size());
254         log_num_triggered -= log_num_regexps;
255       } else if (parents.size() > 9) {
256         auto it = std::find(parents.begin(), parents.end(), id);
257         if (it != parents.end()) {
258           parents.erase(it);
259           entries_[id].propagate_up_at_count--;
260         }
261       }
262     }
263   }
264 }
265 
266 // Functions for triggering during search.
RegexpsGivenStrings(const std::vector<int> & matched_atoms,std::vector<int> * regexps) const267 void PrefilterTree::RegexpsGivenStrings(
268     const std::vector<int>& matched_atoms,
269     std::vector<int>* regexps) const {
270   regexps->clear();
271   if (!compiled_) {
272     // Some legacy users of PrefilterTree call Compile() before
273     // adding any regexps and expect Compile() to have no effect.
274     // This kludge is a counterpart to that kludge.
275     if (prefilter_vec_.empty())
276       return;
277 
278     LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
279     for (size_t i = 0; i < prefilter_vec_.size(); i++)
280       regexps->push_back(static_cast<int>(i));
281   } else {
282     IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
283     std::vector<int> matched_atom_ids;
284     for (size_t j = 0; j < matched_atoms.size(); j++)
285       matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
286     PropagateMatch(matched_atom_ids, &regexps_map);
287     for (IntMap::const_iterator it = regexps_map.begin();
288          it != regexps_map.end();
289          ++it)
290       regexps->push_back(it->index());
291 
292     regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
293   }
294   std::sort(regexps->begin(), regexps->end());
295 }
296 
PropagateMatch(const std::vector<int> & atom_ids,IntMap * regexps) const297 void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
298                                    IntMap* regexps) const {
299   IntMap count(static_cast<int>(entries_.size()));
300   IntMap work(static_cast<int>(entries_.size()));
301   for (size_t i = 0; i < atom_ids.size(); i++)
302     work.set(atom_ids[i], 1);
303   for (IntMap::const_iterator it = work.begin(); it != work.end(); ++it) {
304     const Entry& entry = entries_[it->index()];
305     // Record regexps triggered.
306     for (size_t i = 0; i < entry.regexps.size(); i++)
307       regexps->set(entry.regexps[i], 1);
308     int c;
309     // Pass trigger up to parents.
310     for (int j : entry.parents) {
311       const Entry& parent = entries_[j];
312       // Delay until all the children have succeeded.
313       if (parent.propagate_up_at_count > 1) {
314         if (count.has_index(j)) {
315           c = count.get_existing(j) + 1;
316           count.set_existing(j, c);
317         } else {
318           c = 1;
319           count.set_new(j, c);
320         }
321         if (c < parent.propagate_up_at_count)
322           continue;
323       }
324       // Trigger the parent.
325       work.set(j, 1);
326     }
327   }
328 }
329 
330 // Debugging help.
PrintPrefilter(int regexpid)331 void PrefilterTree::PrintPrefilter(int regexpid) {
332   LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
333 }
334 
PrintDebugInfo(NodeSet * nodes)335 void PrefilterTree::PrintDebugInfo(NodeSet* nodes) {
336   LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
337   LOG(ERROR) << "#Unique Nodes: " << entries_.size();
338 
339   for (size_t i = 0; i < entries_.size(); i++) {
340     const std::vector<int>& parents = entries_[i].parents;
341     const std::vector<int>& regexps = entries_[i].regexps;
342     LOG(ERROR) << "EntryId: " << i
343                << " N: " << parents.size() << " R: " << regexps.size();
344     for (int parent : parents)
345       LOG(ERROR) << parent;
346   }
347   LOG(ERROR) << "Set:";
348   for (NodeSet::const_iterator iter = nodes->begin();
349        iter != nodes->end(); ++iter)
350     LOG(ERROR) << "NodeId: " << (*iter)->unique_id();
351 }
352 
DebugNodeString(Prefilter * node) const353 std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
354   std::string node_string = "";
355   if (node->op() == Prefilter::ATOM) {
356     DCHECK(!node->atom().empty());
357     node_string += node->atom();
358   } else {
359     // Adding the operation disambiguates AND and OR nodes.
360     node_string +=  node->op() == Prefilter::AND ? "AND" : "OR";
361     node_string += "(";
362     for (size_t i = 0; i < node->subs()->size(); i++) {
363       if (i > 0)
364         node_string += ',';
365       node_string += absl::StrFormat("%d", (*node->subs())[i]->unique_id());
366       node_string += ":";
367       node_string += DebugNodeString((*node->subs())[i]);
368     }
369     node_string += ")";
370   }
371   return node_string;
372 }
373 
374 }  // namespace re2
375