1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "util/util.h"
6 #include "util/flags.h"
7 #include "re2/prefilter.h"
8 #include "re2/prefilter_tree.h"
9 #include "re2/re2.h"
10
11 DEFINE_int32(filtered_re2_min_atom_len,
12 3,
13 "Strings less than this length are not stored as atoms");
14
15 namespace re2 {
16
PrefilterTree()17 PrefilterTree::PrefilterTree()
18 : compiled_(false) {
19 }
20
~PrefilterTree()21 PrefilterTree::~PrefilterTree() {
22 for (int i = 0; i < prefilter_vec_.size(); i++)
23 delete prefilter_vec_[i];
24
25 for (int i = 0; i < entries_.size(); i++)
26 delete entries_[i].parents;
27 }
28
29 // Functions used for adding and Compiling prefilters to the
30 // PrefilterTree.
KeepPart(Prefilter * prefilter,int level)31 static bool KeepPart(Prefilter* prefilter, int level) {
32 if (prefilter == NULL)
33 return false;
34
35 switch (prefilter->op()) {
36 default:
37 LOG(DFATAL) << "Unexpected op in KeepPart: "
38 << prefilter->op();
39 return false;
40
41 case Prefilter::ALL:
42 return false;
43
44 case Prefilter::ATOM:
45 return prefilter->atom().size() >=
46 FLAGS_filtered_re2_min_atom_len;
47
48 case Prefilter::AND: {
49 int j = 0;
50 vector<Prefilter*>* subs = prefilter->subs();
51 for (int i = 0; i < subs->size(); i++)
52 if (KeepPart((*subs)[i], level + 1))
53 (*subs)[j++] = (*subs)[i];
54 else
55 delete (*subs)[i];
56
57 subs->resize(j);
58 return j > 0;
59 }
60
61 case Prefilter::OR:
62 for (int i = 0; i < prefilter->subs()->size(); i++)
63 if (!KeepPart((*prefilter->subs())[i], level + 1))
64 return false;
65 return true;
66 }
67 }
68
Add(Prefilter * f)69 void PrefilterTree::Add(Prefilter *f) {
70 if (compiled_) {
71 LOG(DFATAL) << "Add after Compile.";
72 return;
73 }
74 if (f != NULL && !KeepPart(f, 0)) {
75 delete f;
76 f = NULL;
77 }
78
79 prefilter_vec_.push_back(f);
80 }
81
Compile(vector<string> * atom_vec)82 void PrefilterTree::Compile(vector<string>* atom_vec) {
83 if (compiled_) {
84 LOG(DFATAL) << "Compile after Compile.";
85 return;
86 }
87
88 // We do this check to support some legacy uses of
89 // PrefilterTree that call Compile before adding any regexps,
90 // and expect Compile not to have effect.
91 if (prefilter_vec_.empty())
92 return;
93
94 compiled_ = true;
95
96 AssignUniqueIds(atom_vec);
97
98 // Identify nodes that are too common among prefilters and are
99 // triggering too many parents. Then get rid of them if possible.
100 // Note that getting rid of a prefilter node simply means they are
101 // no longer necessary for their parent to trigger; that is, we do
102 // not miss out on any regexps triggering by getting rid of a
103 // prefilter node.
104 for (int i = 0; i < entries_.size(); i++) {
105 IntMap* parents = entries_[i].parents;
106 if (parents->size() > 8) {
107 // This one triggers too many things. If all the parents are AND
108 // nodes and have other things guarding them, then get rid of
109 // this trigger. TODO(vsri): Adjust the threshold appropriately,
110 // make it a function of total number of nodes?
111 bool have_other_guard = true;
112 for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
113 have_other_guard = have_other_guard &&
114 (entries_[it->index()].propagate_up_at_count > 1);
115
116 if (have_other_guard) {
117 for (IntMap::iterator it = parents->begin();
118 it != parents->end(); ++it)
119 entries_[it->index()].propagate_up_at_count -= 1;
120
121 parents->clear(); // Forget the parents
122 }
123 }
124 }
125
126 PrintDebugInfo();
127 }
128
CanonicalNode(Prefilter * node)129 Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
130 string node_string = NodeString(node);
131 map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
132 if (iter == node_map_.end())
133 return NULL;
134 return (*iter).second;
135 }
136
Itoa(int n)137 static string Itoa(int n) {
138 char buf[100];
139 snprintf(buf, sizeof buf, "%d", n);
140 return string(buf);
141 }
142
NodeString(Prefilter * node) const143 string PrefilterTree::NodeString(Prefilter* node) const {
144 // Adding the operation disambiguates AND/OR/atom nodes.
145 string s = Itoa(node->op()) + ":";
146 if (node->op() == Prefilter::ATOM) {
147 s += node->atom();
148 } else {
149 for (int i = 0; i < node->subs()->size() ; i++) {
150 if (i > 0)
151 s += ',';
152 s += Itoa((*node->subs())[i]->unique_id());
153 }
154 }
155 return s;
156 }
157
AssignUniqueIds(vector<string> * atom_vec)158 void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
159 atom_vec->clear();
160
161 // Build vector of all filter nodes, sorted topologically
162 // from top to bottom in v.
163 vector<Prefilter*> v;
164
165 // Add the top level nodes of each regexp prefilter.
166 for (int i = 0; i < prefilter_vec_.size(); i++) {
167 Prefilter* f = prefilter_vec_[i];
168 if (f == NULL)
169 unfiltered_.push_back(i);
170
171 // We push NULL also on to v, so that we maintain the
172 // mapping of index==regexpid for level=0 prefilter nodes.
173 v.push_back(f);
174 }
175
176 // Now add all the descendant nodes.
177 for (int i = 0; i < v.size(); i++) {
178 Prefilter* f = v[i];
179 if (f == NULL)
180 continue;
181 if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
182 const vector<Prefilter*>& subs = *f->subs();
183 for (int j = 0; j < subs.size(); j++)
184 v.push_back(subs[j]);
185 }
186 }
187
188 // Identify unique nodes.
189 int unique_id = 0;
190 for (int i = v.size() - 1; i >= 0; i--) {
191 Prefilter *node = v[i];
192 if (node == NULL)
193 continue;
194 node->set_unique_id(-1);
195 Prefilter* canonical = CanonicalNode(node);
196 if (canonical == NULL) {
197 // Any further nodes that have the same node string
198 // will find this node as the canonical node.
199 node_map_[NodeString(node)] = node;
200 if (node->op() == Prefilter::ATOM) {
201 atom_vec->push_back(node->atom());
202 atom_index_to_id_.push_back(unique_id);
203 }
204 node->set_unique_id(unique_id++);
205 } else {
206 node->set_unique_id(canonical->unique_id());
207 }
208 }
209 entries_.resize(node_map_.size());
210
211 // Create parent IntMap for the entries.
212 for (int i = v.size() - 1; i >= 0; i--) {
213 Prefilter* prefilter = v[i];
214 if (prefilter == NULL)
215 continue;
216
217 if (CanonicalNode(prefilter) != prefilter)
218 continue;
219
220 Entry* entry = &entries_[prefilter->unique_id()];
221 entry->parents = new IntMap(node_map_.size());
222 }
223
224 // Fill the entries.
225 for (int i = v.size() - 1; i >= 0; i--) {
226 Prefilter* prefilter = v[i];
227 if (prefilter == NULL)
228 continue;
229
230 if (CanonicalNode(prefilter) != prefilter)
231 continue;
232
233 Entry* entry = &entries_[prefilter->unique_id()];
234
235 switch (prefilter->op()) {
236 default:
237 case Prefilter::ALL:
238 LOG(DFATAL) << "Unexpected op: " << prefilter->op();
239 return;
240
241 case Prefilter::ATOM:
242 entry->propagate_up_at_count = 1;
243 break;
244
245 case Prefilter::OR:
246 case Prefilter::AND: {
247 IntMap uniq_child(node_map_.size());
248 for (int j = 0; j < prefilter->subs()->size() ; j++) {
249 Prefilter* child = (*prefilter->subs())[j];
250 Prefilter* canonical = CanonicalNode(child);
251 if (canonical == NULL) {
252 LOG(DFATAL) << "Null canonical node";
253 return;
254 }
255 int child_id = canonical->unique_id();
256 if (!uniq_child.has_index(child_id))
257 uniq_child.set_new(child_id, 1);
258 // To the child, we want to add to parent indices.
259 Entry* child_entry = &entries_[child_id];
260 if (!child_entry->parents->has_index(prefilter->unique_id()))
261 child_entry->parents->set_new(prefilter->unique_id(), 1);
262 }
263 entry->propagate_up_at_count =
264 prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
265
266 break;
267 }
268 }
269 }
270
271 // For top level nodes, populate regexp id.
272 for (int i = 0; i < prefilter_vec_.size(); i++) {
273 if (prefilter_vec_[i] == NULL)
274 continue;
275 int id = CanonicalNode(prefilter_vec_[i])->unique_id();
276 DCHECK_LE(0, id);
277 Entry* entry = &entries_[id];
278 entry->regexps.push_back(i);
279 }
280 }
281
282 // Functions for triggering during search.
RegexpsGivenStrings(const vector<int> & matched_atoms,vector<int> * regexps) const283 void PrefilterTree::RegexpsGivenStrings(
284 const vector<int>& matched_atoms,
285 vector<int>* regexps) const {
286 regexps->clear();
287 if (!compiled_) {
288 LOG(WARNING) << "Compile() not called";
289 for (int i = 0; i < prefilter_vec_.size(); ++i)
290 regexps->push_back(i);
291 } else {
292 if (!prefilter_vec_.empty()) {
293 IntMap regexps_map(prefilter_vec_.size());
294 vector<int> matched_atom_ids;
295 for (int j = 0; j < matched_atoms.size(); j++) {
296 matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
297 VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
298 }
299 PropagateMatch(matched_atom_ids, ®exps_map);
300 for (IntMap::iterator it = regexps_map.begin();
301 it != regexps_map.end();
302 ++it)
303 regexps->push_back(it->index());
304
305 regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
306 }
307 }
308 sort(regexps->begin(), regexps->end());
309 }
310
PropagateMatch(const vector<int> & atom_ids,IntMap * regexps) const311 void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
312 IntMap* regexps) const {
313 IntMap count(entries_.size());
314 IntMap work(entries_.size());
315 for (int i = 0; i < atom_ids.size(); i++)
316 work.set(atom_ids[i], 1);
317 for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
318 const Entry& entry = entries_[it->index()];
319 VLOG(10) << "Processing: " << it->index();
320 // Record regexps triggered.
321 for (int i = 0; i < entry.regexps.size(); i++) {
322 VLOG(10) << "Regexp triggered: " << entry.regexps[i];
323 regexps->set(entry.regexps[i], 1);
324 }
325 int c;
326 // Pass trigger up to parents.
327 for (IntMap::iterator it = entry.parents->begin();
328 it != entry.parents->end();
329 ++it) {
330 int j = it->index();
331 const Entry& parent = entries_[j];
332 VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
333 // Delay until all the children have succeeded.
334 if (parent.propagate_up_at_count > 1) {
335 if (count.has_index(j)) {
336 c = count.get_existing(j) + 1;
337 count.set_existing(j, c);
338 } else {
339 c = 1;
340 count.set_new(j, c);
341 }
342 if (c < parent.propagate_up_at_count)
343 continue;
344 }
345 VLOG(10) << "Triggering: " << j;
346 // Trigger the parent.
347 work.set(j, 1);
348 }
349 }
350 }
351
352 // Debugging help.
PrintPrefilter(int regexpid)353 void PrefilterTree::PrintPrefilter(int regexpid) {
354 LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
355 }
356
PrintDebugInfo()357 void PrefilterTree::PrintDebugInfo() {
358 VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
359 VLOG(10) << "#Unique Nodes: " << entries_.size();
360
361 for (int i = 0; i < entries_.size(); ++i) {
362 IntMap* parents = entries_[i].parents;
363 const vector<int>& regexps = entries_[i].regexps;
364 VLOG(10) << "EntryId: " << i
365 << " N: " << parents->size() << " R: " << regexps.size();
366 for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
367 VLOG(10) << it->index();
368 }
369 VLOG(10) << "Map:";
370 for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
371 iter != node_map_.end(); ++iter)
372 VLOG(10) << "NodeId: " << (*iter).second->unique_id()
373 << " Str: " << (*iter).first;
374 }
375
DebugNodeString(Prefilter * node) const376 string PrefilterTree::DebugNodeString(Prefilter* node) const {
377 string node_string = "";
378
379 if (node->op() == Prefilter::ATOM) {
380 DCHECK(!node->atom().empty());
381 node_string += node->atom();
382 } else {
383 // Adding the operation disambiguates AND and OR nodes.
384 node_string += node->op() == Prefilter::AND ? "AND" : "OR";
385 node_string += "(";
386 for (int i = 0; i < node->subs()->size() ; i++) {
387 if (i > 0)
388 node_string += ',';
389 node_string += Itoa((*node->subs())[i]->unique_id());
390 node_string += ":";
391 node_string += DebugNodeString((*node->subs())[i]);
392 }
393 node_string += ")";
394 }
395 return node_string;
396 }
397
398 } // namespace re2
399