1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "util/util.h"
6 #include "re2/prefilter.h"
7 #include "re2/re2.h"
8 #include "re2/unicode_casefold.h"
9 #include "re2/walker-inl.h"
10
11 namespace re2 {
12
13 static const int Trace = false;
14
15 typedef set<string>::iterator SSIter;
16 typedef set<string>::const_iterator ConstSSIter;
17
18 static int alloc_id = 100000; // Used for debugging.
19 // Initializes a Prefilter, allocating subs_ as necessary.
Prefilter(Op op)20 Prefilter::Prefilter(Op op) {
21 op_ = op;
22 subs_ = NULL;
23 if (op_ == AND || op_ == OR)
24 subs_ = new vector<Prefilter*>;
25
26 alloc_id_ = alloc_id++;
27 VLOG(10) << "alloc_id: " << alloc_id_;
28 }
29
30 // Destroys a Prefilter.
~Prefilter()31 Prefilter::~Prefilter() {
32 VLOG(10) << "Deleted: " << alloc_id_;
33 if (subs_) {
34 for (int i = 0; i < subs_->size(); i++)
35 delete (*subs_)[i];
36 delete subs_;
37 subs_ = NULL;
38 }
39 }
40
41 // Simplify if the node is an empty Or or And.
Simplify()42 Prefilter* Prefilter::Simplify() {
43 if (op_ != AND && op_ != OR) {
44 return this;
45 }
46
47 // Nothing left in the AND/OR.
48 if (subs_->size() == 0) {
49 if (op_ == AND)
50 op_ = ALL; // AND of nothing is true
51 else
52 op_ = NONE; // OR of nothing is false
53
54 return this;
55 }
56
57 // Just one subnode: throw away wrapper.
58 if (subs_->size() == 1) {
59 Prefilter* a = (*subs_)[0];
60 subs_->clear();
61 delete this;
62 return a->Simplify();
63 }
64
65 return this;
66 }
67
68 // Combines two Prefilters together to create an "op" (AND or OR).
69 // The passed Prefilters will be part of the returned Prefilter or deleted.
70 // Does lots of work to avoid creating unnecessarily complicated structures.
AndOr(Op op,Prefilter * a,Prefilter * b)71 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
72 // If a, b can be rewritten as op, do so.
73 a = a->Simplify();
74 b = b->Simplify();
75
76 // Canonicalize: a->op <= b->op.
77 if (a->op() > b->op()) {
78 Prefilter* t = a;
79 a = b;
80 b = t;
81 }
82
83 // Trivial cases.
84 // ALL AND b = b
85 // NONE OR b = b
86 // ALL OR b = ALL
87 // NONE AND b = NONE
88 // Don't need to look at b, because of canonicalization above.
89 // ALL and NONE are smallest opcodes.
90 if (a->op() == ALL || a->op() == NONE) {
91 if ((a->op() == ALL && op == AND) ||
92 (a->op() == NONE && op == OR)) {
93 delete a;
94 return b;
95 } else {
96 delete b;
97 return a;
98 }
99 }
100
101 // If a and b match op, merge their contents.
102 if (a->op() == op && b->op() == op) {
103 for (int i = 0; i < b->subs()->size(); i++) {
104 Prefilter* bb = (*b->subs())[i];
105 a->subs()->push_back(bb);
106 }
107 b->subs()->clear();
108 delete b;
109 return a;
110 }
111
112 // If a already has the same op as the op that is under construction
113 // add in b (similarly if b already has the same op, add in a).
114 if (b->op() == op) {
115 Prefilter* t = a;
116 a = b;
117 b = t;
118 }
119 if (a->op() == op) {
120 a->subs()->push_back(b);
121 return a;
122 }
123
124 // Otherwise just return the op.
125 Prefilter* c = new Prefilter(op);
126 c->subs()->push_back(a);
127 c->subs()->push_back(b);
128 return c;
129 }
130
And(Prefilter * a,Prefilter * b)131 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
132 return AndOr(AND, a, b);
133 }
134
Or(Prefilter * a,Prefilter * b)135 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
136 return AndOr(OR, a, b);
137 }
138
SimplifyStringSet(set<string> * ss)139 static void SimplifyStringSet(set<string> *ss) {
140 // Now make sure that the strings aren't redundant. For example, if
141 // we know "ab" is a required string, then it doesn't help at all to
142 // know that "abc" is also a required string, so delete "abc". This
143 // is because, when we are performing a string search to filter
144 // regexps, matching ab will already allow this regexp to be a
145 // candidate for match, so further matching abc is redundant.
146
147 for (SSIter i = ss->begin(); i != ss->end(); ++i) {
148 SSIter j = i;
149 ++j;
150 while (j != ss->end()) {
151 // Increment j early so that we can erase the element it points to.
152 SSIter old_j = j;
153 ++j;
154 if (old_j->find(*i) != string::npos)
155 ss->erase(old_j);
156 }
157 }
158 }
159
OrStrings(set<string> * ss)160 Prefilter* Prefilter::OrStrings(set<string>* ss) {
161 SimplifyStringSet(ss);
162 Prefilter* or_prefilter = NULL;
163 if (!ss->empty()) {
164 or_prefilter = new Prefilter(NONE);
165 for (SSIter i = ss->begin(); i != ss->end(); ++i)
166 or_prefilter = Or(or_prefilter, FromString(*i));
167 }
168 return or_prefilter;
169 }
170
ToLowerRune(Rune r)171 static Rune ToLowerRune(Rune r) {
172 if (r < Runeself) {
173 if ('A' <= r && r <= 'Z')
174 r += 'a' - 'A';
175 return r;
176 }
177
178 CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
179 if (f == NULL || r < f->lo)
180 return r;
181 return ApplyFold(f, r);
182 }
183
FromString(const string & str)184 Prefilter* Prefilter::FromString(const string& str) {
185 Prefilter* m = new Prefilter(Prefilter::ATOM);
186 m->atom_ = str;
187 return m;
188 }
189
190 // Information about a regexp used during computation of Prefilter.
191 // Can be thought of as information about the set of strings matching
192 // the given regular expression.
193 class Prefilter::Info {
194 public:
195 Info();
196 ~Info();
197
198 // More constructors. They delete their Info* arguments.
199 static Info* Alt(Info* a, Info* b);
200 static Info* Concat(Info* a, Info* b);
201 static Info* And(Info* a, Info* b);
202 static Info* Star(Info* a);
203 static Info* Plus(Info* a);
204 static Info* Quest(Info* a);
205 static Info* EmptyString();
206 static Info* NoMatch();
207 static Info* AnyChar();
208 static Info* CClass(CharClass* cc);
209 static Info* Literal(Rune r);
210 static Info* AnyMatch();
211
212 // Format Info as a string.
213 string ToString();
214
215 // Caller takes ownership of the Prefilter.
216 Prefilter* TakeMatch();
217
exact()218 set<string>& exact() { return exact_; }
219
is_exact() const220 bool is_exact() const { return is_exact_; }
221
222 class Walker;
223
224 private:
225 set<string> exact_;
226
227 // When is_exact_ is true, the strings that match
228 // are placed in exact_. When it is no longer an exact
229 // set of strings that match this RE, then is_exact_
230 // is false and the match_ contains the required match
231 // criteria.
232 bool is_exact_;
233
234 // Accumulated Prefilter query that any
235 // match for this regexp is guaranteed to match.
236 Prefilter* match_;
237 };
238
239
Info()240 Prefilter::Info::Info()
241 : is_exact_(false),
242 match_(NULL) {
243 }
244
~Info()245 Prefilter::Info::~Info() {
246 delete match_;
247 }
248
TakeMatch()249 Prefilter* Prefilter::Info::TakeMatch() {
250 if (is_exact_) {
251 match_ = Prefilter::OrStrings(&exact_);
252 is_exact_ = false;
253 }
254 Prefilter* m = match_;
255 match_ = NULL;
256 return m;
257 }
258
259 // Format a Info in string form.
ToString()260 string Prefilter::Info::ToString() {
261 if (this == NULL) {
262 // Sometimes when iterating on children of a node,
263 // some children might have NULL Info. Adding
264 // the check here for NULL to take care of cases where
265 // the caller is not checking.
266 return "";
267 }
268
269 if (is_exact_) {
270 int n = 0;
271 string s;
272 for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
273 if (n++ > 0)
274 s += ",";
275 s += *i;
276 }
277 return s;
278 }
279
280 if (match_)
281 return match_->DebugString();
282
283 return "";
284 }
285
286 // Add the strings from src to dst.
CopyIn(const set<string> & src,set<string> * dst)287 static void CopyIn(const set<string>& src, set<string>* dst) {
288 for (ConstSSIter i = src.begin(); i != src.end(); ++i)
289 dst->insert(*i);
290 }
291
292 // Add the cross-product of a and b to dst.
293 // (For each string i in a and j in b, add i+j.)
CrossProduct(const set<string> & a,const set<string> & b,set<string> * dst)294 static void CrossProduct(const set<string>& a,
295 const set<string>& b,
296 set<string>* dst) {
297 for (ConstSSIter i = a.begin(); i != a.end(); ++i)
298 for (ConstSSIter j = b.begin(); j != b.end(); ++j)
299 dst->insert(*i + *j);
300 }
301
302 // Concats a and b. Requires that both are exact sets.
303 // Forms an exact set that is a crossproduct of a and b.
Concat(Info * a,Info * b)304 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
305 if (a == NULL)
306 return b;
307 DCHECK(a->is_exact_);
308 DCHECK(b && b->is_exact_);
309 Info *ab = new Info();
310
311 CrossProduct(a->exact_, b->exact_, &ab->exact_);
312 ab->is_exact_ = true;
313
314 delete a;
315 delete b;
316 return ab;
317 }
318
319 // Constructs an inexact Info for ab given a and b.
320 // Used only when a or b is not exact or when the
321 // exact cross product is likely to be too big.
And(Info * a,Info * b)322 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
323 if (a == NULL)
324 return b;
325 if (b == NULL)
326 return a;
327
328 Info *ab = new Info();
329
330 ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
331 ab->is_exact_ = false;
332 delete a;
333 delete b;
334 return ab;
335 }
336
337 // Constructs Info for a|b given a and b.
Alt(Info * a,Info * b)338 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
339 Info *ab = new Info();
340
341 if (a->is_exact_ && b->is_exact_) {
342 CopyIn(a->exact_, &ab->exact_);
343 CopyIn(b->exact_, &ab->exact_);
344 ab->is_exact_ = true;
345 } else {
346 // Either a or b has is_exact_ = false. If the other
347 // one has is_exact_ = true, we move it to match_ and
348 // then create a OR of a,b. The resulting Info has
349 // is_exact_ = false.
350 ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
351 ab->is_exact_ = false;
352 }
353
354 delete a;
355 delete b;
356 return ab;
357 }
358
359 // Constructs Info for a? given a.
Quest(Info * a)360 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
361 Info *ab = new Info();
362
363 ab->is_exact_ = false;
364 ab->match_ = new Prefilter(ALL);
365 delete a;
366 return ab;
367 }
368
369 // Constructs Info for a* given a.
370 // Same as a? -- not much to do.
Star(Info * a)371 Prefilter::Info* Prefilter::Info::Star(Info *a) {
372 return Quest(a);
373 }
374
375 // Constructs Info for a+ given a. If a was exact set, it isn't
376 // anymore.
Plus(Info * a)377 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
378 Info *ab = new Info();
379
380 ab->match_ = a->TakeMatch();
381 ab->is_exact_ = false;
382
383 delete a;
384 return ab;
385 }
386
RuneToString(Rune r)387 static string RuneToString(Rune r) {
388 char buf[UTFmax];
389 int n = runetochar(buf, &r);
390 return string(buf, n);
391 }
392
393 // Constructs Info for literal rune.
Literal(Rune r)394 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
395 Info* info = new Info();
396 info->exact_.insert(RuneToString(ToLowerRune(r)));
397 info->is_exact_ = true;
398 return info;
399 }
400
401 // Constructs Info for dot (any character).
AnyChar()402 Prefilter::Info* Prefilter::Info::AnyChar() {
403 Prefilter::Info* info = new Prefilter::Info();
404 info->match_ = new Prefilter(ALL);
405 return info;
406 }
407
408 // Constructs Prefilter::Info for no possible match.
NoMatch()409 Prefilter::Info* Prefilter::Info::NoMatch() {
410 Prefilter::Info* info = new Prefilter::Info();
411 info->match_ = new Prefilter(NONE);
412 return info;
413 }
414
415 // Constructs Prefilter::Info for any possible match.
416 // This Prefilter::Info is valid for any regular expression,
417 // since it makes no assertions whatsoever about the
418 // strings being matched.
AnyMatch()419 Prefilter::Info* Prefilter::Info::AnyMatch() {
420 Prefilter::Info *info = new Prefilter::Info();
421 info->match_ = new Prefilter(ALL);
422 return info;
423 }
424
425 // Constructs Prefilter::Info for just the empty string.
EmptyString()426 Prefilter::Info* Prefilter::Info::EmptyString() {
427 Prefilter::Info* info = new Prefilter::Info();
428 info->is_exact_ = true;
429 info->exact_.insert("");
430 return info;
431 }
432
433 // Constructs Prefilter::Info for a character class.
434 typedef CharClass::iterator CCIter;
CClass(CharClass * cc)435 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) {
436 if (Trace) {
437 VLOG(0) << "CharClassInfo:";
438 for (CCIter i = cc->begin(); i != cc->end(); ++i)
439 VLOG(0) << " " << i->lo << "-" << i->hi;
440 }
441
442 // If the class is too large, it's okay to overestimate.
443 if (cc->size() > 10)
444 return AnyChar();
445
446 Prefilter::Info *a = new Prefilter::Info();
447 for (CCIter i = cc->begin(); i != cc->end(); ++i)
448 for (Rune r = i->lo; r <= i->hi; r++)
449 a->exact_.insert(RuneToString(ToLowerRune(r)));
450
451 a->is_exact_ = true;
452
453 if (Trace) {
454 VLOG(0) << " = " << a->ToString();
455 }
456
457 return a;
458 }
459
460 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
461 public:
Walker()462 Walker() {}
463
464 virtual Info* PostVisit(
465 Regexp* re, Info* parent_arg,
466 Info* pre_arg,
467 Info** child_args, int nchild_args);
468
469 virtual Info* ShortVisit(
470 Regexp* re,
471 Info* parent_arg);
472
473 private:
474 DISALLOW_EVIL_CONSTRUCTORS(Walker);
475 };
476
BuildInfo(Regexp * re)477 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
478 if (Trace) {
479 LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
480 }
481 Prefilter::Info::Walker w;
482 Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
483
484 if (w.stopped_early()) {
485 delete info;
486 return NULL;
487 }
488
489 return info;
490 }
491
ShortVisit(Regexp * re,Prefilter::Info * parent_arg)492 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
493 Regexp* re, Prefilter::Info* parent_arg) {
494 return AnyMatch();
495 }
496
497 // Constructs the Prefilter::Info for the given regular expression.
498 // Assumes re is simplified.
PostVisit(Regexp * re,Prefilter::Info * parent_arg,Prefilter::Info * pre_arg,Prefilter::Info ** child_args,int nchild_args)499 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
500 Regexp* re, Prefilter::Info* parent_arg,
501 Prefilter::Info* pre_arg, Prefilter::Info** child_args,
502 int nchild_args) {
503 Prefilter::Info *info;
504 switch (re->op()) {
505 default:
506 case kRegexpRepeat:
507 LOG(DFATAL) << "Bad regexp op " << re->op();
508 info = EmptyString();
509 break;
510
511 case kRegexpNoMatch:
512 info = NoMatch();
513 break;
514
515 // These ops match the empty string:
516 case kRegexpEmptyMatch: // anywhere
517 case kRegexpBeginLine: // at beginning of line
518 case kRegexpEndLine: // at end of line
519 case kRegexpBeginText: // at beginning of text
520 case kRegexpEndText: // at end of text
521 case kRegexpWordBoundary: // at word boundary
522 case kRegexpNoWordBoundary: // not at word boundary
523 info = EmptyString();
524 break;
525
526 case kRegexpLiteral:
527 info = Literal(re->rune());
528 break;
529
530 case kRegexpLiteralString:
531 if (re->nrunes() == 0) {
532 info = NoMatch();
533 break;
534 }
535 info = Literal(re->runes()[0]);
536 for (int i = 1; i < re->nrunes(); i++)
537 info = Concat(info, Literal(re->runes()[i]));
538 break;
539
540 case kRegexpConcat: {
541 // Accumulate in info.
542 // Exact is concat of recent contiguous exact nodes.
543 info = NULL;
544 Info* exact = NULL;
545 for (int i = 0; i < nchild_args; i++) {
546 Info* ci = child_args[i]; // child info
547 if (!ci->is_exact() ||
548 (exact && ci->exact().size() * exact->exact().size() > 16)) {
549 // Exact run is over.
550 info = And(info, exact);
551 exact = NULL;
552 // Add this child's info.
553 info = And(info, ci);
554 } else {
555 // Append to exact run.
556 exact = Concat(exact, ci);
557 }
558 }
559 info = And(info, exact);
560 }
561 break;
562
563 case kRegexpAlternate:
564 info = child_args[0];
565 for (int i = 1; i < nchild_args; i++)
566 info = Alt(info, child_args[i]);
567 VLOG(10) << "Alt: " << info->ToString();
568 break;
569
570 case kRegexpStar:
571 info = Star(child_args[0]);
572 break;
573
574 case kRegexpQuest:
575 info = Quest(child_args[0]);
576 break;
577
578 case kRegexpPlus:
579 info = Plus(child_args[0]);
580 break;
581
582 case kRegexpAnyChar:
583 // Claim nothing, except that it's not empty.
584 info = AnyChar();
585 break;
586
587 case kRegexpCharClass:
588 info = CClass(re->cc());
589 break;
590
591 case kRegexpCapture:
592 // These don't affect the set of matching strings.
593 info = child_args[0];
594 break;
595 }
596
597 if (Trace) {
598 VLOG(0) << "BuildInfo " << re->ToString()
599 << ": " << info->ToString();
600 }
601
602 return info;
603 }
604
605
FromRegexp(Regexp * re)606 Prefilter* Prefilter::FromRegexp(Regexp* re) {
607 if (re == NULL)
608 return NULL;
609
610 Regexp* simple = re->Simplify();
611 Prefilter::Info *info = BuildInfo(simple);
612
613 simple->Decref();
614 if (info == NULL)
615 return NULL;
616
617 Prefilter* m = info->TakeMatch();
618
619 delete info;
620 return m;
621 }
622
DebugString() const623 string Prefilter::DebugString() const {
624 if (this == NULL)
625 return "<nil>";
626
627 switch (op_) {
628 default:
629 LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
630 return StringPrintf("op%d", op_);
631 case NONE:
632 return "*no-matches*";
633 case ATOM:
634 return atom_;
635 case ALL:
636 return "";
637 case AND: {
638 string s = "";
639 for (int i = 0; i < subs_->size(); i++) {
640 if (i > 0)
641 s += " ";
642 s += (*subs_)[i]->DebugString();
643 }
644 return s;
645 }
646 case OR: {
647 string s = "(";
648 for (int i = 0; i < subs_->size(); i++) {
649 if (i > 0)
650 s += "|";
651 s += (*subs_)[i]->DebugString();
652 }
653 s += ")";
654 return s;
655 }
656 }
657 }
658
FromRE2(const RE2 * re2)659 Prefilter* Prefilter::FromRE2(const RE2* re2) {
660 if (re2 == NULL)
661 return NULL;
662
663 Regexp* regexp = re2->Regexp();
664 if (regexp == NULL)
665 return NULL;
666
667 return FromRegexp(regexp);
668 }
669
670
671 } // namespace re2
672