1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "re2/prefilter.h"
6
7 #include <stddef.h>
8 #include <stdint.h>
9 #include <string>
10 #include <vector>
11
12 #include "util/util.h"
13 #include "util/logging.h"
14 #include "util/strutil.h"
15 #include "util/utf.h"
16 #include "re2/re2.h"
17 #include "re2/unicode_casefold.h"
18 #include "re2/walker-inl.h"
19
20 namespace re2 {
21
22 static const bool ExtraDebug = false;
23
24 typedef std::set<string>::iterator SSIter;
25 typedef std::set<string>::const_iterator ConstSSIter;
26
27 // Initializes a Prefilter, allocating subs_ as necessary.
Prefilter(Op op)28 Prefilter::Prefilter(Op op) {
29 op_ = op;
30 subs_ = NULL;
31 if (op_ == AND || op_ == OR)
32 subs_ = new std::vector<Prefilter*>;
33 }
34
35 // Destroys a Prefilter.
~Prefilter()36 Prefilter::~Prefilter() {
37 if (subs_) {
38 for (size_t i = 0; i < subs_->size(); i++)
39 delete (*subs_)[i];
40 delete subs_;
41 subs_ = NULL;
42 }
43 }
44
45 // Simplify if the node is an empty Or or And.
Simplify()46 Prefilter* Prefilter::Simplify() {
47 if (op_ != AND && op_ != OR) {
48 return this;
49 }
50
51 // Nothing left in the AND/OR.
52 if (subs_->empty()) {
53 if (op_ == AND)
54 op_ = ALL; // AND of nothing is true
55 else
56 op_ = NONE; // OR of nothing is false
57
58 return this;
59 }
60
61 // Just one subnode: throw away wrapper.
62 if (subs_->size() == 1) {
63 Prefilter* a = (*subs_)[0];
64 subs_->clear();
65 delete this;
66 return a->Simplify();
67 }
68
69 return this;
70 }
71
72 // Combines two Prefilters together to create an "op" (AND or OR).
73 // The passed Prefilters will be part of the returned Prefilter or deleted.
74 // Does lots of work to avoid creating unnecessarily complicated structures.
AndOr(Op op,Prefilter * a,Prefilter * b)75 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
76 // If a, b can be rewritten as op, do so.
77 a = a->Simplify();
78 b = b->Simplify();
79
80 // Canonicalize: a->op <= b->op.
81 if (a->op() > b->op()) {
82 Prefilter* t = a;
83 a = b;
84 b = t;
85 }
86
87 // Trivial cases.
88 // ALL AND b = b
89 // NONE OR b = b
90 // ALL OR b = ALL
91 // NONE AND b = NONE
92 // Don't need to look at b, because of canonicalization above.
93 // ALL and NONE are smallest opcodes.
94 if (a->op() == ALL || a->op() == NONE) {
95 if ((a->op() == ALL && op == AND) ||
96 (a->op() == NONE && op == OR)) {
97 delete a;
98 return b;
99 } else {
100 delete b;
101 return a;
102 }
103 }
104
105 // If a and b match op, merge their contents.
106 if (a->op() == op && b->op() == op) {
107 for (size_t i = 0; i < b->subs()->size(); i++) {
108 Prefilter* bb = (*b->subs())[i];
109 a->subs()->push_back(bb);
110 }
111 b->subs()->clear();
112 delete b;
113 return a;
114 }
115
116 // If a already has the same op as the op that is under construction
117 // add in b (similarly if b already has the same op, add in a).
118 if (b->op() == op) {
119 Prefilter* t = a;
120 a = b;
121 b = t;
122 }
123 if (a->op() == op) {
124 a->subs()->push_back(b);
125 return a;
126 }
127
128 // Otherwise just return the op.
129 Prefilter* c = new Prefilter(op);
130 c->subs()->push_back(a);
131 c->subs()->push_back(b);
132 return c;
133 }
134
And(Prefilter * a,Prefilter * b)135 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
136 return AndOr(AND, a, b);
137 }
138
Or(Prefilter * a,Prefilter * b)139 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
140 return AndOr(OR, a, b);
141 }
142
SimplifyStringSet(std::set<string> * ss)143 static void SimplifyStringSet(std::set<string> *ss) {
144 // Now make sure that the strings aren't redundant. For example, if
145 // we know "ab" is a required string, then it doesn't help at all to
146 // know that "abc" is also a required string, so delete "abc". This
147 // is because, when we are performing a string search to filter
148 // regexps, matching ab will already allow this regexp to be a
149 // candidate for match, so further matching abc is redundant.
150
151 for (SSIter i = ss->begin(); i != ss->end(); ++i) {
152 SSIter j = i;
153 ++j;
154 while (j != ss->end()) {
155 // Increment j early so that we can erase the element it points to.
156 SSIter old_j = j;
157 ++j;
158 if (old_j->find(*i) != string::npos)
159 ss->erase(old_j);
160 }
161 }
162 }
163
OrStrings(std::set<string> * ss)164 Prefilter* Prefilter::OrStrings(std::set<string>* ss) {
165 SimplifyStringSet(ss);
166 Prefilter* or_prefilter = NULL;
167 if (!ss->empty()) {
168 or_prefilter = new Prefilter(NONE);
169 for (SSIter i = ss->begin(); i != ss->end(); ++i)
170 or_prefilter = Or(or_prefilter, FromString(*i));
171 }
172 return or_prefilter;
173 }
174
ToLowerRune(Rune r)175 static Rune ToLowerRune(Rune r) {
176 if (r < Runeself) {
177 if ('A' <= r && r <= 'Z')
178 r += 'a' - 'A';
179 return r;
180 }
181
182 const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
183 if (f == NULL || r < f->lo)
184 return r;
185 return ApplyFold(f, r);
186 }
187
ToLowerRuneLatin1(Rune r)188 static Rune ToLowerRuneLatin1(Rune r) {
189 if ('A' <= r && r <= 'Z')
190 r += 'a' - 'A';
191 return r;
192 }
193
FromString(const string & str)194 Prefilter* Prefilter::FromString(const string& str) {
195 Prefilter* m = new Prefilter(Prefilter::ATOM);
196 m->atom_ = str;
197 return m;
198 }
199
200 // Information about a regexp used during computation of Prefilter.
201 // Can be thought of as information about the set of strings matching
202 // the given regular expression.
203 class Prefilter::Info {
204 public:
205 Info();
206 ~Info();
207
208 // More constructors. They delete their Info* arguments.
209 static Info* Alt(Info* a, Info* b);
210 static Info* Concat(Info* a, Info* b);
211 static Info* And(Info* a, Info* b);
212 static Info* Star(Info* a);
213 static Info* Plus(Info* a);
214 static Info* Quest(Info* a);
215 static Info* EmptyString();
216 static Info* NoMatch();
217 static Info* AnyCharOrAnyByte();
218 static Info* CClass(CharClass* cc, bool latin1);
219 static Info* Literal(Rune r);
220 static Info* LiteralLatin1(Rune r);
221 static Info* AnyMatch();
222
223 // Format Info as a string.
224 string ToString();
225
226 // Caller takes ownership of the Prefilter.
227 Prefilter* TakeMatch();
228
exact()229 std::set<string>& exact() { return exact_; }
230
is_exact() const231 bool is_exact() const { return is_exact_; }
232
233 class Walker;
234
235 private:
236 std::set<string> exact_;
237
238 // When is_exact_ is true, the strings that match
239 // are placed in exact_. When it is no longer an exact
240 // set of strings that match this RE, then is_exact_
241 // is false and the match_ contains the required match
242 // criteria.
243 bool is_exact_;
244
245 // Accumulated Prefilter query that any
246 // match for this regexp is guaranteed to match.
247 Prefilter* match_;
248 };
249
250
Info()251 Prefilter::Info::Info()
252 : is_exact_(false),
253 match_(NULL) {
254 }
255
~Info()256 Prefilter::Info::~Info() {
257 delete match_;
258 }
259
TakeMatch()260 Prefilter* Prefilter::Info::TakeMatch() {
261 if (is_exact_) {
262 match_ = Prefilter::OrStrings(&exact_);
263 is_exact_ = false;
264 }
265 Prefilter* m = match_;
266 match_ = NULL;
267 return m;
268 }
269
270 // Format a Info in string form.
ToString()271 string Prefilter::Info::ToString() {
272 if (is_exact_) {
273 int n = 0;
274 string s;
275 for (std::set<string>::iterator i = exact_.begin();
276 i != exact_.end();
277 ++i) {
278 if (n++ > 0)
279 s += ",";
280 s += *i;
281 }
282 return s;
283 }
284
285 if (match_)
286 return match_->DebugString();
287
288 return "";
289 }
290
291 // Add the strings from src to dst.
CopyIn(const std::set<string> & src,std::set<string> * dst)292 static void CopyIn(const std::set<string>& src,
293 std::set<string>* dst) {
294 for (ConstSSIter i = src.begin(); i != src.end(); ++i)
295 dst->insert(*i);
296 }
297
298 // Add the cross-product of a and b to dst.
299 // (For each string i in a and j in b, add i+j.)
CrossProduct(const std::set<string> & a,const std::set<string> & b,std::set<string> * dst)300 static void CrossProduct(const std::set<string>& a,
301 const std::set<string>& b,
302 std::set<string>* dst) {
303 for (ConstSSIter i = a.begin(); i != a.end(); ++i)
304 for (ConstSSIter j = b.begin(); j != b.end(); ++j)
305 dst->insert(*i + *j);
306 }
307
308 // Concats a and b. Requires that both are exact sets.
309 // Forms an exact set that is a crossproduct of a and b.
Concat(Info * a,Info * b)310 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
311 if (a == NULL)
312 return b;
313 DCHECK(a->is_exact_);
314 DCHECK(b && b->is_exact_);
315 Info *ab = new Info();
316
317 CrossProduct(a->exact_, b->exact_, &ab->exact_);
318 ab->is_exact_ = true;
319
320 delete a;
321 delete b;
322 return ab;
323 }
324
325 // Constructs an inexact Info for ab given a and b.
326 // Used only when a or b is not exact or when the
327 // exact cross product is likely to be too big.
And(Info * a,Info * b)328 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
329 if (a == NULL)
330 return b;
331 if (b == NULL)
332 return a;
333
334 Info *ab = new Info();
335
336 ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
337 ab->is_exact_ = false;
338 delete a;
339 delete b;
340 return ab;
341 }
342
343 // Constructs Info for a|b given a and b.
Alt(Info * a,Info * b)344 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
345 Info *ab = new Info();
346
347 if (a->is_exact_ && b->is_exact_) {
348 CopyIn(a->exact_, &ab->exact_);
349 CopyIn(b->exact_, &ab->exact_);
350 ab->is_exact_ = true;
351 } else {
352 // Either a or b has is_exact_ = false. If the other
353 // one has is_exact_ = true, we move it to match_ and
354 // then create a OR of a,b. The resulting Info has
355 // is_exact_ = false.
356 ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
357 ab->is_exact_ = false;
358 }
359
360 delete a;
361 delete b;
362 return ab;
363 }
364
365 // Constructs Info for a? given a.
Quest(Info * a)366 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
367 Info *ab = new Info();
368
369 ab->is_exact_ = false;
370 ab->match_ = new Prefilter(ALL);
371 delete a;
372 return ab;
373 }
374
375 // Constructs Info for a* given a.
376 // Same as a? -- not much to do.
Star(Info * a)377 Prefilter::Info* Prefilter::Info::Star(Info *a) {
378 return Quest(a);
379 }
380
381 // Constructs Info for a+ given a. If a was exact set, it isn't
382 // anymore.
Plus(Info * a)383 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
384 Info *ab = new Info();
385
386 ab->match_ = a->TakeMatch();
387 ab->is_exact_ = false;
388
389 delete a;
390 return ab;
391 }
392
RuneToString(Rune r)393 static string RuneToString(Rune r) {
394 char buf[UTFmax];
395 int n = runetochar(buf, &r);
396 return string(buf, n);
397 }
398
RuneToStringLatin1(Rune r)399 static string RuneToStringLatin1(Rune r) {
400 char c = r & 0xff;
401 return string(&c, 1);
402 }
403
404 // Constructs Info for literal rune.
Literal(Rune r)405 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
406 Info* info = new Info();
407 info->exact_.insert(RuneToString(ToLowerRune(r)));
408 info->is_exact_ = true;
409 return info;
410 }
411
412 // Constructs Info for literal rune for Latin1 encoded string.
LiteralLatin1(Rune r)413 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
414 Info* info = new Info();
415 info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
416 info->is_exact_ = true;
417 return info;
418 }
419
420 // Constructs Info for dot (any character) or \C (any byte).
AnyCharOrAnyByte()421 Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
422 Prefilter::Info* info = new Prefilter::Info();
423 info->match_ = new Prefilter(ALL);
424 return info;
425 }
426
427 // Constructs Prefilter::Info for no possible match.
NoMatch()428 Prefilter::Info* Prefilter::Info::NoMatch() {
429 Prefilter::Info* info = new Prefilter::Info();
430 info->match_ = new Prefilter(NONE);
431 return info;
432 }
433
434 // Constructs Prefilter::Info for any possible match.
435 // This Prefilter::Info is valid for any regular expression,
436 // since it makes no assertions whatsoever about the
437 // strings being matched.
AnyMatch()438 Prefilter::Info* Prefilter::Info::AnyMatch() {
439 Prefilter::Info *info = new Prefilter::Info();
440 info->match_ = new Prefilter(ALL);
441 return info;
442 }
443
444 // Constructs Prefilter::Info for just the empty string.
EmptyString()445 Prefilter::Info* Prefilter::Info::EmptyString() {
446 Prefilter::Info* info = new Prefilter::Info();
447 info->is_exact_ = true;
448 info->exact_.insert("");
449 return info;
450 }
451
452 // Constructs Prefilter::Info for a character class.
453 typedef CharClass::iterator CCIter;
CClass(CharClass * cc,bool latin1)454 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
455 bool latin1) {
456 if (ExtraDebug) {
457 LOG(ERROR) << "CharClassInfo:";
458 for (CCIter i = cc->begin(); i != cc->end(); ++i)
459 LOG(ERROR) << " " << i->lo << "-" << i->hi;
460 }
461
462 // If the class is too large, it's okay to overestimate.
463 if (cc->size() > 10)
464 return AnyCharOrAnyByte();
465
466 Prefilter::Info *a = new Prefilter::Info();
467 for (CCIter i = cc->begin(); i != cc->end(); ++i)
468 for (Rune r = i->lo; r <= i->hi; r++) {
469 if (latin1) {
470 a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
471 } else {
472 a->exact_.insert(RuneToString(ToLowerRune(r)));
473 }
474 }
475
476
477 a->is_exact_ = true;
478
479 if (ExtraDebug)
480 LOG(ERROR) << " = " << a->ToString();
481
482 return a;
483 }
484
485 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
486 public:
Walker(bool latin1)487 Walker(bool latin1) : latin1_(latin1) {}
488
489 virtual Info* PostVisit(
490 Regexp* re, Info* parent_arg,
491 Info* pre_arg,
492 Info** child_args, int nchild_args);
493
494 virtual Info* ShortVisit(
495 Regexp* re,
496 Info* parent_arg);
497
latin1()498 bool latin1() { return latin1_; }
499 private:
500 bool latin1_;
501
502 Walker(const Walker&) = delete;
503 Walker& operator=(const Walker&) = delete;
504 };
505
BuildInfo(Regexp * re)506 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
507 if (ExtraDebug)
508 LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
509
510 bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
511 Prefilter::Info::Walker w(latin1);
512 Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
513
514 if (w.stopped_early()) {
515 delete info;
516 return NULL;
517 }
518
519 return info;
520 }
521
ShortVisit(Regexp * re,Prefilter::Info * parent_arg)522 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
523 Regexp* re, Prefilter::Info* parent_arg) {
524 return AnyMatch();
525 }
526
527 // Constructs the Prefilter::Info for the given regular expression.
528 // Assumes re is simplified.
PostVisit(Regexp * re,Prefilter::Info * parent_arg,Prefilter::Info * pre_arg,Prefilter::Info ** child_args,int nchild_args)529 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
530 Regexp* re, Prefilter::Info* parent_arg,
531 Prefilter::Info* pre_arg, Prefilter::Info** child_args,
532 int nchild_args) {
533 Prefilter::Info *info;
534 switch (re->op()) {
535 default:
536 case kRegexpRepeat:
537 LOG(DFATAL) << "Bad regexp op " << re->op();
538 info = EmptyString();
539 break;
540
541 case kRegexpNoMatch:
542 info = NoMatch();
543 break;
544
545 // These ops match the empty string:
546 case kRegexpEmptyMatch: // anywhere
547 case kRegexpBeginLine: // at beginning of line
548 case kRegexpEndLine: // at end of line
549 case kRegexpBeginText: // at beginning of text
550 case kRegexpEndText: // at end of text
551 case kRegexpWordBoundary: // at word boundary
552 case kRegexpNoWordBoundary: // not at word boundary
553 info = EmptyString();
554 break;
555
556 case kRegexpLiteral:
557 if (latin1()) {
558 info = LiteralLatin1(re->rune());
559 }
560 else {
561 info = Literal(re->rune());
562 }
563 break;
564
565 case kRegexpLiteralString:
566 if (re->nrunes() == 0) {
567 info = NoMatch();
568 break;
569 }
570 if (latin1()) {
571 info = LiteralLatin1(re->runes()[0]);
572 for (int i = 1; i < re->nrunes(); i++) {
573 info = Concat(info, LiteralLatin1(re->runes()[i]));
574 }
575 } else {
576 info = Literal(re->runes()[0]);
577 for (int i = 1; i < re->nrunes(); i++) {
578 info = Concat(info, Literal(re->runes()[i]));
579 }
580 }
581 break;
582
583 case kRegexpConcat: {
584 // Accumulate in info.
585 // Exact is concat of recent contiguous exact nodes.
586 info = NULL;
587 Info* exact = NULL;
588 for (int i = 0; i < nchild_args; i++) {
589 Info* ci = child_args[i]; // child info
590 if (!ci->is_exact() ||
591 (exact && ci->exact().size() * exact->exact().size() > 16)) {
592 // Exact run is over.
593 info = And(info, exact);
594 exact = NULL;
595 // Add this child's info.
596 info = And(info, ci);
597 } else {
598 // Append to exact run.
599 exact = Concat(exact, ci);
600 }
601 }
602 info = And(info, exact);
603 }
604 break;
605
606 case kRegexpAlternate:
607 info = child_args[0];
608 for (int i = 1; i < nchild_args; i++)
609 info = Alt(info, child_args[i]);
610 break;
611
612 case kRegexpStar:
613 info = Star(child_args[0]);
614 break;
615
616 case kRegexpQuest:
617 info = Quest(child_args[0]);
618 break;
619
620 case kRegexpPlus:
621 info = Plus(child_args[0]);
622 break;
623
624 case kRegexpAnyChar:
625 case kRegexpAnyByte:
626 // Claim nothing, except that it's not empty.
627 info = AnyCharOrAnyByte();
628 break;
629
630 case kRegexpCharClass:
631 info = CClass(re->cc(), latin1());
632 break;
633
634 case kRegexpCapture:
635 // These don't affect the set of matching strings.
636 info = child_args[0];
637 break;
638 }
639
640 if (ExtraDebug)
641 LOG(ERROR) << "BuildInfo " << re->ToString()
642 << ": " << (info ? info->ToString() : "");
643
644 return info;
645 }
646
647
FromRegexp(Regexp * re)648 Prefilter* Prefilter::FromRegexp(Regexp* re) {
649 if (re == NULL)
650 return NULL;
651
652 Regexp* simple = re->Simplify();
653 Prefilter::Info *info = BuildInfo(simple);
654
655 simple->Decref();
656 if (info == NULL)
657 return NULL;
658
659 Prefilter* m = info->TakeMatch();
660
661 delete info;
662 return m;
663 }
664
DebugString() const665 string Prefilter::DebugString() const {
666 switch (op_) {
667 default:
668 LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
669 return StringPrintf("op%d", op_);
670 case NONE:
671 return "*no-matches*";
672 case ATOM:
673 return atom_;
674 case ALL:
675 return "";
676 case AND: {
677 string s = "";
678 for (size_t i = 0; i < subs_->size(); i++) {
679 if (i > 0)
680 s += " ";
681 Prefilter* sub = (*subs_)[i];
682 s += sub ? sub->DebugString() : "<nil>";
683 }
684 return s;
685 }
686 case OR: {
687 string s = "(";
688 for (size_t i = 0; i < subs_->size(); i++) {
689 if (i > 0)
690 s += "|";
691 Prefilter* sub = (*subs_)[i];
692 s += sub ? sub->DebugString() : "<nil>";
693 }
694 s += ")";
695 return s;
696 }
697 }
698 }
699
FromRE2(const RE2 * re2)700 Prefilter* Prefilter::FromRE2(const RE2* re2) {
701 if (re2 == NULL)
702 return NULL;
703
704 Regexp* regexp = re2->Regexp();
705 if (regexp == NULL)
706 return NULL;
707
708 return FromRegexp(regexp);
709 }
710
711
712 } // namespace re2
713