1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "re2/prefilter.h"
6
7 #include <stddef.h>
8
9 #include <string>
10 #include <utility>
11 #include <vector>
12
13 #include "absl/log/absl_check.h"
14 #include "absl/log/absl_log.h"
15 #include "absl/strings/str_format.h"
16 #include "re2/re2.h"
17 #include "re2/regexp.h"
18 #include "re2/unicode_casefold.h"
19 #include "re2/walker-inl.h"
20 #include "util/utf.h"
21
22 namespace re2 {
23
24 static const bool ExtraDebug = false;
25
26 // Initializes a Prefilter, allocating subs_ as necessary.
Prefilter(Op op)27 Prefilter::Prefilter(Op op) {
28 op_ = op;
29 subs_ = NULL;
30 if (op_ == AND || op_ == OR)
31 subs_ = new std::vector<Prefilter*>;
32 }
33
34 // Destroys a Prefilter.
~Prefilter()35 Prefilter::~Prefilter() {
36 if (subs_) {
37 for (size_t i = 0; i < subs_->size(); i++)
38 delete (*subs_)[i];
39 delete subs_;
40 subs_ = NULL;
41 }
42 }
43
44 // Simplify if the node is an empty Or or And.
Simplify()45 Prefilter* Prefilter::Simplify() {
46 if (op_ != AND && op_ != OR) {
47 return this;
48 }
49
50 // Nothing left in the AND/OR.
51 if (subs_->empty()) {
52 if (op_ == AND)
53 op_ = ALL; // AND of nothing is true
54 else
55 op_ = NONE; // OR of nothing is false
56
57 return this;
58 }
59
60 // Just one subnode: throw away wrapper.
61 if (subs_->size() == 1) {
62 Prefilter* a = (*subs_)[0];
63 subs_->clear();
64 delete this;
65 return a->Simplify();
66 }
67
68 return this;
69 }
70
71 // Combines two Prefilters together to create an "op" (AND or OR).
72 // The passed Prefilters will be part of the returned Prefilter or deleted.
73 // Does lots of work to avoid creating unnecessarily complicated structures.
AndOr(Op op,Prefilter * a,Prefilter * b)74 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
75 // If a, b can be rewritten as op, do so.
76 a = a->Simplify();
77 b = b->Simplify();
78
79 // Canonicalize: a->op <= b->op.
80 if (a->op() > b->op()) {
81 Prefilter* t = a;
82 a = b;
83 b = t;
84 }
85
86 // Trivial cases.
87 // ALL AND b = b
88 // NONE OR b = b
89 // ALL OR b = ALL
90 // NONE AND b = NONE
91 // Don't need to look at b, because of canonicalization above.
92 // ALL and NONE are smallest opcodes.
93 if (a->op() == ALL || a->op() == NONE) {
94 if ((a->op() == ALL && op == AND) ||
95 (a->op() == NONE && op == OR)) {
96 delete a;
97 return b;
98 } else {
99 delete b;
100 return a;
101 }
102 }
103
104 // If a and b match op, merge their contents.
105 if (a->op() == op && b->op() == op) {
106 for (size_t i = 0; i < b->subs()->size(); i++) {
107 Prefilter* bb = (*b->subs())[i];
108 a->subs()->push_back(bb);
109 }
110 b->subs()->clear();
111 delete b;
112 return a;
113 }
114
115 // If a already has the same op as the op that is under construction
116 // add in b (similarly if b already has the same op, add in a).
117 if (b->op() == op) {
118 Prefilter* t = a;
119 a = b;
120 b = t;
121 }
122 if (a->op() == op) {
123 a->subs()->push_back(b);
124 return a;
125 }
126
127 // Otherwise just return the op.
128 Prefilter* c = new Prefilter(op);
129 c->subs()->push_back(a);
130 c->subs()->push_back(b);
131 return c;
132 }
133
And(Prefilter * a,Prefilter * b)134 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
135 return AndOr(AND, a, b);
136 }
137
Or(Prefilter * a,Prefilter * b)138 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
139 return AndOr(OR, a, b);
140 }
141
SimplifyStringSet(SSet * ss)142 void Prefilter::SimplifyStringSet(SSet* ss) {
143 // Now make sure that the strings aren't redundant. For example, if
144 // we know "ab" is a required string, then it doesn't help at all to
145 // know that "abc" is also a required string, so delete "abc". This
146 // is because, when we are performing a string search to filter
147 // regexps, matching "ab" will already allow this regexp to be a
148 // candidate for match, so further matching "abc" is redundant.
149 // Note that we must ignore "" because find() would find it at the
150 // start of everything and thus we would end up erasing everything.
151 //
152 // The SSet sorts strings by length, then lexicographically. Note that
153 // smaller strings appear first and all strings must be unique. These
154 // observations let us skip string comparisons when possible.
155 SSIter i = ss->begin();
156 if (i != ss->end() && i->empty()) {
157 ++i;
158 }
159 for (; i != ss->end(); ++i) {
160 SSIter j = i;
161 ++j;
162 while (j != ss->end()) {
163 if (j->size() > i->size() && j->find(*i) != std::string::npos) {
164 j = ss->erase(j);
165 continue;
166 }
167 ++j;
168 }
169 }
170 }
171
OrStrings(SSet * ss)172 Prefilter* Prefilter::OrStrings(SSet* ss) {
173 Prefilter* or_prefilter = new Prefilter(NONE);
174 SimplifyStringSet(ss);
175 for (SSIter i = ss->begin(); i != ss->end(); ++i)
176 or_prefilter = Or(or_prefilter, FromString(*i));
177 return or_prefilter;
178 }
179
ToLowerRune(Rune r)180 static Rune ToLowerRune(Rune r) {
181 if (r < Runeself) {
182 if ('A' <= r && r <= 'Z')
183 r += 'a' - 'A';
184 return r;
185 }
186
187 const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
188 if (f == NULL || r < f->lo)
189 return r;
190 return ApplyFold(f, r);
191 }
192
ToLowerRuneLatin1(Rune r)193 static Rune ToLowerRuneLatin1(Rune r) {
194 if ('A' <= r && r <= 'Z')
195 r += 'a' - 'A';
196 return r;
197 }
198
FromString(const std::string & str)199 Prefilter* Prefilter::FromString(const std::string& str) {
200 Prefilter* m = new Prefilter(Prefilter::ATOM);
201 m->atom_ = str;
202 return m;
203 }
204
205 // Information about a regexp used during computation of Prefilter.
206 // Can be thought of as information about the set of strings matching
207 // the given regular expression.
208 class Prefilter::Info {
209 public:
210 Info();
211 ~Info();
212
213 // More constructors. They delete their Info* arguments.
214 static Info* Alt(Info* a, Info* b);
215 static Info* Concat(Info* a, Info* b);
216 static Info* And(Info* a, Info* b);
217 static Info* Star(Info* a);
218 static Info* Plus(Info* a);
219 static Info* Quest(Info* a);
220 static Info* EmptyString();
221 static Info* NoMatch();
222 static Info* AnyCharOrAnyByte();
223 static Info* CClass(CharClass* cc, bool latin1);
224 static Info* Literal(Rune r);
225 static Info* LiteralLatin1(Rune r);
226 static Info* AnyMatch();
227
228 // Format Info as a string.
229 std::string ToString();
230
231 // Caller takes ownership of the Prefilter.
232 Prefilter* TakeMatch();
233
exact()234 SSet& exact() { return exact_; }
235
is_exact() const236 bool is_exact() const { return is_exact_; }
237
238 class Walker;
239
240 private:
241 SSet exact_;
242
243 // When is_exact_ is true, the strings that match
244 // are placed in exact_. When it is no longer an exact
245 // set of strings that match this RE, then is_exact_
246 // is false and the match_ contains the required match
247 // criteria.
248 bool is_exact_;
249
250 // Accumulated Prefilter query that any
251 // match for this regexp is guaranteed to match.
252 Prefilter* match_;
253 };
254
255
Info()256 Prefilter::Info::Info()
257 : is_exact_(false),
258 match_(NULL) {
259 }
260
~Info()261 Prefilter::Info::~Info() {
262 delete match_;
263 }
264
TakeMatch()265 Prefilter* Prefilter::Info::TakeMatch() {
266 if (is_exact_) {
267 match_ = Prefilter::OrStrings(&exact_);
268 is_exact_ = false;
269 }
270 Prefilter* m = match_;
271 match_ = NULL;
272 return m;
273 }
274
275 // Format a Info in string form.
ToString()276 std::string Prefilter::Info::ToString() {
277 if (is_exact_) {
278 int n = 0;
279 std::string s;
280 for (SSIter i = exact_.begin(); i != exact_.end(); ++i) {
281 if (n++ > 0)
282 s += ",";
283 s += *i;
284 }
285 return s;
286 }
287
288 if (match_)
289 return match_->DebugString();
290
291 return "";
292 }
293
CrossProduct(const SSet & a,const SSet & b,SSet * dst)294 void Prefilter::CrossProduct(const SSet& a, const SSet& b, SSet* dst) {
295 for (ConstSSIter i = a.begin(); i != a.end(); ++i)
296 for (ConstSSIter j = b.begin(); j != b.end(); ++j)
297 dst->insert(*i + *j);
298 }
299
300 // Concats a and b. Requires that both are exact sets.
301 // Forms an exact set that is a crossproduct of a and b.
Concat(Info * a,Info * b)302 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
303 if (a == NULL)
304 return b;
305 ABSL_DCHECK(a->is_exact_);
306 ABSL_DCHECK(b && b->is_exact_);
307 Info *ab = new Info();
308
309 CrossProduct(a->exact_, b->exact_, &ab->exact_);
310 ab->is_exact_ = true;
311
312 delete a;
313 delete b;
314 return ab;
315 }
316
317 // Constructs an inexact Info for ab given a and b.
318 // Used only when a or b is not exact or when the
319 // exact cross product is likely to be too big.
And(Info * a,Info * b)320 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
321 if (a == NULL)
322 return b;
323 if (b == NULL)
324 return a;
325
326 Info *ab = new Info();
327
328 ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
329 ab->is_exact_ = false;
330 delete a;
331 delete b;
332 return ab;
333 }
334
335 // Constructs Info for a|b given a and b.
Alt(Info * a,Info * b)336 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
337 Info *ab = new Info();
338
339 if (a->is_exact_ && b->is_exact_) {
340 // Avoid string copies by moving the larger exact_ set into
341 // ab directly, then merge in the smaller set.
342 if (a->exact_.size() < b->exact_.size()) {
343 using std::swap;
344 swap(a, b);
345 }
346 ab->exact_ = std::move(a->exact_);
347 ab->exact_.insert(b->exact_.begin(), b->exact_.end());
348 ab->is_exact_ = true;
349 } else {
350 // Either a or b has is_exact_ = false. If the other
351 // one has is_exact_ = true, we move it to match_ and
352 // then create a OR of a,b. The resulting Info has
353 // is_exact_ = false.
354 ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
355 ab->is_exact_ = false;
356 }
357
358 delete a;
359 delete b;
360 return ab;
361 }
362
363 // Constructs Info for a? given a.
Quest(Info * a)364 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
365 Info *ab = new Info();
366
367 ab->is_exact_ = false;
368 ab->match_ = new Prefilter(ALL);
369 delete a;
370 return ab;
371 }
372
373 // Constructs Info for a* given a.
374 // Same as a? -- not much to do.
Star(Info * a)375 Prefilter::Info* Prefilter::Info::Star(Info *a) {
376 return Quest(a);
377 }
378
379 // Constructs Info for a+ given a. If a was exact set, it isn't
380 // anymore.
Plus(Info * a)381 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
382 Info *ab = new Info();
383
384 ab->match_ = a->TakeMatch();
385 ab->is_exact_ = false;
386
387 delete a;
388 return ab;
389 }
390
RuneToString(Rune r)391 static std::string RuneToString(Rune r) {
392 char buf[UTFmax];
393 int n = runetochar(buf, &r);
394 return std::string(buf, n);
395 }
396
RuneToStringLatin1(Rune r)397 static std::string RuneToStringLatin1(Rune r) {
398 char c = r & 0xff;
399 return std::string(&c, 1);
400 }
401
402 // Constructs Info for literal rune.
Literal(Rune r)403 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
404 Info* info = new Info();
405 info->exact_.insert(RuneToString(ToLowerRune(r)));
406 info->is_exact_ = true;
407 return info;
408 }
409
410 // Constructs Info for literal rune for Latin1 encoded string.
LiteralLatin1(Rune r)411 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
412 Info* info = new Info();
413 info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
414 info->is_exact_ = true;
415 return info;
416 }
417
418 // Constructs Info for dot (any character) or \C (any byte).
AnyCharOrAnyByte()419 Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
420 Prefilter::Info* info = new Prefilter::Info();
421 info->match_ = new Prefilter(ALL);
422 return info;
423 }
424
425 // Constructs Prefilter::Info for no possible match.
NoMatch()426 Prefilter::Info* Prefilter::Info::NoMatch() {
427 Prefilter::Info* info = new Prefilter::Info();
428 info->match_ = new Prefilter(NONE);
429 return info;
430 }
431
432 // Constructs Prefilter::Info for any possible match.
433 // This Prefilter::Info is valid for any regular expression,
434 // since it makes no assertions whatsoever about the
435 // strings being matched.
AnyMatch()436 Prefilter::Info* Prefilter::Info::AnyMatch() {
437 Prefilter::Info *info = new Prefilter::Info();
438 info->match_ = new Prefilter(ALL);
439 return info;
440 }
441
442 // Constructs Prefilter::Info for just the empty string.
EmptyString()443 Prefilter::Info* Prefilter::Info::EmptyString() {
444 Prefilter::Info* info = new Prefilter::Info();
445 info->is_exact_ = true;
446 info->exact_.insert("");
447 return info;
448 }
449
450 // Constructs Prefilter::Info for a character class.
451 typedef CharClass::iterator CCIter;
CClass(CharClass * cc,bool latin1)452 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
453 bool latin1) {
454 if (ExtraDebug) {
455 ABSL_LOG(ERROR) << "CharClassInfo:";
456 for (CCIter i = cc->begin(); i != cc->end(); ++i)
457 ABSL_LOG(ERROR) << " " << i->lo << "-" << i->hi;
458 }
459
460 // If the class is too large, it's okay to overestimate.
461 if (cc->size() > 10)
462 return AnyCharOrAnyByte();
463
464 Prefilter::Info *a = new Prefilter::Info();
465 for (CCIter i = cc->begin(); i != cc->end(); ++i)
466 for (Rune r = i->lo; r <= i->hi; r++) {
467 if (latin1) {
468 a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
469 } else {
470 a->exact_.insert(RuneToString(ToLowerRune(r)));
471 }
472 }
473
474
475 a->is_exact_ = true;
476
477 if (ExtraDebug)
478 ABSL_LOG(ERROR) << " = " << a->ToString();
479
480 return a;
481 }
482
483 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
484 public:
Walker(bool latin1)485 Walker(bool latin1) : latin1_(latin1) {}
486
487 virtual Info* PostVisit(
488 Regexp* re, Info* parent_arg,
489 Info* pre_arg,
490 Info** child_args, int nchild_args);
491
492 virtual Info* ShortVisit(
493 Regexp* re,
494 Info* parent_arg);
495
latin1()496 bool latin1() { return latin1_; }
497 private:
498 bool latin1_;
499
500 Walker(const Walker&) = delete;
501 Walker& operator=(const Walker&) = delete;
502 };
503
BuildInfo(Regexp * re)504 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
505 if (ExtraDebug)
506 ABSL_LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
507
508 bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
509 Prefilter::Info::Walker w(latin1);
510 Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
511
512 if (w.stopped_early()) {
513 delete info;
514 return NULL;
515 }
516
517 return info;
518 }
519
ShortVisit(Regexp * re,Prefilter::Info * parent_arg)520 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
521 Regexp* re, Prefilter::Info* parent_arg) {
522 return AnyMatch();
523 }
524
525 // Constructs the Prefilter::Info for the given regular expression.
526 // Assumes re is simplified.
PostVisit(Regexp * re,Prefilter::Info * parent_arg,Prefilter::Info * pre_arg,Prefilter::Info ** child_args,int nchild_args)527 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
528 Regexp* re, Prefilter::Info* parent_arg,
529 Prefilter::Info* pre_arg, Prefilter::Info** child_args,
530 int nchild_args) {
531 Prefilter::Info *info;
532 switch (re->op()) {
533 default:
534 case kRegexpRepeat:
535 info = EmptyString();
536 ABSL_LOG(DFATAL) << "Bad regexp op " << re->op();
537 break;
538
539 case kRegexpNoMatch:
540 info = NoMatch();
541 break;
542
543 // These ops match the empty string:
544 case kRegexpEmptyMatch: // anywhere
545 case kRegexpBeginLine: // at beginning of line
546 case kRegexpEndLine: // at end of line
547 case kRegexpBeginText: // at beginning of text
548 case kRegexpEndText: // at end of text
549 case kRegexpWordBoundary: // at word boundary
550 case kRegexpNoWordBoundary: // not at word boundary
551 info = EmptyString();
552 break;
553
554 case kRegexpLiteral:
555 if (latin1()) {
556 info = LiteralLatin1(re->rune());
557 }
558 else {
559 info = Literal(re->rune());
560 }
561 break;
562
563 case kRegexpLiteralString:
564 if (re->nrunes() == 0) {
565 info = NoMatch();
566 break;
567 }
568 if (latin1()) {
569 info = LiteralLatin1(re->runes()[0]);
570 for (int i = 1; i < re->nrunes(); i++) {
571 info = Concat(info, LiteralLatin1(re->runes()[i]));
572 }
573 } else {
574 info = Literal(re->runes()[0]);
575 for (int i = 1; i < re->nrunes(); i++) {
576 info = Concat(info, Literal(re->runes()[i]));
577 }
578 }
579 break;
580
581 case kRegexpConcat: {
582 // Accumulate in info.
583 // Exact is concat of recent contiguous exact nodes.
584 info = NULL;
585 Info* exact = NULL;
586 for (int i = 0; i < nchild_args; i++) {
587 Info* ci = child_args[i]; // child info
588 if (!ci->is_exact() ||
589 (exact && ci->exact().size() * exact->exact().size() > 16)) {
590 // Exact run is over.
591 info = And(info, exact);
592 exact = NULL;
593 // Add this child's info.
594 info = And(info, ci);
595 } else {
596 // Append to exact run.
597 exact = Concat(exact, ci);
598 }
599 }
600 info = And(info, exact);
601 }
602 break;
603
604 case kRegexpAlternate:
605 info = child_args[0];
606 for (int i = 1; i < nchild_args; i++)
607 info = Alt(info, child_args[i]);
608 break;
609
610 case kRegexpStar:
611 info = Star(child_args[0]);
612 break;
613
614 case kRegexpQuest:
615 info = Quest(child_args[0]);
616 break;
617
618 case kRegexpPlus:
619 info = Plus(child_args[0]);
620 break;
621
622 case kRegexpAnyChar:
623 case kRegexpAnyByte:
624 // Claim nothing, except that it's not empty.
625 info = AnyCharOrAnyByte();
626 break;
627
628 case kRegexpCharClass:
629 info = CClass(re->cc(), latin1());
630 break;
631
632 case kRegexpCapture:
633 // These don't affect the set of matching strings.
634 info = child_args[0];
635 break;
636 }
637
638 if (ExtraDebug)
639 ABSL_LOG(ERROR) << "BuildInfo " << re->ToString()
640 << ": " << (info ? info->ToString() : "");
641
642 return info;
643 }
644
645
FromRegexp(Regexp * re)646 Prefilter* Prefilter::FromRegexp(Regexp* re) {
647 if (re == NULL)
648 return NULL;
649
650 Regexp* simple = re->Simplify();
651 if (simple == NULL)
652 return NULL;
653
654 Prefilter::Info* info = BuildInfo(simple);
655 simple->Decref();
656 if (info == NULL)
657 return NULL;
658
659 Prefilter* m = info->TakeMatch();
660 delete info;
661 return m;
662 }
663
DebugString() const664 std::string Prefilter::DebugString() const {
665 switch (op_) {
666 default:
667 ABSL_LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
668 return absl::StrFormat("op%d", op_);
669 case NONE:
670 return "*no-matches*";
671 case ATOM:
672 return atom_;
673 case ALL:
674 return "";
675 case AND: {
676 std::string s = "";
677 for (size_t i = 0; i < subs_->size(); i++) {
678 if (i > 0)
679 s += " ";
680 Prefilter* sub = (*subs_)[i];
681 s += sub ? sub->DebugString() : "<nil>";
682 }
683 return s;
684 }
685 case OR: {
686 std::string s = "(";
687 for (size_t i = 0; i < subs_->size(); i++) {
688 if (i > 0)
689 s += "|";
690 Prefilter* sub = (*subs_)[i];
691 s += sub ? sub->DebugString() : "<nil>";
692 }
693 s += ")";
694 return s;
695 }
696 }
697 }
698
FromRE2(const RE2 * re2)699 Prefilter* Prefilter::FromRE2(const RE2* re2) {
700 if (re2 == NULL)
701 return NULL;
702
703 Regexp* regexp = re2->Regexp();
704 if (regexp == NULL)
705 return NULL;
706
707 return FromRegexp(regexp);
708 }
709
710
711 } // namespace re2
712