• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include "re2/prefilter.h"
6 
7 #include <stddef.h>
8 
9 #include <string>
10 #include <utility>
11 #include <vector>
12 
13 #include "absl/log/absl_check.h"
14 #include "absl/log/absl_log.h"
15 #include "absl/strings/str_format.h"
16 #include "re2/re2.h"
17 #include "re2/regexp.h"
18 #include "re2/unicode_casefold.h"
19 #include "re2/walker-inl.h"
20 #include "util/utf.h"
21 
22 namespace re2 {
23 
24 static const bool ExtraDebug = false;
25 
26 // Initializes a Prefilter, allocating subs_ as necessary.
Prefilter(Op op)27 Prefilter::Prefilter(Op op) {
28   op_ = op;
29   subs_ = NULL;
30   if (op_ == AND || op_ == OR)
31     subs_ = new std::vector<Prefilter*>;
32 }
33 
34 // Destroys a Prefilter.
~Prefilter()35 Prefilter::~Prefilter() {
36   if (subs_) {
37     for (size_t i = 0; i < subs_->size(); i++)
38       delete (*subs_)[i];
39     delete subs_;
40     subs_ = NULL;
41   }
42 }
43 
44 // Simplify if the node is an empty Or or And.
Simplify()45 Prefilter* Prefilter::Simplify() {
46   if (op_ != AND && op_ != OR) {
47     return this;
48   }
49 
50   // Nothing left in the AND/OR.
51   if (subs_->empty()) {
52     if (op_ == AND)
53       op_ = ALL;  // AND of nothing is true
54     else
55       op_ = NONE;  // OR of nothing is false
56 
57     return this;
58   }
59 
60   // Just one subnode: throw away wrapper.
61   if (subs_->size() == 1) {
62     Prefilter* a = (*subs_)[0];
63     subs_->clear();
64     delete this;
65     return a->Simplify();
66   }
67 
68   return this;
69 }
70 
71 // Combines two Prefilters together to create an "op" (AND or OR).
72 // The passed Prefilters will be part of the returned Prefilter or deleted.
73 // Does lots of work to avoid creating unnecessarily complicated structures.
AndOr(Op op,Prefilter * a,Prefilter * b)74 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
75   // If a, b can be rewritten as op, do so.
76   a = a->Simplify();
77   b = b->Simplify();
78 
79   // Canonicalize: a->op <= b->op.
80   if (a->op() > b->op()) {
81     Prefilter* t = a;
82     a = b;
83     b = t;
84   }
85 
86   // Trivial cases.
87   //    ALL AND b = b
88   //    NONE OR b = b
89   //    ALL OR b   = ALL
90   //    NONE AND b = NONE
91   // Don't need to look at b, because of canonicalization above.
92   // ALL and NONE are smallest opcodes.
93   if (a->op() == ALL || a->op() == NONE) {
94     if ((a->op() == ALL && op == AND) ||
95         (a->op() == NONE && op == OR)) {
96       delete a;
97       return b;
98     } else {
99       delete b;
100       return a;
101     }
102   }
103 
104   // If a and b match op, merge their contents.
105   if (a->op() == op && b->op() == op) {
106     for (size_t i = 0; i < b->subs()->size(); i++) {
107       Prefilter* bb = (*b->subs())[i];
108       a->subs()->push_back(bb);
109     }
110     b->subs()->clear();
111     delete b;
112     return a;
113   }
114 
115   // If a already has the same op as the op that is under construction
116   // add in b (similarly if b already has the same op, add in a).
117   if (b->op() == op) {
118     Prefilter* t = a;
119     a = b;
120     b = t;
121   }
122   if (a->op() == op) {
123     a->subs()->push_back(b);
124     return a;
125   }
126 
127   // Otherwise just return the op.
128   Prefilter* c = new Prefilter(op);
129   c->subs()->push_back(a);
130   c->subs()->push_back(b);
131   return c;
132 }
133 
And(Prefilter * a,Prefilter * b)134 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
135   return AndOr(AND, a, b);
136 }
137 
Or(Prefilter * a,Prefilter * b)138 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
139   return AndOr(OR, a, b);
140 }
141 
SimplifyStringSet(SSet * ss)142 void Prefilter::SimplifyStringSet(SSet* ss) {
143   // Now make sure that the strings aren't redundant.  For example, if
144   // we know "ab" is a required string, then it doesn't help at all to
145   // know that "abc" is also a required string, so delete "abc". This
146   // is because, when we are performing a string search to filter
147   // regexps, matching "ab" will already allow this regexp to be a
148   // candidate for match, so further matching "abc" is redundant.
149   // Note that we must ignore "" because find() would find it at the
150   // start of everything and thus we would end up erasing everything.
151   //
152   // The SSet sorts strings by length, then lexicographically. Note that
153   // smaller strings appear first and all strings must be unique. These
154   // observations let us skip string comparisons when possible.
155   SSIter i = ss->begin();
156   if (i != ss->end() && i->empty()) {
157     ++i;
158   }
159   for (; i != ss->end(); ++i) {
160     SSIter j = i;
161     ++j;
162     while (j != ss->end()) {
163       if (j->size() > i->size() && j->find(*i) != std::string::npos) {
164         j = ss->erase(j);
165         continue;
166       }
167       ++j;
168     }
169   }
170 }
171 
OrStrings(SSet * ss)172 Prefilter* Prefilter::OrStrings(SSet* ss) {
173   Prefilter* or_prefilter = new Prefilter(NONE);
174   SimplifyStringSet(ss);
175   for (SSIter i = ss->begin(); i != ss->end(); ++i)
176     or_prefilter = Or(or_prefilter, FromString(*i));
177   return or_prefilter;
178 }
179 
ToLowerRune(Rune r)180 static Rune ToLowerRune(Rune r) {
181   if (r < Runeself) {
182     if ('A' <= r && r <= 'Z')
183       r += 'a' - 'A';
184     return r;
185   }
186 
187   const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
188   if (f == NULL || r < f->lo)
189     return r;
190   return ApplyFold(f, r);
191 }
192 
ToLowerRuneLatin1(Rune r)193 static Rune ToLowerRuneLatin1(Rune r) {
194   if ('A' <= r && r <= 'Z')
195     r += 'a' - 'A';
196   return r;
197 }
198 
FromString(const std::string & str)199 Prefilter* Prefilter::FromString(const std::string& str) {
200   Prefilter* m = new Prefilter(Prefilter::ATOM);
201   m->atom_ = str;
202   return m;
203 }
204 
205 // Information about a regexp used during computation of Prefilter.
206 // Can be thought of as information about the set of strings matching
207 // the given regular expression.
208 class Prefilter::Info {
209  public:
210   Info();
211   ~Info();
212 
213   // More constructors.  They delete their Info* arguments.
214   static Info* Alt(Info* a, Info* b);
215   static Info* Concat(Info* a, Info* b);
216   static Info* And(Info* a, Info* b);
217   static Info* Star(Info* a);
218   static Info* Plus(Info* a);
219   static Info* Quest(Info* a);
220   static Info* EmptyString();
221   static Info* NoMatch();
222   static Info* AnyCharOrAnyByte();
223   static Info* CClass(CharClass* cc, bool latin1);
224   static Info* Literal(Rune r);
225   static Info* LiteralLatin1(Rune r);
226   static Info* AnyMatch();
227 
228   // Format Info as a string.
229   std::string ToString();
230 
231   // Caller takes ownership of the Prefilter.
232   Prefilter* TakeMatch();
233 
exact()234   SSet& exact() { return exact_; }
235 
is_exact() const236   bool is_exact() const { return is_exact_; }
237 
238   class Walker;
239 
240  private:
241   SSet exact_;
242 
243   // When is_exact_ is true, the strings that match
244   // are placed in exact_. When it is no longer an exact
245   // set of strings that match this RE, then is_exact_
246   // is false and the match_ contains the required match
247   // criteria.
248   bool is_exact_;
249 
250   // Accumulated Prefilter query that any
251   // match for this regexp is guaranteed to match.
252   Prefilter* match_;
253 };
254 
255 
Info()256 Prefilter::Info::Info()
257   : is_exact_(false),
258     match_(NULL) {
259 }
260 
~Info()261 Prefilter::Info::~Info() {
262   delete match_;
263 }
264 
TakeMatch()265 Prefilter* Prefilter::Info::TakeMatch() {
266   if (is_exact_) {
267     match_ = Prefilter::OrStrings(&exact_);
268     is_exact_ = false;
269   }
270   Prefilter* m = match_;
271   match_ = NULL;
272   return m;
273 }
274 
275 // Format a Info in string form.
ToString()276 std::string Prefilter::Info::ToString() {
277   if (is_exact_) {
278     int n = 0;
279     std::string s;
280     for (SSIter i = exact_.begin(); i != exact_.end(); ++i) {
281       if (n++ > 0)
282         s += ",";
283       s += *i;
284     }
285     return s;
286   }
287 
288   if (match_)
289     return match_->DebugString();
290 
291   return "";
292 }
293 
CrossProduct(const SSet & a,const SSet & b,SSet * dst)294 void Prefilter::CrossProduct(const SSet& a, const SSet& b, SSet* dst) {
295   for (ConstSSIter i = a.begin(); i != a.end(); ++i)
296     for (ConstSSIter j = b.begin(); j != b.end(); ++j)
297       dst->insert(*i + *j);
298 }
299 
300 // Concats a and b. Requires that both are exact sets.
301 // Forms an exact set that is a crossproduct of a and b.
Concat(Info * a,Info * b)302 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
303   if (a == NULL)
304     return b;
305   ABSL_DCHECK(a->is_exact_);
306   ABSL_DCHECK(b && b->is_exact_);
307   Info *ab = new Info();
308 
309   CrossProduct(a->exact_, b->exact_, &ab->exact_);
310   ab->is_exact_ = true;
311 
312   delete a;
313   delete b;
314   return ab;
315 }
316 
317 // Constructs an inexact Info for ab given a and b.
318 // Used only when a or b is not exact or when the
319 // exact cross product is likely to be too big.
And(Info * a,Info * b)320 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
321   if (a == NULL)
322     return b;
323   if (b == NULL)
324     return a;
325 
326   Info *ab = new Info();
327 
328   ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
329   ab->is_exact_ = false;
330   delete a;
331   delete b;
332   return ab;
333 }
334 
335 // Constructs Info for a|b given a and b.
Alt(Info * a,Info * b)336 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
337   Info *ab = new Info();
338 
339   if (a->is_exact_ && b->is_exact_) {
340     // Avoid string copies by moving the larger exact_ set into
341     // ab directly, then merge in the smaller set.
342     if (a->exact_.size() < b->exact_.size()) {
343       using std::swap;
344       swap(a, b);
345     }
346     ab->exact_ = std::move(a->exact_);
347     ab->exact_.insert(b->exact_.begin(), b->exact_.end());
348     ab->is_exact_ = true;
349   } else {
350     // Either a or b has is_exact_ = false. If the other
351     // one has is_exact_ = true, we move it to match_ and
352     // then create a OR of a,b. The resulting Info has
353     // is_exact_ = false.
354     ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
355     ab->is_exact_ = false;
356   }
357 
358   delete a;
359   delete b;
360   return ab;
361 }
362 
363 // Constructs Info for a? given a.
Quest(Info * a)364 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
365   Info *ab = new Info();
366 
367   ab->is_exact_ = false;
368   ab->match_ = new Prefilter(ALL);
369   delete a;
370   return ab;
371 }
372 
373 // Constructs Info for a* given a.
374 // Same as a? -- not much to do.
Star(Info * a)375 Prefilter::Info* Prefilter::Info::Star(Info *a) {
376   return Quest(a);
377 }
378 
379 // Constructs Info for a+ given a. If a was exact set, it isn't
380 // anymore.
Plus(Info * a)381 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
382   Info *ab = new Info();
383 
384   ab->match_ = a->TakeMatch();
385   ab->is_exact_ = false;
386 
387   delete a;
388   return ab;
389 }
390 
RuneToString(Rune r)391 static std::string RuneToString(Rune r) {
392   char buf[UTFmax];
393   int n = runetochar(buf, &r);
394   return std::string(buf, n);
395 }
396 
RuneToStringLatin1(Rune r)397 static std::string RuneToStringLatin1(Rune r) {
398   char c = r & 0xff;
399   return std::string(&c, 1);
400 }
401 
402 // Constructs Info for literal rune.
Literal(Rune r)403 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
404   Info* info = new Info();
405   info->exact_.insert(RuneToString(ToLowerRune(r)));
406   info->is_exact_ = true;
407   return info;
408 }
409 
410 // Constructs Info for literal rune for Latin1 encoded string.
LiteralLatin1(Rune r)411 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
412   Info* info = new Info();
413   info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
414   info->is_exact_ = true;
415   return info;
416 }
417 
418 // Constructs Info for dot (any character) or \C (any byte).
AnyCharOrAnyByte()419 Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
420   Prefilter::Info* info = new Prefilter::Info();
421   info->match_ = new Prefilter(ALL);
422   return info;
423 }
424 
425 // Constructs Prefilter::Info for no possible match.
NoMatch()426 Prefilter::Info* Prefilter::Info::NoMatch() {
427   Prefilter::Info* info = new Prefilter::Info();
428   info->match_ = new Prefilter(NONE);
429   return info;
430 }
431 
432 // Constructs Prefilter::Info for any possible match.
433 // This Prefilter::Info is valid for any regular expression,
434 // since it makes no assertions whatsoever about the
435 // strings being matched.
AnyMatch()436 Prefilter::Info* Prefilter::Info::AnyMatch() {
437   Prefilter::Info *info = new Prefilter::Info();
438   info->match_ = new Prefilter(ALL);
439   return info;
440 }
441 
442 // Constructs Prefilter::Info for just the empty string.
EmptyString()443 Prefilter::Info* Prefilter::Info::EmptyString() {
444   Prefilter::Info* info = new Prefilter::Info();
445   info->is_exact_ = true;
446   info->exact_.insert("");
447   return info;
448 }
449 
450 // Constructs Prefilter::Info for a character class.
451 typedef CharClass::iterator CCIter;
CClass(CharClass * cc,bool latin1)452 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
453                                          bool latin1) {
454   if (ExtraDebug) {
455     ABSL_LOG(ERROR) << "CharClassInfo:";
456     for (CCIter i = cc->begin(); i != cc->end(); ++i)
457       ABSL_LOG(ERROR) << "  " << i->lo << "-" << i->hi;
458   }
459 
460   // If the class is too large, it's okay to overestimate.
461   if (cc->size() > 10)
462     return AnyCharOrAnyByte();
463 
464   Prefilter::Info *a = new Prefilter::Info();
465   for (CCIter i = cc->begin(); i != cc->end(); ++i)
466     for (Rune r = i->lo; r <= i->hi; r++) {
467       if (latin1) {
468         a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
469       } else {
470         a->exact_.insert(RuneToString(ToLowerRune(r)));
471       }
472     }
473 
474 
475   a->is_exact_ = true;
476 
477   if (ExtraDebug)
478     ABSL_LOG(ERROR) << " = " << a->ToString();
479 
480   return a;
481 }
482 
483 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
484  public:
Walker(bool latin1)485   Walker(bool latin1) : latin1_(latin1) {}
486 
487   virtual Info* PostVisit(
488       Regexp* re, Info* parent_arg,
489       Info* pre_arg,
490       Info** child_args, int nchild_args);
491 
492   virtual Info* ShortVisit(
493       Regexp* re,
494       Info* parent_arg);
495 
latin1()496   bool latin1() { return latin1_; }
497  private:
498   bool latin1_;
499 
500   Walker(const Walker&) = delete;
501   Walker& operator=(const Walker&) = delete;
502 };
503 
BuildInfo(Regexp * re)504 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
505   if (ExtraDebug)
506     ABSL_LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
507 
508   bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
509   Prefilter::Info::Walker w(latin1);
510   Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
511 
512   if (w.stopped_early()) {
513     delete info;
514     return NULL;
515   }
516 
517   return info;
518 }
519 
ShortVisit(Regexp * re,Prefilter::Info * parent_arg)520 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
521     Regexp* re, Prefilter::Info* parent_arg) {
522   return AnyMatch();
523 }
524 
525 // Constructs the Prefilter::Info for the given regular expression.
526 // Assumes re is simplified.
PostVisit(Regexp * re,Prefilter::Info * parent_arg,Prefilter::Info * pre_arg,Prefilter::Info ** child_args,int nchild_args)527 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
528     Regexp* re, Prefilter::Info* parent_arg,
529     Prefilter::Info* pre_arg, Prefilter::Info** child_args,
530     int nchild_args) {
531   Prefilter::Info *info;
532   switch (re->op()) {
533     default:
534     case kRegexpRepeat:
535       info = EmptyString();
536       ABSL_LOG(DFATAL) << "Bad regexp op " << re->op();
537       break;
538 
539     case kRegexpNoMatch:
540       info = NoMatch();
541       break;
542 
543     // These ops match the empty string:
544     case kRegexpEmptyMatch:      // anywhere
545     case kRegexpBeginLine:       // at beginning of line
546     case kRegexpEndLine:         // at end of line
547     case kRegexpBeginText:       // at beginning of text
548     case kRegexpEndText:         // at end of text
549     case kRegexpWordBoundary:    // at word boundary
550     case kRegexpNoWordBoundary:  // not at word boundary
551       info = EmptyString();
552       break;
553 
554     case kRegexpLiteral:
555       if (latin1()) {
556         info = LiteralLatin1(re->rune());
557       }
558       else {
559         info = Literal(re->rune());
560       }
561       break;
562 
563     case kRegexpLiteralString:
564       if (re->nrunes() == 0) {
565         info = NoMatch();
566         break;
567       }
568       if (latin1()) {
569         info = LiteralLatin1(re->runes()[0]);
570         for (int i = 1; i < re->nrunes(); i++) {
571           info = Concat(info, LiteralLatin1(re->runes()[i]));
572         }
573       } else {
574         info = Literal(re->runes()[0]);
575         for (int i = 1; i < re->nrunes(); i++) {
576           info = Concat(info, Literal(re->runes()[i]));
577         }
578       }
579       break;
580 
581     case kRegexpConcat: {
582       // Accumulate in info.
583       // Exact is concat of recent contiguous exact nodes.
584       info = NULL;
585       Info* exact = NULL;
586       for (int i = 0; i < nchild_args; i++) {
587         Info* ci = child_args[i];  // child info
588         if (!ci->is_exact() ||
589             (exact && ci->exact().size() * exact->exact().size() > 16)) {
590           // Exact run is over.
591           info = And(info, exact);
592           exact = NULL;
593           // Add this child's info.
594           info = And(info, ci);
595         } else {
596           // Append to exact run.
597           exact = Concat(exact, ci);
598         }
599       }
600       info = And(info, exact);
601     }
602       break;
603 
604     case kRegexpAlternate:
605       info = child_args[0];
606       for (int i = 1; i < nchild_args; i++)
607         info = Alt(info, child_args[i]);
608       break;
609 
610     case kRegexpStar:
611       info = Star(child_args[0]);
612       break;
613 
614     case kRegexpQuest:
615       info = Quest(child_args[0]);
616       break;
617 
618     case kRegexpPlus:
619       info = Plus(child_args[0]);
620       break;
621 
622     case kRegexpAnyChar:
623     case kRegexpAnyByte:
624       // Claim nothing, except that it's not empty.
625       info = AnyCharOrAnyByte();
626       break;
627 
628     case kRegexpCharClass:
629       info = CClass(re->cc(), latin1());
630       break;
631 
632     case kRegexpCapture:
633       // These don't affect the set of matching strings.
634       info = child_args[0];
635       break;
636   }
637 
638   if (ExtraDebug)
639     ABSL_LOG(ERROR) << "BuildInfo " << re->ToString()
640                     << ": " << (info ? info->ToString() : "");
641 
642   return info;
643 }
644 
645 
FromRegexp(Regexp * re)646 Prefilter* Prefilter::FromRegexp(Regexp* re) {
647   if (re == NULL)
648     return NULL;
649 
650   Regexp* simple = re->Simplify();
651   if (simple == NULL)
652     return NULL;
653 
654   Prefilter::Info* info = BuildInfo(simple);
655   simple->Decref();
656   if (info == NULL)
657     return NULL;
658 
659   Prefilter* m = info->TakeMatch();
660   delete info;
661   return m;
662 }
663 
DebugString() const664 std::string Prefilter::DebugString() const {
665   switch (op_) {
666     default:
667       ABSL_LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
668       return absl::StrFormat("op%d", op_);
669     case NONE:
670       return "*no-matches*";
671     case ATOM:
672       return atom_;
673     case ALL:
674       return "";
675     case AND: {
676       std::string s = "";
677       for (size_t i = 0; i < subs_->size(); i++) {
678         if (i > 0)
679           s += " ";
680         Prefilter* sub = (*subs_)[i];
681         s += sub ? sub->DebugString() : "<nil>";
682       }
683       return s;
684     }
685     case OR: {
686       std::string s = "(";
687       for (size_t i = 0; i < subs_->size(); i++) {
688         if (i > 0)
689           s += "|";
690         Prefilter* sub = (*subs_)[i];
691         s += sub ? sub->DebugString() : "<nil>";
692       }
693       s += ")";
694       return s;
695     }
696   }
697 }
698 
FromRE2(const RE2 * re2)699 Prefilter* Prefilter::FromRE2(const RE2* re2) {
700   if (re2 == NULL)
701     return NULL;
702 
703   Regexp* regexp = re2->Regexp();
704   if (regexp == NULL)
705     return NULL;
706 
707   return FromRegexp(regexp);
708 }
709 
710 
711 }  // namespace re2
712