• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include "re2/prefilter.h"
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 #include <string>
10 #include <vector>
11 
12 #include "util/util.h"
13 #include "util/logging.h"
14 #include "util/strutil.h"
15 #include "util/utf.h"
16 #include "re2/re2.h"
17 #include "re2/unicode_casefold.h"
18 #include "re2/walker-inl.h"
19 
20 namespace re2 {
21 
22 static const bool ExtraDebug = false;
23 
24 typedef std::set<string>::iterator SSIter;
25 typedef std::set<string>::const_iterator ConstSSIter;
26 
27 // Initializes a Prefilter, allocating subs_ as necessary.
Prefilter(Op op)28 Prefilter::Prefilter(Op op) {
29   op_ = op;
30   subs_ = NULL;
31   if (op_ == AND || op_ == OR)
32     subs_ = new std::vector<Prefilter*>;
33 }
34 
35 // Destroys a Prefilter.
~Prefilter()36 Prefilter::~Prefilter() {
37   if (subs_) {
38     for (size_t i = 0; i < subs_->size(); i++)
39       delete (*subs_)[i];
40     delete subs_;
41     subs_ = NULL;
42   }
43 }
44 
45 // Simplify if the node is an empty Or or And.
Simplify()46 Prefilter* Prefilter::Simplify() {
47   if (op_ != AND && op_ != OR) {
48     return this;
49   }
50 
51   // Nothing left in the AND/OR.
52   if (subs_->empty()) {
53     if (op_ == AND)
54       op_ = ALL;  // AND of nothing is true
55     else
56       op_ = NONE;  // OR of nothing is false
57 
58     return this;
59   }
60 
61   // Just one subnode: throw away wrapper.
62   if (subs_->size() == 1) {
63     Prefilter* a = (*subs_)[0];
64     subs_->clear();
65     delete this;
66     return a->Simplify();
67   }
68 
69   return this;
70 }
71 
72 // Combines two Prefilters together to create an "op" (AND or OR).
73 // The passed Prefilters will be part of the returned Prefilter or deleted.
74 // Does lots of work to avoid creating unnecessarily complicated structures.
AndOr(Op op,Prefilter * a,Prefilter * b)75 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
76   // If a, b can be rewritten as op, do so.
77   a = a->Simplify();
78   b = b->Simplify();
79 
80   // Canonicalize: a->op <= b->op.
81   if (a->op() > b->op()) {
82     Prefilter* t = a;
83     a = b;
84     b = t;
85   }
86 
87   // Trivial cases.
88   //    ALL AND b = b
89   //    NONE OR b = b
90   //    ALL OR b   = ALL
91   //    NONE AND b = NONE
92   // Don't need to look at b, because of canonicalization above.
93   // ALL and NONE are smallest opcodes.
94   if (a->op() == ALL || a->op() == NONE) {
95     if ((a->op() == ALL && op == AND) ||
96         (a->op() == NONE && op == OR)) {
97       delete a;
98       return b;
99     } else {
100       delete b;
101       return a;
102     }
103   }
104 
105   // If a and b match op, merge their contents.
106   if (a->op() == op && b->op() == op) {
107     for (size_t i = 0; i < b->subs()->size(); i++) {
108       Prefilter* bb = (*b->subs())[i];
109       a->subs()->push_back(bb);
110     }
111     b->subs()->clear();
112     delete b;
113     return a;
114   }
115 
116   // If a already has the same op as the op that is under construction
117   // add in b (similarly if b already has the same op, add in a).
118   if (b->op() == op) {
119     Prefilter* t = a;
120     a = b;
121     b = t;
122   }
123   if (a->op() == op) {
124     a->subs()->push_back(b);
125     return a;
126   }
127 
128   // Otherwise just return the op.
129   Prefilter* c = new Prefilter(op);
130   c->subs()->push_back(a);
131   c->subs()->push_back(b);
132   return c;
133 }
134 
And(Prefilter * a,Prefilter * b)135 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
136   return AndOr(AND, a, b);
137 }
138 
Or(Prefilter * a,Prefilter * b)139 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
140   return AndOr(OR, a, b);
141 }
142 
SimplifyStringSet(std::set<string> * ss)143 static void SimplifyStringSet(std::set<string> *ss) {
144   // Now make sure that the strings aren't redundant.  For example, if
145   // we know "ab" is a required string, then it doesn't help at all to
146   // know that "abc" is also a required string, so delete "abc". This
147   // is because, when we are performing a string search to filter
148   // regexps, matching ab will already allow this regexp to be a
149   // candidate for match, so further matching abc is redundant.
150 
151   for (SSIter i = ss->begin(); i != ss->end(); ++i) {
152     SSIter j = i;
153     ++j;
154     while (j != ss->end()) {
155       // Increment j early so that we can erase the element it points to.
156       SSIter old_j = j;
157       ++j;
158       if (old_j->find(*i) != string::npos)
159         ss->erase(old_j);
160     }
161   }
162 }
163 
OrStrings(std::set<string> * ss)164 Prefilter* Prefilter::OrStrings(std::set<string>* ss) {
165   SimplifyStringSet(ss);
166   Prefilter* or_prefilter = NULL;
167   if (!ss->empty()) {
168     or_prefilter = new Prefilter(NONE);
169     for (SSIter i = ss->begin(); i != ss->end(); ++i)
170       or_prefilter = Or(or_prefilter, FromString(*i));
171   }
172   return or_prefilter;
173 }
174 
ToLowerRune(Rune r)175 static Rune ToLowerRune(Rune r) {
176   if (r < Runeself) {
177     if ('A' <= r && r <= 'Z')
178       r += 'a' - 'A';
179     return r;
180   }
181 
182   const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
183   if (f == NULL || r < f->lo)
184     return r;
185   return ApplyFold(f, r);
186 }
187 
ToLowerRuneLatin1(Rune r)188 static Rune ToLowerRuneLatin1(Rune r) {
189   if ('A' <= r && r <= 'Z')
190     r += 'a' - 'A';
191   return r;
192 }
193 
FromString(const string & str)194 Prefilter* Prefilter::FromString(const string& str) {
195   Prefilter* m = new Prefilter(Prefilter::ATOM);
196   m->atom_ = str;
197   return m;
198 }
199 
200 // Information about a regexp used during computation of Prefilter.
201 // Can be thought of as information about the set of strings matching
202 // the given regular expression.
203 class Prefilter::Info {
204  public:
205   Info();
206   ~Info();
207 
208   // More constructors.  They delete their Info* arguments.
209   static Info* Alt(Info* a, Info* b);
210   static Info* Concat(Info* a, Info* b);
211   static Info* And(Info* a, Info* b);
212   static Info* Star(Info* a);
213   static Info* Plus(Info* a);
214   static Info* Quest(Info* a);
215   static Info* EmptyString();
216   static Info* NoMatch();
217   static Info* AnyCharOrAnyByte();
218   static Info* CClass(CharClass* cc, bool latin1);
219   static Info* Literal(Rune r);
220   static Info* LiteralLatin1(Rune r);
221   static Info* AnyMatch();
222 
223   // Format Info as a string.
224   string ToString();
225 
226   // Caller takes ownership of the Prefilter.
227   Prefilter* TakeMatch();
228 
exact()229   std::set<string>& exact() { return exact_; }
230 
is_exact() const231   bool is_exact() const { return is_exact_; }
232 
233   class Walker;
234 
235  private:
236   std::set<string> exact_;
237 
238   // When is_exact_ is true, the strings that match
239   // are placed in exact_. When it is no longer an exact
240   // set of strings that match this RE, then is_exact_
241   // is false and the match_ contains the required match
242   // criteria.
243   bool is_exact_;
244 
245   // Accumulated Prefilter query that any
246   // match for this regexp is guaranteed to match.
247   Prefilter* match_;
248 };
249 
250 
Info()251 Prefilter::Info::Info()
252   : is_exact_(false),
253     match_(NULL) {
254 }
255 
~Info()256 Prefilter::Info::~Info() {
257   delete match_;
258 }
259 
TakeMatch()260 Prefilter* Prefilter::Info::TakeMatch() {
261   if (is_exact_) {
262     match_ = Prefilter::OrStrings(&exact_);
263     is_exact_ = false;
264   }
265   Prefilter* m = match_;
266   match_ = NULL;
267   return m;
268 }
269 
270 // Format a Info in string form.
ToString()271 string Prefilter::Info::ToString() {
272   if (is_exact_) {
273     int n = 0;
274     string s;
275     for (std::set<string>::iterator i = exact_.begin();
276          i != exact_.end();
277          ++i) {
278       if (n++ > 0)
279         s += ",";
280       s += *i;
281     }
282     return s;
283   }
284 
285   if (match_)
286     return match_->DebugString();
287 
288   return "";
289 }
290 
291 // Add the strings from src to dst.
CopyIn(const std::set<string> & src,std::set<string> * dst)292 static void CopyIn(const std::set<string>& src,
293                    std::set<string>* dst) {
294   for (ConstSSIter i = src.begin(); i != src.end(); ++i)
295     dst->insert(*i);
296 }
297 
298 // Add the cross-product of a and b to dst.
299 // (For each string i in a and j in b, add i+j.)
CrossProduct(const std::set<string> & a,const std::set<string> & b,std::set<string> * dst)300 static void CrossProduct(const std::set<string>& a,
301                          const std::set<string>& b,
302                          std::set<string>* dst) {
303   for (ConstSSIter i = a.begin(); i != a.end(); ++i)
304     for (ConstSSIter j = b.begin(); j != b.end(); ++j)
305       dst->insert(*i + *j);
306 }
307 
308 // Concats a and b. Requires that both are exact sets.
309 // Forms an exact set that is a crossproduct of a and b.
Concat(Info * a,Info * b)310 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
311   if (a == NULL)
312     return b;
313   DCHECK(a->is_exact_);
314   DCHECK(b && b->is_exact_);
315   Info *ab = new Info();
316 
317   CrossProduct(a->exact_, b->exact_, &ab->exact_);
318   ab->is_exact_ = true;
319 
320   delete a;
321   delete b;
322   return ab;
323 }
324 
325 // Constructs an inexact Info for ab given a and b.
326 // Used only when a or b is not exact or when the
327 // exact cross product is likely to be too big.
And(Info * a,Info * b)328 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
329   if (a == NULL)
330     return b;
331   if (b == NULL)
332     return a;
333 
334   Info *ab = new Info();
335 
336   ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
337   ab->is_exact_ = false;
338   delete a;
339   delete b;
340   return ab;
341 }
342 
343 // Constructs Info for a|b given a and b.
Alt(Info * a,Info * b)344 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
345   Info *ab = new Info();
346 
347   if (a->is_exact_ && b->is_exact_) {
348     CopyIn(a->exact_, &ab->exact_);
349     CopyIn(b->exact_, &ab->exact_);
350     ab->is_exact_ = true;
351   } else {
352     // Either a or b has is_exact_ = false. If the other
353     // one has is_exact_ = true, we move it to match_ and
354     // then create a OR of a,b. The resulting Info has
355     // is_exact_ = false.
356     ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
357     ab->is_exact_ = false;
358   }
359 
360   delete a;
361   delete b;
362   return ab;
363 }
364 
365 // Constructs Info for a? given a.
Quest(Info * a)366 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
367   Info *ab = new Info();
368 
369   ab->is_exact_ = false;
370   ab->match_ = new Prefilter(ALL);
371   delete a;
372   return ab;
373 }
374 
375 // Constructs Info for a* given a.
376 // Same as a? -- not much to do.
Star(Info * a)377 Prefilter::Info* Prefilter::Info::Star(Info *a) {
378   return Quest(a);
379 }
380 
381 // Constructs Info for a+ given a. If a was exact set, it isn't
382 // anymore.
Plus(Info * a)383 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
384   Info *ab = new Info();
385 
386   ab->match_ = a->TakeMatch();
387   ab->is_exact_ = false;
388 
389   delete a;
390   return ab;
391 }
392 
RuneToString(Rune r)393 static string RuneToString(Rune r) {
394   char buf[UTFmax];
395   int n = runetochar(buf, &r);
396   return string(buf, n);
397 }
398 
RuneToStringLatin1(Rune r)399 static string RuneToStringLatin1(Rune r) {
400   char c = r & 0xff;
401   return string(&c, 1);
402 }
403 
404 // Constructs Info for literal rune.
Literal(Rune r)405 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
406   Info* info = new Info();
407   info->exact_.insert(RuneToString(ToLowerRune(r)));
408   info->is_exact_ = true;
409   return info;
410 }
411 
412 // Constructs Info for literal rune for Latin1 encoded string.
LiteralLatin1(Rune r)413 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
414   Info* info = new Info();
415   info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
416   info->is_exact_ = true;
417   return info;
418 }
419 
420 // Constructs Info for dot (any character) or \C (any byte).
AnyCharOrAnyByte()421 Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
422   Prefilter::Info* info = new Prefilter::Info();
423   info->match_ = new Prefilter(ALL);
424   return info;
425 }
426 
427 // Constructs Prefilter::Info for no possible match.
NoMatch()428 Prefilter::Info* Prefilter::Info::NoMatch() {
429   Prefilter::Info* info = new Prefilter::Info();
430   info->match_ = new Prefilter(NONE);
431   return info;
432 }
433 
434 // Constructs Prefilter::Info for any possible match.
435 // This Prefilter::Info is valid for any regular expression,
436 // since it makes no assertions whatsoever about the
437 // strings being matched.
AnyMatch()438 Prefilter::Info* Prefilter::Info::AnyMatch() {
439   Prefilter::Info *info = new Prefilter::Info();
440   info->match_ = new Prefilter(ALL);
441   return info;
442 }
443 
444 // Constructs Prefilter::Info for just the empty string.
EmptyString()445 Prefilter::Info* Prefilter::Info::EmptyString() {
446   Prefilter::Info* info = new Prefilter::Info();
447   info->is_exact_ = true;
448   info->exact_.insert("");
449   return info;
450 }
451 
452 // Constructs Prefilter::Info for a character class.
453 typedef CharClass::iterator CCIter;
CClass(CharClass * cc,bool latin1)454 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
455                                          bool latin1) {
456   if (ExtraDebug) {
457     LOG(ERROR) << "CharClassInfo:";
458     for (CCIter i = cc->begin(); i != cc->end(); ++i)
459       LOG(ERROR) << "  " << i->lo << "-" << i->hi;
460   }
461 
462   // If the class is too large, it's okay to overestimate.
463   if (cc->size() > 10)
464     return AnyCharOrAnyByte();
465 
466   Prefilter::Info *a = new Prefilter::Info();
467   for (CCIter i = cc->begin(); i != cc->end(); ++i)
468     for (Rune r = i->lo; r <= i->hi; r++) {
469       if (latin1) {
470         a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
471       } else {
472         a->exact_.insert(RuneToString(ToLowerRune(r)));
473       }
474     }
475 
476 
477   a->is_exact_ = true;
478 
479   if (ExtraDebug)
480     LOG(ERROR) << " = " << a->ToString();
481 
482   return a;
483 }
484 
485 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
486  public:
Walker(bool latin1)487   Walker(bool latin1) : latin1_(latin1) {}
488 
489   virtual Info* PostVisit(
490       Regexp* re, Info* parent_arg,
491       Info* pre_arg,
492       Info** child_args, int nchild_args);
493 
494   virtual Info* ShortVisit(
495       Regexp* re,
496       Info* parent_arg);
497 
latin1()498   bool latin1() { return latin1_; }
499  private:
500   bool latin1_;
501 
502   Walker(const Walker&) = delete;
503   Walker& operator=(const Walker&) = delete;
504 };
505 
BuildInfo(Regexp * re)506 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
507   if (ExtraDebug)
508     LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
509 
510   bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
511   Prefilter::Info::Walker w(latin1);
512   Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
513 
514   if (w.stopped_early()) {
515     delete info;
516     return NULL;
517   }
518 
519   return info;
520 }
521 
ShortVisit(Regexp * re,Prefilter::Info * parent_arg)522 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
523     Regexp* re, Prefilter::Info* parent_arg) {
524   return AnyMatch();
525 }
526 
527 // Constructs the Prefilter::Info for the given regular expression.
528 // Assumes re is simplified.
PostVisit(Regexp * re,Prefilter::Info * parent_arg,Prefilter::Info * pre_arg,Prefilter::Info ** child_args,int nchild_args)529 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
530     Regexp* re, Prefilter::Info* parent_arg,
531     Prefilter::Info* pre_arg, Prefilter::Info** child_args,
532     int nchild_args) {
533   Prefilter::Info *info;
534   switch (re->op()) {
535     default:
536     case kRegexpRepeat:
537       LOG(DFATAL) << "Bad regexp op " << re->op();
538       info = EmptyString();
539       break;
540 
541     case kRegexpNoMatch:
542       info = NoMatch();
543       break;
544 
545     // These ops match the empty string:
546     case kRegexpEmptyMatch:      // anywhere
547     case kRegexpBeginLine:       // at beginning of line
548     case kRegexpEndLine:         // at end of line
549     case kRegexpBeginText:       // at beginning of text
550     case kRegexpEndText:         // at end of text
551     case kRegexpWordBoundary:    // at word boundary
552     case kRegexpNoWordBoundary:  // not at word boundary
553       info = EmptyString();
554       break;
555 
556     case kRegexpLiteral:
557       if (latin1()) {
558         info = LiteralLatin1(re->rune());
559       }
560       else {
561         info = Literal(re->rune());
562       }
563       break;
564 
565     case kRegexpLiteralString:
566       if (re->nrunes() == 0) {
567         info = NoMatch();
568         break;
569       }
570       if (latin1()) {
571         info = LiteralLatin1(re->runes()[0]);
572         for (int i = 1; i < re->nrunes(); i++) {
573           info = Concat(info, LiteralLatin1(re->runes()[i]));
574         }
575       } else {
576         info = Literal(re->runes()[0]);
577         for (int i = 1; i < re->nrunes(); i++) {
578           info = Concat(info, Literal(re->runes()[i]));
579         }
580       }
581       break;
582 
583     case kRegexpConcat: {
584       // Accumulate in info.
585       // Exact is concat of recent contiguous exact nodes.
586       info = NULL;
587       Info* exact = NULL;
588       for (int i = 0; i < nchild_args; i++) {
589         Info* ci = child_args[i];  // child info
590         if (!ci->is_exact() ||
591             (exact && ci->exact().size() * exact->exact().size() > 16)) {
592           // Exact run is over.
593           info = And(info, exact);
594           exact = NULL;
595           // Add this child's info.
596           info = And(info, ci);
597         } else {
598           // Append to exact run.
599           exact = Concat(exact, ci);
600         }
601       }
602       info = And(info, exact);
603     }
604       break;
605 
606     case kRegexpAlternate:
607       info = child_args[0];
608       for (int i = 1; i < nchild_args; i++)
609         info = Alt(info, child_args[i]);
610       break;
611 
612     case kRegexpStar:
613       info = Star(child_args[0]);
614       break;
615 
616     case kRegexpQuest:
617       info = Quest(child_args[0]);
618       break;
619 
620     case kRegexpPlus:
621       info = Plus(child_args[0]);
622       break;
623 
624     case kRegexpAnyChar:
625     case kRegexpAnyByte:
626       // Claim nothing, except that it's not empty.
627       info = AnyCharOrAnyByte();
628       break;
629 
630     case kRegexpCharClass:
631       info = CClass(re->cc(), latin1());
632       break;
633 
634     case kRegexpCapture:
635       // These don't affect the set of matching strings.
636       info = child_args[0];
637       break;
638   }
639 
640   if (ExtraDebug)
641     LOG(ERROR) << "BuildInfo " << re->ToString()
642                << ": " << (info ? info->ToString() : "");
643 
644   return info;
645 }
646 
647 
FromRegexp(Regexp * re)648 Prefilter* Prefilter::FromRegexp(Regexp* re) {
649   if (re == NULL)
650     return NULL;
651 
652   Regexp* simple = re->Simplify();
653   Prefilter::Info *info = BuildInfo(simple);
654 
655   simple->Decref();
656   if (info == NULL)
657     return NULL;
658 
659   Prefilter* m = info->TakeMatch();
660 
661   delete info;
662   return m;
663 }
664 
DebugString() const665 string Prefilter::DebugString() const {
666   switch (op_) {
667     default:
668       LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
669       return StringPrintf("op%d", op_);
670     case NONE:
671       return "*no-matches*";
672     case ATOM:
673       return atom_;
674     case ALL:
675       return "";
676     case AND: {
677       string s = "";
678       for (size_t i = 0; i < subs_->size(); i++) {
679         if (i > 0)
680           s += " ";
681         Prefilter* sub = (*subs_)[i];
682         s += sub ? sub->DebugString() : "<nil>";
683       }
684       return s;
685     }
686     case OR: {
687       string s = "(";
688       for (size_t i = 0; i < subs_->size(); i++) {
689         if (i > 0)
690           s += "|";
691         Prefilter* sub = (*subs_)[i];
692         s += sub ? sub->DebugString() : "<nil>";
693       }
694       s += ")";
695       return s;
696     }
697   }
698 }
699 
FromRE2(const RE2 * re2)700 Prefilter* Prefilter::FromRE2(const RE2* re2) {
701   if (re2 == NULL)
702     return NULL;
703 
704   Regexp* regexp = re2->Regexp();
705   if (regexp == NULL)
706     return NULL;
707 
708   return FromRegexp(regexp);
709 }
710 
711 
712 }  // namespace re2
713