• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include "util/util.h"
6 #include "re2/prefilter.h"
7 #include "re2/re2.h"
8 #include "re2/unicode_casefold.h"
9 #include "re2/walker-inl.h"
10 
11 namespace re2 {
12 
13 static const int Trace = false;
14 
15 typedef set<string>::iterator SSIter;
16 typedef set<string>::const_iterator ConstSSIter;
17 
18 static int alloc_id = 100000;  // Used for debugging.
19 // Initializes a Prefilter, allocating subs_ as necessary.
Prefilter(Op op)20 Prefilter::Prefilter(Op op) {
21   op_ = op;
22   subs_ = NULL;
23   if (op_ == AND || op_ == OR)
24     subs_ = new vector<Prefilter*>;
25 
26   alloc_id_ = alloc_id++;
27   VLOG(10) << "alloc_id: " << alloc_id_;
28 }
29 
30 // Destroys a Prefilter.
~Prefilter()31 Prefilter::~Prefilter() {
32   VLOG(10) << "Deleted: " << alloc_id_;
33   if (subs_) {
34     for (int i = 0; i < subs_->size(); i++)
35       delete (*subs_)[i];
36     delete subs_;
37     subs_ = NULL;
38   }
39 }
40 
41 // Simplify if the node is an empty Or or And.
Simplify()42 Prefilter* Prefilter::Simplify() {
43   if (op_ != AND && op_ != OR) {
44     return this;
45   }
46 
47   // Nothing left in the AND/OR.
48   if (subs_->size() == 0) {
49     if (op_ == AND)
50       op_ = ALL;  // AND of nothing is true
51     else
52       op_ = NONE;  // OR of nothing is false
53 
54     return this;
55   }
56 
57   // Just one subnode: throw away wrapper.
58   if (subs_->size() == 1) {
59     Prefilter* a = (*subs_)[0];
60     subs_->clear();
61     delete this;
62     return a->Simplify();
63   }
64 
65   return this;
66 }
67 
68 // Combines two Prefilters together to create an "op" (AND or OR).
69 // The passed Prefilters will be part of the returned Prefilter or deleted.
70 // Does lots of work to avoid creating unnecessarily complicated structures.
AndOr(Op op,Prefilter * a,Prefilter * b)71 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
72   // If a, b can be rewritten as op, do so.
73   a = a->Simplify();
74   b = b->Simplify();
75 
76   // Canonicalize: a->op <= b->op.
77   if (a->op() > b->op()) {
78     Prefilter* t = a;
79     a = b;
80     b = t;
81   }
82 
83   // Trivial cases.
84   //    ALL AND b = b
85   //    NONE OR b = b
86   //    ALL OR b   = ALL
87   //    NONE AND b = NONE
88   // Don't need to look at b, because of canonicalization above.
89   // ALL and NONE are smallest opcodes.
90   if (a->op() == ALL || a->op() == NONE) {
91     if ((a->op() == ALL && op == AND) ||
92         (a->op() == NONE && op == OR)) {
93       delete a;
94       return b;
95     } else {
96       delete b;
97       return a;
98     }
99   }
100 
101   // If a and b match op, merge their contents.
102   if (a->op() == op && b->op() == op) {
103     for (int i = 0; i < b->subs()->size(); i++) {
104       Prefilter* bb = (*b->subs())[i];
105       a->subs()->push_back(bb);
106     }
107     b->subs()->clear();
108     delete b;
109     return a;
110   }
111 
112   // If a already has the same op as the op that is under construction
113   // add in b (similarly if b already has the same op, add in a).
114   if (b->op() == op) {
115     Prefilter* t = a;
116     a = b;
117     b = t;
118   }
119   if (a->op() == op) {
120     a->subs()->push_back(b);
121     return a;
122   }
123 
124   // Otherwise just return the op.
125   Prefilter* c = new Prefilter(op);
126   c->subs()->push_back(a);
127   c->subs()->push_back(b);
128   return c;
129 }
130 
And(Prefilter * a,Prefilter * b)131 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
132   return AndOr(AND, a, b);
133 }
134 
Or(Prefilter * a,Prefilter * b)135 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
136   return AndOr(OR, a, b);
137 }
138 
SimplifyStringSet(set<string> * ss)139 static void SimplifyStringSet(set<string> *ss) {
140   // Now make sure that the strings aren't redundant.  For example, if
141   // we know "ab" is a required string, then it doesn't help at all to
142   // know that "abc" is also a required string, so delete "abc". This
143   // is because, when we are performing a string search to filter
144   // regexps, matching ab will already allow this regexp to be a
145   // candidate for match, so further matching abc is redundant.
146 
147   for (SSIter i = ss->begin(); i != ss->end(); ++i) {
148     SSIter j = i;
149     ++j;
150     while (j != ss->end()) {
151       // Increment j early so that we can erase the element it points to.
152       SSIter old_j = j;
153       ++j;
154       if (old_j->find(*i) != string::npos)
155         ss->erase(old_j);
156     }
157   }
158 }
159 
OrStrings(set<string> * ss)160 Prefilter* Prefilter::OrStrings(set<string>* ss) {
161   SimplifyStringSet(ss);
162   Prefilter* or_prefilter = NULL;
163   if (!ss->empty()) {
164     or_prefilter = new Prefilter(NONE);
165     for (SSIter i = ss->begin(); i != ss->end(); ++i)
166       or_prefilter = Or(or_prefilter, FromString(*i));
167   }
168   return or_prefilter;
169 }
170 
ToLowerRune(Rune r)171 static Rune ToLowerRune(Rune r) {
172   if (r < Runeself) {
173     if ('A' <= r && r <= 'Z')
174       r += 'a' - 'A';
175     return r;
176   }
177 
178   CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
179   if (f == NULL || r < f->lo)
180     return r;
181   return ApplyFold(f, r);
182 }
183 
FromString(const string & str)184 Prefilter* Prefilter::FromString(const string& str) {
185   Prefilter* m = new Prefilter(Prefilter::ATOM);
186   m->atom_ = str;
187   return m;
188 }
189 
190 // Information about a regexp used during computation of Prefilter.
191 // Can be thought of as information about the set of strings matching
192 // the given regular expression.
193 class Prefilter::Info {
194  public:
195   Info();
196   ~Info();
197 
198   // More constructors.  They delete their Info* arguments.
199   static Info* Alt(Info* a, Info* b);
200   static Info* Concat(Info* a, Info* b);
201   static Info* And(Info* a, Info* b);
202   static Info* Star(Info* a);
203   static Info* Plus(Info* a);
204   static Info* Quest(Info* a);
205   static Info* EmptyString();
206   static Info* NoMatch();
207   static Info* AnyChar();
208   static Info* CClass(CharClass* cc);
209   static Info* Literal(Rune r);
210   static Info* AnyMatch();
211 
212   // Format Info as a string.
213   string ToString();
214 
215   // Caller takes ownership of the Prefilter.
216   Prefilter* TakeMatch();
217 
exact()218   set<string>& exact() { return exact_; }
219 
is_exact() const220   bool is_exact() const { return is_exact_; }
221 
222   class Walker;
223 
224  private:
225   set<string> exact_;
226 
227   // When is_exact_ is true, the strings that match
228   // are placed in exact_. When it is no longer an exact
229   // set of strings that match this RE, then is_exact_
230   // is false and the match_ contains the required match
231   // criteria.
232   bool is_exact_;
233 
234   // Accumulated Prefilter query that any
235   // match for this regexp is guaranteed to match.
236   Prefilter* match_;
237 };
238 
239 
Info()240 Prefilter::Info::Info()
241   : is_exact_(false),
242     match_(NULL) {
243 }
244 
~Info()245 Prefilter::Info::~Info() {
246   delete match_;
247 }
248 
TakeMatch()249 Prefilter* Prefilter::Info::TakeMatch() {
250   if (is_exact_) {
251     match_ = Prefilter::OrStrings(&exact_);
252     is_exact_ = false;
253   }
254   Prefilter* m = match_;
255   match_ = NULL;
256   return m;
257 }
258 
259 // Format a Info in string form.
ToString()260 string Prefilter::Info::ToString() {
261   if (this == NULL) {
262     // Sometimes when iterating on children of a node,
263     // some children might have NULL Info. Adding
264     // the check here for NULL to take care of cases where
265     // the caller is not checking.
266     return "";
267   }
268 
269   if (is_exact_) {
270     int n = 0;
271     string s;
272     for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
273       if (n++ > 0)
274         s += ",";
275       s += *i;
276     }
277     return s;
278   }
279 
280   if (match_)
281     return match_->DebugString();
282 
283   return "";
284 }
285 
286 // Add the strings from src to dst.
CopyIn(const set<string> & src,set<string> * dst)287 static void CopyIn(const set<string>& src, set<string>* dst) {
288   for (ConstSSIter i = src.begin(); i != src.end(); ++i)
289     dst->insert(*i);
290 }
291 
292 // Add the cross-product of a and b to dst.
293 // (For each string i in a and j in b, add i+j.)
CrossProduct(const set<string> & a,const set<string> & b,set<string> * dst)294 static void CrossProduct(const set<string>& a,
295                          const set<string>& b,
296                          set<string>* dst) {
297   for (ConstSSIter i = a.begin(); i != a.end(); ++i)
298     for (ConstSSIter j = b.begin(); j != b.end(); ++j)
299       dst->insert(*i + *j);
300 }
301 
302 // Concats a and b. Requires that both are exact sets.
303 // Forms an exact set that is a crossproduct of a and b.
Concat(Info * a,Info * b)304 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
305   if (a == NULL)
306     return b;
307   DCHECK(a->is_exact_);
308   DCHECK(b && b->is_exact_);
309   Info *ab = new Info();
310 
311   CrossProduct(a->exact_, b->exact_, &ab->exact_);
312   ab->is_exact_ = true;
313 
314   delete a;
315   delete b;
316   return ab;
317 }
318 
319 // Constructs an inexact Info for ab given a and b.
320 // Used only when a or b is not exact or when the
321 // exact cross product is likely to be too big.
And(Info * a,Info * b)322 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
323   if (a == NULL)
324     return b;
325   if (b == NULL)
326     return a;
327 
328   Info *ab = new Info();
329 
330   ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
331   ab->is_exact_ = false;
332   delete a;
333   delete b;
334   return ab;
335 }
336 
337 // Constructs Info for a|b given a and b.
Alt(Info * a,Info * b)338 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
339   Info *ab = new Info();
340 
341   if (a->is_exact_ && b->is_exact_) {
342     CopyIn(a->exact_, &ab->exact_);
343     CopyIn(b->exact_, &ab->exact_);
344     ab->is_exact_ = true;
345   } else {
346     // Either a or b has is_exact_ = false. If the other
347     // one has is_exact_ = true, we move it to match_ and
348     // then create a OR of a,b. The resulting Info has
349     // is_exact_ = false.
350     ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
351     ab->is_exact_ = false;
352   }
353 
354   delete a;
355   delete b;
356   return ab;
357 }
358 
359 // Constructs Info for a? given a.
Quest(Info * a)360 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
361   Info *ab = new Info();
362 
363   ab->is_exact_ = false;
364   ab->match_ = new Prefilter(ALL);
365   delete a;
366   return ab;
367 }
368 
369 // Constructs Info for a* given a.
370 // Same as a? -- not much to do.
Star(Info * a)371 Prefilter::Info* Prefilter::Info::Star(Info *a) {
372   return Quest(a);
373 }
374 
375 // Constructs Info for a+ given a. If a was exact set, it isn't
376 // anymore.
Plus(Info * a)377 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
378   Info *ab = new Info();
379 
380   ab->match_ = a->TakeMatch();
381   ab->is_exact_ = false;
382 
383   delete a;
384   return ab;
385 }
386 
RuneToString(Rune r)387 static string RuneToString(Rune r) {
388   char buf[UTFmax];
389   int n = runetochar(buf, &r);
390   return string(buf, n);
391 }
392 
393 // Constructs Info for literal rune.
Literal(Rune r)394 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
395   Info* info = new Info();
396   info->exact_.insert(RuneToString(ToLowerRune(r)));
397   info->is_exact_ = true;
398   return info;
399 }
400 
401 // Constructs Info for dot (any character).
AnyChar()402 Prefilter::Info* Prefilter::Info::AnyChar() {
403   Prefilter::Info* info = new Prefilter::Info();
404   info->match_ = new Prefilter(ALL);
405   return info;
406 }
407 
408 // Constructs Prefilter::Info for no possible match.
NoMatch()409 Prefilter::Info* Prefilter::Info::NoMatch() {
410   Prefilter::Info* info = new Prefilter::Info();
411   info->match_ = new Prefilter(NONE);
412   return info;
413 }
414 
415 // Constructs Prefilter::Info for any possible match.
416 // This Prefilter::Info is valid for any regular expression,
417 // since it makes no assertions whatsoever about the
418 // strings being matched.
AnyMatch()419 Prefilter::Info* Prefilter::Info::AnyMatch() {
420   Prefilter::Info *info = new Prefilter::Info();
421   info->match_ = new Prefilter(ALL);
422   return info;
423 }
424 
425 // Constructs Prefilter::Info for just the empty string.
EmptyString()426 Prefilter::Info* Prefilter::Info::EmptyString() {
427   Prefilter::Info* info = new Prefilter::Info();
428   info->is_exact_ = true;
429   info->exact_.insert("");
430   return info;
431 }
432 
433 // Constructs Prefilter::Info for a character class.
434 typedef CharClass::iterator CCIter;
CClass(CharClass * cc)435 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) {
436   if (Trace) {
437     VLOG(0) << "CharClassInfo:";
438     for (CCIter i = cc->begin(); i != cc->end(); ++i)
439       VLOG(0) << "  " << i->lo << "-" << i->hi;
440   }
441 
442   // If the class is too large, it's okay to overestimate.
443   if (cc->size() > 10)
444     return AnyChar();
445 
446   Prefilter::Info *a = new Prefilter::Info();
447   for (CCIter i = cc->begin(); i != cc->end(); ++i)
448     for (Rune r = i->lo; r <= i->hi; r++)
449       a->exact_.insert(RuneToString(ToLowerRune(r)));
450 
451   a->is_exact_ = true;
452 
453   if (Trace) {
454     VLOG(0) << " = " << a->ToString();
455   }
456 
457   return a;
458 }
459 
460 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
461  public:
Walker()462   Walker() {}
463 
464   virtual Info* PostVisit(
465       Regexp* re, Info* parent_arg,
466       Info* pre_arg,
467       Info** child_args, int nchild_args);
468 
469   virtual Info* ShortVisit(
470       Regexp* re,
471       Info* parent_arg);
472 
473  private:
474   DISALLOW_EVIL_CONSTRUCTORS(Walker);
475 };
476 
BuildInfo(Regexp * re)477 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
478   if (Trace) {
479     LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
480   }
481   Prefilter::Info::Walker w;
482   Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
483 
484   if (w.stopped_early()) {
485     delete info;
486     return NULL;
487   }
488 
489   return info;
490 }
491 
ShortVisit(Regexp * re,Prefilter::Info * parent_arg)492 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
493     Regexp* re, Prefilter::Info* parent_arg) {
494   return AnyMatch();
495 }
496 
497 // Constructs the Prefilter::Info for the given regular expression.
498 // Assumes re is simplified.
PostVisit(Regexp * re,Prefilter::Info * parent_arg,Prefilter::Info * pre_arg,Prefilter::Info ** child_args,int nchild_args)499 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
500     Regexp* re, Prefilter::Info* parent_arg,
501     Prefilter::Info* pre_arg, Prefilter::Info** child_args,
502     int nchild_args) {
503   Prefilter::Info *info;
504   switch (re->op()) {
505     default:
506     case kRegexpRepeat:
507       LOG(DFATAL) << "Bad regexp op " << re->op();
508       info = EmptyString();
509       break;
510 
511     case kRegexpNoMatch:
512       info = NoMatch();
513       break;
514 
515     // These ops match the empty string:
516     case kRegexpEmptyMatch:      // anywhere
517     case kRegexpBeginLine:       // at beginning of line
518     case kRegexpEndLine:         // at end of line
519     case kRegexpBeginText:       // at beginning of text
520     case kRegexpEndText:         // at end of text
521     case kRegexpWordBoundary:    // at word boundary
522     case kRegexpNoWordBoundary:  // not at word boundary
523       info = EmptyString();
524       break;
525 
526     case kRegexpLiteral:
527       info = Literal(re->rune());
528       break;
529 
530     case kRegexpLiteralString:
531       if (re->nrunes() == 0) {
532         info = NoMatch();
533         break;
534       }
535       info = Literal(re->runes()[0]);
536       for (int i = 1; i < re->nrunes(); i++)
537         info = Concat(info, Literal(re->runes()[i]));
538       break;
539 
540     case kRegexpConcat: {
541       // Accumulate in info.
542       // Exact is concat of recent contiguous exact nodes.
543       info = NULL;
544       Info* exact = NULL;
545       for (int i = 0; i < nchild_args; i++) {
546         Info* ci = child_args[i];  // child info
547         if (!ci->is_exact() ||
548             (exact && ci->exact().size() * exact->exact().size() > 16)) {
549           // Exact run is over.
550           info = And(info, exact);
551           exact = NULL;
552           // Add this child's info.
553           info = And(info, ci);
554         } else {
555           // Append to exact run.
556           exact = Concat(exact, ci);
557         }
558       }
559       info = And(info, exact);
560     }
561       break;
562 
563     case kRegexpAlternate:
564       info = child_args[0];
565       for (int i = 1; i < nchild_args; i++)
566         info = Alt(info, child_args[i]);
567       VLOG(10) << "Alt: " << info->ToString();
568       break;
569 
570     case kRegexpStar:
571       info = Star(child_args[0]);
572       break;
573 
574     case kRegexpQuest:
575       info = Quest(child_args[0]);
576       break;
577 
578     case kRegexpPlus:
579       info = Plus(child_args[0]);
580       break;
581 
582     case kRegexpAnyChar:
583       // Claim nothing, except that it's not empty.
584       info = AnyChar();
585       break;
586 
587     case kRegexpCharClass:
588       info = CClass(re->cc());
589       break;
590 
591     case kRegexpCapture:
592       // These don't affect the set of matching strings.
593       info = child_args[0];
594       break;
595   }
596 
597   if (Trace) {
598     VLOG(0) << "BuildInfo " << re->ToString()
599             << ": " << info->ToString();
600   }
601 
602   return info;
603 }
604 
605 
FromRegexp(Regexp * re)606 Prefilter* Prefilter::FromRegexp(Regexp* re) {
607   if (re == NULL)
608     return NULL;
609 
610   Regexp* simple = re->Simplify();
611   Prefilter::Info *info = BuildInfo(simple);
612 
613   simple->Decref();
614   if (info == NULL)
615     return NULL;
616 
617   Prefilter* m = info->TakeMatch();
618 
619   delete info;
620   return m;
621 }
622 
DebugString() const623 string Prefilter::DebugString() const {
624   if (this == NULL)
625     return "<nil>";
626 
627   switch (op_) {
628     default:
629       LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
630       return StringPrintf("op%d", op_);
631     case NONE:
632       return "*no-matches*";
633     case ATOM:
634       return atom_;
635     case ALL:
636       return "";
637     case AND: {
638       string s = "";
639       for (int i = 0; i < subs_->size(); i++) {
640         if (i > 0)
641           s += " ";
642         s += (*subs_)[i]->DebugString();
643       }
644       return s;
645     }
646     case OR: {
647       string s = "(";
648       for (int i = 0; i < subs_->size(); i++) {
649         if (i > 0)
650           s += "|";
651         s += (*subs_)[i]->DebugString();
652       }
653       s += ")";
654       return s;
655     }
656   }
657 }
658 
FromRE2(const RE2 * re2)659 Prefilter* Prefilter::FromRE2(const RE2* re2) {
660   if (re2 == NULL)
661     return NULL;
662 
663   Regexp* regexp = re2->Regexp();
664   if (regexp == NULL)
665     return NULL;
666 
667   return FromRegexp(regexp);
668 }
669 
670 
671 }  // namespace re2
672