• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Compile regular expression to Prog.
6 //
7 // Prog and Inst are defined in prog.h.
8 // This file's external interface is just Regexp::CompileToProg.
9 // The Compiler class defined in this file is private.
10 
11 #include "re2/prog.h"
12 #include "re2/re2.h"
13 #include "re2/regexp.h"
14 #include "re2/walker-inl.h"
15 
16 namespace re2 {
17 
18 // List of pointers to Inst* that need to be filled in (patched).
19 // Because the Inst* haven't been filled in yet,
20 // we can use the Inst* word to hold the list's "next" pointer.
21 // It's kind of sleazy, but it works well in practice.
22 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
23 //
24 // Because the out and out1 fields in Inst are no longer pointers,
25 // we can't use pointers directly here either.  Instead, p refers
26 // to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
27 // p == 0 represents the NULL list.  This is okay because instruction #0
28 // is always the fail instruction, which never appears on a list.
29 
30 struct PatchList {
31   uint32 p;
32 
33   // Returns patch list containing just p.
34   static PatchList Mk(uint32 p);
35 
36   // Patches all the entries on l to have value v.
37   // Caller must not ever use patch list again.
38   static void Patch(Prog::Inst *inst0, PatchList l, uint32 v);
39 
40   // Deref returns the next pointer pointed at by p.
41   static PatchList Deref(Prog::Inst *inst0, PatchList l);
42 
43   // Appends two patch lists and returns result.
44   static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
45 };
46 
47 static PatchList nullPatchList = { 0 };
48 
49 // Returns patch list containing just p.
Mk(uint32 p)50 PatchList PatchList::Mk(uint32 p) {
51   PatchList l;
52   l.p = p;
53   return l;
54 }
55 
56 // Returns the next pointer pointed at by l.
Deref(Prog::Inst * inst0,PatchList l)57 PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
58   Prog::Inst* ip = &inst0[l.p>>1];
59   if (l.p&1)
60     l.p = ip->out1();
61   else
62     l.p = ip->out();
63   return l;
64 }
65 
66 // Patches all the entries on l to have value v.
Patch(Prog::Inst * inst0,PatchList l,uint32 val)67 void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) {
68   while (l.p != 0) {
69     Prog::Inst* ip = &inst0[l.p>>1];
70     if (l.p&1) {
71       l.p = ip->out1();
72       ip->out1_ = val;
73     } else {
74       l.p = ip->out();
75       ip->set_out(val);
76     }
77   }
78 }
79 
80 // Appends two patch lists and returns result.
Append(Prog::Inst * inst0,PatchList l1,PatchList l2)81 PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
82   if (l1.p == 0)
83     return l2;
84   if (l2.p == 0)
85     return l1;
86 
87   PatchList l = l1;
88   for (;;) {
89     PatchList next = PatchList::Deref(inst0, l);
90     if (next.p == 0)
91       break;
92     l = next;
93   }
94 
95   Prog::Inst* ip = &inst0[l.p>>1];
96   if (l.p&1)
97     ip->out1_ = l2.p;
98   else
99     ip->set_out(l2.p);
100 
101   return l1;
102 }
103 
104 // Compiled program fragment.
105 struct Frag {
106   uint32 begin;
107   PatchList end;
108 
Fragre2::Frag109   Frag() : begin(0) { end.p = 0; }  // needed so Frag can go in vector
Fragre2::Frag110   Frag(uint32 begin, PatchList end) : begin(begin), end(end) {}
111 };
112 
113 static Frag kNullFrag;
114 
115 // Input encodings.
116 enum Encoding {
117   kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
118   kEncodingLatin1,    // Latin1 (0-FF)
119 };
120 
121 class Compiler : public Regexp::Walker<Frag> {
122  public:
123   explicit Compiler();
124   ~Compiler();
125 
126   // Compiles Regexp to a new Prog.
127   // Caller is responsible for deleting Prog when finished with it.
128   // If reversed is true, compiles for walking over the input
129   // string backward (reverses all concatenations).
130   static Prog *Compile(Regexp* re, bool reversed, int64 max_mem);
131 
132   // Compiles alternation of all the re to a new Prog.
133   // Each re has a match with an id equal to its index in the vector.
134   static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
135                           Regexp* re);
136 
137   // Interface for Regexp::Walker, which helps traverse the Regexp.
138   // The walk is purely post-recursive: given the machines for the
139   // children, PostVisit combines them to create the machine for
140   // the current node.  The child_args are Frags.
141   // The Compiler traverses the Regexp parse tree, visiting
142   // each node in depth-first order.  It invokes PreVisit before
143   // visiting the node's children and PostVisit after visiting
144   // the children.
145   Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
146   Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
147                  int nchild_args);
148   Frag ShortVisit(Regexp* re, Frag parent_arg);
149   Frag Copy(Frag arg);
150 
151   // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
152   Frag Plus(Frag a, bool nongreedy);
153   Frag Star(Frag a, bool nongreedy);
154   Frag Quest(Frag a, bool nongreedy);
155 
156   // Given fragment a, returns (a) capturing as \n.
157   Frag Capture(Frag a, int n);
158 
159   // Given fragments a and b, returns ab; a|b
160   Frag Cat(Frag a, Frag b);
161   Frag Alt(Frag a, Frag b);
162 
163   // Returns a fragment that can't match anything.
164   Frag NoMatch();
165 
166   // Returns a fragment that matches the empty string.
167   Frag Match(int32 id);
168 
169   // Returns a no-op fragment.
170   Frag Nop();
171 
172   // Returns a fragment matching the byte range lo-hi.
173   Frag ByteRange(int lo, int hi, bool foldcase);
174 
175   // Returns a fragment matching an empty-width special op.
176   Frag EmptyWidth(EmptyOp op);
177 
178   // Adds n instructions to the program.
179   // Returns the index of the first one.
180   // Returns -1 if no more instructions are available.
181   int AllocInst(int n);
182 
183   // Deletes unused instructions.
184   void Trim();
185 
186   // Rune range compiler.
187 
188   // Begins a new alternation.
189   void BeginRange();
190 
191   // Adds a fragment matching the rune range lo-hi.
192   void AddRuneRange(Rune lo, Rune hi, bool foldcase);
193   void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
194   void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
195   void Add_80_10ffff();
196 
197   // New suffix that matches the byte range lo-hi, then goes to next.
198   int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
199   int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
200 
201   // Adds a suffix to alternation.
202   void AddSuffix(int id);
203 
204   // Returns the alternation of all the added suffixes.
205   Frag EndRange();
206 
207   // Single rune.
208   Frag Literal(Rune r, bool foldcase);
209 
210   void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
211   Prog* Finish();
212 
213   // Returns .* where dot = any byte
214   Frag DotStar();
215 
216  private:
217   Prog* prog_;         // Program being built.
218   bool failed_;        // Did we give up compiling?
219   Encoding encoding_;  // Input encoding
220   bool reversed_;      // Should program run backward over text?
221 
222   int max_inst_;       // Maximum number of instructions.
223 
224   Prog::Inst* inst_;   // Pointer to first instruction.
225   int inst_len_;       // Number of instructions used.
226   int inst_cap_;       // Number of instructions allocated.
227 
228   int64 max_mem_;      // Total memory budget.
229 
230   map<uint64, int> rune_cache_;
231   Frag rune_range_;
232 
233   RE2::Anchor anchor_;  // anchor mode for RE2::Set
234 
235   DISALLOW_EVIL_CONSTRUCTORS(Compiler);
236 };
237 
Compiler()238 Compiler::Compiler() {
239   prog_ = new Prog();
240   failed_ = false;
241   encoding_ = kEncodingUTF8;
242   reversed_ = false;
243   inst_ = NULL;
244   inst_len_ = 0;
245   inst_cap_ = 0;
246   max_inst_ = 1;  // make AllocInst for fail instruction okay
247   max_mem_ = 0;
248   int fail = AllocInst(1);
249   inst_[fail].InitFail();
250   max_inst_ = 0;  // Caller must change
251 }
252 
~Compiler()253 Compiler::~Compiler() {
254   delete prog_;
255   delete[] inst_;
256 }
257 
AllocInst(int n)258 int Compiler::AllocInst(int n) {
259   if (failed_ || inst_len_ + n > max_inst_) {
260     failed_ = true;
261     return -1;
262   }
263 
264   if (inst_len_ + n > inst_cap_) {
265     if (inst_cap_ == 0)
266       inst_cap_ = 8;
267     while (inst_len_ + n > inst_cap_)
268       inst_cap_ *= 2;
269     Prog::Inst* ip = new Prog::Inst[inst_cap_];
270     memmove(ip, inst_, inst_len_ * sizeof ip[0]);
271     memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
272     delete[] inst_;
273     inst_ = ip;
274   }
275   int id = inst_len_;
276   inst_len_ += n;
277   return id;
278 }
279 
Trim()280 void Compiler::Trim() {
281   if (inst_len_ < inst_cap_) {
282     Prog::Inst* ip = new Prog::Inst[inst_len_];
283     memmove(ip, inst_, inst_len_ * sizeof ip[0]);
284     delete[] inst_;
285     inst_ = ip;
286     inst_cap_ = inst_len_;
287   }
288 }
289 
290 // These routines are somewhat hard to visualize in text --
291 // see http://swtch.com/~rsc/regexp/regexp1.html for
292 // pictures explaining what is going on here.
293 
294 // Returns an unmatchable fragment.
NoMatch()295 Frag Compiler::NoMatch() {
296   return Frag(0, nullPatchList);
297 }
298 
299 // Is a an unmatchable fragment?
IsNoMatch(Frag a)300 static bool IsNoMatch(Frag a) {
301   return a.begin == 0;
302 }
303 
304 // Given fragments a and b, returns fragment for ab.
Cat(Frag a,Frag b)305 Frag Compiler::Cat(Frag a, Frag b) {
306   if (IsNoMatch(a) || IsNoMatch(b))
307     return NoMatch();
308 
309   // Elide no-op.
310   Prog::Inst* begin = &inst_[a.begin];
311   if (begin->opcode() == kInstNop &&
312       a.end.p == (a.begin << 1) &&
313       begin->out() == 0) {
314     PatchList::Patch(inst_, a.end, b.begin);  // in case refs to a somewhere
315     return b;
316   }
317 
318   // To run backward over string, reverse all concatenations.
319   if (reversed_) {
320     PatchList::Patch(inst_, b.end, a.begin);
321     return Frag(b.begin, a.end);
322   }
323 
324   PatchList::Patch(inst_, a.end, b.begin);
325   return Frag(a.begin, b.end);
326 }
327 
328 // Given fragments for a and b, returns fragment for a|b.
Alt(Frag a,Frag b)329 Frag Compiler::Alt(Frag a, Frag b) {
330   // Special case for convenience in loops.
331   if (IsNoMatch(a))
332     return b;
333   if (IsNoMatch(b))
334     return a;
335 
336   int id = AllocInst(1);
337   if (id < 0)
338     return NoMatch();
339 
340   inst_[id].InitAlt(a.begin, b.begin);
341   return Frag(id, PatchList::Append(inst_, a.end, b.end));
342 }
343 
344 // When capturing submatches in like-Perl mode, a kOpAlt Inst
345 // treats out_ as the first choice, out1_ as the second.
346 //
347 // For *, +, and ?, if out_ causes another repetition,
348 // then the operator is greedy.  If out1_ is the repetition
349 // (and out_ moves forward), then the operator is non-greedy.
350 
351 // Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
Star(Frag a,bool nongreedy)352 Frag Compiler::Star(Frag a, bool nongreedy) {
353   int id = AllocInst(1);
354   if (id < 0)
355     return NoMatch();
356   inst_[id].InitAlt(0, 0);
357   PatchList::Patch(inst_, a.end, id);
358   if (nongreedy) {
359     inst_[id].out1_ = a.begin;
360     return Frag(id, PatchList::Mk(id << 1));
361   } else {
362     inst_[id].set_out(a.begin);
363     return Frag(id, PatchList::Mk((id << 1) | 1));
364   }
365 }
366 
367 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
Plus(Frag a,bool nongreedy)368 Frag Compiler::Plus(Frag a, bool nongreedy) {
369   // a+ is just a* with a different entry point.
370   Frag f = Star(a, nongreedy);
371   return Frag(a.begin, f.end);
372 }
373 
374 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
Quest(Frag a,bool nongreedy)375 Frag Compiler::Quest(Frag a, bool nongreedy) {
376   int id = AllocInst(1);
377   if (id < 0)
378     return NoMatch();
379   PatchList pl;
380   if (nongreedy) {
381     inst_[id].InitAlt(0, a.begin);
382     pl = PatchList::Mk(id << 1);
383   } else {
384     inst_[id].InitAlt(a.begin, 0);
385     pl = PatchList::Mk((id << 1) | 1);
386   }
387   return Frag(id, PatchList::Append(inst_, pl, a.end));
388 }
389 
390 // Returns a fragment for the byte range lo-hi.
ByteRange(int lo,int hi,bool foldcase)391 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
392   int id = AllocInst(1);
393   if (id < 0)
394     return NoMatch();
395   inst_[id].InitByteRange(lo, hi, foldcase, 0);
396   prog_->byte_inst_count_++;
397   prog_->MarkByteRange(lo, hi);
398   if (foldcase && lo <= 'z' && hi >= 'a') {
399     if (lo < 'a')
400       lo = 'a';
401     if (hi > 'z')
402       hi = 'z';
403     if (lo <= hi)
404       prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
405   }
406   return Frag(id, PatchList::Mk(id << 1));
407 }
408 
409 // Returns a no-op fragment.  Sometimes unavoidable.
Nop()410 Frag Compiler::Nop() {
411   int id = AllocInst(1);
412   if (id < 0)
413     return NoMatch();
414   inst_[id].InitNop(0);
415   return Frag(id, PatchList::Mk(id << 1));
416 }
417 
418 // Returns a fragment that signals a match.
Match(int32 match_id)419 Frag Compiler::Match(int32 match_id) {
420   int id = AllocInst(1);
421   if (id < 0)
422     return NoMatch();
423   inst_[id].InitMatch(match_id);
424   return Frag(id, nullPatchList);
425 }
426 
427 // Returns a fragment matching a particular empty-width op (like ^ or $)
EmptyWidth(EmptyOp empty)428 Frag Compiler::EmptyWidth(EmptyOp empty) {
429   int id = AllocInst(1);
430   if (id < 0)
431     return NoMatch();
432   inst_[id].InitEmptyWidth(empty, 0);
433   if (empty & (kEmptyBeginLine|kEmptyEndLine))
434     prog_->MarkByteRange('\n', '\n');
435   if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
436     int j;
437     for (int i = 0; i < 256; i = j) {
438       for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++)
439         ;
440       prog_->MarkByteRange(i, j-1);
441     }
442   }
443   return Frag(id, PatchList::Mk(id << 1));
444 }
445 
446 // Given a fragment a, returns a fragment with capturing parens around a.
Capture(Frag a,int n)447 Frag Compiler::Capture(Frag a, int n) {
448   int id = AllocInst(2);
449   if (id < 0)
450     return NoMatch();
451   inst_[id].InitCapture(2*n, a.begin);
452   inst_[id+1].InitCapture(2*n+1, 0);
453   PatchList::Patch(inst_, a.end, id+1);
454 
455   return Frag(id, PatchList::Mk((id+1) << 1));
456 }
457 
458 // A Rune is a name for a Unicode code point.
459 // Returns maximum rune encoded by UTF-8 sequence of length len.
MaxRune(int len)460 static int MaxRune(int len) {
461   int b;  // number of Rune blents lenn len-byte UTF-8 sequence (len < UTFmax)
462   if (len == 1)
463     b = 7;
464   else
465     b = 8-(len+1) + 6*(len-1);
466   return (1<<b) - 1;   // maximum Rune for b bits.
467 }
468 
469 // The rune range compiler caches common suffix fragments,
470 // which are very common in UTF-8 (e.g., [80-bf]).
471 // The fragment suffixes are identified by their start
472 // instructions.  NULL denotes the eventual end match.
473 // The Frag accumulates in rune_range_.  Caching common
474 // suffixes reduces the UTF-8 "." from 32 to 24 instructions,
475 // and it reduces the corresponding one-pass NFA from 16 nodes to 8.
476 
BeginRange()477 void Compiler::BeginRange() {
478   rune_cache_.clear();
479   rune_range_.begin = 0;
480   rune_range_.end = nullPatchList;
481 }
482 
UncachedRuneByteSuffix(uint8 lo,uint8 hi,bool foldcase,int next)483 int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
484                                      int next) {
485   Frag f = ByteRange(lo, hi, foldcase);
486   if (next != 0) {
487     PatchList::Patch(inst_, f.end, next);
488   } else {
489     rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end);
490   }
491   return f.begin;
492 }
493 
RuneByteSuffix(uint8 lo,uint8 hi,bool foldcase,int next)494 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
495   // In Latin1 mode, there's no point in caching.
496   // In forward UTF-8 mode, only need to cache continuation bytes.
497   if (encoding_ == kEncodingLatin1 ||
498       (encoding_ == kEncodingUTF8 &&
499        !reversed_ &&
500        !(0x80 <= lo && hi <= 0xbf))) {
501     return UncachedRuneByteSuffix(lo, hi, foldcase, next);
502   }
503 
504   uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase;
505   map<uint64, int>::iterator it = rune_cache_.find(key);
506   if (it != rune_cache_.end())
507     return it->second;
508   int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
509   rune_cache_[key] = id;
510   return id;
511 }
512 
AddSuffix(int id)513 void Compiler::AddSuffix(int id) {
514   if (rune_range_.begin == 0) {
515     rune_range_.begin = id;
516     return;
517   }
518 
519   int alt = AllocInst(1);
520   if (alt < 0) {
521     rune_range_.begin = 0;
522     return;
523   }
524   inst_[alt].InitAlt(rune_range_.begin, id);
525   rune_range_.begin = alt;
526 }
527 
EndRange()528 Frag Compiler::EndRange() {
529   return rune_range_;
530 }
531 
532 // Converts rune range lo-hi into a fragment that recognizes
533 // the bytes that would make up those runes in the current
534 // encoding (Latin 1 or UTF-8).
535 // This lets the machine work byte-by-byte even when
536 // using multibyte encodings.
537 
AddRuneRange(Rune lo,Rune hi,bool foldcase)538 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
539   switch (encoding_) {
540     default:
541     case kEncodingUTF8:
542       AddRuneRangeUTF8(lo, hi, foldcase);
543       break;
544     case kEncodingLatin1:
545       AddRuneRangeLatin1(lo, hi, foldcase);
546       break;
547   }
548 }
549 
AddRuneRangeLatin1(Rune lo,Rune hi,bool foldcase)550 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
551   // Latin1 is easy: runes *are* bytes.
552   if (lo > hi || lo > 0xFF)
553     return;
554   if (hi > 0xFF)
555     hi = 0xFF;
556   AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
557 }
558 
559 // Table describing how to make a UTF-8 matching machine
560 // for the rune range 80-10FFFF (Runeself-Runemax).
561 // This range happens frequently enough (for example /./ and /[^a-z]/)
562 // and the rune_cache_ map is slow enough that this is worth
563 // special handling.  Makes compilation of a small expression
564 // with a dot in it about 10% faster.
565 // The * in the comments below mark whole sequences.
566 static struct ByteRangeProg {
567   int next;
568   int lo;
569   int hi;
570 } prog_80_10ffff[] = {
571   // Two-byte
572   { -1, 0x80, 0xBF, },  // 0:  80-BF
573   {  0, 0xC2, 0xDF, },  // 1:  C2-DF 80-BF*
574 
575   // Three-byte
576   {  0, 0xA0, 0xBF, },  // 2:  A0-BF 80-BF
577   {  2, 0xE0, 0xE0, },  // 3:  E0 A0-BF 80-BF*
578   {  0, 0x80, 0xBF, },  // 4:  80-BF 80-BF
579   {  4, 0xE1, 0xEF, },  // 5:  E1-EF 80-BF 80-BF*
580 
581   // Four-byte
582   {  4, 0x90, 0xBF, },  // 6:  90-BF 80-BF 80-BF
583   {  6, 0xF0, 0xF0, },  // 7:  F0 90-BF 80-BF 80-BF*
584   {  4, 0x80, 0xBF, },  // 8:  80-BF 80-BF 80-BF
585   {  8, 0xF1, 0xF3, },  // 9: F1-F3 80-BF 80-BF 80-BF*
586   {  4, 0x80, 0x8F, },  // 10: 80-8F 80-BF 80-BF
587   { 10, 0xF4, 0xF4, },  // 11: F4 80-8F 80-BF 80-BF*
588 };
589 
Add_80_10ffff()590 void Compiler::Add_80_10ffff() {
591   int inst[arraysize(prog_80_10ffff)];
592   for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
593     const ByteRangeProg& p = prog_80_10ffff[i];
594     int next = 0;
595     if (p.next >= 0)
596       next = inst[p.next];
597     inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);
598     if ((p.lo & 0xC0) != 0x80)
599       AddSuffix(inst[i]);
600   }
601 }
602 
AddRuneRangeUTF8(Rune lo,Rune hi,bool foldcase)603 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
604   if (lo > hi)
605     return;
606 
607   // Pick off 80-10FFFF as a common special case
608   // that can bypass the slow rune_cache_.
609   if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
610     Add_80_10ffff();
611     return;
612   }
613 
614   // Split range into same-length sized ranges.
615   for (int i = 1; i < UTFmax; i++) {
616     Rune max = MaxRune(i);
617     if (lo <= max && max < hi) {
618       AddRuneRangeUTF8(lo, max, foldcase);
619       AddRuneRangeUTF8(max+1, hi, foldcase);
620       return;
621     }
622   }
623 
624   // ASCII range is always a special case.
625   if (hi < Runeself) {
626     AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
627     return;
628   }
629 
630   // Split range into sections that agree on leading bytes.
631   for (int i = 1; i < UTFmax; i++) {
632     uint m = (1<<(6*i)) - 1;  // last i bytes of a UTF-8 sequence
633     if ((lo & ~m) != (hi & ~m)) {
634       if ((lo & m) != 0) {
635         AddRuneRangeUTF8(lo, lo|m, foldcase);
636         AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
637         return;
638       }
639       if ((hi & m) != m) {
640         AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
641         AddRuneRangeUTF8(hi&~m, hi, foldcase);
642         return;
643       }
644     }
645   }
646 
647   // Finally.  Generate byte matching equivalent for lo-hi.
648   uint8 ulo[UTFmax], uhi[UTFmax];
649   int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
650   int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
651   (void)m;  // USED(m)
652   DCHECK_EQ(n, m);
653 
654   int id = 0;
655   if (reversed_) {
656     for (int i = 0; i < n; i++)
657       id = RuneByteSuffix(ulo[i], uhi[i], false, id);
658   } else {
659     for (int i = n-1; i >= 0; i--)
660       id = RuneByteSuffix(ulo[i], uhi[i], false, id);
661   }
662   AddSuffix(id);
663 }
664 
665 // Should not be called.
Copy(Frag arg)666 Frag Compiler::Copy(Frag arg) {
667   // We're using WalkExponential; there should be no copying.
668   LOG(DFATAL) << "Compiler::Copy called!";
669   failed_ = true;
670   return NoMatch();
671 }
672 
673 // Visits a node quickly; called once WalkExponential has
674 // decided to cut this walk short.
ShortVisit(Regexp * re,Frag)675 Frag Compiler::ShortVisit(Regexp* re, Frag) {
676   failed_ = true;
677   return NoMatch();
678 }
679 
680 // Called before traversing a node's children during the walk.
PreVisit(Regexp * re,Frag,bool * stop)681 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
682   // Cut off walk if we've already failed.
683   if (failed_)
684     *stop = true;
685 
686   return kNullFrag;  // not used by caller
687 }
688 
Literal(Rune r,bool foldcase)689 Frag Compiler::Literal(Rune r, bool foldcase) {
690   switch (encoding_) {
691     default:
692       return kNullFrag;
693 
694     case kEncodingLatin1:
695       return ByteRange(r, r, foldcase);
696 
697     case kEncodingUTF8: {
698       if (r < Runeself)  // Make common case fast.
699         return ByteRange(r, r, foldcase);
700       uint8 buf[UTFmax];
701       int n = runetochar(reinterpret_cast<char*>(buf), &r);
702       Frag f = ByteRange((uint8)buf[0], buf[0], false);
703       for (int i = 1; i < n; i++)
704         f = Cat(f, ByteRange((uint8)buf[i], buf[i], false));
705       return f;
706     }
707   }
708 }
709 
710 // Called after traversing the node's children during the walk.
711 // Given their frags, build and return the frag for this re.
PostVisit(Regexp * re,Frag,Frag,Frag * child_frags,int nchild_frags)712 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
713                          int nchild_frags) {
714   // If a child failed, don't bother going forward, especially
715   // since the child_frags might contain Frags with NULLs in them.
716   if (failed_)
717     return NoMatch();
718 
719   // Given the child fragments, return the fragment for this node.
720   switch (re->op()) {
721     case kRegexpRepeat:
722       // Should not see; code at bottom of function will print error
723       break;
724 
725     case kRegexpNoMatch:
726       return NoMatch();
727 
728     case kRegexpEmptyMatch:
729       return Nop();
730 
731     case kRegexpHaveMatch: {
732       Frag f = Match(re->match_id());
733       // Remember unanchored match to end of string.
734       if (anchor_ != RE2::ANCHOR_BOTH)
735         f = Cat(DotStar(), f);
736       return f;
737     }
738 
739     case kRegexpConcat: {
740       Frag f = child_frags[0];
741       for (int i = 1; i < nchild_frags; i++)
742         f = Cat(f, child_frags[i]);
743       return f;
744     }
745 
746     case kRegexpAlternate: {
747       Frag f = child_frags[0];
748       for (int i = 1; i < nchild_frags; i++)
749         f = Alt(f, child_frags[i]);
750       return f;
751     }
752 
753     case kRegexpStar:
754       return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
755 
756     case kRegexpPlus:
757       return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
758 
759     case kRegexpQuest:
760       return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
761 
762     case kRegexpLiteral:
763       return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase);
764 
765     case kRegexpLiteralString: {
766       // Concatenation of literals.
767       if (re->nrunes() == 0)
768         return Nop();
769       Frag f;
770       for (int i = 0; i < re->nrunes(); i++) {
771         Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase);
772         if (i == 0)
773           f = f1;
774         else
775           f = Cat(f, f1);
776       }
777       return f;
778     }
779 
780     case kRegexpAnyChar:
781       BeginRange();
782       AddRuneRange(0, Runemax, false);
783       return EndRange();
784 
785     case kRegexpAnyByte:
786       return ByteRange(0x00, 0xFF, false);
787 
788     case kRegexpCharClass: {
789       CharClass* cc = re->cc();
790       if (cc->empty()) {
791         // This can't happen.
792         LOG(DFATAL) << "No ranges in char class";
793         failed_ = true;
794         return NoMatch();
795       }
796 
797       // ASCII case-folding optimization: if the char class
798       // behaves the same on A-Z as it does on a-z,
799       // discard any ranges wholly contained in A-Z
800       // and mark the other ranges as foldascii.
801       // This reduces the size of a program for
802       // (?i)abc from 3 insts per letter to 1 per letter.
803       bool foldascii = cc->FoldsASCII();
804 
805       // Character class is just a big OR of the different
806       // character ranges in the class.
807       BeginRange();
808       for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
809         // ASCII case-folding optimization (see above).
810         if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
811           continue;
812 
813         // If this range contains all of A-Za-z or none of it,
814         // the fold flag is unnecessary; don't bother.
815         bool fold = foldascii;
816         if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo)
817           fold = false;
818 
819         AddRuneRange(i->lo, i->hi, fold);
820       }
821       return EndRange();
822     }
823 
824     case kRegexpCapture:
825       // If this is a non-capturing parenthesis -- (?:foo) --
826       // just use the inner expression.
827       if (re->cap() < 0)
828         return child_frags[0];
829       return Capture(child_frags[0], re->cap());
830 
831     case kRegexpBeginLine:
832       return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
833 
834     case kRegexpEndLine:
835       return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
836 
837     case kRegexpBeginText:
838       return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
839 
840     case kRegexpEndText:
841       return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
842 
843     case kRegexpWordBoundary:
844       return EmptyWidth(kEmptyWordBoundary);
845 
846     case kRegexpNoWordBoundary:
847       return EmptyWidth(kEmptyNonWordBoundary);
848   }
849   LOG(DFATAL) << "Missing case in Compiler: " << re->op();
850   failed_ = true;
851   return NoMatch();
852 }
853 
854 // Is this regexp required to start at the beginning of the text?
855 // Only approximate; can return false for complicated regexps like (\Aa|\Ab),
856 // but handles (\A(a|b)).  Could use the Walker to write a more exact one.
IsAnchorStart(Regexp ** pre,int depth)857 static bool IsAnchorStart(Regexp** pre, int depth) {
858   Regexp* re = *pre;
859   Regexp* sub;
860   // The depth limit makes sure that we don't overflow
861   // the stack on a deeply nested regexp.  As the comment
862   // above says, IsAnchorStart is conservative, so returning
863   // a false negative is okay.  The exact limit is somewhat arbitrary.
864   if (re == NULL || depth >= 4)
865     return false;
866   switch (re->op()) {
867     default:
868       break;
869     case kRegexpConcat:
870       if (re->nsub() > 0) {
871         sub = re->sub()[0]->Incref();
872         if (IsAnchorStart(&sub, depth+1)) {
873           Regexp** subcopy = new Regexp*[re->nsub()];
874           subcopy[0] = sub;  // already have reference
875           for (int i = 1; i < re->nsub(); i++)
876             subcopy[i] = re->sub()[i]->Incref();
877           *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
878           delete[] subcopy;
879           re->Decref();
880           return true;
881         }
882         sub->Decref();
883       }
884       break;
885     case kRegexpCapture:
886       sub = re->sub()[0]->Incref();
887       if (IsAnchorStart(&sub, depth+1)) {
888         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
889         re->Decref();
890         return true;
891       }
892       sub->Decref();
893       break;
894     case kRegexpBeginText:
895       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
896       re->Decref();
897       return true;
898   }
899   return false;
900 }
901 
902 // Is this regexp required to start at the end of the text?
903 // Only approximate; can return false for complicated regexps like (a\z|b\z),
904 // but handles ((a|b)\z).  Could use the Walker to write a more exact one.
IsAnchorEnd(Regexp ** pre,int depth)905 static bool IsAnchorEnd(Regexp** pre, int depth) {
906   Regexp* re = *pre;
907   Regexp* sub;
908   // The depth limit makes sure that we don't overflow
909   // the stack on a deeply nested regexp.  As the comment
910   // above says, IsAnchorEnd is conservative, so returning
911   // a false negative is okay.  The exact limit is somewhat arbitrary.
912   if (re == NULL || depth >= 4)
913     return false;
914   switch (re->op()) {
915     default:
916       break;
917     case kRegexpConcat:
918       if (re->nsub() > 0) {
919         sub = re->sub()[re->nsub() - 1]->Incref();
920         if (IsAnchorEnd(&sub, depth+1)) {
921           Regexp** subcopy = new Regexp*[re->nsub()];
922           subcopy[re->nsub() - 1] = sub;  // already have reference
923           for (int i = 0; i < re->nsub() - 1; i++)
924             subcopy[i] = re->sub()[i]->Incref();
925           *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
926           delete[] subcopy;
927           re->Decref();
928           return true;
929         }
930         sub->Decref();
931       }
932       break;
933     case kRegexpCapture:
934       sub = re->sub()[0]->Incref();
935       if (IsAnchorEnd(&sub, depth+1)) {
936         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
937         re->Decref();
938         return true;
939       }
940       sub->Decref();
941       break;
942     case kRegexpEndText:
943       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
944       re->Decref();
945       return true;
946   }
947   return false;
948 }
949 
Setup(Regexp::ParseFlags flags,int64 max_mem,RE2::Anchor anchor)950 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
951                      RE2::Anchor anchor) {
952   prog_->set_flags(flags);
953 
954   if (flags & Regexp::Latin1)
955     encoding_ = kEncodingLatin1;
956   max_mem_ = max_mem;
957   if (max_mem <= 0) {
958     max_inst_ = 100000;  // more than enough
959   } else if (max_mem <= sizeof(Prog)) {
960     // No room for anything.
961     max_inst_ = 0;
962   } else {
963     int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
964     // Limit instruction count so that inst->id() fits nicely in an int.
965     // SparseArray also assumes that the indices (inst->id()) are ints.
966     // The call to WalkExponential uses 2*max_inst_ below,
967     // and other places in the code use 2 or 3 * prog->size().
968     // Limiting to 2^24 should avoid overflow in those places.
969     // (The point of allowing more than 32 bits of memory is to
970     // have plenty of room for the DFA states, not to use it up
971     // on the program.)
972     if (m >= 1<<24)
973       m = 1<<24;
974 
975     // Inst imposes its own limit (currently bigger than 2^24 but be safe).
976     if (m > Prog::Inst::kMaxInst)
977       m = Prog::Inst::kMaxInst;
978 
979     max_inst_ = m;
980   }
981 
982   anchor_ = anchor;
983 }
984 
985 // Compiles re, returning program.
986 // Caller is responsible for deleting prog_.
987 // If reversed is true, compiles a program that expects
988 // to run over the input string backward (reverses all concatenations).
989 // The reversed flag is also recorded in the returned program.
Compile(Regexp * re,bool reversed,int64 max_mem)990 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
991   Compiler c;
992 
993   c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
994   c.reversed_ = reversed;
995 
996   // Simplify to remove things like counted repetitions
997   // and character classes like \d.
998   Regexp* sre = re->Simplify();
999   if (sre == NULL)
1000     return NULL;
1001 
1002   // Record whether prog is anchored, removing the anchors.
1003   // (They get in the way of other optimizations.)
1004   bool is_anchor_start = IsAnchorStart(&sre, 0);
1005   bool is_anchor_end = IsAnchorEnd(&sre, 0);
1006 
1007   // Generate fragment for entire regexp.
1008   Frag f = c.WalkExponential(sre, kNullFrag, 2*c.max_inst_);
1009   sre->Decref();
1010   if (c.failed_)
1011     return NULL;
1012 
1013   // Success!  Finish by putting Match node at end, and record start.
1014   // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1015   // to behave normally.
1016   c.reversed_ = false;
1017   Frag all = c.Cat(f, c.Match(0));
1018   c.prog_->set_start(all.begin);
1019 
1020   if (reversed) {
1021     c.prog_->set_anchor_start(is_anchor_end);
1022     c.prog_->set_anchor_end(is_anchor_start);
1023   } else {
1024     c.prog_->set_anchor_start(is_anchor_start);
1025     c.prog_->set_anchor_end(is_anchor_end);
1026   }
1027 
1028   // Also create unanchored version, which starts with a .*? loop.
1029   if (c.prog_->anchor_start()) {
1030     c.prog_->set_start_unanchored(c.prog_->start());
1031   } else {
1032     Frag unanchored = c.Cat(c.DotStar(), all);
1033     c.prog_->set_start_unanchored(unanchored.begin);
1034   }
1035 
1036   c.prog_->set_reversed(reversed);
1037 
1038   // Hand ownership of prog_ to caller.
1039   return c.Finish();
1040 }
1041 
Finish()1042 Prog* Compiler::Finish() {
1043   if (failed_)
1044     return NULL;
1045 
1046   if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1047     // No possible matches; keep Fail instruction only.
1048     inst_len_ = 1;
1049   }
1050 
1051   // Trim instruction to minimum array and transfer to Prog.
1052   Trim();
1053   prog_->inst_ = inst_;
1054   prog_->size_ = inst_len_;
1055   inst_ = NULL;
1056 
1057   // Compute byte map.
1058   prog_->ComputeByteMap();
1059 
1060   prog_->Optimize();
1061 
1062   // Record remaining memory for DFA.
1063   if (max_mem_ <= 0) {
1064     prog_->set_dfa_mem(1<<20);
1065   } else {
1066     int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst);
1067     if (m < 0)
1068       m = 0;
1069     prog_->set_dfa_mem(m);
1070   }
1071 
1072   Prog* p = prog_;
1073   prog_ = NULL;
1074   return p;
1075 }
1076 
1077 // Converts Regexp to Prog.
CompileToProg(int64 max_mem)1078 Prog* Regexp::CompileToProg(int64 max_mem) {
1079   return Compiler::Compile(this, false, max_mem);
1080 }
1081 
CompileToReverseProg(int64 max_mem)1082 Prog* Regexp::CompileToReverseProg(int64 max_mem) {
1083   return Compiler::Compile(this, true, max_mem);
1084 }
1085 
DotStar()1086 Frag Compiler::DotStar() {
1087   return Star(ByteRange(0x00, 0xff, false), true);
1088 }
1089 
1090 // Compiles RE set to Prog.
CompileSet(const RE2::Options & options,RE2::Anchor anchor,Regexp * re)1091 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1092                            Regexp* re) {
1093   Compiler c;
1094 
1095   Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
1096   c.Setup(pf, options.max_mem(), anchor);
1097 
1098   // Compile alternation of fragments.
1099   Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_);
1100   re->Decref();
1101   if (c.failed_)
1102     return NULL;
1103 
1104   if (anchor == RE2::UNANCHORED) {
1105     // The trailing .* was added while handling kRegexpHaveMatch.
1106     // We just have to add the leading one.
1107     all = c.Cat(c.DotStar(), all);
1108   }
1109 
1110   c.prog_->set_start(all.begin);
1111   c.prog_->set_start_unanchored(all.begin);
1112   c.prog_->set_anchor_start(true);
1113   c.prog_->set_anchor_end(true);
1114 
1115   Prog* prog = c.Finish();
1116   if (prog == NULL)
1117     return NULL;
1118 
1119   // Make sure DFA has enough memory to operate,
1120   // since we're not going to fall back to the NFA.
1121   bool failed;
1122   StringPiece sp = "hello, world";
1123   prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1124                   NULL, &failed, NULL);
1125   if (failed) {
1126     delete prog;
1127     return NULL;
1128   }
1129 
1130   return prog;
1131 }
1132 
CompileSet(const RE2::Options & options,RE2::Anchor anchor,Regexp * re)1133 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1134                        Regexp* re) {
1135   return Compiler::CompileSet(options, anchor, re);
1136 }
1137 
1138 }  // namespace re2
1139