• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Compile regular expression to Prog.
6 //
7 // Prog and Inst are defined in prog.h.
8 // This file's external interface is just Regexp::CompileToProg.
9 // The Compiler class defined in this file is private.
10 
11 #include <stdint.h>
12 #include <string.h>
13 #include <unordered_map>
14 #include <utility>
15 
16 #include "util/logging.h"
17 #include "util/utf.h"
18 #include "re2/pod_array.h"
19 #include "re2/prog.h"
20 #include "re2/re2.h"
21 #include "re2/regexp.h"
22 #include "re2/walker-inl.h"
23 
24 namespace re2 {
25 
26 // List of pointers to Inst* that need to be filled in (patched).
27 // Because the Inst* haven't been filled in yet,
28 // we can use the Inst* word to hold the list's "next" pointer.
29 // It's kind of sleazy, but it works well in practice.
30 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
31 //
32 // Because the out and out1 fields in Inst are no longer pointers,
33 // we can't use pointers directly here either.  Instead, p refers
34 // to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
35 // p == 0 represents the NULL list.  This is okay because instruction #0
36 // is always the fail instruction, which never appears on a list.
37 
38 struct PatchList {
39   uint32_t p;
40 
41   // Returns patch list containing just p.
42   static PatchList Mk(uint32_t p);
43 
44   // Patches all the entries on l to have value v.
45   // Caller must not ever use patch list again.
46   static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v);
47 
48   // Deref returns the next pointer pointed at by p.
49   static PatchList Deref(Prog::Inst *inst0, PatchList l);
50 
51   // Appends two patch lists and returns result.
52   static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
53 };
54 
55 static PatchList nullPatchList = { 0 };
56 
57 // Returns patch list containing just p.
Mk(uint32_t p)58 PatchList PatchList::Mk(uint32_t p) {
59   PatchList l;
60   l.p = p;
61   return l;
62 }
63 
64 // Returns the next pointer pointed at by l.
Deref(Prog::Inst * inst0,PatchList l)65 PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
66   Prog::Inst* ip = &inst0[l.p>>1];
67   if (l.p&1)
68     l.p = ip->out1();
69   else
70     l.p = ip->out();
71   return l;
72 }
73 
74 // Patches all the entries on l to have value v.
Patch(Prog::Inst * inst0,PatchList l,uint32_t val)75 void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) {
76   while (l.p != 0) {
77     Prog::Inst* ip = &inst0[l.p>>1];
78     if (l.p&1) {
79       l.p = ip->out1();
80       ip->out1_ = val;
81     } else {
82       l.p = ip->out();
83       ip->set_out(val);
84     }
85   }
86 }
87 
88 // Appends two patch lists and returns result.
Append(Prog::Inst * inst0,PatchList l1,PatchList l2)89 PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
90   if (l1.p == 0)
91     return l2;
92   if (l2.p == 0)
93     return l1;
94 
95   PatchList l = l1;
96   for (;;) {
97     PatchList next = PatchList::Deref(inst0, l);
98     if (next.p == 0)
99       break;
100     l = next;
101   }
102 
103   Prog::Inst* ip = &inst0[l.p>>1];
104   if (l.p&1)
105     ip->out1_ = l2.p;
106   else
107     ip->set_out(l2.p);
108 
109   return l1;
110 }
111 
112 // Compiled program fragment.
113 struct Frag {
114   uint32_t begin;
115   PatchList end;
116 
Fragre2::Frag117   Frag() : begin(0) { end.p = 0; }  // needed so Frag can go in vector
Fragre2::Frag118   Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {}
119 };
120 
121 // Input encodings.
122 enum Encoding {
123   kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
124   kEncodingLatin1,    // Latin-1 (0-FF)
125 };
126 
127 class Compiler : public Regexp::Walker<Frag> {
128  public:
129   explicit Compiler();
130   ~Compiler();
131 
132   // Compiles Regexp to a new Prog.
133   // Caller is responsible for deleting Prog when finished with it.
134   // If reversed is true, compiles for walking over the input
135   // string backward (reverses all concatenations).
136   static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
137 
138   // Compiles alternation of all the re to a new Prog.
139   // Each re has a match with an id equal to its index in the vector.
140   static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
141 
142   // Interface for Regexp::Walker, which helps traverse the Regexp.
143   // The walk is purely post-recursive: given the machines for the
144   // children, PostVisit combines them to create the machine for
145   // the current node.  The child_args are Frags.
146   // The Compiler traverses the Regexp parse tree, visiting
147   // each node in depth-first order.  It invokes PreVisit before
148   // visiting the node's children and PostVisit after visiting
149   // the children.
150   Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
151   Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
152                  int nchild_args);
153   Frag ShortVisit(Regexp* re, Frag parent_arg);
154   Frag Copy(Frag arg);
155 
156   // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
157   Frag Plus(Frag a, bool nongreedy);
158   Frag Star(Frag a, bool nongreedy);
159   Frag Quest(Frag a, bool nongreedy);
160 
161   // Given fragment a, returns (a) capturing as \n.
162   Frag Capture(Frag a, int n);
163 
164   // Given fragments a and b, returns ab; a|b
165   Frag Cat(Frag a, Frag b);
166   Frag Alt(Frag a, Frag b);
167 
168   // Returns a fragment that can't match anything.
169   Frag NoMatch();
170 
171   // Returns a fragment that matches the empty string.
172   Frag Match(int32_t id);
173 
174   // Returns a no-op fragment.
175   Frag Nop();
176 
177   // Returns a fragment matching the byte range lo-hi.
178   Frag ByteRange(int lo, int hi, bool foldcase);
179 
180   // Returns a fragment matching an empty-width special op.
181   Frag EmptyWidth(EmptyOp op);
182 
183   // Adds n instructions to the program.
184   // Returns the index of the first one.
185   // Returns -1 if no more instructions are available.
186   int AllocInst(int n);
187 
188   // Rune range compiler.
189 
190   // Begins a new alternation.
191   void BeginRange();
192 
193   // Adds a fragment matching the rune range lo-hi.
194   void AddRuneRange(Rune lo, Rune hi, bool foldcase);
195   void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
196   void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
197   void Add_80_10ffff();
198 
199   // New suffix that matches the byte range lo-hi, then goes to next.
200   int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
201   int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
202 
203   // Returns true iff the suffix is cached.
204   bool IsCachedRuneByteSuffix(int id);
205 
206   // Adds a suffix to alternation.
207   void AddSuffix(int id);
208 
209   // Adds a suffix to the trie starting from the given root node.
210   // Returns zero iff allocating an instruction fails. Otherwise, returns
211   // the current root node, which might be different from what was given.
212   int AddSuffixRecursive(int root, int id);
213 
214   // Finds the trie node for the given suffix. Returns a Frag in order to
215   // distinguish between pointing at the root node directly (end.p == 0)
216   // and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively).
217   Frag FindByteRange(int root, int id);
218 
219   // Compares two ByteRanges and returns true iff they are equal.
220   bool ByteRangeEqual(int id1, int id2);
221 
222   // Returns the alternation of all the added suffixes.
223   Frag EndRange();
224 
225   // Single rune.
226   Frag Literal(Rune r, bool foldcase);
227 
228   void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor);
229   Prog* Finish();
230 
231   // Returns .* where dot = any byte
232   Frag DotStar();
233 
234  private:
235   Prog* prog_;         // Program being built.
236   bool failed_;        // Did we give up compiling?
237   Encoding encoding_;  // Input encoding
238   bool reversed_;      // Should program run backward over text?
239 
240   PODArray<Prog::Inst> inst_;
241   int ninst_;          // Number of instructions used.
242   int max_ninst_;      // Maximum number of instructions.
243 
244   int64_t max_mem_;    // Total memory budget.
245 
246   std::unordered_map<uint64_t, int> rune_cache_;
247   Frag rune_range_;
248 
249   RE2::Anchor anchor_;  // anchor mode for RE2::Set
250 
251   Compiler(const Compiler&) = delete;
252   Compiler& operator=(const Compiler&) = delete;
253 };
254 
Compiler()255 Compiler::Compiler() {
256   prog_ = new Prog();
257   failed_ = false;
258   encoding_ = kEncodingUTF8;
259   reversed_ = false;
260   ninst_ = 0;
261   max_ninst_ = 1;  // make AllocInst for fail instruction okay
262   max_mem_ = 0;
263   int fail = AllocInst(1);
264   inst_[fail].InitFail();
265   max_ninst_ = 0;  // Caller must change
266 }
267 
~Compiler()268 Compiler::~Compiler() {
269   delete prog_;
270 }
271 
AllocInst(int n)272 int Compiler::AllocInst(int n) {
273   if (failed_ || ninst_ + n > max_ninst_) {
274     failed_ = true;
275     return -1;
276   }
277 
278   if (ninst_ + n > inst_.size()) {
279     int cap = inst_.size();
280     if (cap == 0)
281       cap = 8;
282     while (ninst_ + n > cap)
283       cap *= 2;
284     PODArray<Prog::Inst> inst(cap);
285     if (inst_.data() != NULL)
286       memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
287     memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
288     inst_ = std::move(inst);
289   }
290   int id = ninst_;
291   ninst_ += n;
292   return id;
293 }
294 
295 // These routines are somewhat hard to visualize in text --
296 // see http://swtch.com/~rsc/regexp/regexp1.html for
297 // pictures explaining what is going on here.
298 
299 // Returns an unmatchable fragment.
NoMatch()300 Frag Compiler::NoMatch() {
301   return Frag(0, nullPatchList);
302 }
303 
304 // Is a an unmatchable fragment?
IsNoMatch(Frag a)305 static bool IsNoMatch(Frag a) {
306   return a.begin == 0;
307 }
308 
309 // Given fragments a and b, returns fragment for ab.
Cat(Frag a,Frag b)310 Frag Compiler::Cat(Frag a, Frag b) {
311   if (IsNoMatch(a) || IsNoMatch(b))
312     return NoMatch();
313 
314   // Elide no-op.
315   Prog::Inst* begin = &inst_[a.begin];
316   if (begin->opcode() == kInstNop &&
317       a.end.p == (a.begin << 1) &&
318       begin->out() == 0) {
319     // in case refs to a somewhere
320     PatchList::Patch(inst_.data(), a.end, b.begin);
321     return b;
322   }
323 
324   // To run backward over string, reverse all concatenations.
325   if (reversed_) {
326     PatchList::Patch(inst_.data(), b.end, a.begin);
327     return Frag(b.begin, a.end);
328   }
329 
330   PatchList::Patch(inst_.data(), a.end, b.begin);
331   return Frag(a.begin, b.end);
332 }
333 
334 // Given fragments for a and b, returns fragment for a|b.
Alt(Frag a,Frag b)335 Frag Compiler::Alt(Frag a, Frag b) {
336   // Special case for convenience in loops.
337   if (IsNoMatch(a))
338     return b;
339   if (IsNoMatch(b))
340     return a;
341 
342   int id = AllocInst(1);
343   if (id < 0)
344     return NoMatch();
345 
346   inst_[id].InitAlt(a.begin, b.begin);
347   return Frag(id, PatchList::Append(inst_.data(), a.end, b.end));
348 }
349 
350 // When capturing submatches in like-Perl mode, a kOpAlt Inst
351 // treats out_ as the first choice, out1_ as the second.
352 //
353 // For *, +, and ?, if out_ causes another repetition,
354 // then the operator is greedy.  If out1_ is the repetition
355 // (and out_ moves forward), then the operator is non-greedy.
356 
357 // Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
Star(Frag a,bool nongreedy)358 Frag Compiler::Star(Frag a, bool nongreedy) {
359   int id = AllocInst(1);
360   if (id < 0)
361     return NoMatch();
362   inst_[id].InitAlt(0, 0);
363   PatchList::Patch(inst_.data(), a.end, id);
364   if (nongreedy) {
365     inst_[id].out1_ = a.begin;
366     return Frag(id, PatchList::Mk(id << 1));
367   } else {
368     inst_[id].set_out(a.begin);
369     return Frag(id, PatchList::Mk((id << 1) | 1));
370   }
371 }
372 
373 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
Plus(Frag a,bool nongreedy)374 Frag Compiler::Plus(Frag a, bool nongreedy) {
375   // a+ is just a* with a different entry point.
376   Frag f = Star(a, nongreedy);
377   return Frag(a.begin, f.end);
378 }
379 
380 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
Quest(Frag a,bool nongreedy)381 Frag Compiler::Quest(Frag a, bool nongreedy) {
382   if (IsNoMatch(a))
383     return Nop();
384   int id = AllocInst(1);
385   if (id < 0)
386     return NoMatch();
387   PatchList pl;
388   if (nongreedy) {
389     inst_[id].InitAlt(0, a.begin);
390     pl = PatchList::Mk(id << 1);
391   } else {
392     inst_[id].InitAlt(a.begin, 0);
393     pl = PatchList::Mk((id << 1) | 1);
394   }
395   return Frag(id, PatchList::Append(inst_.data(), pl, a.end));
396 }
397 
398 // Returns a fragment for the byte range lo-hi.
ByteRange(int lo,int hi,bool foldcase)399 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
400   int id = AllocInst(1);
401   if (id < 0)
402     return NoMatch();
403   inst_[id].InitByteRange(lo, hi, foldcase, 0);
404   return Frag(id, PatchList::Mk(id << 1));
405 }
406 
407 // Returns a no-op fragment.  Sometimes unavoidable.
Nop()408 Frag Compiler::Nop() {
409   int id = AllocInst(1);
410   if (id < 0)
411     return NoMatch();
412   inst_[id].InitNop(0);
413   return Frag(id, PatchList::Mk(id << 1));
414 }
415 
416 // Returns a fragment that signals a match.
Match(int32_t match_id)417 Frag Compiler::Match(int32_t match_id) {
418   int id = AllocInst(1);
419   if (id < 0)
420     return NoMatch();
421   inst_[id].InitMatch(match_id);
422   return Frag(id, nullPatchList);
423 }
424 
425 // Returns a fragment matching a particular empty-width op (like ^ or $)
EmptyWidth(EmptyOp empty)426 Frag Compiler::EmptyWidth(EmptyOp empty) {
427   int id = AllocInst(1);
428   if (id < 0)
429     return NoMatch();
430   inst_[id].InitEmptyWidth(empty, 0);
431   return Frag(id, PatchList::Mk(id << 1));
432 }
433 
434 // Given a fragment a, returns a fragment with capturing parens around a.
Capture(Frag a,int n)435 Frag Compiler::Capture(Frag a, int n) {
436   if (IsNoMatch(a))
437     return NoMatch();
438   int id = AllocInst(2);
439   if (id < 0)
440     return NoMatch();
441   inst_[id].InitCapture(2*n, a.begin);
442   inst_[id+1].InitCapture(2*n+1, 0);
443   PatchList::Patch(inst_.data(), a.end, id+1);
444 
445   return Frag(id, PatchList::Mk((id+1) << 1));
446 }
447 
448 // A Rune is a name for a Unicode code point.
449 // Returns maximum rune encoded by UTF-8 sequence of length len.
MaxRune(int len)450 static int MaxRune(int len) {
451   int b;  // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
452   if (len == 1)
453     b = 7;
454   else
455     b = 8-(len+1) + 6*(len-1);
456   return (1<<b) - 1;   // maximum Rune for b bits.
457 }
458 
459 // The rune range compiler caches common suffix fragments,
460 // which are very common in UTF-8 (e.g., [80-bf]).
461 // The fragment suffixes are identified by their start
462 // instructions.  NULL denotes the eventual end match.
463 // The Frag accumulates in rune_range_.  Caching common
464 // suffixes reduces the UTF-8 "." from 32 to 24 instructions,
465 // and it reduces the corresponding one-pass NFA from 16 nodes to 8.
466 
BeginRange()467 void Compiler::BeginRange() {
468   rune_cache_.clear();
469   rune_range_.begin = 0;
470   rune_range_.end = nullPatchList;
471 }
472 
UncachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)473 int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
474                                      int next) {
475   Frag f = ByteRange(lo, hi, foldcase);
476   if (next != 0) {
477     PatchList::Patch(inst_.data(), f.end, next);
478   } else {
479     rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
480   }
481   return f.begin;
482 }
483 
MakeRuneCacheKey(uint8_t lo,uint8_t hi,bool foldcase,int next)484 static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
485                                  int next) {
486   return (uint64_t)next << 17 |
487          (uint64_t)lo   <<  9 |
488          (uint64_t)hi   <<  1 |
489          (uint64_t)foldcase;
490 }
491 
CachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)492 int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
493                                    int next) {
494   uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
495   std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
496   if (it != rune_cache_.end())
497     return it->second;
498   int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
499   rune_cache_[key] = id;
500   return id;
501 }
502 
IsCachedRuneByteSuffix(int id)503 bool Compiler::IsCachedRuneByteSuffix(int id) {
504   uint8_t lo = inst_[id].lo_;
505   uint8_t hi = inst_[id].hi_;
506   bool foldcase = inst_[id].foldcase() != 0;
507   int next = inst_[id].out();
508 
509   uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
510   return rune_cache_.find(key) != rune_cache_.end();
511 }
512 
AddSuffix(int id)513 void Compiler::AddSuffix(int id) {
514   if (failed_)
515     return;
516 
517   if (rune_range_.begin == 0) {
518     rune_range_.begin = id;
519     return;
520   }
521 
522   if (encoding_ == kEncodingUTF8) {
523     // Build a trie in order to reduce fanout.
524     rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
525     return;
526   }
527 
528   int alt = AllocInst(1);
529   if (alt < 0) {
530     rune_range_.begin = 0;
531     return;
532   }
533   inst_[alt].InitAlt(rune_range_.begin, id);
534   rune_range_.begin = alt;
535 }
536 
AddSuffixRecursive(int root,int id)537 int Compiler::AddSuffixRecursive(int root, int id) {
538   DCHECK(inst_[root].opcode() == kInstAlt ||
539          inst_[root].opcode() == kInstByteRange);
540 
541   Frag f = FindByteRange(root, id);
542   if (IsNoMatch(f)) {
543     int alt = AllocInst(1);
544     if (alt < 0)
545       return 0;
546     inst_[alt].InitAlt(root, id);
547     return alt;
548   }
549 
550   int br;
551   if (f.end.p == 0)
552     br = root;
553   else if (f.end.p&1)
554     br = inst_[f.begin].out1();
555   else
556     br = inst_[f.begin].out();
557 
558   if (IsCachedRuneByteSuffix(br)) {
559     // We can't fiddle with cached suffixes, so make a clone of the head.
560     int byterange = AllocInst(1);
561     if (byterange < 0)
562       return 0;
563     inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
564                                    inst_[br].foldcase(), inst_[br].out());
565 
566     // Ensure that the parent points to the clone, not to the original.
567     // Note that this could leave the head unreachable except via the cache.
568     br = byterange;
569     if (f.end.p == 0)
570       root = br;
571     else if (f.end.p&1)
572       inst_[f.begin].out1_ = br;
573     else
574       inst_[f.begin].set_out(br);
575   }
576 
577   int out = inst_[id].out();
578   if (!IsCachedRuneByteSuffix(id)) {
579     // The head should be the instruction most recently allocated, so free it
580     // instead of leaving it unreachable.
581     DCHECK_EQ(id, ninst_-1);
582     inst_[id].out_opcode_ = 0;
583     inst_[id].out1_ = 0;
584     ninst_--;
585   }
586 
587   out = AddSuffixRecursive(inst_[br].out(), out);
588   if (out == 0)
589     return 0;
590 
591   inst_[br].set_out(out);
592   return root;
593 }
594 
ByteRangeEqual(int id1,int id2)595 bool Compiler::ByteRangeEqual(int id1, int id2) {
596   return inst_[id1].lo() == inst_[id2].lo() &&
597          inst_[id1].hi() == inst_[id2].hi() &&
598          inst_[id1].foldcase() == inst_[id2].foldcase();
599 }
600 
FindByteRange(int root,int id)601 Frag Compiler::FindByteRange(int root, int id) {
602   if (inst_[root].opcode() == kInstByteRange) {
603     if (ByteRangeEqual(root, id))
604       return Frag(root, nullPatchList);
605     else
606       return NoMatch();
607   }
608 
609   while (inst_[root].opcode() == kInstAlt) {
610     int out1 = inst_[root].out1();
611     if (ByteRangeEqual(out1, id))
612       return Frag(root, PatchList::Mk((root << 1) | 1));
613 
614     // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
615     // what we're looking for, then we can stop immediately. Unfortunately, we
616     // can't short-circuit the search in reverse mode.
617     if (!reversed_)
618       return NoMatch();
619 
620     int out = inst_[root].out();
621     if (inst_[out].opcode() == kInstAlt)
622       root = out;
623     else if (ByteRangeEqual(out, id))
624       return Frag(root, PatchList::Mk(root << 1));
625     else
626       return NoMatch();
627   }
628 
629   LOG(DFATAL) << "should never happen";
630   return NoMatch();
631 }
632 
EndRange()633 Frag Compiler::EndRange() {
634   return rune_range_;
635 }
636 
637 // Converts rune range lo-hi into a fragment that recognizes
638 // the bytes that would make up those runes in the current
639 // encoding (Latin 1 or UTF-8).
640 // This lets the machine work byte-by-byte even when
641 // using multibyte encodings.
642 
AddRuneRange(Rune lo,Rune hi,bool foldcase)643 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
644   switch (encoding_) {
645     default:
646     case kEncodingUTF8:
647       AddRuneRangeUTF8(lo, hi, foldcase);
648       break;
649     case kEncodingLatin1:
650       AddRuneRangeLatin1(lo, hi, foldcase);
651       break;
652   }
653 }
654 
AddRuneRangeLatin1(Rune lo,Rune hi,bool foldcase)655 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
656   // Latin-1 is easy: runes *are* bytes.
657   if (lo > hi || lo > 0xFF)
658     return;
659   if (hi > 0xFF)
660     hi = 0xFF;
661   AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
662                                    static_cast<uint8_t>(hi), foldcase, 0));
663 }
664 
665 // Table describing how to make a UTF-8 matching machine
666 // for the rune range 80-10FFFF (Runeself-Runemax).
667 // This range happens frequently enough (for example /./ and /[^a-z]/)
668 // and the rune_cache_ map is slow enough that this is worth
669 // special handling.  Makes compilation of a small expression
670 // with a dot in it about 10% faster.
671 // The * in the comments below mark whole sequences.
672 static struct ByteRangeProg {
673   int next;
674   int lo;
675   int hi;
676 } prog_80_10ffff[] = {
677   // Two-byte
678   { -1, 0x80, 0xBF, },  // 0:  80-BF
679   {  0, 0xC2, 0xDF, },  // 1:  C2-DF 80-BF*
680 
681   // Three-byte
682   {  0, 0xA0, 0xBF, },  // 2:  A0-BF 80-BF
683   {  2, 0xE0, 0xE0, },  // 3:  E0 A0-BF 80-BF*
684   {  0, 0x80, 0xBF, },  // 4:  80-BF 80-BF
685   {  4, 0xE1, 0xEF, },  // 5:  E1-EF 80-BF 80-BF*
686 
687   // Four-byte
688   {  4, 0x90, 0xBF, },  // 6:  90-BF 80-BF 80-BF
689   {  6, 0xF0, 0xF0, },  // 7:  F0 90-BF 80-BF 80-BF*
690   {  4, 0x80, 0xBF, },  // 8:  80-BF 80-BF 80-BF
691   {  8, 0xF1, 0xF3, },  // 9: F1-F3 80-BF 80-BF 80-BF*
692   {  4, 0x80, 0x8F, },  // 10: 80-8F 80-BF 80-BF
693   { 10, 0xF4, 0xF4, },  // 11: F4 80-8F 80-BF 80-BF*
694 };
695 
Add_80_10ffff()696 void Compiler::Add_80_10ffff() {
697   int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
698   for (size_t i = 0; i < arraysize(prog_80_10ffff); i++) {
699     const ByteRangeProg& p = prog_80_10ffff[i];
700     int next = 0;
701     if (p.next >= 0)
702       next = inst[p.next];
703     inst[i] = UncachedRuneByteSuffix(static_cast<uint8_t>(p.lo),
704                                      static_cast<uint8_t>(p.hi), false, next);
705     if ((p.lo & 0xC0) != 0x80)
706       AddSuffix(inst[i]);
707   }
708 }
709 
AddRuneRangeUTF8(Rune lo,Rune hi,bool foldcase)710 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
711   if (lo > hi)
712     return;
713 
714   // Pick off 80-10FFFF as a common special case
715   // that can bypass the slow rune_cache_.
716   if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
717     Add_80_10ffff();
718     return;
719   }
720 
721   // Split range into same-length sized ranges.
722   for (int i = 1; i < UTFmax; i++) {
723     Rune max = MaxRune(i);
724     if (lo <= max && max < hi) {
725       AddRuneRangeUTF8(lo, max, foldcase);
726       AddRuneRangeUTF8(max+1, hi, foldcase);
727       return;
728     }
729   }
730 
731   // ASCII range is always a special case.
732   if (hi < Runeself) {
733     AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
734                                      static_cast<uint8_t>(hi), foldcase, 0));
735     return;
736   }
737 
738   // Split range into sections that agree on leading bytes.
739   for (int i = 1; i < UTFmax; i++) {
740     uint32_t m = (1<<(6*i)) - 1;  // last i bytes of a UTF-8 sequence
741     if ((lo & ~m) != (hi & ~m)) {
742       if ((lo & m) != 0) {
743         AddRuneRangeUTF8(lo, lo|m, foldcase);
744         AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
745         return;
746       }
747       if ((hi & m) != m) {
748         AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
749         AddRuneRangeUTF8(hi&~m, hi, foldcase);
750         return;
751       }
752     }
753   }
754 
755   // Finally.  Generate byte matching equivalent for lo-hi.
756   uint8_t ulo[UTFmax], uhi[UTFmax];
757   int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
758   int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
759   (void)m;  // USED(m)
760   DCHECK_EQ(n, m);
761 
762   // The logic below encodes this thinking:
763   //
764   // 1. When we have built the whole suffix, we know that it cannot
765   // possibly be a suffix of anything longer: in forward mode, nothing
766   // else can occur before the leading byte; in reverse mode, nothing
767   // else can occur after the last continuation byte or else the leading
768   // byte would have to change. Thus, there is no benefit to caching
769   // the first byte of the suffix whereas there is a cost involved in
770   // cloning it if it begins a common prefix, which is fairly likely.
771   //
772   // 2. Conversely, the last byte of the suffix cannot possibly be a
773   // prefix of anything because next == 0, so we will never want to
774   // clone it, but it is fairly likely to be a common suffix. Perhaps
775   // more so in reverse mode than in forward mode because the former is
776   // "converging" towards lower entropy, but caching is still worthwhile
777   // for the latter in cases such as 80-BF.
778   //
779   // 3. Handling the bytes between the first and the last is less
780   // straightforward and, again, the approach depends on whether we are
781   // "converging" towards lower entropy: in forward mode, a single byte
782   // is unlikely to be part of a common suffix whereas a byte range
783   // is more likely so; in reverse mode, a byte range is unlikely to
784   // be part of a common suffix whereas a single byte is more likely
785   // so. The same benefit versus cost argument applies here.
786   int id = 0;
787   if (reversed_) {
788     for (int i = 0; i < n; i++) {
789       // In reverse UTF-8 mode: cache the leading byte; don't cache the last
790       // continuation byte; cache anything else iff it's a single byte (XX-XX).
791       if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
792         id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
793       else
794         id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
795     }
796   } else {
797     for (int i = n-1; i >= 0; i--) {
798       // In forward UTF-8 mode: don't cache the leading byte; cache the last
799       // continuation byte; cache anything else iff it's a byte range (XX-YY).
800       if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
801         id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
802       else
803         id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
804     }
805   }
806   AddSuffix(id);
807 }
808 
809 // Should not be called.
Copy(Frag arg)810 Frag Compiler::Copy(Frag arg) {
811   // We're using WalkExponential; there should be no copying.
812   LOG(DFATAL) << "Compiler::Copy called!";
813   failed_ = true;
814   return NoMatch();
815 }
816 
817 // Visits a node quickly; called once WalkExponential has
818 // decided to cut this walk short.
ShortVisit(Regexp * re,Frag)819 Frag Compiler::ShortVisit(Regexp* re, Frag) {
820   failed_ = true;
821   return NoMatch();
822 }
823 
824 // Called before traversing a node's children during the walk.
PreVisit(Regexp * re,Frag,bool * stop)825 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
826   // Cut off walk if we've already failed.
827   if (failed_)
828     *stop = true;
829 
830   return Frag();  // not used by caller
831 }
832 
Literal(Rune r,bool foldcase)833 Frag Compiler::Literal(Rune r, bool foldcase) {
834   switch (encoding_) {
835     default:
836       return Frag();
837 
838     case kEncodingLatin1:
839       return ByteRange(r, r, foldcase);
840 
841     case kEncodingUTF8: {
842       if (r < Runeself)  // Make common case fast.
843         return ByteRange(r, r, foldcase);
844       uint8_t buf[UTFmax];
845       int n = runetochar(reinterpret_cast<char*>(buf), &r);
846       Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
847       for (int i = 1; i < n; i++)
848         f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
849       return f;
850     }
851   }
852 }
853 
854 // Called after traversing the node's children during the walk.
855 // Given their frags, build and return the frag for this re.
PostVisit(Regexp * re,Frag,Frag,Frag * child_frags,int nchild_frags)856 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
857                          int nchild_frags) {
858   // If a child failed, don't bother going forward, especially
859   // since the child_frags might contain Frags with NULLs in them.
860   if (failed_)
861     return NoMatch();
862 
863   // Given the child fragments, return the fragment for this node.
864   switch (re->op()) {
865     case kRegexpRepeat:
866       // Should not see; code at bottom of function will print error
867       break;
868 
869     case kRegexpNoMatch:
870       return NoMatch();
871 
872     case kRegexpEmptyMatch:
873       return Nop();
874 
875     case kRegexpHaveMatch: {
876       Frag f = Match(re->match_id());
877       if (anchor_ == RE2::ANCHOR_BOTH) {
878         // Append \z or else the subexpression will effectively be unanchored.
879         // Complemented by the UNANCHORED case in CompileSet().
880         f = Cat(EmptyWidth(kEmptyEndText), f);
881       }
882       return f;
883     }
884 
885     case kRegexpConcat: {
886       Frag f = child_frags[0];
887       for (int i = 1; i < nchild_frags; i++)
888         f = Cat(f, child_frags[i]);
889       return f;
890     }
891 
892     case kRegexpAlternate: {
893       Frag f = child_frags[0];
894       for (int i = 1; i < nchild_frags; i++)
895         f = Alt(f, child_frags[i]);
896       return f;
897     }
898 
899     case kRegexpStar:
900       return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
901 
902     case kRegexpPlus:
903       return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
904 
905     case kRegexpQuest:
906       return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
907 
908     case kRegexpLiteral:
909       return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
910 
911     case kRegexpLiteralString: {
912       // Concatenation of literals.
913       if (re->nrunes() == 0)
914         return Nop();
915       Frag f;
916       for (int i = 0; i < re->nrunes(); i++) {
917         Frag f1 = Literal(re->runes()[i],
918                           (re->parse_flags()&Regexp::FoldCase) != 0);
919         if (i == 0)
920           f = f1;
921         else
922           f = Cat(f, f1);
923       }
924       return f;
925     }
926 
927     case kRegexpAnyChar:
928       BeginRange();
929       AddRuneRange(0, Runemax, false);
930       return EndRange();
931 
932     case kRegexpAnyByte:
933       return ByteRange(0x00, 0xFF, false);
934 
935     case kRegexpCharClass: {
936       CharClass* cc = re->cc();
937       if (cc->empty()) {
938         // This can't happen.
939         LOG(DFATAL) << "No ranges in char class";
940         failed_ = true;
941         return NoMatch();
942       }
943 
944       // ASCII case-folding optimization: if the char class
945       // behaves the same on A-Z as it does on a-z,
946       // discard any ranges wholly contained in A-Z
947       // and mark the other ranges as foldascii.
948       // This reduces the size of a program for
949       // (?i)abc from 3 insts per letter to 1 per letter.
950       bool foldascii = cc->FoldsASCII();
951 
952       // Character class is just a big OR of the different
953       // character ranges in the class.
954       BeginRange();
955       for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
956         // ASCII case-folding optimization (see above).
957         if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
958           continue;
959 
960         // If this range contains all of A-Za-z or none of it,
961         // the fold flag is unnecessary; don't bother.
962         bool fold = foldascii;
963         if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
964             ('Z' < i->lo && i->hi < 'a'))
965           fold = false;
966 
967         AddRuneRange(i->lo, i->hi, fold);
968       }
969       return EndRange();
970     }
971 
972     case kRegexpCapture:
973       // If this is a non-capturing parenthesis -- (?:foo) --
974       // just use the inner expression.
975       if (re->cap() < 0)
976         return child_frags[0];
977       return Capture(child_frags[0], re->cap());
978 
979     case kRegexpBeginLine:
980       return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
981 
982     case kRegexpEndLine:
983       return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
984 
985     case kRegexpBeginText:
986       return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
987 
988     case kRegexpEndText:
989       return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
990 
991     case kRegexpWordBoundary:
992       return EmptyWidth(kEmptyWordBoundary);
993 
994     case kRegexpNoWordBoundary:
995       return EmptyWidth(kEmptyNonWordBoundary);
996   }
997   LOG(DFATAL) << "Missing case in Compiler: " << re->op();
998   failed_ = true;
999   return NoMatch();
1000 }
1001 
1002 // Is this regexp required to start at the beginning of the text?
1003 // Only approximate; can return false for complicated regexps like (\Aa|\Ab),
1004 // but handles (\A(a|b)).  Could use the Walker to write a more exact one.
IsAnchorStart(Regexp ** pre,int depth)1005 static bool IsAnchorStart(Regexp** pre, int depth) {
1006   Regexp* re = *pre;
1007   Regexp* sub;
1008   // The depth limit makes sure that we don't overflow
1009   // the stack on a deeply nested regexp.  As the comment
1010   // above says, IsAnchorStart is conservative, so returning
1011   // a false negative is okay.  The exact limit is somewhat arbitrary.
1012   if (re == NULL || depth >= 4)
1013     return false;
1014   switch (re->op()) {
1015     default:
1016       break;
1017     case kRegexpConcat:
1018       if (re->nsub() > 0) {
1019         sub = re->sub()[0]->Incref();
1020         if (IsAnchorStart(&sub, depth+1)) {
1021           PODArray<Regexp*> subcopy(re->nsub());
1022           subcopy[0] = sub;  // already have reference
1023           for (int i = 1; i < re->nsub(); i++)
1024             subcopy[i] = re->sub()[i]->Incref();
1025           *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1026           re->Decref();
1027           return true;
1028         }
1029         sub->Decref();
1030       }
1031       break;
1032     case kRegexpCapture:
1033       sub = re->sub()[0]->Incref();
1034       if (IsAnchorStart(&sub, depth+1)) {
1035         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1036         re->Decref();
1037         return true;
1038       }
1039       sub->Decref();
1040       break;
1041     case kRegexpBeginText:
1042       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1043       re->Decref();
1044       return true;
1045   }
1046   return false;
1047 }
1048 
1049 // Is this regexp required to start at the end of the text?
1050 // Only approximate; can return false for complicated regexps like (a\z|b\z),
1051 // but handles ((a|b)\z).  Could use the Walker to write a more exact one.
IsAnchorEnd(Regexp ** pre,int depth)1052 static bool IsAnchorEnd(Regexp** pre, int depth) {
1053   Regexp* re = *pre;
1054   Regexp* sub;
1055   // The depth limit makes sure that we don't overflow
1056   // the stack on a deeply nested regexp.  As the comment
1057   // above says, IsAnchorEnd is conservative, so returning
1058   // a false negative is okay.  The exact limit is somewhat arbitrary.
1059   if (re == NULL || depth >= 4)
1060     return false;
1061   switch (re->op()) {
1062     default:
1063       break;
1064     case kRegexpConcat:
1065       if (re->nsub() > 0) {
1066         sub = re->sub()[re->nsub() - 1]->Incref();
1067         if (IsAnchorEnd(&sub, depth+1)) {
1068           PODArray<Regexp*> subcopy(re->nsub());
1069           subcopy[re->nsub() - 1] = sub;  // already have reference
1070           for (int i = 0; i < re->nsub() - 1; i++)
1071             subcopy[i] = re->sub()[i]->Incref();
1072           *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1073           re->Decref();
1074           return true;
1075         }
1076         sub->Decref();
1077       }
1078       break;
1079     case kRegexpCapture:
1080       sub = re->sub()[0]->Incref();
1081       if (IsAnchorEnd(&sub, depth+1)) {
1082         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1083         re->Decref();
1084         return true;
1085       }
1086       sub->Decref();
1087       break;
1088     case kRegexpEndText:
1089       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1090       re->Decref();
1091       return true;
1092   }
1093   return false;
1094 }
1095 
Setup(Regexp::ParseFlags flags,int64_t max_mem,RE2::Anchor anchor)1096 void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
1097                      RE2::Anchor anchor) {
1098   prog_->set_flags(flags);
1099 
1100   if (flags & Regexp::Latin1)
1101     encoding_ = kEncodingLatin1;
1102   max_mem_ = max_mem;
1103   if (max_mem <= 0) {
1104     max_ninst_ = 100000;  // more than enough
1105   } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
1106     // No room for anything.
1107     max_ninst_ = 0;
1108   } else {
1109     int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
1110     // Limit instruction count so that inst->id() fits nicely in an int.
1111     // SparseArray also assumes that the indices (inst->id()) are ints.
1112     // The call to WalkExponential uses 2*max_ninst_ below,
1113     // and other places in the code use 2 or 3 * prog->size().
1114     // Limiting to 2^24 should avoid overflow in those places.
1115     // (The point of allowing more than 32 bits of memory is to
1116     // have plenty of room for the DFA states, not to use it up
1117     // on the program.)
1118     if (m >= 1<<24)
1119       m = 1<<24;
1120 
1121     // Inst imposes its own limit (currently bigger than 2^24 but be safe).
1122     if (m > Prog::Inst::kMaxInst)
1123       m = Prog::Inst::kMaxInst;
1124 
1125     max_ninst_ = static_cast<int>(m);
1126   }
1127 
1128   anchor_ = anchor;
1129 }
1130 
1131 // Compiles re, returning program.
1132 // Caller is responsible for deleting prog_.
1133 // If reversed is true, compiles a program that expects
1134 // to run over the input string backward (reverses all concatenations).
1135 // The reversed flag is also recorded in the returned program.
Compile(Regexp * re,bool reversed,int64_t max_mem)1136 Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
1137   Compiler c;
1138   c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
1139   c.reversed_ = reversed;
1140 
1141   // Simplify to remove things like counted repetitions
1142   // and character classes like \d.
1143   Regexp* sre = re->Simplify();
1144   if (sre == NULL)
1145     return NULL;
1146 
1147   // Record whether prog is anchored, removing the anchors.
1148   // (They get in the way of other optimizations.)
1149   bool is_anchor_start = IsAnchorStart(&sre, 0);
1150   bool is_anchor_end = IsAnchorEnd(&sre, 0);
1151 
1152   // Generate fragment for entire regexp.
1153   Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1154   sre->Decref();
1155   if (c.failed_)
1156     return NULL;
1157 
1158   // Success!  Finish by putting Match node at end, and record start.
1159   // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1160   // to behave normally.
1161   c.reversed_ = false;
1162   all = c.Cat(all, c.Match(0));
1163 
1164   c.prog_->set_reversed(reversed);
1165   if (c.prog_->reversed()) {
1166     c.prog_->set_anchor_start(is_anchor_end);
1167     c.prog_->set_anchor_end(is_anchor_start);
1168   } else {
1169     c.prog_->set_anchor_start(is_anchor_start);
1170     c.prog_->set_anchor_end(is_anchor_end);
1171   }
1172 
1173   c.prog_->set_start(all.begin);
1174   if (!c.prog_->anchor_start()) {
1175     // Also create unanchored version, which starts with a .*? loop.
1176     all = c.Cat(c.DotStar(), all);
1177   }
1178   c.prog_->set_start_unanchored(all.begin);
1179 
1180   // Hand ownership of prog_ to caller.
1181   return c.Finish();
1182 }
1183 
Finish()1184 Prog* Compiler::Finish() {
1185   if (failed_)
1186     return NULL;
1187 
1188   if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1189     // No possible matches; keep Fail instruction only.
1190     ninst_ = 1;
1191   }
1192 
1193   // Hand off the array to Prog.
1194   prog_->inst_ = std::move(inst_);
1195   prog_->size_ = ninst_;
1196 
1197   prog_->Optimize();
1198   prog_->Flatten();
1199   prog_->ComputeByteMap();
1200 
1201   // Record remaining memory for DFA.
1202   if (max_mem_ <= 0) {
1203     prog_->set_dfa_mem(1<<20);
1204   } else {
1205     int64_t m = max_mem_ - sizeof(Prog);
1206     m -= prog_->size_*sizeof(Prog::Inst);  // account for inst_
1207     if (prog_->CanBitState())
1208       m -= prog_->size_*sizeof(uint16_t);  // account for list_heads_
1209     if (m < 0)
1210       m = 0;
1211     prog_->set_dfa_mem(m);
1212   }
1213 
1214   Prog* p = prog_;
1215   prog_ = NULL;
1216   return p;
1217 }
1218 
1219 // Converts Regexp to Prog.
CompileToProg(int64_t max_mem)1220 Prog* Regexp::CompileToProg(int64_t max_mem) {
1221   return Compiler::Compile(this, false, max_mem);
1222 }
1223 
CompileToReverseProg(int64_t max_mem)1224 Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
1225   return Compiler::Compile(this, true, max_mem);
1226 }
1227 
DotStar()1228 Frag Compiler::DotStar() {
1229   return Star(ByteRange(0x00, 0xff, false), true);
1230 }
1231 
1232 // Compiles RE set to Prog.
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1233 Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1234   Compiler c;
1235   c.Setup(re->parse_flags(), max_mem, anchor);
1236 
1237   Regexp* sre = re->Simplify();
1238   if (sre == NULL)
1239     return NULL;
1240 
1241   Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1242   sre->Decref();
1243   if (c.failed_)
1244     return NULL;
1245 
1246   c.prog_->set_anchor_start(true);
1247   c.prog_->set_anchor_end(true);
1248 
1249   if (anchor == RE2::UNANCHORED) {
1250     // Prepend .* or else the expression will effectively be anchored.
1251     // Complemented by the ANCHOR_BOTH case in PostVisit().
1252     all = c.Cat(c.DotStar(), all);
1253   }
1254   c.prog_->set_start(all.begin);
1255   c.prog_->set_start_unanchored(all.begin);
1256 
1257   Prog* prog = c.Finish();
1258   if (prog == NULL)
1259     return NULL;
1260 
1261   // Make sure DFA has enough memory to operate,
1262   // since we're not going to fall back to the NFA.
1263   bool dfa_failed = false;
1264   StringPiece sp = "hello, world";
1265   prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1266                   NULL, &dfa_failed, NULL);
1267   if (dfa_failed) {
1268     delete prog;
1269     return NULL;
1270   }
1271 
1272   return prog;
1273 }
1274 
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1275 Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1276   return Compiler::CompileSet(re, anchor, max_mem);
1277 }
1278 
1279 }  // namespace re2
1280