• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Compile regular expression to Prog.
6 //
7 // Prog and Inst are defined in prog.h.
8 // This file's external interface is just Regexp::CompileToProg.
9 // The Compiler class defined in this file is private.
10 
11 #include <stdint.h>
12 #include <string.h>
13 
14 #include <string>
15 #include <utility>
16 
17 #include "absl/container/flat_hash_map.h"
18 #include "absl/log/absl_check.h"
19 #include "absl/log/absl_log.h"
20 #include "absl/strings/string_view.h"
21 #include "re2/pod_array.h"
22 #include "re2/prog.h"
23 #include "re2/re2.h"
24 #include "re2/regexp.h"
25 #include "re2/walker-inl.h"
26 #include "util/utf.h"
27 
28 namespace re2 {
29 
30 // List of pointers to Inst* that need to be filled in (patched).
31 // Because the Inst* haven't been filled in yet,
32 // we can use the Inst* word to hold the list's "next" pointer.
33 // It's kind of sleazy, but it works well in practice.
34 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
35 //
36 // Because the out and out1 fields in Inst are no longer pointers,
37 // we can't use pointers directly here either.  Instead, head refers
38 // to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
39 // head == 0 represents the NULL list.  This is okay because instruction #0
40 // is always the fail instruction, which never appears on a list.
41 struct PatchList {
42   // Returns patch list containing just p.
Mkre2::PatchList43   static PatchList Mk(uint32_t p) {
44     return {p, p};
45   }
46 
47   // Patches all the entries on l to have value p.
48   // Caller must not ever use patch list again.
Patchre2::PatchList49   static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) {
50     while (l.head != 0) {
51       Prog::Inst* ip = &inst0[l.head>>1];
52       if (l.head&1) {
53         l.head = ip->out1();
54         ip->out1_ = p;
55       } else {
56         l.head = ip->out();
57         ip->set_out(p);
58       }
59     }
60   }
61 
62   // Appends two patch lists and returns result.
Appendre2::PatchList63   static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
64     if (l1.head == 0)
65       return l2;
66     if (l2.head == 0)
67       return l1;
68     Prog::Inst* ip = &inst0[l1.tail>>1];
69     if (l1.tail&1)
70       ip->out1_ = l2.head;
71     else
72       ip->set_out(l2.head);
73     return {l1.head, l2.tail};
74   }
75 
76   uint32_t head;
77   uint32_t tail;  // for constant-time append
78 };
79 
80 static const PatchList kNullPatchList = {0, 0};
81 
82 // Compiled program fragment.
83 struct Frag {
84   uint32_t begin;
85   PatchList end;
86   bool nullable;
87 
Fragre2::Frag88   Frag() : begin(0), end(kNullPatchList), nullable(false) {}
Fragre2::Frag89   Frag(uint32_t begin, PatchList end, bool nullable)
90       : begin(begin), end(end), nullable(nullable) {}
91 };
92 
93 // Input encodings.
94 enum Encoding {
95   kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
96   kEncodingLatin1,    // Latin-1 (0-FF)
97 };
98 
99 class Compiler : public Regexp::Walker<Frag> {
100  public:
101   explicit Compiler();
102   ~Compiler();
103 
104   // Compiles Regexp to a new Prog.
105   // Caller is responsible for deleting Prog when finished with it.
106   // If reversed is true, compiles for walking over the input
107   // string backward (reverses all concatenations).
108   static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
109 
110   // Compiles alternation of all the re to a new Prog.
111   // Each re has a match with an id equal to its index in the vector.
112   static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
113 
114   // Interface for Regexp::Walker, which helps traverse the Regexp.
115   // The walk is purely post-recursive: given the machines for the
116   // children, PostVisit combines them to create the machine for
117   // the current node.  The child_args are Frags.
118   // The Compiler traverses the Regexp parse tree, visiting
119   // each node in depth-first order.  It invokes PreVisit before
120   // visiting the node's children and PostVisit after visiting
121   // the children.
122   Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
123   Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
124                  int nchild_args);
125   Frag ShortVisit(Regexp* re, Frag parent_arg);
126   Frag Copy(Frag arg);
127 
128   // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
129   Frag Plus(Frag a, bool nongreedy);
130   Frag Star(Frag a, bool nongreedy);
131   Frag Quest(Frag a, bool nongreedy);
132 
133   // Given fragment a, returns (a) capturing as \n.
134   Frag Capture(Frag a, int n);
135 
136   // Given fragments a and b, returns ab; a|b
137   Frag Cat(Frag a, Frag b);
138   Frag Alt(Frag a, Frag b);
139 
140   // Returns a fragment that can't match anything.
141   Frag NoMatch();
142 
143   // Returns a fragment that matches the empty string.
144   Frag Match(int32_t id);
145 
146   // Returns a no-op fragment.
147   Frag Nop();
148 
149   // Returns a fragment matching the byte range lo-hi.
150   Frag ByteRange(int lo, int hi, bool foldcase);
151 
152   // Returns a fragment matching an empty-width special op.
153   Frag EmptyWidth(EmptyOp op);
154 
155   // Adds n instructions to the program.
156   // Returns the index of the first one.
157   // Returns -1 if no more instructions are available.
158   int AllocInst(int n);
159 
160   // Rune range compiler.
161 
162   // Begins a new alternation.
163   void BeginRange();
164 
165   // Adds a fragment matching the rune range lo-hi.
166   void AddRuneRange(Rune lo, Rune hi, bool foldcase);
167   void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
168   void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
169   void Add_80_10ffff();
170 
171   // New suffix that matches the byte range lo-hi, then goes to next.
172   int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
173   int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
174 
175   // Returns true iff the suffix is cached.
176   bool IsCachedRuneByteSuffix(int id);
177 
178   // Adds a suffix to alternation.
179   void AddSuffix(int id);
180 
181   // Adds a suffix to the trie starting from the given root node.
182   // Returns zero iff allocating an instruction fails. Otherwise, returns
183   // the current root node, which might be different from what was given.
184   int AddSuffixRecursive(int root, int id);
185 
186   // Finds the trie node for the given suffix. Returns a Frag in order to
187   // distinguish between pointing at the root node directly (end.head == 0)
188   // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively).
189   Frag FindByteRange(int root, int id);
190 
191   // Compares two ByteRanges and returns true iff they are equal.
192   bool ByteRangeEqual(int id1, int id2);
193 
194   // Returns the alternation of all the added suffixes.
195   Frag EndRange();
196 
197   // Single rune.
198   Frag Literal(Rune r, bool foldcase);
199 
200   void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor);
201   Prog* Finish(Regexp* re);
202 
203   // Returns .* where dot = any byte
204   Frag DotStar();
205 
206  private:
207   Prog* prog_;         // Program being built.
208   bool failed_;        // Did we give up compiling?
209   Encoding encoding_;  // Input encoding
210   bool reversed_;      // Should program run backward over text?
211 
212   PODArray<Prog::Inst> inst_;
213   int ninst_;          // Number of instructions used.
214   int max_ninst_;      // Maximum number of instructions.
215 
216   int64_t max_mem_;    // Total memory budget.
217 
218   absl::flat_hash_map<uint64_t, int> rune_cache_;
219   Frag rune_range_;
220 
221   RE2::Anchor anchor_;  // anchor mode for RE2::Set
222 
223   Compiler(const Compiler&) = delete;
224   Compiler& operator=(const Compiler&) = delete;
225 };
226 
Compiler()227 Compiler::Compiler() {
228   prog_ = new Prog();
229   failed_ = false;
230   encoding_ = kEncodingUTF8;
231   reversed_ = false;
232   ninst_ = 0;
233   max_ninst_ = 1;  // make AllocInst for fail instruction okay
234   max_mem_ = 0;
235   int fail = AllocInst(1);
236   inst_[fail].InitFail();
237   max_ninst_ = 0;  // Caller must change
238 }
239 
~Compiler()240 Compiler::~Compiler() {
241   delete prog_;
242 }
243 
AllocInst(int n)244 int Compiler::AllocInst(int n) {
245   if (failed_ || ninst_ + n > max_ninst_) {
246     failed_ = true;
247     return -1;
248   }
249 
250   if (ninst_ + n > inst_.size()) {
251     int cap = inst_.size();
252     if (cap == 0)
253       cap = 8;
254     while (ninst_ + n > cap)
255       cap *= 2;
256     PODArray<Prog::Inst> inst(cap);
257     if (inst_.data() != NULL)
258       memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
259     memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
260     inst_ = std::move(inst);
261   }
262   int id = ninst_;
263   ninst_ += n;
264   return id;
265 }
266 
267 // These routines are somewhat hard to visualize in text --
268 // see http://swtch.com/~rsc/regexp/regexp1.html for
269 // pictures explaining what is going on here.
270 
271 // Returns an unmatchable fragment.
NoMatch()272 Frag Compiler::NoMatch() {
273   return Frag();
274 }
275 
276 // Is a an unmatchable fragment?
IsNoMatch(Frag a)277 static bool IsNoMatch(Frag a) {
278   return a.begin == 0;
279 }
280 
281 // Given fragments a and b, returns fragment for ab.
Cat(Frag a,Frag b)282 Frag Compiler::Cat(Frag a, Frag b) {
283   if (IsNoMatch(a) || IsNoMatch(b))
284     return NoMatch();
285 
286   // Elide no-op.
287   Prog::Inst* begin = &inst_[a.begin];
288   if (begin->opcode() == kInstNop &&
289       a.end.head == (a.begin << 1) &&
290       begin->out() == 0) {
291     // in case refs to a somewhere
292     PatchList::Patch(inst_.data(), a.end, b.begin);
293     return b;
294   }
295 
296   // To run backward over string, reverse all concatenations.
297   if (reversed_) {
298     PatchList::Patch(inst_.data(), b.end, a.begin);
299     return Frag(b.begin, a.end, b.nullable && a.nullable);
300   }
301 
302   PatchList::Patch(inst_.data(), a.end, b.begin);
303   return Frag(a.begin, b.end, a.nullable && b.nullable);
304 }
305 
306 // Given fragments for a and b, returns fragment for a|b.
Alt(Frag a,Frag b)307 Frag Compiler::Alt(Frag a, Frag b) {
308   // Special case for convenience in loops.
309   if (IsNoMatch(a))
310     return b;
311   if (IsNoMatch(b))
312     return a;
313 
314   int id = AllocInst(1);
315   if (id < 0)
316     return NoMatch();
317 
318   inst_[id].InitAlt(a.begin, b.begin);
319   return Frag(id, PatchList::Append(inst_.data(), a.end, b.end),
320               a.nullable || b.nullable);
321 }
322 
323 // When capturing submatches in like-Perl mode, a kOpAlt Inst
324 // treats out_ as the first choice, out1_ as the second.
325 //
326 // For *, +, and ?, if out_ causes another repetition,
327 // then the operator is greedy.  If out1_ is the repetition
328 // (and out_ moves forward), then the operator is non-greedy.
329 
330 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
Plus(Frag a,bool nongreedy)331 Frag Compiler::Plus(Frag a, bool nongreedy) {
332   int id = AllocInst(1);
333   if (id < 0)
334     return NoMatch();
335   PatchList pl;
336   if (nongreedy) {
337     inst_[id].InitAlt(0, a.begin);
338     pl = PatchList::Mk(id << 1);
339   } else {
340     inst_[id].InitAlt(a.begin, 0);
341     pl = PatchList::Mk((id << 1) | 1);
342   }
343   PatchList::Patch(inst_.data(), a.end, id);
344   return Frag(a.begin, pl, a.nullable);
345 }
346 
347 // Given a fragment for a, returns a fragment for a* or a*? (if nongreedy)
Star(Frag a,bool nongreedy)348 Frag Compiler::Star(Frag a, bool nongreedy) {
349   // When the subexpression is nullable, one Alt isn't enough to guarantee
350   // correct priority ordering within the transitive closure. The simplest
351   // solution is to handle it as (a+)? instead, which adds the second Alt.
352   if (a.nullable)
353     return Quest(Plus(a, nongreedy), nongreedy);
354 
355   int id = AllocInst(1);
356   if (id < 0)
357     return NoMatch();
358   PatchList pl;
359   if (nongreedy) {
360     inst_[id].InitAlt(0, a.begin);
361     pl = PatchList::Mk(id << 1);
362   } else {
363     inst_[id].InitAlt(a.begin, 0);
364     pl = PatchList::Mk((id << 1) | 1);
365   }
366   PatchList::Patch(inst_.data(), a.end, id);
367   return Frag(id, pl, true);
368 }
369 
370 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
Quest(Frag a,bool nongreedy)371 Frag Compiler::Quest(Frag a, bool nongreedy) {
372   if (IsNoMatch(a))
373     return Nop();
374   int id = AllocInst(1);
375   if (id < 0)
376     return NoMatch();
377   PatchList pl;
378   if (nongreedy) {
379     inst_[id].InitAlt(0, a.begin);
380     pl = PatchList::Mk(id << 1);
381   } else {
382     inst_[id].InitAlt(a.begin, 0);
383     pl = PatchList::Mk((id << 1) | 1);
384   }
385   return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true);
386 }
387 
388 // Returns a fragment for the byte range lo-hi.
ByteRange(int lo,int hi,bool foldcase)389 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
390   int id = AllocInst(1);
391   if (id < 0)
392     return NoMatch();
393   inst_[id].InitByteRange(lo, hi, foldcase, 0);
394   return Frag(id, PatchList::Mk(id << 1), false);
395 }
396 
397 // Returns a no-op fragment.  Sometimes unavoidable.
Nop()398 Frag Compiler::Nop() {
399   int id = AllocInst(1);
400   if (id < 0)
401     return NoMatch();
402   inst_[id].InitNop(0);
403   return Frag(id, PatchList::Mk(id << 1), true);
404 }
405 
406 // Returns a fragment that signals a match.
Match(int32_t match_id)407 Frag Compiler::Match(int32_t match_id) {
408   int id = AllocInst(1);
409   if (id < 0)
410     return NoMatch();
411   inst_[id].InitMatch(match_id);
412   return Frag(id, kNullPatchList, false);
413 }
414 
415 // Returns a fragment matching a particular empty-width op (like ^ or $)
EmptyWidth(EmptyOp empty)416 Frag Compiler::EmptyWidth(EmptyOp empty) {
417   int id = AllocInst(1);
418   if (id < 0)
419     return NoMatch();
420   inst_[id].InitEmptyWidth(empty, 0);
421   return Frag(id, PatchList::Mk(id << 1), true);
422 }
423 
424 // Given a fragment a, returns a fragment with capturing parens around a.
Capture(Frag a,int n)425 Frag Compiler::Capture(Frag a, int n) {
426   if (IsNoMatch(a))
427     return NoMatch();
428   int id = AllocInst(2);
429   if (id < 0)
430     return NoMatch();
431   inst_[id].InitCapture(2*n, a.begin);
432   inst_[id+1].InitCapture(2*n+1, 0);
433   PatchList::Patch(inst_.data(), a.end, id+1);
434 
435   return Frag(id, PatchList::Mk((id+1) << 1), a.nullable);
436 }
437 
438 // A Rune is a name for a Unicode code point.
439 // Returns maximum rune encoded by UTF-8 sequence of length len.
MaxRune(int len)440 static int MaxRune(int len) {
441   int b;  // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
442   if (len == 1)
443     b = 7;
444   else
445     b = 8-(len+1) + 6*(len-1);
446   return (1<<b) - 1;   // maximum Rune for b bits.
447 }
448 
449 // The rune range compiler caches common suffix fragments,
450 // which are very common in UTF-8 (e.g., [80-bf]).
451 // The fragment suffixes are identified by their start
452 // instructions.  NULL denotes the eventual end match.
453 // The Frag accumulates in rune_range_.  Caching common
454 // suffixes reduces the UTF-8 "." from 32 to 24 instructions,
455 // and it reduces the corresponding one-pass NFA from 16 nodes to 8.
456 
BeginRange()457 void Compiler::BeginRange() {
458   rune_cache_.clear();
459   rune_range_.begin = 0;
460   rune_range_.end = kNullPatchList;
461 }
462 
UncachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)463 int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
464                                      int next) {
465   Frag f = ByteRange(lo, hi, foldcase);
466   if (next != 0) {
467     PatchList::Patch(inst_.data(), f.end, next);
468   } else {
469     rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
470   }
471   return f.begin;
472 }
473 
MakeRuneCacheKey(uint8_t lo,uint8_t hi,bool foldcase,int next)474 static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
475                                  int next) {
476   return (uint64_t)next << 17 |
477          (uint64_t)lo   <<  9 |
478          (uint64_t)hi   <<  1 |
479          (uint64_t)foldcase;
480 }
481 
CachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)482 int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
483                                    int next) {
484   uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
485   absl::flat_hash_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
486   if (it != rune_cache_.end())
487     return it->second;
488   int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
489   rune_cache_[key] = id;
490   return id;
491 }
492 
IsCachedRuneByteSuffix(int id)493 bool Compiler::IsCachedRuneByteSuffix(int id) {
494   uint8_t lo = inst_[id].lo_;
495   uint8_t hi = inst_[id].hi_;
496   bool foldcase = inst_[id].foldcase() != 0;
497   int next = inst_[id].out();
498 
499   uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
500   return rune_cache_.find(key) != rune_cache_.end();
501 }
502 
AddSuffix(int id)503 void Compiler::AddSuffix(int id) {
504   if (failed_)
505     return;
506 
507   if (rune_range_.begin == 0) {
508     rune_range_.begin = id;
509     return;
510   }
511 
512   if (encoding_ == kEncodingUTF8) {
513     // Build a trie in order to reduce fanout.
514     rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
515     return;
516   }
517 
518   int alt = AllocInst(1);
519   if (alt < 0) {
520     rune_range_.begin = 0;
521     return;
522   }
523   inst_[alt].InitAlt(rune_range_.begin, id);
524   rune_range_.begin = alt;
525 }
526 
AddSuffixRecursive(int root,int id)527 int Compiler::AddSuffixRecursive(int root, int id) {
528   ABSL_DCHECK(inst_[root].opcode() == kInstAlt ||
529               inst_[root].opcode() == kInstByteRange);
530 
531   Frag f = FindByteRange(root, id);
532   if (IsNoMatch(f)) {
533     int alt = AllocInst(1);
534     if (alt < 0)
535       return 0;
536     inst_[alt].InitAlt(root, id);
537     return alt;
538   }
539 
540   int br;
541   if (f.end.head == 0)
542     br = root;
543   else if (f.end.head&1)
544     br = inst_[f.begin].out1();
545   else
546     br = inst_[f.begin].out();
547 
548   if (IsCachedRuneByteSuffix(br)) {
549     // We can't fiddle with cached suffixes, so make a clone of the head.
550     int byterange = AllocInst(1);
551     if (byterange < 0)
552       return 0;
553     inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
554                                    inst_[br].foldcase(), inst_[br].out());
555 
556     // Ensure that the parent points to the clone, not to the original.
557     // Note that this could leave the head unreachable except via the cache.
558     br = byterange;
559     if (f.end.head == 0)
560       root = br;
561     else if (f.end.head&1)
562       inst_[f.begin].out1_ = br;
563     else
564       inst_[f.begin].set_out(br);
565   }
566 
567   int out = inst_[id].out();
568   if (!IsCachedRuneByteSuffix(id)) {
569     // The head should be the instruction most recently allocated, so free it
570     // instead of leaving it unreachable.
571     ABSL_DCHECK_EQ(id, ninst_-1);
572     inst_[id].out_opcode_ = 0;
573     inst_[id].out1_ = 0;
574     ninst_--;
575   }
576 
577   out = AddSuffixRecursive(inst_[br].out(), out);
578   if (out == 0)
579     return 0;
580 
581   inst_[br].set_out(out);
582   return root;
583 }
584 
ByteRangeEqual(int id1,int id2)585 bool Compiler::ByteRangeEqual(int id1, int id2) {
586   return inst_[id1].lo() == inst_[id2].lo() &&
587          inst_[id1].hi() == inst_[id2].hi() &&
588          inst_[id1].foldcase() == inst_[id2].foldcase();
589 }
590 
FindByteRange(int root,int id)591 Frag Compiler::FindByteRange(int root, int id) {
592   if (inst_[root].opcode() == kInstByteRange) {
593     if (ByteRangeEqual(root, id))
594       return Frag(root, kNullPatchList, false);
595     else
596       return NoMatch();
597   }
598 
599   while (inst_[root].opcode() == kInstAlt) {
600     int out1 = inst_[root].out1();
601     if (ByteRangeEqual(out1, id))
602       return Frag(root, PatchList::Mk((root << 1) | 1), false);
603 
604     // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
605     // what we're looking for, then we can stop immediately. Unfortunately, we
606     // can't short-circuit the search in reverse mode.
607     if (!reversed_)
608       return NoMatch();
609 
610     int out = inst_[root].out();
611     if (inst_[out].opcode() == kInstAlt)
612       root = out;
613     else if (ByteRangeEqual(out, id))
614       return Frag(root, PatchList::Mk(root << 1), false);
615     else
616       return NoMatch();
617   }
618 
619   ABSL_LOG(DFATAL) << "should never happen";
620   return NoMatch();
621 }
622 
EndRange()623 Frag Compiler::EndRange() {
624   return rune_range_;
625 }
626 
627 // Converts rune range lo-hi into a fragment that recognizes
628 // the bytes that would make up those runes in the current
629 // encoding (Latin 1 or UTF-8).
630 // This lets the machine work byte-by-byte even when
631 // using multibyte encodings.
632 
AddRuneRange(Rune lo,Rune hi,bool foldcase)633 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
634   switch (encoding_) {
635     default:
636     case kEncodingUTF8:
637       AddRuneRangeUTF8(lo, hi, foldcase);
638       break;
639     case kEncodingLatin1:
640       AddRuneRangeLatin1(lo, hi, foldcase);
641       break;
642   }
643 }
644 
AddRuneRangeLatin1(Rune lo,Rune hi,bool foldcase)645 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
646   // Latin-1 is easy: runes *are* bytes.
647   if (lo > hi || lo > 0xFF)
648     return;
649   if (hi > 0xFF)
650     hi = 0xFF;
651   AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
652                                    static_cast<uint8_t>(hi), foldcase, 0));
653 }
654 
Add_80_10ffff()655 void Compiler::Add_80_10ffff() {
656   // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
657   // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
658   // permitting overlong encodings in E0 and F0 sequences and code points
659   // over 10FFFF in F4 sequences, the size of the bytecode and the number
660   // of equivalence classes are reduced significantly.
661   int id;
662   if (reversed_) {
663     // Prefix factoring matters, but we don't have to handle it here
664     // because the rune range trie logic takes care of that already.
665     id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0);
666     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
667     AddSuffix(id);
668 
669     id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0);
670     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
671     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
672     AddSuffix(id);
673 
674     id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0);
675     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
676     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
677     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
678     AddSuffix(id);
679   } else {
680     // Suffix factoring matters - and we do have to handle it here.
681     int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0);
682     id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1);
683     AddSuffix(id);
684 
685     int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1);
686     id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2);
687     AddSuffix(id);
688 
689     int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2);
690     id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3);
691     AddSuffix(id);
692   }
693 }
694 
AddRuneRangeUTF8(Rune lo,Rune hi,bool foldcase)695 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
696   if (lo > hi)
697     return;
698 
699   // Pick off 80-10FFFF as a common special case.
700   if (lo == 0x80 && hi == 0x10ffff) {
701     Add_80_10ffff();
702     return;
703   }
704 
705   // Split range into same-length sized ranges.
706   for (int i = 1; i < UTFmax; i++) {
707     Rune max = MaxRune(i);
708     if (lo <= max && max < hi) {
709       AddRuneRangeUTF8(lo, max, foldcase);
710       AddRuneRangeUTF8(max+1, hi, foldcase);
711       return;
712     }
713   }
714 
715   // ASCII range is always a special case.
716   if (hi < Runeself) {
717     AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
718                                      static_cast<uint8_t>(hi), foldcase, 0));
719     return;
720   }
721 
722   // Split range into sections that agree on leading bytes.
723   for (int i = 1; i < UTFmax; i++) {
724     uint32_t m = (1<<(6*i)) - 1;  // last i bytes of a UTF-8 sequence
725     if ((lo & ~m) != (hi & ~m)) {
726       if ((lo & m) != 0) {
727         AddRuneRangeUTF8(lo, lo|m, foldcase);
728         AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
729         return;
730       }
731       if ((hi & m) != m) {
732         AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
733         AddRuneRangeUTF8(hi&~m, hi, foldcase);
734         return;
735       }
736     }
737   }
738 
739   // Finally.  Generate byte matching equivalent for lo-hi.
740   uint8_t ulo[UTFmax], uhi[UTFmax];
741   int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
742   int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
743   (void)m;  // USED(m)
744   ABSL_DCHECK_EQ(n, m);
745 
746   // The logic below encodes this thinking:
747   //
748   // 1. When we have built the whole suffix, we know that it cannot
749   // possibly be a suffix of anything longer: in forward mode, nothing
750   // else can occur before the leading byte; in reverse mode, nothing
751   // else can occur after the last continuation byte or else the leading
752   // byte would have to change. Thus, there is no benefit to caching
753   // the first byte of the suffix whereas there is a cost involved in
754   // cloning it if it begins a common prefix, which is fairly likely.
755   //
756   // 2. Conversely, the last byte of the suffix cannot possibly be a
757   // prefix of anything because next == 0, so we will never want to
758   // clone it, but it is fairly likely to be a common suffix. Perhaps
759   // more so in reverse mode than in forward mode because the former is
760   // "converging" towards lower entropy, but caching is still worthwhile
761   // for the latter in cases such as 80-BF.
762   //
763   // 3. Handling the bytes between the first and the last is less
764   // straightforward and, again, the approach depends on whether we are
765   // "converging" towards lower entropy: in forward mode, a single byte
766   // is unlikely to be part of a common suffix whereas a byte range
767   // is more likely so; in reverse mode, a byte range is unlikely to
768   // be part of a common suffix whereas a single byte is more likely
769   // so. The same benefit versus cost argument applies here.
770   int id = 0;
771   if (reversed_) {
772     for (int i = 0; i < n; i++) {
773       // In reverse UTF-8 mode: cache the leading byte; don't cache the last
774       // continuation byte; cache anything else iff it's a single byte (XX-XX).
775       if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
776         id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
777       else
778         id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
779     }
780   } else {
781     for (int i = n-1; i >= 0; i--) {
782       // In forward UTF-8 mode: don't cache the leading byte; cache the last
783       // continuation byte; cache anything else iff it's a byte range (XX-YY).
784       if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
785         id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
786       else
787         id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
788     }
789   }
790   AddSuffix(id);
791 }
792 
793 // Should not be called.
Copy(Frag arg)794 Frag Compiler::Copy(Frag arg) {
795   // We're using WalkExponential; there should be no copying.
796   failed_ = true;
797   ABSL_LOG(DFATAL) << "Compiler::Copy called!";
798   return NoMatch();
799 }
800 
801 // Visits a node quickly; called once WalkExponential has
802 // decided to cut this walk short.
ShortVisit(Regexp * re,Frag)803 Frag Compiler::ShortVisit(Regexp* re, Frag) {
804   failed_ = true;
805   return NoMatch();
806 }
807 
808 // Called before traversing a node's children during the walk.
PreVisit(Regexp * re,Frag,bool * stop)809 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
810   // Cut off walk if we've already failed.
811   if (failed_)
812     *stop = true;
813 
814   return Frag();  // not used by caller
815 }
816 
Literal(Rune r,bool foldcase)817 Frag Compiler::Literal(Rune r, bool foldcase) {
818   switch (encoding_) {
819     default:
820       return Frag();
821 
822     case kEncodingLatin1:
823       return ByteRange(r, r, foldcase);
824 
825     case kEncodingUTF8: {
826       if (r < Runeself)  // Make common case fast.
827         return ByteRange(r, r, foldcase);
828       uint8_t buf[UTFmax];
829       int n = runetochar(reinterpret_cast<char*>(buf), &r);
830       Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
831       for (int i = 1; i < n; i++)
832         f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
833       return f;
834     }
835   }
836 }
837 
838 // Called after traversing the node's children during the walk.
839 // Given their frags, build and return the frag for this re.
PostVisit(Regexp * re,Frag,Frag,Frag * child_frags,int nchild_frags)840 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
841                          int nchild_frags) {
842   // If a child failed, don't bother going forward, especially
843   // since the child_frags might contain Frags with NULLs in them.
844   if (failed_)
845     return NoMatch();
846 
847   // Given the child fragments, return the fragment for this node.
848   switch (re->op()) {
849     case kRegexpRepeat:
850       // Should not see; code at bottom of function will print error
851       break;
852 
853     case kRegexpNoMatch:
854       return NoMatch();
855 
856     case kRegexpEmptyMatch:
857       return Nop();
858 
859     case kRegexpHaveMatch: {
860       Frag f = Match(re->match_id());
861       if (anchor_ == RE2::ANCHOR_BOTH) {
862         // Append \z or else the subexpression will effectively be unanchored.
863         // Complemented by the UNANCHORED case in CompileSet().
864         f = Cat(EmptyWidth(kEmptyEndText), f);
865       }
866       return f;
867     }
868 
869     case kRegexpConcat: {
870       Frag f = child_frags[0];
871       for (int i = 1; i < nchild_frags; i++)
872         f = Cat(f, child_frags[i]);
873       return f;
874     }
875 
876     case kRegexpAlternate: {
877       Frag f = child_frags[0];
878       for (int i = 1; i < nchild_frags; i++)
879         f = Alt(f, child_frags[i]);
880       return f;
881     }
882 
883     case kRegexpStar:
884       return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
885 
886     case kRegexpPlus:
887       return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
888 
889     case kRegexpQuest:
890       return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
891 
892     case kRegexpLiteral:
893       return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
894 
895     case kRegexpLiteralString: {
896       // Concatenation of literals.
897       if (re->nrunes() == 0)
898         return Nop();
899       Frag f;
900       for (int i = 0; i < re->nrunes(); i++) {
901         Frag f1 = Literal(re->runes()[i],
902                           (re->parse_flags()&Regexp::FoldCase) != 0);
903         if (i == 0)
904           f = f1;
905         else
906           f = Cat(f, f1);
907       }
908       return f;
909     }
910 
911     case kRegexpAnyChar:
912       BeginRange();
913       AddRuneRange(0, Runemax, false);
914       return EndRange();
915 
916     case kRegexpAnyByte:
917       return ByteRange(0x00, 0xFF, false);
918 
919     case kRegexpCharClass: {
920       CharClass* cc = re->cc();
921       if (cc->empty()) {
922         // This can't happen.
923         failed_ = true;
924         ABSL_LOG(DFATAL) << "No ranges in char class";
925         return NoMatch();
926       }
927 
928       // ASCII case-folding optimization: if the char class
929       // behaves the same on A-Z as it does on a-z,
930       // discard any ranges wholly contained in A-Z
931       // and mark the other ranges as foldascii.
932       // This reduces the size of a program for
933       // (?i)abc from 3 insts per letter to 1 per letter.
934       bool foldascii = cc->FoldsASCII();
935 
936       // Character class is just a big OR of the different
937       // character ranges in the class.
938       BeginRange();
939       for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
940         // ASCII case-folding optimization (see above).
941         if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
942           continue;
943 
944         // If this range contains all of A-Za-z or none of it,
945         // the fold flag is unnecessary; don't bother.
946         bool fold = foldascii;
947         if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
948             ('Z' < i->lo && i->hi < 'a'))
949           fold = false;
950 
951         AddRuneRange(i->lo, i->hi, fold);
952       }
953       return EndRange();
954     }
955 
956     case kRegexpCapture:
957       // If this is a non-capturing parenthesis -- (?:foo) --
958       // just use the inner expression.
959       if (re->cap() < 0)
960         return child_frags[0];
961       return Capture(child_frags[0], re->cap());
962 
963     case kRegexpBeginLine:
964       return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
965 
966     case kRegexpEndLine:
967       return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
968 
969     case kRegexpBeginText:
970       return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
971 
972     case kRegexpEndText:
973       return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
974 
975     case kRegexpWordBoundary:
976       return EmptyWidth(kEmptyWordBoundary);
977 
978     case kRegexpNoWordBoundary:
979       return EmptyWidth(kEmptyNonWordBoundary);
980   }
981   failed_ = true;
982   ABSL_LOG(DFATAL) << "Missing case in Compiler: " << re->op();
983   return NoMatch();
984 }
985 
986 // Is this regexp required to start at the beginning of the text?
987 // Only approximate; can return false for complicated regexps like (\Aa|\Ab),
988 // but handles (\A(a|b)).  Could use the Walker to write a more exact one.
IsAnchorStart(Regexp ** pre,int depth)989 static bool IsAnchorStart(Regexp** pre, int depth) {
990   Regexp* re = *pre;
991   Regexp* sub;
992   // The depth limit makes sure that we don't overflow
993   // the stack on a deeply nested regexp.  As the comment
994   // above says, IsAnchorStart is conservative, so returning
995   // a false negative is okay.  The exact limit is somewhat arbitrary.
996   if (re == NULL || depth >= 4)
997     return false;
998   switch (re->op()) {
999     default:
1000       break;
1001     case kRegexpConcat:
1002       if (re->nsub() > 0) {
1003         sub = re->sub()[0]->Incref();
1004         if (IsAnchorStart(&sub, depth+1)) {
1005           PODArray<Regexp*> subcopy(re->nsub());
1006           subcopy[0] = sub;  // already have reference
1007           for (int i = 1; i < re->nsub(); i++)
1008             subcopy[i] = re->sub()[i]->Incref();
1009           *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1010           re->Decref();
1011           return true;
1012         }
1013         sub->Decref();
1014       }
1015       break;
1016     case kRegexpCapture:
1017       sub = re->sub()[0]->Incref();
1018       if (IsAnchorStart(&sub, depth+1)) {
1019         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1020         re->Decref();
1021         return true;
1022       }
1023       sub->Decref();
1024       break;
1025     case kRegexpBeginText:
1026       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1027       re->Decref();
1028       return true;
1029   }
1030   return false;
1031 }
1032 
1033 // Is this regexp required to start at the end of the text?
1034 // Only approximate; can return false for complicated regexps like (a\z|b\z),
1035 // but handles ((a|b)\z).  Could use the Walker to write a more exact one.
IsAnchorEnd(Regexp ** pre,int depth)1036 static bool IsAnchorEnd(Regexp** pre, int depth) {
1037   Regexp* re = *pre;
1038   Regexp* sub;
1039   // The depth limit makes sure that we don't overflow
1040   // the stack on a deeply nested regexp.  As the comment
1041   // above says, IsAnchorEnd is conservative, so returning
1042   // a false negative is okay.  The exact limit is somewhat arbitrary.
1043   if (re == NULL || depth >= 4)
1044     return false;
1045   switch (re->op()) {
1046     default:
1047       break;
1048     case kRegexpConcat:
1049       if (re->nsub() > 0) {
1050         sub = re->sub()[re->nsub() - 1]->Incref();
1051         if (IsAnchorEnd(&sub, depth+1)) {
1052           PODArray<Regexp*> subcopy(re->nsub());
1053           subcopy[re->nsub() - 1] = sub;  // already have reference
1054           for (int i = 0; i < re->nsub() - 1; i++)
1055             subcopy[i] = re->sub()[i]->Incref();
1056           *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1057           re->Decref();
1058           return true;
1059         }
1060         sub->Decref();
1061       }
1062       break;
1063     case kRegexpCapture:
1064       sub = re->sub()[0]->Incref();
1065       if (IsAnchorEnd(&sub, depth+1)) {
1066         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1067         re->Decref();
1068         return true;
1069       }
1070       sub->Decref();
1071       break;
1072     case kRegexpEndText:
1073       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1074       re->Decref();
1075       return true;
1076   }
1077   return false;
1078 }
1079 
Setup(Regexp::ParseFlags flags,int64_t max_mem,RE2::Anchor anchor)1080 void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
1081                      RE2::Anchor anchor) {
1082   if (flags & Regexp::Latin1)
1083     encoding_ = kEncodingLatin1;
1084   max_mem_ = max_mem;
1085   if (max_mem <= 0) {
1086     max_ninst_ = 100000;  // more than enough
1087   } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
1088     // No room for anything.
1089     max_ninst_ = 0;
1090   } else {
1091     int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
1092     // Limit instruction count so that inst->id() fits nicely in an int.
1093     // SparseArray also assumes that the indices (inst->id()) are ints.
1094     // The call to WalkExponential uses 2*max_ninst_ below,
1095     // and other places in the code use 2 or 3 * prog->size().
1096     // Limiting to 2^24 should avoid overflow in those places.
1097     // (The point of allowing more than 32 bits of memory is to
1098     // have plenty of room for the DFA states, not to use it up
1099     // on the program.)
1100     if (m >= 1<<24)
1101       m = 1<<24;
1102     // Inst imposes its own limit (currently bigger than 2^24 but be safe).
1103     if (m > Prog::Inst::kMaxInst)
1104       m = Prog::Inst::kMaxInst;
1105     max_ninst_ = static_cast<int>(m);
1106   }
1107   anchor_ = anchor;
1108 }
1109 
1110 // Compiles re, returning program.
1111 // Caller is responsible for deleting prog_.
1112 // If reversed is true, compiles a program that expects
1113 // to run over the input string backward (reverses all concatenations).
1114 // The reversed flag is also recorded in the returned program.
Compile(Regexp * re,bool reversed,int64_t max_mem)1115 Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
1116   Compiler c;
1117   c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
1118   c.reversed_ = reversed;
1119 
1120   // Simplify to remove things like counted repetitions
1121   // and character classes like \d.
1122   Regexp* sre = re->Simplify();
1123   if (sre == NULL)
1124     return NULL;
1125 
1126   // Record whether prog is anchored, removing the anchors.
1127   // (They get in the way of other optimizations.)
1128   bool is_anchor_start = IsAnchorStart(&sre, 0);
1129   bool is_anchor_end = IsAnchorEnd(&sre, 0);
1130 
1131   // Generate fragment for entire regexp.
1132   Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1133   sre->Decref();
1134   if (c.failed_)
1135     return NULL;
1136 
1137   // Success!  Finish by putting Match node at end, and record start.
1138   // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1139   // to behave normally.
1140   c.reversed_ = false;
1141   all = c.Cat(all, c.Match(0));
1142 
1143   c.prog_->set_reversed(reversed);
1144   if (c.prog_->reversed()) {
1145     c.prog_->set_anchor_start(is_anchor_end);
1146     c.prog_->set_anchor_end(is_anchor_start);
1147   } else {
1148     c.prog_->set_anchor_start(is_anchor_start);
1149     c.prog_->set_anchor_end(is_anchor_end);
1150   }
1151 
1152   c.prog_->set_start(all.begin);
1153   if (!c.prog_->anchor_start()) {
1154     // Also create unanchored version, which starts with a .*? loop.
1155     all = c.Cat(c.DotStar(), all);
1156   }
1157   c.prog_->set_start_unanchored(all.begin);
1158 
1159   // Hand ownership of prog_ to caller.
1160   return c.Finish(re);
1161 }
1162 
Finish(Regexp * re)1163 Prog* Compiler::Finish(Regexp* re) {
1164   if (failed_)
1165     return NULL;
1166 
1167   if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1168     // No possible matches; keep Fail instruction only.
1169     ninst_ = 1;
1170   }
1171 
1172   // Hand off the array to Prog.
1173   prog_->inst_ = std::move(inst_);
1174   prog_->size_ = ninst_;
1175 
1176   prog_->Optimize();
1177   prog_->Flatten();
1178   prog_->ComputeByteMap();
1179 
1180   if (!prog_->reversed()) {
1181     std::string prefix;
1182     bool prefix_foldcase;
1183     if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase))
1184       prog_->ConfigurePrefixAccel(prefix, prefix_foldcase);
1185   }
1186 
1187   // Record remaining memory for DFA.
1188   if (max_mem_ <= 0) {
1189     prog_->set_dfa_mem(1<<20);
1190   } else {
1191     int64_t m = max_mem_ - sizeof(Prog);
1192     m -= prog_->size_*sizeof(Prog::Inst);  // account for inst_
1193     if (prog_->CanBitState())
1194       m -= prog_->size_*sizeof(uint16_t);  // account for list_heads_
1195     if (m < 0)
1196       m = 0;
1197     prog_->set_dfa_mem(m);
1198   }
1199 
1200   Prog* p = prog_;
1201   prog_ = NULL;
1202   return p;
1203 }
1204 
1205 // Converts Regexp to Prog.
CompileToProg(int64_t max_mem)1206 Prog* Regexp::CompileToProg(int64_t max_mem) {
1207   return Compiler::Compile(this, false, max_mem);
1208 }
1209 
CompileToReverseProg(int64_t max_mem)1210 Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
1211   return Compiler::Compile(this, true, max_mem);
1212 }
1213 
DotStar()1214 Frag Compiler::DotStar() {
1215   return Star(ByteRange(0x00, 0xff, false), true);
1216 }
1217 
1218 // Compiles RE set to Prog.
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1219 Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1220   Compiler c;
1221   c.Setup(re->parse_flags(), max_mem, anchor);
1222 
1223   Regexp* sre = re->Simplify();
1224   if (sre == NULL)
1225     return NULL;
1226 
1227   Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1228   sre->Decref();
1229   if (c.failed_)
1230     return NULL;
1231 
1232   c.prog_->set_anchor_start(true);
1233   c.prog_->set_anchor_end(true);
1234 
1235   if (anchor == RE2::UNANCHORED) {
1236     // Prepend .* or else the expression will effectively be anchored.
1237     // Complemented by the ANCHOR_BOTH case in PostVisit().
1238     all = c.Cat(c.DotStar(), all);
1239   }
1240   c.prog_->set_start(all.begin);
1241   c.prog_->set_start_unanchored(all.begin);
1242 
1243   Prog* prog = c.Finish(re);
1244   if (prog == NULL)
1245     return NULL;
1246 
1247   // Make sure DFA has enough memory to operate,
1248   // since we're not going to fall back to the NFA.
1249   bool dfa_failed = false;
1250   absl::string_view sp = "hello, world";
1251   prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1252                   NULL, &dfa_failed, NULL);
1253   if (dfa_failed) {
1254     delete prog;
1255     return NULL;
1256   }
1257 
1258   return prog;
1259 }
1260 
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1261 Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1262   return Compiler::CompileSet(re, anchor, max_mem);
1263 }
1264 
1265 }  // namespace re2
1266