1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Compile regular expression to Prog.
6 //
7 // Prog and Inst are defined in prog.h.
8 // This file's external interface is just Regexp::CompileToProg.
9 // The Compiler class defined in this file is private.
10
11 #include "re2/prog.h"
12 #include "re2/re2.h"
13 #include "re2/regexp.h"
14 #include "re2/walker-inl.h"
15
16 namespace re2 {
17
18 // List of pointers to Inst* that need to be filled in (patched).
19 // Because the Inst* haven't been filled in yet,
20 // we can use the Inst* word to hold the list's "next" pointer.
21 // It's kind of sleazy, but it works well in practice.
22 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
23 //
24 // Because the out and out1 fields in Inst are no longer pointers,
25 // we can't use pointers directly here either. Instead, p refers
26 // to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
27 // p == 0 represents the NULL list. This is okay because instruction #0
28 // is always the fail instruction, which never appears on a list.
29
30 struct PatchList {
31 uint32 p;
32
33 // Returns patch list containing just p.
34 static PatchList Mk(uint32 p);
35
36 // Patches all the entries on l to have value v.
37 // Caller must not ever use patch list again.
38 static void Patch(Prog::Inst *inst0, PatchList l, uint32 v);
39
40 // Deref returns the next pointer pointed at by p.
41 static PatchList Deref(Prog::Inst *inst0, PatchList l);
42
43 // Appends two patch lists and returns result.
44 static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
45 };
46
47 static PatchList nullPatchList;
48
49 // Returns patch list containing just p.
Mk(uint32 p)50 PatchList PatchList::Mk(uint32 p) {
51 PatchList l;
52 l.p = p;
53 return l;
54 }
55
56 // Returns the next pointer pointed at by l.
Deref(Prog::Inst * inst0,PatchList l)57 PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
58 Prog::Inst* ip = &inst0[l.p>>1];
59 if (l.p&1)
60 l.p = ip->out1();
61 else
62 l.p = ip->out();
63 return l;
64 }
65
66 // Patches all the entries on l to have value v.
Patch(Prog::Inst * inst0,PatchList l,uint32 val)67 void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) {
68 while (l.p != 0) {
69 Prog::Inst* ip = &inst0[l.p>>1];
70 if (l.p&1) {
71 l.p = ip->out1();
72 ip->out1_ = val;
73 } else {
74 l.p = ip->out();
75 ip->set_out(val);
76 }
77 }
78 }
79
80 // Appends two patch lists and returns result.
Append(Prog::Inst * inst0,PatchList l1,PatchList l2)81 PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
82 if (l1.p == 0)
83 return l2;
84 if (l2.p == 0)
85 return l1;
86
87 PatchList l = l1;
88 for (;;) {
89 PatchList next = PatchList::Deref(inst0, l);
90 if (next.p == 0)
91 break;
92 l = next;
93 }
94
95 Prog::Inst* ip = &inst0[l.p>>1];
96 if (l.p&1)
97 ip->out1_ = l2.p;
98 else
99 ip->set_out(l2.p);
100
101 return l1;
102 }
103
104 // Compiled program fragment.
105 struct Frag {
106 uint32 begin;
107 PatchList end;
108
Fragre2::Frag109 explicit Frag(LinkerInitialized) {}
Fragre2::Frag110 Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector
Fragre2::Frag111 Frag(uint32 begin, PatchList end) : begin(begin), end(end) {}
112 };
113
114 static Frag kNullFrag(LINKER_INITIALIZED);
115
116 // Input encodings.
117 enum Encoding {
118 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
119 kEncodingLatin1, // Latin1 (0-FF)
120 };
121
122 class Compiler : public Regexp::Walker<Frag> {
123 public:
124 explicit Compiler();
125 ~Compiler();
126
127 // Compiles Regexp to a new Prog.
128 // Caller is responsible for deleting Prog when finished with it.
129 // If reversed is true, compiles for walking over the input
130 // string backward (reverses all concatenations).
131 static Prog *Compile(Regexp* re, bool reversed, int64 max_mem);
132
133 // Compiles alternation of all the re to a new Prog.
134 // Each re has a match with an id equal to its index in the vector.
135 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
136 Regexp* re);
137
138 // Interface for Regexp::Walker, which helps traverse the Regexp.
139 // The walk is purely post-recursive: given the machines for the
140 // children, PostVisit combines them to create the machine for
141 // the current node. The child_args are Frags.
142 // The Compiler traverses the Regexp parse tree, visiting
143 // each node in depth-first order. It invokes PreVisit before
144 // visiting the node's children and PostVisit after visiting
145 // the children.
146 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
147 Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
148 int nchild_args);
149 Frag ShortVisit(Regexp* re, Frag parent_arg);
150 Frag Copy(Frag arg);
151
152 // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
153 Frag Plus(Frag a, bool nongreedy);
154 Frag Star(Frag a, bool nongreedy);
155 Frag Quest(Frag a, bool nongreedy);
156
157 // Given fragment a, returns (a) capturing as \n.
158 Frag Capture(Frag a, int n);
159
160 // Given fragments a and b, returns ab; a|b
161 Frag Cat(Frag a, Frag b);
162 Frag Alt(Frag a, Frag b);
163
164 // Returns a fragment that can't match anything.
165 Frag NoMatch();
166
167 // Returns a fragment that matches the empty string.
168 Frag Match(int32 id);
169
170 // Returns a no-op fragment.
171 Frag Nop();
172
173 // Returns a fragment matching the byte range lo-hi.
174 Frag ByteRange(int lo, int hi, bool foldcase);
175
176 // Returns a fragment matching an empty-width special op.
177 Frag EmptyWidth(EmptyOp op);
178
179 // Adds n instructions to the program.
180 // Returns the index of the first one.
181 // Returns -1 if no more instructions are available.
182 int AllocInst(int n);
183
184 // Deletes unused instructions.
185 void Trim();
186
187 // Rune range compiler.
188
189 // Begins a new alternation.
190 void BeginRange();
191
192 // Adds a fragment matching the rune range lo-hi.
193 void AddRuneRange(Rune lo, Rune hi, bool foldcase);
194 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
195 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
196 void Add_80_10ffff();
197
198 // New suffix that matches the byte range lo-hi, then goes to next.
199 int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
200 int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
201
202 // Adds a suffix to alternation.
203 void AddSuffix(int id);
204
205 // Returns the alternation of all the added suffixes.
206 Frag EndRange();
207
208 // Single rune.
209 Frag Literal(Rune r, bool foldcase);
210
211 void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
212 Prog* Finish();
213
214 // Returns .* where dot = any byte
215 Frag DotStar();
216
217 private:
218 Prog* prog_; // Program being built.
219 bool failed_; // Did we give up compiling?
220 Encoding encoding_; // Input encoding
221 bool reversed_; // Should program run backward over text?
222
223 int max_inst_; // Maximum number of instructions.
224
225 Prog::Inst* inst_; // Pointer to first instruction.
226 int inst_len_; // Number of instructions used.
227 int inst_cap_; // Number of instructions allocated.
228
229 int64 max_mem_; // Total memory budget.
230
231 map<uint64, int> rune_cache_;
232 Frag rune_range_;
233
234 RE2::Anchor anchor_; // anchor mode for RE2::Set
235
236 DISALLOW_EVIL_CONSTRUCTORS(Compiler);
237 };
238
Compiler()239 Compiler::Compiler() {
240 prog_ = new Prog();
241 failed_ = false;
242 encoding_ = kEncodingUTF8;
243 reversed_ = false;
244 inst_ = NULL;
245 inst_len_ = 0;
246 inst_cap_ = 0;
247 max_inst_ = 1; // make AllocInst for fail instruction okay
248 max_mem_ = 0;
249 int fail = AllocInst(1);
250 inst_[fail].InitFail();
251 max_inst_ = 0; // Caller must change
252 }
253
~Compiler()254 Compiler::~Compiler() {
255 delete prog_;
256 delete[] inst_;
257 }
258
AllocInst(int n)259 int Compiler::AllocInst(int n) {
260 if (failed_ || inst_len_ + n > max_inst_) {
261 failed_ = true;
262 return -1;
263 }
264
265 if (inst_len_ + n > inst_cap_) {
266 if (inst_cap_ == 0)
267 inst_cap_ = 8;
268 while (inst_len_ + n > inst_cap_)
269 inst_cap_ *= 2;
270 Prog::Inst* ip = new Prog::Inst[inst_cap_];
271 memmove(ip, inst_, inst_len_ * sizeof ip[0]);
272 memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
273 delete[] inst_;
274 inst_ = ip;
275 }
276 int id = inst_len_;
277 inst_len_ += n;
278 return id;
279 }
280
Trim()281 void Compiler::Trim() {
282 if (inst_len_ < inst_cap_) {
283 Prog::Inst* ip = new Prog::Inst[inst_len_];
284 memmove(ip, inst_, inst_len_ * sizeof ip[0]);
285 delete[] inst_;
286 inst_ = ip;
287 inst_cap_ = inst_len_;
288 }
289 }
290
291 // These routines are somewhat hard to visualize in text --
292 // see http://swtch.com/~rsc/regexp/regexp1.html for
293 // pictures explaining what is going on here.
294
295 // Returns an unmatchable fragment.
NoMatch()296 Frag Compiler::NoMatch() {
297 return Frag(0, nullPatchList);
298 }
299
300 // Is a an unmatchable fragment?
IsNoMatch(Frag a)301 static bool IsNoMatch(Frag a) {
302 return a.begin == 0;
303 }
304
305 // Given fragments a and b, returns fragment for ab.
Cat(Frag a,Frag b)306 Frag Compiler::Cat(Frag a, Frag b) {
307 if (IsNoMatch(a) || IsNoMatch(b))
308 return NoMatch();
309
310 // Elide no-op.
311 Prog::Inst* begin = &inst_[a.begin];
312 if (begin->opcode() == kInstNop &&
313 a.end.p == (a.begin << 1) &&
314 begin->out() == 0) {
315 PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere
316 return b;
317 }
318
319 // To run backward over string, reverse all concatenations.
320 if (reversed_) {
321 PatchList::Patch(inst_, b.end, a.begin);
322 return Frag(b.begin, a.end);
323 }
324
325 PatchList::Patch(inst_, a.end, b.begin);
326 return Frag(a.begin, b.end);
327 }
328
329 // Given fragments for a and b, returns fragment for a|b.
Alt(Frag a,Frag b)330 Frag Compiler::Alt(Frag a, Frag b) {
331 // Special case for convenience in loops.
332 if (IsNoMatch(a))
333 return b;
334 if (IsNoMatch(b))
335 return a;
336
337 int id = AllocInst(1);
338 if (id < 0)
339 return NoMatch();
340
341 inst_[id].InitAlt(a.begin, b.begin);
342 return Frag(id, PatchList::Append(inst_, a.end, b.end));
343 }
344
345 // When capturing submatches in like-Perl mode, a kOpAlt Inst
346 // treats out_ as the first choice, out1_ as the second.
347 //
348 // For *, +, and ?, if out_ causes another repetition,
349 // then the operator is greedy. If out1_ is the repetition
350 // (and out_ moves forward), then the operator is non-greedy.
351
352 // Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
Star(Frag a,bool nongreedy)353 Frag Compiler::Star(Frag a, bool nongreedy) {
354 int id = AllocInst(1);
355 if (id < 0)
356 return NoMatch();
357 inst_[id].InitAlt(0, 0);
358 PatchList::Patch(inst_, a.end, id);
359 if (nongreedy) {
360 inst_[id].out1_ = a.begin;
361 return Frag(id, PatchList::Mk(id << 1));
362 } else {
363 inst_[id].set_out(a.begin);
364 return Frag(id, PatchList::Mk((id << 1) | 1));
365 }
366 }
367
368 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
Plus(Frag a,bool nongreedy)369 Frag Compiler::Plus(Frag a, bool nongreedy) {
370 // a+ is just a* with a different entry point.
371 Frag f = Star(a, nongreedy);
372 return Frag(a.begin, f.end);
373 }
374
375 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
Quest(Frag a,bool nongreedy)376 Frag Compiler::Quest(Frag a, bool nongreedy) {
377 int id = AllocInst(1);
378 if (id < 0)
379 return NoMatch();
380 PatchList pl;
381 if (nongreedy) {
382 inst_[id].InitAlt(0, a.begin);
383 pl = PatchList::Mk(id << 1);
384 } else {
385 inst_[id].InitAlt(a.begin, 0);
386 pl = PatchList::Mk((id << 1) | 1);
387 }
388 return Frag(id, PatchList::Append(inst_, pl, a.end));
389 }
390
391 // Returns a fragment for the byte range lo-hi.
ByteRange(int lo,int hi,bool foldcase)392 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
393 int id = AllocInst(1);
394 if (id < 0)
395 return NoMatch();
396 inst_[id].InitByteRange(lo, hi, foldcase, 0);
397 prog_->byte_inst_count_++;
398 prog_->MarkByteRange(lo, hi);
399 if (foldcase && lo <= 'z' && hi >= 'a') {
400 if (lo < 'a')
401 lo = 'a';
402 if (hi > 'z')
403 hi = 'z';
404 if (lo <= hi)
405 prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
406 }
407 return Frag(id, PatchList::Mk(id << 1));
408 }
409
410 // Returns a no-op fragment. Sometimes unavoidable.
Nop()411 Frag Compiler::Nop() {
412 int id = AllocInst(1);
413 if (id < 0)
414 return NoMatch();
415 inst_[id].InitNop(0);
416 return Frag(id, PatchList::Mk(id << 1));
417 }
418
419 // Returns a fragment that signals a match.
Match(int32 match_id)420 Frag Compiler::Match(int32 match_id) {
421 int id = AllocInst(1);
422 if (id < 0)
423 return NoMatch();
424 inst_[id].InitMatch(match_id);
425 return Frag(id, nullPatchList);
426 }
427
428 // Returns a fragment matching a particular empty-width op (like ^ or $)
EmptyWidth(EmptyOp empty)429 Frag Compiler::EmptyWidth(EmptyOp empty) {
430 int id = AllocInst(1);
431 if (id < 0)
432 return NoMatch();
433 inst_[id].InitEmptyWidth(empty, 0);
434 if (empty & (kEmptyBeginLine|kEmptyEndLine))
435 prog_->MarkByteRange('\n', '\n');
436 if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
437 int j;
438 for (int i = 0; i < 256; i = j) {
439 for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++)
440 ;
441 prog_->MarkByteRange(i, j-1);
442 }
443 }
444 return Frag(id, PatchList::Mk(id << 1));
445 }
446
447 // Given a fragment a, returns a fragment with capturing parens around a.
Capture(Frag a,int n)448 Frag Compiler::Capture(Frag a, int n) {
449 int id = AllocInst(2);
450 if (id < 0)
451 return NoMatch();
452 inst_[id].InitCapture(2*n, a.begin);
453 inst_[id+1].InitCapture(2*n+1, 0);
454 PatchList::Patch(inst_, a.end, id+1);
455
456 return Frag(id, PatchList::Mk((id+1) << 1));
457 }
458
459 // A Rune is a name for a Unicode code point.
460 // Returns maximum rune encoded by UTF-8 sequence of length len.
MaxRune(int len)461 static int MaxRune(int len) {
462 int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
463 if (len == 1)
464 b = 7;
465 else
466 b = 8-(len+1) + 6*(len-1);
467 return (1<<b) - 1; // maximum Rune for b bits.
468 }
469
470 // The rune range compiler caches common suffix fragments,
471 // which are very common in UTF-8 (e.g., [80-bf]).
472 // The fragment suffixes are identified by their start
473 // instructions. NULL denotes the eventual end match.
474 // The Frag accumulates in rune_range_. Caching common
475 // suffixes reduces the UTF-8 "." from 32 to 24 instructions,
476 // and it reduces the corresponding one-pass NFA from 16 nodes to 8.
477
BeginRange()478 void Compiler::BeginRange() {
479 rune_cache_.clear();
480 rune_range_.begin = 0;
481 rune_range_.end = nullPatchList;
482 }
483
UncachedRuneByteSuffix(uint8 lo,uint8 hi,bool foldcase,int next)484 int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
485 int next) {
486 Frag f = ByteRange(lo, hi, foldcase);
487 if (next != 0) {
488 PatchList::Patch(inst_, f.end, next);
489 } else {
490 rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end);
491 }
492 return f.begin;
493 }
494
RuneByteSuffix(uint8 lo,uint8 hi,bool foldcase,int next)495 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
496 // In Latin1 mode, there's no point in caching.
497 // In forward UTF-8 mode, only need to cache continuation bytes.
498 if (encoding_ == kEncodingLatin1 ||
499 (encoding_ == kEncodingUTF8 &&
500 !reversed_ &&
501 !(0x80 <= lo && hi <= 0xbf))) {
502 return UncachedRuneByteSuffix(lo, hi, foldcase, next);
503 }
504
505 uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase;
506 map<uint64, int>::iterator it = rune_cache_.find(key);
507 if (it != rune_cache_.end())
508 return it->second;
509 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
510 rune_cache_[key] = id;
511 return id;
512 }
513
AddSuffix(int id)514 void Compiler::AddSuffix(int id) {
515 if (rune_range_.begin == 0) {
516 rune_range_.begin = id;
517 return;
518 }
519
520 int alt = AllocInst(1);
521 if (alt < 0) {
522 rune_range_.begin = 0;
523 return;
524 }
525 inst_[alt].InitAlt(rune_range_.begin, id);
526 rune_range_.begin = alt;
527 }
528
EndRange()529 Frag Compiler::EndRange() {
530 return rune_range_;
531 }
532
533 // Converts rune range lo-hi into a fragment that recognizes
534 // the bytes that would make up those runes in the current
535 // encoding (Latin 1 or UTF-8).
536 // This lets the machine work byte-by-byte even when
537 // using multibyte encodings.
538
AddRuneRange(Rune lo,Rune hi,bool foldcase)539 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
540 switch (encoding_) {
541 default:
542 case kEncodingUTF8:
543 AddRuneRangeUTF8(lo, hi, foldcase);
544 break;
545 case kEncodingLatin1:
546 AddRuneRangeLatin1(lo, hi, foldcase);
547 break;
548 }
549 }
550
AddRuneRangeLatin1(Rune lo,Rune hi,bool foldcase)551 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
552 // Latin1 is easy: runes *are* bytes.
553 if (lo > hi || lo > 0xFF)
554 return;
555 if (hi > 0xFF)
556 hi = 0xFF;
557 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
558 }
559
560 // Table describing how to make a UTF-8 matching machine
561 // for the rune range 80-10FFFF (Runeself-Runemax).
562 // This range happens frequently enough (for example /./ and /[^a-z]/)
563 // and the rune_cache_ map is slow enough that this is worth
564 // special handling. Makes compilation of a small expression
565 // with a dot in it about 10% faster.
566 // The * in the comments below mark whole sequences.
567 static struct ByteRangeProg {
568 int next;
569 int lo;
570 int hi;
571 } prog_80_10ffff[] = {
572 // Two-byte
573 { -1, 0x80, 0xBF, }, // 0: 80-BF
574 { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF*
575
576 // Three-byte
577 { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF
578 { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF*
579 { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF
580 { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF*
581
582 // Four-byte
583 { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF
584 { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF*
585 { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF
586 { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF*
587 { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF
588 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF*
589 };
590
Add_80_10ffff()591 void Compiler::Add_80_10ffff() {
592 int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
593 for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
594 const ByteRangeProg& p = prog_80_10ffff[i];
595 int next = 0;
596 if (p.next >= 0)
597 next = inst[p.next];
598 inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);
599 if ((p.lo & 0xC0) != 0x80)
600 AddSuffix(inst[i]);
601 }
602 }
603
AddRuneRangeUTF8(Rune lo,Rune hi,bool foldcase)604 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
605 if (lo > hi)
606 return;
607
608 // Pick off 80-10FFFF as a common special case
609 // that can bypass the slow rune_cache_.
610 if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
611 Add_80_10ffff();
612 return;
613 }
614
615 // Split range into same-length sized ranges.
616 for (int i = 1; i < UTFmax; i++) {
617 Rune max = MaxRune(i);
618 if (lo <= max && max < hi) {
619 AddRuneRangeUTF8(lo, max, foldcase);
620 AddRuneRangeUTF8(max+1, hi, foldcase);
621 return;
622 }
623 }
624
625 // ASCII range is always a special case.
626 if (hi < Runeself) {
627 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
628 return;
629 }
630
631 // Split range into sections that agree on leading bytes.
632 for (int i = 1; i < UTFmax; i++) {
633 uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
634 if ((lo & ~m) != (hi & ~m)) {
635 if ((lo & m) != 0) {
636 AddRuneRangeUTF8(lo, lo|m, foldcase);
637 AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
638 return;
639 }
640 if ((hi & m) != m) {
641 AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
642 AddRuneRangeUTF8(hi&~m, hi, foldcase);
643 return;
644 }
645 }
646 }
647
648 // Finally. Generate byte matching equivalent for lo-hi.
649 uint8 ulo[UTFmax], uhi[UTFmax];
650 int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
651 int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
652 (void)m; // USED(m)
653 DCHECK_EQ(n, m);
654
655 int id = 0;
656 if (reversed_) {
657 for (int i = 0; i < n; i++)
658 id = RuneByteSuffix(ulo[i], uhi[i], false, id);
659 } else {
660 for (int i = n-1; i >= 0; i--)
661 id = RuneByteSuffix(ulo[i], uhi[i], false, id);
662 }
663 AddSuffix(id);
664 }
665
666 // Should not be called.
Copy(Frag arg)667 Frag Compiler::Copy(Frag arg) {
668 // We're using WalkExponential; there should be no copying.
669 LOG(DFATAL) << "Compiler::Copy called!";
670 failed_ = true;
671 return NoMatch();
672 }
673
674 // Visits a node quickly; called once WalkExponential has
675 // decided to cut this walk short.
ShortVisit(Regexp * re,Frag)676 Frag Compiler::ShortVisit(Regexp* re, Frag) {
677 failed_ = true;
678 return NoMatch();
679 }
680
681 // Called before traversing a node's children during the walk.
PreVisit(Regexp * re,Frag,bool * stop)682 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
683 // Cut off walk if we've already failed.
684 if (failed_)
685 *stop = true;
686
687 return kNullFrag; // not used by caller
688 }
689
Literal(Rune r,bool foldcase)690 Frag Compiler::Literal(Rune r, bool foldcase) {
691 switch (encoding_) {
692 default:
693 return kNullFrag;
694
695 case kEncodingLatin1:
696 return ByteRange(r, r, foldcase);
697
698 case kEncodingUTF8: {
699 if (r < Runeself) // Make common case fast.
700 return ByteRange(r, r, foldcase);
701 uint8 buf[UTFmax];
702 int n = runetochar(reinterpret_cast<char*>(buf), &r);
703 Frag f = ByteRange((uint8)buf[0], buf[0], false);
704 for (int i = 1; i < n; i++)
705 f = Cat(f, ByteRange((uint8)buf[i], buf[i], false));
706 return f;
707 }
708 }
709 }
710
711 // Called after traversing the node's children during the walk.
712 // Given their frags, build and return the frag for this re.
PostVisit(Regexp * re,Frag,Frag,Frag * child_frags,int nchild_frags)713 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
714 int nchild_frags) {
715 // If a child failed, don't bother going forward, especially
716 // since the child_frags might contain Frags with NULLs in them.
717 if (failed_)
718 return NoMatch();
719
720 // Given the child fragments, return the fragment for this node.
721 switch (re->op()) {
722 case kRegexpRepeat:
723 // Should not see; code at bottom of function will print error
724 break;
725
726 case kRegexpNoMatch:
727 return NoMatch();
728
729 case kRegexpEmptyMatch:
730 return Nop();
731
732 case kRegexpHaveMatch: {
733 Frag f = Match(re->match_id());
734 // Remember unanchored match to end of string.
735 if (anchor_ != RE2::ANCHOR_BOTH)
736 f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f));
737 return f;
738 }
739
740 case kRegexpConcat: {
741 Frag f = child_frags[0];
742 for (int i = 1; i < nchild_frags; i++)
743 f = Cat(f, child_frags[i]);
744 return f;
745 }
746
747 case kRegexpAlternate: {
748 Frag f = child_frags[0];
749 for (int i = 1; i < nchild_frags; i++)
750 f = Alt(f, child_frags[i]);
751 return f;
752 }
753
754 case kRegexpStar:
755 return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
756
757 case kRegexpPlus:
758 return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
759
760 case kRegexpQuest:
761 return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
762
763 case kRegexpLiteral:
764 return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase);
765
766 case kRegexpLiteralString: {
767 // Concatenation of literals.
768 if (re->nrunes() == 0)
769 return Nop();
770 Frag f;
771 for (int i = 0; i < re->nrunes(); i++) {
772 Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase);
773 if (i == 0)
774 f = f1;
775 else
776 f = Cat(f, f1);
777 }
778 return f;
779 }
780
781 case kRegexpAnyChar:
782 BeginRange();
783 AddRuneRange(0, Runemax, false);
784 return EndRange();
785
786 case kRegexpAnyByte:
787 return ByteRange(0x00, 0xFF, false);
788
789 case kRegexpCharClass: {
790 CharClass* cc = re->cc();
791 if (cc->empty()) {
792 // This can't happen.
793 LOG(DFATAL) << "No ranges in char class";
794 failed_ = true;
795 return NoMatch();
796 }
797
798 // ASCII case-folding optimization: if the char class
799 // behaves the same on A-Z as it does on a-z,
800 // discard any ranges wholly contained in A-Z
801 // and mark the other ranges as foldascii.
802 // This reduces the size of a program for
803 // (?i)abc from 3 insts per letter to 1 per letter.
804 bool foldascii = cc->FoldsASCII();
805
806 // Character class is just a big OR of the different
807 // character ranges in the class.
808 BeginRange();
809 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
810 // ASCII case-folding optimization (see above).
811 if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
812 continue;
813
814 // If this range contains all of A-Za-z or none of it,
815 // the fold flag is unnecessary; don't bother.
816 bool fold = foldascii;
817 if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo)
818 fold = false;
819
820 AddRuneRange(i->lo, i->hi, fold);
821 }
822 return EndRange();
823 }
824
825 case kRegexpCapture:
826 // If this is a non-capturing parenthesis -- (?:foo) --
827 // just use the inner expression.
828 if (re->cap() < 0)
829 return child_frags[0];
830 return Capture(child_frags[0], re->cap());
831
832 case kRegexpBeginLine:
833 return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
834
835 case kRegexpEndLine:
836 return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
837
838 case kRegexpBeginText:
839 return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
840
841 case kRegexpEndText:
842 return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
843
844 case kRegexpWordBoundary:
845 return EmptyWidth(kEmptyWordBoundary);
846
847 case kRegexpNoWordBoundary:
848 return EmptyWidth(kEmptyNonWordBoundary);
849 }
850 LOG(DFATAL) << "Missing case in Compiler: " << re->op();
851 failed_ = true;
852 return NoMatch();
853 }
854
855 // Is this regexp required to start at the beginning of the text?
856 // Only approximate; can return false for complicated regexps like (\Aa|\Ab),
857 // but handles (\A(a|b)). Could use the Walker to write a more exact one.
IsAnchorStart(Regexp ** pre,int depth)858 static bool IsAnchorStart(Regexp** pre, int depth) {
859 Regexp* re = *pre;
860 Regexp* sub;
861 // The depth limit makes sure that we don't overflow
862 // the stack on a deeply nested regexp. As the comment
863 // above says, IsAnchorStart is conservative, so returning
864 // a false negative is okay. The exact limit is somewhat arbitrary.
865 if (re == NULL || depth >= 4)
866 return false;
867 switch (re->op()) {
868 default:
869 break;
870 case kRegexpConcat:
871 if (re->nsub() > 0) {
872 sub = re->sub()[0]->Incref();
873 if (IsAnchorStart(&sub, depth+1)) {
874 Regexp** subcopy = new Regexp*[re->nsub()];
875 subcopy[0] = sub; // already have reference
876 for (int i = 1; i < re->nsub(); i++)
877 subcopy[i] = re->sub()[i]->Incref();
878 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
879 delete[] subcopy;
880 re->Decref();
881 return true;
882 }
883 sub->Decref();
884 }
885 break;
886 case kRegexpCapture:
887 sub = re->sub()[0]->Incref();
888 if (IsAnchorStart(&sub, depth+1)) {
889 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
890 re->Decref();
891 return true;
892 }
893 sub->Decref();
894 break;
895 case kRegexpBeginText:
896 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
897 re->Decref();
898 return true;
899 }
900 return false;
901 }
902
903 // Is this regexp required to start at the end of the text?
904 // Only approximate; can return false for complicated regexps like (a\z|b\z),
905 // but handles ((a|b)\z). Could use the Walker to write a more exact one.
IsAnchorEnd(Regexp ** pre,int depth)906 static bool IsAnchorEnd(Regexp** pre, int depth) {
907 Regexp* re = *pre;
908 Regexp* sub;
909 // The depth limit makes sure that we don't overflow
910 // the stack on a deeply nested regexp. As the comment
911 // above says, IsAnchorEnd is conservative, so returning
912 // a false negative is okay. The exact limit is somewhat arbitrary.
913 if (re == NULL || depth >= 4)
914 return false;
915 switch (re->op()) {
916 default:
917 break;
918 case kRegexpConcat:
919 if (re->nsub() > 0) {
920 sub = re->sub()[re->nsub() - 1]->Incref();
921 if (IsAnchorEnd(&sub, depth+1)) {
922 Regexp** subcopy = new Regexp*[re->nsub()];
923 subcopy[re->nsub() - 1] = sub; // already have reference
924 for (int i = 0; i < re->nsub() - 1; i++)
925 subcopy[i] = re->sub()[i]->Incref();
926 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
927 delete[] subcopy;
928 re->Decref();
929 return true;
930 }
931 sub->Decref();
932 }
933 break;
934 case kRegexpCapture:
935 sub = re->sub()[0]->Incref();
936 if (IsAnchorEnd(&sub, depth+1)) {
937 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
938 re->Decref();
939 return true;
940 }
941 sub->Decref();
942 break;
943 case kRegexpEndText:
944 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
945 re->Decref();
946 return true;
947 }
948 return false;
949 }
950
Setup(Regexp::ParseFlags flags,int64 max_mem,RE2::Anchor anchor)951 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
952 RE2::Anchor anchor) {
953 prog_->set_flags(flags);
954
955 if (flags & Regexp::Latin1)
956 encoding_ = kEncodingLatin1;
957 max_mem_ = max_mem;
958 if (max_mem <= 0) {
959 max_inst_ = 100000; // more than enough
960 } else if (max_mem <= sizeof(Prog)) {
961 // No room for anything.
962 max_inst_ = 0;
963 } else {
964 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
965 // Limit instruction count so that inst->id() fits nicely in an int.
966 // SparseArray also assumes that the indices (inst->id()) are ints.
967 // The call to WalkExponential uses 2*max_inst_ below,
968 // and other places in the code use 2 or 3 * prog->size().
969 // Limiting to 2^24 should avoid overflow in those places.
970 // (The point of allowing more than 32 bits of memory is to
971 // have plenty of room for the DFA states, not to use it up
972 // on the program.)
973 if (m >= 1<<24)
974 m = 1<<24;
975
976 // Inst imposes its own limit (currently bigger than 2^24 but be safe).
977 if (m > Prog::Inst::kMaxInst)
978 m = Prog::Inst::kMaxInst;
979
980 max_inst_ = m;
981 }
982
983 anchor_ = anchor;
984 }
985
986 // Compiles re, returning program.
987 // Caller is responsible for deleting prog_.
988 // If reversed is true, compiles a program that expects
989 // to run over the input string backward (reverses all concatenations).
990 // The reversed flag is also recorded in the returned program.
Compile(Regexp * re,bool reversed,int64 max_mem)991 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
992 Compiler c;
993
994 c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
995 c.reversed_ = reversed;
996
997 // Simplify to remove things like counted repetitions
998 // and character classes like \d.
999 Regexp* sre = re->Simplify();
1000 if (sre == NULL)
1001 return NULL;
1002
1003 // Record whether prog is anchored, removing the anchors.
1004 // (They get in the way of other optimizations.)
1005 bool is_anchor_start = IsAnchorStart(&sre, 0);
1006 bool is_anchor_end = IsAnchorEnd(&sre, 0);
1007
1008 // Generate fragment for entire regexp.
1009 Frag f = c.WalkExponential(sre, kNullFrag, 2*c.max_inst_);
1010 sre->Decref();
1011 if (c.failed_)
1012 return NULL;
1013
1014 // Success! Finish by putting Match node at end, and record start.
1015 // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1016 // to behave normally.
1017 c.reversed_ = false;
1018 Frag all = c.Cat(f, c.Match(0));
1019 c.prog_->set_start(all.begin);
1020
1021 if (reversed) {
1022 c.prog_->set_anchor_start(is_anchor_end);
1023 c.prog_->set_anchor_end(is_anchor_start);
1024 } else {
1025 c.prog_->set_anchor_start(is_anchor_start);
1026 c.prog_->set_anchor_end(is_anchor_end);
1027 }
1028
1029 // Also create unanchored version, which starts with a .*? loop.
1030 if (c.prog_->anchor_start()) {
1031 c.prog_->set_start_unanchored(c.prog_->start());
1032 } else {
1033 Frag unanchored = c.Cat(c.DotStar(), all);
1034 c.prog_->set_start_unanchored(unanchored.begin);
1035 }
1036
1037 c.prog_->set_reversed(reversed);
1038
1039 // Hand ownership of prog_ to caller.
1040 return c.Finish();
1041 }
1042
Finish()1043 Prog* Compiler::Finish() {
1044 if (failed_)
1045 return NULL;
1046
1047 if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1048 // No possible matches; keep Fail instruction only.
1049 inst_len_ = 1;
1050 }
1051
1052 // Trim instruction to minimum array and transfer to Prog.
1053 Trim();
1054 prog_->inst_ = inst_;
1055 prog_->size_ = inst_len_;
1056 inst_ = NULL;
1057
1058 // Compute byte map.
1059 prog_->ComputeByteMap();
1060
1061 prog_->Optimize();
1062
1063 // Record remaining memory for DFA.
1064 if (max_mem_ <= 0) {
1065 prog_->set_dfa_mem(1<<20);
1066 } else {
1067 int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst);
1068 if (m < 0)
1069 m = 0;
1070 prog_->set_dfa_mem(m);
1071 }
1072
1073 Prog* p = prog_;
1074 prog_ = NULL;
1075 return p;
1076 }
1077
1078 // Converts Regexp to Prog.
CompileToProg(int64 max_mem)1079 Prog* Regexp::CompileToProg(int64 max_mem) {
1080 return Compiler::Compile(this, false, max_mem);
1081 }
1082
CompileToReverseProg(int64 max_mem)1083 Prog* Regexp::CompileToReverseProg(int64 max_mem) {
1084 return Compiler::Compile(this, true, max_mem);
1085 }
1086
DotStar()1087 Frag Compiler::DotStar() {
1088 return Star(ByteRange(0x00, 0xff, false), true);
1089 }
1090
1091 // Compiles RE set to Prog.
CompileSet(const RE2::Options & options,RE2::Anchor anchor,Regexp * re)1092 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1093 Regexp* re) {
1094 Compiler c;
1095
1096 Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
1097 c.Setup(pf, options.max_mem(), anchor);
1098
1099 // Compile alternation of fragments.
1100 Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_);
1101 re->Decref();
1102 if (c.failed_)
1103 return NULL;
1104
1105 if (anchor == RE2::UNANCHORED) {
1106 // The trailing .* was added while handling kRegexpHaveMatch.
1107 // We just have to add the leading one.
1108 all = c.Cat(c.DotStar(), all);
1109 }
1110
1111 c.prog_->set_start(all.begin);
1112 c.prog_->set_start_unanchored(all.begin);
1113 c.prog_->set_anchor_start(true);
1114 c.prog_->set_anchor_end(true);
1115
1116 Prog* prog = c.Finish();
1117 if (prog == NULL)
1118 return NULL;
1119
1120 // Make sure DFA has enough memory to operate,
1121 // since we're not going to fall back to the NFA.
1122 bool failed;
1123 StringPiece sp = "hello, world";
1124 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1125 NULL, &failed, NULL);
1126 if (failed) {
1127 delete prog;
1128 return NULL;
1129 }
1130
1131 return prog;
1132 }
1133
CompileSet(const RE2::Options & options,RE2::Anchor anchor,Regexp * re)1134 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1135 Regexp* re) {
1136 return Compiler::CompileSet(options, anchor, re);
1137 }
1138
1139 } // namespace re2
1140