1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Compiled regular expression representation.
6 // Tested by compile_test.cc
7
8 #include "re2/prog.h"
9
10 #include <stdint.h>
11 #include <string.h>
12 #include <algorithm>
13 #include <memory>
14 #include <utility>
15
16 #include "util/util.h"
17 #include "util/logging.h"
18 #include "util/strutil.h"
19 #include "re2/bitmap256.h"
20 #include "re2/stringpiece.h"
21
22 namespace re2 {
23
24 // Constructors per Inst opcode
25
InitAlt(uint32_t out,uint32_t out1)26 void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
27 DCHECK_EQ(out_opcode_, 0);
28 set_out_opcode(out, kInstAlt);
29 out1_ = out1;
30 }
31
InitByteRange(int lo,int hi,int foldcase,uint32_t out)32 void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
33 DCHECK_EQ(out_opcode_, 0);
34 set_out_opcode(out, kInstByteRange);
35 lo_ = lo & 0xFF;
36 hi_ = hi & 0xFF;
37 hint_foldcase_ = foldcase&1;
38 }
39
InitCapture(int cap,uint32_t out)40 void Prog::Inst::InitCapture(int cap, uint32_t out) {
41 DCHECK_EQ(out_opcode_, 0);
42 set_out_opcode(out, kInstCapture);
43 cap_ = cap;
44 }
45
InitEmptyWidth(EmptyOp empty,uint32_t out)46 void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
47 DCHECK_EQ(out_opcode_, 0);
48 set_out_opcode(out, kInstEmptyWidth);
49 empty_ = empty;
50 }
51
InitMatch(int32_t id)52 void Prog::Inst::InitMatch(int32_t id) {
53 DCHECK_EQ(out_opcode_, 0);
54 set_opcode(kInstMatch);
55 match_id_ = id;
56 }
57
InitNop(uint32_t out)58 void Prog::Inst::InitNop(uint32_t out) {
59 DCHECK_EQ(out_opcode_, 0);
60 set_opcode(kInstNop);
61 }
62
InitFail()63 void Prog::Inst::InitFail() {
64 DCHECK_EQ(out_opcode_, 0);
65 set_opcode(kInstFail);
66 }
67
Dump()68 std::string Prog::Inst::Dump() {
69 switch (opcode()) {
70 default:
71 return StringPrintf("opcode %d", static_cast<int>(opcode()));
72
73 case kInstAlt:
74 return StringPrintf("alt -> %d | %d", out(), out1_);
75
76 case kInstAltMatch:
77 return StringPrintf("altmatch -> %d | %d", out(), out1_);
78
79 case kInstByteRange:
80 return StringPrintf("byte%s [%02x-%02x] %d -> %d",
81 foldcase() ? "/i" : "",
82 lo_, hi_, hint(), out());
83
84 case kInstCapture:
85 return StringPrintf("capture %d -> %d", cap_, out());
86
87 case kInstEmptyWidth:
88 return StringPrintf("emptywidth %#x -> %d",
89 static_cast<int>(empty_), out());
90
91 case kInstMatch:
92 return StringPrintf("match! %d", match_id());
93
94 case kInstNop:
95 return StringPrintf("nop -> %d", out());
96
97 case kInstFail:
98 return StringPrintf("fail");
99 }
100 }
101
Prog()102 Prog::Prog()
103 : anchor_start_(false),
104 anchor_end_(false),
105 reversed_(false),
106 did_flatten_(false),
107 did_onepass_(false),
108 start_(0),
109 start_unanchored_(0),
110 size_(0),
111 bytemap_range_(0),
112 first_byte_(-1),
113 flags_(0),
114 list_count_(0),
115 dfa_mem_(0),
116 dfa_first_(NULL),
117 dfa_longest_(NULL) {
118 }
119
~Prog()120 Prog::~Prog() {
121 DeleteDFA(dfa_longest_);
122 DeleteDFA(dfa_first_);
123 }
124
125 typedef SparseSet Workq;
126
AddToQueue(Workq * q,int id)127 static inline void AddToQueue(Workq* q, int id) {
128 if (id != 0)
129 q->insert(id);
130 }
131
ProgToString(Prog * prog,Workq * q)132 static std::string ProgToString(Prog* prog, Workq* q) {
133 std::string s;
134 for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
135 int id = *i;
136 Prog::Inst* ip = prog->inst(id);
137 s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
138 AddToQueue(q, ip->out());
139 if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
140 AddToQueue(q, ip->out1());
141 }
142 return s;
143 }
144
FlattenedProgToString(Prog * prog,int start)145 static std::string FlattenedProgToString(Prog* prog, int start) {
146 std::string s;
147 for (int id = start; id < prog->size(); id++) {
148 Prog::Inst* ip = prog->inst(id);
149 if (ip->last())
150 s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
151 else
152 s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str());
153 }
154 return s;
155 }
156
Dump()157 std::string Prog::Dump() {
158 if (did_flatten_)
159 return FlattenedProgToString(this, start_);
160
161 Workq q(size_);
162 AddToQueue(&q, start_);
163 return ProgToString(this, &q);
164 }
165
DumpUnanchored()166 std::string Prog::DumpUnanchored() {
167 if (did_flatten_)
168 return FlattenedProgToString(this, start_unanchored_);
169
170 Workq q(size_);
171 AddToQueue(&q, start_unanchored_);
172 return ProgToString(this, &q);
173 }
174
DumpByteMap()175 std::string Prog::DumpByteMap() {
176 std::string map;
177 for (int c = 0; c < 256; c++) {
178 int b = bytemap_[c];
179 int lo = c;
180 while (c < 256-1 && bytemap_[c+1] == b)
181 c++;
182 int hi = c;
183 map += StringPrintf("[%02x-%02x] -> %d\n", lo, hi, b);
184 }
185 return map;
186 }
187
first_byte()188 int Prog::first_byte() {
189 std::call_once(first_byte_once_, [](Prog* prog) {
190 prog->first_byte_ = prog->ComputeFirstByte();
191 }, this);
192 return first_byte_;
193 }
194
195 static bool IsMatch(Prog*, Prog::Inst*);
196
197 // Peep-hole optimizer.
Optimize()198 void Prog::Optimize() {
199 Workq q(size_);
200
201 // Eliminate nops. Most are taken out during compilation
202 // but a few are hard to avoid.
203 q.clear();
204 AddToQueue(&q, start_);
205 for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
206 int id = *i;
207
208 Inst* ip = inst(id);
209 int j = ip->out();
210 Inst* jp;
211 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
212 j = jp->out();
213 }
214 ip->set_out(j);
215 AddToQueue(&q, ip->out());
216
217 if (ip->opcode() == kInstAlt) {
218 j = ip->out1();
219 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
220 j = jp->out();
221 }
222 ip->out1_ = j;
223 AddToQueue(&q, ip->out1());
224 }
225 }
226
227 // Insert kInstAltMatch instructions
228 // Look for
229 // ip: Alt -> j | k
230 // j: ByteRange [00-FF] -> ip
231 // k: Match
232 // or the reverse (the above is the greedy one).
233 // Rewrite Alt to AltMatch.
234 q.clear();
235 AddToQueue(&q, start_);
236 for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
237 int id = *i;
238 Inst* ip = inst(id);
239 AddToQueue(&q, ip->out());
240 if (ip->opcode() == kInstAlt)
241 AddToQueue(&q, ip->out1());
242
243 if (ip->opcode() == kInstAlt) {
244 Inst* j = inst(ip->out());
245 Inst* k = inst(ip->out1());
246 if (j->opcode() == kInstByteRange && j->out() == id &&
247 j->lo() == 0x00 && j->hi() == 0xFF &&
248 IsMatch(this, k)) {
249 ip->set_opcode(kInstAltMatch);
250 continue;
251 }
252 if (IsMatch(this, j) &&
253 k->opcode() == kInstByteRange && k->out() == id &&
254 k->lo() == 0x00 && k->hi() == 0xFF) {
255 ip->set_opcode(kInstAltMatch);
256 }
257 }
258 }
259 }
260
261 // Is ip a guaranteed match at end of text, perhaps after some capturing?
IsMatch(Prog * prog,Prog::Inst * ip)262 static bool IsMatch(Prog* prog, Prog::Inst* ip) {
263 for (;;) {
264 switch (ip->opcode()) {
265 default:
266 LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
267 return false;
268
269 case kInstAlt:
270 case kInstAltMatch:
271 case kInstByteRange:
272 case kInstFail:
273 case kInstEmptyWidth:
274 return false;
275
276 case kInstCapture:
277 case kInstNop:
278 ip = prog->inst(ip->out());
279 break;
280
281 case kInstMatch:
282 return true;
283 }
284 }
285 }
286
EmptyFlags(const StringPiece & text,const char * p)287 uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
288 int flags = 0;
289
290 // ^ and \A
291 if (p == text.data())
292 flags |= kEmptyBeginText | kEmptyBeginLine;
293 else if (p[-1] == '\n')
294 flags |= kEmptyBeginLine;
295
296 // $ and \z
297 if (p == text.data() + text.size())
298 flags |= kEmptyEndText | kEmptyEndLine;
299 else if (p < text.data() + text.size() && p[0] == '\n')
300 flags |= kEmptyEndLine;
301
302 // \b and \B
303 if (p == text.data() && p == text.data() + text.size()) {
304 // no word boundary here
305 } else if (p == text.data()) {
306 if (IsWordChar(p[0]))
307 flags |= kEmptyWordBoundary;
308 } else if (p == text.data() + text.size()) {
309 if (IsWordChar(p[-1]))
310 flags |= kEmptyWordBoundary;
311 } else {
312 if (IsWordChar(p[-1]) != IsWordChar(p[0]))
313 flags |= kEmptyWordBoundary;
314 }
315 if (!(flags & kEmptyWordBoundary))
316 flags |= kEmptyNonWordBoundary;
317
318 return flags;
319 }
320
321 // ByteMapBuilder implements a coloring algorithm.
322 //
323 // The first phase is a series of "mark and merge" batches: we mark one or more
324 // [lo-hi] ranges, then merge them into our internal state. Batching is not for
325 // performance; rather, it means that the ranges are treated indistinguishably.
326 //
327 // Internally, the ranges are represented using a bitmap that stores the splits
328 // and a vector that stores the colors; both of them are indexed by the ranges'
329 // last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
330 // hi (if not already split), then recolor each range in between. The color map
331 // (i.e. from the old color to the new color) is maintained for the lifetime of
332 // the batch and so underpins this somewhat obscure approach to set operations.
333 //
334 // The second phase builds the bytemap from our internal state: we recolor each
335 // range, then store the new color (which is now the byte class) in each of the
336 // corresponding array elements. Finally, we output the number of byte classes.
337 class ByteMapBuilder {
338 public:
ByteMapBuilder()339 ByteMapBuilder() {
340 // Initial state: the [0-255] range has color 256.
341 // This will avoid problems during the second phase,
342 // in which we assign byte classes numbered from 0.
343 splits_.Set(255);
344 colors_[255] = 256;
345 nextcolor_ = 257;
346 }
347
348 void Mark(int lo, int hi);
349 void Merge();
350 void Build(uint8_t* bytemap, int* bytemap_range);
351
352 private:
353 int Recolor(int oldcolor);
354
355 Bitmap256 splits_;
356 int colors_[256];
357 int nextcolor_;
358 std::vector<std::pair<int, int>> colormap_;
359 std::vector<std::pair<int, int>> ranges_;
360
361 ByteMapBuilder(const ByteMapBuilder&) = delete;
362 ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
363 };
364
Mark(int lo,int hi)365 void ByteMapBuilder::Mark(int lo, int hi) {
366 DCHECK_GE(lo, 0);
367 DCHECK_GE(hi, 0);
368 DCHECK_LE(lo, 255);
369 DCHECK_LE(hi, 255);
370 DCHECK_LE(lo, hi);
371
372 // Ignore any [0-255] ranges. They cause us to recolor every range, which
373 // has no effect on the eventual result and is therefore a waste of time.
374 if (lo == 0 && hi == 255)
375 return;
376
377 ranges_.emplace_back(lo, hi);
378 }
379
Merge()380 void ByteMapBuilder::Merge() {
381 for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
382 it != ranges_.end();
383 ++it) {
384 int lo = it->first-1;
385 int hi = it->second;
386
387 if (0 <= lo && !splits_.Test(lo)) {
388 splits_.Set(lo);
389 int next = splits_.FindNextSetBit(lo+1);
390 colors_[lo] = colors_[next];
391 }
392 if (!splits_.Test(hi)) {
393 splits_.Set(hi);
394 int next = splits_.FindNextSetBit(hi+1);
395 colors_[hi] = colors_[next];
396 }
397
398 int c = lo+1;
399 while (c < 256) {
400 int next = splits_.FindNextSetBit(c);
401 colors_[next] = Recolor(colors_[next]);
402 if (next == hi)
403 break;
404 c = next+1;
405 }
406 }
407 colormap_.clear();
408 ranges_.clear();
409 }
410
Build(uint8_t * bytemap,int * bytemap_range)411 void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
412 // Assign byte classes numbered from 0.
413 nextcolor_ = 0;
414
415 int c = 0;
416 while (c < 256) {
417 int next = splits_.FindNextSetBit(c);
418 uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
419 while (c <= next) {
420 bytemap[c] = b;
421 c++;
422 }
423 }
424
425 *bytemap_range = nextcolor_;
426 }
427
Recolor(int oldcolor)428 int ByteMapBuilder::Recolor(int oldcolor) {
429 // Yes, this is a linear search. There can be at most 256
430 // colors and there will typically be far fewer than that.
431 // Also, we need to consider keys *and* values in order to
432 // avoid recoloring a given range more than once per batch.
433 std::vector<std::pair<int, int>>::const_iterator it =
434 std::find_if(colormap_.begin(), colormap_.end(),
435 [=](const std::pair<int, int>& kv) -> bool {
436 return kv.first == oldcolor || kv.second == oldcolor;
437 });
438 if (it != colormap_.end())
439 return it->second;
440 int newcolor = nextcolor_;
441 nextcolor_++;
442 colormap_.emplace_back(oldcolor, newcolor);
443 return newcolor;
444 }
445
ComputeByteMap()446 void Prog::ComputeByteMap() {
447 // Fill in bytemap with byte classes for the program.
448 // Ranges of bytes that are treated indistinguishably
449 // will be mapped to a single byte class.
450 ByteMapBuilder builder;
451
452 // Don't repeat the work for ^ and $.
453 bool marked_line_boundaries = false;
454 // Don't repeat the work for \b and \B.
455 bool marked_word_boundaries = false;
456
457 for (int id = 0; id < size(); id++) {
458 Inst* ip = inst(id);
459 if (ip->opcode() == kInstByteRange) {
460 int lo = ip->lo();
461 int hi = ip->hi();
462 builder.Mark(lo, hi);
463 if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
464 int foldlo = lo;
465 int foldhi = hi;
466 if (foldlo < 'a')
467 foldlo = 'a';
468 if (foldhi > 'z')
469 foldhi = 'z';
470 if (foldlo <= foldhi) {
471 foldlo += 'A' - 'a';
472 foldhi += 'A' - 'a';
473 builder.Mark(foldlo, foldhi);
474 }
475 }
476 // If this Inst is not the last Inst in its list AND the next Inst is
477 // also a ByteRange AND the Insts have the same out, defer the merge.
478 if (!ip->last() &&
479 inst(id+1)->opcode() == kInstByteRange &&
480 ip->out() == inst(id+1)->out())
481 continue;
482 builder.Merge();
483 } else if (ip->opcode() == kInstEmptyWidth) {
484 if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
485 !marked_line_boundaries) {
486 builder.Mark('\n', '\n');
487 builder.Merge();
488 marked_line_boundaries = true;
489 }
490 if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
491 !marked_word_boundaries) {
492 // We require two batches here: the first for ranges that are word
493 // characters, the second for ranges that are not word characters.
494 for (bool isword : {true, false}) {
495 int j;
496 for (int i = 0; i < 256; i = j) {
497 for (j = i + 1; j < 256 &&
498 Prog::IsWordChar(static_cast<uint8_t>(i)) ==
499 Prog::IsWordChar(static_cast<uint8_t>(j));
500 j++)
501 ;
502 if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
503 builder.Mark(i, j - 1);
504 }
505 builder.Merge();
506 }
507 marked_word_boundaries = true;
508 }
509 }
510 }
511
512 builder.Build(bytemap_, &bytemap_range_);
513
514 if (0) { // For debugging, use trivial bytemap.
515 LOG(ERROR) << "Using trivial bytemap.";
516 for (int i = 0; i < 256; i++)
517 bytemap_[i] = static_cast<uint8_t>(i);
518 bytemap_range_ = 256;
519 }
520 }
521
522 // Prog::Flatten() implements a graph rewriting algorithm.
523 //
524 // The overall process is similar to epsilon removal, but retains some epsilon
525 // transitions: those from Capture and EmptyWidth instructions; and those from
526 // nullable subexpressions. (The latter avoids quadratic blowup in transitions
527 // in the worst case.) It might be best thought of as Alt instruction elision.
528 //
529 // In conceptual terms, it divides the Prog into "trees" of instructions, then
530 // traverses the "trees" in order to produce "lists" of instructions. A "tree"
531 // is one or more instructions that grow from one "root" instruction to one or
532 // more "leaf" instructions; if a "tree" has exactly one instruction, then the
533 // "root" is also the "leaf". In most cases, a "root" is the successor of some
534 // "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
535 // and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
536 // EmptyWidth or Match instruction. However, this is insufficient for handling
537 // nested nullable subexpressions correctly, so in some cases, a "root" is the
538 // dominator of the instructions reachable from some "successor root" (i.e. it
539 // has an unreachable predecessor) and is considered a "dominator root". Since
540 // only Alt instructions can be "dominator roots" (other instructions would be
541 // "leaves"), only Alt instructions are required to be marked as predecessors.
542 //
543 // Dividing the Prog into "trees" comprises two passes: marking the "successor
544 // roots" and the predecessors; and marking the "dominator roots". Sorting the
545 // "successor roots" by their bytecode offsets enables iteration in order from
546 // greatest to least during the second pass; by working backwards in this case
547 // and flooding the graph no further than "leaves" and already marked "roots",
548 // it becomes possible to mark "dominator roots" without doing excessive work.
549 //
550 // Traversing the "trees" is just iterating over the "roots" in order of their
551 // marking and flooding the graph no further than "leaves" and "roots". When a
552 // "leaf" is reached, the instruction is copied with its successor remapped to
553 // its "root" number. When a "root" is reached, a Nop instruction is generated
554 // with its successor remapped similarly. As each "list" is produced, its last
555 // instruction is marked as such. After all of the "lists" have been produced,
556 // a pass over their instructions remaps their successors to bytecode offsets.
Flatten()557 void Prog::Flatten() {
558 if (did_flatten_)
559 return;
560 did_flatten_ = true;
561
562 // Scratch structures. It's important that these are reused by functions
563 // that we call in loops because they would thrash the heap otherwise.
564 SparseSet reachable(size());
565 std::vector<int> stk;
566 stk.reserve(size());
567
568 // First pass: Marks "successor roots" and predecessors.
569 // Builds the mapping from inst-ids to root-ids.
570 SparseArray<int> rootmap(size());
571 SparseArray<int> predmap(size());
572 std::vector<std::vector<int>> predvec;
573 MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
574
575 // Second pass: Marks "dominator roots".
576 SparseArray<int> sorted(rootmap);
577 std::sort(sorted.begin(), sorted.end(), sorted.less);
578 for (SparseArray<int>::const_iterator i = sorted.end() - 1;
579 i != sorted.begin();
580 --i) {
581 if (i->index() != start_unanchored() && i->index() != start())
582 MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
583 }
584
585 // Third pass: Emits "lists". Remaps outs to root-ids.
586 // Builds the mapping from root-ids to flat-ids.
587 std::vector<int> flatmap(rootmap.size());
588 std::vector<Inst> flat;
589 flat.reserve(size());
590 for (SparseArray<int>::const_iterator i = rootmap.begin();
591 i != rootmap.end();
592 ++i) {
593 flatmap[i->value()] = static_cast<int>(flat.size());
594 EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
595 flat.back().set_last();
596 // We have the bounds of the "list", so this is the
597 // most convenient point at which to compute hints.
598 ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size()));
599 }
600
601 list_count_ = static_cast<int>(flatmap.size());
602 for (int i = 0; i < kNumInst; i++)
603 inst_count_[i] = 0;
604
605 // Fourth pass: Remaps outs to flat-ids.
606 // Counts instructions by opcode.
607 for (int id = 0; id < static_cast<int>(flat.size()); id++) {
608 Inst* ip = &flat[id];
609 if (ip->opcode() != kInstAltMatch) // handled in EmitList()
610 ip->set_out(flatmap[ip->out()]);
611 inst_count_[ip->opcode()]++;
612 }
613
614 int total = 0;
615 for (int i = 0; i < kNumInst; i++)
616 total += inst_count_[i];
617 DCHECK_EQ(total, static_cast<int>(flat.size()));
618
619 // Remap start_unanchored and start.
620 if (start_unanchored() == 0) {
621 DCHECK_EQ(start(), 0);
622 } else if (start_unanchored() == start()) {
623 set_start_unanchored(flatmap[1]);
624 set_start(flatmap[1]);
625 } else {
626 set_start_unanchored(flatmap[1]);
627 set_start(flatmap[2]);
628 }
629
630 // Finally, replace the old instructions with the new instructions.
631 size_ = static_cast<int>(flat.size());
632 inst_ = PODArray<Inst>(size_);
633 memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]);
634
635 // Populate the list heads for BitState.
636 // 512 instructions limits the memory footprint to 1KiB.
637 if (size_ <= 512) {
638 list_heads_ = PODArray<uint16_t>(size_);
639 // 0xFF makes it more obvious if we try to look up a non-head.
640 memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]);
641 for (int i = 0; i < list_count_; ++i)
642 list_heads_[flatmap[i]] = i;
643 }
644 }
645
MarkSuccessors(SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)646 void Prog::MarkSuccessors(SparseArray<int>* rootmap,
647 SparseArray<int>* predmap,
648 std::vector<std::vector<int>>* predvec,
649 SparseSet* reachable, std::vector<int>* stk) {
650 // Mark the kInstFail instruction.
651 rootmap->set_new(0, rootmap->size());
652
653 // Mark the start_unanchored and start instructions.
654 if (!rootmap->has_index(start_unanchored()))
655 rootmap->set_new(start_unanchored(), rootmap->size());
656 if (!rootmap->has_index(start()))
657 rootmap->set_new(start(), rootmap->size());
658
659 reachable->clear();
660 stk->clear();
661 stk->push_back(start_unanchored());
662 while (!stk->empty()) {
663 int id = stk->back();
664 stk->pop_back();
665 Loop:
666 if (reachable->contains(id))
667 continue;
668 reachable->insert_new(id);
669
670 Inst* ip = inst(id);
671 switch (ip->opcode()) {
672 default:
673 LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
674 break;
675
676 case kInstAltMatch:
677 case kInstAlt:
678 // Mark this instruction as a predecessor of each out.
679 for (int out : {ip->out(), ip->out1()}) {
680 if (!predmap->has_index(out)) {
681 predmap->set_new(out, static_cast<int>(predvec->size()));
682 predvec->emplace_back();
683 }
684 (*predvec)[predmap->get_existing(out)].emplace_back(id);
685 }
686 stk->push_back(ip->out1());
687 id = ip->out();
688 goto Loop;
689
690 case kInstByteRange:
691 case kInstCapture:
692 case kInstEmptyWidth:
693 // Mark the out of this instruction as a "root".
694 if (!rootmap->has_index(ip->out()))
695 rootmap->set_new(ip->out(), rootmap->size());
696 id = ip->out();
697 goto Loop;
698
699 case kInstNop:
700 id = ip->out();
701 goto Loop;
702
703 case kInstMatch:
704 case kInstFail:
705 break;
706 }
707 }
708 }
709
MarkDominator(int root,SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)710 void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
711 SparseArray<int>* predmap,
712 std::vector<std::vector<int>>* predvec,
713 SparseSet* reachable, std::vector<int>* stk) {
714 reachable->clear();
715 stk->clear();
716 stk->push_back(root);
717 while (!stk->empty()) {
718 int id = stk->back();
719 stk->pop_back();
720 Loop:
721 if (reachable->contains(id))
722 continue;
723 reachable->insert_new(id);
724
725 if (id != root && rootmap->has_index(id)) {
726 // We reached another "tree" via epsilon transition.
727 continue;
728 }
729
730 Inst* ip = inst(id);
731 switch (ip->opcode()) {
732 default:
733 LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
734 break;
735
736 case kInstAltMatch:
737 case kInstAlt:
738 stk->push_back(ip->out1());
739 id = ip->out();
740 goto Loop;
741
742 case kInstByteRange:
743 case kInstCapture:
744 case kInstEmptyWidth:
745 break;
746
747 case kInstNop:
748 id = ip->out();
749 goto Loop;
750
751 case kInstMatch:
752 case kInstFail:
753 break;
754 }
755 }
756
757 for (SparseSet::const_iterator i = reachable->begin();
758 i != reachable->end();
759 ++i) {
760 int id = *i;
761 if (predmap->has_index(id)) {
762 for (int pred : (*predvec)[predmap->get_existing(id)]) {
763 if (!reachable->contains(pred)) {
764 // id has a predecessor that cannot be reached from root!
765 // Therefore, id must be a "root" too - mark it as such.
766 if (!rootmap->has_index(id))
767 rootmap->set_new(id, rootmap->size());
768 }
769 }
770 }
771 }
772 }
773
EmitList(int root,SparseArray<int> * rootmap,std::vector<Inst> * flat,SparseSet * reachable,std::vector<int> * stk)774 void Prog::EmitList(int root, SparseArray<int>* rootmap,
775 std::vector<Inst>* flat,
776 SparseSet* reachable, std::vector<int>* stk) {
777 reachable->clear();
778 stk->clear();
779 stk->push_back(root);
780 while (!stk->empty()) {
781 int id = stk->back();
782 stk->pop_back();
783 Loop:
784 if (reachable->contains(id))
785 continue;
786 reachable->insert_new(id);
787
788 if (id != root && rootmap->has_index(id)) {
789 // We reached another "tree" via epsilon transition. Emit a kInstNop
790 // instruction so that the Prog does not become quadratically larger.
791 flat->emplace_back();
792 flat->back().set_opcode(kInstNop);
793 flat->back().set_out(rootmap->get_existing(id));
794 continue;
795 }
796
797 Inst* ip = inst(id);
798 switch (ip->opcode()) {
799 default:
800 LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
801 break;
802
803 case kInstAltMatch:
804 flat->emplace_back();
805 flat->back().set_opcode(kInstAltMatch);
806 flat->back().set_out(static_cast<int>(flat->size()));
807 flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
808 FALLTHROUGH_INTENDED;
809
810 case kInstAlt:
811 stk->push_back(ip->out1());
812 id = ip->out();
813 goto Loop;
814
815 case kInstByteRange:
816 case kInstCapture:
817 case kInstEmptyWidth:
818 flat->emplace_back();
819 memmove(&flat->back(), ip, sizeof *ip);
820 flat->back().set_out(rootmap->get_existing(ip->out()));
821 break;
822
823 case kInstNop:
824 id = ip->out();
825 goto Loop;
826
827 case kInstMatch:
828 case kInstFail:
829 flat->emplace_back();
830 memmove(&flat->back(), ip, sizeof *ip);
831 break;
832 }
833 }
834 }
835
836 // For each ByteRange instruction in [begin, end), computes a hint to execution
837 // engines: the delta to the next instruction (in flat) worth exploring iff the
838 // current instruction matched.
839 //
840 // Implements a coloring algorithm related to ByteMapBuilder, but in this case,
841 // colors are instructions and recoloring ranges precisely identifies conflicts
842 // between instructions. Iterating backwards over [begin, end) is guaranteed to
843 // identify the nearest conflict (if any) with only linear complexity.
ComputeHints(std::vector<Inst> * flat,int begin,int end)844 void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
845 Bitmap256 splits;
846 int colors[256];
847
848 bool dirty = false;
849 for (int id = end; id >= begin; --id) {
850 if (id == end ||
851 (*flat)[id].opcode() != kInstByteRange) {
852 if (dirty) {
853 dirty = false;
854 splits.Clear();
855 }
856 splits.Set(255);
857 colors[255] = id;
858 // At this point, the [0-255] range is colored with id.
859 // Thus, hints cannot point beyond id; and if id == end,
860 // hints that would have pointed to id will be 0 instead.
861 continue;
862 }
863 dirty = true;
864
865 // We recolor the [lo-hi] range with id. Note that first ratchets backwards
866 // from end to the nearest conflict (if any) during recoloring.
867 int first = end;
868 auto Recolor = [&](int lo, int hi) {
869 // Like ByteMapBuilder, we split at lo-1 and at hi.
870 --lo;
871
872 if (0 <= lo && !splits.Test(lo)) {
873 splits.Set(lo);
874 int next = splits.FindNextSetBit(lo+1);
875 colors[lo] = colors[next];
876 }
877 if (!splits.Test(hi)) {
878 splits.Set(hi);
879 int next = splits.FindNextSetBit(hi+1);
880 colors[hi] = colors[next];
881 }
882
883 int c = lo+1;
884 while (c < 256) {
885 int next = splits.FindNextSetBit(c);
886 // Ratchet backwards...
887 first = std::min(first, colors[next]);
888 // Recolor with id - because it's the new nearest conflict!
889 colors[next] = id;
890 if (next == hi)
891 break;
892 c = next+1;
893 }
894 };
895
896 Inst* ip = &(*flat)[id];
897 int lo = ip->lo();
898 int hi = ip->hi();
899 Recolor(lo, hi);
900 if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
901 int foldlo = lo;
902 int foldhi = hi;
903 if (foldlo < 'a')
904 foldlo = 'a';
905 if (foldhi > 'z')
906 foldhi = 'z';
907 if (foldlo <= foldhi) {
908 foldlo += 'A' - 'a';
909 foldhi += 'A' - 'a';
910 Recolor(foldlo, foldhi);
911 }
912 }
913
914 if (first != end) {
915 uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767));
916 ip->hint_foldcase_ |= hint<<1;
917 }
918 }
919 }
920
921 } // namespace re2
922