1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Compiled regular expression representation.
6 // Tested by compile_test.cc
7
8 #include "re2/prog.h"
9
10 #include <stdint.h>
11 #include <string.h>
12
13 #include <algorithm>
14 #include <string>
15 #include <utility>
16 #include <vector>
17
18 #include "absl/base/attributes.h"
19 #include "absl/log/absl_check.h"
20 #include "absl/log/absl_log.h"
21 #include "absl/strings/str_format.h"
22 #include "absl/strings/string_view.h"
23 #include "re2/bitmap256.h"
24 #include "re2/pod_array.h"
25 #include "re2/sparse_array.h"
26 #include "re2/sparse_set.h"
27
28 #if defined(__AVX2__)
29 #include <immintrin.h>
30 #ifdef _MSC_VER
31 #include <intrin.h>
32 #endif
33 #endif
34
35 namespace re2 {
36
37 // Constructors per Inst opcode
38
InitAlt(uint32_t out,uint32_t out1)39 void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
40 ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
41 set_out_opcode(out, kInstAlt);
42 out1_ = out1;
43 }
44
InitByteRange(int lo,int hi,int foldcase,uint32_t out)45 void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
46 ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
47 set_out_opcode(out, kInstByteRange);
48 lo_ = lo & 0xFF;
49 hi_ = hi & 0xFF;
50 hint_foldcase_ = foldcase&1;
51 }
52
InitCapture(int cap,uint32_t out)53 void Prog::Inst::InitCapture(int cap, uint32_t out) {
54 ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
55 set_out_opcode(out, kInstCapture);
56 cap_ = cap;
57 }
58
InitEmptyWidth(EmptyOp empty,uint32_t out)59 void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
60 ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
61 set_out_opcode(out, kInstEmptyWidth);
62 empty_ = empty;
63 }
64
InitMatch(int32_t id)65 void Prog::Inst::InitMatch(int32_t id) {
66 ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
67 set_opcode(kInstMatch);
68 match_id_ = id;
69 }
70
InitNop(uint32_t out)71 void Prog::Inst::InitNop(uint32_t out) {
72 ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
73 set_opcode(kInstNop);
74 }
75
InitFail()76 void Prog::Inst::InitFail() {
77 ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
78 set_opcode(kInstFail);
79 }
80
Dump()81 std::string Prog::Inst::Dump() {
82 switch (opcode()) {
83 default:
84 return absl::StrFormat("opcode %d", static_cast<int>(opcode()));
85
86 case kInstAlt:
87 return absl::StrFormat("alt -> %d | %d", out(), out1_);
88
89 case kInstAltMatch:
90 return absl::StrFormat("altmatch -> %d | %d", out(), out1_);
91
92 case kInstByteRange:
93 return absl::StrFormat("byte%s [%02x-%02x] %d -> %d",
94 foldcase() ? "/i" : "",
95 lo_, hi_, hint(), out());
96
97 case kInstCapture:
98 return absl::StrFormat("capture %d -> %d", cap_, out());
99
100 case kInstEmptyWidth:
101 return absl::StrFormat("emptywidth %#x -> %d",
102 static_cast<int>(empty_), out());
103
104 case kInstMatch:
105 return absl::StrFormat("match! %d", match_id());
106
107 case kInstNop:
108 return absl::StrFormat("nop -> %d", out());
109
110 case kInstFail:
111 return absl::StrFormat("fail");
112 }
113 }
114
Prog()115 Prog::Prog()
116 : anchor_start_(false),
117 anchor_end_(false),
118 reversed_(false),
119 did_flatten_(false),
120 did_onepass_(false),
121 start_(0),
122 start_unanchored_(0),
123 size_(0),
124 bytemap_range_(0),
125 prefix_foldcase_(false),
126 prefix_size_(0),
127 list_count_(0),
128 bit_state_text_max_size_(0),
129 dfa_mem_(0),
130 dfa_first_(NULL),
131 dfa_longest_(NULL) {
132 }
133
~Prog()134 Prog::~Prog() {
135 DeleteDFA(dfa_longest_);
136 DeleteDFA(dfa_first_);
137 if (prefix_foldcase_)
138 delete[] prefix_dfa_;
139 }
140
141 typedef SparseSet Workq;
142
AddToQueue(Workq * q,int id)143 static inline void AddToQueue(Workq* q, int id) {
144 if (id != 0)
145 q->insert(id);
146 }
147
ProgToString(Prog * prog,Workq * q)148 static std::string ProgToString(Prog* prog, Workq* q) {
149 std::string s;
150 for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
151 int id = *i;
152 Prog::Inst* ip = prog->inst(id);
153 s += absl::StrFormat("%d. %s\n", id, ip->Dump());
154 AddToQueue(q, ip->out());
155 if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
156 AddToQueue(q, ip->out1());
157 }
158 return s;
159 }
160
FlattenedProgToString(Prog * prog,int start)161 static std::string FlattenedProgToString(Prog* prog, int start) {
162 std::string s;
163 for (int id = start; id < prog->size(); id++) {
164 Prog::Inst* ip = prog->inst(id);
165 if (ip->last())
166 s += absl::StrFormat("%d. %s\n", id, ip->Dump());
167 else
168 s += absl::StrFormat("%d+ %s\n", id, ip->Dump());
169 }
170 return s;
171 }
172
Dump()173 std::string Prog::Dump() {
174 if (did_flatten_)
175 return FlattenedProgToString(this, start_);
176
177 Workq q(size_);
178 AddToQueue(&q, start_);
179 return ProgToString(this, &q);
180 }
181
DumpUnanchored()182 std::string Prog::DumpUnanchored() {
183 if (did_flatten_)
184 return FlattenedProgToString(this, start_unanchored_);
185
186 Workq q(size_);
187 AddToQueue(&q, start_unanchored_);
188 return ProgToString(this, &q);
189 }
190
DumpByteMap()191 std::string Prog::DumpByteMap() {
192 std::string map;
193 for (int c = 0; c < 256; c++) {
194 int b = bytemap_[c];
195 int lo = c;
196 while (c < 256-1 && bytemap_[c+1] == b)
197 c++;
198 int hi = c;
199 map += absl::StrFormat("[%02x-%02x] -> %d\n", lo, hi, b);
200 }
201 return map;
202 }
203
204 // Is ip a guaranteed match at end of text, perhaps after some capturing?
IsMatch(Prog * prog,Prog::Inst * ip)205 static bool IsMatch(Prog* prog, Prog::Inst* ip) {
206 for (;;) {
207 switch (ip->opcode()) {
208 default:
209 ABSL_LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
210 return false;
211
212 case kInstAlt:
213 case kInstAltMatch:
214 case kInstByteRange:
215 case kInstFail:
216 case kInstEmptyWidth:
217 return false;
218
219 case kInstCapture:
220 case kInstNop:
221 ip = prog->inst(ip->out());
222 break;
223
224 case kInstMatch:
225 return true;
226 }
227 }
228 }
229
230 // Peep-hole optimizer.
Optimize()231 void Prog::Optimize() {
232 Workq q(size_);
233
234 // Eliminate nops. Most are taken out during compilation
235 // but a few are hard to avoid.
236 q.clear();
237 AddToQueue(&q, start_);
238 for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
239 int id = *i;
240
241 Inst* ip = inst(id);
242 int j = ip->out();
243 Inst* jp;
244 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
245 j = jp->out();
246 }
247 ip->set_out(j);
248 AddToQueue(&q, ip->out());
249
250 if (ip->opcode() == kInstAlt) {
251 j = ip->out1();
252 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
253 j = jp->out();
254 }
255 ip->out1_ = j;
256 AddToQueue(&q, ip->out1());
257 }
258 }
259
260 // Insert kInstAltMatch instructions
261 // Look for
262 // ip: Alt -> j | k
263 // j: ByteRange [00-FF] -> ip
264 // k: Match
265 // or the reverse (the above is the greedy one).
266 // Rewrite Alt to AltMatch.
267 q.clear();
268 AddToQueue(&q, start_);
269 for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
270 int id = *i;
271 Inst* ip = inst(id);
272 AddToQueue(&q, ip->out());
273 if (ip->opcode() == kInstAlt)
274 AddToQueue(&q, ip->out1());
275
276 if (ip->opcode() == kInstAlt) {
277 Inst* j = inst(ip->out());
278 Inst* k = inst(ip->out1());
279 if (j->opcode() == kInstByteRange && j->out() == id &&
280 j->lo() == 0x00 && j->hi() == 0xFF &&
281 IsMatch(this, k)) {
282 ip->set_opcode(kInstAltMatch);
283 continue;
284 }
285 if (IsMatch(this, j) &&
286 k->opcode() == kInstByteRange && k->out() == id &&
287 k->lo() == 0x00 && k->hi() == 0xFF) {
288 ip->set_opcode(kInstAltMatch);
289 }
290 }
291 }
292 }
293
EmptyFlags(absl::string_view text,const char * p)294 uint32_t Prog::EmptyFlags(absl::string_view text, const char* p) {
295 int flags = 0;
296
297 // ^ and \A
298 if (p == text.data())
299 flags |= kEmptyBeginText | kEmptyBeginLine;
300 else if (p[-1] == '\n')
301 flags |= kEmptyBeginLine;
302
303 // $ and \z
304 if (p == text.data() + text.size())
305 flags |= kEmptyEndText | kEmptyEndLine;
306 else if (p < text.data() + text.size() && p[0] == '\n')
307 flags |= kEmptyEndLine;
308
309 // \b and \B
310 if (p == text.data() && p == text.data() + text.size()) {
311 // no word boundary here
312 } else if (p == text.data()) {
313 if (IsWordChar(p[0]))
314 flags |= kEmptyWordBoundary;
315 } else if (p == text.data() + text.size()) {
316 if (IsWordChar(p[-1]))
317 flags |= kEmptyWordBoundary;
318 } else {
319 if (IsWordChar(p[-1]) != IsWordChar(p[0]))
320 flags |= kEmptyWordBoundary;
321 }
322 if (!(flags & kEmptyWordBoundary))
323 flags |= kEmptyNonWordBoundary;
324
325 return flags;
326 }
327
328 // ByteMapBuilder implements a coloring algorithm.
329 //
330 // The first phase is a series of "mark and merge" batches: we mark one or more
331 // [lo-hi] ranges, then merge them into our internal state. Batching is not for
332 // performance; rather, it means that the ranges are treated indistinguishably.
333 //
334 // Internally, the ranges are represented using a bitmap that stores the splits
335 // and a vector that stores the colors; both of them are indexed by the ranges'
336 // last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
337 // hi (if not already split), then recolor each range in between. The color map
338 // (i.e. from the old color to the new color) is maintained for the lifetime of
339 // the batch and so underpins this somewhat obscure approach to set operations.
340 //
341 // The second phase builds the bytemap from our internal state: we recolor each
342 // range, then store the new color (which is now the byte class) in each of the
343 // corresponding array elements. Finally, we output the number of byte classes.
344 class ByteMapBuilder {
345 public:
ByteMapBuilder()346 ByteMapBuilder() {
347 // Initial state: the [0-255] range has color 256.
348 // This will avoid problems during the second phase,
349 // in which we assign byte classes numbered from 0.
350 splits_.Set(255);
351 colors_[255] = 256;
352 nextcolor_ = 257;
353 }
354
355 void Mark(int lo, int hi);
356 void Merge();
357 void Build(uint8_t* bytemap, int* bytemap_range);
358
359 private:
360 int Recolor(int oldcolor);
361
362 Bitmap256 splits_;
363 int colors_[256];
364 int nextcolor_;
365 std::vector<std::pair<int, int>> colormap_;
366 std::vector<std::pair<int, int>> ranges_;
367
368 ByteMapBuilder(const ByteMapBuilder&) = delete;
369 ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
370 };
371
Mark(int lo,int hi)372 void ByteMapBuilder::Mark(int lo, int hi) {
373 ABSL_DCHECK_GE(lo, 0);
374 ABSL_DCHECK_GE(hi, 0);
375 ABSL_DCHECK_LE(lo, 255);
376 ABSL_DCHECK_LE(hi, 255);
377 ABSL_DCHECK_LE(lo, hi);
378
379 // Ignore any [0-255] ranges. They cause us to recolor every range, which
380 // has no effect on the eventual result and is therefore a waste of time.
381 if (lo == 0 && hi == 255)
382 return;
383
384 ranges_.emplace_back(lo, hi);
385 }
386
Merge()387 void ByteMapBuilder::Merge() {
388 for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
389 it != ranges_.end();
390 ++it) {
391 int lo = it->first-1;
392 int hi = it->second;
393
394 if (0 <= lo && !splits_.Test(lo)) {
395 splits_.Set(lo);
396 int next = splits_.FindNextSetBit(lo+1);
397 colors_[lo] = colors_[next];
398 }
399 if (!splits_.Test(hi)) {
400 splits_.Set(hi);
401 int next = splits_.FindNextSetBit(hi+1);
402 colors_[hi] = colors_[next];
403 }
404
405 int c = lo+1;
406 while (c < 256) {
407 int next = splits_.FindNextSetBit(c);
408 colors_[next] = Recolor(colors_[next]);
409 if (next == hi)
410 break;
411 c = next+1;
412 }
413 }
414 colormap_.clear();
415 ranges_.clear();
416 }
417
Build(uint8_t * bytemap,int * bytemap_range)418 void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
419 // Assign byte classes numbered from 0.
420 nextcolor_ = 0;
421
422 int c = 0;
423 while (c < 256) {
424 int next = splits_.FindNextSetBit(c);
425 uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
426 while (c <= next) {
427 bytemap[c] = b;
428 c++;
429 }
430 }
431
432 *bytemap_range = nextcolor_;
433 }
434
Recolor(int oldcolor)435 int ByteMapBuilder::Recolor(int oldcolor) {
436 // Yes, this is a linear search. There can be at most 256
437 // colors and there will typically be far fewer than that.
438 // Also, we need to consider keys *and* values in order to
439 // avoid recoloring a given range more than once per batch.
440 std::vector<std::pair<int, int>>::const_iterator it =
441 std::find_if(colormap_.begin(), colormap_.end(),
442 [=](const std::pair<int, int>& kv) -> bool {
443 return kv.first == oldcolor || kv.second == oldcolor;
444 });
445 if (it != colormap_.end())
446 return it->second;
447 int newcolor = nextcolor_;
448 nextcolor_++;
449 colormap_.emplace_back(oldcolor, newcolor);
450 return newcolor;
451 }
452
ComputeByteMap()453 void Prog::ComputeByteMap() {
454 // Fill in bytemap with byte classes for the program.
455 // Ranges of bytes that are treated indistinguishably
456 // will be mapped to a single byte class.
457 ByteMapBuilder builder;
458
459 // Don't repeat the work for ^ and $.
460 bool marked_line_boundaries = false;
461 // Don't repeat the work for \b and \B.
462 bool marked_word_boundaries = false;
463
464 for (int id = 0; id < size(); id++) {
465 Inst* ip = inst(id);
466 if (ip->opcode() == kInstByteRange) {
467 int lo = ip->lo();
468 int hi = ip->hi();
469 builder.Mark(lo, hi);
470 if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
471 int foldlo = lo;
472 int foldhi = hi;
473 if (foldlo < 'a')
474 foldlo = 'a';
475 if (foldhi > 'z')
476 foldhi = 'z';
477 if (foldlo <= foldhi) {
478 foldlo += 'A' - 'a';
479 foldhi += 'A' - 'a';
480 builder.Mark(foldlo, foldhi);
481 }
482 }
483 // If this Inst is not the last Inst in its list AND the next Inst is
484 // also a ByteRange AND the Insts have the same out, defer the merge.
485 if (!ip->last() &&
486 inst(id+1)->opcode() == kInstByteRange &&
487 ip->out() == inst(id+1)->out())
488 continue;
489 builder.Merge();
490 } else if (ip->opcode() == kInstEmptyWidth) {
491 if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
492 !marked_line_boundaries) {
493 builder.Mark('\n', '\n');
494 builder.Merge();
495 marked_line_boundaries = true;
496 }
497 if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
498 !marked_word_boundaries) {
499 // We require two batches here: the first for ranges that are word
500 // characters, the second for ranges that are not word characters.
501 for (bool isword : {true, false}) {
502 int j;
503 for (int i = 0; i < 256; i = j) {
504 for (j = i + 1; j < 256 &&
505 Prog::IsWordChar(static_cast<uint8_t>(i)) ==
506 Prog::IsWordChar(static_cast<uint8_t>(j));
507 j++)
508 ;
509 if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
510 builder.Mark(i, j - 1);
511 }
512 builder.Merge();
513 }
514 marked_word_boundaries = true;
515 }
516 }
517 }
518
519 builder.Build(bytemap_, &bytemap_range_);
520
521 if ((0)) { // For debugging, use trivial bytemap.
522 ABSL_LOG(ERROR) << "Using trivial bytemap.";
523 for (int i = 0; i < 256; i++)
524 bytemap_[i] = static_cast<uint8_t>(i);
525 bytemap_range_ = 256;
526 }
527 }
528
529 // Prog::Flatten() implements a graph rewriting algorithm.
530 //
531 // The overall process is similar to epsilon removal, but retains some epsilon
532 // transitions: those from Capture and EmptyWidth instructions; and those from
533 // nullable subexpressions. (The latter avoids quadratic blowup in transitions
534 // in the worst case.) It might be best thought of as Alt instruction elision.
535 //
536 // In conceptual terms, it divides the Prog into "trees" of instructions, then
537 // traverses the "trees" in order to produce "lists" of instructions. A "tree"
538 // is one or more instructions that grow from one "root" instruction to one or
539 // more "leaf" instructions; if a "tree" has exactly one instruction, then the
540 // "root" is also the "leaf". In most cases, a "root" is the successor of some
541 // "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
542 // and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
543 // EmptyWidth or Match instruction. However, this is insufficient for handling
544 // nested nullable subexpressions correctly, so in some cases, a "root" is the
545 // dominator of the instructions reachable from some "successor root" (i.e. it
546 // has an unreachable predecessor) and is considered a "dominator root". Since
547 // only Alt instructions can be "dominator roots" (other instructions would be
548 // "leaves"), only Alt instructions are required to be marked as predecessors.
549 //
550 // Dividing the Prog into "trees" comprises two passes: marking the "successor
551 // roots" and the predecessors; and marking the "dominator roots". Sorting the
552 // "successor roots" by their bytecode offsets enables iteration in order from
553 // greatest to least during the second pass; by working backwards in this case
554 // and flooding the graph no further than "leaves" and already marked "roots",
555 // it becomes possible to mark "dominator roots" without doing excessive work.
556 //
557 // Traversing the "trees" is just iterating over the "roots" in order of their
558 // marking and flooding the graph no further than "leaves" and "roots". When a
559 // "leaf" is reached, the instruction is copied with its successor remapped to
560 // its "root" number. When a "root" is reached, a Nop instruction is generated
561 // with its successor remapped similarly. As each "list" is produced, its last
562 // instruction is marked as such. After all of the "lists" have been produced,
563 // a pass over their instructions remaps their successors to bytecode offsets.
Flatten()564 void Prog::Flatten() {
565 if (did_flatten_)
566 return;
567 did_flatten_ = true;
568
569 // Scratch structures. It's important that these are reused by functions
570 // that we call in loops because they would thrash the heap otherwise.
571 SparseSet reachable(size());
572 std::vector<int> stk;
573 stk.reserve(size());
574
575 // First pass: Marks "successor roots" and predecessors.
576 // Builds the mapping from inst-ids to root-ids.
577 SparseArray<int> rootmap(size());
578 SparseArray<int> predmap(size());
579 std::vector<std::vector<int>> predvec;
580 MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
581
582 // Second pass: Marks "dominator roots".
583 SparseArray<int> sorted(rootmap);
584 std::sort(sorted.begin(), sorted.end(), sorted.less);
585 for (SparseArray<int>::const_iterator i = sorted.end() - 1;
586 i != sorted.begin();
587 --i) {
588 if (i->index() != start_unanchored() && i->index() != start())
589 MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
590 }
591
592 // Third pass: Emits "lists". Remaps outs to root-ids.
593 // Builds the mapping from root-ids to flat-ids.
594 std::vector<int> flatmap(rootmap.size());
595 std::vector<Inst> flat;
596 flat.reserve(size());
597 for (SparseArray<int>::const_iterator i = rootmap.begin();
598 i != rootmap.end();
599 ++i) {
600 flatmap[i->value()] = static_cast<int>(flat.size());
601 EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
602 flat.back().set_last();
603 // We have the bounds of the "list", so this is the
604 // most convenient point at which to compute hints.
605 ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size()));
606 }
607
608 list_count_ = static_cast<int>(flatmap.size());
609 for (int i = 0; i < kNumInst; i++)
610 inst_count_[i] = 0;
611
612 // Fourth pass: Remaps outs to flat-ids.
613 // Counts instructions by opcode.
614 for (int id = 0; id < static_cast<int>(flat.size()); id++) {
615 Inst* ip = &flat[id];
616 if (ip->opcode() != kInstAltMatch) // handled in EmitList()
617 ip->set_out(flatmap[ip->out()]);
618 inst_count_[ip->opcode()]++;
619 }
620
621 #if !defined(NDEBUG)
622 // Address a `-Wunused-but-set-variable' warning from Clang 13.x.
623 size_t total = 0;
624 for (int i = 0; i < kNumInst; i++)
625 total += inst_count_[i];
626 ABSL_CHECK_EQ(total, flat.size());
627 #endif
628
629 // Remap start_unanchored and start.
630 if (start_unanchored() == 0) {
631 ABSL_DCHECK_EQ(start(), 0);
632 } else if (start_unanchored() == start()) {
633 set_start_unanchored(flatmap[1]);
634 set_start(flatmap[1]);
635 } else {
636 set_start_unanchored(flatmap[1]);
637 set_start(flatmap[2]);
638 }
639
640 // Finally, replace the old instructions with the new instructions.
641 size_ = static_cast<int>(flat.size());
642 inst_ = PODArray<Inst>(size_);
643 memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]);
644
645 // Populate the list heads for BitState.
646 // 512 instructions limits the memory footprint to 1KiB.
647 if (size_ <= 512) {
648 list_heads_ = PODArray<uint16_t>(size_);
649 // 0xFF makes it more obvious if we try to look up a non-head.
650 memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]);
651 for (int i = 0; i < list_count_; ++i)
652 list_heads_[flatmap[i]] = i;
653 }
654
655 // BitState allocates a bitmap of size list_count_ * (text.size()+1)
656 // for tracking pairs of possibilities that it has already explored.
657 const size_t kBitStateBitmapMaxSize = 256*1024; // max size in bits
658 bit_state_text_max_size_ = kBitStateBitmapMaxSize / list_count_ - 1;
659 }
660
MarkSuccessors(SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)661 void Prog::MarkSuccessors(SparseArray<int>* rootmap,
662 SparseArray<int>* predmap,
663 std::vector<std::vector<int>>* predvec,
664 SparseSet* reachable, std::vector<int>* stk) {
665 // Mark the kInstFail instruction.
666 rootmap->set_new(0, rootmap->size());
667
668 // Mark the start_unanchored and start instructions.
669 if (!rootmap->has_index(start_unanchored()))
670 rootmap->set_new(start_unanchored(), rootmap->size());
671 if (!rootmap->has_index(start()))
672 rootmap->set_new(start(), rootmap->size());
673
674 reachable->clear();
675 stk->clear();
676 stk->push_back(start_unanchored());
677 while (!stk->empty()) {
678 int id = stk->back();
679 stk->pop_back();
680 Loop:
681 if (reachable->contains(id))
682 continue;
683 reachable->insert_new(id);
684
685 Inst* ip = inst(id);
686 switch (ip->opcode()) {
687 default:
688 ABSL_LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
689 break;
690
691 case kInstAltMatch:
692 case kInstAlt:
693 // Mark this instruction as a predecessor of each out.
694 for (int out : {ip->out(), ip->out1()}) {
695 if (!predmap->has_index(out)) {
696 predmap->set_new(out, static_cast<int>(predvec->size()));
697 predvec->emplace_back();
698 }
699 (*predvec)[predmap->get_existing(out)].emplace_back(id);
700 }
701 stk->push_back(ip->out1());
702 id = ip->out();
703 goto Loop;
704
705 case kInstByteRange:
706 case kInstCapture:
707 case kInstEmptyWidth:
708 // Mark the out of this instruction as a "root".
709 if (!rootmap->has_index(ip->out()))
710 rootmap->set_new(ip->out(), rootmap->size());
711 id = ip->out();
712 goto Loop;
713
714 case kInstNop:
715 id = ip->out();
716 goto Loop;
717
718 case kInstMatch:
719 case kInstFail:
720 break;
721 }
722 }
723 }
724
MarkDominator(int root,SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)725 void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
726 SparseArray<int>* predmap,
727 std::vector<std::vector<int>>* predvec,
728 SparseSet* reachable, std::vector<int>* stk) {
729 reachable->clear();
730 stk->clear();
731 stk->push_back(root);
732 while (!stk->empty()) {
733 int id = stk->back();
734 stk->pop_back();
735 Loop:
736 if (reachable->contains(id))
737 continue;
738 reachable->insert_new(id);
739
740 if (id != root && rootmap->has_index(id)) {
741 // We reached another "tree" via epsilon transition.
742 continue;
743 }
744
745 Inst* ip = inst(id);
746 switch (ip->opcode()) {
747 default:
748 ABSL_LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
749 break;
750
751 case kInstAltMatch:
752 case kInstAlt:
753 stk->push_back(ip->out1());
754 id = ip->out();
755 goto Loop;
756
757 case kInstByteRange:
758 case kInstCapture:
759 case kInstEmptyWidth:
760 break;
761
762 case kInstNop:
763 id = ip->out();
764 goto Loop;
765
766 case kInstMatch:
767 case kInstFail:
768 break;
769 }
770 }
771
772 for (SparseSet::const_iterator i = reachable->begin();
773 i != reachable->end();
774 ++i) {
775 int id = *i;
776 if (predmap->has_index(id)) {
777 for (int pred : (*predvec)[predmap->get_existing(id)]) {
778 if (!reachable->contains(pred)) {
779 // id has a predecessor that cannot be reached from root!
780 // Therefore, id must be a "root" too - mark it as such.
781 if (!rootmap->has_index(id))
782 rootmap->set_new(id, rootmap->size());
783 }
784 }
785 }
786 }
787 }
788
EmitList(int root,SparseArray<int> * rootmap,std::vector<Inst> * flat,SparseSet * reachable,std::vector<int> * stk)789 void Prog::EmitList(int root, SparseArray<int>* rootmap,
790 std::vector<Inst>* flat,
791 SparseSet* reachable, std::vector<int>* stk) {
792 reachable->clear();
793 stk->clear();
794 stk->push_back(root);
795 while (!stk->empty()) {
796 int id = stk->back();
797 stk->pop_back();
798 Loop:
799 if (reachable->contains(id))
800 continue;
801 reachable->insert_new(id);
802
803 if (id != root && rootmap->has_index(id)) {
804 // We reached another "tree" via epsilon transition. Emit a kInstNop
805 // instruction so that the Prog does not become quadratically larger.
806 flat->emplace_back();
807 flat->back().set_opcode(kInstNop);
808 flat->back().set_out(rootmap->get_existing(id));
809 continue;
810 }
811
812 Inst* ip = inst(id);
813 switch (ip->opcode()) {
814 default:
815 ABSL_LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
816 break;
817
818 case kInstAltMatch:
819 flat->emplace_back();
820 flat->back().set_opcode(kInstAltMatch);
821 flat->back().set_out(static_cast<int>(flat->size()));
822 flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
823 ABSL_FALLTHROUGH_INTENDED;
824
825 case kInstAlt:
826 stk->push_back(ip->out1());
827 id = ip->out();
828 goto Loop;
829
830 case kInstByteRange:
831 case kInstCapture:
832 case kInstEmptyWidth:
833 flat->emplace_back();
834 memmove(&flat->back(), ip, sizeof *ip);
835 flat->back().set_out(rootmap->get_existing(ip->out()));
836 break;
837
838 case kInstNop:
839 id = ip->out();
840 goto Loop;
841
842 case kInstMatch:
843 case kInstFail:
844 flat->emplace_back();
845 memmove(&flat->back(), ip, sizeof *ip);
846 break;
847 }
848 }
849 }
850
851 // For each ByteRange instruction in [begin, end), computes a hint to execution
852 // engines: the delta to the next instruction (in flat) worth exploring iff the
853 // current instruction matched.
854 //
855 // Implements a coloring algorithm related to ByteMapBuilder, but in this case,
856 // colors are instructions and recoloring ranges precisely identifies conflicts
857 // between instructions. Iterating backwards over [begin, end) is guaranteed to
858 // identify the nearest conflict (if any) with only linear complexity.
ComputeHints(std::vector<Inst> * flat,int begin,int end)859 void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
860 Bitmap256 splits;
861 int colors[256];
862
863 bool dirty = false;
864 for (int id = end; id >= begin; --id) {
865 if (id == end ||
866 (*flat)[id].opcode() != kInstByteRange) {
867 if (dirty) {
868 dirty = false;
869 splits.Clear();
870 }
871 splits.Set(255);
872 colors[255] = id;
873 // At this point, the [0-255] range is colored with id.
874 // Thus, hints cannot point beyond id; and if id == end,
875 // hints that would have pointed to id will be 0 instead.
876 continue;
877 }
878 dirty = true;
879
880 // We recolor the [lo-hi] range with id. Note that first ratchets backwards
881 // from end to the nearest conflict (if any) during recoloring.
882 int first = end;
883 auto Recolor = [&](int lo, int hi) {
884 // Like ByteMapBuilder, we split at lo-1 and at hi.
885 --lo;
886
887 if (0 <= lo && !splits.Test(lo)) {
888 splits.Set(lo);
889 int next = splits.FindNextSetBit(lo+1);
890 colors[lo] = colors[next];
891 }
892 if (!splits.Test(hi)) {
893 splits.Set(hi);
894 int next = splits.FindNextSetBit(hi+1);
895 colors[hi] = colors[next];
896 }
897
898 int c = lo+1;
899 while (c < 256) {
900 int next = splits.FindNextSetBit(c);
901 // Ratchet backwards...
902 first = std::min(first, colors[next]);
903 // Recolor with id - because it's the new nearest conflict!
904 colors[next] = id;
905 if (next == hi)
906 break;
907 c = next+1;
908 }
909 };
910
911 Inst* ip = &(*flat)[id];
912 int lo = ip->lo();
913 int hi = ip->hi();
914 Recolor(lo, hi);
915 if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
916 int foldlo = lo;
917 int foldhi = hi;
918 if (foldlo < 'a')
919 foldlo = 'a';
920 if (foldhi > 'z')
921 foldhi = 'z';
922 if (foldlo <= foldhi) {
923 foldlo += 'A' - 'a';
924 foldhi += 'A' - 'a';
925 Recolor(foldlo, foldhi);
926 }
927 }
928
929 if (first != end) {
930 uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767));
931 ip->hint_foldcase_ |= hint<<1;
932 }
933 }
934 }
935
936 // The final state will always be this, which frees up a register for the hot
937 // loop and thus avoids the spilling that can occur when building with Clang.
938 static const size_t kShiftDFAFinal = 9;
939
940 // This function takes the prefix as std::string (i.e. not const std::string&
941 // as normal) because it's going to clobber it, so a temporary is convenient.
BuildShiftDFA(std::string prefix)942 static uint64_t* BuildShiftDFA(std::string prefix) {
943 // This constant is for convenience now and also for correctness later when
944 // we clobber the prefix, but still need to know how long it was initially.
945 const size_t size = prefix.size();
946
947 // Construct the NFA.
948 // The table is indexed by input byte; each element is a bitfield of states
949 // reachable by the input byte. Given a bitfield of the current states, the
950 // bitfield of states reachable from those is - for this specific purpose -
951 // always ((ncurr << 1) | 1). Intersecting the reachability bitfields gives
952 // the bitfield of the next states reached by stepping over the input byte.
953 // Credits for this technique: the Hyperscan paper by Geoff Langdale et al.
954 uint16_t nfa[256]{};
955 for (size_t i = 0; i < size; ++i) {
956 uint8_t b = prefix[i];
957 nfa[b] |= 1 << (i+1);
958 }
959 // This is the `\C*?` for unanchored search.
960 for (int b = 0; b < 256; ++b)
961 nfa[b] |= 1;
962
963 // This maps from DFA state to NFA states; the reverse mapping is used when
964 // recording transitions and gets implemented with plain old linear search.
965 // The "Shift DFA" technique limits this to ten states when using uint64_t;
966 // to allow for the initial state, we use at most nine bytes of the prefix.
967 // That same limit is also why uint16_t is sufficient for the NFA bitfield.
968 uint16_t states[kShiftDFAFinal+1]{};
969 states[0] = 1;
970 for (size_t dcurr = 0; dcurr < size; ++dcurr) {
971 uint8_t b = prefix[dcurr];
972 uint16_t ncurr = states[dcurr];
973 uint16_t nnext = nfa[b] & ((ncurr << 1) | 1);
974 size_t dnext = dcurr+1;
975 if (dnext == size)
976 dnext = kShiftDFAFinal;
977 states[dnext] = nnext;
978 }
979
980 // Sort and unique the bytes of the prefix to avoid repeating work while we
981 // record transitions. This clobbers the prefix, but it's no longer needed.
982 std::sort(prefix.begin(), prefix.end());
983 prefix.erase(std::unique(prefix.begin(), prefix.end()), prefix.end());
984
985 // Construct the DFA.
986 // The table is indexed by input byte; each element is effectively a packed
987 // array of uint6_t; each array value will be multiplied by six in order to
988 // avoid having to do so later in the hot loop as well as masking/shifting.
989 // Credits for this technique: "Shift-based DFAs" on GitHub by Per Vognsen.
990 uint64_t* dfa = new uint64_t[256]{};
991 // Record a transition from each state for each of the bytes of the prefix.
992 // Note that all other input bytes go back to the initial state by default.
993 for (size_t dcurr = 0; dcurr < size; ++dcurr) {
994 for (uint8_t b : prefix) {
995 uint16_t ncurr = states[dcurr];
996 uint16_t nnext = nfa[b] & ((ncurr << 1) | 1);
997 size_t dnext = 0;
998 while (states[dnext] != nnext)
999 ++dnext;
1000 dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
1001 // Convert ASCII letters to uppercase and record the extra transitions.
1002 // Note that ASCII letters are guaranteed to be lowercase at this point
1003 // because that's how the parser normalises them. #FunFact: 'k' and 's'
1004 // match U+212A and U+017F, respectively, so they won't occur here when
1005 // using UTF-8 encoding because the parser will emit character classes.
1006 if ('a' <= b && b <= 'z') {
1007 b -= 'a' - 'A';
1008 dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
1009 }
1010 }
1011 }
1012 // This lets the final state "saturate", which will matter for performance:
1013 // in the hot loop, we check for a match only at the end of each iteration,
1014 // so we must keep signalling the match until we get around to checking it.
1015 for (int b = 0; b < 256; ++b)
1016 dfa[b] |= static_cast<uint64_t>(kShiftDFAFinal * 6) << (kShiftDFAFinal * 6);
1017
1018 return dfa;
1019 }
1020
ConfigurePrefixAccel(const std::string & prefix,bool prefix_foldcase)1021 void Prog::ConfigurePrefixAccel(const std::string& prefix,
1022 bool prefix_foldcase) {
1023 prefix_foldcase_ = prefix_foldcase;
1024 prefix_size_ = prefix.size();
1025 if (prefix_foldcase_) {
1026 // Use PrefixAccel_ShiftDFA().
1027 // ... and no more than nine bytes of the prefix. (See above for details.)
1028 prefix_size_ = std::min(prefix_size_, kShiftDFAFinal);
1029 prefix_dfa_ = BuildShiftDFA(prefix.substr(0, prefix_size_));
1030 } else if (prefix_size_ != 1) {
1031 // Use PrefixAccel_FrontAndBack().
1032 prefix_front_ = prefix.front();
1033 prefix_back_ = prefix.back();
1034 } else {
1035 // Use memchr(3).
1036 prefix_front_ = prefix.front();
1037 }
1038 }
1039
PrefixAccel_ShiftDFA(const void * data,size_t size)1040 const void* Prog::PrefixAccel_ShiftDFA(const void* data, size_t size) {
1041 if (size < prefix_size_)
1042 return NULL;
1043
1044 uint64_t curr = 0;
1045
1046 // At the time of writing, rough benchmarks on a Broadwell machine showed
1047 // that this unroll factor (i.e. eight) achieves a speedup factor of two.
1048 if (size >= 8) {
1049 const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
1050 const uint8_t* endp = p + (size&~7);
1051 do {
1052 uint8_t b0 = p[0];
1053 uint8_t b1 = p[1];
1054 uint8_t b2 = p[2];
1055 uint8_t b3 = p[3];
1056 uint8_t b4 = p[4];
1057 uint8_t b5 = p[5];
1058 uint8_t b6 = p[6];
1059 uint8_t b7 = p[7];
1060
1061 uint64_t next0 = prefix_dfa_[b0];
1062 uint64_t next1 = prefix_dfa_[b1];
1063 uint64_t next2 = prefix_dfa_[b2];
1064 uint64_t next3 = prefix_dfa_[b3];
1065 uint64_t next4 = prefix_dfa_[b4];
1066 uint64_t next5 = prefix_dfa_[b5];
1067 uint64_t next6 = prefix_dfa_[b6];
1068 uint64_t next7 = prefix_dfa_[b7];
1069
1070 uint64_t curr0 = next0 >> (curr & 63);
1071 uint64_t curr1 = next1 >> (curr0 & 63);
1072 uint64_t curr2 = next2 >> (curr1 & 63);
1073 uint64_t curr3 = next3 >> (curr2 & 63);
1074 uint64_t curr4 = next4 >> (curr3 & 63);
1075 uint64_t curr5 = next5 >> (curr4 & 63);
1076 uint64_t curr6 = next6 >> (curr5 & 63);
1077 uint64_t curr7 = next7 >> (curr6 & 63);
1078
1079 if ((curr7 & 63) == kShiftDFAFinal * 6) {
1080 // At the time of writing, using the same masking subexpressions from
1081 // the preceding lines caused Clang to clutter the hot loop computing
1082 // them - even though they aren't actually needed for shifting! Hence
1083 // these rewritten conditions, which achieve a speedup factor of two.
1084 if (((curr7-curr0) & 63) == 0) return p+1-prefix_size_;
1085 if (((curr7-curr1) & 63) == 0) return p+2-prefix_size_;
1086 if (((curr7-curr2) & 63) == 0) return p+3-prefix_size_;
1087 if (((curr7-curr3) & 63) == 0) return p+4-prefix_size_;
1088 if (((curr7-curr4) & 63) == 0) return p+5-prefix_size_;
1089 if (((curr7-curr5) & 63) == 0) return p+6-prefix_size_;
1090 if (((curr7-curr6) & 63) == 0) return p+7-prefix_size_;
1091 if (((curr7-curr7) & 63) == 0) return p+8-prefix_size_;
1092 }
1093
1094 curr = curr7;
1095 p += 8;
1096 } while (p != endp);
1097 data = p;
1098 size = size&7;
1099 }
1100
1101 const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
1102 const uint8_t* endp = p + size;
1103 while (p != endp) {
1104 uint8_t b = *p++;
1105 uint64_t next = prefix_dfa_[b];
1106 curr = next >> (curr & 63);
1107 if ((curr & 63) == kShiftDFAFinal * 6)
1108 return p-prefix_size_;
1109 }
1110 return NULL;
1111 }
1112
1113 #if defined(__AVX2__)
1114 // Finds the least significant non-zero bit in n.
FindLSBSet(uint32_t n)1115 static int FindLSBSet(uint32_t n) {
1116 ABSL_DCHECK_NE(n, uint32_t{0});
1117 #if defined(__GNUC__)
1118 return __builtin_ctz(n);
1119 #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
1120 unsigned long c;
1121 _BitScanForward(&c, n);
1122 return static_cast<int>(c);
1123 #else
1124 int c = 31;
1125 for (int shift = 1 << 4; shift != 0; shift >>= 1) {
1126 uint32_t word = n << shift;
1127 if (word != 0) {
1128 n = word;
1129 c -= shift;
1130 }
1131 }
1132 return c;
1133 #endif
1134 }
1135 #endif
1136
PrefixAccel_FrontAndBack(const void * data,size_t size)1137 const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) {
1138 ABSL_DCHECK_GE(prefix_size_, size_t{2});
1139 if (size < prefix_size_)
1140 return NULL;
1141 // Don't bother searching the last prefix_size_-1 bytes for prefix_front_.
1142 // This also means that probing for prefix_back_ doesn't go out of bounds.
1143 size -= prefix_size_-1;
1144
1145 #if defined(__AVX2__)
1146 // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time.
1147 if (size >= sizeof(__m256i)) {
1148 const __m256i* fp = reinterpret_cast<const __m256i*>(
1149 reinterpret_cast<const char*>(data));
1150 const __m256i* bp = reinterpret_cast<const __m256i*>(
1151 reinterpret_cast<const char*>(data) + prefix_size_-1);
1152 const __m256i* endfp = fp + size/sizeof(__m256i);
1153 const __m256i f_set1 = _mm256_set1_epi8(prefix_front_);
1154 const __m256i b_set1 = _mm256_set1_epi8(prefix_back_);
1155 do {
1156 const __m256i f_loadu = _mm256_loadu_si256(fp++);
1157 const __m256i b_loadu = _mm256_loadu_si256(bp++);
1158 const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu);
1159 const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu);
1160 const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq);
1161 if (fb_testz == 0) { // ZF: 1 means zero, 0 means non-zero.
1162 const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq);
1163 const int fb_movemask = _mm256_movemask_epi8(fb_and);
1164 const int fb_ctz = FindLSBSet(fb_movemask);
1165 return reinterpret_cast<const char*>(fp-1) + fb_ctz;
1166 }
1167 } while (fp != endfp);
1168 data = fp;
1169 size = size%sizeof(__m256i);
1170 }
1171 #endif
1172
1173 const char* p0 = reinterpret_cast<const char*>(data);
1174 for (const char* p = p0;; p++) {
1175 ABSL_DCHECK_GE(size, static_cast<size_t>(p-p0));
1176 p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0)));
1177 if (p == NULL || p[prefix_size_-1] == prefix_back_)
1178 return p;
1179 }
1180 }
1181
1182 } // namespace re2
1183