• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Compiled regular expression representation.
6 // Tested by compile_test.cc
7 
8 #include "re2/prog.h"
9 
10 #include <stdint.h>
11 #include <string.h>
12 
13 #include <algorithm>
14 #include <string>
15 #include <utility>
16 #include <vector>
17 
18 #include "absl/base/attributes.h"
19 #include "absl/log/absl_check.h"
20 #include "absl/log/absl_log.h"
21 #include "absl/strings/str_format.h"
22 #include "absl/strings/string_view.h"
23 #include "re2/bitmap256.h"
24 #include "re2/pod_array.h"
25 #include "re2/sparse_array.h"
26 #include "re2/sparse_set.h"
27 
28 #if defined(__AVX2__)
29 #include <immintrin.h>
30 #ifdef _MSC_VER
31 #include <intrin.h>
32 #endif
33 #endif
34 
35 namespace re2 {
36 
37 // Constructors per Inst opcode
38 
InitAlt(uint32_t out,uint32_t out1)39 void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
40   ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
41   set_out_opcode(out, kInstAlt);
42   out1_ = out1;
43 }
44 
InitByteRange(int lo,int hi,int foldcase,uint32_t out)45 void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
46   ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
47   set_out_opcode(out, kInstByteRange);
48   lo_ = lo & 0xFF;
49   hi_ = hi & 0xFF;
50   hint_foldcase_ = foldcase&1;
51 }
52 
InitCapture(int cap,uint32_t out)53 void Prog::Inst::InitCapture(int cap, uint32_t out) {
54   ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
55   set_out_opcode(out, kInstCapture);
56   cap_ = cap;
57 }
58 
InitEmptyWidth(EmptyOp empty,uint32_t out)59 void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
60   ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
61   set_out_opcode(out, kInstEmptyWidth);
62   empty_ = empty;
63 }
64 
InitMatch(int32_t id)65 void Prog::Inst::InitMatch(int32_t id) {
66   ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
67   set_opcode(kInstMatch);
68   match_id_ = id;
69 }
70 
InitNop(uint32_t out)71 void Prog::Inst::InitNop(uint32_t out) {
72   ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
73   set_opcode(kInstNop);
74 }
75 
InitFail()76 void Prog::Inst::InitFail() {
77   ABSL_DCHECK_EQ(out_opcode_, uint32_t{0});
78   set_opcode(kInstFail);
79 }
80 
Dump()81 std::string Prog::Inst::Dump() {
82   switch (opcode()) {
83     default:
84       return absl::StrFormat("opcode %d", static_cast<int>(opcode()));
85 
86     case kInstAlt:
87       return absl::StrFormat("alt -> %d | %d", out(), out1_);
88 
89     case kInstAltMatch:
90       return absl::StrFormat("altmatch -> %d | %d", out(), out1_);
91 
92     case kInstByteRange:
93       return absl::StrFormat("byte%s [%02x-%02x] %d -> %d",
94                              foldcase() ? "/i" : "",
95                              lo_, hi_, hint(), out());
96 
97     case kInstCapture:
98       return absl::StrFormat("capture %d -> %d", cap_, out());
99 
100     case kInstEmptyWidth:
101       return absl::StrFormat("emptywidth %#x -> %d",
102                              static_cast<int>(empty_), out());
103 
104     case kInstMatch:
105       return absl::StrFormat("match! %d", match_id());
106 
107     case kInstNop:
108       return absl::StrFormat("nop -> %d", out());
109 
110     case kInstFail:
111       return absl::StrFormat("fail");
112   }
113 }
114 
Prog()115 Prog::Prog()
116   : anchor_start_(false),
117     anchor_end_(false),
118     reversed_(false),
119     did_flatten_(false),
120     did_onepass_(false),
121     start_(0),
122     start_unanchored_(0),
123     size_(0),
124     bytemap_range_(0),
125     prefix_foldcase_(false),
126     prefix_size_(0),
127     list_count_(0),
128     bit_state_text_max_size_(0),
129     dfa_mem_(0),
130     dfa_first_(NULL),
131     dfa_longest_(NULL) {
132 }
133 
~Prog()134 Prog::~Prog() {
135   DeleteDFA(dfa_longest_);
136   DeleteDFA(dfa_first_);
137   if (prefix_foldcase_)
138     delete[] prefix_dfa_;
139 }
140 
141 typedef SparseSet Workq;
142 
AddToQueue(Workq * q,int id)143 static inline void AddToQueue(Workq* q, int id) {
144   if (id != 0)
145     q->insert(id);
146 }
147 
ProgToString(Prog * prog,Workq * q)148 static std::string ProgToString(Prog* prog, Workq* q) {
149   std::string s;
150   for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
151     int id = *i;
152     Prog::Inst* ip = prog->inst(id);
153     s += absl::StrFormat("%d. %s\n", id, ip->Dump());
154     AddToQueue(q, ip->out());
155     if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
156       AddToQueue(q, ip->out1());
157   }
158   return s;
159 }
160 
FlattenedProgToString(Prog * prog,int start)161 static std::string FlattenedProgToString(Prog* prog, int start) {
162   std::string s;
163   for (int id = start; id < prog->size(); id++) {
164     Prog::Inst* ip = prog->inst(id);
165     if (ip->last())
166       s += absl::StrFormat("%d. %s\n", id, ip->Dump());
167     else
168       s += absl::StrFormat("%d+ %s\n", id, ip->Dump());
169   }
170   return s;
171 }
172 
Dump()173 std::string Prog::Dump() {
174   if (did_flatten_)
175     return FlattenedProgToString(this, start_);
176 
177   Workq q(size_);
178   AddToQueue(&q, start_);
179   return ProgToString(this, &q);
180 }
181 
DumpUnanchored()182 std::string Prog::DumpUnanchored() {
183   if (did_flatten_)
184     return FlattenedProgToString(this, start_unanchored_);
185 
186   Workq q(size_);
187   AddToQueue(&q, start_unanchored_);
188   return ProgToString(this, &q);
189 }
190 
DumpByteMap()191 std::string Prog::DumpByteMap() {
192   std::string map;
193   for (int c = 0; c < 256; c++) {
194     int b = bytemap_[c];
195     int lo = c;
196     while (c < 256-1 && bytemap_[c+1] == b)
197       c++;
198     int hi = c;
199     map += absl::StrFormat("[%02x-%02x] -> %d\n", lo, hi, b);
200   }
201   return map;
202 }
203 
204 // Is ip a guaranteed match at end of text, perhaps after some capturing?
IsMatch(Prog * prog,Prog::Inst * ip)205 static bool IsMatch(Prog* prog, Prog::Inst* ip) {
206   for (;;) {
207     switch (ip->opcode()) {
208       default:
209         ABSL_LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
210         return false;
211 
212       case kInstAlt:
213       case kInstAltMatch:
214       case kInstByteRange:
215       case kInstFail:
216       case kInstEmptyWidth:
217         return false;
218 
219       case kInstCapture:
220       case kInstNop:
221         ip = prog->inst(ip->out());
222         break;
223 
224       case kInstMatch:
225         return true;
226     }
227   }
228 }
229 
230 // Peep-hole optimizer.
Optimize()231 void Prog::Optimize() {
232   Workq q(size_);
233 
234   // Eliminate nops.  Most are taken out during compilation
235   // but a few are hard to avoid.
236   q.clear();
237   AddToQueue(&q, start_);
238   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
239     int id = *i;
240 
241     Inst* ip = inst(id);
242     int j = ip->out();
243     Inst* jp;
244     while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
245       j = jp->out();
246     }
247     ip->set_out(j);
248     AddToQueue(&q, ip->out());
249 
250     if (ip->opcode() == kInstAlt) {
251       j = ip->out1();
252       while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
253         j = jp->out();
254       }
255       ip->out1_ = j;
256       AddToQueue(&q, ip->out1());
257     }
258   }
259 
260   // Insert kInstAltMatch instructions
261   // Look for
262   //   ip: Alt -> j | k
263   //	  j: ByteRange [00-FF] -> ip
264   //    k: Match
265   // or the reverse (the above is the greedy one).
266   // Rewrite Alt to AltMatch.
267   q.clear();
268   AddToQueue(&q, start_);
269   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
270     int id = *i;
271     Inst* ip = inst(id);
272     AddToQueue(&q, ip->out());
273     if (ip->opcode() == kInstAlt)
274       AddToQueue(&q, ip->out1());
275 
276     if (ip->opcode() == kInstAlt) {
277       Inst* j = inst(ip->out());
278       Inst* k = inst(ip->out1());
279       if (j->opcode() == kInstByteRange && j->out() == id &&
280           j->lo() == 0x00 && j->hi() == 0xFF &&
281           IsMatch(this, k)) {
282         ip->set_opcode(kInstAltMatch);
283         continue;
284       }
285       if (IsMatch(this, j) &&
286           k->opcode() == kInstByteRange && k->out() == id &&
287           k->lo() == 0x00 && k->hi() == 0xFF) {
288         ip->set_opcode(kInstAltMatch);
289       }
290     }
291   }
292 }
293 
EmptyFlags(absl::string_view text,const char * p)294 uint32_t Prog::EmptyFlags(absl::string_view text, const char* p) {
295   int flags = 0;
296 
297   // ^ and \A
298   if (p == text.data())
299     flags |= kEmptyBeginText | kEmptyBeginLine;
300   else if (p[-1] == '\n')
301     flags |= kEmptyBeginLine;
302 
303   // $ and \z
304   if (p == text.data() + text.size())
305     flags |= kEmptyEndText | kEmptyEndLine;
306   else if (p < text.data() + text.size() && p[0] == '\n')
307     flags |= kEmptyEndLine;
308 
309   // \b and \B
310   if (p == text.data() && p == text.data() + text.size()) {
311     // no word boundary here
312   } else if (p == text.data()) {
313     if (IsWordChar(p[0]))
314       flags |= kEmptyWordBoundary;
315   } else if (p == text.data() + text.size()) {
316     if (IsWordChar(p[-1]))
317       flags |= kEmptyWordBoundary;
318   } else {
319     if (IsWordChar(p[-1]) != IsWordChar(p[0]))
320       flags |= kEmptyWordBoundary;
321   }
322   if (!(flags & kEmptyWordBoundary))
323     flags |= kEmptyNonWordBoundary;
324 
325   return flags;
326 }
327 
328 // ByteMapBuilder implements a coloring algorithm.
329 //
330 // The first phase is a series of "mark and merge" batches: we mark one or more
331 // [lo-hi] ranges, then merge them into our internal state. Batching is not for
332 // performance; rather, it means that the ranges are treated indistinguishably.
333 //
334 // Internally, the ranges are represented using a bitmap that stores the splits
335 // and a vector that stores the colors; both of them are indexed by the ranges'
336 // last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
337 // hi (if not already split), then recolor each range in between. The color map
338 // (i.e. from the old color to the new color) is maintained for the lifetime of
339 // the batch and so underpins this somewhat obscure approach to set operations.
340 //
341 // The second phase builds the bytemap from our internal state: we recolor each
342 // range, then store the new color (which is now the byte class) in each of the
343 // corresponding array elements. Finally, we output the number of byte classes.
344 class ByteMapBuilder {
345  public:
ByteMapBuilder()346   ByteMapBuilder() {
347     // Initial state: the [0-255] range has color 256.
348     // This will avoid problems during the second phase,
349     // in which we assign byte classes numbered from 0.
350     splits_.Set(255);
351     colors_[255] = 256;
352     nextcolor_ = 257;
353   }
354 
355   void Mark(int lo, int hi);
356   void Merge();
357   void Build(uint8_t* bytemap, int* bytemap_range);
358 
359  private:
360   int Recolor(int oldcolor);
361 
362   Bitmap256 splits_;
363   int colors_[256];
364   int nextcolor_;
365   std::vector<std::pair<int, int>> colormap_;
366   std::vector<std::pair<int, int>> ranges_;
367 
368   ByteMapBuilder(const ByteMapBuilder&) = delete;
369   ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
370 };
371 
Mark(int lo,int hi)372 void ByteMapBuilder::Mark(int lo, int hi) {
373   ABSL_DCHECK_GE(lo, 0);
374   ABSL_DCHECK_GE(hi, 0);
375   ABSL_DCHECK_LE(lo, 255);
376   ABSL_DCHECK_LE(hi, 255);
377   ABSL_DCHECK_LE(lo, hi);
378 
379   // Ignore any [0-255] ranges. They cause us to recolor every range, which
380   // has no effect on the eventual result and is therefore a waste of time.
381   if (lo == 0 && hi == 255)
382     return;
383 
384   ranges_.emplace_back(lo, hi);
385 }
386 
Merge()387 void ByteMapBuilder::Merge() {
388   for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
389        it != ranges_.end();
390        ++it) {
391     int lo = it->first-1;
392     int hi = it->second;
393 
394     if (0 <= lo && !splits_.Test(lo)) {
395       splits_.Set(lo);
396       int next = splits_.FindNextSetBit(lo+1);
397       colors_[lo] = colors_[next];
398     }
399     if (!splits_.Test(hi)) {
400       splits_.Set(hi);
401       int next = splits_.FindNextSetBit(hi+1);
402       colors_[hi] = colors_[next];
403     }
404 
405     int c = lo+1;
406     while (c < 256) {
407       int next = splits_.FindNextSetBit(c);
408       colors_[next] = Recolor(colors_[next]);
409       if (next == hi)
410         break;
411       c = next+1;
412     }
413   }
414   colormap_.clear();
415   ranges_.clear();
416 }
417 
Build(uint8_t * bytemap,int * bytemap_range)418 void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
419   // Assign byte classes numbered from 0.
420   nextcolor_ = 0;
421 
422   int c = 0;
423   while (c < 256) {
424     int next = splits_.FindNextSetBit(c);
425     uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
426     while (c <= next) {
427       bytemap[c] = b;
428       c++;
429     }
430   }
431 
432   *bytemap_range = nextcolor_;
433 }
434 
Recolor(int oldcolor)435 int ByteMapBuilder::Recolor(int oldcolor) {
436   // Yes, this is a linear search. There can be at most 256
437   // colors and there will typically be far fewer than that.
438   // Also, we need to consider keys *and* values in order to
439   // avoid recoloring a given range more than once per batch.
440   std::vector<std::pair<int, int>>::const_iterator it =
441       std::find_if(colormap_.begin(), colormap_.end(),
442                    [=](const std::pair<int, int>& kv) -> bool {
443                      return kv.first == oldcolor || kv.second == oldcolor;
444                    });
445   if (it != colormap_.end())
446     return it->second;
447   int newcolor = nextcolor_;
448   nextcolor_++;
449   colormap_.emplace_back(oldcolor, newcolor);
450   return newcolor;
451 }
452 
ComputeByteMap()453 void Prog::ComputeByteMap() {
454   // Fill in bytemap with byte classes for the program.
455   // Ranges of bytes that are treated indistinguishably
456   // will be mapped to a single byte class.
457   ByteMapBuilder builder;
458 
459   // Don't repeat the work for ^ and $.
460   bool marked_line_boundaries = false;
461   // Don't repeat the work for \b and \B.
462   bool marked_word_boundaries = false;
463 
464   for (int id = 0; id < size(); id++) {
465     Inst* ip = inst(id);
466     if (ip->opcode() == kInstByteRange) {
467       int lo = ip->lo();
468       int hi = ip->hi();
469       builder.Mark(lo, hi);
470       if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
471         int foldlo = lo;
472         int foldhi = hi;
473         if (foldlo < 'a')
474           foldlo = 'a';
475         if (foldhi > 'z')
476           foldhi = 'z';
477         if (foldlo <= foldhi) {
478           foldlo += 'A' - 'a';
479           foldhi += 'A' - 'a';
480           builder.Mark(foldlo, foldhi);
481         }
482       }
483       // If this Inst is not the last Inst in its list AND the next Inst is
484       // also a ByteRange AND the Insts have the same out, defer the merge.
485       if (!ip->last() &&
486           inst(id+1)->opcode() == kInstByteRange &&
487           ip->out() == inst(id+1)->out())
488         continue;
489       builder.Merge();
490     } else if (ip->opcode() == kInstEmptyWidth) {
491       if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
492           !marked_line_boundaries) {
493         builder.Mark('\n', '\n');
494         builder.Merge();
495         marked_line_boundaries = true;
496       }
497       if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
498           !marked_word_boundaries) {
499         // We require two batches here: the first for ranges that are word
500         // characters, the second for ranges that are not word characters.
501         for (bool isword : {true, false}) {
502           int j;
503           for (int i = 0; i < 256; i = j) {
504             for (j = i + 1; j < 256 &&
505                             Prog::IsWordChar(static_cast<uint8_t>(i)) ==
506                                 Prog::IsWordChar(static_cast<uint8_t>(j));
507                  j++)
508               ;
509             if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
510               builder.Mark(i, j - 1);
511           }
512           builder.Merge();
513         }
514         marked_word_boundaries = true;
515       }
516     }
517   }
518 
519   builder.Build(bytemap_, &bytemap_range_);
520 
521   if ((0)) {  // For debugging, use trivial bytemap.
522     ABSL_LOG(ERROR) << "Using trivial bytemap.";
523     for (int i = 0; i < 256; i++)
524       bytemap_[i] = static_cast<uint8_t>(i);
525     bytemap_range_ = 256;
526   }
527 }
528 
529 // Prog::Flatten() implements a graph rewriting algorithm.
530 //
531 // The overall process is similar to epsilon removal, but retains some epsilon
532 // transitions: those from Capture and EmptyWidth instructions; and those from
533 // nullable subexpressions. (The latter avoids quadratic blowup in transitions
534 // in the worst case.) It might be best thought of as Alt instruction elision.
535 //
536 // In conceptual terms, it divides the Prog into "trees" of instructions, then
537 // traverses the "trees" in order to produce "lists" of instructions. A "tree"
538 // is one or more instructions that grow from one "root" instruction to one or
539 // more "leaf" instructions; if a "tree" has exactly one instruction, then the
540 // "root" is also the "leaf". In most cases, a "root" is the successor of some
541 // "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
542 // and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
543 // EmptyWidth or Match instruction. However, this is insufficient for handling
544 // nested nullable subexpressions correctly, so in some cases, a "root" is the
545 // dominator of the instructions reachable from some "successor root" (i.e. it
546 // has an unreachable predecessor) and is considered a "dominator root". Since
547 // only Alt instructions can be "dominator roots" (other instructions would be
548 // "leaves"), only Alt instructions are required to be marked as predecessors.
549 //
550 // Dividing the Prog into "trees" comprises two passes: marking the "successor
551 // roots" and the predecessors; and marking the "dominator roots". Sorting the
552 // "successor roots" by their bytecode offsets enables iteration in order from
553 // greatest to least during the second pass; by working backwards in this case
554 // and flooding the graph no further than "leaves" and already marked "roots",
555 // it becomes possible to mark "dominator roots" without doing excessive work.
556 //
557 // Traversing the "trees" is just iterating over the "roots" in order of their
558 // marking and flooding the graph no further than "leaves" and "roots". When a
559 // "leaf" is reached, the instruction is copied with its successor remapped to
560 // its "root" number. When a "root" is reached, a Nop instruction is generated
561 // with its successor remapped similarly. As each "list" is produced, its last
562 // instruction is marked as such. After all of the "lists" have been produced,
563 // a pass over their instructions remaps their successors to bytecode offsets.
Flatten()564 void Prog::Flatten() {
565   if (did_flatten_)
566     return;
567   did_flatten_ = true;
568 
569   // Scratch structures. It's important that these are reused by functions
570   // that we call in loops because they would thrash the heap otherwise.
571   SparseSet reachable(size());
572   std::vector<int> stk;
573   stk.reserve(size());
574 
575   // First pass: Marks "successor roots" and predecessors.
576   // Builds the mapping from inst-ids to root-ids.
577   SparseArray<int> rootmap(size());
578   SparseArray<int> predmap(size());
579   std::vector<std::vector<int>> predvec;
580   MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
581 
582   // Second pass: Marks "dominator roots".
583   SparseArray<int> sorted(rootmap);
584   std::sort(sorted.begin(), sorted.end(), sorted.less);
585   for (SparseArray<int>::const_iterator i = sorted.end() - 1;
586        i != sorted.begin();
587        --i) {
588     if (i->index() != start_unanchored() && i->index() != start())
589       MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
590   }
591 
592   // Third pass: Emits "lists". Remaps outs to root-ids.
593   // Builds the mapping from root-ids to flat-ids.
594   std::vector<int> flatmap(rootmap.size());
595   std::vector<Inst> flat;
596   flat.reserve(size());
597   for (SparseArray<int>::const_iterator i = rootmap.begin();
598        i != rootmap.end();
599        ++i) {
600     flatmap[i->value()] = static_cast<int>(flat.size());
601     EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
602     flat.back().set_last();
603     // We have the bounds of the "list", so this is the
604     // most convenient point at which to compute hints.
605     ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size()));
606   }
607 
608   list_count_ = static_cast<int>(flatmap.size());
609   for (int i = 0; i < kNumInst; i++)
610     inst_count_[i] = 0;
611 
612   // Fourth pass: Remaps outs to flat-ids.
613   // Counts instructions by opcode.
614   for (int id = 0; id < static_cast<int>(flat.size()); id++) {
615     Inst* ip = &flat[id];
616     if (ip->opcode() != kInstAltMatch)  // handled in EmitList()
617       ip->set_out(flatmap[ip->out()]);
618     inst_count_[ip->opcode()]++;
619   }
620 
621 #if !defined(NDEBUG)
622   // Address a `-Wunused-but-set-variable' warning from Clang 13.x.
623   size_t total = 0;
624   for (int i = 0; i < kNumInst; i++)
625     total += inst_count_[i];
626   ABSL_CHECK_EQ(total, flat.size());
627 #endif
628 
629   // Remap start_unanchored and start.
630   if (start_unanchored() == 0) {
631     ABSL_DCHECK_EQ(start(), 0);
632   } else if (start_unanchored() == start()) {
633     set_start_unanchored(flatmap[1]);
634     set_start(flatmap[1]);
635   } else {
636     set_start_unanchored(flatmap[1]);
637     set_start(flatmap[2]);
638   }
639 
640   // Finally, replace the old instructions with the new instructions.
641   size_ = static_cast<int>(flat.size());
642   inst_ = PODArray<Inst>(size_);
643   memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]);
644 
645   // Populate the list heads for BitState.
646   // 512 instructions limits the memory footprint to 1KiB.
647   if (size_ <= 512) {
648     list_heads_ = PODArray<uint16_t>(size_);
649     // 0xFF makes it more obvious if we try to look up a non-head.
650     memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]);
651     for (int i = 0; i < list_count_; ++i)
652       list_heads_[flatmap[i]] = i;
653   }
654 
655   // BitState allocates a bitmap of size list_count_ * (text.size()+1)
656   // for tracking pairs of possibilities that it has already explored.
657   const size_t kBitStateBitmapMaxSize = 256*1024;  // max size in bits
658   bit_state_text_max_size_ = kBitStateBitmapMaxSize / list_count_ - 1;
659 }
660 
MarkSuccessors(SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)661 void Prog::MarkSuccessors(SparseArray<int>* rootmap,
662                           SparseArray<int>* predmap,
663                           std::vector<std::vector<int>>* predvec,
664                           SparseSet* reachable, std::vector<int>* stk) {
665   // Mark the kInstFail instruction.
666   rootmap->set_new(0, rootmap->size());
667 
668   // Mark the start_unanchored and start instructions.
669   if (!rootmap->has_index(start_unanchored()))
670     rootmap->set_new(start_unanchored(), rootmap->size());
671   if (!rootmap->has_index(start()))
672     rootmap->set_new(start(), rootmap->size());
673 
674   reachable->clear();
675   stk->clear();
676   stk->push_back(start_unanchored());
677   while (!stk->empty()) {
678     int id = stk->back();
679     stk->pop_back();
680   Loop:
681     if (reachable->contains(id))
682       continue;
683     reachable->insert_new(id);
684 
685     Inst* ip = inst(id);
686     switch (ip->opcode()) {
687       default:
688         ABSL_LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
689         break;
690 
691       case kInstAltMatch:
692       case kInstAlt:
693         // Mark this instruction as a predecessor of each out.
694         for (int out : {ip->out(), ip->out1()}) {
695           if (!predmap->has_index(out)) {
696             predmap->set_new(out, static_cast<int>(predvec->size()));
697             predvec->emplace_back();
698           }
699           (*predvec)[predmap->get_existing(out)].emplace_back(id);
700         }
701         stk->push_back(ip->out1());
702         id = ip->out();
703         goto Loop;
704 
705       case kInstByteRange:
706       case kInstCapture:
707       case kInstEmptyWidth:
708         // Mark the out of this instruction as a "root".
709         if (!rootmap->has_index(ip->out()))
710           rootmap->set_new(ip->out(), rootmap->size());
711         id = ip->out();
712         goto Loop;
713 
714       case kInstNop:
715         id = ip->out();
716         goto Loop;
717 
718       case kInstMatch:
719       case kInstFail:
720         break;
721     }
722   }
723 }
724 
MarkDominator(int root,SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)725 void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
726                          SparseArray<int>* predmap,
727                          std::vector<std::vector<int>>* predvec,
728                          SparseSet* reachable, std::vector<int>* stk) {
729   reachable->clear();
730   stk->clear();
731   stk->push_back(root);
732   while (!stk->empty()) {
733     int id = stk->back();
734     stk->pop_back();
735   Loop:
736     if (reachable->contains(id))
737       continue;
738     reachable->insert_new(id);
739 
740     if (id != root && rootmap->has_index(id)) {
741       // We reached another "tree" via epsilon transition.
742       continue;
743     }
744 
745     Inst* ip = inst(id);
746     switch (ip->opcode()) {
747       default:
748         ABSL_LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
749         break;
750 
751       case kInstAltMatch:
752       case kInstAlt:
753         stk->push_back(ip->out1());
754         id = ip->out();
755         goto Loop;
756 
757       case kInstByteRange:
758       case kInstCapture:
759       case kInstEmptyWidth:
760         break;
761 
762       case kInstNop:
763         id = ip->out();
764         goto Loop;
765 
766       case kInstMatch:
767       case kInstFail:
768         break;
769     }
770   }
771 
772   for (SparseSet::const_iterator i = reachable->begin();
773        i != reachable->end();
774        ++i) {
775     int id = *i;
776     if (predmap->has_index(id)) {
777       for (int pred : (*predvec)[predmap->get_existing(id)]) {
778         if (!reachable->contains(pred)) {
779           // id has a predecessor that cannot be reached from root!
780           // Therefore, id must be a "root" too - mark it as such.
781           if (!rootmap->has_index(id))
782             rootmap->set_new(id, rootmap->size());
783         }
784       }
785     }
786   }
787 }
788 
EmitList(int root,SparseArray<int> * rootmap,std::vector<Inst> * flat,SparseSet * reachable,std::vector<int> * stk)789 void Prog::EmitList(int root, SparseArray<int>* rootmap,
790                     std::vector<Inst>* flat,
791                     SparseSet* reachable, std::vector<int>* stk) {
792   reachable->clear();
793   stk->clear();
794   stk->push_back(root);
795   while (!stk->empty()) {
796     int id = stk->back();
797     stk->pop_back();
798   Loop:
799     if (reachable->contains(id))
800       continue;
801     reachable->insert_new(id);
802 
803     if (id != root && rootmap->has_index(id)) {
804       // We reached another "tree" via epsilon transition. Emit a kInstNop
805       // instruction so that the Prog does not become quadratically larger.
806       flat->emplace_back();
807       flat->back().set_opcode(kInstNop);
808       flat->back().set_out(rootmap->get_existing(id));
809       continue;
810     }
811 
812     Inst* ip = inst(id);
813     switch (ip->opcode()) {
814       default:
815         ABSL_LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
816         break;
817 
818       case kInstAltMatch:
819         flat->emplace_back();
820         flat->back().set_opcode(kInstAltMatch);
821         flat->back().set_out(static_cast<int>(flat->size()));
822         flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
823         ABSL_FALLTHROUGH_INTENDED;
824 
825       case kInstAlt:
826         stk->push_back(ip->out1());
827         id = ip->out();
828         goto Loop;
829 
830       case kInstByteRange:
831       case kInstCapture:
832       case kInstEmptyWidth:
833         flat->emplace_back();
834         memmove(&flat->back(), ip, sizeof *ip);
835         flat->back().set_out(rootmap->get_existing(ip->out()));
836         break;
837 
838       case kInstNop:
839         id = ip->out();
840         goto Loop;
841 
842       case kInstMatch:
843       case kInstFail:
844         flat->emplace_back();
845         memmove(&flat->back(), ip, sizeof *ip);
846         break;
847     }
848   }
849 }
850 
851 // For each ByteRange instruction in [begin, end), computes a hint to execution
852 // engines: the delta to the next instruction (in flat) worth exploring iff the
853 // current instruction matched.
854 //
855 // Implements a coloring algorithm related to ByteMapBuilder, but in this case,
856 // colors are instructions and recoloring ranges precisely identifies conflicts
857 // between instructions. Iterating backwards over [begin, end) is guaranteed to
858 // identify the nearest conflict (if any) with only linear complexity.
ComputeHints(std::vector<Inst> * flat,int begin,int end)859 void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
860   Bitmap256 splits;
861   int colors[256];
862 
863   bool dirty = false;
864   for (int id = end; id >= begin; --id) {
865     if (id == end ||
866         (*flat)[id].opcode() != kInstByteRange) {
867       if (dirty) {
868         dirty = false;
869         splits.Clear();
870       }
871       splits.Set(255);
872       colors[255] = id;
873       // At this point, the [0-255] range is colored with id.
874       // Thus, hints cannot point beyond id; and if id == end,
875       // hints that would have pointed to id will be 0 instead.
876       continue;
877     }
878     dirty = true;
879 
880     // We recolor the [lo-hi] range with id. Note that first ratchets backwards
881     // from end to the nearest conflict (if any) during recoloring.
882     int first = end;
883     auto Recolor = [&](int lo, int hi) {
884       // Like ByteMapBuilder, we split at lo-1 and at hi.
885       --lo;
886 
887       if (0 <= lo && !splits.Test(lo)) {
888         splits.Set(lo);
889         int next = splits.FindNextSetBit(lo+1);
890         colors[lo] = colors[next];
891       }
892       if (!splits.Test(hi)) {
893         splits.Set(hi);
894         int next = splits.FindNextSetBit(hi+1);
895         colors[hi] = colors[next];
896       }
897 
898       int c = lo+1;
899       while (c < 256) {
900         int next = splits.FindNextSetBit(c);
901         // Ratchet backwards...
902         first = std::min(first, colors[next]);
903         // Recolor with id - because it's the new nearest conflict!
904         colors[next] = id;
905         if (next == hi)
906           break;
907         c = next+1;
908       }
909     };
910 
911     Inst* ip = &(*flat)[id];
912     int lo = ip->lo();
913     int hi = ip->hi();
914     Recolor(lo, hi);
915     if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
916       int foldlo = lo;
917       int foldhi = hi;
918       if (foldlo < 'a')
919         foldlo = 'a';
920       if (foldhi > 'z')
921         foldhi = 'z';
922       if (foldlo <= foldhi) {
923         foldlo += 'A' - 'a';
924         foldhi += 'A' - 'a';
925         Recolor(foldlo, foldhi);
926       }
927     }
928 
929     if (first != end) {
930       uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767));
931       ip->hint_foldcase_ |= hint<<1;
932     }
933   }
934 }
935 
936 // The final state will always be this, which frees up a register for the hot
937 // loop and thus avoids the spilling that can occur when building with Clang.
938 static const size_t kShiftDFAFinal = 9;
939 
940 // This function takes the prefix as std::string (i.e. not const std::string&
941 // as normal) because it's going to clobber it, so a temporary is convenient.
BuildShiftDFA(std::string prefix)942 static uint64_t* BuildShiftDFA(std::string prefix) {
943   // This constant is for convenience now and also for correctness later when
944   // we clobber the prefix, but still need to know how long it was initially.
945   const size_t size = prefix.size();
946 
947   // Construct the NFA.
948   // The table is indexed by input byte; each element is a bitfield of states
949   // reachable by the input byte. Given a bitfield of the current states, the
950   // bitfield of states reachable from those is - for this specific purpose -
951   // always ((ncurr << 1) | 1). Intersecting the reachability bitfields gives
952   // the bitfield of the next states reached by stepping over the input byte.
953   // Credits for this technique: the Hyperscan paper by Geoff Langdale et al.
954   uint16_t nfa[256]{};
955   for (size_t i = 0; i < size; ++i) {
956     uint8_t b = prefix[i];
957     nfa[b] |= 1 << (i+1);
958   }
959   // This is the `\C*?` for unanchored search.
960   for (int b = 0; b < 256; ++b)
961     nfa[b] |= 1;
962 
963   // This maps from DFA state to NFA states; the reverse mapping is used when
964   // recording transitions and gets implemented with plain old linear search.
965   // The "Shift DFA" technique limits this to ten states when using uint64_t;
966   // to allow for the initial state, we use at most nine bytes of the prefix.
967   // That same limit is also why uint16_t is sufficient for the NFA bitfield.
968   uint16_t states[kShiftDFAFinal+1]{};
969   states[0] = 1;
970   for (size_t dcurr = 0; dcurr < size; ++dcurr) {
971     uint8_t b = prefix[dcurr];
972     uint16_t ncurr = states[dcurr];
973     uint16_t nnext = nfa[b] & ((ncurr << 1) | 1);
974     size_t dnext = dcurr+1;
975     if (dnext == size)
976       dnext = kShiftDFAFinal;
977     states[dnext] = nnext;
978   }
979 
980   // Sort and unique the bytes of the prefix to avoid repeating work while we
981   // record transitions. This clobbers the prefix, but it's no longer needed.
982   std::sort(prefix.begin(), prefix.end());
983   prefix.erase(std::unique(prefix.begin(), prefix.end()), prefix.end());
984 
985   // Construct the DFA.
986   // The table is indexed by input byte; each element is effectively a packed
987   // array of uint6_t; each array value will be multiplied by six in order to
988   // avoid having to do so later in the hot loop as well as masking/shifting.
989   // Credits for this technique: "Shift-based DFAs" on GitHub by Per Vognsen.
990   uint64_t* dfa = new uint64_t[256]{};
991   // Record a transition from each state for each of the bytes of the prefix.
992   // Note that all other input bytes go back to the initial state by default.
993   for (size_t dcurr = 0; dcurr < size; ++dcurr) {
994     for (uint8_t b : prefix) {
995       uint16_t ncurr = states[dcurr];
996       uint16_t nnext = nfa[b] & ((ncurr << 1) | 1);
997       size_t dnext = 0;
998       while (states[dnext] != nnext)
999         ++dnext;
1000       dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
1001       // Convert ASCII letters to uppercase and record the extra transitions.
1002       // Note that ASCII letters are guaranteed to be lowercase at this point
1003       // because that's how the parser normalises them. #FunFact: 'k' and 's'
1004       // match U+212A and U+017F, respectively, so they won't occur here when
1005       // using UTF-8 encoding because the parser will emit character classes.
1006       if ('a' <= b && b <= 'z') {
1007         b -= 'a' - 'A';
1008         dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
1009       }
1010     }
1011   }
1012   // This lets the final state "saturate", which will matter for performance:
1013   // in the hot loop, we check for a match only at the end of each iteration,
1014   // so we must keep signalling the match until we get around to checking it.
1015   for (int b = 0; b < 256; ++b)
1016     dfa[b] |= static_cast<uint64_t>(kShiftDFAFinal * 6) << (kShiftDFAFinal * 6);
1017 
1018   return dfa;
1019 }
1020 
ConfigurePrefixAccel(const std::string & prefix,bool prefix_foldcase)1021 void Prog::ConfigurePrefixAccel(const std::string& prefix,
1022                                 bool prefix_foldcase) {
1023   prefix_foldcase_ = prefix_foldcase;
1024   prefix_size_ = prefix.size();
1025   if (prefix_foldcase_) {
1026     // Use PrefixAccel_ShiftDFA().
1027     // ... and no more than nine bytes of the prefix. (See above for details.)
1028     prefix_size_ = std::min(prefix_size_, kShiftDFAFinal);
1029     prefix_dfa_ = BuildShiftDFA(prefix.substr(0, prefix_size_));
1030   } else if (prefix_size_ != 1) {
1031     // Use PrefixAccel_FrontAndBack().
1032     prefix_front_ = prefix.front();
1033     prefix_back_ = prefix.back();
1034   } else {
1035     // Use memchr(3).
1036     prefix_front_ = prefix.front();
1037   }
1038 }
1039 
PrefixAccel_ShiftDFA(const void * data,size_t size)1040 const void* Prog::PrefixAccel_ShiftDFA(const void* data, size_t size) {
1041   if (size < prefix_size_)
1042     return NULL;
1043 
1044   uint64_t curr = 0;
1045 
1046   // At the time of writing, rough benchmarks on a Broadwell machine showed
1047   // that this unroll factor (i.e. eight) achieves a speedup factor of two.
1048   if (size >= 8) {
1049     const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
1050     const uint8_t* endp = p + (size&~7);
1051     do {
1052       uint8_t b0 = p[0];
1053       uint8_t b1 = p[1];
1054       uint8_t b2 = p[2];
1055       uint8_t b3 = p[3];
1056       uint8_t b4 = p[4];
1057       uint8_t b5 = p[5];
1058       uint8_t b6 = p[6];
1059       uint8_t b7 = p[7];
1060 
1061       uint64_t next0 = prefix_dfa_[b0];
1062       uint64_t next1 = prefix_dfa_[b1];
1063       uint64_t next2 = prefix_dfa_[b2];
1064       uint64_t next3 = prefix_dfa_[b3];
1065       uint64_t next4 = prefix_dfa_[b4];
1066       uint64_t next5 = prefix_dfa_[b5];
1067       uint64_t next6 = prefix_dfa_[b6];
1068       uint64_t next7 = prefix_dfa_[b7];
1069 
1070       uint64_t curr0 = next0 >> (curr  & 63);
1071       uint64_t curr1 = next1 >> (curr0 & 63);
1072       uint64_t curr2 = next2 >> (curr1 & 63);
1073       uint64_t curr3 = next3 >> (curr2 & 63);
1074       uint64_t curr4 = next4 >> (curr3 & 63);
1075       uint64_t curr5 = next5 >> (curr4 & 63);
1076       uint64_t curr6 = next6 >> (curr5 & 63);
1077       uint64_t curr7 = next7 >> (curr6 & 63);
1078 
1079       if ((curr7 & 63) == kShiftDFAFinal * 6) {
1080         // At the time of writing, using the same masking subexpressions from
1081         // the preceding lines caused Clang to clutter the hot loop computing
1082         // them - even though they aren't actually needed for shifting! Hence
1083         // these rewritten conditions, which achieve a speedup factor of two.
1084         if (((curr7-curr0) & 63) == 0) return p+1-prefix_size_;
1085         if (((curr7-curr1) & 63) == 0) return p+2-prefix_size_;
1086         if (((curr7-curr2) & 63) == 0) return p+3-prefix_size_;
1087         if (((curr7-curr3) & 63) == 0) return p+4-prefix_size_;
1088         if (((curr7-curr4) & 63) == 0) return p+5-prefix_size_;
1089         if (((curr7-curr5) & 63) == 0) return p+6-prefix_size_;
1090         if (((curr7-curr6) & 63) == 0) return p+7-prefix_size_;
1091         if (((curr7-curr7) & 63) == 0) return p+8-prefix_size_;
1092       }
1093 
1094       curr = curr7;
1095       p += 8;
1096     } while (p != endp);
1097     data = p;
1098     size = size&7;
1099   }
1100 
1101   const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
1102   const uint8_t* endp = p + size;
1103   while (p != endp) {
1104     uint8_t b = *p++;
1105     uint64_t next = prefix_dfa_[b];
1106     curr = next >> (curr & 63);
1107     if ((curr & 63) == kShiftDFAFinal * 6)
1108       return p-prefix_size_;
1109   }
1110   return NULL;
1111 }
1112 
1113 #if defined(__AVX2__)
1114 // Finds the least significant non-zero bit in n.
FindLSBSet(uint32_t n)1115 static int FindLSBSet(uint32_t n) {
1116   ABSL_DCHECK_NE(n, uint32_t{0});
1117 #if defined(__GNUC__)
1118   return __builtin_ctz(n);
1119 #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
1120   unsigned long c;
1121   _BitScanForward(&c, n);
1122   return static_cast<int>(c);
1123 #else
1124   int c = 31;
1125   for (int shift = 1 << 4; shift != 0; shift >>= 1) {
1126     uint32_t word = n << shift;
1127     if (word != 0) {
1128       n = word;
1129       c -= shift;
1130     }
1131   }
1132   return c;
1133 #endif
1134 }
1135 #endif
1136 
PrefixAccel_FrontAndBack(const void * data,size_t size)1137 const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) {
1138   ABSL_DCHECK_GE(prefix_size_, size_t{2});
1139   if (size < prefix_size_)
1140     return NULL;
1141   // Don't bother searching the last prefix_size_-1 bytes for prefix_front_.
1142   // This also means that probing for prefix_back_ doesn't go out of bounds.
1143   size -= prefix_size_-1;
1144 
1145 #if defined(__AVX2__)
1146   // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time.
1147   if (size >= sizeof(__m256i)) {
1148     const __m256i* fp = reinterpret_cast<const __m256i*>(
1149         reinterpret_cast<const char*>(data));
1150     const __m256i* bp = reinterpret_cast<const __m256i*>(
1151         reinterpret_cast<const char*>(data) + prefix_size_-1);
1152     const __m256i* endfp = fp + size/sizeof(__m256i);
1153     const __m256i f_set1 = _mm256_set1_epi8(prefix_front_);
1154     const __m256i b_set1 = _mm256_set1_epi8(prefix_back_);
1155     do {
1156       const __m256i f_loadu = _mm256_loadu_si256(fp++);
1157       const __m256i b_loadu = _mm256_loadu_si256(bp++);
1158       const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu);
1159       const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu);
1160       const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq);
1161       if (fb_testz == 0) {  // ZF: 1 means zero, 0 means non-zero.
1162         const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq);
1163         const int fb_movemask = _mm256_movemask_epi8(fb_and);
1164         const int fb_ctz = FindLSBSet(fb_movemask);
1165         return reinterpret_cast<const char*>(fp-1) + fb_ctz;
1166       }
1167     } while (fp != endfp);
1168     data = fp;
1169     size = size%sizeof(__m256i);
1170   }
1171 #endif
1172 
1173   const char* p0 = reinterpret_cast<const char*>(data);
1174   for (const char* p = p0;; p++) {
1175     ABSL_DCHECK_GE(size, static_cast<size_t>(p-p0));
1176     p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0)));
1177     if (p == NULL || p[prefix_size_-1] == prefix_back_)
1178       return p;
1179   }
1180 }
1181 
1182 }  // namespace re2
1183