• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-compiler.h"
6 
7 #include "src/execution/isolate.h"
8 #include "src/regexp/regexp.h"
9 #include "src/strings/unicode-inl.h"
10 #include "src/zone/zone-list-inl.h"
11 
12 #ifdef V8_INTL_SUPPORT
13 #include "src/base/strings.h"
14 #include "src/regexp/special-case.h"
15 #include "unicode/locid.h"
16 #include "unicode/uniset.h"
17 #include "unicode/utypes.h"
18 #endif  // V8_INTL_SUPPORT
19 
20 namespace v8 {
21 namespace internal {
22 
23 using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)
24 
25 constexpr base::uc32 kMaxCodePoint = 0x10ffff;
26 constexpr int kMaxUtf16CodeUnit = 0xffff;
27 constexpr uint32_t kMaxUtf16CodeUnitU = 0xffff;
28 constexpr int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar;
29 
30 // -------------------------------------------------------------------
31 // Tree to graph conversion
32 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)33 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
34                                RegExpNode* on_success) {
35   ZoneList<TextElement>* elms =
36       compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone());
37   elms->Add(TextElement::Atom(this), compiler->zone());
38   return compiler->zone()->New<TextNode>(elms, compiler->read_backward(),
39                                          on_success);
40 }
41 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)42 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
43                                RegExpNode* on_success) {
44   return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(),
45                                          on_success);
46 }
47 
48 namespace {
49 
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)50 bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
51                           const int* special_class, int length) {
52   length--;  // Remove final marker.
53 
54   DCHECK_EQ(kRangeEndMarker, special_class[length]);
55   DCHECK_NE(0, ranges->length());
56   DCHECK_NE(0, length);
57   DCHECK_NE(0, special_class[0]);
58 
59   if (ranges->length() != (length >> 1) + 1) return false;
60 
61   CharacterRange range = ranges->at(0);
62   if (range.from() != 0) return false;
63 
64   for (int i = 0; i < length; i += 2) {
65     if (static_cast<base::uc32>(special_class[i]) != (range.to() + 1)) {
66       return false;
67     }
68     range = ranges->at((i >> 1) + 1);
69     if (static_cast<base::uc32>(special_class[i + 1]) != range.from()) {
70       return false;
71     }
72   }
73 
74   return range.to() == kMaxCodePoint;
75 }
76 
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)77 bool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class,
78                    int length) {
79   length--;  // Remove final marker.
80 
81   DCHECK_EQ(kRangeEndMarker, special_class[length]);
82   if (ranges->length() * 2 != length) return false;
83 
84   for (int i = 0; i < length; i += 2) {
85     CharacterRange range = ranges->at(i >> 1);
86     if (range.from() != static_cast<base::uc32>(special_class[i]) ||
87         range.to() != static_cast<base::uc32>(special_class[i + 1] - 1)) {
88       return false;
89     }
90   }
91   return true;
92 }
93 
94 }  // namespace
95 
is_standard(Zone * zone)96 bool RegExpCharacterClass::is_standard(Zone* zone) {
97   // TODO(lrn): Remove need for this function, by not throwing away information
98   // along the way.
99   if (is_negated()) {
100     return false;
101   }
102   if (set_.is_standard()) {
103     return true;
104   }
105   if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
106     set_.set_standard_set_type(StandardCharacterSet::kWhitespace);
107     return true;
108   }
109   if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
110     set_.set_standard_set_type(StandardCharacterSet::kNotWhitespace);
111     return true;
112   }
113   if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
114                            kLineTerminatorRangeCount)) {
115     set_.set_standard_set_type(StandardCharacterSet::kNotLineTerminator);
116     return true;
117   }
118   if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
119                     kLineTerminatorRangeCount)) {
120     set_.set_standard_set_type(StandardCharacterSet::kLineTerminator);
121     return true;
122   }
123   if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
124     set_.set_standard_set_type(StandardCharacterSet::kWord);
125     return true;
126   }
127   if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
128     set_.set_standard_set_type(StandardCharacterSet::kNotWord);
129     return true;
130   }
131   return false;
132 }
133 
UnicodeRangeSplitter(ZoneList<CharacterRange> * base)134 UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) {
135   // The unicode range splitter categorizes given character ranges into:
136   // - Code points from the BMP representable by one code unit.
137   // - Code points outside the BMP that need to be split into surrogate pairs.
138   // - Lone lead surrogates.
139   // - Lone trail surrogates.
140   // Lone surrogates are valid code points, even though no actual characters.
141   // They require special matching to make sure we do not split surrogate pairs.
142 
143   for (int i = 0; i < base->length(); i++) AddRange(base->at(i));
144 }
145 
AddRange(CharacterRange range)146 void UnicodeRangeSplitter::AddRange(CharacterRange range) {
147   static constexpr base::uc32 kBmp1Start = 0;
148   static constexpr base::uc32 kBmp1End = kLeadSurrogateStart - 1;
149   static constexpr base::uc32 kBmp2Start = kTrailSurrogateEnd + 1;
150   static constexpr base::uc32 kBmp2End = kNonBmpStart - 1;
151 
152   // Ends are all inclusive.
153   STATIC_ASSERT(kBmp1Start == 0);
154   STATIC_ASSERT(kBmp1Start < kBmp1End);
155   STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart);
156   STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd);
157   STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);
158   STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd);
159   STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start);
160   STATIC_ASSERT(kBmp2Start < kBmp2End);
161   STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart);
162   STATIC_ASSERT(kNonBmpStart < kNonBmpEnd);
163 
164   static constexpr base::uc32 kStarts[] = {
165       kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart,
166       kBmp2Start, kNonBmpStart,
167   };
168 
169   static constexpr base::uc32 kEnds[] = {
170       kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,
171   };
172 
173   CharacterRangeVector* const kTargets[] = {
174       &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_,
175   };
176 
177   static constexpr int kCount = arraysize(kStarts);
178   STATIC_ASSERT(kCount == arraysize(kEnds));
179   STATIC_ASSERT(kCount == arraysize(kTargets));
180 
181   for (int i = 0; i < kCount; i++) {
182     if (kStarts[i] > range.to()) break;
183     const base::uc32 from = std::max(kStarts[i], range.from());
184     const base::uc32 to = std::min(kEnds[i], range.to());
185     if (from > to) continue;
186     kTargets[i]->emplace_back(CharacterRange::Range(from, to));
187   }
188 }
189 
190 namespace {
191 
192 // Translates between new and old V8-isms (SmallVector, ZoneList).
ToCanonicalZoneList(const UnicodeRangeSplitter::CharacterRangeVector * v,Zone * zone)193 ZoneList<CharacterRange>* ToCanonicalZoneList(
194     const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) {
195   if (v->empty()) return nullptr;
196 
197   ZoneList<CharacterRange>* result =
198       zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone);
199   for (size_t i = 0; i < v->size(); i++) {
200     result->Add(v->at(i), zone);
201   }
202 
203   CharacterRange::Canonicalize(result);
204   return result;
205 }
206 
AddBmpCharacters(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)207 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
208                       RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
209   ZoneList<CharacterRange>* bmp =
210       ToCanonicalZoneList(splitter->bmp(), compiler->zone());
211   if (bmp == nullptr) return;
212   result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
213       compiler->zone(), bmp, compiler->read_backward(), on_success)));
214 }
215 
216 using UC16Range = uint32_t;  // {from, to} packed into one uint32_t.
ToUC16Range(base::uc16 from,base::uc16 to)217 constexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) {
218   return (static_cast<uint32_t>(from) << 16) | to;
219 }
ExtractFrom(UC16Range r)220 constexpr base::uc16 ExtractFrom(UC16Range r) {
221   return static_cast<base::uc16>(r >> 16);
222 }
ExtractTo(UC16Range r)223 constexpr base::uc16 ExtractTo(UC16Range r) {
224   return static_cast<base::uc16>(r);
225 }
226 
AddNonBmpSurrogatePairs(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)227 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
228                              RegExpNode* on_success,
229                              UnicodeRangeSplitter* splitter) {
230   DCHECK(!compiler->one_byte());
231   Zone* const zone = compiler->zone();
232   ZoneList<CharacterRange>* non_bmp =
233       ToCanonicalZoneList(splitter->non_bmp(), zone);
234   if (non_bmp == nullptr) return;
235 
236   // Translate each 32-bit code point range into the corresponding 16-bit code
237   // unit representation consisting of the lead- and trail surrogate.
238   //
239   // The generated alternatives are grouped by the leading surrogate to avoid
240   // emitting excessive code. For example, for
241   //
242   //  { \ud800[\udc00-\udc01]
243   //  , \ud800[\udc05-\udc06]
244   //  }
245   //
246   // there's no need to emit matching code for the leading surrogate \ud800
247   // twice. We also create a dedicated grouping for full trailing ranges, i.e.
248   // [dc00-dfff].
249   ZoneUnorderedMap<UC16Range, ZoneList<CharacterRange>*> grouped_by_leading(
250       zone);
251   ZoneList<CharacterRange>* leading_with_full_trailing_range =
252       zone->New<ZoneList<CharacterRange>>(1, zone);
253   const auto AddRange = [&](base::uc16 from_l, base::uc16 to_l,
254                             base::uc16 from_t, base::uc16 to_t) {
255     const UC16Range leading_range = ToUC16Range(from_l, to_l);
256     if (grouped_by_leading.count(leading_range) == 0) {
257       if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) {
258         leading_with_full_trailing_range->Add(
259             CharacterRange::Range(from_l, to_l), zone);
260         return;
261       }
262       grouped_by_leading[leading_range] =
263           zone->New<ZoneList<CharacterRange>>(2, zone);
264     }
265     grouped_by_leading[leading_range]->Add(CharacterRange::Range(from_t, to_t),
266                                            zone);
267   };
268 
269   // First, create the grouped ranges.
270   CharacterRange::Canonicalize(non_bmp);
271   for (int i = 0; i < non_bmp->length(); i++) {
272     // Match surrogate pair.
273     // E.g. [\u10005-\u11005] becomes
274     //      \ud800[\udc05-\udfff]|
275     //      [\ud801-\ud803][\udc00-\udfff]|
276     //      \ud804[\udc00-\udc05]
277     base::uc32 from = non_bmp->at(i).from();
278     base::uc32 to = non_bmp->at(i).to();
279     base::uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
280     base::uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
281     base::uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
282     base::uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
283 
284     if (from_l == to_l) {
285       // The lead surrogate is the same.
286       AddRange(from_l, to_l, from_t, to_t);
287       continue;
288     }
289 
290     if (from_t != kTrailSurrogateStart) {
291       // Add [from_l][from_t-\udfff].
292       AddRange(from_l, from_l, from_t, kTrailSurrogateEnd);
293       from_l++;
294     }
295     if (to_t != kTrailSurrogateEnd) {
296       // Add [to_l][\udc00-to_t].
297       AddRange(to_l, to_l, kTrailSurrogateStart, to_t);
298       to_l--;
299     }
300     if (from_l <= to_l) {
301       // Add [from_l-to_l][\udc00-\udfff].
302       AddRange(from_l, to_l, kTrailSurrogateStart, kTrailSurrogateEnd);
303     }
304   }
305 
306   // Create the actual TextNode now that ranges are fully grouped.
307   if (!leading_with_full_trailing_range->is_empty()) {
308     CharacterRange::Canonicalize(leading_with_full_trailing_range);
309     result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
310         zone, leading_with_full_trailing_range,
311         CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
312         compiler->read_backward(), on_success)));
313   }
314   for (const auto& it : grouped_by_leading) {
315     CharacterRange leading_range =
316         CharacterRange::Range(ExtractFrom(it.first), ExtractTo(it.first));
317     ZoneList<CharacterRange>* trailing_ranges = it.second;
318     CharacterRange::Canonicalize(trailing_ranges);
319     result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
320         zone, leading_range, trailing_ranges, compiler->read_backward(),
321         on_success)));
322   }
323 }
324 
NegativeLookaroundAgainstReadDirectionAndMatch(RegExpCompiler * compiler,ZoneList<CharacterRange> * lookbehind,ZoneList<CharacterRange> * match,RegExpNode * on_success,bool read_backward)325 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
326     RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
327     ZoneList<CharacterRange>* match, RegExpNode* on_success,
328     bool read_backward) {
329   Zone* zone = compiler->zone();
330   RegExpNode* match_node = TextNode::CreateForCharacterRanges(
331       zone, match, read_backward, on_success);
332   int stack_register = compiler->UnicodeLookaroundStackRegister();
333   int position_register = compiler->UnicodeLookaroundPositionRegister();
334   RegExpLookaround::Builder lookaround(false, match_node, stack_register,
335                                        position_register);
336   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
337       zone, lookbehind, !read_backward, lookaround.on_match_success());
338   return lookaround.ForMatch(negative_match);
339 }
340 
MatchAndNegativeLookaroundInReadDirection(RegExpCompiler * compiler,ZoneList<CharacterRange> * match,ZoneList<CharacterRange> * lookahead,RegExpNode * on_success,bool read_backward)341 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
342     RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
343     ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
344     bool read_backward) {
345   Zone* zone = compiler->zone();
346   int stack_register = compiler->UnicodeLookaroundStackRegister();
347   int position_register = compiler->UnicodeLookaroundPositionRegister();
348   RegExpLookaround::Builder lookaround(false, on_success, stack_register,
349                                        position_register);
350   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
351       zone, lookahead, read_backward, lookaround.on_match_success());
352   return TextNode::CreateForCharacterRanges(
353       zone, match, read_backward, lookaround.ForMatch(negative_match));
354 }
355 
AddLoneLeadSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)356 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
357                            RegExpNode* on_success,
358                            UnicodeRangeSplitter* splitter) {
359   ZoneList<CharacterRange>* lead_surrogates =
360       ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
361   if (lead_surrogates == nullptr) return;
362   Zone* zone = compiler->zone();
363   // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
364   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
365       zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
366 
367   RegExpNode* match;
368   if (compiler->read_backward()) {
369     // Reading backward. Assert that reading forward, there is no trail
370     // surrogate, and then backward match the lead surrogate.
371     match = NegativeLookaroundAgainstReadDirectionAndMatch(
372         compiler, trail_surrogates, lead_surrogates, on_success, true);
373   } else {
374     // Reading forward. Forward match the lead surrogate and assert that
375     // no trail surrogate follows.
376     match = MatchAndNegativeLookaroundInReadDirection(
377         compiler, lead_surrogates, trail_surrogates, on_success, false);
378   }
379   result->AddAlternative(GuardedAlternative(match));
380 }
381 
AddLoneTrailSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)382 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
383                             RegExpNode* on_success,
384                             UnicodeRangeSplitter* splitter) {
385   ZoneList<CharacterRange>* trail_surrogates =
386       ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
387   if (trail_surrogates == nullptr) return;
388   Zone* zone = compiler->zone();
389   // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
390   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
391       zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
392 
393   RegExpNode* match;
394   if (compiler->read_backward()) {
395     // Reading backward. Backward match the trail surrogate and assert that no
396     // lead surrogate precedes it.
397     match = MatchAndNegativeLookaroundInReadDirection(
398         compiler, trail_surrogates, lead_surrogates, on_success, true);
399   } else {
400     // Reading forward. Assert that reading backward, there is no lead
401     // surrogate, and then forward match the trail surrogate.
402     match = NegativeLookaroundAgainstReadDirectionAndMatch(
403         compiler, lead_surrogates, trail_surrogates, on_success, false);
404   }
405   result->AddAlternative(GuardedAlternative(match));
406 }
407 
UnanchoredAdvance(RegExpCompiler * compiler,RegExpNode * on_success)408 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
409                               RegExpNode* on_success) {
410   // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
411   DCHECK(!compiler->read_backward());
412   Zone* zone = compiler->zone();
413   // Advance any character. If the character happens to be a lead surrogate and
414   // we advanced into the middle of a surrogate pair, it will work out, as
415   // nothing will match from there. We will have to advance again, consuming
416   // the associated trail surrogate.
417   ZoneList<CharacterRange>* range =
418       CharacterRange::List(zone, CharacterRange::Range(0, kMaxUtf16CodeUnit));
419   return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
420 }
421 
AddUnicodeCaseEquivalents(ZoneList<CharacterRange> * ranges,Zone * zone)422 void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
423 #ifdef V8_INTL_SUPPORT
424   DCHECK(CharacterRange::IsCanonical(ranges));
425 
426   // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
427   // See also https://crbug.com/v8/6727.
428   // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
429   // which we use frequently internally. But large ranges can also easily be
430   // created by the user. We might want to have a more general caching mechanism
431   // for such ranges.
432   if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
433 
434   // Use ICU to compute the case fold closure over the ranges.
435   icu::UnicodeSet set;
436   for (int i = 0; i < ranges->length(); i++) {
437     set.add(ranges->at(i).from(), ranges->at(i).to());
438   }
439   // Clear the ranges list without freeing the backing store.
440   ranges->Rewind(0);
441   set.closeOver(USET_CASE_INSENSITIVE);
442   // Full case mapping map single characters to multiple characters.
443   // Those are represented as strings in the set. Remove them so that
444   // we end up with only simple and common case mappings.
445   set.removeAllStrings();
446   for (int i = 0; i < set.getRangeCount(); i++) {
447     ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
448                 zone);
449   }
450   // No errors and everything we collected have been ranges.
451   CharacterRange::Canonicalize(ranges);
452 #endif  // V8_INTL_SUPPORT
453 }
454 
455 }  // namespace
456 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)457 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
458                                          RegExpNode* on_success) {
459   set_.Canonicalize();
460   Zone* const zone = compiler->zone();
461   ZoneList<CharacterRange>* ranges = this->ranges(zone);
462 
463   if (NeedsUnicodeCaseEquivalents(compiler->flags())) {
464     AddUnicodeCaseEquivalents(ranges, zone);
465   }
466 
467   if (!IsUnicode(compiler->flags()) || compiler->one_byte() ||
468       contains_split_surrogate()) {
469     return zone->New<TextNode>(this, compiler->read_backward(), on_success);
470   }
471 
472   if (is_negated()) {
473     ZoneList<CharacterRange>* negated =
474         zone->New<ZoneList<CharacterRange>>(2, zone);
475     CharacterRange::Negate(ranges, negated, zone);
476     ranges = negated;
477   }
478 
479   if (ranges->length() == 0) {
480     // The empty character class is used as a 'fail' node.
481     RegExpCharacterClass* fail = zone->New<RegExpCharacterClass>(zone, ranges);
482     return zone->New<TextNode>(fail, compiler->read_backward(), on_success);
483   }
484 
485   if (set_.is_standard() &&
486       standard_type() == StandardCharacterSet::kEverything) {
487     return UnanchoredAdvance(compiler, on_success);
488   }
489 
490   // Split ranges in order to handle surrogates correctly:
491   // - Surrogate pairs: translate the 32-bit code point into two uc16 code
492   //   units (irregexp operates only on code units).
493   // - Lone surrogates: these require lookarounds to ensure we don't match in
494   //   the middle of a surrogate pair.
495   ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
496   UnicodeRangeSplitter splitter(ranges);
497   AddBmpCharacters(compiler, result, on_success, &splitter);
498   AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
499   AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
500   AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
501 
502   static constexpr int kMaxRangesToInline = 32;  // Arbitrary.
503   if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline();
504 
505   return result;
506 }
507 
508 namespace {
509 
CompareFirstChar(RegExpTree * const * a,RegExpTree * const * b)510 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
511   RegExpAtom* atom1 = (*a)->AsAtom();
512   RegExpAtom* atom2 = (*b)->AsAtom();
513   base::uc16 character1 = atom1->data().at(0);
514   base::uc16 character2 = atom2->data().at(0);
515   if (character1 < character2) return -1;
516   if (character1 > character2) return 1;
517   return 0;
518 }
519 
520 #ifdef V8_INTL_SUPPORT
521 
CompareCaseInsensitive(const icu::UnicodeString & a,const icu::UnicodeString & b)522 int CompareCaseInsensitive(const icu::UnicodeString& a,
523                            const icu::UnicodeString& b) {
524   return a.caseCompare(b, U_FOLD_CASE_DEFAULT);
525 }
526 
CompareFirstCharCaseInsensitive(RegExpTree * const * a,RegExpTree * const * b)527 int CompareFirstCharCaseInsensitive(RegExpTree* const* a,
528                                     RegExpTree* const* b) {
529   RegExpAtom* atom1 = (*a)->AsAtom();
530   RegExpAtom* atom2 = (*b)->AsAtom();
531   return CompareCaseInsensitive(icu::UnicodeString{atom1->data().at(0)},
532                                 icu::UnicodeString{atom2->data().at(0)});
533 }
534 
Equals(bool ignore_case,const icu::UnicodeString & a,const icu::UnicodeString & b)535 bool Equals(bool ignore_case, const icu::UnicodeString& a,
536             const icu::UnicodeString& b) {
537   if (a == b) return true;
538   if (ignore_case) return CompareCaseInsensitive(a, b) == 0;
539   return false;  // Case-sensitive equality already checked above.
540 }
541 
CharAtEquals(bool ignore_case,int index,const RegExpAtom * a,const RegExpAtom * b)542 bool CharAtEquals(bool ignore_case, int index, const RegExpAtom* a,
543                   const RegExpAtom* b) {
544   return Equals(ignore_case, a->data().at(index), b->data().at(index));
545 }
546 
547 #else
548 
Canonical(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar c)549 unibrow::uchar Canonical(
550     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
551     unibrow::uchar c) {
552   unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
553   int length = canonicalize->get(c, '\0', chars);
554   DCHECK_LE(length, 1);
555   unibrow::uchar canonical = c;
556   if (length == 1) canonical = chars[0];
557   return canonical;
558 }
559 
CompareCaseInsensitive(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar a,unibrow::uchar b)560 int CompareCaseInsensitive(
561     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
562     unibrow::uchar a, unibrow::uchar b) {
563   if (a == b) return 0;
564   if (a >= 'a' || b >= 'a') {
565     a = Canonical(canonicalize, a);
566     b = Canonical(canonicalize, b);
567   }
568   return static_cast<int>(a) - static_cast<int>(b);
569 }
570 
CompareFirstCharCaseInsensitive(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,RegExpTree * const * a,RegExpTree * const * b)571 int CompareFirstCharCaseInsensitive(
572     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
573     RegExpTree* const* a, RegExpTree* const* b) {
574   RegExpAtom* atom1 = (*a)->AsAtom();
575   RegExpAtom* atom2 = (*b)->AsAtom();
576   return CompareCaseInsensitive(canonicalize, atom1->data().at(0),
577                                 atom2->data().at(0));
578 }
579 
Equals(bool ignore_case,unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar a,unibrow::uchar b)580 bool Equals(bool ignore_case,
581             unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
582             unibrow::uchar a, unibrow::uchar b) {
583   if (a == b) return true;
584   if (ignore_case) {
585     return CompareCaseInsensitive(canonicalize, a, b) == 0;
586   }
587   return false;  // Case-sensitive equality already checked above.
588 }
589 
CharAtEquals(bool ignore_case,unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,int index,const RegExpAtom * a,const RegExpAtom * b)590 bool CharAtEquals(bool ignore_case,
591                   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
592                   int index, const RegExpAtom* a, const RegExpAtom* b) {
593   return Equals(ignore_case, canonicalize, a->data().at(index),
594                 b->data().at(index));
595 }
596 
597 #endif  // V8_INTL_SUPPORT
598 
599 }  // namespace
600 
601 // We can stable sort runs of atoms, since the order does not matter if they
602 // start with different characters.
603 // Returns true if any consecutive atoms were found.
SortConsecutiveAtoms(RegExpCompiler * compiler)604 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
605   ZoneList<RegExpTree*>* alternatives = this->alternatives();
606   int length = alternatives->length();
607   bool found_consecutive_atoms = false;
608   for (int i = 0; i < length; i++) {
609     while (i < length) {
610       RegExpTree* alternative = alternatives->at(i);
611       if (alternative->IsAtom()) break;
612       i++;
613     }
614     // i is length or it is the index of an atom.
615     if (i == length) break;
616     int first_atom = i;
617     i++;
618     while (i < length) {
619       RegExpTree* alternative = alternatives->at(i);
620       if (!alternative->IsAtom()) break;
621       i++;
622     }
623     // Sort atoms to get ones with common prefixes together.
624     // This step is more tricky if we are in a case-independent regexp,
625     // because it would change /is|I/ to /I|is/, and order matters when
626     // the regexp parts don't match only disjoint starting points. To fix
627     // this we have a version of CompareFirstChar that uses case-
628     // independent character classes for comparison.
629     DCHECK_LT(first_atom, alternatives->length());
630     DCHECK_LE(i, alternatives->length());
631     DCHECK_LE(first_atom, i);
632     if (IsIgnoreCase(compiler->flags())) {
633 #ifdef V8_INTL_SUPPORT
634       alternatives->StableSort(CompareFirstCharCaseInsensitive, first_atom,
635                                i - first_atom);
636 #else
637       unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
638           compiler->isolate()->regexp_macro_assembler_canonicalize();
639       auto compare_closure = [canonicalize](RegExpTree* const* a,
640                                             RegExpTree* const* b) {
641         return CompareFirstCharCaseInsensitive(canonicalize, a, b);
642       };
643       alternatives->StableSort(compare_closure, first_atom, i - first_atom);
644 #endif  // V8_INTL_SUPPORT
645     } else {
646       alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
647     }
648     if (i - first_atom > 1) found_consecutive_atoms = true;
649   }
650   return found_consecutive_atoms;
651 }
652 
653 // Optimizes ab|ac|az to a(?:b|c|d).
RationalizeConsecutiveAtoms(RegExpCompiler * compiler)654 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
655   Zone* zone = compiler->zone();
656   ZoneList<RegExpTree*>* alternatives = this->alternatives();
657   int length = alternatives->length();
658   const bool ignore_case = IsIgnoreCase(compiler->flags());
659 
660   int write_posn = 0;
661   int i = 0;
662   while (i < length) {
663     RegExpTree* alternative = alternatives->at(i);
664     if (!alternative->IsAtom()) {
665       alternatives->at(write_posn++) = alternatives->at(i);
666       i++;
667       continue;
668     }
669     RegExpAtom* const atom = alternative->AsAtom();
670 #ifdef V8_INTL_SUPPORT
671     icu::UnicodeString common_prefix(atom->data().at(0));
672 #else
673     unibrow::Mapping<unibrow::Ecma262Canonicalize>* const canonicalize =
674         compiler->isolate()->regexp_macro_assembler_canonicalize();
675     unibrow::uchar common_prefix = atom->data().at(0);
676     if (ignore_case) {
677       common_prefix = Canonical(canonicalize, common_prefix);
678     }
679 #endif  // V8_INTL_SUPPORT
680     int first_with_prefix = i;
681     int prefix_length = atom->length();
682     i++;
683     while (i < length) {
684       alternative = alternatives->at(i);
685       if (!alternative->IsAtom()) break;
686       RegExpAtom* const alt_atom = alternative->AsAtom();
687 #ifdef V8_INTL_SUPPORT
688       icu::UnicodeString new_prefix(alt_atom->data().at(0));
689       if (!Equals(ignore_case, new_prefix, common_prefix)) break;
690 #else
691       unibrow::uchar new_prefix = alt_atom->data().at(0);
692       if (!Equals(ignore_case, canonicalize, new_prefix, common_prefix)) break;
693 #endif  // V8_INTL_SUPPORT
694       prefix_length = std::min(prefix_length, alt_atom->length());
695       i++;
696     }
697     if (i > first_with_prefix + 2) {
698       // Found worthwhile run of alternatives with common prefix of at least one
699       // character.  The sorting function above did not sort on more than one
700       // character for reasons of correctness, but there may still be a longer
701       // common prefix if the terms were similar or presorted in the input.
702       // Find out how long the common prefix is.
703       int run_length = i - first_with_prefix;
704       RegExpAtom* const alt_atom =
705           alternatives->at(first_with_prefix)->AsAtom();
706       for (int j = 1; j < run_length && prefix_length > 1; j++) {
707         RegExpAtom* old_atom =
708             alternatives->at(j + first_with_prefix)->AsAtom();
709         for (int k = 1; k < prefix_length; k++) {
710 #ifdef V8_INTL_SUPPORT
711           if (!CharAtEquals(ignore_case, k, alt_atom, old_atom)) {
712 #else
713           if (!CharAtEquals(ignore_case, canonicalize, k, alt_atom, old_atom)) {
714 #endif  // V8_INTL_SUPPORT
715             prefix_length = k;
716             break;
717           }
718         }
719       }
720       RegExpAtom* prefix =
721           zone->New<RegExpAtom>(alt_atom->data().SubVector(0, prefix_length));
722       ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone);
723       pair->Add(prefix, zone);
724       ZoneList<RegExpTree*>* suffixes =
725           zone->New<ZoneList<RegExpTree*>>(run_length, zone);
726       for (int j = 0; j < run_length; j++) {
727         RegExpAtom* old_atom =
728             alternatives->at(j + first_with_prefix)->AsAtom();
729         int len = old_atom->length();
730         if (len == prefix_length) {
731           suffixes->Add(zone->New<RegExpEmpty>(), zone);
732         } else {
733           RegExpTree* suffix = zone->New<RegExpAtom>(
734               old_atom->data().SubVector(prefix_length, old_atom->length()));
735           suffixes->Add(suffix, zone);
736         }
737       }
738       pair->Add(zone->New<RegExpDisjunction>(suffixes), zone);
739       alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair);
740     } else {
741       // Just copy any non-worthwhile alternatives.
742       for (int j = first_with_prefix; j < i; j++) {
743         alternatives->at(write_posn++) = alternatives->at(j);
744       }
745     }
746   }
747   alternatives->Rewind(write_posn);  // Trim end of array.
748 }
749 
750 // Optimizes b|c|z to [bcz].
751 void RegExpDisjunction::FixSingleCharacterDisjunctions(
752     RegExpCompiler* compiler) {
753   Zone* zone = compiler->zone();
754   ZoneList<RegExpTree*>* alternatives = this->alternatives();
755   int length = alternatives->length();
756 
757   int write_posn = 0;
758   int i = 0;
759   while (i < length) {
760     RegExpTree* alternative = alternatives->at(i);
761     if (!alternative->IsAtom()) {
762       alternatives->at(write_posn++) = alternatives->at(i);
763       i++;
764       continue;
765     }
766     RegExpAtom* const atom = alternative->AsAtom();
767     if (atom->length() != 1) {
768       alternatives->at(write_posn++) = alternatives->at(i);
769       i++;
770       continue;
771     }
772     const RegExpFlags flags = compiler->flags();
773     DCHECK_IMPLIES(IsUnicode(flags),
774                    !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
775     bool contains_trail_surrogate =
776         unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
777     int first_in_run = i;
778     i++;
779     // Find a run of single-character atom alternatives that have identical
780     // flags (case independence and unicode-ness).
781     while (i < length) {
782       alternative = alternatives->at(i);
783       if (!alternative->IsAtom()) break;
784       RegExpAtom* const alt_atom = alternative->AsAtom();
785       if (alt_atom->length() != 1) break;
786       DCHECK_IMPLIES(IsUnicode(flags),
787                      !unibrow::Utf16::IsLeadSurrogate(alt_atom->data().at(0)));
788       contains_trail_surrogate |=
789           unibrow::Utf16::IsTrailSurrogate(alt_atom->data().at(0));
790       i++;
791     }
792     if (i > first_in_run + 1) {
793       // Found non-trivial run of single-character alternatives.
794       int run_length = i - first_in_run;
795       ZoneList<CharacterRange>* ranges =
796           zone->New<ZoneList<CharacterRange>>(2, zone);
797       for (int j = 0; j < run_length; j++) {
798         RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
799         DCHECK_EQ(old_atom->length(), 1);
800         ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
801       }
802       RegExpCharacterClass::CharacterClassFlags character_class_flags;
803       if (IsUnicode(flags) && contains_trail_surrogate) {
804         character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
805       }
806       alternatives->at(write_posn++) =
807           zone->New<RegExpCharacterClass>(zone, ranges, character_class_flags);
808     } else {
809       // Just copy any trivial alternatives.
810       for (int j = first_in_run; j < i; j++) {
811         alternatives->at(write_posn++) = alternatives->at(j);
812       }
813     }
814   }
815   alternatives->Rewind(write_posn);  // Trim end of array.
816 }
817 
818 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
819                                       RegExpNode* on_success) {
820   compiler->ToNodeMaybeCheckForStackOverflow();
821 
822   ZoneList<RegExpTree*>* alternatives = this->alternatives();
823 
824   if (alternatives->length() > 2) {
825     bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
826     if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
827     FixSingleCharacterDisjunctions(compiler);
828     if (alternatives->length() == 1) {
829       return alternatives->at(0)->ToNode(compiler, on_success);
830     }
831   }
832 
833   int length = alternatives->length();
834 
835   ChoiceNode* result =
836       compiler->zone()->New<ChoiceNode>(length, compiler->zone());
837   for (int i = 0; i < length; i++) {
838     GuardedAlternative alternative(
839         alternatives->at(i)->ToNode(compiler, on_success));
840     result->AddAlternative(alternative);
841   }
842   return result;
843 }
844 
845 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
846                                      RegExpNode* on_success) {
847   return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
848 }
849 
850 namespace {
851 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
852 //         \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
853 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
854                                           RegExpNode* on_success,
855                                           RegExpAssertion::Type type,
856                                           RegExpFlags flags) {
857   CHECK(NeedsUnicodeCaseEquivalents(flags));
858   Zone* zone = compiler->zone();
859   ZoneList<CharacterRange>* word_range =
860       zone->New<ZoneList<CharacterRange>>(2, zone);
861   CharacterRange::AddClassEscape(StandardCharacterSet::kWord, word_range, true,
862                                  zone);
863   int stack_register = compiler->UnicodeLookaroundStackRegister();
864   int position_register = compiler->UnicodeLookaroundPositionRegister();
865   ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
866   // Add two choices. The (non-)boundary could start with a word or
867   // a non-word-character.
868   for (int i = 0; i < 2; i++) {
869     bool lookbehind_for_word = i == 0;
870     bool lookahead_for_word =
871         (type == RegExpAssertion::Type::BOUNDARY) ^ lookbehind_for_word;
872     // Look to the left.
873     RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
874                                          stack_register, position_register);
875     RegExpNode* backward = TextNode::CreateForCharacterRanges(
876         zone, word_range, true, lookbehind.on_match_success());
877     // Look to the right.
878     RegExpLookaround::Builder lookahead(lookahead_for_word,
879                                         lookbehind.ForMatch(backward),
880                                         stack_register, position_register);
881     RegExpNode* forward = TextNode::CreateForCharacterRanges(
882         zone, word_range, false, lookahead.on_match_success());
883     result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
884   }
885   return result;
886 }
887 }  // anonymous namespace
888 
889 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
890                                     RegExpNode* on_success) {
891   NodeInfo info;
892   Zone* zone = compiler->zone();
893 
894   switch (assertion_type()) {
895     case Type::START_OF_LINE:
896       return AssertionNode::AfterNewline(on_success);
897     case Type::START_OF_INPUT:
898       return AssertionNode::AtStart(on_success);
899     case Type::BOUNDARY:
900       return NeedsUnicodeCaseEquivalents(compiler->flags())
901                  ? BoundaryAssertionAsLookaround(
902                        compiler, on_success, Type::BOUNDARY, compiler->flags())
903                  : AssertionNode::AtBoundary(on_success);
904     case Type::NON_BOUNDARY:
905       return NeedsUnicodeCaseEquivalents(compiler->flags())
906                  ? BoundaryAssertionAsLookaround(compiler, on_success,
907                                                  Type::NON_BOUNDARY,
908                                                  compiler->flags())
909                  : AssertionNode::AtNonBoundary(on_success);
910     case Type::END_OF_INPUT:
911       return AssertionNode::AtEnd(on_success);
912     case Type::END_OF_LINE: {
913       // Compile $ in multiline regexps as an alternation with a positive
914       // lookahead in one side and an end-of-input on the other side.
915       // We need two registers for the lookahead.
916       int stack_pointer_register = compiler->AllocateRegister();
917       int position_register = compiler->AllocateRegister();
918       // The ChoiceNode to distinguish between a newline and end-of-input.
919       ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
920       // Create a newline atom.
921       ZoneList<CharacterRange>* newline_ranges =
922           zone->New<ZoneList<CharacterRange>>(3, zone);
923       CharacterRange::AddClassEscape(StandardCharacterSet::kLineTerminator,
924                                      newline_ranges, false, zone);
925       RegExpCharacterClass* newline_atom = zone->New<RegExpCharacterClass>(
926           StandardCharacterSet::kLineTerminator);
927       TextNode* newline_matcher =
928           zone->New<TextNode>(newline_atom, false,
929                               ActionNode::PositiveSubmatchSuccess(
930                                   stack_pointer_register, position_register,
931                                   0,   // No captures inside.
932                                   -1,  // Ignored if no captures.
933                                   on_success));
934       // Create an end-of-input matcher.
935       RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch(
936           stack_pointer_register, position_register, newline_matcher);
937       // Add the two alternatives to the ChoiceNode.
938       GuardedAlternative eol_alternative(end_of_line);
939       result->AddAlternative(eol_alternative);
940       GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
941       result->AddAlternative(end_alternative);
942       return result;
943     }
944     default:
945       UNREACHABLE();
946   }
947 }
948 
949 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
950                                         RegExpNode* on_success) {
951   return compiler->zone()->New<BackReferenceNode>(
952       RegExpCapture::StartRegister(index()),
953       RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(),
954       on_success);
955 }
956 
957 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
958                                 RegExpNode* on_success) {
959   return on_success;
960 }
961 
962 RegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler,
963                                 RegExpNode* on_success) {
964   return body_->ToNode(compiler, on_success);
965 }
966 
967 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
968                                    int stack_pointer_register,
969                                    int position_register,
970                                    int capture_register_count,
971                                    int capture_register_start)
972     : is_positive_(is_positive),
973       on_success_(on_success),
974       stack_pointer_register_(stack_pointer_register),
975       position_register_(position_register) {
976   if (is_positive_) {
977     on_match_success_ = ActionNode::PositiveSubmatchSuccess(
978         stack_pointer_register, position_register, capture_register_count,
979         capture_register_start, on_success_);
980   } else {
981     Zone* zone = on_success_->zone();
982     on_match_success_ = zone->New<NegativeSubmatchSuccess>(
983         stack_pointer_register, position_register, capture_register_count,
984         capture_register_start, zone);
985   }
986 }
987 
988 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
989   if (is_positive_) {
990     return ActionNode::BeginPositiveSubmatch(stack_pointer_register_,
991                                              position_register_, match);
992   } else {
993     Zone* zone = on_success_->zone();
994     // We use a ChoiceNode to represent the negative lookaround. The first
995     // alternative is the negative match. On success, the end node backtracks.
996     // On failure, the second alternative is tried and leads to success.
997     // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
998     // first exit when calculating quick checks.
999     ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>(
1000         GuardedAlternative(match), GuardedAlternative(on_success_), zone);
1001     return ActionNode::BeginNegativeSubmatch(stack_pointer_register_,
1002                                              position_register_, choice_node);
1003   }
1004 }
1005 
1006 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
1007                                      RegExpNode* on_success) {
1008   int stack_pointer_register = compiler->AllocateRegister();
1009   int position_register = compiler->AllocateRegister();
1010 
1011   const int registers_per_capture = 2;
1012   const int register_of_first_capture = 2;
1013   int register_count = capture_count_ * registers_per_capture;
1014   int register_start =
1015       register_of_first_capture + capture_from_ * registers_per_capture;
1016 
1017   RegExpNode* result;
1018   bool was_reading_backward = compiler->read_backward();
1019   compiler->set_read_backward(type() == LOOKBEHIND);
1020   Builder builder(is_positive(), on_success, stack_pointer_register,
1021                   position_register, register_count, register_start);
1022   RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
1023   result = builder.ForMatch(match);
1024   compiler->set_read_backward(was_reading_backward);
1025   return result;
1026 }
1027 
1028 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
1029                                   RegExpNode* on_success) {
1030   return ToNode(body(), index(), compiler, on_success);
1031 }
1032 
1033 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
1034                                   RegExpCompiler* compiler,
1035                                   RegExpNode* on_success) {
1036   DCHECK_NOT_NULL(body);
1037   int start_reg = RegExpCapture::StartRegister(index);
1038   int end_reg = RegExpCapture::EndRegister(index);
1039   if (compiler->read_backward()) std::swap(start_reg, end_reg);
1040   RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
1041   RegExpNode* body_node = body->ToNode(compiler, store_end);
1042   return ActionNode::StorePosition(start_reg, true, body_node);
1043 }
1044 
1045 namespace {
1046 
1047 class AssertionSequenceRewriter final {
1048  public:
1049   // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
1050   // instead of sprinkling rewrites into the AST->Node conversion process.
1051   static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
1052     AssertionSequenceRewriter rewriter(terms, zone);
1053 
1054     static constexpr int kNoIndex = -1;
1055     int from = kNoIndex;
1056 
1057     for (int i = 0; i < terms->length(); i++) {
1058       RegExpTree* t = terms->at(i);
1059       if (from == kNoIndex && t->IsAssertion()) {
1060         from = i;  // Start a sequence.
1061       } else if (from != kNoIndex && !t->IsAssertion()) {
1062         // Terminate and process the sequence.
1063         if (i - from > 1) rewriter.Rewrite(from, i);
1064         from = kNoIndex;
1065       }
1066     }
1067 
1068     if (from != kNoIndex && terms->length() - from > 1) {
1069       rewriter.Rewrite(from, terms->length());
1070     }
1071   }
1072 
1073   // All assertions are zero width. A consecutive sequence of assertions is
1074   // order-independent. There's two ways we can optimize here:
1075   // 1. fold all identical assertions.
1076   // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
1077   //    sequence fails.
1078   void Rewrite(int from, int to) {
1079     DCHECK_GT(to, from + 1);
1080 
1081     // Bitfield of all seen assertions.
1082     uint32_t seen_assertions = 0;
1083     STATIC_ASSERT(static_cast<int>(RegExpAssertion::Type::LAST_ASSERTION_TYPE) <
1084                   kUInt32Size * kBitsPerByte);
1085 
1086     for (int i = from; i < to; i++) {
1087       RegExpAssertion* t = terms_->at(i)->AsAssertion();
1088       const uint32_t bit = 1 << static_cast<int>(t->assertion_type());
1089 
1090       if (seen_assertions & bit) {
1091         // Fold duplicates.
1092         terms_->Set(i, zone_->New<RegExpEmpty>());
1093       }
1094 
1095       seen_assertions |= bit;
1096     }
1097 
1098     // Collapse failures.
1099     const uint32_t always_fails_mask =
1100         1 << static_cast<int>(RegExpAssertion::Type::BOUNDARY) |
1101         1 << static_cast<int>(RegExpAssertion::Type::NON_BOUNDARY);
1102     if ((seen_assertions & always_fails_mask) == always_fails_mask) {
1103       ReplaceSequenceWithFailure(from, to);
1104     }
1105   }
1106 
1107   void ReplaceSequenceWithFailure(int from, int to) {
1108     // Replace the entire sequence with a single node that always fails.
1109     // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
1110     // negated '*' (everything) range serves the purpose.
1111     ZoneList<CharacterRange>* ranges =
1112         zone_->New<ZoneList<CharacterRange>>(0, zone_);
1113     RegExpCharacterClass* cc = zone_->New<RegExpCharacterClass>(zone_, ranges);
1114     terms_->Set(from, cc);
1115 
1116     // Zero out the rest.
1117     RegExpEmpty* empty = zone_->New<RegExpEmpty>();
1118     for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
1119   }
1120 
1121  private:
1122   AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
1123       : zone_(zone), terms_(terms) {}
1124 
1125   Zone* zone_;
1126   ZoneList<RegExpTree*>* terms_;
1127 };
1128 
1129 }  // namespace
1130 
1131 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
1132                                       RegExpNode* on_success) {
1133   compiler->ToNodeMaybeCheckForStackOverflow();
1134 
1135   ZoneList<RegExpTree*>* children = nodes();
1136 
1137   AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
1138 
1139   RegExpNode* current = on_success;
1140   if (compiler->read_backward()) {
1141     for (int i = 0; i < children->length(); i++) {
1142       current = children->at(i)->ToNode(compiler, current);
1143     }
1144   } else {
1145     for (int i = children->length() - 1; i >= 0; i--) {
1146       current = children->at(i)->ToNode(compiler, current);
1147     }
1148   }
1149   return current;
1150 }
1151 
1152 namespace {
1153 
1154 void AddClass(const int* elmv, int elmc, ZoneList<CharacterRange>* ranges,
1155               Zone* zone) {
1156   elmc--;
1157   DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1158   for (int i = 0; i < elmc; i += 2) {
1159     DCHECK(elmv[i] < elmv[i + 1]);
1160     ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
1161   }
1162 }
1163 
1164 void AddClassNegated(const int* elmv, int elmc,
1165                      ZoneList<CharacterRange>* ranges, Zone* zone) {
1166   elmc--;
1167   DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1168   DCHECK_NE(0x0000, elmv[0]);
1169   DCHECK_NE(kMaxCodePoint, elmv[elmc - 1]);
1170   base::uc16 last = 0x0000;
1171   for (int i = 0; i < elmc; i += 2) {
1172     DCHECK(last <= elmv[i] - 1);
1173     DCHECK(elmv[i] < elmv[i + 1]);
1174     ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
1175     last = elmv[i + 1];
1176   }
1177   ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone);
1178 }
1179 
1180 }  // namespace
1181 
1182 void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set,
1183                                     ZoneList<CharacterRange>* ranges,
1184                                     bool add_unicode_case_equivalents,
1185                                     Zone* zone) {
1186   if (add_unicode_case_equivalents &&
1187       (standard_character_set == StandardCharacterSet::kWord ||
1188        standard_character_set == StandardCharacterSet::kNotWord)) {
1189     // See #sec-runtime-semantics-wordcharacters-abstract-operation
1190     // In case of unicode and ignore_case, we need to create the closure over
1191     // case equivalent characters before negating.
1192     ZoneList<CharacterRange>* new_ranges =
1193         zone->New<ZoneList<CharacterRange>>(2, zone);
1194     AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
1195     AddUnicodeCaseEquivalents(new_ranges, zone);
1196     if (standard_character_set == StandardCharacterSet::kNotWord) {
1197       ZoneList<CharacterRange>* negated =
1198           zone->New<ZoneList<CharacterRange>>(2, zone);
1199       CharacterRange::Negate(new_ranges, negated, zone);
1200       new_ranges = negated;
1201     }
1202     ranges->AddAll(*new_ranges, zone);
1203     return;
1204   }
1205 
1206   switch (standard_character_set) {
1207     case StandardCharacterSet::kWhitespace:
1208       AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1209       break;
1210     case StandardCharacterSet::kNotWhitespace:
1211       AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1212       break;
1213     case StandardCharacterSet::kWord:
1214       AddClass(kWordRanges, kWordRangeCount, ranges, zone);
1215       break;
1216     case StandardCharacterSet::kNotWord:
1217       AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
1218       break;
1219     case StandardCharacterSet::kDigit:
1220       AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
1221       break;
1222     case StandardCharacterSet::kNotDigit:
1223       AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
1224       break;
1225     // This is the set of characters matched by the $ and ^ symbols
1226     // in multiline mode.
1227     case StandardCharacterSet::kLineTerminator:
1228       AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
1229       break;
1230     case StandardCharacterSet::kNotLineTerminator:
1231       AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
1232                       zone);
1233       break;
1234     // This is not a character range as defined by the spec but a
1235     // convenient shorthand for a character class that matches any
1236     // character.
1237     case StandardCharacterSet::kEverything:
1238       ranges->Add(CharacterRange::Everything(), zone);
1239       break;
1240   }
1241 }
1242 
1243 // static
1244 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
1245                                         ZoneList<CharacterRange>* ranges,
1246                                         bool is_one_byte) {
1247   CharacterRange::Canonicalize(ranges);
1248   int range_count = ranges->length();
1249 #ifdef V8_INTL_SUPPORT
1250   icu::UnicodeSet others;
1251   for (int i = 0; i < range_count; i++) {
1252     CharacterRange range = ranges->at(i);
1253     base::uc32 from = range.from();
1254     if (from > kMaxUtf16CodeUnit) continue;
1255     base::uc32 to = std::min({range.to(), kMaxUtf16CodeUnitU});
1256     // Nothing to be done for surrogates.
1257     if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
1258     if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1259       if (from > kMaxOneByteCharCode) continue;
1260       if (to > kMaxOneByteCharCode) to = kMaxOneByteCharCode;
1261     }
1262     others.add(from, to);
1263   }
1264 
1265   // Compute the set of additional characters that should be added,
1266   // using UnicodeSet::closeOver. ECMA 262 defines slightly different
1267   // case-folding rules than Unicode, so some characters that are
1268   // added by closeOver do not match anything other than themselves in
1269   // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
1270   // same case-insensitive character as 's' or 'S' according to
1271   // Unicode, but does not match any other character in JS. To handle
1272   // this case, we add such characters to the IgnoreSet and filter
1273   // them out. We filter twice: once before calling closeOver (to
1274   // prevent 'ſ' from adding 's'), and once after calling closeOver
1275   // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
1276   // more information.
1277   icu::UnicodeSet already_added(others);
1278   others.removeAll(RegExpCaseFolding::IgnoreSet());
1279   others.closeOver(USET_CASE_INSENSITIVE);
1280   others.removeAll(RegExpCaseFolding::IgnoreSet());
1281   others.removeAll(already_added);
1282 
1283   // Add others to the ranges
1284   for (int32_t i = 0; i < others.getRangeCount(); i++) {
1285     UChar32 from = others.getRangeStart(i);
1286     UChar32 to = others.getRangeEnd(i);
1287     if (from == to) {
1288       ranges->Add(CharacterRange::Singleton(from), zone);
1289     } else {
1290       ranges->Add(CharacterRange::Range(from, to), zone);
1291     }
1292   }
1293 #else
1294   for (int i = 0; i < range_count; i++) {
1295     CharacterRange range = ranges->at(i);
1296     base::uc32 bottom = range.from();
1297     if (bottom > kMaxUtf16CodeUnit) continue;
1298     base::uc32 top = std::min({range.to(), kMaxUtf16CodeUnitU});
1299     // Nothing to be done for surrogates.
1300     if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
1301     if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1302       if (bottom > kMaxOneByteCharCode) continue;
1303       if (top > kMaxOneByteCharCode) top = kMaxOneByteCharCode;
1304     }
1305     unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1306     if (top == bottom) {
1307       // If this is a singleton we just expand the one character.
1308       int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
1309       for (int i = 0; i < length; i++) {
1310         base::uc32 chr = chars[i];
1311         if (chr != bottom) {
1312           ranges->Add(CharacterRange::Singleton(chars[i]), zone);
1313         }
1314       }
1315     } else {
1316       // If this is a range we expand the characters block by block, expanding
1317       // contiguous subranges (blocks) one at a time.  The approach is as
1318       // follows.  For a given start character we look up the remainder of the
1319       // block that contains it (represented by the end point), for instance we
1320       // find 'z' if the character is 'c'.  A block is characterized by the
1321       // property that all characters uncanonicalize in the same way, except
1322       // that each entry in the result is incremented by the distance from the
1323       // first element.  So a-z is a block because 'a' uncanonicalizes to ['a',
1324       // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].  Once
1325       // we've found the end point we look up its uncanonicalization and
1326       // produce a range for each element.  For instance for [c-f] we look up
1327       // ['z', 'Z'] and produce [c-f] and [C-F].  We then only add a range if
1328       // it is not already contained in the input, so [c-f] will be skipped but
1329       // [C-F] will be added.  If this range is not completely contained in a
1330       // block we do this for all the blocks covered by the range (handling
1331       // characters that is not in a block as a "singleton block").
1332       unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1333       base::uc32 pos = bottom;
1334       while (pos <= top) {
1335         int length =
1336             isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
1337         base::uc32 block_end;
1338         if (length == 0) {
1339           block_end = pos;
1340         } else {
1341           DCHECK_EQ(1, length);
1342           block_end = equivalents[0];
1343         }
1344         int end = (block_end > top) ? top : block_end;
1345         length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
1346                                                          equivalents);
1347         for (int i = 0; i < length; i++) {
1348           base::uc32 c = equivalents[i];
1349           base::uc32 range_from = c - (block_end - pos);
1350           base::uc32 range_to = c - (block_end - end);
1351           if (!(bottom <= range_from && range_to <= top)) {
1352             ranges->Add(CharacterRange::Range(range_from, range_to), zone);
1353           }
1354         }
1355         pos = end + 1;
1356       }
1357     }
1358   }
1359 #endif  // V8_INTL_SUPPORT
1360 }
1361 
1362 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
1363   DCHECK_NOT_NULL(ranges);
1364   int n = ranges->length();
1365   if (n <= 1) return true;
1366   base::uc32 max = ranges->at(0).to();
1367   for (int i = 1; i < n; i++) {
1368     CharacterRange next_range = ranges->at(i);
1369     if (next_range.from() <= max + 1) return false;
1370     max = next_range.to();
1371   }
1372   return true;
1373 }
1374 
1375 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
1376   if (ranges_ == nullptr) {
1377     ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone);
1378     CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false,
1379                                    zone);
1380   }
1381   return ranges_;
1382 }
1383 
1384 namespace {
1385 
1386 // Move a number of elements in a zonelist to another position
1387 // in the same list. Handles overlapping source and target areas.
1388 void MoveRanges(ZoneList<CharacterRange>* list, int from, int to, int count) {
1389   // Ranges are potentially overlapping.
1390   if (from < to) {
1391     for (int i = count - 1; i >= 0; i--) {
1392       list->at(to + i) = list->at(from + i);
1393     }
1394   } else {
1395     for (int i = 0; i < count; i++) {
1396       list->at(to + i) = list->at(from + i);
1397     }
1398   }
1399 }
1400 
1401 int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
1402                                CharacterRange insert) {
1403   // Inserts a range into list[0..count[, which must be sorted
1404   // by from value and non-overlapping and non-adjacent, using at most
1405   // list[0..count] for the result. Returns the number of resulting
1406   // canonicalized ranges. Inserting a range may collapse existing ranges into
1407   // fewer ranges, so the return value can be anything in the range 1..count+1.
1408   base::uc32 from = insert.from();
1409   base::uc32 to = insert.to();
1410   int start_pos = 0;
1411   int end_pos = count;
1412   for (int i = count - 1; i >= 0; i--) {
1413     CharacterRange current = list->at(i);
1414     if (current.from() > to + 1) {
1415       end_pos = i;
1416     } else if (current.to() + 1 < from) {
1417       start_pos = i + 1;
1418       break;
1419     }
1420   }
1421 
1422   // Inserted range overlaps, or is adjacent to, ranges at positions
1423   // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
1424   // not affected by the insertion.
1425   // If start_pos == end_pos, the range must be inserted before start_pos.
1426   // if start_pos < end_pos, the entire range from start_pos to end_pos
1427   // must be merged with the insert range.
1428 
1429   if (start_pos == end_pos) {
1430     // Insert between existing ranges at position start_pos.
1431     if (start_pos < count) {
1432       MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
1433     }
1434     list->at(start_pos) = insert;
1435     return count + 1;
1436   }
1437   if (start_pos + 1 == end_pos) {
1438     // Replace single existing range at position start_pos.
1439     CharacterRange to_replace = list->at(start_pos);
1440     int new_from = std::min(to_replace.from(), from);
1441     int new_to = std::max(to_replace.to(), to);
1442     list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1443     return count;
1444   }
1445   // Replace a number of existing ranges from start_pos to end_pos - 1.
1446   // Move the remaining ranges down.
1447 
1448   int new_from = std::min(list->at(start_pos).from(), from);
1449   int new_to = std::max(list->at(end_pos - 1).to(), to);
1450   if (end_pos < count) {
1451     MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
1452   }
1453   list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1454   return count - (end_pos - start_pos) + 1;
1455 }
1456 
1457 }  // namespace
1458 
1459 void CharacterSet::Canonicalize() {
1460   // Special/default classes are always considered canonical. The result
1461   // of calling ranges() will be sorted.
1462   if (ranges_ == nullptr) return;
1463   CharacterRange::Canonicalize(ranges_);
1464 }
1465 
1466 // static
1467 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
1468   if (character_ranges->length() <= 1) return;
1469   // Check whether ranges are already canonical (increasing, non-overlapping,
1470   // non-adjacent).
1471   int n = character_ranges->length();
1472   base::uc32 max = character_ranges->at(0).to();
1473   int i = 1;
1474   while (i < n) {
1475     CharacterRange current = character_ranges->at(i);
1476     if (current.from() <= max + 1) {
1477       break;
1478     }
1479     max = current.to();
1480     i++;
1481   }
1482   // Canonical until the i'th range. If that's all of them, we are done.
1483   if (i == n) return;
1484 
1485   // The ranges at index i and forward are not canonicalized. Make them so by
1486   // doing the equivalent of insertion sort (inserting each into the previous
1487   // list, in order).
1488   // Notice that inserting a range can reduce the number of ranges in the
1489   // result due to combining of adjacent and overlapping ranges.
1490   int read = i;           // Range to insert.
1491   int num_canonical = i;  // Length of canonicalized part of list.
1492   do {
1493     num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
1494                                                character_ranges->at(read));
1495     read++;
1496   } while (read < n);
1497   character_ranges->Rewind(num_canonical);
1498 
1499   DCHECK(CharacterRange::IsCanonical(character_ranges));
1500 }
1501 
1502 // static
1503 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
1504                             ZoneList<CharacterRange>* negated_ranges,
1505                             Zone* zone) {
1506   DCHECK(CharacterRange::IsCanonical(ranges));
1507   DCHECK_EQ(0, negated_ranges->length());
1508   int range_count = ranges->length();
1509   base::uc32 from = 0;
1510   int i = 0;
1511   if (range_count > 0 && ranges->at(0).from() == 0) {
1512     from = ranges->at(0).to() + 1;
1513     i = 1;
1514   }
1515   while (i < range_count) {
1516     CharacterRange range = ranges->at(i);
1517     negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
1518     from = range.to() + 1;
1519     i++;
1520   }
1521   if (from < kMaxCodePoint) {
1522     negated_ranges->Add(CharacterRange::Range(from, kMaxCodePoint), zone);
1523   }
1524 }
1525 
1526 // static
1527 void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) {
1528   DCHECK(IsCanonical(ranges));
1529 
1530   // Drop all ranges that don't contain one-byte code units, and clamp the last
1531   // range s.t. it likewise only contains one-byte code units. Note this relies
1532   // on `ranges` being canonicalized, i.e. sorted and non-overlapping.
1533 
1534   static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU;
1535   int n = ranges->length();
1536   for (; n > 0; n--) {
1537     CharacterRange& r = ranges->at(n - 1);
1538     if (r.from() <= max_char) {
1539       r.to_ = std::min(r.to_, max_char);
1540       break;
1541     }
1542   }
1543 
1544   ranges->Rewind(n);
1545 }
1546 
1547 namespace {
1548 
1549 // Scoped object to keep track of how much we unroll quantifier loops in the
1550 // regexp graph generator.
1551 class RegExpExpansionLimiter {
1552  public:
1553   static const int kMaxExpansionFactor = 6;
1554   RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
1555       : compiler_(compiler),
1556         saved_expansion_factor_(compiler->current_expansion_factor()),
1557         ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
1558     DCHECK_LT(0, factor);
1559     if (ok_to_expand_) {
1560       if (factor > kMaxExpansionFactor) {
1561         // Avoid integer overflow of the current expansion factor.
1562         ok_to_expand_ = false;
1563         compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
1564       } else {
1565         int new_factor = saved_expansion_factor_ * factor;
1566         ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
1567         compiler->set_current_expansion_factor(new_factor);
1568       }
1569     }
1570   }
1571 
1572   ~RegExpExpansionLimiter() {
1573     compiler_->set_current_expansion_factor(saved_expansion_factor_);
1574   }
1575 
1576   bool ok_to_expand() { return ok_to_expand_; }
1577 
1578  private:
1579   RegExpCompiler* compiler_;
1580   int saved_expansion_factor_;
1581   bool ok_to_expand_;
1582 
1583   DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
1584 };
1585 
1586 }  // namespace
1587 
1588 RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
1589                                      RegExpTree* body, RegExpCompiler* compiler,
1590                                      RegExpNode* on_success,
1591                                      bool not_at_start) {
1592   // x{f, t} becomes this:
1593   //
1594   //             (r++)<-.
1595   //               |     `
1596   //               |     (x)
1597   //               v     ^
1598   //      (r=0)-->(?)---/ [if r < t]
1599   //               |
1600   //   [if r >= f] \----> ...
1601   //
1602 
1603   // 15.10.2.5 RepeatMatcher algorithm.
1604   // The parser has already eliminated the case where max is 0.  In the case
1605   // where max_match is zero the parser has removed the quantifier if min was
1606   // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
1607 
1608   // If we know that we cannot match zero length then things are a little
1609   // simpler since we don't need to make the special zero length match check
1610   // from step 2.1.  If the min and max are small we can unroll a little in
1611   // this case.
1612   static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
1613   static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
1614   if (max == 0) return on_success;  // This can happen due to recursion.
1615   bool body_can_be_empty = (body->min_match() == 0);
1616   int body_start_reg = RegExpCompiler::kNoRegister;
1617   Interval capture_registers = body->CaptureRegisters();
1618   bool needs_capture_clearing = !capture_registers.is_empty();
1619   Zone* zone = compiler->zone();
1620 
1621   if (body_can_be_empty) {
1622     body_start_reg = compiler->AllocateRegister();
1623   } else if (compiler->optimize() && !needs_capture_clearing) {
1624     // Only unroll if there are no captures and the body can't be
1625     // empty.
1626     {
1627       RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
1628       if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
1629         int new_max = (max == kInfinity) ? max : max - min;
1630         // Recurse once to get the loop or optional matches after the fixed
1631         // ones.
1632         RegExpNode* answer =
1633             ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
1634         // Unroll the forced matches from 0 to min.  This can cause chains of
1635         // TextNodes (which the parser does not generate).  These should be
1636         // combined if it turns out they hinder good code generation.
1637         for (int i = 0; i < min; i++) {
1638           answer = body->ToNode(compiler, answer);
1639         }
1640         return answer;
1641       }
1642     }
1643     if (max <= kMaxUnrolledMaxMatches && min == 0) {
1644       DCHECK_LT(0, max);  // Due to the 'if' above.
1645       RegExpExpansionLimiter limiter(compiler, max);
1646       if (limiter.ok_to_expand()) {
1647         // Unroll the optional matches up to max.
1648         RegExpNode* answer = on_success;
1649         for (int i = 0; i < max; i++) {
1650           ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone);
1651           if (is_greedy) {
1652             alternation->AddAlternative(
1653                 GuardedAlternative(body->ToNode(compiler, answer)));
1654             alternation->AddAlternative(GuardedAlternative(on_success));
1655           } else {
1656             alternation->AddAlternative(GuardedAlternative(on_success));
1657             alternation->AddAlternative(
1658                 GuardedAlternative(body->ToNode(compiler, answer)));
1659           }
1660           answer = alternation;
1661           if (not_at_start && !compiler->read_backward()) {
1662             alternation->set_not_at_start();
1663           }
1664         }
1665         return answer;
1666       }
1667     }
1668   }
1669   bool has_min = min > 0;
1670   bool has_max = max < RegExpTree::kInfinity;
1671   bool needs_counter = has_min || has_max;
1672   int reg_ctr = needs_counter ? compiler->AllocateRegister()
1673                               : RegExpCompiler::kNoRegister;
1674   LoopChoiceNode* center = zone->New<LoopChoiceNode>(
1675       body->min_match() == 0, compiler->read_backward(), min, zone);
1676   if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
1677   RegExpNode* loop_return =
1678       needs_counter ? static_cast<RegExpNode*>(
1679                           ActionNode::IncrementRegister(reg_ctr, center))
1680                     : static_cast<RegExpNode*>(center);
1681   if (body_can_be_empty) {
1682     // If the body can be empty we need to check if it was and then
1683     // backtrack.
1684     loop_return =
1685         ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
1686   }
1687   RegExpNode* body_node = body->ToNode(compiler, loop_return);
1688   if (body_can_be_empty) {
1689     // If the body can be empty we need to store the start position
1690     // so we can bail out if it was empty.
1691     body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
1692   }
1693   if (needs_capture_clearing) {
1694     // Before entering the body of this loop we need to clear captures.
1695     body_node = ActionNode::ClearCaptures(capture_registers, body_node);
1696   }
1697   GuardedAlternative body_alt(body_node);
1698   if (has_max) {
1699     Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max);
1700     body_alt.AddGuard(body_guard, zone);
1701   }
1702   GuardedAlternative rest_alt(on_success);
1703   if (has_min) {
1704     Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min);
1705     rest_alt.AddGuard(rest_guard, zone);
1706   }
1707   if (is_greedy) {
1708     center->AddLoopAlternative(body_alt);
1709     center->AddContinueAlternative(rest_alt);
1710   } else {
1711     center->AddContinueAlternative(rest_alt);
1712     center->AddLoopAlternative(body_alt);
1713   }
1714   if (needs_counter) {
1715     return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
1716   } else {
1717     return center;
1718   }
1719 }
1720 
1721 }  // namespace internal
1722 }  // namespace v8
1723