• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-compiler.h"
6 
7 #include "src/execution/isolate.h"
8 #include "src/regexp/regexp.h"
9 #ifdef V8_INTL_SUPPORT
10 #include "src/regexp/special-case.h"
11 #endif  // V8_INTL_SUPPORT
12 #include "src/strings/unicode-inl.h"
13 #include "src/zone/zone-list-inl.h"
14 
15 #ifdef V8_INTL_SUPPORT
16 #include "unicode/locid.h"
17 #include "unicode/uniset.h"
18 #include "unicode/utypes.h"
19 #endif  // V8_INTL_SUPPORT
20 
21 namespace v8 {
22 namespace internal {
23 
24 using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)
25 
26 // -------------------------------------------------------------------
27 // Tree to graph conversion
28 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)29 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
30                                RegExpNode* on_success) {
31   ZoneList<TextElement>* elms =
32       compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone());
33   elms->Add(TextElement::Atom(this), compiler->zone());
34   return compiler->zone()->New<TextNode>(elms, compiler->read_backward(),
35                                          on_success);
36 }
37 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)38 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
39                                RegExpNode* on_success) {
40   return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(),
41                                          on_success);
42 }
43 
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)44 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
45                                  const int* special_class, int length) {
46   length--;  // Remove final marker.
47   DCHECK_EQ(kRangeEndMarker, special_class[length]);
48   DCHECK_NE(0, ranges->length());
49   DCHECK_NE(0, length);
50   DCHECK_NE(0, special_class[0]);
51   if (ranges->length() != (length >> 1) + 1) {
52     return false;
53   }
54   CharacterRange range = ranges->at(0);
55   if (range.from() != 0) {
56     return false;
57   }
58   for (int i = 0; i < length; i += 2) {
59     if (static_cast<uc32>(special_class[i]) != (range.to() + 1)) {
60       return false;
61     }
62     range = ranges->at((i >> 1) + 1);
63     if (static_cast<uc32>(special_class[i + 1]) != range.from()) {
64       return false;
65     }
66   }
67   if (range.to() != String::kMaxCodePoint) {
68     return false;
69   }
70   return true;
71 }
72 
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)73 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
74                           const int* special_class, int length) {
75   length--;  // Remove final marker.
76   DCHECK_EQ(kRangeEndMarker, special_class[length]);
77   if (ranges->length() * 2 != length) {
78     return false;
79   }
80   for (int i = 0; i < length; i += 2) {
81     CharacterRange range = ranges->at(i >> 1);
82     if (range.from() != static_cast<uc32>(special_class[i]) ||
83         range.to() != static_cast<uc32>(special_class[i + 1] - 1)) {
84       return false;
85     }
86   }
87   return true;
88 }
89 
is_standard(Zone * zone)90 bool RegExpCharacterClass::is_standard(Zone* zone) {
91   // TODO(lrn): Remove need for this function, by not throwing away information
92   // along the way.
93   if (is_negated()) {
94     return false;
95   }
96   if (set_.is_standard()) {
97     return true;
98   }
99   if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
100     set_.set_standard_set_type('s');
101     return true;
102   }
103   if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
104     set_.set_standard_set_type('S');
105     return true;
106   }
107   if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
108                            kLineTerminatorRangeCount)) {
109     set_.set_standard_set_type('.');
110     return true;
111   }
112   if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
113                     kLineTerminatorRangeCount)) {
114     set_.set_standard_set_type('n');
115     return true;
116   }
117   if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
118     set_.set_standard_set_type('w');
119     return true;
120   }
121   if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
122     set_.set_standard_set_type('W');
123     return true;
124   }
125   return false;
126 }
127 
UnicodeRangeSplitter(ZoneList<CharacterRange> * base)128 UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) {
129   // The unicode range splitter categorizes given character ranges into:
130   // - Code points from the BMP representable by one code unit.
131   // - Code points outside the BMP that need to be split into surrogate pairs.
132   // - Lone lead surrogates.
133   // - Lone trail surrogates.
134   // Lone surrogates are valid code points, even though no actual characters.
135   // They require special matching to make sure we do not split surrogate pairs.
136 
137   for (int i = 0; i < base->length(); i++) AddRange(base->at(i));
138 }
139 
AddRange(CharacterRange range)140 void UnicodeRangeSplitter::AddRange(CharacterRange range) {
141   static constexpr uc32 kBmp1Start = 0;
142   static constexpr uc32 kBmp1End = kLeadSurrogateStart - 1;
143   static constexpr uc32 kBmp2Start = kTrailSurrogateEnd + 1;
144   static constexpr uc32 kBmp2End = kNonBmpStart - 1;
145 
146   // Ends are all inclusive.
147   STATIC_ASSERT(kBmp1Start == 0);
148   STATIC_ASSERT(kBmp1Start < kBmp1End);
149   STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart);
150   STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd);
151   STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);
152   STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd);
153   STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start);
154   STATIC_ASSERT(kBmp2Start < kBmp2End);
155   STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart);
156   STATIC_ASSERT(kNonBmpStart < kNonBmpEnd);
157 
158   static constexpr uc32 kStarts[] = {
159       kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart,
160       kBmp2Start, kNonBmpStart,
161   };
162 
163   static constexpr uc32 kEnds[] = {
164       kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,
165   };
166 
167   CharacterRangeVector* const kTargets[] = {
168       &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_,
169   };
170 
171   static constexpr int kCount = arraysize(kStarts);
172   STATIC_ASSERT(kCount == arraysize(kEnds));
173   STATIC_ASSERT(kCount == arraysize(kTargets));
174 
175   for (int i = 0; i < kCount; i++) {
176     if (kStarts[i] > range.to()) break;
177     const uc32 from = std::max(kStarts[i], range.from());
178     const uc32 to = std::min(kEnds[i], range.to());
179     if (from > to) continue;
180     kTargets[i]->emplace_back(CharacterRange::Range(from, to));
181   }
182 }
183 
184 namespace {
185 
186 // Translates between new and old V8-isms (SmallVector, ZoneList).
ToCanonicalZoneList(const UnicodeRangeSplitter::CharacterRangeVector * v,Zone * zone)187 ZoneList<CharacterRange>* ToCanonicalZoneList(
188     const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) {
189   if (v->empty()) return nullptr;
190 
191   ZoneList<CharacterRange>* result =
192       zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone);
193   for (size_t i = 0; i < v->size(); i++) {
194     result->Add(v->at(i), zone);
195   }
196 
197   CharacterRange::Canonicalize(result);
198   return result;
199 }
200 
AddBmpCharacters(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)201 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
202                       RegExpNode* on_success, UnicodeRangeSplitter* splitter,
203                       JSRegExp::Flags flags) {
204   ZoneList<CharacterRange>* bmp =
205       ToCanonicalZoneList(splitter->bmp(), compiler->zone());
206   if (bmp == nullptr) return;
207   result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
208       compiler->zone(), bmp, compiler->read_backward(), on_success, flags)));
209 }
210 
AddNonBmpSurrogatePairs(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)211 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
212                              RegExpNode* on_success,
213                              UnicodeRangeSplitter* splitter,
214                              JSRegExp::Flags flags) {
215   ZoneList<CharacterRange>* non_bmp =
216       ToCanonicalZoneList(splitter->non_bmp(), compiler->zone());
217   if (non_bmp == nullptr) return;
218   DCHECK(!compiler->one_byte());
219   Zone* zone = compiler->zone();
220   CharacterRange::Canonicalize(non_bmp);
221   for (int i = 0; i < non_bmp->length(); i++) {
222     // Match surrogate pair.
223     // E.g. [\u10005-\u11005] becomes
224     //      \ud800[\udc05-\udfff]|
225     //      [\ud801-\ud803][\udc00-\udfff]|
226     //      \ud804[\udc00-\udc05]
227     uc32 from = non_bmp->at(i).from();
228     uc32 to = non_bmp->at(i).to();
229     uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
230     uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
231     uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
232     uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
233     if (from_l == to_l) {
234       // The lead surrogate is the same.
235       result->AddAlternative(
236           GuardedAlternative(TextNode::CreateForSurrogatePair(
237               zone, CharacterRange::Singleton(from_l),
238               CharacterRange::Range(from_t, to_t), compiler->read_backward(),
239               on_success, flags)));
240     } else {
241       if (from_t != kTrailSurrogateStart) {
242         // Add [from_l][from_t-\udfff]
243         result->AddAlternative(
244             GuardedAlternative(TextNode::CreateForSurrogatePair(
245                 zone, CharacterRange::Singleton(from_l),
246                 CharacterRange::Range(from_t, kTrailSurrogateEnd),
247                 compiler->read_backward(), on_success, flags)));
248         from_l++;
249       }
250       if (to_t != kTrailSurrogateEnd) {
251         // Add [to_l][\udc00-to_t]
252         result->AddAlternative(
253             GuardedAlternative(TextNode::CreateForSurrogatePair(
254                 zone, CharacterRange::Singleton(to_l),
255                 CharacterRange::Range(kTrailSurrogateStart, to_t),
256                 compiler->read_backward(), on_success, flags)));
257         to_l--;
258       }
259       if (from_l <= to_l) {
260         // Add [from_l-to_l][\udc00-\udfff]
261         result->AddAlternative(
262             GuardedAlternative(TextNode::CreateForSurrogatePair(
263                 zone, CharacterRange::Range(from_l, to_l),
264                 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
265                 compiler->read_backward(), on_success, flags)));
266       }
267     }
268   }
269 }
270 
NegativeLookaroundAgainstReadDirectionAndMatch(RegExpCompiler * compiler,ZoneList<CharacterRange> * lookbehind,ZoneList<CharacterRange> * match,RegExpNode * on_success,bool read_backward,JSRegExp::Flags flags)271 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
272     RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
273     ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
274     JSRegExp::Flags flags) {
275   Zone* zone = compiler->zone();
276   RegExpNode* match_node = TextNode::CreateForCharacterRanges(
277       zone, match, read_backward, on_success, flags);
278   int stack_register = compiler->UnicodeLookaroundStackRegister();
279   int position_register = compiler->UnicodeLookaroundPositionRegister();
280   RegExpLookaround::Builder lookaround(false, match_node, stack_register,
281                                        position_register);
282   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
283       zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
284   return lookaround.ForMatch(negative_match);
285 }
286 
MatchAndNegativeLookaroundInReadDirection(RegExpCompiler * compiler,ZoneList<CharacterRange> * match,ZoneList<CharacterRange> * lookahead,RegExpNode * on_success,bool read_backward,JSRegExp::Flags flags)287 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
288     RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
289     ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
290     bool read_backward, JSRegExp::Flags flags) {
291   Zone* zone = compiler->zone();
292   int stack_register = compiler->UnicodeLookaroundStackRegister();
293   int position_register = compiler->UnicodeLookaroundPositionRegister();
294   RegExpLookaround::Builder lookaround(false, on_success, stack_register,
295                                        position_register);
296   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
297       zone, lookahead, read_backward, lookaround.on_match_success(), flags);
298   return TextNode::CreateForCharacterRanges(
299       zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
300 }
301 
AddLoneLeadSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)302 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
303                            RegExpNode* on_success,
304                            UnicodeRangeSplitter* splitter,
305                            JSRegExp::Flags flags) {
306   ZoneList<CharacterRange>* lead_surrogates =
307       ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
308   if (lead_surrogates == nullptr) return;
309   Zone* zone = compiler->zone();
310   // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
311   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
312       zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
313 
314   RegExpNode* match;
315   if (compiler->read_backward()) {
316     // Reading backward. Assert that reading forward, there is no trail
317     // surrogate, and then backward match the lead surrogate.
318     match = NegativeLookaroundAgainstReadDirectionAndMatch(
319         compiler, trail_surrogates, lead_surrogates, on_success, true, flags);
320   } else {
321     // Reading forward. Forward match the lead surrogate and assert that
322     // no trail surrogate follows.
323     match = MatchAndNegativeLookaroundInReadDirection(
324         compiler, lead_surrogates, trail_surrogates, on_success, false, flags);
325   }
326   result->AddAlternative(GuardedAlternative(match));
327 }
328 
AddLoneTrailSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)329 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
330                             RegExpNode* on_success,
331                             UnicodeRangeSplitter* splitter,
332                             JSRegExp::Flags flags) {
333   ZoneList<CharacterRange>* trail_surrogates =
334       ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
335   if (trail_surrogates == nullptr) return;
336   Zone* zone = compiler->zone();
337   // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
338   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
339       zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
340 
341   RegExpNode* match;
342   if (compiler->read_backward()) {
343     // Reading backward. Backward match the trail surrogate and assert that no
344     // lead surrogate precedes it.
345     match = MatchAndNegativeLookaroundInReadDirection(
346         compiler, trail_surrogates, lead_surrogates, on_success, true, flags);
347   } else {
348     // Reading forward. Assert that reading backward, there is no lead
349     // surrogate, and then forward match the trail surrogate.
350     match = NegativeLookaroundAgainstReadDirectionAndMatch(
351         compiler, lead_surrogates, trail_surrogates, on_success, false, flags);
352   }
353   result->AddAlternative(GuardedAlternative(match));
354 }
355 
UnanchoredAdvance(RegExpCompiler * compiler,RegExpNode * on_success)356 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
357                               RegExpNode* on_success) {
358   // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
359   DCHECK(!compiler->read_backward());
360   Zone* zone = compiler->zone();
361   // Advance any character. If the character happens to be a lead surrogate and
362   // we advanced into the middle of a surrogate pair, it will work out, as
363   // nothing will match from there. We will have to advance again, consuming
364   // the associated trail surrogate.
365   ZoneList<CharacterRange>* range = CharacterRange::List(
366       zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
367   JSRegExp::Flags default_flags = JSRegExp::Flags();
368   return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
369                                             default_flags);
370 }
371 
AddUnicodeCaseEquivalents(ZoneList<CharacterRange> * ranges,Zone * zone)372 void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
373 #ifdef V8_INTL_SUPPORT
374   DCHECK(CharacterRange::IsCanonical(ranges));
375 
376   // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
377   // See also https://crbug.com/v8/6727.
378   // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
379   // which we use frequently internally. But large ranges can also easily be
380   // created by the user. We might want to have a more general caching mechanism
381   // for such ranges.
382   if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
383 
384   // Use ICU to compute the case fold closure over the ranges.
385   icu::UnicodeSet set;
386   for (int i = 0; i < ranges->length(); i++) {
387     set.add(ranges->at(i).from(), ranges->at(i).to());
388   }
389   // Clear the ranges list without freeing the backing store.
390   ranges->Rewind(0);
391   set.closeOver(USET_CASE_INSENSITIVE);
392   // Full case mapping map single characters to multiple characters.
393   // Those are represented as strings in the set. Remove them so that
394   // we end up with only simple and common case mappings.
395   set.removeAllStrings();
396   for (int i = 0; i < set.getRangeCount(); i++) {
397     ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
398                 zone);
399   }
400   // No errors and everything we collected have been ranges.
401   CharacterRange::Canonicalize(ranges);
402 #endif  // V8_INTL_SUPPORT
403 }
404 
405 }  // namespace
406 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)407 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
408                                          RegExpNode* on_success) {
409   set_.Canonicalize();
410   Zone* zone = compiler->zone();
411   ZoneList<CharacterRange>* ranges = this->ranges(zone);
412   if (NeedsUnicodeCaseEquivalents(flags_)) {
413     AddUnicodeCaseEquivalents(ranges, zone);
414   }
415   if (IsUnicode(flags_) && !compiler->one_byte() &&
416       !contains_split_surrogate()) {
417     if (is_negated()) {
418       ZoneList<CharacterRange>* negated =
419           zone->New<ZoneList<CharacterRange>>(2, zone);
420       CharacterRange::Negate(ranges, negated, zone);
421       ranges = negated;
422     }
423     if (ranges->length() == 0) {
424       JSRegExp::Flags default_flags;
425       RegExpCharacterClass* fail =
426           zone->New<RegExpCharacterClass>(zone, ranges, default_flags);
427       return zone->New<TextNode>(fail, compiler->read_backward(), on_success);
428     }
429     if (standard_type() == '*') {
430       return UnanchoredAdvance(compiler, on_success);
431     } else {
432       ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
433       UnicodeRangeSplitter splitter(ranges);
434       AddBmpCharacters(compiler, result, on_success, &splitter, flags_);
435       AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter, flags_);
436       AddLoneLeadSurrogates(compiler, result, on_success, &splitter, flags_);
437       AddLoneTrailSurrogates(compiler, result, on_success, &splitter, flags_);
438       static constexpr int kMaxRangesToInline = 32;  // Arbitrary.
439       if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline();
440       return result;
441     }
442   } else {
443     return zone->New<TextNode>(this, compiler->read_backward(), on_success);
444   }
445 }
446 
CompareFirstChar(RegExpTree * const * a,RegExpTree * const * b)447 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
448   RegExpAtom* atom1 = (*a)->AsAtom();
449   RegExpAtom* atom2 = (*b)->AsAtom();
450   uc16 character1 = atom1->data().at(0);
451   uc16 character2 = atom2->data().at(0);
452   if (character1 < character2) return -1;
453   if (character1 > character2) return 1;
454   return 0;
455 }
456 
457 #ifdef V8_INTL_SUPPORT
458 
459 // Case Insensitve comparesion
CompareFirstCharCaseInsensitve(RegExpTree * const * a,RegExpTree * const * b)460 int CompareFirstCharCaseInsensitve(RegExpTree* const* a, RegExpTree* const* b) {
461   RegExpAtom* atom1 = (*a)->AsAtom();
462   RegExpAtom* atom2 = (*b)->AsAtom();
463   icu::UnicodeString character1(atom1->data().at(0));
464   return character1.caseCompare(atom2->data().at(0), U_FOLD_CASE_DEFAULT);
465 }
466 
467 #else
468 
Canonical(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar c)469 static unibrow::uchar Canonical(
470     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
471     unibrow::uchar c) {
472   unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
473   int length = canonicalize->get(c, '\0', chars);
474   DCHECK_LE(length, 1);
475   unibrow::uchar canonical = c;
476   if (length == 1) canonical = chars[0];
477   return canonical;
478 }
479 
CompareFirstCharCaseIndependent(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,RegExpTree * const * a,RegExpTree * const * b)480 int CompareFirstCharCaseIndependent(
481     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
482     RegExpTree* const* a, RegExpTree* const* b) {
483   RegExpAtom* atom1 = (*a)->AsAtom();
484   RegExpAtom* atom2 = (*b)->AsAtom();
485   unibrow::uchar character1 = atom1->data().at(0);
486   unibrow::uchar character2 = atom2->data().at(0);
487   if (character1 == character2) return 0;
488   if (character1 >= 'a' || character2 >= 'a') {
489     character1 = Canonical(canonicalize, character1);
490     character2 = Canonical(canonicalize, character2);
491   }
492   return static_cast<int>(character1) - static_cast<int>(character2);
493 }
494 #endif  // V8_INTL_SUPPORT
495 
496 // We can stable sort runs of atoms, since the order does not matter if they
497 // start with different characters.
498 // Returns true if any consecutive atoms were found.
SortConsecutiveAtoms(RegExpCompiler * compiler)499 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
500   ZoneList<RegExpTree*>* alternatives = this->alternatives();
501   int length = alternatives->length();
502   bool found_consecutive_atoms = false;
503   for (int i = 0; i < length; i++) {
504     while (i < length) {
505       RegExpTree* alternative = alternatives->at(i);
506       if (alternative->IsAtom()) break;
507       i++;
508     }
509     // i is length or it is the index of an atom.
510     if (i == length) break;
511     int first_atom = i;
512     JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
513     i++;
514     while (i < length) {
515       RegExpTree* alternative = alternatives->at(i);
516       if (!alternative->IsAtom()) break;
517       if (alternative->AsAtom()->flags() != flags) break;
518       i++;
519     }
520     // Sort atoms to get ones with common prefixes together.
521     // This step is more tricky if we are in a case-independent regexp,
522     // because it would change /is|I/ to /I|is/, and order matters when
523     // the regexp parts don't match only disjoint starting points. To fix
524     // this we have a version of CompareFirstChar that uses case-
525     // independent character classes for comparison.
526     DCHECK_LT(first_atom, alternatives->length());
527     DCHECK_LE(i, alternatives->length());
528     DCHECK_LE(first_atom, i);
529     if (IgnoreCase(flags)) {
530 #ifdef V8_INTL_SUPPORT
531       alternatives->StableSort(CompareFirstCharCaseInsensitve, first_atom,
532                                i - first_atom);
533 #else
534       unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
535           compiler->isolate()->regexp_macro_assembler_canonicalize();
536       auto compare_closure = [canonicalize](RegExpTree* const* a,
537                                             RegExpTree* const* b) {
538         return CompareFirstCharCaseIndependent(canonicalize, a, b);
539       };
540       alternatives->StableSort(compare_closure, first_atom, i - first_atom);
541 #endif  // V8_INTL_SUPPORT
542     } else {
543       alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
544     }
545     if (i - first_atom > 1) found_consecutive_atoms = true;
546   }
547   return found_consecutive_atoms;
548 }
549 
550 // Optimizes ab|ac|az to a(?:b|c|d).
RationalizeConsecutiveAtoms(RegExpCompiler * compiler)551 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
552   Zone* zone = compiler->zone();
553   ZoneList<RegExpTree*>* alternatives = this->alternatives();
554   int length = alternatives->length();
555 
556   int write_posn = 0;
557   int i = 0;
558   while (i < length) {
559     RegExpTree* alternative = alternatives->at(i);
560     if (!alternative->IsAtom()) {
561       alternatives->at(write_posn++) = alternatives->at(i);
562       i++;
563       continue;
564     }
565     RegExpAtom* const atom = alternative->AsAtom();
566     JSRegExp::Flags flags = atom->flags();
567 #ifdef V8_INTL_SUPPORT
568     icu::UnicodeString common_prefix(atom->data().at(0));
569 #else
570     unibrow::uchar common_prefix = atom->data().at(0);
571 #endif  // V8_INTL_SUPPORT
572     int first_with_prefix = i;
573     int prefix_length = atom->length();
574     i++;
575     while (i < length) {
576       alternative = alternatives->at(i);
577       if (!alternative->IsAtom()) break;
578       RegExpAtom* const atom = alternative->AsAtom();
579       if (atom->flags() != flags) break;
580 #ifdef V8_INTL_SUPPORT
581       icu::UnicodeString new_prefix(atom->data().at(0));
582       if (new_prefix != common_prefix) {
583         if (!IgnoreCase(flags)) break;
584         if (common_prefix.caseCompare(new_prefix, U_FOLD_CASE_DEFAULT) != 0)
585           break;
586       }
587 #else
588       unibrow::uchar new_prefix = atom->data().at(0);
589       if (new_prefix != common_prefix) {
590         if (!IgnoreCase(flags)) break;
591         unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
592             compiler->isolate()->regexp_macro_assembler_canonicalize();
593         new_prefix = Canonical(canonicalize, new_prefix);
594         common_prefix = Canonical(canonicalize, common_prefix);
595         if (new_prefix != common_prefix) break;
596       }
597 #endif  // V8_INTL_SUPPORT
598       prefix_length = Min(prefix_length, atom->length());
599       i++;
600     }
601     if (i > first_with_prefix + 2) {
602       // Found worthwhile run of alternatives with common prefix of at least one
603       // character.  The sorting function above did not sort on more than one
604       // character for reasons of correctness, but there may still be a longer
605       // common prefix if the terms were similar or presorted in the input.
606       // Find out how long the common prefix is.
607       int run_length = i - first_with_prefix;
608       RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
609       for (int j = 1; j < run_length && prefix_length > 1; j++) {
610         RegExpAtom* old_atom =
611             alternatives->at(j + first_with_prefix)->AsAtom();
612         for (int k = 1; k < prefix_length; k++) {
613           if (atom->data().at(k) != old_atom->data().at(k)) {
614             prefix_length = k;
615             break;
616           }
617         }
618       }
619       RegExpAtom* prefix = zone->New<RegExpAtom>(
620           atom->data().SubVector(0, prefix_length), flags);
621       ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone);
622       pair->Add(prefix, zone);
623       ZoneList<RegExpTree*>* suffixes =
624           zone->New<ZoneList<RegExpTree*>>(run_length, zone);
625       for (int j = 0; j < run_length; j++) {
626         RegExpAtom* old_atom =
627             alternatives->at(j + first_with_prefix)->AsAtom();
628         int len = old_atom->length();
629         if (len == prefix_length) {
630           suffixes->Add(zone->New<RegExpEmpty>(), zone);
631         } else {
632           RegExpTree* suffix = zone->New<RegExpAtom>(
633               old_atom->data().SubVector(prefix_length, old_atom->length()),
634               flags);
635           suffixes->Add(suffix, zone);
636         }
637       }
638       pair->Add(zone->New<RegExpDisjunction>(suffixes), zone);
639       alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair);
640     } else {
641       // Just copy any non-worthwhile alternatives.
642       for (int j = first_with_prefix; j < i; j++) {
643         alternatives->at(write_posn++) = alternatives->at(j);
644       }
645     }
646   }
647   alternatives->Rewind(write_posn);  // Trim end of array.
648 }
649 
650 // Optimizes b|c|z to [bcz].
FixSingleCharacterDisjunctions(RegExpCompiler * compiler)651 void RegExpDisjunction::FixSingleCharacterDisjunctions(
652     RegExpCompiler* compiler) {
653   Zone* zone = compiler->zone();
654   ZoneList<RegExpTree*>* alternatives = this->alternatives();
655   int length = alternatives->length();
656 
657   int write_posn = 0;
658   int i = 0;
659   while (i < length) {
660     RegExpTree* alternative = alternatives->at(i);
661     if (!alternative->IsAtom()) {
662       alternatives->at(write_posn++) = alternatives->at(i);
663       i++;
664       continue;
665     }
666     RegExpAtom* const atom = alternative->AsAtom();
667     if (atom->length() != 1) {
668       alternatives->at(write_posn++) = alternatives->at(i);
669       i++;
670       continue;
671     }
672     JSRegExp::Flags flags = atom->flags();
673     DCHECK_IMPLIES(IsUnicode(flags),
674                    !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
675     bool contains_trail_surrogate =
676         unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
677     int first_in_run = i;
678     i++;
679     // Find a run of single-character atom alternatives that have identical
680     // flags (case independence and unicode-ness).
681     while (i < length) {
682       alternative = alternatives->at(i);
683       if (!alternative->IsAtom()) break;
684       RegExpAtom* const atom = alternative->AsAtom();
685       if (atom->length() != 1) break;
686       if (atom->flags() != flags) break;
687       DCHECK_IMPLIES(IsUnicode(flags),
688                      !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
689       contains_trail_surrogate |=
690           unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
691       i++;
692     }
693     if (i > first_in_run + 1) {
694       // Found non-trivial run of single-character alternatives.
695       int run_length = i - first_in_run;
696       ZoneList<CharacterRange>* ranges =
697           zone->New<ZoneList<CharacterRange>>(2, zone);
698       for (int j = 0; j < run_length; j++) {
699         RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
700         DCHECK_EQ(old_atom->length(), 1);
701         ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
702       }
703       RegExpCharacterClass::CharacterClassFlags character_class_flags;
704       if (IsUnicode(flags) && contains_trail_surrogate) {
705         character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
706       }
707       alternatives->at(write_posn++) = zone->New<RegExpCharacterClass>(
708           zone, ranges, flags, character_class_flags);
709     } else {
710       // Just copy any trivial alternatives.
711       for (int j = first_in_run; j < i; j++) {
712         alternatives->at(write_posn++) = alternatives->at(j);
713       }
714     }
715   }
716   alternatives->Rewind(write_posn);  // Trim end of array.
717 }
718 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)719 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
720                                       RegExpNode* on_success) {
721   ZoneList<RegExpTree*>* alternatives = this->alternatives();
722 
723   if (alternatives->length() > 2) {
724     bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
725     if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
726     FixSingleCharacterDisjunctions(compiler);
727     if (alternatives->length() == 1) {
728       return alternatives->at(0)->ToNode(compiler, on_success);
729     }
730   }
731 
732   int length = alternatives->length();
733 
734   ChoiceNode* result =
735       compiler->zone()->New<ChoiceNode>(length, compiler->zone());
736   for (int i = 0; i < length; i++) {
737     GuardedAlternative alternative(
738         alternatives->at(i)->ToNode(compiler, on_success));
739     result->AddAlternative(alternative);
740   }
741   return result;
742 }
743 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)744 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
745                                      RegExpNode* on_success) {
746   return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
747 }
748 
749 namespace {
750 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
751 //         \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
BoundaryAssertionAsLookaround(RegExpCompiler * compiler,RegExpNode * on_success,RegExpAssertion::AssertionType type,JSRegExp::Flags flags)752 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
753                                           RegExpNode* on_success,
754                                           RegExpAssertion::AssertionType type,
755                                           JSRegExp::Flags flags) {
756   DCHECK(NeedsUnicodeCaseEquivalents(flags));
757   Zone* zone = compiler->zone();
758   ZoneList<CharacterRange>* word_range =
759       zone->New<ZoneList<CharacterRange>>(2, zone);
760   CharacterRange::AddClassEscape('w', word_range, true, zone);
761   int stack_register = compiler->UnicodeLookaroundStackRegister();
762   int position_register = compiler->UnicodeLookaroundPositionRegister();
763   ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
764   // Add two choices. The (non-)boundary could start with a word or
765   // a non-word-character.
766   for (int i = 0; i < 2; i++) {
767     bool lookbehind_for_word = i == 0;
768     bool lookahead_for_word =
769         (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
770     // Look to the left.
771     RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
772                                          stack_register, position_register);
773     RegExpNode* backward = TextNode::CreateForCharacterRanges(
774         zone, word_range, true, lookbehind.on_match_success(), flags);
775     // Look to the right.
776     RegExpLookaround::Builder lookahead(lookahead_for_word,
777                                         lookbehind.ForMatch(backward),
778                                         stack_register, position_register);
779     RegExpNode* forward = TextNode::CreateForCharacterRanges(
780         zone, word_range, false, lookahead.on_match_success(), flags);
781     result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
782   }
783   return result;
784 }
785 }  // anonymous namespace
786 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)787 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
788                                     RegExpNode* on_success) {
789   NodeInfo info;
790   Zone* zone = compiler->zone();
791 
792   switch (assertion_type()) {
793     case START_OF_LINE:
794       return AssertionNode::AfterNewline(on_success);
795     case START_OF_INPUT:
796       return AssertionNode::AtStart(on_success);
797     case BOUNDARY:
798       return NeedsUnicodeCaseEquivalents(flags_)
799                  ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
800                                                  flags_)
801                  : AssertionNode::AtBoundary(on_success);
802     case NON_BOUNDARY:
803       return NeedsUnicodeCaseEquivalents(flags_)
804                  ? BoundaryAssertionAsLookaround(compiler, on_success,
805                                                  NON_BOUNDARY, flags_)
806                  : AssertionNode::AtNonBoundary(on_success);
807     case END_OF_INPUT:
808       return AssertionNode::AtEnd(on_success);
809     case END_OF_LINE: {
810       // Compile $ in multiline regexps as an alternation with a positive
811       // lookahead in one side and an end-of-input on the other side.
812       // We need two registers for the lookahead.
813       int stack_pointer_register = compiler->AllocateRegister();
814       int position_register = compiler->AllocateRegister();
815       // The ChoiceNode to distinguish between a newline and end-of-input.
816       ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
817       // Create a newline atom.
818       ZoneList<CharacterRange>* newline_ranges =
819           zone->New<ZoneList<CharacterRange>>(3, zone);
820       CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
821       JSRegExp::Flags default_flags = JSRegExp::Flags();
822       RegExpCharacterClass* newline_atom =
823           zone->New<RegExpCharacterClass>('n', default_flags);
824       TextNode* newline_matcher =
825           zone->New<TextNode>(newline_atom, false,
826                               ActionNode::PositiveSubmatchSuccess(
827                                   stack_pointer_register, position_register,
828                                   0,   // No captures inside.
829                                   -1,  // Ignored if no captures.
830                                   on_success));
831       // Create an end-of-input matcher.
832       RegExpNode* end_of_line = ActionNode::BeginSubmatch(
833           stack_pointer_register, position_register, newline_matcher);
834       // Add the two alternatives to the ChoiceNode.
835       GuardedAlternative eol_alternative(end_of_line);
836       result->AddAlternative(eol_alternative);
837       GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
838       result->AddAlternative(end_alternative);
839       return result;
840     }
841     default:
842       UNREACHABLE();
843   }
844   return on_success;
845 }
846 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)847 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
848                                         RegExpNode* on_success) {
849   return compiler->zone()->New<BackReferenceNode>(
850       RegExpCapture::StartRegister(index()),
851       RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(),
852       on_success);
853 }
854 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)855 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
856                                 RegExpNode* on_success) {
857   return on_success;
858 }
859 
Builder(bool is_positive,RegExpNode * on_success,int stack_pointer_register,int position_register,int capture_register_count,int capture_register_start)860 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
861                                    int stack_pointer_register,
862                                    int position_register,
863                                    int capture_register_count,
864                                    int capture_register_start)
865     : is_positive_(is_positive),
866       on_success_(on_success),
867       stack_pointer_register_(stack_pointer_register),
868       position_register_(position_register) {
869   if (is_positive_) {
870     on_match_success_ = ActionNode::PositiveSubmatchSuccess(
871         stack_pointer_register, position_register, capture_register_count,
872         capture_register_start, on_success_);
873   } else {
874     Zone* zone = on_success_->zone();
875     on_match_success_ = zone->New<NegativeSubmatchSuccess>(
876         stack_pointer_register, position_register, capture_register_count,
877         capture_register_start, zone);
878   }
879 }
880 
ForMatch(RegExpNode * match)881 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
882   if (is_positive_) {
883     return ActionNode::BeginSubmatch(stack_pointer_register_,
884                                      position_register_, match);
885   } else {
886     Zone* zone = on_success_->zone();
887     // We use a ChoiceNode to represent the negative lookaround. The first
888     // alternative is the negative match. On success, the end node backtracks.
889     // On failure, the second alternative is tried and leads to success.
890     // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
891     // first exit when calculating quick checks.
892     ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>(
893         GuardedAlternative(match), GuardedAlternative(on_success_), zone);
894     return ActionNode::BeginSubmatch(stack_pointer_register_,
895                                      position_register_, choice_node);
896   }
897 }
898 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)899 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
900                                      RegExpNode* on_success) {
901   int stack_pointer_register = compiler->AllocateRegister();
902   int position_register = compiler->AllocateRegister();
903 
904   const int registers_per_capture = 2;
905   const int register_of_first_capture = 2;
906   int register_count = capture_count_ * registers_per_capture;
907   int register_start =
908       register_of_first_capture + capture_from_ * registers_per_capture;
909 
910   RegExpNode* result;
911   bool was_reading_backward = compiler->read_backward();
912   compiler->set_read_backward(type() == LOOKBEHIND);
913   Builder builder(is_positive(), on_success, stack_pointer_register,
914                   position_register, register_count, register_start);
915   RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
916   result = builder.ForMatch(match);
917   compiler->set_read_backward(was_reading_backward);
918   return result;
919 }
920 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)921 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
922                                   RegExpNode* on_success) {
923   return ToNode(body(), index(), compiler, on_success);
924 }
925 
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)926 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
927                                   RegExpCompiler* compiler,
928                                   RegExpNode* on_success) {
929   DCHECK_NOT_NULL(body);
930   int start_reg = RegExpCapture::StartRegister(index);
931   int end_reg = RegExpCapture::EndRegister(index);
932   if (compiler->read_backward()) std::swap(start_reg, end_reg);
933   RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
934   RegExpNode* body_node = body->ToNode(compiler, store_end);
935   return ActionNode::StorePosition(start_reg, true, body_node);
936 }
937 
938 namespace {
939 
940 class AssertionSequenceRewriter final {
941  public:
942   // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
943   // instead of sprinkling rewrites into the AST->Node conversion process.
MaybeRewrite(ZoneList<RegExpTree * > * terms,Zone * zone)944   static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
945     AssertionSequenceRewriter rewriter(terms, zone);
946 
947     static constexpr int kNoIndex = -1;
948     int from = kNoIndex;
949 
950     for (int i = 0; i < terms->length(); i++) {
951       RegExpTree* t = terms->at(i);
952       if (from == kNoIndex && t->IsAssertion()) {
953         from = i;  // Start a sequence.
954       } else if (from != kNoIndex && !t->IsAssertion()) {
955         // Terminate and process the sequence.
956         if (i - from > 1) rewriter.Rewrite(from, i);
957         from = kNoIndex;
958       }
959     }
960 
961     if (from != kNoIndex && terms->length() - from > 1) {
962       rewriter.Rewrite(from, terms->length());
963     }
964   }
965 
966   // All assertions are zero width. A consecutive sequence of assertions is
967   // order-independent. There's two ways we can optimize here:
968   // 1. fold all identical assertions.
969   // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
970   //    sequence fails.
Rewrite(int from,int to)971   void Rewrite(int from, int to) {
972     DCHECK_GT(to, from + 1);
973 
974     // Bitfield of all seen assertions.
975     uint32_t seen_assertions = 0;
976     STATIC_ASSERT(RegExpAssertion::LAST_TYPE < kUInt32Size * kBitsPerByte);
977 
978     // Flags must match for folding.
979     JSRegExp::Flags flags = terms_->at(from)->AsAssertion()->flags();
980     bool saw_mismatched_flags = false;
981 
982     for (int i = from; i < to; i++) {
983       RegExpAssertion* t = terms_->at(i)->AsAssertion();
984       if (t->flags() != flags) saw_mismatched_flags = true;
985       const uint32_t bit = 1 << t->assertion_type();
986 
987       if ((seen_assertions & bit) && !saw_mismatched_flags) {
988         // Fold duplicates.
989         terms_->Set(i, zone_->New<RegExpEmpty>());
990       }
991 
992       seen_assertions |= bit;
993     }
994 
995     // Collapse failures.
996     const uint32_t always_fails_mask =
997         1 << RegExpAssertion::BOUNDARY | 1 << RegExpAssertion::NON_BOUNDARY;
998     if ((seen_assertions & always_fails_mask) == always_fails_mask) {
999       ReplaceSequenceWithFailure(from, to);
1000     }
1001   }
1002 
ReplaceSequenceWithFailure(int from,int to)1003   void ReplaceSequenceWithFailure(int from, int to) {
1004     // Replace the entire sequence with a single node that always fails.
1005     // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
1006     // negated '*' (everything) range serves the purpose.
1007     ZoneList<CharacterRange>* ranges =
1008         zone_->New<ZoneList<CharacterRange>>(0, zone_);
1009     RegExpCharacterClass* cc =
1010         zone_->New<RegExpCharacterClass>(zone_, ranges, JSRegExp::Flags());
1011     terms_->Set(from, cc);
1012 
1013     // Zero out the rest.
1014     RegExpEmpty* empty = zone_->New<RegExpEmpty>();
1015     for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
1016   }
1017 
1018  private:
AssertionSequenceRewriter(ZoneList<RegExpTree * > * terms,Zone * zone)1019   AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
1020       : zone_(zone), terms_(terms) {}
1021 
1022   Zone* zone_;
1023   ZoneList<RegExpTree*>* terms_;
1024 };
1025 
1026 }  // namespace
1027 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)1028 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
1029                                       RegExpNode* on_success) {
1030   ZoneList<RegExpTree*>* children = nodes();
1031 
1032   AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
1033 
1034   RegExpNode* current = on_success;
1035   if (compiler->read_backward()) {
1036     for (int i = 0; i < children->length(); i++) {
1037       current = children->at(i)->ToNode(compiler, current);
1038     }
1039   } else {
1040     for (int i = children->length() - 1; i >= 0; i--) {
1041       current = children->at(i)->ToNode(compiler, current);
1042     }
1043   }
1044   return current;
1045 }
1046 
AddClass(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)1047 static void AddClass(const int* elmv, int elmc,
1048                      ZoneList<CharacterRange>* ranges, Zone* zone) {
1049   elmc--;
1050   DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1051   for (int i = 0; i < elmc; i += 2) {
1052     DCHECK(elmv[i] < elmv[i + 1]);
1053     ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
1054   }
1055 }
1056 
AddClassNegated(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)1057 static void AddClassNegated(const int* elmv, int elmc,
1058                             ZoneList<CharacterRange>* ranges, Zone* zone) {
1059   elmc--;
1060   DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1061   DCHECK_NE(0x0000, elmv[0]);
1062   DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
1063   uc16 last = 0x0000;
1064   for (int i = 0; i < elmc; i += 2) {
1065     DCHECK(last <= elmv[i] - 1);
1066     DCHECK(elmv[i] < elmv[i + 1]);
1067     ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
1068     last = elmv[i + 1];
1069   }
1070   ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
1071 }
1072 
AddClassEscape(char type,ZoneList<CharacterRange> * ranges,bool add_unicode_case_equivalents,Zone * zone)1073 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
1074                                     bool add_unicode_case_equivalents,
1075                                     Zone* zone) {
1076   if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
1077     // See #sec-runtime-semantics-wordcharacters-abstract-operation
1078     // In case of unicode and ignore_case, we need to create the closure over
1079     // case equivalent characters before negating.
1080     ZoneList<CharacterRange>* new_ranges =
1081         zone->New<ZoneList<CharacterRange>>(2, zone);
1082     AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
1083     AddUnicodeCaseEquivalents(new_ranges, zone);
1084     if (type == 'W') {
1085       ZoneList<CharacterRange>* negated =
1086           zone->New<ZoneList<CharacterRange>>(2, zone);
1087       CharacterRange::Negate(new_ranges, negated, zone);
1088       new_ranges = negated;
1089     }
1090     ranges->AddAll(*new_ranges, zone);
1091     return;
1092   }
1093   AddClassEscape(type, ranges, zone);
1094 }
1095 
AddClassEscape(char type,ZoneList<CharacterRange> * ranges,Zone * zone)1096 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
1097                                     Zone* zone) {
1098   switch (type) {
1099     case 's':
1100       AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1101       break;
1102     case 'S':
1103       AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1104       break;
1105     case 'w':
1106       AddClass(kWordRanges, kWordRangeCount, ranges, zone);
1107       break;
1108     case 'W':
1109       AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
1110       break;
1111     case 'd':
1112       AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
1113       break;
1114     case 'D':
1115       AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
1116       break;
1117     case '.':
1118       AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
1119                       zone);
1120       break;
1121     // This is not a character range as defined by the spec but a
1122     // convenient shorthand for a character class that matches any
1123     // character.
1124     case '*':
1125       ranges->Add(CharacterRange::Everything(), zone);
1126       break;
1127     // This is the set of characters matched by the $ and ^ symbols
1128     // in multiline mode.
1129     case 'n':
1130       AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
1131       break;
1132     default:
1133       UNREACHABLE();
1134   }
1135 }
1136 
GetWordBounds()1137 Vector<const int> CharacterRange::GetWordBounds() {
1138   return Vector<const int>(kWordRanges, kWordRangeCount - 1);
1139 }
1140 
1141 // static
AddCaseEquivalents(Isolate * isolate,Zone * zone,ZoneList<CharacterRange> * ranges,bool is_one_byte)1142 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
1143                                         ZoneList<CharacterRange>* ranges,
1144                                         bool is_one_byte) {
1145   CharacterRange::Canonicalize(ranges);
1146   int range_count = ranges->length();
1147 #ifdef V8_INTL_SUPPORT
1148   icu::UnicodeSet others;
1149   for (int i = 0; i < range_count; i++) {
1150     CharacterRange range = ranges->at(i);
1151     uc32 from = range.from();
1152     if (from > String::kMaxUtf16CodeUnit) continue;
1153     uc32 to = Min(range.to(), String::kMaxUtf16CodeUnitU);
1154     // Nothing to be done for surrogates.
1155     if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
1156     if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1157       if (from > String::kMaxOneByteCharCode) continue;
1158       if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode;
1159     }
1160     others.add(from, to);
1161   }
1162 
1163   // Compute the set of additional characters that should be added,
1164   // using UnicodeSet::closeOver. ECMA 262 defines slightly different
1165   // case-folding rules than Unicode, so some characters that are
1166   // added by closeOver do not match anything other than themselves in
1167   // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
1168   // same case-insensitive character as 's' or 'S' according to
1169   // Unicode, but does not match any other character in JS. To handle
1170   // this case, we add such characters to the IgnoreSet and filter
1171   // them out. We filter twice: once before calling closeOver (to
1172   // prevent 'ſ' from adding 's'), and once after calling closeOver
1173   // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
1174   // more information.
1175   icu::UnicodeSet already_added(others);
1176   others.removeAll(RegExpCaseFolding::IgnoreSet());
1177   others.closeOver(USET_CASE_INSENSITIVE);
1178   others.removeAll(RegExpCaseFolding::IgnoreSet());
1179   others.removeAll(already_added);
1180 
1181   // Add others to the ranges
1182   for (int32_t i = 0; i < others.getRangeCount(); i++) {
1183     UChar32 from = others.getRangeStart(i);
1184     UChar32 to = others.getRangeEnd(i);
1185     if (from == to) {
1186       ranges->Add(CharacterRange::Singleton(from), zone);
1187     } else {
1188       ranges->Add(CharacterRange::Range(from, to), zone);
1189     }
1190   }
1191 #else
1192   for (int i = 0; i < range_count; i++) {
1193     CharacterRange range = ranges->at(i);
1194     uc32 bottom = range.from();
1195     if (bottom > String::kMaxUtf16CodeUnit) continue;
1196     uc32 top = Min(range.to(), String::kMaxUtf16CodeUnitU);
1197     // Nothing to be done for surrogates.
1198     if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
1199     if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1200       if (bottom > String::kMaxOneByteCharCode) continue;
1201       if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
1202     }
1203     unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1204     if (top == bottom) {
1205       // If this is a singleton we just expand the one character.
1206       int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
1207       for (int i = 0; i < length; i++) {
1208         uc32 chr = chars[i];
1209         if (chr != bottom) {
1210           ranges->Add(CharacterRange::Singleton(chars[i]), zone);
1211         }
1212       }
1213     } else {
1214       // If this is a range we expand the characters block by block, expanding
1215       // contiguous subranges (blocks) one at a time.  The approach is as
1216       // follows.  For a given start character we look up the remainder of the
1217       // block that contains it (represented by the end point), for instance we
1218       // find 'z' if the character is 'c'.  A block is characterized by the
1219       // property that all characters uncanonicalize in the same way, except
1220       // that each entry in the result is incremented by the distance from the
1221       // first element.  So a-z is a block because 'a' uncanonicalizes to ['a',
1222       // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].  Once
1223       // we've found the end point we look up its uncanonicalization and
1224       // produce a range for each element.  For instance for [c-f] we look up
1225       // ['z', 'Z'] and produce [c-f] and [C-F].  We then only add a range if
1226       // it is not already contained in the input, so [c-f] will be skipped but
1227       // [C-F] will be added.  If this range is not completely contained in a
1228       // block we do this for all the blocks covered by the range (handling
1229       // characters that is not in a block as a "singleton block").
1230       unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1231       uc32 pos = bottom;
1232       while (pos <= top) {
1233         int length =
1234             isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
1235         uc32 block_end;
1236         if (length == 0) {
1237           block_end = pos;
1238         } else {
1239           DCHECK_EQ(1, length);
1240           block_end = equivalents[0];
1241         }
1242         int end = (block_end > top) ? top : block_end;
1243         length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
1244                                                          equivalents);
1245         for (int i = 0; i < length; i++) {
1246           uc32 c = equivalents[i];
1247           uc32 range_from = c - (block_end - pos);
1248           uc32 range_to = c - (block_end - end);
1249           if (!(bottom <= range_from && range_to <= top)) {
1250             ranges->Add(CharacterRange::Range(range_from, range_to), zone);
1251           }
1252         }
1253         pos = end + 1;
1254       }
1255     }
1256   }
1257 #endif  // V8_INTL_SUPPORT
1258 }
1259 
IsCanonical(ZoneList<CharacterRange> * ranges)1260 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
1261   DCHECK_NOT_NULL(ranges);
1262   int n = ranges->length();
1263   if (n <= 1) return true;
1264   uc32 max = ranges->at(0).to();
1265   for (int i = 1; i < n; i++) {
1266     CharacterRange next_range = ranges->at(i);
1267     if (next_range.from() <= max + 1) return false;
1268     max = next_range.to();
1269   }
1270   return true;
1271 }
1272 
ranges(Zone * zone)1273 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
1274   if (ranges_ == nullptr) {
1275     ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone);
1276     CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
1277   }
1278   return ranges_;
1279 }
1280 
1281 // Move a number of elements in a zonelist to another position
1282 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)1283 static void MoveRanges(ZoneList<CharacterRange>* list, int from, int to,
1284                        int count) {
1285   // Ranges are potentially overlapping.
1286   if (from < to) {
1287     for (int i = count - 1; i >= 0; i--) {
1288       list->at(to + i) = list->at(from + i);
1289     }
1290   } else {
1291     for (int i = 0; i < count; i++) {
1292       list->at(to + i) = list->at(from + i);
1293     }
1294   }
1295 }
1296 
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)1297 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
1298                                       CharacterRange insert) {
1299   // Inserts a range into list[0..count[, which must be sorted
1300   // by from value and non-overlapping and non-adjacent, using at most
1301   // list[0..count] for the result. Returns the number of resulting
1302   // canonicalized ranges. Inserting a range may collapse existing ranges into
1303   // fewer ranges, so the return value can be anything in the range 1..count+1.
1304   uc32 from = insert.from();
1305   uc32 to = insert.to();
1306   int start_pos = 0;
1307   int end_pos = count;
1308   for (int i = count - 1; i >= 0; i--) {
1309     CharacterRange current = list->at(i);
1310     if (current.from() > to + 1) {
1311       end_pos = i;
1312     } else if (current.to() + 1 < from) {
1313       start_pos = i + 1;
1314       break;
1315     }
1316   }
1317 
1318   // Inserted range overlaps, or is adjacent to, ranges at positions
1319   // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
1320   // not affected by the insertion.
1321   // If start_pos == end_pos, the range must be inserted before start_pos.
1322   // if start_pos < end_pos, the entire range from start_pos to end_pos
1323   // must be merged with the insert range.
1324 
1325   if (start_pos == end_pos) {
1326     // Insert between existing ranges at position start_pos.
1327     if (start_pos < count) {
1328       MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
1329     }
1330     list->at(start_pos) = insert;
1331     return count + 1;
1332   }
1333   if (start_pos + 1 == end_pos) {
1334     // Replace single existing range at position start_pos.
1335     CharacterRange to_replace = list->at(start_pos);
1336     int new_from = Min(to_replace.from(), from);
1337     int new_to = Max(to_replace.to(), to);
1338     list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1339     return count;
1340   }
1341   // Replace a number of existing ranges from start_pos to end_pos - 1.
1342   // Move the remaining ranges down.
1343 
1344   int new_from = Min(list->at(start_pos).from(), from);
1345   int new_to = Max(list->at(end_pos - 1).to(), to);
1346   if (end_pos < count) {
1347     MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
1348   }
1349   list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1350   return count - (end_pos - start_pos) + 1;
1351 }
1352 
Canonicalize()1353 void CharacterSet::Canonicalize() {
1354   // Special/default classes are always considered canonical. The result
1355   // of calling ranges() will be sorted.
1356   if (ranges_ == nullptr) return;
1357   CharacterRange::Canonicalize(ranges_);
1358 }
1359 
Canonicalize(ZoneList<CharacterRange> * character_ranges)1360 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
1361   if (character_ranges->length() <= 1) return;
1362   // Check whether ranges are already canonical (increasing, non-overlapping,
1363   // non-adjacent).
1364   int n = character_ranges->length();
1365   uc32 max = character_ranges->at(0).to();
1366   int i = 1;
1367   while (i < n) {
1368     CharacterRange current = character_ranges->at(i);
1369     if (current.from() <= max + 1) {
1370       break;
1371     }
1372     max = current.to();
1373     i++;
1374   }
1375   // Canonical until the i'th range. If that's all of them, we are done.
1376   if (i == n) return;
1377 
1378   // The ranges at index i and forward are not canonicalized. Make them so by
1379   // doing the equivalent of insertion sort (inserting each into the previous
1380   // list, in order).
1381   // Notice that inserting a range can reduce the number of ranges in the
1382   // result due to combining of adjacent and overlapping ranges.
1383   int read = i;           // Range to insert.
1384   int num_canonical = i;  // Length of canonicalized part of list.
1385   do {
1386     num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
1387                                                character_ranges->at(read));
1388     read++;
1389   } while (read < n);
1390   character_ranges->Rewind(num_canonical);
1391 
1392   DCHECK(CharacterRange::IsCanonical(character_ranges));
1393 }
1394 
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges,Zone * zone)1395 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
1396                             ZoneList<CharacterRange>* negated_ranges,
1397                             Zone* zone) {
1398   DCHECK(CharacterRange::IsCanonical(ranges));
1399   DCHECK_EQ(0, negated_ranges->length());
1400   int range_count = ranges->length();
1401   uc32 from = 0;
1402   int i = 0;
1403   if (range_count > 0 && ranges->at(0).from() == 0) {
1404     from = ranges->at(0).to() + 1;
1405     i = 1;
1406   }
1407   while (i < range_count) {
1408     CharacterRange range = ranges->at(i);
1409     negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
1410     from = range.to() + 1;
1411     i++;
1412   }
1413   if (from < String::kMaxCodePoint) {
1414     negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
1415                         zone);
1416   }
1417 }
1418 
1419 // Scoped object to keep track of how much we unroll quantifier loops in the
1420 // regexp graph generator.
1421 class RegExpExpansionLimiter {
1422  public:
1423   static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler * compiler,int factor)1424   RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
1425       : compiler_(compiler),
1426         saved_expansion_factor_(compiler->current_expansion_factor()),
1427         ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
1428     DCHECK_LT(0, factor);
1429     if (ok_to_expand_) {
1430       if (factor > kMaxExpansionFactor) {
1431         // Avoid integer overflow of the current expansion factor.
1432         ok_to_expand_ = false;
1433         compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
1434       } else {
1435         int new_factor = saved_expansion_factor_ * factor;
1436         ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
1437         compiler->set_current_expansion_factor(new_factor);
1438       }
1439     }
1440   }
1441 
~RegExpExpansionLimiter()1442   ~RegExpExpansionLimiter() {
1443     compiler_->set_current_expansion_factor(saved_expansion_factor_);
1444   }
1445 
ok_to_expand()1446   bool ok_to_expand() { return ok_to_expand_; }
1447 
1448  private:
1449   RegExpCompiler* compiler_;
1450   int saved_expansion_factor_;
1451   bool ok_to_expand_;
1452 
1453   DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
1454 };
1455 
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)1456 RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
1457                                      RegExpTree* body, RegExpCompiler* compiler,
1458                                      RegExpNode* on_success,
1459                                      bool not_at_start) {
1460   // x{f, t} becomes this:
1461   //
1462   //             (r++)<-.
1463   //               |     `
1464   //               |     (x)
1465   //               v     ^
1466   //      (r=0)-->(?)---/ [if r < t]
1467   //               |
1468   //   [if r >= f] \----> ...
1469   //
1470 
1471   // 15.10.2.5 RepeatMatcher algorithm.
1472   // The parser has already eliminated the case where max is 0.  In the case
1473   // where max_match is zero the parser has removed the quantifier if min was
1474   // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
1475 
1476   // If we know that we cannot match zero length then things are a little
1477   // simpler since we don't need to make the special zero length match check
1478   // from step 2.1.  If the min and max are small we can unroll a little in
1479   // this case.
1480   static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
1481   static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
1482   if (max == 0) return on_success;  // This can happen due to recursion.
1483   bool body_can_be_empty = (body->min_match() == 0);
1484   int body_start_reg = RegExpCompiler::kNoRegister;
1485   Interval capture_registers = body->CaptureRegisters();
1486   bool needs_capture_clearing = !capture_registers.is_empty();
1487   Zone* zone = compiler->zone();
1488 
1489   if (body_can_be_empty) {
1490     body_start_reg = compiler->AllocateRegister();
1491   } else if (compiler->optimize() && !needs_capture_clearing) {
1492     // Only unroll if there are no captures and the body can't be
1493     // empty.
1494     {
1495       RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
1496       if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
1497         int new_max = (max == kInfinity) ? max : max - min;
1498         // Recurse once to get the loop or optional matches after the fixed
1499         // ones.
1500         RegExpNode* answer =
1501             ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
1502         // Unroll the forced matches from 0 to min.  This can cause chains of
1503         // TextNodes (which the parser does not generate).  These should be
1504         // combined if it turns out they hinder good code generation.
1505         for (int i = 0; i < min; i++) {
1506           answer = body->ToNode(compiler, answer);
1507         }
1508         return answer;
1509       }
1510     }
1511     if (max <= kMaxUnrolledMaxMatches && min == 0) {
1512       DCHECK_LT(0, max);  // Due to the 'if' above.
1513       RegExpExpansionLimiter limiter(compiler, max);
1514       if (limiter.ok_to_expand()) {
1515         // Unroll the optional matches up to max.
1516         RegExpNode* answer = on_success;
1517         for (int i = 0; i < max; i++) {
1518           ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone);
1519           if (is_greedy) {
1520             alternation->AddAlternative(
1521                 GuardedAlternative(body->ToNode(compiler, answer)));
1522             alternation->AddAlternative(GuardedAlternative(on_success));
1523           } else {
1524             alternation->AddAlternative(GuardedAlternative(on_success));
1525             alternation->AddAlternative(
1526                 GuardedAlternative(body->ToNode(compiler, answer)));
1527           }
1528           answer = alternation;
1529           if (not_at_start && !compiler->read_backward()) {
1530             alternation->set_not_at_start();
1531           }
1532         }
1533         return answer;
1534       }
1535     }
1536   }
1537   bool has_min = min > 0;
1538   bool has_max = max < RegExpTree::kInfinity;
1539   bool needs_counter = has_min || has_max;
1540   int reg_ctr = needs_counter ? compiler->AllocateRegister()
1541                               : RegExpCompiler::kNoRegister;
1542   LoopChoiceNode* center = zone->New<LoopChoiceNode>(
1543       body->min_match() == 0, compiler->read_backward(), min, zone);
1544   if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
1545   RegExpNode* loop_return =
1546       needs_counter ? static_cast<RegExpNode*>(
1547                           ActionNode::IncrementRegister(reg_ctr, center))
1548                     : static_cast<RegExpNode*>(center);
1549   if (body_can_be_empty) {
1550     // If the body can be empty we need to check if it was and then
1551     // backtrack.
1552     loop_return =
1553         ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
1554   }
1555   RegExpNode* body_node = body->ToNode(compiler, loop_return);
1556   if (body_can_be_empty) {
1557     // If the body can be empty we need to store the start position
1558     // so we can bail out if it was empty.
1559     body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
1560   }
1561   if (needs_capture_clearing) {
1562     // Before entering the body of this loop we need to clear captures.
1563     body_node = ActionNode::ClearCaptures(capture_registers, body_node);
1564   }
1565   GuardedAlternative body_alt(body_node);
1566   if (has_max) {
1567     Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max);
1568     body_alt.AddGuard(body_guard, zone);
1569   }
1570   GuardedAlternative rest_alt(on_success);
1571   if (has_min) {
1572     Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min);
1573     rest_alt.AddGuard(rest_guard, zone);
1574   }
1575   if (is_greedy) {
1576     center->AddLoopAlternative(body_alt);
1577     center->AddContinueAlternative(rest_alt);
1578   } else {
1579     center->AddContinueAlternative(rest_alt);
1580     center->AddLoopAlternative(body_alt);
1581   }
1582   if (needs_counter) {
1583     return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
1584   } else {
1585     return center;
1586   }
1587 }
1588 
1589 }  // namespace internal
1590 }  // namespace v8
1591