1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-compiler.h"
6
7 #include "src/execution/isolate.h"
8 #include "src/regexp/regexp.h"
9 #ifdef V8_INTL_SUPPORT
10 #include "src/regexp/special-case.h"
11 #endif // V8_INTL_SUPPORT
12 #include "src/strings/unicode-inl.h"
13 #include "src/zone/zone-list-inl.h"
14
15 #ifdef V8_INTL_SUPPORT
16 #include "unicode/locid.h"
17 #include "unicode/uniset.h"
18 #include "unicode/utypes.h"
19 #endif // V8_INTL_SUPPORT
20
21 namespace v8 {
22 namespace internal {
23
24 using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
25
26 // -------------------------------------------------------------------
27 // Tree to graph conversion
28
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)29 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
30 RegExpNode* on_success) {
31 ZoneList<TextElement>* elms =
32 compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone());
33 elms->Add(TextElement::Atom(this), compiler->zone());
34 return compiler->zone()->New<TextNode>(elms, compiler->read_backward(),
35 on_success);
36 }
37
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)38 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
39 RegExpNode* on_success) {
40 return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(),
41 on_success);
42 }
43
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)44 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
45 const int* special_class, int length) {
46 length--; // Remove final marker.
47 DCHECK_EQ(kRangeEndMarker, special_class[length]);
48 DCHECK_NE(0, ranges->length());
49 DCHECK_NE(0, length);
50 DCHECK_NE(0, special_class[0]);
51 if (ranges->length() != (length >> 1) + 1) {
52 return false;
53 }
54 CharacterRange range = ranges->at(0);
55 if (range.from() != 0) {
56 return false;
57 }
58 for (int i = 0; i < length; i += 2) {
59 if (static_cast<uc32>(special_class[i]) != (range.to() + 1)) {
60 return false;
61 }
62 range = ranges->at((i >> 1) + 1);
63 if (static_cast<uc32>(special_class[i + 1]) != range.from()) {
64 return false;
65 }
66 }
67 if (range.to() != String::kMaxCodePoint) {
68 return false;
69 }
70 return true;
71 }
72
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)73 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
74 const int* special_class, int length) {
75 length--; // Remove final marker.
76 DCHECK_EQ(kRangeEndMarker, special_class[length]);
77 if (ranges->length() * 2 != length) {
78 return false;
79 }
80 for (int i = 0; i < length; i += 2) {
81 CharacterRange range = ranges->at(i >> 1);
82 if (range.from() != static_cast<uc32>(special_class[i]) ||
83 range.to() != static_cast<uc32>(special_class[i + 1] - 1)) {
84 return false;
85 }
86 }
87 return true;
88 }
89
is_standard(Zone * zone)90 bool RegExpCharacterClass::is_standard(Zone* zone) {
91 // TODO(lrn): Remove need for this function, by not throwing away information
92 // along the way.
93 if (is_negated()) {
94 return false;
95 }
96 if (set_.is_standard()) {
97 return true;
98 }
99 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
100 set_.set_standard_set_type('s');
101 return true;
102 }
103 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
104 set_.set_standard_set_type('S');
105 return true;
106 }
107 if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
108 kLineTerminatorRangeCount)) {
109 set_.set_standard_set_type('.');
110 return true;
111 }
112 if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
113 kLineTerminatorRangeCount)) {
114 set_.set_standard_set_type('n');
115 return true;
116 }
117 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
118 set_.set_standard_set_type('w');
119 return true;
120 }
121 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
122 set_.set_standard_set_type('W');
123 return true;
124 }
125 return false;
126 }
127
UnicodeRangeSplitter(ZoneList<CharacterRange> * base)128 UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) {
129 // The unicode range splitter categorizes given character ranges into:
130 // - Code points from the BMP representable by one code unit.
131 // - Code points outside the BMP that need to be split into surrogate pairs.
132 // - Lone lead surrogates.
133 // - Lone trail surrogates.
134 // Lone surrogates are valid code points, even though no actual characters.
135 // They require special matching to make sure we do not split surrogate pairs.
136
137 for (int i = 0; i < base->length(); i++) AddRange(base->at(i));
138 }
139
AddRange(CharacterRange range)140 void UnicodeRangeSplitter::AddRange(CharacterRange range) {
141 static constexpr uc32 kBmp1Start = 0;
142 static constexpr uc32 kBmp1End = kLeadSurrogateStart - 1;
143 static constexpr uc32 kBmp2Start = kTrailSurrogateEnd + 1;
144 static constexpr uc32 kBmp2End = kNonBmpStart - 1;
145
146 // Ends are all inclusive.
147 STATIC_ASSERT(kBmp1Start == 0);
148 STATIC_ASSERT(kBmp1Start < kBmp1End);
149 STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart);
150 STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd);
151 STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);
152 STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd);
153 STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start);
154 STATIC_ASSERT(kBmp2Start < kBmp2End);
155 STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart);
156 STATIC_ASSERT(kNonBmpStart < kNonBmpEnd);
157
158 static constexpr uc32 kStarts[] = {
159 kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart,
160 kBmp2Start, kNonBmpStart,
161 };
162
163 static constexpr uc32 kEnds[] = {
164 kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,
165 };
166
167 CharacterRangeVector* const kTargets[] = {
168 &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_,
169 };
170
171 static constexpr int kCount = arraysize(kStarts);
172 STATIC_ASSERT(kCount == arraysize(kEnds));
173 STATIC_ASSERT(kCount == arraysize(kTargets));
174
175 for (int i = 0; i < kCount; i++) {
176 if (kStarts[i] > range.to()) break;
177 const uc32 from = std::max(kStarts[i], range.from());
178 const uc32 to = std::min(kEnds[i], range.to());
179 if (from > to) continue;
180 kTargets[i]->emplace_back(CharacterRange::Range(from, to));
181 }
182 }
183
184 namespace {
185
186 // Translates between new and old V8-isms (SmallVector, ZoneList).
ToCanonicalZoneList(const UnicodeRangeSplitter::CharacterRangeVector * v,Zone * zone)187 ZoneList<CharacterRange>* ToCanonicalZoneList(
188 const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) {
189 if (v->empty()) return nullptr;
190
191 ZoneList<CharacterRange>* result =
192 zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone);
193 for (size_t i = 0; i < v->size(); i++) {
194 result->Add(v->at(i), zone);
195 }
196
197 CharacterRange::Canonicalize(result);
198 return result;
199 }
200
AddBmpCharacters(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)201 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
202 RegExpNode* on_success, UnicodeRangeSplitter* splitter,
203 JSRegExp::Flags flags) {
204 ZoneList<CharacterRange>* bmp =
205 ToCanonicalZoneList(splitter->bmp(), compiler->zone());
206 if (bmp == nullptr) return;
207 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
208 compiler->zone(), bmp, compiler->read_backward(), on_success, flags)));
209 }
210
AddNonBmpSurrogatePairs(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)211 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
212 RegExpNode* on_success,
213 UnicodeRangeSplitter* splitter,
214 JSRegExp::Flags flags) {
215 ZoneList<CharacterRange>* non_bmp =
216 ToCanonicalZoneList(splitter->non_bmp(), compiler->zone());
217 if (non_bmp == nullptr) return;
218 DCHECK(!compiler->one_byte());
219 Zone* zone = compiler->zone();
220 CharacterRange::Canonicalize(non_bmp);
221 for (int i = 0; i < non_bmp->length(); i++) {
222 // Match surrogate pair.
223 // E.g. [\u10005-\u11005] becomes
224 // \ud800[\udc05-\udfff]|
225 // [\ud801-\ud803][\udc00-\udfff]|
226 // \ud804[\udc00-\udc05]
227 uc32 from = non_bmp->at(i).from();
228 uc32 to = non_bmp->at(i).to();
229 uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
230 uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
231 uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
232 uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
233 if (from_l == to_l) {
234 // The lead surrogate is the same.
235 result->AddAlternative(
236 GuardedAlternative(TextNode::CreateForSurrogatePair(
237 zone, CharacterRange::Singleton(from_l),
238 CharacterRange::Range(from_t, to_t), compiler->read_backward(),
239 on_success, flags)));
240 } else {
241 if (from_t != kTrailSurrogateStart) {
242 // Add [from_l][from_t-\udfff]
243 result->AddAlternative(
244 GuardedAlternative(TextNode::CreateForSurrogatePair(
245 zone, CharacterRange::Singleton(from_l),
246 CharacterRange::Range(from_t, kTrailSurrogateEnd),
247 compiler->read_backward(), on_success, flags)));
248 from_l++;
249 }
250 if (to_t != kTrailSurrogateEnd) {
251 // Add [to_l][\udc00-to_t]
252 result->AddAlternative(
253 GuardedAlternative(TextNode::CreateForSurrogatePair(
254 zone, CharacterRange::Singleton(to_l),
255 CharacterRange::Range(kTrailSurrogateStart, to_t),
256 compiler->read_backward(), on_success, flags)));
257 to_l--;
258 }
259 if (from_l <= to_l) {
260 // Add [from_l-to_l][\udc00-\udfff]
261 result->AddAlternative(
262 GuardedAlternative(TextNode::CreateForSurrogatePair(
263 zone, CharacterRange::Range(from_l, to_l),
264 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
265 compiler->read_backward(), on_success, flags)));
266 }
267 }
268 }
269 }
270
NegativeLookaroundAgainstReadDirectionAndMatch(RegExpCompiler * compiler,ZoneList<CharacterRange> * lookbehind,ZoneList<CharacterRange> * match,RegExpNode * on_success,bool read_backward,JSRegExp::Flags flags)271 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
272 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
273 ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
274 JSRegExp::Flags flags) {
275 Zone* zone = compiler->zone();
276 RegExpNode* match_node = TextNode::CreateForCharacterRanges(
277 zone, match, read_backward, on_success, flags);
278 int stack_register = compiler->UnicodeLookaroundStackRegister();
279 int position_register = compiler->UnicodeLookaroundPositionRegister();
280 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
281 position_register);
282 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
283 zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
284 return lookaround.ForMatch(negative_match);
285 }
286
MatchAndNegativeLookaroundInReadDirection(RegExpCompiler * compiler,ZoneList<CharacterRange> * match,ZoneList<CharacterRange> * lookahead,RegExpNode * on_success,bool read_backward,JSRegExp::Flags flags)287 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
288 RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
289 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
290 bool read_backward, JSRegExp::Flags flags) {
291 Zone* zone = compiler->zone();
292 int stack_register = compiler->UnicodeLookaroundStackRegister();
293 int position_register = compiler->UnicodeLookaroundPositionRegister();
294 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
295 position_register);
296 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
297 zone, lookahead, read_backward, lookaround.on_match_success(), flags);
298 return TextNode::CreateForCharacterRanges(
299 zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
300 }
301
AddLoneLeadSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)302 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
303 RegExpNode* on_success,
304 UnicodeRangeSplitter* splitter,
305 JSRegExp::Flags flags) {
306 ZoneList<CharacterRange>* lead_surrogates =
307 ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
308 if (lead_surrogates == nullptr) return;
309 Zone* zone = compiler->zone();
310 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
311 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
312 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
313
314 RegExpNode* match;
315 if (compiler->read_backward()) {
316 // Reading backward. Assert that reading forward, there is no trail
317 // surrogate, and then backward match the lead surrogate.
318 match = NegativeLookaroundAgainstReadDirectionAndMatch(
319 compiler, trail_surrogates, lead_surrogates, on_success, true, flags);
320 } else {
321 // Reading forward. Forward match the lead surrogate and assert that
322 // no trail surrogate follows.
323 match = MatchAndNegativeLookaroundInReadDirection(
324 compiler, lead_surrogates, trail_surrogates, on_success, false, flags);
325 }
326 result->AddAlternative(GuardedAlternative(match));
327 }
328
AddLoneTrailSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)329 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
330 RegExpNode* on_success,
331 UnicodeRangeSplitter* splitter,
332 JSRegExp::Flags flags) {
333 ZoneList<CharacterRange>* trail_surrogates =
334 ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
335 if (trail_surrogates == nullptr) return;
336 Zone* zone = compiler->zone();
337 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
338 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
339 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
340
341 RegExpNode* match;
342 if (compiler->read_backward()) {
343 // Reading backward. Backward match the trail surrogate and assert that no
344 // lead surrogate precedes it.
345 match = MatchAndNegativeLookaroundInReadDirection(
346 compiler, trail_surrogates, lead_surrogates, on_success, true, flags);
347 } else {
348 // Reading forward. Assert that reading backward, there is no lead
349 // surrogate, and then forward match the trail surrogate.
350 match = NegativeLookaroundAgainstReadDirectionAndMatch(
351 compiler, lead_surrogates, trail_surrogates, on_success, false, flags);
352 }
353 result->AddAlternative(GuardedAlternative(match));
354 }
355
UnanchoredAdvance(RegExpCompiler * compiler,RegExpNode * on_success)356 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
357 RegExpNode* on_success) {
358 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
359 DCHECK(!compiler->read_backward());
360 Zone* zone = compiler->zone();
361 // Advance any character. If the character happens to be a lead surrogate and
362 // we advanced into the middle of a surrogate pair, it will work out, as
363 // nothing will match from there. We will have to advance again, consuming
364 // the associated trail surrogate.
365 ZoneList<CharacterRange>* range = CharacterRange::List(
366 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
367 JSRegExp::Flags default_flags = JSRegExp::Flags();
368 return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
369 default_flags);
370 }
371
AddUnicodeCaseEquivalents(ZoneList<CharacterRange> * ranges,Zone * zone)372 void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
373 #ifdef V8_INTL_SUPPORT
374 DCHECK(CharacterRange::IsCanonical(ranges));
375
376 // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
377 // See also https://crbug.com/v8/6727.
378 // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
379 // which we use frequently internally. But large ranges can also easily be
380 // created by the user. We might want to have a more general caching mechanism
381 // for such ranges.
382 if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
383
384 // Use ICU to compute the case fold closure over the ranges.
385 icu::UnicodeSet set;
386 for (int i = 0; i < ranges->length(); i++) {
387 set.add(ranges->at(i).from(), ranges->at(i).to());
388 }
389 // Clear the ranges list without freeing the backing store.
390 ranges->Rewind(0);
391 set.closeOver(USET_CASE_INSENSITIVE);
392 // Full case mapping map single characters to multiple characters.
393 // Those are represented as strings in the set. Remove them so that
394 // we end up with only simple and common case mappings.
395 set.removeAllStrings();
396 for (int i = 0; i < set.getRangeCount(); i++) {
397 ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
398 zone);
399 }
400 // No errors and everything we collected have been ranges.
401 CharacterRange::Canonicalize(ranges);
402 #endif // V8_INTL_SUPPORT
403 }
404
405 } // namespace
406
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)407 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
408 RegExpNode* on_success) {
409 set_.Canonicalize();
410 Zone* zone = compiler->zone();
411 ZoneList<CharacterRange>* ranges = this->ranges(zone);
412 if (NeedsUnicodeCaseEquivalents(flags_)) {
413 AddUnicodeCaseEquivalents(ranges, zone);
414 }
415 if (IsUnicode(flags_) && !compiler->one_byte() &&
416 !contains_split_surrogate()) {
417 if (is_negated()) {
418 ZoneList<CharacterRange>* negated =
419 zone->New<ZoneList<CharacterRange>>(2, zone);
420 CharacterRange::Negate(ranges, negated, zone);
421 ranges = negated;
422 }
423 if (ranges->length() == 0) {
424 JSRegExp::Flags default_flags;
425 RegExpCharacterClass* fail =
426 zone->New<RegExpCharacterClass>(zone, ranges, default_flags);
427 return zone->New<TextNode>(fail, compiler->read_backward(), on_success);
428 }
429 if (standard_type() == '*') {
430 return UnanchoredAdvance(compiler, on_success);
431 } else {
432 ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
433 UnicodeRangeSplitter splitter(ranges);
434 AddBmpCharacters(compiler, result, on_success, &splitter, flags_);
435 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter, flags_);
436 AddLoneLeadSurrogates(compiler, result, on_success, &splitter, flags_);
437 AddLoneTrailSurrogates(compiler, result, on_success, &splitter, flags_);
438 static constexpr int kMaxRangesToInline = 32; // Arbitrary.
439 if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline();
440 return result;
441 }
442 } else {
443 return zone->New<TextNode>(this, compiler->read_backward(), on_success);
444 }
445 }
446
CompareFirstChar(RegExpTree * const * a,RegExpTree * const * b)447 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
448 RegExpAtom* atom1 = (*a)->AsAtom();
449 RegExpAtom* atom2 = (*b)->AsAtom();
450 uc16 character1 = atom1->data().at(0);
451 uc16 character2 = atom2->data().at(0);
452 if (character1 < character2) return -1;
453 if (character1 > character2) return 1;
454 return 0;
455 }
456
457 #ifdef V8_INTL_SUPPORT
458
459 // Case Insensitve comparesion
CompareFirstCharCaseInsensitve(RegExpTree * const * a,RegExpTree * const * b)460 int CompareFirstCharCaseInsensitve(RegExpTree* const* a, RegExpTree* const* b) {
461 RegExpAtom* atom1 = (*a)->AsAtom();
462 RegExpAtom* atom2 = (*b)->AsAtom();
463 icu::UnicodeString character1(atom1->data().at(0));
464 return character1.caseCompare(atom2->data().at(0), U_FOLD_CASE_DEFAULT);
465 }
466
467 #else
468
Canonical(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar c)469 static unibrow::uchar Canonical(
470 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
471 unibrow::uchar c) {
472 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
473 int length = canonicalize->get(c, '\0', chars);
474 DCHECK_LE(length, 1);
475 unibrow::uchar canonical = c;
476 if (length == 1) canonical = chars[0];
477 return canonical;
478 }
479
CompareFirstCharCaseIndependent(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,RegExpTree * const * a,RegExpTree * const * b)480 int CompareFirstCharCaseIndependent(
481 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
482 RegExpTree* const* a, RegExpTree* const* b) {
483 RegExpAtom* atom1 = (*a)->AsAtom();
484 RegExpAtom* atom2 = (*b)->AsAtom();
485 unibrow::uchar character1 = atom1->data().at(0);
486 unibrow::uchar character2 = atom2->data().at(0);
487 if (character1 == character2) return 0;
488 if (character1 >= 'a' || character2 >= 'a') {
489 character1 = Canonical(canonicalize, character1);
490 character2 = Canonical(canonicalize, character2);
491 }
492 return static_cast<int>(character1) - static_cast<int>(character2);
493 }
494 #endif // V8_INTL_SUPPORT
495
496 // We can stable sort runs of atoms, since the order does not matter if they
497 // start with different characters.
498 // Returns true if any consecutive atoms were found.
SortConsecutiveAtoms(RegExpCompiler * compiler)499 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
500 ZoneList<RegExpTree*>* alternatives = this->alternatives();
501 int length = alternatives->length();
502 bool found_consecutive_atoms = false;
503 for (int i = 0; i < length; i++) {
504 while (i < length) {
505 RegExpTree* alternative = alternatives->at(i);
506 if (alternative->IsAtom()) break;
507 i++;
508 }
509 // i is length or it is the index of an atom.
510 if (i == length) break;
511 int first_atom = i;
512 JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
513 i++;
514 while (i < length) {
515 RegExpTree* alternative = alternatives->at(i);
516 if (!alternative->IsAtom()) break;
517 if (alternative->AsAtom()->flags() != flags) break;
518 i++;
519 }
520 // Sort atoms to get ones with common prefixes together.
521 // This step is more tricky if we are in a case-independent regexp,
522 // because it would change /is|I/ to /I|is/, and order matters when
523 // the regexp parts don't match only disjoint starting points. To fix
524 // this we have a version of CompareFirstChar that uses case-
525 // independent character classes for comparison.
526 DCHECK_LT(first_atom, alternatives->length());
527 DCHECK_LE(i, alternatives->length());
528 DCHECK_LE(first_atom, i);
529 if (IgnoreCase(flags)) {
530 #ifdef V8_INTL_SUPPORT
531 alternatives->StableSort(CompareFirstCharCaseInsensitve, first_atom,
532 i - first_atom);
533 #else
534 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
535 compiler->isolate()->regexp_macro_assembler_canonicalize();
536 auto compare_closure = [canonicalize](RegExpTree* const* a,
537 RegExpTree* const* b) {
538 return CompareFirstCharCaseIndependent(canonicalize, a, b);
539 };
540 alternatives->StableSort(compare_closure, first_atom, i - first_atom);
541 #endif // V8_INTL_SUPPORT
542 } else {
543 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
544 }
545 if (i - first_atom > 1) found_consecutive_atoms = true;
546 }
547 return found_consecutive_atoms;
548 }
549
550 // Optimizes ab|ac|az to a(?:b|c|d).
RationalizeConsecutiveAtoms(RegExpCompiler * compiler)551 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
552 Zone* zone = compiler->zone();
553 ZoneList<RegExpTree*>* alternatives = this->alternatives();
554 int length = alternatives->length();
555
556 int write_posn = 0;
557 int i = 0;
558 while (i < length) {
559 RegExpTree* alternative = alternatives->at(i);
560 if (!alternative->IsAtom()) {
561 alternatives->at(write_posn++) = alternatives->at(i);
562 i++;
563 continue;
564 }
565 RegExpAtom* const atom = alternative->AsAtom();
566 JSRegExp::Flags flags = atom->flags();
567 #ifdef V8_INTL_SUPPORT
568 icu::UnicodeString common_prefix(atom->data().at(0));
569 #else
570 unibrow::uchar common_prefix = atom->data().at(0);
571 #endif // V8_INTL_SUPPORT
572 int first_with_prefix = i;
573 int prefix_length = atom->length();
574 i++;
575 while (i < length) {
576 alternative = alternatives->at(i);
577 if (!alternative->IsAtom()) break;
578 RegExpAtom* const atom = alternative->AsAtom();
579 if (atom->flags() != flags) break;
580 #ifdef V8_INTL_SUPPORT
581 icu::UnicodeString new_prefix(atom->data().at(0));
582 if (new_prefix != common_prefix) {
583 if (!IgnoreCase(flags)) break;
584 if (common_prefix.caseCompare(new_prefix, U_FOLD_CASE_DEFAULT) != 0)
585 break;
586 }
587 #else
588 unibrow::uchar new_prefix = atom->data().at(0);
589 if (new_prefix != common_prefix) {
590 if (!IgnoreCase(flags)) break;
591 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
592 compiler->isolate()->regexp_macro_assembler_canonicalize();
593 new_prefix = Canonical(canonicalize, new_prefix);
594 common_prefix = Canonical(canonicalize, common_prefix);
595 if (new_prefix != common_prefix) break;
596 }
597 #endif // V8_INTL_SUPPORT
598 prefix_length = Min(prefix_length, atom->length());
599 i++;
600 }
601 if (i > first_with_prefix + 2) {
602 // Found worthwhile run of alternatives with common prefix of at least one
603 // character. The sorting function above did not sort on more than one
604 // character for reasons of correctness, but there may still be a longer
605 // common prefix if the terms were similar or presorted in the input.
606 // Find out how long the common prefix is.
607 int run_length = i - first_with_prefix;
608 RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
609 for (int j = 1; j < run_length && prefix_length > 1; j++) {
610 RegExpAtom* old_atom =
611 alternatives->at(j + first_with_prefix)->AsAtom();
612 for (int k = 1; k < prefix_length; k++) {
613 if (atom->data().at(k) != old_atom->data().at(k)) {
614 prefix_length = k;
615 break;
616 }
617 }
618 }
619 RegExpAtom* prefix = zone->New<RegExpAtom>(
620 atom->data().SubVector(0, prefix_length), flags);
621 ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone);
622 pair->Add(prefix, zone);
623 ZoneList<RegExpTree*>* suffixes =
624 zone->New<ZoneList<RegExpTree*>>(run_length, zone);
625 for (int j = 0; j < run_length; j++) {
626 RegExpAtom* old_atom =
627 alternatives->at(j + first_with_prefix)->AsAtom();
628 int len = old_atom->length();
629 if (len == prefix_length) {
630 suffixes->Add(zone->New<RegExpEmpty>(), zone);
631 } else {
632 RegExpTree* suffix = zone->New<RegExpAtom>(
633 old_atom->data().SubVector(prefix_length, old_atom->length()),
634 flags);
635 suffixes->Add(suffix, zone);
636 }
637 }
638 pair->Add(zone->New<RegExpDisjunction>(suffixes), zone);
639 alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair);
640 } else {
641 // Just copy any non-worthwhile alternatives.
642 for (int j = first_with_prefix; j < i; j++) {
643 alternatives->at(write_posn++) = alternatives->at(j);
644 }
645 }
646 }
647 alternatives->Rewind(write_posn); // Trim end of array.
648 }
649
650 // Optimizes b|c|z to [bcz].
FixSingleCharacterDisjunctions(RegExpCompiler * compiler)651 void RegExpDisjunction::FixSingleCharacterDisjunctions(
652 RegExpCompiler* compiler) {
653 Zone* zone = compiler->zone();
654 ZoneList<RegExpTree*>* alternatives = this->alternatives();
655 int length = alternatives->length();
656
657 int write_posn = 0;
658 int i = 0;
659 while (i < length) {
660 RegExpTree* alternative = alternatives->at(i);
661 if (!alternative->IsAtom()) {
662 alternatives->at(write_posn++) = alternatives->at(i);
663 i++;
664 continue;
665 }
666 RegExpAtom* const atom = alternative->AsAtom();
667 if (atom->length() != 1) {
668 alternatives->at(write_posn++) = alternatives->at(i);
669 i++;
670 continue;
671 }
672 JSRegExp::Flags flags = atom->flags();
673 DCHECK_IMPLIES(IsUnicode(flags),
674 !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
675 bool contains_trail_surrogate =
676 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
677 int first_in_run = i;
678 i++;
679 // Find a run of single-character atom alternatives that have identical
680 // flags (case independence and unicode-ness).
681 while (i < length) {
682 alternative = alternatives->at(i);
683 if (!alternative->IsAtom()) break;
684 RegExpAtom* const atom = alternative->AsAtom();
685 if (atom->length() != 1) break;
686 if (atom->flags() != flags) break;
687 DCHECK_IMPLIES(IsUnicode(flags),
688 !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
689 contains_trail_surrogate |=
690 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
691 i++;
692 }
693 if (i > first_in_run + 1) {
694 // Found non-trivial run of single-character alternatives.
695 int run_length = i - first_in_run;
696 ZoneList<CharacterRange>* ranges =
697 zone->New<ZoneList<CharacterRange>>(2, zone);
698 for (int j = 0; j < run_length; j++) {
699 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
700 DCHECK_EQ(old_atom->length(), 1);
701 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
702 }
703 RegExpCharacterClass::CharacterClassFlags character_class_flags;
704 if (IsUnicode(flags) && contains_trail_surrogate) {
705 character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
706 }
707 alternatives->at(write_posn++) = zone->New<RegExpCharacterClass>(
708 zone, ranges, flags, character_class_flags);
709 } else {
710 // Just copy any trivial alternatives.
711 for (int j = first_in_run; j < i; j++) {
712 alternatives->at(write_posn++) = alternatives->at(j);
713 }
714 }
715 }
716 alternatives->Rewind(write_posn); // Trim end of array.
717 }
718
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)719 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
720 RegExpNode* on_success) {
721 ZoneList<RegExpTree*>* alternatives = this->alternatives();
722
723 if (alternatives->length() > 2) {
724 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
725 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
726 FixSingleCharacterDisjunctions(compiler);
727 if (alternatives->length() == 1) {
728 return alternatives->at(0)->ToNode(compiler, on_success);
729 }
730 }
731
732 int length = alternatives->length();
733
734 ChoiceNode* result =
735 compiler->zone()->New<ChoiceNode>(length, compiler->zone());
736 for (int i = 0; i < length; i++) {
737 GuardedAlternative alternative(
738 alternatives->at(i)->ToNode(compiler, on_success));
739 result->AddAlternative(alternative);
740 }
741 return result;
742 }
743
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)744 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
745 RegExpNode* on_success) {
746 return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
747 }
748
749 namespace {
750 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
751 // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
BoundaryAssertionAsLookaround(RegExpCompiler * compiler,RegExpNode * on_success,RegExpAssertion::AssertionType type,JSRegExp::Flags flags)752 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
753 RegExpNode* on_success,
754 RegExpAssertion::AssertionType type,
755 JSRegExp::Flags flags) {
756 DCHECK(NeedsUnicodeCaseEquivalents(flags));
757 Zone* zone = compiler->zone();
758 ZoneList<CharacterRange>* word_range =
759 zone->New<ZoneList<CharacterRange>>(2, zone);
760 CharacterRange::AddClassEscape('w', word_range, true, zone);
761 int stack_register = compiler->UnicodeLookaroundStackRegister();
762 int position_register = compiler->UnicodeLookaroundPositionRegister();
763 ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
764 // Add two choices. The (non-)boundary could start with a word or
765 // a non-word-character.
766 for (int i = 0; i < 2; i++) {
767 bool lookbehind_for_word = i == 0;
768 bool lookahead_for_word =
769 (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
770 // Look to the left.
771 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
772 stack_register, position_register);
773 RegExpNode* backward = TextNode::CreateForCharacterRanges(
774 zone, word_range, true, lookbehind.on_match_success(), flags);
775 // Look to the right.
776 RegExpLookaround::Builder lookahead(lookahead_for_word,
777 lookbehind.ForMatch(backward),
778 stack_register, position_register);
779 RegExpNode* forward = TextNode::CreateForCharacterRanges(
780 zone, word_range, false, lookahead.on_match_success(), flags);
781 result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
782 }
783 return result;
784 }
785 } // anonymous namespace
786
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)787 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
788 RegExpNode* on_success) {
789 NodeInfo info;
790 Zone* zone = compiler->zone();
791
792 switch (assertion_type()) {
793 case START_OF_LINE:
794 return AssertionNode::AfterNewline(on_success);
795 case START_OF_INPUT:
796 return AssertionNode::AtStart(on_success);
797 case BOUNDARY:
798 return NeedsUnicodeCaseEquivalents(flags_)
799 ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
800 flags_)
801 : AssertionNode::AtBoundary(on_success);
802 case NON_BOUNDARY:
803 return NeedsUnicodeCaseEquivalents(flags_)
804 ? BoundaryAssertionAsLookaround(compiler, on_success,
805 NON_BOUNDARY, flags_)
806 : AssertionNode::AtNonBoundary(on_success);
807 case END_OF_INPUT:
808 return AssertionNode::AtEnd(on_success);
809 case END_OF_LINE: {
810 // Compile $ in multiline regexps as an alternation with a positive
811 // lookahead in one side and an end-of-input on the other side.
812 // We need two registers for the lookahead.
813 int stack_pointer_register = compiler->AllocateRegister();
814 int position_register = compiler->AllocateRegister();
815 // The ChoiceNode to distinguish between a newline and end-of-input.
816 ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
817 // Create a newline atom.
818 ZoneList<CharacterRange>* newline_ranges =
819 zone->New<ZoneList<CharacterRange>>(3, zone);
820 CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
821 JSRegExp::Flags default_flags = JSRegExp::Flags();
822 RegExpCharacterClass* newline_atom =
823 zone->New<RegExpCharacterClass>('n', default_flags);
824 TextNode* newline_matcher =
825 zone->New<TextNode>(newline_atom, false,
826 ActionNode::PositiveSubmatchSuccess(
827 stack_pointer_register, position_register,
828 0, // No captures inside.
829 -1, // Ignored if no captures.
830 on_success));
831 // Create an end-of-input matcher.
832 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
833 stack_pointer_register, position_register, newline_matcher);
834 // Add the two alternatives to the ChoiceNode.
835 GuardedAlternative eol_alternative(end_of_line);
836 result->AddAlternative(eol_alternative);
837 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
838 result->AddAlternative(end_alternative);
839 return result;
840 }
841 default:
842 UNREACHABLE();
843 }
844 return on_success;
845 }
846
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)847 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
848 RegExpNode* on_success) {
849 return compiler->zone()->New<BackReferenceNode>(
850 RegExpCapture::StartRegister(index()),
851 RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(),
852 on_success);
853 }
854
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)855 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
856 RegExpNode* on_success) {
857 return on_success;
858 }
859
Builder(bool is_positive,RegExpNode * on_success,int stack_pointer_register,int position_register,int capture_register_count,int capture_register_start)860 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
861 int stack_pointer_register,
862 int position_register,
863 int capture_register_count,
864 int capture_register_start)
865 : is_positive_(is_positive),
866 on_success_(on_success),
867 stack_pointer_register_(stack_pointer_register),
868 position_register_(position_register) {
869 if (is_positive_) {
870 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
871 stack_pointer_register, position_register, capture_register_count,
872 capture_register_start, on_success_);
873 } else {
874 Zone* zone = on_success_->zone();
875 on_match_success_ = zone->New<NegativeSubmatchSuccess>(
876 stack_pointer_register, position_register, capture_register_count,
877 capture_register_start, zone);
878 }
879 }
880
ForMatch(RegExpNode * match)881 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
882 if (is_positive_) {
883 return ActionNode::BeginSubmatch(stack_pointer_register_,
884 position_register_, match);
885 } else {
886 Zone* zone = on_success_->zone();
887 // We use a ChoiceNode to represent the negative lookaround. The first
888 // alternative is the negative match. On success, the end node backtracks.
889 // On failure, the second alternative is tried and leads to success.
890 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
891 // first exit when calculating quick checks.
892 ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>(
893 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
894 return ActionNode::BeginSubmatch(stack_pointer_register_,
895 position_register_, choice_node);
896 }
897 }
898
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)899 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
900 RegExpNode* on_success) {
901 int stack_pointer_register = compiler->AllocateRegister();
902 int position_register = compiler->AllocateRegister();
903
904 const int registers_per_capture = 2;
905 const int register_of_first_capture = 2;
906 int register_count = capture_count_ * registers_per_capture;
907 int register_start =
908 register_of_first_capture + capture_from_ * registers_per_capture;
909
910 RegExpNode* result;
911 bool was_reading_backward = compiler->read_backward();
912 compiler->set_read_backward(type() == LOOKBEHIND);
913 Builder builder(is_positive(), on_success, stack_pointer_register,
914 position_register, register_count, register_start);
915 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
916 result = builder.ForMatch(match);
917 compiler->set_read_backward(was_reading_backward);
918 return result;
919 }
920
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)921 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
922 RegExpNode* on_success) {
923 return ToNode(body(), index(), compiler, on_success);
924 }
925
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)926 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
927 RegExpCompiler* compiler,
928 RegExpNode* on_success) {
929 DCHECK_NOT_NULL(body);
930 int start_reg = RegExpCapture::StartRegister(index);
931 int end_reg = RegExpCapture::EndRegister(index);
932 if (compiler->read_backward()) std::swap(start_reg, end_reg);
933 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
934 RegExpNode* body_node = body->ToNode(compiler, store_end);
935 return ActionNode::StorePosition(start_reg, true, body_node);
936 }
937
938 namespace {
939
940 class AssertionSequenceRewriter final {
941 public:
942 // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
943 // instead of sprinkling rewrites into the AST->Node conversion process.
MaybeRewrite(ZoneList<RegExpTree * > * terms,Zone * zone)944 static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
945 AssertionSequenceRewriter rewriter(terms, zone);
946
947 static constexpr int kNoIndex = -1;
948 int from = kNoIndex;
949
950 for (int i = 0; i < terms->length(); i++) {
951 RegExpTree* t = terms->at(i);
952 if (from == kNoIndex && t->IsAssertion()) {
953 from = i; // Start a sequence.
954 } else if (from != kNoIndex && !t->IsAssertion()) {
955 // Terminate and process the sequence.
956 if (i - from > 1) rewriter.Rewrite(from, i);
957 from = kNoIndex;
958 }
959 }
960
961 if (from != kNoIndex && terms->length() - from > 1) {
962 rewriter.Rewrite(from, terms->length());
963 }
964 }
965
966 // All assertions are zero width. A consecutive sequence of assertions is
967 // order-independent. There's two ways we can optimize here:
968 // 1. fold all identical assertions.
969 // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
970 // sequence fails.
Rewrite(int from,int to)971 void Rewrite(int from, int to) {
972 DCHECK_GT(to, from + 1);
973
974 // Bitfield of all seen assertions.
975 uint32_t seen_assertions = 0;
976 STATIC_ASSERT(RegExpAssertion::LAST_TYPE < kUInt32Size * kBitsPerByte);
977
978 // Flags must match for folding.
979 JSRegExp::Flags flags = terms_->at(from)->AsAssertion()->flags();
980 bool saw_mismatched_flags = false;
981
982 for (int i = from; i < to; i++) {
983 RegExpAssertion* t = terms_->at(i)->AsAssertion();
984 if (t->flags() != flags) saw_mismatched_flags = true;
985 const uint32_t bit = 1 << t->assertion_type();
986
987 if ((seen_assertions & bit) && !saw_mismatched_flags) {
988 // Fold duplicates.
989 terms_->Set(i, zone_->New<RegExpEmpty>());
990 }
991
992 seen_assertions |= bit;
993 }
994
995 // Collapse failures.
996 const uint32_t always_fails_mask =
997 1 << RegExpAssertion::BOUNDARY | 1 << RegExpAssertion::NON_BOUNDARY;
998 if ((seen_assertions & always_fails_mask) == always_fails_mask) {
999 ReplaceSequenceWithFailure(from, to);
1000 }
1001 }
1002
ReplaceSequenceWithFailure(int from,int to)1003 void ReplaceSequenceWithFailure(int from, int to) {
1004 // Replace the entire sequence with a single node that always fails.
1005 // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
1006 // negated '*' (everything) range serves the purpose.
1007 ZoneList<CharacterRange>* ranges =
1008 zone_->New<ZoneList<CharacterRange>>(0, zone_);
1009 RegExpCharacterClass* cc =
1010 zone_->New<RegExpCharacterClass>(zone_, ranges, JSRegExp::Flags());
1011 terms_->Set(from, cc);
1012
1013 // Zero out the rest.
1014 RegExpEmpty* empty = zone_->New<RegExpEmpty>();
1015 for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
1016 }
1017
1018 private:
AssertionSequenceRewriter(ZoneList<RegExpTree * > * terms,Zone * zone)1019 AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
1020 : zone_(zone), terms_(terms) {}
1021
1022 Zone* zone_;
1023 ZoneList<RegExpTree*>* terms_;
1024 };
1025
1026 } // namespace
1027
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)1028 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
1029 RegExpNode* on_success) {
1030 ZoneList<RegExpTree*>* children = nodes();
1031
1032 AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
1033
1034 RegExpNode* current = on_success;
1035 if (compiler->read_backward()) {
1036 for (int i = 0; i < children->length(); i++) {
1037 current = children->at(i)->ToNode(compiler, current);
1038 }
1039 } else {
1040 for (int i = children->length() - 1; i >= 0; i--) {
1041 current = children->at(i)->ToNode(compiler, current);
1042 }
1043 }
1044 return current;
1045 }
1046
AddClass(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)1047 static void AddClass(const int* elmv, int elmc,
1048 ZoneList<CharacterRange>* ranges, Zone* zone) {
1049 elmc--;
1050 DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1051 for (int i = 0; i < elmc; i += 2) {
1052 DCHECK(elmv[i] < elmv[i + 1]);
1053 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
1054 }
1055 }
1056
AddClassNegated(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)1057 static void AddClassNegated(const int* elmv, int elmc,
1058 ZoneList<CharacterRange>* ranges, Zone* zone) {
1059 elmc--;
1060 DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1061 DCHECK_NE(0x0000, elmv[0]);
1062 DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
1063 uc16 last = 0x0000;
1064 for (int i = 0; i < elmc; i += 2) {
1065 DCHECK(last <= elmv[i] - 1);
1066 DCHECK(elmv[i] < elmv[i + 1]);
1067 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
1068 last = elmv[i + 1];
1069 }
1070 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
1071 }
1072
AddClassEscape(char type,ZoneList<CharacterRange> * ranges,bool add_unicode_case_equivalents,Zone * zone)1073 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
1074 bool add_unicode_case_equivalents,
1075 Zone* zone) {
1076 if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
1077 // See #sec-runtime-semantics-wordcharacters-abstract-operation
1078 // In case of unicode and ignore_case, we need to create the closure over
1079 // case equivalent characters before negating.
1080 ZoneList<CharacterRange>* new_ranges =
1081 zone->New<ZoneList<CharacterRange>>(2, zone);
1082 AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
1083 AddUnicodeCaseEquivalents(new_ranges, zone);
1084 if (type == 'W') {
1085 ZoneList<CharacterRange>* negated =
1086 zone->New<ZoneList<CharacterRange>>(2, zone);
1087 CharacterRange::Negate(new_ranges, negated, zone);
1088 new_ranges = negated;
1089 }
1090 ranges->AddAll(*new_ranges, zone);
1091 return;
1092 }
1093 AddClassEscape(type, ranges, zone);
1094 }
1095
AddClassEscape(char type,ZoneList<CharacterRange> * ranges,Zone * zone)1096 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
1097 Zone* zone) {
1098 switch (type) {
1099 case 's':
1100 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1101 break;
1102 case 'S':
1103 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1104 break;
1105 case 'w':
1106 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
1107 break;
1108 case 'W':
1109 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
1110 break;
1111 case 'd':
1112 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
1113 break;
1114 case 'D':
1115 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
1116 break;
1117 case '.':
1118 AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
1119 zone);
1120 break;
1121 // This is not a character range as defined by the spec but a
1122 // convenient shorthand for a character class that matches any
1123 // character.
1124 case '*':
1125 ranges->Add(CharacterRange::Everything(), zone);
1126 break;
1127 // This is the set of characters matched by the $ and ^ symbols
1128 // in multiline mode.
1129 case 'n':
1130 AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
1131 break;
1132 default:
1133 UNREACHABLE();
1134 }
1135 }
1136
GetWordBounds()1137 Vector<const int> CharacterRange::GetWordBounds() {
1138 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
1139 }
1140
1141 // static
AddCaseEquivalents(Isolate * isolate,Zone * zone,ZoneList<CharacterRange> * ranges,bool is_one_byte)1142 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
1143 ZoneList<CharacterRange>* ranges,
1144 bool is_one_byte) {
1145 CharacterRange::Canonicalize(ranges);
1146 int range_count = ranges->length();
1147 #ifdef V8_INTL_SUPPORT
1148 icu::UnicodeSet others;
1149 for (int i = 0; i < range_count; i++) {
1150 CharacterRange range = ranges->at(i);
1151 uc32 from = range.from();
1152 if (from > String::kMaxUtf16CodeUnit) continue;
1153 uc32 to = Min(range.to(), String::kMaxUtf16CodeUnitU);
1154 // Nothing to be done for surrogates.
1155 if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
1156 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1157 if (from > String::kMaxOneByteCharCode) continue;
1158 if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode;
1159 }
1160 others.add(from, to);
1161 }
1162
1163 // Compute the set of additional characters that should be added,
1164 // using UnicodeSet::closeOver. ECMA 262 defines slightly different
1165 // case-folding rules than Unicode, so some characters that are
1166 // added by closeOver do not match anything other than themselves in
1167 // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
1168 // same case-insensitive character as 's' or 'S' according to
1169 // Unicode, but does not match any other character in JS. To handle
1170 // this case, we add such characters to the IgnoreSet and filter
1171 // them out. We filter twice: once before calling closeOver (to
1172 // prevent 'ſ' from adding 's'), and once after calling closeOver
1173 // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
1174 // more information.
1175 icu::UnicodeSet already_added(others);
1176 others.removeAll(RegExpCaseFolding::IgnoreSet());
1177 others.closeOver(USET_CASE_INSENSITIVE);
1178 others.removeAll(RegExpCaseFolding::IgnoreSet());
1179 others.removeAll(already_added);
1180
1181 // Add others to the ranges
1182 for (int32_t i = 0; i < others.getRangeCount(); i++) {
1183 UChar32 from = others.getRangeStart(i);
1184 UChar32 to = others.getRangeEnd(i);
1185 if (from == to) {
1186 ranges->Add(CharacterRange::Singleton(from), zone);
1187 } else {
1188 ranges->Add(CharacterRange::Range(from, to), zone);
1189 }
1190 }
1191 #else
1192 for (int i = 0; i < range_count; i++) {
1193 CharacterRange range = ranges->at(i);
1194 uc32 bottom = range.from();
1195 if (bottom > String::kMaxUtf16CodeUnit) continue;
1196 uc32 top = Min(range.to(), String::kMaxUtf16CodeUnitU);
1197 // Nothing to be done for surrogates.
1198 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
1199 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1200 if (bottom > String::kMaxOneByteCharCode) continue;
1201 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
1202 }
1203 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1204 if (top == bottom) {
1205 // If this is a singleton we just expand the one character.
1206 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
1207 for (int i = 0; i < length; i++) {
1208 uc32 chr = chars[i];
1209 if (chr != bottom) {
1210 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
1211 }
1212 }
1213 } else {
1214 // If this is a range we expand the characters block by block, expanding
1215 // contiguous subranges (blocks) one at a time. The approach is as
1216 // follows. For a given start character we look up the remainder of the
1217 // block that contains it (represented by the end point), for instance we
1218 // find 'z' if the character is 'c'. A block is characterized by the
1219 // property that all characters uncanonicalize in the same way, except
1220 // that each entry in the result is incremented by the distance from the
1221 // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
1222 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
1223 // we've found the end point we look up its uncanonicalization and
1224 // produce a range for each element. For instance for [c-f] we look up
1225 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
1226 // it is not already contained in the input, so [c-f] will be skipped but
1227 // [C-F] will be added. If this range is not completely contained in a
1228 // block we do this for all the blocks covered by the range (handling
1229 // characters that is not in a block as a "singleton block").
1230 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1231 uc32 pos = bottom;
1232 while (pos <= top) {
1233 int length =
1234 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
1235 uc32 block_end;
1236 if (length == 0) {
1237 block_end = pos;
1238 } else {
1239 DCHECK_EQ(1, length);
1240 block_end = equivalents[0];
1241 }
1242 int end = (block_end > top) ? top : block_end;
1243 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
1244 equivalents);
1245 for (int i = 0; i < length; i++) {
1246 uc32 c = equivalents[i];
1247 uc32 range_from = c - (block_end - pos);
1248 uc32 range_to = c - (block_end - end);
1249 if (!(bottom <= range_from && range_to <= top)) {
1250 ranges->Add(CharacterRange::Range(range_from, range_to), zone);
1251 }
1252 }
1253 pos = end + 1;
1254 }
1255 }
1256 }
1257 #endif // V8_INTL_SUPPORT
1258 }
1259
IsCanonical(ZoneList<CharacterRange> * ranges)1260 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
1261 DCHECK_NOT_NULL(ranges);
1262 int n = ranges->length();
1263 if (n <= 1) return true;
1264 uc32 max = ranges->at(0).to();
1265 for (int i = 1; i < n; i++) {
1266 CharacterRange next_range = ranges->at(i);
1267 if (next_range.from() <= max + 1) return false;
1268 max = next_range.to();
1269 }
1270 return true;
1271 }
1272
ranges(Zone * zone)1273 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
1274 if (ranges_ == nullptr) {
1275 ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone);
1276 CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
1277 }
1278 return ranges_;
1279 }
1280
1281 // Move a number of elements in a zonelist to another position
1282 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)1283 static void MoveRanges(ZoneList<CharacterRange>* list, int from, int to,
1284 int count) {
1285 // Ranges are potentially overlapping.
1286 if (from < to) {
1287 for (int i = count - 1; i >= 0; i--) {
1288 list->at(to + i) = list->at(from + i);
1289 }
1290 } else {
1291 for (int i = 0; i < count; i++) {
1292 list->at(to + i) = list->at(from + i);
1293 }
1294 }
1295 }
1296
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)1297 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
1298 CharacterRange insert) {
1299 // Inserts a range into list[0..count[, which must be sorted
1300 // by from value and non-overlapping and non-adjacent, using at most
1301 // list[0..count] for the result. Returns the number of resulting
1302 // canonicalized ranges. Inserting a range may collapse existing ranges into
1303 // fewer ranges, so the return value can be anything in the range 1..count+1.
1304 uc32 from = insert.from();
1305 uc32 to = insert.to();
1306 int start_pos = 0;
1307 int end_pos = count;
1308 for (int i = count - 1; i >= 0; i--) {
1309 CharacterRange current = list->at(i);
1310 if (current.from() > to + 1) {
1311 end_pos = i;
1312 } else if (current.to() + 1 < from) {
1313 start_pos = i + 1;
1314 break;
1315 }
1316 }
1317
1318 // Inserted range overlaps, or is adjacent to, ranges at positions
1319 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
1320 // not affected by the insertion.
1321 // If start_pos == end_pos, the range must be inserted before start_pos.
1322 // if start_pos < end_pos, the entire range from start_pos to end_pos
1323 // must be merged with the insert range.
1324
1325 if (start_pos == end_pos) {
1326 // Insert between existing ranges at position start_pos.
1327 if (start_pos < count) {
1328 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
1329 }
1330 list->at(start_pos) = insert;
1331 return count + 1;
1332 }
1333 if (start_pos + 1 == end_pos) {
1334 // Replace single existing range at position start_pos.
1335 CharacterRange to_replace = list->at(start_pos);
1336 int new_from = Min(to_replace.from(), from);
1337 int new_to = Max(to_replace.to(), to);
1338 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1339 return count;
1340 }
1341 // Replace a number of existing ranges from start_pos to end_pos - 1.
1342 // Move the remaining ranges down.
1343
1344 int new_from = Min(list->at(start_pos).from(), from);
1345 int new_to = Max(list->at(end_pos - 1).to(), to);
1346 if (end_pos < count) {
1347 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
1348 }
1349 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1350 return count - (end_pos - start_pos) + 1;
1351 }
1352
Canonicalize()1353 void CharacterSet::Canonicalize() {
1354 // Special/default classes are always considered canonical. The result
1355 // of calling ranges() will be sorted.
1356 if (ranges_ == nullptr) return;
1357 CharacterRange::Canonicalize(ranges_);
1358 }
1359
Canonicalize(ZoneList<CharacterRange> * character_ranges)1360 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
1361 if (character_ranges->length() <= 1) return;
1362 // Check whether ranges are already canonical (increasing, non-overlapping,
1363 // non-adjacent).
1364 int n = character_ranges->length();
1365 uc32 max = character_ranges->at(0).to();
1366 int i = 1;
1367 while (i < n) {
1368 CharacterRange current = character_ranges->at(i);
1369 if (current.from() <= max + 1) {
1370 break;
1371 }
1372 max = current.to();
1373 i++;
1374 }
1375 // Canonical until the i'th range. If that's all of them, we are done.
1376 if (i == n) return;
1377
1378 // The ranges at index i and forward are not canonicalized. Make them so by
1379 // doing the equivalent of insertion sort (inserting each into the previous
1380 // list, in order).
1381 // Notice that inserting a range can reduce the number of ranges in the
1382 // result due to combining of adjacent and overlapping ranges.
1383 int read = i; // Range to insert.
1384 int num_canonical = i; // Length of canonicalized part of list.
1385 do {
1386 num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
1387 character_ranges->at(read));
1388 read++;
1389 } while (read < n);
1390 character_ranges->Rewind(num_canonical);
1391
1392 DCHECK(CharacterRange::IsCanonical(character_ranges));
1393 }
1394
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges,Zone * zone)1395 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
1396 ZoneList<CharacterRange>* negated_ranges,
1397 Zone* zone) {
1398 DCHECK(CharacterRange::IsCanonical(ranges));
1399 DCHECK_EQ(0, negated_ranges->length());
1400 int range_count = ranges->length();
1401 uc32 from = 0;
1402 int i = 0;
1403 if (range_count > 0 && ranges->at(0).from() == 0) {
1404 from = ranges->at(0).to() + 1;
1405 i = 1;
1406 }
1407 while (i < range_count) {
1408 CharacterRange range = ranges->at(i);
1409 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
1410 from = range.to() + 1;
1411 i++;
1412 }
1413 if (from < String::kMaxCodePoint) {
1414 negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
1415 zone);
1416 }
1417 }
1418
1419 // Scoped object to keep track of how much we unroll quantifier loops in the
1420 // regexp graph generator.
1421 class RegExpExpansionLimiter {
1422 public:
1423 static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler * compiler,int factor)1424 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
1425 : compiler_(compiler),
1426 saved_expansion_factor_(compiler->current_expansion_factor()),
1427 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
1428 DCHECK_LT(0, factor);
1429 if (ok_to_expand_) {
1430 if (factor > kMaxExpansionFactor) {
1431 // Avoid integer overflow of the current expansion factor.
1432 ok_to_expand_ = false;
1433 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
1434 } else {
1435 int new_factor = saved_expansion_factor_ * factor;
1436 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
1437 compiler->set_current_expansion_factor(new_factor);
1438 }
1439 }
1440 }
1441
~RegExpExpansionLimiter()1442 ~RegExpExpansionLimiter() {
1443 compiler_->set_current_expansion_factor(saved_expansion_factor_);
1444 }
1445
ok_to_expand()1446 bool ok_to_expand() { return ok_to_expand_; }
1447
1448 private:
1449 RegExpCompiler* compiler_;
1450 int saved_expansion_factor_;
1451 bool ok_to_expand_;
1452
1453 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
1454 };
1455
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)1456 RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
1457 RegExpTree* body, RegExpCompiler* compiler,
1458 RegExpNode* on_success,
1459 bool not_at_start) {
1460 // x{f, t} becomes this:
1461 //
1462 // (r++)<-.
1463 // | `
1464 // | (x)
1465 // v ^
1466 // (r=0)-->(?)---/ [if r < t]
1467 // |
1468 // [if r >= f] \----> ...
1469 //
1470
1471 // 15.10.2.5 RepeatMatcher algorithm.
1472 // The parser has already eliminated the case where max is 0. In the case
1473 // where max_match is zero the parser has removed the quantifier if min was
1474 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
1475
1476 // If we know that we cannot match zero length then things are a little
1477 // simpler since we don't need to make the special zero length match check
1478 // from step 2.1. If the min and max are small we can unroll a little in
1479 // this case.
1480 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
1481 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
1482 if (max == 0) return on_success; // This can happen due to recursion.
1483 bool body_can_be_empty = (body->min_match() == 0);
1484 int body_start_reg = RegExpCompiler::kNoRegister;
1485 Interval capture_registers = body->CaptureRegisters();
1486 bool needs_capture_clearing = !capture_registers.is_empty();
1487 Zone* zone = compiler->zone();
1488
1489 if (body_can_be_empty) {
1490 body_start_reg = compiler->AllocateRegister();
1491 } else if (compiler->optimize() && !needs_capture_clearing) {
1492 // Only unroll if there are no captures and the body can't be
1493 // empty.
1494 {
1495 RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
1496 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
1497 int new_max = (max == kInfinity) ? max : max - min;
1498 // Recurse once to get the loop or optional matches after the fixed
1499 // ones.
1500 RegExpNode* answer =
1501 ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
1502 // Unroll the forced matches from 0 to min. This can cause chains of
1503 // TextNodes (which the parser does not generate). These should be
1504 // combined if it turns out they hinder good code generation.
1505 for (int i = 0; i < min; i++) {
1506 answer = body->ToNode(compiler, answer);
1507 }
1508 return answer;
1509 }
1510 }
1511 if (max <= kMaxUnrolledMaxMatches && min == 0) {
1512 DCHECK_LT(0, max); // Due to the 'if' above.
1513 RegExpExpansionLimiter limiter(compiler, max);
1514 if (limiter.ok_to_expand()) {
1515 // Unroll the optional matches up to max.
1516 RegExpNode* answer = on_success;
1517 for (int i = 0; i < max; i++) {
1518 ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone);
1519 if (is_greedy) {
1520 alternation->AddAlternative(
1521 GuardedAlternative(body->ToNode(compiler, answer)));
1522 alternation->AddAlternative(GuardedAlternative(on_success));
1523 } else {
1524 alternation->AddAlternative(GuardedAlternative(on_success));
1525 alternation->AddAlternative(
1526 GuardedAlternative(body->ToNode(compiler, answer)));
1527 }
1528 answer = alternation;
1529 if (not_at_start && !compiler->read_backward()) {
1530 alternation->set_not_at_start();
1531 }
1532 }
1533 return answer;
1534 }
1535 }
1536 }
1537 bool has_min = min > 0;
1538 bool has_max = max < RegExpTree::kInfinity;
1539 bool needs_counter = has_min || has_max;
1540 int reg_ctr = needs_counter ? compiler->AllocateRegister()
1541 : RegExpCompiler::kNoRegister;
1542 LoopChoiceNode* center = zone->New<LoopChoiceNode>(
1543 body->min_match() == 0, compiler->read_backward(), min, zone);
1544 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
1545 RegExpNode* loop_return =
1546 needs_counter ? static_cast<RegExpNode*>(
1547 ActionNode::IncrementRegister(reg_ctr, center))
1548 : static_cast<RegExpNode*>(center);
1549 if (body_can_be_empty) {
1550 // If the body can be empty we need to check if it was and then
1551 // backtrack.
1552 loop_return =
1553 ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
1554 }
1555 RegExpNode* body_node = body->ToNode(compiler, loop_return);
1556 if (body_can_be_empty) {
1557 // If the body can be empty we need to store the start position
1558 // so we can bail out if it was empty.
1559 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
1560 }
1561 if (needs_capture_clearing) {
1562 // Before entering the body of this loop we need to clear captures.
1563 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
1564 }
1565 GuardedAlternative body_alt(body_node);
1566 if (has_max) {
1567 Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max);
1568 body_alt.AddGuard(body_guard, zone);
1569 }
1570 GuardedAlternative rest_alt(on_success);
1571 if (has_min) {
1572 Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min);
1573 rest_alt.AddGuard(rest_guard, zone);
1574 }
1575 if (is_greedy) {
1576 center->AddLoopAlternative(body_alt);
1577 center->AddContinueAlternative(rest_alt);
1578 } else {
1579 center->AddContinueAlternative(rest_alt);
1580 center->AddLoopAlternative(body_alt);
1581 }
1582 if (needs_counter) {
1583 return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
1584 } else {
1585 return center;
1586 }
1587 }
1588
1589 } // namespace internal
1590 } // namespace v8
1591