1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-parser.h"
6
7 #include <vector>
8
9 #include "src/execution/isolate.h"
10 #include "src/heap/factory.h"
11 #include "src/objects/objects-inl.h"
12 #include "src/regexp/property-sequences.h"
13 #include "src/regexp/regexp-macro-assembler.h"
14 #include "src/regexp/regexp.h"
15 #include "src/strings/char-predicates-inl.h"
16 #include "src/utils/ostreams.h"
17 #include "src/utils/utils.h"
18 #include "src/zone/zone-list-inl.h"
19
20 #ifdef V8_INTL_SUPPORT
21 #include "unicode/uniset.h"
22 #endif // V8_INTL_SUPPORT
23
24 namespace v8 {
25 namespace internal {
26
RegExpParser(FlatStringReader * in,JSRegExp::Flags flags,Isolate * isolate,Zone * zone)27 RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags,
28 Isolate* isolate, Zone* zone)
29 : isolate_(isolate),
30 zone_(zone),
31 captures_(nullptr),
32 named_captures_(nullptr),
33 named_back_references_(nullptr),
34 in_(in),
35 current_(kEndMarker),
36 top_level_flags_(flags),
37 next_pos_(0),
38 captures_started_(0),
39 capture_count_(0),
40 has_more_(true),
41 simple_(false),
42 contains_anchor_(false),
43 is_scanned_for_captures_(false),
44 has_named_captures_(false),
45 failed_(false) {
46 Advance();
47 }
48
49 template <bool update_position>
ReadNext()50 inline uc32 RegExpParser::ReadNext() {
51 int position = next_pos_;
52 uc32 c0 = in()->Get(position);
53 position++;
54 // Read the whole surrogate pair in case of unicode flag, if possible.
55 if (unicode() && position < in()->length() &&
56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
57 uc16 c1 = in()->Get(position);
58 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
60 position++;
61 }
62 }
63 if (update_position) next_pos_ = position;
64 return c0;
65 }
66
67
Next()68 uc32 RegExpParser::Next() {
69 if (has_next()) {
70 return ReadNext<false>();
71 } else {
72 return kEndMarker;
73 }
74 }
75
Advance()76 void RegExpParser::Advance() {
77 if (has_next()) {
78 StackLimitCheck check(isolate());
79 if (check.HasOverflowed()) {
80 if (FLAG_correctness_fuzzer_suppressions) {
81 FATAL("Aborting on stack overflow");
82 }
83 ReportError(RegExpError::kStackOverflow);
84 } else if (zone()->excess_allocation()) {
85 if (FLAG_correctness_fuzzer_suppressions) {
86 FATAL("Aborting on excess zone allocation");
87 }
88 ReportError(RegExpError::kTooLarge);
89 } else {
90 current_ = ReadNext<true>();
91 }
92 } else {
93 current_ = kEndMarker;
94 // Advance so that position() points to 1-after-the-last-character. This is
95 // important so that Reset() to this position works correctly.
96 next_pos_ = in()->length() + 1;
97 has_more_ = false;
98 }
99 }
100
101
Reset(int pos)102 void RegExpParser::Reset(int pos) {
103 next_pos_ = pos;
104 has_more_ = (pos < in()->length());
105 Advance();
106 }
107
Advance(int dist)108 void RegExpParser::Advance(int dist) {
109 next_pos_ += dist - 1;
110 Advance();
111 }
112
113
simple()114 bool RegExpParser::simple() { return simple_; }
115
IsSyntaxCharacterOrSlash(uc32 c)116 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
117 switch (c) {
118 case '^':
119 case '$':
120 case '\\':
121 case '.':
122 case '*':
123 case '+':
124 case '?':
125 case '(':
126 case ')':
127 case '[':
128 case ']':
129 case '{':
130 case '}':
131 case '|':
132 case '/':
133 return true;
134 default:
135 break;
136 }
137 return false;
138 }
139
ReportError(RegExpError error)140 RegExpTree* RegExpParser::ReportError(RegExpError error) {
141 if (failed_) return nullptr; // Do not overwrite any existing error.
142 failed_ = true;
143 error_ = error;
144 error_pos_ = position();
145 // Zip to the end to make sure no more input is read.
146 current_ = kEndMarker;
147 next_pos_ = in()->length();
148 return nullptr;
149 }
150
151 #define CHECK_FAILED /**/); \
152 if (failed_) return nullptr; \
153 ((void)0
154
155 // Pattern ::
156 // Disjunction
ParsePattern()157 RegExpTree* RegExpParser::ParsePattern() {
158 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
159 PatchNamedBackReferences(CHECK_FAILED);
160 DCHECK(!has_more());
161 // If the result of parsing is a literal string atom, and it has the
162 // same length as the input, then the atom is identical to the input.
163 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
164 simple_ = true;
165 }
166 return result;
167 }
168
169
170 // Disjunction ::
171 // Alternative
172 // Alternative | Disjunction
173 // Alternative ::
174 // [empty]
175 // Term Alternative
176 // Term ::
177 // Assertion
178 // Atom
179 // Atom Quantifier
ParseDisjunction()180 RegExpTree* RegExpParser::ParseDisjunction() {
181 // Used to store current state while parsing subexpressions.
182 RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
183 0, nullptr, top_level_flags_, zone());
184 RegExpParserState* state = &initial_state;
185 // Cache the builder in a local variable for quick access.
186 RegExpBuilder* builder = initial_state.builder();
187 while (true) {
188 switch (current()) {
189 case kEndMarker:
190 if (state->IsSubexpression()) {
191 // Inside a parenthesized group when hitting end of input.
192 return ReportError(RegExpError::kUnterminatedGroup);
193 }
194 DCHECK_EQ(INITIAL, state->group_type());
195 // Parsing completed successfully.
196 return builder->ToRegExp();
197 case ')': {
198 if (!state->IsSubexpression()) {
199 return ReportError(RegExpError::kUnmatchedParen);
200 }
201 DCHECK_NE(INITIAL, state->group_type());
202
203 Advance();
204 // End disjunction parsing and convert builder content to new single
205 // regexp atom.
206 RegExpTree* body = builder->ToRegExp();
207
208 int end_capture_index = captures_started();
209
210 int capture_index = state->capture_index();
211 SubexpressionType group_type = state->group_type();
212
213 // Build result of subexpression.
214 if (group_type == CAPTURE) {
215 if (state->IsNamedCapture()) {
216 CreateNamedCaptureAtIndex(state->capture_name(),
217 capture_index CHECK_FAILED);
218 }
219 RegExpCapture* capture = GetCapture(capture_index);
220 capture->set_body(body);
221 body = capture;
222 } else if (group_type == GROUPING) {
223 body = zone()->New<RegExpGroup>(body);
224 } else {
225 DCHECK(group_type == POSITIVE_LOOKAROUND ||
226 group_type == NEGATIVE_LOOKAROUND);
227 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
228 body = zone()->New<RegExpLookaround>(
229 body, is_positive, end_capture_index - capture_index,
230 capture_index, state->lookaround_type());
231 }
232
233 // Restore previous state.
234 state = state->previous_state();
235 builder = state->builder();
236
237 builder->AddAtom(body);
238 // For compatibility with JSC and ES3, we allow quantifiers after
239 // lookaheads, and break in all cases.
240 break;
241 }
242 case '|': {
243 Advance();
244 builder->NewAlternative();
245 continue;
246 }
247 case '*':
248 case '+':
249 case '?':
250 return ReportError(RegExpError::kNothingToRepeat);
251 case '^': {
252 Advance();
253 if (builder->multiline()) {
254 builder->AddAssertion(zone()->New<RegExpAssertion>(
255 RegExpAssertion::START_OF_LINE, builder->flags()));
256 } else {
257 builder->AddAssertion(zone()->New<RegExpAssertion>(
258 RegExpAssertion::START_OF_INPUT, builder->flags()));
259 set_contains_anchor();
260 }
261 continue;
262 }
263 case '$': {
264 Advance();
265 RegExpAssertion::AssertionType assertion_type =
266 builder->multiline() ? RegExpAssertion::END_OF_LINE
267 : RegExpAssertion::END_OF_INPUT;
268 builder->AddAssertion(
269 zone()->New<RegExpAssertion>(assertion_type, builder->flags()));
270 continue;
271 }
272 case '.': {
273 Advance();
274 ZoneList<CharacterRange>* ranges =
275 zone()->New<ZoneList<CharacterRange>>(2, zone());
276
277 if (builder->dotall()) {
278 // Everything.
279 CharacterRange::AddClassEscape('*', ranges, false, zone());
280 } else {
281 // Everything except \x0A, \x0D, \u2028 and \u2029
282 CharacterRange::AddClassEscape('.', ranges, false, zone());
283 }
284
285 RegExpCharacterClass* cc =
286 zone()->New<RegExpCharacterClass>(zone(), ranges, builder->flags());
287 builder->AddCharacterClass(cc);
288 break;
289 }
290 case '(': {
291 state = ParseOpenParenthesis(state CHECK_FAILED);
292 builder = state->builder();
293 continue;
294 }
295 case '[': {
296 RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
297 builder->AddCharacterClass(cc->AsCharacterClass());
298 break;
299 }
300 // Atom ::
301 // \ AtomEscape
302 case '\\':
303 switch (Next()) {
304 case kEndMarker:
305 return ReportError(RegExpError::kEscapeAtEndOfPattern);
306 case 'b':
307 Advance(2);
308 builder->AddAssertion(zone()->New<RegExpAssertion>(
309 RegExpAssertion::BOUNDARY, builder->flags()));
310 continue;
311 case 'B':
312 Advance(2);
313 builder->AddAssertion(zone()->New<RegExpAssertion>(
314 RegExpAssertion::NON_BOUNDARY, builder->flags()));
315 continue;
316 // AtomEscape ::
317 // CharacterClassEscape
318 //
319 // CharacterClassEscape :: one of
320 // d D s S w W
321 case 'd':
322 case 'D':
323 case 's':
324 case 'S':
325 case 'w':
326 case 'W': {
327 uc32 c = Next();
328 Advance(2);
329 ZoneList<CharacterRange>* ranges =
330 zone()->New<ZoneList<CharacterRange>>(2, zone());
331 CharacterRange::AddClassEscape(
332 c, ranges, unicode() && builder->ignore_case(), zone());
333 RegExpCharacterClass* cc = zone()->New<RegExpCharacterClass>(
334 zone(), ranges, builder->flags());
335 builder->AddCharacterClass(cc);
336 break;
337 }
338 case 'p':
339 case 'P': {
340 uc32 p = Next();
341 Advance(2);
342 if (unicode()) {
343 ZoneList<CharacterRange>* ranges =
344 zone()->New<ZoneList<CharacterRange>>(2, zone());
345 ZoneVector<char> name_1(zone());
346 ZoneVector<char> name_2(zone());
347 if (ParsePropertyClassName(&name_1, &name_2)) {
348 if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
349 RegExpCharacterClass* cc = zone()->New<RegExpCharacterClass>(
350 zone(), ranges, builder->flags());
351 builder->AddCharacterClass(cc);
352 break;
353 }
354 if (p == 'p' && name_2.empty()) {
355 RegExpTree* sequence = GetPropertySequence(name_1);
356 if (sequence != nullptr) {
357 builder->AddAtom(sequence);
358 break;
359 }
360 }
361 }
362 return ReportError(RegExpError::kInvalidPropertyName);
363 } else {
364 builder->AddCharacter(p);
365 }
366 break;
367 }
368 case '1':
369 case '2':
370 case '3':
371 case '4':
372 case '5':
373 case '6':
374 case '7':
375 case '8':
376 case '9': {
377 int index = 0;
378 bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
379 if (is_backref) {
380 if (state->IsInsideCaptureGroup(index)) {
381 // The back reference is inside the capture group it refers to.
382 // Nothing can possibly have been captured yet, so we use empty
383 // instead. This ensures that, when checking a back reference,
384 // the capture registers of the referenced capture are either
385 // both set or both cleared.
386 builder->AddEmpty();
387 } else {
388 RegExpCapture* capture = GetCapture(index);
389 RegExpTree* atom =
390 zone()->New<RegExpBackReference>(capture, builder->flags());
391 builder->AddAtom(atom);
392 }
393 break;
394 }
395 // With /u, no identity escapes except for syntax characters
396 // are allowed. Otherwise, all identity escapes are allowed.
397 if (unicode()) {
398 return ReportError(RegExpError::kInvalidEscape);
399 }
400 uc32 first_digit = Next();
401 if (first_digit == '8' || first_digit == '9') {
402 builder->AddCharacter(first_digit);
403 Advance(2);
404 break;
405 }
406 V8_FALLTHROUGH;
407 }
408 case '0': {
409 Advance();
410 if (unicode() && Next() >= '0' && Next() <= '9') {
411 // With /u, decimal escape with leading 0 are not parsed as octal.
412 return ReportError(RegExpError::kInvalidDecimalEscape);
413 }
414 uc32 octal = ParseOctalLiteral();
415 builder->AddCharacter(octal);
416 break;
417 }
418 // ControlEscape :: one of
419 // f n r t v
420 case 'f':
421 Advance(2);
422 builder->AddCharacter('\f');
423 break;
424 case 'n':
425 Advance(2);
426 builder->AddCharacter('\n');
427 break;
428 case 'r':
429 Advance(2);
430 builder->AddCharacter('\r');
431 break;
432 case 't':
433 Advance(2);
434 builder->AddCharacter('\t');
435 break;
436 case 'v':
437 Advance(2);
438 builder->AddCharacter('\v');
439 break;
440 case 'c': {
441 Advance();
442 uc32 controlLetter = Next();
443 // Special case if it is an ASCII letter.
444 // Convert lower case letters to uppercase.
445 uc32 letter = controlLetter & ~('a' ^ 'A');
446 if (letter < 'A' || 'Z' < letter) {
447 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
448 // Read the backslash as a literal character instead of as
449 // starting an escape.
450 // ES#prod-annexB-ExtendedPatternCharacter
451 if (unicode()) {
452 // With /u, invalid escapes are not treated as identity escapes.
453 return ReportError(RegExpError::kInvalidUnicodeEscape);
454 }
455 builder->AddCharacter('\\');
456 } else {
457 Advance(2);
458 builder->AddCharacter(controlLetter & 0x1F);
459 }
460 break;
461 }
462 case 'x': {
463 Advance(2);
464 uc32 value;
465 if (ParseHexEscape(2, &value)) {
466 builder->AddCharacter(value);
467 } else if (!unicode()) {
468 builder->AddCharacter('x');
469 } else {
470 // With /u, invalid escapes are not treated as identity escapes.
471 return ReportError(RegExpError::kInvalidEscape);
472 }
473 break;
474 }
475 case 'u': {
476 Advance(2);
477 uc32 value;
478 if (ParseUnicodeEscape(&value)) {
479 builder->AddEscapedUnicodeCharacter(value);
480 } else if (!unicode()) {
481 builder->AddCharacter('u');
482 } else {
483 // With /u, invalid escapes are not treated as identity escapes.
484 return ReportError(RegExpError::kInvalidUnicodeEscape);
485 }
486 break;
487 }
488 case 'k':
489 // Either an identity escape or a named back-reference. The two
490 // interpretations are mutually exclusive: '\k' is interpreted as
491 // an identity escape for non-Unicode patterns without named
492 // capture groups, and as the beginning of a named back-reference
493 // in all other cases.
494 if (unicode() || HasNamedCaptures()) {
495 Advance(2);
496 ParseNamedBackReference(builder, state CHECK_FAILED);
497 break;
498 }
499 V8_FALLTHROUGH;
500 default:
501 Advance();
502 // With /u, no identity escapes except for syntax characters
503 // are allowed. Otherwise, all identity escapes are allowed.
504 if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
505 builder->AddCharacter(current());
506 Advance();
507 } else {
508 return ReportError(RegExpError::kInvalidEscape);
509 }
510 break;
511 }
512 break;
513 case '{': {
514 int dummy;
515 bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
516 if (parsed) return ReportError(RegExpError::kNothingToRepeat);
517 V8_FALLTHROUGH;
518 }
519 case '}':
520 case ']':
521 if (unicode()) {
522 return ReportError(RegExpError::kLoneQuantifierBrackets);
523 }
524 V8_FALLTHROUGH;
525 default:
526 builder->AddUnicodeCharacter(current());
527 Advance();
528 break;
529 } // end switch(current())
530
531 int min;
532 int max;
533 switch (current()) {
534 // QuantifierPrefix ::
535 // *
536 // +
537 // ?
538 // {
539 case '*':
540 min = 0;
541 max = RegExpTree::kInfinity;
542 Advance();
543 break;
544 case '+':
545 min = 1;
546 max = RegExpTree::kInfinity;
547 Advance();
548 break;
549 case '?':
550 min = 0;
551 max = 1;
552 Advance();
553 break;
554 case '{':
555 if (ParseIntervalQuantifier(&min, &max)) {
556 if (max < min) {
557 return ReportError(RegExpError::kRangeOutOfOrder);
558 }
559 break;
560 } else if (unicode()) {
561 // With /u, incomplete quantifiers are not allowed.
562 return ReportError(RegExpError::kIncompleteQuantifier);
563 }
564 continue;
565 default:
566 continue;
567 }
568 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
569 if (current() == '?') {
570 quantifier_type = RegExpQuantifier::NON_GREEDY;
571 Advance();
572 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
573 // FLAG_regexp_possessive_quantifier is a debug-only flag.
574 quantifier_type = RegExpQuantifier::POSSESSIVE;
575 Advance();
576 }
577 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
578 return ReportError(RegExpError::kInvalidQuantifier);
579 }
580 }
581 }
582
ParseOpenParenthesis(RegExpParserState * state)583 RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
584 RegExpParserState* state) {
585 RegExpLookaround::Type lookaround_type = state->lookaround_type();
586 bool is_named_capture = false;
587 JSRegExp::Flags switch_on = JSRegExp::kNone;
588 JSRegExp::Flags switch_off = JSRegExp::kNone;
589 const ZoneVector<uc16>* capture_name = nullptr;
590 SubexpressionType subexpr_type = CAPTURE;
591 Advance();
592 if (current() == '?') {
593 switch (Next()) {
594 case ':':
595 Advance(2);
596 subexpr_type = GROUPING;
597 break;
598 case '=':
599 Advance(2);
600 lookaround_type = RegExpLookaround::LOOKAHEAD;
601 subexpr_type = POSITIVE_LOOKAROUND;
602 break;
603 case '!':
604 Advance(2);
605 lookaround_type = RegExpLookaround::LOOKAHEAD;
606 subexpr_type = NEGATIVE_LOOKAROUND;
607 break;
608 case '-':
609 case 'i':
610 case 's':
611 case 'm': {
612 if (!FLAG_regexp_mode_modifiers) {
613 ReportError(RegExpError::kInvalidGroup);
614 return nullptr;
615 }
616 Advance();
617 bool flags_sense = true; // Switching on flags.
618 while (subexpr_type != GROUPING) {
619 switch (current()) {
620 case '-':
621 if (!flags_sense) {
622 ReportError(RegExpError::kMultipleFlagDashes);
623 return nullptr;
624 }
625 flags_sense = false;
626 Advance();
627 continue;
628 case 's':
629 case 'i':
630 case 'm': {
631 JSRegExp::Flags bit = JSRegExp::kUnicode;
632 if (current() == 'i') bit = JSRegExp::kIgnoreCase;
633 if (current() == 'm') bit = JSRegExp::kMultiline;
634 if (current() == 's') bit = JSRegExp::kDotAll;
635 if (((switch_on | switch_off) & bit) != 0) {
636 ReportError(RegExpError::kRepeatedFlag);
637 return nullptr;
638 }
639 if (flags_sense) {
640 switch_on |= bit;
641 } else {
642 switch_off |= bit;
643 }
644 Advance();
645 continue;
646 }
647 case ')': {
648 Advance();
649 state->builder()
650 ->FlushText(); // Flush pending text using old flags.
651 // These (?i)-style flag switches don't put us in a subexpression
652 // at all, they just modify the flags in the rest of the current
653 // subexpression.
654 JSRegExp::Flags flags =
655 (state->builder()->flags() | switch_on) & ~switch_off;
656 state->builder()->set_flags(flags);
657 return state;
658 }
659 case ':':
660 Advance();
661 subexpr_type = GROUPING; // Will break us out of the outer loop.
662 continue;
663 default:
664 ReportError(RegExpError::kInvalidFlagGroup);
665 return nullptr;
666 }
667 }
668 break;
669 }
670 case '<':
671 Advance();
672 if (Next() == '=') {
673 Advance(2);
674 lookaround_type = RegExpLookaround::LOOKBEHIND;
675 subexpr_type = POSITIVE_LOOKAROUND;
676 break;
677 } else if (Next() == '!') {
678 Advance(2);
679 lookaround_type = RegExpLookaround::LOOKBEHIND;
680 subexpr_type = NEGATIVE_LOOKAROUND;
681 break;
682 }
683 is_named_capture = true;
684 has_named_captures_ = true;
685 Advance();
686 break;
687 default:
688 ReportError(RegExpError::kInvalidGroup);
689 return nullptr;
690 }
691 }
692 if (subexpr_type == CAPTURE) {
693 if (captures_started_ >= JSRegExp::kMaxCaptures) {
694 ReportError(RegExpError::kTooManyCaptures);
695 return nullptr;
696 }
697 captures_started_++;
698
699 if (is_named_capture) {
700 capture_name = ParseCaptureGroupName(CHECK_FAILED);
701 }
702 }
703 JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
704 // Store current state and begin new disjunction parsing.
705 return zone()->New<RegExpParserState>(state, subexpr_type, lookaround_type,
706 captures_started_, capture_name, flags,
707 zone());
708 }
709
710 #ifdef DEBUG
711 // Currently only used in an DCHECK.
IsSpecialClassEscape(uc32 c)712 static bool IsSpecialClassEscape(uc32 c) {
713 switch (c) {
714 case 'd':
715 case 'D':
716 case 's':
717 case 'S':
718 case 'w':
719 case 'W':
720 return true;
721 default:
722 return false;
723 }
724 }
725 #endif
726
727
728 // In order to know whether an escape is a backreference or not we have to scan
729 // the entire regexp and find the number of capturing parentheses. However we
730 // don't want to scan the regexp twice unless it is necessary. This mini-parser
731 // is called when needed. It can see the difference between capturing and
732 // noncapturing parentheses and can skip character classes and backslash-escaped
733 // characters.
ScanForCaptures()734 void RegExpParser::ScanForCaptures() {
735 DCHECK(!is_scanned_for_captures_);
736 const int saved_position = position();
737 // Start with captures started previous to current position
738 int capture_count = captures_started();
739 // Add count of captures after this position.
740 int n;
741 while ((n = current()) != kEndMarker) {
742 Advance();
743 switch (n) {
744 case '\\':
745 Advance();
746 break;
747 case '[': {
748 int c;
749 while ((c = current()) != kEndMarker) {
750 Advance();
751 if (c == '\\') {
752 Advance();
753 } else {
754 if (c == ']') break;
755 }
756 }
757 break;
758 }
759 case '(':
760 if (current() == '?') {
761 // At this point we could be in
762 // * a non-capturing group '(:',
763 // * a lookbehind assertion '(?<=' '(?<!'
764 // * or a named capture '(?<'.
765 //
766 // Of these, only named captures are capturing groups.
767
768 Advance();
769 if (current() != '<') break;
770
771 Advance();
772 if (current() == '=' || current() == '!') break;
773
774 // Found a possible named capture. It could turn out to be a syntax
775 // error (e.g. an unterminated or invalid name), but that distinction
776 // does not matter for our purposes.
777 has_named_captures_ = true;
778 }
779 capture_count++;
780 break;
781 }
782 }
783 capture_count_ = capture_count;
784 is_scanned_for_captures_ = true;
785 Reset(saved_position);
786 }
787
788
ParseBackReferenceIndex(int * index_out)789 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
790 DCHECK_EQ('\\', current());
791 DCHECK('1' <= Next() && Next() <= '9');
792 // Try to parse a decimal literal that is no greater than the total number
793 // of left capturing parentheses in the input.
794 int start = position();
795 int value = Next() - '0';
796 Advance(2);
797 while (true) {
798 uc32 c = current();
799 if (IsDecimalDigit(c)) {
800 value = 10 * value + (c - '0');
801 if (value > JSRegExp::kMaxCaptures) {
802 Reset(start);
803 return false;
804 }
805 Advance();
806 } else {
807 break;
808 }
809 }
810 if (value > captures_started()) {
811 if (!is_scanned_for_captures_) ScanForCaptures();
812 if (value > capture_count_) {
813 Reset(start);
814 return false;
815 }
816 }
817 *index_out = value;
818 return true;
819 }
820
push_code_unit(ZoneVector<uc16> * v,uint32_t code_unit)821 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
822 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
823 v->push_back(code_unit);
824 } else {
825 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
826 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
827 }
828 }
829
ParseCaptureGroupName()830 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
831 ZoneVector<uc16>* name = zone()->New<ZoneVector<uc16>>(zone());
832
833 bool at_start = true;
834 while (true) {
835 uc32 c = current();
836 Advance();
837
838 // Convert unicode escapes.
839 if (c == '\\' && current() == 'u') {
840 Advance();
841 if (!ParseUnicodeEscape(&c)) {
842 ReportError(RegExpError::kInvalidUnicodeEscape);
843 return nullptr;
844 }
845 }
846
847 // The backslash char is misclassified as both ID_Start and ID_Continue.
848 if (c == '\\') {
849 ReportError(RegExpError::kInvalidCaptureGroupName);
850 return nullptr;
851 }
852
853 if (at_start) {
854 if (!IsIdentifierStart(c)) {
855 ReportError(RegExpError::kInvalidCaptureGroupName);
856 return nullptr;
857 }
858 push_code_unit(name, c);
859 at_start = false;
860 } else {
861 if (c == '>') {
862 break;
863 } else if (IsIdentifierPart(c)) {
864 push_code_unit(name, c);
865 } else {
866 ReportError(RegExpError::kInvalidCaptureGroupName);
867 return nullptr;
868 }
869 }
870 }
871
872 return name;
873 }
874
CreateNamedCaptureAtIndex(const ZoneVector<uc16> * name,int index)875 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
876 int index) {
877 DCHECK(0 < index && index <= captures_started_);
878 DCHECK_NOT_NULL(name);
879
880 RegExpCapture* capture = GetCapture(index);
881 DCHECK_NULL(capture->name());
882
883 capture->set_name(name);
884
885 if (named_captures_ == nullptr) {
886 named_captures_ =
887 zone_->New<ZoneSet<RegExpCapture*, RegExpCaptureNameLess>>(zone());
888 } else {
889 // Check for duplicates and bail if we find any.
890
891 const auto& named_capture_it = named_captures_->find(capture);
892 if (named_capture_it != named_captures_->end()) {
893 ReportError(RegExpError::kDuplicateCaptureGroupName);
894 return false;
895 }
896 }
897
898 named_captures_->emplace(capture);
899
900 return true;
901 }
902
ParseNamedBackReference(RegExpBuilder * builder,RegExpParserState * state)903 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
904 RegExpParserState* state) {
905 // The parser is assumed to be on the '<' in \k<name>.
906 if (current() != '<') {
907 ReportError(RegExpError::kInvalidNamedReference);
908 return false;
909 }
910
911 Advance();
912 const ZoneVector<uc16>* name = ParseCaptureGroupName();
913 if (name == nullptr) {
914 return false;
915 }
916
917 if (state->IsInsideCaptureGroup(name)) {
918 builder->AddEmpty();
919 } else {
920 RegExpBackReference* atom =
921 zone()->New<RegExpBackReference>(builder->flags());
922 atom->set_name(name);
923
924 builder->AddAtom(atom);
925
926 if (named_back_references_ == nullptr) {
927 named_back_references_ =
928 zone()->New<ZoneList<RegExpBackReference*>>(1, zone());
929 }
930 named_back_references_->Add(atom, zone());
931 }
932
933 return true;
934 }
935
PatchNamedBackReferences()936 void RegExpParser::PatchNamedBackReferences() {
937 if (named_back_references_ == nullptr) return;
938
939 if (named_captures_ == nullptr) {
940 ReportError(RegExpError::kInvalidNamedCaptureReference);
941 return;
942 }
943
944 // Look up and patch the actual capture for each named back reference.
945
946 for (int i = 0; i < named_back_references_->length(); i++) {
947 RegExpBackReference* ref = named_back_references_->at(i);
948
949 // Capture used to search the named_captures_ by name, index of the
950 // capture is never used.
951 static const int kInvalidIndex = 0;
952 RegExpCapture* search_capture = zone()->New<RegExpCapture>(kInvalidIndex);
953 DCHECK_NULL(search_capture->name());
954 search_capture->set_name(ref->name());
955
956 int index = -1;
957 const auto& capture_it = named_captures_->find(search_capture);
958 if (capture_it != named_captures_->end()) {
959 index = (*capture_it)->index();
960 } else {
961 ReportError(RegExpError::kInvalidNamedCaptureReference);
962 return;
963 }
964
965 ref->set_capture(GetCapture(index));
966 }
967 }
968
GetCapture(int index)969 RegExpCapture* RegExpParser::GetCapture(int index) {
970 // The index for the capture groups are one-based. Its index in the list is
971 // zero-based.
972 int know_captures =
973 is_scanned_for_captures_ ? capture_count_ : captures_started_;
974 DCHECK(index <= know_captures);
975 if (captures_ == nullptr) {
976 captures_ = zone()->New<ZoneList<RegExpCapture*>>(know_captures, zone());
977 }
978 while (captures_->length() < know_captures) {
979 captures_->Add(zone()->New<RegExpCapture>(captures_->length() + 1), zone());
980 }
981 return captures_->at(index - 1);
982 }
983
984 namespace {
985
986 struct RegExpCaptureIndexLess {
operator ()v8::internal::__anonb7f600e30111::RegExpCaptureIndexLess987 bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
988 DCHECK_NOT_NULL(lhs);
989 DCHECK_NOT_NULL(rhs);
990 return lhs->index() < rhs->index();
991 }
992 };
993
994 } // namespace
995
CreateCaptureNameMap()996 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
997 if (named_captures_ == nullptr || named_captures_->empty()) {
998 return Handle<FixedArray>();
999 }
1000
1001 // Named captures are sorted by name (because the set is used to ensure
1002 // name uniqueness). But the capture name map must to be sorted by index.
1003
1004 ZoneVector<RegExpCapture*> sorted_named_captures(
1005 named_captures_->begin(), named_captures_->end(), zone());
1006 std::sort(sorted_named_captures.begin(), sorted_named_captures.end(),
1007 RegExpCaptureIndexLess{});
1008 DCHECK_EQ(sorted_named_captures.size(), named_captures_->size());
1009
1010 Factory* factory = isolate()->factory();
1011
1012 int len = static_cast<int>(sorted_named_captures.size()) * 2;
1013 Handle<FixedArray> array = factory->NewFixedArray(len);
1014
1015 int i = 0;
1016 for (const auto& capture : sorted_named_captures) {
1017 Vector<const uc16> capture_name(capture->name()->data(),
1018 capture->name()->size());
1019 // CSA code in ConstructNewResultFromMatchInfo requires these strings to be
1020 // internalized so they can be used as property names in the 'exec' results.
1021 Handle<String> name = factory->InternalizeString(capture_name);
1022 array->set(i * 2, *name);
1023 array->set(i * 2 + 1, Smi::FromInt(capture->index()));
1024
1025 i++;
1026 }
1027 DCHECK_EQ(i * 2, len);
1028
1029 return array;
1030 }
1031
HasNamedCaptures()1032 bool RegExpParser::HasNamedCaptures() {
1033 if (has_named_captures_ || is_scanned_for_captures_) {
1034 return has_named_captures_;
1035 }
1036
1037 ScanForCaptures();
1038 DCHECK(is_scanned_for_captures_);
1039 return has_named_captures_;
1040 }
1041
IsInsideCaptureGroup(int index)1042 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
1043 for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1044 if (s->group_type() != CAPTURE) continue;
1045 // Return true if we found the matching capture index.
1046 if (index == s->capture_index()) return true;
1047 // Abort if index is larger than what has been parsed up till this state.
1048 if (index > s->capture_index()) return false;
1049 }
1050 return false;
1051 }
1052
IsInsideCaptureGroup(const ZoneVector<uc16> * name)1053 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
1054 const ZoneVector<uc16>* name) {
1055 DCHECK_NOT_NULL(name);
1056 for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1057 if (s->capture_name() == nullptr) continue;
1058 if (*s->capture_name() == *name) return true;
1059 }
1060 return false;
1061 }
1062
1063 // QuantifierPrefix ::
1064 // { DecimalDigits }
1065 // { DecimalDigits , }
1066 // { DecimalDigits , DecimalDigits }
1067 //
1068 // Returns true if parsing succeeds, and set the min_out and max_out
1069 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
ParseIntervalQuantifier(int * min_out,int * max_out)1070 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
1071 DCHECK_EQ(current(), '{');
1072 int start = position();
1073 Advance();
1074 int min = 0;
1075 if (!IsDecimalDigit(current())) {
1076 Reset(start);
1077 return false;
1078 }
1079 while (IsDecimalDigit(current())) {
1080 int next = current() - '0';
1081 if (min > (RegExpTree::kInfinity - next) / 10) {
1082 // Overflow. Skip past remaining decimal digits and return -1.
1083 do {
1084 Advance();
1085 } while (IsDecimalDigit(current()));
1086 min = RegExpTree::kInfinity;
1087 break;
1088 }
1089 min = 10 * min + next;
1090 Advance();
1091 }
1092 int max = 0;
1093 if (current() == '}') {
1094 max = min;
1095 Advance();
1096 } else if (current() == ',') {
1097 Advance();
1098 if (current() == '}') {
1099 max = RegExpTree::kInfinity;
1100 Advance();
1101 } else {
1102 while (IsDecimalDigit(current())) {
1103 int next = current() - '0';
1104 if (max > (RegExpTree::kInfinity - next) / 10) {
1105 do {
1106 Advance();
1107 } while (IsDecimalDigit(current()));
1108 max = RegExpTree::kInfinity;
1109 break;
1110 }
1111 max = 10 * max + next;
1112 Advance();
1113 }
1114 if (current() != '}') {
1115 Reset(start);
1116 return false;
1117 }
1118 Advance();
1119 }
1120 } else {
1121 Reset(start);
1122 return false;
1123 }
1124 *min_out = min;
1125 *max_out = max;
1126 return true;
1127 }
1128
1129
ParseOctalLiteral()1130 uc32 RegExpParser::ParseOctalLiteral() {
1131 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1132 // For compatibility with some other browsers (not all), we parse
1133 // up to three octal digits with a value below 256.
1134 // ES#prod-annexB-LegacyOctalEscapeSequence
1135 uc32 value = current() - '0';
1136 Advance();
1137 if ('0' <= current() && current() <= '7') {
1138 value = value * 8 + current() - '0';
1139 Advance();
1140 if (value < 32 && '0' <= current() && current() <= '7') {
1141 value = value * 8 + current() - '0';
1142 Advance();
1143 }
1144 }
1145 return value;
1146 }
1147
1148
ParseHexEscape(int length,uc32 * value)1149 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
1150 int start = position();
1151 uc32 val = 0;
1152 for (int i = 0; i < length; ++i) {
1153 uc32 c = current();
1154 int d = HexValue(c);
1155 if (d < 0) {
1156 Reset(start);
1157 return false;
1158 }
1159 val = val * 16 + d;
1160 Advance();
1161 }
1162 *value = val;
1163 return true;
1164 }
1165
1166 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uc32 * value)1167 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1168 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1169 // allowed). In the latter case, the number of hex digits between { } is
1170 // arbitrary. \ and u have already been read.
1171 if (current() == '{' && unicode()) {
1172 int start = position();
1173 Advance();
1174 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1175 if (current() == '}') {
1176 Advance();
1177 return true;
1178 }
1179 }
1180 Reset(start);
1181 return false;
1182 }
1183 // \u but no {, or \u{...} escapes not allowed.
1184 bool result = ParseHexEscape(4, value);
1185 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1186 current() == '\\') {
1187 // Attempt to read trail surrogate.
1188 int start = position();
1189 if (Next() == 'u') {
1190 Advance(2);
1191 uc32 trail;
1192 if (ParseHexEscape(4, &trail) &&
1193 unibrow::Utf16::IsTrailSurrogate(trail)) {
1194 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1195 static_cast<uc16>(trail));
1196 return true;
1197 }
1198 }
1199 Reset(start);
1200 }
1201 return result;
1202 }
1203
1204 #ifdef V8_INTL_SUPPORT
1205
1206 namespace {
1207
IsExactPropertyAlias(const char * property_name,UProperty property)1208 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1209 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1210 if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1211 return true;
1212 for (int i = 0;; i++) {
1213 const char* long_name = u_getPropertyName(
1214 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1215 if (long_name == nullptr) break;
1216 if (strcmp(property_name, long_name) == 0) return true;
1217 }
1218 return false;
1219 }
1220
IsExactPropertyValueAlias(const char * property_value_name,UProperty property,int32_t property_value)1221 bool IsExactPropertyValueAlias(const char* property_value_name,
1222 UProperty property, int32_t property_value) {
1223 const char* short_name =
1224 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1225 if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1226 return true;
1227 }
1228 for (int i = 0;; i++) {
1229 const char* long_name = u_getPropertyValueName(
1230 property, property_value,
1231 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1232 if (long_name == nullptr) break;
1233 if (strcmp(property_value_name, long_name) == 0) return true;
1234 }
1235 return false;
1236 }
1237
LookupPropertyValueName(UProperty property,const char * property_value_name,bool negate,ZoneList<CharacterRange> * result,Zone * zone)1238 bool LookupPropertyValueName(UProperty property,
1239 const char* property_value_name, bool negate,
1240 ZoneList<CharacterRange>* result, Zone* zone) {
1241 UProperty property_for_lookup = property;
1242 if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1243 // For the property Script_Extensions, we have to do the property value
1244 // name lookup as if the property is Script.
1245 property_for_lookup = UCHAR_SCRIPT;
1246 }
1247 int32_t property_value =
1248 u_getPropertyValueEnum(property_for_lookup, property_value_name);
1249 if (property_value == UCHAR_INVALID_CODE) return false;
1250
1251 // We require the property name to match exactly to one of the property value
1252 // aliases. However, u_getPropertyValueEnum uses loose matching.
1253 if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1254 property_value)) {
1255 return false;
1256 }
1257
1258 UErrorCode ec = U_ZERO_ERROR;
1259 icu::UnicodeSet set;
1260 set.applyIntPropertyValue(property, property_value, ec);
1261 bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1262
1263 if (success) {
1264 set.removeAllStrings();
1265 if (negate) set.complement();
1266 for (int i = 0; i < set.getRangeCount(); i++) {
1267 result->Add(
1268 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1269 zone);
1270 }
1271 }
1272 return success;
1273 }
1274
1275 template <size_t N>
NameEquals(const char * name,const char (& literal)[N])1276 inline bool NameEquals(const char* name, const char (&literal)[N]) {
1277 return strncmp(name, literal, N + 1) == 0;
1278 }
1279
LookupSpecialPropertyValueName(const char * name,ZoneList<CharacterRange> * result,bool negate,Zone * zone)1280 bool LookupSpecialPropertyValueName(const char* name,
1281 ZoneList<CharacterRange>* result,
1282 bool negate, Zone* zone) {
1283 if (NameEquals(name, "Any")) {
1284 if (negate) {
1285 // Leave the list of character ranges empty, since the negation of 'Any'
1286 // is the empty set.
1287 } else {
1288 result->Add(CharacterRange::Everything(), zone);
1289 }
1290 } else if (NameEquals(name, "ASCII")) {
1291 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1292 : CharacterRange::Range(0x0, 0x7F),
1293 zone);
1294 } else if (NameEquals(name, "Assigned")) {
1295 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1296 !negate, result, zone);
1297 } else {
1298 return false;
1299 }
1300 return true;
1301 }
1302
1303 // Explicitly allowlist supported binary properties. The spec forbids supporting
1304 // properties outside of this set to ensure interoperability.
IsSupportedBinaryProperty(UProperty property)1305 bool IsSupportedBinaryProperty(UProperty property) {
1306 switch (property) {
1307 case UCHAR_ALPHABETIC:
1308 // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1309 // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1310 case UCHAR_ASCII_HEX_DIGIT:
1311 // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1312 case UCHAR_BIDI_CONTROL:
1313 case UCHAR_BIDI_MIRRORED:
1314 case UCHAR_CASE_IGNORABLE:
1315 case UCHAR_CASED:
1316 case UCHAR_CHANGES_WHEN_CASEFOLDED:
1317 case UCHAR_CHANGES_WHEN_CASEMAPPED:
1318 case UCHAR_CHANGES_WHEN_LOWERCASED:
1319 case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1320 case UCHAR_CHANGES_WHEN_TITLECASED:
1321 case UCHAR_CHANGES_WHEN_UPPERCASED:
1322 case UCHAR_DASH:
1323 case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1324 case UCHAR_DEPRECATED:
1325 case UCHAR_DIACRITIC:
1326 case UCHAR_EMOJI:
1327 case UCHAR_EMOJI_COMPONENT:
1328 case UCHAR_EMOJI_MODIFIER_BASE:
1329 case UCHAR_EMOJI_MODIFIER:
1330 case UCHAR_EMOJI_PRESENTATION:
1331 case UCHAR_EXTENDED_PICTOGRAPHIC:
1332 case UCHAR_EXTENDER:
1333 case UCHAR_GRAPHEME_BASE:
1334 case UCHAR_GRAPHEME_EXTEND:
1335 case UCHAR_HEX_DIGIT:
1336 case UCHAR_ID_CONTINUE:
1337 case UCHAR_ID_START:
1338 case UCHAR_IDEOGRAPHIC:
1339 case UCHAR_IDS_BINARY_OPERATOR:
1340 case UCHAR_IDS_TRINARY_OPERATOR:
1341 case UCHAR_JOIN_CONTROL:
1342 case UCHAR_LOGICAL_ORDER_EXCEPTION:
1343 case UCHAR_LOWERCASE:
1344 case UCHAR_MATH:
1345 case UCHAR_NONCHARACTER_CODE_POINT:
1346 case UCHAR_PATTERN_SYNTAX:
1347 case UCHAR_PATTERN_WHITE_SPACE:
1348 case UCHAR_QUOTATION_MARK:
1349 case UCHAR_RADICAL:
1350 case UCHAR_REGIONAL_INDICATOR:
1351 case UCHAR_S_TERM:
1352 case UCHAR_SOFT_DOTTED:
1353 case UCHAR_TERMINAL_PUNCTUATION:
1354 case UCHAR_UNIFIED_IDEOGRAPH:
1355 case UCHAR_UPPERCASE:
1356 case UCHAR_VARIATION_SELECTOR:
1357 case UCHAR_WHITE_SPACE:
1358 case UCHAR_XID_CONTINUE:
1359 case UCHAR_XID_START:
1360 return true;
1361 default:
1362 break;
1363 }
1364 return false;
1365 }
1366
IsUnicodePropertyValueCharacter(char c)1367 bool IsUnicodePropertyValueCharacter(char c) {
1368 // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1369 //
1370 // Note that using this to validate each parsed char is quite conservative.
1371 // A possible alternative solution would be to only ensure the parsed
1372 // property name/value candidate string does not contain '\0' characters and
1373 // let ICU lookups trigger the final failure.
1374 if ('a' <= c && c <= 'z') return true;
1375 if ('A' <= c && c <= 'Z') return true;
1376 if ('0' <= c && c <= '9') return true;
1377 return (c == '_');
1378 }
1379
1380 } // anonymous namespace
1381
ParsePropertyClassName(ZoneVector<char> * name_1,ZoneVector<char> * name_2)1382 bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
1383 ZoneVector<char>* name_2) {
1384 DCHECK(name_1->empty());
1385 DCHECK(name_2->empty());
1386 // Parse the property class as follows:
1387 // - In \p{name}, 'name' is interpreted
1388 // - either as a general category property value name.
1389 // - or as a binary property name.
1390 // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1391 // and 'value' is interpreted as one of the available property value names.
1392 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1393 // - Loose matching is not applied.
1394 if (current() == '{') {
1395 // Parse \p{[PropertyName=]PropertyNameValue}
1396 for (Advance(); current() != '}' && current() != '='; Advance()) {
1397 if (!IsUnicodePropertyValueCharacter(current())) return false;
1398 if (!has_next()) return false;
1399 name_1->push_back(static_cast<char>(current()));
1400 }
1401 if (current() == '=') {
1402 for (Advance(); current() != '}'; Advance()) {
1403 if (!IsUnicodePropertyValueCharacter(current())) return false;
1404 if (!has_next()) return false;
1405 name_2->push_back(static_cast<char>(current()));
1406 }
1407 name_2->push_back(0); // null-terminate string.
1408 }
1409 } else {
1410 return false;
1411 }
1412 Advance();
1413 name_1->push_back(0); // null-terminate string.
1414
1415 DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
1416 DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
1417 return true;
1418 }
1419
AddPropertyClassRange(ZoneList<CharacterRange> * add_to,bool negate,const ZoneVector<char> & name_1,const ZoneVector<char> & name_2)1420 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1421 bool negate,
1422 const ZoneVector<char>& name_1,
1423 const ZoneVector<char>& name_2) {
1424 if (name_2.empty()) {
1425 // First attempt to interpret as general category property value name.
1426 const char* name = name_1.data();
1427 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1428 add_to, zone())) {
1429 return true;
1430 }
1431 // Interpret "Any", "ASCII", and "Assigned".
1432 if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
1433 return true;
1434 }
1435 // Then attempt to interpret as binary property name with value name 'Y'.
1436 UProperty property = u_getPropertyEnum(name);
1437 if (!IsSupportedBinaryProperty(property)) return false;
1438 if (!IsExactPropertyAlias(name, property)) return false;
1439 return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
1440 zone());
1441 } else {
1442 // Both property name and value name are specified. Attempt to interpret
1443 // the property name as enumerated property.
1444 const char* property_name = name_1.data();
1445 const char* value_name = name_2.data();
1446 UProperty property = u_getPropertyEnum(property_name);
1447 if (!IsExactPropertyAlias(property_name, property)) return false;
1448 if (property == UCHAR_GENERAL_CATEGORY) {
1449 // We want to allow aggregate value names such as "Letter".
1450 property = UCHAR_GENERAL_CATEGORY_MASK;
1451 } else if (property != UCHAR_SCRIPT &&
1452 property != UCHAR_SCRIPT_EXTENSIONS) {
1453 return false;
1454 }
1455 return LookupPropertyValueName(property, value_name, negate, add_to,
1456 zone());
1457 }
1458 }
1459
GetPropertySequence(const ZoneVector<char> & name_1)1460 RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name_1) {
1461 if (!FLAG_harmony_regexp_sequence) return nullptr;
1462 const char* name = name_1.data();
1463 const uc32* sequence_list = nullptr;
1464 JSRegExp::Flags flags = JSRegExp::kUnicode;
1465 if (NameEquals(name, "Emoji_Flag_Sequence")) {
1466 sequence_list = UnicodePropertySequences::kEmojiFlagSequences;
1467 } else if (NameEquals(name, "Emoji_Tag_Sequence")) {
1468 sequence_list = UnicodePropertySequences::kEmojiTagSequences;
1469 } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) {
1470 sequence_list = UnicodePropertySequences::kEmojiZWJSequences;
1471 }
1472 if (sequence_list != nullptr) {
1473 // TODO(yangguo): this creates huge regexp code. Alternative to this is
1474 // to create a new operator that checks for these sequences at runtime.
1475 RegExpBuilder builder(zone(), flags);
1476 while (true) { // Iterate through list of sequences.
1477 while (*sequence_list != 0) { // Iterate through sequence.
1478 builder.AddUnicodeCharacter(*sequence_list);
1479 sequence_list++;
1480 }
1481 sequence_list++;
1482 if (*sequence_list == 0) break;
1483 builder.NewAlternative();
1484 }
1485 return builder.ToRegExp();
1486 }
1487
1488 if (NameEquals(name, "Emoji_Keycap_Sequence")) {
1489 // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence
1490 // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3}
1491 RegExpBuilder builder(zone(), flags);
1492 ZoneList<CharacterRange>* prefix_ranges =
1493 zone()->New<ZoneList<CharacterRange>>(2, zone());
1494 prefix_ranges->Add(CharacterRange::Range('0', '9'), zone());
1495 prefix_ranges->Add(CharacterRange::Singleton('#'), zone());
1496 prefix_ranges->Add(CharacterRange::Singleton('*'), zone());
1497 builder.AddCharacterClass(
1498 zone()->New<RegExpCharacterClass>(zone(), prefix_ranges, flags));
1499 builder.AddCharacter(0xFE0F);
1500 builder.AddCharacter(0x20E3);
1501 return builder.ToRegExp();
1502 } else if (NameEquals(name, "Emoji_Modifier_Sequence")) {
1503 // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
1504 // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
1505 RegExpBuilder builder(zone(), flags);
1506 ZoneList<CharacterRange>* modifier_base_ranges =
1507 zone()->New<ZoneList<CharacterRange>>(2, zone());
1508 LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false,
1509 modifier_base_ranges, zone());
1510 builder.AddCharacterClass(
1511 zone()->New<RegExpCharacterClass>(zone(), modifier_base_ranges, flags));
1512 ZoneList<CharacterRange>* modifier_ranges =
1513 zone()->New<ZoneList<CharacterRange>>(2, zone());
1514 LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges,
1515 zone());
1516 builder.AddCharacterClass(
1517 zone()->New<RegExpCharacterClass>(zone(), modifier_ranges, flags));
1518 return builder.ToRegExp();
1519 }
1520
1521 return nullptr;
1522 }
1523
1524 #else // V8_INTL_SUPPORT
1525
ParsePropertyClassName(ZoneVector<char> * name_1,ZoneVector<char> * name_2)1526 bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
1527 ZoneVector<char>* name_2) {
1528 return false;
1529 }
1530
AddPropertyClassRange(ZoneList<CharacterRange> * add_to,bool negate,const ZoneVector<char> & name_1,const ZoneVector<char> & name_2)1531 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1532 bool negate,
1533 const ZoneVector<char>& name_1,
1534 const ZoneVector<char>& name_2) {
1535 return false;
1536 }
1537
GetPropertySequence(const ZoneVector<char> & name)1538 RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name) {
1539 return nullptr;
1540 }
1541
1542 #endif // V8_INTL_SUPPORT
1543
ParseUnlimitedLengthHexNumber(int max_value,uc32 * value)1544 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1545 uc32 x = 0;
1546 int d = HexValue(current());
1547 if (d < 0) {
1548 return false;
1549 }
1550 while (d >= 0) {
1551 x = x * 16 + d;
1552 if (x > static_cast<uc32>(max_value)) {
1553 return false;
1554 }
1555 Advance();
1556 d = HexValue(current());
1557 }
1558 *value = x;
1559 return true;
1560 }
1561
1562
ParseClassCharacterEscape()1563 uc32 RegExpParser::ParseClassCharacterEscape() {
1564 DCHECK_EQ('\\', current());
1565 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1566 Advance();
1567 switch (current()) {
1568 case 'b':
1569 Advance();
1570 return '\b';
1571 // ControlEscape :: one of
1572 // f n r t v
1573 case 'f':
1574 Advance();
1575 return '\f';
1576 case 'n':
1577 Advance();
1578 return '\n';
1579 case 'r':
1580 Advance();
1581 return '\r';
1582 case 't':
1583 Advance();
1584 return '\t';
1585 case 'v':
1586 Advance();
1587 return '\v';
1588 case 'c': {
1589 uc32 controlLetter = Next();
1590 uc32 letter = controlLetter & ~('A' ^ 'a');
1591 // Inside a character class, we also accept digits and underscore as
1592 // control characters, unless with /u. See Annex B:
1593 // ES#prod-annexB-ClassControlLetter
1594 if (letter >= 'A' && letter <= 'Z') {
1595 Advance(2);
1596 // Control letters mapped to ASCII control characters in the range
1597 // 0x00-0x1F.
1598 return controlLetter & 0x1F;
1599 }
1600 if (unicode()) {
1601 // With /u, invalid escapes are not treated as identity escapes.
1602 ReportError(RegExpError::kInvalidClassEscape);
1603 return 0;
1604 }
1605 if ((controlLetter >= '0' && controlLetter <= '9') ||
1606 controlLetter == '_') {
1607 Advance(2);
1608 return controlLetter & 0x1F;
1609 }
1610 // We match JSC in reading the backslash as a literal
1611 // character instead of as starting an escape.
1612 // TODO(v8:6201): Not yet covered by the spec.
1613 return '\\';
1614 }
1615 case '0':
1616 // With /u, \0 is interpreted as NUL if not followed by another digit.
1617 if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1618 Advance();
1619 return 0;
1620 }
1621 V8_FALLTHROUGH;
1622 case '1':
1623 case '2':
1624 case '3':
1625 case '4':
1626 case '5':
1627 case '6':
1628 case '7':
1629 // For compatibility, we interpret a decimal escape that isn't
1630 // a back reference (and therefore either \0 or not valid according
1631 // to the specification) as a 1..3 digit octal character code.
1632 // ES#prod-annexB-LegacyOctalEscapeSequence
1633 if (unicode()) {
1634 // With /u, decimal escape is not interpreted as octal character code.
1635 ReportError(RegExpError::kInvalidClassEscape);
1636 return 0;
1637 }
1638 return ParseOctalLiteral();
1639 case 'x': {
1640 Advance();
1641 uc32 value;
1642 if (ParseHexEscape(2, &value)) return value;
1643 if (unicode()) {
1644 // With /u, invalid escapes are not treated as identity escapes.
1645 ReportError(RegExpError::kInvalidEscape);
1646 return 0;
1647 }
1648 // If \x is not followed by a two-digit hexadecimal, treat it
1649 // as an identity escape.
1650 return 'x';
1651 }
1652 case 'u': {
1653 Advance();
1654 uc32 value;
1655 if (ParseUnicodeEscape(&value)) return value;
1656 if (unicode()) {
1657 // With /u, invalid escapes are not treated as identity escapes.
1658 ReportError(RegExpError::kInvalidUnicodeEscape);
1659 return 0;
1660 }
1661 // If \u is not followed by a two-digit hexadecimal, treat it
1662 // as an identity escape.
1663 return 'u';
1664 }
1665 default: {
1666 uc32 result = current();
1667 // With /u, no identity escapes except for syntax characters and '-' are
1668 // allowed. Otherwise, all identity escapes are allowed.
1669 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1670 Advance();
1671 return result;
1672 }
1673 ReportError(RegExpError::kInvalidEscape);
1674 return 0;
1675 }
1676 }
1677 UNREACHABLE();
1678 }
1679
ParseClassEscape(ZoneList<CharacterRange> * ranges,Zone * zone,bool add_unicode_case_equivalents,uc32 * char_out,bool * is_class_escape)1680 void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
1681 Zone* zone,
1682 bool add_unicode_case_equivalents,
1683 uc32* char_out, bool* is_class_escape) {
1684 uc32 current_char = current();
1685 if (current_char == '\\') {
1686 switch (Next()) {
1687 case 'w':
1688 case 'W':
1689 case 'd':
1690 case 'D':
1691 case 's':
1692 case 'S': {
1693 CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
1694 add_unicode_case_equivalents, zone);
1695 Advance(2);
1696 *is_class_escape = true;
1697 return;
1698 }
1699 case kEndMarker:
1700 ReportError(RegExpError::kEscapeAtEndOfPattern);
1701 return;
1702 case 'p':
1703 case 'P':
1704 if (unicode()) {
1705 bool negate = Next() == 'P';
1706 Advance(2);
1707 ZoneVector<char> name_1(zone);
1708 ZoneVector<char> name_2(zone);
1709 if (!ParsePropertyClassName(&name_1, &name_2) ||
1710 !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1711 ReportError(RegExpError::kInvalidClassPropertyName);
1712 }
1713 *is_class_escape = true;
1714 return;
1715 }
1716 break;
1717 default:
1718 break;
1719 }
1720 *char_out = ParseClassCharacterEscape();
1721 *is_class_escape = false;
1722 } else {
1723 Advance();
1724 *char_out = current_char;
1725 *is_class_escape = false;
1726 }
1727 }
1728
ParseCharacterClass(const RegExpBuilder * builder)1729 RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
1730 DCHECK_EQ(current(), '[');
1731 Advance();
1732 bool is_negated = false;
1733 if (current() == '^') {
1734 is_negated = true;
1735 Advance();
1736 }
1737 ZoneList<CharacterRange>* ranges =
1738 zone()->New<ZoneList<CharacterRange>>(2, zone());
1739 bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
1740 while (has_more() && current() != ']') {
1741 uc32 char_1, char_2;
1742 bool is_class_1, is_class_2;
1743 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1744 &is_class_1 CHECK_FAILED);
1745 if (current() == '-') {
1746 Advance();
1747 if (current() == kEndMarker) {
1748 // If we reach the end we break out of the loop and let the
1749 // following code report an error.
1750 break;
1751 } else if (current() == ']') {
1752 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1753 ranges->Add(CharacterRange::Singleton('-'), zone());
1754 break;
1755 }
1756 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1757 &is_class_2 CHECK_FAILED);
1758 if (is_class_1 || is_class_2) {
1759 // Either end is an escaped character class. Treat the '-' verbatim.
1760 if (unicode()) {
1761 // ES2015 21.2.2.15.1 step 1.
1762 return ReportError(RegExpError::kInvalidCharacterClass);
1763 }
1764 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1765 ranges->Add(CharacterRange::Singleton('-'), zone());
1766 if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
1767 continue;
1768 }
1769 // ES2015 21.2.2.15.1 step 6.
1770 if (char_1 > char_2) {
1771 return ReportError(RegExpError::kOutOfOrderCharacterClass);
1772 }
1773 ranges->Add(CharacterRange::Range(char_1, char_2), zone());
1774 } else {
1775 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1776 }
1777 }
1778 if (!has_more()) {
1779 return ReportError(RegExpError::kUnterminatedCharacterClass);
1780 }
1781 Advance();
1782 RegExpCharacterClass::CharacterClassFlags character_class_flags;
1783 if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
1784 return zone()->New<RegExpCharacterClass>(zone(), ranges, builder->flags(),
1785 character_class_flags);
1786 }
1787
1788
1789 #undef CHECK_FAILED
1790
Parse(RegExpCompileData * result,const DisallowHeapAllocation &)1791 bool RegExpParser::Parse(RegExpCompileData* result,
1792 const DisallowHeapAllocation&) {
1793 DCHECK(result != nullptr);
1794 RegExpTree* tree = ParsePattern();
1795 if (failed()) {
1796 DCHECK(tree == nullptr);
1797 DCHECK(error_ != RegExpError::kNone);
1798 result->error = error_;
1799 result->error_pos = error_pos_;
1800 } else {
1801 DCHECK(tree != nullptr);
1802 DCHECK(error_ == RegExpError::kNone);
1803 if (FLAG_trace_regexp_parser) {
1804 StdoutStream os;
1805 tree->Print(os, zone());
1806 os << "\n";
1807 }
1808 result->tree = tree;
1809 int capture_count = captures_started();
1810 result->simple = tree->IsAtom() && simple() && capture_count == 0;
1811 result->contains_anchor = contains_anchor();
1812 result->capture_count = capture_count;
1813 }
1814 return !failed();
1815 }
1816
ParseRegExp(Isolate * isolate,Zone * zone,FlatStringReader * input,JSRegExp::Flags flags,RegExpCompileData * result)1817 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1818 FlatStringReader* input, JSRegExp::Flags flags,
1819 RegExpCompileData* result) {
1820 RegExpParser parser(input, flags, isolate, zone);
1821 bool success;
1822 {
1823 DisallowHeapAllocation no_gc;
1824 success = parser.Parse(result, no_gc);
1825 }
1826 if (success) {
1827 result->capture_name_map = parser.CreateCaptureNameMap();
1828 }
1829 return success;
1830 }
1831
RegExpBuilder(Zone * zone,JSRegExp::Flags flags)1832 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
1833 : zone_(zone),
1834 pending_empty_(false),
1835 flags_(flags),
1836 characters_(nullptr),
1837 pending_surrogate_(kNoPendingSurrogate),
1838 terms_(),
1839 alternatives_()
1840 #ifdef DEBUG
1841 ,
1842 last_added_(ADD_NONE)
1843 #endif
1844 {
1845 }
1846
1847
AddLeadSurrogate(uc16 lead_surrogate)1848 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1849 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1850 FlushPendingSurrogate();
1851 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1852 pending_surrogate_ = lead_surrogate;
1853 }
1854
1855
AddTrailSurrogate(uc16 trail_surrogate)1856 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1857 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1858 if (pending_surrogate_ != kNoPendingSurrogate) {
1859 uc16 lead_surrogate = pending_surrogate_;
1860 pending_surrogate_ = kNoPendingSurrogate;
1861 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1862 uc32 combined =
1863 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1864 if (NeedsDesugaringForIgnoreCase(combined)) {
1865 AddCharacterClassForDesugaring(combined);
1866 } else {
1867 ZoneList<uc16> surrogate_pair(2, zone());
1868 surrogate_pair.Add(lead_surrogate, zone());
1869 surrogate_pair.Add(trail_surrogate, zone());
1870 RegExpAtom* atom =
1871 zone()->New<RegExpAtom>(surrogate_pair.ToConstVector(), flags_);
1872 AddAtom(atom);
1873 }
1874 } else {
1875 pending_surrogate_ = trail_surrogate;
1876 FlushPendingSurrogate();
1877 }
1878 }
1879
1880
FlushPendingSurrogate()1881 void RegExpBuilder::FlushPendingSurrogate() {
1882 if (pending_surrogate_ != kNoPendingSurrogate) {
1883 DCHECK(unicode());
1884 uc32 c = pending_surrogate_;
1885 pending_surrogate_ = kNoPendingSurrogate;
1886 AddCharacterClassForDesugaring(c);
1887 }
1888 }
1889
1890
FlushCharacters()1891 void RegExpBuilder::FlushCharacters() {
1892 FlushPendingSurrogate();
1893 pending_empty_ = false;
1894 if (characters_ != nullptr) {
1895 RegExpTree* atom =
1896 zone()->New<RegExpAtom>(characters_->ToConstVector(), flags_);
1897 characters_ = nullptr;
1898 text_.Add(atom, zone());
1899 LAST(ADD_ATOM);
1900 }
1901 }
1902
1903
FlushText()1904 void RegExpBuilder::FlushText() {
1905 FlushCharacters();
1906 int num_text = text_.length();
1907 if (num_text == 0) {
1908 return;
1909 } else if (num_text == 1) {
1910 terms_.Add(text_.last(), zone());
1911 } else {
1912 RegExpText* text = zone()->New<RegExpText>(zone());
1913 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1914 terms_.Add(text, zone());
1915 }
1916 text_.Clear();
1917 }
1918
1919
AddCharacter(uc16 c)1920 void RegExpBuilder::AddCharacter(uc16 c) {
1921 FlushPendingSurrogate();
1922 pending_empty_ = false;
1923 if (NeedsDesugaringForIgnoreCase(c)) {
1924 AddCharacterClassForDesugaring(c);
1925 } else {
1926 if (characters_ == nullptr) {
1927 characters_ = zone()->New<ZoneList<uc16>>(4, zone());
1928 }
1929 characters_->Add(c, zone());
1930 LAST(ADD_CHAR);
1931 }
1932 }
1933
1934
AddUnicodeCharacter(uc32 c)1935 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1936 if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1937 DCHECK(unicode());
1938 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1939 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1940 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1941 AddLeadSurrogate(c);
1942 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1943 AddTrailSurrogate(c);
1944 } else {
1945 AddCharacter(static_cast<uc16>(c));
1946 }
1947 }
1948
AddEscapedUnicodeCharacter(uc32 character)1949 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1950 // A lead or trail surrogate parsed via escape sequence will not
1951 // pair up with any preceding lead or following trail surrogate.
1952 FlushPendingSurrogate();
1953 AddUnicodeCharacter(character);
1954 FlushPendingSurrogate();
1955 }
1956
AddEmpty()1957 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1958
1959
AddCharacterClass(RegExpCharacterClass * cc)1960 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1961 if (NeedsDesugaringForUnicode(cc)) {
1962 // With /u, character class needs to be desugared, so it
1963 // must be a standalone term instead of being part of a RegExpText.
1964 AddTerm(cc);
1965 } else {
1966 AddAtom(cc);
1967 }
1968 }
1969
AddCharacterClassForDesugaring(uc32 c)1970 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1971 AddTerm(zone()->New<RegExpCharacterClass>(
1972 zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)),
1973 flags_));
1974 }
1975
1976
AddAtom(RegExpTree * term)1977 void RegExpBuilder::AddAtom(RegExpTree* term) {
1978 if (term->IsEmpty()) {
1979 AddEmpty();
1980 return;
1981 }
1982 if (term->IsTextElement()) {
1983 FlushCharacters();
1984 text_.Add(term, zone());
1985 } else {
1986 FlushText();
1987 terms_.Add(term, zone());
1988 }
1989 LAST(ADD_ATOM);
1990 }
1991
1992
AddTerm(RegExpTree * term)1993 void RegExpBuilder::AddTerm(RegExpTree* term) {
1994 FlushText();
1995 terms_.Add(term, zone());
1996 LAST(ADD_ATOM);
1997 }
1998
1999
AddAssertion(RegExpTree * assert)2000 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
2001 FlushText();
2002 terms_.Add(assert, zone());
2003 LAST(ADD_ASSERT);
2004 }
2005
2006
NewAlternative()2007 void RegExpBuilder::NewAlternative() { FlushTerms(); }
2008
2009
FlushTerms()2010 void RegExpBuilder::FlushTerms() {
2011 FlushText();
2012 int num_terms = terms_.length();
2013 RegExpTree* alternative;
2014 if (num_terms == 0) {
2015 alternative = zone()->New<RegExpEmpty>();
2016 } else if (num_terms == 1) {
2017 alternative = terms_.last();
2018 } else {
2019 alternative = zone()->New<RegExpAlternative>(terms_.GetList(zone()));
2020 }
2021 alternatives_.Add(alternative, zone());
2022 terms_.Clear();
2023 LAST(ADD_NONE);
2024 }
2025
2026
NeedsDesugaringForUnicode(RegExpCharacterClass * cc)2027 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
2028 if (!unicode()) return false;
2029 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
2030 // necessarily mean that we need to desugar. It's probably nicer to have a
2031 // separate pass to figure out unicode desugarings.
2032 if (ignore_case()) return true;
2033 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2034 CharacterRange::Canonicalize(ranges);
2035 for (int i = ranges->length() - 1; i >= 0; i--) {
2036 uc32 from = ranges->at(i).from();
2037 uc32 to = ranges->at(i).to();
2038 // Check for non-BMP characters.
2039 if (to >= kNonBmpStart) return true;
2040 // Check for lone surrogates.
2041 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
2042 }
2043 return false;
2044 }
2045
2046
NeedsDesugaringForIgnoreCase(uc32 c)2047 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
2048 #ifdef V8_INTL_SUPPORT
2049 if (unicode() && ignore_case()) {
2050 icu::UnicodeSet set(c, c);
2051 set.closeOver(USET_CASE_INSENSITIVE);
2052 set.removeAllStrings();
2053 return set.size() > 1;
2054 }
2055 // In the case where ICU is not included, we act as if the unicode flag is
2056 // not set, and do not desugar.
2057 #endif // V8_INTL_SUPPORT
2058 return false;
2059 }
2060
2061
ToRegExp()2062 RegExpTree* RegExpBuilder::ToRegExp() {
2063 FlushTerms();
2064 int num_alternatives = alternatives_.length();
2065 if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
2066 if (num_alternatives == 1) return alternatives_.last();
2067 return zone()->New<RegExpDisjunction>(alternatives_.GetList(zone()));
2068 }
2069
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)2070 bool RegExpBuilder::AddQuantifierToAtom(
2071 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
2072 FlushPendingSurrogate();
2073 if (pending_empty_) {
2074 pending_empty_ = false;
2075 return true;
2076 }
2077 RegExpTree* atom;
2078 if (characters_ != nullptr) {
2079 DCHECK(last_added_ == ADD_CHAR);
2080 // Last atom was character.
2081 Vector<const uc16> char_vector = characters_->ToConstVector();
2082 int num_chars = char_vector.length();
2083 if (num_chars > 1) {
2084 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
2085 text_.Add(zone()->New<RegExpAtom>(prefix, flags_), zone());
2086 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
2087 }
2088 characters_ = nullptr;
2089 atom = zone()->New<RegExpAtom>(char_vector, flags_);
2090 FlushText();
2091 } else if (text_.length() > 0) {
2092 DCHECK(last_added_ == ADD_ATOM);
2093 atom = text_.RemoveLast();
2094 FlushText();
2095 } else if (terms_.length() > 0) {
2096 DCHECK(last_added_ == ADD_ATOM);
2097 atom = terms_.RemoveLast();
2098 if (atom->IsLookaround()) {
2099 // With /u, lookarounds are not quantifiable.
2100 if (unicode()) return false;
2101 // Lookbehinds are not quantifiable.
2102 if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
2103 return false;
2104 }
2105 }
2106 if (atom->max_match() == 0) {
2107 // Guaranteed to only match an empty string.
2108 LAST(ADD_TERM);
2109 if (min == 0) {
2110 return true;
2111 }
2112 terms_.Add(atom, zone());
2113 return true;
2114 }
2115 } else {
2116 // Only call immediately after adding an atom or character!
2117 UNREACHABLE();
2118 }
2119 terms_.Add(zone()->New<RegExpQuantifier>(min, max, quantifier_type, atom),
2120 zone());
2121 LAST(ADD_TERM);
2122 return true;
2123 }
2124
2125 } // namespace internal
2126 } // namespace v8
2127