1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-parser.h"
6
7 #include <vector>
8
9 #include "src/char-predicates-inl.h"
10 #include "src/heap/factory.h"
11 #include "src/isolate.h"
12 #include "src/objects-inl.h"
13 #include "src/ostreams.h"
14 #include "src/regexp/jsregexp.h"
15 #include "src/utils.h"
16
17 #ifdef V8_INTL_SUPPORT
18 #include "unicode/uniset.h"
19 // TODO(mathias): Remove this when we no longer need to check
20 // `U_ICU_VERSION_MAJOR_NUM`.
21 #include "unicode/uvernum.h"
22 #endif // V8_INTL_SUPPORT
23
24 namespace v8 {
25 namespace internal {
26
RegExpParser(FlatStringReader * in,Handle<String> * error,JSRegExp::Flags flags,Isolate * isolate,Zone * zone)27 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
28 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
29 : isolate_(isolate),
30 zone_(zone),
31 error_(error),
32 captures_(nullptr),
33 named_captures_(nullptr),
34 named_back_references_(nullptr),
35 in_(in),
36 current_(kEndMarker),
37 top_level_flags_(flags),
38 next_pos_(0),
39 captures_started_(0),
40 capture_count_(0),
41 has_more_(true),
42 simple_(false),
43 contains_anchor_(false),
44 is_scanned_for_captures_(false),
45 has_named_captures_(false),
46 failed_(false) {
47 Advance();
48 }
49
50 template <bool update_position>
ReadNext()51 inline uc32 RegExpParser::ReadNext() {
52 int position = next_pos_;
53 uc32 c0 = in()->Get(position);
54 position++;
55 // Read the whole surrogate pair in case of unicode flag, if possible.
56 if (unicode() && position < in()->length() &&
57 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
58 uc16 c1 = in()->Get(position);
59 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
60 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
61 position++;
62 }
63 }
64 if (update_position) next_pos_ = position;
65 return c0;
66 }
67
68
Next()69 uc32 RegExpParser::Next() {
70 if (has_next()) {
71 return ReadNext<false>();
72 } else {
73 return kEndMarker;
74 }
75 }
76
Advance()77 void RegExpParser::Advance() {
78 if (has_next()) {
79 StackLimitCheck check(isolate());
80 if (check.HasOverflowed()) {
81 if (FLAG_abort_on_stack_or_string_length_overflow) {
82 FATAL("Aborting on stack overflow");
83 }
84 ReportError(CStrVector(
85 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
86 } else if (zone()->excess_allocation()) {
87 ReportError(CStrVector("Regular expression too large"));
88 } else {
89 current_ = ReadNext<true>();
90 }
91 } else {
92 current_ = kEndMarker;
93 // Advance so that position() points to 1-after-the-last-character. This is
94 // important so that Reset() to this position works correctly.
95 next_pos_ = in()->length() + 1;
96 has_more_ = false;
97 }
98 }
99
100
Reset(int pos)101 void RegExpParser::Reset(int pos) {
102 next_pos_ = pos;
103 has_more_ = (pos < in()->length());
104 Advance();
105 }
106
Advance(int dist)107 void RegExpParser::Advance(int dist) {
108 next_pos_ += dist - 1;
109 Advance();
110 }
111
112
simple()113 bool RegExpParser::simple() { return simple_; }
114
IsSyntaxCharacterOrSlash(uc32 c)115 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
116 switch (c) {
117 case '^':
118 case '$':
119 case '\\':
120 case '.':
121 case '*':
122 case '+':
123 case '?':
124 case '(':
125 case ')':
126 case '[':
127 case ']':
128 case '{':
129 case '}':
130 case '|':
131 case '/':
132 return true;
133 default:
134 break;
135 }
136 return false;
137 }
138
139
ReportError(Vector<const char> message)140 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
141 if (failed_) return nullptr; // Do not overwrite any existing error.
142 failed_ = true;
143 *error_ = isolate()
144 ->factory()
145 ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
146 .ToHandleChecked();
147 // Zip to the end to make sure the no more input is read.
148 current_ = kEndMarker;
149 next_pos_ = in()->length();
150 return nullptr;
151 }
152
153 #define CHECK_FAILED /**/); \
154 if (failed_) return nullptr; \
155 ((void)0
156
157 // Pattern ::
158 // Disjunction
ParsePattern()159 RegExpTree* RegExpParser::ParsePattern() {
160 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
161 PatchNamedBackReferences(CHECK_FAILED);
162 DCHECK(!has_more());
163 // If the result of parsing is a literal string atom, and it has the
164 // same length as the input, then the atom is identical to the input.
165 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
166 simple_ = true;
167 }
168 return result;
169 }
170
171
172 // Disjunction ::
173 // Alternative
174 // Alternative | Disjunction
175 // Alternative ::
176 // [empty]
177 // Term Alternative
178 // Term ::
179 // Assertion
180 // Atom
181 // Atom Quantifier
ParseDisjunction()182 RegExpTree* RegExpParser::ParseDisjunction() {
183 // Used to store current state while parsing subexpressions.
184 RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
185 0, nullptr, top_level_flags_, zone());
186 RegExpParserState* state = &initial_state;
187 // Cache the builder in a local variable for quick access.
188 RegExpBuilder* builder = initial_state.builder();
189 while (true) {
190 switch (current()) {
191 case kEndMarker:
192 if (state->IsSubexpression()) {
193 // Inside a parenthesized group when hitting end of input.
194 return ReportError(CStrVector("Unterminated group"));
195 }
196 DCHECK_EQ(INITIAL, state->group_type());
197 // Parsing completed successfully.
198 return builder->ToRegExp();
199 case ')': {
200 if (!state->IsSubexpression()) {
201 return ReportError(CStrVector("Unmatched ')'"));
202 }
203 DCHECK_NE(INITIAL, state->group_type());
204
205 Advance();
206 // End disjunction parsing and convert builder content to new single
207 // regexp atom.
208 RegExpTree* body = builder->ToRegExp();
209
210 int end_capture_index = captures_started();
211
212 int capture_index = state->capture_index();
213 SubexpressionType group_type = state->group_type();
214
215 // Build result of subexpression.
216 if (group_type == CAPTURE) {
217 if (state->IsNamedCapture()) {
218 CreateNamedCaptureAtIndex(state->capture_name(),
219 capture_index CHECK_FAILED);
220 }
221 RegExpCapture* capture = GetCapture(capture_index);
222 capture->set_body(body);
223 body = capture;
224 } else if (group_type == GROUPING) {
225 body = new (zone()) RegExpGroup(body);
226 } else {
227 DCHECK(group_type == POSITIVE_LOOKAROUND ||
228 group_type == NEGATIVE_LOOKAROUND);
229 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
230 body = new (zone()) RegExpLookaround(
231 body, is_positive, end_capture_index - capture_index,
232 capture_index, state->lookaround_type());
233 }
234
235 // Restore previous state.
236 state = state->previous_state();
237 builder = state->builder();
238
239 builder->AddAtom(body);
240 // For compatibility with JSC and ES3, we allow quantifiers after
241 // lookaheads, and break in all cases.
242 break;
243 }
244 case '|': {
245 Advance();
246 builder->NewAlternative();
247 continue;
248 }
249 case '*':
250 case '+':
251 case '?':
252 return ReportError(CStrVector("Nothing to repeat"));
253 case '^': {
254 Advance();
255 if (builder->multiline()) {
256 builder->AddAssertion(new (zone()) RegExpAssertion(
257 RegExpAssertion::START_OF_LINE, builder->flags()));
258 } else {
259 builder->AddAssertion(new (zone()) RegExpAssertion(
260 RegExpAssertion::START_OF_INPUT, builder->flags()));
261 set_contains_anchor();
262 }
263 continue;
264 }
265 case '$': {
266 Advance();
267 RegExpAssertion::AssertionType assertion_type =
268 builder->multiline() ? RegExpAssertion::END_OF_LINE
269 : RegExpAssertion::END_OF_INPUT;
270 builder->AddAssertion(
271 new (zone()) RegExpAssertion(assertion_type, builder->flags()));
272 continue;
273 }
274 case '.': {
275 Advance();
276 ZoneList<CharacterRange>* ranges =
277 new (zone()) ZoneList<CharacterRange>(2, zone());
278
279 if (builder->dotall()) {
280 // Everything.
281 CharacterRange::AddClassEscape('*', ranges, false, zone());
282 } else {
283 // Everything except \x0A, \x0D, \u2028 and \u2029
284 CharacterRange::AddClassEscape('.', ranges, false, zone());
285 }
286
287 RegExpCharacterClass* cc =
288 new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags());
289 builder->AddCharacterClass(cc);
290 break;
291 }
292 case '(': {
293 state = ParseOpenParenthesis(state CHECK_FAILED);
294 builder = state->builder();
295 continue;
296 }
297 case '[': {
298 RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
299 builder->AddCharacterClass(cc->AsCharacterClass());
300 break;
301 }
302 // Atom ::
303 // \ AtomEscape
304 case '\\':
305 switch (Next()) {
306 case kEndMarker:
307 return ReportError(CStrVector("\\ at end of pattern"));
308 case 'b':
309 Advance(2);
310 builder->AddAssertion(new (zone()) RegExpAssertion(
311 RegExpAssertion::BOUNDARY, builder->flags()));
312 continue;
313 case 'B':
314 Advance(2);
315 builder->AddAssertion(new (zone()) RegExpAssertion(
316 RegExpAssertion::NON_BOUNDARY, builder->flags()));
317 continue;
318 // AtomEscape ::
319 // CharacterClassEscape
320 //
321 // CharacterClassEscape :: one of
322 // d D s S w W
323 case 'd':
324 case 'D':
325 case 's':
326 case 'S':
327 case 'w':
328 case 'W': {
329 uc32 c = Next();
330 Advance(2);
331 ZoneList<CharacterRange>* ranges =
332 new (zone()) ZoneList<CharacterRange>(2, zone());
333 CharacterRange::AddClassEscape(
334 c, ranges, unicode() && builder->ignore_case(), zone());
335 RegExpCharacterClass* cc = new (zone())
336 RegExpCharacterClass(zone(), ranges, builder->flags());
337 builder->AddCharacterClass(cc);
338 break;
339 }
340 case 'p':
341 case 'P': {
342 uc32 p = Next();
343 Advance(2);
344 if (unicode()) {
345 ZoneList<CharacterRange>* ranges =
346 new (zone()) ZoneList<CharacterRange>(2, zone());
347 if (!ParsePropertyClass(ranges, p == 'P')) {
348 return ReportError(CStrVector("Invalid property name"));
349 }
350 RegExpCharacterClass* cc = new (zone())
351 RegExpCharacterClass(zone(), ranges, builder->flags());
352 builder->AddCharacterClass(cc);
353
354 } else {
355 builder->AddCharacter(p);
356 }
357 break;
358 }
359 case '1':
360 case '2':
361 case '3':
362 case '4':
363 case '5':
364 case '6':
365 case '7':
366 case '8':
367 case '9': {
368 int index = 0;
369 bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
370 if (is_backref) {
371 if (state->IsInsideCaptureGroup(index)) {
372 // The back reference is inside the capture group it refers to.
373 // Nothing can possibly have been captured yet, so we use empty
374 // instead. This ensures that, when checking a back reference,
375 // the capture registers of the referenced capture are either
376 // both set or both cleared.
377 builder->AddEmpty();
378 } else {
379 RegExpCapture* capture = GetCapture(index);
380 RegExpTree* atom =
381 new (zone()) RegExpBackReference(capture, builder->flags());
382 builder->AddAtom(atom);
383 }
384 break;
385 }
386 // With /u, no identity escapes except for syntax characters
387 // are allowed. Otherwise, all identity escapes are allowed.
388 if (unicode()) {
389 return ReportError(CStrVector("Invalid escape"));
390 }
391 uc32 first_digit = Next();
392 if (first_digit == '8' || first_digit == '9') {
393 builder->AddCharacter(first_digit);
394 Advance(2);
395 break;
396 }
397 V8_FALLTHROUGH;
398 }
399 case '0': {
400 Advance();
401 if (unicode() && Next() >= '0' && Next() <= '9') {
402 // With /u, decimal escape with leading 0 are not parsed as octal.
403 return ReportError(CStrVector("Invalid decimal escape"));
404 }
405 uc32 octal = ParseOctalLiteral();
406 builder->AddCharacter(octal);
407 break;
408 }
409 // ControlEscape :: one of
410 // f n r t v
411 case 'f':
412 Advance(2);
413 builder->AddCharacter('\f');
414 break;
415 case 'n':
416 Advance(2);
417 builder->AddCharacter('\n');
418 break;
419 case 'r':
420 Advance(2);
421 builder->AddCharacter('\r');
422 break;
423 case 't':
424 Advance(2);
425 builder->AddCharacter('\t');
426 break;
427 case 'v':
428 Advance(2);
429 builder->AddCharacter('\v');
430 break;
431 case 'c': {
432 Advance();
433 uc32 controlLetter = Next();
434 // Special case if it is an ASCII letter.
435 // Convert lower case letters to uppercase.
436 uc32 letter = controlLetter & ~('a' ^ 'A');
437 if (letter < 'A' || 'Z' < letter) {
438 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
439 // Read the backslash as a literal character instead of as
440 // starting an escape.
441 // ES#prod-annexB-ExtendedPatternCharacter
442 if (unicode()) {
443 // With /u, invalid escapes are not treated as identity escapes.
444 return ReportError(CStrVector("Invalid unicode escape"));
445 }
446 builder->AddCharacter('\\');
447 } else {
448 Advance(2);
449 builder->AddCharacter(controlLetter & 0x1F);
450 }
451 break;
452 }
453 case 'x': {
454 Advance(2);
455 uc32 value;
456 if (ParseHexEscape(2, &value)) {
457 builder->AddCharacter(value);
458 } else if (!unicode()) {
459 builder->AddCharacter('x');
460 } else {
461 // With /u, invalid escapes are not treated as identity escapes.
462 return ReportError(CStrVector("Invalid escape"));
463 }
464 break;
465 }
466 case 'u': {
467 Advance(2);
468 uc32 value;
469 if (ParseUnicodeEscape(&value)) {
470 builder->AddEscapedUnicodeCharacter(value);
471 } else if (!unicode()) {
472 builder->AddCharacter('u');
473 } else {
474 // With /u, invalid escapes are not treated as identity escapes.
475 return ReportError(CStrVector("Invalid Unicode escape"));
476 }
477 break;
478 }
479 case 'k':
480 // Either an identity escape or a named back-reference. The two
481 // interpretations are mutually exclusive: '\k' is interpreted as
482 // an identity escape for non-Unicode patterns without named
483 // capture groups, and as the beginning of a named back-reference
484 // in all other cases.
485 if (unicode() || HasNamedCaptures()) {
486 Advance(2);
487 ParseNamedBackReference(builder, state CHECK_FAILED);
488 break;
489 }
490 V8_FALLTHROUGH;
491 default:
492 Advance();
493 // With /u, no identity escapes except for syntax characters
494 // are allowed. Otherwise, all identity escapes are allowed.
495 if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
496 builder->AddCharacter(current());
497 Advance();
498 } else {
499 return ReportError(CStrVector("Invalid escape"));
500 }
501 break;
502 }
503 break;
504 case '{': {
505 int dummy;
506 bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
507 if (parsed) return ReportError(CStrVector("Nothing to repeat"));
508 V8_FALLTHROUGH;
509 }
510 case '}':
511 case ']':
512 if (unicode()) {
513 return ReportError(CStrVector("Lone quantifier brackets"));
514 }
515 V8_FALLTHROUGH;
516 default:
517 builder->AddUnicodeCharacter(current());
518 Advance();
519 break;
520 } // end switch(current())
521
522 int min;
523 int max;
524 switch (current()) {
525 // QuantifierPrefix ::
526 // *
527 // +
528 // ?
529 // {
530 case '*':
531 min = 0;
532 max = RegExpTree::kInfinity;
533 Advance();
534 break;
535 case '+':
536 min = 1;
537 max = RegExpTree::kInfinity;
538 Advance();
539 break;
540 case '?':
541 min = 0;
542 max = 1;
543 Advance();
544 break;
545 case '{':
546 if (ParseIntervalQuantifier(&min, &max)) {
547 if (max < min) {
548 return ReportError(
549 CStrVector("numbers out of order in {} quantifier"));
550 }
551 break;
552 } else if (unicode()) {
553 // With /u, incomplete quantifiers are not allowed.
554 return ReportError(CStrVector("Incomplete quantifier"));
555 }
556 continue;
557 default:
558 continue;
559 }
560 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
561 if (current() == '?') {
562 quantifier_type = RegExpQuantifier::NON_GREEDY;
563 Advance();
564 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
565 // FLAG_regexp_possessive_quantifier is a debug-only flag.
566 quantifier_type = RegExpQuantifier::POSSESSIVE;
567 Advance();
568 }
569 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
570 return ReportError(CStrVector("Invalid quantifier"));
571 }
572 }
573 }
574
ParseOpenParenthesis(RegExpParserState * state)575 RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
576 RegExpParserState* state) {
577 RegExpLookaround::Type lookaround_type = state->lookaround_type();
578 bool is_named_capture = false;
579 JSRegExp::Flags switch_on = JSRegExp::kNone;
580 JSRegExp::Flags switch_off = JSRegExp::kNone;
581 const ZoneVector<uc16>* capture_name = nullptr;
582 SubexpressionType subexpr_type = CAPTURE;
583 Advance();
584 if (current() == '?') {
585 switch (Next()) {
586 case ':':
587 Advance(2);
588 subexpr_type = GROUPING;
589 break;
590 case '=':
591 Advance(2);
592 lookaround_type = RegExpLookaround::LOOKAHEAD;
593 subexpr_type = POSITIVE_LOOKAROUND;
594 break;
595 case '!':
596 Advance(2);
597 lookaround_type = RegExpLookaround::LOOKAHEAD;
598 subexpr_type = NEGATIVE_LOOKAROUND;
599 break;
600 case '-':
601 case 'i':
602 case 's':
603 case 'm': {
604 if (!FLAG_regexp_mode_modifiers) {
605 ReportError(CStrVector("Invalid group"));
606 return nullptr;
607 }
608 Advance();
609 bool flags_sense = true; // Switching on flags.
610 while (subexpr_type != GROUPING) {
611 switch (current()) {
612 case '-':
613 if (!flags_sense) {
614 ReportError(CStrVector("Multiple dashes in flag group"));
615 return nullptr;
616 }
617 flags_sense = false;
618 Advance();
619 continue;
620 case 's':
621 case 'i':
622 case 'm': {
623 JSRegExp::Flags bit = JSRegExp::kUnicode;
624 if (current() == 'i') bit = JSRegExp::kIgnoreCase;
625 if (current() == 'm') bit = JSRegExp::kMultiline;
626 if (current() == 's') bit = JSRegExp::kDotAll;
627 if (((switch_on | switch_off) & bit) != 0) {
628 ReportError(CStrVector("Repeated flag in flag group"));
629 return nullptr;
630 }
631 if (flags_sense) {
632 switch_on |= bit;
633 } else {
634 switch_off |= bit;
635 }
636 Advance();
637 continue;
638 }
639 case ')': {
640 Advance();
641 state->builder()
642 ->FlushText(); // Flush pending text using old flags.
643 // These (?i)-style flag switches don't put us in a subexpression
644 // at all, they just modify the flags in the rest of the current
645 // subexpression.
646 JSRegExp::Flags flags =
647 (state->builder()->flags() | switch_on) & ~switch_off;
648 state->builder()->set_flags(flags);
649 return state;
650 }
651 case ':':
652 Advance();
653 subexpr_type = GROUPING; // Will break us out of the outer loop.
654 continue;
655 default:
656 ReportError(CStrVector("Invalid flag group"));
657 return nullptr;
658 }
659 }
660 break;
661 }
662 case '<':
663 Advance();
664 if (Next() == '=') {
665 Advance(2);
666 lookaround_type = RegExpLookaround::LOOKBEHIND;
667 subexpr_type = POSITIVE_LOOKAROUND;
668 break;
669 } else if (Next() == '!') {
670 Advance(2);
671 lookaround_type = RegExpLookaround::LOOKBEHIND;
672 subexpr_type = NEGATIVE_LOOKAROUND;
673 break;
674 }
675 is_named_capture = true;
676 has_named_captures_ = true;
677 Advance();
678 break;
679 default:
680 ReportError(CStrVector("Invalid group"));
681 return nullptr;
682 }
683 }
684 if (subexpr_type == CAPTURE) {
685 if (captures_started_ >= kMaxCaptures) {
686 ReportError(CStrVector("Too many captures"));
687 return nullptr;
688 }
689 captures_started_++;
690
691 if (is_named_capture) {
692 capture_name = ParseCaptureGroupName(CHECK_FAILED);
693 }
694 }
695 JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
696 // Store current state and begin new disjunction parsing.
697 return new (zone())
698 RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
699 capture_name, flags, zone());
700 }
701
702 #ifdef DEBUG
703 // Currently only used in an DCHECK.
IsSpecialClassEscape(uc32 c)704 static bool IsSpecialClassEscape(uc32 c) {
705 switch (c) {
706 case 'd':
707 case 'D':
708 case 's':
709 case 'S':
710 case 'w':
711 case 'W':
712 return true;
713 default:
714 return false;
715 }
716 }
717 #endif
718
719
720 // In order to know whether an escape is a backreference or not we have to scan
721 // the entire regexp and find the number of capturing parentheses. However we
722 // don't want to scan the regexp twice unless it is necessary. This mini-parser
723 // is called when needed. It can see the difference between capturing and
724 // noncapturing parentheses and can skip character classes and backslash-escaped
725 // characters.
ScanForCaptures()726 void RegExpParser::ScanForCaptures() {
727 DCHECK(!is_scanned_for_captures_);
728 const int saved_position = position();
729 // Start with captures started previous to current position
730 int capture_count = captures_started();
731 // Add count of captures after this position.
732 int n;
733 while ((n = current()) != kEndMarker) {
734 Advance();
735 switch (n) {
736 case '\\':
737 Advance();
738 break;
739 case '[': {
740 int c;
741 while ((c = current()) != kEndMarker) {
742 Advance();
743 if (c == '\\') {
744 Advance();
745 } else {
746 if (c == ']') break;
747 }
748 }
749 break;
750 }
751 case '(':
752 if (current() == '?') {
753 // At this point we could be in
754 // * a non-capturing group '(:',
755 // * a lookbehind assertion '(?<=' '(?<!'
756 // * or a named capture '(?<'.
757 //
758 // Of these, only named captures are capturing groups.
759
760 Advance();
761 if (current() != '<') break;
762
763 Advance();
764 if (current() == '=' || current() == '!') break;
765
766 // Found a possible named capture. It could turn out to be a syntax
767 // error (e.g. an unterminated or invalid name), but that distinction
768 // does not matter for our purposes.
769 has_named_captures_ = true;
770 }
771 capture_count++;
772 break;
773 }
774 }
775 capture_count_ = capture_count;
776 is_scanned_for_captures_ = true;
777 Reset(saved_position);
778 }
779
780
ParseBackReferenceIndex(int * index_out)781 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
782 DCHECK_EQ('\\', current());
783 DCHECK('1' <= Next() && Next() <= '9');
784 // Try to parse a decimal literal that is no greater than the total number
785 // of left capturing parentheses in the input.
786 int start = position();
787 int value = Next() - '0';
788 Advance(2);
789 while (true) {
790 uc32 c = current();
791 if (IsDecimalDigit(c)) {
792 value = 10 * value + (c - '0');
793 if (value > kMaxCaptures) {
794 Reset(start);
795 return false;
796 }
797 Advance();
798 } else {
799 break;
800 }
801 }
802 if (value > captures_started()) {
803 if (!is_scanned_for_captures_) ScanForCaptures();
804 if (value > capture_count_) {
805 Reset(start);
806 return false;
807 }
808 }
809 *index_out = value;
810 return true;
811 }
812
push_code_unit(ZoneVector<uc16> * v,uint32_t code_unit)813 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
814 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
815 v->push_back(code_unit);
816 } else {
817 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
818 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
819 }
820 }
821
ParseCaptureGroupName()822 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
823 ZoneVector<uc16>* name =
824 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
825
826 bool at_start = true;
827 while (true) {
828 uc32 c = current();
829 Advance();
830
831 // Convert unicode escapes.
832 if (c == '\\' && current() == 'u') {
833 Advance();
834 if (!ParseUnicodeEscape(&c)) {
835 ReportError(CStrVector("Invalid Unicode escape sequence"));
836 return nullptr;
837 }
838 }
839
840 // The backslash char is misclassified as both ID_Start and ID_Continue.
841 if (c == '\\') {
842 ReportError(CStrVector("Invalid capture group name"));
843 return nullptr;
844 }
845
846 if (at_start) {
847 if (!IdentifierStart::Is(c)) {
848 ReportError(CStrVector("Invalid capture group name"));
849 return nullptr;
850 }
851 push_code_unit(name, c);
852 at_start = false;
853 } else {
854 if (c == '>') {
855 break;
856 } else if (IdentifierPart::Is(c)) {
857 push_code_unit(name, c);
858 } else {
859 ReportError(CStrVector("Invalid capture group name"));
860 return nullptr;
861 }
862 }
863 }
864
865 return name;
866 }
867
CreateNamedCaptureAtIndex(const ZoneVector<uc16> * name,int index)868 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
869 int index) {
870 DCHECK(0 < index && index <= captures_started_);
871 DCHECK_NOT_NULL(name);
872
873 if (named_captures_ == nullptr) {
874 named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
875 } else {
876 // Check for duplicates and bail if we find any.
877 // TODO(jgruber): O(n^2).
878 for (const auto& named_capture : *named_captures_) {
879 if (*named_capture->name() == *name) {
880 ReportError(CStrVector("Duplicate capture group name"));
881 return false;
882 }
883 }
884 }
885
886 RegExpCapture* capture = GetCapture(index);
887 DCHECK_NULL(capture->name());
888
889 capture->set_name(name);
890 named_captures_->Add(capture, zone());
891
892 return true;
893 }
894
ParseNamedBackReference(RegExpBuilder * builder,RegExpParserState * state)895 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
896 RegExpParserState* state) {
897 // The parser is assumed to be on the '<' in \k<name>.
898 if (current() != '<') {
899 ReportError(CStrVector("Invalid named reference"));
900 return false;
901 }
902
903 Advance();
904 const ZoneVector<uc16>* name = ParseCaptureGroupName();
905 if (name == nullptr) {
906 return false;
907 }
908
909 if (state->IsInsideCaptureGroup(name)) {
910 builder->AddEmpty();
911 } else {
912 RegExpBackReference* atom =
913 new (zone()) RegExpBackReference(builder->flags());
914 atom->set_name(name);
915
916 builder->AddAtom(atom);
917
918 if (named_back_references_ == nullptr) {
919 named_back_references_ =
920 new (zone()) ZoneList<RegExpBackReference*>(1, zone());
921 }
922 named_back_references_->Add(atom, zone());
923 }
924
925 return true;
926 }
927
PatchNamedBackReferences()928 void RegExpParser::PatchNamedBackReferences() {
929 if (named_back_references_ == nullptr) return;
930
931 if (named_captures_ == nullptr) {
932 ReportError(CStrVector("Invalid named capture referenced"));
933 return;
934 }
935
936 // Look up and patch the actual capture for each named back reference.
937 // TODO(jgruber): O(n^2), optimize if necessary.
938
939 for (int i = 0; i < named_back_references_->length(); i++) {
940 RegExpBackReference* ref = named_back_references_->at(i);
941
942 int index = -1;
943 for (const auto& capture : *named_captures_) {
944 if (*capture->name() == *ref->name()) {
945 index = capture->index();
946 break;
947 }
948 }
949
950 if (index == -1) {
951 ReportError(CStrVector("Invalid named capture referenced"));
952 return;
953 }
954
955 ref->set_capture(GetCapture(index));
956 }
957 }
958
GetCapture(int index)959 RegExpCapture* RegExpParser::GetCapture(int index) {
960 // The index for the capture groups are one-based. Its index in the list is
961 // zero-based.
962 int know_captures =
963 is_scanned_for_captures_ ? capture_count_ : captures_started_;
964 DCHECK(index <= know_captures);
965 if (captures_ == nullptr) {
966 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
967 }
968 while (captures_->length() < know_captures) {
969 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
970 }
971 return captures_->at(index - 1);
972 }
973
CreateCaptureNameMap()974 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
975 if (named_captures_ == nullptr || named_captures_->is_empty())
976 return Handle<FixedArray>();
977
978 Factory* factory = isolate()->factory();
979
980 int len = named_captures_->length() * 2;
981 Handle<FixedArray> array = factory->NewFixedArray(len);
982
983 for (int i = 0; i < named_captures_->length(); i++) {
984 RegExpCapture* capture = named_captures_->at(i);
985 MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
986 array->set(i * 2, *name.ToHandleChecked());
987 array->set(i * 2 + 1, Smi::FromInt(capture->index()));
988 }
989
990 return array;
991 }
992
HasNamedCaptures()993 bool RegExpParser::HasNamedCaptures() {
994 if (has_named_captures_ || is_scanned_for_captures_) {
995 return has_named_captures_;
996 }
997
998 ScanForCaptures();
999 DCHECK(is_scanned_for_captures_);
1000 return has_named_captures_;
1001 }
1002
IsInsideCaptureGroup(int index)1003 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
1004 for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1005 if (s->group_type() != CAPTURE) continue;
1006 // Return true if we found the matching capture index.
1007 if (index == s->capture_index()) return true;
1008 // Abort if index is larger than what has been parsed up till this state.
1009 if (index > s->capture_index()) return false;
1010 }
1011 return false;
1012 }
1013
IsInsideCaptureGroup(const ZoneVector<uc16> * name)1014 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
1015 const ZoneVector<uc16>* name) {
1016 DCHECK_NOT_NULL(name);
1017 for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1018 if (s->capture_name() == nullptr) continue;
1019 if (*s->capture_name() == *name) return true;
1020 }
1021 return false;
1022 }
1023
1024 // QuantifierPrefix ::
1025 // { DecimalDigits }
1026 // { DecimalDigits , }
1027 // { DecimalDigits , DecimalDigits }
1028 //
1029 // Returns true if parsing succeeds, and set the min_out and max_out
1030 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
ParseIntervalQuantifier(int * min_out,int * max_out)1031 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
1032 DCHECK_EQ(current(), '{');
1033 int start = position();
1034 Advance();
1035 int min = 0;
1036 if (!IsDecimalDigit(current())) {
1037 Reset(start);
1038 return false;
1039 }
1040 while (IsDecimalDigit(current())) {
1041 int next = current() - '0';
1042 if (min > (RegExpTree::kInfinity - next) / 10) {
1043 // Overflow. Skip past remaining decimal digits and return -1.
1044 do {
1045 Advance();
1046 } while (IsDecimalDigit(current()));
1047 min = RegExpTree::kInfinity;
1048 break;
1049 }
1050 min = 10 * min + next;
1051 Advance();
1052 }
1053 int max = 0;
1054 if (current() == '}') {
1055 max = min;
1056 Advance();
1057 } else if (current() == ',') {
1058 Advance();
1059 if (current() == '}') {
1060 max = RegExpTree::kInfinity;
1061 Advance();
1062 } else {
1063 while (IsDecimalDigit(current())) {
1064 int next = current() - '0';
1065 if (max > (RegExpTree::kInfinity - next) / 10) {
1066 do {
1067 Advance();
1068 } while (IsDecimalDigit(current()));
1069 max = RegExpTree::kInfinity;
1070 break;
1071 }
1072 max = 10 * max + next;
1073 Advance();
1074 }
1075 if (current() != '}') {
1076 Reset(start);
1077 return false;
1078 }
1079 Advance();
1080 }
1081 } else {
1082 Reset(start);
1083 return false;
1084 }
1085 *min_out = min;
1086 *max_out = max;
1087 return true;
1088 }
1089
1090
ParseOctalLiteral()1091 uc32 RegExpParser::ParseOctalLiteral() {
1092 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1093 // For compatibility with some other browsers (not all), we parse
1094 // up to three octal digits with a value below 256.
1095 // ES#prod-annexB-LegacyOctalEscapeSequence
1096 uc32 value = current() - '0';
1097 Advance();
1098 if ('0' <= current() && current() <= '7') {
1099 value = value * 8 + current() - '0';
1100 Advance();
1101 if (value < 32 && '0' <= current() && current() <= '7') {
1102 value = value * 8 + current() - '0';
1103 Advance();
1104 }
1105 }
1106 return value;
1107 }
1108
1109
ParseHexEscape(int length,uc32 * value)1110 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
1111 int start = position();
1112 uc32 val = 0;
1113 for (int i = 0; i < length; ++i) {
1114 uc32 c = current();
1115 int d = HexValue(c);
1116 if (d < 0) {
1117 Reset(start);
1118 return false;
1119 }
1120 val = val * 16 + d;
1121 Advance();
1122 }
1123 *value = val;
1124 return true;
1125 }
1126
1127 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uc32 * value)1128 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1129 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1130 // allowed). In the latter case, the number of hex digits between { } is
1131 // arbitrary. \ and u have already been read.
1132 if (current() == '{' && unicode()) {
1133 int start = position();
1134 Advance();
1135 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1136 if (current() == '}') {
1137 Advance();
1138 return true;
1139 }
1140 }
1141 Reset(start);
1142 return false;
1143 }
1144 // \u but no {, or \u{...} escapes not allowed.
1145 bool result = ParseHexEscape(4, value);
1146 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1147 current() == '\\') {
1148 // Attempt to read trail surrogate.
1149 int start = position();
1150 if (Next() == 'u') {
1151 Advance(2);
1152 uc32 trail;
1153 if (ParseHexEscape(4, &trail) &&
1154 unibrow::Utf16::IsTrailSurrogate(trail)) {
1155 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1156 static_cast<uc16>(trail));
1157 return true;
1158 }
1159 }
1160 Reset(start);
1161 }
1162 return result;
1163 }
1164
1165 #ifdef V8_INTL_SUPPORT
1166
1167 namespace {
1168
IsExactPropertyAlias(const char * property_name,UProperty property)1169 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1170 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1171 if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1172 return true;
1173 for (int i = 0;; i++) {
1174 const char* long_name = u_getPropertyName(
1175 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1176 if (long_name == nullptr) break;
1177 if (strcmp(property_name, long_name) == 0) return true;
1178 }
1179 return false;
1180 }
1181
IsExactPropertyValueAlias(const char * property_value_name,UProperty property,int32_t property_value)1182 bool IsExactPropertyValueAlias(const char* property_value_name,
1183 UProperty property, int32_t property_value) {
1184 const char* short_name =
1185 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1186 if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1187 return true;
1188 }
1189 for (int i = 0;; i++) {
1190 const char* long_name = u_getPropertyValueName(
1191 property, property_value,
1192 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1193 if (long_name == nullptr) break;
1194 if (strcmp(property_value_name, long_name) == 0) return true;
1195 }
1196 return false;
1197 }
1198
LookupPropertyValueName(UProperty property,const char * property_value_name,bool negate,ZoneList<CharacterRange> * result,Zone * zone)1199 bool LookupPropertyValueName(UProperty property,
1200 const char* property_value_name, bool negate,
1201 ZoneList<CharacterRange>* result, Zone* zone) {
1202 UProperty property_for_lookup = property;
1203 if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1204 // For the property Script_Extensions, we have to do the property value
1205 // name lookup as if the property is Script.
1206 property_for_lookup = UCHAR_SCRIPT;
1207 }
1208 int32_t property_value =
1209 u_getPropertyValueEnum(property_for_lookup, property_value_name);
1210 if (property_value == UCHAR_INVALID_CODE) return false;
1211
1212 // We require the property name to match exactly to one of the property value
1213 // aliases. However, u_getPropertyValueEnum uses loose matching.
1214 if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1215 property_value)) {
1216 return false;
1217 }
1218
1219 UErrorCode ec = U_ZERO_ERROR;
1220 icu::UnicodeSet set;
1221 set.applyIntPropertyValue(property, property_value, ec);
1222 bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1223
1224 if (success) {
1225 set.removeAllStrings();
1226 if (negate) set.complement();
1227 for (int i = 0; i < set.getRangeCount(); i++) {
1228 result->Add(
1229 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1230 zone);
1231 }
1232 }
1233 return success;
1234 }
1235
1236 template <size_t N>
NameEquals(const char * name,const char (& literal)[N])1237 inline bool NameEquals(const char* name, const char (&literal)[N]) {
1238 return strncmp(name, literal, N + 1) == 0;
1239 }
1240
LookupSpecialPropertyValueName(const char * name,ZoneList<CharacterRange> * result,bool negate,Zone * zone)1241 bool LookupSpecialPropertyValueName(const char* name,
1242 ZoneList<CharacterRange>* result,
1243 bool negate, Zone* zone) {
1244 if (NameEquals(name, "Any")) {
1245 if (negate) {
1246 // Leave the list of character ranges empty, since the negation of 'Any'
1247 // is the empty set.
1248 } else {
1249 result->Add(CharacterRange::Everything(), zone);
1250 }
1251 } else if (NameEquals(name, "ASCII")) {
1252 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1253 : CharacterRange::Range(0x0, 0x7F),
1254 zone);
1255 } else if (NameEquals(name, "Assigned")) {
1256 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1257 !negate, result, zone);
1258 } else {
1259 return false;
1260 }
1261 return true;
1262 }
1263
1264 // Explicitly whitelist supported binary properties. The spec forbids supporting
1265 // properties outside of this set to ensure interoperability.
IsSupportedBinaryProperty(UProperty property)1266 bool IsSupportedBinaryProperty(UProperty property) {
1267 switch (property) {
1268 case UCHAR_ALPHABETIC:
1269 // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1270 // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1271 case UCHAR_ASCII_HEX_DIGIT:
1272 // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1273 case UCHAR_BIDI_CONTROL:
1274 case UCHAR_BIDI_MIRRORED:
1275 case UCHAR_CASE_IGNORABLE:
1276 case UCHAR_CASED:
1277 case UCHAR_CHANGES_WHEN_CASEFOLDED:
1278 case UCHAR_CHANGES_WHEN_CASEMAPPED:
1279 case UCHAR_CHANGES_WHEN_LOWERCASED:
1280 case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1281 case UCHAR_CHANGES_WHEN_TITLECASED:
1282 case UCHAR_CHANGES_WHEN_UPPERCASED:
1283 case UCHAR_DASH:
1284 case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1285 case UCHAR_DEPRECATED:
1286 case UCHAR_DIACRITIC:
1287 case UCHAR_EMOJI:
1288 #if U_ICU_VERSION_MAJOR_NUM >= 60
1289 case UCHAR_EMOJI_COMPONENT:
1290 #endif
1291 case UCHAR_EMOJI_MODIFIER_BASE:
1292 case UCHAR_EMOJI_MODIFIER:
1293 case UCHAR_EMOJI_PRESENTATION:
1294 #if U_ICU_VERSION_MAJOR_NUM >= 62
1295 case UCHAR_EXTENDED_PICTOGRAPHIC:
1296 #endif
1297 case UCHAR_EXTENDER:
1298 case UCHAR_GRAPHEME_BASE:
1299 case UCHAR_GRAPHEME_EXTEND:
1300 case UCHAR_HEX_DIGIT:
1301 case UCHAR_ID_CONTINUE:
1302 case UCHAR_ID_START:
1303 case UCHAR_IDEOGRAPHIC:
1304 case UCHAR_IDS_BINARY_OPERATOR:
1305 case UCHAR_IDS_TRINARY_OPERATOR:
1306 case UCHAR_JOIN_CONTROL:
1307 case UCHAR_LOGICAL_ORDER_EXCEPTION:
1308 case UCHAR_LOWERCASE:
1309 case UCHAR_MATH:
1310 case UCHAR_NONCHARACTER_CODE_POINT:
1311 case UCHAR_PATTERN_SYNTAX:
1312 case UCHAR_PATTERN_WHITE_SPACE:
1313 case UCHAR_QUOTATION_MARK:
1314 case UCHAR_RADICAL:
1315 #if U_ICU_VERSION_MAJOR_NUM >= 60
1316 case UCHAR_REGIONAL_INDICATOR:
1317 #endif
1318 case UCHAR_S_TERM:
1319 case UCHAR_SOFT_DOTTED:
1320 case UCHAR_TERMINAL_PUNCTUATION:
1321 case UCHAR_UNIFIED_IDEOGRAPH:
1322 case UCHAR_UPPERCASE:
1323 case UCHAR_VARIATION_SELECTOR:
1324 case UCHAR_WHITE_SPACE:
1325 case UCHAR_XID_CONTINUE:
1326 case UCHAR_XID_START:
1327 return true;
1328 default:
1329 break;
1330 }
1331 return false;
1332 }
1333
IsUnicodePropertyValueCharacter(char c)1334 bool IsUnicodePropertyValueCharacter(char c) {
1335 // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1336 //
1337 // Note that using this to validate each parsed char is quite conservative.
1338 // A possible alternative solution would be to only ensure the parsed
1339 // property name/value candidate string does not contain '\0' characters and
1340 // let ICU lookups trigger the final failure.
1341 if ('a' <= c && c <= 'z') return true;
1342 if ('A' <= c && c <= 'Z') return true;
1343 if ('0' <= c && c <= '9') return true;
1344 return (c == '_');
1345 }
1346
1347 } // anonymous namespace
1348
ParsePropertyClass(ZoneList<CharacterRange> * result,bool negate)1349 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1350 bool negate) {
1351 // Parse the property class as follows:
1352 // - In \p{name}, 'name' is interpreted
1353 // - either as a general category property value name.
1354 // - or as a binary property name.
1355 // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1356 // and 'value' is interpreted as one of the available property value names.
1357 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1358 // - Loose matching is not applied.
1359 std::vector<char> first_part;
1360 std::vector<char> second_part;
1361 if (current() == '{') {
1362 // Parse \p{[PropertyName=]PropertyNameValue}
1363 for (Advance(); current() != '}' && current() != '='; Advance()) {
1364 if (!IsUnicodePropertyValueCharacter(current())) return false;
1365 if (!has_next()) return false;
1366 first_part.push_back(static_cast<char>(current()));
1367 }
1368 if (current() == '=') {
1369 for (Advance(); current() != '}'; Advance()) {
1370 if (!IsUnicodePropertyValueCharacter(current())) return false;
1371 if (!has_next()) return false;
1372 second_part.push_back(static_cast<char>(current()));
1373 }
1374 second_part.push_back(0); // null-terminate string.
1375 }
1376 } else {
1377 return false;
1378 }
1379 Advance();
1380 first_part.push_back(0); // null-terminate string.
1381
1382 DCHECK(first_part.size() - 1 == std::strlen(first_part.data()));
1383 DCHECK(second_part.empty() ||
1384 second_part.size() - 1 == std::strlen(second_part.data()));
1385
1386 if (second_part.empty()) {
1387 // First attempt to interpret as general category property value name.
1388 const char* name = first_part.data();
1389 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1390 result, zone())) {
1391 return true;
1392 }
1393 // Interpret "Any", "ASCII", and "Assigned".
1394 if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
1395 return true;
1396 }
1397 // Then attempt to interpret as binary property name with value name 'Y'.
1398 UProperty property = u_getPropertyEnum(name);
1399 if (!IsSupportedBinaryProperty(property)) return false;
1400 if (!IsExactPropertyAlias(name, property)) return false;
1401 return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
1402 zone());
1403 } else {
1404 // Both property name and value name are specified. Attempt to interpret
1405 // the property name as enumerated property.
1406 const char* property_name = first_part.data();
1407 const char* value_name = second_part.data();
1408 UProperty property = u_getPropertyEnum(property_name);
1409 if (!IsExactPropertyAlias(property_name, property)) return false;
1410 if (property == UCHAR_GENERAL_CATEGORY) {
1411 // We want to allow aggregate value names such as "Letter".
1412 property = UCHAR_GENERAL_CATEGORY_MASK;
1413 } else if (property != UCHAR_SCRIPT &&
1414 property != UCHAR_SCRIPT_EXTENSIONS) {
1415 return false;
1416 }
1417 return LookupPropertyValueName(property, value_name, negate, result,
1418 zone());
1419 }
1420 }
1421
1422 #else // V8_INTL_SUPPORT
1423
ParsePropertyClass(ZoneList<CharacterRange> * result,bool negate)1424 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1425 bool negate) {
1426 return false;
1427 }
1428
1429 #endif // V8_INTL_SUPPORT
1430
ParseUnlimitedLengthHexNumber(int max_value,uc32 * value)1431 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1432 uc32 x = 0;
1433 int d = HexValue(current());
1434 if (d < 0) {
1435 return false;
1436 }
1437 while (d >= 0) {
1438 x = x * 16 + d;
1439 if (x > max_value) {
1440 return false;
1441 }
1442 Advance();
1443 d = HexValue(current());
1444 }
1445 *value = x;
1446 return true;
1447 }
1448
1449
ParseClassCharacterEscape()1450 uc32 RegExpParser::ParseClassCharacterEscape() {
1451 DCHECK_EQ('\\', current());
1452 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1453 Advance();
1454 switch (current()) {
1455 case 'b':
1456 Advance();
1457 return '\b';
1458 // ControlEscape :: one of
1459 // f n r t v
1460 case 'f':
1461 Advance();
1462 return '\f';
1463 case 'n':
1464 Advance();
1465 return '\n';
1466 case 'r':
1467 Advance();
1468 return '\r';
1469 case 't':
1470 Advance();
1471 return '\t';
1472 case 'v':
1473 Advance();
1474 return '\v';
1475 case 'c': {
1476 uc32 controlLetter = Next();
1477 uc32 letter = controlLetter & ~('A' ^ 'a');
1478 // Inside a character class, we also accept digits and underscore as
1479 // control characters, unless with /u. See Annex B:
1480 // ES#prod-annexB-ClassControlLetter
1481 if (letter >= 'A' && letter <= 'Z') {
1482 Advance(2);
1483 // Control letters mapped to ASCII control characters in the range
1484 // 0x00-0x1F.
1485 return controlLetter & 0x1F;
1486 }
1487 if (unicode()) {
1488 // With /u, invalid escapes are not treated as identity escapes.
1489 ReportError(CStrVector("Invalid class escape"));
1490 return 0;
1491 }
1492 if ((controlLetter >= '0' && controlLetter <= '9') ||
1493 controlLetter == '_') {
1494 Advance(2);
1495 return controlLetter & 0x1F;
1496 }
1497 // We match JSC in reading the backslash as a literal
1498 // character instead of as starting an escape.
1499 // TODO(v8:6201): Not yet covered by the spec.
1500 return '\\';
1501 }
1502 case '0':
1503 // With /u, \0 is interpreted as NUL if not followed by another digit.
1504 if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1505 Advance();
1506 return 0;
1507 }
1508 V8_FALLTHROUGH;
1509 case '1':
1510 case '2':
1511 case '3':
1512 case '4':
1513 case '5':
1514 case '6':
1515 case '7':
1516 // For compatibility, we interpret a decimal escape that isn't
1517 // a back reference (and therefore either \0 or not valid according
1518 // to the specification) as a 1..3 digit octal character code.
1519 // ES#prod-annexB-LegacyOctalEscapeSequence
1520 if (unicode()) {
1521 // With /u, decimal escape is not interpreted as octal character code.
1522 ReportError(CStrVector("Invalid class escape"));
1523 return 0;
1524 }
1525 return ParseOctalLiteral();
1526 case 'x': {
1527 Advance();
1528 uc32 value;
1529 if (ParseHexEscape(2, &value)) return value;
1530 if (unicode()) {
1531 // With /u, invalid escapes are not treated as identity escapes.
1532 ReportError(CStrVector("Invalid escape"));
1533 return 0;
1534 }
1535 // If \x is not followed by a two-digit hexadecimal, treat it
1536 // as an identity escape.
1537 return 'x';
1538 }
1539 case 'u': {
1540 Advance();
1541 uc32 value;
1542 if (ParseUnicodeEscape(&value)) return value;
1543 if (unicode()) {
1544 // With /u, invalid escapes are not treated as identity escapes.
1545 ReportError(CStrVector("Invalid unicode escape"));
1546 return 0;
1547 }
1548 // If \u is not followed by a two-digit hexadecimal, treat it
1549 // as an identity escape.
1550 return 'u';
1551 }
1552 default: {
1553 uc32 result = current();
1554 // With /u, no identity escapes except for syntax characters and '-' are
1555 // allowed. Otherwise, all identity escapes are allowed.
1556 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1557 Advance();
1558 return result;
1559 }
1560 ReportError(CStrVector("Invalid escape"));
1561 return 0;
1562 }
1563 }
1564 return 0;
1565 }
1566
ParseClassEscape(ZoneList<CharacterRange> * ranges,Zone * zone,bool add_unicode_case_equivalents,uc32 * char_out,bool * is_class_escape)1567 void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
1568 Zone* zone,
1569 bool add_unicode_case_equivalents,
1570 uc32* char_out, bool* is_class_escape) {
1571 uc32 current_char = current();
1572 if (current_char == '\\') {
1573 switch (Next()) {
1574 case 'w':
1575 case 'W':
1576 case 'd':
1577 case 'D':
1578 case 's':
1579 case 'S': {
1580 CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
1581 add_unicode_case_equivalents, zone);
1582 Advance(2);
1583 *is_class_escape = true;
1584 return;
1585 }
1586 case kEndMarker:
1587 ReportError(CStrVector("\\ at end of pattern"));
1588 return;
1589 case 'p':
1590 case 'P':
1591 if (unicode()) {
1592 bool negate = Next() == 'P';
1593 Advance(2);
1594 if (!ParsePropertyClass(ranges, negate)) {
1595 ReportError(CStrVector("Invalid property name in character class"));
1596 }
1597 *is_class_escape = true;
1598 return;
1599 }
1600 break;
1601 default:
1602 break;
1603 }
1604 *char_out = ParseClassCharacterEscape();
1605 *is_class_escape = false;
1606 } else {
1607 Advance();
1608 *char_out = current_char;
1609 *is_class_escape = false;
1610 }
1611 }
1612
ParseCharacterClass(const RegExpBuilder * builder)1613 RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
1614 static const char* kUnterminated = "Unterminated character class";
1615 static const char* kRangeInvalid = "Invalid character class";
1616 static const char* kRangeOutOfOrder = "Range out of order in character class";
1617
1618 DCHECK_EQ(current(), '[');
1619 Advance();
1620 bool is_negated = false;
1621 if (current() == '^') {
1622 is_negated = true;
1623 Advance();
1624 }
1625 ZoneList<CharacterRange>* ranges =
1626 new (zone()) ZoneList<CharacterRange>(2, zone());
1627 bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
1628 while (has_more() && current() != ']') {
1629 uc32 char_1, char_2;
1630 bool is_class_1, is_class_2;
1631 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1632 &is_class_1 CHECK_FAILED);
1633 if (current() == '-') {
1634 Advance();
1635 if (current() == kEndMarker) {
1636 // If we reach the end we break out of the loop and let the
1637 // following code report an error.
1638 break;
1639 } else if (current() == ']') {
1640 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1641 ranges->Add(CharacterRange::Singleton('-'), zone());
1642 break;
1643 }
1644 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1645 &is_class_2 CHECK_FAILED);
1646 if (is_class_1 || is_class_2) {
1647 // Either end is an escaped character class. Treat the '-' verbatim.
1648 if (unicode()) {
1649 // ES2015 21.2.2.15.1 step 1.
1650 return ReportError(CStrVector(kRangeInvalid));
1651 }
1652 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1653 ranges->Add(CharacterRange::Singleton('-'), zone());
1654 if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
1655 continue;
1656 }
1657 // ES2015 21.2.2.15.1 step 6.
1658 if (char_1 > char_2) {
1659 return ReportError(CStrVector(kRangeOutOfOrder));
1660 }
1661 ranges->Add(CharacterRange::Range(char_1, char_2), zone());
1662 } else {
1663 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1664 }
1665 }
1666 if (!has_more()) {
1667 return ReportError(CStrVector(kUnterminated));
1668 }
1669 Advance();
1670 RegExpCharacterClass::CharacterClassFlags character_class_flags;
1671 if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
1672 return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(),
1673 character_class_flags);
1674 }
1675
1676
1677 #undef CHECK_FAILED
1678
1679
ParseRegExp(Isolate * isolate,Zone * zone,FlatStringReader * input,JSRegExp::Flags flags,RegExpCompileData * result)1680 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1681 FlatStringReader* input, JSRegExp::Flags flags,
1682 RegExpCompileData* result) {
1683 DCHECK(result != nullptr);
1684 RegExpParser parser(input, &result->error, flags, isolate, zone);
1685 RegExpTree* tree = parser.ParsePattern();
1686 if (parser.failed()) {
1687 DCHECK(tree == nullptr);
1688 DCHECK(!result->error.is_null());
1689 } else {
1690 DCHECK(tree != nullptr);
1691 DCHECK(result->error.is_null());
1692 if (FLAG_trace_regexp_parser) {
1693 StdoutStream os;
1694 tree->Print(os, zone);
1695 os << "\n";
1696 }
1697 result->tree = tree;
1698 int capture_count = parser.captures_started();
1699 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1700 result->contains_anchor = parser.contains_anchor();
1701 result->capture_name_map = parser.CreateCaptureNameMap();
1702 result->capture_count = capture_count;
1703 }
1704 return !parser.failed();
1705 }
1706
RegExpBuilder(Zone * zone,JSRegExp::Flags flags)1707 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
1708 : zone_(zone),
1709 pending_empty_(false),
1710 flags_(flags),
1711 characters_(nullptr),
1712 pending_surrogate_(kNoPendingSurrogate),
1713 terms_(),
1714 alternatives_()
1715 #ifdef DEBUG
1716 ,
1717 last_added_(ADD_NONE)
1718 #endif
1719 {
1720 }
1721
1722
AddLeadSurrogate(uc16 lead_surrogate)1723 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1724 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1725 FlushPendingSurrogate();
1726 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1727 pending_surrogate_ = lead_surrogate;
1728 }
1729
1730
AddTrailSurrogate(uc16 trail_surrogate)1731 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1732 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1733 if (pending_surrogate_ != kNoPendingSurrogate) {
1734 uc16 lead_surrogate = pending_surrogate_;
1735 pending_surrogate_ = kNoPendingSurrogate;
1736 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1737 uc32 combined =
1738 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1739 if (NeedsDesugaringForIgnoreCase(combined)) {
1740 AddCharacterClassForDesugaring(combined);
1741 } else {
1742 ZoneList<uc16> surrogate_pair(2, zone());
1743 surrogate_pair.Add(lead_surrogate, zone());
1744 surrogate_pair.Add(trail_surrogate, zone());
1745 RegExpAtom* atom =
1746 new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_);
1747 AddAtom(atom);
1748 }
1749 } else {
1750 pending_surrogate_ = trail_surrogate;
1751 FlushPendingSurrogate();
1752 }
1753 }
1754
1755
FlushPendingSurrogate()1756 void RegExpBuilder::FlushPendingSurrogate() {
1757 if (pending_surrogate_ != kNoPendingSurrogate) {
1758 DCHECK(unicode());
1759 uc32 c = pending_surrogate_;
1760 pending_surrogate_ = kNoPendingSurrogate;
1761 AddCharacterClassForDesugaring(c);
1762 }
1763 }
1764
1765
FlushCharacters()1766 void RegExpBuilder::FlushCharacters() {
1767 FlushPendingSurrogate();
1768 pending_empty_ = false;
1769 if (characters_ != nullptr) {
1770 RegExpTree* atom =
1771 new (zone()) RegExpAtom(characters_->ToConstVector(), flags_);
1772 characters_ = nullptr;
1773 text_.Add(atom, zone());
1774 LAST(ADD_ATOM);
1775 }
1776 }
1777
1778
FlushText()1779 void RegExpBuilder::FlushText() {
1780 FlushCharacters();
1781 int num_text = text_.length();
1782 if (num_text == 0) {
1783 return;
1784 } else if (num_text == 1) {
1785 terms_.Add(text_.last(), zone());
1786 } else {
1787 RegExpText* text = new (zone()) RegExpText(zone());
1788 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1789 terms_.Add(text, zone());
1790 }
1791 text_.Clear();
1792 }
1793
1794
AddCharacter(uc16 c)1795 void RegExpBuilder::AddCharacter(uc16 c) {
1796 FlushPendingSurrogate();
1797 pending_empty_ = false;
1798 if (NeedsDesugaringForIgnoreCase(c)) {
1799 AddCharacterClassForDesugaring(c);
1800 } else {
1801 if (characters_ == nullptr) {
1802 characters_ = new (zone()) ZoneList<uc16>(4, zone());
1803 }
1804 characters_->Add(c, zone());
1805 LAST(ADD_CHAR);
1806 }
1807 }
1808
1809
AddUnicodeCharacter(uc32 c)1810 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1811 if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1812 DCHECK(unicode());
1813 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1814 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1815 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1816 AddLeadSurrogate(c);
1817 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1818 AddTrailSurrogate(c);
1819 } else {
1820 AddCharacter(static_cast<uc16>(c));
1821 }
1822 }
1823
AddEscapedUnicodeCharacter(uc32 character)1824 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1825 // A lead or trail surrogate parsed via escape sequence will not
1826 // pair up with any preceding lead or following trail surrogate.
1827 FlushPendingSurrogate();
1828 AddUnicodeCharacter(character);
1829 FlushPendingSurrogate();
1830 }
1831
AddEmpty()1832 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1833
1834
AddCharacterClass(RegExpCharacterClass * cc)1835 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1836 if (NeedsDesugaringForUnicode(cc)) {
1837 // With /u, character class needs to be desugared, so it
1838 // must be a standalone term instead of being part of a RegExpText.
1839 AddTerm(cc);
1840 } else {
1841 AddAtom(cc);
1842 }
1843 }
1844
AddCharacterClassForDesugaring(uc32 c)1845 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1846 AddTerm(new (zone()) RegExpCharacterClass(
1847 zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)),
1848 flags_));
1849 }
1850
1851
AddAtom(RegExpTree * term)1852 void RegExpBuilder::AddAtom(RegExpTree* term) {
1853 if (term->IsEmpty()) {
1854 AddEmpty();
1855 return;
1856 }
1857 if (term->IsTextElement()) {
1858 FlushCharacters();
1859 text_.Add(term, zone());
1860 } else {
1861 FlushText();
1862 terms_.Add(term, zone());
1863 }
1864 LAST(ADD_ATOM);
1865 }
1866
1867
AddTerm(RegExpTree * term)1868 void RegExpBuilder::AddTerm(RegExpTree* term) {
1869 FlushText();
1870 terms_.Add(term, zone());
1871 LAST(ADD_ATOM);
1872 }
1873
1874
AddAssertion(RegExpTree * assert)1875 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1876 FlushText();
1877 if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
1878 // Omit repeated assertions of the same type.
1879 RegExpAssertion* last = terms_.last()->AsAssertion();
1880 RegExpAssertion* next = assert->AsAssertion();
1881 if (last->assertion_type() == next->assertion_type()) return;
1882 }
1883 terms_.Add(assert, zone());
1884 LAST(ADD_ASSERT);
1885 }
1886
1887
NewAlternative()1888 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1889
1890
FlushTerms()1891 void RegExpBuilder::FlushTerms() {
1892 FlushText();
1893 int num_terms = terms_.length();
1894 RegExpTree* alternative;
1895 if (num_terms == 0) {
1896 alternative = new (zone()) RegExpEmpty();
1897 } else if (num_terms == 1) {
1898 alternative = terms_.last();
1899 } else {
1900 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1901 }
1902 alternatives_.Add(alternative, zone());
1903 terms_.Clear();
1904 LAST(ADD_NONE);
1905 }
1906
1907
NeedsDesugaringForUnicode(RegExpCharacterClass * cc)1908 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1909 if (!unicode()) return false;
1910 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1911 // necessarily mean that we need to desugar. It's probably nicer to have a
1912 // separate pass to figure out unicode desugarings.
1913 if (ignore_case()) return true;
1914 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1915 CharacterRange::Canonicalize(ranges);
1916 for (int i = ranges->length() - 1; i >= 0; i--) {
1917 uc32 from = ranges->at(i).from();
1918 uc32 to = ranges->at(i).to();
1919 // Check for non-BMP characters.
1920 if (to >= kNonBmpStart) return true;
1921 // Check for lone surrogates.
1922 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1923 }
1924 return false;
1925 }
1926
1927
NeedsDesugaringForIgnoreCase(uc32 c)1928 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1929 #ifdef V8_INTL_SUPPORT
1930 if (unicode() && ignore_case()) {
1931 icu::UnicodeSet set(c, c);
1932 set.closeOver(USET_CASE_INSENSITIVE);
1933 set.removeAllStrings();
1934 return set.size() > 1;
1935 }
1936 // In the case where ICU is not included, we act as if the unicode flag is
1937 // not set, and do not desugar.
1938 #endif // V8_INTL_SUPPORT
1939 return false;
1940 }
1941
1942
ToRegExp()1943 RegExpTree* RegExpBuilder::ToRegExp() {
1944 FlushTerms();
1945 int num_alternatives = alternatives_.length();
1946 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1947 if (num_alternatives == 1) return alternatives_.last();
1948 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1949 }
1950
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)1951 bool RegExpBuilder::AddQuantifierToAtom(
1952 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1953 FlushPendingSurrogate();
1954 if (pending_empty_) {
1955 pending_empty_ = false;
1956 return true;
1957 }
1958 RegExpTree* atom;
1959 if (characters_ != nullptr) {
1960 DCHECK(last_added_ == ADD_CHAR);
1961 // Last atom was character.
1962 Vector<const uc16> char_vector = characters_->ToConstVector();
1963 int num_chars = char_vector.length();
1964 if (num_chars > 1) {
1965 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1966 text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone());
1967 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1968 }
1969 characters_ = nullptr;
1970 atom = new (zone()) RegExpAtom(char_vector, flags_);
1971 FlushText();
1972 } else if (text_.length() > 0) {
1973 DCHECK(last_added_ == ADD_ATOM);
1974 atom = text_.RemoveLast();
1975 FlushText();
1976 } else if (terms_.length() > 0) {
1977 DCHECK(last_added_ == ADD_ATOM);
1978 atom = terms_.RemoveLast();
1979 if (atom->IsLookaround()) {
1980 // With /u, lookarounds are not quantifiable.
1981 if (unicode()) return false;
1982 // Lookbehinds are not quantifiable.
1983 if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
1984 return false;
1985 }
1986 }
1987 if (atom->max_match() == 0) {
1988 // Guaranteed to only match an empty string.
1989 LAST(ADD_TERM);
1990 if (min == 0) {
1991 return true;
1992 }
1993 terms_.Add(atom, zone());
1994 return true;
1995 }
1996 } else {
1997 // Only call immediately after adding an atom or character!
1998 UNREACHABLE();
1999 }
2000 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
2001 zone());
2002 LAST(ADD_TERM);
2003 return true;
2004 }
2005
2006 } // namespace internal
2007 } // namespace v8
2008