1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-parser.h"
6
7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h"
9 #include "src/isolate.h"
10 #include "src/objects-inl.h"
11 #include "src/ostreams.h"
12 #include "src/regexp/jsregexp.h"
13 #include "src/utils.h"
14
15 #ifdef V8_I18N_SUPPORT
16 #include "unicode/uniset.h"
17 #endif // V8_I18N_SUPPORT
18
19 namespace v8 {
20 namespace internal {
21
RegExpParser(FlatStringReader * in,Handle<String> * error,JSRegExp::Flags flags,Isolate * isolate,Zone * zone)22 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
23 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
24 : isolate_(isolate),
25 zone_(zone),
26 error_(error),
27 captures_(NULL),
28 named_captures_(NULL),
29 named_back_references_(NULL),
30 in_(in),
31 current_(kEndMarker),
32 ignore_case_(flags & JSRegExp::kIgnoreCase),
33 multiline_(flags & JSRegExp::kMultiline),
34 unicode_(flags & JSRegExp::kUnicode),
35 next_pos_(0),
36 captures_started_(0),
37 capture_count_(0),
38 has_more_(true),
39 simple_(false),
40 contains_anchor_(false),
41 is_scanned_for_captures_(false),
42 failed_(false) {
43 Advance();
44 }
45
46 template <bool update_position>
ReadNext()47 inline uc32 RegExpParser::ReadNext() {
48 int position = next_pos_;
49 uc32 c0 = in()->Get(position);
50 position++;
51 // Read the whole surrogate pair in case of unicode flag, if possible.
52 if (unicode() && position < in()->length() &&
53 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
54 uc16 c1 = in()->Get(position);
55 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
56 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
57 position++;
58 }
59 }
60 if (update_position) next_pos_ = position;
61 return c0;
62 }
63
64
Next()65 uc32 RegExpParser::Next() {
66 if (has_next()) {
67 return ReadNext<false>();
68 } else {
69 return kEndMarker;
70 }
71 }
72
73
Advance()74 void RegExpParser::Advance() {
75 if (has_next()) {
76 StackLimitCheck check(isolate());
77 if (check.HasOverflowed()) {
78 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");
79 ReportError(CStrVector(
80 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
81 } else if (zone()->excess_allocation()) {
82 ReportError(CStrVector("Regular expression too large"));
83 } else {
84 current_ = ReadNext<true>();
85 }
86 } else {
87 current_ = kEndMarker;
88 // Advance so that position() points to 1-after-the-last-character. This is
89 // important so that Reset() to this position works correctly.
90 next_pos_ = in()->length() + 1;
91 has_more_ = false;
92 }
93 }
94
95
Reset(int pos)96 void RegExpParser::Reset(int pos) {
97 next_pos_ = pos;
98 has_more_ = (pos < in()->length());
99 Advance();
100 }
101
102
Advance(int dist)103 void RegExpParser::Advance(int dist) {
104 next_pos_ += dist - 1;
105 Advance();
106 }
107
108
simple()109 bool RegExpParser::simple() { return simple_; }
110
IsSyntaxCharacterOrSlash(uc32 c)111 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
112 switch (c) {
113 case '^':
114 case '$':
115 case '\\':
116 case '.':
117 case '*':
118 case '+':
119 case '?':
120 case '(':
121 case ')':
122 case '[':
123 case ']':
124 case '{':
125 case '}':
126 case '|':
127 case '/':
128 return true;
129 default:
130 break;
131 }
132 return false;
133 }
134
135
ReportError(Vector<const char> message)136 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
137 if (failed_) return NULL; // Do not overwrite any existing error.
138 failed_ = true;
139 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
140 // Zip to the end to make sure the no more input is read.
141 current_ = kEndMarker;
142 next_pos_ = in()->length();
143 return NULL;
144 }
145
146
147 #define CHECK_FAILED /**/); \
148 if (failed_) return NULL; \
149 ((void)0
150
151
152 // Pattern ::
153 // Disjunction
ParsePattern()154 RegExpTree* RegExpParser::ParsePattern() {
155 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
156 PatchNamedBackReferences(CHECK_FAILED);
157 DCHECK(!has_more());
158 // If the result of parsing is a literal string atom, and it has the
159 // same length as the input, then the atom is identical to the input.
160 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
161 simple_ = true;
162 }
163 return result;
164 }
165
166
167 // Disjunction ::
168 // Alternative
169 // Alternative | Disjunction
170 // Alternative ::
171 // [empty]
172 // Term Alternative
173 // Term ::
174 // Assertion
175 // Atom
176 // Atom Quantifier
ParseDisjunction()177 RegExpTree* RegExpParser::ParseDisjunction() {
178 // Used to store current state while parsing subexpressions.
179 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
180 nullptr, ignore_case(), unicode(), zone());
181 RegExpParserState* state = &initial_state;
182 // Cache the builder in a local variable for quick access.
183 RegExpBuilder* builder = initial_state.builder();
184 while (true) {
185 switch (current()) {
186 case kEndMarker:
187 if (state->IsSubexpression()) {
188 // Inside a parenthesized group when hitting end of input.
189 return ReportError(CStrVector("Unterminated group"));
190 }
191 DCHECK_EQ(INITIAL, state->group_type());
192 // Parsing completed successfully.
193 return builder->ToRegExp();
194 case ')': {
195 if (!state->IsSubexpression()) {
196 return ReportError(CStrVector("Unmatched ')'"));
197 }
198 DCHECK_NE(INITIAL, state->group_type());
199
200 Advance();
201 // End disjunction parsing and convert builder content to new single
202 // regexp atom.
203 RegExpTree* body = builder->ToRegExp();
204
205 int end_capture_index = captures_started();
206
207 int capture_index = state->capture_index();
208 SubexpressionType group_type = state->group_type();
209
210 // Build result of subexpression.
211 if (group_type == CAPTURE) {
212 if (state->IsNamedCapture()) {
213 CreateNamedCaptureAtIndex(state->capture_name(),
214 capture_index CHECK_FAILED);
215 }
216 RegExpCapture* capture = GetCapture(capture_index);
217 capture->set_body(body);
218 body = capture;
219 } else if (group_type == GROUPING) {
220 body = new (zone()) RegExpGroup(body);
221 } else {
222 DCHECK(group_type == POSITIVE_LOOKAROUND ||
223 group_type == NEGATIVE_LOOKAROUND);
224 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
225 body = new (zone()) RegExpLookaround(
226 body, is_positive, end_capture_index - capture_index,
227 capture_index, state->lookaround_type());
228 }
229
230 // Restore previous state.
231 state = state->previous_state();
232 builder = state->builder();
233
234 builder->AddAtom(body);
235 // For compatability with JSC and ES3, we allow quantifiers after
236 // lookaheads, and break in all cases.
237 break;
238 }
239 case '|': {
240 Advance();
241 builder->NewAlternative();
242 continue;
243 }
244 case '*':
245 case '+':
246 case '?':
247 return ReportError(CStrVector("Nothing to repeat"));
248 case '^': {
249 Advance();
250 if (multiline()) {
251 builder->AddAssertion(
252 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
253 } else {
254 builder->AddAssertion(
255 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
256 set_contains_anchor();
257 }
258 continue;
259 }
260 case '$': {
261 Advance();
262 RegExpAssertion::AssertionType assertion_type =
263 multiline() ? RegExpAssertion::END_OF_LINE
264 : RegExpAssertion::END_OF_INPUT;
265 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
266 continue;
267 }
268 case '.': {
269 Advance();
270 // everything except \x0a, \x0d, \u2028 and \u2029
271 ZoneList<CharacterRange>* ranges =
272 new (zone()) ZoneList<CharacterRange>(2, zone());
273 CharacterRange::AddClassEscape('.', ranges, zone());
274 RegExpCharacterClass* cc =
275 new (zone()) RegExpCharacterClass(ranges, false);
276 builder->AddCharacterClass(cc);
277 break;
278 }
279 case '(': {
280 SubexpressionType subexpr_type = CAPTURE;
281 RegExpLookaround::Type lookaround_type = state->lookaround_type();
282 bool is_named_capture = false;
283 Advance();
284 if (current() == '?') {
285 switch (Next()) {
286 case ':':
287 subexpr_type = GROUPING;
288 Advance(2);
289 break;
290 case '=':
291 lookaround_type = RegExpLookaround::LOOKAHEAD;
292 subexpr_type = POSITIVE_LOOKAROUND;
293 Advance(2);
294 break;
295 case '!':
296 lookaround_type = RegExpLookaround::LOOKAHEAD;
297 subexpr_type = NEGATIVE_LOOKAROUND;
298 Advance(2);
299 break;
300 case '<':
301 Advance();
302 if (FLAG_harmony_regexp_lookbehind) {
303 if (Next() == '=') {
304 subexpr_type = POSITIVE_LOOKAROUND;
305 lookaround_type = RegExpLookaround::LOOKBEHIND;
306 Advance(2);
307 break;
308 } else if (Next() == '!') {
309 subexpr_type = NEGATIVE_LOOKAROUND;
310 lookaround_type = RegExpLookaround::LOOKBEHIND;
311 Advance(2);
312 break;
313 }
314 }
315 if (FLAG_harmony_regexp_named_captures && unicode()) {
316 is_named_capture = true;
317 Advance();
318 break;
319 }
320 // Fall through.
321 default:
322 return ReportError(CStrVector("Invalid group"));
323 }
324 }
325
326 const ZoneVector<uc16>* capture_name = nullptr;
327 if (subexpr_type == CAPTURE) {
328 if (captures_started_ >= kMaxCaptures) {
329 return ReportError(CStrVector("Too many captures"));
330 }
331 captures_started_++;
332
333 if (is_named_capture) {
334 capture_name = ParseCaptureGroupName(CHECK_FAILED);
335 }
336 }
337 // Store current state and begin new disjunction parsing.
338 state = new (zone()) RegExpParserState(
339 state, subexpr_type, lookaround_type, captures_started_,
340 capture_name, ignore_case(), unicode(), zone());
341 builder = state->builder();
342 continue;
343 }
344 case '[': {
345 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
346 builder->AddCharacterClass(cc->AsCharacterClass());
347 break;
348 }
349 // Atom ::
350 // \ AtomEscape
351 case '\\':
352 switch (Next()) {
353 case kEndMarker:
354 return ReportError(CStrVector("\\ at end of pattern"));
355 case 'b':
356 Advance(2);
357 builder->AddAssertion(
358 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
359 continue;
360 case 'B':
361 Advance(2);
362 builder->AddAssertion(
363 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
364 continue;
365 // AtomEscape ::
366 // CharacterClassEscape
367 //
368 // CharacterClassEscape :: one of
369 // d D s S w W
370 case 'd':
371 case 'D':
372 case 's':
373 case 'S':
374 case 'w':
375 case 'W': {
376 uc32 c = Next();
377 Advance(2);
378 ZoneList<CharacterRange>* ranges =
379 new (zone()) ZoneList<CharacterRange>(2, zone());
380 CharacterRange::AddClassEscape(c, ranges, zone());
381 RegExpCharacterClass* cc =
382 new (zone()) RegExpCharacterClass(ranges, false);
383 builder->AddCharacterClass(cc);
384 break;
385 }
386 case 'p':
387 case 'P': {
388 uc32 p = Next();
389 Advance(2);
390 if (unicode()) {
391 if (FLAG_harmony_regexp_property) {
392 ZoneList<CharacterRange>* ranges =
393 new (zone()) ZoneList<CharacterRange>(2, zone());
394 if (!ParsePropertyClass(ranges, p == 'P')) {
395 return ReportError(CStrVector("Invalid property name"));
396 }
397 RegExpCharacterClass* cc =
398 new (zone()) RegExpCharacterClass(ranges, false);
399 builder->AddCharacterClass(cc);
400 } else {
401 // With /u, no identity escapes except for syntax characters
402 // are allowed. Otherwise, all identity escapes are allowed.
403 return ReportError(CStrVector("Invalid escape"));
404 }
405 } else {
406 builder->AddCharacter(p);
407 }
408 break;
409 }
410 case '1':
411 case '2':
412 case '3':
413 case '4':
414 case '5':
415 case '6':
416 case '7':
417 case '8':
418 case '9': {
419 int index = 0;
420 bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
421 if (is_backref) {
422 if (state->IsInsideCaptureGroup(index)) {
423 // The back reference is inside the capture group it refers to.
424 // Nothing can possibly have been captured yet, so we use empty
425 // instead. This ensures that, when checking a back reference,
426 // the capture registers of the referenced capture are either
427 // both set or both cleared.
428 builder->AddEmpty();
429 } else {
430 RegExpCapture* capture = GetCapture(index);
431 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
432 builder->AddAtom(atom);
433 }
434 break;
435 }
436 // With /u, no identity escapes except for syntax characters
437 // are allowed. Otherwise, all identity escapes are allowed.
438 if (unicode()) {
439 return ReportError(CStrVector("Invalid escape"));
440 }
441 uc32 first_digit = Next();
442 if (first_digit == '8' || first_digit == '9') {
443 builder->AddCharacter(first_digit);
444 Advance(2);
445 break;
446 }
447 }
448 // Fall through.
449 case '0': {
450 Advance();
451 if (unicode() && Next() >= '0' && Next() <= '9') {
452 // With /u, decimal escape with leading 0 are not parsed as octal.
453 return ReportError(CStrVector("Invalid decimal escape"));
454 }
455 uc32 octal = ParseOctalLiteral();
456 builder->AddCharacter(octal);
457 break;
458 }
459 // ControlEscape :: one of
460 // f n r t v
461 case 'f':
462 Advance(2);
463 builder->AddCharacter('\f');
464 break;
465 case 'n':
466 Advance(2);
467 builder->AddCharacter('\n');
468 break;
469 case 'r':
470 Advance(2);
471 builder->AddCharacter('\r');
472 break;
473 case 't':
474 Advance(2);
475 builder->AddCharacter('\t');
476 break;
477 case 'v':
478 Advance(2);
479 builder->AddCharacter('\v');
480 break;
481 case 'c': {
482 Advance();
483 uc32 controlLetter = Next();
484 // Special case if it is an ASCII letter.
485 // Convert lower case letters to uppercase.
486 uc32 letter = controlLetter & ~('a' ^ 'A');
487 if (letter < 'A' || 'Z' < letter) {
488 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
489 // This is outside the specification. We match JSC in
490 // reading the backslash as a literal character instead
491 // of as starting an escape.
492 if (unicode()) {
493 // With /u, invalid escapes are not treated as identity escapes.
494 return ReportError(CStrVector("Invalid unicode escape"));
495 }
496 builder->AddCharacter('\\');
497 } else {
498 Advance(2);
499 builder->AddCharacter(controlLetter & 0x1f);
500 }
501 break;
502 }
503 case 'x': {
504 Advance(2);
505 uc32 value;
506 if (ParseHexEscape(2, &value)) {
507 builder->AddCharacter(value);
508 } else if (!unicode()) {
509 builder->AddCharacter('x');
510 } else {
511 // With /u, invalid escapes are not treated as identity escapes.
512 return ReportError(CStrVector("Invalid escape"));
513 }
514 break;
515 }
516 case 'u': {
517 Advance(2);
518 uc32 value;
519 if (ParseUnicodeEscape(&value)) {
520 builder->AddEscapedUnicodeCharacter(value);
521 } else if (!unicode()) {
522 builder->AddCharacter('u');
523 } else {
524 // With /u, invalid escapes are not treated as identity escapes.
525 return ReportError(CStrVector("Invalid unicode escape"));
526 }
527 break;
528 }
529 case 'k':
530 if (FLAG_harmony_regexp_named_captures && unicode()) {
531 Advance(2);
532 ParseNamedBackReference(builder, state CHECK_FAILED);
533 break;
534 }
535 // Fall through.
536 default:
537 Advance();
538 // With /u, no identity escapes except for syntax characters
539 // are allowed. Otherwise, all identity escapes are allowed.
540 if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
541 builder->AddCharacter(current());
542 Advance();
543 } else {
544 return ReportError(CStrVector("Invalid escape"));
545 }
546 break;
547 }
548 break;
549 case '{': {
550 int dummy;
551 bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
552 if (parsed) return ReportError(CStrVector("Nothing to repeat"));
553 // Fall through.
554 }
555 case '}':
556 case ']':
557 if (unicode()) {
558 return ReportError(CStrVector("Lone quantifier brackets"));
559 }
560 // Fall through.
561 default:
562 builder->AddUnicodeCharacter(current());
563 Advance();
564 break;
565 } // end switch(current())
566
567 int min;
568 int max;
569 switch (current()) {
570 // QuantifierPrefix ::
571 // *
572 // +
573 // ?
574 // {
575 case '*':
576 min = 0;
577 max = RegExpTree::kInfinity;
578 Advance();
579 break;
580 case '+':
581 min = 1;
582 max = RegExpTree::kInfinity;
583 Advance();
584 break;
585 case '?':
586 min = 0;
587 max = 1;
588 Advance();
589 break;
590 case '{':
591 if (ParseIntervalQuantifier(&min, &max)) {
592 if (max < min) {
593 return ReportError(
594 CStrVector("numbers out of order in {} quantifier"));
595 }
596 break;
597 } else if (unicode()) {
598 // With /u, incomplete quantifiers are not allowed.
599 return ReportError(CStrVector("Incomplete quantifier"));
600 }
601 continue;
602 default:
603 continue;
604 }
605 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
606 if (current() == '?') {
607 quantifier_type = RegExpQuantifier::NON_GREEDY;
608 Advance();
609 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
610 // FLAG_regexp_possessive_quantifier is a debug-only flag.
611 quantifier_type = RegExpQuantifier::POSSESSIVE;
612 Advance();
613 }
614 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
615 return ReportError(CStrVector("Invalid quantifier"));
616 }
617 }
618 }
619
620
621 #ifdef DEBUG
622 // Currently only used in an DCHECK.
IsSpecialClassEscape(uc32 c)623 static bool IsSpecialClassEscape(uc32 c) {
624 switch (c) {
625 case 'd':
626 case 'D':
627 case 's':
628 case 'S':
629 case 'w':
630 case 'W':
631 return true;
632 default:
633 return false;
634 }
635 }
636 #endif
637
638
639 // In order to know whether an escape is a backreference or not we have to scan
640 // the entire regexp and find the number of capturing parentheses. However we
641 // don't want to scan the regexp twice unless it is necessary. This mini-parser
642 // is called when needed. It can see the difference between capturing and
643 // noncapturing parentheses and can skip character classes and backslash-escaped
644 // characters.
ScanForCaptures()645 void RegExpParser::ScanForCaptures() {
646 // Start with captures started previous to current position
647 int capture_count = captures_started();
648 // Add count of captures after this position.
649 int n;
650 while ((n = current()) != kEndMarker) {
651 Advance();
652 switch (n) {
653 case '\\':
654 Advance();
655 break;
656 case '[': {
657 int c;
658 while ((c = current()) != kEndMarker) {
659 Advance();
660 if (c == '\\') {
661 Advance();
662 } else {
663 if (c == ']') break;
664 }
665 }
666 break;
667 }
668 case '(':
669 if (current() != '?') capture_count++;
670 break;
671 }
672 }
673 capture_count_ = capture_count;
674 is_scanned_for_captures_ = true;
675 }
676
677
ParseBackReferenceIndex(int * index_out)678 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
679 DCHECK_EQ('\\', current());
680 DCHECK('1' <= Next() && Next() <= '9');
681 // Try to parse a decimal literal that is no greater than the total number
682 // of left capturing parentheses in the input.
683 int start = position();
684 int value = Next() - '0';
685 Advance(2);
686 while (true) {
687 uc32 c = current();
688 if (IsDecimalDigit(c)) {
689 value = 10 * value + (c - '0');
690 if (value > kMaxCaptures) {
691 Reset(start);
692 return false;
693 }
694 Advance();
695 } else {
696 break;
697 }
698 }
699 if (value > captures_started()) {
700 if (!is_scanned_for_captures_) {
701 int saved_position = position();
702 ScanForCaptures();
703 Reset(saved_position);
704 }
705 if (value > capture_count_) {
706 Reset(start);
707 return false;
708 }
709 }
710 *index_out = value;
711 return true;
712 }
713
push_code_unit(ZoneVector<uc16> * v,uint32_t code_unit)714 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
715 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
716 v->push_back(code_unit);
717 } else {
718 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
719 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
720 }
721 }
722
ParseCaptureGroupName()723 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
724 DCHECK(FLAG_harmony_regexp_named_captures);
725 DCHECK(unicode());
726
727 ZoneVector<uc16>* name =
728 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
729
730 bool at_start = true;
731 while (true) {
732 uc32 c = current();
733 Advance();
734
735 // Convert unicode escapes.
736 if (c == '\\' && current() == 'u') {
737 Advance();
738 if (!ParseUnicodeEscape(&c)) {
739 ReportError(CStrVector("Invalid Unicode escape sequence"));
740 return nullptr;
741 }
742 }
743
744 if (at_start) {
745 if (!IdentifierStart::Is(c)) {
746 ReportError(CStrVector("Invalid capture group name"));
747 return nullptr;
748 }
749 push_code_unit(name, c);
750 at_start = false;
751 } else {
752 if (c == '>') {
753 break;
754 } else if (IdentifierPart::Is(c)) {
755 push_code_unit(name, c);
756 } else {
757 ReportError(CStrVector("Invalid capture group name"));
758 return nullptr;
759 }
760 }
761 }
762
763 return name;
764 }
765
CreateNamedCaptureAtIndex(const ZoneVector<uc16> * name,int index)766 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
767 int index) {
768 DCHECK(FLAG_harmony_regexp_named_captures);
769 DCHECK(unicode());
770 DCHECK(0 < index && index <= captures_started_);
771 DCHECK_NOT_NULL(name);
772
773 if (named_captures_ == nullptr) {
774 named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
775 } else {
776 // Check for duplicates and bail if we find any.
777 for (const auto& named_capture : *named_captures_) {
778 if (*named_capture->name() == *name) {
779 ReportError(CStrVector("Duplicate capture group name"));
780 return false;
781 }
782 }
783 }
784
785 RegExpCapture* capture = GetCapture(index);
786 DCHECK(capture->name() == nullptr);
787
788 capture->set_name(name);
789 named_captures_->Add(capture, zone());
790
791 return true;
792 }
793
ParseNamedBackReference(RegExpBuilder * builder,RegExpParserState * state)794 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
795 RegExpParserState* state) {
796 // The parser is assumed to be on the '<' in \k<name>.
797 if (current() != '<') {
798 ReportError(CStrVector("Invalid named reference"));
799 return false;
800 }
801
802 Advance();
803 const ZoneVector<uc16>* name = ParseCaptureGroupName();
804 if (name == nullptr) {
805 return false;
806 }
807
808 if (state->IsInsideCaptureGroup(name)) {
809 builder->AddEmpty();
810 } else {
811 RegExpBackReference* atom = new (zone()) RegExpBackReference();
812 atom->set_name(name);
813
814 builder->AddAtom(atom);
815
816 if (named_back_references_ == nullptr) {
817 named_back_references_ =
818 new (zone()) ZoneList<RegExpBackReference*>(1, zone());
819 }
820 named_back_references_->Add(atom, zone());
821 }
822
823 return true;
824 }
825
PatchNamedBackReferences()826 void RegExpParser::PatchNamedBackReferences() {
827 if (named_back_references_ == nullptr) return;
828
829 if (named_captures_ == nullptr) {
830 ReportError(CStrVector("Invalid named capture referenced"));
831 return;
832 }
833
834 // Look up and patch the actual capture for each named back reference.
835 // TODO(jgruber): O(n^2), optimize if necessary.
836
837 for (int i = 0; i < named_back_references_->length(); i++) {
838 RegExpBackReference* ref = named_back_references_->at(i);
839
840 int index = -1;
841 for (const auto& capture : *named_captures_) {
842 if (*capture->name() == *ref->name()) {
843 index = capture->index();
844 break;
845 }
846 }
847
848 if (index == -1) {
849 ReportError(CStrVector("Invalid named capture referenced"));
850 return;
851 }
852
853 ref->set_capture(GetCapture(index));
854 }
855 }
856
GetCapture(int index)857 RegExpCapture* RegExpParser::GetCapture(int index) {
858 // The index for the capture groups are one-based. Its index in the list is
859 // zero-based.
860 int know_captures =
861 is_scanned_for_captures_ ? capture_count_ : captures_started_;
862 DCHECK(index <= know_captures);
863 if (captures_ == NULL) {
864 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
865 }
866 while (captures_->length() < know_captures) {
867 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
868 }
869 return captures_->at(index - 1);
870 }
871
CreateCaptureNameMap()872 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
873 if (named_captures_ == nullptr || named_captures_->is_empty())
874 return Handle<FixedArray>();
875
876 Factory* factory = isolate()->factory();
877
878 int len = named_captures_->length() * 2;
879 Handle<FixedArray> array = factory->NewFixedArray(len);
880
881 for (int i = 0; i < named_captures_->length(); i++) {
882 RegExpCapture* capture = named_captures_->at(i);
883 MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
884 array->set(i * 2, *name.ToHandleChecked());
885 array->set(i * 2 + 1, Smi::FromInt(capture->index()));
886 }
887
888 return array;
889 }
890
IsInsideCaptureGroup(int index)891 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
892 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
893 if (s->group_type() != CAPTURE) continue;
894 // Return true if we found the matching capture index.
895 if (index == s->capture_index()) return true;
896 // Abort if index is larger than what has been parsed up till this state.
897 if (index > s->capture_index()) return false;
898 }
899 return false;
900 }
901
IsInsideCaptureGroup(const ZoneVector<uc16> * name)902 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
903 const ZoneVector<uc16>* name) {
904 DCHECK_NOT_NULL(name);
905 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
906 if (s->capture_name() == nullptr) continue;
907 if (*s->capture_name() == *name) return true;
908 }
909 return false;
910 }
911
912 // QuantifierPrefix ::
913 // { DecimalDigits }
914 // { DecimalDigits , }
915 // { DecimalDigits , DecimalDigits }
916 //
917 // Returns true if parsing succeeds, and set the min_out and max_out
918 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
ParseIntervalQuantifier(int * min_out,int * max_out)919 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
920 DCHECK_EQ(current(), '{');
921 int start = position();
922 Advance();
923 int min = 0;
924 if (!IsDecimalDigit(current())) {
925 Reset(start);
926 return false;
927 }
928 while (IsDecimalDigit(current())) {
929 int next = current() - '0';
930 if (min > (RegExpTree::kInfinity - next) / 10) {
931 // Overflow. Skip past remaining decimal digits and return -1.
932 do {
933 Advance();
934 } while (IsDecimalDigit(current()));
935 min = RegExpTree::kInfinity;
936 break;
937 }
938 min = 10 * min + next;
939 Advance();
940 }
941 int max = 0;
942 if (current() == '}') {
943 max = min;
944 Advance();
945 } else if (current() == ',') {
946 Advance();
947 if (current() == '}') {
948 max = RegExpTree::kInfinity;
949 Advance();
950 } else {
951 while (IsDecimalDigit(current())) {
952 int next = current() - '0';
953 if (max > (RegExpTree::kInfinity - next) / 10) {
954 do {
955 Advance();
956 } while (IsDecimalDigit(current()));
957 max = RegExpTree::kInfinity;
958 break;
959 }
960 max = 10 * max + next;
961 Advance();
962 }
963 if (current() != '}') {
964 Reset(start);
965 return false;
966 }
967 Advance();
968 }
969 } else {
970 Reset(start);
971 return false;
972 }
973 *min_out = min;
974 *max_out = max;
975 return true;
976 }
977
978
ParseOctalLiteral()979 uc32 RegExpParser::ParseOctalLiteral() {
980 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
981 // For compatibility with some other browsers (not all), we parse
982 // up to three octal digits with a value below 256.
983 uc32 value = current() - '0';
984 Advance();
985 if ('0' <= current() && current() <= '7') {
986 value = value * 8 + current() - '0';
987 Advance();
988 if (value < 32 && '0' <= current() && current() <= '7') {
989 value = value * 8 + current() - '0';
990 Advance();
991 }
992 }
993 return value;
994 }
995
996
ParseHexEscape(int length,uc32 * value)997 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
998 int start = position();
999 uc32 val = 0;
1000 for (int i = 0; i < length; ++i) {
1001 uc32 c = current();
1002 int d = HexValue(c);
1003 if (d < 0) {
1004 Reset(start);
1005 return false;
1006 }
1007 val = val * 16 + d;
1008 Advance();
1009 }
1010 *value = val;
1011 return true;
1012 }
1013
1014 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uc32 * value)1015 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1016 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1017 // allowed). In the latter case, the number of hex digits between { } is
1018 // arbitrary. \ and u have already been read.
1019 if (current() == '{' && unicode()) {
1020 int start = position();
1021 Advance();
1022 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
1023 if (current() == '}') {
1024 Advance();
1025 return true;
1026 }
1027 }
1028 Reset(start);
1029 return false;
1030 }
1031 // \u but no {, or \u{...} escapes not allowed.
1032 bool result = ParseHexEscape(4, value);
1033 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1034 current() == '\\') {
1035 // Attempt to read trail surrogate.
1036 int start = position();
1037 if (Next() == 'u') {
1038 Advance(2);
1039 uc32 trail;
1040 if (ParseHexEscape(4, &trail) &&
1041 unibrow::Utf16::IsTrailSurrogate(trail)) {
1042 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1043 static_cast<uc16>(trail));
1044 return true;
1045 }
1046 }
1047 Reset(start);
1048 }
1049 return result;
1050 }
1051
1052 #ifdef V8_I18N_SUPPORT
1053
1054 namespace {
1055
IsExactPropertyAlias(const char * property_name,UProperty property)1056 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1057 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1058 if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
1059 for (int i = 0;; i++) {
1060 const char* long_name = u_getPropertyName(
1061 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1062 if (long_name == NULL) break;
1063 if (strcmp(property_name, long_name) == 0) return true;
1064 }
1065 return false;
1066 }
1067
IsExactPropertyValueAlias(const char * property_value_name,UProperty property,int32_t property_value)1068 bool IsExactPropertyValueAlias(const char* property_value_name,
1069 UProperty property, int32_t property_value) {
1070 const char* short_name =
1071 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1072 if (short_name != NULL && strcmp(property_value_name, short_name) == 0) {
1073 return true;
1074 }
1075 for (int i = 0;; i++) {
1076 const char* long_name = u_getPropertyValueName(
1077 property, property_value,
1078 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1079 if (long_name == NULL) break;
1080 if (strcmp(property_value_name, long_name) == 0) return true;
1081 }
1082 return false;
1083 }
1084
LookupPropertyValueName(UProperty property,const char * property_value_name,bool negate,ZoneList<CharacterRange> * result,Zone * zone)1085 bool LookupPropertyValueName(UProperty property,
1086 const char* property_value_name, bool negate,
1087 ZoneList<CharacterRange>* result, Zone* zone) {
1088 UProperty property_for_lookup = property;
1089 if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1090 // For the property Script_Extensions, we have to do the property value
1091 // name lookup as if the property is Script.
1092 property_for_lookup = UCHAR_SCRIPT;
1093 }
1094 int32_t property_value =
1095 u_getPropertyValueEnum(property_for_lookup, property_value_name);
1096 if (property_value == UCHAR_INVALID_CODE) return false;
1097
1098 // We require the property name to match exactly to one of the property value
1099 // aliases. However, u_getPropertyValueEnum uses loose matching.
1100 if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1101 property_value)) {
1102 return false;
1103 }
1104
1105 UErrorCode ec = U_ZERO_ERROR;
1106 icu::UnicodeSet set;
1107 set.applyIntPropertyValue(property, property_value, ec);
1108 bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1109
1110 if (success) {
1111 set.removeAllStrings();
1112 if (negate) set.complement();
1113 for (int i = 0; i < set.getRangeCount(); i++) {
1114 result->Add(
1115 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1116 zone);
1117 }
1118 }
1119 return success;
1120 }
1121
1122 template <size_t N>
NameEquals(const char * name,const char (& literal)[N])1123 inline bool NameEquals(const char* name, const char (&literal)[N]) {
1124 return strncmp(name, literal, N + 1) == 0;
1125 }
1126
LookupSpecialPropertyValueName(const char * name,ZoneList<CharacterRange> * result,bool negate,Zone * zone)1127 bool LookupSpecialPropertyValueName(const char* name,
1128 ZoneList<CharacterRange>* result,
1129 bool negate, Zone* zone) {
1130 if (NameEquals(name, "Any")) {
1131 if (!negate) result->Add(CharacterRange::Everything(), zone);
1132 } else if (NameEquals(name, "ASCII")) {
1133 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1134 : CharacterRange::Range(0x0, 0x7f),
1135 zone);
1136 } else if (NameEquals(name, "Assigned")) {
1137 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1138 !negate, result, zone);
1139 } else {
1140 return false;
1141 }
1142 return true;
1143 }
1144
1145 } // anonymous namespace
1146
ParsePropertyClass(ZoneList<CharacterRange> * result,bool negate)1147 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1148 bool negate) {
1149 // Parse the property class as follows:
1150 // - In \p{name}, 'name' is interpreted
1151 // - either as a general category property value name.
1152 // - or as a binary property name.
1153 // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1154 // and 'value' is interpreted as one of the available property value names.
1155 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1156 // - Loose matching is not applied.
1157 List<char> first_part;
1158 List<char> second_part;
1159 if (current() == '{') {
1160 // Parse \p{[PropertyName=]PropertyNameValue}
1161 for (Advance(); current() != '}' && current() != '='; Advance()) {
1162 if (!has_next()) return false;
1163 first_part.Add(static_cast<char>(current()));
1164 }
1165 if (current() == '=') {
1166 for (Advance(); current() != '}'; Advance()) {
1167 if (!has_next()) return false;
1168 second_part.Add(static_cast<char>(current()));
1169 }
1170 second_part.Add(0); // null-terminate string.
1171 }
1172 } else {
1173 return false;
1174 }
1175 Advance();
1176 first_part.Add(0); // null-terminate string.
1177
1178 if (second_part.is_empty()) {
1179 // First attempt to interpret as general category property value name.
1180 const char* name = first_part.ToConstVector().start();
1181 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1182 result, zone())) {
1183 return true;
1184 }
1185 // Interpret "Any", "ASCII", and "Assigned".
1186 if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
1187 return true;
1188 }
1189 // Then attempt to interpret as binary property name with value name 'Y'.
1190 UProperty property = u_getPropertyEnum(name);
1191 if (property < UCHAR_BINARY_START) return false;
1192 if (property >= UCHAR_BINARY_LIMIT) return false;
1193 if (!IsExactPropertyAlias(name, property)) return false;
1194 return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
1195 zone());
1196 } else {
1197 // Both property name and value name are specified. Attempt to interpret
1198 // the property name as enumerated property.
1199 const char* property_name = first_part.ToConstVector().start();
1200 const char* value_name = second_part.ToConstVector().start();
1201 UProperty property = u_getPropertyEnum(property_name);
1202 if (!IsExactPropertyAlias(property_name, property)) return false;
1203 if (property == UCHAR_GENERAL_CATEGORY) {
1204 // We want to allow aggregate value names such as "Letter".
1205 property = UCHAR_GENERAL_CATEGORY_MASK;
1206 } else if (property != UCHAR_SCRIPT &&
1207 property != UCHAR_SCRIPT_EXTENSIONS) {
1208 return false;
1209 }
1210 return LookupPropertyValueName(property, value_name, negate, result,
1211 zone());
1212 }
1213 }
1214
1215 #else // V8_I18N_SUPPORT
1216
ParsePropertyClass(ZoneList<CharacterRange> * result,bool negate)1217 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1218 bool negate) {
1219 return false;
1220 }
1221
1222 #endif // V8_I18N_SUPPORT
1223
ParseUnlimitedLengthHexNumber(int max_value,uc32 * value)1224 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1225 uc32 x = 0;
1226 int d = HexValue(current());
1227 if (d < 0) {
1228 return false;
1229 }
1230 while (d >= 0) {
1231 x = x * 16 + d;
1232 if (x > max_value) {
1233 return false;
1234 }
1235 Advance();
1236 d = HexValue(current());
1237 }
1238 *value = x;
1239 return true;
1240 }
1241
1242
ParseClassCharacterEscape()1243 uc32 RegExpParser::ParseClassCharacterEscape() {
1244 DCHECK(current() == '\\');
1245 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1246 Advance();
1247 switch (current()) {
1248 case 'b':
1249 Advance();
1250 return '\b';
1251 // ControlEscape :: one of
1252 // f n r t v
1253 case 'f':
1254 Advance();
1255 return '\f';
1256 case 'n':
1257 Advance();
1258 return '\n';
1259 case 'r':
1260 Advance();
1261 return '\r';
1262 case 't':
1263 Advance();
1264 return '\t';
1265 case 'v':
1266 Advance();
1267 return '\v';
1268 case 'c': {
1269 uc32 controlLetter = Next();
1270 uc32 letter = controlLetter & ~('A' ^ 'a');
1271 // For compatibility with JSC, inside a character class. We also accept
1272 // digits and underscore as control characters, unless with /u.
1273 if (letter >= 'A' && letter <= 'Z') {
1274 Advance(2);
1275 // Control letters mapped to ASCII control characters in the range
1276 // 0x00-0x1f.
1277 return controlLetter & 0x1f;
1278 }
1279 if (unicode()) {
1280 // With /u, invalid escapes are not treated as identity escapes.
1281 ReportError(CStrVector("Invalid class escape"));
1282 return 0;
1283 }
1284 if ((controlLetter >= '0' && controlLetter <= '9') ||
1285 controlLetter == '_') {
1286 Advance(2);
1287 return controlLetter & 0x1f;
1288 }
1289 // We match JSC in reading the backslash as a literal
1290 // character instead of as starting an escape.
1291 return '\\';
1292 }
1293 case '0':
1294 // With /u, \0 is interpreted as NUL if not followed by another digit.
1295 if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1296 Advance();
1297 return 0;
1298 }
1299 // Fall through.
1300 case '1':
1301 case '2':
1302 case '3':
1303 case '4':
1304 case '5':
1305 case '6':
1306 case '7':
1307 // For compatibility, we interpret a decimal escape that isn't
1308 // a back reference (and therefore either \0 or not valid according
1309 // to the specification) as a 1..3 digit octal character code.
1310 if (unicode()) {
1311 // With /u, decimal escape is not interpreted as octal character code.
1312 ReportError(CStrVector("Invalid class escape"));
1313 return 0;
1314 }
1315 return ParseOctalLiteral();
1316 case 'x': {
1317 Advance();
1318 uc32 value;
1319 if (ParseHexEscape(2, &value)) return value;
1320 if (unicode()) {
1321 // With /u, invalid escapes are not treated as identity escapes.
1322 ReportError(CStrVector("Invalid escape"));
1323 return 0;
1324 }
1325 // If \x is not followed by a two-digit hexadecimal, treat it
1326 // as an identity escape.
1327 return 'x';
1328 }
1329 case 'u': {
1330 Advance();
1331 uc32 value;
1332 if (ParseUnicodeEscape(&value)) return value;
1333 if (unicode()) {
1334 // With /u, invalid escapes are not treated as identity escapes.
1335 ReportError(CStrVector("Invalid unicode escape"));
1336 return 0;
1337 }
1338 // If \u is not followed by a two-digit hexadecimal, treat it
1339 // as an identity escape.
1340 return 'u';
1341 }
1342 default: {
1343 uc32 result = current();
1344 // With /u, no identity escapes except for syntax characters and '-' are
1345 // allowed. Otherwise, all identity escapes are allowed.
1346 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1347 Advance();
1348 return result;
1349 }
1350 ReportError(CStrVector("Invalid escape"));
1351 return 0;
1352 }
1353 }
1354 return 0;
1355 }
1356
1357
ParseClassAtom(uc16 * char_class)1358 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
1359 DCHECK_EQ(0, *char_class);
1360 uc32 first = current();
1361 if (first == '\\') {
1362 switch (Next()) {
1363 case 'w':
1364 case 'W':
1365 case 'd':
1366 case 'D':
1367 case 's':
1368 case 'S': {
1369 *char_class = Next();
1370 Advance(2);
1371 return CharacterRange::Singleton(0); // Return dummy value.
1372 }
1373 case kEndMarker:
1374 return ReportError(CStrVector("\\ at end of pattern"));
1375 default:
1376 first = ParseClassCharacterEscape(CHECK_FAILED);
1377 }
1378 } else {
1379 Advance();
1380 }
1381
1382 return CharacterRange::Singleton(first);
1383 }
1384
1385 static const uc16 kNoCharClass = 0;
1386
1387 // Adds range or pre-defined character class to character ranges.
1388 // If char_class is not kInvalidClass, it's interpreted as a class
1389 // escape (i.e., 's' means whitespace, from '\s').
AddRangeOrEscape(ZoneList<CharacterRange> * ranges,uc16 char_class,CharacterRange range,Zone * zone)1390 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
1391 uc16 char_class, CharacterRange range,
1392 Zone* zone) {
1393 if (char_class != kNoCharClass) {
1394 CharacterRange::AddClassEscape(char_class, ranges, zone);
1395 } else {
1396 ranges->Add(range, zone);
1397 }
1398 }
1399
ParseClassProperty(ZoneList<CharacterRange> * ranges)1400 bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
1401 if (!FLAG_harmony_regexp_property) return false;
1402 if (!unicode()) return false;
1403 if (current() != '\\') return false;
1404 uc32 next = Next();
1405 bool parse_success = false;
1406 if (next == 'p') {
1407 Advance(2);
1408 parse_success = ParsePropertyClass(ranges, false);
1409 } else if (next == 'P') {
1410 Advance(2);
1411 parse_success = ParsePropertyClass(ranges, true);
1412 } else {
1413 return false;
1414 }
1415 if (!parse_success)
1416 ReportError(CStrVector("Invalid property name in character class"));
1417 return parse_success;
1418 }
1419
ParseCharacterClass()1420 RegExpTree* RegExpParser::ParseCharacterClass() {
1421 static const char* kUnterminated = "Unterminated character class";
1422 static const char* kRangeInvalid = "Invalid character class";
1423 static const char* kRangeOutOfOrder = "Range out of order in character class";
1424
1425 DCHECK_EQ(current(), '[');
1426 Advance();
1427 bool is_negated = false;
1428 if (current() == '^') {
1429 is_negated = true;
1430 Advance();
1431 }
1432 ZoneList<CharacterRange>* ranges =
1433 new (zone()) ZoneList<CharacterRange>(2, zone());
1434 while (has_more() && current() != ']') {
1435 bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
1436 if (parsed_property) continue;
1437 uc16 char_class = kNoCharClass;
1438 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
1439 if (current() == '-') {
1440 Advance();
1441 if (current() == kEndMarker) {
1442 // If we reach the end we break out of the loop and let the
1443 // following code report an error.
1444 break;
1445 } else if (current() == ']') {
1446 AddRangeOrEscape(ranges, char_class, first, zone());
1447 ranges->Add(CharacterRange::Singleton('-'), zone());
1448 break;
1449 }
1450 uc16 char_class_2 = kNoCharClass;
1451 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
1452 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1453 // Either end is an escaped character class. Treat the '-' verbatim.
1454 if (unicode()) {
1455 // ES2015 21.2.2.15.1 step 1.
1456 return ReportError(CStrVector(kRangeInvalid));
1457 }
1458 AddRangeOrEscape(ranges, char_class, first, zone());
1459 ranges->Add(CharacterRange::Singleton('-'), zone());
1460 AddRangeOrEscape(ranges, char_class_2, next, zone());
1461 continue;
1462 }
1463 // ES2015 21.2.2.15.1 step 6.
1464 if (first.from() > next.to()) {
1465 return ReportError(CStrVector(kRangeOutOfOrder));
1466 }
1467 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
1468 } else {
1469 AddRangeOrEscape(ranges, char_class, first, zone());
1470 }
1471 }
1472 if (!has_more()) {
1473 return ReportError(CStrVector(kUnterminated));
1474 }
1475 Advance();
1476 if (ranges->length() == 0) {
1477 ranges->Add(CharacterRange::Everything(), zone());
1478 is_negated = !is_negated;
1479 }
1480 return new (zone()) RegExpCharacterClass(ranges, is_negated);
1481 }
1482
1483
1484 #undef CHECK_FAILED
1485
1486
ParseRegExp(Isolate * isolate,Zone * zone,FlatStringReader * input,JSRegExp::Flags flags,RegExpCompileData * result)1487 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1488 FlatStringReader* input, JSRegExp::Flags flags,
1489 RegExpCompileData* result) {
1490 DCHECK(result != NULL);
1491 RegExpParser parser(input, &result->error, flags, isolate, zone);
1492 RegExpTree* tree = parser.ParsePattern();
1493 if (parser.failed()) {
1494 DCHECK(tree == NULL);
1495 DCHECK(!result->error.is_null());
1496 } else {
1497 DCHECK(tree != NULL);
1498 DCHECK(result->error.is_null());
1499 if (FLAG_trace_regexp_parser) {
1500 OFStream os(stdout);
1501 tree->Print(os, zone);
1502 os << "\n";
1503 }
1504 result->tree = tree;
1505 int capture_count = parser.captures_started();
1506 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1507 result->contains_anchor = parser.contains_anchor();
1508 result->capture_name_map = parser.CreateCaptureNameMap();
1509 result->capture_count = capture_count;
1510 }
1511 return !parser.failed();
1512 }
1513
RegExpBuilder(Zone * zone,bool ignore_case,bool unicode)1514 RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
1515 : zone_(zone),
1516 pending_empty_(false),
1517 ignore_case_(ignore_case),
1518 unicode_(unicode),
1519 characters_(NULL),
1520 pending_surrogate_(kNoPendingSurrogate),
1521 terms_(),
1522 alternatives_()
1523 #ifdef DEBUG
1524 ,
1525 last_added_(ADD_NONE)
1526 #endif
1527 {
1528 }
1529
1530
AddLeadSurrogate(uc16 lead_surrogate)1531 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1532 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1533 FlushPendingSurrogate();
1534 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1535 pending_surrogate_ = lead_surrogate;
1536 }
1537
1538
AddTrailSurrogate(uc16 trail_surrogate)1539 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1540 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1541 if (pending_surrogate_ != kNoPendingSurrogate) {
1542 uc16 lead_surrogate = pending_surrogate_;
1543 pending_surrogate_ = kNoPendingSurrogate;
1544 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1545 uc32 combined =
1546 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1547 if (NeedsDesugaringForIgnoreCase(combined)) {
1548 AddCharacterClassForDesugaring(combined);
1549 } else {
1550 ZoneList<uc16> surrogate_pair(2, zone());
1551 surrogate_pair.Add(lead_surrogate, zone());
1552 surrogate_pair.Add(trail_surrogate, zone());
1553 RegExpAtom* atom =
1554 new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1555 AddAtom(atom);
1556 }
1557 } else {
1558 pending_surrogate_ = trail_surrogate;
1559 FlushPendingSurrogate();
1560 }
1561 }
1562
1563
FlushPendingSurrogate()1564 void RegExpBuilder::FlushPendingSurrogate() {
1565 if (pending_surrogate_ != kNoPendingSurrogate) {
1566 DCHECK(unicode());
1567 uc32 c = pending_surrogate_;
1568 pending_surrogate_ = kNoPendingSurrogate;
1569 AddCharacterClassForDesugaring(c);
1570 }
1571 }
1572
1573
FlushCharacters()1574 void RegExpBuilder::FlushCharacters() {
1575 FlushPendingSurrogate();
1576 pending_empty_ = false;
1577 if (characters_ != NULL) {
1578 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1579 characters_ = NULL;
1580 text_.Add(atom, zone());
1581 LAST(ADD_ATOM);
1582 }
1583 }
1584
1585
FlushText()1586 void RegExpBuilder::FlushText() {
1587 FlushCharacters();
1588 int num_text = text_.length();
1589 if (num_text == 0) {
1590 return;
1591 } else if (num_text == 1) {
1592 terms_.Add(text_.last(), zone());
1593 } else {
1594 RegExpText* text = new (zone()) RegExpText(zone());
1595 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1596 terms_.Add(text, zone());
1597 }
1598 text_.Clear();
1599 }
1600
1601
AddCharacter(uc16 c)1602 void RegExpBuilder::AddCharacter(uc16 c) {
1603 FlushPendingSurrogate();
1604 pending_empty_ = false;
1605 if (NeedsDesugaringForIgnoreCase(c)) {
1606 AddCharacterClassForDesugaring(c);
1607 } else {
1608 if (characters_ == NULL) {
1609 characters_ = new (zone()) ZoneList<uc16>(4, zone());
1610 }
1611 characters_->Add(c, zone());
1612 LAST(ADD_CHAR);
1613 }
1614 }
1615
1616
AddUnicodeCharacter(uc32 c)1617 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1618 if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1619 DCHECK(unicode());
1620 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1621 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1622 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1623 AddLeadSurrogate(c);
1624 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1625 AddTrailSurrogate(c);
1626 } else {
1627 AddCharacter(static_cast<uc16>(c));
1628 }
1629 }
1630
AddEscapedUnicodeCharacter(uc32 character)1631 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1632 // A lead or trail surrogate parsed via escape sequence will not
1633 // pair up with any preceding lead or following trail surrogate.
1634 FlushPendingSurrogate();
1635 AddUnicodeCharacter(character);
1636 FlushPendingSurrogate();
1637 }
1638
AddEmpty()1639 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1640
1641
AddCharacterClass(RegExpCharacterClass * cc)1642 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1643 if (NeedsDesugaringForUnicode(cc)) {
1644 // With /u, character class needs to be desugared, so it
1645 // must be a standalone term instead of being part of a RegExpText.
1646 AddTerm(cc);
1647 } else {
1648 AddAtom(cc);
1649 }
1650 }
1651
AddCharacterClassForDesugaring(uc32 c)1652 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1653 AddTerm(new (zone()) RegExpCharacterClass(
1654 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
1655 }
1656
1657
AddAtom(RegExpTree * term)1658 void RegExpBuilder::AddAtom(RegExpTree* term) {
1659 if (term->IsEmpty()) {
1660 AddEmpty();
1661 return;
1662 }
1663 if (term->IsTextElement()) {
1664 FlushCharacters();
1665 text_.Add(term, zone());
1666 } else {
1667 FlushText();
1668 terms_.Add(term, zone());
1669 }
1670 LAST(ADD_ATOM);
1671 }
1672
1673
AddTerm(RegExpTree * term)1674 void RegExpBuilder::AddTerm(RegExpTree* term) {
1675 FlushText();
1676 terms_.Add(term, zone());
1677 LAST(ADD_ATOM);
1678 }
1679
1680
AddAssertion(RegExpTree * assert)1681 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1682 FlushText();
1683 terms_.Add(assert, zone());
1684 LAST(ADD_ASSERT);
1685 }
1686
1687
NewAlternative()1688 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1689
1690
FlushTerms()1691 void RegExpBuilder::FlushTerms() {
1692 FlushText();
1693 int num_terms = terms_.length();
1694 RegExpTree* alternative;
1695 if (num_terms == 0) {
1696 alternative = new (zone()) RegExpEmpty();
1697 } else if (num_terms == 1) {
1698 alternative = terms_.last();
1699 } else {
1700 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1701 }
1702 alternatives_.Add(alternative, zone());
1703 terms_.Clear();
1704 LAST(ADD_NONE);
1705 }
1706
1707
NeedsDesugaringForUnicode(RegExpCharacterClass * cc)1708 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1709 if (!unicode()) return false;
1710 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1711 // necessarily mean that we need to desugar. It's probably nicer to have a
1712 // separate pass to figure out unicode desugarings.
1713 if (ignore_case()) return true;
1714 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1715 CharacterRange::Canonicalize(ranges);
1716 for (int i = ranges->length() - 1; i >= 0; i--) {
1717 uc32 from = ranges->at(i).from();
1718 uc32 to = ranges->at(i).to();
1719 // Check for non-BMP characters.
1720 if (to >= kNonBmpStart) return true;
1721 // Check for lone surrogates.
1722 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1723 }
1724 return false;
1725 }
1726
1727
NeedsDesugaringForIgnoreCase(uc32 c)1728 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1729 #ifdef V8_I18N_SUPPORT
1730 if (unicode() && ignore_case()) {
1731 icu::UnicodeSet set(c, c);
1732 set.closeOver(USET_CASE_INSENSITIVE);
1733 set.removeAllStrings();
1734 return set.size() > 1;
1735 }
1736 // In the case where ICU is not included, we act as if the unicode flag is
1737 // not set, and do not desugar.
1738 #endif // V8_I18N_SUPPORT
1739 return false;
1740 }
1741
1742
ToRegExp()1743 RegExpTree* RegExpBuilder::ToRegExp() {
1744 FlushTerms();
1745 int num_alternatives = alternatives_.length();
1746 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1747 if (num_alternatives == 1) return alternatives_.last();
1748 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1749 }
1750
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)1751 bool RegExpBuilder::AddQuantifierToAtom(
1752 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1753 FlushPendingSurrogate();
1754 if (pending_empty_) {
1755 pending_empty_ = false;
1756 return true;
1757 }
1758 RegExpTree* atom;
1759 if (characters_ != NULL) {
1760 DCHECK(last_added_ == ADD_CHAR);
1761 // Last atom was character.
1762 Vector<const uc16> char_vector = characters_->ToConstVector();
1763 int num_chars = char_vector.length();
1764 if (num_chars > 1) {
1765 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1766 text_.Add(new (zone()) RegExpAtom(prefix), zone());
1767 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1768 }
1769 characters_ = NULL;
1770 atom = new (zone()) RegExpAtom(char_vector);
1771 FlushText();
1772 } else if (text_.length() > 0) {
1773 DCHECK(last_added_ == ADD_ATOM);
1774 atom = text_.RemoveLast();
1775 FlushText();
1776 } else if (terms_.length() > 0) {
1777 DCHECK(last_added_ == ADD_ATOM);
1778 atom = terms_.RemoveLast();
1779 // With /u, lookarounds are not quantifiable.
1780 if (unicode() && atom->IsLookaround()) return false;
1781 if (atom->max_match() == 0) {
1782 // Guaranteed to only match an empty string.
1783 LAST(ADD_TERM);
1784 if (min == 0) {
1785 return true;
1786 }
1787 terms_.Add(atom, zone());
1788 return true;
1789 }
1790 } else {
1791 // Only call immediately after adding an atom or character!
1792 UNREACHABLE();
1793 return false;
1794 }
1795 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1796 zone());
1797 LAST(ADD_TERM);
1798 return true;
1799 }
1800
1801 } // namespace internal
1802 } // namespace v8
1803