1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-parser.h"
6
7 #include "src/base/small-vector.h"
8 #include "src/execution/isolate.h"
9 #include "src/objects/string-inl.h"
10 #include "src/regexp/property-sequences.h"
11 #include "src/regexp/regexp-ast.h"
12 #include "src/regexp/regexp-macro-assembler.h"
13 #include "src/regexp/regexp.h"
14 #include "src/strings/char-predicates-inl.h"
15 #include "src/utils/ostreams.h"
16 #include "src/utils/utils.h"
17 #include "src/zone/zone-allocator.h"
18 #include "src/zone/zone-list-inl.h"
19
20 #ifdef V8_INTL_SUPPORT
21 #include "unicode/uniset.h"
22 #endif // V8_INTL_SUPPORT
23
24 namespace v8 {
25 namespace internal {
26
27 namespace {
28
29 // Whether we're currently inside the ClassEscape production
30 // (tc39.es/ecma262/#prod-annexB-CharacterEscape).
31 enum class InClassEscapeState {
32 kInClass,
33 kNotInClass,
34 };
35
36 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
37 class RegExpBuilder {
38 public:
RegExpBuilder(Zone * zone,RegExpFlags flags)39 RegExpBuilder(Zone* zone, RegExpFlags flags)
40 : zone_(zone),
41 flags_(flags),
42 terms_(ZoneAllocator<RegExpTree*>{zone}),
43 text_(ZoneAllocator<RegExpTree*>{zone}),
44 alternatives_(ZoneAllocator<RegExpTree*>{zone}) {}
45 void AddCharacter(base::uc16 character);
46 void AddUnicodeCharacter(base::uc32 character);
47 void AddEscapedUnicodeCharacter(base::uc32 character);
48 // "Adds" an empty expression. Does nothing except consume a
49 // following quantifier
50 void AddEmpty();
51 void AddCharacterClass(RegExpCharacterClass* cc);
52 void AddCharacterClassForDesugaring(base::uc32 c);
53 void AddAtom(RegExpTree* tree);
54 void AddTerm(RegExpTree* tree);
55 void AddAssertion(RegExpTree* tree);
56 void NewAlternative(); // '|'
57 bool AddQuantifierToAtom(int min, int max,
58 RegExpQuantifier::QuantifierType type);
59 void FlushText();
60 RegExpTree* ToRegExp();
flags() const61 RegExpFlags flags() const { return flags_; }
62
ignore_case() const63 bool ignore_case() const { return IsIgnoreCase(flags_); }
multiline() const64 bool multiline() const { return IsMultiline(flags_); }
dotall() const65 bool dotall() const { return IsDotAll(flags_); }
66
67 private:
68 static const base::uc16 kNoPendingSurrogate = 0;
69 void AddLeadSurrogate(base::uc16 lead_surrogate);
70 void AddTrailSurrogate(base::uc16 trail_surrogate);
71 void FlushPendingSurrogate();
72 void FlushCharacters();
73 void FlushTerms();
74 bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
75 bool NeedsDesugaringForIgnoreCase(base::uc32 c);
zone() const76 Zone* zone() const { return zone_; }
unicode() const77 bool unicode() const { return IsUnicode(flags_); }
78
79 Zone* const zone_;
80 bool pending_empty_ = false;
81 const RegExpFlags flags_;
82 ZoneList<base::uc16>* characters_ = nullptr;
83 base::uc16 pending_surrogate_ = kNoPendingSurrogate;
84
85 using SmallRegExpTreeVector =
86 base::SmallVector<RegExpTree*, 8, ZoneAllocator<RegExpTree*>>;
87 SmallRegExpTreeVector terms_;
88 SmallRegExpTreeVector text_;
89 SmallRegExpTreeVector alternatives_;
90 #ifdef DEBUG
91 enum {
92 ADD_NONE,
93 ADD_CHAR,
94 ADD_TERM,
95 ADD_ASSERT,
96 ADD_ATOM
97 } last_added_ = ADD_NONE;
98 #define LAST(x) last_added_ = x;
99 #else
100 #define LAST(x)
101 #endif
102 };
103
104 enum SubexpressionType {
105 INITIAL,
106 CAPTURE, // All positive values represent captures.
107 POSITIVE_LOOKAROUND,
108 NEGATIVE_LOOKAROUND,
109 GROUPING
110 };
111
112 class RegExpParserState : public ZoneObject {
113 public:
114 // Push a state on the stack.
RegExpParserState(RegExpParserState * previous_state,SubexpressionType group_type,RegExpLookaround::Type lookaround_type,int disjunction_capture_index,const ZoneVector<base::uc16> * capture_name,RegExpFlags flags,Zone * zone)115 RegExpParserState(RegExpParserState* previous_state,
116 SubexpressionType group_type,
117 RegExpLookaround::Type lookaround_type,
118 int disjunction_capture_index,
119 const ZoneVector<base::uc16>* capture_name,
120 RegExpFlags flags, Zone* zone)
121 : previous_state_(previous_state),
122 builder_(zone, flags),
123 group_type_(group_type),
124 lookaround_type_(lookaround_type),
125 disjunction_capture_index_(disjunction_capture_index),
126 capture_name_(capture_name) {}
127 // Parser state of containing expression, if any.
previous_state() const128 RegExpParserState* previous_state() const { return previous_state_; }
IsSubexpression()129 bool IsSubexpression() { return previous_state_ != nullptr; }
130 // RegExpBuilder building this regexp's AST.
builder()131 RegExpBuilder* builder() { return &builder_; }
132 // Type of regexp being parsed (parenthesized group or entire regexp).
group_type() const133 SubexpressionType group_type() const { return group_type_; }
134 // Lookahead or Lookbehind.
lookaround_type() const135 RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
136 // Index in captures array of first capture in this sub-expression, if any.
137 // Also the capture index of this sub-expression itself, if group_type
138 // is CAPTURE.
capture_index() const139 int capture_index() const { return disjunction_capture_index_; }
140 // The name of the current sub-expression, if group_type is CAPTURE. Only
141 // used for named captures.
capture_name() const142 const ZoneVector<base::uc16>* capture_name() const { return capture_name_; }
143
IsNamedCapture() const144 bool IsNamedCapture() const { return capture_name_ != nullptr; }
145
146 // Check whether the parser is inside a capture group with the given index.
IsInsideCaptureGroup(int index) const147 bool IsInsideCaptureGroup(int index) const {
148 for (const RegExpParserState* s = this; s != nullptr;
149 s = s->previous_state()) {
150 if (s->group_type() != CAPTURE) continue;
151 // Return true if we found the matching capture index.
152 if (index == s->capture_index()) return true;
153 // Abort if index is larger than what has been parsed up till this state.
154 if (index > s->capture_index()) return false;
155 }
156 return false;
157 }
158
159 // Check whether the parser is inside a capture group with the given name.
IsInsideCaptureGroup(const ZoneVector<base::uc16> * name) const160 bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const {
161 DCHECK_NOT_NULL(name);
162 for (const RegExpParserState* s = this; s != nullptr;
163 s = s->previous_state()) {
164 if (s->capture_name() == nullptr) continue;
165 if (*s->capture_name() == *name) return true;
166 }
167 return false;
168 }
169
170 private:
171 // Linked list implementation of stack of states.
172 RegExpParserState* const previous_state_;
173 // Builder for the stored disjunction.
174 RegExpBuilder builder_;
175 // Stored disjunction type (capture, look-ahead or grouping), if any.
176 const SubexpressionType group_type_;
177 // Stored read direction.
178 const RegExpLookaround::Type lookaround_type_;
179 // Stored disjunction's capture index (if any).
180 const int disjunction_capture_index_;
181 // Stored capture name (if any).
182 const ZoneVector<base::uc16>* const capture_name_;
183 };
184
185 template <class CharT>
186 class RegExpParserImpl final {
187 private:
188 RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags,
189 uintptr_t stack_limit, Zone* zone,
190 const DisallowGarbageCollection& no_gc);
191
192 bool Parse(RegExpCompileData* result);
193
194 RegExpTree* ParsePattern();
195 RegExpTree* ParseDisjunction();
196 RegExpTree* ParseGroup();
197
198 // Parses a {...,...} quantifier and stores the range in the given
199 // out parameters.
200 bool ParseIntervalQuantifier(int* min_out, int* max_out);
201
202 // Checks whether the following is a length-digit hexadecimal number,
203 // and sets the value if it is.
204 bool ParseHexEscape(int length, base::uc32* value);
205 bool ParseUnicodeEscape(base::uc32* value);
206 bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value);
207
208 bool ParsePropertyClassName(ZoneVector<char>* name_1,
209 ZoneVector<char>* name_2);
210 bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
211 const ZoneVector<char>& name_1,
212 const ZoneVector<char>& name_2);
213
214 RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
215
216 base::uc32 ParseOctalLiteral();
217
218 // Tries to parse the input as a back reference. If successful it
219 // stores the result in the output parameter and returns true. If
220 // it fails it will push back the characters read so the same characters
221 // can be reparsed.
222 bool ParseBackReferenceIndex(int* index_out);
223
224 // Parse inside a class. Either add escaped class to the range, or return
225 // false and pass parsed single character through |char_out|.
226 void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
227 bool add_unicode_case_equivalents, base::uc32* char_out,
228 bool* is_class_escape);
229 // Returns true iff parsing was successful.
230 bool TryParseCharacterClassEscape(base::uc32 next,
231 InClassEscapeState in_class_escape_state,
232 ZoneList<CharacterRange>* ranges,
233 Zone* zone,
234 bool add_unicode_case_equivalents);
235 // Parses and returns a single escaped character.
236 base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,
237 bool* is_escaped_unicode_character);
238
239 RegExpTree* ReportError(RegExpError error);
240 void Advance();
241 void Advance(int dist);
242 void RewindByOneCodepoint(); // Rewinds to before the previous Advance().
243 void Reset(int pos);
244
245 // Reports whether the pattern might be used as a literal search string.
246 // Only use if the result of the parse is a single atom node.
simple() const247 bool simple() const { return simple_; }
contains_anchor() const248 bool contains_anchor() const { return contains_anchor_; }
set_contains_anchor()249 void set_contains_anchor() { contains_anchor_ = true; }
captures_started() const250 int captures_started() const { return captures_started_; }
position() const251 int position() const { return next_pos_ - 1; }
failed() const252 bool failed() const { return failed_; }
unicode() const253 bool unicode() const { return IsUnicode(top_level_flags_) || force_unicode_; }
254
255 static bool IsSyntaxCharacterOrSlash(base::uc32 c);
256
257 static const base::uc32 kEndMarker = (1 << 21);
258
259 private:
260 // Return the 1-indexed RegExpCapture object, allocate if necessary.
261 RegExpCapture* GetCapture(int index);
262
263 // Creates a new named capture at the specified index. Must be called exactly
264 // once for each named capture. Fails if a capture with the same name is
265 // encountered.
266 bool CreateNamedCaptureAtIndex(const ZoneVector<base::uc16>* name, int index);
267
268 // Parses the name of a capture group (?<name>pattern). The name must adhere
269 // to IdentifierName in the ECMAScript standard.
270 const ZoneVector<base::uc16>* ParseCaptureGroupName();
271
272 bool ParseNamedBackReference(RegExpBuilder* builder,
273 RegExpParserState* state);
274 RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
275
276 // After the initial parsing pass, patch corresponding RegExpCapture objects
277 // into all RegExpBackReferences. This is done after initial parsing in order
278 // to avoid complicating cases in which references comes before the capture.
279 void PatchNamedBackReferences();
280
281 ZoneVector<RegExpCapture*>* GetNamedCaptures() const;
282
283 // Returns true iff the pattern contains named captures. May call
284 // ScanForCaptures to look ahead at the remaining pattern.
285 bool HasNamedCaptures(InClassEscapeState in_class_escape_state);
286
zone() const287 Zone* zone() const { return zone_; }
288
current() const289 base::uc32 current() const { return current_; }
has_more() const290 bool has_more() const { return has_more_; }
has_next() const291 bool has_next() const { return next_pos_ < input_length(); }
292 base::uc32 Next();
293 template <bool update_position>
294 base::uc32 ReadNext();
InputAt(int index) const295 CharT InputAt(int index) const {
296 DCHECK(0 <= index && index < input_length());
297 return input_[index];
298 }
input_length() const299 int input_length() const { return input_length_; }
300 void ScanForCaptures(InClassEscapeState in_class_escape_state);
301
302 struct RegExpCaptureNameLess {
operator ()v8::internal::__anonfae8d4fb0111::RegExpParserImpl::RegExpCaptureNameLess303 bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
304 DCHECK_NOT_NULL(lhs);
305 DCHECK_NOT_NULL(rhs);
306 return *lhs->name() < *rhs->name();
307 }
308 };
309
310 class ForceUnicodeScope final {
311 public:
ForceUnicodeScope(RegExpParserImpl<CharT> * parser)312 explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser)
313 : parser_(parser) {
314 DCHECK(!parser_->force_unicode_);
315 parser_->force_unicode_ = true;
316 }
~ForceUnicodeScope()317 ~ForceUnicodeScope() {
318 DCHECK(parser_->force_unicode_);
319 parser_->force_unicode_ = false;
320 }
321
322 private:
323 RegExpParserImpl<CharT>* const parser_;
324 };
325
326 const DisallowGarbageCollection no_gc_;
327 Zone* const zone_;
328 RegExpError error_ = RegExpError::kNone;
329 int error_pos_ = 0;
330 ZoneList<RegExpCapture*>* captures_;
331 ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
332 ZoneList<RegExpBackReference*>* named_back_references_;
333 const CharT* const input_;
334 const int input_length_;
335 base::uc32 current_;
336 const RegExpFlags top_level_flags_;
337 bool force_unicode_ = false; // Force parser to act as if unicode were set.
338 int next_pos_;
339 int captures_started_;
340 int capture_count_; // Only valid after we have scanned for captures.
341 bool has_more_;
342 bool simple_;
343 bool contains_anchor_;
344 bool is_scanned_for_captures_;
345 bool has_named_captures_; // Only valid after we have scanned for captures.
346 bool failed_;
347 const uintptr_t stack_limit_;
348
349 friend bool RegExpParser::ParseRegExpFromHeapString(Isolate*, Zone*,
350 Handle<String>,
351 RegExpFlags,
352 RegExpCompileData*);
353 friend bool RegExpParser::VerifyRegExpSyntax<CharT>(
354 Zone*, uintptr_t, const CharT*, int, RegExpFlags, RegExpCompileData*,
355 const DisallowGarbageCollection&);
356 };
357
358 template <class CharT>
RegExpParserImpl(const CharT * input,int input_length,RegExpFlags flags,uintptr_t stack_limit,Zone * zone,const DisallowGarbageCollection & no_gc)359 RegExpParserImpl<CharT>::RegExpParserImpl(
360 const CharT* input, int input_length, RegExpFlags flags,
361 uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc)
362 : zone_(zone),
363 captures_(nullptr),
364 named_captures_(nullptr),
365 named_back_references_(nullptr),
366 input_(input),
367 input_length_(input_length),
368 current_(kEndMarker),
369 top_level_flags_(flags),
370 next_pos_(0),
371 captures_started_(0),
372 capture_count_(0),
373 has_more_(true),
374 simple_(false),
375 contains_anchor_(false),
376 is_scanned_for_captures_(false),
377 has_named_captures_(false),
378 failed_(false),
379 stack_limit_(stack_limit) {
380 Advance();
381 }
382
383 template <>
384 template <bool update_position>
ReadNext()385 inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() {
386 int position = next_pos_;
387 base::uc16 c0 = InputAt(position);
388 position++;
389 DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0));
390 if (update_position) next_pos_ = position;
391 return c0;
392 }
393
394 template <>
395 template <bool update_position>
ReadNext()396 inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() {
397 int position = next_pos_;
398 base::uc16 c0 = InputAt(position);
399 base::uc32 result = c0;
400 position++;
401 // Read the whole surrogate pair in case of unicode flag, if possible.
402 if (unicode() && position < input_length() &&
403 unibrow::Utf16::IsLeadSurrogate(c0)) {
404 base::uc16 c1 = InputAt(position);
405 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
406 result = unibrow::Utf16::CombineSurrogatePair(c0, c1);
407 position++;
408 }
409 }
410 if (update_position) next_pos_ = position;
411 return result;
412 }
413
414 template <class CharT>
Next()415 base::uc32 RegExpParserImpl<CharT>::Next() {
416 if (has_next()) {
417 return ReadNext<false>();
418 } else {
419 return kEndMarker;
420 }
421 }
422
423 template <class CharT>
Advance()424 void RegExpParserImpl<CharT>::Advance() {
425 if (has_next()) {
426 if (GetCurrentStackPosition() < stack_limit_) {
427 if (FLAG_correctness_fuzzer_suppressions) {
428 FATAL("Aborting on stack overflow");
429 }
430 ReportError(RegExpError::kStackOverflow);
431 } else {
432 current_ = ReadNext<true>();
433 }
434 } else {
435 current_ = kEndMarker;
436 // Advance so that position() points to 1-after-the-last-character. This is
437 // important so that Reset() to this position works correctly.
438 next_pos_ = input_length() + 1;
439 has_more_ = false;
440 }
441 }
442
443 template <class CharT>
RewindByOneCodepoint()444 void RegExpParserImpl<CharT>::RewindByOneCodepoint() {
445 if (current() == kEndMarker) return;
446 // Rewinds by one code point, i.e.: two code units if `current` is outside
447 // the basic multilingual plane (= composed of a lead and trail surrogate),
448 // or one code unit otherwise.
449 const int rewind_by =
450 current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1;
451 Advance(rewind_by); // Undo the last Advance.
452 }
453
454 template <class CharT>
Reset(int pos)455 void RegExpParserImpl<CharT>::Reset(int pos) {
456 next_pos_ = pos;
457 has_more_ = (pos < input_length());
458 Advance();
459 }
460
461 template <class CharT>
Advance(int dist)462 void RegExpParserImpl<CharT>::Advance(int dist) {
463 next_pos_ += dist - 1;
464 Advance();
465 }
466
467 template <class CharT>
IsSyntaxCharacterOrSlash(base::uc32 c)468 bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) {
469 switch (c) {
470 case '^':
471 case '$':
472 case '\\':
473 case '.':
474 case '*':
475 case '+':
476 case '?':
477 case '(':
478 case ')':
479 case '[':
480 case ']':
481 case '{':
482 case '}':
483 case '|':
484 case '/':
485 return true;
486 default:
487 break;
488 }
489 return false;
490 }
491
492 template <class CharT>
ReportError(RegExpError error)493 RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) {
494 if (failed_) return nullptr; // Do not overwrite any existing error.
495 failed_ = true;
496 error_ = error;
497 error_pos_ = position();
498 // Zip to the end to make sure no more input is read.
499 current_ = kEndMarker;
500 next_pos_ = input_length();
501 return nullptr;
502 }
503
504 #define CHECK_FAILED /**/); \
505 if (failed_) return nullptr; \
506 ((void)0
507
508 // Pattern ::
509 // Disjunction
510 template <class CharT>
ParsePattern()511 RegExpTree* RegExpParserImpl<CharT>::ParsePattern() {
512 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
513 PatchNamedBackReferences(CHECK_FAILED);
514 DCHECK(!has_more());
515 // If the result of parsing is a literal string atom, and it has the
516 // same length as the input, then the atom is identical to the input.
517 if (result->IsAtom() && result->AsAtom()->length() == input_length()) {
518 simple_ = true;
519 }
520 return result;
521 }
522
523 // Disjunction ::
524 // Alternative
525 // Alternative | Disjunction
526 // Alternative ::
527 // [empty]
528 // Term Alternative
529 // Term ::
530 // Assertion
531 // Atom
532 // Atom Quantifier
533 template <class CharT>
ParseDisjunction()534 RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
535 // Used to store current state while parsing subexpressions.
536 RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
537 0, nullptr, top_level_flags_, zone());
538 RegExpParserState* state = &initial_state;
539 // Cache the builder in a local variable for quick access.
540 RegExpBuilder* builder = initial_state.builder();
541 while (true) {
542 switch (current()) {
543 case kEndMarker:
544 if (failed()) return nullptr; // E.g. the initial Advance failed.
545 if (state->IsSubexpression()) {
546 // Inside a parenthesized group when hitting end of input.
547 return ReportError(RegExpError::kUnterminatedGroup);
548 }
549 DCHECK_EQ(INITIAL, state->group_type());
550 // Parsing completed successfully.
551 return builder->ToRegExp();
552 case ')': {
553 if (!state->IsSubexpression()) {
554 return ReportError(RegExpError::kUnmatchedParen);
555 }
556 DCHECK_NE(INITIAL, state->group_type());
557
558 Advance();
559 // End disjunction parsing and convert builder content to new single
560 // regexp atom.
561 RegExpTree* body = builder->ToRegExp();
562
563 int end_capture_index = captures_started();
564
565 int capture_index = state->capture_index();
566 SubexpressionType group_type = state->group_type();
567
568 // Build result of subexpression.
569 if (group_type == CAPTURE) {
570 if (state->IsNamedCapture()) {
571 CreateNamedCaptureAtIndex(state->capture_name(),
572 capture_index CHECK_FAILED);
573 }
574 RegExpCapture* capture = GetCapture(capture_index);
575 capture->set_body(body);
576 body = capture;
577 } else if (group_type == GROUPING) {
578 body = zone()->template New<RegExpGroup>(body);
579 } else {
580 DCHECK(group_type == POSITIVE_LOOKAROUND ||
581 group_type == NEGATIVE_LOOKAROUND);
582 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
583 body = zone()->template New<RegExpLookaround>(
584 body, is_positive, end_capture_index - capture_index,
585 capture_index, state->lookaround_type());
586 }
587
588 // Restore previous state.
589 state = state->previous_state();
590 builder = state->builder();
591
592 builder->AddAtom(body);
593 // For compatibility with JSC and ES3, we allow quantifiers after
594 // lookaheads, and break in all cases.
595 break;
596 }
597 case '|': {
598 Advance();
599 builder->NewAlternative();
600 continue;
601 }
602 case '*':
603 case '+':
604 case '?':
605 return ReportError(RegExpError::kNothingToRepeat);
606 case '^': {
607 Advance();
608 builder->AddAssertion(zone()->template New<RegExpAssertion>(
609 builder->multiline() ? RegExpAssertion::Type::START_OF_LINE
610 : RegExpAssertion::Type::START_OF_INPUT));
611 set_contains_anchor();
612 continue;
613 }
614 case '$': {
615 Advance();
616 RegExpAssertion::Type assertion_type =
617 builder->multiline() ? RegExpAssertion::Type::END_OF_LINE
618 : RegExpAssertion::Type::END_OF_INPUT;
619 builder->AddAssertion(
620 zone()->template New<RegExpAssertion>(assertion_type));
621 continue;
622 }
623 case '.': {
624 Advance();
625 ZoneList<CharacterRange>* ranges =
626 zone()->template New<ZoneList<CharacterRange>>(2, zone());
627
628 if (builder->dotall()) {
629 // Everything.
630 CharacterRange::AddClassEscape(StandardCharacterSet::kEverything,
631 ranges, false, zone());
632 } else {
633 // Everything except \x0A, \x0D, \u2028 and \u2029.
634 CharacterRange::AddClassEscape(
635 StandardCharacterSet::kNotLineTerminator, ranges, false, zone());
636 }
637
638 RegExpCharacterClass* cc =
639 zone()->template New<RegExpCharacterClass>(zone(), ranges);
640 builder->AddCharacterClass(cc);
641 break;
642 }
643 case '(': {
644 state = ParseOpenParenthesis(state CHECK_FAILED);
645 builder = state->builder();
646 continue;
647 }
648 case '[': {
649 RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
650 builder->AddCharacterClass(cc->AsCharacterClass());
651 break;
652 }
653 // Atom ::
654 // \ AtomEscape
655 case '\\':
656 switch (Next()) {
657 case kEndMarker:
658 return ReportError(RegExpError::kEscapeAtEndOfPattern);
659 // AtomEscape ::
660 // [+UnicodeMode] DecimalEscape
661 // [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber
662 // of DecimalEscape is ≤ NcapturingParens
663 // CharacterEscape (some cases of this mixed in too)
664 //
665 // TODO(jgruber): It may make sense to disentangle all the different
666 // cases and make the structure mirror the spec, e.g. for AtomEscape:
667 //
668 // if (TryParseDecimalEscape(...)) return;
669 // if (TryParseCharacterClassEscape(...)) return;
670 // if (TryParseCharacterEscape(...)) return;
671 // if (TryParseGroupName(...)) return;
672 case '1':
673 case '2':
674 case '3':
675 case '4':
676 case '5':
677 case '6':
678 case '7':
679 case '8':
680 case '9': {
681 int index = 0;
682 const bool is_backref =
683 ParseBackReferenceIndex(&index CHECK_FAILED);
684 if (is_backref) {
685 if (state->IsInsideCaptureGroup(index)) {
686 // The back reference is inside the capture group it refers to.
687 // Nothing can possibly have been captured yet, so we use empty
688 // instead. This ensures that, when checking a back reference,
689 // the capture registers of the referenced capture are either
690 // both set or both cleared.
691 builder->AddEmpty();
692 } else {
693 RegExpCapture* capture = GetCapture(index);
694 RegExpTree* atom = zone()->template New<RegExpBackReference>(
695 capture, builder->flags());
696 builder->AddAtom(atom);
697 }
698 break;
699 }
700 // With /u, no identity escapes except for syntax characters
701 // are allowed. Otherwise, all identity escapes are allowed.
702 if (unicode()) {
703 return ReportError(RegExpError::kInvalidEscape);
704 }
705 base::uc32 first_digit = Next();
706 if (first_digit == '8' || first_digit == '9') {
707 builder->AddCharacter(first_digit);
708 Advance(2);
709 break;
710 }
711 V8_FALLTHROUGH;
712 }
713 case '0': {
714 Advance();
715 if (unicode() && Next() >= '0' && Next() <= '9') {
716 // With /u, decimal escape with leading 0 are not parsed as octal.
717 return ReportError(RegExpError::kInvalidDecimalEscape);
718 }
719 base::uc32 octal = ParseOctalLiteral();
720 builder->AddCharacter(octal);
721 break;
722 }
723 case 'b':
724 Advance(2);
725 builder->AddAssertion(zone()->template New<RegExpAssertion>(
726 RegExpAssertion::Type::BOUNDARY));
727 continue;
728 case 'B':
729 Advance(2);
730 builder->AddAssertion(zone()->template New<RegExpAssertion>(
731 RegExpAssertion::Type::NON_BOUNDARY));
732 continue;
733 // AtomEscape ::
734 // CharacterClassEscape
735 case 'd':
736 case 'D':
737 case 's':
738 case 'S':
739 case 'w':
740 case 'W':
741 case 'p':
742 case 'P': {
743 base::uc32 next = Next();
744 ZoneList<CharacterRange>* ranges =
745 zone()->template New<ZoneList<CharacterRange>>(2, zone());
746 bool add_unicode_case_equivalents =
747 unicode() && builder->ignore_case();
748 bool parsed_character_class_escape = TryParseCharacterClassEscape(
749 next, InClassEscapeState::kNotInClass, ranges, zone(),
750 add_unicode_case_equivalents CHECK_FAILED);
751
752 if (parsed_character_class_escape) {
753 RegExpCharacterClass* cc =
754 zone()->template New<RegExpCharacterClass>(zone(), ranges);
755 builder->AddCharacterClass(cc);
756 } else {
757 CHECK(!unicode());
758 Advance(2);
759 builder->AddCharacter(next); // IdentityEscape.
760 }
761 break;
762 }
763 // AtomEscape ::
764 // k GroupName
765 case 'k': {
766 // Either an identity escape or a named back-reference. The two
767 // interpretations are mutually exclusive: '\k' is interpreted as
768 // an identity escape for non-Unicode patterns without named
769 // capture groups, and as the beginning of a named back-reference
770 // in all other cases.
771 const bool has_named_captures =
772 HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED);
773 if (unicode() || has_named_captures) {
774 Advance(2);
775 ParseNamedBackReference(builder, state CHECK_FAILED);
776 break;
777 }
778 }
779 V8_FALLTHROUGH;
780 // AtomEscape ::
781 // CharacterEscape
782 default: {
783 bool is_escaped_unicode_character = false;
784 base::uc32 c = ParseCharacterEscape(
785 InClassEscapeState::kNotInClass,
786 &is_escaped_unicode_character CHECK_FAILED);
787 if (is_escaped_unicode_character) {
788 builder->AddEscapedUnicodeCharacter(c);
789 } else {
790 builder->AddCharacter(c);
791 }
792 break;
793 }
794 }
795 break;
796 case '{': {
797 int dummy;
798 bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
799 if (parsed) return ReportError(RegExpError::kNothingToRepeat);
800 V8_FALLTHROUGH;
801 }
802 case '}':
803 case ']':
804 if (unicode()) {
805 return ReportError(RegExpError::kLoneQuantifierBrackets);
806 }
807 V8_FALLTHROUGH;
808 default:
809 builder->AddUnicodeCharacter(current());
810 Advance();
811 break;
812 } // end switch(current())
813
814 int min;
815 int max;
816 switch (current()) {
817 // QuantifierPrefix ::
818 // *
819 // +
820 // ?
821 // {
822 case '*':
823 min = 0;
824 max = RegExpTree::kInfinity;
825 Advance();
826 break;
827 case '+':
828 min = 1;
829 max = RegExpTree::kInfinity;
830 Advance();
831 break;
832 case '?':
833 min = 0;
834 max = 1;
835 Advance();
836 break;
837 case '{':
838 if (ParseIntervalQuantifier(&min, &max)) {
839 if (max < min) {
840 return ReportError(RegExpError::kRangeOutOfOrder);
841 }
842 break;
843 } else if (unicode()) {
844 // With /u, incomplete quantifiers are not allowed.
845 return ReportError(RegExpError::kIncompleteQuantifier);
846 }
847 continue;
848 default:
849 continue;
850 }
851 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
852 if (current() == '?') {
853 quantifier_type = RegExpQuantifier::NON_GREEDY;
854 Advance();
855 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
856 // FLAG_regexp_possessive_quantifier is a debug-only flag.
857 quantifier_type = RegExpQuantifier::POSSESSIVE;
858 Advance();
859 }
860 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
861 return ReportError(RegExpError::kInvalidQuantifier);
862 }
863 }
864 }
865
866 template <class CharT>
ParseOpenParenthesis(RegExpParserState * state)867 RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis(
868 RegExpParserState* state) {
869 RegExpLookaround::Type lookaround_type = state->lookaround_type();
870 bool is_named_capture = false;
871 const ZoneVector<base::uc16>* capture_name = nullptr;
872 SubexpressionType subexpr_type = CAPTURE;
873 Advance();
874 if (current() == '?') {
875 switch (Next()) {
876 case ':':
877 Advance(2);
878 subexpr_type = GROUPING;
879 break;
880 case '=':
881 Advance(2);
882 lookaround_type = RegExpLookaround::LOOKAHEAD;
883 subexpr_type = POSITIVE_LOOKAROUND;
884 break;
885 case '!':
886 Advance(2);
887 lookaround_type = RegExpLookaround::LOOKAHEAD;
888 subexpr_type = NEGATIVE_LOOKAROUND;
889 break;
890 case '<':
891 Advance();
892 if (Next() == '=') {
893 Advance(2);
894 lookaround_type = RegExpLookaround::LOOKBEHIND;
895 subexpr_type = POSITIVE_LOOKAROUND;
896 break;
897 } else if (Next() == '!') {
898 Advance(2);
899 lookaround_type = RegExpLookaround::LOOKBEHIND;
900 subexpr_type = NEGATIVE_LOOKAROUND;
901 break;
902 }
903 is_named_capture = true;
904 has_named_captures_ = true;
905 Advance();
906 break;
907 default:
908 ReportError(RegExpError::kInvalidGroup);
909 return nullptr;
910 }
911 }
912 if (subexpr_type == CAPTURE) {
913 if (captures_started_ >= RegExpMacroAssembler::kMaxRegisterCount) {
914 ReportError(RegExpError::kTooManyCaptures);
915 return nullptr;
916 }
917 captures_started_++;
918
919 if (is_named_capture) {
920 capture_name = ParseCaptureGroupName(CHECK_FAILED);
921 }
922 }
923 // Store current state and begin new disjunction parsing.
924 return zone()->template New<RegExpParserState>(
925 state, subexpr_type, lookaround_type, captures_started_, capture_name,
926 state->builder()->flags(), zone());
927 }
928
929 #ifdef DEBUG
930 namespace {
931
IsSpecialClassEscape(base::uc32 c)932 bool IsSpecialClassEscape(base::uc32 c) {
933 switch (c) {
934 case 'd':
935 case 'D':
936 case 's':
937 case 'S':
938 case 'w':
939 case 'W':
940 return true;
941 default:
942 return false;
943 }
944 }
945
946 } // namespace
947 #endif
948
949 // In order to know whether an escape is a backreference or not we have to scan
950 // the entire regexp and find the number of capturing parentheses. However we
951 // don't want to scan the regexp twice unless it is necessary. This mini-parser
952 // is called when needed. It can see the difference between capturing and
953 // noncapturing parentheses and can skip character classes and backslash-escaped
954 // characters.
955 //
956 // Important: The scanner has to be in a consistent state when calling
957 // ScanForCaptures, e.g. not in the middle of an escape sequence '\['.
958 template <class CharT>
ScanForCaptures(InClassEscapeState in_class_escape_state)959 void RegExpParserImpl<CharT>::ScanForCaptures(
960 InClassEscapeState in_class_escape_state) {
961 DCHECK(!is_scanned_for_captures_);
962 const int saved_position = position();
963 // Start with captures started previous to current position
964 int capture_count = captures_started();
965 // When we start inside a character class, skip everything inside the class.
966 if (in_class_escape_state == InClassEscapeState::kInClass) {
967 int c;
968 while ((c = current()) != kEndMarker) {
969 Advance();
970 if (c == '\\') {
971 Advance();
972 } else {
973 if (c == ']') break;
974 }
975 }
976 }
977 // Add count of captures after this position.
978 int n;
979 while ((n = current()) != kEndMarker) {
980 Advance();
981 switch (n) {
982 case '\\':
983 Advance();
984 break;
985 case '[': {
986 int c;
987 while ((c = current()) != kEndMarker) {
988 Advance();
989 if (c == '\\') {
990 Advance();
991 } else {
992 if (c == ']') break;
993 }
994 }
995 break;
996 }
997 case '(':
998 if (current() == '?') {
999 // At this point we could be in
1000 // * a non-capturing group '(:',
1001 // * a lookbehind assertion '(?<=' '(?<!'
1002 // * or a named capture '(?<'.
1003 //
1004 // Of these, only named captures are capturing groups.
1005
1006 Advance();
1007 if (current() != '<') break;
1008
1009 Advance();
1010 if (current() == '=' || current() == '!') break;
1011
1012 // Found a possible named capture. It could turn out to be a syntax
1013 // error (e.g. an unterminated or invalid name), but that distinction
1014 // does not matter for our purposes.
1015 has_named_captures_ = true;
1016 }
1017 capture_count++;
1018 break;
1019 }
1020 }
1021 capture_count_ = capture_count;
1022 is_scanned_for_captures_ = true;
1023 Reset(saved_position);
1024 }
1025
1026 template <class CharT>
ParseBackReferenceIndex(int * index_out)1027 bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) {
1028 DCHECK_EQ('\\', current());
1029 DCHECK('1' <= Next() && Next() <= '9');
1030 // Try to parse a decimal literal that is no greater than the total number
1031 // of left capturing parentheses in the input.
1032 int start = position();
1033 int value = Next() - '0';
1034 Advance(2);
1035 while (true) {
1036 base::uc32 c = current();
1037 if (IsDecimalDigit(c)) {
1038 value = 10 * value + (c - '0');
1039 if (value > RegExpMacroAssembler::kMaxRegisterCount) {
1040 Reset(start);
1041 return false;
1042 }
1043 Advance();
1044 } else {
1045 break;
1046 }
1047 }
1048 if (value > captures_started()) {
1049 if (!is_scanned_for_captures_)
1050 ScanForCaptures(InClassEscapeState::kNotInClass);
1051 if (value > capture_count_) {
1052 Reset(start);
1053 return false;
1054 }
1055 }
1056 *index_out = value;
1057 return true;
1058 }
1059
1060 namespace {
1061
push_code_unit(ZoneVector<base::uc16> * v,uint32_t code_unit)1062 void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) {
1063 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
1064 v->push_back(code_unit);
1065 } else {
1066 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
1067 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
1068 }
1069 }
1070
1071 } // namespace
1072
1073 template <class CharT>
ParseCaptureGroupName()1074 const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() {
1075 // Due to special Advance requirements (see the next comment), rewind by one
1076 // such that names starting with a surrogate pair are parsed correctly for
1077 // patterns where the unicode flag is unset.
1078 //
1079 // Note that we use this odd pattern of rewinding the last advance in order
1080 // to adhere to the common parser behavior of expecting `current` to point at
1081 // the first candidate character for a function (e.g. when entering ParseFoo,
1082 // `current` should point at the first character of Foo).
1083 RewindByOneCodepoint();
1084
1085 ZoneVector<base::uc16>* name =
1086 zone()->template New<ZoneVector<base::uc16>>(zone());
1087
1088 {
1089 // Advance behavior inside this function is tricky since
1090 // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U)
1091 // and thus allows surrogate pairs and \u{}-style escapes even in
1092 // non-unicode patterns. Therefore Advance within the capture group name
1093 // has to force-enable unicode, and outside the name revert to default
1094 // behavior.
1095 ForceUnicodeScope force_unicode(this);
1096
1097 bool at_start = true;
1098 while (true) {
1099 Advance();
1100 base::uc32 c = current();
1101
1102 // Convert unicode escapes.
1103 if (c == '\\' && Next() == 'u') {
1104 Advance(2);
1105 if (!ParseUnicodeEscape(&c)) {
1106 ReportError(RegExpError::kInvalidUnicodeEscape);
1107 return nullptr;
1108 }
1109 RewindByOneCodepoint();
1110 }
1111
1112 // The backslash char is misclassified as both ID_Start and ID_Continue.
1113 if (c == '\\') {
1114 ReportError(RegExpError::kInvalidCaptureGroupName);
1115 return nullptr;
1116 }
1117
1118 if (at_start) {
1119 if (!IsIdentifierStart(c)) {
1120 ReportError(RegExpError::kInvalidCaptureGroupName);
1121 return nullptr;
1122 }
1123 push_code_unit(name, c);
1124 at_start = false;
1125 } else {
1126 if (c == '>') {
1127 break;
1128 } else if (IsIdentifierPart(c)) {
1129 push_code_unit(name, c);
1130 } else {
1131 ReportError(RegExpError::kInvalidCaptureGroupName);
1132 return nullptr;
1133 }
1134 }
1135 }
1136 }
1137
1138 // This final advance goes back into the state of pointing at the next
1139 // relevant char, which the rest of the parser expects. See also the previous
1140 // comments in this function.
1141 Advance();
1142 return name;
1143 }
1144
1145 template <class CharT>
CreateNamedCaptureAtIndex(const ZoneVector<base::uc16> * name,int index)1146 bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex(
1147 const ZoneVector<base::uc16>* name, int index) {
1148 DCHECK(0 < index && index <= captures_started_);
1149 DCHECK_NOT_NULL(name);
1150
1151 RegExpCapture* capture = GetCapture(index);
1152 DCHECK_NULL(capture->name());
1153
1154 capture->set_name(name);
1155
1156 if (named_captures_ == nullptr) {
1157 named_captures_ =
1158 zone_->template New<ZoneSet<RegExpCapture*, RegExpCaptureNameLess>>(
1159 zone());
1160 } else {
1161 // Check for duplicates and bail if we find any.
1162
1163 const auto& named_capture_it = named_captures_->find(capture);
1164 if (named_capture_it != named_captures_->end()) {
1165 ReportError(RegExpError::kDuplicateCaptureGroupName);
1166 return false;
1167 }
1168 }
1169
1170 named_captures_->emplace(capture);
1171
1172 return true;
1173 }
1174
1175 template <class CharT>
ParseNamedBackReference(RegExpBuilder * builder,RegExpParserState * state)1176 bool RegExpParserImpl<CharT>::ParseNamedBackReference(
1177 RegExpBuilder* builder, RegExpParserState* state) {
1178 // The parser is assumed to be on the '<' in \k<name>.
1179 if (current() != '<') {
1180 ReportError(RegExpError::kInvalidNamedReference);
1181 return false;
1182 }
1183
1184 Advance();
1185 const ZoneVector<base::uc16>* name = ParseCaptureGroupName();
1186 if (name == nullptr) {
1187 return false;
1188 }
1189
1190 if (state->IsInsideCaptureGroup(name)) {
1191 builder->AddEmpty();
1192 } else {
1193 RegExpBackReference* atom =
1194 zone()->template New<RegExpBackReference>(builder->flags());
1195 atom->set_name(name);
1196
1197 builder->AddAtom(atom);
1198
1199 if (named_back_references_ == nullptr) {
1200 named_back_references_ =
1201 zone()->template New<ZoneList<RegExpBackReference*>>(1, zone());
1202 }
1203 named_back_references_->Add(atom, zone());
1204 }
1205
1206 return true;
1207 }
1208
1209 template <class CharT>
PatchNamedBackReferences()1210 void RegExpParserImpl<CharT>::PatchNamedBackReferences() {
1211 if (named_back_references_ == nullptr) return;
1212
1213 if (named_captures_ == nullptr) {
1214 ReportError(RegExpError::kInvalidNamedCaptureReference);
1215 return;
1216 }
1217
1218 // Look up and patch the actual capture for each named back reference.
1219
1220 for (int i = 0; i < named_back_references_->length(); i++) {
1221 RegExpBackReference* ref = named_back_references_->at(i);
1222
1223 // Capture used to search the named_captures_ by name, index of the
1224 // capture is never used.
1225 static const int kInvalidIndex = 0;
1226 RegExpCapture* search_capture =
1227 zone()->template New<RegExpCapture>(kInvalidIndex);
1228 DCHECK_NULL(search_capture->name());
1229 search_capture->set_name(ref->name());
1230
1231 int index = -1;
1232 const auto& capture_it = named_captures_->find(search_capture);
1233 if (capture_it != named_captures_->end()) {
1234 index = (*capture_it)->index();
1235 } else {
1236 ReportError(RegExpError::kInvalidNamedCaptureReference);
1237 return;
1238 }
1239
1240 ref->set_capture(GetCapture(index));
1241 }
1242 }
1243
1244 template <class CharT>
GetCapture(int index)1245 RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) {
1246 // The index for the capture groups are one-based. Its index in the list is
1247 // zero-based.
1248 const int known_captures =
1249 is_scanned_for_captures_ ? capture_count_ : captures_started_;
1250 DCHECK(index <= known_captures);
1251 if (captures_ == nullptr) {
1252 captures_ =
1253 zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone());
1254 }
1255 while (captures_->length() < known_captures) {
1256 captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1),
1257 zone());
1258 }
1259 return captures_->at(index - 1);
1260 }
1261
1262 template <class CharT>
GetNamedCaptures() const1263 ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() const {
1264 if (named_captures_ == nullptr || named_captures_->empty()) {
1265 return nullptr;
1266 }
1267
1268 return zone()->template New<ZoneVector<RegExpCapture*>>(
1269 named_captures_->begin(), named_captures_->end(), zone());
1270 }
1271
1272 template <class CharT>
HasNamedCaptures(InClassEscapeState in_class_escape_state)1273 bool RegExpParserImpl<CharT>::HasNamedCaptures(
1274 InClassEscapeState in_class_escape_state) {
1275 if (has_named_captures_ || is_scanned_for_captures_) {
1276 return has_named_captures_;
1277 }
1278
1279 ScanForCaptures(in_class_escape_state);
1280 DCHECK(is_scanned_for_captures_);
1281 return has_named_captures_;
1282 }
1283
1284 // QuantifierPrefix ::
1285 // { DecimalDigits }
1286 // { DecimalDigits , }
1287 // { DecimalDigits , DecimalDigits }
1288 //
1289 // Returns true if parsing succeeds, and set the min_out and max_out
1290 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
1291 template <class CharT>
ParseIntervalQuantifier(int * min_out,int * max_out)1292 bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out,
1293 int* max_out) {
1294 DCHECK_EQ(current(), '{');
1295 int start = position();
1296 Advance();
1297 int min = 0;
1298 if (!IsDecimalDigit(current())) {
1299 Reset(start);
1300 return false;
1301 }
1302 while (IsDecimalDigit(current())) {
1303 int next = current() - '0';
1304 if (min > (RegExpTree::kInfinity - next) / 10) {
1305 // Overflow. Skip past remaining decimal digits and return -1.
1306 do {
1307 Advance();
1308 } while (IsDecimalDigit(current()));
1309 min = RegExpTree::kInfinity;
1310 break;
1311 }
1312 min = 10 * min + next;
1313 Advance();
1314 }
1315 int max = 0;
1316 if (current() == '}') {
1317 max = min;
1318 Advance();
1319 } else if (current() == ',') {
1320 Advance();
1321 if (current() == '}') {
1322 max = RegExpTree::kInfinity;
1323 Advance();
1324 } else {
1325 while (IsDecimalDigit(current())) {
1326 int next = current() - '0';
1327 if (max > (RegExpTree::kInfinity - next) / 10) {
1328 do {
1329 Advance();
1330 } while (IsDecimalDigit(current()));
1331 max = RegExpTree::kInfinity;
1332 break;
1333 }
1334 max = 10 * max + next;
1335 Advance();
1336 }
1337 if (current() != '}') {
1338 Reset(start);
1339 return false;
1340 }
1341 Advance();
1342 }
1343 } else {
1344 Reset(start);
1345 return false;
1346 }
1347 *min_out = min;
1348 *max_out = max;
1349 return true;
1350 }
1351
1352 template <class CharT>
ParseOctalLiteral()1353 base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() {
1354 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1355 // For compatibility with some other browsers (not all), we parse
1356 // up to three octal digits with a value below 256.
1357 // ES#prod-annexB-LegacyOctalEscapeSequence
1358 base::uc32 value = current() - '0';
1359 Advance();
1360 if ('0' <= current() && current() <= '7') {
1361 value = value * 8 + current() - '0';
1362 Advance();
1363 if (value < 32 && '0' <= current() && current() <= '7') {
1364 value = value * 8 + current() - '0';
1365 Advance();
1366 }
1367 }
1368 return value;
1369 }
1370
1371 template <class CharT>
ParseHexEscape(int length,base::uc32 * value)1372 bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) {
1373 int start = position();
1374 base::uc32 val = 0;
1375 for (int i = 0; i < length; ++i) {
1376 base::uc32 c = current();
1377 int d = base::HexValue(c);
1378 if (d < 0) {
1379 Reset(start);
1380 return false;
1381 }
1382 val = val * 16 + d;
1383 Advance();
1384 }
1385 *value = val;
1386 return true;
1387 }
1388
1389 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1390 template <class CharT>
ParseUnicodeEscape(base::uc32 * value)1391 bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) {
1392 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1393 // allowed). In the latter case, the number of hex digits between { } is
1394 // arbitrary. \ and u have already been read.
1395 if (current() == '{' && unicode()) {
1396 int start = position();
1397 Advance();
1398 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1399 if (current() == '}') {
1400 Advance();
1401 return true;
1402 }
1403 }
1404 Reset(start);
1405 return false;
1406 }
1407 // \u but no {, or \u{...} escapes not allowed.
1408 bool result = ParseHexEscape(4, value);
1409 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1410 current() == '\\') {
1411 // Attempt to read trail surrogate.
1412 int start = position();
1413 if (Next() == 'u') {
1414 Advance(2);
1415 base::uc32 trail;
1416 if (ParseHexEscape(4, &trail) &&
1417 unibrow::Utf16::IsTrailSurrogate(trail)) {
1418 *value = unibrow::Utf16::CombineSurrogatePair(
1419 static_cast<base::uc16>(*value), static_cast<base::uc16>(trail));
1420 return true;
1421 }
1422 }
1423 Reset(start);
1424 }
1425 return result;
1426 }
1427
1428 #ifdef V8_INTL_SUPPORT
1429
1430 namespace {
1431
IsExactPropertyAlias(const char * property_name,UProperty property)1432 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1433 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1434 if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1435 return true;
1436 for (int i = 0;; i++) {
1437 const char* long_name = u_getPropertyName(
1438 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1439 if (long_name == nullptr) break;
1440 if (strcmp(property_name, long_name) == 0) return true;
1441 }
1442 return false;
1443 }
1444
IsExactPropertyValueAlias(const char * property_value_name,UProperty property,int32_t property_value)1445 bool IsExactPropertyValueAlias(const char* property_value_name,
1446 UProperty property, int32_t property_value) {
1447 const char* short_name =
1448 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1449 if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1450 return true;
1451 }
1452 for (int i = 0;; i++) {
1453 const char* long_name = u_getPropertyValueName(
1454 property, property_value,
1455 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1456 if (long_name == nullptr) break;
1457 if (strcmp(property_value_name, long_name) == 0) return true;
1458 }
1459 return false;
1460 }
1461
LookupPropertyValueName(UProperty property,const char * property_value_name,bool negate,ZoneList<CharacterRange> * result,Zone * zone)1462 bool LookupPropertyValueName(UProperty property,
1463 const char* property_value_name, bool negate,
1464 ZoneList<CharacterRange>* result, Zone* zone) {
1465 UProperty property_for_lookup = property;
1466 if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1467 // For the property Script_Extensions, we have to do the property value
1468 // name lookup as if the property is Script.
1469 property_for_lookup = UCHAR_SCRIPT;
1470 }
1471 int32_t property_value =
1472 u_getPropertyValueEnum(property_for_lookup, property_value_name);
1473 if (property_value == UCHAR_INVALID_CODE) return false;
1474
1475 // We require the property name to match exactly to one of the property value
1476 // aliases. However, u_getPropertyValueEnum uses loose matching.
1477 if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1478 property_value)) {
1479 return false;
1480 }
1481
1482 UErrorCode ec = U_ZERO_ERROR;
1483 icu::UnicodeSet set;
1484 set.applyIntPropertyValue(property, property_value, ec);
1485 bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1486
1487 if (success) {
1488 set.removeAllStrings();
1489 if (negate) set.complement();
1490 for (int i = 0; i < set.getRangeCount(); i++) {
1491 result->Add(
1492 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1493 zone);
1494 }
1495 }
1496 return success;
1497 }
1498
1499 template <size_t N>
NameEquals(const char * name,const char (& literal)[N])1500 inline bool NameEquals(const char* name, const char (&literal)[N]) {
1501 return strncmp(name, literal, N + 1) == 0;
1502 }
1503
LookupSpecialPropertyValueName(const char * name,ZoneList<CharacterRange> * result,bool negate,Zone * zone)1504 bool LookupSpecialPropertyValueName(const char* name,
1505 ZoneList<CharacterRange>* result,
1506 bool negate, Zone* zone) {
1507 if (NameEquals(name, "Any")) {
1508 if (negate) {
1509 // Leave the list of character ranges empty, since the negation of 'Any'
1510 // is the empty set.
1511 } else {
1512 result->Add(CharacterRange::Everything(), zone);
1513 }
1514 } else if (NameEquals(name, "ASCII")) {
1515 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1516 : CharacterRange::Range(0x0, 0x7F),
1517 zone);
1518 } else if (NameEquals(name, "Assigned")) {
1519 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1520 !negate, result, zone);
1521 } else {
1522 return false;
1523 }
1524 return true;
1525 }
1526
1527 // Explicitly allowlist supported binary properties. The spec forbids supporting
1528 // properties outside of this set to ensure interoperability.
IsSupportedBinaryProperty(UProperty property)1529 bool IsSupportedBinaryProperty(UProperty property) {
1530 switch (property) {
1531 case UCHAR_ALPHABETIC:
1532 // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1533 // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1534 case UCHAR_ASCII_HEX_DIGIT:
1535 // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1536 case UCHAR_BIDI_CONTROL:
1537 case UCHAR_BIDI_MIRRORED:
1538 case UCHAR_CASE_IGNORABLE:
1539 case UCHAR_CASED:
1540 case UCHAR_CHANGES_WHEN_CASEFOLDED:
1541 case UCHAR_CHANGES_WHEN_CASEMAPPED:
1542 case UCHAR_CHANGES_WHEN_LOWERCASED:
1543 case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1544 case UCHAR_CHANGES_WHEN_TITLECASED:
1545 case UCHAR_CHANGES_WHEN_UPPERCASED:
1546 case UCHAR_DASH:
1547 case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1548 case UCHAR_DEPRECATED:
1549 case UCHAR_DIACRITIC:
1550 case UCHAR_EMOJI:
1551 case UCHAR_EMOJI_COMPONENT:
1552 case UCHAR_EMOJI_MODIFIER_BASE:
1553 case UCHAR_EMOJI_MODIFIER:
1554 case UCHAR_EMOJI_PRESENTATION:
1555 case UCHAR_EXTENDED_PICTOGRAPHIC:
1556 case UCHAR_EXTENDER:
1557 case UCHAR_GRAPHEME_BASE:
1558 case UCHAR_GRAPHEME_EXTEND:
1559 case UCHAR_HEX_DIGIT:
1560 case UCHAR_ID_CONTINUE:
1561 case UCHAR_ID_START:
1562 case UCHAR_IDEOGRAPHIC:
1563 case UCHAR_IDS_BINARY_OPERATOR:
1564 case UCHAR_IDS_TRINARY_OPERATOR:
1565 case UCHAR_JOIN_CONTROL:
1566 case UCHAR_LOGICAL_ORDER_EXCEPTION:
1567 case UCHAR_LOWERCASE:
1568 case UCHAR_MATH:
1569 case UCHAR_NONCHARACTER_CODE_POINT:
1570 case UCHAR_PATTERN_SYNTAX:
1571 case UCHAR_PATTERN_WHITE_SPACE:
1572 case UCHAR_QUOTATION_MARK:
1573 case UCHAR_RADICAL:
1574 case UCHAR_REGIONAL_INDICATOR:
1575 case UCHAR_S_TERM:
1576 case UCHAR_SOFT_DOTTED:
1577 case UCHAR_TERMINAL_PUNCTUATION:
1578 case UCHAR_UNIFIED_IDEOGRAPH:
1579 case UCHAR_UPPERCASE:
1580 case UCHAR_VARIATION_SELECTOR:
1581 case UCHAR_WHITE_SPACE:
1582 case UCHAR_XID_CONTINUE:
1583 case UCHAR_XID_START:
1584 return true;
1585 default:
1586 break;
1587 }
1588 return false;
1589 }
1590
IsUnicodePropertyValueCharacter(char c)1591 bool IsUnicodePropertyValueCharacter(char c) {
1592 // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1593 //
1594 // Note that using this to validate each parsed char is quite conservative.
1595 // A possible alternative solution would be to only ensure the parsed
1596 // property name/value candidate string does not contain '\0' characters and
1597 // let ICU lookups trigger the final failure.
1598 if ('a' <= c && c <= 'z') return true;
1599 if ('A' <= c && c <= 'Z') return true;
1600 if ('0' <= c && c <= '9') return true;
1601 return (c == '_');
1602 }
1603
1604 } // namespace
1605
1606 template <class CharT>
ParsePropertyClassName(ZoneVector<char> * name_1,ZoneVector<char> * name_2)1607 bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
1608 ZoneVector<char>* name_2) {
1609 DCHECK(name_1->empty());
1610 DCHECK(name_2->empty());
1611 // Parse the property class as follows:
1612 // - In \p{name}, 'name' is interpreted
1613 // - either as a general category property value name.
1614 // - or as a binary property name.
1615 // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1616 // and 'value' is interpreted as one of the available property value names.
1617 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1618 // - Loose matching is not applied.
1619 if (current() == '{') {
1620 // Parse \p{[PropertyName=]PropertyNameValue}
1621 for (Advance(); current() != '}' && current() != '='; Advance()) {
1622 if (!IsUnicodePropertyValueCharacter(current())) return false;
1623 if (!has_next()) return false;
1624 name_1->push_back(static_cast<char>(current()));
1625 }
1626 if (current() == '=') {
1627 for (Advance(); current() != '}'; Advance()) {
1628 if (!IsUnicodePropertyValueCharacter(current())) return false;
1629 if (!has_next()) return false;
1630 name_2->push_back(static_cast<char>(current()));
1631 }
1632 name_2->push_back(0); // null-terminate string.
1633 }
1634 } else {
1635 return false;
1636 }
1637 Advance();
1638 name_1->push_back(0); // null-terminate string.
1639
1640 DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
1641 DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
1642 return true;
1643 }
1644
1645 template <class CharT>
AddPropertyClassRange(ZoneList<CharacterRange> * add_to,bool negate,const ZoneVector<char> & name_1,const ZoneVector<char> & name_2)1646 bool RegExpParserImpl<CharT>::AddPropertyClassRange(
1647 ZoneList<CharacterRange>* add_to, bool negate,
1648 const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
1649 if (name_2.empty()) {
1650 // First attempt to interpret as general category property value name.
1651 const char* name = name_1.data();
1652 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1653 add_to, zone())) {
1654 return true;
1655 }
1656 // Interpret "Any", "ASCII", and "Assigned".
1657 if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
1658 return true;
1659 }
1660 // Then attempt to interpret as binary property name with value name 'Y'.
1661 UProperty property = u_getPropertyEnum(name);
1662 if (!IsSupportedBinaryProperty(property)) return false;
1663 if (!IsExactPropertyAlias(name, property)) return false;
1664 return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
1665 zone());
1666 } else {
1667 // Both property name and value name are specified. Attempt to interpret
1668 // the property name as enumerated property.
1669 const char* property_name = name_1.data();
1670 const char* value_name = name_2.data();
1671 UProperty property = u_getPropertyEnum(property_name);
1672 if (!IsExactPropertyAlias(property_name, property)) return false;
1673 if (property == UCHAR_GENERAL_CATEGORY) {
1674 // We want to allow aggregate value names such as "Letter".
1675 property = UCHAR_GENERAL_CATEGORY_MASK;
1676 } else if (property != UCHAR_SCRIPT &&
1677 property != UCHAR_SCRIPT_EXTENSIONS) {
1678 return false;
1679 }
1680 return LookupPropertyValueName(property, value_name, negate, add_to,
1681 zone());
1682 }
1683 }
1684
1685 #else // V8_INTL_SUPPORT
1686
1687 template <class CharT>
ParsePropertyClassName(ZoneVector<char> * name_1,ZoneVector<char> * name_2)1688 bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
1689 ZoneVector<char>* name_2) {
1690 return false;
1691 }
1692
1693 template <class CharT>
AddPropertyClassRange(ZoneList<CharacterRange> * add_to,bool negate,const ZoneVector<char> & name_1,const ZoneVector<char> & name_2)1694 bool RegExpParserImpl<CharT>::AddPropertyClassRange(
1695 ZoneList<CharacterRange>* add_to, bool negate,
1696 const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
1697 return false;
1698 }
1699
1700 #endif // V8_INTL_SUPPORT
1701
1702 template <class CharT>
ParseUnlimitedLengthHexNumber(int max_value,base::uc32 * value)1703 bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value,
1704 base::uc32* value) {
1705 base::uc32 x = 0;
1706 int d = base::HexValue(current());
1707 if (d < 0) {
1708 return false;
1709 }
1710 while (d >= 0) {
1711 x = x * 16 + d;
1712 if (x > static_cast<base::uc32>(max_value)) {
1713 return false;
1714 }
1715 Advance();
1716 d = base::HexValue(current());
1717 }
1718 *value = x;
1719 return true;
1720 }
1721
1722 // https://tc39.es/ecma262/#prod-CharacterEscape
1723 template <class CharT>
ParseCharacterEscape(InClassEscapeState in_class_escape_state,bool * is_escaped_unicode_character)1724 base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape(
1725 InClassEscapeState in_class_escape_state,
1726 bool* is_escaped_unicode_character) {
1727 DCHECK_EQ('\\', current());
1728 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1729
1730 Advance();
1731
1732 const base::uc32 c = current();
1733 switch (c) {
1734 // CharacterEscape ::
1735 // ControlEscape :: one of
1736 // f n r t v
1737 case 'f':
1738 Advance();
1739 return '\f';
1740 case 'n':
1741 Advance();
1742 return '\n';
1743 case 'r':
1744 Advance();
1745 return '\r';
1746 case 't':
1747 Advance();
1748 return '\t';
1749 case 'v':
1750 Advance();
1751 return '\v';
1752 // CharacterEscape ::
1753 // c ControlLetter
1754 case 'c': {
1755 base::uc32 controlLetter = Next();
1756 base::uc32 letter = controlLetter & ~('A' ^ 'a');
1757 if (letter >= 'A' && letter <= 'Z') {
1758 Advance(2);
1759 // Control letters mapped to ASCII control characters in the range
1760 // 0x00-0x1F.
1761 return controlLetter & 0x1F;
1762 }
1763 if (unicode()) {
1764 // With /u, invalid escapes are not treated as identity escapes.
1765 ReportError(RegExpError::kInvalidUnicodeEscape);
1766 return 0;
1767 }
1768 if (in_class_escape_state == InClassEscapeState::kInClass) {
1769 // Inside a character class, we also accept digits and underscore as
1770 // control characters, unless with /u. See Annex B:
1771 // ES#prod-annexB-ClassControlLetter
1772 if ((controlLetter >= '0' && controlLetter <= '9') ||
1773 controlLetter == '_') {
1774 Advance(2);
1775 return controlLetter & 0x1F;
1776 }
1777 }
1778 // We match JSC in reading the backslash as a literal
1779 // character instead of as starting an escape.
1780 return '\\';
1781 }
1782 // CharacterEscape ::
1783 // 0 [lookahead ∉ DecimalDigit]
1784 // [~UnicodeMode] LegacyOctalEscapeSequence
1785 case '0':
1786 // \0 is interpreted as NUL if not followed by another digit.
1787 if (Next() < '0' || Next() > '9') {
1788 Advance();
1789 return 0;
1790 }
1791 V8_FALLTHROUGH;
1792 case '1':
1793 case '2':
1794 case '3':
1795 case '4':
1796 case '5':
1797 case '6':
1798 case '7':
1799 // For compatibility, we interpret a decimal escape that isn't
1800 // a back reference (and therefore either \0 or not valid according
1801 // to the specification) as a 1..3 digit octal character code.
1802 // ES#prod-annexB-LegacyOctalEscapeSequence
1803 if (unicode()) {
1804 // With /u, decimal escape is not interpreted as octal character code.
1805 ReportError(RegExpError::kInvalidClassEscape);
1806 return 0;
1807 }
1808 return ParseOctalLiteral();
1809 // CharacterEscape ::
1810 // HexEscapeSequence
1811 case 'x': {
1812 Advance();
1813 base::uc32 value;
1814 if (ParseHexEscape(2, &value)) return value;
1815 if (unicode()) {
1816 // With /u, invalid escapes are not treated as identity escapes.
1817 ReportError(RegExpError::kInvalidEscape);
1818 return 0;
1819 }
1820 // If \x is not followed by a two-digit hexadecimal, treat it
1821 // as an identity escape.
1822 return 'x';
1823 }
1824 // CharacterEscape ::
1825 // RegExpUnicodeEscapeSequence [?UnicodeMode]
1826 case 'u': {
1827 Advance();
1828 base::uc32 value;
1829 if (ParseUnicodeEscape(&value)) {
1830 *is_escaped_unicode_character = true;
1831 return value;
1832 }
1833 if (unicode()) {
1834 // With /u, invalid escapes are not treated as identity escapes.
1835 ReportError(RegExpError::kInvalidUnicodeEscape);
1836 return 0;
1837 }
1838 // If \u is not followed by a two-digit hexadecimal, treat it
1839 // as an identity escape.
1840 return 'u';
1841 }
1842 default:
1843 break;
1844 }
1845
1846 // CharacterEscape ::
1847 // IdentityEscape[?UnicodeMode, ?N]
1848 //
1849 // * With /u, no identity escapes except for syntax characters are
1850 // allowed.
1851 // * Without /u:
1852 // * '\c' is not an IdentityEscape.
1853 // * '\k' is not an IdentityEscape when named captures exist.
1854 // * Otherwise, all identity escapes are allowed.
1855 if (unicode()) {
1856 if (!IsSyntaxCharacterOrSlash(c)) {
1857 ReportError(RegExpError::kInvalidEscape);
1858 return 0;
1859 }
1860 Advance();
1861 return c;
1862 }
1863 DCHECK(!unicode());
1864 if (c == 'c') {
1865 ReportError(RegExpError::kInvalidEscape);
1866 return 0;
1867 }
1868 Advance();
1869 // Note: It's important to Advance before the HasNamedCaptures call s.t. we
1870 // don't start scanning in the middle of an escape.
1871 if (c == 'k' && HasNamedCaptures(in_class_escape_state)) {
1872 ReportError(RegExpError::kInvalidEscape);
1873 return 0;
1874 }
1875 return c;
1876 }
1877
1878 // https://tc39.es/ecma262/#prod-ClassEscape
1879 template <class CharT>
ParseClassEscape(ZoneList<CharacterRange> * ranges,Zone * zone,bool add_unicode_case_equivalents,base::uc32 * char_out,bool * is_class_escape)1880 void RegExpParserImpl<CharT>::ParseClassEscape(
1881 ZoneList<CharacterRange>* ranges, Zone* zone,
1882 bool add_unicode_case_equivalents, base::uc32* char_out,
1883 bool* is_class_escape) {
1884 *is_class_escape = false;
1885
1886 if (current() != '\\') {
1887 // Not a ClassEscape.
1888 *char_out = current();
1889 Advance();
1890 return;
1891 }
1892
1893 const base::uc32 next = Next();
1894 switch (next) {
1895 case 'b':
1896 *char_out = '\b';
1897 Advance(2);
1898 return;
1899 case '-':
1900 if (unicode()) {
1901 *char_out = next;
1902 Advance(2);
1903 return;
1904 }
1905 break;
1906 case kEndMarker:
1907 ReportError(RegExpError::kEscapeAtEndOfPattern);
1908 return;
1909 default:
1910 break;
1911 }
1912
1913 static constexpr InClassEscapeState kInClassEscape =
1914 InClassEscapeState::kInClass;
1915 *is_class_escape = TryParseCharacterClassEscape(
1916 next, kInClassEscape, ranges, zone, add_unicode_case_equivalents);
1917 if (*is_class_escape) return;
1918
1919 bool dummy = false; // Unused.
1920 *char_out = ParseCharacterEscape(kInClassEscape, &dummy);
1921 }
1922
1923 // https://tc39.es/ecma262/#prod-CharacterClassEscape
1924 template <class CharT>
TryParseCharacterClassEscape(base::uc32 next,InClassEscapeState in_class_escape_state,ZoneList<CharacterRange> * ranges,Zone * zone,bool add_unicode_case_equivalents)1925 bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(
1926 base::uc32 next, InClassEscapeState in_class_escape_state,
1927 ZoneList<CharacterRange>* ranges, Zone* zone,
1928 bool add_unicode_case_equivalents) {
1929 DCHECK_EQ(current(), '\\');
1930 DCHECK_EQ(Next(), next);
1931
1932 switch (next) {
1933 case 'd':
1934 case 'D':
1935 case 's':
1936 case 'S':
1937 case 'w':
1938 case 'W':
1939 CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next),
1940 ranges, add_unicode_case_equivalents,
1941 zone);
1942 Advance(2);
1943 return true;
1944 case 'p':
1945 case 'P': {
1946 if (!unicode()) return false;
1947 bool negate = next == 'P';
1948 Advance(2);
1949 ZoneVector<char> name_1(zone);
1950 ZoneVector<char> name_2(zone);
1951 if (!ParsePropertyClassName(&name_1, &name_2) ||
1952 !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1953 ReportError(in_class_escape_state == InClassEscapeState::kInClass
1954 ? RegExpError::kInvalidClassPropertyName
1955 : RegExpError::kInvalidPropertyName);
1956 }
1957 return true;
1958 }
1959 default:
1960 return false;
1961 }
1962 }
1963
1964 template <class CharT>
ParseCharacterClass(const RegExpBuilder * builder)1965 RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
1966 const RegExpBuilder* builder) {
1967 DCHECK_EQ(current(), '[');
1968 Advance();
1969 bool is_negated = false;
1970 if (current() == '^') {
1971 is_negated = true;
1972 Advance();
1973 }
1974 ZoneList<CharacterRange>* ranges =
1975 zone()->template New<ZoneList<CharacterRange>>(2, zone());
1976 bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
1977 while (has_more() && current() != ']') {
1978 base::uc32 char_1, char_2;
1979 bool is_class_1, is_class_2;
1980 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1981 &is_class_1 CHECK_FAILED);
1982 if (current() == '-') {
1983 Advance();
1984 if (current() == kEndMarker) {
1985 // If we reach the end we break out of the loop and let the
1986 // following code report an error.
1987 break;
1988 } else if (current() == ']') {
1989 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1990 ranges->Add(CharacterRange::Singleton('-'), zone());
1991 break;
1992 }
1993 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1994 &is_class_2 CHECK_FAILED);
1995 if (is_class_1 || is_class_2) {
1996 // Either end is an escaped character class. Treat the '-' verbatim.
1997 if (unicode()) {
1998 // ES2015 21.2.2.15.1 step 1.
1999 return ReportError(RegExpError::kInvalidCharacterClass);
2000 }
2001 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2002 ranges->Add(CharacterRange::Singleton('-'), zone());
2003 if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
2004 continue;
2005 }
2006 // ES2015 21.2.2.15.1 step 6.
2007 if (char_1 > char_2) {
2008 return ReportError(RegExpError::kOutOfOrderCharacterClass);
2009 }
2010 ranges->Add(CharacterRange::Range(char_1, char_2), zone());
2011 } else {
2012 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2013 }
2014 }
2015 if (!has_more()) {
2016 return ReportError(RegExpError::kUnterminatedCharacterClass);
2017 }
2018 Advance();
2019 RegExpCharacterClass::CharacterClassFlags character_class_flags;
2020 if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
2021 return zone()->template New<RegExpCharacterClass>(zone(), ranges,
2022 character_class_flags);
2023 }
2024
2025 #undef CHECK_FAILED
2026
2027 template <class CharT>
Parse(RegExpCompileData * result)2028 bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) {
2029 DCHECK_NOT_NULL(result);
2030 RegExpTree* tree = ParsePattern();
2031
2032 if (failed()) {
2033 DCHECK_NULL(tree);
2034 DCHECK_NE(error_, RegExpError::kNone);
2035 result->error = error_;
2036 result->error_pos = error_pos_;
2037 return false;
2038 }
2039
2040 DCHECK_NOT_NULL(tree);
2041 DCHECK_EQ(error_, RegExpError::kNone);
2042 if (FLAG_trace_regexp_parser) {
2043 StdoutStream os;
2044 tree->Print(os, zone());
2045 os << "\n";
2046 }
2047
2048 result->tree = tree;
2049 const int capture_count = captures_started();
2050 result->simple = tree->IsAtom() && simple() && capture_count == 0;
2051 result->contains_anchor = contains_anchor();
2052 result->capture_count = capture_count;
2053 result->named_captures = GetNamedCaptures();
2054 return true;
2055 }
2056
AddLeadSurrogate(base::uc16 lead_surrogate)2057 void RegExpBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) {
2058 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
2059 FlushPendingSurrogate();
2060 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
2061 pending_surrogate_ = lead_surrogate;
2062 }
2063
AddTrailSurrogate(base::uc16 trail_surrogate)2064 void RegExpBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) {
2065 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
2066 if (pending_surrogate_ != kNoPendingSurrogate) {
2067 base::uc16 lead_surrogate = pending_surrogate_;
2068 pending_surrogate_ = kNoPendingSurrogate;
2069 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
2070 base::uc32 combined =
2071 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
2072 if (NeedsDesugaringForIgnoreCase(combined)) {
2073 AddCharacterClassForDesugaring(combined);
2074 } else {
2075 ZoneList<base::uc16> surrogate_pair(2, zone());
2076 surrogate_pair.Add(lead_surrogate, zone());
2077 surrogate_pair.Add(trail_surrogate, zone());
2078 RegExpAtom* atom =
2079 zone()->New<RegExpAtom>(surrogate_pair.ToConstVector());
2080 AddAtom(atom);
2081 }
2082 } else {
2083 pending_surrogate_ = trail_surrogate;
2084 FlushPendingSurrogate();
2085 }
2086 }
2087
FlushPendingSurrogate()2088 void RegExpBuilder::FlushPendingSurrogate() {
2089 if (pending_surrogate_ != kNoPendingSurrogate) {
2090 DCHECK(unicode());
2091 base::uc32 c = pending_surrogate_;
2092 pending_surrogate_ = kNoPendingSurrogate;
2093 AddCharacterClassForDesugaring(c);
2094 }
2095 }
2096
FlushCharacters()2097 void RegExpBuilder::FlushCharacters() {
2098 FlushPendingSurrogate();
2099 pending_empty_ = false;
2100 if (characters_ != nullptr) {
2101 RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector());
2102 characters_ = nullptr;
2103 text_.emplace_back(atom);
2104 LAST(ADD_ATOM);
2105 }
2106 }
2107
FlushText()2108 void RegExpBuilder::FlushText() {
2109 FlushCharacters();
2110 size_t num_text = text_.size();
2111 if (num_text == 0) {
2112 return;
2113 } else if (num_text == 1) {
2114 terms_.emplace_back(text_.back());
2115 } else {
2116 RegExpText* text = zone()->New<RegExpText>(zone());
2117 for (size_t i = 0; i < num_text; i++) {
2118 text_[i]->AppendToText(text, zone());
2119 }
2120 terms_.emplace_back(text);
2121 }
2122 text_.clear();
2123 }
2124
AddCharacter(base::uc16 c)2125 void RegExpBuilder::AddCharacter(base::uc16 c) {
2126 FlushPendingSurrogate();
2127 pending_empty_ = false;
2128 if (NeedsDesugaringForIgnoreCase(c)) {
2129 AddCharacterClassForDesugaring(c);
2130 } else {
2131 if (characters_ == nullptr) {
2132 characters_ = zone()->New<ZoneList<base::uc16>>(4, zone());
2133 }
2134 characters_->Add(c, zone());
2135 LAST(ADD_CHAR);
2136 }
2137 }
2138
AddUnicodeCharacter(base::uc32 c)2139 void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) {
2140 if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
2141 DCHECK(unicode());
2142 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
2143 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
2144 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
2145 AddLeadSurrogate(c);
2146 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
2147 AddTrailSurrogate(c);
2148 } else {
2149 AddCharacter(static_cast<base::uc16>(c));
2150 }
2151 }
2152
AddEscapedUnicodeCharacter(base::uc32 character)2153 void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
2154 // A lead or trail surrogate parsed via escape sequence will not
2155 // pair up with any preceding lead or following trail surrogate.
2156 FlushPendingSurrogate();
2157 AddUnicodeCharacter(character);
2158 FlushPendingSurrogate();
2159 }
2160
AddEmpty()2161 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
2162
AddCharacterClass(RegExpCharacterClass * cc)2163 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
2164 if (NeedsDesugaringForUnicode(cc)) {
2165 // With /u, character class needs to be desugared, so it
2166 // must be a standalone term instead of being part of a RegExpText.
2167 AddTerm(cc);
2168 } else {
2169 AddAtom(cc);
2170 }
2171 }
2172
AddCharacterClassForDesugaring(base::uc32 c)2173 void RegExpBuilder::AddCharacterClassForDesugaring(base::uc32 c) {
2174 AddTerm(zone()->New<RegExpCharacterClass>(
2175 zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c))));
2176 }
2177
AddAtom(RegExpTree * term)2178 void RegExpBuilder::AddAtom(RegExpTree* term) {
2179 if (term->IsEmpty()) {
2180 AddEmpty();
2181 return;
2182 }
2183 if (term->IsTextElement()) {
2184 FlushCharacters();
2185 text_.emplace_back(term);
2186 } else {
2187 FlushText();
2188 terms_.emplace_back(term);
2189 }
2190 LAST(ADD_ATOM);
2191 }
2192
AddTerm(RegExpTree * term)2193 void RegExpBuilder::AddTerm(RegExpTree* term) {
2194 FlushText();
2195 terms_.emplace_back(term);
2196 LAST(ADD_ATOM);
2197 }
2198
AddAssertion(RegExpTree * assert)2199 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
2200 FlushText();
2201 terms_.emplace_back(assert);
2202 LAST(ADD_ASSERT);
2203 }
2204
NewAlternative()2205 void RegExpBuilder::NewAlternative() { FlushTerms(); }
2206
FlushTerms()2207 void RegExpBuilder::FlushTerms() {
2208 FlushText();
2209 size_t num_terms = terms_.size();
2210 RegExpTree* alternative;
2211 if (num_terms == 0) {
2212 alternative = zone()->New<RegExpEmpty>();
2213 } else if (num_terms == 1) {
2214 alternative = terms_.back();
2215 } else {
2216 alternative =
2217 zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
2218 base::VectorOf(terms_.begin(), terms_.size()), zone()));
2219 }
2220 alternatives_.emplace_back(alternative);
2221 terms_.clear();
2222 LAST(ADD_NONE);
2223 }
2224
NeedsDesugaringForUnicode(RegExpCharacterClass * cc)2225 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
2226 if (!unicode()) return false;
2227 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
2228 // necessarily mean that we need to desugar. It's probably nicer to have a
2229 // separate pass to figure out unicode desugarings.
2230 if (ignore_case()) return true;
2231 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2232 CharacterRange::Canonicalize(ranges);
2233 for (int i = ranges->length() - 1; i >= 0; i--) {
2234 base::uc32 from = ranges->at(i).from();
2235 base::uc32 to = ranges->at(i).to();
2236 // Check for non-BMP characters.
2237 if (to >= kNonBmpStart) return true;
2238 // Check for lone surrogates.
2239 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
2240 }
2241 return false;
2242 }
2243
NeedsDesugaringForIgnoreCase(base::uc32 c)2244 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) {
2245 #ifdef V8_INTL_SUPPORT
2246 if (unicode() && ignore_case()) {
2247 icu::UnicodeSet set(c, c);
2248 set.closeOver(USET_CASE_INSENSITIVE);
2249 set.removeAllStrings();
2250 return set.size() > 1;
2251 }
2252 // In the case where ICU is not included, we act as if the unicode flag is
2253 // not set, and do not desugar.
2254 #endif // V8_INTL_SUPPORT
2255 return false;
2256 }
2257
ToRegExp()2258 RegExpTree* RegExpBuilder::ToRegExp() {
2259 FlushTerms();
2260 size_t num_alternatives = alternatives_.size();
2261 if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
2262 if (num_alternatives == 1) return alternatives_.back();
2263 return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>(
2264 base::VectorOf(alternatives_.begin(), alternatives_.size()), zone()));
2265 }
2266
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)2267 bool RegExpBuilder::AddQuantifierToAtom(
2268 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
2269 FlushPendingSurrogate();
2270 if (pending_empty_) {
2271 pending_empty_ = false;
2272 return true;
2273 }
2274 RegExpTree* atom;
2275 if (characters_ != nullptr) {
2276 DCHECK(last_added_ == ADD_CHAR);
2277 // Last atom was character.
2278 base::Vector<const base::uc16> char_vector = characters_->ToConstVector();
2279 int num_chars = char_vector.length();
2280 if (num_chars > 1) {
2281 base::Vector<const base::uc16> prefix =
2282 char_vector.SubVector(0, num_chars - 1);
2283 text_.emplace_back(zone()->New<RegExpAtom>(prefix));
2284 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
2285 }
2286 characters_ = nullptr;
2287 atom = zone()->New<RegExpAtom>(char_vector);
2288 FlushText();
2289 } else if (text_.size() > 0) {
2290 DCHECK(last_added_ == ADD_ATOM);
2291 atom = text_.back();
2292 text_.pop_back();
2293 FlushText();
2294 } else if (terms_.size() > 0) {
2295 DCHECK(last_added_ == ADD_ATOM);
2296 atom = terms_.back();
2297 terms_.pop_back();
2298 if (atom->IsLookaround()) {
2299 // With /u, lookarounds are not quantifiable.
2300 if (unicode()) return false;
2301 // Lookbehinds are not quantifiable.
2302 if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
2303 return false;
2304 }
2305 }
2306 if (atom->max_match() == 0) {
2307 // Guaranteed to only match an empty string.
2308 LAST(ADD_TERM);
2309 if (min == 0) {
2310 return true;
2311 }
2312 terms_.emplace_back(atom);
2313 return true;
2314 }
2315 } else {
2316 // Only call immediately after adding an atom or character!
2317 UNREACHABLE();
2318 }
2319 terms_.emplace_back(
2320 zone()->New<RegExpQuantifier>(min, max, quantifier_type, atom));
2321 LAST(ADD_TERM);
2322 return true;
2323 }
2324
2325 template class RegExpParserImpl<uint8_t>;
2326 template class RegExpParserImpl<base::uc16>;
2327
2328 } // namespace
2329
2330 // static
ParseRegExpFromHeapString(Isolate * isolate,Zone * zone,Handle<String> input,RegExpFlags flags,RegExpCompileData * result)2331 bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,
2332 Handle<String> input,
2333 RegExpFlags flags,
2334 RegExpCompileData* result) {
2335 DisallowGarbageCollection no_gc;
2336 uintptr_t stack_limit = isolate->stack_guard()->real_climit();
2337 String::FlatContent content = input->GetFlatContent(no_gc);
2338 if (content.IsOneByte()) {
2339 base::Vector<const uint8_t> v = content.ToOneByteVector();
2340 return RegExpParserImpl<uint8_t>{v.begin(), v.length(), flags,
2341 stack_limit, zone, no_gc}
2342 .Parse(result);
2343 } else {
2344 base::Vector<const base::uc16> v = content.ToUC16Vector();
2345 return RegExpParserImpl<base::uc16>{v.begin(), v.length(), flags,
2346 stack_limit, zone, no_gc}
2347 .Parse(result);
2348 }
2349 }
2350
2351 // static
2352 template <class CharT>
VerifyRegExpSyntax(Zone * zone,uintptr_t stack_limit,const CharT * input,int input_length,RegExpFlags flags,RegExpCompileData * result,const DisallowGarbageCollection & no_gc)2353 bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,
2354 const CharT* input, int input_length,
2355 RegExpFlags flags,
2356 RegExpCompileData* result,
2357 const DisallowGarbageCollection& no_gc) {
2358 return RegExpParserImpl<CharT>{input, input_length, flags,
2359 stack_limit, zone, no_gc}
2360 .Parse(result);
2361 }
2362
2363 template bool RegExpParser::VerifyRegExpSyntax<uint8_t>(
2364 Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*,
2365 const DisallowGarbageCollection&);
2366 template bool RegExpParser::VerifyRegExpSyntax<base::uc16>(
2367 Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*,
2368 const DisallowGarbageCollection&);
2369
2370 // static
VerifyRegExpSyntax(Isolate * isolate,Zone * zone,Handle<String> input,RegExpFlags flags,RegExpCompileData * result,const DisallowGarbageCollection &)2371 bool RegExpParser::VerifyRegExpSyntax(Isolate* isolate, Zone* zone,
2372 Handle<String> input, RegExpFlags flags,
2373 RegExpCompileData* result,
2374 const DisallowGarbageCollection&) {
2375 return ParseRegExpFromHeapString(isolate, zone, input, flags, result);
2376 }
2377
2378 #undef LAST
2379
2380 } // namespace internal
2381 } // namespace v8
2382