• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Features shared by parsing and pre-parsing scanners.
6 
7 #include "src/parsing/scanner.h"
8 
9 #include <stdint.h>
10 
11 #include <cmath>
12 
13 #include "src/ast/ast-value-factory.h"
14 #include "src/base/platform/wrappers.h"
15 #include "src/base/strings.h"
16 #include "src/numbers/conversions-inl.h"
17 #include "src/objects/bigint.h"
18 #include "src/parsing/parse-info.h"
19 #include "src/parsing/scanner-inl.h"
20 #include "src/zone/zone.h"
21 
22 namespace v8 {
23 namespace internal {
24 
25 class Scanner::ErrorState {
26  public:
ErrorState(MessageTemplate * message_stack,Scanner::Location * location_stack)27   ErrorState(MessageTemplate* message_stack, Scanner::Location* location_stack)
28       : message_stack_(message_stack),
29         old_message_(*message_stack),
30         location_stack_(location_stack),
31         old_location_(*location_stack) {
32     *message_stack_ = MessageTemplate::kNone;
33     *location_stack_ = Location::invalid();
34   }
35 
~ErrorState()36   ~ErrorState() {
37     *message_stack_ = old_message_;
38     *location_stack_ = old_location_;
39   }
40 
MoveErrorTo(TokenDesc * dest)41   void MoveErrorTo(TokenDesc* dest) {
42     if (*message_stack_ == MessageTemplate::kNone) {
43       return;
44     }
45     if (dest->invalid_template_escape_message == MessageTemplate::kNone) {
46       dest->invalid_template_escape_message = *message_stack_;
47       dest->invalid_template_escape_location = *location_stack_;
48     }
49     *message_stack_ = MessageTemplate::kNone;
50     *location_stack_ = Location::invalid();
51   }
52 
53  private:
54   MessageTemplate* const message_stack_;
55   MessageTemplate const old_message_;
56   Scanner::Location* const location_stack_;
57   Scanner::Location const old_location_;
58 };
59 
60 // ----------------------------------------------------------------------------
61 // Scanner::BookmarkScope
62 
63 const size_t Scanner::BookmarkScope::kNoBookmark =
64     std::numeric_limits<size_t>::max() - 1;
65 const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
66     std::numeric_limits<size_t>::max();
67 
Set(size_t position)68 void Scanner::BookmarkScope::Set(size_t position) {
69   DCHECK_EQ(bookmark_, kNoBookmark);
70   bookmark_ = position;
71 }
72 
Apply()73 void Scanner::BookmarkScope::Apply() {
74   DCHECK(HasBeenSet());  // Caller hasn't called SetBookmark.
75   if (had_parser_error_) {
76     scanner_->set_parser_error();
77   } else {
78     scanner_->reset_parser_error_flag();
79     scanner_->SeekNext(bookmark_);
80   }
81   bookmark_ = kBookmarkWasApplied;
82 }
83 
HasBeenSet() const84 bool Scanner::BookmarkScope::HasBeenSet() const {
85   return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
86 }
87 
HasBeenApplied() const88 bool Scanner::BookmarkScope::HasBeenApplied() const {
89   return bookmark_ == kBookmarkWasApplied;
90 }
91 
92 // ----------------------------------------------------------------------------
93 // Scanner
94 
Scanner(Utf16CharacterStream * source,UnoptimizedCompileFlags flags)95 Scanner::Scanner(Utf16CharacterStream* source, UnoptimizedCompileFlags flags)
96     : flags_(flags),
97       source_(source),
98       found_html_comment_(false),
99       octal_pos_(Location::invalid()),
100       octal_message_(MessageTemplate::kNone) {
101   DCHECK_NOT_NULL(source);
102 }
103 
Initialize()104 void Scanner::Initialize() {
105   // Need to capture identifiers in order to recognize "get" and "set"
106   // in object literals.
107   Init();
108   next().after_line_terminator = true;
109   Scan();
110 }
111 
112 // static
IsInvalid(base::uc32 c)113 bool Scanner::IsInvalid(base::uc32 c) {
114   DCHECK(c == Invalid() || base::IsInRange(c, 0u, String::kMaxCodePoint));
115   return c == Scanner::Invalid();
116 }
117 
118 template <bool capture_raw, bool unicode>
ScanHexNumber(int expected_length)119 base::uc32 Scanner::ScanHexNumber(int expected_length) {
120   DCHECK_LE(expected_length, 4);  // prevent overflow
121 
122   int begin = source_pos() - 2;
123   base::uc32 x = 0;
124   for (int i = 0; i < expected_length; i++) {
125     int d = base::HexValue(c0_);
126     if (d < 0) {
127       ReportScannerError(Location(begin, begin + expected_length + 2),
128                          unicode
129                              ? MessageTemplate::kInvalidUnicodeEscapeSequence
130                              : MessageTemplate::kInvalidHexEscapeSequence);
131       return Invalid();
132     }
133     x = x * 16 + d;
134     Advance<capture_raw>();
135   }
136 
137   return x;
138 }
139 
140 template <bool capture_raw>
ScanUnlimitedLengthHexNumber(base::uc32 max_value,int beg_pos)141 base::uc32 Scanner::ScanUnlimitedLengthHexNumber(base::uc32 max_value,
142                                                  int beg_pos) {
143   base::uc32 x = 0;
144   int d = base::HexValue(c0_);
145   if (d < 0) return Invalid();
146 
147   while (d >= 0) {
148     x = x * 16 + d;
149     if (x > max_value) {
150       ReportScannerError(Location(beg_pos, source_pos() + 1),
151                          MessageTemplate::kUndefinedUnicodeCodePoint);
152       return Invalid();
153     }
154     Advance<capture_raw>();
155     d = base::HexValue(c0_);
156   }
157 
158   return x;
159 }
160 
Next()161 Token::Value Scanner::Next() {
162   // Rotate through tokens.
163   TokenDesc* previous = current_;
164   current_ = next_;
165   // Either we already have the next token lined up, in which case next_next_
166   // simply becomes next_. In that case we use current_ as new next_next_ and
167   // clear its token to indicate that it wasn't scanned yet. Otherwise we use
168   // current_ as next_ and scan into it, leaving next_next_ uninitialized.
169   if (V8_LIKELY(next_next().token == Token::UNINITIALIZED)) {
170     next_ = previous;
171     // User 'previous' instead of 'next_' because for some reason the compiler
172     // thinks 'next_' could be modified before the entry into Scan.
173     previous->after_line_terminator = false;
174     Scan(previous);
175   } else {
176     next_ = next_next_;
177     next_next_ = previous;
178     previous->token = Token::UNINITIALIZED;
179     DCHECK_NE(Token::UNINITIALIZED, current().token);
180   }
181   return current().token;
182 }
183 
PeekAhead()184 Token::Value Scanner::PeekAhead() {
185   DCHECK(next().token != Token::DIV);
186   DCHECK(next().token != Token::ASSIGN_DIV);
187 
188   if (next_next().token != Token::UNINITIALIZED) {
189     return next_next().token;
190   }
191   TokenDesc* temp = next_;
192   next_ = next_next_;
193   next().after_line_terminator = false;
194   Scan();
195   next_next_ = next_;
196   next_ = temp;
197   return next_next().token;
198 }
199 
SkipSingleHTMLComment()200 Token::Value Scanner::SkipSingleHTMLComment() {
201   if (flags_.is_module()) {
202     ReportScannerError(source_pos(), MessageTemplate::kHtmlCommentInModule);
203     return Token::ILLEGAL;
204   }
205   return SkipSingleLineComment();
206 }
207 
SkipSingleLineComment()208 Token::Value Scanner::SkipSingleLineComment() {
209   // The line terminator at the end of the line is not considered
210   // to be part of the single-line comment; it is recognized
211   // separately by the lexical grammar and becomes part of the
212   // stream of input elements for the syntactic grammar (see
213   // ECMA-262, section 7.4).
214   AdvanceUntil([](base::uc32 c0) { return unibrow::IsLineTerminator(c0); });
215 
216   return Token::WHITESPACE;
217 }
218 
SkipSourceURLComment()219 Token::Value Scanner::SkipSourceURLComment() {
220   TryToParseSourceURLComment();
221   if (unibrow::IsLineTerminator(c0_) || c0_ == kEndOfInput) {
222     return Token::WHITESPACE;
223   }
224   return SkipSingleLineComment();
225 }
226 
TryToParseSourceURLComment()227 void Scanner::TryToParseSourceURLComment() {
228   // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
229   // function will just return if it cannot parse a magic comment.
230   DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput));
231   if (!IsWhiteSpace(c0_)) return;
232   Advance();
233   LiteralBuffer name;
234   name.Start();
235 
236   while (c0_ != kEndOfInput && !IsWhiteSpaceOrLineTerminator(c0_) &&
237          c0_ != '=') {
238     name.AddChar(c0_);
239     Advance();
240   }
241   if (!name.is_one_byte()) return;
242   base::Vector<const uint8_t> name_literal = name.one_byte_literal();
243   LiteralBuffer* value;
244   if (name_literal == base::StaticOneByteVector("sourceURL")) {
245     value = &source_url_;
246   } else if (name_literal == base::StaticOneByteVector("sourceMappingURL")) {
247     value = &source_mapping_url_;
248   } else {
249     return;
250   }
251   if (c0_ != '=')
252     return;
253   value->Start();
254   Advance();
255   while (IsWhiteSpace(c0_)) {
256     Advance();
257   }
258   while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
259     if (IsWhiteSpace(c0_)) {
260       break;
261     }
262     value->AddChar(c0_);
263     Advance();
264   }
265   // Allow whitespace at the end.
266   while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
267     if (!IsWhiteSpace(c0_)) {
268       value->Start();
269       break;
270     }
271     Advance();
272   }
273 }
274 
SkipMultiLineComment()275 Token::Value Scanner::SkipMultiLineComment() {
276   DCHECK_EQ(c0_, '*');
277 
278   // Until we see the first newline, check for * and newline characters.
279   if (!next().after_line_terminator) {
280     do {
281       AdvanceUntil([](base::uc32 c0) {
282         if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
283           return unibrow::IsLineTerminator(c0);
284         }
285         uint8_t char_flags = character_scan_flags[c0];
286         return MultilineCommentCharacterNeedsSlowPath(char_flags);
287       });
288 
289       while (c0_ == '*') {
290         Advance();
291         if (c0_ == '/') {
292           Advance();
293           return Token::WHITESPACE;
294         }
295       }
296 
297       if (unibrow::IsLineTerminator(c0_)) {
298         next().after_line_terminator = true;
299         break;
300       }
301     } while (c0_ != kEndOfInput);
302   }
303 
304   // After we've seen newline, simply try to find '*/'.
305   while (c0_ != kEndOfInput) {
306     AdvanceUntil([](base::uc32 c0) { return c0 == '*'; });
307 
308     while (c0_ == '*') {
309       Advance();
310       if (c0_ == '/') {
311         Advance();
312         return Token::WHITESPACE;
313       }
314     }
315   }
316 
317   return Token::ILLEGAL;
318 }
319 
ScanHtmlComment()320 Token::Value Scanner::ScanHtmlComment() {
321   // Check for <!-- comments.
322   DCHECK_EQ(c0_, '!');
323   Advance();
324   if (c0_ != '-' || Peek() != '-') {
325     PushBack('!');  // undo Advance()
326     return Token::LT;
327   }
328   Advance();
329 
330   found_html_comment_ = true;
331   return SkipSingleHTMLComment();
332 }
333 
334 #ifdef DEBUG
SanityCheckTokenDesc(const TokenDesc & token) const335 void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
336   // Only TEMPLATE_* tokens can have a invalid_template_escape_message.
337   // ILLEGAL and UNINITIALIZED can have garbage for the field.
338 
339   switch (token.token) {
340     case Token::UNINITIALIZED:
341     case Token::ILLEGAL:
342       // token.literal_chars & other members might be garbage. That's ok.
343     case Token::TEMPLATE_SPAN:
344     case Token::TEMPLATE_TAIL:
345       break;
346     default:
347       DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone);
348       break;
349   }
350 }
351 #endif  // DEBUG
352 
SeekForward(int pos)353 void Scanner::SeekForward(int pos) {
354   // After this call, we will have the token at the given position as
355   // the "next" token. The "current" token will be invalid.
356   if (pos == next().location.beg_pos) return;
357   int current_pos = source_pos();
358   DCHECK_EQ(next().location.end_pos, current_pos);
359   // Positions inside the lookahead token aren't supported.
360   DCHECK(pos >= current_pos);
361   if (pos != current_pos) {
362     source_->Seek(pos);
363     Advance();
364     // This function is only called to seek to the location
365     // of the end of a function (at the "}" token). It doesn't matter
366     // whether there was a line terminator in the part we skip.
367     next().after_line_terminator = false;
368   }
369   Scan();
370 }
371 
372 template <bool capture_raw>
ScanEscape()373 bool Scanner::ScanEscape() {
374   base::uc32 c = c0_;
375   Advance<capture_raw>();
376 
377   // Skip escaped newlines.
378   DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
379   if (!capture_raw && unibrow::IsLineTerminator(c)) {
380     // Allow escaped CR+LF newlines in multiline string literals.
381     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
382     return true;
383   }
384 
385   switch (c) {
386     case 'b' : c = '\b'; break;
387     case 'f' : c = '\f'; break;
388     case 'n' : c = '\n'; break;
389     case 'r' : c = '\r'; break;
390     case 't' : c = '\t'; break;
391     case 'u' : {
392       c = ScanUnicodeEscape<capture_raw>();
393       if (IsInvalid(c)) return false;
394       break;
395     }
396     case 'v':
397       c = '\v';
398       break;
399     case 'x': {
400       c = ScanHexNumber<capture_raw>(2);
401       if (IsInvalid(c)) return false;
402       break;
403     }
404     case '0':
405     case '1':
406     case '2':
407     case '3':
408     case '4':
409     case '5':
410     case '6':
411     case '7':
412       c = ScanOctalEscape<capture_raw>(c, 2);
413       break;
414     case '8':
415     case '9':
416       // '\8' and '\9' are disallowed in strict mode.
417       // Re-use the octal error state to propagate the error.
418       octal_pos_ = Location(source_pos() - 2, source_pos() - 1);
419       octal_message_ = capture_raw ? MessageTemplate::kTemplate8Or9Escape
420                                    : MessageTemplate::kStrict8Or9Escape;
421       break;
422   }
423 
424   // Other escaped characters are interpreted as their non-escaped version.
425   AddLiteralChar(c);
426   return true;
427 }
428 
429 template <bool capture_raw>
ScanOctalEscape(base::uc32 c,int length)430 base::uc32 Scanner::ScanOctalEscape(base::uc32 c, int length) {
431   DCHECK('0' <= c && c <= '7');
432   base::uc32 x = c - '0';
433   int i = 0;
434   for (; i < length; i++) {
435     int d = c0_ - '0';
436     if (d < 0 || d > 7) break;
437     int nx = x * 8 + d;
438     if (nx >= 256) break;
439     x = nx;
440     Advance<capture_raw>();
441   }
442   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
443   // Remember the position of octal escape sequences so that an error
444   // can be reported later (in strict mode).
445   // We don't report the error immediately, because the octal escape can
446   // occur before the "use strict" directive.
447   if (c != '0' || i > 0 || IsNonOctalDecimalDigit(c0_)) {
448     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
449     octal_message_ = capture_raw ? MessageTemplate::kTemplateOctalLiteral
450                                  : MessageTemplate::kStrictOctalEscape;
451   }
452   return x;
453 }
454 
ScanString()455 Token::Value Scanner::ScanString() {
456   base::uc32 quote = c0_;
457 
458   next().literal_chars.Start();
459   while (true) {
460     AdvanceUntil([this](base::uc32 c0) {
461       if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
462         if (V8_UNLIKELY(unibrow::IsStringLiteralLineTerminator(c0))) {
463           return true;
464         }
465         AddLiteralChar(c0);
466         return false;
467       }
468       uint8_t char_flags = character_scan_flags[c0];
469       if (MayTerminateString(char_flags)) return true;
470       AddLiteralChar(c0);
471       return false;
472     });
473 
474     while (c0_ == '\\') {
475       Advance();
476       // TODO(verwaest): Check whether we can remove the additional check.
477       if (V8_UNLIKELY(c0_ == kEndOfInput || !ScanEscape<false>())) {
478         return Token::ILLEGAL;
479       }
480     }
481 
482     if (c0_ == quote) {
483       Advance();
484       return Token::STRING;
485     }
486 
487     if (V8_UNLIKELY(c0_ == kEndOfInput ||
488                     unibrow::IsStringLiteralLineTerminator(c0_))) {
489       return Token::ILLEGAL;
490     }
491 
492     AddLiteralChar(c0_);
493   }
494 }
495 
ScanPrivateName()496 Token::Value Scanner::ScanPrivateName() {
497   next().literal_chars.Start();
498   DCHECK_EQ(c0_, '#');
499   DCHECK(!IsIdentifierStart(kEndOfInput));
500   int pos = source_pos();
501   Advance();
502   if (IsIdentifierStart(c0_) ||
503       (CombineSurrogatePair() && IsIdentifierStart(c0_))) {
504     AddLiteralChar('#');
505     Token::Value token = ScanIdentifierOrKeywordInner();
506     return token == Token::ILLEGAL ? Token::ILLEGAL : Token::PRIVATE_NAME;
507   }
508 
509   ReportScannerError(pos, MessageTemplate::kInvalidOrUnexpectedToken);
510   return Token::ILLEGAL;
511 }
512 
ScanTemplateSpan()513 Token::Value Scanner::ScanTemplateSpan() {
514   // When scanning a TemplateSpan, we are looking for the following construct:
515   // TEMPLATE_SPAN ::
516   //     ` LiteralChars* ${
517   //   | } LiteralChars* ${
518   //
519   // TEMPLATE_TAIL ::
520   //     ` LiteralChars* `
521   //   | } LiteralChar* `
522   //
523   // A TEMPLATE_SPAN should always be followed by an Expression, while a
524   // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
525   // followed by an Expression.
526 
527   // These scoped helpers save and restore the original error state, so that we
528   // can specially treat invalid escape sequences in templates (which are
529   // handled by the parser).
530   ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_);
531   ErrorState octal_error_state(&octal_message_, &octal_pos_);
532 
533   Token::Value result = Token::TEMPLATE_SPAN;
534   next().literal_chars.Start();
535   next().raw_literal_chars.Start();
536   const bool capture_raw = true;
537   while (true) {
538     base::uc32 c = c0_;
539     if (c == '`') {
540       Advance();  // Consume '`'
541       result = Token::TEMPLATE_TAIL;
542       break;
543     } else if (c == '$' && Peek() == '{') {
544       Advance();  // Consume '$'
545       Advance();  // Consume '{'
546       break;
547     } else if (c == '\\') {
548       Advance();  // Consume '\\'
549       DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
550       if (capture_raw) AddRawLiteralChar('\\');
551       if (unibrow::IsLineTerminator(c0_)) {
552         // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
553         // code unit sequence.
554         base::uc32 lastChar = c0_;
555         Advance();
556         if (lastChar == '\r') {
557           // Also skip \n.
558           if (c0_ == '\n') Advance();
559           lastChar = '\n';
560         }
561         if (capture_raw) AddRawLiteralChar(lastChar);
562       } else {
563         bool success = ScanEscape<capture_raw>();
564         USE(success);
565         DCHECK_EQ(!success, has_error());
566         // For templates, invalid escape sequence checking is handled in the
567         // parser.
568         scanner_error_state.MoveErrorTo(next_);
569         octal_error_state.MoveErrorTo(next_);
570       }
571     } else if (c == kEndOfInput) {
572       // Unterminated template literal
573       break;
574     } else {
575       Advance();  // Consume c.
576       // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
577       // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
578       // consisting of the CV 0x000A.
579       if (c == '\r') {
580         if (c0_ == '\n') Advance();  // Consume '\n'
581         c = '\n';
582       }
583       if (capture_raw) AddRawLiteralChar(c);
584       AddLiteralChar(c);
585     }
586   }
587   next().location.end_pos = source_pos();
588   next().token = result;
589 
590   return result;
591 }
592 
593 template <typename IsolateT>
SourceUrl(IsolateT * isolate) const594 Handle<String> Scanner::SourceUrl(IsolateT* isolate) const {
595   Handle<String> tmp;
596   if (source_url_.length() > 0) {
597     tmp = source_url_.Internalize(isolate);
598   }
599   return tmp;
600 }
601 
602 template Handle<String> Scanner::SourceUrl(Isolate* isolate) const;
603 template Handle<String> Scanner::SourceUrl(LocalIsolate* isolate) const;
604 
605 template <typename IsolateT>
SourceMappingUrl(IsolateT * isolate) const606 Handle<String> Scanner::SourceMappingUrl(IsolateT* isolate) const {
607   Handle<String> tmp;
608   if (source_mapping_url_.length() > 0) {
609     tmp = source_mapping_url_.Internalize(isolate);
610   }
611   return tmp;
612 }
613 
614 template Handle<String> Scanner::SourceMappingUrl(Isolate* isolate) const;
615 template Handle<String> Scanner::SourceMappingUrl(LocalIsolate* isolate) const;
616 
ScanDigitsWithNumericSeparators(bool (* predicate)(base::uc32 ch),bool is_check_first_digit)617 bool Scanner::ScanDigitsWithNumericSeparators(bool (*predicate)(base::uc32 ch),
618                                               bool is_check_first_digit) {
619   // we must have at least one digit after 'x'/'b'/'o'
620   if (is_check_first_digit && !predicate(c0_)) return false;
621 
622   bool separator_seen = false;
623   while (predicate(c0_) || c0_ == '_') {
624     if (c0_ == '_') {
625       Advance();
626       if (c0_ == '_') {
627         ReportScannerError(Location(source_pos(), source_pos() + 1),
628                            MessageTemplate::kContinuousNumericSeparator);
629         return false;
630       }
631       separator_seen = true;
632       continue;
633     }
634     separator_seen = false;
635     AddLiteralCharAdvance();
636   }
637 
638   if (separator_seen) {
639     ReportScannerError(Location(source_pos(), source_pos() + 1),
640                        MessageTemplate::kTrailingNumericSeparator);
641     return false;
642   }
643 
644   return true;
645 }
646 
ScanDecimalDigits(bool allow_numeric_separator)647 bool Scanner::ScanDecimalDigits(bool allow_numeric_separator) {
648   if (allow_numeric_separator) {
649     return ScanDigitsWithNumericSeparators(&IsDecimalDigit, false);
650   }
651   while (IsDecimalDigit(c0_)) {
652     AddLiteralCharAdvance();
653   }
654   if (c0_ == '_') {
655     ReportScannerError(Location(source_pos(), source_pos() + 1),
656                        MessageTemplate::kInvalidOrUnexpectedToken);
657     return false;
658   }
659   return true;
660 }
661 
ScanDecimalAsSmiWithNumericSeparators(uint64_t * value)662 bool Scanner::ScanDecimalAsSmiWithNumericSeparators(uint64_t* value) {
663   bool separator_seen = false;
664   while (IsDecimalDigit(c0_) || c0_ == '_') {
665     if (c0_ == '_') {
666       Advance();
667       if (c0_ == '_') {
668         ReportScannerError(Location(source_pos(), source_pos() + 1),
669                            MessageTemplate::kContinuousNumericSeparator);
670         return false;
671       }
672       separator_seen = true;
673       continue;
674     }
675     separator_seen = false;
676     *value = 10 * *value + (c0_ - '0');
677     base::uc32 first_char = c0_;
678     Advance();
679     AddLiteralChar(first_char);
680   }
681 
682   if (separator_seen) {
683     ReportScannerError(Location(source_pos(), source_pos() + 1),
684                        MessageTemplate::kTrailingNumericSeparator);
685     return false;
686   }
687 
688   return true;
689 }
690 
ScanDecimalAsSmi(uint64_t * value,bool allow_numeric_separator)691 bool Scanner::ScanDecimalAsSmi(uint64_t* value, bool allow_numeric_separator) {
692   if (allow_numeric_separator) {
693     return ScanDecimalAsSmiWithNumericSeparators(value);
694   }
695 
696   while (IsDecimalDigit(c0_)) {
697     *value = 10 * *value + (c0_ - '0');
698     base::uc32 first_char = c0_;
699     Advance();
700     AddLiteralChar(first_char);
701   }
702   return true;
703 }
704 
ScanBinaryDigits()705 bool Scanner::ScanBinaryDigits() {
706   return ScanDigitsWithNumericSeparators(&IsBinaryDigit, true);
707 }
708 
ScanOctalDigits()709 bool Scanner::ScanOctalDigits() {
710   return ScanDigitsWithNumericSeparators(&IsOctalDigit, true);
711 }
712 
ScanImplicitOctalDigits(int start_pos,Scanner::NumberKind * kind)713 bool Scanner::ScanImplicitOctalDigits(int start_pos,
714                                       Scanner::NumberKind* kind) {
715   *kind = IMPLICIT_OCTAL;
716 
717   while (true) {
718     // (possible) octal number
719     if (IsNonOctalDecimalDigit(c0_)) {
720       *kind = DECIMAL_WITH_LEADING_ZERO;
721       return true;
722     }
723     if (!IsOctalDigit(c0_)) {
724       // Octal literal finished.
725       octal_pos_ = Location(start_pos, source_pos());
726       octal_message_ = MessageTemplate::kStrictOctalLiteral;
727       return true;
728     }
729     AddLiteralCharAdvance();
730   }
731 }
732 
ScanHexDigits()733 bool Scanner::ScanHexDigits() {
734   return ScanDigitsWithNumericSeparators(&IsHexDigit, true);
735 }
736 
ScanSignedInteger()737 bool Scanner::ScanSignedInteger() {
738   if (c0_ == '+' || c0_ == '-') AddLiteralCharAdvance();
739   // we must have at least one decimal digit after 'e'/'E'
740   if (!IsDecimalDigit(c0_)) return false;
741   return ScanDecimalDigits(true);
742 }
743 
ScanNumber(bool seen_period)744 Token::Value Scanner::ScanNumber(bool seen_period) {
745   DCHECK(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
746 
747   NumberKind kind = DECIMAL;
748 
749   next().literal_chars.Start();
750   bool at_start = !seen_period;
751   int start_pos = source_pos();  // For reporting octal positions.
752   if (seen_period) {
753     // we have already seen a decimal point of the float
754     AddLiteralChar('.');
755     if (c0_ == '_') {
756       return Token::ILLEGAL;
757     }
758     // we know we have at least one digit
759     if (!ScanDecimalDigits(true)) return Token::ILLEGAL;
760   } else {
761     // if the first character is '0' we must check for octals and hex
762     if (c0_ == '0') {
763       AddLiteralCharAdvance();
764 
765       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
766       // an octal number.
767       if (AsciiAlphaToLower(c0_) == 'x') {
768         AddLiteralCharAdvance();
769         kind = HEX;
770         if (!ScanHexDigits()) return Token::ILLEGAL;
771       } else if (AsciiAlphaToLower(c0_) == 'o') {
772         AddLiteralCharAdvance();
773         kind = OCTAL;
774         if (!ScanOctalDigits()) return Token::ILLEGAL;
775       } else if (AsciiAlphaToLower(c0_) == 'b') {
776         AddLiteralCharAdvance();
777         kind = BINARY;
778         if (!ScanBinaryDigits()) return Token::ILLEGAL;
779       } else if (IsOctalDigit(c0_)) {
780         kind = IMPLICIT_OCTAL;
781         if (!ScanImplicitOctalDigits(start_pos, &kind)) {
782           return Token::ILLEGAL;
783         }
784         if (kind == DECIMAL_WITH_LEADING_ZERO) {
785           at_start = false;
786         }
787       } else if (IsNonOctalDecimalDigit(c0_)) {
788         kind = DECIMAL_WITH_LEADING_ZERO;
789       } else if (c0_ == '_') {
790         ReportScannerError(Location(source_pos(), source_pos() + 1),
791                            MessageTemplate::kZeroDigitNumericSeparator);
792         return Token::ILLEGAL;
793       }
794     }
795 
796     // Parse decimal digits and allow trailing fractional part.
797     if (IsDecimalNumberKind(kind)) {
798       bool allow_numeric_separator = kind != DECIMAL_WITH_LEADING_ZERO;
799       // This is an optimization for parsing Decimal numbers as Smi's.
800       if (at_start) {
801         uint64_t value = 0;
802         // scan subsequent decimal digits
803         if (!ScanDecimalAsSmi(&value, allow_numeric_separator)) {
804           return Token::ILLEGAL;
805         }
806 
807         if (next().literal_chars.one_byte_literal().length() <= 10 &&
808             value <= Smi::kMaxValue && c0_ != '.' && !IsIdentifierStart(c0_)) {
809           next().smi_value_ = static_cast<uint32_t>(value);
810 
811           if (kind == DECIMAL_WITH_LEADING_ZERO) {
812             octal_pos_ = Location(start_pos, source_pos());
813             octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
814           }
815           return Token::SMI;
816         }
817       }
818 
819       if (!ScanDecimalDigits(allow_numeric_separator)) {
820         return Token::ILLEGAL;
821       }
822       if (c0_ == '.') {
823         seen_period = true;
824         AddLiteralCharAdvance();
825         if (c0_ == '_') {
826           return Token::ILLEGAL;
827         }
828         if (!ScanDecimalDigits(true)) return Token::ILLEGAL;
829       }
830     }
831   }
832 
833   bool is_bigint = false;
834   if (c0_ == 'n' && !seen_period && IsValidBigIntKind(kind)) {
835     // Check that the literal is within our limits for BigInt length.
836     // For simplicity, use 4 bits per character to calculate the maximum
837     // allowed literal length.
838     static const int kMaxBigIntCharacters = BigInt::kMaxLengthBits / 4;
839     int length = source_pos() - start_pos - (kind != DECIMAL ? 2 : 0);
840     if (length > kMaxBigIntCharacters) {
841       ReportScannerError(Location(start_pos, source_pos()),
842                          MessageTemplate::kBigIntTooBig);
843       return Token::ILLEGAL;
844     }
845 
846     is_bigint = true;
847     Advance();
848   } else if (AsciiAlphaToLower(c0_) == 'e') {
849     // scan exponent, if any
850     DCHECK(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
851 
852     if (!IsDecimalNumberKind(kind)) return Token::ILLEGAL;
853 
854     // scan exponent
855     AddLiteralCharAdvance();
856 
857     if (!ScanSignedInteger()) return Token::ILLEGAL;
858   }
859 
860   // The source character immediately following a numeric literal must
861   // not be an identifier start or a decimal digit; see ECMA-262
862   // section 7.8.3, page 17 (note that we read only one decimal digit
863   // if the value is 0).
864   if (IsDecimalDigit(c0_) || IsIdentifierStart(c0_)) {
865     return Token::ILLEGAL;
866   }
867 
868   if (kind == DECIMAL_WITH_LEADING_ZERO) {
869     octal_pos_ = Location(start_pos, source_pos());
870     octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
871   }
872 
873   return is_bigint ? Token::BIGINT : Token::NUMBER;
874 }
875 
ScanIdentifierUnicodeEscape()876 base::uc32 Scanner::ScanIdentifierUnicodeEscape() {
877   Advance();
878   if (c0_ != 'u') return Invalid();
879   Advance();
880   return ScanUnicodeEscape<false>();
881 }
882 
883 template <bool capture_raw>
ScanUnicodeEscape()884 base::uc32 Scanner::ScanUnicodeEscape() {
885   // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
886   // hex digits between { } is arbitrary. \ and u have already been read.
887   if (c0_ == '{') {
888     int begin = source_pos() - 2;
889     Advance<capture_raw>();
890     base::uc32 cp =
891         ScanUnlimitedLengthHexNumber<capture_raw>(String::kMaxCodePoint, begin);
892     if (cp == kInvalidSequence || c0_ != '}') {
893       ReportScannerError(source_pos(),
894                          MessageTemplate::kInvalidUnicodeEscapeSequence);
895       return Invalid();
896     }
897     Advance<capture_raw>();
898     return cp;
899   }
900   const bool unicode = true;
901   return ScanHexNumber<capture_raw, unicode>(4);
902 }
903 
ScanIdentifierOrKeywordInnerSlow(bool escaped,bool can_be_keyword)904 Token::Value Scanner::ScanIdentifierOrKeywordInnerSlow(bool escaped,
905                                                        bool can_be_keyword) {
906   while (true) {
907     if (c0_ == '\\') {
908       escaped = true;
909       base::uc32 c = ScanIdentifierUnicodeEscape();
910       // Only allow legal identifier part characters.
911       // TODO(verwaest): Make this true.
912       // DCHECK(!IsIdentifierPart('\'));
913       DCHECK(!IsIdentifierPart(Invalid()));
914       if (c == '\\' || !IsIdentifierPart(c)) {
915         return Token::ILLEGAL;
916       }
917       can_be_keyword = can_be_keyword && CharCanBeKeyword(c);
918       AddLiteralChar(c);
919     } else if (IsIdentifierPart(c0_) ||
920                (CombineSurrogatePair() && IsIdentifierPart(c0_))) {
921       can_be_keyword = can_be_keyword && CharCanBeKeyword(c0_);
922       AddLiteralCharAdvance();
923     } else {
924       break;
925     }
926   }
927 
928   if (can_be_keyword && next().literal_chars.is_one_byte()) {
929     base::Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
930     Token::Value token =
931         KeywordOrIdentifierToken(chars.begin(), chars.length());
932     if (base::IsInRange(token, Token::IDENTIFIER, Token::YIELD)) return token;
933 
934     if (token == Token::FUTURE_STRICT_RESERVED_WORD) {
935       if (escaped) return Token::ESCAPED_STRICT_RESERVED_WORD;
936       return token;
937     }
938 
939     if (!escaped) return token;
940 
941     STATIC_ASSERT(Token::LET + 1 == Token::STATIC);
942     if (base::IsInRange(token, Token::LET, Token::STATIC)) {
943       return Token::ESCAPED_STRICT_RESERVED_WORD;
944     }
945     return Token::ESCAPED_KEYWORD;
946   }
947 
948   return Token::IDENTIFIER;
949 }
950 
ScanRegExpPattern()951 bool Scanner::ScanRegExpPattern() {
952   DCHECK_EQ(Token::UNINITIALIZED, next_next().token);
953   DCHECK(next().token == Token::DIV || next().token == Token::ASSIGN_DIV);
954 
955   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
956   bool in_character_class = false;
957 
958   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
959   // the scanner should pass uninterpreted bodies to the RegExp
960   // constructor.
961   next().literal_chars.Start();
962   if (next().token == Token::ASSIGN_DIV) {
963     AddLiteralChar('=');
964   }
965 
966   while (c0_ != '/' || in_character_class) {
967     if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
968       return false;
969     }
970     if (c0_ == '\\') {  // Escape sequence.
971       AddLiteralCharAdvance();
972       if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
973         return false;
974       }
975       AddLiteralCharAdvance();
976       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
977       // only "safe" characters are allowed (letters, digits, underscore),
978       // otherwise the escape isn't valid and the invalid character has
979       // its normal meaning. I.e., we can just continue scanning without
980       // worrying whether the following characters are part of the escape
981       // or not, since any '/', '\\' or '[' is guaranteed to not be part
982       // of the escape sequence.
983     } else {  // Unescaped character.
984       if (c0_ == '[') in_character_class = true;
985       if (c0_ == ']') in_character_class = false;
986       AddLiteralCharAdvance();
987     }
988   }
989   Advance();  // consume '/'
990 
991   next().token = Token::REGEXP_LITERAL;
992   return true;
993 }
994 
ScanRegExpFlags()995 base::Optional<RegExpFlags> Scanner::ScanRegExpFlags() {
996   DCHECK_EQ(Token::REGEXP_LITERAL, next().token);
997 
998   RegExpFlags flags;
999   while (IsIdentifierPart(c0_)) {
1000     base::Optional<RegExpFlag> maybe_flag = JSRegExp::FlagFromChar(c0_);
1001     if (!maybe_flag.has_value()) return {};
1002     RegExpFlag flag = maybe_flag.value();
1003     if (flags & flag) return {};
1004     Advance();
1005     flags |= flag;
1006   }
1007 
1008   next().location.end_pos = source_pos();
1009   return flags;
1010 }
1011 
CurrentSymbol(AstValueFactory * ast_value_factory) const1012 const AstRawString* Scanner::CurrentSymbol(
1013     AstValueFactory* ast_value_factory) const {
1014   if (is_literal_one_byte()) {
1015     return ast_value_factory->GetOneByteString(literal_one_byte_string());
1016   }
1017   return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1018 }
1019 
NextSymbol(AstValueFactory * ast_value_factory) const1020 const AstRawString* Scanner::NextSymbol(
1021     AstValueFactory* ast_value_factory) const {
1022   if (is_next_literal_one_byte()) {
1023     return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1024   }
1025   return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1026 }
1027 
CurrentRawSymbol(AstValueFactory * ast_value_factory) const1028 const AstRawString* Scanner::CurrentRawSymbol(
1029     AstValueFactory* ast_value_factory) const {
1030   if (is_raw_literal_one_byte()) {
1031     return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1032   }
1033   return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1034 }
1035 
1036 
DoubleValue()1037 double Scanner::DoubleValue() {
1038   DCHECK(is_literal_one_byte());
1039   return StringToDouble(
1040       literal_one_byte_string(),
1041       ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1042 }
1043 
CurrentLiteralAsCString(Zone * zone) const1044 const char* Scanner::CurrentLiteralAsCString(Zone* zone) const {
1045   DCHECK(is_literal_one_byte());
1046   base::Vector<const uint8_t> vector = literal_one_byte_string();
1047   int length = vector.length();
1048   char* buffer = zone->NewArray<char>(length + 1);
1049   memcpy(buffer, vector.begin(), length);
1050   buffer[length] = '\0';
1051   return buffer;
1052 }
1053 
SeekNext(size_t position)1054 void Scanner::SeekNext(size_t position) {
1055   // Use with care: This cleanly resets most, but not all scanner state.
1056   // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.
1057 
1058   // To re-scan from a given character position, we need to:
1059   // 1, Reset the current_, next_ and next_next_ tokens
1060   //    (next_ + next_next_ will be overwrittem by Next(),
1061   //     current_ will remain unchanged, so overwrite it fully.)
1062   for (TokenDesc& token : token_storage_) {
1063     token.token = Token::UNINITIALIZED;
1064     token.invalid_template_escape_message = MessageTemplate::kNone;
1065   }
1066   // 2, reset the source to the desired position,
1067   source_->Seek(position);
1068   // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
1069   c0_ = source_->Advance();
1070   next().after_line_terminator = false;
1071   Scan();
1072   DCHECK_EQ(next().location.beg_pos, static_cast<int>(position));
1073 }
1074 
1075 }  // namespace internal
1076 }  // namespace v8
1077