1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Features shared by parsing and pre-parsing scanners. 6 7 #ifndef V8_PARSING_SCANNER_H_ 8 #define V8_PARSING_SCANNER_H_ 9 10 #include <algorithm> 11 12 #include "src/allocation.h" 13 #include "src/base/logging.h" 14 #include "src/char-predicates.h" 15 #include "src/globals.h" 16 #include "src/messages.h" 17 #include "src/parsing/token.h" 18 #include "src/unicode-decoder.h" 19 #include "src/unicode.h" 20 21 namespace v8 { 22 namespace internal { 23 24 25 class AstRawString; 26 class AstValueFactory; 27 class DuplicateFinder; 28 class ExternalOneByteString; 29 class ExternalTwoByteString; 30 class ParserRecorder; 31 class UnicodeCache; 32 33 // --------------------------------------------------------------------- 34 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer. 35 // A code unit is a 16 bit value representing either a 16 bit code point 36 // or one part of a surrogate pair that make a single 21 bit code point. 37 class Utf16CharacterStream { 38 public: 39 static const uc32 kEndOfInput = -1; 40 ~Utf16CharacterStream()41 virtual ~Utf16CharacterStream() {} 42 Peek()43 inline uc32 Peek() { 44 if (V8_LIKELY(buffer_cursor_ < buffer_end_)) { 45 return static_cast<uc32>(*buffer_cursor_); 46 } else if (ReadBlockChecked()) { 47 return static_cast<uc32>(*buffer_cursor_); 48 } else { 49 return kEndOfInput; 50 } 51 } 52 53 // Returns and advances past the next UTF-16 code unit in the input 54 // stream. If there are no more code units it returns kEndOfInput. Advance()55 inline uc32 Advance() { 56 uc32 result = Peek(); 57 buffer_cursor_++; 58 return result; 59 } 60 61 // Returns and advances past the next UTF-16 code unit in the input stream 62 // that meets the checks requirement. If there are no more code units it 63 // returns kEndOfInput. 64 template <typename FunctionType> AdvanceUntil(FunctionType check)65 V8_INLINE uc32 AdvanceUntil(FunctionType check) { 66 while (true) { 67 auto next_cursor_pos = 68 std::find_if(buffer_cursor_, buffer_end_, [&check](uint16_t raw_c0_) { 69 uc32 c0_ = static_cast<uc32>(raw_c0_); 70 return check(c0_); 71 }); 72 73 if (next_cursor_pos == buffer_end_) { 74 buffer_cursor_ = buffer_end_; 75 if (!ReadBlockChecked()) { 76 buffer_cursor_++; 77 return kEndOfInput; 78 } 79 } else { 80 buffer_cursor_ = next_cursor_pos + 1; 81 return static_cast<uc32>(*next_cursor_pos); 82 } 83 } 84 } 85 86 // Go back one by one character in the input stream. 87 // This undoes the most recent Advance(). Back()88 inline void Back() { 89 // The common case - if the previous character is within 90 // buffer_start_ .. buffer_end_ will be handles locally. 91 // Otherwise, a new block is requested. 92 if (V8_LIKELY(buffer_cursor_ > buffer_start_)) { 93 buffer_cursor_--; 94 } else { 95 ReadBlockAt(pos() - 1); 96 } 97 } 98 pos()99 inline size_t pos() const { 100 return buffer_pos_ + (buffer_cursor_ - buffer_start_); 101 } 102 Seek(size_t pos)103 inline void Seek(size_t pos) { 104 if (V8_LIKELY(pos >= buffer_pos_ && 105 pos < (buffer_pos_ + (buffer_end_ - buffer_start_)))) { 106 buffer_cursor_ = buffer_start_ + (pos - buffer_pos_); 107 } else { 108 ReadBlockAt(pos); 109 } 110 } 111 112 // Returns true if the stream could access the V8 heap after construction. 113 virtual bool can_access_heap() = 0; 114 115 protected: Utf16CharacterStream(const uint16_t * buffer_start,const uint16_t * buffer_cursor,const uint16_t * buffer_end,size_t buffer_pos)116 Utf16CharacterStream(const uint16_t* buffer_start, 117 const uint16_t* buffer_cursor, 118 const uint16_t* buffer_end, size_t buffer_pos) 119 : buffer_start_(buffer_start), 120 buffer_cursor_(buffer_cursor), 121 buffer_end_(buffer_end), 122 buffer_pos_(buffer_pos) {} Utf16CharacterStream()123 Utf16CharacterStream() : Utf16CharacterStream(nullptr, nullptr, nullptr, 0) {} 124 ReadBlockChecked()125 bool ReadBlockChecked() { 126 size_t position = pos(); 127 USE(position); 128 bool success = ReadBlock(); 129 130 // Post-conditions: 1, We should always be at the right position. 131 // 2, Cursor should be inside the buffer. 132 // 3, We should have more characters available iff success. 133 DCHECK_EQ(pos(), position); 134 DCHECK_LE(buffer_cursor_, buffer_end_); 135 DCHECK_LE(buffer_start_, buffer_cursor_); 136 DCHECK_EQ(success, buffer_cursor_ < buffer_end_); 137 return success; 138 } 139 ReadBlockAt(size_t new_pos)140 void ReadBlockAt(size_t new_pos) { 141 // The callers of this method (Back/Back2/Seek) should handle the easy 142 // case (seeking within the current buffer), and we should only get here 143 // if we actually require new data. 144 // (This is really an efficiency check, not a correctness invariant.) 145 DCHECK(new_pos < buffer_pos_ || 146 new_pos >= buffer_pos_ + (buffer_end_ - buffer_start_)); 147 148 // Change pos() to point to new_pos. 149 buffer_pos_ = new_pos; 150 buffer_cursor_ = buffer_start_; 151 DCHECK_EQ(pos(), new_pos); 152 ReadBlockChecked(); 153 } 154 155 // Read more data, and update buffer_*_ to point to it. 156 // Returns true if more data was available. 157 // 158 // ReadBlock() may modify any of the buffer_*_ members, but must sure that 159 // the result of pos() remains unaffected. 160 // 161 // Examples: 162 // - a stream could either fill a separate buffer. Then buffer_start_ and 163 // buffer_cursor_ would point to the beginning of the buffer, and 164 // buffer_pos would be the old pos(). 165 // - a stream with existing buffer chunks would set buffer_start_ and 166 // buffer_end_ to cover the full chunk, and then buffer_cursor_ would 167 // point into the middle of the buffer, while buffer_pos_ would describe 168 // the start of the buffer. 169 virtual bool ReadBlock() = 0; 170 171 const uint16_t* buffer_start_; 172 const uint16_t* buffer_cursor_; 173 const uint16_t* buffer_end_; 174 size_t buffer_pos_; 175 }; 176 177 // ---------------------------------------------------------------------------- 178 // JavaScript Scanner. 179 180 class Scanner { 181 public: 182 // Scoped helper for a re-settable bookmark. 183 class BookmarkScope { 184 public: BookmarkScope(Scanner * scanner)185 explicit BookmarkScope(Scanner* scanner) 186 : scanner_(scanner), bookmark_(kNoBookmark) { 187 DCHECK_NOT_NULL(scanner_); 188 } ~BookmarkScope()189 ~BookmarkScope() {} 190 191 void Set(); 192 void Apply(); 193 bool HasBeenSet(); 194 bool HasBeenApplied(); 195 196 private: 197 static const size_t kNoBookmark; 198 static const size_t kBookmarkWasApplied; 199 static const size_t kBookmarkAtFirstPos; 200 201 Scanner* scanner_; 202 size_t bookmark_; 203 204 DISALLOW_COPY_AND_ASSIGN(BookmarkScope); 205 }; 206 207 // Representation of an interval of source positions. 208 struct Location { LocationLocation209 Location(int b, int e) : beg_pos(b), end_pos(e) { } LocationLocation210 Location() : beg_pos(0), end_pos(0) { } 211 IsValidLocation212 bool IsValid() const { 213 return beg_pos >= 0 && end_pos >= beg_pos; 214 } 215 invalidLocation216 static Location invalid() { return Location(-1, -1); } 217 218 int beg_pos; 219 int end_pos; 220 }; 221 222 // -1 is outside of the range of any real source code. 223 static const int kNoOctalLocation = -1; 224 static const uc32 kEndOfInput = Utf16CharacterStream::kEndOfInput; 225 226 explicit Scanner(UnicodeCache* scanner_contants, Utf16CharacterStream* source, 227 bool is_module); 228 229 void Initialize(); 230 231 // Returns the next token and advances input. 232 Token::Value Next(); 233 // Returns the token following peek() 234 Token::Value PeekAhead(); 235 // Returns the current token again. current_token()236 Token::Value current_token() { return current().token; } 237 current_contextual_token()238 Token::Value current_contextual_token() { return current().contextual_token; } next_contextual_token()239 Token::Value next_contextual_token() { return next().contextual_token; } 240 241 // Returns the location information for the current token 242 // (the token last returned by Next()). location()243 Location location() const { return current().location; } 244 245 // This error is specifically an invalid hex or unicode escape sequence. has_error()246 bool has_error() const { return scanner_error_ != MessageTemplate::kNone; } error()247 MessageTemplate::Template error() const { return scanner_error_; } error_location()248 Location error_location() const { return scanner_error_location_; } 249 has_invalid_template_escape()250 bool has_invalid_template_escape() const { 251 return current().invalid_template_escape_message != MessageTemplate::kNone; 252 } invalid_template_escape_message()253 MessageTemplate::Template invalid_template_escape_message() const { 254 DCHECK(has_invalid_template_escape()); 255 return current().invalid_template_escape_message; 256 } invalid_template_escape_location()257 Location invalid_template_escape_location() const { 258 DCHECK(has_invalid_template_escape()); 259 return current().invalid_template_escape_location; 260 } 261 262 // Similar functions for the upcoming token. 263 264 // One token look-ahead (past the token returned by Next()). peek()265 Token::Value peek() const { return next().token; } 266 peek_location()267 Location peek_location() const { return next().location; } 268 literal_contains_escapes()269 bool literal_contains_escapes() const { 270 return LiteralContainsEscapes(current()); 271 } 272 273 const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory) const; 274 const AstRawString* NextSymbol(AstValueFactory* ast_value_factory) const; 275 const AstRawString* CurrentRawSymbol( 276 AstValueFactory* ast_value_factory) const; 277 278 double DoubleValue(); 279 280 const char* CurrentLiteralAsCString(Zone* zone) const; 281 CurrentMatches(Token::Value token)282 inline bool CurrentMatches(Token::Value token) const { 283 DCHECK(Token::IsKeyword(token)); 284 return current().token == token; 285 } 286 CurrentMatchesContextual(Token::Value token)287 inline bool CurrentMatchesContextual(Token::Value token) const { 288 DCHECK(Token::IsContextualKeyword(token)); 289 return current().contextual_token == token; 290 } 291 292 // Match the token against the contextual keyword or literal buffer. CurrentMatchesContextualEscaped(Token::Value token)293 inline bool CurrentMatchesContextualEscaped(Token::Value token) const { 294 DCHECK(Token::IsContextualKeyword(token) || token == Token::LET); 295 // Escaped keywords are not matched as tokens. So if we require escape 296 // and/or string processing we need to look at the literal content 297 // (which was escape-processed already). 298 // Conveniently, !current().literal_chars.is_used() for all proper 299 // keywords, so this second condition should exit early in common cases. 300 return (current().contextual_token == token) || 301 (current().literal_chars.is_used() && 302 current().literal_chars.Equals(Vector<const char>( 303 Token::String(token), Token::StringLength(token)))); 304 } 305 IsUseStrict()306 bool IsUseStrict() const { 307 return current().token == Token::STRING && 308 current().literal_chars.Equals( 309 Vector<const char>("use strict", strlen("use strict"))); 310 } IsGetOrSet(bool * is_get,bool * is_set)311 bool IsGetOrSet(bool* is_get, bool* is_set) const { 312 *is_get = CurrentMatchesContextual(Token::GET); 313 *is_set = CurrentMatchesContextual(Token::SET); 314 return *is_get || *is_set; 315 } IsLet()316 bool IsLet() const { 317 return CurrentMatches(Token::LET) || 318 CurrentMatchesContextualEscaped(Token::LET); 319 } 320 321 // Check whether the CurrentSymbol() has already been seen. 322 // The DuplicateFinder holds the data, so different instances can be used 323 // for different sets of duplicates to check for. 324 bool IsDuplicateSymbol(DuplicateFinder* duplicate_finder, 325 AstValueFactory* ast_value_factory) const; 326 unicode_cache()327 UnicodeCache* unicode_cache() { return unicode_cache_; } 328 329 // Returns the location of the last seen octal literal. octal_position()330 Location octal_position() const { return octal_pos_; } clear_octal_position()331 void clear_octal_position() { 332 octal_pos_ = Location::invalid(); 333 octal_message_ = MessageTemplate::kNone; 334 } octal_message()335 MessageTemplate::Template octal_message() const { return octal_message_; } 336 337 // Returns the value of the last smi that was scanned. smi_value()338 uint32_t smi_value() const { return current().smi_value_; } 339 340 // Seek forward to the given position. This operation does not 341 // work in general, for instance when there are pushed back 342 // characters, but works for seeking forward until simple delimiter 343 // tokens, which is what it is used for. 344 void SeekForward(int pos); 345 346 // Returns true if there was a line terminator before the peek'ed token, 347 // possibly inside a multi-line comment. HasLineTerminatorBeforeNext()348 bool HasLineTerminatorBeforeNext() const { 349 return next().after_line_terminator; 350 } 351 HasLineTerminatorAfterNext()352 bool HasLineTerminatorAfterNext() { 353 Token::Value ensure_next_next = PeekAhead(); 354 USE(ensure_next_next); 355 return next_next().after_line_terminator; 356 } 357 358 // Scans the input as a regular expression pattern, next token must be /(=). 359 // Returns true if a pattern is scanned. 360 bool ScanRegExpPattern(); 361 // Scans the input as regular expression flags. Returns the flags on success. 362 Maybe<RegExp::Flags> ScanRegExpFlags(); 363 364 // Scans the input as a template literal 365 Token::Value ScanTemplateStart(); ScanTemplateContinuation()366 Token::Value ScanTemplateContinuation() { 367 DCHECK_EQ(next().token, Token::RBRACE); 368 next().location.beg_pos = source_pos() - 1; // We already consumed } 369 return ScanTemplateSpan(); 370 } 371 372 Handle<String> SourceUrl(Isolate* isolate) const; 373 Handle<String> SourceMappingUrl(Isolate* isolate) const; 374 FoundHtmlComment()375 bool FoundHtmlComment() const { return found_html_comment_; } 376 allow_harmony_bigint()377 bool allow_harmony_bigint() const { return allow_harmony_bigint_; } set_allow_harmony_bigint(bool allow)378 void set_allow_harmony_bigint(bool allow) { allow_harmony_bigint_ = allow; } allow_harmony_private_fields()379 bool allow_harmony_private_fields() const { 380 return allow_harmony_private_fields_; 381 } set_allow_harmony_private_fields(bool allow)382 void set_allow_harmony_private_fields(bool allow) { 383 allow_harmony_private_fields_ = allow; 384 } allow_harmony_numeric_separator()385 bool allow_harmony_numeric_separator() const { 386 return allow_harmony_numeric_separator_; 387 } set_allow_harmony_numeric_separator(bool allow)388 void set_allow_harmony_numeric_separator(bool allow) { 389 allow_harmony_numeric_separator_ = allow; 390 } 391 392 private: 393 // Scoped helper for saving & restoring scanner error state. 394 // This is used for tagged template literals, in which normally forbidden 395 // escape sequences are allowed. 396 class ErrorState; 397 398 // Scoped helper for literal recording. Automatically drops the literal 399 // if aborting the scanning before it's complete. 400 class LiteralScope { 401 public: LiteralScope(Scanner * self)402 explicit LiteralScope(Scanner* self) : scanner_(self), complete_(false) { 403 scanner_->StartLiteral(); 404 } ~LiteralScope()405 ~LiteralScope() { 406 if (!complete_) scanner_->DropLiteral(); 407 } Complete()408 void Complete() { complete_ = true; } 409 410 private: 411 Scanner* scanner_; 412 bool complete_; 413 }; 414 415 // LiteralBuffer - Collector of chars of literals. 416 class LiteralBuffer { 417 public: LiteralBuffer()418 LiteralBuffer() 419 : position_(0), is_one_byte_(true), is_used_(false), backing_store_() {} 420 ~LiteralBuffer()421 ~LiteralBuffer() { backing_store_.Dispose(); } 422 AddChar(char code_unit)423 V8_INLINE void AddChar(char code_unit) { 424 DCHECK(is_used_); 425 DCHECK(IsValidAscii(code_unit)); 426 AddOneByteChar(static_cast<byte>(code_unit)); 427 } 428 AddChar(uc32 code_unit)429 V8_INLINE void AddChar(uc32 code_unit) { 430 DCHECK(is_used_); 431 if (is_one_byte_) { 432 if (code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) { 433 AddOneByteChar(static_cast<byte>(code_unit)); 434 return; 435 } 436 ConvertToTwoByte(); 437 } 438 AddTwoByteChar(code_unit); 439 } 440 is_one_byte()441 bool is_one_byte() const { return is_one_byte_; } 442 Equals(Vector<const char> keyword)443 bool Equals(Vector<const char> keyword) const { 444 DCHECK(is_used_); 445 return is_one_byte() && keyword.length() == position_ && 446 (memcmp(keyword.start(), backing_store_.start(), position_) == 0); 447 } 448 two_byte_literal()449 Vector<const uint16_t> two_byte_literal() const { 450 DCHECK(!is_one_byte_); 451 DCHECK(is_used_); 452 DCHECK_EQ(position_ & 0x1, 0); 453 return Vector<const uint16_t>( 454 reinterpret_cast<const uint16_t*>(backing_store_.start()), 455 position_ >> 1); 456 } 457 one_byte_literal()458 Vector<const uint8_t> one_byte_literal() const { 459 DCHECK(is_one_byte_); 460 DCHECK(is_used_); 461 return Vector<const uint8_t>( 462 reinterpret_cast<const uint8_t*>(backing_store_.start()), position_); 463 } 464 length()465 int length() const { return is_one_byte_ ? position_ : (position_ >> 1); } 466 Start()467 void Start() { 468 DCHECK(!is_used_); 469 DCHECK_EQ(0, position_); 470 is_used_ = true; 471 } 472 is_used()473 bool is_used() const { return is_used_; } 474 Drop()475 void Drop() { 476 is_used_ = false; 477 position_ = 0; 478 is_one_byte_ = true; 479 } 480 481 Handle<String> Internalize(Isolate* isolate) const; 482 483 private: 484 static const int kInitialCapacity = 16; 485 static const int kGrowthFactory = 4; 486 static const int kMinConversionSlack = 256; 487 static const int kMaxGrowth = 1 * MB; 488 IsValidAscii(char code_unit)489 inline bool IsValidAscii(char code_unit) { 490 // Control characters and printable characters span the range of 491 // valid ASCII characters (0-127). Chars are unsigned on some 492 // platforms which causes compiler warnings if the validity check 493 // tests the lower bound >= 0 as it's always true. 494 return iscntrl(code_unit) || isprint(code_unit); 495 } 496 AddOneByteChar(byte one_byte_char)497 V8_INLINE void AddOneByteChar(byte one_byte_char) { 498 DCHECK(is_one_byte_); 499 if (position_ >= backing_store_.length()) ExpandBuffer(); 500 backing_store_[position_] = one_byte_char; 501 position_ += kOneByteSize; 502 } 503 504 void AddTwoByteChar(uc32 code_unit); 505 int NewCapacity(int min_capacity); 506 void ExpandBuffer(); 507 void ConvertToTwoByte(); 508 509 int position_; 510 bool is_one_byte_; 511 bool is_used_; 512 Vector<byte> backing_store_; 513 514 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer); 515 }; 516 517 // The current and look-ahead token. 518 struct TokenDesc { 519 Location location = {0, 0}; 520 LiteralBuffer literal_chars; 521 LiteralBuffer raw_literal_chars; 522 Token::Value token = Token::UNINITIALIZED; 523 MessageTemplate::Template invalid_template_escape_message = 524 MessageTemplate::kNone; 525 Location invalid_template_escape_location; 526 Token::Value contextual_token = Token::UNINITIALIZED; 527 uint32_t smi_value_ = 0; 528 bool after_line_terminator = false; 529 }; 530 531 enum NumberKind { 532 BINARY, 533 OCTAL, 534 IMPLICIT_OCTAL, 535 HEX, 536 DECIMAL, 537 DECIMAL_WITH_LEADING_ZERO 538 }; 539 540 static const int kCharacterLookaheadBufferSize = 1; 541 const int kMaxAscii = 127; 542 543 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence. 544 template <bool capture_raw> 545 uc32 ScanOctalEscape(uc32 c, int length); 546 547 // Call this after setting source_ to the input. Init()548 void Init() { 549 // Set c0_ (one character ahead) 550 STATIC_ASSERT(kCharacterLookaheadBufferSize == 1); 551 Advance(); 552 553 current_ = &token_storage_[0]; 554 next_ = &token_storage_[1]; 555 next_next_ = &token_storage_[2]; 556 557 found_html_comment_ = false; 558 scanner_error_ = MessageTemplate::kNone; 559 } 560 ReportScannerError(const Location & location,MessageTemplate::Template error)561 void ReportScannerError(const Location& location, 562 MessageTemplate::Template error) { 563 if (has_error()) return; 564 scanner_error_ = error; 565 scanner_error_location_ = location; 566 } 567 ReportScannerError(int pos,MessageTemplate::Template error)568 void ReportScannerError(int pos, MessageTemplate::Template error) { 569 if (has_error()) return; 570 scanner_error_ = error; 571 scanner_error_location_ = Location(pos, pos + 1); 572 } 573 574 // Seek to the next_ token at the given position. 575 void SeekNext(size_t position); 576 577 // Literal buffer support StartLiteral()578 inline void StartLiteral() { next().literal_chars.Start(); } 579 StartRawLiteral()580 inline void StartRawLiteral() { next().raw_literal_chars.Start(); } 581 AddLiteralChar(uc32 c)582 V8_INLINE void AddLiteralChar(uc32 c) { next().literal_chars.AddChar(c); } 583 AddLiteralChar(char c)584 V8_INLINE void AddLiteralChar(char c) { next().literal_chars.AddChar(c); } 585 AddRawLiteralChar(uc32 c)586 V8_INLINE void AddRawLiteralChar(uc32 c) { 587 next().raw_literal_chars.AddChar(c); 588 } 589 590 // Stops scanning of a literal and drop the collected characters, 591 // e.g., due to an encountered error. DropLiteral()592 inline void DropLiteral() { 593 next().literal_chars.Drop(); 594 next().raw_literal_chars.Drop(); 595 } 596 AddLiteralCharAdvance()597 inline void AddLiteralCharAdvance() { 598 AddLiteralChar(c0_); 599 Advance(); 600 } 601 602 // Low-level scanning support. 603 template <bool capture_raw = false> Advance()604 void Advance() { 605 if (capture_raw) { 606 AddRawLiteralChar(c0_); 607 } 608 c0_ = source_->Advance(); 609 } 610 611 template <typename FunctionType> AdvanceUntil(FunctionType check)612 V8_INLINE void AdvanceUntil(FunctionType check) { 613 c0_ = source_->AdvanceUntil(check); 614 } 615 CombineSurrogatePair()616 bool CombineSurrogatePair() { 617 DCHECK(!unibrow::Utf16::IsLeadSurrogate(kEndOfInput)); 618 if (unibrow::Utf16::IsLeadSurrogate(c0_)) { 619 uc32 c1 = source_->Advance(); 620 DCHECK(!unibrow::Utf16::IsTrailSurrogate(kEndOfInput)); 621 if (unibrow::Utf16::IsTrailSurrogate(c1)) { 622 c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1); 623 return true; 624 } 625 source_->Back(); 626 } 627 return false; 628 } 629 PushBack(uc32 ch)630 void PushBack(uc32 ch) { 631 DCHECK_LE(c0_, static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)); 632 source_->Back(); 633 c0_ = ch; 634 } 635 Peek()636 uc32 Peek() const { return source_->Peek(); } 637 Select(Token::Value tok)638 inline Token::Value Select(Token::Value tok) { 639 Advance(); 640 return tok; 641 } 642 Select(uc32 next,Token::Value then,Token::Value else_)643 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) { 644 Advance(); 645 if (c0_ == next) { 646 Advance(); 647 return then; 648 } else { 649 return else_; 650 } 651 } 652 // Returns the literal string, if any, for the current token (the 653 // token last returned by Next()). The string is 0-terminated. 654 // Literal strings are collected for identifiers, strings, numbers as well 655 // as for template literals. For template literals we also collect the raw 656 // form. 657 // These functions only give the correct result if the literal was scanned 658 // when a LiteralScope object is alive. 659 // 660 // Current usage of these functions is unfortunately a little undisciplined, 661 // and is_literal_one_byte() + is_literal_one_byte_string() is also 662 // requested for tokens that do not have a literal. Hence, we treat any 663 // token as a one-byte literal. E.g. Token::FUNCTION pretends to have a 664 // literal "function". literal_one_byte_string()665 Vector<const uint8_t> literal_one_byte_string() const { 666 if (current().literal_chars.is_used()) 667 return current().literal_chars.one_byte_literal(); 668 const char* str = Token::String(current().token); 669 const uint8_t* str_as_uint8 = reinterpret_cast<const uint8_t*>(str); 670 return Vector<const uint8_t>(str_as_uint8, 671 Token::StringLength(current().token)); 672 } literal_two_byte_string()673 Vector<const uint16_t> literal_two_byte_string() const { 674 DCHECK(current().literal_chars.is_used()); 675 return current().literal_chars.two_byte_literal(); 676 } is_literal_one_byte()677 bool is_literal_one_byte() const { 678 return !current().literal_chars.is_used() || 679 current().literal_chars.is_one_byte(); 680 } 681 // Returns the literal string for the next token (the token that 682 // would be returned if Next() were called). next_literal_one_byte_string()683 Vector<const uint8_t> next_literal_one_byte_string() const { 684 DCHECK(next().literal_chars.is_used()); 685 return next().literal_chars.one_byte_literal(); 686 } next_literal_two_byte_string()687 Vector<const uint16_t> next_literal_two_byte_string() const { 688 DCHECK(next().literal_chars.is_used()); 689 return next().literal_chars.two_byte_literal(); 690 } is_next_literal_one_byte()691 bool is_next_literal_one_byte() const { 692 DCHECK(next().literal_chars.is_used()); 693 return next().literal_chars.is_one_byte(); 694 } raw_literal_one_byte_string()695 Vector<const uint8_t> raw_literal_one_byte_string() const { 696 DCHECK(current().raw_literal_chars.is_used()); 697 return current().raw_literal_chars.one_byte_literal(); 698 } raw_literal_two_byte_string()699 Vector<const uint16_t> raw_literal_two_byte_string() const { 700 DCHECK(current().raw_literal_chars.is_used()); 701 return current().raw_literal_chars.two_byte_literal(); 702 } is_raw_literal_one_byte()703 bool is_raw_literal_one_byte() const { 704 DCHECK(current().raw_literal_chars.is_used()); 705 return current().raw_literal_chars.is_one_byte(); 706 } 707 708 template <bool capture_raw, bool unicode = false> 709 uc32 ScanHexNumber(int expected_length); 710 // Scan a number of any length but not bigger than max_value. For example, the 711 // number can be 000000001, so it's very long in characters but its value is 712 // small. 713 template <bool capture_raw> 714 uc32 ScanUnlimitedLengthHexNumber(int max_value, int beg_pos); 715 716 // Scans a single JavaScript token. 717 void Scan(); 718 719 V8_INLINE Token::Value SkipWhiteSpace(); 720 Token::Value SkipSingleHTMLComment(); 721 Token::Value SkipSingleLineComment(); 722 Token::Value SkipSourceURLComment(); 723 void TryToParseSourceURLComment(); 724 Token::Value SkipMultiLineComment(); 725 // Scans a possible HTML comment -- begins with '<!'. 726 Token::Value ScanHtmlComment(); 727 728 bool ScanDigitsWithNumericSeparators(bool (*predicate)(uc32 ch), 729 bool is_check_first_digit); 730 bool ScanDecimalDigits(); 731 // Optimized function to scan decimal number as Smi. 732 bool ScanDecimalAsSmi(uint64_t* value); 733 bool ScanDecimalAsSmiWithNumericSeparators(uint64_t* value); 734 bool ScanHexDigits(); 735 bool ScanBinaryDigits(); 736 bool ScanSignedInteger(); 737 bool ScanOctalDigits(); 738 bool ScanImplicitOctalDigits(int start_pos, NumberKind* kind); 739 740 Token::Value ScanNumber(bool seen_period); 741 Token::Value ScanIdentifierOrKeyword(); 742 Token::Value ScanIdentifierOrKeywordInner(LiteralScope* literal); 743 744 Token::Value ScanString(); 745 Token::Value ScanPrivateName(); 746 747 // Scans an escape-sequence which is part of a string and adds the 748 // decoded character to the current literal. Returns true if a pattern 749 // is scanned. 750 template <bool capture_raw> 751 bool ScanEscape(); 752 753 // Decodes a Unicode escape-sequence which is part of an identifier. 754 // If the escape sequence cannot be decoded the result is kBadChar. 755 uc32 ScanIdentifierUnicodeEscape(); 756 // Helper for the above functions. 757 template <bool capture_raw> 758 uc32 ScanUnicodeEscape(); 759 760 Token::Value ScanTemplateSpan(); 761 762 // Return the current source position. source_pos()763 int source_pos() { 764 return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize; 765 } 766 LiteralContainsEscapes(const TokenDesc & token)767 static bool LiteralContainsEscapes(const TokenDesc& token) { 768 Location location = token.location; 769 int source_length = (location.end_pos - location.beg_pos); 770 if (token.token == Token::STRING) { 771 // Subtract delimiters. 772 source_length -= 2; 773 } 774 return token.literal_chars.is_used() && 775 (token.literal_chars.length() != source_length); 776 } 777 778 #ifdef DEBUG 779 void SanityCheckTokenDesc(const TokenDesc&) const; 780 #endif 781 782 UnicodeCache* unicode_cache_; 783 784 // Values parsed from magic comments. 785 LiteralBuffer source_url_; 786 LiteralBuffer source_mapping_url_; 787 788 TokenDesc token_storage_[3]; 789 next()790 TokenDesc& next() { return *next_; } 791 current()792 const TokenDesc& current() const { return *current_; } next()793 const TokenDesc& next() const { return *next_; } next_next()794 const TokenDesc& next_next() const { return *next_next_; } 795 796 TokenDesc* current_; // desc for current token (as returned by Next()) 797 TokenDesc* next_; // desc for next token (one token look-ahead) 798 TokenDesc* next_next_; // desc for the token after next (after PeakAhead()) 799 800 // Input stream. Must be initialized to an Utf16CharacterStream. 801 Utf16CharacterStream* const source_; 802 803 // Last-seen positions of potentially problematic tokens. 804 Location octal_pos_; 805 MessageTemplate::Template octal_message_; 806 807 // One Unicode character look-ahead; c0_ < 0 at the end of the input. 808 uc32 c0_; 809 810 // Whether this scanner encountered an HTML comment. 811 bool found_html_comment_; 812 813 // Harmony flags to allow ESNext features. 814 bool allow_harmony_bigint_; 815 bool allow_harmony_private_fields_; 816 bool allow_harmony_numeric_separator_; 817 818 const bool is_module_; 819 820 MessageTemplate::Template scanner_error_; 821 Location scanner_error_location_; 822 }; 823 824 } // namespace internal 825 } // namespace v8 826 827 #endif // V8_PARSING_SCANNER_H_ 828