1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Features shared by parsing and pre-parsing scanners.
6
7 #include "src/parsing/scanner.h"
8
9 #include <stdint.h>
10
11 #include <cmath>
12
13 #include "src/ast/ast-value-factory.h"
14 #include "src/char-predicates-inl.h"
15 #include "src/conversions-inl.h"
16 #include "src/objects/bigint.h"
17 #include "src/parsing/duplicate-finder.h" // For Scanner::FindSymbol
18 #include "src/parsing/scanner-inl.h"
19
20 namespace v8 {
21 namespace internal {
22
23 class Scanner::ErrorState {
24 public:
ErrorState(MessageTemplate::Template * message_stack,Scanner::Location * location_stack)25 ErrorState(MessageTemplate::Template* message_stack,
26 Scanner::Location* location_stack)
27 : message_stack_(message_stack),
28 old_message_(*message_stack),
29 location_stack_(location_stack),
30 old_location_(*location_stack) {
31 *message_stack_ = MessageTemplate::kNone;
32 *location_stack_ = Location::invalid();
33 }
34
~ErrorState()35 ~ErrorState() {
36 *message_stack_ = old_message_;
37 *location_stack_ = old_location_;
38 }
39
MoveErrorTo(TokenDesc * dest)40 void MoveErrorTo(TokenDesc* dest) {
41 if (*message_stack_ == MessageTemplate::kNone) {
42 return;
43 }
44 if (dest->invalid_template_escape_message == MessageTemplate::kNone) {
45 dest->invalid_template_escape_message = *message_stack_;
46 dest->invalid_template_escape_location = *location_stack_;
47 }
48 *message_stack_ = MessageTemplate::kNone;
49 *location_stack_ = Location::invalid();
50 }
51
52 private:
53 MessageTemplate::Template* const message_stack_;
54 MessageTemplate::Template const old_message_;
55 Scanner::Location* const location_stack_;
56 Scanner::Location const old_location_;
57 };
58
59 // ----------------------------------------------------------------------------
60 // Scanner::LiteralBuffer
61
Internalize(Isolate * isolate) const62 Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const {
63 DCHECK(is_used_);
64 if (is_one_byte()) {
65 return isolate->factory()->InternalizeOneByteString(one_byte_literal());
66 }
67 return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
68 }
69
NewCapacity(int min_capacity)70 int Scanner::LiteralBuffer::NewCapacity(int min_capacity) {
71 int capacity = Max(min_capacity, backing_store_.length());
72 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
73 return new_capacity;
74 }
75
ExpandBuffer()76 void Scanner::LiteralBuffer::ExpandBuffer() {
77 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
78 MemCopy(new_store.start(), backing_store_.start(), position_);
79 backing_store_.Dispose();
80 backing_store_ = new_store;
81 }
82
ConvertToTwoByte()83 void Scanner::LiteralBuffer::ConvertToTwoByte() {
84 DCHECK(is_one_byte_);
85 Vector<byte> new_store;
86 int new_content_size = position_ * kUC16Size;
87 if (new_content_size >= backing_store_.length()) {
88 // Ensure room for all currently read code units as UC16 as well
89 // as the code unit about to be stored.
90 new_store = Vector<byte>::New(NewCapacity(new_content_size));
91 } else {
92 new_store = backing_store_;
93 }
94 uint8_t* src = backing_store_.start();
95 uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
96 for (int i = position_ - 1; i >= 0; i--) {
97 dst[i] = src[i];
98 }
99 if (new_store.start() != backing_store_.start()) {
100 backing_store_.Dispose();
101 backing_store_ = new_store;
102 }
103 position_ = new_content_size;
104 is_one_byte_ = false;
105 }
106
AddTwoByteChar(uc32 code_unit)107 void Scanner::LiteralBuffer::AddTwoByteChar(uc32 code_unit) {
108 DCHECK(!is_one_byte_);
109 if (position_ >= backing_store_.length()) ExpandBuffer();
110 if (code_unit <=
111 static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
112 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
113 position_ += kUC16Size;
114 } else {
115 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
116 unibrow::Utf16::LeadSurrogate(code_unit);
117 position_ += kUC16Size;
118 if (position_ >= backing_store_.length()) ExpandBuffer();
119 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
120 unibrow::Utf16::TrailSurrogate(code_unit);
121 position_ += kUC16Size;
122 }
123 }
124
125 // ----------------------------------------------------------------------------
126 // Scanner::BookmarkScope
127
128 const size_t Scanner::BookmarkScope::kBookmarkAtFirstPos =
129 std::numeric_limits<size_t>::max() - 2;
130 const size_t Scanner::BookmarkScope::kNoBookmark =
131 std::numeric_limits<size_t>::max() - 1;
132 const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
133 std::numeric_limits<size_t>::max();
134
Set()135 void Scanner::BookmarkScope::Set() {
136 DCHECK_EQ(bookmark_, kNoBookmark);
137 DCHECK_EQ(scanner_->next_next().token, Token::UNINITIALIZED);
138
139 // The first token is a bit special, since current_ will still be
140 // uninitialized. In this case, store kBookmarkAtFirstPos and special-case it
141 // when
142 // applying the bookmark.
143 DCHECK_IMPLIES(scanner_->current().token == Token::UNINITIALIZED,
144 scanner_->current().location.beg_pos ==
145 scanner_->next().location.beg_pos);
146 bookmark_ = (scanner_->current().token == Token::UNINITIALIZED)
147 ? kBookmarkAtFirstPos
148 : scanner_->location().beg_pos;
149 }
150
Apply()151 void Scanner::BookmarkScope::Apply() {
152 DCHECK(HasBeenSet()); // Caller hasn't called SetBookmark.
153 if (bookmark_ == kBookmarkAtFirstPos) {
154 scanner_->SeekNext(0);
155 } else {
156 scanner_->SeekNext(bookmark_);
157 scanner_->Next();
158 DCHECK_EQ(scanner_->location().beg_pos, static_cast<int>(bookmark_));
159 }
160 bookmark_ = kBookmarkWasApplied;
161 }
162
HasBeenSet()163 bool Scanner::BookmarkScope::HasBeenSet() {
164 return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
165 }
166
HasBeenApplied()167 bool Scanner::BookmarkScope::HasBeenApplied() {
168 return bookmark_ == kBookmarkWasApplied;
169 }
170
171 // ----------------------------------------------------------------------------
172 // Scanner
173
Scanner(UnicodeCache * unicode_cache,Utf16CharacterStream * source,bool is_module)174 Scanner::Scanner(UnicodeCache* unicode_cache, Utf16CharacterStream* source,
175 bool is_module)
176 : unicode_cache_(unicode_cache),
177 source_(source),
178 octal_pos_(Location::invalid()),
179 octal_message_(MessageTemplate::kNone),
180 found_html_comment_(false),
181 allow_harmony_bigint_(false),
182 allow_harmony_numeric_separator_(false),
183 is_module_(is_module) {
184 DCHECK_NOT_NULL(source);
185 }
186
Initialize()187 void Scanner::Initialize() {
188 // Need to capture identifiers in order to recognize "get" and "set"
189 // in object literals.
190 Init();
191 next().after_line_terminator = true;
192 Scan();
193 }
194
195 template <bool capture_raw, bool unicode>
ScanHexNumber(int expected_length)196 uc32 Scanner::ScanHexNumber(int expected_length) {
197 DCHECK_LE(expected_length, 4); // prevent overflow
198
199 int begin = source_pos() - 2;
200 uc32 x = 0;
201 for (int i = 0; i < expected_length; i++) {
202 int d = HexValue(c0_);
203 if (d < 0) {
204 ReportScannerError(Location(begin, begin + expected_length + 2),
205 unicode
206 ? MessageTemplate::kInvalidUnicodeEscapeSequence
207 : MessageTemplate::kInvalidHexEscapeSequence);
208 return -1;
209 }
210 x = x * 16 + d;
211 Advance<capture_raw>();
212 }
213
214 return x;
215 }
216
217 template <bool capture_raw>
ScanUnlimitedLengthHexNumber(int max_value,int beg_pos)218 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
219 uc32 x = 0;
220 int d = HexValue(c0_);
221 if (d < 0) return -1;
222
223 while (d >= 0) {
224 x = x * 16 + d;
225 if (x > max_value) {
226 ReportScannerError(Location(beg_pos, source_pos() + 1),
227 MessageTemplate::kUndefinedUnicodeCodePoint);
228 return -1;
229 }
230 Advance<capture_raw>();
231 d = HexValue(c0_);
232 }
233
234 return x;
235 }
236
237
238 // Ensure that tokens can be stored in a byte.
239 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
240
241 // Table of one-character tokens, by character (0x00..0x7F only).
242 // clang-format off
243 static const byte one_char_tokens[] = {
244 Token::ILLEGAL,
245 Token::ILLEGAL,
246 Token::ILLEGAL,
247 Token::ILLEGAL,
248 Token::ILLEGAL,
249 Token::ILLEGAL,
250 Token::ILLEGAL,
251 Token::ILLEGAL,
252 Token::ILLEGAL,
253 Token::ILLEGAL,
254 Token::ILLEGAL,
255 Token::ILLEGAL,
256 Token::ILLEGAL,
257 Token::ILLEGAL,
258 Token::ILLEGAL,
259 Token::ILLEGAL,
260 Token::ILLEGAL,
261 Token::ILLEGAL,
262 Token::ILLEGAL,
263 Token::ILLEGAL,
264 Token::ILLEGAL,
265 Token::ILLEGAL,
266 Token::ILLEGAL,
267 Token::ILLEGAL,
268 Token::ILLEGAL,
269 Token::ILLEGAL,
270 Token::ILLEGAL,
271 Token::ILLEGAL,
272 Token::ILLEGAL,
273 Token::ILLEGAL,
274 Token::ILLEGAL,
275 Token::ILLEGAL,
276 Token::ILLEGAL,
277 Token::ILLEGAL,
278 Token::ILLEGAL,
279 Token::ILLEGAL,
280 Token::ILLEGAL,
281 Token::ILLEGAL,
282 Token::ILLEGAL,
283 Token::ILLEGAL,
284 Token::LPAREN, // 0x28
285 Token::RPAREN, // 0x29
286 Token::ILLEGAL,
287 Token::ILLEGAL,
288 Token::COMMA, // 0x2C
289 Token::ILLEGAL,
290 Token::ILLEGAL,
291 Token::ILLEGAL,
292 Token::ILLEGAL,
293 Token::ILLEGAL,
294 Token::ILLEGAL,
295 Token::ILLEGAL,
296 Token::ILLEGAL,
297 Token::ILLEGAL,
298 Token::ILLEGAL,
299 Token::ILLEGAL,
300 Token::ILLEGAL,
301 Token::ILLEGAL,
302 Token::COLON, // 0x3A
303 Token::SEMICOLON, // 0x3B
304 Token::ILLEGAL,
305 Token::ILLEGAL,
306 Token::ILLEGAL,
307 Token::CONDITIONAL, // 0x3F
308 Token::ILLEGAL,
309 Token::ILLEGAL,
310 Token::ILLEGAL,
311 Token::ILLEGAL,
312 Token::ILLEGAL,
313 Token::ILLEGAL,
314 Token::ILLEGAL,
315 Token::ILLEGAL,
316 Token::ILLEGAL,
317 Token::ILLEGAL,
318 Token::ILLEGAL,
319 Token::ILLEGAL,
320 Token::ILLEGAL,
321 Token::ILLEGAL,
322 Token::ILLEGAL,
323 Token::ILLEGAL,
324 Token::ILLEGAL,
325 Token::ILLEGAL,
326 Token::ILLEGAL,
327 Token::ILLEGAL,
328 Token::ILLEGAL,
329 Token::ILLEGAL,
330 Token::ILLEGAL,
331 Token::ILLEGAL,
332 Token::ILLEGAL,
333 Token::ILLEGAL,
334 Token::ILLEGAL,
335 Token::LBRACK, // 0x5B
336 Token::ILLEGAL,
337 Token::RBRACK, // 0x5D
338 Token::ILLEGAL,
339 Token::ILLEGAL,
340 Token::ILLEGAL,
341 Token::ILLEGAL,
342 Token::ILLEGAL,
343 Token::ILLEGAL,
344 Token::ILLEGAL,
345 Token::ILLEGAL,
346 Token::ILLEGAL,
347 Token::ILLEGAL,
348 Token::ILLEGAL,
349 Token::ILLEGAL,
350 Token::ILLEGAL,
351 Token::ILLEGAL,
352 Token::ILLEGAL,
353 Token::ILLEGAL,
354 Token::ILLEGAL,
355 Token::ILLEGAL,
356 Token::ILLEGAL,
357 Token::ILLEGAL,
358 Token::ILLEGAL,
359 Token::ILLEGAL,
360 Token::ILLEGAL,
361 Token::ILLEGAL,
362 Token::ILLEGAL,
363 Token::ILLEGAL,
364 Token::ILLEGAL,
365 Token::ILLEGAL,
366 Token::ILLEGAL,
367 Token::LBRACE, // 0x7B
368 Token::ILLEGAL,
369 Token::RBRACE, // 0x7D
370 Token::BIT_NOT, // 0x7E
371 Token::ILLEGAL
372 };
373 // clang-format on
374
Next()375 Token::Value Scanner::Next() {
376 if (next().token == Token::EOS) next().location = current().location;
377 // Rotate through tokens.
378 TokenDesc* previous = current_;
379 current_ = next_;
380 // Either we already have the next token lined up, in which case next_next_
381 // simply becomes next_. In that case we use current_ as new next_next_ and
382 // clear its token to indicate that it wasn't scanned yet. Otherwise we use
383 // current_ as next_ and scan into it, leaving next_next_ uninitialized.
384 if (V8_LIKELY(next_next().token == Token::UNINITIALIZED)) {
385 next_ = previous;
386 next().after_line_terminator = false;
387 Scan();
388 } else {
389 next_ = next_next_;
390 next_next_ = previous;
391 previous->token = Token::UNINITIALIZED;
392 previous->contextual_token = Token::UNINITIALIZED;
393 DCHECK_NE(Token::UNINITIALIZED, current().token);
394 }
395 return current().token;
396 }
397
398
PeekAhead()399 Token::Value Scanner::PeekAhead() {
400 DCHECK(next().token != Token::DIV);
401 DCHECK(next().token != Token::ASSIGN_DIV);
402
403 if (next_next().token != Token::UNINITIALIZED) {
404 return next_next().token;
405 }
406 TokenDesc* temp = next_;
407 next_ = next_next_;
408 next().after_line_terminator = false;
409 Scan();
410 next_next_ = next_;
411 next_ = temp;
412 return next_next().token;
413 }
414
SkipSingleHTMLComment()415 Token::Value Scanner::SkipSingleHTMLComment() {
416 if (is_module_) {
417 ReportScannerError(source_pos(), MessageTemplate::kHtmlCommentInModule);
418 return Token::ILLEGAL;
419 }
420 return SkipSingleLineComment();
421 }
422
SkipSingleLineComment()423 Token::Value Scanner::SkipSingleLineComment() {
424 // The line terminator at the end of the line is not considered
425 // to be part of the single-line comment; it is recognized
426 // separately by the lexical grammar and becomes part of the
427 // stream of input elements for the syntactic grammar (see
428 // ECMA-262, section 7.4).
429 AdvanceUntil([](uc32 c0_) { return unibrow::IsLineTerminator(c0_); });
430
431 return Token::WHITESPACE;
432 }
433
SkipSourceURLComment()434 Token::Value Scanner::SkipSourceURLComment() {
435 TryToParseSourceURLComment();
436 while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
437 Advance();
438 }
439
440 return Token::WHITESPACE;
441 }
442
TryToParseSourceURLComment()443 void Scanner::TryToParseSourceURLComment() {
444 // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
445 // function will just return if it cannot parse a magic comment.
446 DCHECK(!unicode_cache_->IsWhiteSpaceOrLineTerminator(kEndOfInput));
447 if (!unicode_cache_->IsWhiteSpace(c0_)) return;
448 Advance();
449 LiteralBuffer name;
450 name.Start();
451
452 while (c0_ != kEndOfInput &&
453 !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && c0_ != '=') {
454 name.AddChar(c0_);
455 Advance();
456 }
457 if (!name.is_one_byte()) return;
458 Vector<const uint8_t> name_literal = name.one_byte_literal();
459 LiteralBuffer* value;
460 if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
461 value = &source_url_;
462 } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
463 value = &source_mapping_url_;
464 } else {
465 return;
466 }
467 if (c0_ != '=')
468 return;
469 value->Drop();
470 value->Start();
471 Advance();
472 while (unicode_cache_->IsWhiteSpace(c0_)) {
473 Advance();
474 }
475 while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
476 // Disallowed characters.
477 if (c0_ == '"' || c0_ == '\'') {
478 value->Drop();
479 return;
480 }
481 if (unicode_cache_->IsWhiteSpace(c0_)) {
482 break;
483 }
484 value->AddChar(c0_);
485 Advance();
486 }
487 // Allow whitespace at the end.
488 while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
489 if (!unicode_cache_->IsWhiteSpace(c0_)) {
490 value->Drop();
491 break;
492 }
493 Advance();
494 }
495 }
496
SkipMultiLineComment()497 Token::Value Scanner::SkipMultiLineComment() {
498 DCHECK_EQ(c0_, '*');
499 Advance();
500
501 while (c0_ != kEndOfInput) {
502 DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
503 if (!HasLineTerminatorBeforeNext() && unibrow::IsLineTerminator(c0_)) {
504 // Following ECMA-262, section 7.4, a comment containing
505 // a newline will make the comment count as a line-terminator.
506 next().after_line_terminator = true;
507 }
508
509 while (V8_UNLIKELY(c0_ == '*')) {
510 Advance();
511 if (c0_ == '/') {
512 Advance();
513 return Token::WHITESPACE;
514 }
515 }
516 Advance();
517 }
518
519 // Unterminated multi-line comment.
520 return Token::ILLEGAL;
521 }
522
ScanHtmlComment()523 Token::Value Scanner::ScanHtmlComment() {
524 // Check for <!-- comments.
525 DCHECK_EQ(c0_, '!');
526 Advance();
527 if (c0_ != '-' || Peek() != '-') {
528 PushBack('!'); // undo Advance()
529 return Token::LT;
530 }
531 Advance();
532
533 found_html_comment_ = true;
534 return SkipSingleHTMLComment();
535 }
536
Scan()537 void Scanner::Scan() {
538 next().literal_chars.Drop();
539 next().raw_literal_chars.Drop();
540 next().invalid_template_escape_message = MessageTemplate::kNone;
541
542 Token::Value token;
543 do {
544 if (static_cast<unsigned>(c0_) <= 0x7F) {
545 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
546 if (token != Token::ILLEGAL) {
547 int pos = source_pos();
548 next().token = token;
549 next().contextual_token = Token::UNINITIALIZED;
550 next().location.beg_pos = pos;
551 next().location.end_pos = pos + 1;
552 Advance();
553 return;
554 }
555 }
556
557 // Remember the position of the next token
558 next().location.beg_pos = source_pos();
559
560 switch (c0_) {
561 case '"':
562 case '\'':
563 token = ScanString();
564 break;
565
566 case '<':
567 // < <= << <<= <!--
568 Advance();
569 if (c0_ == '=') {
570 token = Select(Token::LTE);
571 } else if (c0_ == '<') {
572 token = Select('=', Token::ASSIGN_SHL, Token::SHL);
573 } else if (c0_ == '!') {
574 token = ScanHtmlComment();
575 } else {
576 token = Token::LT;
577 }
578 break;
579
580 case '>':
581 // > >= >> >>= >>> >>>=
582 Advance();
583 if (c0_ == '=') {
584 token = Select(Token::GTE);
585 } else if (c0_ == '>') {
586 // >> >>= >>> >>>=
587 Advance();
588 if (c0_ == '=') {
589 token = Select(Token::ASSIGN_SAR);
590 } else if (c0_ == '>') {
591 token = Select('=', Token::ASSIGN_SHR, Token::SHR);
592 } else {
593 token = Token::SAR;
594 }
595 } else {
596 token = Token::GT;
597 }
598 break;
599
600 case '=':
601 // = == === =>
602 Advance();
603 if (c0_ == '=') {
604 token = Select('=', Token::EQ_STRICT, Token::EQ);
605 } else if (c0_ == '>') {
606 token = Select(Token::ARROW);
607 } else {
608 token = Token::ASSIGN;
609 }
610 break;
611
612 case '!':
613 // ! != !==
614 Advance();
615 if (c0_ == '=') {
616 token = Select('=', Token::NE_STRICT, Token::NE);
617 } else {
618 token = Token::NOT;
619 }
620 break;
621
622 case '+':
623 // + ++ +=
624 Advance();
625 if (c0_ == '+') {
626 token = Select(Token::INC);
627 } else if (c0_ == '=') {
628 token = Select(Token::ASSIGN_ADD);
629 } else {
630 token = Token::ADD;
631 }
632 break;
633
634 case '-':
635 // - -- --> -=
636 Advance();
637 if (c0_ == '-') {
638 Advance();
639 if (c0_ == '>' && HasLineTerminatorBeforeNext()) {
640 // For compatibility with SpiderMonkey, we skip lines that
641 // start with an HTML comment end '-->'.
642 token = SkipSingleHTMLComment();
643 } else {
644 token = Token::DEC;
645 }
646 } else if (c0_ == '=') {
647 token = Select(Token::ASSIGN_SUB);
648 } else {
649 token = Token::SUB;
650 }
651 break;
652
653 case '*':
654 // * *=
655 Advance();
656 if (c0_ == '*') {
657 token = Select('=', Token::ASSIGN_EXP, Token::EXP);
658 } else if (c0_ == '=') {
659 token = Select(Token::ASSIGN_MUL);
660 } else {
661 token = Token::MUL;
662 }
663 break;
664
665 case '%':
666 // % %=
667 token = Select('=', Token::ASSIGN_MOD, Token::MOD);
668 break;
669
670 case '/':
671 // / // /* /=
672 Advance();
673 if (c0_ == '/') {
674 uc32 c = Peek();
675 if (c == '#' || c == '@') {
676 Advance();
677 Advance();
678 token = SkipSourceURLComment();
679 } else {
680 token = SkipSingleLineComment();
681 }
682 } else if (c0_ == '*') {
683 token = SkipMultiLineComment();
684 } else if (c0_ == '=') {
685 token = Select(Token::ASSIGN_DIV);
686 } else {
687 token = Token::DIV;
688 }
689 break;
690
691 case '&':
692 // & && &=
693 Advance();
694 if (c0_ == '&') {
695 token = Select(Token::AND);
696 } else if (c0_ == '=') {
697 token = Select(Token::ASSIGN_BIT_AND);
698 } else {
699 token = Token::BIT_AND;
700 }
701 break;
702
703 case '|':
704 // | || |=
705 Advance();
706 if (c0_ == '|') {
707 token = Select(Token::OR);
708 } else if (c0_ == '=') {
709 token = Select(Token::ASSIGN_BIT_OR);
710 } else {
711 token = Token::BIT_OR;
712 }
713 break;
714
715 case '^':
716 // ^ ^=
717 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
718 break;
719
720 case '.':
721 // . Number
722 Advance();
723 if (IsDecimalDigit(c0_)) {
724 token = ScanNumber(true);
725 } else {
726 token = Token::PERIOD;
727 if (c0_ == '.') {
728 if (Peek() == '.') {
729 Advance();
730 Advance();
731 token = Token::ELLIPSIS;
732 }
733 }
734 }
735 break;
736
737 case '`':
738 token = ScanTemplateStart();
739 break;
740
741 case '#':
742 token = ScanPrivateName();
743 break;
744
745 default:
746 if (unicode_cache_->IsIdentifierStart(c0_) ||
747 (CombineSurrogatePair() &&
748 unicode_cache_->IsIdentifierStart(c0_))) {
749 token = ScanIdentifierOrKeyword();
750 } else if (IsDecimalDigit(c0_)) {
751 token = ScanNumber(false);
752 } else if (c0_ == kEndOfInput) {
753 token = Token::EOS;
754 } else {
755 token = SkipWhiteSpace();
756 if (token == Token::ILLEGAL) Advance();
757 }
758 break;
759 }
760
761 // Continue scanning for tokens as long as we're just skipping
762 // whitespace.
763 } while (token == Token::WHITESPACE);
764
765 next().location.end_pos = source_pos();
766 if (Token::IsContextualKeyword(token)) {
767 next().token = Token::IDENTIFIER;
768 next().contextual_token = token;
769 } else {
770 next().token = token;
771 next().contextual_token = Token::UNINITIALIZED;
772 }
773
774 #ifdef DEBUG
775 SanityCheckTokenDesc(current());
776 SanityCheckTokenDesc(next());
777 SanityCheckTokenDesc(next_next());
778 #endif
779 }
780
781 #ifdef DEBUG
SanityCheckTokenDesc(const TokenDesc & token) const782 void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
783 // Most tokens should not have literal_chars or even raw_literal chars.
784 // The rules are:
785 // - UNINITIALIZED: we don't care.
786 // - TEMPLATE_*: need both literal + raw literal chars.
787 // - IDENTIFIERS, STRINGS, etc.: need a literal, but no raw literal.
788 // - all others: should have neither.
789 // Furthermore, only TEMPLATE_* tokens can have a
790 // invalid_template_escape_message.
791
792 switch (token.token) {
793 case Token::UNINITIALIZED:
794 // token.literal_chars & other members might be garbage. That's ok.
795 break;
796 case Token::TEMPLATE_SPAN:
797 case Token::TEMPLATE_TAIL:
798 DCHECK(token.raw_literal_chars.is_used());
799 DCHECK(token.literal_chars.is_used());
800 break;
801 case Token::ESCAPED_KEYWORD:
802 case Token::ESCAPED_STRICT_RESERVED_WORD:
803 case Token::FUTURE_STRICT_RESERVED_WORD:
804 case Token::IDENTIFIER:
805 case Token::NUMBER:
806 case Token::BIGINT:
807 case Token::REGEXP_LITERAL:
808 case Token::SMI:
809 case Token::STRING:
810 case Token::PRIVATE_NAME:
811 DCHECK(token.literal_chars.is_used());
812 DCHECK(!token.raw_literal_chars.is_used());
813 DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone);
814 break;
815 default:
816 DCHECK(!token.literal_chars.is_used());
817 DCHECK(!token.raw_literal_chars.is_used());
818 DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone);
819 break;
820 }
821
822 DCHECK_IMPLIES(token.token != Token::IDENTIFIER,
823 token.contextual_token == Token::UNINITIALIZED);
824 DCHECK_IMPLIES(token.contextual_token != Token::UNINITIALIZED,
825 token.token == Token::IDENTIFIER &&
826 Token::IsContextualKeyword(token.contextual_token));
827 DCHECK(!Token::IsContextualKeyword(token.token));
828 }
829 #endif // DEBUG
830
SeekForward(int pos)831 void Scanner::SeekForward(int pos) {
832 // After this call, we will have the token at the given position as
833 // the "next" token. The "current" token will be invalid.
834 if (pos == next().location.beg_pos) return;
835 int current_pos = source_pos();
836 DCHECK_EQ(next().location.end_pos, current_pos);
837 // Positions inside the lookahead token aren't supported.
838 DCHECK(pos >= current_pos);
839 if (pos != current_pos) {
840 source_->Seek(pos);
841 Advance();
842 // This function is only called to seek to the location
843 // of the end of a function (at the "}" token). It doesn't matter
844 // whether there was a line terminator in the part we skip.
845 next().after_line_terminator = false;
846 }
847 Scan();
848 }
849
850 template <bool capture_raw>
ScanEscape()851 bool Scanner::ScanEscape() {
852 uc32 c = c0_;
853 Advance<capture_raw>();
854
855 // Skip escaped newlines.
856 DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
857 if (!capture_raw && unibrow::IsLineTerminator(c)) {
858 // Allow escaped CR+LF newlines in multiline string literals.
859 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
860 return true;
861 }
862
863 switch (c) {
864 case '\'': // fall through
865 case '"' : // fall through
866 case '\\': break;
867 case 'b' : c = '\b'; break;
868 case 'f' : c = '\f'; break;
869 case 'n' : c = '\n'; break;
870 case 'r' : c = '\r'; break;
871 case 't' : c = '\t'; break;
872 case 'u' : {
873 c = ScanUnicodeEscape<capture_raw>();
874 if (c < 0) return false;
875 break;
876 }
877 case 'v':
878 c = '\v';
879 break;
880 case 'x': {
881 c = ScanHexNumber<capture_raw>(2);
882 if (c < 0) return false;
883 break;
884 }
885 case '0': // Fall through.
886 case '1': // fall through
887 case '2': // fall through
888 case '3': // fall through
889 case '4': // fall through
890 case '5': // fall through
891 case '6': // fall through
892 case '7':
893 c = ScanOctalEscape<capture_raw>(c, 2);
894 break;
895 }
896
897 // Other escaped characters are interpreted as their non-escaped version.
898 AddLiteralChar(c);
899 return true;
900 }
901
902 template <bool capture_raw>
ScanOctalEscape(uc32 c,int length)903 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
904 uc32 x = c - '0';
905 int i = 0;
906 for (; i < length; i++) {
907 int d = c0_ - '0';
908 if (d < 0 || d > 7) break;
909 int nx = x * 8 + d;
910 if (nx >= 256) break;
911 x = nx;
912 Advance<capture_raw>();
913 }
914 // Anything except '\0' is an octal escape sequence, illegal in strict mode.
915 // Remember the position of octal escape sequences so that an error
916 // can be reported later (in strict mode).
917 // We don't report the error immediately, because the octal escape can
918 // occur before the "use strict" directive.
919 if (c != '0' || i > 0 || c0_ == '8' || c0_ == '9') {
920 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
921 octal_message_ = capture_raw ? MessageTemplate::kTemplateOctalLiteral
922 : MessageTemplate::kStrictOctalEscape;
923 }
924 return x;
925 }
926
ScanString()927 Token::Value Scanner::ScanString() {
928 uc32 quote = c0_;
929 Advance(); // consume quote
930
931 LiteralScope literal(this);
932 while (true) {
933 if (c0_ == quote) {
934 literal.Complete();
935 Advance();
936 return Token::STRING;
937 }
938 if (c0_ == kEndOfInput || unibrow::IsStringLiteralLineTerminator(c0_)) {
939 return Token::ILLEGAL;
940 }
941 if (c0_ == '\\') {
942 Advance();
943 // TODO(verwaest): Check whether we can remove the additional check.
944 if (c0_ == kEndOfInput || !ScanEscape<false>()) {
945 return Token::ILLEGAL;
946 }
947 continue;
948 }
949 AddLiteralCharAdvance();
950 }
951 }
952
ScanPrivateName()953 Token::Value Scanner::ScanPrivateName() {
954 if (!allow_harmony_private_fields()) {
955 ReportScannerError(source_pos(),
956 MessageTemplate::kInvalidOrUnexpectedToken);
957 return Token::ILLEGAL;
958 }
959
960 LiteralScope literal(this);
961 DCHECK_EQ(c0_, '#');
962 DCHECK(!unicode_cache_->IsIdentifierStart(kEndOfInput));
963 if (!unicode_cache_->IsIdentifierStart(Peek())) {
964 ReportScannerError(source_pos(),
965 MessageTemplate::kInvalidOrUnexpectedToken);
966 return Token::ILLEGAL;
967 }
968
969 AddLiteralCharAdvance();
970 Token::Value token = ScanIdentifierOrKeywordInner(&literal);
971 return token == Token::ILLEGAL ? Token::ILLEGAL : Token::PRIVATE_NAME;
972 }
973
ScanTemplateSpan()974 Token::Value Scanner::ScanTemplateSpan() {
975 // When scanning a TemplateSpan, we are looking for the following construct:
976 // TEMPLATE_SPAN ::
977 // ` LiteralChars* ${
978 // | } LiteralChars* ${
979 //
980 // TEMPLATE_TAIL ::
981 // ` LiteralChars* `
982 // | } LiteralChar* `
983 //
984 // A TEMPLATE_SPAN should always be followed by an Expression, while a
985 // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
986 // followed by an Expression.
987
988 // These scoped helpers save and restore the original error state, so that we
989 // can specially treat invalid escape sequences in templates (which are
990 // handled by the parser).
991 ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_);
992 ErrorState octal_error_state(&octal_message_, &octal_pos_);
993
994 Token::Value result = Token::TEMPLATE_SPAN;
995 LiteralScope literal(this);
996 StartRawLiteral();
997 const bool capture_raw = true;
998 while (true) {
999 uc32 c = c0_;
1000 if (c == '`') {
1001 Advance(); // Consume '`'
1002 result = Token::TEMPLATE_TAIL;
1003 break;
1004 } else if (c == '$' && Peek() == '{') {
1005 Advance(); // Consume '$'
1006 Advance(); // Consume '{'
1007 break;
1008 } else if (c == '\\') {
1009 Advance(); // Consume '\\'
1010 DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
1011 if (capture_raw) AddRawLiteralChar('\\');
1012 if (unibrow::IsLineTerminator(c0_)) {
1013 // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
1014 // code unit sequence.
1015 uc32 lastChar = c0_;
1016 Advance();
1017 if (lastChar == '\r') {
1018 // Also skip \n.
1019 if (c0_ == '\n') Advance();
1020 lastChar = '\n';
1021 }
1022 if (capture_raw) AddRawLiteralChar(lastChar);
1023 } else {
1024 bool success = ScanEscape<capture_raw>();
1025 USE(success);
1026 DCHECK_EQ(!success, has_error());
1027 // For templates, invalid escape sequence checking is handled in the
1028 // parser.
1029 scanner_error_state.MoveErrorTo(next_);
1030 octal_error_state.MoveErrorTo(next_);
1031 }
1032 } else if (c < 0) {
1033 // Unterminated template literal
1034 break;
1035 } else {
1036 Advance(); // Consume c.
1037 // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
1038 // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
1039 // consisting of the CV 0x000A.
1040 if (c == '\r') {
1041 if (c0_ == '\n') Advance(); // Consume '\n'
1042 c = '\n';
1043 }
1044 if (capture_raw) AddRawLiteralChar(c);
1045 AddLiteralChar(c);
1046 }
1047 }
1048 literal.Complete();
1049 next().location.end_pos = source_pos();
1050 next().token = result;
1051 next().contextual_token = Token::UNINITIALIZED;
1052
1053 return result;
1054 }
1055
ScanTemplateStart()1056 Token::Value Scanner::ScanTemplateStart() {
1057 DCHECK_EQ(next_next().token, Token::UNINITIALIZED);
1058 DCHECK_EQ(c0_, '`');
1059 next().location.beg_pos = source_pos();
1060 Advance(); // Consume `
1061 return ScanTemplateSpan();
1062 }
1063
SourceUrl(Isolate * isolate) const1064 Handle<String> Scanner::SourceUrl(Isolate* isolate) const {
1065 Handle<String> tmp;
1066 if (source_url_.length() > 0) {
1067 DCHECK(source_url_.is_used());
1068 tmp = source_url_.Internalize(isolate);
1069 }
1070 return tmp;
1071 }
1072
SourceMappingUrl(Isolate * isolate) const1073 Handle<String> Scanner::SourceMappingUrl(Isolate* isolate) const {
1074 Handle<String> tmp;
1075 if (source_mapping_url_.length() > 0) {
1076 DCHECK(source_mapping_url_.is_used());
1077 tmp = source_mapping_url_.Internalize(isolate);
1078 }
1079 return tmp;
1080 }
1081
ScanDigitsWithNumericSeparators(bool (* predicate)(uc32 ch),bool is_check_first_digit)1082 bool Scanner::ScanDigitsWithNumericSeparators(bool (*predicate)(uc32 ch),
1083 bool is_check_first_digit) {
1084 // we must have at least one digit after 'x'/'b'/'o'
1085 if (is_check_first_digit && !predicate(c0_)) return false;
1086
1087 bool separator_seen = false;
1088 while (predicate(c0_) || c0_ == '_') {
1089 if (c0_ == '_') {
1090 Advance();
1091 if (c0_ == '_') {
1092 ReportScannerError(Location(source_pos(), source_pos() + 1),
1093 MessageTemplate::kContinuousNumericSeparator);
1094 return false;
1095 }
1096 separator_seen = true;
1097 continue;
1098 }
1099 separator_seen = false;
1100 AddLiteralCharAdvance();
1101 }
1102
1103 if (separator_seen) {
1104 ReportScannerError(Location(source_pos(), source_pos() + 1),
1105 MessageTemplate::kTrailingNumericSeparator);
1106 return false;
1107 }
1108
1109 return true;
1110 }
1111
ScanDecimalDigits()1112 bool Scanner::ScanDecimalDigits() {
1113 if (allow_harmony_numeric_separator()) {
1114 return ScanDigitsWithNumericSeparators(&IsDecimalDigit, false);
1115 }
1116 while (IsDecimalDigit(c0_)) {
1117 AddLiteralCharAdvance();
1118 }
1119 return true;
1120 }
1121
ScanDecimalAsSmiWithNumericSeparators(uint64_t * value)1122 bool Scanner::ScanDecimalAsSmiWithNumericSeparators(uint64_t* value) {
1123 bool separator_seen = false;
1124 while (IsDecimalDigit(c0_) || c0_ == '_') {
1125 if (c0_ == '_') {
1126 Advance();
1127 if (c0_ == '_') {
1128 ReportScannerError(Location(source_pos(), source_pos() + 1),
1129 MessageTemplate::kContinuousNumericSeparator);
1130 return false;
1131 }
1132 separator_seen = true;
1133 continue;
1134 }
1135 separator_seen = false;
1136 *value = 10 * *value + (c0_ - '0');
1137 uc32 first_char = c0_;
1138 Advance();
1139 AddLiteralChar(first_char);
1140 }
1141
1142 if (separator_seen) {
1143 ReportScannerError(Location(source_pos(), source_pos() + 1),
1144 MessageTemplate::kTrailingNumericSeparator);
1145 return false;
1146 }
1147
1148 return true;
1149 }
1150
ScanDecimalAsSmi(uint64_t * value)1151 bool Scanner::ScanDecimalAsSmi(uint64_t* value) {
1152 if (allow_harmony_numeric_separator()) {
1153 return ScanDecimalAsSmiWithNumericSeparators(value);
1154 }
1155
1156 while (IsDecimalDigit(c0_)) {
1157 *value = 10 * *value + (c0_ - '0');
1158 uc32 first_char = c0_;
1159 Advance();
1160 AddLiteralChar(first_char);
1161 }
1162 return true;
1163 }
1164
ScanBinaryDigits()1165 bool Scanner::ScanBinaryDigits() {
1166 if (allow_harmony_numeric_separator()) {
1167 return ScanDigitsWithNumericSeparators(&IsBinaryDigit, true);
1168 }
1169
1170 // we must have at least one binary digit after 'b'/'B'
1171 if (!IsBinaryDigit(c0_)) {
1172 return false;
1173 }
1174
1175 while (IsBinaryDigit(c0_)) {
1176 AddLiteralCharAdvance();
1177 }
1178 return true;
1179 }
1180
ScanOctalDigits()1181 bool Scanner::ScanOctalDigits() {
1182 if (allow_harmony_numeric_separator()) {
1183 return ScanDigitsWithNumericSeparators(&IsOctalDigit, true);
1184 }
1185
1186 // we must have at least one octal digit after 'o'/'O'
1187 if (!IsOctalDigit(c0_)) {
1188 return false;
1189 }
1190
1191 while (IsOctalDigit(c0_)) {
1192 AddLiteralCharAdvance();
1193 }
1194 return true;
1195 }
1196
ScanImplicitOctalDigits(int start_pos,Scanner::NumberKind * kind)1197 bool Scanner::ScanImplicitOctalDigits(int start_pos,
1198 Scanner::NumberKind* kind) {
1199 *kind = IMPLICIT_OCTAL;
1200
1201 while (true) {
1202 // (possible) octal number
1203 if (c0_ == '8' || c0_ == '9') {
1204 *kind = DECIMAL_WITH_LEADING_ZERO;
1205 return true;
1206 }
1207 if (c0_ < '0' || '7' < c0_) {
1208 // Octal literal finished.
1209 octal_pos_ = Location(start_pos, source_pos());
1210 octal_message_ = MessageTemplate::kStrictOctalLiteral;
1211 return true;
1212 }
1213 AddLiteralCharAdvance();
1214 }
1215 }
1216
ScanHexDigits()1217 bool Scanner::ScanHexDigits() {
1218 if (allow_harmony_numeric_separator()) {
1219 return ScanDigitsWithNumericSeparators(&IsHexDigit, true);
1220 }
1221
1222 // we must have at least one hex digit after 'x'/'X'
1223 if (!IsHexDigit(c0_)) {
1224 return false;
1225 }
1226
1227 while (IsHexDigit(c0_)) {
1228 AddLiteralCharAdvance();
1229 }
1230 return true;
1231 }
1232
ScanSignedInteger()1233 bool Scanner::ScanSignedInteger() {
1234 if (c0_ == '+' || c0_ == '-') AddLiteralCharAdvance();
1235 // we must have at least one decimal digit after 'e'/'E'
1236 if (!IsDecimalDigit(c0_)) return false;
1237 return ScanDecimalDigits();
1238 }
1239
ScanNumber(bool seen_period)1240 Token::Value Scanner::ScanNumber(bool seen_period) {
1241 DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
1242
1243 NumberKind kind = DECIMAL;
1244
1245 LiteralScope literal(this);
1246 bool at_start = !seen_period;
1247 int start_pos = source_pos(); // For reporting octal positions.
1248 if (seen_period) {
1249 // we have already seen a decimal point of the float
1250 AddLiteralChar('.');
1251 if (allow_harmony_numeric_separator() && c0_ == '_') {
1252 return Token::ILLEGAL;
1253 }
1254 // we know we have at least one digit
1255 if (!ScanDecimalDigits()) return Token::ILLEGAL;
1256 } else {
1257 // if the first character is '0' we must check for octals and hex
1258 if (c0_ == '0') {
1259 AddLiteralCharAdvance();
1260
1261 // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
1262 // an octal number.
1263 if (c0_ == 'x' || c0_ == 'X') {
1264 AddLiteralCharAdvance();
1265 kind = HEX;
1266 if (!ScanHexDigits()) return Token::ILLEGAL;
1267 } else if (c0_ == 'o' || c0_ == 'O') {
1268 AddLiteralCharAdvance();
1269 kind = OCTAL;
1270 if (!ScanOctalDigits()) return Token::ILLEGAL;
1271 } else if (c0_ == 'b' || c0_ == 'B') {
1272 AddLiteralCharAdvance();
1273 kind = BINARY;
1274 if (!ScanBinaryDigits()) return Token::ILLEGAL;
1275 } else if ('0' <= c0_ && c0_ <= '7') {
1276 kind = IMPLICIT_OCTAL;
1277 if (!ScanImplicitOctalDigits(start_pos, &kind)) {
1278 return Token::ILLEGAL;
1279 }
1280 if (kind == DECIMAL_WITH_LEADING_ZERO) {
1281 at_start = false;
1282 }
1283 } else if (c0_ == '8' || c0_ == '9') {
1284 kind = DECIMAL_WITH_LEADING_ZERO;
1285 } else if (allow_harmony_numeric_separator() && c0_ == '_') {
1286 ReportScannerError(Location(source_pos(), source_pos() + 1),
1287 MessageTemplate::kZeroDigitNumericSeparator);
1288 return Token::ILLEGAL;
1289 }
1290 }
1291
1292 // Parse decimal digits and allow trailing fractional part.
1293 if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) {
1294 // This is an optimization for parsing Decimal numbers as Smi's.
1295 if (at_start) {
1296 uint64_t value = 0;
1297 // scan subsequent decimal digits
1298 if (!ScanDecimalAsSmi(&value)) {
1299 return Token::ILLEGAL;
1300 }
1301
1302 if (next().literal_chars.one_byte_literal().length() <= 10 &&
1303 value <= Smi::kMaxValue && c0_ != '.' &&
1304 !unicode_cache_->IsIdentifierStart(c0_)) {
1305 next().smi_value_ = static_cast<uint32_t>(value);
1306 literal.Complete();
1307
1308 if (kind == DECIMAL_WITH_LEADING_ZERO) {
1309 octal_pos_ = Location(start_pos, source_pos());
1310 octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
1311 }
1312 return Token::SMI;
1313 }
1314 }
1315
1316 if (!ScanDecimalDigits()) return Token::ILLEGAL;
1317 if (c0_ == '.') {
1318 seen_period = true;
1319 AddLiteralCharAdvance();
1320 if (allow_harmony_numeric_separator() && c0_ == '_') {
1321 return Token::ILLEGAL;
1322 }
1323 if (!ScanDecimalDigits()) return Token::ILLEGAL;
1324 }
1325 }
1326 }
1327
1328 bool is_bigint = false;
1329 if (allow_harmony_bigint() && c0_ == 'n' && !seen_period &&
1330 (kind == DECIMAL || kind == HEX || kind == OCTAL || kind == BINARY)) {
1331 // Check that the literal is within our limits for BigInt length.
1332 // For simplicity, use 4 bits per character to calculate the maximum
1333 // allowed literal length.
1334 static const int kMaxBigIntCharacters = BigInt::kMaxLengthBits / 4;
1335 int length = source_pos() - start_pos - (kind != DECIMAL ? 2 : 0);
1336 if (length > kMaxBigIntCharacters) {
1337 ReportScannerError(Location(start_pos, source_pos()),
1338 MessageTemplate::kBigIntTooBig);
1339 return Token::ILLEGAL;
1340 }
1341
1342 is_bigint = true;
1343 Advance();
1344 } else if (c0_ == 'e' || c0_ == 'E') {
1345 // scan exponent, if any
1346 DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
1347
1348 if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO))
1349 return Token::ILLEGAL;
1350
1351 // scan exponent
1352 AddLiteralCharAdvance();
1353
1354 if (!ScanSignedInteger()) return Token::ILLEGAL;
1355 }
1356
1357 // The source character immediately following a numeric literal must
1358 // not be an identifier start or a decimal digit; see ECMA-262
1359 // section 7.8.3, page 17 (note that we read only one decimal digit
1360 // if the value is 0).
1361 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_)) {
1362 return Token::ILLEGAL;
1363 }
1364
1365 literal.Complete();
1366
1367 if (kind == DECIMAL_WITH_LEADING_ZERO) {
1368 octal_pos_ = Location(start_pos, source_pos());
1369 octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
1370 }
1371
1372 return is_bigint ? Token::BIGINT : Token::NUMBER;
1373 }
1374
ScanIdentifierUnicodeEscape()1375 uc32 Scanner::ScanIdentifierUnicodeEscape() {
1376 Advance();
1377 if (c0_ != 'u') return -1;
1378 Advance();
1379 return ScanUnicodeEscape<false>();
1380 }
1381
1382 template <bool capture_raw>
ScanUnicodeEscape()1383 uc32 Scanner::ScanUnicodeEscape() {
1384 // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
1385 // hex digits between { } is arbitrary. \ and u have already been read.
1386 if (c0_ == '{') {
1387 int begin = source_pos() - 2;
1388 Advance<capture_raw>();
1389 uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10FFFF, begin);
1390 if (cp < 0 || c0_ != '}') {
1391 ReportScannerError(source_pos(),
1392 MessageTemplate::kInvalidUnicodeEscapeSequence);
1393 return -1;
1394 }
1395 Advance<capture_raw>();
1396 return cp;
1397 }
1398 const bool unicode = true;
1399 return ScanHexNumber<capture_raw, unicode>(4);
1400 }
1401
1402
1403 // ----------------------------------------------------------------------------
1404 // Keyword Matcher
1405
1406 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
1407 KEYWORD_GROUP('a') \
1408 KEYWORD("arguments", Token::ARGUMENTS) \
1409 KEYWORD("as", Token::AS) \
1410 KEYWORD("async", Token::ASYNC) \
1411 KEYWORD("await", Token::AWAIT) \
1412 KEYWORD("anonymous", Token::ANONYMOUS) \
1413 KEYWORD_GROUP('b') \
1414 KEYWORD("break", Token::BREAK) \
1415 KEYWORD_GROUP('c') \
1416 KEYWORD("case", Token::CASE) \
1417 KEYWORD("catch", Token::CATCH) \
1418 KEYWORD("class", Token::CLASS) \
1419 KEYWORD("const", Token::CONST) \
1420 KEYWORD("constructor", Token::CONSTRUCTOR) \
1421 KEYWORD("continue", Token::CONTINUE) \
1422 KEYWORD_GROUP('d') \
1423 KEYWORD("debugger", Token::DEBUGGER) \
1424 KEYWORD("default", Token::DEFAULT) \
1425 KEYWORD("delete", Token::DELETE) \
1426 KEYWORD("do", Token::DO) \
1427 KEYWORD_GROUP('e') \
1428 KEYWORD("else", Token::ELSE) \
1429 KEYWORD("enum", Token::ENUM) \
1430 KEYWORD("eval", Token::EVAL) \
1431 KEYWORD("export", Token::EXPORT) \
1432 KEYWORD("extends", Token::EXTENDS) \
1433 KEYWORD_GROUP('f') \
1434 KEYWORD("false", Token::FALSE_LITERAL) \
1435 KEYWORD("finally", Token::FINALLY) \
1436 KEYWORD("for", Token::FOR) \
1437 KEYWORD("from", Token::FROM) \
1438 KEYWORD("function", Token::FUNCTION) \
1439 KEYWORD_GROUP('g') \
1440 KEYWORD("get", Token::GET) \
1441 KEYWORD_GROUP('i') \
1442 KEYWORD("if", Token::IF) \
1443 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
1444 KEYWORD("import", Token::IMPORT) \
1445 KEYWORD("in", Token::IN) \
1446 KEYWORD("instanceof", Token::INSTANCEOF) \
1447 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \
1448 KEYWORD_GROUP('l') \
1449 KEYWORD("let", Token::LET) \
1450 KEYWORD_GROUP('m') \
1451 KEYWORD("meta", Token::META) \
1452 KEYWORD_GROUP('n') \
1453 KEYWORD("name", Token::NAME) \
1454 KEYWORD("new", Token::NEW) \
1455 KEYWORD("null", Token::NULL_LITERAL) \
1456 KEYWORD_GROUP('o') \
1457 KEYWORD("of", Token::OF) \
1458 KEYWORD_GROUP('p') \
1459 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \
1460 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \
1461 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \
1462 KEYWORD("prototype", Token::PROTOTYPE) \
1463 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \
1464 KEYWORD_GROUP('r') \
1465 KEYWORD("return", Token::RETURN) \
1466 KEYWORD_GROUP('s') \
1467 KEYWORD("set", Token::SET) \
1468 KEYWORD("static", Token::STATIC) \
1469 KEYWORD("super", Token::SUPER) \
1470 KEYWORD("switch", Token::SWITCH) \
1471 KEYWORD_GROUP('t') \
1472 KEYWORD("target", Token::TARGET) \
1473 KEYWORD("this", Token::THIS) \
1474 KEYWORD("throw", Token::THROW) \
1475 KEYWORD("true", Token::TRUE_LITERAL) \
1476 KEYWORD("try", Token::TRY) \
1477 KEYWORD("typeof", Token::TYPEOF) \
1478 KEYWORD_GROUP('u') \
1479 KEYWORD("undefined", Token::UNDEFINED) \
1480 KEYWORD_GROUP('v') \
1481 KEYWORD("var", Token::VAR) \
1482 KEYWORD("void", Token::VOID) \
1483 KEYWORD_GROUP('w') \
1484 KEYWORD("while", Token::WHILE) \
1485 KEYWORD("with", Token::WITH) \
1486 KEYWORD_GROUP('y') \
1487 KEYWORD("yield", Token::YIELD) \
1488 KEYWORD_GROUP('_') \
1489 KEYWORD("__proto__", Token::PROTO_UNDERSCORED) \
1490 KEYWORD_GROUP('#') \
1491 KEYWORD("#constructor", Token::PRIVATE_CONSTRUCTOR)
1492
KeywordOrIdentifierToken(const uint8_t * input,int input_length)1493 static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
1494 int input_length) {
1495 DCHECK_GE(input_length, 1);
1496 const int kMinLength = 2;
1497 const int kMaxLength = 12;
1498 if (input_length < kMinLength || input_length > kMaxLength) {
1499 return Token::IDENTIFIER;
1500 }
1501 switch (input[0]) {
1502 default:
1503 #define KEYWORD_GROUP_CASE(ch) \
1504 break; \
1505 case ch:
1506 #define KEYWORD(keyword, token) \
1507 { \
1508 /* 'keyword' is a char array, so sizeof(keyword) is */ \
1509 /* strlen(keyword) plus 1 for the NUL char. */ \
1510 const int keyword_length = sizeof(keyword) - 1; \
1511 STATIC_ASSERT(keyword_length >= kMinLength); \
1512 STATIC_ASSERT(keyword_length <= kMaxLength); \
1513 DCHECK_EQ(input[0], keyword[0]); \
1514 DCHECK(token == Token::FUTURE_STRICT_RESERVED_WORD || \
1515 0 == strncmp(keyword, Token::String(token), sizeof(keyword))); \
1516 if (input_length == keyword_length && input[1] == keyword[1] && \
1517 (keyword_length <= 2 || input[2] == keyword[2]) && \
1518 (keyword_length <= 3 || input[3] == keyword[3]) && \
1519 (keyword_length <= 4 || input[4] == keyword[4]) && \
1520 (keyword_length <= 5 || input[5] == keyword[5]) && \
1521 (keyword_length <= 6 || input[6] == keyword[6]) && \
1522 (keyword_length <= 7 || input[7] == keyword[7]) && \
1523 (keyword_length <= 8 || input[8] == keyword[8]) && \
1524 (keyword_length <= 9 || input[9] == keyword[9]) && \
1525 (keyword_length <= 10 || input[10] == keyword[10])) { \
1526 return token; \
1527 } \
1528 }
1529 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
1530 }
1531 return Token::IDENTIFIER;
1532 #undef KEYWORDS
1533 #undef KEYWORD
1534 #undef KEYWORD_GROUP_CASE
1535 }
1536
ScanIdentifierOrKeyword()1537 Token::Value Scanner::ScanIdentifierOrKeyword() {
1538 LiteralScope literal(this);
1539 return ScanIdentifierOrKeywordInner(&literal);
1540 }
1541
ScanIdentifierOrKeywordInner(LiteralScope * literal)1542 Token::Value Scanner::ScanIdentifierOrKeywordInner(LiteralScope* literal) {
1543 DCHECK(unicode_cache_->IsIdentifierStart(c0_));
1544 bool escaped = false;
1545 if (IsInRange(c0_, 'a', 'z') || c0_ == '_') {
1546 do {
1547 AddLiteralChar(static_cast<char>(c0_));
1548 Advance();
1549 } while (IsInRange(c0_, 'a', 'z') || c0_ == '_');
1550
1551 if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '$') {
1552 // Identifier starting with lowercase or _.
1553 do {
1554 AddLiteralChar(static_cast<char>(c0_));
1555 Advance();
1556 } while (IsAsciiIdentifier(c0_));
1557
1558 if (c0_ <= kMaxAscii && c0_ != '\\') {
1559 literal->Complete();
1560 return Token::IDENTIFIER;
1561 }
1562 } else if (c0_ <= kMaxAscii && c0_ != '\\') {
1563 // Only a-z+ or _: could be a keyword or identifier.
1564 Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
1565 Token::Value token =
1566 KeywordOrIdentifierToken(chars.start(), chars.length());
1567 if (token == Token::IDENTIFIER ||
1568 token == Token::FUTURE_STRICT_RESERVED_WORD ||
1569 Token::IsContextualKeyword(token))
1570 literal->Complete();
1571 return token;
1572 }
1573 } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '$') {
1574 do {
1575 AddLiteralChar(static_cast<char>(c0_));
1576 Advance();
1577 } while (IsAsciiIdentifier(c0_));
1578
1579 if (c0_ <= kMaxAscii && c0_ != '\\') {
1580 literal->Complete();
1581 return Token::IDENTIFIER;
1582 }
1583 } else if (c0_ == '\\') {
1584 escaped = true;
1585 uc32 c = ScanIdentifierUnicodeEscape();
1586 DCHECK(!unicode_cache_->IsIdentifierStart(-1));
1587 if (c == '\\' || !unicode_cache_->IsIdentifierStart(c)) {
1588 return Token::ILLEGAL;
1589 }
1590 AddLiteralChar(c);
1591 }
1592
1593 while (true) {
1594 if (c0_ == '\\') {
1595 escaped = true;
1596 uc32 c = ScanIdentifierUnicodeEscape();
1597 // Only allow legal identifier part characters.
1598 // TODO(verwaest): Make this true.
1599 // DCHECK(!unicode_cache_->IsIdentifierPart('\\'));
1600 DCHECK(!unicode_cache_->IsIdentifierPart(-1));
1601 if (c == '\\' || !unicode_cache_->IsIdentifierPart(c)) {
1602 return Token::ILLEGAL;
1603 }
1604 AddLiteralChar(c);
1605 } else if (unicode_cache_->IsIdentifierPart(c0_) ||
1606 (CombineSurrogatePair() &&
1607 unicode_cache_->IsIdentifierPart(c0_))) {
1608 AddLiteralCharAdvance();
1609 } else {
1610 break;
1611 }
1612 }
1613
1614 if (next().literal_chars.is_one_byte()) {
1615 Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
1616 Token::Value token =
1617 KeywordOrIdentifierToken(chars.start(), chars.length());
1618 /* TODO(adamk): YIELD should be handled specially. */
1619 if (token == Token::FUTURE_STRICT_RESERVED_WORD) {
1620 literal->Complete();
1621 if (escaped) return Token::ESCAPED_STRICT_RESERVED_WORD;
1622 return token;
1623 }
1624 if (token == Token::IDENTIFIER || Token::IsContextualKeyword(token)) {
1625 literal->Complete();
1626 return token;
1627 }
1628
1629 if (!escaped) return token;
1630
1631 literal->Complete();
1632 if (token == Token::LET || token == Token::STATIC) {
1633 return Token::ESCAPED_STRICT_RESERVED_WORD;
1634 }
1635 return Token::ESCAPED_KEYWORD;
1636 }
1637
1638 literal->Complete();
1639 return Token::IDENTIFIER;
1640 }
1641
ScanRegExpPattern()1642 bool Scanner::ScanRegExpPattern() {
1643 DCHECK_EQ(Token::UNINITIALIZED, next_next().token);
1644 DCHECK(next().token == Token::DIV || next().token == Token::ASSIGN_DIV);
1645
1646 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1647 bool in_character_class = false;
1648 bool seen_equal = (next().token == Token::ASSIGN_DIV);
1649
1650 // Previous token is either '/' or '/=', in the second case, the
1651 // pattern starts at =.
1652 next().location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1653 next().location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1654
1655 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1656 // the scanner should pass uninterpreted bodies to the RegExp
1657 // constructor.
1658 LiteralScope literal(this);
1659 if (seen_equal) {
1660 AddLiteralChar('=');
1661 }
1662
1663 while (c0_ != '/' || in_character_class) {
1664 if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
1665 return false;
1666 }
1667 if (c0_ == '\\') { // Escape sequence.
1668 AddLiteralCharAdvance();
1669 if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
1670 return false;
1671 }
1672 AddLiteralCharAdvance();
1673 // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1674 // only "safe" characters are allowed (letters, digits, underscore),
1675 // otherwise the escape isn't valid and the invalid character has
1676 // its normal meaning. I.e., we can just continue scanning without
1677 // worrying whether the following characters are part of the escape
1678 // or not, since any '/', '\\' or '[' is guaranteed to not be part
1679 // of the escape sequence.
1680
1681 // TODO(896): At some point, parse RegExps more thoroughly to capture
1682 // octal esacpes in strict mode.
1683 } else { // Unescaped character.
1684 if (c0_ == '[') in_character_class = true;
1685 if (c0_ == ']') in_character_class = false;
1686 AddLiteralCharAdvance();
1687 }
1688 }
1689 Advance(); // consume '/'
1690
1691 literal.Complete();
1692 next().token = Token::REGEXP_LITERAL;
1693 next().contextual_token = Token::UNINITIALIZED;
1694 return true;
1695 }
1696
1697
ScanRegExpFlags()1698 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
1699 DCHECK_EQ(Token::REGEXP_LITERAL, next().token);
1700
1701 // Scan regular expression flags.
1702 int flags = 0;
1703 while (unicode_cache_->IsIdentifierPart(c0_)) {
1704 RegExp::Flags flag = RegExp::kNone;
1705 switch (c0_) {
1706 case 'g':
1707 flag = RegExp::kGlobal;
1708 break;
1709 case 'i':
1710 flag = RegExp::kIgnoreCase;
1711 break;
1712 case 'm':
1713 flag = RegExp::kMultiline;
1714 break;
1715 case 's':
1716 flag = RegExp::kDotAll;
1717 break;
1718 case 'u':
1719 flag = RegExp::kUnicode;
1720 break;
1721 case 'y':
1722 flag = RegExp::kSticky;
1723 break;
1724 default:
1725 return Nothing<RegExp::Flags>();
1726 }
1727 if (flags & flag) {
1728 return Nothing<RegExp::Flags>();
1729 }
1730 Advance();
1731 flags |= flag;
1732 }
1733
1734 next().location.end_pos = source_pos();
1735 return Just(RegExp::Flags(flags));
1736 }
1737
CurrentSymbol(AstValueFactory * ast_value_factory) const1738 const AstRawString* Scanner::CurrentSymbol(
1739 AstValueFactory* ast_value_factory) const {
1740 if (is_literal_one_byte()) {
1741 return ast_value_factory->GetOneByteString(literal_one_byte_string());
1742 }
1743 return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1744 }
1745
NextSymbol(AstValueFactory * ast_value_factory) const1746 const AstRawString* Scanner::NextSymbol(
1747 AstValueFactory* ast_value_factory) const {
1748 if (is_next_literal_one_byte()) {
1749 return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1750 }
1751 return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1752 }
1753
CurrentRawSymbol(AstValueFactory * ast_value_factory) const1754 const AstRawString* Scanner::CurrentRawSymbol(
1755 AstValueFactory* ast_value_factory) const {
1756 if (is_raw_literal_one_byte()) {
1757 return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1758 }
1759 return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1760 }
1761
1762
DoubleValue()1763 double Scanner::DoubleValue() {
1764 DCHECK(is_literal_one_byte());
1765 return StringToDouble(
1766 unicode_cache_,
1767 literal_one_byte_string(),
1768 ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1769 }
1770
CurrentLiteralAsCString(Zone * zone) const1771 const char* Scanner::CurrentLiteralAsCString(Zone* zone) const {
1772 DCHECK(is_literal_one_byte());
1773 Vector<const uint8_t> vector = literal_one_byte_string();
1774 int length = vector.length();
1775 char* buffer = zone->NewArray<char>(length + 1);
1776 memcpy(buffer, vector.start(), length);
1777 buffer[length] = '\0';
1778 return buffer;
1779 }
1780
IsDuplicateSymbol(DuplicateFinder * duplicate_finder,AstValueFactory * ast_value_factory) const1781 bool Scanner::IsDuplicateSymbol(DuplicateFinder* duplicate_finder,
1782 AstValueFactory* ast_value_factory) const {
1783 DCHECK_NOT_NULL(duplicate_finder);
1784 DCHECK_NOT_NULL(ast_value_factory);
1785 const AstRawString* string = CurrentSymbol(ast_value_factory);
1786 return !duplicate_finder->known_symbols_.insert(string).second;
1787 }
1788
SeekNext(size_t position)1789 void Scanner::SeekNext(size_t position) {
1790 // Use with care: This cleanly resets most, but not all scanner state.
1791 // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.
1792
1793 // To re-scan from a given character position, we need to:
1794 // 1, Reset the current_, next_ and next_next_ tokens
1795 // (next_ + next_next_ will be overwrittem by Next(),
1796 // current_ will remain unchanged, so overwrite it fully.)
1797 for (TokenDesc& token : token_storage_) {
1798 token.token = Token::UNINITIALIZED;
1799 token.contextual_token = Token::UNINITIALIZED;
1800 }
1801 // 2, reset the source to the desired position,
1802 source_->Seek(position);
1803 // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
1804 c0_ = source_->Advance();
1805 next().after_line_terminator = false;
1806 Scan();
1807 DCHECK_EQ(next().location.beg_pos, static_cast<int>(position));
1808 }
1809
1810 } // namespace internal
1811 } // namespace v8
1812