1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Features shared by parsing and pre-parsing scanners.
6
7 #include "src/parsing/scanner.h"
8
9 #include <stdint.h>
10
11 #include <cmath>
12
13 #include "src/ast/ast-value-factory.h"
14 #include "src/char-predicates-inl.h"
15 #include "src/conversions-inl.h"
16 #include "src/list-inl.h"
17 #include "src/parsing/duplicate-finder.h" // For Scanner::FindSymbol
18
19 namespace v8 {
20 namespace internal {
21
22 // Scoped helper for saving & restoring scanner error state.
23 // This is used for tagged template literals, in which normally forbidden
24 // escape sequences are allowed.
25 class ErrorState {
26 public:
ErrorState(MessageTemplate::Template * message_stack,Scanner::Location * location_stack)27 ErrorState(MessageTemplate::Template* message_stack,
28 Scanner::Location* location_stack)
29 : message_stack_(message_stack),
30 old_message_(*message_stack),
31 location_stack_(location_stack),
32 old_location_(*location_stack) {
33 *message_stack_ = MessageTemplate::kNone;
34 *location_stack_ = Scanner::Location::invalid();
35 }
36
~ErrorState()37 ~ErrorState() {
38 *message_stack_ = old_message_;
39 *location_stack_ = old_location_;
40 }
41
MoveErrorTo(MessageTemplate::Template * message_dest,Scanner::Location * location_dest)42 void MoveErrorTo(MessageTemplate::Template* message_dest,
43 Scanner::Location* location_dest) {
44 if (*message_stack_ == MessageTemplate::kNone) {
45 return;
46 }
47 if (*message_dest == MessageTemplate::kNone) {
48 *message_dest = *message_stack_;
49 *location_dest = *location_stack_;
50 }
51 *message_stack_ = MessageTemplate::kNone;
52 *location_stack_ = Scanner::Location::invalid();
53 }
54
55 private:
56 MessageTemplate::Template* const message_stack_;
57 MessageTemplate::Template const old_message_;
58 Scanner::Location* const location_stack_;
59 Scanner::Location const old_location_;
60 };
61
Internalize(Isolate * isolate) const62 Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const {
63 if (is_one_byte()) {
64 return isolate->factory()->InternalizeOneByteString(one_byte_literal());
65 }
66 return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
67 }
68
NewCapacity(int min_capacity)69 int Scanner::LiteralBuffer::NewCapacity(int min_capacity) {
70 int capacity = Max(min_capacity, backing_store_.length());
71 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
72 return new_capacity;
73 }
74
ExpandBuffer()75 void Scanner::LiteralBuffer::ExpandBuffer() {
76 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
77 MemCopy(new_store.start(), backing_store_.start(), position_);
78 backing_store_.Dispose();
79 backing_store_ = new_store;
80 }
81
ConvertToTwoByte()82 void Scanner::LiteralBuffer::ConvertToTwoByte() {
83 DCHECK(is_one_byte_);
84 Vector<byte> new_store;
85 int new_content_size = position_ * kUC16Size;
86 if (new_content_size >= backing_store_.length()) {
87 // Ensure room for all currently read code units as UC16 as well
88 // as the code unit about to be stored.
89 new_store = Vector<byte>::New(NewCapacity(new_content_size));
90 } else {
91 new_store = backing_store_;
92 }
93 uint8_t* src = backing_store_.start();
94 uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
95 for (int i = position_ - 1; i >= 0; i--) {
96 dst[i] = src[i];
97 }
98 if (new_store.start() != backing_store_.start()) {
99 backing_store_.Dispose();
100 backing_store_ = new_store;
101 }
102 position_ = new_content_size;
103 is_one_byte_ = false;
104 }
105
AddCharSlow(uc32 code_unit)106 void Scanner::LiteralBuffer::AddCharSlow(uc32 code_unit) {
107 if (position_ >= backing_store_.length()) ExpandBuffer();
108 if (is_one_byte_) {
109 if (code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) {
110 backing_store_[position_] = static_cast<byte>(code_unit);
111 position_ += kOneByteSize;
112 return;
113 }
114 ConvertToTwoByte();
115 }
116 if (code_unit <=
117 static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
118 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
119 position_ += kUC16Size;
120 } else {
121 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
122 unibrow::Utf16::LeadSurrogate(code_unit);
123 position_ += kUC16Size;
124 if (position_ >= backing_store_.length()) ExpandBuffer();
125 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
126 unibrow::Utf16::TrailSurrogate(code_unit);
127 position_ += kUC16Size;
128 }
129 }
130
131 // ----------------------------------------------------------------------------
132 // Scanner::BookmarkScope
133
134 const size_t Scanner::BookmarkScope::kBookmarkAtFirstPos =
135 std::numeric_limits<size_t>::max() - 2;
136 const size_t Scanner::BookmarkScope::kNoBookmark =
137 std::numeric_limits<size_t>::max() - 1;
138 const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
139 std::numeric_limits<size_t>::max();
140
Set()141 void Scanner::BookmarkScope::Set() {
142 DCHECK_EQ(bookmark_, kNoBookmark);
143 DCHECK_EQ(scanner_->next_next_.token, Token::UNINITIALIZED);
144
145 // The first token is a bit special, since current_ will still be
146 // uninitialized. In this case, store kBookmarkAtFirstPos and special-case it
147 // when
148 // applying the bookmark.
149 DCHECK_IMPLIES(
150 scanner_->current_.token == Token::UNINITIALIZED,
151 scanner_->current_.location.beg_pos == scanner_->next_.location.beg_pos);
152 bookmark_ = (scanner_->current_.token == Token::UNINITIALIZED)
153 ? kBookmarkAtFirstPos
154 : scanner_->location().beg_pos;
155 }
156
Apply()157 void Scanner::BookmarkScope::Apply() {
158 DCHECK(HasBeenSet()); // Caller hasn't called SetBookmark.
159 if (bookmark_ == kBookmarkAtFirstPos) {
160 scanner_->SeekNext(0);
161 } else {
162 scanner_->SeekNext(bookmark_);
163 scanner_->Next();
164 DCHECK_EQ(scanner_->location().beg_pos, static_cast<int>(bookmark_));
165 }
166 bookmark_ = kBookmarkWasApplied;
167 }
168
HasBeenSet()169 bool Scanner::BookmarkScope::HasBeenSet() {
170 return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
171 }
172
HasBeenApplied()173 bool Scanner::BookmarkScope::HasBeenApplied() {
174 return bookmark_ == kBookmarkWasApplied;
175 }
176
177 // ----------------------------------------------------------------------------
178 // Scanner
179
Scanner(UnicodeCache * unicode_cache)180 Scanner::Scanner(UnicodeCache* unicode_cache)
181 : unicode_cache_(unicode_cache),
182 octal_pos_(Location::invalid()),
183 octal_message_(MessageTemplate::kNone),
184 found_html_comment_(false) {}
185
Initialize(Utf16CharacterStream * source)186 void Scanner::Initialize(Utf16CharacterStream* source) {
187 source_ = source;
188 // Need to capture identifiers in order to recognize "get" and "set"
189 // in object literals.
190 Init();
191 // Skip initial whitespace allowing HTML comment ends just like
192 // after a newline and scan first token.
193 has_line_terminator_before_next_ = true;
194 SkipWhiteSpace();
195 Scan();
196 }
197
198 template <bool capture_raw, bool unicode>
ScanHexNumber(int expected_length)199 uc32 Scanner::ScanHexNumber(int expected_length) {
200 DCHECK(expected_length <= 4); // prevent overflow
201
202 int begin = source_pos() - 2;
203 uc32 x = 0;
204 for (int i = 0; i < expected_length; i++) {
205 int d = HexValue(c0_);
206 if (d < 0) {
207 ReportScannerError(Location(begin, begin + expected_length + 2),
208 unicode
209 ? MessageTemplate::kInvalidUnicodeEscapeSequence
210 : MessageTemplate::kInvalidHexEscapeSequence);
211 return -1;
212 }
213 x = x * 16 + d;
214 Advance<capture_raw>();
215 }
216
217 return x;
218 }
219
220 template <bool capture_raw>
ScanUnlimitedLengthHexNumber(int max_value,int beg_pos)221 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
222 uc32 x = 0;
223 int d = HexValue(c0_);
224 if (d < 0) return -1;
225
226 while (d >= 0) {
227 x = x * 16 + d;
228 if (x > max_value) {
229 ReportScannerError(Location(beg_pos, source_pos() + 1),
230 MessageTemplate::kUndefinedUnicodeCodePoint);
231 return -1;
232 }
233 Advance<capture_raw>();
234 d = HexValue(c0_);
235 }
236
237 return x;
238 }
239
240
241 // Ensure that tokens can be stored in a byte.
242 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
243
244 // Table of one-character tokens, by character (0x00..0x7f only).
245 static const byte one_char_tokens[] = {
246 Token::ILLEGAL,
247 Token::ILLEGAL,
248 Token::ILLEGAL,
249 Token::ILLEGAL,
250 Token::ILLEGAL,
251 Token::ILLEGAL,
252 Token::ILLEGAL,
253 Token::ILLEGAL,
254 Token::ILLEGAL,
255 Token::ILLEGAL,
256 Token::ILLEGAL,
257 Token::ILLEGAL,
258 Token::ILLEGAL,
259 Token::ILLEGAL,
260 Token::ILLEGAL,
261 Token::ILLEGAL,
262 Token::ILLEGAL,
263 Token::ILLEGAL,
264 Token::ILLEGAL,
265 Token::ILLEGAL,
266 Token::ILLEGAL,
267 Token::ILLEGAL,
268 Token::ILLEGAL,
269 Token::ILLEGAL,
270 Token::ILLEGAL,
271 Token::ILLEGAL,
272 Token::ILLEGAL,
273 Token::ILLEGAL,
274 Token::ILLEGAL,
275 Token::ILLEGAL,
276 Token::ILLEGAL,
277 Token::ILLEGAL,
278 Token::ILLEGAL,
279 Token::ILLEGAL,
280 Token::ILLEGAL,
281 Token::ILLEGAL,
282 Token::ILLEGAL,
283 Token::ILLEGAL,
284 Token::ILLEGAL,
285 Token::ILLEGAL,
286 Token::LPAREN, // 0x28
287 Token::RPAREN, // 0x29
288 Token::ILLEGAL,
289 Token::ILLEGAL,
290 Token::COMMA, // 0x2c
291 Token::ILLEGAL,
292 Token::ILLEGAL,
293 Token::ILLEGAL,
294 Token::ILLEGAL,
295 Token::ILLEGAL,
296 Token::ILLEGAL,
297 Token::ILLEGAL,
298 Token::ILLEGAL,
299 Token::ILLEGAL,
300 Token::ILLEGAL,
301 Token::ILLEGAL,
302 Token::ILLEGAL,
303 Token::ILLEGAL,
304 Token::COLON, // 0x3a
305 Token::SEMICOLON, // 0x3b
306 Token::ILLEGAL,
307 Token::ILLEGAL,
308 Token::ILLEGAL,
309 Token::CONDITIONAL, // 0x3f
310 Token::ILLEGAL,
311 Token::ILLEGAL,
312 Token::ILLEGAL,
313 Token::ILLEGAL,
314 Token::ILLEGAL,
315 Token::ILLEGAL,
316 Token::ILLEGAL,
317 Token::ILLEGAL,
318 Token::ILLEGAL,
319 Token::ILLEGAL,
320 Token::ILLEGAL,
321 Token::ILLEGAL,
322 Token::ILLEGAL,
323 Token::ILLEGAL,
324 Token::ILLEGAL,
325 Token::ILLEGAL,
326 Token::ILLEGAL,
327 Token::ILLEGAL,
328 Token::ILLEGAL,
329 Token::ILLEGAL,
330 Token::ILLEGAL,
331 Token::ILLEGAL,
332 Token::ILLEGAL,
333 Token::ILLEGAL,
334 Token::ILLEGAL,
335 Token::ILLEGAL,
336 Token::ILLEGAL,
337 Token::LBRACK, // 0x5b
338 Token::ILLEGAL,
339 Token::RBRACK, // 0x5d
340 Token::ILLEGAL,
341 Token::ILLEGAL,
342 Token::ILLEGAL,
343 Token::ILLEGAL,
344 Token::ILLEGAL,
345 Token::ILLEGAL,
346 Token::ILLEGAL,
347 Token::ILLEGAL,
348 Token::ILLEGAL,
349 Token::ILLEGAL,
350 Token::ILLEGAL,
351 Token::ILLEGAL,
352 Token::ILLEGAL,
353 Token::ILLEGAL,
354 Token::ILLEGAL,
355 Token::ILLEGAL,
356 Token::ILLEGAL,
357 Token::ILLEGAL,
358 Token::ILLEGAL,
359 Token::ILLEGAL,
360 Token::ILLEGAL,
361 Token::ILLEGAL,
362 Token::ILLEGAL,
363 Token::ILLEGAL,
364 Token::ILLEGAL,
365 Token::ILLEGAL,
366 Token::ILLEGAL,
367 Token::ILLEGAL,
368 Token::ILLEGAL,
369 Token::LBRACE, // 0x7b
370 Token::ILLEGAL,
371 Token::RBRACE, // 0x7d
372 Token::BIT_NOT, // 0x7e
373 Token::ILLEGAL
374 };
375
376
Next()377 Token::Value Scanner::Next() {
378 if (next_.token == Token::EOS) {
379 next_.location.beg_pos = current_.location.beg_pos;
380 next_.location.end_pos = current_.location.end_pos;
381 }
382 current_ = next_;
383 if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) {
384 next_ = next_next_;
385 next_next_.token = Token::UNINITIALIZED;
386 has_line_terminator_before_next_ = has_line_terminator_after_next_;
387 return current_.token;
388 }
389 has_line_terminator_before_next_ = false;
390 has_multiline_comment_before_next_ = false;
391 if (static_cast<unsigned>(c0_) <= 0x7f) {
392 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
393 if (token != Token::ILLEGAL) {
394 int pos = source_pos();
395 next_.token = token;
396 next_.location.beg_pos = pos;
397 next_.location.end_pos = pos + 1;
398 next_.literal_chars = nullptr;
399 next_.raw_literal_chars = nullptr;
400 Advance();
401 return current_.token;
402 }
403 }
404 Scan();
405 return current_.token;
406 }
407
408
PeekAhead()409 Token::Value Scanner::PeekAhead() {
410 DCHECK(next_.token != Token::DIV);
411 DCHECK(next_.token != Token::ASSIGN_DIV);
412
413 if (next_next_.token != Token::UNINITIALIZED) {
414 return next_next_.token;
415 }
416 TokenDesc prev = current_;
417 bool has_line_terminator_before_next =
418 has_line_terminator_before_next_ || has_multiline_comment_before_next_;
419 Next();
420 has_line_terminator_after_next_ =
421 has_line_terminator_before_next_ || has_multiline_comment_before_next_;
422 has_line_terminator_before_next_ = has_line_terminator_before_next;
423 Token::Value ret = next_.token;
424 next_next_ = next_;
425 next_ = current_;
426 current_ = prev;
427 return ret;
428 }
429
430
431 // TODO(yangguo): check whether this is actually necessary.
IsLittleEndianByteOrderMark(uc32 c)432 static inline bool IsLittleEndianByteOrderMark(uc32 c) {
433 // The Unicode value U+FFFE is guaranteed never to be assigned as a
434 // Unicode character; this implies that in a Unicode context the
435 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
436 // character expressed in little-endian byte order (since it could
437 // not be a U+FFFE character expressed in big-endian byte
438 // order). Nevertheless, we check for it to be compatible with
439 // Spidermonkey.
440 return c == 0xFFFE;
441 }
442
SkipWhiteSpace()443 bool Scanner::SkipWhiteSpace() {
444 int start_position = source_pos();
445
446 while (true) {
447 while (true) {
448 // Don't skip behind the end of input.
449 if (c0_ == kEndOfInput) break;
450
451 // Advance as long as character is a WhiteSpace or LineTerminator.
452 // Remember if the latter is the case.
453 if (unicode_cache_->IsLineTerminator(c0_)) {
454 has_line_terminator_before_next_ = true;
455 } else if (!unicode_cache_->IsWhiteSpace(c0_) &&
456 !IsLittleEndianByteOrderMark(c0_)) {
457 break;
458 }
459 Advance();
460 }
461
462 // If there is an HTML comment end '-->' at the beginning of a
463 // line (with only whitespace in front of it), we treat the rest
464 // of the line as a comment. This is in line with the way
465 // SpiderMonkey handles it.
466 if (c0_ != '-' || !has_line_terminator_before_next_) break;
467
468 Advance();
469 if (c0_ != '-') {
470 PushBack('-'); // undo Advance()
471 break;
472 }
473
474 Advance();
475 if (c0_ != '>') {
476 PushBack2('-', '-'); // undo 2x Advance();
477 break;
478 }
479
480 // Treat the rest of the line as a comment.
481 SkipSingleLineComment();
482 }
483
484 // Return whether or not we skipped any characters.
485 return source_pos() != start_position;
486 }
487
SkipSingleLineComment()488 Token::Value Scanner::SkipSingleLineComment() {
489 Advance();
490
491 // The line terminator at the end of the line is not considered
492 // to be part of the single-line comment; it is recognized
493 // separately by the lexical grammar and becomes part of the
494 // stream of input elements for the syntactic grammar (see
495 // ECMA-262, section 7.4).
496 while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
497 Advance();
498 }
499
500 return Token::WHITESPACE;
501 }
502
503
SkipSourceURLComment()504 Token::Value Scanner::SkipSourceURLComment() {
505 TryToParseSourceURLComment();
506 while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
507 Advance();
508 }
509
510 return Token::WHITESPACE;
511 }
512
513
TryToParseSourceURLComment()514 void Scanner::TryToParseSourceURLComment() {
515 // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
516 // function will just return if it cannot parse a magic comment.
517 if (c0_ == kEndOfInput || !unicode_cache_->IsWhiteSpace(c0_)) return;
518 Advance();
519 LiteralBuffer name;
520 while (c0_ != kEndOfInput &&
521 !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && c0_ != '=') {
522 name.AddChar(c0_);
523 Advance();
524 }
525 if (!name.is_one_byte()) return;
526 Vector<const uint8_t> name_literal = name.one_byte_literal();
527 LiteralBuffer* value;
528 if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
529 value = &source_url_;
530 } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
531 value = &source_mapping_url_;
532 } else {
533 return;
534 }
535 if (c0_ != '=')
536 return;
537 Advance();
538 value->Reset();
539 while (c0_ != kEndOfInput && unicode_cache_->IsWhiteSpace(c0_)) {
540 Advance();
541 }
542 while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
543 // Disallowed characters.
544 if (c0_ == '"' || c0_ == '\'') {
545 value->Reset();
546 return;
547 }
548 if (unicode_cache_->IsWhiteSpace(c0_)) {
549 break;
550 }
551 value->AddChar(c0_);
552 Advance();
553 }
554 // Allow whitespace at the end.
555 while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
556 if (!unicode_cache_->IsWhiteSpace(c0_)) {
557 value->Reset();
558 break;
559 }
560 Advance();
561 }
562 }
563
564
SkipMultiLineComment()565 Token::Value Scanner::SkipMultiLineComment() {
566 DCHECK(c0_ == '*');
567 Advance();
568
569 while (c0_ != kEndOfInput) {
570 uc32 ch = c0_;
571 Advance();
572 if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(ch)) {
573 // Following ECMA-262, section 7.4, a comment containing
574 // a newline will make the comment count as a line-terminator.
575 has_multiline_comment_before_next_ = true;
576 }
577 // If we have reached the end of the multi-line comment, we
578 // consume the '/' and insert a whitespace. This way all
579 // multi-line comments are treated as whitespace.
580 if (ch == '*' && c0_ == '/') {
581 c0_ = ' ';
582 return Token::WHITESPACE;
583 }
584 }
585
586 // Unterminated multi-line comment.
587 return Token::ILLEGAL;
588 }
589
ScanHtmlComment()590 Token::Value Scanner::ScanHtmlComment() {
591 // Check for <!-- comments.
592 DCHECK(c0_ == '!');
593 Advance();
594 if (c0_ != '-') {
595 PushBack('!'); // undo Advance()
596 return Token::LT;
597 }
598
599 Advance();
600 if (c0_ != '-') {
601 PushBack2('-', '!'); // undo 2x Advance()
602 return Token::LT;
603 }
604
605 found_html_comment_ = true;
606 return SkipSingleLineComment();
607 }
608
Scan()609 void Scanner::Scan() {
610 next_.literal_chars = NULL;
611 next_.raw_literal_chars = NULL;
612 Token::Value token;
613 do {
614 // Remember the position of the next token
615 next_.location.beg_pos = source_pos();
616
617 switch (c0_) {
618 case ' ':
619 case '\t':
620 Advance();
621 token = Token::WHITESPACE;
622 break;
623
624 case '\n':
625 Advance();
626 has_line_terminator_before_next_ = true;
627 token = Token::WHITESPACE;
628 break;
629
630 case '"': case '\'':
631 token = ScanString();
632 break;
633
634 case '<':
635 // < <= << <<= <!--
636 Advance();
637 if (c0_ == '=') {
638 token = Select(Token::LTE);
639 } else if (c0_ == '<') {
640 token = Select('=', Token::ASSIGN_SHL, Token::SHL);
641 } else if (c0_ == '!') {
642 token = ScanHtmlComment();
643 } else {
644 token = Token::LT;
645 }
646 break;
647
648 case '>':
649 // > >= >> >>= >>> >>>=
650 Advance();
651 if (c0_ == '=') {
652 token = Select(Token::GTE);
653 } else if (c0_ == '>') {
654 // >> >>= >>> >>>=
655 Advance();
656 if (c0_ == '=') {
657 token = Select(Token::ASSIGN_SAR);
658 } else if (c0_ == '>') {
659 token = Select('=', Token::ASSIGN_SHR, Token::SHR);
660 } else {
661 token = Token::SAR;
662 }
663 } else {
664 token = Token::GT;
665 }
666 break;
667
668 case '=':
669 // = == === =>
670 Advance();
671 if (c0_ == '=') {
672 token = Select('=', Token::EQ_STRICT, Token::EQ);
673 } else if (c0_ == '>') {
674 token = Select(Token::ARROW);
675 } else {
676 token = Token::ASSIGN;
677 }
678 break;
679
680 case '!':
681 // ! != !==
682 Advance();
683 if (c0_ == '=') {
684 token = Select('=', Token::NE_STRICT, Token::NE);
685 } else {
686 token = Token::NOT;
687 }
688 break;
689
690 case '+':
691 // + ++ +=
692 Advance();
693 if (c0_ == '+') {
694 token = Select(Token::INC);
695 } else if (c0_ == '=') {
696 token = Select(Token::ASSIGN_ADD);
697 } else {
698 token = Token::ADD;
699 }
700 break;
701
702 case '-':
703 // - -- --> -=
704 Advance();
705 if (c0_ == '-') {
706 Advance();
707 if (c0_ == '>' && HasAnyLineTerminatorBeforeNext()) {
708 // For compatibility with SpiderMonkey, we skip lines that
709 // start with an HTML comment end '-->'.
710 token = SkipSingleLineComment();
711 } else {
712 token = Token::DEC;
713 }
714 } else if (c0_ == '=') {
715 token = Select(Token::ASSIGN_SUB);
716 } else {
717 token = Token::SUB;
718 }
719 break;
720
721 case '*':
722 // * *=
723 Advance();
724 if (c0_ == '*') {
725 token = Select('=', Token::ASSIGN_EXP, Token::EXP);
726 } else if (c0_ == '=') {
727 token = Select(Token::ASSIGN_MUL);
728 } else {
729 token = Token::MUL;
730 }
731 break;
732
733 case '%':
734 // % %=
735 token = Select('=', Token::ASSIGN_MOD, Token::MOD);
736 break;
737
738 case '/':
739 // / // /* /=
740 Advance();
741 if (c0_ == '/') {
742 Advance();
743 if (c0_ == '#' || c0_ == '@') {
744 Advance();
745 token = SkipSourceURLComment();
746 } else {
747 PushBack(c0_);
748 token = SkipSingleLineComment();
749 }
750 } else if (c0_ == '*') {
751 token = SkipMultiLineComment();
752 } else if (c0_ == '=') {
753 token = Select(Token::ASSIGN_DIV);
754 } else {
755 token = Token::DIV;
756 }
757 break;
758
759 case '&':
760 // & && &=
761 Advance();
762 if (c0_ == '&') {
763 token = Select(Token::AND);
764 } else if (c0_ == '=') {
765 token = Select(Token::ASSIGN_BIT_AND);
766 } else {
767 token = Token::BIT_AND;
768 }
769 break;
770
771 case '|':
772 // | || |=
773 Advance();
774 if (c0_ == '|') {
775 token = Select(Token::OR);
776 } else if (c0_ == '=') {
777 token = Select(Token::ASSIGN_BIT_OR);
778 } else {
779 token = Token::BIT_OR;
780 }
781 break;
782
783 case '^':
784 // ^ ^=
785 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
786 break;
787
788 case '.':
789 // . Number
790 Advance();
791 if (IsDecimalDigit(c0_)) {
792 token = ScanNumber(true);
793 } else {
794 token = Token::PERIOD;
795 if (c0_ == '.') {
796 Advance();
797 if (c0_ == '.') {
798 Advance();
799 token = Token::ELLIPSIS;
800 } else {
801 PushBack('.');
802 }
803 }
804 }
805 break;
806
807 case ':':
808 token = Select(Token::COLON);
809 break;
810
811 case ';':
812 token = Select(Token::SEMICOLON);
813 break;
814
815 case ',':
816 token = Select(Token::COMMA);
817 break;
818
819 case '(':
820 token = Select(Token::LPAREN);
821 break;
822
823 case ')':
824 token = Select(Token::RPAREN);
825 break;
826
827 case '[':
828 token = Select(Token::LBRACK);
829 break;
830
831 case ']':
832 token = Select(Token::RBRACK);
833 break;
834
835 case '{':
836 token = Select(Token::LBRACE);
837 break;
838
839 case '}':
840 token = Select(Token::RBRACE);
841 break;
842
843 case '?':
844 token = Select(Token::CONDITIONAL);
845 break;
846
847 case '~':
848 token = Select(Token::BIT_NOT);
849 break;
850
851 case '`':
852 token = ScanTemplateStart();
853 break;
854
855 default:
856 if (c0_ == kEndOfInput) {
857 token = Token::EOS;
858 } else if (unicode_cache_->IsIdentifierStart(c0_)) {
859 token = ScanIdentifierOrKeyword();
860 } else if (IsDecimalDigit(c0_)) {
861 token = ScanNumber(false);
862 } else if (SkipWhiteSpace()) {
863 token = Token::WHITESPACE;
864 } else {
865 token = Select(Token::ILLEGAL);
866 }
867 break;
868 }
869
870 // Continue scanning for tokens as long as we're just skipping
871 // whitespace.
872 } while (token == Token::WHITESPACE);
873
874 next_.location.end_pos = source_pos();
875 next_.token = token;
876
877 #ifdef DEBUG
878 SanityCheckTokenDesc(current_);
879 SanityCheckTokenDesc(next_);
880 SanityCheckTokenDesc(next_next_);
881 #endif
882 }
883
884 #ifdef DEBUG
SanityCheckTokenDesc(const TokenDesc & token) const885 void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
886 // Most tokens should not have literal_chars or even raw_literal chars.
887 // The rules are:
888 // - UNINITIALIZED: we don't care.
889 // - TEMPLATE_*: need both literal + raw literal chars.
890 // - IDENTIFIERS, STRINGS, etc.: need a literal, but no raw literal.
891 // - all others: should have neither.
892
893 switch (token.token) {
894 case Token::UNINITIALIZED:
895 // token.literal_chars & other members might be garbage. That's ok.
896 break;
897 case Token::TEMPLATE_SPAN:
898 case Token::TEMPLATE_TAIL:
899 DCHECK_NOT_NULL(token.raw_literal_chars);
900 DCHECK_NOT_NULL(token.literal_chars);
901 break;
902 case Token::ESCAPED_KEYWORD:
903 case Token::ESCAPED_STRICT_RESERVED_WORD:
904 case Token::FUTURE_STRICT_RESERVED_WORD:
905 case Token::IDENTIFIER:
906 case Token::NUMBER:
907 case Token::REGEXP_LITERAL:
908 case Token::SMI:
909 case Token::STRING:
910 DCHECK_NOT_NULL(token.literal_chars);
911 DCHECK_NULL(token.raw_literal_chars);
912 break;
913 default:
914 DCHECK_NULL(token.literal_chars);
915 DCHECK_NULL(token.raw_literal_chars);
916 break;
917 }
918 }
919 #endif // DEBUG
920
SeekForward(int pos)921 void Scanner::SeekForward(int pos) {
922 // After this call, we will have the token at the given position as
923 // the "next" token. The "current" token will be invalid.
924 if (pos == next_.location.beg_pos) return;
925 int current_pos = source_pos();
926 DCHECK_EQ(next_.location.end_pos, current_pos);
927 // Positions inside the lookahead token aren't supported.
928 DCHECK(pos >= current_pos);
929 if (pos != current_pos) {
930 source_->Seek(pos);
931 Advance();
932 // This function is only called to seek to the location
933 // of the end of a function (at the "}" token). It doesn't matter
934 // whether there was a line terminator in the part we skip.
935 has_line_terminator_before_next_ = false;
936 has_multiline_comment_before_next_ = false;
937 }
938 Scan();
939 }
940
941
942 template <bool capture_raw, bool in_template_literal>
ScanEscape()943 bool Scanner::ScanEscape() {
944 uc32 c = c0_;
945 Advance<capture_raw>();
946
947 // Skip escaped newlines.
948 if (!in_template_literal && c0_ != kEndOfInput &&
949 unicode_cache_->IsLineTerminator(c)) {
950 // Allow CR+LF newlines in multiline string literals.
951 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>();
952 // Allow LF+CR newlines in multiline string literals.
953 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>();
954 return true;
955 }
956
957 switch (c) {
958 case '\'': // fall through
959 case '"' : // fall through
960 case '\\': break;
961 case 'b' : c = '\b'; break;
962 case 'f' : c = '\f'; break;
963 case 'n' : c = '\n'; break;
964 case 'r' : c = '\r'; break;
965 case 't' : c = '\t'; break;
966 case 'u' : {
967 c = ScanUnicodeEscape<capture_raw>();
968 if (c < 0) return false;
969 break;
970 }
971 case 'v':
972 c = '\v';
973 break;
974 case 'x': {
975 c = ScanHexNumber<capture_raw>(2);
976 if (c < 0) return false;
977 break;
978 }
979 case '0': // Fall through.
980 case '1': // fall through
981 case '2': // fall through
982 case '3': // fall through
983 case '4': // fall through
984 case '5': // fall through
985 case '6': // fall through
986 case '7':
987 c = ScanOctalEscape<capture_raw>(c, 2);
988 break;
989 }
990
991 // Other escaped characters are interpreted as their non-escaped version.
992 AddLiteralChar(c);
993 return true;
994 }
995
996
997 template <bool capture_raw>
ScanOctalEscape(uc32 c,int length)998 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
999 uc32 x = c - '0';
1000 int i = 0;
1001 for (; i < length; i++) {
1002 int d = c0_ - '0';
1003 if (d < 0 || d > 7) break;
1004 int nx = x * 8 + d;
1005 if (nx >= 256) break;
1006 x = nx;
1007 Advance<capture_raw>();
1008 }
1009 // Anything except '\0' is an octal escape sequence, illegal in strict mode.
1010 // Remember the position of octal escape sequences so that an error
1011 // can be reported later (in strict mode).
1012 // We don't report the error immediately, because the octal escape can
1013 // occur before the "use strict" directive.
1014 if (c != '0' || i > 0) {
1015 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
1016 octal_message_ = MessageTemplate::kStrictOctalEscape;
1017 }
1018 return x;
1019 }
1020
1021
ScanString()1022 Token::Value Scanner::ScanString() {
1023 uc32 quote = c0_;
1024 Advance<false, false>(); // consume quote
1025
1026 LiteralScope literal(this);
1027 while (true) {
1028 if (c0_ > kMaxAscii) {
1029 HandleLeadSurrogate();
1030 break;
1031 }
1032 if (c0_ == kEndOfInput || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL;
1033 if (c0_ == quote) {
1034 literal.Complete();
1035 Advance<false, false>();
1036 return Token::STRING;
1037 }
1038 char c = static_cast<char>(c0_);
1039 if (c == '\\') break;
1040 Advance<false, false>();
1041 AddLiteralChar(c);
1042 }
1043
1044 while (c0_ != quote && c0_ != kEndOfInput &&
1045 !unicode_cache_->IsLineTerminator(c0_)) {
1046 uc32 c = c0_;
1047 Advance();
1048 if (c == '\\') {
1049 if (c0_ == kEndOfInput || !ScanEscape<false, false>()) {
1050 return Token::ILLEGAL;
1051 }
1052 } else {
1053 AddLiteralChar(c);
1054 }
1055 }
1056 if (c0_ != quote) return Token::ILLEGAL;
1057 literal.Complete();
1058
1059 Advance(); // consume quote
1060 return Token::STRING;
1061 }
1062
1063
ScanTemplateSpan()1064 Token::Value Scanner::ScanTemplateSpan() {
1065 // When scanning a TemplateSpan, we are looking for the following construct:
1066 // TEMPLATE_SPAN ::
1067 // ` LiteralChars* ${
1068 // | } LiteralChars* ${
1069 //
1070 // TEMPLATE_TAIL ::
1071 // ` LiteralChars* `
1072 // | } LiteralChar* `
1073 //
1074 // A TEMPLATE_SPAN should always be followed by an Expression, while a
1075 // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
1076 // followed by an Expression.
1077
1078 // These scoped helpers save and restore the original error state, so that we
1079 // can specially treat invalid escape sequences in templates (which are
1080 // handled by the parser).
1081 ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_);
1082 ErrorState octal_error_state(&octal_message_, &octal_pos_);
1083
1084 Token::Value result = Token::TEMPLATE_SPAN;
1085 LiteralScope literal(this);
1086 StartRawLiteral();
1087 const bool capture_raw = true;
1088 const bool in_template_literal = true;
1089 while (true) {
1090 uc32 c = c0_;
1091 Advance<capture_raw>();
1092 if (c == '`') {
1093 result = Token::TEMPLATE_TAIL;
1094 ReduceRawLiteralLength(1);
1095 break;
1096 } else if (c == '$' && c0_ == '{') {
1097 Advance<capture_raw>(); // Consume '{'
1098 ReduceRawLiteralLength(2);
1099 break;
1100 } else if (c == '\\') {
1101 if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(c0_)) {
1102 // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
1103 // code unit sequence.
1104 uc32 lastChar = c0_;
1105 Advance<capture_raw>();
1106 if (lastChar == '\r') {
1107 ReduceRawLiteralLength(1); // Remove \r
1108 if (c0_ == '\n') {
1109 Advance<capture_raw>(); // Adds \n
1110 } else {
1111 AddRawLiteralChar('\n');
1112 }
1113 }
1114 } else {
1115 bool success = ScanEscape<capture_raw, in_template_literal>();
1116 USE(success);
1117 DCHECK_EQ(!success, has_error());
1118 // For templates, invalid escape sequence checking is handled in the
1119 // parser.
1120 scanner_error_state.MoveErrorTo(&invalid_template_escape_message_,
1121 &invalid_template_escape_location_);
1122 octal_error_state.MoveErrorTo(&invalid_template_escape_message_,
1123 &invalid_template_escape_location_);
1124 }
1125 } else if (c < 0) {
1126 // Unterminated template literal
1127 PushBack(c);
1128 break;
1129 } else {
1130 // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
1131 // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
1132 // consisting of the CV 0x000A.
1133 if (c == '\r') {
1134 ReduceRawLiteralLength(1); // Remove \r
1135 if (c0_ == '\n') {
1136 Advance<capture_raw>(); // Adds \n
1137 } else {
1138 AddRawLiteralChar('\n');
1139 }
1140 c = '\n';
1141 }
1142 AddLiteralChar(c);
1143 }
1144 }
1145 literal.Complete();
1146 next_.location.end_pos = source_pos();
1147 next_.token = result;
1148
1149 return result;
1150 }
1151
1152
ScanTemplateStart()1153 Token::Value Scanner::ScanTemplateStart() {
1154 DCHECK(next_next_.token == Token::UNINITIALIZED);
1155 DCHECK(c0_ == '`');
1156 next_.location.beg_pos = source_pos();
1157 Advance(); // Consume `
1158 return ScanTemplateSpan();
1159 }
1160
1161
ScanTemplateContinuation()1162 Token::Value Scanner::ScanTemplateContinuation() {
1163 DCHECK_EQ(next_.token, Token::RBRACE);
1164 next_.location.beg_pos = source_pos() - 1; // We already consumed }
1165 return ScanTemplateSpan();
1166 }
1167
1168
ScanDecimalDigits()1169 void Scanner::ScanDecimalDigits() {
1170 while (IsDecimalDigit(c0_))
1171 AddLiteralCharAdvance();
1172 }
1173
1174
ScanNumber(bool seen_period)1175 Token::Value Scanner::ScanNumber(bool seen_period) {
1176 DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
1177
1178 enum {
1179 DECIMAL,
1180 DECIMAL_WITH_LEADING_ZERO,
1181 HEX,
1182 OCTAL,
1183 IMPLICIT_OCTAL,
1184 BINARY
1185 } kind = DECIMAL;
1186
1187 LiteralScope literal(this);
1188 bool at_start = !seen_period;
1189 int start_pos = source_pos(); // For reporting octal positions.
1190 if (seen_period) {
1191 // we have already seen a decimal point of the float
1192 AddLiteralChar('.');
1193 ScanDecimalDigits(); // we know we have at least one digit
1194
1195 } else {
1196 // if the first character is '0' we must check for octals and hex
1197 if (c0_ == '0') {
1198 AddLiteralCharAdvance();
1199
1200 // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
1201 // an octal number.
1202 if (c0_ == 'x' || c0_ == 'X') {
1203 // hex number
1204 kind = HEX;
1205 AddLiteralCharAdvance();
1206 if (!IsHexDigit(c0_)) {
1207 // we must have at least one hex digit after 'x'/'X'
1208 return Token::ILLEGAL;
1209 }
1210 while (IsHexDigit(c0_)) {
1211 AddLiteralCharAdvance();
1212 }
1213 } else if (c0_ == 'o' || c0_ == 'O') {
1214 kind = OCTAL;
1215 AddLiteralCharAdvance();
1216 if (!IsOctalDigit(c0_)) {
1217 // we must have at least one octal digit after 'o'/'O'
1218 return Token::ILLEGAL;
1219 }
1220 while (IsOctalDigit(c0_)) {
1221 AddLiteralCharAdvance();
1222 }
1223 } else if (c0_ == 'b' || c0_ == 'B') {
1224 kind = BINARY;
1225 AddLiteralCharAdvance();
1226 if (!IsBinaryDigit(c0_)) {
1227 // we must have at least one binary digit after 'b'/'B'
1228 return Token::ILLEGAL;
1229 }
1230 while (IsBinaryDigit(c0_)) {
1231 AddLiteralCharAdvance();
1232 }
1233 } else if ('0' <= c0_ && c0_ <= '7') {
1234 // (possible) octal number
1235 kind = IMPLICIT_OCTAL;
1236 while (true) {
1237 if (c0_ == '8' || c0_ == '9') {
1238 at_start = false;
1239 kind = DECIMAL_WITH_LEADING_ZERO;
1240 break;
1241 }
1242 if (c0_ < '0' || '7' < c0_) {
1243 // Octal literal finished.
1244 octal_pos_ = Location(start_pos, source_pos());
1245 octal_message_ = MessageTemplate::kStrictOctalLiteral;
1246 break;
1247 }
1248 AddLiteralCharAdvance();
1249 }
1250 } else if (c0_ == '8' || c0_ == '9') {
1251 kind = DECIMAL_WITH_LEADING_ZERO;
1252 }
1253 }
1254
1255 // Parse decimal digits and allow trailing fractional part.
1256 if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) {
1257 if (at_start) {
1258 uint64_t value = 0;
1259 while (IsDecimalDigit(c0_)) {
1260 value = 10 * value + (c0_ - '0');
1261
1262 uc32 first_char = c0_;
1263 Advance<false, false>();
1264 AddLiteralChar(first_char);
1265 }
1266
1267 if (next_.literal_chars->one_byte_literal().length() <= 10 &&
1268 value <= Smi::kMaxValue && c0_ != '.' &&
1269 (c0_ == kEndOfInput || !unicode_cache_->IsIdentifierStart(c0_))) {
1270 next_.smi_value_ = static_cast<uint32_t>(value);
1271 literal.Complete();
1272 HandleLeadSurrogate();
1273
1274 if (kind == DECIMAL_WITH_LEADING_ZERO) {
1275 octal_pos_ = Location(start_pos, source_pos());
1276 octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
1277 }
1278 return Token::SMI;
1279 }
1280 HandleLeadSurrogate();
1281 }
1282
1283 ScanDecimalDigits(); // optional
1284 if (c0_ == '.') {
1285 AddLiteralCharAdvance();
1286 ScanDecimalDigits(); // optional
1287 }
1288 }
1289 }
1290
1291 // scan exponent, if any
1292 if (c0_ == 'e' || c0_ == 'E') {
1293 DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
1294 if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO))
1295 return Token::ILLEGAL;
1296 // scan exponent
1297 AddLiteralCharAdvance();
1298 if (c0_ == '+' || c0_ == '-')
1299 AddLiteralCharAdvance();
1300 if (!IsDecimalDigit(c0_)) {
1301 // we must have at least one decimal digit after 'e'/'E'
1302 return Token::ILLEGAL;
1303 }
1304 ScanDecimalDigits();
1305 }
1306
1307 // The source character immediately following a numeric literal must
1308 // not be an identifier start or a decimal digit; see ECMA-262
1309 // section 7.8.3, page 17 (note that we read only one decimal digit
1310 // if the value is 0).
1311 if (IsDecimalDigit(c0_) ||
1312 (c0_ != kEndOfInput && unicode_cache_->IsIdentifierStart(c0_)))
1313 return Token::ILLEGAL;
1314
1315 literal.Complete();
1316
1317 if (kind == DECIMAL_WITH_LEADING_ZERO) {
1318 octal_pos_ = Location(start_pos, source_pos());
1319 octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
1320 }
1321 return Token::NUMBER;
1322 }
1323
1324
ScanIdentifierUnicodeEscape()1325 uc32 Scanner::ScanIdentifierUnicodeEscape() {
1326 Advance();
1327 if (c0_ != 'u') return -1;
1328 Advance();
1329 return ScanUnicodeEscape<false>();
1330 }
1331
1332
1333 template <bool capture_raw>
ScanUnicodeEscape()1334 uc32 Scanner::ScanUnicodeEscape() {
1335 // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
1336 // hex digits between { } is arbitrary. \ and u have already been read.
1337 if (c0_ == '{') {
1338 int begin = source_pos() - 2;
1339 Advance<capture_raw>();
1340 uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff, begin);
1341 if (cp < 0 || c0_ != '}') {
1342 ReportScannerError(source_pos(),
1343 MessageTemplate::kInvalidUnicodeEscapeSequence);
1344 return -1;
1345 }
1346 Advance<capture_raw>();
1347 return cp;
1348 }
1349 const bool unicode = true;
1350 return ScanHexNumber<capture_raw, unicode>(4);
1351 }
1352
1353
1354 // ----------------------------------------------------------------------------
1355 // Keyword Matcher
1356
1357 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
1358 KEYWORD_GROUP('a') \
1359 KEYWORD("async", Token::ASYNC) \
1360 KEYWORD("await", Token::AWAIT) \
1361 KEYWORD_GROUP('b') \
1362 KEYWORD("break", Token::BREAK) \
1363 KEYWORD_GROUP('c') \
1364 KEYWORD("case", Token::CASE) \
1365 KEYWORD("catch", Token::CATCH) \
1366 KEYWORD("class", Token::CLASS) \
1367 KEYWORD("const", Token::CONST) \
1368 KEYWORD("continue", Token::CONTINUE) \
1369 KEYWORD_GROUP('d') \
1370 KEYWORD("debugger", Token::DEBUGGER) \
1371 KEYWORD("default", Token::DEFAULT) \
1372 KEYWORD("delete", Token::DELETE) \
1373 KEYWORD("do", Token::DO) \
1374 KEYWORD_GROUP('e') \
1375 KEYWORD("else", Token::ELSE) \
1376 KEYWORD("enum", Token::ENUM) \
1377 KEYWORD("export", Token::EXPORT) \
1378 KEYWORD("extends", Token::EXTENDS) \
1379 KEYWORD_GROUP('f') \
1380 KEYWORD("false", Token::FALSE_LITERAL) \
1381 KEYWORD("finally", Token::FINALLY) \
1382 KEYWORD("for", Token::FOR) \
1383 KEYWORD("function", Token::FUNCTION) \
1384 KEYWORD_GROUP('i') \
1385 KEYWORD("if", Token::IF) \
1386 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
1387 KEYWORD("import", Token::IMPORT) \
1388 KEYWORD("in", Token::IN) \
1389 KEYWORD("instanceof", Token::INSTANCEOF) \
1390 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \
1391 KEYWORD_GROUP('l') \
1392 KEYWORD("let", Token::LET) \
1393 KEYWORD_GROUP('n') \
1394 KEYWORD("new", Token::NEW) \
1395 KEYWORD("null", Token::NULL_LITERAL) \
1396 KEYWORD_GROUP('p') \
1397 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \
1398 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \
1399 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \
1400 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \
1401 KEYWORD_GROUP('r') \
1402 KEYWORD("return", Token::RETURN) \
1403 KEYWORD_GROUP('s') \
1404 KEYWORD("static", Token::STATIC) \
1405 KEYWORD("super", Token::SUPER) \
1406 KEYWORD("switch", Token::SWITCH) \
1407 KEYWORD_GROUP('t') \
1408 KEYWORD("this", Token::THIS) \
1409 KEYWORD("throw", Token::THROW) \
1410 KEYWORD("true", Token::TRUE_LITERAL) \
1411 KEYWORD("try", Token::TRY) \
1412 KEYWORD("typeof", Token::TYPEOF) \
1413 KEYWORD_GROUP('v') \
1414 KEYWORD("var", Token::VAR) \
1415 KEYWORD("void", Token::VOID) \
1416 KEYWORD_GROUP('w') \
1417 KEYWORD("while", Token::WHILE) \
1418 KEYWORD("with", Token::WITH) \
1419 KEYWORD_GROUP('y') \
1420 KEYWORD("yield", Token::YIELD)
1421
KeywordOrIdentifierToken(const uint8_t * input,int input_length)1422 static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
1423 int input_length) {
1424 DCHECK(input_length >= 1);
1425 const int kMinLength = 2;
1426 const int kMaxLength = 10;
1427 if (input_length < kMinLength || input_length > kMaxLength) {
1428 return Token::IDENTIFIER;
1429 }
1430 switch (input[0]) {
1431 default:
1432 #define KEYWORD_GROUP_CASE(ch) \
1433 break; \
1434 case ch:
1435 #define KEYWORD(keyword, token) \
1436 { \
1437 /* 'keyword' is a char array, so sizeof(keyword) is */ \
1438 /* strlen(keyword) plus 1 for the NUL char. */ \
1439 const int keyword_length = sizeof(keyword) - 1; \
1440 STATIC_ASSERT(keyword_length >= kMinLength); \
1441 STATIC_ASSERT(keyword_length <= kMaxLength); \
1442 if (input_length == keyword_length && input[1] == keyword[1] && \
1443 (keyword_length <= 2 || input[2] == keyword[2]) && \
1444 (keyword_length <= 3 || input[3] == keyword[3]) && \
1445 (keyword_length <= 4 || input[4] == keyword[4]) && \
1446 (keyword_length <= 5 || input[5] == keyword[5]) && \
1447 (keyword_length <= 6 || input[6] == keyword[6]) && \
1448 (keyword_length <= 7 || input[7] == keyword[7]) && \
1449 (keyword_length <= 8 || input[8] == keyword[8]) && \
1450 (keyword_length <= 9 || input[9] == keyword[9])) { \
1451 return token; \
1452 } \
1453 }
1454 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
1455 }
1456 return Token::IDENTIFIER;
1457 }
1458
1459
ScanIdentifierOrKeyword()1460 Token::Value Scanner::ScanIdentifierOrKeyword() {
1461 DCHECK(unicode_cache_->IsIdentifierStart(c0_));
1462 LiteralScope literal(this);
1463 if (IsInRange(c0_, 'a', 'z')) {
1464 do {
1465 char first_char = static_cast<char>(c0_);
1466 Advance<false, false>();
1467 AddLiteralChar(first_char);
1468 } while (IsInRange(c0_, 'a', 'z'));
1469
1470 if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' ||
1471 c0_ == '$') {
1472 // Identifier starting with lowercase.
1473 char first_char = static_cast<char>(c0_);
1474 Advance<false, false>();
1475 AddLiteralChar(first_char);
1476 while (IsAsciiIdentifier(c0_)) {
1477 char first_char = static_cast<char>(c0_);
1478 Advance<false, false>();
1479 AddLiteralChar(first_char);
1480 }
1481 if (c0_ <= kMaxAscii && c0_ != '\\') {
1482 literal.Complete();
1483 return Token::IDENTIFIER;
1484 }
1485 } else if (c0_ <= kMaxAscii && c0_ != '\\') {
1486 // Only a-z+: could be a keyword or identifier.
1487 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1488 Token::Value token =
1489 KeywordOrIdentifierToken(chars.start(), chars.length());
1490 if (token == Token::IDENTIFIER ||
1491 token == Token::FUTURE_STRICT_RESERVED_WORD)
1492 literal.Complete();
1493 return token;
1494 }
1495
1496 HandleLeadSurrogate();
1497 } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') {
1498 do {
1499 char first_char = static_cast<char>(c0_);
1500 Advance<false, false>();
1501 AddLiteralChar(first_char);
1502 } while (IsAsciiIdentifier(c0_));
1503
1504 if (c0_ <= kMaxAscii && c0_ != '\\') {
1505 literal.Complete();
1506 return Token::IDENTIFIER;
1507 }
1508
1509 HandleLeadSurrogate();
1510 } else if (c0_ == '\\') {
1511 // Scan identifier start character.
1512 uc32 c = ScanIdentifierUnicodeEscape();
1513 // Only allow legal identifier start characters.
1514 if (c < 0 ||
1515 c == '\\' || // No recursive escapes.
1516 !unicode_cache_->IsIdentifierStart(c)) {
1517 return Token::ILLEGAL;
1518 }
1519 AddLiteralChar(c);
1520 return ScanIdentifierSuffix(&literal, true);
1521 } else {
1522 uc32 first_char = c0_;
1523 Advance();
1524 AddLiteralChar(first_char);
1525 }
1526
1527 // Scan the rest of the identifier characters.
1528 while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1529 if (c0_ != '\\') {
1530 uc32 next_char = c0_;
1531 Advance();
1532 AddLiteralChar(next_char);
1533 continue;
1534 }
1535 // Fallthrough if no longer able to complete keyword.
1536 return ScanIdentifierSuffix(&literal, false);
1537 }
1538
1539 if (next_.literal_chars->is_one_byte()) {
1540 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1541 Token::Value token =
1542 KeywordOrIdentifierToken(chars.start(), chars.length());
1543 if (token == Token::IDENTIFIER ||
1544 token == Token::FUTURE_STRICT_RESERVED_WORD)
1545 literal.Complete();
1546 return token;
1547 }
1548 literal.Complete();
1549 return Token::IDENTIFIER;
1550 }
1551
1552
ScanIdentifierSuffix(LiteralScope * literal,bool escaped)1553 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal,
1554 bool escaped) {
1555 // Scan the rest of the identifier characters.
1556 while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1557 if (c0_ == '\\') {
1558 uc32 c = ScanIdentifierUnicodeEscape();
1559 escaped = true;
1560 // Only allow legal identifier part characters.
1561 if (c < 0 ||
1562 c == '\\' ||
1563 !unicode_cache_->IsIdentifierPart(c)) {
1564 return Token::ILLEGAL;
1565 }
1566 AddLiteralChar(c);
1567 } else {
1568 AddLiteralChar(c0_);
1569 Advance();
1570 }
1571 }
1572 literal->Complete();
1573
1574 if (escaped && next_.literal_chars->is_one_byte()) {
1575 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1576 Token::Value token =
1577 KeywordOrIdentifierToken(chars.start(), chars.length());
1578 /* TODO(adamk): YIELD should be handled specially. */
1579 if (token == Token::IDENTIFIER) {
1580 return Token::IDENTIFIER;
1581 } else if (token == Token::FUTURE_STRICT_RESERVED_WORD ||
1582 token == Token::LET || token == Token::STATIC) {
1583 return Token::ESCAPED_STRICT_RESERVED_WORD;
1584 } else {
1585 return Token::ESCAPED_KEYWORD;
1586 }
1587 }
1588 return Token::IDENTIFIER;
1589 }
1590
ScanRegExpPattern()1591 bool Scanner::ScanRegExpPattern() {
1592 DCHECK(next_next_.token == Token::UNINITIALIZED);
1593 DCHECK(next_.token == Token::DIV || next_.token == Token::ASSIGN_DIV);
1594
1595 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1596 bool in_character_class = false;
1597 bool seen_equal = (next_.token == Token::ASSIGN_DIV);
1598
1599 // Previous token is either '/' or '/=', in the second case, the
1600 // pattern starts at =.
1601 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1602 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1603
1604 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1605 // the scanner should pass uninterpreted bodies to the RegExp
1606 // constructor.
1607 LiteralScope literal(this);
1608 if (seen_equal) {
1609 AddLiteralChar('=');
1610 }
1611
1612 while (c0_ != '/' || in_character_class) {
1613 if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_))
1614 return false;
1615 if (c0_ == '\\') { // Escape sequence.
1616 AddLiteralCharAdvance();
1617 if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_))
1618 return false;
1619 AddLiteralCharAdvance();
1620 // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1621 // only "safe" characters are allowed (letters, digits, underscore),
1622 // otherwise the escape isn't valid and the invalid character has
1623 // its normal meaning. I.e., we can just continue scanning without
1624 // worrying whether the following characters are part of the escape
1625 // or not, since any '/', '\\' or '[' is guaranteed to not be part
1626 // of the escape sequence.
1627
1628 // TODO(896): At some point, parse RegExps more throughly to capture
1629 // octal esacpes in strict mode.
1630 } else { // Unescaped character.
1631 if (c0_ == '[') in_character_class = true;
1632 if (c0_ == ']') in_character_class = false;
1633 AddLiteralCharAdvance();
1634 }
1635 }
1636 Advance(); // consume '/'
1637
1638 literal.Complete();
1639 next_.token = Token::REGEXP_LITERAL;
1640 return true;
1641 }
1642
1643
ScanRegExpFlags()1644 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
1645 DCHECK(next_.token == Token::REGEXP_LITERAL);
1646
1647 // Scan regular expression flags.
1648 int flags = 0;
1649 while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1650 RegExp::Flags flag = RegExp::kNone;
1651 switch (c0_) {
1652 case 'g':
1653 flag = RegExp::kGlobal;
1654 break;
1655 case 'i':
1656 flag = RegExp::kIgnoreCase;
1657 break;
1658 case 'm':
1659 flag = RegExp::kMultiline;
1660 break;
1661 case 'u':
1662 flag = RegExp::kUnicode;
1663 break;
1664 case 'y':
1665 flag = RegExp::kSticky;
1666 break;
1667 default:
1668 return Nothing<RegExp::Flags>();
1669 }
1670 if (flags & flag) {
1671 return Nothing<RegExp::Flags>();
1672 }
1673 Advance();
1674 flags |= flag;
1675 }
1676
1677 next_.location.end_pos = source_pos();
1678 return Just(RegExp::Flags(flags));
1679 }
1680
1681
CurrentSymbol(AstValueFactory * ast_value_factory)1682 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) {
1683 if (is_literal_one_byte()) {
1684 return ast_value_factory->GetOneByteString(literal_one_byte_string());
1685 }
1686 return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1687 }
1688
1689
NextSymbol(AstValueFactory * ast_value_factory)1690 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) {
1691 if (is_next_literal_one_byte()) {
1692 return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1693 }
1694 return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1695 }
1696
1697
CurrentRawSymbol(AstValueFactory * ast_value_factory)1698 const AstRawString* Scanner::CurrentRawSymbol(
1699 AstValueFactory* ast_value_factory) {
1700 if (is_raw_literal_one_byte()) {
1701 return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1702 }
1703 return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1704 }
1705
1706
DoubleValue()1707 double Scanner::DoubleValue() {
1708 DCHECK(is_literal_one_byte());
1709 return StringToDouble(
1710 unicode_cache_,
1711 literal_one_byte_string(),
1712 ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1713 }
1714
1715
ContainsDot()1716 bool Scanner::ContainsDot() {
1717 DCHECK(is_literal_one_byte());
1718 Vector<const uint8_t> str = literal_one_byte_string();
1719 return std::find(str.begin(), str.end(), '.') != str.end();
1720 }
1721
FindSymbol(DuplicateFinder * finder)1722 bool Scanner::FindSymbol(DuplicateFinder* finder) {
1723 // TODO(vogelheim): Move this logic into the calling class; this can be fully
1724 // implemented using the public interface.
1725 if (is_literal_one_byte()) {
1726 return finder->AddOneByteSymbol(literal_one_byte_string());
1727 }
1728 return finder->AddTwoByteSymbol(literal_two_byte_string());
1729 }
1730
SeekNext(size_t position)1731 void Scanner::SeekNext(size_t position) {
1732 // Use with care: This cleanly resets most, but not all scanner state.
1733 // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.
1734
1735 // To re-scan from a given character position, we need to:
1736 // 1, Reset the current_, next_ and next_next_ tokens
1737 // (next_ + next_next_ will be overwrittem by Next(),
1738 // current_ will remain unchanged, so overwrite it fully.)
1739 current_ = {{0, 0}, nullptr, nullptr, 0, Token::UNINITIALIZED};
1740 next_.token = Token::UNINITIALIZED;
1741 next_next_.token = Token::UNINITIALIZED;
1742 // 2, reset the source to the desired position,
1743 source_->Seek(position);
1744 // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
1745 c0_ = source_->Advance();
1746 Next();
1747 DCHECK_EQ(next_.location.beg_pos, static_cast<int>(position));
1748 }
1749
1750 } // namespace internal
1751 } // namespace v8
1752