• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Features shared by parsing and pre-parsing scanners.
6 
7 #include "src/parsing/scanner.h"
8 
9 #include <stdint.h>
10 
11 #include <cmath>
12 
13 #include "src/ast/ast-value-factory.h"
14 #include "src/char-predicates-inl.h"
15 #include "src/conversions-inl.h"
16 #include "src/list-inl.h"
17 #include "src/parsing/duplicate-finder.h"  // For Scanner::FindSymbol
18 
19 namespace v8 {
20 namespace internal {
21 
22 // Scoped helper for saving & restoring scanner error state.
23 // This is used for tagged template literals, in which normally forbidden
24 // escape sequences are allowed.
25 class ErrorState {
26  public:
ErrorState(MessageTemplate::Template * message_stack,Scanner::Location * location_stack)27   ErrorState(MessageTemplate::Template* message_stack,
28              Scanner::Location* location_stack)
29       : message_stack_(message_stack),
30         old_message_(*message_stack),
31         location_stack_(location_stack),
32         old_location_(*location_stack) {
33     *message_stack_ = MessageTemplate::kNone;
34     *location_stack_ = Scanner::Location::invalid();
35   }
36 
~ErrorState()37   ~ErrorState() {
38     *message_stack_ = old_message_;
39     *location_stack_ = old_location_;
40   }
41 
MoveErrorTo(MessageTemplate::Template * message_dest,Scanner::Location * location_dest)42   void MoveErrorTo(MessageTemplate::Template* message_dest,
43                    Scanner::Location* location_dest) {
44     if (*message_stack_ == MessageTemplate::kNone) {
45       return;
46     }
47     if (*message_dest == MessageTemplate::kNone) {
48       *message_dest = *message_stack_;
49       *location_dest = *location_stack_;
50     }
51     *message_stack_ = MessageTemplate::kNone;
52     *location_stack_ = Scanner::Location::invalid();
53   }
54 
55  private:
56   MessageTemplate::Template* const message_stack_;
57   MessageTemplate::Template const old_message_;
58   Scanner::Location* const location_stack_;
59   Scanner::Location const old_location_;
60 };
61 
Internalize(Isolate * isolate) const62 Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const {
63   if (is_one_byte()) {
64     return isolate->factory()->InternalizeOneByteString(one_byte_literal());
65   }
66   return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
67 }
68 
NewCapacity(int min_capacity)69 int Scanner::LiteralBuffer::NewCapacity(int min_capacity) {
70   int capacity = Max(min_capacity, backing_store_.length());
71   int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
72   return new_capacity;
73 }
74 
ExpandBuffer()75 void Scanner::LiteralBuffer::ExpandBuffer() {
76   Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
77   MemCopy(new_store.start(), backing_store_.start(), position_);
78   backing_store_.Dispose();
79   backing_store_ = new_store;
80 }
81 
ConvertToTwoByte()82 void Scanner::LiteralBuffer::ConvertToTwoByte() {
83   DCHECK(is_one_byte_);
84   Vector<byte> new_store;
85   int new_content_size = position_ * kUC16Size;
86   if (new_content_size >= backing_store_.length()) {
87     // Ensure room for all currently read code units as UC16 as well
88     // as the code unit about to be stored.
89     new_store = Vector<byte>::New(NewCapacity(new_content_size));
90   } else {
91     new_store = backing_store_;
92   }
93   uint8_t* src = backing_store_.start();
94   uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
95   for (int i = position_ - 1; i >= 0; i--) {
96     dst[i] = src[i];
97   }
98   if (new_store.start() != backing_store_.start()) {
99     backing_store_.Dispose();
100     backing_store_ = new_store;
101   }
102   position_ = new_content_size;
103   is_one_byte_ = false;
104 }
105 
AddCharSlow(uc32 code_unit)106 void Scanner::LiteralBuffer::AddCharSlow(uc32 code_unit) {
107   if (position_ >= backing_store_.length()) ExpandBuffer();
108   if (is_one_byte_) {
109     if (code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) {
110       backing_store_[position_] = static_cast<byte>(code_unit);
111       position_ += kOneByteSize;
112       return;
113     }
114     ConvertToTwoByte();
115   }
116   if (code_unit <=
117       static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
118     *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
119     position_ += kUC16Size;
120   } else {
121     *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
122         unibrow::Utf16::LeadSurrogate(code_unit);
123     position_ += kUC16Size;
124     if (position_ >= backing_store_.length()) ExpandBuffer();
125     *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
126         unibrow::Utf16::TrailSurrogate(code_unit);
127     position_ += kUC16Size;
128   }
129 }
130 
131 // ----------------------------------------------------------------------------
132 // Scanner::BookmarkScope
133 
134 const size_t Scanner::BookmarkScope::kBookmarkAtFirstPos =
135     std::numeric_limits<size_t>::max() - 2;
136 const size_t Scanner::BookmarkScope::kNoBookmark =
137     std::numeric_limits<size_t>::max() - 1;
138 const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
139     std::numeric_limits<size_t>::max();
140 
Set()141 void Scanner::BookmarkScope::Set() {
142   DCHECK_EQ(bookmark_, kNoBookmark);
143   DCHECK_EQ(scanner_->next_next_.token, Token::UNINITIALIZED);
144 
145   // The first token is a bit special, since current_ will still be
146   // uninitialized. In this case, store kBookmarkAtFirstPos and special-case it
147   // when
148   // applying the bookmark.
149   DCHECK_IMPLIES(
150       scanner_->current_.token == Token::UNINITIALIZED,
151       scanner_->current_.location.beg_pos == scanner_->next_.location.beg_pos);
152   bookmark_ = (scanner_->current_.token == Token::UNINITIALIZED)
153                   ? kBookmarkAtFirstPos
154                   : scanner_->location().beg_pos;
155 }
156 
Apply()157 void Scanner::BookmarkScope::Apply() {
158   DCHECK(HasBeenSet());  // Caller hasn't called SetBookmark.
159   if (bookmark_ == kBookmarkAtFirstPos) {
160     scanner_->SeekNext(0);
161   } else {
162     scanner_->SeekNext(bookmark_);
163     scanner_->Next();
164     DCHECK_EQ(scanner_->location().beg_pos, static_cast<int>(bookmark_));
165   }
166   bookmark_ = kBookmarkWasApplied;
167 }
168 
HasBeenSet()169 bool Scanner::BookmarkScope::HasBeenSet() {
170   return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
171 }
172 
HasBeenApplied()173 bool Scanner::BookmarkScope::HasBeenApplied() {
174   return bookmark_ == kBookmarkWasApplied;
175 }
176 
177 // ----------------------------------------------------------------------------
178 // Scanner
179 
Scanner(UnicodeCache * unicode_cache)180 Scanner::Scanner(UnicodeCache* unicode_cache)
181     : unicode_cache_(unicode_cache),
182       octal_pos_(Location::invalid()),
183       octal_message_(MessageTemplate::kNone),
184       found_html_comment_(false) {}
185 
Initialize(Utf16CharacterStream * source)186 void Scanner::Initialize(Utf16CharacterStream* source) {
187   source_ = source;
188   // Need to capture identifiers in order to recognize "get" and "set"
189   // in object literals.
190   Init();
191   // Skip initial whitespace allowing HTML comment ends just like
192   // after a newline and scan first token.
193   has_line_terminator_before_next_ = true;
194   SkipWhiteSpace();
195   Scan();
196 }
197 
198 template <bool capture_raw, bool unicode>
ScanHexNumber(int expected_length)199 uc32 Scanner::ScanHexNumber(int expected_length) {
200   DCHECK(expected_length <= 4);  // prevent overflow
201 
202   int begin = source_pos() - 2;
203   uc32 x = 0;
204   for (int i = 0; i < expected_length; i++) {
205     int d = HexValue(c0_);
206     if (d < 0) {
207       ReportScannerError(Location(begin, begin + expected_length + 2),
208                          unicode
209                              ? MessageTemplate::kInvalidUnicodeEscapeSequence
210                              : MessageTemplate::kInvalidHexEscapeSequence);
211       return -1;
212     }
213     x = x * 16 + d;
214     Advance<capture_raw>();
215   }
216 
217   return x;
218 }
219 
220 template <bool capture_raw>
ScanUnlimitedLengthHexNumber(int max_value,int beg_pos)221 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
222   uc32 x = 0;
223   int d = HexValue(c0_);
224   if (d < 0) return -1;
225 
226   while (d >= 0) {
227     x = x * 16 + d;
228     if (x > max_value) {
229       ReportScannerError(Location(beg_pos, source_pos() + 1),
230                          MessageTemplate::kUndefinedUnicodeCodePoint);
231       return -1;
232     }
233     Advance<capture_raw>();
234     d = HexValue(c0_);
235   }
236 
237   return x;
238 }
239 
240 
241 // Ensure that tokens can be stored in a byte.
242 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
243 
244 // Table of one-character tokens, by character (0x00..0x7f only).
245 static const byte one_char_tokens[] = {
246   Token::ILLEGAL,
247   Token::ILLEGAL,
248   Token::ILLEGAL,
249   Token::ILLEGAL,
250   Token::ILLEGAL,
251   Token::ILLEGAL,
252   Token::ILLEGAL,
253   Token::ILLEGAL,
254   Token::ILLEGAL,
255   Token::ILLEGAL,
256   Token::ILLEGAL,
257   Token::ILLEGAL,
258   Token::ILLEGAL,
259   Token::ILLEGAL,
260   Token::ILLEGAL,
261   Token::ILLEGAL,
262   Token::ILLEGAL,
263   Token::ILLEGAL,
264   Token::ILLEGAL,
265   Token::ILLEGAL,
266   Token::ILLEGAL,
267   Token::ILLEGAL,
268   Token::ILLEGAL,
269   Token::ILLEGAL,
270   Token::ILLEGAL,
271   Token::ILLEGAL,
272   Token::ILLEGAL,
273   Token::ILLEGAL,
274   Token::ILLEGAL,
275   Token::ILLEGAL,
276   Token::ILLEGAL,
277   Token::ILLEGAL,
278   Token::ILLEGAL,
279   Token::ILLEGAL,
280   Token::ILLEGAL,
281   Token::ILLEGAL,
282   Token::ILLEGAL,
283   Token::ILLEGAL,
284   Token::ILLEGAL,
285   Token::ILLEGAL,
286   Token::LPAREN,       // 0x28
287   Token::RPAREN,       // 0x29
288   Token::ILLEGAL,
289   Token::ILLEGAL,
290   Token::COMMA,        // 0x2c
291   Token::ILLEGAL,
292   Token::ILLEGAL,
293   Token::ILLEGAL,
294   Token::ILLEGAL,
295   Token::ILLEGAL,
296   Token::ILLEGAL,
297   Token::ILLEGAL,
298   Token::ILLEGAL,
299   Token::ILLEGAL,
300   Token::ILLEGAL,
301   Token::ILLEGAL,
302   Token::ILLEGAL,
303   Token::ILLEGAL,
304   Token::COLON,        // 0x3a
305   Token::SEMICOLON,    // 0x3b
306   Token::ILLEGAL,
307   Token::ILLEGAL,
308   Token::ILLEGAL,
309   Token::CONDITIONAL,  // 0x3f
310   Token::ILLEGAL,
311   Token::ILLEGAL,
312   Token::ILLEGAL,
313   Token::ILLEGAL,
314   Token::ILLEGAL,
315   Token::ILLEGAL,
316   Token::ILLEGAL,
317   Token::ILLEGAL,
318   Token::ILLEGAL,
319   Token::ILLEGAL,
320   Token::ILLEGAL,
321   Token::ILLEGAL,
322   Token::ILLEGAL,
323   Token::ILLEGAL,
324   Token::ILLEGAL,
325   Token::ILLEGAL,
326   Token::ILLEGAL,
327   Token::ILLEGAL,
328   Token::ILLEGAL,
329   Token::ILLEGAL,
330   Token::ILLEGAL,
331   Token::ILLEGAL,
332   Token::ILLEGAL,
333   Token::ILLEGAL,
334   Token::ILLEGAL,
335   Token::ILLEGAL,
336   Token::ILLEGAL,
337   Token::LBRACK,     // 0x5b
338   Token::ILLEGAL,
339   Token::RBRACK,     // 0x5d
340   Token::ILLEGAL,
341   Token::ILLEGAL,
342   Token::ILLEGAL,
343   Token::ILLEGAL,
344   Token::ILLEGAL,
345   Token::ILLEGAL,
346   Token::ILLEGAL,
347   Token::ILLEGAL,
348   Token::ILLEGAL,
349   Token::ILLEGAL,
350   Token::ILLEGAL,
351   Token::ILLEGAL,
352   Token::ILLEGAL,
353   Token::ILLEGAL,
354   Token::ILLEGAL,
355   Token::ILLEGAL,
356   Token::ILLEGAL,
357   Token::ILLEGAL,
358   Token::ILLEGAL,
359   Token::ILLEGAL,
360   Token::ILLEGAL,
361   Token::ILLEGAL,
362   Token::ILLEGAL,
363   Token::ILLEGAL,
364   Token::ILLEGAL,
365   Token::ILLEGAL,
366   Token::ILLEGAL,
367   Token::ILLEGAL,
368   Token::ILLEGAL,
369   Token::LBRACE,       // 0x7b
370   Token::ILLEGAL,
371   Token::RBRACE,       // 0x7d
372   Token::BIT_NOT,      // 0x7e
373   Token::ILLEGAL
374 };
375 
376 
Next()377 Token::Value Scanner::Next() {
378   if (next_.token == Token::EOS) {
379     next_.location.beg_pos = current_.location.beg_pos;
380     next_.location.end_pos = current_.location.end_pos;
381   }
382   current_ = next_;
383   if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) {
384     next_ = next_next_;
385     next_next_.token = Token::UNINITIALIZED;
386     has_line_terminator_before_next_ = has_line_terminator_after_next_;
387     return current_.token;
388   }
389   has_line_terminator_before_next_ = false;
390   has_multiline_comment_before_next_ = false;
391   if (static_cast<unsigned>(c0_) <= 0x7f) {
392     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
393     if (token != Token::ILLEGAL) {
394       int pos = source_pos();
395       next_.token = token;
396       next_.location.beg_pos = pos;
397       next_.location.end_pos = pos + 1;
398       next_.literal_chars = nullptr;
399       next_.raw_literal_chars = nullptr;
400       Advance();
401       return current_.token;
402     }
403   }
404   Scan();
405   return current_.token;
406 }
407 
408 
PeekAhead()409 Token::Value Scanner::PeekAhead() {
410   DCHECK(next_.token != Token::DIV);
411   DCHECK(next_.token != Token::ASSIGN_DIV);
412 
413   if (next_next_.token != Token::UNINITIALIZED) {
414     return next_next_.token;
415   }
416   TokenDesc prev = current_;
417   bool has_line_terminator_before_next =
418       has_line_terminator_before_next_ || has_multiline_comment_before_next_;
419   Next();
420   has_line_terminator_after_next_ =
421       has_line_terminator_before_next_ || has_multiline_comment_before_next_;
422   has_line_terminator_before_next_ = has_line_terminator_before_next;
423   Token::Value ret = next_.token;
424   next_next_ = next_;
425   next_ = current_;
426   current_ = prev;
427   return ret;
428 }
429 
430 
431 // TODO(yangguo): check whether this is actually necessary.
IsLittleEndianByteOrderMark(uc32 c)432 static inline bool IsLittleEndianByteOrderMark(uc32 c) {
433   // The Unicode value U+FFFE is guaranteed never to be assigned as a
434   // Unicode character; this implies that in a Unicode context the
435   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
436   // character expressed in little-endian byte order (since it could
437   // not be a U+FFFE character expressed in big-endian byte
438   // order). Nevertheless, we check for it to be compatible with
439   // Spidermonkey.
440   return c == 0xFFFE;
441 }
442 
SkipWhiteSpace()443 bool Scanner::SkipWhiteSpace() {
444   int start_position = source_pos();
445 
446   while (true) {
447     while (true) {
448       // Don't skip behind the end of input.
449       if (c0_ == kEndOfInput) break;
450 
451       // Advance as long as character is a WhiteSpace or LineTerminator.
452       // Remember if the latter is the case.
453       if (unicode_cache_->IsLineTerminator(c0_)) {
454         has_line_terminator_before_next_ = true;
455       } else if (!unicode_cache_->IsWhiteSpace(c0_) &&
456                  !IsLittleEndianByteOrderMark(c0_)) {
457         break;
458       }
459       Advance();
460     }
461 
462     // If there is an HTML comment end '-->' at the beginning of a
463     // line (with only whitespace in front of it), we treat the rest
464     // of the line as a comment. This is in line with the way
465     // SpiderMonkey handles it.
466     if (c0_ != '-' || !has_line_terminator_before_next_) break;
467 
468     Advance();
469     if (c0_ != '-') {
470       PushBack('-');  // undo Advance()
471       break;
472     }
473 
474     Advance();
475     if (c0_ != '>') {
476       PushBack2('-', '-');  // undo 2x Advance();
477       break;
478     }
479 
480     // Treat the rest of the line as a comment.
481     SkipSingleLineComment();
482   }
483 
484   // Return whether or not we skipped any characters.
485   return source_pos() != start_position;
486 }
487 
SkipSingleLineComment()488 Token::Value Scanner::SkipSingleLineComment() {
489   Advance();
490 
491   // The line terminator at the end of the line is not considered
492   // to be part of the single-line comment; it is recognized
493   // separately by the lexical grammar and becomes part of the
494   // stream of input elements for the syntactic grammar (see
495   // ECMA-262, section 7.4).
496   while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
497     Advance();
498   }
499 
500   return Token::WHITESPACE;
501 }
502 
503 
SkipSourceURLComment()504 Token::Value Scanner::SkipSourceURLComment() {
505   TryToParseSourceURLComment();
506   while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
507     Advance();
508   }
509 
510   return Token::WHITESPACE;
511 }
512 
513 
TryToParseSourceURLComment()514 void Scanner::TryToParseSourceURLComment() {
515   // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
516   // function will just return if it cannot parse a magic comment.
517   if (c0_ == kEndOfInput || !unicode_cache_->IsWhiteSpace(c0_)) return;
518   Advance();
519   LiteralBuffer name;
520   while (c0_ != kEndOfInput &&
521          !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && c0_ != '=') {
522     name.AddChar(c0_);
523     Advance();
524   }
525   if (!name.is_one_byte()) return;
526   Vector<const uint8_t> name_literal = name.one_byte_literal();
527   LiteralBuffer* value;
528   if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
529     value = &source_url_;
530   } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
531     value = &source_mapping_url_;
532   } else {
533     return;
534   }
535   if (c0_ != '=')
536     return;
537   Advance();
538   value->Reset();
539   while (c0_ != kEndOfInput && unicode_cache_->IsWhiteSpace(c0_)) {
540     Advance();
541   }
542   while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
543     // Disallowed characters.
544     if (c0_ == '"' || c0_ == '\'') {
545       value->Reset();
546       return;
547     }
548     if (unicode_cache_->IsWhiteSpace(c0_)) {
549       break;
550     }
551     value->AddChar(c0_);
552     Advance();
553   }
554   // Allow whitespace at the end.
555   while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
556     if (!unicode_cache_->IsWhiteSpace(c0_)) {
557       value->Reset();
558       break;
559     }
560     Advance();
561   }
562 }
563 
564 
SkipMultiLineComment()565 Token::Value Scanner::SkipMultiLineComment() {
566   DCHECK(c0_ == '*');
567   Advance();
568 
569   while (c0_ != kEndOfInput) {
570     uc32 ch = c0_;
571     Advance();
572     if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(ch)) {
573       // Following ECMA-262, section 7.4, a comment containing
574       // a newline will make the comment count as a line-terminator.
575       has_multiline_comment_before_next_ = true;
576     }
577     // If we have reached the end of the multi-line comment, we
578     // consume the '/' and insert a whitespace. This way all
579     // multi-line comments are treated as whitespace.
580     if (ch == '*' && c0_ == '/') {
581       c0_ = ' ';
582       return Token::WHITESPACE;
583     }
584   }
585 
586   // Unterminated multi-line comment.
587   return Token::ILLEGAL;
588 }
589 
ScanHtmlComment()590 Token::Value Scanner::ScanHtmlComment() {
591   // Check for <!-- comments.
592   DCHECK(c0_ == '!');
593   Advance();
594   if (c0_ != '-') {
595     PushBack('!');  // undo Advance()
596     return Token::LT;
597   }
598 
599   Advance();
600   if (c0_ != '-') {
601     PushBack2('-', '!');  // undo 2x Advance()
602     return Token::LT;
603   }
604 
605   found_html_comment_ = true;
606   return SkipSingleLineComment();
607 }
608 
Scan()609 void Scanner::Scan() {
610   next_.literal_chars = NULL;
611   next_.raw_literal_chars = NULL;
612   Token::Value token;
613   do {
614     // Remember the position of the next token
615     next_.location.beg_pos = source_pos();
616 
617     switch (c0_) {
618       case ' ':
619       case '\t':
620         Advance();
621         token = Token::WHITESPACE;
622         break;
623 
624       case '\n':
625         Advance();
626         has_line_terminator_before_next_ = true;
627         token = Token::WHITESPACE;
628         break;
629 
630       case '"': case '\'':
631         token = ScanString();
632         break;
633 
634       case '<':
635         // < <= << <<= <!--
636         Advance();
637         if (c0_ == '=') {
638           token = Select(Token::LTE);
639         } else if (c0_ == '<') {
640           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
641         } else if (c0_ == '!') {
642           token = ScanHtmlComment();
643         } else {
644           token = Token::LT;
645         }
646         break;
647 
648       case '>':
649         // > >= >> >>= >>> >>>=
650         Advance();
651         if (c0_ == '=') {
652           token = Select(Token::GTE);
653         } else if (c0_ == '>') {
654           // >> >>= >>> >>>=
655           Advance();
656           if (c0_ == '=') {
657             token = Select(Token::ASSIGN_SAR);
658           } else if (c0_ == '>') {
659             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
660           } else {
661             token = Token::SAR;
662           }
663         } else {
664           token = Token::GT;
665         }
666         break;
667 
668       case '=':
669         // = == === =>
670         Advance();
671         if (c0_ == '=') {
672           token = Select('=', Token::EQ_STRICT, Token::EQ);
673         } else if (c0_ == '>') {
674           token = Select(Token::ARROW);
675         } else {
676           token = Token::ASSIGN;
677         }
678         break;
679 
680       case '!':
681         // ! != !==
682         Advance();
683         if (c0_ == '=') {
684           token = Select('=', Token::NE_STRICT, Token::NE);
685         } else {
686           token = Token::NOT;
687         }
688         break;
689 
690       case '+':
691         // + ++ +=
692         Advance();
693         if (c0_ == '+') {
694           token = Select(Token::INC);
695         } else if (c0_ == '=') {
696           token = Select(Token::ASSIGN_ADD);
697         } else {
698           token = Token::ADD;
699         }
700         break;
701 
702       case '-':
703         // - -- --> -=
704         Advance();
705         if (c0_ == '-') {
706           Advance();
707           if (c0_ == '>' && HasAnyLineTerminatorBeforeNext()) {
708             // For compatibility with SpiderMonkey, we skip lines that
709             // start with an HTML comment end '-->'.
710             token = SkipSingleLineComment();
711           } else {
712             token = Token::DEC;
713           }
714         } else if (c0_ == '=') {
715           token = Select(Token::ASSIGN_SUB);
716         } else {
717           token = Token::SUB;
718         }
719         break;
720 
721       case '*':
722         // * *=
723         Advance();
724         if (c0_ == '*') {
725           token = Select('=', Token::ASSIGN_EXP, Token::EXP);
726         } else if (c0_ == '=') {
727           token = Select(Token::ASSIGN_MUL);
728         } else {
729           token = Token::MUL;
730         }
731         break;
732 
733       case '%':
734         // % %=
735         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
736         break;
737 
738       case '/':
739         // /  // /* /=
740         Advance();
741         if (c0_ == '/') {
742           Advance();
743           if (c0_ == '#' || c0_ == '@') {
744             Advance();
745             token = SkipSourceURLComment();
746           } else {
747             PushBack(c0_);
748             token = SkipSingleLineComment();
749           }
750         } else if (c0_ == '*') {
751           token = SkipMultiLineComment();
752         } else if (c0_ == '=') {
753           token = Select(Token::ASSIGN_DIV);
754         } else {
755           token = Token::DIV;
756         }
757         break;
758 
759       case '&':
760         // & && &=
761         Advance();
762         if (c0_ == '&') {
763           token = Select(Token::AND);
764         } else if (c0_ == '=') {
765           token = Select(Token::ASSIGN_BIT_AND);
766         } else {
767           token = Token::BIT_AND;
768         }
769         break;
770 
771       case '|':
772         // | || |=
773         Advance();
774         if (c0_ == '|') {
775           token = Select(Token::OR);
776         } else if (c0_ == '=') {
777           token = Select(Token::ASSIGN_BIT_OR);
778         } else {
779           token = Token::BIT_OR;
780         }
781         break;
782 
783       case '^':
784         // ^ ^=
785         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
786         break;
787 
788       case '.':
789         // . Number
790         Advance();
791         if (IsDecimalDigit(c0_)) {
792           token = ScanNumber(true);
793         } else {
794           token = Token::PERIOD;
795           if (c0_ == '.') {
796             Advance();
797             if (c0_ == '.') {
798               Advance();
799               token = Token::ELLIPSIS;
800             } else {
801               PushBack('.');
802             }
803           }
804         }
805         break;
806 
807       case ':':
808         token = Select(Token::COLON);
809         break;
810 
811       case ';':
812         token = Select(Token::SEMICOLON);
813         break;
814 
815       case ',':
816         token = Select(Token::COMMA);
817         break;
818 
819       case '(':
820         token = Select(Token::LPAREN);
821         break;
822 
823       case ')':
824         token = Select(Token::RPAREN);
825         break;
826 
827       case '[':
828         token = Select(Token::LBRACK);
829         break;
830 
831       case ']':
832         token = Select(Token::RBRACK);
833         break;
834 
835       case '{':
836         token = Select(Token::LBRACE);
837         break;
838 
839       case '}':
840         token = Select(Token::RBRACE);
841         break;
842 
843       case '?':
844         token = Select(Token::CONDITIONAL);
845         break;
846 
847       case '~':
848         token = Select(Token::BIT_NOT);
849         break;
850 
851       case '`':
852         token = ScanTemplateStart();
853         break;
854 
855       default:
856         if (c0_ == kEndOfInput) {
857           token = Token::EOS;
858         } else if (unicode_cache_->IsIdentifierStart(c0_)) {
859           token = ScanIdentifierOrKeyword();
860         } else if (IsDecimalDigit(c0_)) {
861           token = ScanNumber(false);
862         } else if (SkipWhiteSpace()) {
863           token = Token::WHITESPACE;
864         } else {
865           token = Select(Token::ILLEGAL);
866         }
867         break;
868     }
869 
870     // Continue scanning for tokens as long as we're just skipping
871     // whitespace.
872   } while (token == Token::WHITESPACE);
873 
874   next_.location.end_pos = source_pos();
875   next_.token = token;
876 
877 #ifdef DEBUG
878   SanityCheckTokenDesc(current_);
879   SanityCheckTokenDesc(next_);
880   SanityCheckTokenDesc(next_next_);
881 #endif
882 }
883 
884 #ifdef DEBUG
SanityCheckTokenDesc(const TokenDesc & token) const885 void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
886   // Most tokens should not have literal_chars or even raw_literal chars.
887   // The rules are:
888   // - UNINITIALIZED: we don't care.
889   // - TEMPLATE_*: need both literal + raw literal chars.
890   // - IDENTIFIERS, STRINGS, etc.: need a literal, but no raw literal.
891   // - all others: should have neither.
892 
893   switch (token.token) {
894     case Token::UNINITIALIZED:
895       // token.literal_chars & other members might be garbage. That's ok.
896       break;
897     case Token::TEMPLATE_SPAN:
898     case Token::TEMPLATE_TAIL:
899       DCHECK_NOT_NULL(token.raw_literal_chars);
900       DCHECK_NOT_NULL(token.literal_chars);
901       break;
902     case Token::ESCAPED_KEYWORD:
903     case Token::ESCAPED_STRICT_RESERVED_WORD:
904     case Token::FUTURE_STRICT_RESERVED_WORD:
905     case Token::IDENTIFIER:
906     case Token::NUMBER:
907     case Token::REGEXP_LITERAL:
908     case Token::SMI:
909     case Token::STRING:
910       DCHECK_NOT_NULL(token.literal_chars);
911       DCHECK_NULL(token.raw_literal_chars);
912       break;
913     default:
914       DCHECK_NULL(token.literal_chars);
915       DCHECK_NULL(token.raw_literal_chars);
916       break;
917   }
918 }
919 #endif  // DEBUG
920 
SeekForward(int pos)921 void Scanner::SeekForward(int pos) {
922   // After this call, we will have the token at the given position as
923   // the "next" token. The "current" token will be invalid.
924   if (pos == next_.location.beg_pos) return;
925   int current_pos = source_pos();
926   DCHECK_EQ(next_.location.end_pos, current_pos);
927   // Positions inside the lookahead token aren't supported.
928   DCHECK(pos >= current_pos);
929   if (pos != current_pos) {
930     source_->Seek(pos);
931     Advance();
932     // This function is only called to seek to the location
933     // of the end of a function (at the "}" token). It doesn't matter
934     // whether there was a line terminator in the part we skip.
935     has_line_terminator_before_next_ = false;
936     has_multiline_comment_before_next_ = false;
937   }
938   Scan();
939 }
940 
941 
942 template <bool capture_raw, bool in_template_literal>
ScanEscape()943 bool Scanner::ScanEscape() {
944   uc32 c = c0_;
945   Advance<capture_raw>();
946 
947   // Skip escaped newlines.
948   if (!in_template_literal && c0_ != kEndOfInput &&
949       unicode_cache_->IsLineTerminator(c)) {
950     // Allow CR+LF newlines in multiline string literals.
951     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>();
952     // Allow LF+CR newlines in multiline string literals.
953     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>();
954     return true;
955   }
956 
957   switch (c) {
958     case '\'':  // fall through
959     case '"' :  // fall through
960     case '\\': break;
961     case 'b' : c = '\b'; break;
962     case 'f' : c = '\f'; break;
963     case 'n' : c = '\n'; break;
964     case 'r' : c = '\r'; break;
965     case 't' : c = '\t'; break;
966     case 'u' : {
967       c = ScanUnicodeEscape<capture_raw>();
968       if (c < 0) return false;
969       break;
970     }
971     case 'v':
972       c = '\v';
973       break;
974     case 'x': {
975       c = ScanHexNumber<capture_raw>(2);
976       if (c < 0) return false;
977       break;
978     }
979     case '0':  // Fall through.
980     case '1':  // fall through
981     case '2':  // fall through
982     case '3':  // fall through
983     case '4':  // fall through
984     case '5':  // fall through
985     case '6':  // fall through
986     case '7':
987       c = ScanOctalEscape<capture_raw>(c, 2);
988       break;
989   }
990 
991   // Other escaped characters are interpreted as their non-escaped version.
992   AddLiteralChar(c);
993   return true;
994 }
995 
996 
997 template <bool capture_raw>
ScanOctalEscape(uc32 c,int length)998 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
999   uc32 x = c - '0';
1000   int i = 0;
1001   for (; i < length; i++) {
1002     int d = c0_ - '0';
1003     if (d < 0 || d > 7) break;
1004     int nx = x * 8 + d;
1005     if (nx >= 256) break;
1006     x = nx;
1007     Advance<capture_raw>();
1008   }
1009   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
1010   // Remember the position of octal escape sequences so that an error
1011   // can be reported later (in strict mode).
1012   // We don't report the error immediately, because the octal escape can
1013   // occur before the "use strict" directive.
1014   if (c != '0' || i > 0) {
1015     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
1016     octal_message_ = MessageTemplate::kStrictOctalEscape;
1017   }
1018   return x;
1019 }
1020 
1021 
ScanString()1022 Token::Value Scanner::ScanString() {
1023   uc32 quote = c0_;
1024   Advance<false, false>();  // consume quote
1025 
1026   LiteralScope literal(this);
1027   while (true) {
1028     if (c0_ > kMaxAscii) {
1029       HandleLeadSurrogate();
1030       break;
1031     }
1032     if (c0_ == kEndOfInput || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL;
1033     if (c0_ == quote) {
1034       literal.Complete();
1035       Advance<false, false>();
1036       return Token::STRING;
1037     }
1038     char c = static_cast<char>(c0_);
1039     if (c == '\\') break;
1040     Advance<false, false>();
1041     AddLiteralChar(c);
1042   }
1043 
1044   while (c0_ != quote && c0_ != kEndOfInput &&
1045          !unicode_cache_->IsLineTerminator(c0_)) {
1046     uc32 c = c0_;
1047     Advance();
1048     if (c == '\\') {
1049       if (c0_ == kEndOfInput || !ScanEscape<false, false>()) {
1050         return Token::ILLEGAL;
1051       }
1052     } else {
1053       AddLiteralChar(c);
1054     }
1055   }
1056   if (c0_ != quote) return Token::ILLEGAL;
1057   literal.Complete();
1058 
1059   Advance();  // consume quote
1060   return Token::STRING;
1061 }
1062 
1063 
ScanTemplateSpan()1064 Token::Value Scanner::ScanTemplateSpan() {
1065   // When scanning a TemplateSpan, we are looking for the following construct:
1066   // TEMPLATE_SPAN ::
1067   //     ` LiteralChars* ${
1068   //   | } LiteralChars* ${
1069   //
1070   // TEMPLATE_TAIL ::
1071   //     ` LiteralChars* `
1072   //   | } LiteralChar* `
1073   //
1074   // A TEMPLATE_SPAN should always be followed by an Expression, while a
1075   // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
1076   // followed by an Expression.
1077 
1078   // These scoped helpers save and restore the original error state, so that we
1079   // can specially treat invalid escape sequences in templates (which are
1080   // handled by the parser).
1081   ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_);
1082   ErrorState octal_error_state(&octal_message_, &octal_pos_);
1083 
1084   Token::Value result = Token::TEMPLATE_SPAN;
1085   LiteralScope literal(this);
1086   StartRawLiteral();
1087   const bool capture_raw = true;
1088   const bool in_template_literal = true;
1089   while (true) {
1090     uc32 c = c0_;
1091     Advance<capture_raw>();
1092     if (c == '`') {
1093       result = Token::TEMPLATE_TAIL;
1094       ReduceRawLiteralLength(1);
1095       break;
1096     } else if (c == '$' && c0_ == '{') {
1097       Advance<capture_raw>();  // Consume '{'
1098       ReduceRawLiteralLength(2);
1099       break;
1100     } else if (c == '\\') {
1101       if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(c0_)) {
1102         // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
1103         // code unit sequence.
1104         uc32 lastChar = c0_;
1105         Advance<capture_raw>();
1106         if (lastChar == '\r') {
1107           ReduceRawLiteralLength(1);  // Remove \r
1108           if (c0_ == '\n') {
1109             Advance<capture_raw>();  // Adds \n
1110           } else {
1111             AddRawLiteralChar('\n');
1112           }
1113         }
1114       } else {
1115         bool success = ScanEscape<capture_raw, in_template_literal>();
1116         USE(success);
1117         DCHECK_EQ(!success, has_error());
1118         // For templates, invalid escape sequence checking is handled in the
1119         // parser.
1120         scanner_error_state.MoveErrorTo(&invalid_template_escape_message_,
1121                                         &invalid_template_escape_location_);
1122         octal_error_state.MoveErrorTo(&invalid_template_escape_message_,
1123                                       &invalid_template_escape_location_);
1124       }
1125     } else if (c < 0) {
1126       // Unterminated template literal
1127       PushBack(c);
1128       break;
1129     } else {
1130       // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
1131       // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
1132       // consisting of the CV 0x000A.
1133       if (c == '\r') {
1134         ReduceRawLiteralLength(1);  // Remove \r
1135         if (c0_ == '\n') {
1136           Advance<capture_raw>();  // Adds \n
1137         } else {
1138           AddRawLiteralChar('\n');
1139         }
1140         c = '\n';
1141       }
1142       AddLiteralChar(c);
1143     }
1144   }
1145   literal.Complete();
1146   next_.location.end_pos = source_pos();
1147   next_.token = result;
1148 
1149   return result;
1150 }
1151 
1152 
ScanTemplateStart()1153 Token::Value Scanner::ScanTemplateStart() {
1154   DCHECK(next_next_.token == Token::UNINITIALIZED);
1155   DCHECK(c0_ == '`');
1156   next_.location.beg_pos = source_pos();
1157   Advance();  // Consume `
1158   return ScanTemplateSpan();
1159 }
1160 
1161 
ScanTemplateContinuation()1162 Token::Value Scanner::ScanTemplateContinuation() {
1163   DCHECK_EQ(next_.token, Token::RBRACE);
1164   next_.location.beg_pos = source_pos() - 1;  // We already consumed }
1165   return ScanTemplateSpan();
1166 }
1167 
1168 
ScanDecimalDigits()1169 void Scanner::ScanDecimalDigits() {
1170   while (IsDecimalDigit(c0_))
1171     AddLiteralCharAdvance();
1172 }
1173 
1174 
ScanNumber(bool seen_period)1175 Token::Value Scanner::ScanNumber(bool seen_period) {
1176   DCHECK(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
1177 
1178   enum {
1179     DECIMAL,
1180     DECIMAL_WITH_LEADING_ZERO,
1181     HEX,
1182     OCTAL,
1183     IMPLICIT_OCTAL,
1184     BINARY
1185   } kind = DECIMAL;
1186 
1187   LiteralScope literal(this);
1188   bool at_start = !seen_period;
1189   int start_pos = source_pos();  // For reporting octal positions.
1190   if (seen_period) {
1191     // we have already seen a decimal point of the float
1192     AddLiteralChar('.');
1193     ScanDecimalDigits();  // we know we have at least one digit
1194 
1195   } else {
1196     // if the first character is '0' we must check for octals and hex
1197     if (c0_ == '0') {
1198       AddLiteralCharAdvance();
1199 
1200       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
1201       // an octal number.
1202       if (c0_ == 'x' || c0_ == 'X') {
1203         // hex number
1204         kind = HEX;
1205         AddLiteralCharAdvance();
1206         if (!IsHexDigit(c0_)) {
1207           // we must have at least one hex digit after 'x'/'X'
1208           return Token::ILLEGAL;
1209         }
1210         while (IsHexDigit(c0_)) {
1211           AddLiteralCharAdvance();
1212         }
1213       } else if (c0_ == 'o' || c0_ == 'O') {
1214         kind = OCTAL;
1215         AddLiteralCharAdvance();
1216         if (!IsOctalDigit(c0_)) {
1217           // we must have at least one octal digit after 'o'/'O'
1218           return Token::ILLEGAL;
1219         }
1220         while (IsOctalDigit(c0_)) {
1221           AddLiteralCharAdvance();
1222         }
1223       } else if (c0_ == 'b' || c0_ == 'B') {
1224         kind = BINARY;
1225         AddLiteralCharAdvance();
1226         if (!IsBinaryDigit(c0_)) {
1227           // we must have at least one binary digit after 'b'/'B'
1228           return Token::ILLEGAL;
1229         }
1230         while (IsBinaryDigit(c0_)) {
1231           AddLiteralCharAdvance();
1232         }
1233       } else if ('0' <= c0_ && c0_ <= '7') {
1234         // (possible) octal number
1235         kind = IMPLICIT_OCTAL;
1236         while (true) {
1237           if (c0_ == '8' || c0_ == '9') {
1238             at_start = false;
1239             kind = DECIMAL_WITH_LEADING_ZERO;
1240             break;
1241           }
1242           if (c0_  < '0' || '7'  < c0_) {
1243             // Octal literal finished.
1244             octal_pos_ = Location(start_pos, source_pos());
1245             octal_message_ = MessageTemplate::kStrictOctalLiteral;
1246             break;
1247           }
1248           AddLiteralCharAdvance();
1249         }
1250       } else if (c0_ == '8' || c0_ == '9') {
1251         kind = DECIMAL_WITH_LEADING_ZERO;
1252       }
1253     }
1254 
1255     // Parse decimal digits and allow trailing fractional part.
1256     if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) {
1257       if (at_start) {
1258         uint64_t value = 0;
1259         while (IsDecimalDigit(c0_)) {
1260           value = 10 * value + (c0_ - '0');
1261 
1262           uc32 first_char = c0_;
1263           Advance<false, false>();
1264           AddLiteralChar(first_char);
1265         }
1266 
1267         if (next_.literal_chars->one_byte_literal().length() <= 10 &&
1268             value <= Smi::kMaxValue && c0_ != '.' &&
1269             (c0_ == kEndOfInput || !unicode_cache_->IsIdentifierStart(c0_))) {
1270           next_.smi_value_ = static_cast<uint32_t>(value);
1271           literal.Complete();
1272           HandleLeadSurrogate();
1273 
1274           if (kind == DECIMAL_WITH_LEADING_ZERO) {
1275             octal_pos_ = Location(start_pos, source_pos());
1276             octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
1277           }
1278           return Token::SMI;
1279         }
1280         HandleLeadSurrogate();
1281       }
1282 
1283       ScanDecimalDigits();  // optional
1284       if (c0_ == '.') {
1285         AddLiteralCharAdvance();
1286         ScanDecimalDigits();  // optional
1287       }
1288     }
1289   }
1290 
1291   // scan exponent, if any
1292   if (c0_ == 'e' || c0_ == 'E') {
1293     DCHECK(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
1294     if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO))
1295       return Token::ILLEGAL;
1296     // scan exponent
1297     AddLiteralCharAdvance();
1298     if (c0_ == '+' || c0_ == '-')
1299       AddLiteralCharAdvance();
1300     if (!IsDecimalDigit(c0_)) {
1301       // we must have at least one decimal digit after 'e'/'E'
1302       return Token::ILLEGAL;
1303     }
1304     ScanDecimalDigits();
1305   }
1306 
1307   // The source character immediately following a numeric literal must
1308   // not be an identifier start or a decimal digit; see ECMA-262
1309   // section 7.8.3, page 17 (note that we read only one decimal digit
1310   // if the value is 0).
1311   if (IsDecimalDigit(c0_) ||
1312       (c0_ != kEndOfInput && unicode_cache_->IsIdentifierStart(c0_)))
1313     return Token::ILLEGAL;
1314 
1315   literal.Complete();
1316 
1317   if (kind == DECIMAL_WITH_LEADING_ZERO) {
1318     octal_pos_ = Location(start_pos, source_pos());
1319     octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
1320   }
1321   return Token::NUMBER;
1322 }
1323 
1324 
ScanIdentifierUnicodeEscape()1325 uc32 Scanner::ScanIdentifierUnicodeEscape() {
1326   Advance();
1327   if (c0_ != 'u') return -1;
1328   Advance();
1329   return ScanUnicodeEscape<false>();
1330 }
1331 
1332 
1333 template <bool capture_raw>
ScanUnicodeEscape()1334 uc32 Scanner::ScanUnicodeEscape() {
1335   // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
1336   // hex digits between { } is arbitrary. \ and u have already been read.
1337   if (c0_ == '{') {
1338     int begin = source_pos() - 2;
1339     Advance<capture_raw>();
1340     uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff, begin);
1341     if (cp < 0 || c0_ != '}') {
1342       ReportScannerError(source_pos(),
1343                          MessageTemplate::kInvalidUnicodeEscapeSequence);
1344       return -1;
1345     }
1346     Advance<capture_raw>();
1347     return cp;
1348   }
1349   const bool unicode = true;
1350   return ScanHexNumber<capture_raw, unicode>(4);
1351 }
1352 
1353 
1354 // ----------------------------------------------------------------------------
1355 // Keyword Matcher
1356 
1357 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                    \
1358   KEYWORD_GROUP('a')                                        \
1359   KEYWORD("async", Token::ASYNC)                            \
1360   KEYWORD("await", Token::AWAIT)                            \
1361   KEYWORD_GROUP('b')                                        \
1362   KEYWORD("break", Token::BREAK)                            \
1363   KEYWORD_GROUP('c')                                        \
1364   KEYWORD("case", Token::CASE)                              \
1365   KEYWORD("catch", Token::CATCH)                            \
1366   KEYWORD("class", Token::CLASS)                            \
1367   KEYWORD("const", Token::CONST)                            \
1368   KEYWORD("continue", Token::CONTINUE)                      \
1369   KEYWORD_GROUP('d')                                        \
1370   KEYWORD("debugger", Token::DEBUGGER)                      \
1371   KEYWORD("default", Token::DEFAULT)                        \
1372   KEYWORD("delete", Token::DELETE)                          \
1373   KEYWORD("do", Token::DO)                                  \
1374   KEYWORD_GROUP('e')                                        \
1375   KEYWORD("else", Token::ELSE)                              \
1376   KEYWORD("enum", Token::ENUM)                              \
1377   KEYWORD("export", Token::EXPORT)                          \
1378   KEYWORD("extends", Token::EXTENDS)                        \
1379   KEYWORD_GROUP('f')                                        \
1380   KEYWORD("false", Token::FALSE_LITERAL)                    \
1381   KEYWORD("finally", Token::FINALLY)                        \
1382   KEYWORD("for", Token::FOR)                                \
1383   KEYWORD("function", Token::FUNCTION)                      \
1384   KEYWORD_GROUP('i')                                        \
1385   KEYWORD("if", Token::IF)                                  \
1386   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
1387   KEYWORD("import", Token::IMPORT)                          \
1388   KEYWORD("in", Token::IN)                                  \
1389   KEYWORD("instanceof", Token::INSTANCEOF)                  \
1390   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)  \
1391   KEYWORD_GROUP('l')                                        \
1392   KEYWORD("let", Token::LET)                                \
1393   KEYWORD_GROUP('n')                                        \
1394   KEYWORD("new", Token::NEW)                                \
1395   KEYWORD("null", Token::NULL_LITERAL)                      \
1396   KEYWORD_GROUP('p')                                        \
1397   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)    \
1398   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)    \
1399   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)  \
1400   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)     \
1401   KEYWORD_GROUP('r')                                        \
1402   KEYWORD("return", Token::RETURN)                          \
1403   KEYWORD_GROUP('s')                                        \
1404   KEYWORD("static", Token::STATIC)                          \
1405   KEYWORD("super", Token::SUPER)                            \
1406   KEYWORD("switch", Token::SWITCH)                          \
1407   KEYWORD_GROUP('t')                                        \
1408   KEYWORD("this", Token::THIS)                              \
1409   KEYWORD("throw", Token::THROW)                            \
1410   KEYWORD("true", Token::TRUE_LITERAL)                      \
1411   KEYWORD("try", Token::TRY)                                \
1412   KEYWORD("typeof", Token::TYPEOF)                          \
1413   KEYWORD_GROUP('v')                                        \
1414   KEYWORD("var", Token::VAR)                                \
1415   KEYWORD("void", Token::VOID)                              \
1416   KEYWORD_GROUP('w')                                        \
1417   KEYWORD("while", Token::WHILE)                            \
1418   KEYWORD("with", Token::WITH)                              \
1419   KEYWORD_GROUP('y')                                        \
1420   KEYWORD("yield", Token::YIELD)
1421 
KeywordOrIdentifierToken(const uint8_t * input,int input_length)1422 static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
1423                                              int input_length) {
1424   DCHECK(input_length >= 1);
1425   const int kMinLength = 2;
1426   const int kMaxLength = 10;
1427   if (input_length < kMinLength || input_length > kMaxLength) {
1428     return Token::IDENTIFIER;
1429   }
1430   switch (input[0]) {
1431     default:
1432 #define KEYWORD_GROUP_CASE(ch)                                \
1433       break;                                                  \
1434     case ch:
1435 #define KEYWORD(keyword, token)                                     \
1436   {                                                                 \
1437     /* 'keyword' is a char array, so sizeof(keyword) is */          \
1438     /* strlen(keyword) plus 1 for the NUL char. */                  \
1439     const int keyword_length = sizeof(keyword) - 1;                 \
1440     STATIC_ASSERT(keyword_length >= kMinLength);                    \
1441     STATIC_ASSERT(keyword_length <= kMaxLength);                    \
1442     if (input_length == keyword_length && input[1] == keyword[1] && \
1443         (keyword_length <= 2 || input[2] == keyword[2]) &&          \
1444         (keyword_length <= 3 || input[3] == keyword[3]) &&          \
1445         (keyword_length <= 4 || input[4] == keyword[4]) &&          \
1446         (keyword_length <= 5 || input[5] == keyword[5]) &&          \
1447         (keyword_length <= 6 || input[6] == keyword[6]) &&          \
1448         (keyword_length <= 7 || input[7] == keyword[7]) &&          \
1449         (keyword_length <= 8 || input[8] == keyword[8]) &&          \
1450         (keyword_length <= 9 || input[9] == keyword[9])) {          \
1451       return token;                                                 \
1452     }                                                               \
1453   }
1454     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
1455   }
1456   return Token::IDENTIFIER;
1457 }
1458 
1459 
ScanIdentifierOrKeyword()1460 Token::Value Scanner::ScanIdentifierOrKeyword() {
1461   DCHECK(unicode_cache_->IsIdentifierStart(c0_));
1462   LiteralScope literal(this);
1463   if (IsInRange(c0_, 'a', 'z')) {
1464     do {
1465       char first_char = static_cast<char>(c0_);
1466       Advance<false, false>();
1467       AddLiteralChar(first_char);
1468     } while (IsInRange(c0_, 'a', 'z'));
1469 
1470     if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' ||
1471         c0_ == '$') {
1472       // Identifier starting with lowercase.
1473       char first_char = static_cast<char>(c0_);
1474       Advance<false, false>();
1475       AddLiteralChar(first_char);
1476       while (IsAsciiIdentifier(c0_)) {
1477         char first_char = static_cast<char>(c0_);
1478         Advance<false, false>();
1479         AddLiteralChar(first_char);
1480       }
1481       if (c0_ <= kMaxAscii && c0_ != '\\') {
1482         literal.Complete();
1483         return Token::IDENTIFIER;
1484       }
1485     } else if (c0_ <= kMaxAscii && c0_ != '\\') {
1486       // Only a-z+: could be a keyword or identifier.
1487       Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1488       Token::Value token =
1489           KeywordOrIdentifierToken(chars.start(), chars.length());
1490       if (token == Token::IDENTIFIER ||
1491           token == Token::FUTURE_STRICT_RESERVED_WORD)
1492         literal.Complete();
1493       return token;
1494     }
1495 
1496     HandleLeadSurrogate();
1497   } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') {
1498     do {
1499       char first_char = static_cast<char>(c0_);
1500       Advance<false, false>();
1501       AddLiteralChar(first_char);
1502     } while (IsAsciiIdentifier(c0_));
1503 
1504     if (c0_ <= kMaxAscii && c0_ != '\\') {
1505       literal.Complete();
1506       return Token::IDENTIFIER;
1507     }
1508 
1509     HandleLeadSurrogate();
1510   } else if (c0_ == '\\') {
1511     // Scan identifier start character.
1512     uc32 c = ScanIdentifierUnicodeEscape();
1513     // Only allow legal identifier start characters.
1514     if (c < 0 ||
1515         c == '\\' ||  // No recursive escapes.
1516         !unicode_cache_->IsIdentifierStart(c)) {
1517       return Token::ILLEGAL;
1518     }
1519     AddLiteralChar(c);
1520     return ScanIdentifierSuffix(&literal, true);
1521   } else {
1522     uc32 first_char = c0_;
1523     Advance();
1524     AddLiteralChar(first_char);
1525   }
1526 
1527   // Scan the rest of the identifier characters.
1528   while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1529     if (c0_ != '\\') {
1530       uc32 next_char = c0_;
1531       Advance();
1532       AddLiteralChar(next_char);
1533       continue;
1534     }
1535     // Fallthrough if no longer able to complete keyword.
1536     return ScanIdentifierSuffix(&literal, false);
1537   }
1538 
1539   if (next_.literal_chars->is_one_byte()) {
1540     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1541     Token::Value token =
1542         KeywordOrIdentifierToken(chars.start(), chars.length());
1543     if (token == Token::IDENTIFIER ||
1544         token == Token::FUTURE_STRICT_RESERVED_WORD)
1545       literal.Complete();
1546     return token;
1547   }
1548   literal.Complete();
1549   return Token::IDENTIFIER;
1550 }
1551 
1552 
ScanIdentifierSuffix(LiteralScope * literal,bool escaped)1553 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal,
1554                                            bool escaped) {
1555   // Scan the rest of the identifier characters.
1556   while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1557     if (c0_ == '\\') {
1558       uc32 c = ScanIdentifierUnicodeEscape();
1559       escaped = true;
1560       // Only allow legal identifier part characters.
1561       if (c < 0 ||
1562           c == '\\' ||
1563           !unicode_cache_->IsIdentifierPart(c)) {
1564         return Token::ILLEGAL;
1565       }
1566       AddLiteralChar(c);
1567     } else {
1568       AddLiteralChar(c0_);
1569       Advance();
1570     }
1571   }
1572   literal->Complete();
1573 
1574   if (escaped && next_.literal_chars->is_one_byte()) {
1575     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1576     Token::Value token =
1577         KeywordOrIdentifierToken(chars.start(), chars.length());
1578     /* TODO(adamk): YIELD should be handled specially. */
1579     if (token == Token::IDENTIFIER) {
1580       return Token::IDENTIFIER;
1581     } else if (token == Token::FUTURE_STRICT_RESERVED_WORD ||
1582                token == Token::LET || token == Token::STATIC) {
1583       return Token::ESCAPED_STRICT_RESERVED_WORD;
1584     } else {
1585       return Token::ESCAPED_KEYWORD;
1586     }
1587   }
1588   return Token::IDENTIFIER;
1589 }
1590 
ScanRegExpPattern()1591 bool Scanner::ScanRegExpPattern() {
1592   DCHECK(next_next_.token == Token::UNINITIALIZED);
1593   DCHECK(next_.token == Token::DIV || next_.token == Token::ASSIGN_DIV);
1594 
1595   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1596   bool in_character_class = false;
1597   bool seen_equal = (next_.token == Token::ASSIGN_DIV);
1598 
1599   // Previous token is either '/' or '/=', in the second case, the
1600   // pattern starts at =.
1601   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1602   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1603 
1604   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1605   // the scanner should pass uninterpreted bodies to the RegExp
1606   // constructor.
1607   LiteralScope literal(this);
1608   if (seen_equal) {
1609     AddLiteralChar('=');
1610   }
1611 
1612   while (c0_ != '/' || in_character_class) {
1613     if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_))
1614       return false;
1615     if (c0_ == '\\') {  // Escape sequence.
1616       AddLiteralCharAdvance();
1617       if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_))
1618         return false;
1619       AddLiteralCharAdvance();
1620       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1621       // only "safe" characters are allowed (letters, digits, underscore),
1622       // otherwise the escape isn't valid and the invalid character has
1623       // its normal meaning. I.e., we can just continue scanning without
1624       // worrying whether the following characters are part of the escape
1625       // or not, since any '/', '\\' or '[' is guaranteed to not be part
1626       // of the escape sequence.
1627 
1628       // TODO(896): At some point, parse RegExps more throughly to capture
1629       // octal esacpes in strict mode.
1630     } else {  // Unescaped character.
1631       if (c0_ == '[') in_character_class = true;
1632       if (c0_ == ']') in_character_class = false;
1633       AddLiteralCharAdvance();
1634     }
1635   }
1636   Advance();  // consume '/'
1637 
1638   literal.Complete();
1639   next_.token = Token::REGEXP_LITERAL;
1640   return true;
1641 }
1642 
1643 
ScanRegExpFlags()1644 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
1645   DCHECK(next_.token == Token::REGEXP_LITERAL);
1646 
1647   // Scan regular expression flags.
1648   int flags = 0;
1649   while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1650     RegExp::Flags flag = RegExp::kNone;
1651     switch (c0_) {
1652       case 'g':
1653         flag = RegExp::kGlobal;
1654         break;
1655       case 'i':
1656         flag = RegExp::kIgnoreCase;
1657         break;
1658       case 'm':
1659         flag = RegExp::kMultiline;
1660         break;
1661       case 'u':
1662         flag = RegExp::kUnicode;
1663         break;
1664       case 'y':
1665         flag = RegExp::kSticky;
1666         break;
1667       default:
1668         return Nothing<RegExp::Flags>();
1669     }
1670     if (flags & flag) {
1671       return Nothing<RegExp::Flags>();
1672     }
1673     Advance();
1674     flags |= flag;
1675   }
1676 
1677   next_.location.end_pos = source_pos();
1678   return Just(RegExp::Flags(flags));
1679 }
1680 
1681 
CurrentSymbol(AstValueFactory * ast_value_factory)1682 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) {
1683   if (is_literal_one_byte()) {
1684     return ast_value_factory->GetOneByteString(literal_one_byte_string());
1685   }
1686   return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1687 }
1688 
1689 
NextSymbol(AstValueFactory * ast_value_factory)1690 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) {
1691   if (is_next_literal_one_byte()) {
1692     return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1693   }
1694   return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1695 }
1696 
1697 
CurrentRawSymbol(AstValueFactory * ast_value_factory)1698 const AstRawString* Scanner::CurrentRawSymbol(
1699     AstValueFactory* ast_value_factory) {
1700   if (is_raw_literal_one_byte()) {
1701     return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1702   }
1703   return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1704 }
1705 
1706 
DoubleValue()1707 double Scanner::DoubleValue() {
1708   DCHECK(is_literal_one_byte());
1709   return StringToDouble(
1710       unicode_cache_,
1711       literal_one_byte_string(),
1712       ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1713 }
1714 
1715 
ContainsDot()1716 bool Scanner::ContainsDot() {
1717   DCHECK(is_literal_one_byte());
1718   Vector<const uint8_t> str = literal_one_byte_string();
1719   return std::find(str.begin(), str.end(), '.') != str.end();
1720 }
1721 
FindSymbol(DuplicateFinder * finder)1722 bool Scanner::FindSymbol(DuplicateFinder* finder) {
1723   // TODO(vogelheim): Move this logic into the calling class; this can be fully
1724   //                  implemented using the public interface.
1725   if (is_literal_one_byte()) {
1726     return finder->AddOneByteSymbol(literal_one_byte_string());
1727   }
1728   return finder->AddTwoByteSymbol(literal_two_byte_string());
1729 }
1730 
SeekNext(size_t position)1731 void Scanner::SeekNext(size_t position) {
1732   // Use with care: This cleanly resets most, but not all scanner state.
1733   // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.
1734 
1735   // To re-scan from a given character position, we need to:
1736   // 1, Reset the current_, next_ and next_next_ tokens
1737   //    (next_ + next_next_ will be overwrittem by Next(),
1738   //     current_ will remain unchanged, so overwrite it fully.)
1739   current_ = {{0, 0}, nullptr, nullptr, 0, Token::UNINITIALIZED};
1740   next_.token = Token::UNINITIALIZED;
1741   next_next_.token = Token::UNINITIALIZED;
1742   // 2, reset the source to the desired position,
1743   source_->Seek(position);
1744   // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
1745   c0_ = source_->Advance();
1746   Next();
1747   DCHECK_EQ(next_.location.beg_pos, static_cast<int>(position));
1748 }
1749 
1750 }  // namespace internal
1751 }  // namespace v8
1752