• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Features shared by parsing and pre-parsing scanners.
6 
7 #include "src/parsing/scanner.h"
8 
9 #include <stdint.h>
10 
11 #include <cmath>
12 
13 #include "src/ast/ast-value-factory.h"
14 #include "src/char-predicates-inl.h"
15 #include "src/conversions-inl.h"
16 #include "src/list-inl.h"
17 #include "src/parsing/parser.h"
18 
19 namespace v8 {
20 namespace internal {
21 
22 
Internalize(Isolate * isolate) const23 Handle<String> LiteralBuffer::Internalize(Isolate* isolate) const {
24   if (is_one_byte()) {
25     return isolate->factory()->InternalizeOneByteString(one_byte_literal());
26   }
27   return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
28 }
29 
30 
31 // Default implementation for streams that do not support bookmarks.
SetBookmark()32 bool Utf16CharacterStream::SetBookmark() { return false; }
ResetToBookmark()33 void Utf16CharacterStream::ResetToBookmark() { UNREACHABLE(); }
34 
35 
36 // ----------------------------------------------------------------------------
37 // Scanner
38 
Scanner(UnicodeCache * unicode_cache)39 Scanner::Scanner(UnicodeCache* unicode_cache)
40     : unicode_cache_(unicode_cache),
41       bookmark_c0_(kNoBookmark),
42       octal_pos_(Location::invalid()),
43       decimal_with_leading_zero_pos_(Location::invalid()),
44       found_html_comment_(false),
45       allow_harmony_exponentiation_operator_(false) {
46   bookmark_current_.literal_chars = &bookmark_current_literal_;
47   bookmark_current_.raw_literal_chars = &bookmark_current_raw_literal_;
48   bookmark_next_.literal_chars = &bookmark_next_literal_;
49   bookmark_next_.raw_literal_chars = &bookmark_next_raw_literal_;
50 }
51 
52 
Initialize(Utf16CharacterStream * source)53 void Scanner::Initialize(Utf16CharacterStream* source) {
54   source_ = source;
55   // Need to capture identifiers in order to recognize "get" and "set"
56   // in object literals.
57   Init();
58   // Skip initial whitespace allowing HTML comment ends just like
59   // after a newline and scan first token.
60   has_line_terminator_before_next_ = true;
61   SkipWhiteSpace();
62   Scan();
63 }
64 
65 template <bool capture_raw, bool unicode>
ScanHexNumber(int expected_length)66 uc32 Scanner::ScanHexNumber(int expected_length) {
67   DCHECK(expected_length <= 4);  // prevent overflow
68 
69   int begin = source_pos() - 2;
70   uc32 x = 0;
71   for (int i = 0; i < expected_length; i++) {
72     int d = HexValue(c0_);
73     if (d < 0) {
74       ReportScannerError(Location(begin, begin + expected_length + 2),
75                          unicode
76                              ? MessageTemplate::kInvalidUnicodeEscapeSequence
77                              : MessageTemplate::kInvalidHexEscapeSequence);
78       return -1;
79     }
80     x = x * 16 + d;
81     Advance<capture_raw>();
82   }
83 
84   return x;
85 }
86 
87 template <bool capture_raw>
ScanUnlimitedLengthHexNumber(int max_value,int beg_pos)88 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
89   uc32 x = 0;
90   int d = HexValue(c0_);
91   if (d < 0) return -1;
92 
93   while (d >= 0) {
94     x = x * 16 + d;
95     if (x > max_value) {
96       ReportScannerError(Location(beg_pos, source_pos() + 1),
97                          MessageTemplate::kUndefinedUnicodeCodePoint);
98       return -1;
99     }
100     Advance<capture_raw>();
101     d = HexValue(c0_);
102   }
103 
104   return x;
105 }
106 
107 
108 // Ensure that tokens can be stored in a byte.
109 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
110 
111 // Table of one-character tokens, by character (0x00..0x7f only).
112 static const byte one_char_tokens[] = {
113   Token::ILLEGAL,
114   Token::ILLEGAL,
115   Token::ILLEGAL,
116   Token::ILLEGAL,
117   Token::ILLEGAL,
118   Token::ILLEGAL,
119   Token::ILLEGAL,
120   Token::ILLEGAL,
121   Token::ILLEGAL,
122   Token::ILLEGAL,
123   Token::ILLEGAL,
124   Token::ILLEGAL,
125   Token::ILLEGAL,
126   Token::ILLEGAL,
127   Token::ILLEGAL,
128   Token::ILLEGAL,
129   Token::ILLEGAL,
130   Token::ILLEGAL,
131   Token::ILLEGAL,
132   Token::ILLEGAL,
133   Token::ILLEGAL,
134   Token::ILLEGAL,
135   Token::ILLEGAL,
136   Token::ILLEGAL,
137   Token::ILLEGAL,
138   Token::ILLEGAL,
139   Token::ILLEGAL,
140   Token::ILLEGAL,
141   Token::ILLEGAL,
142   Token::ILLEGAL,
143   Token::ILLEGAL,
144   Token::ILLEGAL,
145   Token::ILLEGAL,
146   Token::ILLEGAL,
147   Token::ILLEGAL,
148   Token::ILLEGAL,
149   Token::ILLEGAL,
150   Token::ILLEGAL,
151   Token::ILLEGAL,
152   Token::ILLEGAL,
153   Token::LPAREN,       // 0x28
154   Token::RPAREN,       // 0x29
155   Token::ILLEGAL,
156   Token::ILLEGAL,
157   Token::COMMA,        // 0x2c
158   Token::ILLEGAL,
159   Token::ILLEGAL,
160   Token::ILLEGAL,
161   Token::ILLEGAL,
162   Token::ILLEGAL,
163   Token::ILLEGAL,
164   Token::ILLEGAL,
165   Token::ILLEGAL,
166   Token::ILLEGAL,
167   Token::ILLEGAL,
168   Token::ILLEGAL,
169   Token::ILLEGAL,
170   Token::ILLEGAL,
171   Token::COLON,        // 0x3a
172   Token::SEMICOLON,    // 0x3b
173   Token::ILLEGAL,
174   Token::ILLEGAL,
175   Token::ILLEGAL,
176   Token::CONDITIONAL,  // 0x3f
177   Token::ILLEGAL,
178   Token::ILLEGAL,
179   Token::ILLEGAL,
180   Token::ILLEGAL,
181   Token::ILLEGAL,
182   Token::ILLEGAL,
183   Token::ILLEGAL,
184   Token::ILLEGAL,
185   Token::ILLEGAL,
186   Token::ILLEGAL,
187   Token::ILLEGAL,
188   Token::ILLEGAL,
189   Token::ILLEGAL,
190   Token::ILLEGAL,
191   Token::ILLEGAL,
192   Token::ILLEGAL,
193   Token::ILLEGAL,
194   Token::ILLEGAL,
195   Token::ILLEGAL,
196   Token::ILLEGAL,
197   Token::ILLEGAL,
198   Token::ILLEGAL,
199   Token::ILLEGAL,
200   Token::ILLEGAL,
201   Token::ILLEGAL,
202   Token::ILLEGAL,
203   Token::ILLEGAL,
204   Token::LBRACK,     // 0x5b
205   Token::ILLEGAL,
206   Token::RBRACK,     // 0x5d
207   Token::ILLEGAL,
208   Token::ILLEGAL,
209   Token::ILLEGAL,
210   Token::ILLEGAL,
211   Token::ILLEGAL,
212   Token::ILLEGAL,
213   Token::ILLEGAL,
214   Token::ILLEGAL,
215   Token::ILLEGAL,
216   Token::ILLEGAL,
217   Token::ILLEGAL,
218   Token::ILLEGAL,
219   Token::ILLEGAL,
220   Token::ILLEGAL,
221   Token::ILLEGAL,
222   Token::ILLEGAL,
223   Token::ILLEGAL,
224   Token::ILLEGAL,
225   Token::ILLEGAL,
226   Token::ILLEGAL,
227   Token::ILLEGAL,
228   Token::ILLEGAL,
229   Token::ILLEGAL,
230   Token::ILLEGAL,
231   Token::ILLEGAL,
232   Token::ILLEGAL,
233   Token::ILLEGAL,
234   Token::ILLEGAL,
235   Token::ILLEGAL,
236   Token::LBRACE,       // 0x7b
237   Token::ILLEGAL,
238   Token::RBRACE,       // 0x7d
239   Token::BIT_NOT,      // 0x7e
240   Token::ILLEGAL
241 };
242 
243 
Next()244 Token::Value Scanner::Next() {
245   if (next_.token == Token::EOS) {
246     next_.location.beg_pos = current_.location.beg_pos;
247     next_.location.end_pos = current_.location.end_pos;
248   }
249   current_ = next_;
250   if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) {
251     next_ = next_next_;
252     next_next_.token = Token::UNINITIALIZED;
253     has_line_terminator_before_next_ = has_line_terminator_after_next_;
254     return current_.token;
255   }
256   has_line_terminator_before_next_ = false;
257   has_multiline_comment_before_next_ = false;
258   if (static_cast<unsigned>(c0_) <= 0x7f) {
259     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
260     if (token != Token::ILLEGAL) {
261       int pos = source_pos();
262       next_.token = token;
263       next_.location.beg_pos = pos;
264       next_.location.end_pos = pos + 1;
265       Advance();
266       return current_.token;
267     }
268   }
269   Scan();
270   return current_.token;
271 }
272 
273 
PeekAhead()274 Token::Value Scanner::PeekAhead() {
275   if (next_next_.token != Token::UNINITIALIZED) {
276     return next_next_.token;
277   }
278   TokenDesc prev = current_;
279   bool has_line_terminator_before_next =
280       has_line_terminator_before_next_ || has_multiline_comment_before_next_;
281   Next();
282   has_line_terminator_after_next_ =
283       has_line_terminator_before_next_ || has_multiline_comment_before_next_;
284   has_line_terminator_before_next_ = has_line_terminator_before_next;
285   Token::Value ret = next_.token;
286   next_next_ = next_;
287   next_ = current_;
288   current_ = prev;
289   return ret;
290 }
291 
292 
293 // TODO(yangguo): check whether this is actually necessary.
IsLittleEndianByteOrderMark(uc32 c)294 static inline bool IsLittleEndianByteOrderMark(uc32 c) {
295   // The Unicode value U+FFFE is guaranteed never to be assigned as a
296   // Unicode character; this implies that in a Unicode context the
297   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
298   // character expressed in little-endian byte order (since it could
299   // not be a U+FFFE character expressed in big-endian byte
300   // order). Nevertheless, we check for it to be compatible with
301   // Spidermonkey.
302   return c == 0xFFFE;
303 }
304 
305 
SkipWhiteSpace()306 bool Scanner::SkipWhiteSpace() {
307   int start_position = source_pos();
308 
309   while (true) {
310     while (true) {
311       // The unicode cache accepts unsigned inputs.
312       if (c0_ < 0) break;
313       // Advance as long as character is a WhiteSpace or LineTerminator.
314       // Remember if the latter is the case.
315       if (unicode_cache_->IsLineTerminator(c0_)) {
316         has_line_terminator_before_next_ = true;
317       } else if (!unicode_cache_->IsWhiteSpace(c0_) &&
318                  !IsLittleEndianByteOrderMark(c0_)) {
319         break;
320       }
321       Advance();
322     }
323 
324     // If there is an HTML comment end '-->' at the beginning of a
325     // line (with only whitespace in front of it), we treat the rest
326     // of the line as a comment. This is in line with the way
327     // SpiderMonkey handles it.
328     if (c0_ == '-' && has_line_terminator_before_next_) {
329       Advance();
330       if (c0_ == '-') {
331         Advance();
332         if (c0_ == '>') {
333           // Treat the rest of the line as a comment.
334           SkipSingleLineComment();
335           // Continue skipping white space after the comment.
336           continue;
337         }
338         PushBack('-');  // undo Advance()
339       }
340       PushBack('-');  // undo Advance()
341     }
342     // Return whether or not we skipped any characters.
343     return source_pos() != start_position;
344   }
345 }
346 
347 
SkipSingleLineComment()348 Token::Value Scanner::SkipSingleLineComment() {
349   Advance();
350 
351   // The line terminator at the end of the line is not considered
352   // to be part of the single-line comment; it is recognized
353   // separately by the lexical grammar and becomes part of the
354   // stream of input elements for the syntactic grammar (see
355   // ECMA-262, section 7.4).
356   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
357     Advance();
358   }
359 
360   return Token::WHITESPACE;
361 }
362 
363 
SkipSourceURLComment()364 Token::Value Scanner::SkipSourceURLComment() {
365   TryToParseSourceURLComment();
366   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
367     Advance();
368   }
369 
370   return Token::WHITESPACE;
371 }
372 
373 
TryToParseSourceURLComment()374 void Scanner::TryToParseSourceURLComment() {
375   // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
376   // function will just return if it cannot parse a magic comment.
377   if (c0_ < 0 || !unicode_cache_->IsWhiteSpace(c0_)) return;
378   Advance();
379   LiteralBuffer name;
380   while (c0_ >= 0 && !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) &&
381          c0_ != '=') {
382     name.AddChar(c0_);
383     Advance();
384   }
385   if (!name.is_one_byte()) return;
386   Vector<const uint8_t> name_literal = name.one_byte_literal();
387   LiteralBuffer* value;
388   if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
389     value = &source_url_;
390   } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
391     value = &source_mapping_url_;
392   } else {
393     return;
394   }
395   if (c0_ != '=')
396     return;
397   Advance();
398   value->Reset();
399   while (c0_ >= 0 && unicode_cache_->IsWhiteSpace(c0_)) {
400     Advance();
401   }
402   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
403     // Disallowed characters.
404     if (c0_ == '"' || c0_ == '\'') {
405       value->Reset();
406       return;
407     }
408     if (unicode_cache_->IsWhiteSpace(c0_)) {
409       break;
410     }
411     value->AddChar(c0_);
412     Advance();
413   }
414   // Allow whitespace at the end.
415   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
416     if (!unicode_cache_->IsWhiteSpace(c0_)) {
417       value->Reset();
418       break;
419     }
420     Advance();
421   }
422 }
423 
424 
SkipMultiLineComment()425 Token::Value Scanner::SkipMultiLineComment() {
426   DCHECK(c0_ == '*');
427   Advance();
428 
429   while (c0_ >= 0) {
430     uc32 ch = c0_;
431     Advance();
432     if (c0_ >= 0 && unicode_cache_->IsLineTerminator(ch)) {
433       // Following ECMA-262, section 7.4, a comment containing
434       // a newline will make the comment count as a line-terminator.
435       has_multiline_comment_before_next_ = true;
436     }
437     // If we have reached the end of the multi-line comment, we
438     // consume the '/' and insert a whitespace. This way all
439     // multi-line comments are treated as whitespace.
440     if (ch == '*' && c0_ == '/') {
441       c0_ = ' ';
442       return Token::WHITESPACE;
443     }
444   }
445 
446   // Unterminated multi-line comment.
447   return Token::ILLEGAL;
448 }
449 
450 
ScanHtmlComment()451 Token::Value Scanner::ScanHtmlComment() {
452   // Check for <!-- comments.
453   DCHECK(c0_ == '!');
454   Advance();
455   if (c0_ == '-') {
456     Advance();
457     if (c0_ == '-') {
458       found_html_comment_ = true;
459       return SkipSingleLineComment();
460     }
461     PushBack('-');  // undo Advance()
462   }
463   PushBack('!');  // undo Advance()
464   DCHECK(c0_ == '!');
465   return Token::LT;
466 }
467 
468 
Scan()469 void Scanner::Scan() {
470   next_.literal_chars = NULL;
471   next_.raw_literal_chars = NULL;
472   Token::Value token;
473   do {
474     // Remember the position of the next token
475     next_.location.beg_pos = source_pos();
476 
477     switch (c0_) {
478       case ' ':
479       case '\t':
480         Advance();
481         token = Token::WHITESPACE;
482         break;
483 
484       case '\n':
485         Advance();
486         has_line_terminator_before_next_ = true;
487         token = Token::WHITESPACE;
488         break;
489 
490       case '"': case '\'':
491         token = ScanString();
492         break;
493 
494       case '<':
495         // < <= << <<= <!--
496         Advance();
497         if (c0_ == '=') {
498           token = Select(Token::LTE);
499         } else if (c0_ == '<') {
500           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
501         } else if (c0_ == '!') {
502           token = ScanHtmlComment();
503         } else {
504           token = Token::LT;
505         }
506         break;
507 
508       case '>':
509         // > >= >> >>= >>> >>>=
510         Advance();
511         if (c0_ == '=') {
512           token = Select(Token::GTE);
513         } else if (c0_ == '>') {
514           // >> >>= >>> >>>=
515           Advance();
516           if (c0_ == '=') {
517             token = Select(Token::ASSIGN_SAR);
518           } else if (c0_ == '>') {
519             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
520           } else {
521             token = Token::SAR;
522           }
523         } else {
524           token = Token::GT;
525         }
526         break;
527 
528       case '=':
529         // = == === =>
530         Advance();
531         if (c0_ == '=') {
532           token = Select('=', Token::EQ_STRICT, Token::EQ);
533         } else if (c0_ == '>') {
534           token = Select(Token::ARROW);
535         } else {
536           token = Token::ASSIGN;
537         }
538         break;
539 
540       case '!':
541         // ! != !==
542         Advance();
543         if (c0_ == '=') {
544           token = Select('=', Token::NE_STRICT, Token::NE);
545         } else {
546           token = Token::NOT;
547         }
548         break;
549 
550       case '+':
551         // + ++ +=
552         Advance();
553         if (c0_ == '+') {
554           token = Select(Token::INC);
555         } else if (c0_ == '=') {
556           token = Select(Token::ASSIGN_ADD);
557         } else {
558           token = Token::ADD;
559         }
560         break;
561 
562       case '-':
563         // - -- --> -=
564         Advance();
565         if (c0_ == '-') {
566           Advance();
567           if (c0_ == '>' && has_line_terminator_before_next_) {
568             // For compatibility with SpiderMonkey, we skip lines that
569             // start with an HTML comment end '-->'.
570             token = SkipSingleLineComment();
571           } else {
572             token = Token::DEC;
573           }
574         } else if (c0_ == '=') {
575           token = Select(Token::ASSIGN_SUB);
576         } else {
577           token = Token::SUB;
578         }
579         break;
580 
581       case '*':
582         // * *=
583         Advance();
584         if (c0_ == '*' && allow_harmony_exponentiation_operator()) {
585           token = Select('=', Token::ASSIGN_EXP, Token::EXP);
586         } else if (c0_ == '=') {
587           token = Select(Token::ASSIGN_MUL);
588         } else {
589           token = Token::MUL;
590         }
591         break;
592 
593       case '%':
594         // % %=
595         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
596         break;
597 
598       case '/':
599         // /  // /* /=
600         Advance();
601         if (c0_ == '/') {
602           Advance();
603           if (c0_ == '#' || c0_ == '@') {
604             Advance();
605             token = SkipSourceURLComment();
606           } else {
607             PushBack(c0_);
608             token = SkipSingleLineComment();
609           }
610         } else if (c0_ == '*') {
611           token = SkipMultiLineComment();
612         } else if (c0_ == '=') {
613           token = Select(Token::ASSIGN_DIV);
614         } else {
615           token = Token::DIV;
616         }
617         break;
618 
619       case '&':
620         // & && &=
621         Advance();
622         if (c0_ == '&') {
623           token = Select(Token::AND);
624         } else if (c0_ == '=') {
625           token = Select(Token::ASSIGN_BIT_AND);
626         } else {
627           token = Token::BIT_AND;
628         }
629         break;
630 
631       case '|':
632         // | || |=
633         Advance();
634         if (c0_ == '|') {
635           token = Select(Token::OR);
636         } else if (c0_ == '=') {
637           token = Select(Token::ASSIGN_BIT_OR);
638         } else {
639           token = Token::BIT_OR;
640         }
641         break;
642 
643       case '^':
644         // ^ ^=
645         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
646         break;
647 
648       case '.':
649         // . Number
650         Advance();
651         if (IsDecimalDigit(c0_)) {
652           token = ScanNumber(true);
653         } else {
654           token = Token::PERIOD;
655           if (c0_ == '.') {
656             Advance();
657             if (c0_ == '.') {
658               Advance();
659               token = Token::ELLIPSIS;
660             } else {
661               PushBack('.');
662             }
663           }
664         }
665         break;
666 
667       case ':':
668         token = Select(Token::COLON);
669         break;
670 
671       case ';':
672         token = Select(Token::SEMICOLON);
673         break;
674 
675       case ',':
676         token = Select(Token::COMMA);
677         break;
678 
679       case '(':
680         token = Select(Token::LPAREN);
681         break;
682 
683       case ')':
684         token = Select(Token::RPAREN);
685         break;
686 
687       case '[':
688         token = Select(Token::LBRACK);
689         break;
690 
691       case ']':
692         token = Select(Token::RBRACK);
693         break;
694 
695       case '{':
696         token = Select(Token::LBRACE);
697         break;
698 
699       case '}':
700         token = Select(Token::RBRACE);
701         break;
702 
703       case '?':
704         token = Select(Token::CONDITIONAL);
705         break;
706 
707       case '~':
708         token = Select(Token::BIT_NOT);
709         break;
710 
711       case '`':
712         token = ScanTemplateStart();
713         break;
714 
715       default:
716         if (c0_ < 0) {
717           token = Token::EOS;
718         } else if (unicode_cache_->IsIdentifierStart(c0_)) {
719           token = ScanIdentifierOrKeyword();
720         } else if (IsDecimalDigit(c0_)) {
721           token = ScanNumber(false);
722         } else if (SkipWhiteSpace()) {
723           token = Token::WHITESPACE;
724         } else {
725           token = Select(Token::ILLEGAL);
726         }
727         break;
728     }
729 
730     // Continue scanning for tokens as long as we're just skipping
731     // whitespace.
732   } while (token == Token::WHITESPACE);
733 
734   next_.location.end_pos = source_pos();
735   next_.token = token;
736 }
737 
738 
SeekForward(int pos)739 void Scanner::SeekForward(int pos) {
740   // After this call, we will have the token at the given position as
741   // the "next" token. The "current" token will be invalid.
742   if (pos == next_.location.beg_pos) return;
743   int current_pos = source_pos();
744   DCHECK_EQ(next_.location.end_pos, current_pos);
745   // Positions inside the lookahead token aren't supported.
746   DCHECK(pos >= current_pos);
747   if (pos != current_pos) {
748     source_->SeekForward(pos - source_->pos());
749     Advance();
750     // This function is only called to seek to the location
751     // of the end of a function (at the "}" token). It doesn't matter
752     // whether there was a line terminator in the part we skip.
753     has_line_terminator_before_next_ = false;
754     has_multiline_comment_before_next_ = false;
755   }
756   Scan();
757 }
758 
759 
760 template <bool capture_raw, bool in_template_literal>
ScanEscape()761 bool Scanner::ScanEscape() {
762   uc32 c = c0_;
763   Advance<capture_raw>();
764 
765   // Skip escaped newlines.
766   if (!in_template_literal && c0_ >= 0 && unicode_cache_->IsLineTerminator(c)) {
767     // Allow CR+LF newlines in multiline string literals.
768     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>();
769     // Allow LF+CR newlines in multiline string literals.
770     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>();
771     return true;
772   }
773 
774   switch (c) {
775     case '\'':  // fall through
776     case '"' :  // fall through
777     case '\\': break;
778     case 'b' : c = '\b'; break;
779     case 'f' : c = '\f'; break;
780     case 'n' : c = '\n'; break;
781     case 'r' : c = '\r'; break;
782     case 't' : c = '\t'; break;
783     case 'u' : {
784       c = ScanUnicodeEscape<capture_raw>();
785       if (c < 0) return false;
786       break;
787     }
788     case 'v':
789       c = '\v';
790       break;
791     case 'x': {
792       c = ScanHexNumber<capture_raw>(2);
793       if (c < 0) return false;
794       break;
795     }
796     case '0':  // Fall through.
797     case '1':  // fall through
798     case '2':  // fall through
799     case '3':  // fall through
800     case '4':  // fall through
801     case '5':  // fall through
802     case '6':  // fall through
803     case '7':
804       c = ScanOctalEscape<capture_raw>(c, 2);
805       break;
806   }
807 
808   // According to ECMA-262, section 7.8.4, characters not covered by the
809   // above cases should be illegal, but they are commonly handled as
810   // non-escaped characters by JS VMs.
811   AddLiteralChar(c);
812   return true;
813 }
814 
815 
816 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
817 // ECMA-262. Other JS VMs support them.
818 template <bool capture_raw>
ScanOctalEscape(uc32 c,int length)819 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
820   uc32 x = c - '0';
821   int i = 0;
822   for (; i < length; i++) {
823     int d = c0_ - '0';
824     if (d < 0 || d > 7) break;
825     int nx = x * 8 + d;
826     if (nx >= 256) break;
827     x = nx;
828     Advance<capture_raw>();
829   }
830   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
831   // Remember the position of octal escape sequences so that an error
832   // can be reported later (in strict mode).
833   // We don't report the error immediately, because the octal escape can
834   // occur before the "use strict" directive.
835   if (c != '0' || i > 0) {
836     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
837   }
838   return x;
839 }
840 
841 
ScanString()842 Token::Value Scanner::ScanString() {
843   uc32 quote = c0_;
844   Advance<false, false>();  // consume quote
845 
846   LiteralScope literal(this);
847   while (true) {
848     if (c0_ > kMaxAscii) {
849       HandleLeadSurrogate();
850       break;
851     }
852     if (c0_ < 0 || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL;
853     if (c0_ == quote) {
854       literal.Complete();
855       Advance<false, false>();
856       return Token::STRING;
857     }
858     char c = static_cast<char>(c0_);
859     if (c == '\\') break;
860     Advance<false, false>();
861     AddLiteralChar(c);
862   }
863 
864   while (c0_ != quote && c0_ >= 0
865          && !unicode_cache_->IsLineTerminator(c0_)) {
866     uc32 c = c0_;
867     Advance();
868     if (c == '\\') {
869       if (c0_ < 0 || !ScanEscape<false, false>()) {
870         return Token::ILLEGAL;
871       }
872     } else {
873       AddLiteralChar(c);
874     }
875   }
876   if (c0_ != quote) return Token::ILLEGAL;
877   literal.Complete();
878 
879   Advance();  // consume quote
880   return Token::STRING;
881 }
882 
883 
ScanTemplateSpan()884 Token::Value Scanner::ScanTemplateSpan() {
885   // When scanning a TemplateSpan, we are looking for the following construct:
886   // TEMPLATE_SPAN ::
887   //     ` LiteralChars* ${
888   //   | } LiteralChars* ${
889   //
890   // TEMPLATE_TAIL ::
891   //     ` LiteralChars* `
892   //   | } LiteralChar* `
893   //
894   // A TEMPLATE_SPAN should always be followed by an Expression, while a
895   // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
896   // followed by an Expression.
897 
898   Token::Value result = Token::TEMPLATE_SPAN;
899   LiteralScope literal(this);
900   StartRawLiteral();
901   const bool capture_raw = true;
902   const bool in_template_literal = true;
903   while (true) {
904     uc32 c = c0_;
905     Advance<capture_raw>();
906     if (c == '`') {
907       result = Token::TEMPLATE_TAIL;
908       ReduceRawLiteralLength(1);
909       break;
910     } else if (c == '$' && c0_ == '{') {
911       Advance<capture_raw>();  // Consume '{'
912       ReduceRawLiteralLength(2);
913       break;
914     } else if (c == '\\') {
915       if (c0_ > 0 && unicode_cache_->IsLineTerminator(c0_)) {
916         // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
917         // code unit sequence.
918         uc32 lastChar = c0_;
919         Advance<capture_raw>();
920         if (lastChar == '\r') {
921           ReduceRawLiteralLength(1);  // Remove \r
922           if (c0_ == '\n') {
923             Advance<capture_raw>();  // Adds \n
924           } else {
925             AddRawLiteralChar('\n');
926           }
927         }
928       } else if (!ScanEscape<capture_raw, in_template_literal>()) {
929         return Token::ILLEGAL;
930       }
931     } else if (c < 0) {
932       // Unterminated template literal
933       PushBack(c);
934       break;
935     } else {
936       // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
937       // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
938       // consisting of the CV 0x000A.
939       if (c == '\r') {
940         ReduceRawLiteralLength(1);  // Remove \r
941         if (c0_ == '\n') {
942           Advance<capture_raw>();  // Adds \n
943         } else {
944           AddRawLiteralChar('\n');
945         }
946         c = '\n';
947       }
948       AddLiteralChar(c);
949     }
950   }
951   literal.Complete();
952   next_.location.end_pos = source_pos();
953   next_.token = result;
954   return result;
955 }
956 
957 
ScanTemplateStart()958 Token::Value Scanner::ScanTemplateStart() {
959   DCHECK(c0_ == '`');
960   next_.location.beg_pos = source_pos();
961   Advance();  // Consume `
962   return ScanTemplateSpan();
963 }
964 
965 
ScanTemplateContinuation()966 Token::Value Scanner::ScanTemplateContinuation() {
967   DCHECK_EQ(next_.token, Token::RBRACE);
968   next_.location.beg_pos = source_pos() - 1;  // We already consumed }
969   return ScanTemplateSpan();
970 }
971 
972 
ScanDecimalDigits()973 void Scanner::ScanDecimalDigits() {
974   while (IsDecimalDigit(c0_))
975     AddLiteralCharAdvance();
976 }
977 
978 
ScanNumber(bool seen_period)979 Token::Value Scanner::ScanNumber(bool seen_period) {
980   DCHECK(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
981 
982   enum {
983     DECIMAL,
984     DECIMAL_WITH_LEADING_ZERO,
985     HEX,
986     OCTAL,
987     IMPLICIT_OCTAL,
988     BINARY
989   } kind = DECIMAL;
990 
991   LiteralScope literal(this);
992   bool at_start = !seen_period;
993   int start_pos = source_pos();  // For reporting octal positions.
994   if (seen_period) {
995     // we have already seen a decimal point of the float
996     AddLiteralChar('.');
997     ScanDecimalDigits();  // we know we have at least one digit
998 
999   } else {
1000     // if the first character is '0' we must check for octals and hex
1001     if (c0_ == '0') {
1002       AddLiteralCharAdvance();
1003 
1004       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
1005       // an octal number.
1006       if (c0_ == 'x' || c0_ == 'X') {
1007         // hex number
1008         kind = HEX;
1009         AddLiteralCharAdvance();
1010         if (!IsHexDigit(c0_)) {
1011           // we must have at least one hex digit after 'x'/'X'
1012           return Token::ILLEGAL;
1013         }
1014         while (IsHexDigit(c0_)) {
1015           AddLiteralCharAdvance();
1016         }
1017       } else if (c0_ == 'o' || c0_ == 'O') {
1018         kind = OCTAL;
1019         AddLiteralCharAdvance();
1020         if (!IsOctalDigit(c0_)) {
1021           // we must have at least one octal digit after 'o'/'O'
1022           return Token::ILLEGAL;
1023         }
1024         while (IsOctalDigit(c0_)) {
1025           AddLiteralCharAdvance();
1026         }
1027       } else if (c0_ == 'b' || c0_ == 'B') {
1028         kind = BINARY;
1029         AddLiteralCharAdvance();
1030         if (!IsBinaryDigit(c0_)) {
1031           // we must have at least one binary digit after 'b'/'B'
1032           return Token::ILLEGAL;
1033         }
1034         while (IsBinaryDigit(c0_)) {
1035           AddLiteralCharAdvance();
1036         }
1037       } else if ('0' <= c0_ && c0_ <= '7') {
1038         // (possible) octal number
1039         kind = IMPLICIT_OCTAL;
1040         while (true) {
1041           if (c0_ == '8' || c0_ == '9') {
1042             at_start = false;
1043             kind = DECIMAL_WITH_LEADING_ZERO;
1044             break;
1045           }
1046           if (c0_  < '0' || '7'  < c0_) {
1047             // Octal literal finished.
1048             octal_pos_ = Location(start_pos, source_pos());
1049             break;
1050           }
1051           AddLiteralCharAdvance();
1052         }
1053       } else if (c0_ == '8' || c0_ == '9') {
1054         kind = DECIMAL_WITH_LEADING_ZERO;
1055       }
1056     }
1057 
1058     // Parse decimal digits and allow trailing fractional part.
1059     if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) {
1060       if (at_start) {
1061         uint64_t value = 0;
1062         while (IsDecimalDigit(c0_)) {
1063           value = 10 * value + (c0_ - '0');
1064 
1065           uc32 first_char = c0_;
1066           Advance<false, false>();
1067           AddLiteralChar(first_char);
1068         }
1069 
1070         if (next_.literal_chars->one_byte_literal().length() <= 10 &&
1071             value <= Smi::kMaxValue && c0_ != '.' && c0_ != 'e' && c0_ != 'E') {
1072           next_.smi_value_ = static_cast<int>(value);
1073           literal.Complete();
1074           HandleLeadSurrogate();
1075 
1076           if (kind == DECIMAL_WITH_LEADING_ZERO)
1077             decimal_with_leading_zero_pos_ = Location(start_pos, source_pos());
1078           return Token::SMI;
1079         }
1080         HandleLeadSurrogate();
1081       }
1082 
1083       ScanDecimalDigits();  // optional
1084       if (c0_ == '.') {
1085         AddLiteralCharAdvance();
1086         ScanDecimalDigits();  // optional
1087       }
1088     }
1089   }
1090 
1091   // scan exponent, if any
1092   if (c0_ == 'e' || c0_ == 'E') {
1093     DCHECK(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
1094     if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO))
1095       return Token::ILLEGAL;
1096     // scan exponent
1097     AddLiteralCharAdvance();
1098     if (c0_ == '+' || c0_ == '-')
1099       AddLiteralCharAdvance();
1100     if (!IsDecimalDigit(c0_)) {
1101       // we must have at least one decimal digit after 'e'/'E'
1102       return Token::ILLEGAL;
1103     }
1104     ScanDecimalDigits();
1105   }
1106 
1107   // The source character immediately following a numeric literal must
1108   // not be an identifier start or a decimal digit; see ECMA-262
1109   // section 7.8.3, page 17 (note that we read only one decimal digit
1110   // if the value is 0).
1111   if (IsDecimalDigit(c0_) ||
1112       (c0_ >= 0 && unicode_cache_->IsIdentifierStart(c0_)))
1113     return Token::ILLEGAL;
1114 
1115   literal.Complete();
1116 
1117   if (kind == DECIMAL_WITH_LEADING_ZERO)
1118     decimal_with_leading_zero_pos_ = Location(start_pos, source_pos());
1119   return Token::NUMBER;
1120 }
1121 
1122 
ScanIdentifierUnicodeEscape()1123 uc32 Scanner::ScanIdentifierUnicodeEscape() {
1124   Advance();
1125   if (c0_ != 'u') return -1;
1126   Advance();
1127   return ScanUnicodeEscape<false>();
1128 }
1129 
1130 
1131 template <bool capture_raw>
ScanUnicodeEscape()1132 uc32 Scanner::ScanUnicodeEscape() {
1133   // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
1134   // hex digits between { } is arbitrary. \ and u have already been read.
1135   if (c0_ == '{') {
1136     int begin = source_pos() - 2;
1137     Advance<capture_raw>();
1138     uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff, begin);
1139     if (cp < 0 || c0_ != '}') {
1140       ReportScannerError(source_pos(),
1141                          MessageTemplate::kInvalidUnicodeEscapeSequence);
1142       return -1;
1143     }
1144     Advance<capture_raw>();
1145     return cp;
1146   }
1147   const bool unicode = true;
1148   return ScanHexNumber<capture_raw, unicode>(4);
1149 }
1150 
1151 
1152 // ----------------------------------------------------------------------------
1153 // Keyword Matcher
1154 
1155 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                    \
1156   KEYWORD_GROUP('a')                                        \
1157   KEYWORD("async", Token::ASYNC)                            \
1158   KEYWORD("await", Token::AWAIT)                            \
1159   KEYWORD_GROUP('b')                                        \
1160   KEYWORD("break", Token::BREAK)                            \
1161   KEYWORD_GROUP('c')                                        \
1162   KEYWORD("case", Token::CASE)                              \
1163   KEYWORD("catch", Token::CATCH)                            \
1164   KEYWORD("class", Token::CLASS)                            \
1165   KEYWORD("const", Token::CONST)                            \
1166   KEYWORD("continue", Token::CONTINUE)                      \
1167   KEYWORD_GROUP('d')                                        \
1168   KEYWORD("debugger", Token::DEBUGGER)                      \
1169   KEYWORD("default", Token::DEFAULT)                        \
1170   KEYWORD("delete", Token::DELETE)                          \
1171   KEYWORD("do", Token::DO)                                  \
1172   KEYWORD_GROUP('e')                                        \
1173   KEYWORD("else", Token::ELSE)                              \
1174   KEYWORD("enum", Token::ENUM)                              \
1175   KEYWORD("export", Token::EXPORT)                          \
1176   KEYWORD("extends", Token::EXTENDS)                        \
1177   KEYWORD_GROUP('f')                                        \
1178   KEYWORD("false", Token::FALSE_LITERAL)                    \
1179   KEYWORD("finally", Token::FINALLY)                        \
1180   KEYWORD("for", Token::FOR)                                \
1181   KEYWORD("function", Token::FUNCTION)                      \
1182   KEYWORD_GROUP('i')                                        \
1183   KEYWORD("if", Token::IF)                                  \
1184   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
1185   KEYWORD("import", Token::IMPORT)                          \
1186   KEYWORD("in", Token::IN)                                  \
1187   KEYWORD("instanceof", Token::INSTANCEOF)                  \
1188   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)  \
1189   KEYWORD_GROUP('l')                                        \
1190   KEYWORD("let", Token::LET)                                \
1191   KEYWORD_GROUP('n')                                        \
1192   KEYWORD("new", Token::NEW)                                \
1193   KEYWORD("null", Token::NULL_LITERAL)                      \
1194   KEYWORD_GROUP('p')                                        \
1195   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)    \
1196   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)    \
1197   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)  \
1198   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)     \
1199   KEYWORD_GROUP('r')                                        \
1200   KEYWORD("return", Token::RETURN)                          \
1201   KEYWORD_GROUP('s')                                        \
1202   KEYWORD("static", Token::STATIC)                          \
1203   KEYWORD("super", Token::SUPER)                            \
1204   KEYWORD("switch", Token::SWITCH)                          \
1205   KEYWORD_GROUP('t')                                        \
1206   KEYWORD("this", Token::THIS)                              \
1207   KEYWORD("throw", Token::THROW)                            \
1208   KEYWORD("true", Token::TRUE_LITERAL)                      \
1209   KEYWORD("try", Token::TRY)                                \
1210   KEYWORD("typeof", Token::TYPEOF)                          \
1211   KEYWORD_GROUP('v')                                        \
1212   KEYWORD("var", Token::VAR)                                \
1213   KEYWORD("void", Token::VOID)                              \
1214   KEYWORD_GROUP('w')                                        \
1215   KEYWORD("while", Token::WHILE)                            \
1216   KEYWORD("with", Token::WITH)                              \
1217   KEYWORD_GROUP('y')                                        \
1218   KEYWORD("yield", Token::YIELD)
1219 
KeywordOrIdentifierToken(const uint8_t * input,int input_length,bool escaped)1220 static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
1221                                              int input_length, bool escaped) {
1222   DCHECK(input_length >= 1);
1223   const int kMinLength = 2;
1224   const int kMaxLength = 10;
1225   if (input_length < kMinLength || input_length > kMaxLength) {
1226     return Token::IDENTIFIER;
1227   }
1228   switch (input[0]) {
1229     default:
1230 #define KEYWORD_GROUP_CASE(ch)                                \
1231       break;                                                  \
1232     case ch:
1233 #define KEYWORD(keyword, token)                                     \
1234   {                                                                 \
1235     /* 'keyword' is a char array, so sizeof(keyword) is */          \
1236     /* strlen(keyword) plus 1 for the NUL char. */                  \
1237     const int keyword_length = sizeof(keyword) - 1;                 \
1238     STATIC_ASSERT(keyword_length >= kMinLength);                    \
1239     STATIC_ASSERT(keyword_length <= kMaxLength);                    \
1240     if (input_length == keyword_length && input[1] == keyword[1] && \
1241         (keyword_length <= 2 || input[2] == keyword[2]) &&          \
1242         (keyword_length <= 3 || input[3] == keyword[3]) &&          \
1243         (keyword_length <= 4 || input[4] == keyword[4]) &&          \
1244         (keyword_length <= 5 || input[5] == keyword[5]) &&          \
1245         (keyword_length <= 6 || input[6] == keyword[6]) &&          \
1246         (keyword_length <= 7 || input[7] == keyword[7]) &&          \
1247         (keyword_length <= 8 || input[8] == keyword[8]) &&          \
1248         (keyword_length <= 9 || input[9] == keyword[9])) {          \
1249       if (escaped) {                                                \
1250         /* TODO(adamk): YIELD should be handled specially. */       \
1251         return (token == Token::FUTURE_STRICT_RESERVED_WORD ||      \
1252                 token == Token::LET || token == Token::STATIC)      \
1253                    ? Token::ESCAPED_STRICT_RESERVED_WORD            \
1254                    : Token::ESCAPED_KEYWORD;                        \
1255       }                                                             \
1256       return token;                                                 \
1257     }                                                               \
1258   }
1259     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
1260   }
1261   return Token::IDENTIFIER;
1262 }
1263 
1264 
IdentifierIsFutureStrictReserved(const AstRawString * string) const1265 bool Scanner::IdentifierIsFutureStrictReserved(
1266     const AstRawString* string) const {
1267   // Keywords are always 1-byte strings.
1268   if (!string->is_one_byte()) return false;
1269   if (string->IsOneByteEqualTo("let") || string->IsOneByteEqualTo("static") ||
1270       string->IsOneByteEqualTo("yield")) {
1271     return true;
1272   }
1273   return Token::FUTURE_STRICT_RESERVED_WORD ==
1274          KeywordOrIdentifierToken(string->raw_data(), string->length(), false);
1275 }
1276 
1277 
ScanIdentifierOrKeyword()1278 Token::Value Scanner::ScanIdentifierOrKeyword() {
1279   DCHECK(unicode_cache_->IsIdentifierStart(c0_));
1280   LiteralScope literal(this);
1281   if (IsInRange(c0_, 'a', 'z')) {
1282     do {
1283       char first_char = static_cast<char>(c0_);
1284       Advance<false, false>();
1285       AddLiteralChar(first_char);
1286     } while (IsInRange(c0_, 'a', 'z'));
1287 
1288     if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' ||
1289         c0_ == '$') {
1290       // Identifier starting with lowercase.
1291       char first_char = static_cast<char>(c0_);
1292       Advance<false, false>();
1293       AddLiteralChar(first_char);
1294       while (IsAsciiIdentifier(c0_)) {
1295         char first_char = static_cast<char>(c0_);
1296         Advance<false, false>();
1297         AddLiteralChar(first_char);
1298       }
1299       if (c0_ <= kMaxAscii && c0_ != '\\') {
1300         literal.Complete();
1301         return Token::IDENTIFIER;
1302       }
1303     } else if (c0_ <= kMaxAscii && c0_ != '\\') {
1304       // Only a-z+: could be a keyword or identifier.
1305       literal.Complete();
1306       Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1307       return KeywordOrIdentifierToken(chars.start(), chars.length(), false);
1308     }
1309 
1310     HandleLeadSurrogate();
1311   } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') {
1312     do {
1313       char first_char = static_cast<char>(c0_);
1314       Advance<false, false>();
1315       AddLiteralChar(first_char);
1316     } while (IsAsciiIdentifier(c0_));
1317 
1318     if (c0_ <= kMaxAscii && c0_ != '\\') {
1319       literal.Complete();
1320       return Token::IDENTIFIER;
1321     }
1322 
1323     HandleLeadSurrogate();
1324   } else if (c0_ == '\\') {
1325     // Scan identifier start character.
1326     uc32 c = ScanIdentifierUnicodeEscape();
1327     // Only allow legal identifier start characters.
1328     if (c < 0 ||
1329         c == '\\' ||  // No recursive escapes.
1330         !unicode_cache_->IsIdentifierStart(c)) {
1331       return Token::ILLEGAL;
1332     }
1333     AddLiteralChar(c);
1334     return ScanIdentifierSuffix(&literal, true);
1335   } else {
1336     uc32 first_char = c0_;
1337     Advance();
1338     AddLiteralChar(first_char);
1339   }
1340 
1341   // Scan the rest of the identifier characters.
1342   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
1343     if (c0_ != '\\') {
1344       uc32 next_char = c0_;
1345       Advance();
1346       AddLiteralChar(next_char);
1347       continue;
1348     }
1349     // Fallthrough if no longer able to complete keyword.
1350     return ScanIdentifierSuffix(&literal, false);
1351   }
1352 
1353   literal.Complete();
1354 
1355   if (next_.literal_chars->is_one_byte()) {
1356     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1357     return KeywordOrIdentifierToken(chars.start(), chars.length(), false);
1358   }
1359   return Token::IDENTIFIER;
1360 }
1361 
1362 
ScanIdentifierSuffix(LiteralScope * literal,bool escaped)1363 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal,
1364                                            bool escaped) {
1365   // Scan the rest of the identifier characters.
1366   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
1367     if (c0_ == '\\') {
1368       uc32 c = ScanIdentifierUnicodeEscape();
1369       escaped = true;
1370       // Only allow legal identifier part characters.
1371       if (c < 0 ||
1372           c == '\\' ||
1373           !unicode_cache_->IsIdentifierPart(c)) {
1374         return Token::ILLEGAL;
1375       }
1376       AddLiteralChar(c);
1377     } else {
1378       AddLiteralChar(c0_);
1379       Advance();
1380     }
1381   }
1382   literal->Complete();
1383 
1384   if (escaped && next_.literal_chars->is_one_byte()) {
1385     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1386     return KeywordOrIdentifierToken(chars.start(), chars.length(), true);
1387   }
1388   return Token::IDENTIFIER;
1389 }
1390 
1391 
ScanRegExpPattern(bool seen_equal)1392 bool Scanner::ScanRegExpPattern(bool seen_equal) {
1393   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1394   bool in_character_class = false;
1395 
1396   // Previous token is either '/' or '/=', in the second case, the
1397   // pattern starts at =.
1398   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1399   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1400 
1401   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1402   // the scanner should pass uninterpreted bodies to the RegExp
1403   // constructor.
1404   LiteralScope literal(this);
1405   if (seen_equal) {
1406     AddLiteralChar('=');
1407   }
1408 
1409   while (c0_ != '/' || in_character_class) {
1410     if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false;
1411     if (c0_ == '\\') {  // Escape sequence.
1412       AddLiteralCharAdvance();
1413       if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false;
1414       AddLiteralCharAdvance();
1415       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1416       // only "safe" characters are allowed (letters, digits, underscore),
1417       // otherwise the escape isn't valid and the invalid character has
1418       // its normal meaning. I.e., we can just continue scanning without
1419       // worrying whether the following characters are part of the escape
1420       // or not, since any '/', '\\' or '[' is guaranteed to not be part
1421       // of the escape sequence.
1422 
1423       // TODO(896): At some point, parse RegExps more throughly to capture
1424       // octal esacpes in strict mode.
1425     } else {  // Unescaped character.
1426       if (c0_ == '[') in_character_class = true;
1427       if (c0_ == ']') in_character_class = false;
1428       AddLiteralCharAdvance();
1429     }
1430   }
1431   Advance();  // consume '/'
1432 
1433   literal.Complete();
1434 
1435   return true;
1436 }
1437 
1438 
ScanRegExpFlags()1439 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
1440   // Scan regular expression flags.
1441   LiteralScope literal(this);
1442   int flags = 0;
1443   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
1444     RegExp::Flags flag = RegExp::kNone;
1445     switch (c0_) {
1446       case 'g':
1447         flag = RegExp::kGlobal;
1448         break;
1449       case 'i':
1450         flag = RegExp::kIgnoreCase;
1451         break;
1452       case 'm':
1453         flag = RegExp::kMultiline;
1454         break;
1455       case 'u':
1456         flag = RegExp::kUnicode;
1457         break;
1458       case 'y':
1459         flag = RegExp::kSticky;
1460         break;
1461       default:
1462         return Nothing<RegExp::Flags>();
1463     }
1464     if (flags & flag) return Nothing<RegExp::Flags>();
1465     AddLiteralCharAdvance();
1466     flags |= flag;
1467   }
1468   literal.Complete();
1469 
1470   next_.location.end_pos = source_pos();
1471   return Just(RegExp::Flags(flags));
1472 }
1473 
1474 
CurrentSymbol(AstValueFactory * ast_value_factory)1475 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) {
1476   if (is_literal_one_byte()) {
1477     return ast_value_factory->GetOneByteString(literal_one_byte_string());
1478   }
1479   return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1480 }
1481 
1482 
NextSymbol(AstValueFactory * ast_value_factory)1483 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) {
1484   if (is_next_literal_one_byte()) {
1485     return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1486   }
1487   return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1488 }
1489 
1490 
CurrentRawSymbol(AstValueFactory * ast_value_factory)1491 const AstRawString* Scanner::CurrentRawSymbol(
1492     AstValueFactory* ast_value_factory) {
1493   if (is_raw_literal_one_byte()) {
1494     return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1495   }
1496   return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1497 }
1498 
1499 
DoubleValue()1500 double Scanner::DoubleValue() {
1501   DCHECK(is_literal_one_byte());
1502   return StringToDouble(
1503       unicode_cache_,
1504       literal_one_byte_string(),
1505       ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1506 }
1507 
1508 
ContainsDot()1509 bool Scanner::ContainsDot() {
1510   DCHECK(is_literal_one_byte());
1511   Vector<const uint8_t> str = literal_one_byte_string();
1512   return std::find(str.begin(), str.end(), '.') != str.end();
1513 }
1514 
1515 
FindSymbol(DuplicateFinder * finder,int value)1516 int Scanner::FindSymbol(DuplicateFinder* finder, int value) {
1517   if (is_literal_one_byte()) {
1518     return finder->AddOneByteSymbol(literal_one_byte_string(), value);
1519   }
1520   return finder->AddTwoByteSymbol(literal_two_byte_string(), value);
1521 }
1522 
1523 
SetBookmark()1524 bool Scanner::SetBookmark() {
1525   if (c0_ != kNoBookmark && bookmark_c0_ == kNoBookmark &&
1526       next_next_.token == Token::UNINITIALIZED && source_->SetBookmark()) {
1527     bookmark_c0_ = c0_;
1528     CopyTokenDesc(&bookmark_current_, &current_);
1529     CopyTokenDesc(&bookmark_next_, &next_);
1530     return true;
1531   }
1532   return false;
1533 }
1534 
1535 
ResetToBookmark()1536 void Scanner::ResetToBookmark() {
1537   DCHECK(BookmarkHasBeenSet());  // Caller hasn't called SetBookmark.
1538 
1539   source_->ResetToBookmark();
1540   c0_ = bookmark_c0_;
1541   StartLiteral();
1542   StartRawLiteral();
1543   CopyTokenDesc(&next_, &bookmark_current_);
1544   current_ = next_;
1545   StartLiteral();
1546   StartRawLiteral();
1547   CopyTokenDesc(&next_, &bookmark_next_);
1548 
1549   bookmark_c0_ = kBookmarkWasApplied;
1550 }
1551 
1552 
BookmarkHasBeenSet()1553 bool Scanner::BookmarkHasBeenSet() { return bookmark_c0_ >= 0; }
1554 
1555 
BookmarkHasBeenReset()1556 bool Scanner::BookmarkHasBeenReset() {
1557   return bookmark_c0_ == kBookmarkWasApplied;
1558 }
1559 
1560 
DropBookmark()1561 void Scanner::DropBookmark() { bookmark_c0_ = kNoBookmark; }
1562 
1563 
CopyTokenDesc(TokenDesc * to,TokenDesc * from)1564 void Scanner::CopyTokenDesc(TokenDesc* to, TokenDesc* from) {
1565   DCHECK_NOT_NULL(to);
1566   DCHECK_NOT_NULL(from);
1567   to->token = from->token;
1568   to->location = from->location;
1569   to->literal_chars->CopyFrom(from->literal_chars);
1570   to->raw_literal_chars->CopyFrom(from->raw_literal_chars);
1571 }
1572 
1573 
AddOneByteSymbol(Vector<const uint8_t> key,int value)1574 int DuplicateFinder::AddOneByteSymbol(Vector<const uint8_t> key, int value) {
1575   return AddSymbol(key, true, value);
1576 }
1577 
1578 
AddTwoByteSymbol(Vector<const uint16_t> key,int value)1579 int DuplicateFinder::AddTwoByteSymbol(Vector<const uint16_t> key, int value) {
1580   return AddSymbol(Vector<const uint8_t>::cast(key), false, value);
1581 }
1582 
1583 
AddSymbol(Vector<const uint8_t> key,bool is_one_byte,int value)1584 int DuplicateFinder::AddSymbol(Vector<const uint8_t> key,
1585                                bool is_one_byte,
1586                                int value) {
1587   uint32_t hash = Hash(key, is_one_byte);
1588   byte* encoding = BackupKey(key, is_one_byte);
1589   base::HashMap::Entry* entry = map_.LookupOrInsert(encoding, hash);
1590   int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value));
1591   entry->value =
1592     reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value));
1593   return old_value;
1594 }
1595 
1596 
AddNumber(Vector<const uint8_t> key,int value)1597 int DuplicateFinder::AddNumber(Vector<const uint8_t> key, int value) {
1598   DCHECK(key.length() > 0);
1599   // Quick check for already being in canonical form.
1600   if (IsNumberCanonical(key)) {
1601     return AddOneByteSymbol(key, value);
1602   }
1603 
1604   int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY;
1605   double double_value = StringToDouble(
1606       unicode_constants_, key, flags, 0.0);
1607   int length;
1608   const char* string;
1609   if (!std::isfinite(double_value)) {
1610     string = "Infinity";
1611     length = 8;  // strlen("Infinity");
1612   } else {
1613     string = DoubleToCString(double_value,
1614                              Vector<char>(number_buffer_, kBufferSize));
1615     length = StrLength(string);
1616   }
1617   return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string),
1618                                       length), true, value);
1619 }
1620 
1621 
IsNumberCanonical(Vector<const uint8_t> number)1622 bool DuplicateFinder::IsNumberCanonical(Vector<const uint8_t> number) {
1623   // Test for a safe approximation of number literals that are already
1624   // in canonical form: max 15 digits, no leading zeroes, except an
1625   // integer part that is a single zero, and no trailing zeros below
1626   // the decimal point.
1627   int pos = 0;
1628   int length = number.length();
1629   if (number.length() > 15) return false;
1630   if (number[pos] == '0') {
1631     pos++;
1632   } else {
1633     while (pos < length &&
1634            static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++;
1635   }
1636   if (length == pos) return true;
1637   if (number[pos] != '.') return false;
1638   pos++;
1639   bool invalid_last_digit = true;
1640   while (pos < length) {
1641     uint8_t digit = number[pos] - '0';
1642     if (digit > '9' - '0') return false;
1643     invalid_last_digit = (digit == 0);
1644     pos++;
1645   }
1646   return !invalid_last_digit;
1647 }
1648 
1649 
Hash(Vector<const uint8_t> key,bool is_one_byte)1650 uint32_t DuplicateFinder::Hash(Vector<const uint8_t> key, bool is_one_byte) {
1651   // Primitive hash function, almost identical to the one used
1652   // for strings (except that it's seeded by the length and representation).
1653   int length = key.length();
1654   uint32_t hash = (length << 1) | (is_one_byte ? 1 : 0);
1655   for (int i = 0; i < length; i++) {
1656     uint32_t c = key[i];
1657     hash = (hash + c) * 1025;
1658     hash ^= (hash >> 6);
1659   }
1660   return hash;
1661 }
1662 
1663 
Match(void * first,void * second)1664 bool DuplicateFinder::Match(void* first, void* second) {
1665   // Decode lengths.
1666   // Length + representation is encoded as base 128, most significant heptet
1667   // first, with a 8th bit being non-zero while there are more heptets.
1668   // The value encodes the number of bytes following, and whether the original
1669   // was Latin1.
1670   byte* s1 = reinterpret_cast<byte*>(first);
1671   byte* s2 = reinterpret_cast<byte*>(second);
1672   uint32_t length_one_byte_field = 0;
1673   byte c1;
1674   do {
1675     c1 = *s1;
1676     if (c1 != *s2) return false;
1677     length_one_byte_field = (length_one_byte_field << 7) | (c1 & 0x7f);
1678     s1++;
1679     s2++;
1680   } while ((c1 & 0x80) != 0);
1681   int length = static_cast<int>(length_one_byte_field >> 1);
1682   return memcmp(s1, s2, length) == 0;
1683 }
1684 
1685 
BackupKey(Vector<const uint8_t> bytes,bool is_one_byte)1686 byte* DuplicateFinder::BackupKey(Vector<const uint8_t> bytes,
1687                                  bool is_one_byte) {
1688   uint32_t one_byte_length = (bytes.length() << 1) | (is_one_byte ? 1 : 0);
1689   backing_store_.StartSequence();
1690   // Emit one_byte_length as base-128 encoded number, with the 7th bit set
1691   // on the byte of every heptet except the last, least significant, one.
1692   if (one_byte_length >= (1 << 7)) {
1693     if (one_byte_length >= (1 << 14)) {
1694       if (one_byte_length >= (1 << 21)) {
1695         if (one_byte_length >= (1 << 28)) {
1696           backing_store_.Add(
1697               static_cast<uint8_t>((one_byte_length >> 28) | 0x80));
1698         }
1699         backing_store_.Add(
1700             static_cast<uint8_t>((one_byte_length >> 21) | 0x80u));
1701       }
1702       backing_store_.Add(
1703           static_cast<uint8_t>((one_byte_length >> 14) | 0x80u));
1704     }
1705     backing_store_.Add(static_cast<uint8_t>((one_byte_length >> 7) | 0x80u));
1706   }
1707   backing_store_.Add(static_cast<uint8_t>(one_byte_length & 0x7f));
1708 
1709   backing_store_.AddBlock(bytes);
1710   return backing_store_.EndSequence().start();
1711 }
1712 
1713 }  // namespace internal
1714 }  // namespace v8
1715