• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Features shared by parsing and pre-parsing scanners.
6 
7 #include <cmath>
8 
9 #include "src/scanner.h"
10 
11 #include "include/v8stdint.h"
12 #include "src/char-predicates-inl.h"
13 #include "src/conversions-inl.h"
14 #include "src/list-inl.h"
15 #include "src/v8.h"
16 #include "src/parser.h"
17 
18 namespace v8 {
19 namespace internal {
20 
21 // ----------------------------------------------------------------------------
22 // Scanner
23 
Scanner(UnicodeCache * unicode_cache)24 Scanner::Scanner(UnicodeCache* unicode_cache)
25     : unicode_cache_(unicode_cache),
26       octal_pos_(Location::invalid()),
27       harmony_scoping_(false),
28       harmony_modules_(false),
29       harmony_numeric_literals_(false) { }
30 
31 
Initialize(Utf16CharacterStream * source)32 void Scanner::Initialize(Utf16CharacterStream* source) {
33   source_ = source;
34   // Need to capture identifiers in order to recognize "get" and "set"
35   // in object literals.
36   Init();
37   // Skip initial whitespace allowing HTML comment ends just like
38   // after a newline and scan first token.
39   has_line_terminator_before_next_ = true;
40   SkipWhiteSpace();
41   Scan();
42 }
43 
44 
ScanHexNumber(int expected_length)45 uc32 Scanner::ScanHexNumber(int expected_length) {
46   ASSERT(expected_length <= 4);  // prevent overflow
47 
48   uc32 digits[4] = { 0, 0, 0, 0 };
49   uc32 x = 0;
50   for (int i = 0; i < expected_length; i++) {
51     digits[i] = c0_;
52     int d = HexValue(c0_);
53     if (d < 0) {
54       // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
55       // should be illegal, but other JS VMs just return the
56       // non-escaped version of the original character.
57 
58       // Push back digits that we have advanced past.
59       for (int j = i-1; j >= 0; j--) {
60         PushBack(digits[j]);
61       }
62       return -1;
63     }
64     x = x * 16 + d;
65     Advance();
66   }
67 
68   return x;
69 }
70 
71 
72 // Ensure that tokens can be stored in a byte.
73 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
74 
75 // Table of one-character tokens, by character (0x00..0x7f only).
76 static const byte one_char_tokens[] = {
77   Token::ILLEGAL,
78   Token::ILLEGAL,
79   Token::ILLEGAL,
80   Token::ILLEGAL,
81   Token::ILLEGAL,
82   Token::ILLEGAL,
83   Token::ILLEGAL,
84   Token::ILLEGAL,
85   Token::ILLEGAL,
86   Token::ILLEGAL,
87   Token::ILLEGAL,
88   Token::ILLEGAL,
89   Token::ILLEGAL,
90   Token::ILLEGAL,
91   Token::ILLEGAL,
92   Token::ILLEGAL,
93   Token::ILLEGAL,
94   Token::ILLEGAL,
95   Token::ILLEGAL,
96   Token::ILLEGAL,
97   Token::ILLEGAL,
98   Token::ILLEGAL,
99   Token::ILLEGAL,
100   Token::ILLEGAL,
101   Token::ILLEGAL,
102   Token::ILLEGAL,
103   Token::ILLEGAL,
104   Token::ILLEGAL,
105   Token::ILLEGAL,
106   Token::ILLEGAL,
107   Token::ILLEGAL,
108   Token::ILLEGAL,
109   Token::ILLEGAL,
110   Token::ILLEGAL,
111   Token::ILLEGAL,
112   Token::ILLEGAL,
113   Token::ILLEGAL,
114   Token::ILLEGAL,
115   Token::ILLEGAL,
116   Token::ILLEGAL,
117   Token::LPAREN,       // 0x28
118   Token::RPAREN,       // 0x29
119   Token::ILLEGAL,
120   Token::ILLEGAL,
121   Token::COMMA,        // 0x2c
122   Token::ILLEGAL,
123   Token::ILLEGAL,
124   Token::ILLEGAL,
125   Token::ILLEGAL,
126   Token::ILLEGAL,
127   Token::ILLEGAL,
128   Token::ILLEGAL,
129   Token::ILLEGAL,
130   Token::ILLEGAL,
131   Token::ILLEGAL,
132   Token::ILLEGAL,
133   Token::ILLEGAL,
134   Token::ILLEGAL,
135   Token::COLON,        // 0x3a
136   Token::SEMICOLON,    // 0x3b
137   Token::ILLEGAL,
138   Token::ILLEGAL,
139   Token::ILLEGAL,
140   Token::CONDITIONAL,  // 0x3f
141   Token::ILLEGAL,
142   Token::ILLEGAL,
143   Token::ILLEGAL,
144   Token::ILLEGAL,
145   Token::ILLEGAL,
146   Token::ILLEGAL,
147   Token::ILLEGAL,
148   Token::ILLEGAL,
149   Token::ILLEGAL,
150   Token::ILLEGAL,
151   Token::ILLEGAL,
152   Token::ILLEGAL,
153   Token::ILLEGAL,
154   Token::ILLEGAL,
155   Token::ILLEGAL,
156   Token::ILLEGAL,
157   Token::ILLEGAL,
158   Token::ILLEGAL,
159   Token::ILLEGAL,
160   Token::ILLEGAL,
161   Token::ILLEGAL,
162   Token::ILLEGAL,
163   Token::ILLEGAL,
164   Token::ILLEGAL,
165   Token::ILLEGAL,
166   Token::ILLEGAL,
167   Token::ILLEGAL,
168   Token::LBRACK,     // 0x5b
169   Token::ILLEGAL,
170   Token::RBRACK,     // 0x5d
171   Token::ILLEGAL,
172   Token::ILLEGAL,
173   Token::ILLEGAL,
174   Token::ILLEGAL,
175   Token::ILLEGAL,
176   Token::ILLEGAL,
177   Token::ILLEGAL,
178   Token::ILLEGAL,
179   Token::ILLEGAL,
180   Token::ILLEGAL,
181   Token::ILLEGAL,
182   Token::ILLEGAL,
183   Token::ILLEGAL,
184   Token::ILLEGAL,
185   Token::ILLEGAL,
186   Token::ILLEGAL,
187   Token::ILLEGAL,
188   Token::ILLEGAL,
189   Token::ILLEGAL,
190   Token::ILLEGAL,
191   Token::ILLEGAL,
192   Token::ILLEGAL,
193   Token::ILLEGAL,
194   Token::ILLEGAL,
195   Token::ILLEGAL,
196   Token::ILLEGAL,
197   Token::ILLEGAL,
198   Token::ILLEGAL,
199   Token::ILLEGAL,
200   Token::LBRACE,       // 0x7b
201   Token::ILLEGAL,
202   Token::RBRACE,       // 0x7d
203   Token::BIT_NOT,      // 0x7e
204   Token::ILLEGAL
205 };
206 
207 
Next()208 Token::Value Scanner::Next() {
209   current_ = next_;
210   has_line_terminator_before_next_ = false;
211   has_multiline_comment_before_next_ = false;
212   if (static_cast<unsigned>(c0_) <= 0x7f) {
213     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
214     if (token != Token::ILLEGAL) {
215       int pos = source_pos();
216       next_.token = token;
217       next_.location.beg_pos = pos;
218       next_.location.end_pos = pos + 1;
219       Advance();
220       return current_.token;
221     }
222   }
223   Scan();
224   return current_.token;
225 }
226 
227 
228 // TODO(yangguo): check whether this is actually necessary.
IsLittleEndianByteOrderMark(uc32 c)229 static inline bool IsLittleEndianByteOrderMark(uc32 c) {
230   // The Unicode value U+FFFE is guaranteed never to be assigned as a
231   // Unicode character; this implies that in a Unicode context the
232   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
233   // character expressed in little-endian byte order (since it could
234   // not be a U+FFFE character expressed in big-endian byte
235   // order). Nevertheless, we check for it to be compatible with
236   // Spidermonkey.
237   return c == 0xFFFE;
238 }
239 
240 
SkipWhiteSpace()241 bool Scanner::SkipWhiteSpace() {
242   int start_position = source_pos();
243 
244   while (true) {
245     while (true) {
246       // Advance as long as character is a WhiteSpace or LineTerminator.
247       // Remember if the latter is the case.
248       if (unicode_cache_->IsLineTerminator(c0_)) {
249         has_line_terminator_before_next_ = true;
250       } else if (!unicode_cache_->IsWhiteSpace(c0_) &&
251                  !IsLittleEndianByteOrderMark(c0_)) {
252         break;
253       }
254       Advance();
255     }
256 
257     // If there is an HTML comment end '-->' at the beginning of a
258     // line (with only whitespace in front of it), we treat the rest
259     // of the line as a comment. This is in line with the way
260     // SpiderMonkey handles it.
261     if (c0_ == '-' && has_line_terminator_before_next_) {
262       Advance();
263       if (c0_ == '-') {
264         Advance();
265         if (c0_ == '>') {
266           // Treat the rest of the line as a comment.
267           SkipSingleLineComment();
268           // Continue skipping white space after the comment.
269           continue;
270         }
271         PushBack('-');  // undo Advance()
272       }
273       PushBack('-');  // undo Advance()
274     }
275     // Return whether or not we skipped any characters.
276     return source_pos() != start_position;
277   }
278 }
279 
280 
SkipSingleLineComment()281 Token::Value Scanner::SkipSingleLineComment() {
282   Advance();
283 
284   // The line terminator at the end of the line is not considered
285   // to be part of the single-line comment; it is recognized
286   // separately by the lexical grammar and becomes part of the
287   // stream of input elements for the syntactic grammar (see
288   // ECMA-262, section 7.4).
289   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
290     Advance();
291   }
292 
293   return Token::WHITESPACE;
294 }
295 
296 
SkipMultiLineComment()297 Token::Value Scanner::SkipMultiLineComment() {
298   ASSERT(c0_ == '*');
299   Advance();
300 
301   while (c0_ >= 0) {
302     uc32 ch = c0_;
303     Advance();
304     if (unicode_cache_->IsLineTerminator(ch)) {
305       // Following ECMA-262, section 7.4, a comment containing
306       // a newline will make the comment count as a line-terminator.
307       has_multiline_comment_before_next_ = true;
308     }
309     // If we have reached the end of the multi-line comment, we
310     // consume the '/' and insert a whitespace. This way all
311     // multi-line comments are treated as whitespace.
312     if (ch == '*' && c0_ == '/') {
313       c0_ = ' ';
314       return Token::WHITESPACE;
315     }
316   }
317 
318   // Unterminated multi-line comment.
319   return Token::ILLEGAL;
320 }
321 
322 
ScanHtmlComment()323 Token::Value Scanner::ScanHtmlComment() {
324   // Check for <!-- comments.
325   ASSERT(c0_ == '!');
326   Advance();
327   if (c0_ == '-') {
328     Advance();
329     if (c0_ == '-') return SkipSingleLineComment();
330     PushBack('-');  // undo Advance()
331   }
332   PushBack('!');  // undo Advance()
333   ASSERT(c0_ == '!');
334   return Token::LT;
335 }
336 
337 
Scan()338 void Scanner::Scan() {
339   next_.literal_chars = NULL;
340   Token::Value token;
341   do {
342     // Remember the position of the next token
343     next_.location.beg_pos = source_pos();
344 
345     switch (c0_) {
346       case ' ':
347       case '\t':
348         Advance();
349         token = Token::WHITESPACE;
350         break;
351 
352       case '\n':
353         Advance();
354         has_line_terminator_before_next_ = true;
355         token = Token::WHITESPACE;
356         break;
357 
358       case '"': case '\'':
359         token = ScanString();
360         break;
361 
362       case '<':
363         // < <= << <<= <!--
364         Advance();
365         if (c0_ == '=') {
366           token = Select(Token::LTE);
367         } else if (c0_ == '<') {
368           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
369         } else if (c0_ == '!') {
370           token = ScanHtmlComment();
371         } else {
372           token = Token::LT;
373         }
374         break;
375 
376       case '>':
377         // > >= >> >>= >>> >>>=
378         Advance();
379         if (c0_ == '=') {
380           token = Select(Token::GTE);
381         } else if (c0_ == '>') {
382           // >> >>= >>> >>>=
383           Advance();
384           if (c0_ == '=') {
385             token = Select(Token::ASSIGN_SAR);
386           } else if (c0_ == '>') {
387             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
388           } else {
389             token = Token::SAR;
390           }
391         } else {
392           token = Token::GT;
393         }
394         break;
395 
396       case '=':
397         // = == ===
398         Advance();
399         if (c0_ == '=') {
400           token = Select('=', Token::EQ_STRICT, Token::EQ);
401         } else {
402           token = Token::ASSIGN;
403         }
404         break;
405 
406       case '!':
407         // ! != !==
408         Advance();
409         if (c0_ == '=') {
410           token = Select('=', Token::NE_STRICT, Token::NE);
411         } else {
412           token = Token::NOT;
413         }
414         break;
415 
416       case '+':
417         // + ++ +=
418         Advance();
419         if (c0_ == '+') {
420           token = Select(Token::INC);
421         } else if (c0_ == '=') {
422           token = Select(Token::ASSIGN_ADD);
423         } else {
424           token = Token::ADD;
425         }
426         break;
427 
428       case '-':
429         // - -- --> -=
430         Advance();
431         if (c0_ == '-') {
432           Advance();
433           if (c0_ == '>' && has_line_terminator_before_next_) {
434             // For compatibility with SpiderMonkey, we skip lines that
435             // start with an HTML comment end '-->'.
436             token = SkipSingleLineComment();
437           } else {
438             token = Token::DEC;
439           }
440         } else if (c0_ == '=') {
441           token = Select(Token::ASSIGN_SUB);
442         } else {
443           token = Token::SUB;
444         }
445         break;
446 
447       case '*':
448         // * *=
449         token = Select('=', Token::ASSIGN_MUL, Token::MUL);
450         break;
451 
452       case '%':
453         // % %=
454         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
455         break;
456 
457       case '/':
458         // /  // /* /=
459         Advance();
460         if (c0_ == '/') {
461           token = SkipSingleLineComment();
462         } else if (c0_ == '*') {
463           token = SkipMultiLineComment();
464         } else if (c0_ == '=') {
465           token = Select(Token::ASSIGN_DIV);
466         } else {
467           token = Token::DIV;
468         }
469         break;
470 
471       case '&':
472         // & && &=
473         Advance();
474         if (c0_ == '&') {
475           token = Select(Token::AND);
476         } else if (c0_ == '=') {
477           token = Select(Token::ASSIGN_BIT_AND);
478         } else {
479           token = Token::BIT_AND;
480         }
481         break;
482 
483       case '|':
484         // | || |=
485         Advance();
486         if (c0_ == '|') {
487           token = Select(Token::OR);
488         } else if (c0_ == '=') {
489           token = Select(Token::ASSIGN_BIT_OR);
490         } else {
491           token = Token::BIT_OR;
492         }
493         break;
494 
495       case '^':
496         // ^ ^=
497         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
498         break;
499 
500       case '.':
501         // . Number
502         Advance();
503         if (IsDecimalDigit(c0_)) {
504           token = ScanNumber(true);
505         } else {
506           token = Token::PERIOD;
507         }
508         break;
509 
510       case ':':
511         token = Select(Token::COLON);
512         break;
513 
514       case ';':
515         token = Select(Token::SEMICOLON);
516         break;
517 
518       case ',':
519         token = Select(Token::COMMA);
520         break;
521 
522       case '(':
523         token = Select(Token::LPAREN);
524         break;
525 
526       case ')':
527         token = Select(Token::RPAREN);
528         break;
529 
530       case '[':
531         token = Select(Token::LBRACK);
532         break;
533 
534       case ']':
535         token = Select(Token::RBRACK);
536         break;
537 
538       case '{':
539         token = Select(Token::LBRACE);
540         break;
541 
542       case '}':
543         token = Select(Token::RBRACE);
544         break;
545 
546       case '?':
547         token = Select(Token::CONDITIONAL);
548         break;
549 
550       case '~':
551         token = Select(Token::BIT_NOT);
552         break;
553 
554       default:
555         if (unicode_cache_->IsIdentifierStart(c0_)) {
556           token = ScanIdentifierOrKeyword();
557         } else if (IsDecimalDigit(c0_)) {
558           token = ScanNumber(false);
559         } else if (SkipWhiteSpace()) {
560           token = Token::WHITESPACE;
561         } else if (c0_ < 0) {
562           token = Token::EOS;
563         } else {
564           token = Select(Token::ILLEGAL);
565         }
566         break;
567     }
568 
569     // Continue scanning for tokens as long as we're just skipping
570     // whitespace.
571   } while (token == Token::WHITESPACE);
572 
573   next_.location.end_pos = source_pos();
574   next_.token = token;
575 }
576 
577 
SeekForward(int pos)578 void Scanner::SeekForward(int pos) {
579   // After this call, we will have the token at the given position as
580   // the "next" token. The "current" token will be invalid.
581   if (pos == next_.location.beg_pos) return;
582   int current_pos = source_pos();
583   ASSERT_EQ(next_.location.end_pos, current_pos);
584   // Positions inside the lookahead token aren't supported.
585   ASSERT(pos >= current_pos);
586   if (pos != current_pos) {
587     source_->SeekForward(pos - source_->pos());
588     Advance();
589     // This function is only called to seek to the location
590     // of the end of a function (at the "}" token). It doesn't matter
591     // whether there was a line terminator in the part we skip.
592     has_line_terminator_before_next_ = false;
593     has_multiline_comment_before_next_ = false;
594   }
595   Scan();
596 }
597 
598 
ScanEscape()599 bool Scanner::ScanEscape() {
600   uc32 c = c0_;
601   Advance();
602 
603   // Skip escaped newlines.
604   if (unicode_cache_->IsLineTerminator(c)) {
605     // Allow CR+LF newlines in multiline string literals.
606     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
607     // Allow LF+CR newlines in multiline string literals.
608     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
609     return true;
610   }
611 
612   switch (c) {
613     case '\'':  // fall through
614     case '"' :  // fall through
615     case '\\': break;
616     case 'b' : c = '\b'; break;
617     case 'f' : c = '\f'; break;
618     case 'n' : c = '\n'; break;
619     case 'r' : c = '\r'; break;
620     case 't' : c = '\t'; break;
621     case 'u' : {
622       c = ScanHexNumber(4);
623       if (c < 0) return false;
624       break;
625     }
626     case 'v' : c = '\v'; break;
627     case 'x' : {
628       c = ScanHexNumber(2);
629       if (c < 0) return false;
630       break;
631     }
632     case '0' :  // fall through
633     case '1' :  // fall through
634     case '2' :  // fall through
635     case '3' :  // fall through
636     case '4' :  // fall through
637     case '5' :  // fall through
638     case '6' :  // fall through
639     case '7' : c = ScanOctalEscape(c, 2); break;
640   }
641 
642   // According to ECMA-262, section 7.8.4, characters not covered by the
643   // above cases should be illegal, but they are commonly handled as
644   // non-escaped characters by JS VMs.
645   AddLiteralChar(c);
646   return true;
647 }
648 
649 
650 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
651 // ECMA-262. Other JS VMs support them.
ScanOctalEscape(uc32 c,int length)652 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
653   uc32 x = c - '0';
654   int i = 0;
655   for (; i < length; i++) {
656     int d = c0_ - '0';
657     if (d < 0 || d > 7) break;
658     int nx = x * 8 + d;
659     if (nx >= 256) break;
660     x = nx;
661     Advance();
662   }
663   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
664   // Remember the position of octal escape sequences so that an error
665   // can be reported later (in strict mode).
666   // We don't report the error immediately, because the octal escape can
667   // occur before the "use strict" directive.
668   if (c != '0' || i > 0) {
669     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
670   }
671   return x;
672 }
673 
674 
ScanString()675 Token::Value Scanner::ScanString() {
676   uc32 quote = c0_;
677   Advance();  // consume quote
678 
679   LiteralScope literal(this);
680   while (c0_ != quote && c0_ >= 0
681          && !unicode_cache_->IsLineTerminator(c0_)) {
682     uc32 c = c0_;
683     Advance();
684     if (c == '\\') {
685       if (c0_ < 0 || !ScanEscape()) return Token::ILLEGAL;
686     } else {
687       AddLiteralChar(c);
688     }
689   }
690   if (c0_ != quote) return Token::ILLEGAL;
691   literal.Complete();
692 
693   Advance();  // consume quote
694   return Token::STRING;
695 }
696 
697 
ScanDecimalDigits()698 void Scanner::ScanDecimalDigits() {
699   while (IsDecimalDigit(c0_))
700     AddLiteralCharAdvance();
701 }
702 
703 
ScanNumber(bool seen_period)704 Token::Value Scanner::ScanNumber(bool seen_period) {
705   ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
706 
707   enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL;
708 
709   LiteralScope literal(this);
710   if (seen_period) {
711     // we have already seen a decimal point of the float
712     AddLiteralChar('.');
713     ScanDecimalDigits();  // we know we have at least one digit
714 
715   } else {
716     // if the first character is '0' we must check for octals and hex
717     if (c0_ == '0') {
718       int start_pos = source_pos();  // For reporting octal positions.
719       AddLiteralCharAdvance();
720 
721       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
722       // an octal number.
723       if (c0_ == 'x' || c0_ == 'X') {
724         // hex number
725         kind = HEX;
726         AddLiteralCharAdvance();
727         if (!IsHexDigit(c0_)) {
728           // we must have at least one hex digit after 'x'/'X'
729           return Token::ILLEGAL;
730         }
731         while (IsHexDigit(c0_)) {
732           AddLiteralCharAdvance();
733         }
734       } else if (harmony_numeric_literals_ && (c0_ == 'o' || c0_ == 'O')) {
735         kind = OCTAL;
736         AddLiteralCharAdvance();
737         if (!IsOctalDigit(c0_)) {
738           // we must have at least one octal digit after 'o'/'O'
739           return Token::ILLEGAL;
740         }
741         while (IsOctalDigit(c0_)) {
742           AddLiteralCharAdvance();
743         }
744       } else if (harmony_numeric_literals_ && (c0_ == 'b' || c0_ == 'B')) {
745         kind = BINARY;
746         AddLiteralCharAdvance();
747         if (!IsBinaryDigit(c0_)) {
748           // we must have at least one binary digit after 'b'/'B'
749           return Token::ILLEGAL;
750         }
751         while (IsBinaryDigit(c0_)) {
752           AddLiteralCharAdvance();
753         }
754       } else if ('0' <= c0_ && c0_ <= '7') {
755         // (possible) octal number
756         kind = IMPLICIT_OCTAL;
757         while (true) {
758           if (c0_ == '8' || c0_ == '9') {
759             kind = DECIMAL;
760             break;
761           }
762           if (c0_  < '0' || '7'  < c0_) {
763             // Octal literal finished.
764             octal_pos_ = Location(start_pos, source_pos());
765             break;
766           }
767           AddLiteralCharAdvance();
768         }
769       }
770     }
771 
772     // Parse decimal digits and allow trailing fractional part.
773     if (kind == DECIMAL) {
774       ScanDecimalDigits();  // optional
775       if (c0_ == '.') {
776         AddLiteralCharAdvance();
777         ScanDecimalDigits();  // optional
778       }
779     }
780   }
781 
782   // scan exponent, if any
783   if (c0_ == 'e' || c0_ == 'E') {
784     ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
785     if (kind != DECIMAL) return Token::ILLEGAL;
786     // scan exponent
787     AddLiteralCharAdvance();
788     if (c0_ == '+' || c0_ == '-')
789       AddLiteralCharAdvance();
790     if (!IsDecimalDigit(c0_)) {
791       // we must have at least one decimal digit after 'e'/'E'
792       return Token::ILLEGAL;
793     }
794     ScanDecimalDigits();
795   }
796 
797   // The source character immediately following a numeric literal must
798   // not be an identifier start or a decimal digit; see ECMA-262
799   // section 7.8.3, page 17 (note that we read only one decimal digit
800   // if the value is 0).
801   if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
802     return Token::ILLEGAL;
803 
804   literal.Complete();
805 
806   return Token::NUMBER;
807 }
808 
809 
ScanIdentifierUnicodeEscape()810 uc32 Scanner::ScanIdentifierUnicodeEscape() {
811   Advance();
812   if (c0_ != 'u') return -1;
813   Advance();
814   uc32 result = ScanHexNumber(4);
815   if (result < 0) PushBack('u');
816   return result;
817 }
818 
819 
820 // ----------------------------------------------------------------------------
821 // Keyword Matcher
822 
823 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                            \
824   KEYWORD_GROUP('b')                                                \
825   KEYWORD("break", Token::BREAK)                                    \
826   KEYWORD_GROUP('c')                                                \
827   KEYWORD("case", Token::CASE)                                      \
828   KEYWORD("catch", Token::CATCH)                                    \
829   KEYWORD("class", Token::FUTURE_RESERVED_WORD)                     \
830   KEYWORD("const", Token::CONST)                                    \
831   KEYWORD("continue", Token::CONTINUE)                              \
832   KEYWORD_GROUP('d')                                                \
833   KEYWORD("debugger", Token::DEBUGGER)                              \
834   KEYWORD("default", Token::DEFAULT)                                \
835   KEYWORD("delete", Token::DELETE)                                  \
836   KEYWORD("do", Token::DO)                                          \
837   KEYWORD_GROUP('e')                                                \
838   KEYWORD("else", Token::ELSE)                                      \
839   KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                      \
840   KEYWORD("export", harmony_modules                                 \
841                     ? Token::EXPORT : Token::FUTURE_RESERVED_WORD)  \
842   KEYWORD("extends", Token::FUTURE_RESERVED_WORD)                   \
843   KEYWORD_GROUP('f')                                                \
844   KEYWORD("false", Token::FALSE_LITERAL)                            \
845   KEYWORD("finally", Token::FINALLY)                                \
846   KEYWORD("for", Token::FOR)                                        \
847   KEYWORD("function", Token::FUNCTION)                              \
848   KEYWORD_GROUP('i')                                                \
849   KEYWORD("if", Token::IF)                                          \
850   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)         \
851   KEYWORD("import", harmony_modules                                 \
852                     ? Token::IMPORT : Token::FUTURE_RESERVED_WORD)  \
853   KEYWORD("in", Token::IN)                                          \
854   KEYWORD("instanceof", Token::INSTANCEOF)                          \
855   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)          \
856   KEYWORD_GROUP('l')                                                \
857   KEYWORD("let", harmony_scoping                                    \
858                  ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
859   KEYWORD_GROUP('n')                                                \
860   KEYWORD("new", Token::NEW)                                        \
861   KEYWORD("null", Token::NULL_LITERAL)                              \
862   KEYWORD_GROUP('p')                                                \
863   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)            \
864   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)            \
865   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)          \
866   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)             \
867   KEYWORD_GROUP('r')                                                \
868   KEYWORD("return", Token::RETURN)                                  \
869   KEYWORD_GROUP('s')                                                \
870   KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD)             \
871   KEYWORD("super", Token::FUTURE_RESERVED_WORD)                     \
872   KEYWORD("switch", Token::SWITCH)                                  \
873   KEYWORD_GROUP('t')                                                \
874   KEYWORD("this", Token::THIS)                                      \
875   KEYWORD("throw", Token::THROW)                                    \
876   KEYWORD("true", Token::TRUE_LITERAL)                              \
877   KEYWORD("try", Token::TRY)                                        \
878   KEYWORD("typeof", Token::TYPEOF)                                  \
879   KEYWORD_GROUP('v')                                                \
880   KEYWORD("var", Token::VAR)                                        \
881   KEYWORD("void", Token::VOID)                                      \
882   KEYWORD_GROUP('w')                                                \
883   KEYWORD("while", Token::WHILE)                                    \
884   KEYWORD("with", Token::WITH)                                      \
885   KEYWORD_GROUP('y')                                                \
886   KEYWORD("yield", Token::YIELD)
887 
888 
KeywordOrIdentifierToken(const uint8_t * input,int input_length,bool harmony_scoping,bool harmony_modules)889 static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
890                                              int input_length,
891                                              bool harmony_scoping,
892                                              bool harmony_modules) {
893   ASSERT(input_length >= 1);
894   const int kMinLength = 2;
895   const int kMaxLength = 10;
896   if (input_length < kMinLength || input_length > kMaxLength) {
897     return Token::IDENTIFIER;
898   }
899   switch (input[0]) {
900     default:
901 #define KEYWORD_GROUP_CASE(ch)                                \
902       break;                                                  \
903     case ch:
904 #define KEYWORD(keyword, token)                               \
905     {                                                         \
906       /* 'keyword' is a char array, so sizeof(keyword) is */  \
907       /* strlen(keyword) plus 1 for the NUL char. */          \
908       const int keyword_length = sizeof(keyword) - 1;         \
909       STATIC_ASSERT(keyword_length >= kMinLength);            \
910       STATIC_ASSERT(keyword_length <= kMaxLength);            \
911       if (input_length == keyword_length &&                   \
912           input[1] == keyword[1] &&                           \
913           (keyword_length <= 2 || input[2] == keyword[2]) &&  \
914           (keyword_length <= 3 || input[3] == keyword[3]) &&  \
915           (keyword_length <= 4 || input[4] == keyword[4]) &&  \
916           (keyword_length <= 5 || input[5] == keyword[5]) &&  \
917           (keyword_length <= 6 || input[6] == keyword[6]) &&  \
918           (keyword_length <= 7 || input[7] == keyword[7]) &&  \
919           (keyword_length <= 8 || input[8] == keyword[8]) &&  \
920           (keyword_length <= 9 || input[9] == keyword[9])) {  \
921         return token;                                         \
922       }                                                       \
923     }
924     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
925   }
926   return Token::IDENTIFIER;
927 }
928 
929 
ScanIdentifierOrKeyword()930 Token::Value Scanner::ScanIdentifierOrKeyword() {
931   ASSERT(unicode_cache_->IsIdentifierStart(c0_));
932   LiteralScope literal(this);
933   // Scan identifier start character.
934   if (c0_ == '\\') {
935     uc32 c = ScanIdentifierUnicodeEscape();
936     // Only allow legal identifier start characters.
937     if (c < 0 ||
938         c == '\\' ||  // No recursive escapes.
939         !unicode_cache_->IsIdentifierStart(c)) {
940       return Token::ILLEGAL;
941     }
942     AddLiteralChar(c);
943     return ScanIdentifierSuffix(&literal);
944   }
945 
946   uc32 first_char = c0_;
947   Advance();
948   AddLiteralChar(first_char);
949 
950   // Scan the rest of the identifier characters.
951   while (unicode_cache_->IsIdentifierPart(c0_)) {
952     if (c0_ != '\\') {
953       uc32 next_char = c0_;
954       Advance();
955       AddLiteralChar(next_char);
956       continue;
957     }
958     // Fallthrough if no longer able to complete keyword.
959     return ScanIdentifierSuffix(&literal);
960   }
961 
962   literal.Complete();
963 
964   if (next_.literal_chars->is_one_byte()) {
965     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
966     return KeywordOrIdentifierToken(chars.start(),
967                                     chars.length(),
968                                     harmony_scoping_,
969                                     harmony_modules_);
970   }
971 
972   return Token::IDENTIFIER;
973 }
974 
975 
ScanIdentifierSuffix(LiteralScope * literal)976 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
977   // Scan the rest of the identifier characters.
978   while (unicode_cache_->IsIdentifierPart(c0_)) {
979     if (c0_ == '\\') {
980       uc32 c = ScanIdentifierUnicodeEscape();
981       // Only allow legal identifier part characters.
982       if (c < 0 ||
983           c == '\\' ||
984           !unicode_cache_->IsIdentifierPart(c)) {
985         return Token::ILLEGAL;
986       }
987       AddLiteralChar(c);
988     } else {
989       AddLiteralChar(c0_);
990       Advance();
991     }
992   }
993   literal->Complete();
994 
995   return Token::IDENTIFIER;
996 }
997 
998 
ScanRegExpPattern(bool seen_equal)999 bool Scanner::ScanRegExpPattern(bool seen_equal) {
1000   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1001   bool in_character_class = false;
1002 
1003   // Previous token is either '/' or '/=', in the second case, the
1004   // pattern starts at =.
1005   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1006   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1007 
1008   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1009   // the scanner should pass uninterpreted bodies to the RegExp
1010   // constructor.
1011   LiteralScope literal(this);
1012   if (seen_equal) {
1013     AddLiteralChar('=');
1014   }
1015 
1016   while (c0_ != '/' || in_character_class) {
1017     if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1018     if (c0_ == '\\') {  // Escape sequence.
1019       AddLiteralCharAdvance();
1020       if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1021       AddLiteralCharAdvance();
1022       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1023       // only "safe" characters are allowed (letters, digits, underscore),
1024       // otherwise the escape isn't valid and the invalid character has
1025       // its normal meaning. I.e., we can just continue scanning without
1026       // worrying whether the following characters are part of the escape
1027       // or not, since any '/', '\\' or '[' is guaranteed to not be part
1028       // of the escape sequence.
1029 
1030       // TODO(896): At some point, parse RegExps more throughly to capture
1031       // octal esacpes in strict mode.
1032     } else {  // Unescaped character.
1033       if (c0_ == '[') in_character_class = true;
1034       if (c0_ == ']') in_character_class = false;
1035       AddLiteralCharAdvance();
1036     }
1037   }
1038   Advance();  // consume '/'
1039 
1040   literal.Complete();
1041 
1042   return true;
1043 }
1044 
1045 
ScanLiteralUnicodeEscape()1046 bool Scanner::ScanLiteralUnicodeEscape() {
1047   ASSERT(c0_ == '\\');
1048   uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
1049   Advance();
1050   int i = 1;
1051   if (c0_ == 'u') {
1052     i++;
1053     while (i < 6) {
1054       Advance();
1055       if (!IsHexDigit(c0_)) break;
1056       chars_read[i] = c0_;
1057       i++;
1058     }
1059   }
1060   if (i < 6) {
1061     // Incomplete escape. Undo all advances and return false.
1062     while (i > 0) {
1063       i--;
1064       PushBack(chars_read[i]);
1065     }
1066     return false;
1067   }
1068   // Complete escape. Add all chars to current literal buffer.
1069   for (int i = 0; i < 6; i++) {
1070     AddLiteralChar(chars_read[i]);
1071   }
1072   return true;
1073 }
1074 
1075 
ScanRegExpFlags()1076 bool Scanner::ScanRegExpFlags() {
1077   // Scan regular expression flags.
1078   LiteralScope literal(this);
1079   while (unicode_cache_->IsIdentifierPart(c0_)) {
1080     if (c0_ != '\\') {
1081       AddLiteralCharAdvance();
1082     } else {
1083       if (!ScanLiteralUnicodeEscape()) {
1084         break;
1085       }
1086       Advance();
1087     }
1088   }
1089   literal.Complete();
1090 
1091   next_.location.end_pos = source_pos() - 1;
1092   return true;
1093 }
1094 
1095 
AllocateNextLiteralString(Isolate * isolate,PretenureFlag tenured)1096 Handle<String> Scanner::AllocateNextLiteralString(Isolate* isolate,
1097                                                   PretenureFlag tenured) {
1098   if (is_next_literal_one_byte()) {
1099     return isolate->factory()->NewStringFromOneByte(
1100         next_literal_one_byte_string(), tenured).ToHandleChecked();
1101   } else {
1102     return isolate->factory()->NewStringFromTwoByte(
1103         next_literal_two_byte_string(), tenured).ToHandleChecked();
1104   }
1105 }
1106 
1107 
AllocateInternalizedString(Isolate * isolate)1108 Handle<String> Scanner::AllocateInternalizedString(Isolate* isolate) {
1109   if (is_literal_one_byte()) {
1110     return isolate->factory()->InternalizeOneByteString(
1111         literal_one_byte_string());
1112   } else {
1113     return isolate->factory()->InternalizeTwoByteString(
1114         literal_two_byte_string());
1115   }
1116 }
1117 
1118 
DoubleValue()1119 double Scanner::DoubleValue() {
1120   ASSERT(is_literal_one_byte());
1121   return StringToDouble(
1122       unicode_cache_,
1123       literal_one_byte_string(),
1124       ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1125 }
1126 
1127 
FindNumber(DuplicateFinder * finder,int value)1128 int Scanner::FindNumber(DuplicateFinder* finder, int value) {
1129   return finder->AddNumber(literal_one_byte_string(), value);
1130 }
1131 
1132 
FindSymbol(DuplicateFinder * finder,int value)1133 int Scanner::FindSymbol(DuplicateFinder* finder, int value) {
1134   if (is_literal_one_byte()) {
1135     return finder->AddOneByteSymbol(literal_one_byte_string(), value);
1136   }
1137   return finder->AddTwoByteSymbol(literal_two_byte_string(), value);
1138 }
1139 
1140 
AddOneByteSymbol(Vector<const uint8_t> key,int value)1141 int DuplicateFinder::AddOneByteSymbol(Vector<const uint8_t> key, int value) {
1142   return AddSymbol(key, true, value);
1143 }
1144 
1145 
AddTwoByteSymbol(Vector<const uint16_t> key,int value)1146 int DuplicateFinder::AddTwoByteSymbol(Vector<const uint16_t> key, int value) {
1147   return AddSymbol(Vector<const uint8_t>::cast(key), false, value);
1148 }
1149 
1150 
AddSymbol(Vector<const uint8_t> key,bool is_one_byte,int value)1151 int DuplicateFinder::AddSymbol(Vector<const uint8_t> key,
1152                                bool is_one_byte,
1153                                int value) {
1154   uint32_t hash = Hash(key, is_one_byte);
1155   byte* encoding = BackupKey(key, is_one_byte);
1156   HashMap::Entry* entry = map_.Lookup(encoding, hash, true);
1157   int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value));
1158   entry->value =
1159     reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value));
1160   return old_value;
1161 }
1162 
1163 
AddNumber(Vector<const uint8_t> key,int value)1164 int DuplicateFinder::AddNumber(Vector<const uint8_t> key, int value) {
1165   ASSERT(key.length() > 0);
1166   // Quick check for already being in canonical form.
1167   if (IsNumberCanonical(key)) {
1168     return AddOneByteSymbol(key, value);
1169   }
1170 
1171   int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY;
1172   double double_value = StringToDouble(
1173       unicode_constants_, key, flags, 0.0);
1174   int length;
1175   const char* string;
1176   if (!std::isfinite(double_value)) {
1177     string = "Infinity";
1178     length = 8;  // strlen("Infinity");
1179   } else {
1180     string = DoubleToCString(double_value,
1181                              Vector<char>(number_buffer_, kBufferSize));
1182     length = StrLength(string);
1183   }
1184   return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string),
1185                                       length), true, value);
1186 }
1187 
1188 
IsNumberCanonical(Vector<const uint8_t> number)1189 bool DuplicateFinder::IsNumberCanonical(Vector<const uint8_t> number) {
1190   // Test for a safe approximation of number literals that are already
1191   // in canonical form: max 15 digits, no leading zeroes, except an
1192   // integer part that is a single zero, and no trailing zeros below
1193   // the decimal point.
1194   int pos = 0;
1195   int length = number.length();
1196   if (number.length() > 15) return false;
1197   if (number[pos] == '0') {
1198     pos++;
1199   } else {
1200     while (pos < length &&
1201            static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++;
1202   }
1203   if (length == pos) return true;
1204   if (number[pos] != '.') return false;
1205   pos++;
1206   bool invalid_last_digit = true;
1207   while (pos < length) {
1208     uint8_t digit = number[pos] - '0';
1209     if (digit > '9' - '0') return false;
1210     invalid_last_digit = (digit == 0);
1211     pos++;
1212   }
1213   return !invalid_last_digit;
1214 }
1215 
1216 
Hash(Vector<const uint8_t> key,bool is_one_byte)1217 uint32_t DuplicateFinder::Hash(Vector<const uint8_t> key, bool is_one_byte) {
1218   // Primitive hash function, almost identical to the one used
1219   // for strings (except that it's seeded by the length and ASCII-ness).
1220   int length = key.length();
1221   uint32_t hash = (length << 1) | (is_one_byte ? 1 : 0) ;
1222   for (int i = 0; i < length; i++) {
1223     uint32_t c = key[i];
1224     hash = (hash + c) * 1025;
1225     hash ^= (hash >> 6);
1226   }
1227   return hash;
1228 }
1229 
1230 
Match(void * first,void * second)1231 bool DuplicateFinder::Match(void* first, void* second) {
1232   // Decode lengths.
1233   // Length + ASCII-bit is encoded as base 128, most significant heptet first,
1234   // with a 8th bit being non-zero while there are more heptets.
1235   // The value encodes the number of bytes following, and whether the original
1236   // was ASCII.
1237   byte* s1 = reinterpret_cast<byte*>(first);
1238   byte* s2 = reinterpret_cast<byte*>(second);
1239   uint32_t length_one_byte_field = 0;
1240   byte c1;
1241   do {
1242     c1 = *s1;
1243     if (c1 != *s2) return false;
1244     length_one_byte_field = (length_one_byte_field << 7) | (c1 & 0x7f);
1245     s1++;
1246     s2++;
1247   } while ((c1 & 0x80) != 0);
1248   int length = static_cast<int>(length_one_byte_field >> 1);
1249   return memcmp(s1, s2, length) == 0;
1250 }
1251 
1252 
BackupKey(Vector<const uint8_t> bytes,bool is_one_byte)1253 byte* DuplicateFinder::BackupKey(Vector<const uint8_t> bytes,
1254                                  bool is_one_byte) {
1255   uint32_t one_byte_length = (bytes.length() << 1) | (is_one_byte ? 1 : 0);
1256   backing_store_.StartSequence();
1257   // Emit one_byte_length as base-128 encoded number, with the 7th bit set
1258   // on the byte of every heptet except the last, least significant, one.
1259   if (one_byte_length >= (1 << 7)) {
1260     if (one_byte_length >= (1 << 14)) {
1261       if (one_byte_length >= (1 << 21)) {
1262         if (one_byte_length >= (1 << 28)) {
1263           backing_store_.Add(
1264               static_cast<uint8_t>((one_byte_length >> 28) | 0x80));
1265         }
1266         backing_store_.Add(
1267             static_cast<uint8_t>((one_byte_length >> 21) | 0x80u));
1268       }
1269       backing_store_.Add(
1270           static_cast<uint8_t>((one_byte_length >> 14) | 0x80u));
1271     }
1272     backing_store_.Add(static_cast<uint8_t>((one_byte_length >> 7) | 0x80u));
1273   }
1274   backing_store_.Add(static_cast<uint8_t>(one_byte_length & 0x7f));
1275 
1276   backing_store_.AddBlock(bytes);
1277   return backing_store_.EndSequence().start();
1278 }
1279 
1280 } }  // namespace v8::internal
1281