• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 //     * Redistributions of source code must retain the above copyright
7 //       notice, this list of conditions and the following disclaimer.
8 //     * Redistributions in binary form must reproduce the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer in the documentation and/or other materials provided
11 //       with the distribution.
12 //     * Neither the name of Google Inc. nor the names of its
13 //       contributors may be used to endorse or promote products derived
14 //       from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 // Features shared by parsing and pre-parsing scanners.
29 
30 #include <cmath>
31 
32 #include "scanner.h"
33 
34 #include "../include/v8stdint.h"
35 #include "char-predicates-inl.h"
36 #include "conversions-inl.h"
37 #include "list-inl.h"
38 
39 namespace v8 {
40 namespace internal {
41 
42 // ----------------------------------------------------------------------------
43 // Scanner
44 
Scanner(UnicodeCache * unicode_cache)45 Scanner::Scanner(UnicodeCache* unicode_cache)
46     : unicode_cache_(unicode_cache),
47       octal_pos_(Location::invalid()),
48       harmony_scoping_(false),
49       harmony_modules_(false),
50       harmony_numeric_literals_(false) { }
51 
52 
Initialize(Utf16CharacterStream * source)53 void Scanner::Initialize(Utf16CharacterStream* source) {
54   source_ = source;
55   // Need to capture identifiers in order to recognize "get" and "set"
56   // in object literals.
57   Init();
58   // Skip initial whitespace allowing HTML comment ends just like
59   // after a newline and scan first token.
60   has_line_terminator_before_next_ = true;
61   SkipWhiteSpace();
62   Scan();
63 }
64 
65 
ScanHexNumber(int expected_length)66 uc32 Scanner::ScanHexNumber(int expected_length) {
67   ASSERT(expected_length <= 4);  // prevent overflow
68 
69   uc32 digits[4] = { 0, 0, 0, 0 };
70   uc32 x = 0;
71   for (int i = 0; i < expected_length; i++) {
72     digits[i] = c0_;
73     int d = HexValue(c0_);
74     if (d < 0) {
75       // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
76       // should be illegal, but other JS VMs just return the
77       // non-escaped version of the original character.
78 
79       // Push back digits that we have advanced past.
80       for (int j = i-1; j >= 0; j--) {
81         PushBack(digits[j]);
82       }
83       return -1;
84     }
85     x = x * 16 + d;
86     Advance();
87   }
88 
89   return x;
90 }
91 
92 
93 // Ensure that tokens can be stored in a byte.
94 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
95 
96 // Table of one-character tokens, by character (0x00..0x7f only).
97 static const byte one_char_tokens[] = {
98   Token::ILLEGAL,
99   Token::ILLEGAL,
100   Token::ILLEGAL,
101   Token::ILLEGAL,
102   Token::ILLEGAL,
103   Token::ILLEGAL,
104   Token::ILLEGAL,
105   Token::ILLEGAL,
106   Token::ILLEGAL,
107   Token::ILLEGAL,
108   Token::ILLEGAL,
109   Token::ILLEGAL,
110   Token::ILLEGAL,
111   Token::ILLEGAL,
112   Token::ILLEGAL,
113   Token::ILLEGAL,
114   Token::ILLEGAL,
115   Token::ILLEGAL,
116   Token::ILLEGAL,
117   Token::ILLEGAL,
118   Token::ILLEGAL,
119   Token::ILLEGAL,
120   Token::ILLEGAL,
121   Token::ILLEGAL,
122   Token::ILLEGAL,
123   Token::ILLEGAL,
124   Token::ILLEGAL,
125   Token::ILLEGAL,
126   Token::ILLEGAL,
127   Token::ILLEGAL,
128   Token::ILLEGAL,
129   Token::ILLEGAL,
130   Token::ILLEGAL,
131   Token::ILLEGAL,
132   Token::ILLEGAL,
133   Token::ILLEGAL,
134   Token::ILLEGAL,
135   Token::ILLEGAL,
136   Token::ILLEGAL,
137   Token::ILLEGAL,
138   Token::LPAREN,       // 0x28
139   Token::RPAREN,       // 0x29
140   Token::ILLEGAL,
141   Token::ILLEGAL,
142   Token::COMMA,        // 0x2c
143   Token::ILLEGAL,
144   Token::ILLEGAL,
145   Token::ILLEGAL,
146   Token::ILLEGAL,
147   Token::ILLEGAL,
148   Token::ILLEGAL,
149   Token::ILLEGAL,
150   Token::ILLEGAL,
151   Token::ILLEGAL,
152   Token::ILLEGAL,
153   Token::ILLEGAL,
154   Token::ILLEGAL,
155   Token::ILLEGAL,
156   Token::COLON,        // 0x3a
157   Token::SEMICOLON,    // 0x3b
158   Token::ILLEGAL,
159   Token::ILLEGAL,
160   Token::ILLEGAL,
161   Token::CONDITIONAL,  // 0x3f
162   Token::ILLEGAL,
163   Token::ILLEGAL,
164   Token::ILLEGAL,
165   Token::ILLEGAL,
166   Token::ILLEGAL,
167   Token::ILLEGAL,
168   Token::ILLEGAL,
169   Token::ILLEGAL,
170   Token::ILLEGAL,
171   Token::ILLEGAL,
172   Token::ILLEGAL,
173   Token::ILLEGAL,
174   Token::ILLEGAL,
175   Token::ILLEGAL,
176   Token::ILLEGAL,
177   Token::ILLEGAL,
178   Token::ILLEGAL,
179   Token::ILLEGAL,
180   Token::ILLEGAL,
181   Token::ILLEGAL,
182   Token::ILLEGAL,
183   Token::ILLEGAL,
184   Token::ILLEGAL,
185   Token::ILLEGAL,
186   Token::ILLEGAL,
187   Token::ILLEGAL,
188   Token::ILLEGAL,
189   Token::LBRACK,     // 0x5b
190   Token::ILLEGAL,
191   Token::RBRACK,     // 0x5d
192   Token::ILLEGAL,
193   Token::ILLEGAL,
194   Token::ILLEGAL,
195   Token::ILLEGAL,
196   Token::ILLEGAL,
197   Token::ILLEGAL,
198   Token::ILLEGAL,
199   Token::ILLEGAL,
200   Token::ILLEGAL,
201   Token::ILLEGAL,
202   Token::ILLEGAL,
203   Token::ILLEGAL,
204   Token::ILLEGAL,
205   Token::ILLEGAL,
206   Token::ILLEGAL,
207   Token::ILLEGAL,
208   Token::ILLEGAL,
209   Token::ILLEGAL,
210   Token::ILLEGAL,
211   Token::ILLEGAL,
212   Token::ILLEGAL,
213   Token::ILLEGAL,
214   Token::ILLEGAL,
215   Token::ILLEGAL,
216   Token::ILLEGAL,
217   Token::ILLEGAL,
218   Token::ILLEGAL,
219   Token::ILLEGAL,
220   Token::ILLEGAL,
221   Token::LBRACE,       // 0x7b
222   Token::ILLEGAL,
223   Token::RBRACE,       // 0x7d
224   Token::BIT_NOT,      // 0x7e
225   Token::ILLEGAL
226 };
227 
228 
Next()229 Token::Value Scanner::Next() {
230   current_ = next_;
231   has_line_terminator_before_next_ = false;
232   has_multiline_comment_before_next_ = false;
233   if (static_cast<unsigned>(c0_) <= 0x7f) {
234     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
235     if (token != Token::ILLEGAL) {
236       int pos = source_pos();
237       next_.token = token;
238       next_.location.beg_pos = pos;
239       next_.location.end_pos = pos + 1;
240       Advance();
241       return current_.token;
242     }
243   }
244   Scan();
245   return current_.token;
246 }
247 
248 
IsByteOrderMark(uc32 c)249 static inline bool IsByteOrderMark(uc32 c) {
250   // The Unicode value U+FFFE is guaranteed never to be assigned as a
251   // Unicode character; this implies that in a Unicode context the
252   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
253   // character expressed in little-endian byte order (since it could
254   // not be a U+FFFE character expressed in big-endian byte
255   // order). Nevertheless, we check for it to be compatible with
256   // Spidermonkey.
257   return c == 0xFEFF || c == 0xFFFE;
258 }
259 
260 
SkipWhiteSpace()261 bool Scanner::SkipWhiteSpace() {
262   int start_position = source_pos();
263 
264   while (true) {
265     // We treat byte-order marks (BOMs) as whitespace for better
266     // compatibility with Spidermonkey and other JavaScript engines.
267     while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
268       // IsWhiteSpace() includes line terminators!
269       if (unicode_cache_->IsLineTerminator(c0_)) {
270         // Ignore line terminators, but remember them. This is necessary
271         // for automatic semicolon insertion.
272         has_line_terminator_before_next_ = true;
273       }
274       Advance();
275     }
276 
277     // If there is an HTML comment end '-->' at the beginning of a
278     // line (with only whitespace in front of it), we treat the rest
279     // of the line as a comment. This is in line with the way
280     // SpiderMonkey handles it.
281     if (c0_ == '-' && has_line_terminator_before_next_) {
282       Advance();
283       if (c0_ == '-') {
284         Advance();
285         if (c0_ == '>') {
286           // Treat the rest of the line as a comment.
287           SkipSingleLineComment();
288           // Continue skipping white space after the comment.
289           continue;
290         }
291         PushBack('-');  // undo Advance()
292       }
293       PushBack('-');  // undo Advance()
294     }
295     // Return whether or not we skipped any characters.
296     return source_pos() != start_position;
297   }
298 }
299 
300 
SkipSingleLineComment()301 Token::Value Scanner::SkipSingleLineComment() {
302   Advance();
303 
304   // The line terminator at the end of the line is not considered
305   // to be part of the single-line comment; it is recognized
306   // separately by the lexical grammar and becomes part of the
307   // stream of input elements for the syntactic grammar (see
308   // ECMA-262, section 7.4).
309   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
310     Advance();
311   }
312 
313   return Token::WHITESPACE;
314 }
315 
316 
SkipMultiLineComment()317 Token::Value Scanner::SkipMultiLineComment() {
318   ASSERT(c0_ == '*');
319   Advance();
320 
321   while (c0_ >= 0) {
322     uc32 ch = c0_;
323     Advance();
324     if (unicode_cache_->IsLineTerminator(ch)) {
325       // Following ECMA-262, section 7.4, a comment containing
326       // a newline will make the comment count as a line-terminator.
327       has_multiline_comment_before_next_ = true;
328     }
329     // If we have reached the end of the multi-line comment, we
330     // consume the '/' and insert a whitespace. This way all
331     // multi-line comments are treated as whitespace.
332     if (ch == '*' && c0_ == '/') {
333       c0_ = ' ';
334       return Token::WHITESPACE;
335     }
336   }
337 
338   // Unterminated multi-line comment.
339   return Token::ILLEGAL;
340 }
341 
342 
ScanHtmlComment()343 Token::Value Scanner::ScanHtmlComment() {
344   // Check for <!-- comments.
345   ASSERT(c0_ == '!');
346   Advance();
347   if (c0_ == '-') {
348     Advance();
349     if (c0_ == '-') return SkipSingleLineComment();
350     PushBack('-');  // undo Advance()
351   }
352   PushBack('!');  // undo Advance()
353   ASSERT(c0_ == '!');
354   return Token::LT;
355 }
356 
357 
Scan()358 void Scanner::Scan() {
359   next_.literal_chars = NULL;
360   Token::Value token;
361   do {
362     // Remember the position of the next token
363     next_.location.beg_pos = source_pos();
364 
365     switch (c0_) {
366       case ' ':
367       case '\t':
368         Advance();
369         token = Token::WHITESPACE;
370         break;
371 
372       case '\n':
373         Advance();
374         has_line_terminator_before_next_ = true;
375         token = Token::WHITESPACE;
376         break;
377 
378       case '"': case '\'':
379         token = ScanString();
380         break;
381 
382       case '<':
383         // < <= << <<= <!--
384         Advance();
385         if (c0_ == '=') {
386           token = Select(Token::LTE);
387         } else if (c0_ == '<') {
388           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
389         } else if (c0_ == '!') {
390           token = ScanHtmlComment();
391         } else {
392           token = Token::LT;
393         }
394         break;
395 
396       case '>':
397         // > >= >> >>= >>> >>>=
398         Advance();
399         if (c0_ == '=') {
400           token = Select(Token::GTE);
401         } else if (c0_ == '>') {
402           // >> >>= >>> >>>=
403           Advance();
404           if (c0_ == '=') {
405             token = Select(Token::ASSIGN_SAR);
406           } else if (c0_ == '>') {
407             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
408           } else {
409             token = Token::SAR;
410           }
411         } else {
412           token = Token::GT;
413         }
414         break;
415 
416       case '=':
417         // = == ===
418         Advance();
419         if (c0_ == '=') {
420           token = Select('=', Token::EQ_STRICT, Token::EQ);
421         } else {
422           token = Token::ASSIGN;
423         }
424         break;
425 
426       case '!':
427         // ! != !==
428         Advance();
429         if (c0_ == '=') {
430           token = Select('=', Token::NE_STRICT, Token::NE);
431         } else {
432           token = Token::NOT;
433         }
434         break;
435 
436       case '+':
437         // + ++ +=
438         Advance();
439         if (c0_ == '+') {
440           token = Select(Token::INC);
441         } else if (c0_ == '=') {
442           token = Select(Token::ASSIGN_ADD);
443         } else {
444           token = Token::ADD;
445         }
446         break;
447 
448       case '-':
449         // - -- --> -=
450         Advance();
451         if (c0_ == '-') {
452           Advance();
453           if (c0_ == '>' && has_line_terminator_before_next_) {
454             // For compatibility with SpiderMonkey, we skip lines that
455             // start with an HTML comment end '-->'.
456             token = SkipSingleLineComment();
457           } else {
458             token = Token::DEC;
459           }
460         } else if (c0_ == '=') {
461           token = Select(Token::ASSIGN_SUB);
462         } else {
463           token = Token::SUB;
464         }
465         break;
466 
467       case '*':
468         // * *=
469         token = Select('=', Token::ASSIGN_MUL, Token::MUL);
470         break;
471 
472       case '%':
473         // % %=
474         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
475         break;
476 
477       case '/':
478         // /  // /* /=
479         Advance();
480         if (c0_ == '/') {
481           token = SkipSingleLineComment();
482         } else if (c0_ == '*') {
483           token = SkipMultiLineComment();
484         } else if (c0_ == '=') {
485           token = Select(Token::ASSIGN_DIV);
486         } else {
487           token = Token::DIV;
488         }
489         break;
490 
491       case '&':
492         // & && &=
493         Advance();
494         if (c0_ == '&') {
495           token = Select(Token::AND);
496         } else if (c0_ == '=') {
497           token = Select(Token::ASSIGN_BIT_AND);
498         } else {
499           token = Token::BIT_AND;
500         }
501         break;
502 
503       case '|':
504         // | || |=
505         Advance();
506         if (c0_ == '|') {
507           token = Select(Token::OR);
508         } else if (c0_ == '=') {
509           token = Select(Token::ASSIGN_BIT_OR);
510         } else {
511           token = Token::BIT_OR;
512         }
513         break;
514 
515       case '^':
516         // ^ ^=
517         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
518         break;
519 
520       case '.':
521         // . Number
522         Advance();
523         if (IsDecimalDigit(c0_)) {
524           token = ScanNumber(true);
525         } else {
526           token = Token::PERIOD;
527         }
528         break;
529 
530       case ':':
531         token = Select(Token::COLON);
532         break;
533 
534       case ';':
535         token = Select(Token::SEMICOLON);
536         break;
537 
538       case ',':
539         token = Select(Token::COMMA);
540         break;
541 
542       case '(':
543         token = Select(Token::LPAREN);
544         break;
545 
546       case ')':
547         token = Select(Token::RPAREN);
548         break;
549 
550       case '[':
551         token = Select(Token::LBRACK);
552         break;
553 
554       case ']':
555         token = Select(Token::RBRACK);
556         break;
557 
558       case '{':
559         token = Select(Token::LBRACE);
560         break;
561 
562       case '}':
563         token = Select(Token::RBRACE);
564         break;
565 
566       case '?':
567         token = Select(Token::CONDITIONAL);
568         break;
569 
570       case '~':
571         token = Select(Token::BIT_NOT);
572         break;
573 
574       default:
575         if (unicode_cache_->IsIdentifierStart(c0_)) {
576           token = ScanIdentifierOrKeyword();
577         } else if (IsDecimalDigit(c0_)) {
578           token = ScanNumber(false);
579         } else if (SkipWhiteSpace()) {
580           token = Token::WHITESPACE;
581         } else if (c0_ < 0) {
582           token = Token::EOS;
583         } else {
584           token = Select(Token::ILLEGAL);
585         }
586         break;
587     }
588 
589     // Continue scanning for tokens as long as we're just skipping
590     // whitespace.
591   } while (token == Token::WHITESPACE);
592 
593   next_.location.end_pos = source_pos();
594   next_.token = token;
595 }
596 
597 
SeekForward(int pos)598 void Scanner::SeekForward(int pos) {
599   // After this call, we will have the token at the given position as
600   // the "next" token. The "current" token will be invalid.
601   if (pos == next_.location.beg_pos) return;
602   int current_pos = source_pos();
603   ASSERT_EQ(next_.location.end_pos, current_pos);
604   // Positions inside the lookahead token aren't supported.
605   ASSERT(pos >= current_pos);
606   if (pos != current_pos) {
607     source_->SeekForward(pos - source_->pos());
608     Advance();
609     // This function is only called to seek to the location
610     // of the end of a function (at the "}" token). It doesn't matter
611     // whether there was a line terminator in the part we skip.
612     has_line_terminator_before_next_ = false;
613     has_multiline_comment_before_next_ = false;
614   }
615   Scan();
616 }
617 
618 
ScanEscape()619 bool Scanner::ScanEscape() {
620   uc32 c = c0_;
621   Advance();
622 
623   // Skip escaped newlines.
624   if (unicode_cache_->IsLineTerminator(c)) {
625     // Allow CR+LF newlines in multiline string literals.
626     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
627     // Allow LF+CR newlines in multiline string literals.
628     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
629     return true;
630   }
631 
632   switch (c) {
633     case '\'':  // fall through
634     case '"' :  // fall through
635     case '\\': break;
636     case 'b' : c = '\b'; break;
637     case 'f' : c = '\f'; break;
638     case 'n' : c = '\n'; break;
639     case 'r' : c = '\r'; break;
640     case 't' : c = '\t'; break;
641     case 'u' : {
642       c = ScanHexNumber(4);
643       if (c < 0) return false;
644       break;
645     }
646     case 'v' : c = '\v'; break;
647     case 'x' : {
648       c = ScanHexNumber(2);
649       if (c < 0) return false;
650       break;
651     }
652     case '0' :  // fall through
653     case '1' :  // fall through
654     case '2' :  // fall through
655     case '3' :  // fall through
656     case '4' :  // fall through
657     case '5' :  // fall through
658     case '6' :  // fall through
659     case '7' : c = ScanOctalEscape(c, 2); break;
660   }
661 
662   // According to ECMA-262, section 7.8.4, characters not covered by the
663   // above cases should be illegal, but they are commonly handled as
664   // non-escaped characters by JS VMs.
665   AddLiteralChar(c);
666   return true;
667 }
668 
669 
670 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
671 // ECMA-262. Other JS VMs support them.
ScanOctalEscape(uc32 c,int length)672 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
673   uc32 x = c - '0';
674   int i = 0;
675   for (; i < length; i++) {
676     int d = c0_ - '0';
677     if (d < 0 || d > 7) break;
678     int nx = x * 8 + d;
679     if (nx >= 256) break;
680     x = nx;
681     Advance();
682   }
683   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
684   // Remember the position of octal escape sequences so that an error
685   // can be reported later (in strict mode).
686   // We don't report the error immediately, because the octal escape can
687   // occur before the "use strict" directive.
688   if (c != '0' || i > 0) {
689     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
690   }
691   return x;
692 }
693 
694 
ScanString()695 Token::Value Scanner::ScanString() {
696   uc32 quote = c0_;
697   Advance();  // consume quote
698 
699   LiteralScope literal(this);
700   while (c0_ != quote && c0_ >= 0
701          && !unicode_cache_->IsLineTerminator(c0_)) {
702     uc32 c = c0_;
703     Advance();
704     if (c == '\\') {
705       if (c0_ < 0 || !ScanEscape()) return Token::ILLEGAL;
706     } else {
707       AddLiteralChar(c);
708     }
709   }
710   if (c0_ != quote) return Token::ILLEGAL;
711   literal.Complete();
712 
713   Advance();  // consume quote
714   return Token::STRING;
715 }
716 
717 
ScanDecimalDigits()718 void Scanner::ScanDecimalDigits() {
719   while (IsDecimalDigit(c0_))
720     AddLiteralCharAdvance();
721 }
722 
723 
ScanNumber(bool seen_period)724 Token::Value Scanner::ScanNumber(bool seen_period) {
725   ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
726 
727   enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL;
728 
729   LiteralScope literal(this);
730   if (seen_period) {
731     // we have already seen a decimal point of the float
732     AddLiteralChar('.');
733     ScanDecimalDigits();  // we know we have at least one digit
734 
735   } else {
736     // if the first character is '0' we must check for octals and hex
737     if (c0_ == '0') {
738       int start_pos = source_pos();  // For reporting octal positions.
739       AddLiteralCharAdvance();
740 
741       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
742       // an octal number.
743       if (c0_ == 'x' || c0_ == 'X') {
744         // hex number
745         kind = HEX;
746         AddLiteralCharAdvance();
747         if (!IsHexDigit(c0_)) {
748           // we must have at least one hex digit after 'x'/'X'
749           return Token::ILLEGAL;
750         }
751         while (IsHexDigit(c0_)) {
752           AddLiteralCharAdvance();
753         }
754       } else if (harmony_numeric_literals_ && (c0_ == 'o' || c0_ == 'O')) {
755         kind = OCTAL;
756         AddLiteralCharAdvance();
757         if (!IsOctalDigit(c0_)) {
758           // we must have at least one octal digit after 'o'/'O'
759           return Token::ILLEGAL;
760         }
761         while (IsOctalDigit(c0_)) {
762           AddLiteralCharAdvance();
763         }
764       } else if (harmony_numeric_literals_ && (c0_ == 'b' || c0_ == 'B')) {
765         kind = BINARY;
766         AddLiteralCharAdvance();
767         if (!IsBinaryDigit(c0_)) {
768           // we must have at least one binary digit after 'b'/'B'
769           return Token::ILLEGAL;
770         }
771         while (IsBinaryDigit(c0_)) {
772           AddLiteralCharAdvance();
773         }
774       } else if ('0' <= c0_ && c0_ <= '7') {
775         // (possible) octal number
776         kind = IMPLICIT_OCTAL;
777         while (true) {
778           if (c0_ == '8' || c0_ == '9') {
779             kind = DECIMAL;
780             break;
781           }
782           if (c0_  < '0' || '7'  < c0_) {
783             // Octal literal finished.
784             octal_pos_ = Location(start_pos, source_pos());
785             break;
786           }
787           AddLiteralCharAdvance();
788         }
789       }
790     }
791 
792     // Parse decimal digits and allow trailing fractional part.
793     if (kind == DECIMAL) {
794       ScanDecimalDigits();  // optional
795       if (c0_ == '.') {
796         AddLiteralCharAdvance();
797         ScanDecimalDigits();  // optional
798       }
799     }
800   }
801 
802   // scan exponent, if any
803   if (c0_ == 'e' || c0_ == 'E') {
804     ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
805     if (kind != DECIMAL) return Token::ILLEGAL;
806     // scan exponent
807     AddLiteralCharAdvance();
808     if (c0_ == '+' || c0_ == '-')
809       AddLiteralCharAdvance();
810     if (!IsDecimalDigit(c0_)) {
811       // we must have at least one decimal digit after 'e'/'E'
812       return Token::ILLEGAL;
813     }
814     ScanDecimalDigits();
815   }
816 
817   // The source character immediately following a numeric literal must
818   // not be an identifier start or a decimal digit; see ECMA-262
819   // section 7.8.3, page 17 (note that we read only one decimal digit
820   // if the value is 0).
821   if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
822     return Token::ILLEGAL;
823 
824   literal.Complete();
825 
826   return Token::NUMBER;
827 }
828 
829 
ScanIdentifierUnicodeEscape()830 uc32 Scanner::ScanIdentifierUnicodeEscape() {
831   Advance();
832   if (c0_ != 'u') return -1;
833   Advance();
834   uc32 result = ScanHexNumber(4);
835   if (result < 0) PushBack('u');
836   return result;
837 }
838 
839 
840 // ----------------------------------------------------------------------------
841 // Keyword Matcher
842 
843 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                            \
844   KEYWORD_GROUP('b')                                                \
845   KEYWORD("break", Token::BREAK)                                    \
846   KEYWORD_GROUP('c')                                                \
847   KEYWORD("case", Token::CASE)                                      \
848   KEYWORD("catch", Token::CATCH)                                    \
849   KEYWORD("class", Token::FUTURE_RESERVED_WORD)                     \
850   KEYWORD("const", Token::CONST)                                    \
851   KEYWORD("continue", Token::CONTINUE)                              \
852   KEYWORD_GROUP('d')                                                \
853   KEYWORD("debugger", Token::DEBUGGER)                              \
854   KEYWORD("default", Token::DEFAULT)                                \
855   KEYWORD("delete", Token::DELETE)                                  \
856   KEYWORD("do", Token::DO)                                          \
857   KEYWORD_GROUP('e')                                                \
858   KEYWORD("else", Token::ELSE)                                      \
859   KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                      \
860   KEYWORD("export", harmony_modules                                 \
861                     ? Token::EXPORT : Token::FUTURE_RESERVED_WORD)  \
862   KEYWORD("extends", Token::FUTURE_RESERVED_WORD)                   \
863   KEYWORD_GROUP('f')                                                \
864   KEYWORD("false", Token::FALSE_LITERAL)                            \
865   KEYWORD("finally", Token::FINALLY)                                \
866   KEYWORD("for", Token::FOR)                                        \
867   KEYWORD("function", Token::FUNCTION)                              \
868   KEYWORD_GROUP('i')                                                \
869   KEYWORD("if", Token::IF)                                          \
870   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)         \
871   KEYWORD("import", harmony_modules                                 \
872                     ? Token::IMPORT : Token::FUTURE_RESERVED_WORD)  \
873   KEYWORD("in", Token::IN)                                          \
874   KEYWORD("instanceof", Token::INSTANCEOF)                          \
875   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)          \
876   KEYWORD_GROUP('l')                                                \
877   KEYWORD("let", harmony_scoping                                    \
878                  ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
879   KEYWORD_GROUP('n')                                                \
880   KEYWORD("new", Token::NEW)                                        \
881   KEYWORD("null", Token::NULL_LITERAL)                              \
882   KEYWORD_GROUP('p')                                                \
883   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)            \
884   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)            \
885   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)          \
886   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)             \
887   KEYWORD_GROUP('r')                                                \
888   KEYWORD("return", Token::RETURN)                                  \
889   KEYWORD_GROUP('s')                                                \
890   KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD)             \
891   KEYWORD("super", Token::FUTURE_RESERVED_WORD)                     \
892   KEYWORD("switch", Token::SWITCH)                                  \
893   KEYWORD_GROUP('t')                                                \
894   KEYWORD("this", Token::THIS)                                      \
895   KEYWORD("throw", Token::THROW)                                    \
896   KEYWORD("true", Token::TRUE_LITERAL)                              \
897   KEYWORD("try", Token::TRY)                                        \
898   KEYWORD("typeof", Token::TYPEOF)                                  \
899   KEYWORD_GROUP('v')                                                \
900   KEYWORD("var", Token::VAR)                                        \
901   KEYWORD("void", Token::VOID)                                      \
902   KEYWORD_GROUP('w')                                                \
903   KEYWORD("while", Token::WHILE)                                    \
904   KEYWORD("with", Token::WITH)                                      \
905   KEYWORD_GROUP('y')                                                \
906   KEYWORD("yield", Token::YIELD)
907 
908 
KeywordOrIdentifierToken(const char * input,int input_length,bool harmony_scoping,bool harmony_modules)909 static Token::Value KeywordOrIdentifierToken(const char* input,
910                                              int input_length,
911                                              bool harmony_scoping,
912                                              bool harmony_modules) {
913   ASSERT(input_length >= 1);
914   const int kMinLength = 2;
915   const int kMaxLength = 10;
916   if (input_length < kMinLength || input_length > kMaxLength) {
917     return Token::IDENTIFIER;
918   }
919   switch (input[0]) {
920     default:
921 #define KEYWORD_GROUP_CASE(ch)                                \
922       break;                                                  \
923     case ch:
924 #define KEYWORD(keyword, token)                               \
925     {                                                         \
926       /* 'keyword' is a char array, so sizeof(keyword) is */  \
927       /* strlen(keyword) plus 1 for the NUL char. */          \
928       const int keyword_length = sizeof(keyword) - 1;         \
929       STATIC_ASSERT(keyword_length >= kMinLength);            \
930       STATIC_ASSERT(keyword_length <= kMaxLength);            \
931       if (input_length == keyword_length &&                   \
932           input[1] == keyword[1] &&                           \
933           (keyword_length <= 2 || input[2] == keyword[2]) &&  \
934           (keyword_length <= 3 || input[3] == keyword[3]) &&  \
935           (keyword_length <= 4 || input[4] == keyword[4]) &&  \
936           (keyword_length <= 5 || input[5] == keyword[5]) &&  \
937           (keyword_length <= 6 || input[6] == keyword[6]) &&  \
938           (keyword_length <= 7 || input[7] == keyword[7]) &&  \
939           (keyword_length <= 8 || input[8] == keyword[8]) &&  \
940           (keyword_length <= 9 || input[9] == keyword[9])) {  \
941         return token;                                         \
942       }                                                       \
943     }
944     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
945   }
946   return Token::IDENTIFIER;
947 }
948 
949 
ScanIdentifierOrKeyword()950 Token::Value Scanner::ScanIdentifierOrKeyword() {
951   ASSERT(unicode_cache_->IsIdentifierStart(c0_));
952   LiteralScope literal(this);
953   // Scan identifier start character.
954   if (c0_ == '\\') {
955     uc32 c = ScanIdentifierUnicodeEscape();
956     // Only allow legal identifier start characters.
957     if (c < 0 ||
958         c == '\\' ||  // No recursive escapes.
959         !unicode_cache_->IsIdentifierStart(c)) {
960       return Token::ILLEGAL;
961     }
962     AddLiteralChar(c);
963     return ScanIdentifierSuffix(&literal);
964   }
965 
966   uc32 first_char = c0_;
967   Advance();
968   AddLiteralChar(first_char);
969 
970   // Scan the rest of the identifier characters.
971   while (unicode_cache_->IsIdentifierPart(c0_)) {
972     if (c0_ != '\\') {
973       uc32 next_char = c0_;
974       Advance();
975       AddLiteralChar(next_char);
976       continue;
977     }
978     // Fallthrough if no longer able to complete keyword.
979     return ScanIdentifierSuffix(&literal);
980   }
981 
982   literal.Complete();
983 
984   if (next_.literal_chars->is_ascii()) {
985     Vector<const char> chars = next_.literal_chars->ascii_literal();
986     return KeywordOrIdentifierToken(chars.start(),
987                                     chars.length(),
988                                     harmony_scoping_,
989                                     harmony_modules_);
990   }
991 
992   return Token::IDENTIFIER;
993 }
994 
995 
ScanIdentifierSuffix(LiteralScope * literal)996 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
997   // Scan the rest of the identifier characters.
998   while (unicode_cache_->IsIdentifierPart(c0_)) {
999     if (c0_ == '\\') {
1000       uc32 c = ScanIdentifierUnicodeEscape();
1001       // Only allow legal identifier part characters.
1002       if (c < 0 ||
1003           c == '\\' ||
1004           !unicode_cache_->IsIdentifierPart(c)) {
1005         return Token::ILLEGAL;
1006       }
1007       AddLiteralChar(c);
1008     } else {
1009       AddLiteralChar(c0_);
1010       Advance();
1011     }
1012   }
1013   literal->Complete();
1014 
1015   return Token::IDENTIFIER;
1016 }
1017 
1018 
ScanRegExpPattern(bool seen_equal)1019 bool Scanner::ScanRegExpPattern(bool seen_equal) {
1020   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1021   bool in_character_class = false;
1022 
1023   // Previous token is either '/' or '/=', in the second case, the
1024   // pattern starts at =.
1025   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1026   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1027 
1028   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1029   // the scanner should pass uninterpreted bodies to the RegExp
1030   // constructor.
1031   LiteralScope literal(this);
1032   if (seen_equal) {
1033     AddLiteralChar('=');
1034   }
1035 
1036   while (c0_ != '/' || in_character_class) {
1037     if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1038     if (c0_ == '\\') {  // Escape sequence.
1039       AddLiteralCharAdvance();
1040       if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1041       AddLiteralCharAdvance();
1042       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1043       // only "safe" characters are allowed (letters, digits, underscore),
1044       // otherwise the escape isn't valid and the invalid character has
1045       // its normal meaning. I.e., we can just continue scanning without
1046       // worrying whether the following characters are part of the escape
1047       // or not, since any '/', '\\' or '[' is guaranteed to not be part
1048       // of the escape sequence.
1049 
1050       // TODO(896): At some point, parse RegExps more throughly to capture
1051       // octal esacpes in strict mode.
1052     } else {  // Unescaped character.
1053       if (c0_ == '[') in_character_class = true;
1054       if (c0_ == ']') in_character_class = false;
1055       AddLiteralCharAdvance();
1056     }
1057   }
1058   Advance();  // consume '/'
1059 
1060   literal.Complete();
1061 
1062   return true;
1063 }
1064 
1065 
ScanLiteralUnicodeEscape()1066 bool Scanner::ScanLiteralUnicodeEscape() {
1067   ASSERT(c0_ == '\\');
1068   uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
1069   Advance();
1070   int i = 1;
1071   if (c0_ == 'u') {
1072     i++;
1073     while (i < 6) {
1074       Advance();
1075       if (!IsHexDigit(c0_)) break;
1076       chars_read[i] = c0_;
1077       i++;
1078     }
1079   }
1080   if (i < 6) {
1081     // Incomplete escape. Undo all advances and return false.
1082     while (i > 0) {
1083       i--;
1084       PushBack(chars_read[i]);
1085     }
1086     return false;
1087   }
1088   // Complete escape. Add all chars to current literal buffer.
1089   for (int i = 0; i < 6; i++) {
1090     AddLiteralChar(chars_read[i]);
1091   }
1092   return true;
1093 }
1094 
1095 
ScanRegExpFlags()1096 bool Scanner::ScanRegExpFlags() {
1097   // Scan regular expression flags.
1098   LiteralScope literal(this);
1099   while (unicode_cache_->IsIdentifierPart(c0_)) {
1100     if (c0_ != '\\') {
1101       AddLiteralCharAdvance();
1102     } else {
1103       if (!ScanLiteralUnicodeEscape()) {
1104         break;
1105       }
1106       Advance();
1107     }
1108   }
1109   literal.Complete();
1110 
1111   next_.location.end_pos = source_pos() - 1;
1112   return true;
1113 }
1114 
1115 
AddAsciiSymbol(Vector<const char> key,int value)1116 int DuplicateFinder::AddAsciiSymbol(Vector<const char> key, int value) {
1117   return AddSymbol(Vector<const byte>::cast(key), true, value);
1118 }
1119 
1120 
AddUtf16Symbol(Vector<const uint16_t> key,int value)1121 int DuplicateFinder::AddUtf16Symbol(Vector<const uint16_t> key, int value) {
1122   return AddSymbol(Vector<const byte>::cast(key), false, value);
1123 }
1124 
1125 
AddSymbol(Vector<const byte> key,bool is_ascii,int value)1126 int DuplicateFinder::AddSymbol(Vector<const byte> key,
1127                                bool is_ascii,
1128                                int value) {
1129   uint32_t hash = Hash(key, is_ascii);
1130   byte* encoding = BackupKey(key, is_ascii);
1131   HashMap::Entry* entry = map_.Lookup(encoding, hash, true);
1132   int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value));
1133   entry->value =
1134     reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value));
1135   return old_value;
1136 }
1137 
1138 
AddNumber(Vector<const char> key,int value)1139 int DuplicateFinder::AddNumber(Vector<const char> key, int value) {
1140   ASSERT(key.length() > 0);
1141   // Quick check for already being in canonical form.
1142   if (IsNumberCanonical(key)) {
1143     return AddAsciiSymbol(key, value);
1144   }
1145 
1146   int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY;
1147   double double_value = StringToDouble(unicode_constants_, key, flags, 0.0);
1148   int length;
1149   const char* string;
1150   if (!std::isfinite(double_value)) {
1151     string = "Infinity";
1152     length = 8;  // strlen("Infinity");
1153   } else {
1154     string = DoubleToCString(double_value,
1155                              Vector<char>(number_buffer_, kBufferSize));
1156     length = StrLength(string);
1157   }
1158   return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string),
1159                                       length), true, value);
1160 }
1161 
1162 
IsNumberCanonical(Vector<const char> number)1163 bool DuplicateFinder::IsNumberCanonical(Vector<const char> number) {
1164   // Test for a safe approximation of number literals that are already
1165   // in canonical form: max 15 digits, no leading zeroes, except an
1166   // integer part that is a single zero, and no trailing zeros below
1167   // the decimal point.
1168   int pos = 0;
1169   int length = number.length();
1170   if (number.length() > 15) return false;
1171   if (number[pos] == '0') {
1172     pos++;
1173   } else {
1174     while (pos < length &&
1175            static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++;
1176   }
1177   if (length == pos) return true;
1178   if (number[pos] != '.') return false;
1179   pos++;
1180   bool invalid_last_digit = true;
1181   while (pos < length) {
1182     byte digit = number[pos] - '0';
1183     if (digit > '9' - '0') return false;
1184     invalid_last_digit = (digit == 0);
1185     pos++;
1186   }
1187   return !invalid_last_digit;
1188 }
1189 
1190 
Hash(Vector<const byte> key,bool is_ascii)1191 uint32_t DuplicateFinder::Hash(Vector<const byte> key, bool is_ascii) {
1192   // Primitive hash function, almost identical to the one used
1193   // for strings (except that it's seeded by the length and ASCII-ness).
1194   int length = key.length();
1195   uint32_t hash = (length << 1) | (is_ascii ? 1 : 0) ;
1196   for (int i = 0; i < length; i++) {
1197     uint32_t c = key[i];
1198     hash = (hash + c) * 1025;
1199     hash ^= (hash >> 6);
1200   }
1201   return hash;
1202 }
1203 
1204 
Match(void * first,void * second)1205 bool DuplicateFinder::Match(void* first, void* second) {
1206   // Decode lengths.
1207   // Length + ASCII-bit is encoded as base 128, most significant heptet first,
1208   // with a 8th bit being non-zero while there are more heptets.
1209   // The value encodes the number of bytes following, and whether the original
1210   // was ASCII.
1211   byte* s1 = reinterpret_cast<byte*>(first);
1212   byte* s2 = reinterpret_cast<byte*>(second);
1213   uint32_t length_ascii_field = 0;
1214   byte c1;
1215   do {
1216     c1 = *s1;
1217     if (c1 != *s2) return false;
1218     length_ascii_field = (length_ascii_field << 7) | (c1 & 0x7f);
1219     s1++;
1220     s2++;
1221   } while ((c1 & 0x80) != 0);
1222   int length = static_cast<int>(length_ascii_field >> 1);
1223   return memcmp(s1, s2, length) == 0;
1224 }
1225 
1226 
BackupKey(Vector<const byte> bytes,bool is_ascii)1227 byte* DuplicateFinder::BackupKey(Vector<const byte> bytes,
1228                                  bool is_ascii) {
1229   uint32_t ascii_length = (bytes.length() << 1) | (is_ascii ? 1 : 0);
1230   backing_store_.StartSequence();
1231   // Emit ascii_length as base-128 encoded number, with the 7th bit set
1232   // on the byte of every heptet except the last, least significant, one.
1233   if (ascii_length >= (1 << 7)) {
1234     if (ascii_length >= (1 << 14)) {
1235       if (ascii_length >= (1 << 21)) {
1236         if (ascii_length >= (1 << 28)) {
1237           backing_store_.Add(static_cast<byte>((ascii_length >> 28) | 0x80));
1238         }
1239         backing_store_.Add(static_cast<byte>((ascii_length >> 21) | 0x80u));
1240       }
1241       backing_store_.Add(static_cast<byte>((ascii_length >> 14) | 0x80u));
1242     }
1243     backing_store_.Add(static_cast<byte>((ascii_length >> 7) | 0x80u));
1244   }
1245   backing_store_.Add(static_cast<byte>(ascii_length & 0x7f));
1246 
1247   backing_store_.AddBlock(bytes);
1248   return backing_store_.EndSequence().start();
1249 }
1250 
1251 } }  // namespace v8::internal
1252