1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Features shared by parsing and pre-parsing scanners.
6
7 #include <cmath>
8
9 #include "src/scanner.h"
10
11 #include "include/v8stdint.h"
12 #include "src/char-predicates-inl.h"
13 #include "src/conversions-inl.h"
14 #include "src/list-inl.h"
15 #include "src/v8.h"
16 #include "src/parser.h"
17
18 namespace v8 {
19 namespace internal {
20
21 // ----------------------------------------------------------------------------
22 // Scanner
23
Scanner(UnicodeCache * unicode_cache)24 Scanner::Scanner(UnicodeCache* unicode_cache)
25 : unicode_cache_(unicode_cache),
26 octal_pos_(Location::invalid()),
27 harmony_scoping_(false),
28 harmony_modules_(false),
29 harmony_numeric_literals_(false) { }
30
31
Initialize(Utf16CharacterStream * source)32 void Scanner::Initialize(Utf16CharacterStream* source) {
33 source_ = source;
34 // Need to capture identifiers in order to recognize "get" and "set"
35 // in object literals.
36 Init();
37 // Skip initial whitespace allowing HTML comment ends just like
38 // after a newline and scan first token.
39 has_line_terminator_before_next_ = true;
40 SkipWhiteSpace();
41 Scan();
42 }
43
44
ScanHexNumber(int expected_length)45 uc32 Scanner::ScanHexNumber(int expected_length) {
46 ASSERT(expected_length <= 4); // prevent overflow
47
48 uc32 digits[4] = { 0, 0, 0, 0 };
49 uc32 x = 0;
50 for (int i = 0; i < expected_length; i++) {
51 digits[i] = c0_;
52 int d = HexValue(c0_);
53 if (d < 0) {
54 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
55 // should be illegal, but other JS VMs just return the
56 // non-escaped version of the original character.
57
58 // Push back digits that we have advanced past.
59 for (int j = i-1; j >= 0; j--) {
60 PushBack(digits[j]);
61 }
62 return -1;
63 }
64 x = x * 16 + d;
65 Advance();
66 }
67
68 return x;
69 }
70
71
72 // Ensure that tokens can be stored in a byte.
73 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
74
75 // Table of one-character tokens, by character (0x00..0x7f only).
76 static const byte one_char_tokens[] = {
77 Token::ILLEGAL,
78 Token::ILLEGAL,
79 Token::ILLEGAL,
80 Token::ILLEGAL,
81 Token::ILLEGAL,
82 Token::ILLEGAL,
83 Token::ILLEGAL,
84 Token::ILLEGAL,
85 Token::ILLEGAL,
86 Token::ILLEGAL,
87 Token::ILLEGAL,
88 Token::ILLEGAL,
89 Token::ILLEGAL,
90 Token::ILLEGAL,
91 Token::ILLEGAL,
92 Token::ILLEGAL,
93 Token::ILLEGAL,
94 Token::ILLEGAL,
95 Token::ILLEGAL,
96 Token::ILLEGAL,
97 Token::ILLEGAL,
98 Token::ILLEGAL,
99 Token::ILLEGAL,
100 Token::ILLEGAL,
101 Token::ILLEGAL,
102 Token::ILLEGAL,
103 Token::ILLEGAL,
104 Token::ILLEGAL,
105 Token::ILLEGAL,
106 Token::ILLEGAL,
107 Token::ILLEGAL,
108 Token::ILLEGAL,
109 Token::ILLEGAL,
110 Token::ILLEGAL,
111 Token::ILLEGAL,
112 Token::ILLEGAL,
113 Token::ILLEGAL,
114 Token::ILLEGAL,
115 Token::ILLEGAL,
116 Token::ILLEGAL,
117 Token::LPAREN, // 0x28
118 Token::RPAREN, // 0x29
119 Token::ILLEGAL,
120 Token::ILLEGAL,
121 Token::COMMA, // 0x2c
122 Token::ILLEGAL,
123 Token::ILLEGAL,
124 Token::ILLEGAL,
125 Token::ILLEGAL,
126 Token::ILLEGAL,
127 Token::ILLEGAL,
128 Token::ILLEGAL,
129 Token::ILLEGAL,
130 Token::ILLEGAL,
131 Token::ILLEGAL,
132 Token::ILLEGAL,
133 Token::ILLEGAL,
134 Token::ILLEGAL,
135 Token::COLON, // 0x3a
136 Token::SEMICOLON, // 0x3b
137 Token::ILLEGAL,
138 Token::ILLEGAL,
139 Token::ILLEGAL,
140 Token::CONDITIONAL, // 0x3f
141 Token::ILLEGAL,
142 Token::ILLEGAL,
143 Token::ILLEGAL,
144 Token::ILLEGAL,
145 Token::ILLEGAL,
146 Token::ILLEGAL,
147 Token::ILLEGAL,
148 Token::ILLEGAL,
149 Token::ILLEGAL,
150 Token::ILLEGAL,
151 Token::ILLEGAL,
152 Token::ILLEGAL,
153 Token::ILLEGAL,
154 Token::ILLEGAL,
155 Token::ILLEGAL,
156 Token::ILLEGAL,
157 Token::ILLEGAL,
158 Token::ILLEGAL,
159 Token::ILLEGAL,
160 Token::ILLEGAL,
161 Token::ILLEGAL,
162 Token::ILLEGAL,
163 Token::ILLEGAL,
164 Token::ILLEGAL,
165 Token::ILLEGAL,
166 Token::ILLEGAL,
167 Token::ILLEGAL,
168 Token::LBRACK, // 0x5b
169 Token::ILLEGAL,
170 Token::RBRACK, // 0x5d
171 Token::ILLEGAL,
172 Token::ILLEGAL,
173 Token::ILLEGAL,
174 Token::ILLEGAL,
175 Token::ILLEGAL,
176 Token::ILLEGAL,
177 Token::ILLEGAL,
178 Token::ILLEGAL,
179 Token::ILLEGAL,
180 Token::ILLEGAL,
181 Token::ILLEGAL,
182 Token::ILLEGAL,
183 Token::ILLEGAL,
184 Token::ILLEGAL,
185 Token::ILLEGAL,
186 Token::ILLEGAL,
187 Token::ILLEGAL,
188 Token::ILLEGAL,
189 Token::ILLEGAL,
190 Token::ILLEGAL,
191 Token::ILLEGAL,
192 Token::ILLEGAL,
193 Token::ILLEGAL,
194 Token::ILLEGAL,
195 Token::ILLEGAL,
196 Token::ILLEGAL,
197 Token::ILLEGAL,
198 Token::ILLEGAL,
199 Token::ILLEGAL,
200 Token::LBRACE, // 0x7b
201 Token::ILLEGAL,
202 Token::RBRACE, // 0x7d
203 Token::BIT_NOT, // 0x7e
204 Token::ILLEGAL
205 };
206
207
Next()208 Token::Value Scanner::Next() {
209 current_ = next_;
210 has_line_terminator_before_next_ = false;
211 has_multiline_comment_before_next_ = false;
212 if (static_cast<unsigned>(c0_) <= 0x7f) {
213 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
214 if (token != Token::ILLEGAL) {
215 int pos = source_pos();
216 next_.token = token;
217 next_.location.beg_pos = pos;
218 next_.location.end_pos = pos + 1;
219 Advance();
220 return current_.token;
221 }
222 }
223 Scan();
224 return current_.token;
225 }
226
227
228 // TODO(yangguo): check whether this is actually necessary.
IsLittleEndianByteOrderMark(uc32 c)229 static inline bool IsLittleEndianByteOrderMark(uc32 c) {
230 // The Unicode value U+FFFE is guaranteed never to be assigned as a
231 // Unicode character; this implies that in a Unicode context the
232 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
233 // character expressed in little-endian byte order (since it could
234 // not be a U+FFFE character expressed in big-endian byte
235 // order). Nevertheless, we check for it to be compatible with
236 // Spidermonkey.
237 return c == 0xFFFE;
238 }
239
240
SkipWhiteSpace()241 bool Scanner::SkipWhiteSpace() {
242 int start_position = source_pos();
243
244 while (true) {
245 while (true) {
246 // Advance as long as character is a WhiteSpace or LineTerminator.
247 // Remember if the latter is the case.
248 if (unicode_cache_->IsLineTerminator(c0_)) {
249 has_line_terminator_before_next_ = true;
250 } else if (!unicode_cache_->IsWhiteSpace(c0_) &&
251 !IsLittleEndianByteOrderMark(c0_)) {
252 break;
253 }
254 Advance();
255 }
256
257 // If there is an HTML comment end '-->' at the beginning of a
258 // line (with only whitespace in front of it), we treat the rest
259 // of the line as a comment. This is in line with the way
260 // SpiderMonkey handles it.
261 if (c0_ == '-' && has_line_terminator_before_next_) {
262 Advance();
263 if (c0_ == '-') {
264 Advance();
265 if (c0_ == '>') {
266 // Treat the rest of the line as a comment.
267 SkipSingleLineComment();
268 // Continue skipping white space after the comment.
269 continue;
270 }
271 PushBack('-'); // undo Advance()
272 }
273 PushBack('-'); // undo Advance()
274 }
275 // Return whether or not we skipped any characters.
276 return source_pos() != start_position;
277 }
278 }
279
280
SkipSingleLineComment()281 Token::Value Scanner::SkipSingleLineComment() {
282 Advance();
283
284 // The line terminator at the end of the line is not considered
285 // to be part of the single-line comment; it is recognized
286 // separately by the lexical grammar and becomes part of the
287 // stream of input elements for the syntactic grammar (see
288 // ECMA-262, section 7.4).
289 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
290 Advance();
291 }
292
293 return Token::WHITESPACE;
294 }
295
296
SkipMultiLineComment()297 Token::Value Scanner::SkipMultiLineComment() {
298 ASSERT(c0_ == '*');
299 Advance();
300
301 while (c0_ >= 0) {
302 uc32 ch = c0_;
303 Advance();
304 if (unicode_cache_->IsLineTerminator(ch)) {
305 // Following ECMA-262, section 7.4, a comment containing
306 // a newline will make the comment count as a line-terminator.
307 has_multiline_comment_before_next_ = true;
308 }
309 // If we have reached the end of the multi-line comment, we
310 // consume the '/' and insert a whitespace. This way all
311 // multi-line comments are treated as whitespace.
312 if (ch == '*' && c0_ == '/') {
313 c0_ = ' ';
314 return Token::WHITESPACE;
315 }
316 }
317
318 // Unterminated multi-line comment.
319 return Token::ILLEGAL;
320 }
321
322
ScanHtmlComment()323 Token::Value Scanner::ScanHtmlComment() {
324 // Check for <!-- comments.
325 ASSERT(c0_ == '!');
326 Advance();
327 if (c0_ == '-') {
328 Advance();
329 if (c0_ == '-') return SkipSingleLineComment();
330 PushBack('-'); // undo Advance()
331 }
332 PushBack('!'); // undo Advance()
333 ASSERT(c0_ == '!');
334 return Token::LT;
335 }
336
337
Scan()338 void Scanner::Scan() {
339 next_.literal_chars = NULL;
340 Token::Value token;
341 do {
342 // Remember the position of the next token
343 next_.location.beg_pos = source_pos();
344
345 switch (c0_) {
346 case ' ':
347 case '\t':
348 Advance();
349 token = Token::WHITESPACE;
350 break;
351
352 case '\n':
353 Advance();
354 has_line_terminator_before_next_ = true;
355 token = Token::WHITESPACE;
356 break;
357
358 case '"': case '\'':
359 token = ScanString();
360 break;
361
362 case '<':
363 // < <= << <<= <!--
364 Advance();
365 if (c0_ == '=') {
366 token = Select(Token::LTE);
367 } else if (c0_ == '<') {
368 token = Select('=', Token::ASSIGN_SHL, Token::SHL);
369 } else if (c0_ == '!') {
370 token = ScanHtmlComment();
371 } else {
372 token = Token::LT;
373 }
374 break;
375
376 case '>':
377 // > >= >> >>= >>> >>>=
378 Advance();
379 if (c0_ == '=') {
380 token = Select(Token::GTE);
381 } else if (c0_ == '>') {
382 // >> >>= >>> >>>=
383 Advance();
384 if (c0_ == '=') {
385 token = Select(Token::ASSIGN_SAR);
386 } else if (c0_ == '>') {
387 token = Select('=', Token::ASSIGN_SHR, Token::SHR);
388 } else {
389 token = Token::SAR;
390 }
391 } else {
392 token = Token::GT;
393 }
394 break;
395
396 case '=':
397 // = == ===
398 Advance();
399 if (c0_ == '=') {
400 token = Select('=', Token::EQ_STRICT, Token::EQ);
401 } else {
402 token = Token::ASSIGN;
403 }
404 break;
405
406 case '!':
407 // ! != !==
408 Advance();
409 if (c0_ == '=') {
410 token = Select('=', Token::NE_STRICT, Token::NE);
411 } else {
412 token = Token::NOT;
413 }
414 break;
415
416 case '+':
417 // + ++ +=
418 Advance();
419 if (c0_ == '+') {
420 token = Select(Token::INC);
421 } else if (c0_ == '=') {
422 token = Select(Token::ASSIGN_ADD);
423 } else {
424 token = Token::ADD;
425 }
426 break;
427
428 case '-':
429 // - -- --> -=
430 Advance();
431 if (c0_ == '-') {
432 Advance();
433 if (c0_ == '>' && has_line_terminator_before_next_) {
434 // For compatibility with SpiderMonkey, we skip lines that
435 // start with an HTML comment end '-->'.
436 token = SkipSingleLineComment();
437 } else {
438 token = Token::DEC;
439 }
440 } else if (c0_ == '=') {
441 token = Select(Token::ASSIGN_SUB);
442 } else {
443 token = Token::SUB;
444 }
445 break;
446
447 case '*':
448 // * *=
449 token = Select('=', Token::ASSIGN_MUL, Token::MUL);
450 break;
451
452 case '%':
453 // % %=
454 token = Select('=', Token::ASSIGN_MOD, Token::MOD);
455 break;
456
457 case '/':
458 // / // /* /=
459 Advance();
460 if (c0_ == '/') {
461 token = SkipSingleLineComment();
462 } else if (c0_ == '*') {
463 token = SkipMultiLineComment();
464 } else if (c0_ == '=') {
465 token = Select(Token::ASSIGN_DIV);
466 } else {
467 token = Token::DIV;
468 }
469 break;
470
471 case '&':
472 // & && &=
473 Advance();
474 if (c0_ == '&') {
475 token = Select(Token::AND);
476 } else if (c0_ == '=') {
477 token = Select(Token::ASSIGN_BIT_AND);
478 } else {
479 token = Token::BIT_AND;
480 }
481 break;
482
483 case '|':
484 // | || |=
485 Advance();
486 if (c0_ == '|') {
487 token = Select(Token::OR);
488 } else if (c0_ == '=') {
489 token = Select(Token::ASSIGN_BIT_OR);
490 } else {
491 token = Token::BIT_OR;
492 }
493 break;
494
495 case '^':
496 // ^ ^=
497 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
498 break;
499
500 case '.':
501 // . Number
502 Advance();
503 if (IsDecimalDigit(c0_)) {
504 token = ScanNumber(true);
505 } else {
506 token = Token::PERIOD;
507 }
508 break;
509
510 case ':':
511 token = Select(Token::COLON);
512 break;
513
514 case ';':
515 token = Select(Token::SEMICOLON);
516 break;
517
518 case ',':
519 token = Select(Token::COMMA);
520 break;
521
522 case '(':
523 token = Select(Token::LPAREN);
524 break;
525
526 case ')':
527 token = Select(Token::RPAREN);
528 break;
529
530 case '[':
531 token = Select(Token::LBRACK);
532 break;
533
534 case ']':
535 token = Select(Token::RBRACK);
536 break;
537
538 case '{':
539 token = Select(Token::LBRACE);
540 break;
541
542 case '}':
543 token = Select(Token::RBRACE);
544 break;
545
546 case '?':
547 token = Select(Token::CONDITIONAL);
548 break;
549
550 case '~':
551 token = Select(Token::BIT_NOT);
552 break;
553
554 default:
555 if (unicode_cache_->IsIdentifierStart(c0_)) {
556 token = ScanIdentifierOrKeyword();
557 } else if (IsDecimalDigit(c0_)) {
558 token = ScanNumber(false);
559 } else if (SkipWhiteSpace()) {
560 token = Token::WHITESPACE;
561 } else if (c0_ < 0) {
562 token = Token::EOS;
563 } else {
564 token = Select(Token::ILLEGAL);
565 }
566 break;
567 }
568
569 // Continue scanning for tokens as long as we're just skipping
570 // whitespace.
571 } while (token == Token::WHITESPACE);
572
573 next_.location.end_pos = source_pos();
574 next_.token = token;
575 }
576
577
SeekForward(int pos)578 void Scanner::SeekForward(int pos) {
579 // After this call, we will have the token at the given position as
580 // the "next" token. The "current" token will be invalid.
581 if (pos == next_.location.beg_pos) return;
582 int current_pos = source_pos();
583 ASSERT_EQ(next_.location.end_pos, current_pos);
584 // Positions inside the lookahead token aren't supported.
585 ASSERT(pos >= current_pos);
586 if (pos != current_pos) {
587 source_->SeekForward(pos - source_->pos());
588 Advance();
589 // This function is only called to seek to the location
590 // of the end of a function (at the "}" token). It doesn't matter
591 // whether there was a line terminator in the part we skip.
592 has_line_terminator_before_next_ = false;
593 has_multiline_comment_before_next_ = false;
594 }
595 Scan();
596 }
597
598
ScanEscape()599 bool Scanner::ScanEscape() {
600 uc32 c = c0_;
601 Advance();
602
603 // Skip escaped newlines.
604 if (unicode_cache_->IsLineTerminator(c)) {
605 // Allow CR+LF newlines in multiline string literals.
606 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
607 // Allow LF+CR newlines in multiline string literals.
608 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
609 return true;
610 }
611
612 switch (c) {
613 case '\'': // fall through
614 case '"' : // fall through
615 case '\\': break;
616 case 'b' : c = '\b'; break;
617 case 'f' : c = '\f'; break;
618 case 'n' : c = '\n'; break;
619 case 'r' : c = '\r'; break;
620 case 't' : c = '\t'; break;
621 case 'u' : {
622 c = ScanHexNumber(4);
623 if (c < 0) return false;
624 break;
625 }
626 case 'v' : c = '\v'; break;
627 case 'x' : {
628 c = ScanHexNumber(2);
629 if (c < 0) return false;
630 break;
631 }
632 case '0' : // fall through
633 case '1' : // fall through
634 case '2' : // fall through
635 case '3' : // fall through
636 case '4' : // fall through
637 case '5' : // fall through
638 case '6' : // fall through
639 case '7' : c = ScanOctalEscape(c, 2); break;
640 }
641
642 // According to ECMA-262, section 7.8.4, characters not covered by the
643 // above cases should be illegal, but they are commonly handled as
644 // non-escaped characters by JS VMs.
645 AddLiteralChar(c);
646 return true;
647 }
648
649
650 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
651 // ECMA-262. Other JS VMs support them.
ScanOctalEscape(uc32 c,int length)652 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
653 uc32 x = c - '0';
654 int i = 0;
655 for (; i < length; i++) {
656 int d = c0_ - '0';
657 if (d < 0 || d > 7) break;
658 int nx = x * 8 + d;
659 if (nx >= 256) break;
660 x = nx;
661 Advance();
662 }
663 // Anything except '\0' is an octal escape sequence, illegal in strict mode.
664 // Remember the position of octal escape sequences so that an error
665 // can be reported later (in strict mode).
666 // We don't report the error immediately, because the octal escape can
667 // occur before the "use strict" directive.
668 if (c != '0' || i > 0) {
669 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
670 }
671 return x;
672 }
673
674
ScanString()675 Token::Value Scanner::ScanString() {
676 uc32 quote = c0_;
677 Advance(); // consume quote
678
679 LiteralScope literal(this);
680 while (c0_ != quote && c0_ >= 0
681 && !unicode_cache_->IsLineTerminator(c0_)) {
682 uc32 c = c0_;
683 Advance();
684 if (c == '\\') {
685 if (c0_ < 0 || !ScanEscape()) return Token::ILLEGAL;
686 } else {
687 AddLiteralChar(c);
688 }
689 }
690 if (c0_ != quote) return Token::ILLEGAL;
691 literal.Complete();
692
693 Advance(); // consume quote
694 return Token::STRING;
695 }
696
697
ScanDecimalDigits()698 void Scanner::ScanDecimalDigits() {
699 while (IsDecimalDigit(c0_))
700 AddLiteralCharAdvance();
701 }
702
703
ScanNumber(bool seen_period)704 Token::Value Scanner::ScanNumber(bool seen_period) {
705 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
706
707 enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL;
708
709 LiteralScope literal(this);
710 if (seen_period) {
711 // we have already seen a decimal point of the float
712 AddLiteralChar('.');
713 ScanDecimalDigits(); // we know we have at least one digit
714
715 } else {
716 // if the first character is '0' we must check for octals and hex
717 if (c0_ == '0') {
718 int start_pos = source_pos(); // For reporting octal positions.
719 AddLiteralCharAdvance();
720
721 // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
722 // an octal number.
723 if (c0_ == 'x' || c0_ == 'X') {
724 // hex number
725 kind = HEX;
726 AddLiteralCharAdvance();
727 if (!IsHexDigit(c0_)) {
728 // we must have at least one hex digit after 'x'/'X'
729 return Token::ILLEGAL;
730 }
731 while (IsHexDigit(c0_)) {
732 AddLiteralCharAdvance();
733 }
734 } else if (harmony_numeric_literals_ && (c0_ == 'o' || c0_ == 'O')) {
735 kind = OCTAL;
736 AddLiteralCharAdvance();
737 if (!IsOctalDigit(c0_)) {
738 // we must have at least one octal digit after 'o'/'O'
739 return Token::ILLEGAL;
740 }
741 while (IsOctalDigit(c0_)) {
742 AddLiteralCharAdvance();
743 }
744 } else if (harmony_numeric_literals_ && (c0_ == 'b' || c0_ == 'B')) {
745 kind = BINARY;
746 AddLiteralCharAdvance();
747 if (!IsBinaryDigit(c0_)) {
748 // we must have at least one binary digit after 'b'/'B'
749 return Token::ILLEGAL;
750 }
751 while (IsBinaryDigit(c0_)) {
752 AddLiteralCharAdvance();
753 }
754 } else if ('0' <= c0_ && c0_ <= '7') {
755 // (possible) octal number
756 kind = IMPLICIT_OCTAL;
757 while (true) {
758 if (c0_ == '8' || c0_ == '9') {
759 kind = DECIMAL;
760 break;
761 }
762 if (c0_ < '0' || '7' < c0_) {
763 // Octal literal finished.
764 octal_pos_ = Location(start_pos, source_pos());
765 break;
766 }
767 AddLiteralCharAdvance();
768 }
769 }
770 }
771
772 // Parse decimal digits and allow trailing fractional part.
773 if (kind == DECIMAL) {
774 ScanDecimalDigits(); // optional
775 if (c0_ == '.') {
776 AddLiteralCharAdvance();
777 ScanDecimalDigits(); // optional
778 }
779 }
780 }
781
782 // scan exponent, if any
783 if (c0_ == 'e' || c0_ == 'E') {
784 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
785 if (kind != DECIMAL) return Token::ILLEGAL;
786 // scan exponent
787 AddLiteralCharAdvance();
788 if (c0_ == '+' || c0_ == '-')
789 AddLiteralCharAdvance();
790 if (!IsDecimalDigit(c0_)) {
791 // we must have at least one decimal digit after 'e'/'E'
792 return Token::ILLEGAL;
793 }
794 ScanDecimalDigits();
795 }
796
797 // The source character immediately following a numeric literal must
798 // not be an identifier start or a decimal digit; see ECMA-262
799 // section 7.8.3, page 17 (note that we read only one decimal digit
800 // if the value is 0).
801 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
802 return Token::ILLEGAL;
803
804 literal.Complete();
805
806 return Token::NUMBER;
807 }
808
809
ScanIdentifierUnicodeEscape()810 uc32 Scanner::ScanIdentifierUnicodeEscape() {
811 Advance();
812 if (c0_ != 'u') return -1;
813 Advance();
814 uc32 result = ScanHexNumber(4);
815 if (result < 0) PushBack('u');
816 return result;
817 }
818
819
820 // ----------------------------------------------------------------------------
821 // Keyword Matcher
822
823 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
824 KEYWORD_GROUP('b') \
825 KEYWORD("break", Token::BREAK) \
826 KEYWORD_GROUP('c') \
827 KEYWORD("case", Token::CASE) \
828 KEYWORD("catch", Token::CATCH) \
829 KEYWORD("class", Token::FUTURE_RESERVED_WORD) \
830 KEYWORD("const", Token::CONST) \
831 KEYWORD("continue", Token::CONTINUE) \
832 KEYWORD_GROUP('d') \
833 KEYWORD("debugger", Token::DEBUGGER) \
834 KEYWORD("default", Token::DEFAULT) \
835 KEYWORD("delete", Token::DELETE) \
836 KEYWORD("do", Token::DO) \
837 KEYWORD_GROUP('e') \
838 KEYWORD("else", Token::ELSE) \
839 KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \
840 KEYWORD("export", harmony_modules \
841 ? Token::EXPORT : Token::FUTURE_RESERVED_WORD) \
842 KEYWORD("extends", Token::FUTURE_RESERVED_WORD) \
843 KEYWORD_GROUP('f') \
844 KEYWORD("false", Token::FALSE_LITERAL) \
845 KEYWORD("finally", Token::FINALLY) \
846 KEYWORD("for", Token::FOR) \
847 KEYWORD("function", Token::FUNCTION) \
848 KEYWORD_GROUP('i') \
849 KEYWORD("if", Token::IF) \
850 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
851 KEYWORD("import", harmony_modules \
852 ? Token::IMPORT : Token::FUTURE_RESERVED_WORD) \
853 KEYWORD("in", Token::IN) \
854 KEYWORD("instanceof", Token::INSTANCEOF) \
855 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \
856 KEYWORD_GROUP('l') \
857 KEYWORD("let", harmony_scoping \
858 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
859 KEYWORD_GROUP('n') \
860 KEYWORD("new", Token::NEW) \
861 KEYWORD("null", Token::NULL_LITERAL) \
862 KEYWORD_GROUP('p') \
863 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \
864 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \
865 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \
866 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \
867 KEYWORD_GROUP('r') \
868 KEYWORD("return", Token::RETURN) \
869 KEYWORD_GROUP('s') \
870 KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD) \
871 KEYWORD("super", Token::FUTURE_RESERVED_WORD) \
872 KEYWORD("switch", Token::SWITCH) \
873 KEYWORD_GROUP('t') \
874 KEYWORD("this", Token::THIS) \
875 KEYWORD("throw", Token::THROW) \
876 KEYWORD("true", Token::TRUE_LITERAL) \
877 KEYWORD("try", Token::TRY) \
878 KEYWORD("typeof", Token::TYPEOF) \
879 KEYWORD_GROUP('v') \
880 KEYWORD("var", Token::VAR) \
881 KEYWORD("void", Token::VOID) \
882 KEYWORD_GROUP('w') \
883 KEYWORD("while", Token::WHILE) \
884 KEYWORD("with", Token::WITH) \
885 KEYWORD_GROUP('y') \
886 KEYWORD("yield", Token::YIELD)
887
888
KeywordOrIdentifierToken(const uint8_t * input,int input_length,bool harmony_scoping,bool harmony_modules)889 static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
890 int input_length,
891 bool harmony_scoping,
892 bool harmony_modules) {
893 ASSERT(input_length >= 1);
894 const int kMinLength = 2;
895 const int kMaxLength = 10;
896 if (input_length < kMinLength || input_length > kMaxLength) {
897 return Token::IDENTIFIER;
898 }
899 switch (input[0]) {
900 default:
901 #define KEYWORD_GROUP_CASE(ch) \
902 break; \
903 case ch:
904 #define KEYWORD(keyword, token) \
905 { \
906 /* 'keyword' is a char array, so sizeof(keyword) is */ \
907 /* strlen(keyword) plus 1 for the NUL char. */ \
908 const int keyword_length = sizeof(keyword) - 1; \
909 STATIC_ASSERT(keyword_length >= kMinLength); \
910 STATIC_ASSERT(keyword_length <= kMaxLength); \
911 if (input_length == keyword_length && \
912 input[1] == keyword[1] && \
913 (keyword_length <= 2 || input[2] == keyword[2]) && \
914 (keyword_length <= 3 || input[3] == keyword[3]) && \
915 (keyword_length <= 4 || input[4] == keyword[4]) && \
916 (keyword_length <= 5 || input[5] == keyword[5]) && \
917 (keyword_length <= 6 || input[6] == keyword[6]) && \
918 (keyword_length <= 7 || input[7] == keyword[7]) && \
919 (keyword_length <= 8 || input[8] == keyword[8]) && \
920 (keyword_length <= 9 || input[9] == keyword[9])) { \
921 return token; \
922 } \
923 }
924 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
925 }
926 return Token::IDENTIFIER;
927 }
928
929
ScanIdentifierOrKeyword()930 Token::Value Scanner::ScanIdentifierOrKeyword() {
931 ASSERT(unicode_cache_->IsIdentifierStart(c0_));
932 LiteralScope literal(this);
933 // Scan identifier start character.
934 if (c0_ == '\\') {
935 uc32 c = ScanIdentifierUnicodeEscape();
936 // Only allow legal identifier start characters.
937 if (c < 0 ||
938 c == '\\' || // No recursive escapes.
939 !unicode_cache_->IsIdentifierStart(c)) {
940 return Token::ILLEGAL;
941 }
942 AddLiteralChar(c);
943 return ScanIdentifierSuffix(&literal);
944 }
945
946 uc32 first_char = c0_;
947 Advance();
948 AddLiteralChar(first_char);
949
950 // Scan the rest of the identifier characters.
951 while (unicode_cache_->IsIdentifierPart(c0_)) {
952 if (c0_ != '\\') {
953 uc32 next_char = c0_;
954 Advance();
955 AddLiteralChar(next_char);
956 continue;
957 }
958 // Fallthrough if no longer able to complete keyword.
959 return ScanIdentifierSuffix(&literal);
960 }
961
962 literal.Complete();
963
964 if (next_.literal_chars->is_one_byte()) {
965 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
966 return KeywordOrIdentifierToken(chars.start(),
967 chars.length(),
968 harmony_scoping_,
969 harmony_modules_);
970 }
971
972 return Token::IDENTIFIER;
973 }
974
975
ScanIdentifierSuffix(LiteralScope * literal)976 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
977 // Scan the rest of the identifier characters.
978 while (unicode_cache_->IsIdentifierPart(c0_)) {
979 if (c0_ == '\\') {
980 uc32 c = ScanIdentifierUnicodeEscape();
981 // Only allow legal identifier part characters.
982 if (c < 0 ||
983 c == '\\' ||
984 !unicode_cache_->IsIdentifierPart(c)) {
985 return Token::ILLEGAL;
986 }
987 AddLiteralChar(c);
988 } else {
989 AddLiteralChar(c0_);
990 Advance();
991 }
992 }
993 literal->Complete();
994
995 return Token::IDENTIFIER;
996 }
997
998
ScanRegExpPattern(bool seen_equal)999 bool Scanner::ScanRegExpPattern(bool seen_equal) {
1000 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1001 bool in_character_class = false;
1002
1003 // Previous token is either '/' or '/=', in the second case, the
1004 // pattern starts at =.
1005 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1006 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1007
1008 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1009 // the scanner should pass uninterpreted bodies to the RegExp
1010 // constructor.
1011 LiteralScope literal(this);
1012 if (seen_equal) {
1013 AddLiteralChar('=');
1014 }
1015
1016 while (c0_ != '/' || in_character_class) {
1017 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1018 if (c0_ == '\\') { // Escape sequence.
1019 AddLiteralCharAdvance();
1020 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1021 AddLiteralCharAdvance();
1022 // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1023 // only "safe" characters are allowed (letters, digits, underscore),
1024 // otherwise the escape isn't valid and the invalid character has
1025 // its normal meaning. I.e., we can just continue scanning without
1026 // worrying whether the following characters are part of the escape
1027 // or not, since any '/', '\\' or '[' is guaranteed to not be part
1028 // of the escape sequence.
1029
1030 // TODO(896): At some point, parse RegExps more throughly to capture
1031 // octal esacpes in strict mode.
1032 } else { // Unescaped character.
1033 if (c0_ == '[') in_character_class = true;
1034 if (c0_ == ']') in_character_class = false;
1035 AddLiteralCharAdvance();
1036 }
1037 }
1038 Advance(); // consume '/'
1039
1040 literal.Complete();
1041
1042 return true;
1043 }
1044
1045
ScanLiteralUnicodeEscape()1046 bool Scanner::ScanLiteralUnicodeEscape() {
1047 ASSERT(c0_ == '\\');
1048 uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
1049 Advance();
1050 int i = 1;
1051 if (c0_ == 'u') {
1052 i++;
1053 while (i < 6) {
1054 Advance();
1055 if (!IsHexDigit(c0_)) break;
1056 chars_read[i] = c0_;
1057 i++;
1058 }
1059 }
1060 if (i < 6) {
1061 // Incomplete escape. Undo all advances and return false.
1062 while (i > 0) {
1063 i--;
1064 PushBack(chars_read[i]);
1065 }
1066 return false;
1067 }
1068 // Complete escape. Add all chars to current literal buffer.
1069 for (int i = 0; i < 6; i++) {
1070 AddLiteralChar(chars_read[i]);
1071 }
1072 return true;
1073 }
1074
1075
ScanRegExpFlags()1076 bool Scanner::ScanRegExpFlags() {
1077 // Scan regular expression flags.
1078 LiteralScope literal(this);
1079 while (unicode_cache_->IsIdentifierPart(c0_)) {
1080 if (c0_ != '\\') {
1081 AddLiteralCharAdvance();
1082 } else {
1083 if (!ScanLiteralUnicodeEscape()) {
1084 break;
1085 }
1086 Advance();
1087 }
1088 }
1089 literal.Complete();
1090
1091 next_.location.end_pos = source_pos() - 1;
1092 return true;
1093 }
1094
1095
AllocateNextLiteralString(Isolate * isolate,PretenureFlag tenured)1096 Handle<String> Scanner::AllocateNextLiteralString(Isolate* isolate,
1097 PretenureFlag tenured) {
1098 if (is_next_literal_one_byte()) {
1099 return isolate->factory()->NewStringFromOneByte(
1100 next_literal_one_byte_string(), tenured).ToHandleChecked();
1101 } else {
1102 return isolate->factory()->NewStringFromTwoByte(
1103 next_literal_two_byte_string(), tenured).ToHandleChecked();
1104 }
1105 }
1106
1107
AllocateInternalizedString(Isolate * isolate)1108 Handle<String> Scanner::AllocateInternalizedString(Isolate* isolate) {
1109 if (is_literal_one_byte()) {
1110 return isolate->factory()->InternalizeOneByteString(
1111 literal_one_byte_string());
1112 } else {
1113 return isolate->factory()->InternalizeTwoByteString(
1114 literal_two_byte_string());
1115 }
1116 }
1117
1118
DoubleValue()1119 double Scanner::DoubleValue() {
1120 ASSERT(is_literal_one_byte());
1121 return StringToDouble(
1122 unicode_cache_,
1123 literal_one_byte_string(),
1124 ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1125 }
1126
1127
FindNumber(DuplicateFinder * finder,int value)1128 int Scanner::FindNumber(DuplicateFinder* finder, int value) {
1129 return finder->AddNumber(literal_one_byte_string(), value);
1130 }
1131
1132
FindSymbol(DuplicateFinder * finder,int value)1133 int Scanner::FindSymbol(DuplicateFinder* finder, int value) {
1134 if (is_literal_one_byte()) {
1135 return finder->AddOneByteSymbol(literal_one_byte_string(), value);
1136 }
1137 return finder->AddTwoByteSymbol(literal_two_byte_string(), value);
1138 }
1139
1140
AddOneByteSymbol(Vector<const uint8_t> key,int value)1141 int DuplicateFinder::AddOneByteSymbol(Vector<const uint8_t> key, int value) {
1142 return AddSymbol(key, true, value);
1143 }
1144
1145
AddTwoByteSymbol(Vector<const uint16_t> key,int value)1146 int DuplicateFinder::AddTwoByteSymbol(Vector<const uint16_t> key, int value) {
1147 return AddSymbol(Vector<const uint8_t>::cast(key), false, value);
1148 }
1149
1150
AddSymbol(Vector<const uint8_t> key,bool is_one_byte,int value)1151 int DuplicateFinder::AddSymbol(Vector<const uint8_t> key,
1152 bool is_one_byte,
1153 int value) {
1154 uint32_t hash = Hash(key, is_one_byte);
1155 byte* encoding = BackupKey(key, is_one_byte);
1156 HashMap::Entry* entry = map_.Lookup(encoding, hash, true);
1157 int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value));
1158 entry->value =
1159 reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value));
1160 return old_value;
1161 }
1162
1163
AddNumber(Vector<const uint8_t> key,int value)1164 int DuplicateFinder::AddNumber(Vector<const uint8_t> key, int value) {
1165 ASSERT(key.length() > 0);
1166 // Quick check for already being in canonical form.
1167 if (IsNumberCanonical(key)) {
1168 return AddOneByteSymbol(key, value);
1169 }
1170
1171 int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY;
1172 double double_value = StringToDouble(
1173 unicode_constants_, key, flags, 0.0);
1174 int length;
1175 const char* string;
1176 if (!std::isfinite(double_value)) {
1177 string = "Infinity";
1178 length = 8; // strlen("Infinity");
1179 } else {
1180 string = DoubleToCString(double_value,
1181 Vector<char>(number_buffer_, kBufferSize));
1182 length = StrLength(string);
1183 }
1184 return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string),
1185 length), true, value);
1186 }
1187
1188
IsNumberCanonical(Vector<const uint8_t> number)1189 bool DuplicateFinder::IsNumberCanonical(Vector<const uint8_t> number) {
1190 // Test for a safe approximation of number literals that are already
1191 // in canonical form: max 15 digits, no leading zeroes, except an
1192 // integer part that is a single zero, and no trailing zeros below
1193 // the decimal point.
1194 int pos = 0;
1195 int length = number.length();
1196 if (number.length() > 15) return false;
1197 if (number[pos] == '0') {
1198 pos++;
1199 } else {
1200 while (pos < length &&
1201 static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++;
1202 }
1203 if (length == pos) return true;
1204 if (number[pos] != '.') return false;
1205 pos++;
1206 bool invalid_last_digit = true;
1207 while (pos < length) {
1208 uint8_t digit = number[pos] - '0';
1209 if (digit > '9' - '0') return false;
1210 invalid_last_digit = (digit == 0);
1211 pos++;
1212 }
1213 return !invalid_last_digit;
1214 }
1215
1216
Hash(Vector<const uint8_t> key,bool is_one_byte)1217 uint32_t DuplicateFinder::Hash(Vector<const uint8_t> key, bool is_one_byte) {
1218 // Primitive hash function, almost identical to the one used
1219 // for strings (except that it's seeded by the length and ASCII-ness).
1220 int length = key.length();
1221 uint32_t hash = (length << 1) | (is_one_byte ? 1 : 0) ;
1222 for (int i = 0; i < length; i++) {
1223 uint32_t c = key[i];
1224 hash = (hash + c) * 1025;
1225 hash ^= (hash >> 6);
1226 }
1227 return hash;
1228 }
1229
1230
Match(void * first,void * second)1231 bool DuplicateFinder::Match(void* first, void* second) {
1232 // Decode lengths.
1233 // Length + ASCII-bit is encoded as base 128, most significant heptet first,
1234 // with a 8th bit being non-zero while there are more heptets.
1235 // The value encodes the number of bytes following, and whether the original
1236 // was ASCII.
1237 byte* s1 = reinterpret_cast<byte*>(first);
1238 byte* s2 = reinterpret_cast<byte*>(second);
1239 uint32_t length_one_byte_field = 0;
1240 byte c1;
1241 do {
1242 c1 = *s1;
1243 if (c1 != *s2) return false;
1244 length_one_byte_field = (length_one_byte_field << 7) | (c1 & 0x7f);
1245 s1++;
1246 s2++;
1247 } while ((c1 & 0x80) != 0);
1248 int length = static_cast<int>(length_one_byte_field >> 1);
1249 return memcmp(s1, s2, length) == 0;
1250 }
1251
1252
BackupKey(Vector<const uint8_t> bytes,bool is_one_byte)1253 byte* DuplicateFinder::BackupKey(Vector<const uint8_t> bytes,
1254 bool is_one_byte) {
1255 uint32_t one_byte_length = (bytes.length() << 1) | (is_one_byte ? 1 : 0);
1256 backing_store_.StartSequence();
1257 // Emit one_byte_length as base-128 encoded number, with the 7th bit set
1258 // on the byte of every heptet except the last, least significant, one.
1259 if (one_byte_length >= (1 << 7)) {
1260 if (one_byte_length >= (1 << 14)) {
1261 if (one_byte_length >= (1 << 21)) {
1262 if (one_byte_length >= (1 << 28)) {
1263 backing_store_.Add(
1264 static_cast<uint8_t>((one_byte_length >> 28) | 0x80));
1265 }
1266 backing_store_.Add(
1267 static_cast<uint8_t>((one_byte_length >> 21) | 0x80u));
1268 }
1269 backing_store_.Add(
1270 static_cast<uint8_t>((one_byte_length >> 14) | 0x80u));
1271 }
1272 backing_store_.Add(static_cast<uint8_t>((one_byte_length >> 7) | 0x80u));
1273 }
1274 backing_store_.Add(static_cast<uint8_t>(one_byte_length & 0x7f));
1275
1276 backing_store_.AddBlock(bytes);
1277 return backing_store_.EndSequence().start();
1278 }
1279
1280 } } // namespace v8::internal
1281