1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28 #include "v8.h"
29
30 #include "ast.h"
31 #include "scanner.h"
32
33 namespace v8 {
34 namespace internal {
35
36 // ----------------------------------------------------------------------------
37 // Character predicates
38
39
40 unibrow::Predicate<IdentifierStart, 128> Scanner::kIsIdentifierStart;
41 unibrow::Predicate<IdentifierPart, 128> Scanner::kIsIdentifierPart;
42 unibrow::Predicate<unibrow::LineTerminator, 128> Scanner::kIsLineTerminator;
43 unibrow::Predicate<unibrow::WhiteSpace, 128> Scanner::kIsWhiteSpace;
44
45
46 StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_;
47
48
49 // ----------------------------------------------------------------------------
50 // UTF8Buffer
51
UTF8Buffer()52 UTF8Buffer::UTF8Buffer() : data_(NULL), limit_(NULL) { }
53
54
~UTF8Buffer()55 UTF8Buffer::~UTF8Buffer() {
56 if (data_ != NULL) DeleteArray(data_);
57 }
58
59
AddCharSlow(uc32 c)60 void UTF8Buffer::AddCharSlow(uc32 c) {
61 static const int kCapacityGrowthLimit = 1 * MB;
62 if (cursor_ > limit_) {
63 int old_capacity = Capacity();
64 int old_position = pos();
65 int new_capacity =
66 Min(old_capacity * 3, old_capacity + kCapacityGrowthLimit);
67 char* new_data = NewArray<char>(new_capacity);
68 memcpy(new_data, data_, old_position);
69 DeleteArray(data_);
70 data_ = new_data;
71 cursor_ = new_data + old_position;
72 limit_ = ComputeLimit(new_data, new_capacity);
73 ASSERT(Capacity() == new_capacity && pos() == old_position);
74 }
75 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
76 *cursor_++ = c; // Common case: 7-bit ASCII.
77 } else {
78 cursor_ += unibrow::Utf8::Encode(cursor_, c);
79 }
80 ASSERT(pos() <= Capacity());
81 }
82
83
84 // ----------------------------------------------------------------------------
85 // UTF16Buffer
86
87
UTF16Buffer()88 UTF16Buffer::UTF16Buffer()
89 : pos_(0), size_(0) { }
90
91
SubString(int start,int end)92 Handle<String> UTF16Buffer::SubString(int start, int end) {
93 return internal::SubString(data_, start, end);
94 }
95
96
97 // CharacterStreamUTF16Buffer
CharacterStreamUTF16Buffer()98 CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()
99 : pushback_buffer_(0), last_(0), stream_(NULL) { }
100
101
Initialize(Handle<String> data,unibrow::CharacterStream * input)102 void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,
103 unibrow::CharacterStream* input) {
104 data_ = data;
105 pos_ = 0;
106 stream_ = input;
107 }
108
109
PushBack(uc32 ch)110 void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {
111 pushback_buffer()->Add(last_);
112 last_ = ch;
113 pos_--;
114 }
115
116
Advance()117 uc32 CharacterStreamUTF16Buffer::Advance() {
118 // NOTE: It is of importance to Persian / Farsi resources that we do
119 // *not* strip format control characters in the scanner; see
120 //
121 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152
122 //
123 // So, even though ECMA-262, section 7.1, page 11, dictates that we
124 // must remove Unicode format-control characters, we do not. This is
125 // in line with how IE and SpiderMonkey handles it.
126 if (!pushback_buffer()->is_empty()) {
127 pos_++;
128 return last_ = pushback_buffer()->RemoveLast();
129 } else if (stream_->has_more()) {
130 pos_++;
131 uc32 next = stream_->GetNext();
132 return last_ = next;
133 } else {
134 // Note: currently the following increment is necessary to avoid a
135 // test-parser problem!
136 pos_++;
137 return last_ = static_cast<uc32>(-1);
138 }
139 }
140
141
SeekForward(int pos)142 void CharacterStreamUTF16Buffer::SeekForward(int pos) {
143 pos_ = pos;
144 ASSERT(pushback_buffer()->is_empty());
145 stream_->Seek(pos);
146 }
147
148
149 // TwoByteStringUTF16Buffer
TwoByteStringUTF16Buffer()150 TwoByteStringUTF16Buffer::TwoByteStringUTF16Buffer()
151 : raw_data_(NULL) { }
152
153
Initialize(Handle<ExternalTwoByteString> data)154 void TwoByteStringUTF16Buffer::Initialize(
155 Handle<ExternalTwoByteString> data) {
156 ASSERT(!data.is_null());
157
158 data_ = data;
159 pos_ = 0;
160
161 raw_data_ = data->resource()->data();
162 size_ = data->length();
163 }
164
165
Advance()166 uc32 TwoByteStringUTF16Buffer::Advance() {
167 if (pos_ < size_) {
168 return raw_data_[pos_++];
169 } else {
170 // note: currently the following increment is necessary to avoid a
171 // test-parser problem!
172 pos_++;
173 return static_cast<uc32>(-1);
174 }
175 }
176
177
PushBack(uc32 ch)178 void TwoByteStringUTF16Buffer::PushBack(uc32 ch) {
179 pos_--;
180 ASSERT(pos_ >= Scanner::kCharacterLookaheadBufferSize);
181 ASSERT(raw_data_[pos_ - Scanner::kCharacterLookaheadBufferSize] == ch);
182 }
183
184
SeekForward(int pos)185 void TwoByteStringUTF16Buffer::SeekForward(int pos) {
186 pos_ = pos;
187 }
188
189
190 // ----------------------------------------------------------------------------
191 // Keyword Matcher
192 KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {
193 { "break", KEYWORD_PREFIX, Token::BREAK },
194 { NULL, C, Token::ILLEGAL },
195 { NULL, D, Token::ILLEGAL },
196 { "else", KEYWORD_PREFIX, Token::ELSE },
197 { NULL, F, Token::ILLEGAL },
198 { NULL, UNMATCHABLE, Token::ILLEGAL },
199 { NULL, UNMATCHABLE, Token::ILLEGAL },
200 { NULL, I, Token::ILLEGAL },
201 { NULL, UNMATCHABLE, Token::ILLEGAL },
202 { NULL, UNMATCHABLE, Token::ILLEGAL },
203 { NULL, UNMATCHABLE, Token::ILLEGAL },
204 { NULL, UNMATCHABLE, Token::ILLEGAL },
205 { NULL, N, Token::ILLEGAL },
206 { NULL, UNMATCHABLE, Token::ILLEGAL },
207 { NULL, UNMATCHABLE, Token::ILLEGAL },
208 { NULL, UNMATCHABLE, Token::ILLEGAL },
209 { "return", KEYWORD_PREFIX, Token::RETURN },
210 { "switch", KEYWORD_PREFIX, Token::SWITCH },
211 { NULL, T, Token::ILLEGAL },
212 { NULL, UNMATCHABLE, Token::ILLEGAL },
213 { NULL, V, Token::ILLEGAL },
214 { NULL, W, Token::ILLEGAL }
215 };
216
217
Step(uc32 input)218 void KeywordMatcher::Step(uc32 input) {
219 switch (state_) {
220 case INITIAL: {
221 // matching the first character is the only state with significant fanout.
222 // Match only lower-case letters in range 'b'..'w'.
223 unsigned int offset = input - kFirstCharRangeMin;
224 if (offset < kFirstCharRangeLength) {
225 state_ = first_states_[offset].state;
226 if (state_ == KEYWORD_PREFIX) {
227 keyword_ = first_states_[offset].keyword;
228 counter_ = 1;
229 keyword_token_ = first_states_[offset].token;
230 }
231 return;
232 }
233 break;
234 }
235 case KEYWORD_PREFIX:
236 if (keyword_[counter_] == input) {
237 ASSERT_NE(input, '\0');
238 counter_++;
239 if (keyword_[counter_] == '\0') {
240 state_ = KEYWORD_MATCHED;
241 token_ = keyword_token_;
242 }
243 return;
244 }
245 break;
246 case KEYWORD_MATCHED:
247 token_ = Token::IDENTIFIER;
248 break;
249 case C:
250 if (MatchState(input, 'a', CA)) return;
251 if (MatchState(input, 'o', CO)) return;
252 break;
253 case CA:
254 if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;
255 if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;
256 break;
257 case CO:
258 if (MatchState(input, 'n', CON)) return;
259 break;
260 case CON:
261 if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;
262 if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;
263 break;
264 case D:
265 if (MatchState(input, 'e', DE)) return;
266 if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;
267 break;
268 case DE:
269 if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;
270 if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;
271 if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;
272 break;
273 case F:
274 if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;
275 if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;
276 if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;
277 if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;
278 break;
279 case I:
280 if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;
281 if (MatchKeyword(input, 'n', IN, Token::IN)) return;
282 break;
283 case IN:
284 token_ = Token::IDENTIFIER;
285 if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) {
286 return;
287 }
288 break;
289 case N:
290 if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return;
291 if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;
292 if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;
293 break;
294 case T:
295 if (MatchState(input, 'h', TH)) return;
296 if (MatchState(input, 'r', TR)) return;
297 if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;
298 break;
299 case TH:
300 if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;
301 if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;
302 break;
303 case TR:
304 if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;
305 if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;
306 break;
307 case V:
308 if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;
309 if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;
310 break;
311 case W:
312 if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;
313 if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
314 break;
315 default:
316 UNREACHABLE();
317 }
318 // On fallthrough, it's a failure.
319 state_ = UNMATCHABLE;
320 }
321
322
323 // ----------------------------------------------------------------------------
324 // Scanner
325
Scanner(ParserMode pre)326 Scanner::Scanner(ParserMode pre)
327 : stack_overflow_(false), is_pre_parsing_(pre == PREPARSE) { }
328
329
Init(Handle<String> source,unibrow::CharacterStream * stream,int position,ParserLanguage language)330 void Scanner::Init(Handle<String> source,
331 unibrow::CharacterStream* stream,
332 int position,
333 ParserLanguage language) {
334 // Initialize the source buffer.
335 if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {
336 two_byte_string_buffer_.Initialize(
337 Handle<ExternalTwoByteString>::cast(source));
338 source_ = &two_byte_string_buffer_;
339 } else {
340 char_stream_buffer_.Initialize(source, stream);
341 source_ = &char_stream_buffer_;
342 }
343
344 position_ = position;
345 is_parsing_json_ = (language == JSON);
346
347 // Set c0_ (one character ahead)
348 ASSERT(kCharacterLookaheadBufferSize == 1);
349 Advance();
350 // Initializer current_ to not refer to a literal buffer.
351 current_.literal_buffer = NULL;
352
353 // Skip initial whitespace allowing HTML comment ends just like
354 // after a newline and scan first token.
355 has_line_terminator_before_next_ = true;
356 SkipWhiteSpace();
357 Scan();
358 }
359
360
SubString(int start,int end)361 Handle<String> Scanner::SubString(int start, int end) {
362 return source_->SubString(start - position_, end - position_);
363 }
364
365
Next()366 Token::Value Scanner::Next() {
367 // BUG 1215673: Find a thread safe way to set a stack limit in
368 // pre-parse mode. Otherwise, we cannot safely pre-parse from other
369 // threads.
370 current_ = next_;
371 // Check for stack-overflow before returning any tokens.
372 StackLimitCheck check;
373 if (check.HasOverflowed()) {
374 stack_overflow_ = true;
375 next_.token = Token::ILLEGAL;
376 } else {
377 Scan();
378 }
379 return current_.token;
380 }
381
382
StartLiteral()383 void Scanner::StartLiteral() {
384 // Use the first buffer unless it's currently in use by the current_ token.
385 // In most cases we won't have two literals/identifiers in a row, so
386 // the second buffer won't be used very often and is unlikely to grow much.
387 UTF8Buffer* free_buffer =
388 (current_.literal_buffer != &literal_buffer_1_) ? &literal_buffer_1_
389 : &literal_buffer_2_;
390 next_.literal_buffer = free_buffer;
391 free_buffer->Reset();
392 }
393
394
AddChar(uc32 c)395 void Scanner::AddChar(uc32 c) {
396 next_.literal_buffer->AddChar(c);
397 }
398
399
TerminateLiteral()400 void Scanner::TerminateLiteral() {
401 AddChar(0);
402 }
403
404
AddCharAdvance()405 void Scanner::AddCharAdvance() {
406 AddChar(c0_);
407 Advance();
408 }
409
410
IsByteOrderMark(uc32 c)411 static inline bool IsByteOrderMark(uc32 c) {
412 // The Unicode value U+FFFE is guaranteed never to be assigned as a
413 // Unicode character; this implies that in a Unicode context the
414 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
415 // character expressed in little-endian byte order (since it could
416 // not be a U+FFFE character expressed in big-endian byte
417 // order). Nevertheless, we check for it to be compatible with
418 // Spidermonkey.
419 return c == 0xFEFF || c == 0xFFFE;
420 }
421
422
SkipJsonWhiteSpace()423 bool Scanner::SkipJsonWhiteSpace() {
424 int start_position = source_pos();
425 // JSON WhiteSpace is tab, carrige-return, newline and space.
426 while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') {
427 Advance();
428 }
429 return source_pos() != start_position;
430 }
431
432
SkipJavaScriptWhiteSpace()433 bool Scanner::SkipJavaScriptWhiteSpace() {
434 int start_position = source_pos();
435
436 while (true) {
437 // We treat byte-order marks (BOMs) as whitespace for better
438 // compatibility with Spidermonkey and other JavaScript engines.
439 while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
440 // IsWhiteSpace() includes line terminators!
441 if (kIsLineTerminator.get(c0_)) {
442 // Ignore line terminators, but remember them. This is necessary
443 // for automatic semicolon insertion.
444 has_line_terminator_before_next_ = true;
445 }
446 Advance();
447 }
448
449 // If there is an HTML comment end '-->' at the beginning of a
450 // line (with only whitespace in front of it), we treat the rest
451 // of the line as a comment. This is in line with the way
452 // SpiderMonkey handles it.
453 if (c0_ == '-' && has_line_terminator_before_next_) {
454 Advance();
455 if (c0_ == '-') {
456 Advance();
457 if (c0_ == '>') {
458 // Treat the rest of the line as a comment.
459 SkipSingleLineComment();
460 // Continue skipping white space after the comment.
461 continue;
462 }
463 PushBack('-'); // undo Advance()
464 }
465 PushBack('-'); // undo Advance()
466 }
467 // Return whether or not we skipped any characters.
468 return source_pos() != start_position;
469 }
470 }
471
472
SkipSingleLineComment()473 Token::Value Scanner::SkipSingleLineComment() {
474 Advance();
475
476 // The line terminator at the end of the line is not considered
477 // to be part of the single-line comment; it is recognized
478 // separately by the lexical grammar and becomes part of the
479 // stream of input elements for the syntactic grammar (see
480 // ECMA-262, section 7.4, page 12).
481 while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
482 Advance();
483 }
484
485 return Token::WHITESPACE;
486 }
487
488
SkipMultiLineComment()489 Token::Value Scanner::SkipMultiLineComment() {
490 ASSERT(c0_ == '*');
491 Advance();
492
493 while (c0_ >= 0) {
494 char ch = c0_;
495 Advance();
496 // If we have reached the end of the multi-line comment, we
497 // consume the '/' and insert a whitespace. This way all
498 // multi-line comments are treated as whitespace - even the ones
499 // containing line terminators. This contradicts ECMA-262, section
500 // 7.4, page 12, that says that multi-line comments containing
501 // line terminators should be treated as a line terminator, but it
502 // matches the behaviour of SpiderMonkey and KJS.
503 if (ch == '*' && c0_ == '/') {
504 c0_ = ' ';
505 return Token::WHITESPACE;
506 }
507 }
508
509 // Unterminated multi-line comment.
510 return Token::ILLEGAL;
511 }
512
513
ScanHtmlComment()514 Token::Value Scanner::ScanHtmlComment() {
515 // Check for <!-- comments.
516 ASSERT(c0_ == '!');
517 Advance();
518 if (c0_ == '-') {
519 Advance();
520 if (c0_ == '-') return SkipSingleLineComment();
521 PushBack('-'); // undo Advance()
522 }
523 PushBack('!'); // undo Advance()
524 ASSERT(c0_ == '!');
525 return Token::LT;
526 }
527
528
529
ScanJson()530 void Scanner::ScanJson() {
531 next_.literal_buffer = NULL;
532 Token::Value token;
533 has_line_terminator_before_next_ = false;
534 do {
535 // Remember the position of the next token
536 next_.location.beg_pos = source_pos();
537 switch (c0_) {
538 case '\t':
539 case '\r':
540 case '\n':
541 case ' ':
542 Advance();
543 token = Token::WHITESPACE;
544 break;
545 case '{':
546 Advance();
547 token = Token::LBRACE;
548 break;
549 case '}':
550 Advance();
551 token = Token::RBRACE;
552 break;
553 case '[':
554 Advance();
555 token = Token::LBRACK;
556 break;
557 case ']':
558 Advance();
559 token = Token::RBRACK;
560 break;
561 case ':':
562 Advance();
563 token = Token::COLON;
564 break;
565 case ',':
566 Advance();
567 token = Token::COMMA;
568 break;
569 case '"':
570 token = ScanJsonString();
571 break;
572 case '-':
573 case '0':
574 case '1':
575 case '2':
576 case '3':
577 case '4':
578 case '5':
579 case '6':
580 case '7':
581 case '8':
582 case '9':
583 token = ScanJsonNumber();
584 break;
585 case 't':
586 token = ScanJsonIdentifier("true", Token::TRUE_LITERAL);
587 break;
588 case 'f':
589 token = ScanJsonIdentifier("false", Token::FALSE_LITERAL);
590 break;
591 case 'n':
592 token = ScanJsonIdentifier("null", Token::NULL_LITERAL);
593 break;
594 default:
595 if (c0_ < 0) {
596 Advance();
597 token = Token::EOS;
598 } else {
599 Advance();
600 token = Select(Token::ILLEGAL);
601 }
602 }
603 } while (token == Token::WHITESPACE);
604
605 next_.location.end_pos = source_pos();
606 next_.token = token;
607 }
608
609
ScanJsonString()610 Token::Value Scanner::ScanJsonString() {
611 ASSERT_EQ('"', c0_);
612 Advance();
613 StartLiteral();
614 while (c0_ != '"' && c0_ > 0) {
615 // Check for control character (0x00-0x1f) or unterminated string (<0).
616 if (c0_ < 0x20) return Token::ILLEGAL;
617 if (c0_ != '\\') {
618 AddCharAdvance();
619 } else {
620 Advance();
621 switch (c0_) {
622 case '"':
623 case '\\':
624 case '/':
625 AddChar(c0_);
626 break;
627 case 'b':
628 AddChar('\x08');
629 break;
630 case 'f':
631 AddChar('\x0c');
632 break;
633 case 'n':
634 AddChar('\x0a');
635 break;
636 case 'r':
637 AddChar('\x0d');
638 break;
639 case 't':
640 AddChar('\x09');
641 break;
642 case 'u': {
643 uc32 value = 0;
644 for (int i = 0; i < 4; i++) {
645 Advance();
646 int digit = HexValue(c0_);
647 if (digit < 0) return Token::ILLEGAL;
648 value = value * 16 + digit;
649 }
650 AddChar(value);
651 break;
652 }
653 default:
654 return Token::ILLEGAL;
655 }
656 Advance();
657 }
658 }
659 if (c0_ != '"') {
660 return Token::ILLEGAL;
661 }
662 TerminateLiteral();
663 Advance();
664 return Token::STRING;
665 }
666
667
ScanJsonNumber()668 Token::Value Scanner::ScanJsonNumber() {
669 StartLiteral();
670 if (c0_ == '-') AddCharAdvance();
671 if (c0_ == '0') {
672 AddCharAdvance();
673 // Prefix zero is only allowed if it's the only digit before
674 // a decimal point or exponent.
675 if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL;
676 } else {
677 if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL;
678 do {
679 AddCharAdvance();
680 } while (c0_ >= '0' && c0_ <= '9');
681 }
682 if (c0_ == '.') {
683 AddCharAdvance();
684 if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
685 do {
686 AddCharAdvance();
687 } while (c0_ >= '0' && c0_ <= '9');
688 }
689 if ((c0_ | 0x20) == 'e') {
690 AddCharAdvance();
691 if (c0_ == '-' || c0_ == '+') AddCharAdvance();
692 if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
693 do {
694 AddCharAdvance();
695 } while (c0_ >= '0' && c0_ <= '9');
696 }
697 TerminateLiteral();
698 return Token::NUMBER;
699 }
700
701
ScanJsonIdentifier(const char * text,Token::Value token)702 Token::Value Scanner::ScanJsonIdentifier(const char* text,
703 Token::Value token) {
704 StartLiteral();
705 while (*text != '\0') {
706 if (c0_ != *text) return Token::ILLEGAL;
707 Advance();
708 text++;
709 }
710 if (kIsIdentifierPart.get(c0_)) return Token::ILLEGAL;
711 TerminateLiteral();
712 return token;
713 }
714
715
ScanJavaScript()716 void Scanner::ScanJavaScript() {
717 next_.literal_buffer = NULL;
718 Token::Value token;
719 has_line_terminator_before_next_ = false;
720 do {
721 // Remember the position of the next token
722 next_.location.beg_pos = source_pos();
723
724 switch (c0_) {
725 case ' ':
726 case '\t':
727 Advance();
728 token = Token::WHITESPACE;
729 break;
730
731 case '\n':
732 Advance();
733 has_line_terminator_before_next_ = true;
734 token = Token::WHITESPACE;
735 break;
736
737 case '"': case '\'':
738 token = ScanString();
739 break;
740
741 case '<':
742 // < <= << <<= <!--
743 Advance();
744 if (c0_ == '=') {
745 token = Select(Token::LTE);
746 } else if (c0_ == '<') {
747 token = Select('=', Token::ASSIGN_SHL, Token::SHL);
748 } else if (c0_ == '!') {
749 token = ScanHtmlComment();
750 } else {
751 token = Token::LT;
752 }
753 break;
754
755 case '>':
756 // > >= >> >>= >>> >>>=
757 Advance();
758 if (c0_ == '=') {
759 token = Select(Token::GTE);
760 } else if (c0_ == '>') {
761 // >> >>= >>> >>>=
762 Advance();
763 if (c0_ == '=') {
764 token = Select(Token::ASSIGN_SAR);
765 } else if (c0_ == '>') {
766 token = Select('=', Token::ASSIGN_SHR, Token::SHR);
767 } else {
768 token = Token::SAR;
769 }
770 } else {
771 token = Token::GT;
772 }
773 break;
774
775 case '=':
776 // = == ===
777 Advance();
778 if (c0_ == '=') {
779 token = Select('=', Token::EQ_STRICT, Token::EQ);
780 } else {
781 token = Token::ASSIGN;
782 }
783 break;
784
785 case '!':
786 // ! != !==
787 Advance();
788 if (c0_ == '=') {
789 token = Select('=', Token::NE_STRICT, Token::NE);
790 } else {
791 token = Token::NOT;
792 }
793 break;
794
795 case '+':
796 // + ++ +=
797 Advance();
798 if (c0_ == '+') {
799 token = Select(Token::INC);
800 } else if (c0_ == '=') {
801 token = Select(Token::ASSIGN_ADD);
802 } else {
803 token = Token::ADD;
804 }
805 break;
806
807 case '-':
808 // - -- --> -=
809 Advance();
810 if (c0_ == '-') {
811 Advance();
812 if (c0_ == '>' && has_line_terminator_before_next_) {
813 // For compatibility with SpiderMonkey, we skip lines that
814 // start with an HTML comment end '-->'.
815 token = SkipSingleLineComment();
816 } else {
817 token = Token::DEC;
818 }
819 } else if (c0_ == '=') {
820 token = Select(Token::ASSIGN_SUB);
821 } else {
822 token = Token::SUB;
823 }
824 break;
825
826 case '*':
827 // * *=
828 token = Select('=', Token::ASSIGN_MUL, Token::MUL);
829 break;
830
831 case '%':
832 // % %=
833 token = Select('=', Token::ASSIGN_MOD, Token::MOD);
834 break;
835
836 case '/':
837 // / // /* /=
838 Advance();
839 if (c0_ == '/') {
840 token = SkipSingleLineComment();
841 } else if (c0_ == '*') {
842 token = SkipMultiLineComment();
843 } else if (c0_ == '=') {
844 token = Select(Token::ASSIGN_DIV);
845 } else {
846 token = Token::DIV;
847 }
848 break;
849
850 case '&':
851 // & && &=
852 Advance();
853 if (c0_ == '&') {
854 token = Select(Token::AND);
855 } else if (c0_ == '=') {
856 token = Select(Token::ASSIGN_BIT_AND);
857 } else {
858 token = Token::BIT_AND;
859 }
860 break;
861
862 case '|':
863 // | || |=
864 Advance();
865 if (c0_ == '|') {
866 token = Select(Token::OR);
867 } else if (c0_ == '=') {
868 token = Select(Token::ASSIGN_BIT_OR);
869 } else {
870 token = Token::BIT_OR;
871 }
872 break;
873
874 case '^':
875 // ^ ^=
876 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
877 break;
878
879 case '.':
880 // . Number
881 Advance();
882 if (IsDecimalDigit(c0_)) {
883 token = ScanNumber(true);
884 } else {
885 token = Token::PERIOD;
886 }
887 break;
888
889 case ':':
890 token = Select(Token::COLON);
891 break;
892
893 case ';':
894 token = Select(Token::SEMICOLON);
895 break;
896
897 case ',':
898 token = Select(Token::COMMA);
899 break;
900
901 case '(':
902 token = Select(Token::LPAREN);
903 break;
904
905 case ')':
906 token = Select(Token::RPAREN);
907 break;
908
909 case '[':
910 token = Select(Token::LBRACK);
911 break;
912
913 case ']':
914 token = Select(Token::RBRACK);
915 break;
916
917 case '{':
918 token = Select(Token::LBRACE);
919 break;
920
921 case '}':
922 token = Select(Token::RBRACE);
923 break;
924
925 case '?':
926 token = Select(Token::CONDITIONAL);
927 break;
928
929 case '~':
930 token = Select(Token::BIT_NOT);
931 break;
932
933 default:
934 if (kIsIdentifierStart.get(c0_)) {
935 token = ScanIdentifier();
936 } else if (IsDecimalDigit(c0_)) {
937 token = ScanNumber(false);
938 } else if (SkipWhiteSpace()) {
939 token = Token::WHITESPACE;
940 } else if (c0_ < 0) {
941 token = Token::EOS;
942 } else {
943 token = Select(Token::ILLEGAL);
944 }
945 break;
946 }
947
948 // Continue scanning for tokens as long as we're just skipping
949 // whitespace.
950 } while (token == Token::WHITESPACE);
951
952 next_.location.end_pos = source_pos();
953 next_.token = token;
954 }
955
956
SeekForward(int pos)957 void Scanner::SeekForward(int pos) {
958 source_->SeekForward(pos - 1);
959 Advance();
960 Scan();
961 }
962
963
ScanHexEscape(uc32 c,int length)964 uc32 Scanner::ScanHexEscape(uc32 c, int length) {
965 ASSERT(length <= 4); // prevent overflow
966
967 uc32 digits[4];
968 uc32 x = 0;
969 for (int i = 0; i < length; i++) {
970 digits[i] = c0_;
971 int d = HexValue(c0_);
972 if (d < 0) {
973 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
974 // should be illegal, but other JS VMs just return the
975 // non-escaped version of the original character.
976
977 // Push back digits read, except the last one (in c0_).
978 for (int j = i-1; j >= 0; j--) {
979 PushBack(digits[j]);
980 }
981 // Notice: No handling of error - treat it as "\u"->"u".
982 return c;
983 }
984 x = x * 16 + d;
985 Advance();
986 }
987
988 return x;
989 }
990
991
992 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
993 // ECMA-262. Other JS VMs support them.
ScanOctalEscape(uc32 c,int length)994 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
995 uc32 x = c - '0';
996 for (int i = 0; i < length; i++) {
997 int d = c0_ - '0';
998 if (d < 0 || d > 7) break;
999 int nx = x * 8 + d;
1000 if (nx >= 256) break;
1001 x = nx;
1002 Advance();
1003 }
1004 return x;
1005 }
1006
1007
ScanEscape()1008 void Scanner::ScanEscape() {
1009 uc32 c = c0_;
1010 Advance();
1011
1012 // Skip escaped newlines.
1013 if (kIsLineTerminator.get(c)) {
1014 // Allow CR+LF newlines in multiline string literals.
1015 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
1016 // Allow LF+CR newlines in multiline string literals.
1017 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
1018 return;
1019 }
1020
1021 switch (c) {
1022 case '\'': // fall through
1023 case '"' : // fall through
1024 case '\\': break;
1025 case 'b' : c = '\b'; break;
1026 case 'f' : c = '\f'; break;
1027 case 'n' : c = '\n'; break;
1028 case 'r' : c = '\r'; break;
1029 case 't' : c = '\t'; break;
1030 case 'u' : c = ScanHexEscape(c, 4); break;
1031 case 'v' : c = '\v'; break;
1032 case 'x' : c = ScanHexEscape(c, 2); break;
1033 case '0' : // fall through
1034 case '1' : // fall through
1035 case '2' : // fall through
1036 case '3' : // fall through
1037 case '4' : // fall through
1038 case '5' : // fall through
1039 case '6' : // fall through
1040 case '7' : c = ScanOctalEscape(c, 2); break;
1041 }
1042
1043 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
1044 // should be illegal, but they are commonly handled
1045 // as non-escaped characters by JS VMs.
1046 AddChar(c);
1047 }
1048
1049
ScanString()1050 Token::Value Scanner::ScanString() {
1051 uc32 quote = c0_;
1052 Advance(); // consume quote
1053
1054 StartLiteral();
1055 while (c0_ != quote && c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
1056 uc32 c = c0_;
1057 Advance();
1058 if (c == '\\') {
1059 if (c0_ < 0) return Token::ILLEGAL;
1060 ScanEscape();
1061 } else {
1062 AddChar(c);
1063 }
1064 }
1065 if (c0_ != quote) {
1066 return Token::ILLEGAL;
1067 }
1068 TerminateLiteral();
1069
1070 Advance(); // consume quote
1071 return Token::STRING;
1072 }
1073
1074
Select(Token::Value tok)1075 Token::Value Scanner::Select(Token::Value tok) {
1076 Advance();
1077 return tok;
1078 }
1079
1080
Select(uc32 next,Token::Value then,Token::Value else_)1081 Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) {
1082 Advance();
1083 if (c0_ == next) {
1084 Advance();
1085 return then;
1086 } else {
1087 return else_;
1088 }
1089 }
1090
1091
1092 // Returns true if any decimal digits were scanned, returns false otherwise.
ScanDecimalDigits()1093 void Scanner::ScanDecimalDigits() {
1094 while (IsDecimalDigit(c0_))
1095 AddCharAdvance();
1096 }
1097
1098
ScanNumber(bool seen_period)1099 Token::Value Scanner::ScanNumber(bool seen_period) {
1100 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
1101
1102 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
1103
1104 StartLiteral();
1105 if (seen_period) {
1106 // we have already seen a decimal point of the float
1107 AddChar('.');
1108 ScanDecimalDigits(); // we know we have at least one digit
1109
1110 } else {
1111 // if the first character is '0' we must check for octals and hex
1112 if (c0_ == '0') {
1113 AddCharAdvance();
1114
1115 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
1116 if (c0_ == 'x' || c0_ == 'X') {
1117 // hex number
1118 kind = HEX;
1119 AddCharAdvance();
1120 if (!IsHexDigit(c0_))
1121 // we must have at least one hex digit after 'x'/'X'
1122 return Token::ILLEGAL;
1123 while (IsHexDigit(c0_))
1124 AddCharAdvance();
1125
1126 } else if ('0' <= c0_ && c0_ <= '7') {
1127 // (possible) octal number
1128 kind = OCTAL;
1129 while (true) {
1130 if (c0_ == '8' || c0_ == '9') {
1131 kind = DECIMAL;
1132 break;
1133 }
1134 if (c0_ < '0' || '7' < c0_) break;
1135 AddCharAdvance();
1136 }
1137 }
1138 }
1139
1140 // Parse decimal digits and allow trailing fractional part.
1141 if (kind == DECIMAL) {
1142 ScanDecimalDigits(); // optional
1143 if (c0_ == '.') {
1144 AddCharAdvance();
1145 ScanDecimalDigits(); // optional
1146 }
1147 }
1148 }
1149
1150 // scan exponent, if any
1151 if (c0_ == 'e' || c0_ == 'E') {
1152 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
1153 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed
1154 // scan exponent
1155 AddCharAdvance();
1156 if (c0_ == '+' || c0_ == '-')
1157 AddCharAdvance();
1158 if (!IsDecimalDigit(c0_))
1159 // we must have at least one decimal digit after 'e'/'E'
1160 return Token::ILLEGAL;
1161 ScanDecimalDigits();
1162 }
1163 TerminateLiteral();
1164
1165 // The source character immediately following a numeric literal must
1166 // not be an identifier start or a decimal digit; see ECMA-262
1167 // section 7.8.3, page 17 (note that we read only one decimal digit
1168 // if the value is 0).
1169 if (IsDecimalDigit(c0_) || kIsIdentifierStart.get(c0_))
1170 return Token::ILLEGAL;
1171
1172 return Token::NUMBER;
1173 }
1174
1175
ScanIdentifierUnicodeEscape()1176 uc32 Scanner::ScanIdentifierUnicodeEscape() {
1177 Advance();
1178 if (c0_ != 'u') return unibrow::Utf8::kBadChar;
1179 Advance();
1180 uc32 c = ScanHexEscape('u', 4);
1181 // We do not allow a unicode escape sequence to start another
1182 // unicode escape sequence.
1183 if (c == '\\') return unibrow::Utf8::kBadChar;
1184 return c;
1185 }
1186
1187
ScanIdentifier()1188 Token::Value Scanner::ScanIdentifier() {
1189 ASSERT(kIsIdentifierStart.get(c0_));
1190
1191 StartLiteral();
1192 KeywordMatcher keyword_match;
1193
1194 // Scan identifier start character.
1195 if (c0_ == '\\') {
1196 uc32 c = ScanIdentifierUnicodeEscape();
1197 // Only allow legal identifier start characters.
1198 if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL;
1199 AddChar(c);
1200 keyword_match.Fail();
1201 } else {
1202 AddChar(c0_);
1203 keyword_match.AddChar(c0_);
1204 Advance();
1205 }
1206
1207 // Scan the rest of the identifier characters.
1208 while (kIsIdentifierPart.get(c0_)) {
1209 if (c0_ == '\\') {
1210 uc32 c = ScanIdentifierUnicodeEscape();
1211 // Only allow legal identifier part characters.
1212 if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL;
1213 AddChar(c);
1214 keyword_match.Fail();
1215 } else {
1216 AddChar(c0_);
1217 keyword_match.AddChar(c0_);
1218 Advance();
1219 }
1220 }
1221 TerminateLiteral();
1222
1223 return keyword_match.token();
1224 }
1225
1226
1227
IsIdentifier(unibrow::CharacterStream * buffer)1228 bool Scanner::IsIdentifier(unibrow::CharacterStream* buffer) {
1229 // Checks whether the buffer contains an identifier (no escape).
1230 if (!buffer->has_more()) return false;
1231 if (!kIsIdentifierStart.get(buffer->GetNext())) return false;
1232 while (buffer->has_more()) {
1233 if (!kIsIdentifierPart.get(buffer->GetNext())) return false;
1234 }
1235 return true;
1236 }
1237
1238
ScanRegExpPattern(bool seen_equal)1239 bool Scanner::ScanRegExpPattern(bool seen_equal) {
1240 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1241 bool in_character_class = false;
1242
1243 // Previous token is either '/' or '/=', in the second case, the
1244 // pattern starts at =.
1245 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1246 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1247
1248 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1249 // the scanner should pass uninterpreted bodies to the RegExp
1250 // constructor.
1251 StartLiteral();
1252 if (seen_equal)
1253 AddChar('=');
1254
1255 while (c0_ != '/' || in_character_class) {
1256 if (kIsLineTerminator.get(c0_) || c0_ < 0)
1257 return false;
1258 if (c0_ == '\\') { // escaped character
1259 AddCharAdvance();
1260 if (kIsLineTerminator.get(c0_) || c0_ < 0)
1261 return false;
1262 AddCharAdvance();
1263 } else { // unescaped character
1264 if (c0_ == '[')
1265 in_character_class = true;
1266 if (c0_ == ']')
1267 in_character_class = false;
1268 AddCharAdvance();
1269 }
1270 }
1271 Advance(); // consume '/'
1272
1273 TerminateLiteral();
1274
1275 return true;
1276 }
1277
ScanRegExpFlags()1278 bool Scanner::ScanRegExpFlags() {
1279 // Scan regular expression flags.
1280 StartLiteral();
1281 while (kIsIdentifierPart.get(c0_)) {
1282 if (c0_ == '\\') {
1283 uc32 c = ScanIdentifierUnicodeEscape();
1284 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
1285 // We allow any escaped character, unlike the restriction on
1286 // IdentifierPart when it is used to build an IdentifierName.
1287 AddChar(c);
1288 continue;
1289 }
1290 }
1291 AddCharAdvance();
1292 }
1293 TerminateLiteral();
1294
1295 next_.location.end_pos = source_pos() - 1;
1296 return true;
1297 }
1298
1299 } } // namespace v8::internal
1300