• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 //     * Redistributions of source code must retain the above copyright
7 //       notice, this list of conditions and the following disclaimer.
8 //     * Redistributions in binary form must reproduce the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer in the documentation and/or other materials provided
11 //       with the distribution.
12 //     * Neither the name of Google Inc. nor the names of its
13 //       contributors may be used to endorse or promote products derived
14 //       from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #include "v8.h"
29 
30 #include "ast.h"
31 #include "scanner.h"
32 
33 namespace v8 {
34 namespace internal {
35 
36 // ----------------------------------------------------------------------------
37 // Character predicates
38 
39 
40 unibrow::Predicate<IdentifierStart, 128> Scanner::kIsIdentifierStart;
41 unibrow::Predicate<IdentifierPart, 128> Scanner::kIsIdentifierPart;
42 unibrow::Predicate<unibrow::LineTerminator, 128> Scanner::kIsLineTerminator;
43 unibrow::Predicate<unibrow::WhiteSpace, 128> Scanner::kIsWhiteSpace;
44 
45 
46 StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_;
47 
48 
49 // ----------------------------------------------------------------------------
50 // UTF8Buffer
51 
UTF8Buffer()52 UTF8Buffer::UTF8Buffer() : data_(NULL), limit_(NULL) { }
53 
54 
~UTF8Buffer()55 UTF8Buffer::~UTF8Buffer() {
56   if (data_ != NULL) DeleteArray(data_);
57 }
58 
59 
AddCharSlow(uc32 c)60 void UTF8Buffer::AddCharSlow(uc32 c) {
61   static const int kCapacityGrowthLimit = 1 * MB;
62   if (cursor_ > limit_) {
63     int old_capacity = Capacity();
64     int old_position = pos();
65     int new_capacity =
66         Min(old_capacity * 3, old_capacity + kCapacityGrowthLimit);
67     char* new_data = NewArray<char>(new_capacity);
68     memcpy(new_data, data_, old_position);
69     DeleteArray(data_);
70     data_ = new_data;
71     cursor_ = new_data + old_position;
72     limit_ = ComputeLimit(new_data, new_capacity);
73     ASSERT(Capacity() == new_capacity && pos() == old_position);
74   }
75   if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
76     *cursor_++ = c;  // Common case: 7-bit ASCII.
77   } else {
78     cursor_ += unibrow::Utf8::Encode(cursor_, c);
79   }
80   ASSERT(pos() <= Capacity());
81 }
82 
83 
84 // ----------------------------------------------------------------------------
85 // UTF16Buffer
86 
87 
UTF16Buffer()88 UTF16Buffer::UTF16Buffer()
89     : pos_(0), size_(0) { }
90 
91 
SubString(int start,int end)92 Handle<String> UTF16Buffer::SubString(int start, int end) {
93   return internal::SubString(data_, start, end);
94 }
95 
96 
97 // CharacterStreamUTF16Buffer
CharacterStreamUTF16Buffer()98 CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()
99     : pushback_buffer_(0), last_(0), stream_(NULL) { }
100 
101 
Initialize(Handle<String> data,unibrow::CharacterStream * input)102 void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,
103                                             unibrow::CharacterStream* input) {
104   data_ = data;
105   pos_ = 0;
106   stream_ = input;
107 }
108 
109 
PushBack(uc32 ch)110 void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {
111   pushback_buffer()->Add(last_);
112   last_ = ch;
113   pos_--;
114 }
115 
116 
Advance()117 uc32 CharacterStreamUTF16Buffer::Advance() {
118   // NOTE: It is of importance to Persian / Farsi resources that we do
119   // *not* strip format control characters in the scanner; see
120   //
121   //    https://bugzilla.mozilla.org/show_bug.cgi?id=274152
122   //
123   // So, even though ECMA-262, section 7.1, page 11, dictates that we
124   // must remove Unicode format-control characters, we do not. This is
125   // in line with how IE and SpiderMonkey handles it.
126   if (!pushback_buffer()->is_empty()) {
127     pos_++;
128     return last_ = pushback_buffer()->RemoveLast();
129   } else if (stream_->has_more()) {
130     pos_++;
131     uc32 next = stream_->GetNext();
132     return last_ = next;
133   } else {
134     // Note: currently the following increment is necessary to avoid a
135     // test-parser problem!
136     pos_++;
137     return last_ = static_cast<uc32>(-1);
138   }
139 }
140 
141 
SeekForward(int pos)142 void CharacterStreamUTF16Buffer::SeekForward(int pos) {
143   pos_ = pos;
144   ASSERT(pushback_buffer()->is_empty());
145   stream_->Seek(pos);
146 }
147 
148 
149 // TwoByteStringUTF16Buffer
TwoByteStringUTF16Buffer()150 TwoByteStringUTF16Buffer::TwoByteStringUTF16Buffer()
151     : raw_data_(NULL) { }
152 
153 
Initialize(Handle<ExternalTwoByteString> data)154 void TwoByteStringUTF16Buffer::Initialize(
155      Handle<ExternalTwoByteString> data) {
156   ASSERT(!data.is_null());
157 
158   data_ = data;
159   pos_ = 0;
160 
161   raw_data_ = data->resource()->data();
162   size_ = data->length();
163 }
164 
165 
Advance()166 uc32 TwoByteStringUTF16Buffer::Advance() {
167   if (pos_ < size_) {
168     return raw_data_[pos_++];
169   } else {
170     // note: currently the following increment is necessary to avoid a
171     // test-parser problem!
172     pos_++;
173     return static_cast<uc32>(-1);
174   }
175 }
176 
177 
PushBack(uc32 ch)178 void TwoByteStringUTF16Buffer::PushBack(uc32 ch) {
179   pos_--;
180   ASSERT(pos_ >= Scanner::kCharacterLookaheadBufferSize);
181   ASSERT(raw_data_[pos_ - Scanner::kCharacterLookaheadBufferSize] == ch);
182 }
183 
184 
SeekForward(int pos)185 void TwoByteStringUTF16Buffer::SeekForward(int pos) {
186   pos_ = pos;
187 }
188 
189 
190 // ----------------------------------------------------------------------------
191 // Keyword Matcher
192 KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {
193   { "break",  KEYWORD_PREFIX, Token::BREAK },
194   { NULL,     C,              Token::ILLEGAL },
195   { NULL,     D,              Token::ILLEGAL },
196   { "else",   KEYWORD_PREFIX, Token::ELSE },
197   { NULL,     F,              Token::ILLEGAL },
198   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
199   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
200   { NULL,     I,              Token::ILLEGAL },
201   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
202   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
203   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
204   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
205   { NULL,     N,              Token::ILLEGAL },
206   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
207   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
208   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
209   { "return", KEYWORD_PREFIX, Token::RETURN },
210   { "switch", KEYWORD_PREFIX, Token::SWITCH },
211   { NULL,     T,              Token::ILLEGAL },
212   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
213   { NULL,     V,              Token::ILLEGAL },
214   { NULL,     W,              Token::ILLEGAL }
215 };
216 
217 
Step(uc32 input)218 void KeywordMatcher::Step(uc32 input) {
219   switch (state_) {
220     case INITIAL: {
221       // matching the first character is the only state with significant fanout.
222       // Match only lower-case letters in range 'b'..'w'.
223       unsigned int offset = input - kFirstCharRangeMin;
224       if (offset < kFirstCharRangeLength) {
225         state_ = first_states_[offset].state;
226         if (state_ == KEYWORD_PREFIX) {
227           keyword_ = first_states_[offset].keyword;
228           counter_ = 1;
229           keyword_token_ = first_states_[offset].token;
230         }
231         return;
232       }
233       break;
234     }
235     case KEYWORD_PREFIX:
236       if (keyword_[counter_] == input) {
237         ASSERT_NE(input, '\0');
238         counter_++;
239         if (keyword_[counter_] == '\0') {
240           state_ = KEYWORD_MATCHED;
241           token_ = keyword_token_;
242         }
243         return;
244       }
245       break;
246     case KEYWORD_MATCHED:
247       token_ = Token::IDENTIFIER;
248       break;
249     case C:
250       if (MatchState(input, 'a', CA)) return;
251       if (MatchState(input, 'o', CO)) return;
252       break;
253     case CA:
254       if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;
255       if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;
256       break;
257     case CO:
258       if (MatchState(input, 'n', CON)) return;
259       break;
260     case CON:
261       if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;
262       if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;
263       break;
264     case D:
265       if (MatchState(input, 'e', DE)) return;
266       if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;
267       break;
268     case DE:
269       if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;
270       if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;
271       if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;
272       break;
273     case F:
274       if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;
275       if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;
276       if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;
277       if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;
278       break;
279     case I:
280       if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;
281       if (MatchKeyword(input, 'n', IN, Token::IN)) return;
282       break;
283     case IN:
284       token_ = Token::IDENTIFIER;
285       if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) {
286         return;
287       }
288       break;
289     case N:
290       if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return;
291       if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;
292       if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;
293       break;
294     case T:
295       if (MatchState(input, 'h', TH)) return;
296       if (MatchState(input, 'r', TR)) return;
297       if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;
298       break;
299     case TH:
300       if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;
301       if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;
302       break;
303     case TR:
304       if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;
305       if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;
306       break;
307     case V:
308       if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;
309       if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;
310       break;
311     case W:
312       if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;
313       if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
314       break;
315     default:
316       UNREACHABLE();
317   }
318   // On fallthrough, it's a failure.
319   state_ = UNMATCHABLE;
320 }
321 
322 
323 // ----------------------------------------------------------------------------
324 // Scanner
325 
Scanner(ParserMode pre)326 Scanner::Scanner(ParserMode pre)
327     : stack_overflow_(false), is_pre_parsing_(pre == PREPARSE) { }
328 
329 
Init(Handle<String> source,unibrow::CharacterStream * stream,int position,ParserLanguage language)330 void Scanner::Init(Handle<String> source,
331                    unibrow::CharacterStream* stream,
332                    int position,
333                    ParserLanguage language) {
334   // Initialize the source buffer.
335   if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {
336     two_byte_string_buffer_.Initialize(
337         Handle<ExternalTwoByteString>::cast(source));
338     source_ = &two_byte_string_buffer_;
339   } else {
340     char_stream_buffer_.Initialize(source, stream);
341     source_ = &char_stream_buffer_;
342   }
343 
344   position_ = position;
345   is_parsing_json_ = (language == JSON);
346 
347   // Set c0_ (one character ahead)
348   ASSERT(kCharacterLookaheadBufferSize == 1);
349   Advance();
350   // Initializer current_ to not refer to a literal buffer.
351   current_.literal_buffer = NULL;
352 
353   // Skip initial whitespace allowing HTML comment ends just like
354   // after a newline and scan first token.
355   has_line_terminator_before_next_ = true;
356   SkipWhiteSpace();
357   Scan();
358 }
359 
360 
SubString(int start,int end)361 Handle<String> Scanner::SubString(int start, int end) {
362   return source_->SubString(start - position_, end - position_);
363 }
364 
365 
Next()366 Token::Value Scanner::Next() {
367   // BUG 1215673: Find a thread safe way to set a stack limit in
368   // pre-parse mode. Otherwise, we cannot safely pre-parse from other
369   // threads.
370   current_ = next_;
371   // Check for stack-overflow before returning any tokens.
372   StackLimitCheck check;
373   if (check.HasOverflowed()) {
374     stack_overflow_ = true;
375     next_.token = Token::ILLEGAL;
376   } else {
377     Scan();
378   }
379   return current_.token;
380 }
381 
382 
StartLiteral()383 void Scanner::StartLiteral() {
384   // Use the first buffer unless it's currently in use by the current_ token.
385   // In most cases we won't have two literals/identifiers in a row, so
386   // the second buffer won't be used very often and is unlikely to grow much.
387   UTF8Buffer* free_buffer =
388       (current_.literal_buffer != &literal_buffer_1_) ? &literal_buffer_1_
389                                                       : &literal_buffer_2_;
390   next_.literal_buffer = free_buffer;
391   free_buffer->Reset();
392 }
393 
394 
AddChar(uc32 c)395 void Scanner::AddChar(uc32 c) {
396   next_.literal_buffer->AddChar(c);
397 }
398 
399 
TerminateLiteral()400 void Scanner::TerminateLiteral() {
401   AddChar(0);
402 }
403 
404 
AddCharAdvance()405 void Scanner::AddCharAdvance() {
406   AddChar(c0_);
407   Advance();
408 }
409 
410 
IsByteOrderMark(uc32 c)411 static inline bool IsByteOrderMark(uc32 c) {
412   // The Unicode value U+FFFE is guaranteed never to be assigned as a
413   // Unicode character; this implies that in a Unicode context the
414   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
415   // character expressed in little-endian byte order (since it could
416   // not be a U+FFFE character expressed in big-endian byte
417   // order). Nevertheless, we check for it to be compatible with
418   // Spidermonkey.
419   return c == 0xFEFF || c == 0xFFFE;
420 }
421 
422 
SkipJsonWhiteSpace()423 bool Scanner::SkipJsonWhiteSpace() {
424   int start_position = source_pos();
425   // JSON WhiteSpace is tab, carrige-return, newline and space.
426   while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') {
427     Advance();
428   }
429   return source_pos() != start_position;
430 }
431 
432 
SkipJavaScriptWhiteSpace()433 bool Scanner::SkipJavaScriptWhiteSpace() {
434   int start_position = source_pos();
435 
436   while (true) {
437     // We treat byte-order marks (BOMs) as whitespace for better
438     // compatibility with Spidermonkey and other JavaScript engines.
439     while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
440       // IsWhiteSpace() includes line terminators!
441       if (kIsLineTerminator.get(c0_)) {
442         // Ignore line terminators, but remember them. This is necessary
443         // for automatic semicolon insertion.
444         has_line_terminator_before_next_ = true;
445       }
446       Advance();
447     }
448 
449     // If there is an HTML comment end '-->' at the beginning of a
450     // line (with only whitespace in front of it), we treat the rest
451     // of the line as a comment. This is in line with the way
452     // SpiderMonkey handles it.
453     if (c0_ == '-' && has_line_terminator_before_next_) {
454       Advance();
455       if (c0_ == '-') {
456         Advance();
457         if (c0_ == '>') {
458           // Treat the rest of the line as a comment.
459           SkipSingleLineComment();
460           // Continue skipping white space after the comment.
461           continue;
462         }
463         PushBack('-');  // undo Advance()
464       }
465       PushBack('-');  // undo Advance()
466     }
467     // Return whether or not we skipped any characters.
468     return source_pos() != start_position;
469   }
470 }
471 
472 
SkipSingleLineComment()473 Token::Value Scanner::SkipSingleLineComment() {
474   Advance();
475 
476   // The line terminator at the end of the line is not considered
477   // to be part of the single-line comment; it is recognized
478   // separately by the lexical grammar and becomes part of the
479   // stream of input elements for the syntactic grammar (see
480   // ECMA-262, section 7.4, page 12).
481   while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
482     Advance();
483   }
484 
485   return Token::WHITESPACE;
486 }
487 
488 
SkipMultiLineComment()489 Token::Value Scanner::SkipMultiLineComment() {
490   ASSERT(c0_ == '*');
491   Advance();
492 
493   while (c0_ >= 0) {
494     char ch = c0_;
495     Advance();
496     // If we have reached the end of the multi-line comment, we
497     // consume the '/' and insert a whitespace. This way all
498     // multi-line comments are treated as whitespace - even the ones
499     // containing line terminators. This contradicts ECMA-262, section
500     // 7.4, page 12, that says that multi-line comments containing
501     // line terminators should be treated as a line terminator, but it
502     // matches the behaviour of SpiderMonkey and KJS.
503     if (ch == '*' && c0_ == '/') {
504       c0_ = ' ';
505       return Token::WHITESPACE;
506     }
507   }
508 
509   // Unterminated multi-line comment.
510   return Token::ILLEGAL;
511 }
512 
513 
ScanHtmlComment()514 Token::Value Scanner::ScanHtmlComment() {
515   // Check for <!-- comments.
516   ASSERT(c0_ == '!');
517   Advance();
518   if (c0_ == '-') {
519     Advance();
520     if (c0_ == '-') return SkipSingleLineComment();
521     PushBack('-');  // undo Advance()
522   }
523   PushBack('!');  // undo Advance()
524   ASSERT(c0_ == '!');
525   return Token::LT;
526 }
527 
528 
529 
ScanJson()530 void Scanner::ScanJson() {
531   next_.literal_buffer = NULL;
532   Token::Value token;
533   has_line_terminator_before_next_ = false;
534   do {
535     // Remember the position of the next token
536     next_.location.beg_pos = source_pos();
537     switch (c0_) {
538       case '\t':
539       case '\r':
540       case '\n':
541       case ' ':
542         Advance();
543         token = Token::WHITESPACE;
544         break;
545       case '{':
546         Advance();
547         token = Token::LBRACE;
548         break;
549       case '}':
550         Advance();
551         token = Token::RBRACE;
552         break;
553       case '[':
554         Advance();
555         token = Token::LBRACK;
556         break;
557       case ']':
558         Advance();
559         token = Token::RBRACK;
560         break;
561       case ':':
562         Advance();
563         token = Token::COLON;
564         break;
565       case ',':
566         Advance();
567         token = Token::COMMA;
568         break;
569       case '"':
570         token = ScanJsonString();
571         break;
572       case '-':
573       case '0':
574       case '1':
575       case '2':
576       case '3':
577       case '4':
578       case '5':
579       case '6':
580       case '7':
581       case '8':
582       case '9':
583         token = ScanJsonNumber();
584         break;
585       case 't':
586         token = ScanJsonIdentifier("true", Token::TRUE_LITERAL);
587         break;
588       case 'f':
589         token = ScanJsonIdentifier("false", Token::FALSE_LITERAL);
590         break;
591       case 'n':
592         token = ScanJsonIdentifier("null", Token::NULL_LITERAL);
593         break;
594       default:
595         if (c0_ < 0) {
596           Advance();
597           token = Token::EOS;
598         } else {
599           Advance();
600           token = Select(Token::ILLEGAL);
601         }
602     }
603   } while (token == Token::WHITESPACE);
604 
605   next_.location.end_pos = source_pos();
606   next_.token = token;
607 }
608 
609 
ScanJsonString()610 Token::Value Scanner::ScanJsonString() {
611   ASSERT_EQ('"', c0_);
612   Advance();
613   StartLiteral();
614   while (c0_ != '"' && c0_ > 0) {
615     // Check for control character (0x00-0x1f) or unterminated string (<0).
616     if (c0_ < 0x20) return Token::ILLEGAL;
617     if (c0_ != '\\') {
618       AddCharAdvance();
619     } else {
620       Advance();
621       switch (c0_) {
622         case '"':
623         case '\\':
624         case '/':
625           AddChar(c0_);
626           break;
627         case 'b':
628           AddChar('\x08');
629           break;
630         case 'f':
631           AddChar('\x0c');
632           break;
633         case 'n':
634           AddChar('\x0a');
635           break;
636         case 'r':
637           AddChar('\x0d');
638           break;
639         case 't':
640           AddChar('\x09');
641           break;
642         case 'u': {
643           uc32 value = 0;
644           for (int i = 0; i < 4; i++) {
645             Advance();
646             int digit = HexValue(c0_);
647             if (digit < 0) return Token::ILLEGAL;
648             value = value * 16 + digit;
649           }
650           AddChar(value);
651           break;
652         }
653         default:
654           return Token::ILLEGAL;
655       }
656       Advance();
657     }
658   }
659   if (c0_ != '"') {
660     return Token::ILLEGAL;
661   }
662   TerminateLiteral();
663   Advance();
664   return Token::STRING;
665 }
666 
667 
ScanJsonNumber()668 Token::Value Scanner::ScanJsonNumber() {
669   StartLiteral();
670   if (c0_ == '-') AddCharAdvance();
671   if (c0_ == '0') {
672     AddCharAdvance();
673     // Prefix zero is only allowed if it's the only digit before
674     // a decimal point or exponent.
675     if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL;
676   } else {
677     if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL;
678     do {
679       AddCharAdvance();
680     } while (c0_ >= '0' && c0_ <= '9');
681   }
682   if (c0_ == '.') {
683     AddCharAdvance();
684     if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
685     do {
686       AddCharAdvance();
687     } while (c0_ >= '0' && c0_ <= '9');
688   }
689   if ((c0_ | 0x20) == 'e') {
690     AddCharAdvance();
691     if (c0_ == '-' || c0_ == '+') AddCharAdvance();
692     if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
693     do {
694       AddCharAdvance();
695     } while (c0_ >= '0' && c0_ <= '9');
696   }
697   TerminateLiteral();
698   return Token::NUMBER;
699 }
700 
701 
ScanJsonIdentifier(const char * text,Token::Value token)702 Token::Value Scanner::ScanJsonIdentifier(const char* text,
703                                          Token::Value token) {
704   StartLiteral();
705   while (*text != '\0') {
706     if (c0_ != *text) return Token::ILLEGAL;
707     Advance();
708     text++;
709   }
710   if (kIsIdentifierPart.get(c0_)) return Token::ILLEGAL;
711   TerminateLiteral();
712   return token;
713 }
714 
715 
ScanJavaScript()716 void Scanner::ScanJavaScript() {
717   next_.literal_buffer = NULL;
718   Token::Value token;
719   has_line_terminator_before_next_ = false;
720   do {
721     // Remember the position of the next token
722     next_.location.beg_pos = source_pos();
723 
724     switch (c0_) {
725       case ' ':
726       case '\t':
727         Advance();
728         token = Token::WHITESPACE;
729         break;
730 
731       case '\n':
732         Advance();
733         has_line_terminator_before_next_ = true;
734         token = Token::WHITESPACE;
735         break;
736 
737       case '"': case '\'':
738         token = ScanString();
739         break;
740 
741       case '<':
742         // < <= << <<= <!--
743         Advance();
744         if (c0_ == '=') {
745           token = Select(Token::LTE);
746         } else if (c0_ == '<') {
747           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
748         } else if (c0_ == '!') {
749           token = ScanHtmlComment();
750         } else {
751           token = Token::LT;
752         }
753         break;
754 
755       case '>':
756         // > >= >> >>= >>> >>>=
757         Advance();
758         if (c0_ == '=') {
759           token = Select(Token::GTE);
760         } else if (c0_ == '>') {
761           // >> >>= >>> >>>=
762           Advance();
763           if (c0_ == '=') {
764             token = Select(Token::ASSIGN_SAR);
765           } else if (c0_ == '>') {
766             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
767           } else {
768             token = Token::SAR;
769           }
770         } else {
771           token = Token::GT;
772         }
773         break;
774 
775       case '=':
776         // = == ===
777         Advance();
778         if (c0_ == '=') {
779           token = Select('=', Token::EQ_STRICT, Token::EQ);
780         } else {
781           token = Token::ASSIGN;
782         }
783         break;
784 
785       case '!':
786         // ! != !==
787         Advance();
788         if (c0_ == '=') {
789           token = Select('=', Token::NE_STRICT, Token::NE);
790         } else {
791           token = Token::NOT;
792         }
793         break;
794 
795       case '+':
796         // + ++ +=
797         Advance();
798         if (c0_ == '+') {
799           token = Select(Token::INC);
800         } else if (c0_ == '=') {
801           token = Select(Token::ASSIGN_ADD);
802         } else {
803           token = Token::ADD;
804         }
805         break;
806 
807       case '-':
808         // - -- --> -=
809         Advance();
810         if (c0_ == '-') {
811           Advance();
812           if (c0_ == '>' && has_line_terminator_before_next_) {
813             // For compatibility with SpiderMonkey, we skip lines that
814             // start with an HTML comment end '-->'.
815             token = SkipSingleLineComment();
816           } else {
817             token = Token::DEC;
818           }
819         } else if (c0_ == '=') {
820           token = Select(Token::ASSIGN_SUB);
821         } else {
822           token = Token::SUB;
823         }
824         break;
825 
826       case '*':
827         // * *=
828         token = Select('=', Token::ASSIGN_MUL, Token::MUL);
829         break;
830 
831       case '%':
832         // % %=
833         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
834         break;
835 
836       case '/':
837         // /  // /* /=
838         Advance();
839         if (c0_ == '/') {
840           token = SkipSingleLineComment();
841         } else if (c0_ == '*') {
842           token = SkipMultiLineComment();
843         } else if (c0_ == '=') {
844           token = Select(Token::ASSIGN_DIV);
845         } else {
846           token = Token::DIV;
847         }
848         break;
849 
850       case '&':
851         // & && &=
852         Advance();
853         if (c0_ == '&') {
854           token = Select(Token::AND);
855         } else if (c0_ == '=') {
856           token = Select(Token::ASSIGN_BIT_AND);
857         } else {
858           token = Token::BIT_AND;
859         }
860         break;
861 
862       case '|':
863         // | || |=
864         Advance();
865         if (c0_ == '|') {
866           token = Select(Token::OR);
867         } else if (c0_ == '=') {
868           token = Select(Token::ASSIGN_BIT_OR);
869         } else {
870           token = Token::BIT_OR;
871         }
872         break;
873 
874       case '^':
875         // ^ ^=
876         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
877         break;
878 
879       case '.':
880         // . Number
881         Advance();
882         if (IsDecimalDigit(c0_)) {
883           token = ScanNumber(true);
884         } else {
885           token = Token::PERIOD;
886         }
887         break;
888 
889       case ':':
890         token = Select(Token::COLON);
891         break;
892 
893       case ';':
894         token = Select(Token::SEMICOLON);
895         break;
896 
897       case ',':
898         token = Select(Token::COMMA);
899         break;
900 
901       case '(':
902         token = Select(Token::LPAREN);
903         break;
904 
905       case ')':
906         token = Select(Token::RPAREN);
907         break;
908 
909       case '[':
910         token = Select(Token::LBRACK);
911         break;
912 
913       case ']':
914         token = Select(Token::RBRACK);
915         break;
916 
917       case '{':
918         token = Select(Token::LBRACE);
919         break;
920 
921       case '}':
922         token = Select(Token::RBRACE);
923         break;
924 
925       case '?':
926         token = Select(Token::CONDITIONAL);
927         break;
928 
929       case '~':
930         token = Select(Token::BIT_NOT);
931         break;
932 
933       default:
934         if (kIsIdentifierStart.get(c0_)) {
935           token = ScanIdentifier();
936         } else if (IsDecimalDigit(c0_)) {
937           token = ScanNumber(false);
938         } else if (SkipWhiteSpace()) {
939           token = Token::WHITESPACE;
940         } else if (c0_ < 0) {
941           token = Token::EOS;
942         } else {
943           token = Select(Token::ILLEGAL);
944         }
945         break;
946     }
947 
948     // Continue scanning for tokens as long as we're just skipping
949     // whitespace.
950   } while (token == Token::WHITESPACE);
951 
952   next_.location.end_pos = source_pos();
953   next_.token = token;
954 }
955 
956 
SeekForward(int pos)957 void Scanner::SeekForward(int pos) {
958   source_->SeekForward(pos - 1);
959   Advance();
960   Scan();
961 }
962 
963 
ScanHexEscape(uc32 c,int length)964 uc32 Scanner::ScanHexEscape(uc32 c, int length) {
965   ASSERT(length <= 4);  // prevent overflow
966 
967   uc32 digits[4];
968   uc32 x = 0;
969   for (int i = 0; i < length; i++) {
970     digits[i] = c0_;
971     int d = HexValue(c0_);
972     if (d < 0) {
973       // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
974       // should be illegal, but other JS VMs just return the
975       // non-escaped version of the original character.
976 
977       // Push back digits read, except the last one (in c0_).
978       for (int j = i-1; j >= 0; j--) {
979         PushBack(digits[j]);
980       }
981       // Notice: No handling of error - treat it as "\u"->"u".
982       return c;
983     }
984     x = x * 16 + d;
985     Advance();
986   }
987 
988   return x;
989 }
990 
991 
992 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
993 // ECMA-262. Other JS VMs support them.
ScanOctalEscape(uc32 c,int length)994 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
995   uc32 x = c - '0';
996   for (int i = 0; i < length; i++) {
997     int d = c0_ - '0';
998     if (d < 0 || d > 7) break;
999     int nx = x * 8 + d;
1000     if (nx >= 256) break;
1001     x = nx;
1002     Advance();
1003   }
1004   return x;
1005 }
1006 
1007 
ScanEscape()1008 void Scanner::ScanEscape() {
1009   uc32 c = c0_;
1010   Advance();
1011 
1012   // Skip escaped newlines.
1013   if (kIsLineTerminator.get(c)) {
1014     // Allow CR+LF newlines in multiline string literals.
1015     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
1016     // Allow LF+CR newlines in multiline string literals.
1017     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
1018     return;
1019   }
1020 
1021   switch (c) {
1022     case '\'':  // fall through
1023     case '"' :  // fall through
1024     case '\\': break;
1025     case 'b' : c = '\b'; break;
1026     case 'f' : c = '\f'; break;
1027     case 'n' : c = '\n'; break;
1028     case 'r' : c = '\r'; break;
1029     case 't' : c = '\t'; break;
1030     case 'u' : c = ScanHexEscape(c, 4); break;
1031     case 'v' : c = '\v'; break;
1032     case 'x' : c = ScanHexEscape(c, 2); break;
1033     case '0' :  // fall through
1034     case '1' :  // fall through
1035     case '2' :  // fall through
1036     case '3' :  // fall through
1037     case '4' :  // fall through
1038     case '5' :  // fall through
1039     case '6' :  // fall through
1040     case '7' : c = ScanOctalEscape(c, 2); break;
1041   }
1042 
1043   // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
1044   // should be illegal, but they are commonly handled
1045   // as non-escaped characters by JS VMs.
1046   AddChar(c);
1047 }
1048 
1049 
ScanString()1050 Token::Value Scanner::ScanString() {
1051   uc32 quote = c0_;
1052   Advance();  // consume quote
1053 
1054   StartLiteral();
1055   while (c0_ != quote && c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
1056     uc32 c = c0_;
1057     Advance();
1058     if (c == '\\') {
1059       if (c0_ < 0) return Token::ILLEGAL;
1060       ScanEscape();
1061     } else {
1062       AddChar(c);
1063     }
1064   }
1065   if (c0_ != quote) {
1066     return Token::ILLEGAL;
1067   }
1068   TerminateLiteral();
1069 
1070   Advance();  // consume quote
1071   return Token::STRING;
1072 }
1073 
1074 
Select(Token::Value tok)1075 Token::Value Scanner::Select(Token::Value tok) {
1076   Advance();
1077   return tok;
1078 }
1079 
1080 
Select(uc32 next,Token::Value then,Token::Value else_)1081 Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) {
1082   Advance();
1083   if (c0_ == next) {
1084     Advance();
1085     return then;
1086   } else {
1087     return else_;
1088   }
1089 }
1090 
1091 
1092 // Returns true if any decimal digits were scanned, returns false otherwise.
ScanDecimalDigits()1093 void Scanner::ScanDecimalDigits() {
1094   while (IsDecimalDigit(c0_))
1095     AddCharAdvance();
1096 }
1097 
1098 
ScanNumber(bool seen_period)1099 Token::Value Scanner::ScanNumber(bool seen_period) {
1100   ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
1101 
1102   enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
1103 
1104   StartLiteral();
1105   if (seen_period) {
1106     // we have already seen a decimal point of the float
1107     AddChar('.');
1108     ScanDecimalDigits();  // we know we have at least one digit
1109 
1110   } else {
1111     // if the first character is '0' we must check for octals and hex
1112     if (c0_ == '0') {
1113       AddCharAdvance();
1114 
1115       // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
1116       if (c0_ == 'x' || c0_ == 'X') {
1117         // hex number
1118         kind = HEX;
1119         AddCharAdvance();
1120         if (!IsHexDigit(c0_))
1121           // we must have at least one hex digit after 'x'/'X'
1122           return Token::ILLEGAL;
1123         while (IsHexDigit(c0_))
1124           AddCharAdvance();
1125 
1126       } else if ('0' <= c0_ && c0_ <= '7') {
1127         // (possible) octal number
1128         kind = OCTAL;
1129         while (true) {
1130           if (c0_ == '8' || c0_ == '9') {
1131             kind = DECIMAL;
1132             break;
1133           }
1134           if (c0_  < '0' || '7'  < c0_) break;
1135           AddCharAdvance();
1136         }
1137       }
1138     }
1139 
1140     // Parse decimal digits and allow trailing fractional part.
1141     if (kind == DECIMAL) {
1142       ScanDecimalDigits();  // optional
1143       if (c0_ == '.') {
1144         AddCharAdvance();
1145         ScanDecimalDigits();  // optional
1146       }
1147     }
1148   }
1149 
1150   // scan exponent, if any
1151   if (c0_ == 'e' || c0_ == 'E') {
1152     ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
1153     if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
1154     // scan exponent
1155     AddCharAdvance();
1156     if (c0_ == '+' || c0_ == '-')
1157       AddCharAdvance();
1158     if (!IsDecimalDigit(c0_))
1159       // we must have at least one decimal digit after 'e'/'E'
1160       return Token::ILLEGAL;
1161     ScanDecimalDigits();
1162   }
1163   TerminateLiteral();
1164 
1165   // The source character immediately following a numeric literal must
1166   // not be an identifier start or a decimal digit; see ECMA-262
1167   // section 7.8.3, page 17 (note that we read only one decimal digit
1168   // if the value is 0).
1169   if (IsDecimalDigit(c0_) || kIsIdentifierStart.get(c0_))
1170     return Token::ILLEGAL;
1171 
1172   return Token::NUMBER;
1173 }
1174 
1175 
ScanIdentifierUnicodeEscape()1176 uc32 Scanner::ScanIdentifierUnicodeEscape() {
1177   Advance();
1178   if (c0_ != 'u') return unibrow::Utf8::kBadChar;
1179   Advance();
1180   uc32 c = ScanHexEscape('u', 4);
1181   // We do not allow a unicode escape sequence to start another
1182   // unicode escape sequence.
1183   if (c == '\\') return unibrow::Utf8::kBadChar;
1184   return c;
1185 }
1186 
1187 
ScanIdentifier()1188 Token::Value Scanner::ScanIdentifier() {
1189   ASSERT(kIsIdentifierStart.get(c0_));
1190 
1191   StartLiteral();
1192   KeywordMatcher keyword_match;
1193 
1194   // Scan identifier start character.
1195   if (c0_ == '\\') {
1196     uc32 c = ScanIdentifierUnicodeEscape();
1197     // Only allow legal identifier start characters.
1198     if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL;
1199     AddChar(c);
1200     keyword_match.Fail();
1201   } else {
1202     AddChar(c0_);
1203     keyword_match.AddChar(c0_);
1204     Advance();
1205   }
1206 
1207   // Scan the rest of the identifier characters.
1208   while (kIsIdentifierPart.get(c0_)) {
1209     if (c0_ == '\\') {
1210       uc32 c = ScanIdentifierUnicodeEscape();
1211       // Only allow legal identifier part characters.
1212       if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL;
1213       AddChar(c);
1214       keyword_match.Fail();
1215     } else {
1216       AddChar(c0_);
1217       keyword_match.AddChar(c0_);
1218       Advance();
1219     }
1220   }
1221   TerminateLiteral();
1222 
1223   return keyword_match.token();
1224 }
1225 
1226 
1227 
IsIdentifier(unibrow::CharacterStream * buffer)1228 bool Scanner::IsIdentifier(unibrow::CharacterStream* buffer) {
1229   // Checks whether the buffer contains an identifier (no escape).
1230   if (!buffer->has_more()) return false;
1231   if (!kIsIdentifierStart.get(buffer->GetNext())) return false;
1232   while (buffer->has_more()) {
1233     if (!kIsIdentifierPart.get(buffer->GetNext())) return false;
1234   }
1235   return true;
1236 }
1237 
1238 
ScanRegExpPattern(bool seen_equal)1239 bool Scanner::ScanRegExpPattern(bool seen_equal) {
1240   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1241   bool in_character_class = false;
1242 
1243   // Previous token is either '/' or '/=', in the second case, the
1244   // pattern starts at =.
1245   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1246   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1247 
1248   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1249   // the scanner should pass uninterpreted bodies to the RegExp
1250   // constructor.
1251   StartLiteral();
1252   if (seen_equal)
1253     AddChar('=');
1254 
1255   while (c0_ != '/' || in_character_class) {
1256     if (kIsLineTerminator.get(c0_) || c0_ < 0)
1257       return false;
1258     if (c0_ == '\\') {  // escaped character
1259       AddCharAdvance();
1260       if (kIsLineTerminator.get(c0_) || c0_ < 0)
1261         return false;
1262       AddCharAdvance();
1263     } else {  // unescaped character
1264       if (c0_ == '[')
1265         in_character_class = true;
1266       if (c0_ == ']')
1267         in_character_class = false;
1268       AddCharAdvance();
1269     }
1270   }
1271   Advance();  // consume '/'
1272 
1273   TerminateLiteral();
1274 
1275   return true;
1276 }
1277 
ScanRegExpFlags()1278 bool Scanner::ScanRegExpFlags() {
1279   // Scan regular expression flags.
1280   StartLiteral();
1281   while (kIsIdentifierPart.get(c0_)) {
1282     if (c0_ == '\\') {
1283       uc32 c = ScanIdentifierUnicodeEscape();
1284       if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
1285         // We allow any escaped character, unlike the restriction on
1286         // IdentifierPart when it is used to build an IdentifierName.
1287         AddChar(c);
1288         continue;
1289       }
1290     }
1291     AddCharAdvance();
1292   }
1293   TerminateLiteral();
1294 
1295   next_.location.end_pos = source_pos() - 1;
1296   return true;
1297 }
1298 
1299 } }  // namespace v8::internal
1300