• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 //     * Redistributions of source code must retain the above copyright
7 //       notice, this list of conditions and the following disclaimer.
8 //     * Redistributions in binary form must reproduce the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer in the documentation and/or other materials provided
11 //       with the distribution.
12 //     * Neither the name of Google Inc. nor the names of its
13 //       contributors may be used to endorse or promote products derived
14 //       from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 // Features shared by parsing and pre-parsing scanners.
29 
30 #include "../include/v8stdint.h"
31 #include "scanner-base.h"
32 #include "char-predicates-inl.h"
33 
34 namespace v8 {
35 namespace internal {
36 
37 // ----------------------------------------------------------------------------
38 // Scanner
39 
Scanner(UnicodeCache * unicode_cache)40 Scanner::Scanner(UnicodeCache* unicode_cache)
41     : unicode_cache_(unicode_cache),
42       octal_pos_(kNoOctalLocation) { }
43 
44 
ScanHexEscape(uc32 c,int length)45 uc32 Scanner::ScanHexEscape(uc32 c, int length) {
46   ASSERT(length <= 4);  // prevent overflow
47 
48   uc32 digits[4];
49   uc32 x = 0;
50   for (int i = 0; i < length; i++) {
51     digits[i] = c0_;
52     int d = HexValue(c0_);
53     if (d < 0) {
54       // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
55       // should be illegal, but other JS VMs just return the
56       // non-escaped version of the original character.
57 
58       // Push back digits read, except the last one (in c0_).
59       for (int j = i-1; j >= 0; j--) {
60         PushBack(digits[j]);
61       }
62       // Notice: No handling of error - treat it as "\u"->"u".
63       return c;
64     }
65     x = x * 16 + d;
66     Advance();
67   }
68 
69   return x;
70 }
71 
72 
73 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
74 // ECMA-262. Other JS VMs support them.
ScanOctalEscape(uc32 c,int length)75 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
76   uc32 x = c - '0';
77   int i = 0;
78   for (; i < length; i++) {
79     int d = c0_ - '0';
80     if (d < 0 || d > 7) break;
81     int nx = x * 8 + d;
82     if (nx >= 256) break;
83     x = nx;
84     Advance();
85   }
86   // Anything excelt '\0' is an octal escape sequence, illegal in strict mode.
87   // Remember the position of octal escape sequences so that better error
88   // can be reported later (in strict mode).
89   if (c != '0' || i > 0) {
90     octal_pos_ = source_pos() - i - 1;     // Already advanced
91   }
92   return x;
93 }
94 
95 
96 // ----------------------------------------------------------------------------
97 // JavaScriptScanner
98 
JavaScriptScanner(UnicodeCache * scanner_contants)99 JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants)
100     : Scanner(scanner_contants) { }
101 
102 
Next()103 Token::Value JavaScriptScanner::Next() {
104   current_ = next_;
105   has_line_terminator_before_next_ = false;
106   Scan();
107   return current_.token;
108 }
109 
110 
IsByteOrderMark(uc32 c)111 static inline bool IsByteOrderMark(uc32 c) {
112   // The Unicode value U+FFFE is guaranteed never to be assigned as a
113   // Unicode character; this implies that in a Unicode context the
114   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
115   // character expressed in little-endian byte order (since it could
116   // not be a U+FFFE character expressed in big-endian byte
117   // order). Nevertheless, we check for it to be compatible with
118   // Spidermonkey.
119   return c == 0xFEFF || c == 0xFFFE;
120 }
121 
122 
SkipWhiteSpace()123 bool JavaScriptScanner::SkipWhiteSpace() {
124   int start_position = source_pos();
125 
126   while (true) {
127     // We treat byte-order marks (BOMs) as whitespace for better
128     // compatibility with Spidermonkey and other JavaScript engines.
129     while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
130       // IsWhiteSpace() includes line terminators!
131       if (unicode_cache_->IsLineTerminator(c0_)) {
132         // Ignore line terminators, but remember them. This is necessary
133         // for automatic semicolon insertion.
134         has_line_terminator_before_next_ = true;
135       }
136       Advance();
137     }
138 
139     // If there is an HTML comment end '-->' at the beginning of a
140     // line (with only whitespace in front of it), we treat the rest
141     // of the line as a comment. This is in line with the way
142     // SpiderMonkey handles it.
143     if (c0_ == '-' && has_line_terminator_before_next_) {
144       Advance();
145       if (c0_ == '-') {
146         Advance();
147         if (c0_ == '>') {
148           // Treat the rest of the line as a comment.
149           SkipSingleLineComment();
150           // Continue skipping white space after the comment.
151           continue;
152         }
153         PushBack('-');  // undo Advance()
154       }
155       PushBack('-');  // undo Advance()
156     }
157     // Return whether or not we skipped any characters.
158     return source_pos() != start_position;
159   }
160 }
161 
162 
SkipSingleLineComment()163 Token::Value JavaScriptScanner::SkipSingleLineComment() {
164   Advance();
165 
166   // The line terminator at the end of the line is not considered
167   // to be part of the single-line comment; it is recognized
168   // separately by the lexical grammar and becomes part of the
169   // stream of input elements for the syntactic grammar (see
170   // ECMA-262, section 7.4, page 12).
171   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
172     Advance();
173   }
174 
175   return Token::WHITESPACE;
176 }
177 
178 
SkipMultiLineComment()179 Token::Value JavaScriptScanner::SkipMultiLineComment() {
180   ASSERT(c0_ == '*');
181   Advance();
182 
183   while (c0_ >= 0) {
184     char ch = c0_;
185     Advance();
186     // If we have reached the end of the multi-line comment, we
187     // consume the '/' and insert a whitespace. This way all
188     // multi-line comments are treated as whitespace - even the ones
189     // containing line terminators. This contradicts ECMA-262, section
190     // 7.4, page 12, that says that multi-line comments containing
191     // line terminators should be treated as a line terminator, but it
192     // matches the behaviour of SpiderMonkey and KJS.
193     if (ch == '*' && c0_ == '/') {
194       c0_ = ' ';
195       return Token::WHITESPACE;
196     }
197   }
198 
199   // Unterminated multi-line comment.
200   return Token::ILLEGAL;
201 }
202 
203 
ScanHtmlComment()204 Token::Value JavaScriptScanner::ScanHtmlComment() {
205   // Check for <!-- comments.
206   ASSERT(c0_ == '!');
207   Advance();
208   if (c0_ == '-') {
209     Advance();
210     if (c0_ == '-') return SkipSingleLineComment();
211     PushBack('-');  // undo Advance()
212   }
213   PushBack('!');  // undo Advance()
214   ASSERT(c0_ == '!');
215   return Token::LT;
216 }
217 
218 
Scan()219 void JavaScriptScanner::Scan() {
220   next_.literal_chars = NULL;
221   Token::Value token;
222   do {
223     // Remember the position of the next token
224     next_.location.beg_pos = source_pos();
225 
226     switch (c0_) {
227       case ' ':
228       case '\t':
229         Advance();
230         token = Token::WHITESPACE;
231         break;
232 
233       case '\n':
234         Advance();
235         has_line_terminator_before_next_ = true;
236         token = Token::WHITESPACE;
237         break;
238 
239       case '"': case '\'':
240         token = ScanString();
241         break;
242 
243       case '<':
244         // < <= << <<= <!--
245         Advance();
246         if (c0_ == '=') {
247           token = Select(Token::LTE);
248         } else if (c0_ == '<') {
249           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
250         } else if (c0_ == '!') {
251           token = ScanHtmlComment();
252         } else {
253           token = Token::LT;
254         }
255         break;
256 
257       case '>':
258         // > >= >> >>= >>> >>>=
259         Advance();
260         if (c0_ == '=') {
261           token = Select(Token::GTE);
262         } else if (c0_ == '>') {
263           // >> >>= >>> >>>=
264           Advance();
265           if (c0_ == '=') {
266             token = Select(Token::ASSIGN_SAR);
267           } else if (c0_ == '>') {
268             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
269           } else {
270             token = Token::SAR;
271           }
272         } else {
273           token = Token::GT;
274         }
275         break;
276 
277       case '=':
278         // = == ===
279         Advance();
280         if (c0_ == '=') {
281           token = Select('=', Token::EQ_STRICT, Token::EQ);
282         } else {
283           token = Token::ASSIGN;
284         }
285         break;
286 
287       case '!':
288         // ! != !==
289         Advance();
290         if (c0_ == '=') {
291           token = Select('=', Token::NE_STRICT, Token::NE);
292         } else {
293           token = Token::NOT;
294         }
295         break;
296 
297       case '+':
298         // + ++ +=
299         Advance();
300         if (c0_ == '+') {
301           token = Select(Token::INC);
302         } else if (c0_ == '=') {
303           token = Select(Token::ASSIGN_ADD);
304         } else {
305           token = Token::ADD;
306         }
307         break;
308 
309       case '-':
310         // - -- --> -=
311         Advance();
312         if (c0_ == '-') {
313           Advance();
314           if (c0_ == '>' && has_line_terminator_before_next_) {
315             // For compatibility with SpiderMonkey, we skip lines that
316             // start with an HTML comment end '-->'.
317             token = SkipSingleLineComment();
318           } else {
319             token = Token::DEC;
320           }
321         } else if (c0_ == '=') {
322           token = Select(Token::ASSIGN_SUB);
323         } else {
324           token = Token::SUB;
325         }
326         break;
327 
328       case '*':
329         // * *=
330         token = Select('=', Token::ASSIGN_MUL, Token::MUL);
331         break;
332 
333       case '%':
334         // % %=
335         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
336         break;
337 
338       case '/':
339         // /  // /* /=
340         Advance();
341         if (c0_ == '/') {
342           token = SkipSingleLineComment();
343         } else if (c0_ == '*') {
344           token = SkipMultiLineComment();
345         } else if (c0_ == '=') {
346           token = Select(Token::ASSIGN_DIV);
347         } else {
348           token = Token::DIV;
349         }
350         break;
351 
352       case '&':
353         // & && &=
354         Advance();
355         if (c0_ == '&') {
356           token = Select(Token::AND);
357         } else if (c0_ == '=') {
358           token = Select(Token::ASSIGN_BIT_AND);
359         } else {
360           token = Token::BIT_AND;
361         }
362         break;
363 
364       case '|':
365         // | || |=
366         Advance();
367         if (c0_ == '|') {
368           token = Select(Token::OR);
369         } else if (c0_ == '=') {
370           token = Select(Token::ASSIGN_BIT_OR);
371         } else {
372           token = Token::BIT_OR;
373         }
374         break;
375 
376       case '^':
377         // ^ ^=
378         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
379         break;
380 
381       case '.':
382         // . Number
383         Advance();
384         if (IsDecimalDigit(c0_)) {
385           token = ScanNumber(true);
386         } else {
387           token = Token::PERIOD;
388         }
389         break;
390 
391       case ':':
392         token = Select(Token::COLON);
393         break;
394 
395       case ';':
396         token = Select(Token::SEMICOLON);
397         break;
398 
399       case ',':
400         token = Select(Token::COMMA);
401         break;
402 
403       case '(':
404         token = Select(Token::LPAREN);
405         break;
406 
407       case ')':
408         token = Select(Token::RPAREN);
409         break;
410 
411       case '[':
412         token = Select(Token::LBRACK);
413         break;
414 
415       case ']':
416         token = Select(Token::RBRACK);
417         break;
418 
419       case '{':
420         token = Select(Token::LBRACE);
421         break;
422 
423       case '}':
424         token = Select(Token::RBRACE);
425         break;
426 
427       case '?':
428         token = Select(Token::CONDITIONAL);
429         break;
430 
431       case '~':
432         token = Select(Token::BIT_NOT);
433         break;
434 
435       default:
436         if (unicode_cache_->IsIdentifierStart(c0_)) {
437           token = ScanIdentifierOrKeyword();
438         } else if (IsDecimalDigit(c0_)) {
439           token = ScanNumber(false);
440         } else if (SkipWhiteSpace()) {
441           token = Token::WHITESPACE;
442         } else if (c0_ < 0) {
443           token = Token::EOS;
444         } else {
445           token = Select(Token::ILLEGAL);
446         }
447         break;
448     }
449 
450     // Continue scanning for tokens as long as we're just skipping
451     // whitespace.
452   } while (token == Token::WHITESPACE);
453 
454   next_.location.end_pos = source_pos();
455   next_.token = token;
456 }
457 
458 
SeekForward(int pos)459 void JavaScriptScanner::SeekForward(int pos) {
460   // After this call, we will have the token at the given position as
461   // the "next" token. The "current" token will be invalid.
462   if (pos == next_.location.beg_pos) return;
463   int current_pos = source_pos();
464   ASSERT_EQ(next_.location.end_pos, current_pos);
465   // Positions inside the lookahead token aren't supported.
466   ASSERT(pos >= current_pos);
467   if (pos != current_pos) {
468     source_->SeekForward(pos - source_->pos());
469     Advance();
470     // This function is only called to seek to the location
471     // of the end of a function (at the "}" token). It doesn't matter
472     // whether there was a line terminator in the part we skip.
473     has_line_terminator_before_next_ = false;
474   }
475   Scan();
476 }
477 
478 
ScanEscape()479 void JavaScriptScanner::ScanEscape() {
480   uc32 c = c0_;
481   Advance();
482 
483   // Skip escaped newlines.
484   if (unicode_cache_->IsLineTerminator(c)) {
485     // Allow CR+LF newlines in multiline string literals.
486     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
487     // Allow LF+CR newlines in multiline string literals.
488     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
489     return;
490   }
491 
492   switch (c) {
493     case '\'':  // fall through
494     case '"' :  // fall through
495     case '\\': break;
496     case 'b' : c = '\b'; break;
497     case 'f' : c = '\f'; break;
498     case 'n' : c = '\n'; break;
499     case 'r' : c = '\r'; break;
500     case 't' : c = '\t'; break;
501     case 'u' : c = ScanHexEscape(c, 4); break;
502     case 'v' : c = '\v'; break;
503     case 'x' : c = ScanHexEscape(c, 2); break;
504     case '0' :  // fall through
505     case '1' :  // fall through
506     case '2' :  // fall through
507     case '3' :  // fall through
508     case '4' :  // fall through
509     case '5' :  // fall through
510     case '6' :  // fall through
511     case '7' : c = ScanOctalEscape(c, 2); break;
512   }
513 
514   // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
515   // should be illegal, but they are commonly handled
516   // as non-escaped characters by JS VMs.
517   AddLiteralChar(c);
518 }
519 
520 
ScanString()521 Token::Value JavaScriptScanner::ScanString() {
522   uc32 quote = c0_;
523   Advance();  // consume quote
524 
525   LiteralScope literal(this);
526   while (c0_ != quote && c0_ >= 0
527          && !unicode_cache_->IsLineTerminator(c0_)) {
528     uc32 c = c0_;
529     Advance();
530     if (c == '\\') {
531       if (c0_ < 0) return Token::ILLEGAL;
532       ScanEscape();
533     } else {
534       AddLiteralChar(c);
535     }
536   }
537   if (c0_ != quote) return Token::ILLEGAL;
538   literal.Complete();
539 
540   Advance();  // consume quote
541   return Token::STRING;
542 }
543 
544 
ScanDecimalDigits()545 void JavaScriptScanner::ScanDecimalDigits() {
546   while (IsDecimalDigit(c0_))
547     AddLiteralCharAdvance();
548 }
549 
550 
ScanNumber(bool seen_period)551 Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {
552   ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
553 
554   enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
555 
556   LiteralScope literal(this);
557   if (seen_period) {
558     // we have already seen a decimal point of the float
559     AddLiteralChar('.');
560     ScanDecimalDigits();  // we know we have at least one digit
561 
562   } else {
563     // if the first character is '0' we must check for octals and hex
564     if (c0_ == '0') {
565       AddLiteralCharAdvance();
566 
567       // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
568       if (c0_ == 'x' || c0_ == 'X') {
569         // hex number
570         kind = HEX;
571         AddLiteralCharAdvance();
572         if (!IsHexDigit(c0_)) {
573           // we must have at least one hex digit after 'x'/'X'
574           return Token::ILLEGAL;
575         }
576         while (IsHexDigit(c0_)) {
577           AddLiteralCharAdvance();
578         }
579       } else if ('0' <= c0_ && c0_ <= '7') {
580         // (possible) octal number
581         kind = OCTAL;
582         while (true) {
583           if (c0_ == '8' || c0_ == '9') {
584             kind = DECIMAL;
585             break;
586           }
587           if (c0_  < '0' || '7'  < c0_) {
588             // Octal literal finished.
589             octal_pos_ = next_.location.beg_pos;
590             break;
591           }
592           AddLiteralCharAdvance();
593         }
594       }
595     }
596 
597     // Parse decimal digits and allow trailing fractional part.
598     if (kind == DECIMAL) {
599       ScanDecimalDigits();  // optional
600       if (c0_ == '.') {
601         AddLiteralCharAdvance();
602         ScanDecimalDigits();  // optional
603       }
604     }
605   }
606 
607   // scan exponent, if any
608   if (c0_ == 'e' || c0_ == 'E') {
609     ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
610     if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
611     // scan exponent
612     AddLiteralCharAdvance();
613     if (c0_ == '+' || c0_ == '-')
614       AddLiteralCharAdvance();
615     if (!IsDecimalDigit(c0_)) {
616       // we must have at least one decimal digit after 'e'/'E'
617       return Token::ILLEGAL;
618     }
619     ScanDecimalDigits();
620   }
621 
622   // The source character immediately following a numeric literal must
623   // not be an identifier start or a decimal digit; see ECMA-262
624   // section 7.8.3, page 17 (note that we read only one decimal digit
625   // if the value is 0).
626   if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
627     return Token::ILLEGAL;
628 
629   literal.Complete();
630 
631   return Token::NUMBER;
632 }
633 
634 
ScanIdentifierUnicodeEscape()635 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {
636   Advance();
637   if (c0_ != 'u') return unibrow::Utf8::kBadChar;
638   Advance();
639   uc32 c = ScanHexEscape('u', 4);
640   // We do not allow a unicode escape sequence to start another
641   // unicode escape sequence.
642   if (c == '\\') return unibrow::Utf8::kBadChar;
643   return c;
644 }
645 
646 
ScanIdentifierOrKeyword()647 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {
648   ASSERT(unicode_cache_->IsIdentifierStart(c0_));
649   LiteralScope literal(this);
650   KeywordMatcher keyword_match;
651   // Scan identifier start character.
652   if (c0_ == '\\') {
653     uc32 c = ScanIdentifierUnicodeEscape();
654     // Only allow legal identifier start characters.
655     if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL;
656     AddLiteralChar(c);
657     return ScanIdentifierSuffix(&literal);
658   }
659 
660   uc32 first_char = c0_;
661   Advance();
662   AddLiteralChar(first_char);
663   if (!keyword_match.AddChar(first_char)) {
664     return ScanIdentifierSuffix(&literal);
665   }
666 
667   // Scan the rest of the identifier characters.
668   while (unicode_cache_->IsIdentifierPart(c0_)) {
669     if (c0_ != '\\') {
670       uc32 next_char = c0_;
671       Advance();
672       AddLiteralChar(next_char);
673       if (keyword_match.AddChar(next_char)) continue;
674     }
675     // Fallthrough if no loner able to complete keyword.
676     return ScanIdentifierSuffix(&literal);
677   }
678   literal.Complete();
679 
680   return keyword_match.token();
681 }
682 
683 
ScanIdentifierSuffix(LiteralScope * literal)684 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {
685   // Scan the rest of the identifier characters.
686   while (unicode_cache_->IsIdentifierPart(c0_)) {
687     if (c0_ == '\\') {
688       uc32 c = ScanIdentifierUnicodeEscape();
689       // Only allow legal identifier part characters.
690       if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL;
691       AddLiteralChar(c);
692     } else {
693       AddLiteralChar(c0_);
694       Advance();
695     }
696   }
697   literal->Complete();
698 
699   return Token::IDENTIFIER;
700 }
701 
702 
ScanRegExpPattern(bool seen_equal)703 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
704   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
705   bool in_character_class = false;
706 
707   // Previous token is either '/' or '/=', in the second case, the
708   // pattern starts at =.
709   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
710   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
711 
712   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
713   // the scanner should pass uninterpreted bodies to the RegExp
714   // constructor.
715   LiteralScope literal(this);
716   if (seen_equal)
717     AddLiteralChar('=');
718 
719   while (c0_ != '/' || in_character_class) {
720     if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
721     if (c0_ == '\\') {  // Escape sequence.
722       AddLiteralCharAdvance();
723       if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
724       AddLiteralCharAdvance();
725       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
726       // only "safe" characters are allowed (letters, digits, underscore),
727       // otherwise the escape isn't valid and the invalid character has
728       // its normal meaning. I.e., we can just continue scanning without
729       // worrying whether the following characters are part of the escape
730       // or not, since any '/', '\\' or '[' is guaranteed to not be part
731       // of the escape sequence.
732     } else {  // Unescaped character.
733       if (c0_ == '[') in_character_class = true;
734       if (c0_ == ']') in_character_class = false;
735       AddLiteralCharAdvance();
736     }
737   }
738   Advance();  // consume '/'
739 
740   literal.Complete();
741 
742   return true;
743 }
744 
745 
ScanRegExpFlags()746 bool JavaScriptScanner::ScanRegExpFlags() {
747   // Scan regular expression flags.
748   LiteralScope literal(this);
749   while (unicode_cache_->IsIdentifierPart(c0_)) {
750     if (c0_ == '\\') {
751       uc32 c = ScanIdentifierUnicodeEscape();
752       if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
753         // We allow any escaped character, unlike the restriction on
754         // IdentifierPart when it is used to build an IdentifierName.
755         AddLiteralChar(c);
756         continue;
757       }
758     }
759     AddLiteralCharAdvance();
760   }
761   literal.Complete();
762 
763   next_.location.end_pos = source_pos() - 1;
764   return true;
765 }
766 
767 // ----------------------------------------------------------------------------
768 // Keyword Matcher
769 
770 KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {
771   { "break",  KEYWORD_PREFIX, Token::BREAK },
772   { NULL,     C,              Token::ILLEGAL },
773   { NULL,     D,              Token::ILLEGAL },
774   { NULL,     E,              Token::ILLEGAL },
775   { NULL,     F,              Token::ILLEGAL },
776   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
777   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
778   { NULL,     I,              Token::ILLEGAL },
779   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
780   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
781   { "let",    KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD },
782   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
783   { NULL,     N,              Token::ILLEGAL },
784   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
785   { NULL,     P,              Token::ILLEGAL },
786   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
787   { "return", KEYWORD_PREFIX, Token::RETURN },
788   { NULL,     S,              Token::ILLEGAL },
789   { NULL,     T,              Token::ILLEGAL },
790   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
791   { NULL,     V,              Token::ILLEGAL },
792   { NULL,     W,              Token::ILLEGAL },
793   { NULL,     UNMATCHABLE,    Token::ILLEGAL },
794   { "yield",  KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD }
795 };
796 
797 
Step(unibrow::uchar input)798 void KeywordMatcher::Step(unibrow::uchar input) {
799   switch (state_) {
800     case INITIAL: {
801       // matching the first character is the only state with significant fanout.
802       // Match only lower-case letters in range 'b'..'y'.
803       unsigned int offset = input - kFirstCharRangeMin;
804       if (offset < kFirstCharRangeLength) {
805         state_ = first_states_[offset].state;
806         if (state_ == KEYWORD_PREFIX) {
807           keyword_ = first_states_[offset].keyword;
808           counter_ = 1;
809           keyword_token_ = first_states_[offset].token;
810         }
811         return;
812       }
813       break;
814     }
815     case KEYWORD_PREFIX:
816       if (static_cast<unibrow::uchar>(keyword_[counter_]) == input) {
817         counter_++;
818         if (keyword_[counter_] == '\0') {
819           state_ = KEYWORD_MATCHED;
820           token_ = keyword_token_;
821         }
822         return;
823       }
824       break;
825     case KEYWORD_MATCHED:
826       token_ = Token::IDENTIFIER;
827       break;
828     case C:
829       if (MatchState(input, 'a', CA)) return;
830       if (MatchKeywordStart(input, "class", 1,
831           Token::FUTURE_RESERVED_WORD)) return;
832       if (MatchState(input, 'o', CO)) return;
833       break;
834     case CA:
835       if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;
836       if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;
837       break;
838     case CO:
839       if (MatchState(input, 'n', CON)) return;
840       break;
841     case CON:
842       if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;
843       if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;
844       break;
845     case D:
846       if (MatchState(input, 'e', DE)) return;
847       if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;
848       break;
849     case DE:
850       if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;
851       if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;
852       if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;
853       break;
854     case E:
855       if (MatchKeywordStart(input, "else", 1, Token::ELSE)) return;
856       if (MatchKeywordStart(input, "enum", 1,
857           Token::FUTURE_RESERVED_WORD)) return;
858       if (MatchState(input, 'x', EX)) return;
859       break;
860     case EX:
861       if (MatchKeywordStart(input, "export", 2,
862           Token::FUTURE_RESERVED_WORD)) return;
863       if (MatchKeywordStart(input, "extends", 2,
864           Token::FUTURE_RESERVED_WORD)) return;
865       break;
866     case F:
867       if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;
868       if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;
869       if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;
870       if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;
871       break;
872     case I:
873       if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;
874       if (MatchState(input, 'm', IM)) return;
875       if (MatchKeyword(input, 'n', IN, Token::IN)) return;
876       break;
877     case IM:
878       if (MatchState(input, 'p', IMP)) return;
879       break;
880     case IMP:
881       if (MatchKeywordStart(input, "implements", 3,
882          Token::FUTURE_RESERVED_WORD )) return;
883       if (MatchKeywordStart(input, "import", 3,
884          Token::FUTURE_RESERVED_WORD)) return;
885       break;
886     case IN:
887       token_ = Token::IDENTIFIER;
888       if (MatchKeywordStart(input, "interface", 2,
889          Token::FUTURE_RESERVED_WORD)) return;
890       if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) return;
891       break;
892     case N:
893       if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return;
894       if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;
895       if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;
896       break;
897     case P:
898       if (MatchKeywordStart(input, "package", 1,
899           Token::FUTURE_RESERVED_WORD)) return;
900       if (MatchState(input, 'r', PR)) return;
901       if (MatchKeywordStart(input, "public", 1,
902           Token::FUTURE_RESERVED_WORD)) return;
903       break;
904     case PR:
905       if (MatchKeywordStart(input, "private", 2,
906           Token::FUTURE_RESERVED_WORD)) return;
907       if (MatchKeywordStart(input, "protected", 2,
908           Token::FUTURE_RESERVED_WORD)) return;
909       break;
910     case S:
911       if (MatchKeywordStart(input, "static", 1,
912           Token::FUTURE_RESERVED_WORD)) return;
913       if (MatchKeywordStart(input, "super", 1,
914           Token::FUTURE_RESERVED_WORD)) return;
915       if (MatchKeywordStart(input, "switch", 1,
916           Token::SWITCH)) return;
917       break;
918     case T:
919       if (MatchState(input, 'h', TH)) return;
920       if (MatchState(input, 'r', TR)) return;
921       if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;
922       break;
923     case TH:
924       if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;
925       if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;
926       break;
927     case TR:
928       if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;
929       if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;
930       break;
931     case V:
932       if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;
933       if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;
934       break;
935     case W:
936       if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;
937       if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
938       break;
939     case UNMATCHABLE:
940       break;
941   }
942   // On fallthrough, it's a failure.
943   state_ = UNMATCHABLE;
944 }
945 
946 } }  // namespace v8::internal
947