1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28 // Features shared by parsing and pre-parsing scanners.
29
30 #include "../include/v8stdint.h"
31 #include "scanner-base.h"
32 #include "char-predicates-inl.h"
33
34 namespace v8 {
35 namespace internal {
36
37 // ----------------------------------------------------------------------------
38 // Scanner
39
Scanner(UnicodeCache * unicode_cache)40 Scanner::Scanner(UnicodeCache* unicode_cache)
41 : unicode_cache_(unicode_cache),
42 octal_pos_(kNoOctalLocation) { }
43
44
ScanHexEscape(uc32 c,int length)45 uc32 Scanner::ScanHexEscape(uc32 c, int length) {
46 ASSERT(length <= 4); // prevent overflow
47
48 uc32 digits[4];
49 uc32 x = 0;
50 for (int i = 0; i < length; i++) {
51 digits[i] = c0_;
52 int d = HexValue(c0_);
53 if (d < 0) {
54 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
55 // should be illegal, but other JS VMs just return the
56 // non-escaped version of the original character.
57
58 // Push back digits read, except the last one (in c0_).
59 for (int j = i-1; j >= 0; j--) {
60 PushBack(digits[j]);
61 }
62 // Notice: No handling of error - treat it as "\u"->"u".
63 return c;
64 }
65 x = x * 16 + d;
66 Advance();
67 }
68
69 return x;
70 }
71
72
73 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
74 // ECMA-262. Other JS VMs support them.
ScanOctalEscape(uc32 c,int length)75 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
76 uc32 x = c - '0';
77 int i = 0;
78 for (; i < length; i++) {
79 int d = c0_ - '0';
80 if (d < 0 || d > 7) break;
81 int nx = x * 8 + d;
82 if (nx >= 256) break;
83 x = nx;
84 Advance();
85 }
86 // Anything excelt '\0' is an octal escape sequence, illegal in strict mode.
87 // Remember the position of octal escape sequences so that better error
88 // can be reported later (in strict mode).
89 if (c != '0' || i > 0) {
90 octal_pos_ = source_pos() - i - 1; // Already advanced
91 }
92 return x;
93 }
94
95
96 // ----------------------------------------------------------------------------
97 // JavaScriptScanner
98
JavaScriptScanner(UnicodeCache * scanner_contants)99 JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants)
100 : Scanner(scanner_contants) { }
101
102
Next()103 Token::Value JavaScriptScanner::Next() {
104 current_ = next_;
105 has_line_terminator_before_next_ = false;
106 Scan();
107 return current_.token;
108 }
109
110
IsByteOrderMark(uc32 c)111 static inline bool IsByteOrderMark(uc32 c) {
112 // The Unicode value U+FFFE is guaranteed never to be assigned as a
113 // Unicode character; this implies that in a Unicode context the
114 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
115 // character expressed in little-endian byte order (since it could
116 // not be a U+FFFE character expressed in big-endian byte
117 // order). Nevertheless, we check for it to be compatible with
118 // Spidermonkey.
119 return c == 0xFEFF || c == 0xFFFE;
120 }
121
122
SkipWhiteSpace()123 bool JavaScriptScanner::SkipWhiteSpace() {
124 int start_position = source_pos();
125
126 while (true) {
127 // We treat byte-order marks (BOMs) as whitespace for better
128 // compatibility with Spidermonkey and other JavaScript engines.
129 while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
130 // IsWhiteSpace() includes line terminators!
131 if (unicode_cache_->IsLineTerminator(c0_)) {
132 // Ignore line terminators, but remember them. This is necessary
133 // for automatic semicolon insertion.
134 has_line_terminator_before_next_ = true;
135 }
136 Advance();
137 }
138
139 // If there is an HTML comment end '-->' at the beginning of a
140 // line (with only whitespace in front of it), we treat the rest
141 // of the line as a comment. This is in line with the way
142 // SpiderMonkey handles it.
143 if (c0_ == '-' && has_line_terminator_before_next_) {
144 Advance();
145 if (c0_ == '-') {
146 Advance();
147 if (c0_ == '>') {
148 // Treat the rest of the line as a comment.
149 SkipSingleLineComment();
150 // Continue skipping white space after the comment.
151 continue;
152 }
153 PushBack('-'); // undo Advance()
154 }
155 PushBack('-'); // undo Advance()
156 }
157 // Return whether or not we skipped any characters.
158 return source_pos() != start_position;
159 }
160 }
161
162
SkipSingleLineComment()163 Token::Value JavaScriptScanner::SkipSingleLineComment() {
164 Advance();
165
166 // The line terminator at the end of the line is not considered
167 // to be part of the single-line comment; it is recognized
168 // separately by the lexical grammar and becomes part of the
169 // stream of input elements for the syntactic grammar (see
170 // ECMA-262, section 7.4, page 12).
171 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
172 Advance();
173 }
174
175 return Token::WHITESPACE;
176 }
177
178
SkipMultiLineComment()179 Token::Value JavaScriptScanner::SkipMultiLineComment() {
180 ASSERT(c0_ == '*');
181 Advance();
182
183 while (c0_ >= 0) {
184 char ch = c0_;
185 Advance();
186 // If we have reached the end of the multi-line comment, we
187 // consume the '/' and insert a whitespace. This way all
188 // multi-line comments are treated as whitespace - even the ones
189 // containing line terminators. This contradicts ECMA-262, section
190 // 7.4, page 12, that says that multi-line comments containing
191 // line terminators should be treated as a line terminator, but it
192 // matches the behaviour of SpiderMonkey and KJS.
193 if (ch == '*' && c0_ == '/') {
194 c0_ = ' ';
195 return Token::WHITESPACE;
196 }
197 }
198
199 // Unterminated multi-line comment.
200 return Token::ILLEGAL;
201 }
202
203
ScanHtmlComment()204 Token::Value JavaScriptScanner::ScanHtmlComment() {
205 // Check for <!-- comments.
206 ASSERT(c0_ == '!');
207 Advance();
208 if (c0_ == '-') {
209 Advance();
210 if (c0_ == '-') return SkipSingleLineComment();
211 PushBack('-'); // undo Advance()
212 }
213 PushBack('!'); // undo Advance()
214 ASSERT(c0_ == '!');
215 return Token::LT;
216 }
217
218
Scan()219 void JavaScriptScanner::Scan() {
220 next_.literal_chars = NULL;
221 Token::Value token;
222 do {
223 // Remember the position of the next token
224 next_.location.beg_pos = source_pos();
225
226 switch (c0_) {
227 case ' ':
228 case '\t':
229 Advance();
230 token = Token::WHITESPACE;
231 break;
232
233 case '\n':
234 Advance();
235 has_line_terminator_before_next_ = true;
236 token = Token::WHITESPACE;
237 break;
238
239 case '"': case '\'':
240 token = ScanString();
241 break;
242
243 case '<':
244 // < <= << <<= <!--
245 Advance();
246 if (c0_ == '=') {
247 token = Select(Token::LTE);
248 } else if (c0_ == '<') {
249 token = Select('=', Token::ASSIGN_SHL, Token::SHL);
250 } else if (c0_ == '!') {
251 token = ScanHtmlComment();
252 } else {
253 token = Token::LT;
254 }
255 break;
256
257 case '>':
258 // > >= >> >>= >>> >>>=
259 Advance();
260 if (c0_ == '=') {
261 token = Select(Token::GTE);
262 } else if (c0_ == '>') {
263 // >> >>= >>> >>>=
264 Advance();
265 if (c0_ == '=') {
266 token = Select(Token::ASSIGN_SAR);
267 } else if (c0_ == '>') {
268 token = Select('=', Token::ASSIGN_SHR, Token::SHR);
269 } else {
270 token = Token::SAR;
271 }
272 } else {
273 token = Token::GT;
274 }
275 break;
276
277 case '=':
278 // = == ===
279 Advance();
280 if (c0_ == '=') {
281 token = Select('=', Token::EQ_STRICT, Token::EQ);
282 } else {
283 token = Token::ASSIGN;
284 }
285 break;
286
287 case '!':
288 // ! != !==
289 Advance();
290 if (c0_ == '=') {
291 token = Select('=', Token::NE_STRICT, Token::NE);
292 } else {
293 token = Token::NOT;
294 }
295 break;
296
297 case '+':
298 // + ++ +=
299 Advance();
300 if (c0_ == '+') {
301 token = Select(Token::INC);
302 } else if (c0_ == '=') {
303 token = Select(Token::ASSIGN_ADD);
304 } else {
305 token = Token::ADD;
306 }
307 break;
308
309 case '-':
310 // - -- --> -=
311 Advance();
312 if (c0_ == '-') {
313 Advance();
314 if (c0_ == '>' && has_line_terminator_before_next_) {
315 // For compatibility with SpiderMonkey, we skip lines that
316 // start with an HTML comment end '-->'.
317 token = SkipSingleLineComment();
318 } else {
319 token = Token::DEC;
320 }
321 } else if (c0_ == '=') {
322 token = Select(Token::ASSIGN_SUB);
323 } else {
324 token = Token::SUB;
325 }
326 break;
327
328 case '*':
329 // * *=
330 token = Select('=', Token::ASSIGN_MUL, Token::MUL);
331 break;
332
333 case '%':
334 // % %=
335 token = Select('=', Token::ASSIGN_MOD, Token::MOD);
336 break;
337
338 case '/':
339 // / // /* /=
340 Advance();
341 if (c0_ == '/') {
342 token = SkipSingleLineComment();
343 } else if (c0_ == '*') {
344 token = SkipMultiLineComment();
345 } else if (c0_ == '=') {
346 token = Select(Token::ASSIGN_DIV);
347 } else {
348 token = Token::DIV;
349 }
350 break;
351
352 case '&':
353 // & && &=
354 Advance();
355 if (c0_ == '&') {
356 token = Select(Token::AND);
357 } else if (c0_ == '=') {
358 token = Select(Token::ASSIGN_BIT_AND);
359 } else {
360 token = Token::BIT_AND;
361 }
362 break;
363
364 case '|':
365 // | || |=
366 Advance();
367 if (c0_ == '|') {
368 token = Select(Token::OR);
369 } else if (c0_ == '=') {
370 token = Select(Token::ASSIGN_BIT_OR);
371 } else {
372 token = Token::BIT_OR;
373 }
374 break;
375
376 case '^':
377 // ^ ^=
378 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
379 break;
380
381 case '.':
382 // . Number
383 Advance();
384 if (IsDecimalDigit(c0_)) {
385 token = ScanNumber(true);
386 } else {
387 token = Token::PERIOD;
388 }
389 break;
390
391 case ':':
392 token = Select(Token::COLON);
393 break;
394
395 case ';':
396 token = Select(Token::SEMICOLON);
397 break;
398
399 case ',':
400 token = Select(Token::COMMA);
401 break;
402
403 case '(':
404 token = Select(Token::LPAREN);
405 break;
406
407 case ')':
408 token = Select(Token::RPAREN);
409 break;
410
411 case '[':
412 token = Select(Token::LBRACK);
413 break;
414
415 case ']':
416 token = Select(Token::RBRACK);
417 break;
418
419 case '{':
420 token = Select(Token::LBRACE);
421 break;
422
423 case '}':
424 token = Select(Token::RBRACE);
425 break;
426
427 case '?':
428 token = Select(Token::CONDITIONAL);
429 break;
430
431 case '~':
432 token = Select(Token::BIT_NOT);
433 break;
434
435 default:
436 if (unicode_cache_->IsIdentifierStart(c0_)) {
437 token = ScanIdentifierOrKeyword();
438 } else if (IsDecimalDigit(c0_)) {
439 token = ScanNumber(false);
440 } else if (SkipWhiteSpace()) {
441 token = Token::WHITESPACE;
442 } else if (c0_ < 0) {
443 token = Token::EOS;
444 } else {
445 token = Select(Token::ILLEGAL);
446 }
447 break;
448 }
449
450 // Continue scanning for tokens as long as we're just skipping
451 // whitespace.
452 } while (token == Token::WHITESPACE);
453
454 next_.location.end_pos = source_pos();
455 next_.token = token;
456 }
457
458
SeekForward(int pos)459 void JavaScriptScanner::SeekForward(int pos) {
460 // After this call, we will have the token at the given position as
461 // the "next" token. The "current" token will be invalid.
462 if (pos == next_.location.beg_pos) return;
463 int current_pos = source_pos();
464 ASSERT_EQ(next_.location.end_pos, current_pos);
465 // Positions inside the lookahead token aren't supported.
466 ASSERT(pos >= current_pos);
467 if (pos != current_pos) {
468 source_->SeekForward(pos - source_->pos());
469 Advance();
470 // This function is only called to seek to the location
471 // of the end of a function (at the "}" token). It doesn't matter
472 // whether there was a line terminator in the part we skip.
473 has_line_terminator_before_next_ = false;
474 }
475 Scan();
476 }
477
478
ScanEscape()479 void JavaScriptScanner::ScanEscape() {
480 uc32 c = c0_;
481 Advance();
482
483 // Skip escaped newlines.
484 if (unicode_cache_->IsLineTerminator(c)) {
485 // Allow CR+LF newlines in multiline string literals.
486 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
487 // Allow LF+CR newlines in multiline string literals.
488 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
489 return;
490 }
491
492 switch (c) {
493 case '\'': // fall through
494 case '"' : // fall through
495 case '\\': break;
496 case 'b' : c = '\b'; break;
497 case 'f' : c = '\f'; break;
498 case 'n' : c = '\n'; break;
499 case 'r' : c = '\r'; break;
500 case 't' : c = '\t'; break;
501 case 'u' : c = ScanHexEscape(c, 4); break;
502 case 'v' : c = '\v'; break;
503 case 'x' : c = ScanHexEscape(c, 2); break;
504 case '0' : // fall through
505 case '1' : // fall through
506 case '2' : // fall through
507 case '3' : // fall through
508 case '4' : // fall through
509 case '5' : // fall through
510 case '6' : // fall through
511 case '7' : c = ScanOctalEscape(c, 2); break;
512 }
513
514 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
515 // should be illegal, but they are commonly handled
516 // as non-escaped characters by JS VMs.
517 AddLiteralChar(c);
518 }
519
520
ScanString()521 Token::Value JavaScriptScanner::ScanString() {
522 uc32 quote = c0_;
523 Advance(); // consume quote
524
525 LiteralScope literal(this);
526 while (c0_ != quote && c0_ >= 0
527 && !unicode_cache_->IsLineTerminator(c0_)) {
528 uc32 c = c0_;
529 Advance();
530 if (c == '\\') {
531 if (c0_ < 0) return Token::ILLEGAL;
532 ScanEscape();
533 } else {
534 AddLiteralChar(c);
535 }
536 }
537 if (c0_ != quote) return Token::ILLEGAL;
538 literal.Complete();
539
540 Advance(); // consume quote
541 return Token::STRING;
542 }
543
544
ScanDecimalDigits()545 void JavaScriptScanner::ScanDecimalDigits() {
546 while (IsDecimalDigit(c0_))
547 AddLiteralCharAdvance();
548 }
549
550
ScanNumber(bool seen_period)551 Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {
552 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
553
554 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
555
556 LiteralScope literal(this);
557 if (seen_period) {
558 // we have already seen a decimal point of the float
559 AddLiteralChar('.');
560 ScanDecimalDigits(); // we know we have at least one digit
561
562 } else {
563 // if the first character is '0' we must check for octals and hex
564 if (c0_ == '0') {
565 AddLiteralCharAdvance();
566
567 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
568 if (c0_ == 'x' || c0_ == 'X') {
569 // hex number
570 kind = HEX;
571 AddLiteralCharAdvance();
572 if (!IsHexDigit(c0_)) {
573 // we must have at least one hex digit after 'x'/'X'
574 return Token::ILLEGAL;
575 }
576 while (IsHexDigit(c0_)) {
577 AddLiteralCharAdvance();
578 }
579 } else if ('0' <= c0_ && c0_ <= '7') {
580 // (possible) octal number
581 kind = OCTAL;
582 while (true) {
583 if (c0_ == '8' || c0_ == '9') {
584 kind = DECIMAL;
585 break;
586 }
587 if (c0_ < '0' || '7' < c0_) {
588 // Octal literal finished.
589 octal_pos_ = next_.location.beg_pos;
590 break;
591 }
592 AddLiteralCharAdvance();
593 }
594 }
595 }
596
597 // Parse decimal digits and allow trailing fractional part.
598 if (kind == DECIMAL) {
599 ScanDecimalDigits(); // optional
600 if (c0_ == '.') {
601 AddLiteralCharAdvance();
602 ScanDecimalDigits(); // optional
603 }
604 }
605 }
606
607 // scan exponent, if any
608 if (c0_ == 'e' || c0_ == 'E') {
609 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
610 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed
611 // scan exponent
612 AddLiteralCharAdvance();
613 if (c0_ == '+' || c0_ == '-')
614 AddLiteralCharAdvance();
615 if (!IsDecimalDigit(c0_)) {
616 // we must have at least one decimal digit after 'e'/'E'
617 return Token::ILLEGAL;
618 }
619 ScanDecimalDigits();
620 }
621
622 // The source character immediately following a numeric literal must
623 // not be an identifier start or a decimal digit; see ECMA-262
624 // section 7.8.3, page 17 (note that we read only one decimal digit
625 // if the value is 0).
626 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
627 return Token::ILLEGAL;
628
629 literal.Complete();
630
631 return Token::NUMBER;
632 }
633
634
ScanIdentifierUnicodeEscape()635 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {
636 Advance();
637 if (c0_ != 'u') return unibrow::Utf8::kBadChar;
638 Advance();
639 uc32 c = ScanHexEscape('u', 4);
640 // We do not allow a unicode escape sequence to start another
641 // unicode escape sequence.
642 if (c == '\\') return unibrow::Utf8::kBadChar;
643 return c;
644 }
645
646
ScanIdentifierOrKeyword()647 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {
648 ASSERT(unicode_cache_->IsIdentifierStart(c0_));
649 LiteralScope literal(this);
650 KeywordMatcher keyword_match;
651 // Scan identifier start character.
652 if (c0_ == '\\') {
653 uc32 c = ScanIdentifierUnicodeEscape();
654 // Only allow legal identifier start characters.
655 if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL;
656 AddLiteralChar(c);
657 return ScanIdentifierSuffix(&literal);
658 }
659
660 uc32 first_char = c0_;
661 Advance();
662 AddLiteralChar(first_char);
663 if (!keyword_match.AddChar(first_char)) {
664 return ScanIdentifierSuffix(&literal);
665 }
666
667 // Scan the rest of the identifier characters.
668 while (unicode_cache_->IsIdentifierPart(c0_)) {
669 if (c0_ != '\\') {
670 uc32 next_char = c0_;
671 Advance();
672 AddLiteralChar(next_char);
673 if (keyword_match.AddChar(next_char)) continue;
674 }
675 // Fallthrough if no loner able to complete keyword.
676 return ScanIdentifierSuffix(&literal);
677 }
678 literal.Complete();
679
680 return keyword_match.token();
681 }
682
683
ScanIdentifierSuffix(LiteralScope * literal)684 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {
685 // Scan the rest of the identifier characters.
686 while (unicode_cache_->IsIdentifierPart(c0_)) {
687 if (c0_ == '\\') {
688 uc32 c = ScanIdentifierUnicodeEscape();
689 // Only allow legal identifier part characters.
690 if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL;
691 AddLiteralChar(c);
692 } else {
693 AddLiteralChar(c0_);
694 Advance();
695 }
696 }
697 literal->Complete();
698
699 return Token::IDENTIFIER;
700 }
701
702
ScanRegExpPattern(bool seen_equal)703 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
704 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
705 bool in_character_class = false;
706
707 // Previous token is either '/' or '/=', in the second case, the
708 // pattern starts at =.
709 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
710 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
711
712 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
713 // the scanner should pass uninterpreted bodies to the RegExp
714 // constructor.
715 LiteralScope literal(this);
716 if (seen_equal)
717 AddLiteralChar('=');
718
719 while (c0_ != '/' || in_character_class) {
720 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
721 if (c0_ == '\\') { // Escape sequence.
722 AddLiteralCharAdvance();
723 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
724 AddLiteralCharAdvance();
725 // If the escape allows more characters, i.e., \x??, \u????, or \c?,
726 // only "safe" characters are allowed (letters, digits, underscore),
727 // otherwise the escape isn't valid and the invalid character has
728 // its normal meaning. I.e., we can just continue scanning without
729 // worrying whether the following characters are part of the escape
730 // or not, since any '/', '\\' or '[' is guaranteed to not be part
731 // of the escape sequence.
732 } else { // Unescaped character.
733 if (c0_ == '[') in_character_class = true;
734 if (c0_ == ']') in_character_class = false;
735 AddLiteralCharAdvance();
736 }
737 }
738 Advance(); // consume '/'
739
740 literal.Complete();
741
742 return true;
743 }
744
745
ScanRegExpFlags()746 bool JavaScriptScanner::ScanRegExpFlags() {
747 // Scan regular expression flags.
748 LiteralScope literal(this);
749 while (unicode_cache_->IsIdentifierPart(c0_)) {
750 if (c0_ == '\\') {
751 uc32 c = ScanIdentifierUnicodeEscape();
752 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
753 // We allow any escaped character, unlike the restriction on
754 // IdentifierPart when it is used to build an IdentifierName.
755 AddLiteralChar(c);
756 continue;
757 }
758 }
759 AddLiteralCharAdvance();
760 }
761 literal.Complete();
762
763 next_.location.end_pos = source_pos() - 1;
764 return true;
765 }
766
767 // ----------------------------------------------------------------------------
768 // Keyword Matcher
769
770 KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {
771 { "break", KEYWORD_PREFIX, Token::BREAK },
772 { NULL, C, Token::ILLEGAL },
773 { NULL, D, Token::ILLEGAL },
774 { NULL, E, Token::ILLEGAL },
775 { NULL, F, Token::ILLEGAL },
776 { NULL, UNMATCHABLE, Token::ILLEGAL },
777 { NULL, UNMATCHABLE, Token::ILLEGAL },
778 { NULL, I, Token::ILLEGAL },
779 { NULL, UNMATCHABLE, Token::ILLEGAL },
780 { NULL, UNMATCHABLE, Token::ILLEGAL },
781 { "let", KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD },
782 { NULL, UNMATCHABLE, Token::ILLEGAL },
783 { NULL, N, Token::ILLEGAL },
784 { NULL, UNMATCHABLE, Token::ILLEGAL },
785 { NULL, P, Token::ILLEGAL },
786 { NULL, UNMATCHABLE, Token::ILLEGAL },
787 { "return", KEYWORD_PREFIX, Token::RETURN },
788 { NULL, S, Token::ILLEGAL },
789 { NULL, T, Token::ILLEGAL },
790 { NULL, UNMATCHABLE, Token::ILLEGAL },
791 { NULL, V, Token::ILLEGAL },
792 { NULL, W, Token::ILLEGAL },
793 { NULL, UNMATCHABLE, Token::ILLEGAL },
794 { "yield", KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD }
795 };
796
797
Step(unibrow::uchar input)798 void KeywordMatcher::Step(unibrow::uchar input) {
799 switch (state_) {
800 case INITIAL: {
801 // matching the first character is the only state with significant fanout.
802 // Match only lower-case letters in range 'b'..'y'.
803 unsigned int offset = input - kFirstCharRangeMin;
804 if (offset < kFirstCharRangeLength) {
805 state_ = first_states_[offset].state;
806 if (state_ == KEYWORD_PREFIX) {
807 keyword_ = first_states_[offset].keyword;
808 counter_ = 1;
809 keyword_token_ = first_states_[offset].token;
810 }
811 return;
812 }
813 break;
814 }
815 case KEYWORD_PREFIX:
816 if (static_cast<unibrow::uchar>(keyword_[counter_]) == input) {
817 counter_++;
818 if (keyword_[counter_] == '\0') {
819 state_ = KEYWORD_MATCHED;
820 token_ = keyword_token_;
821 }
822 return;
823 }
824 break;
825 case KEYWORD_MATCHED:
826 token_ = Token::IDENTIFIER;
827 break;
828 case C:
829 if (MatchState(input, 'a', CA)) return;
830 if (MatchKeywordStart(input, "class", 1,
831 Token::FUTURE_RESERVED_WORD)) return;
832 if (MatchState(input, 'o', CO)) return;
833 break;
834 case CA:
835 if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;
836 if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;
837 break;
838 case CO:
839 if (MatchState(input, 'n', CON)) return;
840 break;
841 case CON:
842 if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;
843 if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;
844 break;
845 case D:
846 if (MatchState(input, 'e', DE)) return;
847 if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;
848 break;
849 case DE:
850 if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;
851 if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;
852 if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;
853 break;
854 case E:
855 if (MatchKeywordStart(input, "else", 1, Token::ELSE)) return;
856 if (MatchKeywordStart(input, "enum", 1,
857 Token::FUTURE_RESERVED_WORD)) return;
858 if (MatchState(input, 'x', EX)) return;
859 break;
860 case EX:
861 if (MatchKeywordStart(input, "export", 2,
862 Token::FUTURE_RESERVED_WORD)) return;
863 if (MatchKeywordStart(input, "extends", 2,
864 Token::FUTURE_RESERVED_WORD)) return;
865 break;
866 case F:
867 if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;
868 if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;
869 if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;
870 if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;
871 break;
872 case I:
873 if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;
874 if (MatchState(input, 'm', IM)) return;
875 if (MatchKeyword(input, 'n', IN, Token::IN)) return;
876 break;
877 case IM:
878 if (MatchState(input, 'p', IMP)) return;
879 break;
880 case IMP:
881 if (MatchKeywordStart(input, "implements", 3,
882 Token::FUTURE_RESERVED_WORD )) return;
883 if (MatchKeywordStart(input, "import", 3,
884 Token::FUTURE_RESERVED_WORD)) return;
885 break;
886 case IN:
887 token_ = Token::IDENTIFIER;
888 if (MatchKeywordStart(input, "interface", 2,
889 Token::FUTURE_RESERVED_WORD)) return;
890 if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) return;
891 break;
892 case N:
893 if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return;
894 if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;
895 if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;
896 break;
897 case P:
898 if (MatchKeywordStart(input, "package", 1,
899 Token::FUTURE_RESERVED_WORD)) return;
900 if (MatchState(input, 'r', PR)) return;
901 if (MatchKeywordStart(input, "public", 1,
902 Token::FUTURE_RESERVED_WORD)) return;
903 break;
904 case PR:
905 if (MatchKeywordStart(input, "private", 2,
906 Token::FUTURE_RESERVED_WORD)) return;
907 if (MatchKeywordStart(input, "protected", 2,
908 Token::FUTURE_RESERVED_WORD)) return;
909 break;
910 case S:
911 if (MatchKeywordStart(input, "static", 1,
912 Token::FUTURE_RESERVED_WORD)) return;
913 if (MatchKeywordStart(input, "super", 1,
914 Token::FUTURE_RESERVED_WORD)) return;
915 if (MatchKeywordStart(input, "switch", 1,
916 Token::SWITCH)) return;
917 break;
918 case T:
919 if (MatchState(input, 'h', TH)) return;
920 if (MatchState(input, 'r', TR)) return;
921 if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;
922 break;
923 case TH:
924 if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;
925 if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;
926 break;
927 case TR:
928 if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;
929 if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;
930 break;
931 case V:
932 if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;
933 if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;
934 break;
935 case W:
936 if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;
937 if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
938 break;
939 case UNMATCHABLE:
940 break;
941 }
942 // On fallthrough, it's a failure.
943 state_ = UNMATCHABLE;
944 }
945
946 } } // namespace v8::internal
947