• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_PARSING_SCANNER_INL_H_
6 #define V8_PARSING_SCANNER_INL_H_
7 
8 #include "src/parsing/keywords-gen.h"
9 #include "src/parsing/scanner.h"
10 #include "src/strings/char-predicates-inl.h"
11 #include "src/utils/utils.h"
12 
13 namespace v8 {
14 namespace internal {
15 
16 // ----------------------------------------------------------------------------
17 // Keyword Matcher
18 
19 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                    \
20   KEYWORD_GROUP('a')                                        \
21   KEYWORD("async", Token::ASYNC)                            \
22   KEYWORD("await", Token::AWAIT)                            \
23   KEYWORD_GROUP('b')                                        \
24   KEYWORD("break", Token::BREAK)                            \
25   KEYWORD_GROUP('c')                                        \
26   KEYWORD("case", Token::CASE)                              \
27   KEYWORD("catch", Token::CATCH)                            \
28   KEYWORD("class", Token::CLASS)                            \
29   KEYWORD("const", Token::CONST)                            \
30   KEYWORD("continue", Token::CONTINUE)                      \
31   KEYWORD_GROUP('d')                                        \
32   KEYWORD("debugger", Token::DEBUGGER)                      \
33   KEYWORD("default", Token::DEFAULT)                        \
34   KEYWORD("delete", Token::DELETE)                          \
35   KEYWORD("do", Token::DO)                                  \
36   KEYWORD_GROUP('e')                                        \
37   KEYWORD("else", Token::ELSE)                              \
38   KEYWORD("enum", Token::ENUM)                              \
39   KEYWORD("export", Token::EXPORT)                          \
40   KEYWORD("extends", Token::EXTENDS)                        \
41   KEYWORD_GROUP('f')                                        \
42   KEYWORD("false", Token::FALSE_LITERAL)                    \
43   KEYWORD("finally", Token::FINALLY)                        \
44   KEYWORD("for", Token::FOR)                                \
45   KEYWORD("function", Token::FUNCTION)                      \
46   KEYWORD_GROUP('g')                                        \
47   KEYWORD("get", Token::GET)                                \
48   KEYWORD_GROUP('i')                                        \
49   KEYWORD("if", Token::IF)                                  \
50   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
51   KEYWORD("import", Token::IMPORT)                          \
52   KEYWORD("in", Token::IN)                                  \
53   KEYWORD("instanceof", Token::INSTANCEOF)                  \
54   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)  \
55   KEYWORD_GROUP('l')                                        \
56   KEYWORD("let", Token::LET)                                \
57   KEYWORD_GROUP('n')                                        \
58   KEYWORD("new", Token::NEW)                                \
59   KEYWORD("null", Token::NULL_LITERAL)                      \
60   KEYWORD_GROUP('p')                                        \
61   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)    \
62   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)    \
63   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)  \
64   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)     \
65   KEYWORD_GROUP('r')                                        \
66   KEYWORD("return", Token::RETURN)                          \
67   KEYWORD_GROUP('s')                                        \
68   KEYWORD("set", Token::SET)                                \
69   KEYWORD("static", Token::STATIC)                          \
70   KEYWORD("super", Token::SUPER)                            \
71   KEYWORD("switch", Token::SWITCH)                          \
72   KEYWORD_GROUP('t')                                        \
73   KEYWORD("this", Token::THIS)                              \
74   KEYWORD("throw", Token::THROW)                            \
75   KEYWORD("true", Token::TRUE_LITERAL)                      \
76   KEYWORD("try", Token::TRY)                                \
77   KEYWORD("typeof", Token::TYPEOF)                          \
78   KEYWORD_GROUP('v')                                        \
79   KEYWORD("var", Token::VAR)                                \
80   KEYWORD("void", Token::VOID)                              \
81   KEYWORD_GROUP('w')                                        \
82   KEYWORD("while", Token::WHILE)                            \
83   KEYWORD("with", Token::WITH)                              \
84   KEYWORD_GROUP('y')                                        \
85   KEYWORD("yield", Token::YIELD)
86 
IsKeywordStart(char c)87 constexpr bool IsKeywordStart(char c) {
88 #define KEYWORD_GROUP_CHECK(ch) c == ch ||
89 #define KEYWORD_CHECK(keyword, token)
90   return KEYWORDS(KEYWORD_GROUP_CHECK, KEYWORD_CHECK) /* || */ false;
91 #undef KEYWORD_GROUP_CHECK
92 #undef KEYWORD_CHECK
93 }
94 
KeywordOrIdentifierToken(const uint8_t * input,int input_length)95 V8_INLINE Token::Value KeywordOrIdentifierToken(const uint8_t* input,
96                                                 int input_length) {
97   DCHECK_GE(input_length, 1);
98   return PerfectKeywordHash::GetToken(reinterpret_cast<const char*>(input),
99                                       input_length);
100 }
101 
102 // Recursive constexpr template magic to check if a character is in a given
103 // string.
104 template <int N>
105 constexpr bool IsInString(const char (&s)[N], char c, size_t i = 0) {
106   return i >= N ? false : s[i] == c ? true : IsInString(s, c, i + 1);
107 }
108 
CanBeKeywordCharacter(char c)109 inline constexpr bool CanBeKeywordCharacter(char c) {
110   return IsInString(
111 #define KEYWORD_GROUP_CASE(ch)  // Nothing
112 #define KEYWORD(keyword, token) keyword
113       // Use C string literal concatenation ("a" "b" becomes "ab") to build one
114       // giant string containing all the keywords.
115       KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
116 #undef KEYWORD
117 #undef KEYWORD_GROUP_CASE
118           ,
119       c);
120 }
121 
122 // Make sure tokens are stored as a single byte.
123 STATIC_ASSERT(sizeof(Token::Value) == 1);
124 
125 // Get the shortest token that this character starts, the token may change
126 // depending on subsequent characters.
GetOneCharToken(char c)127 constexpr Token::Value GetOneCharToken(char c) {
128   // clang-format off
129   return
130       c == '(' ? Token::LPAREN :
131       c == ')' ? Token::RPAREN :
132       c == '{' ? Token::LBRACE :
133       c == '}' ? Token::RBRACE :
134       c == '[' ? Token::LBRACK :
135       c == ']' ? Token::RBRACK :
136       c == '?' ? Token::CONDITIONAL :
137       c == ':' ? Token::COLON :
138       c == ';' ? Token::SEMICOLON :
139       c == ',' ? Token::COMMA :
140       c == '.' ? Token::PERIOD :
141       c == '|' ? Token::BIT_OR :
142       c == '&' ? Token::BIT_AND :
143       c == '^' ? Token::BIT_XOR :
144       c == '~' ? Token::BIT_NOT :
145       c == '!' ? Token::NOT :
146       c == '<' ? Token::LT :
147       c == '>' ? Token::GT :
148       c == '%' ? Token::MOD :
149       c == '=' ? Token::ASSIGN :
150       c == '+' ? Token::ADD :
151       c == '-' ? Token::SUB :
152       c == '*' ? Token::MUL :
153       c == '/' ? Token::DIV :
154       c == '#' ? Token::PRIVATE_NAME :
155       c == '"' ? Token::STRING :
156       c == '\'' ? Token::STRING :
157       c == '`' ? Token::TEMPLATE_SPAN :
158       c == '\\' ? Token::IDENTIFIER :
159       // Whitespace or line terminator
160       c == ' ' ? Token::WHITESPACE :
161       c == '\t' ? Token::WHITESPACE :
162       c == '\v' ? Token::WHITESPACE :
163       c == '\f' ? Token::WHITESPACE :
164       c == '\r' ? Token::WHITESPACE :
165       c == '\n' ? Token::WHITESPACE :
166       // IsDecimalDigit must be tested before IsAsciiIdentifier
167       IsDecimalDigit(c) ? Token::NUMBER :
168       IsAsciiIdentifier(c) ? Token::IDENTIFIER :
169       Token::ILLEGAL;
170   // clang-format on
171 }
172 
173 // Table of one-character tokens, by character (0x00..0x7F only).
174 static const constexpr Token::Value one_char_tokens[128] = {
175 #define CALL_GET_SCAN_FLAGS(N) GetOneCharToken(N),
176     INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS)
177 #undef CALL_GET_SCAN_FLAGS
178 };
179 
180 #undef KEYWORDS
181 
ScanIdentifierOrKeyword()182 V8_INLINE Token::Value Scanner::ScanIdentifierOrKeyword() {
183   next().literal_chars.Start();
184   return ScanIdentifierOrKeywordInner();
185 }
186 
187 // Character flags for the fast path of scanning a keyword or identifier token.
188 enum class ScanFlags : uint8_t {
189   kTerminatesLiteral = 1 << 0,
190   // "Cannot" rather than "can" so that this flag can be ORed together across
191   // multiple characters.
192   kCannotBeKeyword = 1 << 1,
193   kCannotBeKeywordStart = 1 << 2,
194   kStringTerminator = 1 << 3,
195   kIdentifierNeedsSlowPath = 1 << 4,
196   kMultilineCommentCharacterNeedsSlowPath = 1 << 5,
197 };
GetScanFlags(char c)198 constexpr uint8_t GetScanFlags(char c) {
199   return
200       // Keywords are all lowercase and only contain letters.
201       // Note that non-identifier characters do not set this flag, so
202       // that it plays well with kTerminatesLiteral.
203       (IsAsciiIdentifier(c) && !CanBeKeywordCharacter(c)
204            ? static_cast<uint8_t>(ScanFlags::kCannotBeKeyword)
205            : 0) |
206       (IsKeywordStart(c)
207            ? 0
208            : static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart)) |
209       // Anything that isn't an identifier character will terminate the
210       // literal, or at least terminates the literal fast path processing
211       // (like an escape).
212       (!IsAsciiIdentifier(c)
213            ? static_cast<uint8_t>(ScanFlags::kTerminatesLiteral)
214            : 0) |
215       // Possible string termination characters.
216       ((c == '\'' || c == '"' || c == '\n' || c == '\r' || c == '\\')
217            ? static_cast<uint8_t>(ScanFlags::kStringTerminator)
218            : 0) |
219       // Escapes are processed on the slow path.
220       (c == '\\' ? static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath)
221                  : 0) |
222       // Newlines and * are interesting characters for multiline comment
223       // scanning.
224       (c == '\n' || c == '\r' || c == '*'
225            ? static_cast<uint8_t>(
226                  ScanFlags::kMultilineCommentCharacterNeedsSlowPath)
227            : 0);
228 }
TerminatesLiteral(uint8_t scan_flags)229 inline bool TerminatesLiteral(uint8_t scan_flags) {
230   return (scan_flags & static_cast<uint8_t>(ScanFlags::kTerminatesLiteral));
231 }
CanBeKeyword(uint8_t scan_flags)232 inline bool CanBeKeyword(uint8_t scan_flags) {
233   return !(scan_flags & static_cast<uint8_t>(ScanFlags::kCannotBeKeyword));
234 }
IdentifierNeedsSlowPath(uint8_t scan_flags)235 inline bool IdentifierNeedsSlowPath(uint8_t scan_flags) {
236   return (scan_flags &
237           static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath));
238 }
MultilineCommentCharacterNeedsSlowPath(uint8_t scan_flags)239 inline bool MultilineCommentCharacterNeedsSlowPath(uint8_t scan_flags) {
240   return (scan_flags & static_cast<uint8_t>(
241                            ScanFlags::kMultilineCommentCharacterNeedsSlowPath));
242 }
MayTerminateString(uint8_t scan_flags)243 inline bool MayTerminateString(uint8_t scan_flags) {
244   return (scan_flags & static_cast<uint8_t>(ScanFlags::kStringTerminator));
245 }
246 // Table of precomputed scan flags for the 128 ASCII characters, for branchless
247 // flag calculation during the scan.
248 static constexpr const uint8_t character_scan_flags[128] = {
249 #define CALL_GET_SCAN_FLAGS(N) GetScanFlags(N),
250     INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS)
251 #undef CALL_GET_SCAN_FLAGS
252 };
253 
CharCanBeKeyword(uc32 c)254 inline bool CharCanBeKeyword(uc32 c) {
255   return static_cast<uint32_t>(c) < arraysize(character_scan_flags) &&
256          CanBeKeyword(character_scan_flags[c]);
257 }
258 
ScanIdentifierOrKeywordInner()259 V8_INLINE Token::Value Scanner::ScanIdentifierOrKeywordInner() {
260   DCHECK(IsIdentifierStart(c0_));
261   bool escaped = false;
262   bool can_be_keyword = true;
263 
264   STATIC_ASSERT(arraysize(character_scan_flags) == kMaxAscii + 1);
265   if (V8_LIKELY(static_cast<uint32_t>(c0_) <= kMaxAscii)) {
266     if (V8_LIKELY(c0_ != '\\')) {
267       uint8_t scan_flags = character_scan_flags[c0_];
268       DCHECK(!TerminatesLiteral(scan_flags));
269       STATIC_ASSERT(static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart) ==
270                     static_cast<uint8_t>(ScanFlags::kCannotBeKeyword) << 1);
271       scan_flags >>= 1;
272       // Make sure the shifting above doesn't set IdentifierNeedsSlowPath.
273       // Otherwise we'll fall into the slow path after scanning the identifier.
274       DCHECK(!IdentifierNeedsSlowPath(scan_flags));
275       AddLiteralChar(static_cast<char>(c0_));
276       AdvanceUntil([this, &scan_flags](uc32 c0) {
277         if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
278           // A non-ascii character means we need to drop through to the slow
279           // path.
280           // TODO(leszeks): This would be most efficient as a goto to the slow
281           // path, check codegen and maybe use a bool instead.
282           scan_flags |=
283               static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath);
284           return true;
285         }
286         uint8_t char_flags = character_scan_flags[c0];
287         scan_flags |= char_flags;
288         if (TerminatesLiteral(char_flags)) {
289           return true;
290         } else {
291           AddLiteralChar(static_cast<char>(c0));
292           return false;
293         }
294       });
295 
296       if (V8_LIKELY(!IdentifierNeedsSlowPath(scan_flags))) {
297         if (!CanBeKeyword(scan_flags)) return Token::IDENTIFIER;
298         // Could be a keyword or identifier.
299         Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
300         return KeywordOrIdentifierToken(chars.begin(), chars.length());
301       }
302 
303       can_be_keyword = CanBeKeyword(scan_flags);
304     } else {
305       // Special case for escapes at the start of an identifier.
306       escaped = true;
307       uc32 c = ScanIdentifierUnicodeEscape();
308       DCHECK(!IsIdentifierStart(Invalid()));
309       if (c == '\\' || !IsIdentifierStart(c)) {
310         return Token::ILLEGAL;
311       }
312       AddLiteralChar(c);
313       can_be_keyword = CharCanBeKeyword(c);
314     }
315   }
316 
317   return ScanIdentifierOrKeywordInnerSlow(escaped, can_be_keyword);
318 }
319 
SkipWhiteSpace()320 V8_INLINE Token::Value Scanner::SkipWhiteSpace() {
321   int start_position = source_pos();
322 
323   // We won't skip behind the end of input.
324   DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput));
325 
326   // Advance as long as character is a WhiteSpace or LineTerminator.
327   while (IsWhiteSpaceOrLineTerminator(c0_)) {
328     if (!next().after_line_terminator && unibrow::IsLineTerminator(c0_)) {
329       next().after_line_terminator = true;
330     }
331     Advance();
332   }
333 
334   // Return whether or not we skipped any characters.
335   if (source_pos() == start_position) {
336     DCHECK_NE('0', c0_);
337     return Token::ILLEGAL;
338   }
339 
340   return Token::WHITESPACE;
341 }
342 
ScanSingleToken()343 V8_INLINE Token::Value Scanner::ScanSingleToken() {
344   Token::Value token;
345   do {
346     next().location.beg_pos = source_pos();
347 
348     if (V8_LIKELY(static_cast<unsigned>(c0_) <= kMaxAscii)) {
349       token = one_char_tokens[c0_];
350 
351       switch (token) {
352         case Token::LPAREN:
353         case Token::RPAREN:
354         case Token::LBRACE:
355         case Token::RBRACE:
356         case Token::LBRACK:
357         case Token::RBRACK:
358         case Token::COLON:
359         case Token::SEMICOLON:
360         case Token::COMMA:
361         case Token::BIT_NOT:
362         case Token::ILLEGAL:
363           // One character tokens.
364           return Select(token);
365 
366         case Token::CONDITIONAL:
367           // ? ?. ?? ??=
368           Advance();
369           if (c0_ == '.') {
370             Advance();
371             if (!IsDecimalDigit(c0_)) return Token::QUESTION_PERIOD;
372             PushBack('.');
373           } else if (c0_ == '?') {
374             return Select('=', Token::ASSIGN_NULLISH, Token::NULLISH);
375           }
376           return Token::CONDITIONAL;
377 
378         case Token::STRING:
379           return ScanString();
380 
381         case Token::LT:
382           // < <= << <<= <!--
383           Advance();
384           if (c0_ == '=') return Select(Token::LTE);
385           if (c0_ == '<') return Select('=', Token::ASSIGN_SHL, Token::SHL);
386           if (c0_ == '!') {
387             token = ScanHtmlComment();
388             continue;
389           }
390           return Token::LT;
391 
392         case Token::GT:
393           // > >= >> >>= >>> >>>=
394           Advance();
395           if (c0_ == '=') return Select(Token::GTE);
396           if (c0_ == '>') {
397             // >> >>= >>> >>>=
398             Advance();
399             if (c0_ == '=') return Select(Token::ASSIGN_SAR);
400             if (c0_ == '>') return Select('=', Token::ASSIGN_SHR, Token::SHR);
401             return Token::SAR;
402           }
403           return Token::GT;
404 
405         case Token::ASSIGN:
406           // = == === =>
407           Advance();
408           if (c0_ == '=') return Select('=', Token::EQ_STRICT, Token::EQ);
409           if (c0_ == '>') return Select(Token::ARROW);
410           return Token::ASSIGN;
411 
412         case Token::NOT:
413           // ! != !==
414           Advance();
415           if (c0_ == '=') return Select('=', Token::NE_STRICT, Token::NE);
416           return Token::NOT;
417 
418         case Token::ADD:
419           // + ++ +=
420           Advance();
421           if (c0_ == '+') return Select(Token::INC);
422           if (c0_ == '=') return Select(Token::ASSIGN_ADD);
423           return Token::ADD;
424 
425         case Token::SUB:
426           // - -- --> -=
427           Advance();
428           if (c0_ == '-') {
429             Advance();
430             if (c0_ == '>' && next().after_line_terminator) {
431               // For compatibility with SpiderMonkey, we skip lines that
432               // start with an HTML comment end '-->'.
433               token = SkipSingleHTMLComment();
434               continue;
435             }
436             return Token::DEC;
437           }
438           if (c0_ == '=') return Select(Token::ASSIGN_SUB);
439           return Token::SUB;
440 
441         case Token::MUL:
442           // * *=
443           Advance();
444           if (c0_ == '*') return Select('=', Token::ASSIGN_EXP, Token::EXP);
445           if (c0_ == '=') return Select(Token::ASSIGN_MUL);
446           return Token::MUL;
447 
448         case Token::MOD:
449           // % %=
450           return Select('=', Token::ASSIGN_MOD, Token::MOD);
451 
452         case Token::DIV:
453           // /  // /* /=
454           Advance();
455           if (c0_ == '/') {
456             uc32 c = Peek();
457             if (c == '#' || c == '@') {
458               Advance();
459               Advance();
460               token = SkipSourceURLComment();
461               continue;
462             }
463             token = SkipSingleLineComment();
464             continue;
465           }
466           if (c0_ == '*') {
467             token = SkipMultiLineComment();
468             continue;
469           }
470           if (c0_ == '=') return Select(Token::ASSIGN_DIV);
471           return Token::DIV;
472 
473         case Token::BIT_AND:
474           // & && &= &&=
475           Advance();
476           if (c0_ == '&') return Select('=', Token::ASSIGN_AND, Token::AND);
477           if (c0_ == '=') return Select(Token::ASSIGN_BIT_AND);
478           return Token::BIT_AND;
479 
480         case Token::BIT_OR:
481           // | || |= ||=
482           Advance();
483           if (c0_ == '|') return Select('=', Token::ASSIGN_OR, Token::OR);
484           if (c0_ == '=') return Select(Token::ASSIGN_BIT_OR);
485           return Token::BIT_OR;
486 
487         case Token::BIT_XOR:
488           // ^ ^=
489           return Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
490 
491         case Token::PERIOD:
492           // . Number
493           Advance();
494           if (IsDecimalDigit(c0_)) return ScanNumber(true);
495           if (c0_ == '.') {
496             if (Peek() == '.') {
497               Advance();
498               Advance();
499               return Token::ELLIPSIS;
500             }
501           }
502           return Token::PERIOD;
503 
504         case Token::TEMPLATE_SPAN:
505           Advance();
506           return ScanTemplateSpan();
507 
508         case Token::PRIVATE_NAME:
509           if (source_pos() == 0 && Peek() == '!') {
510             token = SkipSingleLineComment();
511             continue;
512           }
513           return ScanPrivateName();
514 
515         case Token::WHITESPACE:
516           token = SkipWhiteSpace();
517           continue;
518 
519         case Token::NUMBER:
520           return ScanNumber(false);
521 
522         case Token::IDENTIFIER:
523           return ScanIdentifierOrKeyword();
524 
525         default:
526           UNREACHABLE();
527       }
528     }
529 
530     if (IsIdentifierStart(c0_) ||
531         (CombineSurrogatePair() && IsIdentifierStart(c0_))) {
532       return ScanIdentifierOrKeyword();
533     }
534     if (c0_ == kEndOfInput) {
535       return source_->has_parser_error() ? Token::ILLEGAL : Token::EOS;
536     }
537     token = SkipWhiteSpace();
538 
539     // Continue scanning for tokens as long as we're just skipping whitespace.
540   } while (token == Token::WHITESPACE);
541 
542   return token;
543 }
544 
Scan(TokenDesc * next_desc)545 void Scanner::Scan(TokenDesc* next_desc) {
546   DCHECK_EQ(next_desc, &next());
547 
548   next_desc->token = ScanSingleToken();
549   DCHECK_IMPLIES(has_parser_error(), next_desc->token == Token::ILLEGAL);
550   next_desc->location.end_pos = source_pos();
551 
552 #ifdef DEBUG
553   SanityCheckTokenDesc(current());
554   SanityCheckTokenDesc(next());
555   SanityCheckTokenDesc(next_next());
556 #endif
557 }
558 
Scan()559 void Scanner::Scan() { Scan(next_); }
560 
561 }  // namespace internal
562 }  // namespace v8
563 
564 #endif  // V8_PARSING_SCANNER_INL_H_
565