1 // Copyright 2018 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_PARSING_SCANNER_INL_H_
6 #define V8_PARSING_SCANNER_INL_H_
7
8 #include "src/parsing/keywords-gen.h"
9 #include "src/parsing/scanner.h"
10 #include "src/strings/char-predicates-inl.h"
11 #include "src/utils/utils.h"
12
13 namespace v8 {
14 namespace internal {
15
16 // ----------------------------------------------------------------------------
17 // Keyword Matcher
18
19 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
20 KEYWORD_GROUP('a') \
21 KEYWORD("async", Token::ASYNC) \
22 KEYWORD("await", Token::AWAIT) \
23 KEYWORD_GROUP('b') \
24 KEYWORD("break", Token::BREAK) \
25 KEYWORD_GROUP('c') \
26 KEYWORD("case", Token::CASE) \
27 KEYWORD("catch", Token::CATCH) \
28 KEYWORD("class", Token::CLASS) \
29 KEYWORD("const", Token::CONST) \
30 KEYWORD("continue", Token::CONTINUE) \
31 KEYWORD_GROUP('d') \
32 KEYWORD("debugger", Token::DEBUGGER) \
33 KEYWORD("default", Token::DEFAULT) \
34 KEYWORD("delete", Token::DELETE) \
35 KEYWORD("do", Token::DO) \
36 KEYWORD_GROUP('e') \
37 KEYWORD("else", Token::ELSE) \
38 KEYWORD("enum", Token::ENUM) \
39 KEYWORD("export", Token::EXPORT) \
40 KEYWORD("extends", Token::EXTENDS) \
41 KEYWORD_GROUP('f') \
42 KEYWORD("false", Token::FALSE_LITERAL) \
43 KEYWORD("finally", Token::FINALLY) \
44 KEYWORD("for", Token::FOR) \
45 KEYWORD("function", Token::FUNCTION) \
46 KEYWORD_GROUP('g') \
47 KEYWORD("get", Token::GET) \
48 KEYWORD_GROUP('i') \
49 KEYWORD("if", Token::IF) \
50 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
51 KEYWORD("import", Token::IMPORT) \
52 KEYWORD("in", Token::IN) \
53 KEYWORD("instanceof", Token::INSTANCEOF) \
54 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \
55 KEYWORD_GROUP('l') \
56 KEYWORD("let", Token::LET) \
57 KEYWORD_GROUP('n') \
58 KEYWORD("new", Token::NEW) \
59 KEYWORD("null", Token::NULL_LITERAL) \
60 KEYWORD_GROUP('p') \
61 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \
62 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \
63 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \
64 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \
65 KEYWORD_GROUP('r') \
66 KEYWORD("return", Token::RETURN) \
67 KEYWORD_GROUP('s') \
68 KEYWORD("set", Token::SET) \
69 KEYWORD("static", Token::STATIC) \
70 KEYWORD("super", Token::SUPER) \
71 KEYWORD("switch", Token::SWITCH) \
72 KEYWORD_GROUP('t') \
73 KEYWORD("this", Token::THIS) \
74 KEYWORD("throw", Token::THROW) \
75 KEYWORD("true", Token::TRUE_LITERAL) \
76 KEYWORD("try", Token::TRY) \
77 KEYWORD("typeof", Token::TYPEOF) \
78 KEYWORD_GROUP('v') \
79 KEYWORD("var", Token::VAR) \
80 KEYWORD("void", Token::VOID) \
81 KEYWORD_GROUP('w') \
82 KEYWORD("while", Token::WHILE) \
83 KEYWORD("with", Token::WITH) \
84 KEYWORD_GROUP('y') \
85 KEYWORD("yield", Token::YIELD)
86
IsKeywordStart(char c)87 constexpr bool IsKeywordStart(char c) {
88 #define KEYWORD_GROUP_CHECK(ch) c == ch ||
89 #define KEYWORD_CHECK(keyword, token)
90 return KEYWORDS(KEYWORD_GROUP_CHECK, KEYWORD_CHECK) /* || */ false;
91 #undef KEYWORD_GROUP_CHECK
92 #undef KEYWORD_CHECK
93 }
94
KeywordOrIdentifierToken(const uint8_t * input,int input_length)95 V8_INLINE Token::Value KeywordOrIdentifierToken(const uint8_t* input,
96 int input_length) {
97 DCHECK_GE(input_length, 1);
98 return PerfectKeywordHash::GetToken(reinterpret_cast<const char*>(input),
99 input_length);
100 }
101
102 // Recursive constexpr template magic to check if a character is in a given
103 // string.
104 template <int N>
105 constexpr bool IsInString(const char (&s)[N], char c, size_t i = 0) {
106 return i >= N ? false : s[i] == c ? true : IsInString(s, c, i + 1);
107 }
108
CanBeKeywordCharacter(char c)109 inline constexpr bool CanBeKeywordCharacter(char c) {
110 return IsInString(
111 #define KEYWORD_GROUP_CASE(ch) // Nothing
112 #define KEYWORD(keyword, token) keyword
113 // Use C string literal concatenation ("a" "b" becomes "ab") to build one
114 // giant string containing all the keywords.
115 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
116 #undef KEYWORD
117 #undef KEYWORD_GROUP_CASE
118 ,
119 c);
120 }
121
122 // Make sure tokens are stored as a single byte.
123 STATIC_ASSERT(sizeof(Token::Value) == 1);
124
125 // Get the shortest token that this character starts, the token may change
126 // depending on subsequent characters.
GetOneCharToken(char c)127 constexpr Token::Value GetOneCharToken(char c) {
128 // clang-format off
129 return
130 c == '(' ? Token::LPAREN :
131 c == ')' ? Token::RPAREN :
132 c == '{' ? Token::LBRACE :
133 c == '}' ? Token::RBRACE :
134 c == '[' ? Token::LBRACK :
135 c == ']' ? Token::RBRACK :
136 c == '?' ? Token::CONDITIONAL :
137 c == ':' ? Token::COLON :
138 c == ';' ? Token::SEMICOLON :
139 c == ',' ? Token::COMMA :
140 c == '.' ? Token::PERIOD :
141 c == '|' ? Token::BIT_OR :
142 c == '&' ? Token::BIT_AND :
143 c == '^' ? Token::BIT_XOR :
144 c == '~' ? Token::BIT_NOT :
145 c == '!' ? Token::NOT :
146 c == '<' ? Token::LT :
147 c == '>' ? Token::GT :
148 c == '%' ? Token::MOD :
149 c == '=' ? Token::ASSIGN :
150 c == '+' ? Token::ADD :
151 c == '-' ? Token::SUB :
152 c == '*' ? Token::MUL :
153 c == '/' ? Token::DIV :
154 c == '#' ? Token::PRIVATE_NAME :
155 c == '"' ? Token::STRING :
156 c == '\'' ? Token::STRING :
157 c == '`' ? Token::TEMPLATE_SPAN :
158 c == '\\' ? Token::IDENTIFIER :
159 // Whitespace or line terminator
160 c == ' ' ? Token::WHITESPACE :
161 c == '\t' ? Token::WHITESPACE :
162 c == '\v' ? Token::WHITESPACE :
163 c == '\f' ? Token::WHITESPACE :
164 c == '\r' ? Token::WHITESPACE :
165 c == '\n' ? Token::WHITESPACE :
166 // IsDecimalDigit must be tested before IsAsciiIdentifier
167 IsDecimalDigit(c) ? Token::NUMBER :
168 IsAsciiIdentifier(c) ? Token::IDENTIFIER :
169 Token::ILLEGAL;
170 // clang-format on
171 }
172
173 // Table of one-character tokens, by character (0x00..0x7F only).
174 static const constexpr Token::Value one_char_tokens[128] = {
175 #define CALL_GET_SCAN_FLAGS(N) GetOneCharToken(N),
176 INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS)
177 #undef CALL_GET_SCAN_FLAGS
178 };
179
180 #undef KEYWORDS
181
ScanIdentifierOrKeyword()182 V8_INLINE Token::Value Scanner::ScanIdentifierOrKeyword() {
183 next().literal_chars.Start();
184 return ScanIdentifierOrKeywordInner();
185 }
186
187 // Character flags for the fast path of scanning a keyword or identifier token.
188 enum class ScanFlags : uint8_t {
189 kTerminatesLiteral = 1 << 0,
190 // "Cannot" rather than "can" so that this flag can be ORed together across
191 // multiple characters.
192 kCannotBeKeyword = 1 << 1,
193 kCannotBeKeywordStart = 1 << 2,
194 kStringTerminator = 1 << 3,
195 kIdentifierNeedsSlowPath = 1 << 4,
196 kMultilineCommentCharacterNeedsSlowPath = 1 << 5,
197 };
GetScanFlags(char c)198 constexpr uint8_t GetScanFlags(char c) {
199 return
200 // Keywords are all lowercase and only contain letters.
201 // Note that non-identifier characters do not set this flag, so
202 // that it plays well with kTerminatesLiteral.
203 (IsAsciiIdentifier(c) && !CanBeKeywordCharacter(c)
204 ? static_cast<uint8_t>(ScanFlags::kCannotBeKeyword)
205 : 0) |
206 (IsKeywordStart(c)
207 ? 0
208 : static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart)) |
209 // Anything that isn't an identifier character will terminate the
210 // literal, or at least terminates the literal fast path processing
211 // (like an escape).
212 (!IsAsciiIdentifier(c)
213 ? static_cast<uint8_t>(ScanFlags::kTerminatesLiteral)
214 : 0) |
215 // Possible string termination characters.
216 ((c == '\'' || c == '"' || c == '\n' || c == '\r' || c == '\\')
217 ? static_cast<uint8_t>(ScanFlags::kStringTerminator)
218 : 0) |
219 // Escapes are processed on the slow path.
220 (c == '\\' ? static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath)
221 : 0) |
222 // Newlines and * are interesting characters for multiline comment
223 // scanning.
224 (c == '\n' || c == '\r' || c == '*'
225 ? static_cast<uint8_t>(
226 ScanFlags::kMultilineCommentCharacterNeedsSlowPath)
227 : 0);
228 }
TerminatesLiteral(uint8_t scan_flags)229 inline bool TerminatesLiteral(uint8_t scan_flags) {
230 return (scan_flags & static_cast<uint8_t>(ScanFlags::kTerminatesLiteral));
231 }
CanBeKeyword(uint8_t scan_flags)232 inline bool CanBeKeyword(uint8_t scan_flags) {
233 return !(scan_flags & static_cast<uint8_t>(ScanFlags::kCannotBeKeyword));
234 }
IdentifierNeedsSlowPath(uint8_t scan_flags)235 inline bool IdentifierNeedsSlowPath(uint8_t scan_flags) {
236 return (scan_flags &
237 static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath));
238 }
MultilineCommentCharacterNeedsSlowPath(uint8_t scan_flags)239 inline bool MultilineCommentCharacterNeedsSlowPath(uint8_t scan_flags) {
240 return (scan_flags & static_cast<uint8_t>(
241 ScanFlags::kMultilineCommentCharacterNeedsSlowPath));
242 }
MayTerminateString(uint8_t scan_flags)243 inline bool MayTerminateString(uint8_t scan_flags) {
244 return (scan_flags & static_cast<uint8_t>(ScanFlags::kStringTerminator));
245 }
246 // Table of precomputed scan flags for the 128 ASCII characters, for branchless
247 // flag calculation during the scan.
248 static constexpr const uint8_t character_scan_flags[128] = {
249 #define CALL_GET_SCAN_FLAGS(N) GetScanFlags(N),
250 INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS)
251 #undef CALL_GET_SCAN_FLAGS
252 };
253
CharCanBeKeyword(base::uc32 c)254 inline bool CharCanBeKeyword(base::uc32 c) {
255 return static_cast<uint32_t>(c) < arraysize(character_scan_flags) &&
256 CanBeKeyword(character_scan_flags[c]);
257 }
258
ScanIdentifierOrKeywordInner()259 V8_INLINE Token::Value Scanner::ScanIdentifierOrKeywordInner() {
260 DCHECK(IsIdentifierStart(c0_));
261 bool escaped = false;
262 bool can_be_keyword = true;
263
264 STATIC_ASSERT(arraysize(character_scan_flags) == kMaxAscii + 1);
265 if (V8_LIKELY(static_cast<uint32_t>(c0_) <= kMaxAscii)) {
266 if (V8_LIKELY(c0_ != '\\')) {
267 uint8_t scan_flags = character_scan_flags[c0_];
268 DCHECK(!TerminatesLiteral(scan_flags));
269 STATIC_ASSERT(static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart) ==
270 static_cast<uint8_t>(ScanFlags::kCannotBeKeyword) << 1);
271 scan_flags >>= 1;
272 // Make sure the shifting above doesn't set IdentifierNeedsSlowPath.
273 // Otherwise we'll fall into the slow path after scanning the identifier.
274 DCHECK(!IdentifierNeedsSlowPath(scan_flags));
275 AddLiteralChar(static_cast<char>(c0_));
276 AdvanceUntil([this, &scan_flags](base::uc32 c0) {
277 if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
278 // A non-ascii character means we need to drop through to the slow
279 // path.
280 // TODO(leszeks): This would be most efficient as a goto to the slow
281 // path, check codegen and maybe use a bool instead.
282 scan_flags |=
283 static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath);
284 return true;
285 }
286 uint8_t char_flags = character_scan_flags[c0];
287 scan_flags |= char_flags;
288 if (TerminatesLiteral(char_flags)) {
289 return true;
290 } else {
291 AddLiteralChar(static_cast<char>(c0));
292 return false;
293 }
294 });
295
296 if (V8_LIKELY(!IdentifierNeedsSlowPath(scan_flags))) {
297 if (!CanBeKeyword(scan_flags)) return Token::IDENTIFIER;
298 // Could be a keyword or identifier.
299 base::Vector<const uint8_t> chars =
300 next().literal_chars.one_byte_literal();
301 return KeywordOrIdentifierToken(chars.begin(), chars.length());
302 }
303
304 can_be_keyword = CanBeKeyword(scan_flags);
305 } else {
306 // Special case for escapes at the start of an identifier.
307 escaped = true;
308 base::uc32 c = ScanIdentifierUnicodeEscape();
309 DCHECK(!IsIdentifierStart(Invalid()));
310 if (c == '\\' || !IsIdentifierStart(c)) {
311 return Token::ILLEGAL;
312 }
313 AddLiteralChar(c);
314 can_be_keyword = CharCanBeKeyword(c);
315 }
316 }
317
318 return ScanIdentifierOrKeywordInnerSlow(escaped, can_be_keyword);
319 }
320
SkipWhiteSpace()321 V8_INLINE Token::Value Scanner::SkipWhiteSpace() {
322 int start_position = source_pos();
323
324 // We won't skip behind the end of input.
325 DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput));
326
327 // Advance as long as character is a WhiteSpace or LineTerminator.
328 while (IsWhiteSpaceOrLineTerminator(c0_)) {
329 if (!next().after_line_terminator && unibrow::IsLineTerminator(c0_)) {
330 next().after_line_terminator = true;
331 }
332 Advance();
333 }
334
335 // Return whether or not we skipped any characters.
336 if (source_pos() == start_position) {
337 DCHECK_NE('0', c0_);
338 return Token::ILLEGAL;
339 }
340
341 return Token::WHITESPACE;
342 }
343
ScanSingleToken()344 V8_INLINE Token::Value Scanner::ScanSingleToken() {
345 Token::Value token;
346 do {
347 next().location.beg_pos = source_pos();
348
349 if (V8_LIKELY(static_cast<unsigned>(c0_) <= kMaxAscii)) {
350 token = one_char_tokens[c0_];
351
352 switch (token) {
353 case Token::LPAREN:
354 case Token::RPAREN:
355 case Token::LBRACE:
356 case Token::RBRACE:
357 case Token::LBRACK:
358 case Token::RBRACK:
359 case Token::COLON:
360 case Token::SEMICOLON:
361 case Token::COMMA:
362 case Token::BIT_NOT:
363 case Token::ILLEGAL:
364 // One character tokens.
365 return Select(token);
366
367 case Token::CONDITIONAL:
368 // ? ?. ?? ??=
369 Advance();
370 if (c0_ == '.') {
371 Advance();
372 if (!IsDecimalDigit(c0_)) return Token::QUESTION_PERIOD;
373 PushBack('.');
374 } else if (c0_ == '?') {
375 return Select('=', Token::ASSIGN_NULLISH, Token::NULLISH);
376 }
377 return Token::CONDITIONAL;
378
379 case Token::STRING:
380 return ScanString();
381
382 case Token::LT:
383 // < <= << <<= <!--
384 Advance();
385 if (c0_ == '=') return Select(Token::LTE);
386 if (c0_ == '<') return Select('=', Token::ASSIGN_SHL, Token::SHL);
387 if (c0_ == '!') {
388 token = ScanHtmlComment();
389 continue;
390 }
391 return Token::LT;
392
393 case Token::GT:
394 // > >= >> >>= >>> >>>=
395 Advance();
396 if (c0_ == '=') return Select(Token::GTE);
397 if (c0_ == '>') {
398 // >> >>= >>> >>>=
399 Advance();
400 if (c0_ == '=') return Select(Token::ASSIGN_SAR);
401 if (c0_ == '>') return Select('=', Token::ASSIGN_SHR, Token::SHR);
402 return Token::SAR;
403 }
404 return Token::GT;
405
406 case Token::ASSIGN:
407 // = == === =>
408 Advance();
409 if (c0_ == '=') return Select('=', Token::EQ_STRICT, Token::EQ);
410 if (c0_ == '>') return Select(Token::ARROW);
411 return Token::ASSIGN;
412
413 case Token::NOT:
414 // ! != !==
415 Advance();
416 if (c0_ == '=') return Select('=', Token::NE_STRICT, Token::NE);
417 return Token::NOT;
418
419 case Token::ADD:
420 // + ++ +=
421 Advance();
422 if (c0_ == '+') return Select(Token::INC);
423 if (c0_ == '=') return Select(Token::ASSIGN_ADD);
424 return Token::ADD;
425
426 case Token::SUB:
427 // - -- --> -=
428 Advance();
429 if (c0_ == '-') {
430 Advance();
431 if (c0_ == '>' && next().after_line_terminator) {
432 // For compatibility with SpiderMonkey, we skip lines that
433 // start with an HTML comment end '-->'.
434 token = SkipSingleHTMLComment();
435 continue;
436 }
437 return Token::DEC;
438 }
439 if (c0_ == '=') return Select(Token::ASSIGN_SUB);
440 return Token::SUB;
441
442 case Token::MUL:
443 // * *=
444 Advance();
445 if (c0_ == '*') return Select('=', Token::ASSIGN_EXP, Token::EXP);
446 if (c0_ == '=') return Select(Token::ASSIGN_MUL);
447 return Token::MUL;
448
449 case Token::MOD:
450 // % %=
451 return Select('=', Token::ASSIGN_MOD, Token::MOD);
452
453 case Token::DIV:
454 // / // /* /=
455 Advance();
456 if (c0_ == '/') {
457 base::uc32 c = Peek();
458 if (c == '#' || c == '@') {
459 Advance();
460 Advance();
461 token = SkipSourceURLComment();
462 continue;
463 }
464 token = SkipSingleLineComment();
465 continue;
466 }
467 if (c0_ == '*') {
468 token = SkipMultiLineComment();
469 continue;
470 }
471 if (c0_ == '=') return Select(Token::ASSIGN_DIV);
472 return Token::DIV;
473
474 case Token::BIT_AND:
475 // & && &= &&=
476 Advance();
477 if (c0_ == '&') return Select('=', Token::ASSIGN_AND, Token::AND);
478 if (c0_ == '=') return Select(Token::ASSIGN_BIT_AND);
479 return Token::BIT_AND;
480
481 case Token::BIT_OR:
482 // | || |= ||=
483 Advance();
484 if (c0_ == '|') return Select('=', Token::ASSIGN_OR, Token::OR);
485 if (c0_ == '=') return Select(Token::ASSIGN_BIT_OR);
486 return Token::BIT_OR;
487
488 case Token::BIT_XOR:
489 // ^ ^=
490 return Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
491
492 case Token::PERIOD:
493 // . Number
494 Advance();
495 if (IsDecimalDigit(c0_)) return ScanNumber(true);
496 if (c0_ == '.') {
497 if (Peek() == '.') {
498 Advance();
499 Advance();
500 return Token::ELLIPSIS;
501 }
502 }
503 return Token::PERIOD;
504
505 case Token::TEMPLATE_SPAN:
506 Advance();
507 return ScanTemplateSpan();
508
509 case Token::PRIVATE_NAME:
510 if (source_pos() == 0 && Peek() == '!') {
511 token = SkipSingleLineComment();
512 continue;
513 }
514 return ScanPrivateName();
515
516 case Token::WHITESPACE:
517 token = SkipWhiteSpace();
518 continue;
519
520 case Token::NUMBER:
521 return ScanNumber(false);
522
523 case Token::IDENTIFIER:
524 return ScanIdentifierOrKeyword();
525
526 default:
527 UNREACHABLE();
528 }
529 }
530
531 if (IsIdentifierStart(c0_) ||
532 (CombineSurrogatePair() && IsIdentifierStart(c0_))) {
533 return ScanIdentifierOrKeyword();
534 }
535 if (c0_ == kEndOfInput) {
536 return source_->has_parser_error() ? Token::ILLEGAL : Token::EOS;
537 }
538 token = SkipWhiteSpace();
539
540 // Continue scanning for tokens as long as we're just skipping whitespace.
541 } while (token == Token::WHITESPACE);
542
543 return token;
544 }
545
Scan(TokenDesc * next_desc)546 void Scanner::Scan(TokenDesc* next_desc) {
547 DCHECK_EQ(next_desc, &next());
548
549 next_desc->token = ScanSingleToken();
550 DCHECK_IMPLIES(has_parser_error(), next_desc->token == Token::ILLEGAL);
551 next_desc->location.end_pos = source_pos();
552
553 #ifdef DEBUG
554 SanityCheckTokenDesc(current());
555 SanityCheckTokenDesc(next());
556 SanityCheckTokenDesc(next_next());
557 #endif
558 }
559
Scan()560 void Scanner::Scan() { Scan(next_); }
561
562 } // namespace internal
563 } // namespace v8
564
565 #endif // V8_PARSING_SCANNER_INL_H_
566