1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Features shared by parsing and pre-parsing scanners.
6
7 #include "src/parsing/scanner.h"
8
9 #include <stdint.h>
10
11 #include <cmath>
12
13 #include "src/ast/ast-value-factory.h"
14 #include "src/numbers/conversions-inl.h"
15 #include "src/objects/bigint.h"
16 #include "src/parsing/parse-info.h"
17 #include "src/parsing/scanner-inl.h"
18 #include "src/zone/zone.h"
19
20 namespace v8 {
21 namespace internal {
22
23 class Scanner::ErrorState {
24 public:
ErrorState(MessageTemplate * message_stack,Scanner::Location * location_stack)25 ErrorState(MessageTemplate* message_stack, Scanner::Location* location_stack)
26 : message_stack_(message_stack),
27 old_message_(*message_stack),
28 location_stack_(location_stack),
29 old_location_(*location_stack) {
30 *message_stack_ = MessageTemplate::kNone;
31 *location_stack_ = Location::invalid();
32 }
33
~ErrorState()34 ~ErrorState() {
35 *message_stack_ = old_message_;
36 *location_stack_ = old_location_;
37 }
38
MoveErrorTo(TokenDesc * dest)39 void MoveErrorTo(TokenDesc* dest) {
40 if (*message_stack_ == MessageTemplate::kNone) {
41 return;
42 }
43 if (dest->invalid_template_escape_message == MessageTemplate::kNone) {
44 dest->invalid_template_escape_message = *message_stack_;
45 dest->invalid_template_escape_location = *location_stack_;
46 }
47 *message_stack_ = MessageTemplate::kNone;
48 *location_stack_ = Location::invalid();
49 }
50
51 private:
52 MessageTemplate* const message_stack_;
53 MessageTemplate const old_message_;
54 Scanner::Location* const location_stack_;
55 Scanner::Location const old_location_;
56 };
57
58 // ----------------------------------------------------------------------------
59 // Scanner::BookmarkScope
60
61 const size_t Scanner::BookmarkScope::kNoBookmark =
62 std::numeric_limits<size_t>::max() - 1;
63 const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
64 std::numeric_limits<size_t>::max();
65
Set(size_t position)66 void Scanner::BookmarkScope::Set(size_t position) {
67 DCHECK_EQ(bookmark_, kNoBookmark);
68 bookmark_ = position;
69 }
70
Apply()71 void Scanner::BookmarkScope::Apply() {
72 DCHECK(HasBeenSet()); // Caller hasn't called SetBookmark.
73 if (had_parser_error_) {
74 scanner_->set_parser_error();
75 } else {
76 scanner_->reset_parser_error_flag();
77 scanner_->SeekNext(bookmark_);
78 }
79 bookmark_ = kBookmarkWasApplied;
80 }
81
HasBeenSet() const82 bool Scanner::BookmarkScope::HasBeenSet() const {
83 return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
84 }
85
HasBeenApplied() const86 bool Scanner::BookmarkScope::HasBeenApplied() const {
87 return bookmark_ == kBookmarkWasApplied;
88 }
89
90 // ----------------------------------------------------------------------------
91 // Scanner
92
Scanner(Utf16CharacterStream * source,UnoptimizedCompileFlags flags)93 Scanner::Scanner(Utf16CharacterStream* source, UnoptimizedCompileFlags flags)
94 : flags_(flags),
95 source_(source),
96 found_html_comment_(false),
97 octal_pos_(Location::invalid()),
98 octal_message_(MessageTemplate::kNone) {
99 DCHECK_NOT_NULL(source);
100 }
101
Initialize()102 void Scanner::Initialize() {
103 // Need to capture identifiers in order to recognize "get" and "set"
104 // in object literals.
105 Init();
106 next().after_line_terminator = true;
107 Scan();
108 }
109
110 // static
IsInvalid(uc32 c)111 bool Scanner::IsInvalid(uc32 c) {
112 DCHECK(c == Invalid() || base::IsInRange(c, 0u, String::kMaxCodePoint));
113 return c == Scanner::Invalid();
114 }
115
116 template <bool capture_raw, bool unicode>
ScanHexNumber(int expected_length)117 uc32 Scanner::ScanHexNumber(int expected_length) {
118 DCHECK_LE(expected_length, 4); // prevent overflow
119
120 int begin = source_pos() - 2;
121 uc32 x = 0;
122 for (int i = 0; i < expected_length; i++) {
123 int d = HexValue(c0_);
124 if (d < 0) {
125 ReportScannerError(Location(begin, begin + expected_length + 2),
126 unicode
127 ? MessageTemplate::kInvalidUnicodeEscapeSequence
128 : MessageTemplate::kInvalidHexEscapeSequence);
129 return Invalid();
130 }
131 x = x * 16 + d;
132 Advance<capture_raw>();
133 }
134
135 return x;
136 }
137
138 template <bool capture_raw>
ScanUnlimitedLengthHexNumber(uc32 max_value,int beg_pos)139 uc32 Scanner::ScanUnlimitedLengthHexNumber(uc32 max_value, int beg_pos) {
140 uc32 x = 0;
141 int d = HexValue(c0_);
142 if (d < 0) return Invalid();
143
144 while (d >= 0) {
145 x = x * 16 + d;
146 if (x > max_value) {
147 ReportScannerError(Location(beg_pos, source_pos() + 1),
148 MessageTemplate::kUndefinedUnicodeCodePoint);
149 return Invalid();
150 }
151 Advance<capture_raw>();
152 d = HexValue(c0_);
153 }
154
155 return x;
156 }
157
Next()158 Token::Value Scanner::Next() {
159 // Rotate through tokens.
160 TokenDesc* previous = current_;
161 current_ = next_;
162 // Either we already have the next token lined up, in which case next_next_
163 // simply becomes next_. In that case we use current_ as new next_next_ and
164 // clear its token to indicate that it wasn't scanned yet. Otherwise we use
165 // current_ as next_ and scan into it, leaving next_next_ uninitialized.
166 if (V8_LIKELY(next_next().token == Token::UNINITIALIZED)) {
167 next_ = previous;
168 // User 'previous' instead of 'next_' because for some reason the compiler
169 // thinks 'next_' could be modified before the entry into Scan.
170 previous->after_line_terminator = false;
171 Scan(previous);
172 } else {
173 next_ = next_next_;
174 next_next_ = previous;
175 previous->token = Token::UNINITIALIZED;
176 DCHECK_NE(Token::UNINITIALIZED, current().token);
177 }
178 return current().token;
179 }
180
PeekAhead()181 Token::Value Scanner::PeekAhead() {
182 DCHECK(next().token != Token::DIV);
183 DCHECK(next().token != Token::ASSIGN_DIV);
184
185 if (next_next().token != Token::UNINITIALIZED) {
186 return next_next().token;
187 }
188 TokenDesc* temp = next_;
189 next_ = next_next_;
190 next().after_line_terminator = false;
191 Scan();
192 next_next_ = next_;
193 next_ = temp;
194 return next_next().token;
195 }
196
SkipSingleHTMLComment()197 Token::Value Scanner::SkipSingleHTMLComment() {
198 if (flags_.is_module()) {
199 ReportScannerError(source_pos(), MessageTemplate::kHtmlCommentInModule);
200 return Token::ILLEGAL;
201 }
202 return SkipSingleLineComment();
203 }
204
SkipSingleLineComment()205 Token::Value Scanner::SkipSingleLineComment() {
206 // The line terminator at the end of the line is not considered
207 // to be part of the single-line comment; it is recognized
208 // separately by the lexical grammar and becomes part of the
209 // stream of input elements for the syntactic grammar (see
210 // ECMA-262, section 7.4).
211 AdvanceUntil([](uc32 c0_) { return unibrow::IsLineTerminator(c0_); });
212
213 return Token::WHITESPACE;
214 }
215
SkipSourceURLComment()216 Token::Value Scanner::SkipSourceURLComment() {
217 TryToParseSourceURLComment();
218 if (unibrow::IsLineTerminator(c0_) || c0_ == kEndOfInput) {
219 return Token::WHITESPACE;
220 }
221 return SkipSingleLineComment();
222 }
223
TryToParseSourceURLComment()224 void Scanner::TryToParseSourceURLComment() {
225 // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
226 // function will just return if it cannot parse a magic comment.
227 DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput));
228 if (!IsWhiteSpace(c0_)) return;
229 Advance();
230 LiteralBuffer name;
231 name.Start();
232
233 while (c0_ != kEndOfInput && !IsWhiteSpaceOrLineTerminator(c0_) &&
234 c0_ != '=') {
235 name.AddChar(c0_);
236 Advance();
237 }
238 if (!name.is_one_byte()) return;
239 Vector<const uint8_t> name_literal = name.one_byte_literal();
240 LiteralBuffer* value;
241 if (name_literal == StaticOneByteVector("sourceURL")) {
242 value = &source_url_;
243 } else if (name_literal == StaticOneByteVector("sourceMappingURL")) {
244 value = &source_mapping_url_;
245 } else {
246 return;
247 }
248 if (c0_ != '=')
249 return;
250 value->Start();
251 Advance();
252 while (IsWhiteSpace(c0_)) {
253 Advance();
254 }
255 while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
256 // Disallowed characters.
257 if (c0_ == '"' || c0_ == '\'') {
258 value->Start();
259 return;
260 }
261 if (IsWhiteSpace(c0_)) {
262 break;
263 }
264 value->AddChar(c0_);
265 Advance();
266 }
267 // Allow whitespace at the end.
268 while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
269 if (!IsWhiteSpace(c0_)) {
270 value->Start();
271 break;
272 }
273 Advance();
274 }
275 }
276
SkipMultiLineComment()277 Token::Value Scanner::SkipMultiLineComment() {
278 DCHECK_EQ(c0_, '*');
279
280 // Until we see the first newline, check for * and newline characters.
281 if (!next().after_line_terminator) {
282 do {
283 AdvanceUntil([](uc32 c0) {
284 if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
285 return unibrow::IsLineTerminator(c0);
286 }
287 uint8_t char_flags = character_scan_flags[c0];
288 return MultilineCommentCharacterNeedsSlowPath(char_flags);
289 });
290
291 while (c0_ == '*') {
292 Advance();
293 if (c0_ == '/') {
294 Advance();
295 return Token::WHITESPACE;
296 }
297 }
298
299 if (unibrow::IsLineTerminator(c0_)) {
300 next().after_line_terminator = true;
301 break;
302 }
303 } while (c0_ != kEndOfInput);
304 }
305
306 // After we've seen newline, simply try to find '*/'.
307 while (c0_ != kEndOfInput) {
308 AdvanceUntil([](uc32 c0) { return c0 == '*'; });
309
310 while (c0_ == '*') {
311 Advance();
312 if (c0_ == '/') {
313 Advance();
314 return Token::WHITESPACE;
315 }
316 }
317 }
318
319 return Token::ILLEGAL;
320 }
321
ScanHtmlComment()322 Token::Value Scanner::ScanHtmlComment() {
323 // Check for <!-- comments.
324 DCHECK_EQ(c0_, '!');
325 Advance();
326 if (c0_ != '-' || Peek() != '-') {
327 PushBack('!'); // undo Advance()
328 return Token::LT;
329 }
330 Advance();
331
332 found_html_comment_ = true;
333 return SkipSingleHTMLComment();
334 }
335
336 #ifdef DEBUG
SanityCheckTokenDesc(const TokenDesc & token) const337 void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
338 // Only TEMPLATE_* tokens can have a invalid_template_escape_message.
339 // ILLEGAL and UNINITIALIZED can have garbage for the field.
340
341 switch (token.token) {
342 case Token::UNINITIALIZED:
343 case Token::ILLEGAL:
344 // token.literal_chars & other members might be garbage. That's ok.
345 case Token::TEMPLATE_SPAN:
346 case Token::TEMPLATE_TAIL:
347 break;
348 default:
349 DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone);
350 break;
351 }
352 }
353 #endif // DEBUG
354
SeekForward(int pos)355 void Scanner::SeekForward(int pos) {
356 // After this call, we will have the token at the given position as
357 // the "next" token. The "current" token will be invalid.
358 if (pos == next().location.beg_pos) return;
359 int current_pos = source_pos();
360 DCHECK_EQ(next().location.end_pos, current_pos);
361 // Positions inside the lookahead token aren't supported.
362 DCHECK(pos >= current_pos);
363 if (pos != current_pos) {
364 source_->Seek(pos);
365 Advance();
366 // This function is only called to seek to the location
367 // of the end of a function (at the "}" token). It doesn't matter
368 // whether there was a line terminator in the part we skip.
369 next().after_line_terminator = false;
370 }
371 Scan();
372 }
373
374 template <bool capture_raw>
ScanEscape()375 bool Scanner::ScanEscape() {
376 uc32 c = c0_;
377 Advance<capture_raw>();
378
379 // Skip escaped newlines.
380 DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
381 if (!capture_raw && unibrow::IsLineTerminator(c)) {
382 // Allow escaped CR+LF newlines in multiline string literals.
383 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
384 return true;
385 }
386
387 switch (c) {
388 case 'b' : c = '\b'; break;
389 case 'f' : c = '\f'; break;
390 case 'n' : c = '\n'; break;
391 case 'r' : c = '\r'; break;
392 case 't' : c = '\t'; break;
393 case 'u' : {
394 c = ScanUnicodeEscape<capture_raw>();
395 if (IsInvalid(c)) return false;
396 break;
397 }
398 case 'v':
399 c = '\v';
400 break;
401 case 'x': {
402 c = ScanHexNumber<capture_raw>(2);
403 if (IsInvalid(c)) return false;
404 break;
405 }
406 case '0':
407 case '1':
408 case '2':
409 case '3':
410 case '4':
411 case '5':
412 case '6':
413 case '7':
414 c = ScanOctalEscape<capture_raw>(c, 2);
415 break;
416 case '8':
417 case '9':
418 // '\8' and '\9' are disallowed in strict mode.
419 // Re-use the octal error state to propagate the error.
420 octal_pos_ = Location(source_pos() - 2, source_pos() - 1);
421 octal_message_ = capture_raw ? MessageTemplate::kTemplate8Or9Escape
422 : MessageTemplate::kStrict8Or9Escape;
423 break;
424 }
425
426 // Other escaped characters are interpreted as their non-escaped version.
427 AddLiteralChar(c);
428 return true;
429 }
430
431 template <bool capture_raw>
ScanOctalEscape(uc32 c,int length)432 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
433 DCHECK('0' <= c && c <= '7');
434 uc32 x = c - '0';
435 int i = 0;
436 for (; i < length; i++) {
437 int d = c0_ - '0';
438 if (d < 0 || d > 7) break;
439 int nx = x * 8 + d;
440 if (nx >= 256) break;
441 x = nx;
442 Advance<capture_raw>();
443 }
444 // Anything except '\0' is an octal escape sequence, illegal in strict mode.
445 // Remember the position of octal escape sequences so that an error
446 // can be reported later (in strict mode).
447 // We don't report the error immediately, because the octal escape can
448 // occur before the "use strict" directive.
449 if (c != '0' || i > 0 || IsNonOctalDecimalDigit(c0_)) {
450 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
451 octal_message_ = capture_raw ? MessageTemplate::kTemplateOctalLiteral
452 : MessageTemplate::kStrictOctalEscape;
453 }
454 return x;
455 }
456
ScanString()457 Token::Value Scanner::ScanString() {
458 uc32 quote = c0_;
459
460 next().literal_chars.Start();
461 while (true) {
462 AdvanceUntil([this](uc32 c0) {
463 if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
464 if (V8_UNLIKELY(unibrow::IsStringLiteralLineTerminator(c0))) {
465 return true;
466 }
467 AddLiteralChar(c0);
468 return false;
469 }
470 uint8_t char_flags = character_scan_flags[c0];
471 if (MayTerminateString(char_flags)) return true;
472 AddLiteralChar(c0);
473 return false;
474 });
475
476 while (c0_ == '\\') {
477 Advance();
478 // TODO(verwaest): Check whether we can remove the additional check.
479 if (V8_UNLIKELY(c0_ == kEndOfInput || !ScanEscape<false>())) {
480 return Token::ILLEGAL;
481 }
482 }
483
484 if (c0_ == quote) {
485 Advance();
486 return Token::STRING;
487 }
488
489 if (V8_UNLIKELY(c0_ == kEndOfInput ||
490 unibrow::IsStringLiteralLineTerminator(c0_))) {
491 return Token::ILLEGAL;
492 }
493
494 AddLiteralChar(c0_);
495 }
496 }
497
ScanPrivateName()498 Token::Value Scanner::ScanPrivateName() {
499 next().literal_chars.Start();
500 DCHECK_EQ(c0_, '#');
501 DCHECK(!IsIdentifierStart(kEndOfInput));
502 if (!IsIdentifierStart(Peek())) {
503 ReportScannerError(source_pos(),
504 MessageTemplate::kInvalidOrUnexpectedToken);
505 return Token::ILLEGAL;
506 }
507
508 AddLiteralCharAdvance();
509 Token::Value token = ScanIdentifierOrKeywordInner();
510 return token == Token::ILLEGAL ? Token::ILLEGAL : Token::PRIVATE_NAME;
511 }
512
ScanTemplateSpan()513 Token::Value Scanner::ScanTemplateSpan() {
514 // When scanning a TemplateSpan, we are looking for the following construct:
515 // TEMPLATE_SPAN ::
516 // ` LiteralChars* ${
517 // | } LiteralChars* ${
518 //
519 // TEMPLATE_TAIL ::
520 // ` LiteralChars* `
521 // | } LiteralChar* `
522 //
523 // A TEMPLATE_SPAN should always be followed by an Expression, while a
524 // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
525 // followed by an Expression.
526
527 // These scoped helpers save and restore the original error state, so that we
528 // can specially treat invalid escape sequences in templates (which are
529 // handled by the parser).
530 ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_);
531 ErrorState octal_error_state(&octal_message_, &octal_pos_);
532
533 Token::Value result = Token::TEMPLATE_SPAN;
534 next().literal_chars.Start();
535 next().raw_literal_chars.Start();
536 const bool capture_raw = true;
537 while (true) {
538 uc32 c = c0_;
539 if (c == '`') {
540 Advance(); // Consume '`'
541 result = Token::TEMPLATE_TAIL;
542 break;
543 } else if (c == '$' && Peek() == '{') {
544 Advance(); // Consume '$'
545 Advance(); // Consume '{'
546 break;
547 } else if (c == '\\') {
548 Advance(); // Consume '\\'
549 DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
550 if (capture_raw) AddRawLiteralChar('\\');
551 if (unibrow::IsLineTerminator(c0_)) {
552 // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
553 // code unit sequence.
554 uc32 lastChar = c0_;
555 Advance();
556 if (lastChar == '\r') {
557 // Also skip \n.
558 if (c0_ == '\n') Advance();
559 lastChar = '\n';
560 }
561 if (capture_raw) AddRawLiteralChar(lastChar);
562 } else {
563 bool success = ScanEscape<capture_raw>();
564 USE(success);
565 DCHECK_EQ(!success, has_error());
566 // For templates, invalid escape sequence checking is handled in the
567 // parser.
568 scanner_error_state.MoveErrorTo(next_);
569 octal_error_state.MoveErrorTo(next_);
570 }
571 } else if (c == kEndOfInput) {
572 // Unterminated template literal
573 break;
574 } else {
575 Advance(); // Consume c.
576 // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
577 // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
578 // consisting of the CV 0x000A.
579 if (c == '\r') {
580 if (c0_ == '\n') Advance(); // Consume '\n'
581 c = '\n';
582 }
583 if (capture_raw) AddRawLiteralChar(c);
584 AddLiteralChar(c);
585 }
586 }
587 next().location.end_pos = source_pos();
588 next().token = result;
589
590 return result;
591 }
592
593 template <typename LocalIsolate>
SourceUrl(LocalIsolate * isolate) const594 Handle<String> Scanner::SourceUrl(LocalIsolate* isolate) const {
595 Handle<String> tmp;
596 if (source_url_.length() > 0) {
597 tmp = source_url_.Internalize(isolate);
598 }
599 return tmp;
600 }
601
602 template Handle<String> Scanner::SourceUrl(Isolate* isolate) const;
603 template Handle<String> Scanner::SourceUrl(LocalIsolate* isolate) const;
604
605 template <typename LocalIsolate>
SourceMappingUrl(LocalIsolate * isolate) const606 Handle<String> Scanner::SourceMappingUrl(LocalIsolate* isolate) const {
607 Handle<String> tmp;
608 if (source_mapping_url_.length() > 0) {
609 tmp = source_mapping_url_.Internalize(isolate);
610 }
611 return tmp;
612 }
613
614 template Handle<String> Scanner::SourceMappingUrl(Isolate* isolate) const;
615 template Handle<String> Scanner::SourceMappingUrl(LocalIsolate* isolate) const;
616
ScanDigitsWithNumericSeparators(bool (* predicate)(uc32 ch),bool is_check_first_digit)617 bool Scanner::ScanDigitsWithNumericSeparators(bool (*predicate)(uc32 ch),
618 bool is_check_first_digit) {
619 // we must have at least one digit after 'x'/'b'/'o'
620 if (is_check_first_digit && !predicate(c0_)) return false;
621
622 bool separator_seen = false;
623 while (predicate(c0_) || c0_ == '_') {
624 if (c0_ == '_') {
625 Advance();
626 if (c0_ == '_') {
627 ReportScannerError(Location(source_pos(), source_pos() + 1),
628 MessageTemplate::kContinuousNumericSeparator);
629 return false;
630 }
631 separator_seen = true;
632 continue;
633 }
634 separator_seen = false;
635 AddLiteralCharAdvance();
636 }
637
638 if (separator_seen) {
639 ReportScannerError(Location(source_pos(), source_pos() + 1),
640 MessageTemplate::kTrailingNumericSeparator);
641 return false;
642 }
643
644 return true;
645 }
646
ScanDecimalDigits(bool allow_numeric_separator)647 bool Scanner::ScanDecimalDigits(bool allow_numeric_separator) {
648 if (allow_numeric_separator) {
649 return ScanDigitsWithNumericSeparators(&IsDecimalDigit, false);
650 }
651 while (IsDecimalDigit(c0_)) {
652 AddLiteralCharAdvance();
653 }
654 if (c0_ == '_') {
655 ReportScannerError(Location(source_pos(), source_pos() + 1),
656 MessageTemplate::kInvalidOrUnexpectedToken);
657 return false;
658 }
659 return true;
660 }
661
ScanDecimalAsSmiWithNumericSeparators(uint64_t * value)662 bool Scanner::ScanDecimalAsSmiWithNumericSeparators(uint64_t* value) {
663 bool separator_seen = false;
664 while (IsDecimalDigit(c0_) || c0_ == '_') {
665 if (c0_ == '_') {
666 Advance();
667 if (c0_ == '_') {
668 ReportScannerError(Location(source_pos(), source_pos() + 1),
669 MessageTemplate::kContinuousNumericSeparator);
670 return false;
671 }
672 separator_seen = true;
673 continue;
674 }
675 separator_seen = false;
676 *value = 10 * *value + (c0_ - '0');
677 uc32 first_char = c0_;
678 Advance();
679 AddLiteralChar(first_char);
680 }
681
682 if (separator_seen) {
683 ReportScannerError(Location(source_pos(), source_pos() + 1),
684 MessageTemplate::kTrailingNumericSeparator);
685 return false;
686 }
687
688 return true;
689 }
690
ScanDecimalAsSmi(uint64_t * value,bool allow_numeric_separator)691 bool Scanner::ScanDecimalAsSmi(uint64_t* value, bool allow_numeric_separator) {
692 if (allow_numeric_separator) {
693 return ScanDecimalAsSmiWithNumericSeparators(value);
694 }
695
696 while (IsDecimalDigit(c0_)) {
697 *value = 10 * *value + (c0_ - '0');
698 uc32 first_char = c0_;
699 Advance();
700 AddLiteralChar(first_char);
701 }
702 return true;
703 }
704
ScanBinaryDigits()705 bool Scanner::ScanBinaryDigits() {
706 return ScanDigitsWithNumericSeparators(&IsBinaryDigit, true);
707 }
708
ScanOctalDigits()709 bool Scanner::ScanOctalDigits() {
710 return ScanDigitsWithNumericSeparators(&IsOctalDigit, true);
711 }
712
ScanImplicitOctalDigits(int start_pos,Scanner::NumberKind * kind)713 bool Scanner::ScanImplicitOctalDigits(int start_pos,
714 Scanner::NumberKind* kind) {
715 *kind = IMPLICIT_OCTAL;
716
717 while (true) {
718 // (possible) octal number
719 if (IsNonOctalDecimalDigit(c0_)) {
720 *kind = DECIMAL_WITH_LEADING_ZERO;
721 return true;
722 }
723 if (!IsOctalDigit(c0_)) {
724 // Octal literal finished.
725 octal_pos_ = Location(start_pos, source_pos());
726 octal_message_ = MessageTemplate::kStrictOctalLiteral;
727 return true;
728 }
729 AddLiteralCharAdvance();
730 }
731 }
732
ScanHexDigits()733 bool Scanner::ScanHexDigits() {
734 return ScanDigitsWithNumericSeparators(&IsHexDigit, true);
735 }
736
ScanSignedInteger()737 bool Scanner::ScanSignedInteger() {
738 if (c0_ == '+' || c0_ == '-') AddLiteralCharAdvance();
739 // we must have at least one decimal digit after 'e'/'E'
740 if (!IsDecimalDigit(c0_)) return false;
741 return ScanDecimalDigits(true);
742 }
743
ScanNumber(bool seen_period)744 Token::Value Scanner::ScanNumber(bool seen_period) {
745 DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
746
747 NumberKind kind = DECIMAL;
748
749 next().literal_chars.Start();
750 bool at_start = !seen_period;
751 int start_pos = source_pos(); // For reporting octal positions.
752 if (seen_period) {
753 // we have already seen a decimal point of the float
754 AddLiteralChar('.');
755 if (c0_ == '_') {
756 return Token::ILLEGAL;
757 }
758 // we know we have at least one digit
759 if (!ScanDecimalDigits(true)) return Token::ILLEGAL;
760 } else {
761 // if the first character is '0' we must check for octals and hex
762 if (c0_ == '0') {
763 AddLiteralCharAdvance();
764
765 // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
766 // an octal number.
767 if (AsciiAlphaToLower(c0_) == 'x') {
768 AddLiteralCharAdvance();
769 kind = HEX;
770 if (!ScanHexDigits()) return Token::ILLEGAL;
771 } else if (AsciiAlphaToLower(c0_) == 'o') {
772 AddLiteralCharAdvance();
773 kind = OCTAL;
774 if (!ScanOctalDigits()) return Token::ILLEGAL;
775 } else if (AsciiAlphaToLower(c0_) == 'b') {
776 AddLiteralCharAdvance();
777 kind = BINARY;
778 if (!ScanBinaryDigits()) return Token::ILLEGAL;
779 } else if (IsOctalDigit(c0_)) {
780 kind = IMPLICIT_OCTAL;
781 if (!ScanImplicitOctalDigits(start_pos, &kind)) {
782 return Token::ILLEGAL;
783 }
784 if (kind == DECIMAL_WITH_LEADING_ZERO) {
785 at_start = false;
786 }
787 } else if (IsNonOctalDecimalDigit(c0_)) {
788 kind = DECIMAL_WITH_LEADING_ZERO;
789 } else if (c0_ == '_') {
790 ReportScannerError(Location(source_pos(), source_pos() + 1),
791 MessageTemplate::kZeroDigitNumericSeparator);
792 return Token::ILLEGAL;
793 }
794 }
795
796 // Parse decimal digits and allow trailing fractional part.
797 if (IsDecimalNumberKind(kind)) {
798 bool allow_numeric_separator = kind != DECIMAL_WITH_LEADING_ZERO;
799 // This is an optimization for parsing Decimal numbers as Smi's.
800 if (at_start) {
801 uint64_t value = 0;
802 // scan subsequent decimal digits
803 if (!ScanDecimalAsSmi(&value, allow_numeric_separator)) {
804 return Token::ILLEGAL;
805 }
806
807 if (next().literal_chars.one_byte_literal().length() <= 10 &&
808 value <= Smi::kMaxValue && c0_ != '.' && !IsIdentifierStart(c0_)) {
809 next().smi_value_ = static_cast<uint32_t>(value);
810
811 if (kind == DECIMAL_WITH_LEADING_ZERO) {
812 octal_pos_ = Location(start_pos, source_pos());
813 octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
814 }
815 return Token::SMI;
816 }
817 }
818
819 if (!ScanDecimalDigits(allow_numeric_separator)) {
820 return Token::ILLEGAL;
821 }
822 if (c0_ == '.') {
823 seen_period = true;
824 AddLiteralCharAdvance();
825 if (c0_ == '_') {
826 return Token::ILLEGAL;
827 }
828 if (!ScanDecimalDigits(true)) return Token::ILLEGAL;
829 }
830 }
831 }
832
833 bool is_bigint = false;
834 if (c0_ == 'n' && !seen_period && IsValidBigIntKind(kind)) {
835 // Check that the literal is within our limits for BigInt length.
836 // For simplicity, use 4 bits per character to calculate the maximum
837 // allowed literal length.
838 static const int kMaxBigIntCharacters = BigInt::kMaxLengthBits / 4;
839 int length = source_pos() - start_pos - (kind != DECIMAL ? 2 : 0);
840 if (length > kMaxBigIntCharacters) {
841 ReportScannerError(Location(start_pos, source_pos()),
842 MessageTemplate::kBigIntTooBig);
843 return Token::ILLEGAL;
844 }
845
846 is_bigint = true;
847 Advance();
848 } else if (AsciiAlphaToLower(c0_) == 'e') {
849 // scan exponent, if any
850 DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
851
852 if (!IsDecimalNumberKind(kind)) return Token::ILLEGAL;
853
854 // scan exponent
855 AddLiteralCharAdvance();
856
857 if (!ScanSignedInteger()) return Token::ILLEGAL;
858 }
859
860 // The source character immediately following a numeric literal must
861 // not be an identifier start or a decimal digit; see ECMA-262
862 // section 7.8.3, page 17 (note that we read only one decimal digit
863 // if the value is 0).
864 if (IsDecimalDigit(c0_) || IsIdentifierStart(c0_)) {
865 return Token::ILLEGAL;
866 }
867
868 if (kind == DECIMAL_WITH_LEADING_ZERO) {
869 octal_pos_ = Location(start_pos, source_pos());
870 octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
871 }
872
873 return is_bigint ? Token::BIGINT : Token::NUMBER;
874 }
875
ScanIdentifierUnicodeEscape()876 uc32 Scanner::ScanIdentifierUnicodeEscape() {
877 Advance();
878 if (c0_ != 'u') return Invalid();
879 Advance();
880 return ScanUnicodeEscape<false>();
881 }
882
883 template <bool capture_raw>
ScanUnicodeEscape()884 uc32 Scanner::ScanUnicodeEscape() {
885 // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
886 // hex digits between { } is arbitrary. \ and u have already been read.
887 if (c0_ == '{') {
888 int begin = source_pos() - 2;
889 Advance<capture_raw>();
890 uc32 cp =
891 ScanUnlimitedLengthHexNumber<capture_raw>(String::kMaxCodePoint, begin);
892 if (cp == kInvalidSequence || c0_ != '}') {
893 ReportScannerError(source_pos(),
894 MessageTemplate::kInvalidUnicodeEscapeSequence);
895 return Invalid();
896 }
897 Advance<capture_raw>();
898 return cp;
899 }
900 const bool unicode = true;
901 return ScanHexNumber<capture_raw, unicode>(4);
902 }
903
ScanIdentifierOrKeywordInnerSlow(bool escaped,bool can_be_keyword)904 Token::Value Scanner::ScanIdentifierOrKeywordInnerSlow(bool escaped,
905 bool can_be_keyword) {
906 while (true) {
907 if (c0_ == '\\') {
908 escaped = true;
909 uc32 c = ScanIdentifierUnicodeEscape();
910 // Only allow legal identifier part characters.
911 // TODO(verwaest): Make this true.
912 // DCHECK(!IsIdentifierPart('\'));
913 DCHECK(!IsIdentifierPart(Invalid()));
914 if (c == '\\' || !IsIdentifierPart(c)) {
915 return Token::ILLEGAL;
916 }
917 can_be_keyword = can_be_keyword && CharCanBeKeyword(c);
918 AddLiteralChar(c);
919 } else if (IsIdentifierPart(c0_) ||
920 (CombineSurrogatePair() && IsIdentifierPart(c0_))) {
921 can_be_keyword = can_be_keyword && CharCanBeKeyword(c0_);
922 AddLiteralCharAdvance();
923 } else {
924 break;
925 }
926 }
927
928 if (can_be_keyword && next().literal_chars.is_one_byte()) {
929 Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
930 Token::Value token =
931 KeywordOrIdentifierToken(chars.begin(), chars.length());
932 if (base::IsInRange(token, Token::IDENTIFIER, Token::YIELD)) return token;
933
934 if (token == Token::FUTURE_STRICT_RESERVED_WORD) {
935 if (escaped) return Token::ESCAPED_STRICT_RESERVED_WORD;
936 return token;
937 }
938
939 if (!escaped) return token;
940
941 STATIC_ASSERT(Token::LET + 1 == Token::STATIC);
942 if (base::IsInRange(token, Token::LET, Token::STATIC)) {
943 return Token::ESCAPED_STRICT_RESERVED_WORD;
944 }
945 return Token::ESCAPED_KEYWORD;
946 }
947
948 return Token::IDENTIFIER;
949 }
950
ScanRegExpPattern()951 bool Scanner::ScanRegExpPattern() {
952 DCHECK_EQ(Token::UNINITIALIZED, next_next().token);
953 DCHECK(next().token == Token::DIV || next().token == Token::ASSIGN_DIV);
954
955 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
956 bool in_character_class = false;
957
958 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
959 // the scanner should pass uninterpreted bodies to the RegExp
960 // constructor.
961 next().literal_chars.Start();
962 if (next().token == Token::ASSIGN_DIV) {
963 AddLiteralChar('=');
964 }
965
966 while (c0_ != '/' || in_character_class) {
967 if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
968 return false;
969 }
970 if (c0_ == '\\') { // Escape sequence.
971 AddLiteralCharAdvance();
972 if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
973 return false;
974 }
975 AddLiteralCharAdvance();
976 // If the escape allows more characters, i.e., \x??, \u????, or \c?,
977 // only "safe" characters are allowed (letters, digits, underscore),
978 // otherwise the escape isn't valid and the invalid character has
979 // its normal meaning. I.e., we can just continue scanning without
980 // worrying whether the following characters are part of the escape
981 // or not, since any '/', '\\' or '[' is guaranteed to not be part
982 // of the escape sequence.
983
984 // TODO(896): At some point, parse RegExps more thoroughly to capture
985 // octal esacpes in strict mode.
986 } else { // Unescaped character.
987 if (c0_ == '[') in_character_class = true;
988 if (c0_ == ']') in_character_class = false;
989 AddLiteralCharAdvance();
990 }
991 }
992 Advance(); // consume '/'
993
994 next().token = Token::REGEXP_LITERAL;
995 return true;
996 }
997
ScanRegExpFlags()998 Maybe<int> Scanner::ScanRegExpFlags() {
999 DCHECK_EQ(Token::REGEXP_LITERAL, next().token);
1000
1001 // Scan regular expression flags.
1002 JSRegExp::Flags flags;
1003 while (IsIdentifierPart(c0_)) {
1004 base::Optional<JSRegExp::Flags> maybe_flag = JSRegExp::FlagFromChar(c0_);
1005 if (!maybe_flag.has_value()) return Nothing<int>();
1006 JSRegExp::Flags flag = *maybe_flag;
1007 if (flags & flag) return Nothing<int>();
1008 Advance();
1009 flags |= flag;
1010 }
1011
1012 next().location.end_pos = source_pos();
1013 return Just<int>(flags);
1014 }
1015
CurrentSymbol(AstValueFactory * ast_value_factory) const1016 const AstRawString* Scanner::CurrentSymbol(
1017 AstValueFactory* ast_value_factory) const {
1018 if (is_literal_one_byte()) {
1019 return ast_value_factory->GetOneByteString(literal_one_byte_string());
1020 }
1021 return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1022 }
1023
NextSymbol(AstValueFactory * ast_value_factory) const1024 const AstRawString* Scanner::NextSymbol(
1025 AstValueFactory* ast_value_factory) const {
1026 if (is_next_literal_one_byte()) {
1027 return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1028 }
1029 return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1030 }
1031
CurrentRawSymbol(AstValueFactory * ast_value_factory) const1032 const AstRawString* Scanner::CurrentRawSymbol(
1033 AstValueFactory* ast_value_factory) const {
1034 if (is_raw_literal_one_byte()) {
1035 return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1036 }
1037 return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1038 }
1039
1040
DoubleValue()1041 double Scanner::DoubleValue() {
1042 DCHECK(is_literal_one_byte());
1043 return StringToDouble(
1044 literal_one_byte_string(),
1045 ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1046 }
1047
CurrentLiteralAsCString(Zone * zone) const1048 const char* Scanner::CurrentLiteralAsCString(Zone* zone) const {
1049 DCHECK(is_literal_one_byte());
1050 Vector<const uint8_t> vector = literal_one_byte_string();
1051 int length = vector.length();
1052 char* buffer = zone->NewArray<char>(length + 1);
1053 memcpy(buffer, vector.begin(), length);
1054 buffer[length] = '\0';
1055 return buffer;
1056 }
1057
SeekNext(size_t position)1058 void Scanner::SeekNext(size_t position) {
1059 // Use with care: This cleanly resets most, but not all scanner state.
1060 // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.
1061
1062 // To re-scan from a given character position, we need to:
1063 // 1, Reset the current_, next_ and next_next_ tokens
1064 // (next_ + next_next_ will be overwrittem by Next(),
1065 // current_ will remain unchanged, so overwrite it fully.)
1066 for (TokenDesc& token : token_storage_) {
1067 token.token = Token::UNINITIALIZED;
1068 token.invalid_template_escape_message = MessageTemplate::kNone;
1069 }
1070 // 2, reset the source to the desired position,
1071 source_->Seek(position);
1072 // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
1073 c0_ = source_->Advance();
1074 next().after_line_terminator = false;
1075 Scan();
1076 DCHECK_EQ(next().location.beg_pos, static_cast<int>(position));
1077 }
1078
1079 } // namespace internal
1080 } // namespace v8
1081