1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Features shared by parsing and pre-parsing scanners.
6
7 #include "src/parsing/scanner.h"
8
9 #include <stdint.h>
10
11 #include <cmath>
12
13 #include "src/ast/ast-value-factory.h"
14 #include "src/base/platform/wrappers.h"
15 #include "src/base/strings.h"
16 #include "src/numbers/conversions-inl.h"
17 #include "src/objects/bigint.h"
18 #include "src/parsing/parse-info.h"
19 #include "src/parsing/scanner-inl.h"
20 #include "src/zone/zone.h"
21
22 namespace v8 {
23 namespace internal {
24
25 class Scanner::ErrorState {
26 public:
ErrorState(MessageTemplate * message_stack,Scanner::Location * location_stack)27 ErrorState(MessageTemplate* message_stack, Scanner::Location* location_stack)
28 : message_stack_(message_stack),
29 old_message_(*message_stack),
30 location_stack_(location_stack),
31 old_location_(*location_stack) {
32 *message_stack_ = MessageTemplate::kNone;
33 *location_stack_ = Location::invalid();
34 }
35
~ErrorState()36 ~ErrorState() {
37 *message_stack_ = old_message_;
38 *location_stack_ = old_location_;
39 }
40
MoveErrorTo(TokenDesc * dest)41 void MoveErrorTo(TokenDesc* dest) {
42 if (*message_stack_ == MessageTemplate::kNone) {
43 return;
44 }
45 if (dest->invalid_template_escape_message == MessageTemplate::kNone) {
46 dest->invalid_template_escape_message = *message_stack_;
47 dest->invalid_template_escape_location = *location_stack_;
48 }
49 *message_stack_ = MessageTemplate::kNone;
50 *location_stack_ = Location::invalid();
51 }
52
53 private:
54 MessageTemplate* const message_stack_;
55 MessageTemplate const old_message_;
56 Scanner::Location* const location_stack_;
57 Scanner::Location const old_location_;
58 };
59
60 // ----------------------------------------------------------------------------
61 // Scanner::BookmarkScope
62
63 const size_t Scanner::BookmarkScope::kNoBookmark =
64 std::numeric_limits<size_t>::max() - 1;
65 const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
66 std::numeric_limits<size_t>::max();
67
Set(size_t position)68 void Scanner::BookmarkScope::Set(size_t position) {
69 DCHECK_EQ(bookmark_, kNoBookmark);
70 bookmark_ = position;
71 }
72
Apply()73 void Scanner::BookmarkScope::Apply() {
74 DCHECK(HasBeenSet()); // Caller hasn't called SetBookmark.
75 if (had_parser_error_) {
76 scanner_->set_parser_error();
77 } else {
78 scanner_->reset_parser_error_flag();
79 scanner_->SeekNext(bookmark_);
80 }
81 bookmark_ = kBookmarkWasApplied;
82 }
83
HasBeenSet() const84 bool Scanner::BookmarkScope::HasBeenSet() const {
85 return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
86 }
87
HasBeenApplied() const88 bool Scanner::BookmarkScope::HasBeenApplied() const {
89 return bookmark_ == kBookmarkWasApplied;
90 }
91
92 // ----------------------------------------------------------------------------
93 // Scanner
94
Scanner(Utf16CharacterStream * source,UnoptimizedCompileFlags flags)95 Scanner::Scanner(Utf16CharacterStream* source, UnoptimizedCompileFlags flags)
96 : flags_(flags),
97 source_(source),
98 found_html_comment_(false),
99 octal_pos_(Location::invalid()),
100 octal_message_(MessageTemplate::kNone) {
101 DCHECK_NOT_NULL(source);
102 }
103
Initialize()104 void Scanner::Initialize() {
105 // Need to capture identifiers in order to recognize "get" and "set"
106 // in object literals.
107 Init();
108 next().after_line_terminator = true;
109 Scan();
110 }
111
112 // static
IsInvalid(base::uc32 c)113 bool Scanner::IsInvalid(base::uc32 c) {
114 DCHECK(c == Invalid() || base::IsInRange(c, 0u, String::kMaxCodePoint));
115 return c == Scanner::Invalid();
116 }
117
118 template <bool capture_raw, bool unicode>
ScanHexNumber(int expected_length)119 base::uc32 Scanner::ScanHexNumber(int expected_length) {
120 DCHECK_LE(expected_length, 4); // prevent overflow
121
122 int begin = source_pos() - 2;
123 base::uc32 x = 0;
124 for (int i = 0; i < expected_length; i++) {
125 int d = base::HexValue(c0_);
126 if (d < 0) {
127 ReportScannerError(Location(begin, begin + expected_length + 2),
128 unicode
129 ? MessageTemplate::kInvalidUnicodeEscapeSequence
130 : MessageTemplate::kInvalidHexEscapeSequence);
131 return Invalid();
132 }
133 x = x * 16 + d;
134 Advance<capture_raw>();
135 }
136
137 return x;
138 }
139
140 template <bool capture_raw>
ScanUnlimitedLengthHexNumber(base::uc32 max_value,int beg_pos)141 base::uc32 Scanner::ScanUnlimitedLengthHexNumber(base::uc32 max_value,
142 int beg_pos) {
143 base::uc32 x = 0;
144 int d = base::HexValue(c0_);
145 if (d < 0) return Invalid();
146
147 while (d >= 0) {
148 x = x * 16 + d;
149 if (x > max_value) {
150 ReportScannerError(Location(beg_pos, source_pos() + 1),
151 MessageTemplate::kUndefinedUnicodeCodePoint);
152 return Invalid();
153 }
154 Advance<capture_raw>();
155 d = base::HexValue(c0_);
156 }
157
158 return x;
159 }
160
Next()161 Token::Value Scanner::Next() {
162 // Rotate through tokens.
163 TokenDesc* previous = current_;
164 current_ = next_;
165 // Either we already have the next token lined up, in which case next_next_
166 // simply becomes next_. In that case we use current_ as new next_next_ and
167 // clear its token to indicate that it wasn't scanned yet. Otherwise we use
168 // current_ as next_ and scan into it, leaving next_next_ uninitialized.
169 if (V8_LIKELY(next_next().token == Token::UNINITIALIZED)) {
170 next_ = previous;
171 // User 'previous' instead of 'next_' because for some reason the compiler
172 // thinks 'next_' could be modified before the entry into Scan.
173 previous->after_line_terminator = false;
174 Scan(previous);
175 } else {
176 next_ = next_next_;
177 next_next_ = previous;
178 previous->token = Token::UNINITIALIZED;
179 DCHECK_NE(Token::UNINITIALIZED, current().token);
180 }
181 return current().token;
182 }
183
PeekAhead()184 Token::Value Scanner::PeekAhead() {
185 DCHECK(next().token != Token::DIV);
186 DCHECK(next().token != Token::ASSIGN_DIV);
187
188 if (next_next().token != Token::UNINITIALIZED) {
189 return next_next().token;
190 }
191 TokenDesc* temp = next_;
192 next_ = next_next_;
193 next().after_line_terminator = false;
194 Scan();
195 next_next_ = next_;
196 next_ = temp;
197 return next_next().token;
198 }
199
SkipSingleHTMLComment()200 Token::Value Scanner::SkipSingleHTMLComment() {
201 if (flags_.is_module()) {
202 ReportScannerError(source_pos(), MessageTemplate::kHtmlCommentInModule);
203 return Token::ILLEGAL;
204 }
205 return SkipSingleLineComment();
206 }
207
SkipSingleLineComment()208 Token::Value Scanner::SkipSingleLineComment() {
209 // The line terminator at the end of the line is not considered
210 // to be part of the single-line comment; it is recognized
211 // separately by the lexical grammar and becomes part of the
212 // stream of input elements for the syntactic grammar (see
213 // ECMA-262, section 7.4).
214 AdvanceUntil([](base::uc32 c0) { return unibrow::IsLineTerminator(c0); });
215
216 return Token::WHITESPACE;
217 }
218
SkipSourceURLComment()219 Token::Value Scanner::SkipSourceURLComment() {
220 TryToParseSourceURLComment();
221 if (unibrow::IsLineTerminator(c0_) || c0_ == kEndOfInput) {
222 return Token::WHITESPACE;
223 }
224 return SkipSingleLineComment();
225 }
226
TryToParseSourceURLComment()227 void Scanner::TryToParseSourceURLComment() {
228 // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
229 // function will just return if it cannot parse a magic comment.
230 DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput));
231 if (!IsWhiteSpace(c0_)) return;
232 Advance();
233 LiteralBuffer name;
234 name.Start();
235
236 while (c0_ != kEndOfInput && !IsWhiteSpaceOrLineTerminator(c0_) &&
237 c0_ != '=') {
238 name.AddChar(c0_);
239 Advance();
240 }
241 if (!name.is_one_byte()) return;
242 base::Vector<const uint8_t> name_literal = name.one_byte_literal();
243 LiteralBuffer* value;
244 if (name_literal == base::StaticOneByteVector("sourceURL")) {
245 value = &source_url_;
246 } else if (name_literal == base::StaticOneByteVector("sourceMappingURL")) {
247 value = &source_mapping_url_;
248 } else {
249 return;
250 }
251 if (c0_ != '=')
252 return;
253 value->Start();
254 Advance();
255 while (IsWhiteSpace(c0_)) {
256 Advance();
257 }
258 while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
259 if (IsWhiteSpace(c0_)) {
260 break;
261 }
262 value->AddChar(c0_);
263 Advance();
264 }
265 // Allow whitespace at the end.
266 while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
267 if (!IsWhiteSpace(c0_)) {
268 value->Start();
269 break;
270 }
271 Advance();
272 }
273 }
274
SkipMultiLineComment()275 Token::Value Scanner::SkipMultiLineComment() {
276 DCHECK_EQ(c0_, '*');
277
278 // Until we see the first newline, check for * and newline characters.
279 if (!next().after_line_terminator) {
280 do {
281 AdvanceUntil([](base::uc32 c0) {
282 if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
283 return unibrow::IsLineTerminator(c0);
284 }
285 uint8_t char_flags = character_scan_flags[c0];
286 return MultilineCommentCharacterNeedsSlowPath(char_flags);
287 });
288
289 while (c0_ == '*') {
290 Advance();
291 if (c0_ == '/') {
292 Advance();
293 return Token::WHITESPACE;
294 }
295 }
296
297 if (unibrow::IsLineTerminator(c0_)) {
298 next().after_line_terminator = true;
299 break;
300 }
301 } while (c0_ != kEndOfInput);
302 }
303
304 // After we've seen newline, simply try to find '*/'.
305 while (c0_ != kEndOfInput) {
306 AdvanceUntil([](base::uc32 c0) { return c0 == '*'; });
307
308 while (c0_ == '*') {
309 Advance();
310 if (c0_ == '/') {
311 Advance();
312 return Token::WHITESPACE;
313 }
314 }
315 }
316
317 return Token::ILLEGAL;
318 }
319
ScanHtmlComment()320 Token::Value Scanner::ScanHtmlComment() {
321 // Check for <!-- comments.
322 DCHECK_EQ(c0_, '!');
323 Advance();
324 if (c0_ != '-' || Peek() != '-') {
325 PushBack('!'); // undo Advance()
326 return Token::LT;
327 }
328 Advance();
329
330 found_html_comment_ = true;
331 return SkipSingleHTMLComment();
332 }
333
334 #ifdef DEBUG
SanityCheckTokenDesc(const TokenDesc & token) const335 void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
336 // Only TEMPLATE_* tokens can have a invalid_template_escape_message.
337 // ILLEGAL and UNINITIALIZED can have garbage for the field.
338
339 switch (token.token) {
340 case Token::UNINITIALIZED:
341 case Token::ILLEGAL:
342 // token.literal_chars & other members might be garbage. That's ok.
343 case Token::TEMPLATE_SPAN:
344 case Token::TEMPLATE_TAIL:
345 break;
346 default:
347 DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone);
348 break;
349 }
350 }
351 #endif // DEBUG
352
SeekForward(int pos)353 void Scanner::SeekForward(int pos) {
354 // After this call, we will have the token at the given position as
355 // the "next" token. The "current" token will be invalid.
356 if (pos == next().location.beg_pos) return;
357 int current_pos = source_pos();
358 DCHECK_EQ(next().location.end_pos, current_pos);
359 // Positions inside the lookahead token aren't supported.
360 DCHECK(pos >= current_pos);
361 if (pos != current_pos) {
362 source_->Seek(pos);
363 Advance();
364 // This function is only called to seek to the location
365 // of the end of a function (at the "}" token). It doesn't matter
366 // whether there was a line terminator in the part we skip.
367 next().after_line_terminator = false;
368 }
369 Scan();
370 }
371
372 template <bool capture_raw>
ScanEscape()373 bool Scanner::ScanEscape() {
374 base::uc32 c = c0_;
375 Advance<capture_raw>();
376
377 // Skip escaped newlines.
378 DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
379 if (!capture_raw && unibrow::IsLineTerminator(c)) {
380 // Allow escaped CR+LF newlines in multiline string literals.
381 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
382 return true;
383 }
384
385 switch (c) {
386 case 'b' : c = '\b'; break;
387 case 'f' : c = '\f'; break;
388 case 'n' : c = '\n'; break;
389 case 'r' : c = '\r'; break;
390 case 't' : c = '\t'; break;
391 case 'u' : {
392 c = ScanUnicodeEscape<capture_raw>();
393 if (IsInvalid(c)) return false;
394 break;
395 }
396 case 'v':
397 c = '\v';
398 break;
399 case 'x': {
400 c = ScanHexNumber<capture_raw>(2);
401 if (IsInvalid(c)) return false;
402 break;
403 }
404 case '0':
405 case '1':
406 case '2':
407 case '3':
408 case '4':
409 case '5':
410 case '6':
411 case '7':
412 c = ScanOctalEscape<capture_raw>(c, 2);
413 break;
414 case '8':
415 case '9':
416 // '\8' and '\9' are disallowed in strict mode.
417 // Re-use the octal error state to propagate the error.
418 octal_pos_ = Location(source_pos() - 2, source_pos() - 1);
419 octal_message_ = capture_raw ? MessageTemplate::kTemplate8Or9Escape
420 : MessageTemplate::kStrict8Or9Escape;
421 break;
422 }
423
424 // Other escaped characters are interpreted as their non-escaped version.
425 AddLiteralChar(c);
426 return true;
427 }
428
429 template <bool capture_raw>
ScanOctalEscape(base::uc32 c,int length)430 base::uc32 Scanner::ScanOctalEscape(base::uc32 c, int length) {
431 DCHECK('0' <= c && c <= '7');
432 base::uc32 x = c - '0';
433 int i = 0;
434 for (; i < length; i++) {
435 int d = c0_ - '0';
436 if (d < 0 || d > 7) break;
437 int nx = x * 8 + d;
438 if (nx >= 256) break;
439 x = nx;
440 Advance<capture_raw>();
441 }
442 // Anything except '\0' is an octal escape sequence, illegal in strict mode.
443 // Remember the position of octal escape sequences so that an error
444 // can be reported later (in strict mode).
445 // We don't report the error immediately, because the octal escape can
446 // occur before the "use strict" directive.
447 if (c != '0' || i > 0 || IsNonOctalDecimalDigit(c0_)) {
448 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
449 octal_message_ = capture_raw ? MessageTemplate::kTemplateOctalLiteral
450 : MessageTemplate::kStrictOctalEscape;
451 }
452 return x;
453 }
454
ScanString()455 Token::Value Scanner::ScanString() {
456 base::uc32 quote = c0_;
457
458 next().literal_chars.Start();
459 while (true) {
460 AdvanceUntil([this](base::uc32 c0) {
461 if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
462 if (V8_UNLIKELY(unibrow::IsStringLiteralLineTerminator(c0))) {
463 return true;
464 }
465 AddLiteralChar(c0);
466 return false;
467 }
468 uint8_t char_flags = character_scan_flags[c0];
469 if (MayTerminateString(char_flags)) return true;
470 AddLiteralChar(c0);
471 return false;
472 });
473
474 while (c0_ == '\\') {
475 Advance();
476 // TODO(verwaest): Check whether we can remove the additional check.
477 if (V8_UNLIKELY(c0_ == kEndOfInput || !ScanEscape<false>())) {
478 return Token::ILLEGAL;
479 }
480 }
481
482 if (c0_ == quote) {
483 Advance();
484 return Token::STRING;
485 }
486
487 if (V8_UNLIKELY(c0_ == kEndOfInput ||
488 unibrow::IsStringLiteralLineTerminator(c0_))) {
489 return Token::ILLEGAL;
490 }
491
492 AddLiteralChar(c0_);
493 }
494 }
495
ScanPrivateName()496 Token::Value Scanner::ScanPrivateName() {
497 next().literal_chars.Start();
498 DCHECK_EQ(c0_, '#');
499 DCHECK(!IsIdentifierStart(kEndOfInput));
500 int pos = source_pos();
501 Advance();
502 if (IsIdentifierStart(c0_) ||
503 (CombineSurrogatePair() && IsIdentifierStart(c0_))) {
504 AddLiteralChar('#');
505 Token::Value token = ScanIdentifierOrKeywordInner();
506 return token == Token::ILLEGAL ? Token::ILLEGAL : Token::PRIVATE_NAME;
507 }
508
509 ReportScannerError(pos, MessageTemplate::kInvalidOrUnexpectedToken);
510 return Token::ILLEGAL;
511 }
512
ScanTemplateSpan()513 Token::Value Scanner::ScanTemplateSpan() {
514 // When scanning a TemplateSpan, we are looking for the following construct:
515 // TEMPLATE_SPAN ::
516 // ` LiteralChars* ${
517 // | } LiteralChars* ${
518 //
519 // TEMPLATE_TAIL ::
520 // ` LiteralChars* `
521 // | } LiteralChar* `
522 //
523 // A TEMPLATE_SPAN should always be followed by an Expression, while a
524 // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
525 // followed by an Expression.
526
527 // These scoped helpers save and restore the original error state, so that we
528 // can specially treat invalid escape sequences in templates (which are
529 // handled by the parser).
530 ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_);
531 ErrorState octal_error_state(&octal_message_, &octal_pos_);
532
533 Token::Value result = Token::TEMPLATE_SPAN;
534 next().literal_chars.Start();
535 next().raw_literal_chars.Start();
536 const bool capture_raw = true;
537 while (true) {
538 base::uc32 c = c0_;
539 if (c == '`') {
540 Advance(); // Consume '`'
541 result = Token::TEMPLATE_TAIL;
542 break;
543 } else if (c == '$' && Peek() == '{') {
544 Advance(); // Consume '$'
545 Advance(); // Consume '{'
546 break;
547 } else if (c == '\\') {
548 Advance(); // Consume '\\'
549 DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
550 if (capture_raw) AddRawLiteralChar('\\');
551 if (unibrow::IsLineTerminator(c0_)) {
552 // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
553 // code unit sequence.
554 base::uc32 lastChar = c0_;
555 Advance();
556 if (lastChar == '\r') {
557 // Also skip \n.
558 if (c0_ == '\n') Advance();
559 lastChar = '\n';
560 }
561 if (capture_raw) AddRawLiteralChar(lastChar);
562 } else {
563 bool success = ScanEscape<capture_raw>();
564 USE(success);
565 DCHECK_EQ(!success, has_error());
566 // For templates, invalid escape sequence checking is handled in the
567 // parser.
568 scanner_error_state.MoveErrorTo(next_);
569 octal_error_state.MoveErrorTo(next_);
570 }
571 } else if (c == kEndOfInput) {
572 // Unterminated template literal
573 break;
574 } else {
575 Advance(); // Consume c.
576 // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
577 // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
578 // consisting of the CV 0x000A.
579 if (c == '\r') {
580 if (c0_ == '\n') Advance(); // Consume '\n'
581 c = '\n';
582 }
583 if (capture_raw) AddRawLiteralChar(c);
584 AddLiteralChar(c);
585 }
586 }
587 next().location.end_pos = source_pos();
588 next().token = result;
589
590 return result;
591 }
592
593 template <typename IsolateT>
SourceUrl(IsolateT * isolate) const594 Handle<String> Scanner::SourceUrl(IsolateT* isolate) const {
595 Handle<String> tmp;
596 if (source_url_.length() > 0) {
597 tmp = source_url_.Internalize(isolate);
598 }
599 return tmp;
600 }
601
602 template Handle<String> Scanner::SourceUrl(Isolate* isolate) const;
603 template Handle<String> Scanner::SourceUrl(LocalIsolate* isolate) const;
604
605 template <typename IsolateT>
SourceMappingUrl(IsolateT * isolate) const606 Handle<String> Scanner::SourceMappingUrl(IsolateT* isolate) const {
607 Handle<String> tmp;
608 if (source_mapping_url_.length() > 0) {
609 tmp = source_mapping_url_.Internalize(isolate);
610 }
611 return tmp;
612 }
613
614 template Handle<String> Scanner::SourceMappingUrl(Isolate* isolate) const;
615 template Handle<String> Scanner::SourceMappingUrl(LocalIsolate* isolate) const;
616
ScanDigitsWithNumericSeparators(bool (* predicate)(base::uc32 ch),bool is_check_first_digit)617 bool Scanner::ScanDigitsWithNumericSeparators(bool (*predicate)(base::uc32 ch),
618 bool is_check_first_digit) {
619 // we must have at least one digit after 'x'/'b'/'o'
620 if (is_check_first_digit && !predicate(c0_)) return false;
621
622 bool separator_seen = false;
623 while (predicate(c0_) || c0_ == '_') {
624 if (c0_ == '_') {
625 Advance();
626 if (c0_ == '_') {
627 ReportScannerError(Location(source_pos(), source_pos() + 1),
628 MessageTemplate::kContinuousNumericSeparator);
629 return false;
630 }
631 separator_seen = true;
632 continue;
633 }
634 separator_seen = false;
635 AddLiteralCharAdvance();
636 }
637
638 if (separator_seen) {
639 ReportScannerError(Location(source_pos(), source_pos() + 1),
640 MessageTemplate::kTrailingNumericSeparator);
641 return false;
642 }
643
644 return true;
645 }
646
ScanDecimalDigits(bool allow_numeric_separator)647 bool Scanner::ScanDecimalDigits(bool allow_numeric_separator) {
648 if (allow_numeric_separator) {
649 return ScanDigitsWithNumericSeparators(&IsDecimalDigit, false);
650 }
651 while (IsDecimalDigit(c0_)) {
652 AddLiteralCharAdvance();
653 }
654 if (c0_ == '_') {
655 ReportScannerError(Location(source_pos(), source_pos() + 1),
656 MessageTemplate::kInvalidOrUnexpectedToken);
657 return false;
658 }
659 return true;
660 }
661
ScanDecimalAsSmiWithNumericSeparators(uint64_t * value)662 bool Scanner::ScanDecimalAsSmiWithNumericSeparators(uint64_t* value) {
663 bool separator_seen = false;
664 while (IsDecimalDigit(c0_) || c0_ == '_') {
665 if (c0_ == '_') {
666 Advance();
667 if (c0_ == '_') {
668 ReportScannerError(Location(source_pos(), source_pos() + 1),
669 MessageTemplate::kContinuousNumericSeparator);
670 return false;
671 }
672 separator_seen = true;
673 continue;
674 }
675 separator_seen = false;
676 *value = 10 * *value + (c0_ - '0');
677 base::uc32 first_char = c0_;
678 Advance();
679 AddLiteralChar(first_char);
680 }
681
682 if (separator_seen) {
683 ReportScannerError(Location(source_pos(), source_pos() + 1),
684 MessageTemplate::kTrailingNumericSeparator);
685 return false;
686 }
687
688 return true;
689 }
690
ScanDecimalAsSmi(uint64_t * value,bool allow_numeric_separator)691 bool Scanner::ScanDecimalAsSmi(uint64_t* value, bool allow_numeric_separator) {
692 if (allow_numeric_separator) {
693 return ScanDecimalAsSmiWithNumericSeparators(value);
694 }
695
696 while (IsDecimalDigit(c0_)) {
697 *value = 10 * *value + (c0_ - '0');
698 base::uc32 first_char = c0_;
699 Advance();
700 AddLiteralChar(first_char);
701 }
702 return true;
703 }
704
ScanBinaryDigits()705 bool Scanner::ScanBinaryDigits() {
706 return ScanDigitsWithNumericSeparators(&IsBinaryDigit, true);
707 }
708
ScanOctalDigits()709 bool Scanner::ScanOctalDigits() {
710 return ScanDigitsWithNumericSeparators(&IsOctalDigit, true);
711 }
712
ScanImplicitOctalDigits(int start_pos,Scanner::NumberKind * kind)713 bool Scanner::ScanImplicitOctalDigits(int start_pos,
714 Scanner::NumberKind* kind) {
715 *kind = IMPLICIT_OCTAL;
716
717 while (true) {
718 // (possible) octal number
719 if (IsNonOctalDecimalDigit(c0_)) {
720 *kind = DECIMAL_WITH_LEADING_ZERO;
721 return true;
722 }
723 if (!IsOctalDigit(c0_)) {
724 // Octal literal finished.
725 octal_pos_ = Location(start_pos, source_pos());
726 octal_message_ = MessageTemplate::kStrictOctalLiteral;
727 return true;
728 }
729 AddLiteralCharAdvance();
730 }
731 }
732
ScanHexDigits()733 bool Scanner::ScanHexDigits() {
734 return ScanDigitsWithNumericSeparators(&IsHexDigit, true);
735 }
736
ScanSignedInteger()737 bool Scanner::ScanSignedInteger() {
738 if (c0_ == '+' || c0_ == '-') AddLiteralCharAdvance();
739 // we must have at least one decimal digit after 'e'/'E'
740 if (!IsDecimalDigit(c0_)) return false;
741 return ScanDecimalDigits(true);
742 }
743
ScanNumber(bool seen_period)744 Token::Value Scanner::ScanNumber(bool seen_period) {
745 DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
746
747 NumberKind kind = DECIMAL;
748
749 next().literal_chars.Start();
750 bool at_start = !seen_period;
751 int start_pos = source_pos(); // For reporting octal positions.
752 if (seen_period) {
753 // we have already seen a decimal point of the float
754 AddLiteralChar('.');
755 if (c0_ == '_') {
756 return Token::ILLEGAL;
757 }
758 // we know we have at least one digit
759 if (!ScanDecimalDigits(true)) return Token::ILLEGAL;
760 } else {
761 // if the first character is '0' we must check for octals and hex
762 if (c0_ == '0') {
763 AddLiteralCharAdvance();
764
765 // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
766 // an octal number.
767 if (AsciiAlphaToLower(c0_) == 'x') {
768 AddLiteralCharAdvance();
769 kind = HEX;
770 if (!ScanHexDigits()) return Token::ILLEGAL;
771 } else if (AsciiAlphaToLower(c0_) == 'o') {
772 AddLiteralCharAdvance();
773 kind = OCTAL;
774 if (!ScanOctalDigits()) return Token::ILLEGAL;
775 } else if (AsciiAlphaToLower(c0_) == 'b') {
776 AddLiteralCharAdvance();
777 kind = BINARY;
778 if (!ScanBinaryDigits()) return Token::ILLEGAL;
779 } else if (IsOctalDigit(c0_)) {
780 kind = IMPLICIT_OCTAL;
781 if (!ScanImplicitOctalDigits(start_pos, &kind)) {
782 return Token::ILLEGAL;
783 }
784 if (kind == DECIMAL_WITH_LEADING_ZERO) {
785 at_start = false;
786 }
787 } else if (IsNonOctalDecimalDigit(c0_)) {
788 kind = DECIMAL_WITH_LEADING_ZERO;
789 } else if (c0_ == '_') {
790 ReportScannerError(Location(source_pos(), source_pos() + 1),
791 MessageTemplate::kZeroDigitNumericSeparator);
792 return Token::ILLEGAL;
793 }
794 }
795
796 // Parse decimal digits and allow trailing fractional part.
797 if (IsDecimalNumberKind(kind)) {
798 bool allow_numeric_separator = kind != DECIMAL_WITH_LEADING_ZERO;
799 // This is an optimization for parsing Decimal numbers as Smi's.
800 if (at_start) {
801 uint64_t value = 0;
802 // scan subsequent decimal digits
803 if (!ScanDecimalAsSmi(&value, allow_numeric_separator)) {
804 return Token::ILLEGAL;
805 }
806
807 if (next().literal_chars.one_byte_literal().length() <= 10 &&
808 value <= Smi::kMaxValue && c0_ != '.' && !IsIdentifierStart(c0_)) {
809 next().smi_value_ = static_cast<uint32_t>(value);
810
811 if (kind == DECIMAL_WITH_LEADING_ZERO) {
812 octal_pos_ = Location(start_pos, source_pos());
813 octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
814 }
815 return Token::SMI;
816 }
817 }
818
819 if (!ScanDecimalDigits(allow_numeric_separator)) {
820 return Token::ILLEGAL;
821 }
822 if (c0_ == '.') {
823 seen_period = true;
824 AddLiteralCharAdvance();
825 if (c0_ == '_') {
826 return Token::ILLEGAL;
827 }
828 if (!ScanDecimalDigits(true)) return Token::ILLEGAL;
829 }
830 }
831 }
832
833 bool is_bigint = false;
834 if (c0_ == 'n' && !seen_period && IsValidBigIntKind(kind)) {
835 // Check that the literal is within our limits for BigInt length.
836 // For simplicity, use 4 bits per character to calculate the maximum
837 // allowed literal length.
838 static const int kMaxBigIntCharacters = BigInt::kMaxLengthBits / 4;
839 int length = source_pos() - start_pos - (kind != DECIMAL ? 2 : 0);
840 if (length > kMaxBigIntCharacters) {
841 ReportScannerError(Location(start_pos, source_pos()),
842 MessageTemplate::kBigIntTooBig);
843 return Token::ILLEGAL;
844 }
845
846 is_bigint = true;
847 Advance();
848 } else if (AsciiAlphaToLower(c0_) == 'e') {
849 // scan exponent, if any
850 DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
851
852 if (!IsDecimalNumberKind(kind)) return Token::ILLEGAL;
853
854 // scan exponent
855 AddLiteralCharAdvance();
856
857 if (!ScanSignedInteger()) return Token::ILLEGAL;
858 }
859
860 // The source character immediately following a numeric literal must
861 // not be an identifier start or a decimal digit; see ECMA-262
862 // section 7.8.3, page 17 (note that we read only one decimal digit
863 // if the value is 0).
864 if (IsDecimalDigit(c0_) || IsIdentifierStart(c0_)) {
865 return Token::ILLEGAL;
866 }
867
868 if (kind == DECIMAL_WITH_LEADING_ZERO) {
869 octal_pos_ = Location(start_pos, source_pos());
870 octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
871 }
872
873 return is_bigint ? Token::BIGINT : Token::NUMBER;
874 }
875
ScanIdentifierUnicodeEscape()876 base::uc32 Scanner::ScanIdentifierUnicodeEscape() {
877 Advance();
878 if (c0_ != 'u') return Invalid();
879 Advance();
880 return ScanUnicodeEscape<false>();
881 }
882
883 template <bool capture_raw>
ScanUnicodeEscape()884 base::uc32 Scanner::ScanUnicodeEscape() {
885 // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
886 // hex digits between { } is arbitrary. \ and u have already been read.
887 if (c0_ == '{') {
888 int begin = source_pos() - 2;
889 Advance<capture_raw>();
890 base::uc32 cp =
891 ScanUnlimitedLengthHexNumber<capture_raw>(String::kMaxCodePoint, begin);
892 if (cp == kInvalidSequence || c0_ != '}') {
893 ReportScannerError(source_pos(),
894 MessageTemplate::kInvalidUnicodeEscapeSequence);
895 return Invalid();
896 }
897 Advance<capture_raw>();
898 return cp;
899 }
900 const bool unicode = true;
901 return ScanHexNumber<capture_raw, unicode>(4);
902 }
903
ScanIdentifierOrKeywordInnerSlow(bool escaped,bool can_be_keyword)904 Token::Value Scanner::ScanIdentifierOrKeywordInnerSlow(bool escaped,
905 bool can_be_keyword) {
906 while (true) {
907 if (c0_ == '\\') {
908 escaped = true;
909 base::uc32 c = ScanIdentifierUnicodeEscape();
910 // Only allow legal identifier part characters.
911 // TODO(verwaest): Make this true.
912 // DCHECK(!IsIdentifierPart('\'));
913 DCHECK(!IsIdentifierPart(Invalid()));
914 if (c == '\\' || !IsIdentifierPart(c)) {
915 return Token::ILLEGAL;
916 }
917 can_be_keyword = can_be_keyword && CharCanBeKeyword(c);
918 AddLiteralChar(c);
919 } else if (IsIdentifierPart(c0_) ||
920 (CombineSurrogatePair() && IsIdentifierPart(c0_))) {
921 can_be_keyword = can_be_keyword && CharCanBeKeyword(c0_);
922 AddLiteralCharAdvance();
923 } else {
924 break;
925 }
926 }
927
928 if (can_be_keyword && next().literal_chars.is_one_byte()) {
929 base::Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
930 Token::Value token =
931 KeywordOrIdentifierToken(chars.begin(), chars.length());
932 if (base::IsInRange(token, Token::IDENTIFIER, Token::YIELD)) return token;
933
934 if (token == Token::FUTURE_STRICT_RESERVED_WORD) {
935 if (escaped) return Token::ESCAPED_STRICT_RESERVED_WORD;
936 return token;
937 }
938
939 if (!escaped) return token;
940
941 STATIC_ASSERT(Token::LET + 1 == Token::STATIC);
942 if (base::IsInRange(token, Token::LET, Token::STATIC)) {
943 return Token::ESCAPED_STRICT_RESERVED_WORD;
944 }
945 return Token::ESCAPED_KEYWORD;
946 }
947
948 return Token::IDENTIFIER;
949 }
950
ScanRegExpPattern()951 bool Scanner::ScanRegExpPattern() {
952 DCHECK_EQ(Token::UNINITIALIZED, next_next().token);
953 DCHECK(next().token == Token::DIV || next().token == Token::ASSIGN_DIV);
954
955 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
956 bool in_character_class = false;
957
958 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
959 // the scanner should pass uninterpreted bodies to the RegExp
960 // constructor.
961 next().literal_chars.Start();
962 if (next().token == Token::ASSIGN_DIV) {
963 AddLiteralChar('=');
964 }
965
966 while (c0_ != '/' || in_character_class) {
967 if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
968 return false;
969 }
970 if (c0_ == '\\') { // Escape sequence.
971 AddLiteralCharAdvance();
972 if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
973 return false;
974 }
975 AddLiteralCharAdvance();
976 // If the escape allows more characters, i.e., \x??, \u????, or \c?,
977 // only "safe" characters are allowed (letters, digits, underscore),
978 // otherwise the escape isn't valid and the invalid character has
979 // its normal meaning. I.e., we can just continue scanning without
980 // worrying whether the following characters are part of the escape
981 // or not, since any '/', '\\' or '[' is guaranteed to not be part
982 // of the escape sequence.
983 } else { // Unescaped character.
984 if (c0_ == '[') in_character_class = true;
985 if (c0_ == ']') in_character_class = false;
986 AddLiteralCharAdvance();
987 }
988 }
989 Advance(); // consume '/'
990
991 next().token = Token::REGEXP_LITERAL;
992 return true;
993 }
994
ScanRegExpFlags()995 base::Optional<RegExpFlags> Scanner::ScanRegExpFlags() {
996 DCHECK_EQ(Token::REGEXP_LITERAL, next().token);
997
998 RegExpFlags flags;
999 while (IsIdentifierPart(c0_)) {
1000 base::Optional<RegExpFlag> maybe_flag = JSRegExp::FlagFromChar(c0_);
1001 if (!maybe_flag.has_value()) return {};
1002 RegExpFlag flag = maybe_flag.value();
1003 if (flags & flag) return {};
1004 Advance();
1005 flags |= flag;
1006 }
1007
1008 next().location.end_pos = source_pos();
1009 return flags;
1010 }
1011
CurrentSymbol(AstValueFactory * ast_value_factory) const1012 const AstRawString* Scanner::CurrentSymbol(
1013 AstValueFactory* ast_value_factory) const {
1014 if (is_literal_one_byte()) {
1015 return ast_value_factory->GetOneByteString(literal_one_byte_string());
1016 }
1017 return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1018 }
1019
NextSymbol(AstValueFactory * ast_value_factory) const1020 const AstRawString* Scanner::NextSymbol(
1021 AstValueFactory* ast_value_factory) const {
1022 if (is_next_literal_one_byte()) {
1023 return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1024 }
1025 return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1026 }
1027
CurrentRawSymbol(AstValueFactory * ast_value_factory) const1028 const AstRawString* Scanner::CurrentRawSymbol(
1029 AstValueFactory* ast_value_factory) const {
1030 if (is_raw_literal_one_byte()) {
1031 return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1032 }
1033 return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1034 }
1035
1036
DoubleValue()1037 double Scanner::DoubleValue() {
1038 DCHECK(is_literal_one_byte());
1039 return StringToDouble(
1040 literal_one_byte_string(),
1041 ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1042 }
1043
CurrentLiteralAsCString(Zone * zone) const1044 const char* Scanner::CurrentLiteralAsCString(Zone* zone) const {
1045 DCHECK(is_literal_one_byte());
1046 base::Vector<const uint8_t> vector = literal_one_byte_string();
1047 int length = vector.length();
1048 char* buffer = zone->NewArray<char>(length + 1);
1049 memcpy(buffer, vector.begin(), length);
1050 buffer[length] = '\0';
1051 return buffer;
1052 }
1053
SeekNext(size_t position)1054 void Scanner::SeekNext(size_t position) {
1055 // Use with care: This cleanly resets most, but not all scanner state.
1056 // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.
1057
1058 // To re-scan from a given character position, we need to:
1059 // 1, Reset the current_, next_ and next_next_ tokens
1060 // (next_ + next_next_ will be overwrittem by Next(),
1061 // current_ will remain unchanged, so overwrite it fully.)
1062 for (TokenDesc& token : token_storage_) {
1063 token.token = Token::UNINITIALIZED;
1064 token.invalid_template_escape_message = MessageTemplate::kNone;
1065 }
1066 // 2, reset the source to the desired position,
1067 source_->Seek(position);
1068 // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
1069 c0_ = source_->Advance();
1070 next().after_line_terminator = false;
1071 Scan();
1072 DCHECK_EQ(next().location.beg_pos, static_cast<int>(position));
1073 }
1074
1075 } // namespace internal
1076 } // namespace v8
1077