• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Features shared by parsing and pre-parsing scanners.
6 
7 #ifndef V8_PARSING_SCANNER_H_
8 #define V8_PARSING_SCANNER_H_
9 
10 #include "src/allocation.h"
11 #include "src/base/logging.h"
12 #include "src/char-predicates.h"
13 #include "src/globals.h"
14 #include "src/messages.h"
15 #include "src/parsing/token.h"
16 #include "src/unicode-decoder.h"
17 #include "src/unicode.h"
18 
19 namespace v8 {
20 namespace internal {
21 
22 
23 class AstRawString;
24 class AstValueFactory;
25 class DuplicateFinder;
26 class ExternalOneByteString;
27 class ExternalTwoByteString;
28 class ParserRecorder;
29 class UnicodeCache;
30 
31 // ---------------------------------------------------------------------
32 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
33 // A code unit is a 16 bit value representing either a 16 bit code point
34 // or one part of a surrogate pair that make a single 21 bit code point.
35 class Utf16CharacterStream {
36  public:
37   static const uc32 kEndOfInput = -1;
38 
~Utf16CharacterStream()39   virtual ~Utf16CharacterStream() { }
40 
41   // Returns and advances past the next UTF-16 code unit in the input
42   // stream. If there are no more code units it returns kEndOfInput.
Advance()43   inline uc32 Advance() {
44     if (V8_LIKELY(buffer_cursor_ < buffer_end_)) {
45       return static_cast<uc32>(*(buffer_cursor_++));
46     } else if (ReadBlock()) {
47       return static_cast<uc32>(*(buffer_cursor_++));
48     } else {
49       // Note: currently the following increment is necessary to avoid a
50       // parser problem! The scanner treats the final kEndOfInput as
51       // a code unit with a position, and does math relative to that
52       // position.
53       buffer_cursor_++;
54       return kEndOfInput;
55     }
56   }
57 
58   // Go back one by one character in the input stream.
59   // This undoes the most recent Advance().
Back()60   inline void Back() {
61     // The common case - if the previous character is within
62     // buffer_start_ .. buffer_end_ will be handles locally.
63     // Otherwise, a new block is requested.
64     if (V8_LIKELY(buffer_cursor_ > buffer_start_)) {
65       buffer_cursor_--;
66     } else {
67       ReadBlockAt(pos() - 1);
68     }
69   }
70 
71   // Go back one by two characters in the input stream. (This is the same as
72   // calling Back() twice. But Back() may - in some instances - do substantial
73   // work. Back2() guarantees this work will be done only once.)
Back2()74   inline void Back2() {
75     if (V8_LIKELY(buffer_cursor_ - 2 >= buffer_start_)) {
76       buffer_cursor_ -= 2;
77     } else {
78       ReadBlockAt(pos() - 2);
79     }
80   }
81 
pos()82   inline size_t pos() const {
83     return buffer_pos_ + (buffer_cursor_ - buffer_start_);
84   }
85 
Seek(size_t pos)86   inline void Seek(size_t pos) {
87     if (V8_LIKELY(pos >= buffer_pos_ &&
88                   pos < (buffer_pos_ + (buffer_end_ - buffer_start_)))) {
89       buffer_cursor_ = buffer_start_ + (pos - buffer_pos_);
90     } else {
91       ReadBlockAt(pos);
92     }
93   }
94 
95  protected:
Utf16CharacterStream(const uint16_t * buffer_start,const uint16_t * buffer_cursor,const uint16_t * buffer_end,size_t buffer_pos)96   Utf16CharacterStream(const uint16_t* buffer_start,
97                        const uint16_t* buffer_cursor,
98                        const uint16_t* buffer_end, size_t buffer_pos)
99       : buffer_start_(buffer_start),
100         buffer_cursor_(buffer_cursor),
101         buffer_end_(buffer_end),
102         buffer_pos_(buffer_pos) {}
Utf16CharacterStream()103   Utf16CharacterStream() : Utf16CharacterStream(nullptr, nullptr, nullptr, 0) {}
104 
ReadBlockAt(size_t new_pos)105   void ReadBlockAt(size_t new_pos) {
106     // The callers of this method (Back/Back2/Seek) should handle the easy
107     // case (seeking within the current buffer), and we should only get here
108     // if we actually require new data.
109     // (This is really an efficiency check, not a correctness invariant.)
110     DCHECK(new_pos < buffer_pos_ ||
111            new_pos >= buffer_pos_ + (buffer_end_ - buffer_start_));
112 
113     // Change pos() to point to new_pos.
114     buffer_pos_ = new_pos;
115     buffer_cursor_ = buffer_start_;
116     bool success = ReadBlock();
117     USE(success);
118 
119     // Post-conditions: 1, on success, we should be at the right position.
120     //                  2, success == we should have more characters available.
121     DCHECK_IMPLIES(success, pos() == new_pos);
122     DCHECK_EQ(success, buffer_cursor_ < buffer_end_);
123     DCHECK_EQ(success, buffer_start_ < buffer_end_);
124   }
125 
126   // Read more data, and update buffer_*_ to point to it.
127   // Returns true if more data was available.
128   //
129   // ReadBlock() may modify any of the buffer_*_ members, but must sure that
130   // the result of pos() remains unaffected.
131   //
132   // Examples:
133   // - a stream could either fill a separate buffer. Then buffer_start_ and
134   //   buffer_cursor_ would point to the beginning of the buffer, and
135   //   buffer_pos would be the old pos().
136   // - a stream with existing buffer chunks would set buffer_start_ and
137   //   buffer_end_ to cover the full chunk, and then buffer_cursor_ would
138   //   point into the middle of the buffer, while buffer_pos_ would describe
139   //   the start of the buffer.
140   virtual bool ReadBlock() = 0;
141 
142   const uint16_t* buffer_start_;
143   const uint16_t* buffer_cursor_;
144   const uint16_t* buffer_end_;
145   size_t buffer_pos_;
146 };
147 
148 
149 // ----------------------------------------------------------------------------
150 // JavaScript Scanner.
151 
152 class Scanner {
153  public:
154   // Scoped helper for a re-settable bookmark.
155   class BookmarkScope {
156    public:
BookmarkScope(Scanner * scanner)157     explicit BookmarkScope(Scanner* scanner)
158         : scanner_(scanner), bookmark_(kNoBookmark) {
159       DCHECK_NOT_NULL(scanner_);
160     }
~BookmarkScope()161     ~BookmarkScope() {}
162 
163     void Set();
164     void Apply();
165     bool HasBeenSet();
166     bool HasBeenApplied();
167 
168    private:
169     static const size_t kNoBookmark;
170     static const size_t kBookmarkWasApplied;
171     static const size_t kBookmarkAtFirstPos;
172 
173     Scanner* scanner_;
174     size_t bookmark_;
175 
176     DISALLOW_COPY_AND_ASSIGN(BookmarkScope);
177   };
178 
179   // Representation of an interval of source positions.
180   struct Location {
LocationLocation181     Location(int b, int e) : beg_pos(b), end_pos(e) { }
LocationLocation182     Location() : beg_pos(0), end_pos(0) { }
183 
IsValidLocation184     bool IsValid() const {
185       return beg_pos >= 0 && end_pos >= beg_pos;
186     }
187 
invalidLocation188     static Location invalid() { return Location(-1, -1); }
189 
190     int beg_pos;
191     int end_pos;
192   };
193 
194   // -1 is outside of the range of any real source code.
195   static const int kNoOctalLocation = -1;
196   static const uc32 kEndOfInput = Utf16CharacterStream::kEndOfInput;
197 
198   explicit Scanner(UnicodeCache* scanner_contants);
199 
200   void Initialize(Utf16CharacterStream* source);
201 
202   // Returns the next token and advances input.
203   Token::Value Next();
204   // Returns the token following peek()
205   Token::Value PeekAhead();
206   // Returns the current token again.
current_token()207   Token::Value current_token() { return current_.token; }
208   // Returns the location information for the current token
209   // (the token last returned by Next()).
location()210   Location location() const { return current_.location; }
211 
212   // This error is specifically an invalid hex or unicode escape sequence.
has_error()213   bool has_error() const { return scanner_error_ != MessageTemplate::kNone; }
error()214   MessageTemplate::Template error() const { return scanner_error_; }
error_location()215   Location error_location() const { return scanner_error_location_; }
216 
has_invalid_template_escape()217   bool has_invalid_template_escape() const {
218     return invalid_template_escape_message_ != MessageTemplate::kNone;
219   }
invalid_template_escape_message()220   MessageTemplate::Template invalid_template_escape_message() const {
221     return invalid_template_escape_message_;
222   }
invalid_template_escape_location()223   Location invalid_template_escape_location() const {
224     return invalid_template_escape_location_;
225   }
226 
clear_invalid_template_escape()227   void clear_invalid_template_escape() {
228     DCHECK(has_invalid_template_escape());
229     invalid_template_escape_message_ = MessageTemplate::kNone;
230     invalid_template_escape_location_ = Location::invalid();
231   }
232 
233   // Similar functions for the upcoming token.
234 
235   // One token look-ahead (past the token returned by Next()).
peek()236   Token::Value peek() const { return next_.token; }
237 
peek_location()238   Location peek_location() const { return next_.location; }
239 
literal_contains_escapes()240   bool literal_contains_escapes() const {
241     return LiteralContainsEscapes(current_);
242   }
is_literal_contextual_keyword(Vector<const char> keyword)243   bool is_literal_contextual_keyword(Vector<const char> keyword) {
244     DCHECK(current_.token == Token::IDENTIFIER ||
245            current_.token == Token::ESCAPED_STRICT_RESERVED_WORD);
246     DCHECK_NOT_NULL(current_.literal_chars);
247     return current_.literal_chars->is_contextual_keyword(keyword);
248   }
is_next_contextual_keyword(Vector<const char> keyword)249   bool is_next_contextual_keyword(Vector<const char> keyword) {
250     DCHECK_NOT_NULL(next_.literal_chars);
251     return next_.literal_chars->is_contextual_keyword(keyword);
252   }
253 
254   const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory);
255   const AstRawString* NextSymbol(AstValueFactory* ast_value_factory);
256   const AstRawString* CurrentRawSymbol(AstValueFactory* ast_value_factory);
257 
258   double DoubleValue();
259   bool ContainsDot();
260   bool LiteralMatches(const char* data, int length, bool allow_escapes = true) {
261     if (!current_.literal_chars) {
262       return !strncmp(Token::Name(current_.token), data, length);
263     } else if (is_literal_one_byte() && literal_length() == length &&
264                (allow_escapes || !literal_contains_escapes())) {
265       const char* token =
266           reinterpret_cast<const char*>(literal_one_byte_string().start());
267       return !strncmp(token, data, length);
268     }
269     return false;
270   }
UnescapedLiteralMatches(const char * data,int length)271   inline bool UnescapedLiteralMatches(const char* data, int length) {
272     return LiteralMatches(data, length, false);
273   }
274 
IsGetOrSet(bool * is_get,bool * is_set)275   bool IsGetOrSet(bool* is_get, bool* is_set) {
276     if (is_literal_one_byte() &&
277         literal_length() == 3 &&
278         !literal_contains_escapes()) {
279       const char* token =
280           reinterpret_cast<const char*>(literal_one_byte_string().start());
281       *is_get = strncmp(token, "get", 3) == 0;
282       *is_set = !*is_get && strncmp(token, "set", 3) == 0;
283       return *is_get || *is_set;
284     }
285     return false;
286   }
287 
288   bool FindSymbol(DuplicateFinder* finder);
289 
unicode_cache()290   UnicodeCache* unicode_cache() { return unicode_cache_; }
291 
292   // Returns the location of the last seen octal literal.
octal_position()293   Location octal_position() const { return octal_pos_; }
clear_octal_position()294   void clear_octal_position() {
295     octal_pos_ = Location::invalid();
296     octal_message_ = MessageTemplate::kNone;
297   }
octal_message()298   MessageTemplate::Template octal_message() const { return octal_message_; }
299 
300   // Returns the value of the last smi that was scanned.
smi_value()301   uint32_t smi_value() const { return current_.smi_value_; }
302 
303   // Seek forward to the given position.  This operation does not
304   // work in general, for instance when there are pushed back
305   // characters, but works for seeking forward until simple delimiter
306   // tokens, which is what it is used for.
307   void SeekForward(int pos);
308 
309   // Returns true if there was a line terminator before the peek'ed token,
310   // possibly inside a multi-line comment.
HasAnyLineTerminatorBeforeNext()311   bool HasAnyLineTerminatorBeforeNext() const {
312     return has_line_terminator_before_next_ ||
313            has_multiline_comment_before_next_;
314   }
315 
HasAnyLineTerminatorAfterNext()316   bool HasAnyLineTerminatorAfterNext() {
317     Token::Value ensure_next_next = PeekAhead();
318     USE(ensure_next_next);
319     return has_line_terminator_after_next_;
320   }
321 
322   // Scans the input as a regular expression pattern, next token must be /(=).
323   // Returns true if a pattern is scanned.
324   bool ScanRegExpPattern();
325   // Scans the input as regular expression flags. Returns the flags on success.
326   Maybe<RegExp::Flags> ScanRegExpFlags();
327 
328   // Scans the input as a template literal
329   Token::Value ScanTemplateStart();
330   Token::Value ScanTemplateContinuation();
331 
SourceUrl(Isolate * isolate)332   Handle<String> SourceUrl(Isolate* isolate) const {
333     Handle<String> tmp;
334     if (source_url_.length() > 0) tmp = source_url_.Internalize(isolate);
335     return tmp;
336   }
337 
SourceMappingUrl(Isolate * isolate)338   Handle<String> SourceMappingUrl(Isolate* isolate) const {
339     Handle<String> tmp;
340     if (source_mapping_url_.length() > 0)
341       tmp = source_mapping_url_.Internalize(isolate);
342     return tmp;
343   }
344 
FoundHtmlComment()345   bool FoundHtmlComment() const { return found_html_comment_; }
346 
347  private:
348   // Scoped helper for literal recording. Automatically drops the literal
349   // if aborting the scanning before it's complete.
350   class LiteralScope {
351    public:
LiteralScope(Scanner * self)352     explicit LiteralScope(Scanner* self) : scanner_(self), complete_(false) {
353       scanner_->StartLiteral();
354     }
~LiteralScope()355     ~LiteralScope() {
356       if (!complete_) scanner_->DropLiteral();
357     }
Complete()358     void Complete() { complete_ = true; }
359 
360    private:
361     Scanner* scanner_;
362     bool complete_;
363   };
364 
365   // LiteralBuffer -  Collector of chars of literals.
366   class LiteralBuffer {
367    public:
LiteralBuffer()368     LiteralBuffer() : is_one_byte_(true), position_(0), backing_store_() {}
369 
~LiteralBuffer()370     ~LiteralBuffer() { backing_store_.Dispose(); }
371 
INLINE(void AddChar (char code_unit))372     INLINE(void AddChar(char code_unit)) {
373       DCHECK(IsValidAscii(code_unit));
374       AddOneByteChar(static_cast<byte>(code_unit));
375     }
376 
INLINE(void AddChar (uc32 code_unit))377     INLINE(void AddChar(uc32 code_unit)) {
378       if (is_one_byte_ &&
379           code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) {
380         AddOneByteChar(static_cast<byte>(code_unit));
381       } else {
382         AddCharSlow(code_unit);
383       }
384     }
385 
is_one_byte()386     bool is_one_byte() const { return is_one_byte_; }
387 
is_contextual_keyword(Vector<const char> keyword)388     bool is_contextual_keyword(Vector<const char> keyword) const {
389       return is_one_byte() && keyword.length() == position_ &&
390              (memcmp(keyword.start(), backing_store_.start(), position_) == 0);
391     }
392 
two_byte_literal()393     Vector<const uint16_t> two_byte_literal() const {
394       DCHECK(!is_one_byte_);
395       DCHECK((position_ & 0x1) == 0);
396       return Vector<const uint16_t>(
397           reinterpret_cast<const uint16_t*>(backing_store_.start()),
398           position_ >> 1);
399     }
400 
one_byte_literal()401     Vector<const uint8_t> one_byte_literal() const {
402       DCHECK(is_one_byte_);
403       return Vector<const uint8_t>(
404           reinterpret_cast<const uint8_t*>(backing_store_.start()), position_);
405     }
406 
length()407     int length() const { return is_one_byte_ ? position_ : (position_ >> 1); }
408 
ReduceLength(int delta)409     void ReduceLength(int delta) {
410       position_ -= delta * (is_one_byte_ ? kOneByteSize : kUC16Size);
411     }
412 
Reset()413     void Reset() {
414       position_ = 0;
415       is_one_byte_ = true;
416     }
417 
418     Handle<String> Internalize(Isolate* isolate) const;
419 
420    private:
421     static const int kInitialCapacity = 16;
422     static const int kGrowthFactory = 4;
423     static const int kMinConversionSlack = 256;
424     static const int kMaxGrowth = 1 * MB;
425 
IsValidAscii(char code_unit)426     inline bool IsValidAscii(char code_unit) {
427       // Control characters and printable characters span the range of
428       // valid ASCII characters (0-127). Chars are unsigned on some
429       // platforms which causes compiler warnings if the validity check
430       // tests the lower bound >= 0 as it's always true.
431       return iscntrl(code_unit) || isprint(code_unit);
432     }
433 
INLINE(void AddOneByteChar (byte one_byte_char))434     INLINE(void AddOneByteChar(byte one_byte_char)) {
435       DCHECK(is_one_byte_);
436       if (position_ >= backing_store_.length()) ExpandBuffer();
437       backing_store_[position_] = one_byte_char;
438       position_ += kOneByteSize;
439     }
440 
441     void AddCharSlow(uc32 code_unit);
442     int NewCapacity(int min_capacity);
443     void ExpandBuffer();
444     void ConvertToTwoByte();
445 
446     bool is_one_byte_;
447     int position_;
448     Vector<byte> backing_store_;
449 
450     DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
451   };
452 
453   // The current and look-ahead token.
454   struct TokenDesc {
455     Location location;
456     LiteralBuffer* literal_chars;
457     LiteralBuffer* raw_literal_chars;
458     uint32_t smi_value_;
459     Token::Value token;
460   };
461 
462   static const int kCharacterLookaheadBufferSize = 1;
463   const int kMaxAscii = 127;
464 
465   // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
466   template <bool capture_raw>
467   uc32 ScanOctalEscape(uc32 c, int length);
468 
469   // Call this after setting source_ to the input.
Init()470   void Init() {
471     // Set c0_ (one character ahead)
472     STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
473     Advance();
474     // Initialize current_ to not refer to a literal.
475     current_.token = Token::UNINITIALIZED;
476     current_.literal_chars = NULL;
477     current_.raw_literal_chars = NULL;
478     next_.token = Token::UNINITIALIZED;
479     next_.literal_chars = NULL;
480     next_.raw_literal_chars = NULL;
481     next_next_.token = Token::UNINITIALIZED;
482     next_next_.literal_chars = NULL;
483     next_next_.raw_literal_chars = NULL;
484     found_html_comment_ = false;
485     scanner_error_ = MessageTemplate::kNone;
486     invalid_template_escape_message_ = MessageTemplate::kNone;
487   }
488 
ReportScannerError(const Location & location,MessageTemplate::Template error)489   void ReportScannerError(const Location& location,
490                           MessageTemplate::Template error) {
491     if (has_error()) return;
492     scanner_error_ = error;
493     scanner_error_location_ = location;
494   }
495 
ReportScannerError(int pos,MessageTemplate::Template error)496   void ReportScannerError(int pos, MessageTemplate::Template error) {
497     if (has_error()) return;
498     scanner_error_ = error;
499     scanner_error_location_ = Location(pos, pos + 1);
500   }
501 
502   // Seek to the next_ token at the given position.
503   void SeekNext(size_t position);
504 
505   // Literal buffer support
StartLiteral()506   inline void StartLiteral() {
507     LiteralBuffer* free_buffer =
508         (current_.literal_chars == &literal_buffer0_)
509             ? &literal_buffer1_
510             : (current_.literal_chars == &literal_buffer1_) ? &literal_buffer2_
511                                                             : &literal_buffer0_;
512     free_buffer->Reset();
513     next_.literal_chars = free_buffer;
514   }
515 
StartRawLiteral()516   inline void StartRawLiteral() {
517     LiteralBuffer* free_buffer =
518         (current_.raw_literal_chars == &raw_literal_buffer0_)
519             ? &raw_literal_buffer1_
520             : (current_.raw_literal_chars == &raw_literal_buffer1_)
521                   ? &raw_literal_buffer2_
522                   : &raw_literal_buffer0_;
523     free_buffer->Reset();
524     next_.raw_literal_chars = free_buffer;
525   }
526 
INLINE(void AddLiteralChar (uc32 c))527   INLINE(void AddLiteralChar(uc32 c)) {
528     DCHECK_NOT_NULL(next_.literal_chars);
529     next_.literal_chars->AddChar(c);
530   }
531 
INLINE(void AddLiteralChar (char c))532   INLINE(void AddLiteralChar(char c)) {
533     DCHECK_NOT_NULL(next_.literal_chars);
534     next_.literal_chars->AddChar(c);
535   }
536 
INLINE(void AddRawLiteralChar (uc32 c))537   INLINE(void AddRawLiteralChar(uc32 c)) {
538     DCHECK_NOT_NULL(next_.raw_literal_chars);
539     next_.raw_literal_chars->AddChar(c);
540   }
541 
INLINE(void ReduceRawLiteralLength (int delta))542   INLINE(void ReduceRawLiteralLength(int delta)) {
543     DCHECK_NOT_NULL(next_.raw_literal_chars);
544     next_.raw_literal_chars->ReduceLength(delta);
545   }
546 
547   // Stops scanning of a literal and drop the collected characters,
548   // e.g., due to an encountered error.
DropLiteral()549   inline void DropLiteral() {
550     next_.literal_chars = NULL;
551     next_.raw_literal_chars = NULL;
552   }
553 
AddLiteralCharAdvance()554   inline void AddLiteralCharAdvance() {
555     AddLiteralChar(c0_);
556     Advance();
557   }
558 
559   // Low-level scanning support.
560   template <bool capture_raw = false, bool check_surrogate = true>
Advance()561   void Advance() {
562     if (capture_raw) {
563       AddRawLiteralChar(c0_);
564     }
565     c0_ = source_->Advance();
566     if (check_surrogate) HandleLeadSurrogate();
567   }
568 
HandleLeadSurrogate()569   void HandleLeadSurrogate() {
570     if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
571       uc32 c1 = source_->Advance();
572       if (!unibrow::Utf16::IsTrailSurrogate(c1)) {
573         source_->Back();
574       } else {
575         c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
576       }
577     }
578   }
579 
PushBack(uc32 ch)580   void PushBack(uc32 ch) {
581     if (c0_ > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
582       source_->Back2();
583     } else {
584       source_->Back();
585     }
586     c0_ = ch;
587   }
588 
589   // Same as PushBack(ch1); PushBack(ch2).
590   // - Potentially more efficient as it uses Back2() on the stream.
591   // - Uses char as parameters, since we're only calling it with ASCII chars in
592   //   practice. This way, we can avoid a few edge cases.
PushBack2(char ch1,char ch2)593   void PushBack2(char ch1, char ch2) {
594     source_->Back2();
595     c0_ = ch2;
596   }
597 
Select(Token::Value tok)598   inline Token::Value Select(Token::Value tok) {
599     Advance();
600     return tok;
601   }
602 
Select(uc32 next,Token::Value then,Token::Value else_)603   inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
604     Advance();
605     if (c0_ == next) {
606       Advance();
607       return then;
608     } else {
609       return else_;
610     }
611   }
612 
613   // Returns the literal string, if any, for the current token (the
614   // token last returned by Next()). The string is 0-terminated.
615   // Literal strings are collected for identifiers, strings, numbers as well
616   // as for template literals. For template literals we also collect the raw
617   // form.
618   // These functions only give the correct result if the literal was scanned
619   // when a LiteralScope object is alive.
620   //
621   // Current usage of these functions is unfortunately a little undisciplined,
622   // and is_literal_one_byte() + is_literal_one_byte_string() is also
623   // requested for tokens that do not have a literal. Hence, we treat any
624   // token as a one-byte literal. E.g. Token::FUNCTION pretends to have a
625   // literal "function".
literal_one_byte_string()626   Vector<const uint8_t> literal_one_byte_string() {
627     if (current_.literal_chars)
628       return current_.literal_chars->one_byte_literal();
629     const char* str = Token::String(current_.token);
630     const uint8_t* str_as_uint8 = reinterpret_cast<const uint8_t*>(str);
631     return Vector<const uint8_t>(str_as_uint8,
632                                  Token::StringLength(current_.token));
633   }
literal_two_byte_string()634   Vector<const uint16_t> literal_two_byte_string() {
635     DCHECK_NOT_NULL(current_.literal_chars);
636     return current_.literal_chars->two_byte_literal();
637   }
is_literal_one_byte()638   bool is_literal_one_byte() {
639     return !current_.literal_chars || current_.literal_chars->is_one_byte();
640   }
literal_length()641   int literal_length() const {
642     if (current_.literal_chars) return current_.literal_chars->length();
643     return Token::StringLength(current_.token);
644   }
645   // Returns the literal string for the next token (the token that
646   // would be returned if Next() were called).
next_literal_one_byte_string()647   Vector<const uint8_t> next_literal_one_byte_string() {
648     DCHECK_NOT_NULL(next_.literal_chars);
649     return next_.literal_chars->one_byte_literal();
650   }
next_literal_two_byte_string()651   Vector<const uint16_t> next_literal_two_byte_string() {
652     DCHECK_NOT_NULL(next_.literal_chars);
653     return next_.literal_chars->two_byte_literal();
654   }
is_next_literal_one_byte()655   bool is_next_literal_one_byte() {
656     DCHECK_NOT_NULL(next_.literal_chars);
657     return next_.literal_chars->is_one_byte();
658   }
raw_literal_one_byte_string()659   Vector<const uint8_t> raw_literal_one_byte_string() {
660     DCHECK_NOT_NULL(current_.raw_literal_chars);
661     return current_.raw_literal_chars->one_byte_literal();
662   }
raw_literal_two_byte_string()663   Vector<const uint16_t> raw_literal_two_byte_string() {
664     DCHECK_NOT_NULL(current_.raw_literal_chars);
665     return current_.raw_literal_chars->two_byte_literal();
666   }
is_raw_literal_one_byte()667   bool is_raw_literal_one_byte() {
668     DCHECK_NOT_NULL(current_.raw_literal_chars);
669     return current_.raw_literal_chars->is_one_byte();
670   }
671 
672   template <bool capture_raw, bool unicode = false>
673   uc32 ScanHexNumber(int expected_length);
674   // Scan a number of any length but not bigger than max_value. For example, the
675   // number can be 000000001, so it's very long in characters but its value is
676   // small.
677   template <bool capture_raw>
678   uc32 ScanUnlimitedLengthHexNumber(int max_value, int beg_pos);
679 
680   // Scans a single JavaScript token.
681   void Scan();
682 
683   bool SkipWhiteSpace();
684   Token::Value SkipSingleLineComment();
685   Token::Value SkipSourceURLComment();
686   void TryToParseSourceURLComment();
687   Token::Value SkipMultiLineComment();
688   // Scans a possible HTML comment -- begins with '<!'.
689   Token::Value ScanHtmlComment();
690 
691   void ScanDecimalDigits();
692   Token::Value ScanNumber(bool seen_period);
693   Token::Value ScanIdentifierOrKeyword();
694   Token::Value ScanIdentifierSuffix(LiteralScope* literal, bool escaped);
695 
696   Token::Value ScanString();
697 
698   // Scans an escape-sequence which is part of a string and adds the
699   // decoded character to the current literal. Returns true if a pattern
700   // is scanned.
701   template <bool capture_raw, bool in_template_literal>
702   bool ScanEscape();
703 
704   // Decodes a Unicode escape-sequence which is part of an identifier.
705   // If the escape sequence cannot be decoded the result is kBadChar.
706   uc32 ScanIdentifierUnicodeEscape();
707   // Helper for the above functions.
708   template <bool capture_raw>
709   uc32 ScanUnicodeEscape();
710 
711   Token::Value ScanTemplateSpan();
712 
713   // Return the current source position.
source_pos()714   int source_pos() {
715     return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize;
716   }
717 
LiteralContainsEscapes(const TokenDesc & token)718   static bool LiteralContainsEscapes(const TokenDesc& token) {
719     Location location = token.location;
720     int source_length = (location.end_pos - location.beg_pos);
721     if (token.token == Token::STRING) {
722       // Subtract delimiters.
723       source_length -= 2;
724     }
725     return token.literal_chars &&
726            (token.literal_chars->length() != source_length);
727   }
728 
729 #ifdef DEBUG
730   void SanityCheckTokenDesc(const TokenDesc&) const;
731 #endif
732 
733   UnicodeCache* unicode_cache_;
734 
735   // Buffers collecting literal strings, numbers, etc.
736   LiteralBuffer literal_buffer0_;
737   LiteralBuffer literal_buffer1_;
738   LiteralBuffer literal_buffer2_;
739 
740   // Values parsed from magic comments.
741   LiteralBuffer source_url_;
742   LiteralBuffer source_mapping_url_;
743 
744   // Buffer to store raw string values
745   LiteralBuffer raw_literal_buffer0_;
746   LiteralBuffer raw_literal_buffer1_;
747   LiteralBuffer raw_literal_buffer2_;
748 
749   TokenDesc current_;    // desc for current token (as returned by Next())
750   TokenDesc next_;       // desc for next token (one token look-ahead)
751   TokenDesc next_next_;  // desc for the token after next (after PeakAhead())
752 
753   // Input stream. Must be initialized to an Utf16CharacterStream.
754   Utf16CharacterStream* source_;
755 
756   // Last-seen positions of potentially problematic tokens.
757   Location octal_pos_;
758   MessageTemplate::Template octal_message_;
759 
760   // One Unicode character look-ahead; c0_ < 0 at the end of the input.
761   uc32 c0_;
762 
763   // Whether there is a line terminator whitespace character after
764   // the current token, and  before the next. Does not count newlines
765   // inside multiline comments.
766   bool has_line_terminator_before_next_;
767   // Whether there is a multi-line comment that contains a
768   // line-terminator after the current token, and before the next.
769   bool has_multiline_comment_before_next_;
770   bool has_line_terminator_after_next_;
771 
772   // Whether this scanner encountered an HTML comment.
773   bool found_html_comment_;
774 
775   MessageTemplate::Template scanner_error_;
776   Location scanner_error_location_;
777 
778   MessageTemplate::Template invalid_template_escape_message_;
779   Location invalid_template_escape_location_;
780 };
781 
782 }  // namespace internal
783 }  // namespace v8
784 
785 #endif  // V8_PARSING_SCANNER_H_
786