• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 //     * Redistributions of source code must retain the above copyright
7 //       notice, this list of conditions and the following disclaimer.
8 //     * Redistributions in binary form must reproduce the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer in the documentation and/or other materials provided
11 //       with the distribution.
12 //     * Neither the name of Google Inc. nor the names of its
13 //       contributors may be used to endorse or promote products derived
14 //       from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 // Features shared by parsing and pre-parsing scanners.
29 
30 #ifndef V8_SCANNER_BASE_H_
31 #define V8_SCANNER_BASE_H_
32 
33 #include "globals.h"
34 #include "checks.h"
35 #include "allocation.h"
36 #include "token.h"
37 #include "unicode-inl.h"
38 #include "char-predicates.h"
39 #include "utils.h"
40 #include "list-inl.h"
41 
42 namespace v8 {
43 namespace internal {
44 
45 // Returns the value (0 .. 15) of a hexadecimal character c.
46 // If c is not a legal hexadecimal character, returns a value < 0.
HexValue(uc32 c)47 inline int HexValue(uc32 c) {
48   c -= '0';
49   if (static_cast<unsigned>(c) <= 9) return c;
50   c = (c | 0x20) - ('a' - '0');  // detect 0x11..0x16 and 0x31..0x36.
51   if (static_cast<unsigned>(c) <= 5) return c + 10;
52   return -1;
53 }
54 
55 
56 // ---------------------------------------------------------------------
57 // Buffered stream of characters, using an internal UC16 buffer.
58 
59 class UC16CharacterStream {
60  public:
UC16CharacterStream()61   UC16CharacterStream() : pos_(0) { }
~UC16CharacterStream()62   virtual ~UC16CharacterStream() { }
63 
64   // Returns and advances past the next UC16 character in the input
65   // stream. If there are no more characters, it returns a negative
66   // value.
Advance()67   inline uc32 Advance() {
68     if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
69       pos_++;
70       return static_cast<uc32>(*(buffer_cursor_++));
71     }
72     // Note: currently the following increment is necessary to avoid a
73     // parser problem! The scanner treats the final kEndOfInput as
74     // a character with a position, and does math relative to that
75     // position.
76     pos_++;
77 
78     return kEndOfInput;
79   }
80 
81   // Return the current position in the character stream.
82   // Starts at zero.
pos()83   inline unsigned pos() const { return pos_; }
84 
85   // Skips forward past the next character_count UC16 characters
86   // in the input, or until the end of input if that comes sooner.
87   // Returns the number of characters actually skipped. If less
88   // than character_count,
SeekForward(unsigned character_count)89   inline unsigned SeekForward(unsigned character_count) {
90     unsigned buffered_chars =
91         static_cast<unsigned>(buffer_end_ - buffer_cursor_);
92     if (character_count <= buffered_chars) {
93       buffer_cursor_ += character_count;
94       pos_ += character_count;
95       return character_count;
96     }
97     return SlowSeekForward(character_count);
98   }
99 
100   // Pushes back the most recently read UC16 character (or negative
101   // value if at end of input), i.e., the value returned by the most recent
102   // call to Advance.
103   // Must not be used right after calling SeekForward.
104   virtual void PushBack(int32_t character) = 0;
105 
106  protected:
107   static const uc32 kEndOfInput = -1;
108 
109   // Ensures that the buffer_cursor_ points to the character at
110   // position pos_ of the input, if possible. If the position
111   // is at or after the end of the input, return false. If there
112   // are more characters available, return true.
113   virtual bool ReadBlock() = 0;
114   virtual unsigned SlowSeekForward(unsigned character_count) = 0;
115 
116   const uc16* buffer_cursor_;
117   const uc16* buffer_end_;
118   unsigned pos_;
119 };
120 
121 
122 class UnicodeCache {
123 // ---------------------------------------------------------------------
124 // Caching predicates used by scanners.
125  public:
UnicodeCache()126   UnicodeCache() {}
127   typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
128 
utf8_decoder()129   StaticResource<Utf8Decoder>* utf8_decoder() {
130     return &utf8_decoder_;
131   }
132 
IsIdentifierStart(unibrow::uchar c)133   bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); }
IsIdentifierPart(unibrow::uchar c)134   bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }
IsLineTerminator(unibrow::uchar c)135   bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
IsWhiteSpace(unibrow::uchar c)136   bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
137 
138  private:
139 
140   unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
141   unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
142   unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
143   unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
144   StaticResource<Utf8Decoder> utf8_decoder_;
145 
146   DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
147 };
148 
149 
150 // ----------------------------------------------------------------------------
151 // LiteralBuffer -  Collector of chars of literals.
152 
153 class LiteralBuffer {
154  public:
LiteralBuffer()155   LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
156 
~LiteralBuffer()157   ~LiteralBuffer() {
158     if (backing_store_.length() > 0) {
159       backing_store_.Dispose();
160     }
161   }
162 
AddChar(uc16 character)163   inline void AddChar(uc16 character) {
164     if (position_ >= backing_store_.length()) ExpandBuffer();
165     if (is_ascii_) {
166       if (character < kMaxAsciiCharCodeU) {
167         backing_store_[position_] = static_cast<byte>(character);
168         position_ += kASCIISize;
169         return;
170       }
171       ConvertToUC16();
172     }
173     *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
174     position_ += kUC16Size;
175   }
176 
is_ascii()177   bool is_ascii() { return is_ascii_; }
178 
uc16_literal()179   Vector<const uc16> uc16_literal() {
180     ASSERT(!is_ascii_);
181     ASSERT((position_ & 0x1) == 0);
182     return Vector<const uc16>(
183         reinterpret_cast<const uc16*>(backing_store_.start()),
184         position_ >> 1);
185   }
186 
ascii_literal()187   Vector<const char> ascii_literal() {
188     ASSERT(is_ascii_);
189     return Vector<const char>(
190         reinterpret_cast<const char*>(backing_store_.start()),
191         position_);
192   }
193 
length()194   int length() {
195     return is_ascii_ ? position_ : (position_ >> 1);
196   }
197 
Reset()198   void Reset() {
199     position_ = 0;
200     is_ascii_ = true;
201   }
202  private:
203   static const int kInitialCapacity = 16;
204   static const int kGrowthFactory = 4;
205   static const int kMinConversionSlack = 256;
206   static const int kMaxGrowth = 1 * MB;
NewCapacity(int min_capacity)207   inline int NewCapacity(int min_capacity) {
208     int capacity = Max(min_capacity, backing_store_.length());
209     int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
210     return new_capacity;
211   }
212 
ExpandBuffer()213   void ExpandBuffer() {
214     Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
215     memcpy(new_store.start(), backing_store_.start(), position_);
216     backing_store_.Dispose();
217     backing_store_ = new_store;
218   }
219 
ConvertToUC16()220   void ConvertToUC16() {
221     ASSERT(is_ascii_);
222     Vector<byte> new_store;
223     int new_content_size = position_ * kUC16Size;
224     if (new_content_size >= backing_store_.length()) {
225       // Ensure room for all currently read characters as UC16 as well
226       // as the character about to be stored.
227       new_store = Vector<byte>::New(NewCapacity(new_content_size));
228     } else {
229       new_store = backing_store_;
230     }
231     char* src = reinterpret_cast<char*>(backing_store_.start());
232     uc16* dst = reinterpret_cast<uc16*>(new_store.start());
233     for (int i = position_ - 1; i >= 0; i--) {
234       dst[i] = src[i];
235     }
236     if (new_store.start() != backing_store_.start()) {
237       backing_store_.Dispose();
238       backing_store_ = new_store;
239     }
240     position_ = new_content_size;
241     is_ascii_ = false;
242   }
243 
244   bool is_ascii_;
245   int position_;
246   Vector<byte> backing_store_;
247 
248   DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
249 };
250 
251 
252 // ----------------------------------------------------------------------------
253 // Scanner base-class.
254 
255 // Generic functionality used by both JSON and JavaScript scanners.
256 class Scanner {
257  public:
258   // -1 is outside of the range of any real source code.
259   static const int kNoOctalLocation = -1;
260 
261   typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
262 
263   class LiteralScope {
264    public:
265     explicit LiteralScope(Scanner* self);
266     ~LiteralScope();
267     void Complete();
268 
269    private:
270     Scanner* scanner_;
271     bool complete_;
272   };
273 
274   explicit Scanner(UnicodeCache* scanner_contants);
275 
276   // Returns the current token again.
current_token()277   Token::Value current_token() { return current_.token; }
278 
279   // One token look-ahead (past the token returned by Next()).
peek()280   Token::Value peek() const { return next_.token; }
281 
282   struct Location {
LocationLocation283     Location(int b, int e) : beg_pos(b), end_pos(e) { }
LocationLocation284     Location() : beg_pos(0), end_pos(0) { }
285 
IsValidLocation286     bool IsValid() const {
287       return beg_pos >= 0 && end_pos >= beg_pos;
288     }
289 
290     int beg_pos;
291     int end_pos;
292   };
293 
NoLocation()294   static Location NoLocation() {
295     return Location(-1, -1);
296   }
297 
298   // Returns the location information for the current token
299   // (the token returned by Next()).
location()300   Location location() const { return current_.location; }
peek_location()301   Location peek_location() const { return next_.location; }
302 
303   // Returns the location of the last seen octal literal
octal_position()304   int octal_position() const { return octal_pos_; }
clear_octal_position()305   void clear_octal_position() { octal_pos_ = -1; }
306 
307   // Returns the literal string, if any, for the current token (the
308   // token returned by Next()). The string is 0-terminated and in
309   // UTF-8 format; they may contain 0-characters. Literal strings are
310   // collected for identifiers, strings, and numbers.
311   // These functions only give the correct result if the literal
312   // was scanned between calls to StartLiteral() and TerminateLiteral().
is_literal_ascii()313   bool is_literal_ascii() {
314     ASSERT_NOT_NULL(current_.literal_chars);
315     return current_.literal_chars->is_ascii();
316   }
literal_ascii_string()317   Vector<const char> literal_ascii_string() {
318     ASSERT_NOT_NULL(current_.literal_chars);
319     return current_.literal_chars->ascii_literal();
320   }
literal_uc16_string()321   Vector<const uc16> literal_uc16_string() {
322     ASSERT_NOT_NULL(current_.literal_chars);
323     return current_.literal_chars->uc16_literal();
324   }
literal_length()325   int literal_length() const {
326     ASSERT_NOT_NULL(current_.literal_chars);
327     return current_.literal_chars->length();
328   }
329 
330   // Returns the literal string for the next token (the token that
331   // would be returned if Next() were called).
is_next_literal_ascii()332   bool is_next_literal_ascii() {
333     ASSERT_NOT_NULL(next_.literal_chars);
334     return next_.literal_chars->is_ascii();
335   }
next_literal_ascii_string()336   Vector<const char> next_literal_ascii_string() {
337     ASSERT_NOT_NULL(next_.literal_chars);
338     return next_.literal_chars->ascii_literal();
339   }
next_literal_uc16_string()340   Vector<const uc16> next_literal_uc16_string() {
341     ASSERT_NOT_NULL(next_.literal_chars);
342     return next_.literal_chars->uc16_literal();
343   }
next_literal_length()344   int next_literal_length() const {
345     ASSERT_NOT_NULL(next_.literal_chars);
346     return next_.literal_chars->length();
347   }
348 
349   static const int kCharacterLookaheadBufferSize = 1;
350 
351  protected:
352   // The current and look-ahead token.
353   struct TokenDesc {
354     Token::Value token;
355     Location location;
356     LiteralBuffer* literal_chars;
357   };
358 
359   // Call this after setting source_ to the input.
Init()360   void Init() {
361     // Set c0_ (one character ahead)
362     ASSERT(kCharacterLookaheadBufferSize == 1);
363     Advance();
364     // Initialize current_ to not refer to a literal.
365     current_.literal_chars = NULL;
366   }
367 
368   // Literal buffer support
StartLiteral()369   inline void StartLiteral() {
370     LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
371             &literal_buffer2_ : &literal_buffer1_;
372     free_buffer->Reset();
373     next_.literal_chars = free_buffer;
374   }
375 
AddLiteralChar(uc32 c)376   inline void AddLiteralChar(uc32 c) {
377     ASSERT_NOT_NULL(next_.literal_chars);
378     next_.literal_chars->AddChar(c);
379   }
380 
381   // Complete scanning of a literal.
TerminateLiteral()382   inline void TerminateLiteral() {
383     // Does nothing in the current implementation.
384   }
385 
386   // Stops scanning of a literal and drop the collected characters,
387   // e.g., due to an encountered error.
DropLiteral()388   inline void DropLiteral() {
389     next_.literal_chars = NULL;
390   }
391 
AddLiteralCharAdvance()392   inline void AddLiteralCharAdvance() {
393     AddLiteralChar(c0_);
394     Advance();
395   }
396 
397   // Low-level scanning support.
Advance()398   void Advance() { c0_ = source_->Advance(); }
PushBack(uc32 ch)399   void PushBack(uc32 ch) {
400     source_->PushBack(c0_);
401     c0_ = ch;
402   }
403 
Select(Token::Value tok)404   inline Token::Value Select(Token::Value tok) {
405     Advance();
406     return tok;
407   }
408 
Select(uc32 next,Token::Value then,Token::Value else_)409   inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
410     Advance();
411     if (c0_ == next) {
412       Advance();
413       return then;
414     } else {
415       return else_;
416     }
417   }
418 
419   uc32 ScanHexEscape(uc32 c, int length);
420 
421   // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
422   uc32 ScanOctalEscape(uc32 c, int length);
423 
424   // Return the current source position.
source_pos()425   int source_pos() {
426     return source_->pos() - kCharacterLookaheadBufferSize;
427   }
428 
429   UnicodeCache* unicode_cache_;
430 
431   // Buffers collecting literal strings, numbers, etc.
432   LiteralBuffer literal_buffer1_;
433   LiteralBuffer literal_buffer2_;
434 
435   TokenDesc current_;  // desc for current token (as returned by Next())
436   TokenDesc next_;     // desc for next token (one token look-ahead)
437 
438   // Input stream. Must be initialized to an UC16CharacterStream.
439   UC16CharacterStream* source_;
440 
441   // Start position of the octal literal last scanned.
442   int octal_pos_;
443 
444   // One Unicode character look-ahead; c0_ < 0 at the end of the input.
445   uc32 c0_;
446 };
447 
448 // ----------------------------------------------------------------------------
449 // JavaScriptScanner - base logic for JavaScript scanning.
450 
451 class JavaScriptScanner : public Scanner {
452  public:
453   // A LiteralScope that disables recording of some types of JavaScript
454   // literals. If the scanner is configured to not record the specific
455   // type of literal, the scope will not call StartLiteral.
456   class LiteralScope {
457    public:
LiteralScope(JavaScriptScanner * self)458     explicit LiteralScope(JavaScriptScanner* self)
459         : scanner_(self), complete_(false) {
460       scanner_->StartLiteral();
461     }
~LiteralScope()462      ~LiteralScope() {
463        if (!complete_) scanner_->DropLiteral();
464      }
Complete()465     void Complete() {
466       scanner_->TerminateLiteral();
467       complete_ = true;
468     }
469 
470    private:
471     JavaScriptScanner* scanner_;
472     bool complete_;
473   };
474 
475   explicit JavaScriptScanner(UnicodeCache* scanner_contants);
476 
477   // Returns the next token.
478   Token::Value Next();
479 
480   // Returns true if there was a line terminator before the peek'ed token.
has_line_terminator_before_next()481   bool has_line_terminator_before_next() const {
482     return has_line_terminator_before_next_;
483   }
484 
485   // Scans the input as a regular expression pattern, previous
486   // character(s) must be /(=). Returns true if a pattern is scanned.
487   bool ScanRegExpPattern(bool seen_equal);
488   // Returns true if regexp flags are scanned (always since flags can
489   // be empty).
490   bool ScanRegExpFlags();
491 
492   // Tells whether the buffer contains an identifier (no escapes).
493   // Used for checking if a property name is an identifier.
494   static bool IsIdentifier(unibrow::CharacterStream* buffer);
495 
496   // Seek forward to the given position.  This operation does not
497   // work in general, for instance when there are pushed back
498   // characters, but works for seeking forward until simple delimiter
499   // tokens, which is what it is used for.
500   void SeekForward(int pos);
501 
502  protected:
503   bool SkipWhiteSpace();
504   Token::Value SkipSingleLineComment();
505   Token::Value SkipMultiLineComment();
506 
507   // Scans a single JavaScript token.
508   void Scan();
509 
510   void ScanDecimalDigits();
511   Token::Value ScanNumber(bool seen_period);
512   Token::Value ScanIdentifierOrKeyword();
513   Token::Value ScanIdentifierSuffix(LiteralScope* literal);
514 
515   void ScanEscape();
516   Token::Value ScanString();
517 
518   // Scans a possible HTML comment -- begins with '<!'.
519   Token::Value ScanHtmlComment();
520 
521   // Decodes a unicode escape-sequence which is part of an identifier.
522   // If the escape sequence cannot be decoded the result is kBadChar.
523   uc32 ScanIdentifierUnicodeEscape();
524 
525   bool has_line_terminator_before_next_;
526 };
527 
528 
529 // ----------------------------------------------------------------------------
530 // Keyword matching state machine.
531 
532 class KeywordMatcher {
533 //  Incrementally recognize keywords.
534 //
535 //  Recognized keywords:
536 //      break case catch const* continue debugger* default delete do else
537 //      finally false for function if in instanceof native* new null
538 //      return switch this throw true try typeof var void while with
539 //
540 //  *: Actually "future reserved keywords". These are the only ones we
541 //     recognize, the remaining are allowed as identifiers.
542 //     In ES5 strict mode, we should disallow all reserved keywords.
543  public:
KeywordMatcher()544   KeywordMatcher()
545       : state_(INITIAL),
546         token_(Token::IDENTIFIER),
547         keyword_(NULL),
548         counter_(0),
549         keyword_token_(Token::ILLEGAL) {}
550 
token()551   Token::Value token() { return token_; }
552 
AddChar(unibrow::uchar input)553   inline bool AddChar(unibrow::uchar input) {
554     if (state_ != UNMATCHABLE) {
555       Step(input);
556     }
557     return state_ != UNMATCHABLE;
558   }
559 
Fail()560   void Fail() {
561     token_ = Token::IDENTIFIER;
562     state_ = UNMATCHABLE;
563   }
564 
565  private:
566   enum State {
567     UNMATCHABLE,
568     INITIAL,
569     KEYWORD_PREFIX,
570     KEYWORD_MATCHED,
571     C,
572     CA,
573     CO,
574     CON,
575     D,
576     DE,
577     E,
578     EX,
579     F,
580     I,
581     IM,
582     IMP,
583     IN,
584     N,
585     P,
586     PR,
587     S,
588     T,
589     TH,
590     TR,
591     V,
592     W
593   };
594 
595   struct FirstState {
596     const char* keyword;
597     State state;
598     Token::Value token;
599   };
600 
601   // Range of possible first characters of a keyword.
602   static const unsigned int kFirstCharRangeMin = 'b';
603   static const unsigned int kFirstCharRangeMax = 'y';
604   static const unsigned int kFirstCharRangeLength =
605       kFirstCharRangeMax - kFirstCharRangeMin + 1;
606   // State map for first keyword character range.
607   static FirstState first_states_[kFirstCharRangeLength];
608 
609   // If input equals keyword's character at position, continue matching keyword
610   // from that position.
MatchKeywordStart(unibrow::uchar input,const char * keyword,int position,Token::Value token_if_match)611   inline bool MatchKeywordStart(unibrow::uchar input,
612                                 const char* keyword,
613                                 int position,
614                                 Token::Value token_if_match) {
615     if (input != static_cast<unibrow::uchar>(keyword[position])) {
616       return false;
617     }
618     state_ = KEYWORD_PREFIX;
619     this->keyword_ = keyword;
620     this->counter_ = position + 1;
621     this->keyword_token_ = token_if_match;
622     return true;
623   }
624 
625   // If input equals match character, transition to new state and return true.
MatchState(unibrow::uchar input,char match,State new_state)626   inline bool MatchState(unibrow::uchar input, char match, State new_state) {
627     if (input != static_cast<unibrow::uchar>(match)) {
628       return false;
629     }
630     state_ = new_state;
631     return true;
632   }
633 
MatchKeyword(unibrow::uchar input,char match,State new_state,Token::Value keyword_token)634   inline bool MatchKeyword(unibrow::uchar input,
635                            char match,
636                            State new_state,
637                            Token::Value keyword_token) {
638     if (input != static_cast<unibrow::uchar>(match)) {
639       return false;
640     }
641     state_ = new_state;
642     token_ = keyword_token;
643     return true;
644   }
645 
646   void Step(unibrow::uchar input);
647 
648   // Current state.
649   State state_;
650   // Token for currently added characters.
651   Token::Value token_;
652 
653   // Matching a specific keyword string (there is only one possible valid
654   // keyword with the current prefix).
655   const char* keyword_;
656   int counter_;
657   Token::Value keyword_token_;
658 };
659 
660 
661 } }  // namespace v8::internal
662 
663 #endif  // V8_SCANNER_BASE_H_
664