• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 //     * Redistributions of source code must retain the above copyright
7 //       notice, this list of conditions and the following disclaimer.
8 //     * Redistributions in binary form must reproduce the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer in the documentation and/or other materials provided
11 //       with the distribution.
12 //     * Neither the name of Google Inc. nor the names of its
13 //       contributors may be used to endorse or promote products derived
14 //       from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #ifndef V8_SCANNER_H_
29 #define V8_SCANNER_H_
30 
31 #include "token.h"
32 #include "char-predicates-inl.h"
33 
34 namespace v8 {
35 namespace internal {
36 
37 
38 class UTF8Buffer {
39  public:
40   UTF8Buffer();
41   ~UTF8Buffer();
42 
AddChar(uc32 c)43   void AddChar(uc32 c) {
44     ASSERT_NOT_NULL(data_);
45     if (cursor_ <= limit_ &&
46         static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
47       *cursor_++ = static_cast<char>(c);
48     } else {
49       AddCharSlow(c);
50     }
51   }
52 
Reset()53   void Reset() {
54     if (data_ == NULL) {
55       data_ = NewArray<char>(kInitialCapacity);
56       limit_ = ComputeLimit(data_, kInitialCapacity);
57     }
58     cursor_ = data_;
59   }
60 
pos()61   int pos() const {
62     ASSERT_NOT_NULL(data_);
63     return static_cast<int>(cursor_ - data_);
64   }
65 
data()66   char* data() const { return data_; }
67 
68  private:
69   static const int kInitialCapacity = 256;
70   char* data_;
71   char* cursor_;
72   char* limit_;
73 
Capacity()74   int Capacity() const {
75     ASSERT_NOT_NULL(data_);
76     return static_cast<int>(limit_ - data_) + unibrow::Utf8::kMaxEncodedSize;
77   }
78 
ComputeLimit(char * data,int capacity)79   static char* ComputeLimit(char* data, int capacity) {
80     return (data + capacity) - unibrow::Utf8::kMaxEncodedSize;
81   }
82 
83   void AddCharSlow(uc32 c);
84 };
85 
86 
87 class UTF16Buffer {
88  public:
89   UTF16Buffer();
~UTF16Buffer()90   virtual ~UTF16Buffer() {}
91 
92   virtual void PushBack(uc32 ch) = 0;
93   // returns a value < 0 when the buffer end is reached
94   virtual uc32 Advance() = 0;
95   virtual void SeekForward(int pos) = 0;
96 
pos()97   int pos() const { return pos_; }
size()98   int size() const { return size_; }
99   Handle<String> SubString(int start, int end);
100 
101  protected:
102   Handle<String> data_;
103   int pos_;
104   int size_;
105 };
106 
107 
108 class CharacterStreamUTF16Buffer: public UTF16Buffer {
109  public:
110   CharacterStreamUTF16Buffer();
~CharacterStreamUTF16Buffer()111   virtual ~CharacterStreamUTF16Buffer() {}
112   void Initialize(Handle<String> data, unibrow::CharacterStream* stream);
113   virtual void PushBack(uc32 ch);
114   virtual uc32 Advance();
115   virtual void SeekForward(int pos);
116 
117  private:
118   List<uc32> pushback_buffer_;
119   uc32 last_;
120   unibrow::CharacterStream* stream_;
121 
pushback_buffer()122   List<uc32>* pushback_buffer() { return &pushback_buffer_; }
123 };
124 
125 
126 class TwoByteStringUTF16Buffer: public UTF16Buffer {
127  public:
128   TwoByteStringUTF16Buffer();
~TwoByteStringUTF16Buffer()129   virtual ~TwoByteStringUTF16Buffer() {}
130   void Initialize(Handle<ExternalTwoByteString> data);
131   virtual void PushBack(uc32 ch);
132   virtual uc32 Advance();
133   virtual void SeekForward(int pos);
134 
135  private:
136   const uint16_t* raw_data_;
137 };
138 
139 
140 class KeywordMatcher {
141 //  Incrementally recognize keywords.
142 //
143 //  Recognized keywords:
144 //      break case catch const* continue debugger* default delete do else
145 //      finally false for function if in instanceof native* new null
146 //      return switch this throw true try typeof var void while with
147 //
148 //  *: Actually "future reserved keywords". These are the only ones we
149 //     recognized, the remaining are allowed as identifiers.
150  public:
KeywordMatcher()151   KeywordMatcher() : state_(INITIAL), token_(Token::IDENTIFIER) {}
152 
token()153   Token::Value token() { return token_; }
154 
AddChar(uc32 input)155   inline void AddChar(uc32 input) {
156     if (state_ != UNMATCHABLE) {
157       Step(input);
158     }
159   }
160 
Fail()161   void Fail() {
162     token_ = Token::IDENTIFIER;
163     state_ = UNMATCHABLE;
164   }
165 
166  private:
167   enum State {
168     UNMATCHABLE,
169     INITIAL,
170     KEYWORD_PREFIX,
171     KEYWORD_MATCHED,
172     C,
173     CA,
174     CO,
175     CON,
176     D,
177     DE,
178     F,
179     I,
180     IN,
181     N,
182     T,
183     TH,
184     TR,
185     V,
186     W
187   };
188 
189   struct FirstState {
190     const char* keyword;
191     State state;
192     Token::Value token;
193   };
194 
195   // Range of possible first characters of a keyword.
196   static const unsigned int kFirstCharRangeMin = 'b';
197   static const unsigned int kFirstCharRangeMax = 'w';
198   static const unsigned int kFirstCharRangeLength =
199       kFirstCharRangeMax - kFirstCharRangeMin + 1;
200   // State map for first keyword character range.
201   static FirstState first_states_[kFirstCharRangeLength];
202 
203   // Current state.
204   State state_;
205   // Token for currently added characters.
206   Token::Value token_;
207 
208   // Matching a specific keyword string (there is only one possible valid
209   // keyword with the current prefix).
210   const char* keyword_;
211   int counter_;
212   Token::Value keyword_token_;
213 
214   // If input equals keyword's character at position, continue matching keyword
215   // from that position.
MatchKeywordStart(uc32 input,const char * keyword,int position,Token::Value token_if_match)216   inline bool MatchKeywordStart(uc32 input,
217                                 const char* keyword,
218                                 int position,
219                                 Token::Value token_if_match) {
220     if (input == keyword[position]) {
221       state_ = KEYWORD_PREFIX;
222       this->keyword_ = keyword;
223       this->counter_ = position + 1;
224       this->keyword_token_ = token_if_match;
225       return true;
226     }
227     return false;
228   }
229 
230   // If input equals match character, transition to new state and return true.
MatchState(uc32 input,char match,State new_state)231   inline bool MatchState(uc32 input, char match, State new_state) {
232     if (input == match) {
233       state_ = new_state;
234       return true;
235     }
236     return false;
237   }
238 
MatchKeyword(uc32 input,char match,State new_state,Token::Value keyword_token)239   inline bool MatchKeyword(uc32 input,
240                            char match,
241                            State new_state,
242                            Token::Value keyword_token) {
243     if (input == match) {  // Matched "do".
244       state_ = new_state;
245       token_ = keyword_token;
246       return true;
247     }
248     return false;
249   }
250 
251   void Step(uc32 input);
252 };
253 
254 
255 enum ParserMode { PARSE, PREPARSE };
256 enum ParserLanguage { JAVASCRIPT, JSON };
257 
258 
259 class Scanner {
260  public:
261   typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
262 
263   // Construction
264   explicit Scanner(ParserMode parse_mode);
265 
266   // Initialize the Scanner to scan source:
267   void Init(Handle<String> source,
268             unibrow::CharacterStream* stream,
269             int position,
270             ParserLanguage language);
271 
272   // Returns the next token.
273   Token::Value Next();
274 
275   // One token look-ahead (past the token returned by Next()).
peek()276   Token::Value peek() const  { return next_.token; }
277 
278   // Returns true if there was a line terminator before the peek'ed token.
has_line_terminator_before_next()279   bool has_line_terminator_before_next() const {
280     return has_line_terminator_before_next_;
281   }
282 
283   struct Location {
LocationLocation284     Location(int b, int e) : beg_pos(b), end_pos(e) { }
LocationLocation285     Location() : beg_pos(0), end_pos(0) { }
286     int beg_pos;
287     int end_pos;
288   };
289 
290   // Returns the location information for the current token
291   // (the token returned by Next()).
location()292   Location location() const  { return current_.location; }
peek_location()293   Location peek_location() const  { return next_.location; }
294 
295   // Returns the literal string, if any, for the current token (the
296   // token returned by Next()). The string is 0-terminated and in
297   // UTF-8 format; they may contain 0-characters. Literal strings are
298   // collected for identifiers, strings, and numbers.
299   // These functions only give the correct result if the literal
300   // was scanned between calls to StartLiteral() and TerminateLiteral().
literal_string()301   const char* literal_string() const {
302     return current_.literal_buffer->data();
303   }
literal_length()304   int literal_length() const {
305     // Excluding terminal '\0' added by TerminateLiteral().
306     return current_.literal_buffer->pos() - 1;
307   }
308 
309   // Returns the literal string for the next token (the token that
310   // would be returned if Next() were called).
next_literal_string()311   const char* next_literal_string() const {
312     return next_.literal_buffer->data();
313   }
314   // Returns the length of the next token (that would be returned if
315   // Next() were called).
next_literal_length()316   int next_literal_length() const {
317     return next_.literal_buffer->pos() - 1;
318   }
319 
next_literal()320   Vector<const char> next_literal() const {
321     return Vector<const char>(next_literal_string(),
322                               next_literal_length());
323   }
324 
325   // Scans the input as a regular expression pattern, previous
326   // character(s) must be /(=). Returns true if a pattern is scanned.
327   bool ScanRegExpPattern(bool seen_equal);
328   // Returns true if regexp flags are scanned (always since flags can
329   // be empty).
330   bool ScanRegExpFlags();
331 
332   // Seek forward to the given position.  This operation does not
333   // work in general, for instance when there are pushed back
334   // characters, but works for seeking forward until simple delimiter
335   // tokens, which is what it is used for.
336   void SeekForward(int pos);
337 
338   Handle<String> SubString(int start_pos, int end_pos);
stack_overflow()339   bool stack_overflow() { return stack_overflow_; }
340 
utf8_decoder()341   static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; }
342 
343   // Tells whether the buffer contains an identifier (no escapes).
344   // Used for checking if a property name is an identifier.
345   static bool IsIdentifier(unibrow::CharacterStream* buffer);
346 
347   static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
348   static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
349   static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
350   static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
351 
352   static const int kCharacterLookaheadBufferSize = 1;
353 
354  private:
355   CharacterStreamUTF16Buffer char_stream_buffer_;
356   TwoByteStringUTF16Buffer two_byte_string_buffer_;
357 
358   // Source.
359   UTF16Buffer* source_;
360   int position_;
361 
362   // Buffer to hold literal values (identifiers, strings, numbers)
363   // using 0-terminated UTF-8 encoding.
364   UTF8Buffer literal_buffer_1_;
365   UTF8Buffer literal_buffer_2_;
366 
367   bool stack_overflow_;
368   static StaticResource<Utf8Decoder> utf8_decoder_;
369 
370   // One Unicode character look-ahead; c0_ < 0 at the end of the input.
371   uc32 c0_;
372 
373   // The current and look-ahead token.
374   struct TokenDesc {
375     Token::Value token;
376     Location location;
377     UTF8Buffer* literal_buffer;
378   };
379 
380   TokenDesc current_;  // desc for current token (as returned by Next())
381   TokenDesc next_;     // desc for next token (one token look-ahead)
382   bool has_line_terminator_before_next_;
383   bool is_pre_parsing_;
384   bool is_parsing_json_;
385 
386   // Literal buffer support
387   void StartLiteral();
388   void AddChar(uc32 ch);
389   void AddCharAdvance();
390   void TerminateLiteral();
391 
392   // Low-level scanning support.
Advance()393   void Advance() { c0_ = source_->Advance(); }
PushBack(uc32 ch)394   void PushBack(uc32 ch) {
395     source_->PushBack(ch);
396     c0_ = ch;
397   }
398 
SkipWhiteSpace()399   bool SkipWhiteSpace() {
400     if (is_parsing_json_) {
401       return SkipJsonWhiteSpace();
402     } else {
403       return SkipJavaScriptWhiteSpace();
404     }
405   }
406   bool SkipJavaScriptWhiteSpace();
407   bool SkipJsonWhiteSpace();
408   Token::Value SkipSingleLineComment();
409   Token::Value SkipMultiLineComment();
410 
411   inline Token::Value Select(Token::Value tok);
412   inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);
413 
Scan()414   inline void Scan() {
415     if (is_parsing_json_) {
416       ScanJson();
417     } else {
418       ScanJavaScript();
419     }
420   }
421 
422   // Scans a single JavaScript token.
423   void ScanJavaScript();
424 
425   // Scan a single JSON token. The JSON lexical grammar is specified in the
426   // ECMAScript 5 standard, section 15.12.1.1.
427   // Recognizes all of the single-character tokens directly, or calls a function
428   // to scan a number, string or identifier literal.
429   // The only allowed whitespace characters between tokens are tab,
430   // carrige-return, newline and space.
431   void ScanJson();
432 
433   // A JSON number (production JSONNumber) is a subset of the valid JavaScript
434   // decimal number literals.
435   // It includes an optional minus sign, must have at least one
436   // digit before and after a decimal point, may not have prefixed zeros (unless
437   // the integer part is zero), and may include an exponent part (e.g., "e-10").
438   // Hexadecimal and octal numbers are not allowed.
439   Token::Value ScanJsonNumber();
440   // A JSON string (production JSONString) is subset of valid JavaScript string
441   // literals. The string must only be double-quoted (not single-quoted), and
442   // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
443   // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
444   Token::Value ScanJsonString();
445   // Used to recognizes one of the literals "true", "false", or "null". These
446   // are the only valid JSON identifiers (productions JSONBooleanLiteral,
447   // JSONNullLiteral).
448   Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
449 
450   void ScanDecimalDigits();
451   Token::Value ScanNumber(bool seen_period);
452   Token::Value ScanIdentifier();
453   uc32 ScanHexEscape(uc32 c, int length);
454   uc32 ScanOctalEscape(uc32 c, int length);
455   void ScanEscape();
456   Token::Value ScanString();
457 
458   // Scans a possible HTML comment -- begins with '<!'.
459   Token::Value ScanHtmlComment();
460 
461   // Return the current source position.
source_pos()462   int source_pos() {
463     return source_->pos() - kCharacterLookaheadBufferSize + position_;
464   }
465 
466   // Decodes a unicode escape-sequence which is part of an identifier.
467   // If the escape sequence cannot be decoded the result is kBadRune.
468   uc32 ScanIdentifierUnicodeEscape();
469 };
470 
471 } }  // namespace v8::internal
472 
473 #endif  // V8_SCANNER_H_
474