1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #ifndef V8_SCANNER_H_ 29 #define V8_SCANNER_H_ 30 31 #include "token.h" 32 #include "char-predicates-inl.h" 33 34 namespace v8 { 35 namespace internal { 36 37 38 class UTF8Buffer { 39 public: 40 UTF8Buffer(); 41 ~UTF8Buffer(); 42 AddChar(uc32 c)43 void AddChar(uc32 c) { 44 ASSERT_NOT_NULL(data_); 45 if (cursor_ <= limit_ && 46 static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { 47 *cursor_++ = static_cast<char>(c); 48 } else { 49 AddCharSlow(c); 50 } 51 } 52 Reset()53 void Reset() { 54 if (data_ == NULL) { 55 data_ = NewArray<char>(kInitialCapacity); 56 limit_ = ComputeLimit(data_, kInitialCapacity); 57 } 58 cursor_ = data_; 59 } 60 pos()61 int pos() const { 62 ASSERT_NOT_NULL(data_); 63 return static_cast<int>(cursor_ - data_); 64 } 65 data()66 char* data() const { return data_; } 67 68 private: 69 static const int kInitialCapacity = 256; 70 char* data_; 71 char* cursor_; 72 char* limit_; 73 Capacity()74 int Capacity() const { 75 ASSERT_NOT_NULL(data_); 76 return static_cast<int>(limit_ - data_) + unibrow::Utf8::kMaxEncodedSize; 77 } 78 ComputeLimit(char * data,int capacity)79 static char* ComputeLimit(char* data, int capacity) { 80 return (data + capacity) - unibrow::Utf8::kMaxEncodedSize; 81 } 82 83 void AddCharSlow(uc32 c); 84 }; 85 86 87 class UTF16Buffer { 88 public: 89 UTF16Buffer(); ~UTF16Buffer()90 virtual ~UTF16Buffer() {} 91 92 virtual void PushBack(uc32 ch) = 0; 93 // returns a value < 0 when the buffer end is reached 94 virtual uc32 Advance() = 0; 95 virtual void SeekForward(int pos) = 0; 96 pos()97 int pos() const { return pos_; } size()98 int size() const { return size_; } 99 Handle<String> SubString(int start, int end); 100 101 protected: 102 Handle<String> data_; 103 int pos_; 104 int size_; 105 }; 106 107 108 class CharacterStreamUTF16Buffer: public UTF16Buffer { 109 public: 110 CharacterStreamUTF16Buffer(); ~CharacterStreamUTF16Buffer()111 virtual ~CharacterStreamUTF16Buffer() {} 112 void Initialize(Handle<String> data, unibrow::CharacterStream* stream); 113 virtual void PushBack(uc32 ch); 114 virtual uc32 Advance(); 115 virtual void SeekForward(int pos); 116 117 private: 118 List<uc32> pushback_buffer_; 119 uc32 last_; 120 unibrow::CharacterStream* stream_; 121 pushback_buffer()122 List<uc32>* pushback_buffer() { return &pushback_buffer_; } 123 }; 124 125 126 class TwoByteStringUTF16Buffer: public UTF16Buffer { 127 public: 128 TwoByteStringUTF16Buffer(); ~TwoByteStringUTF16Buffer()129 virtual ~TwoByteStringUTF16Buffer() {} 130 void Initialize(Handle<ExternalTwoByteString> data); 131 virtual void PushBack(uc32 ch); 132 virtual uc32 Advance(); 133 virtual void SeekForward(int pos); 134 135 private: 136 const uint16_t* raw_data_; 137 }; 138 139 140 class KeywordMatcher { 141 // Incrementally recognize keywords. 142 // 143 // Recognized keywords: 144 // break case catch const* continue debugger* default delete do else 145 // finally false for function if in instanceof native* new null 146 // return switch this throw true try typeof var void while with 147 // 148 // *: Actually "future reserved keywords". These are the only ones we 149 // recognized, the remaining are allowed as identifiers. 150 public: KeywordMatcher()151 KeywordMatcher() : state_(INITIAL), token_(Token::IDENTIFIER) {} 152 token()153 Token::Value token() { return token_; } 154 AddChar(uc32 input)155 inline void AddChar(uc32 input) { 156 if (state_ != UNMATCHABLE) { 157 Step(input); 158 } 159 } 160 Fail()161 void Fail() { 162 token_ = Token::IDENTIFIER; 163 state_ = UNMATCHABLE; 164 } 165 166 private: 167 enum State { 168 UNMATCHABLE, 169 INITIAL, 170 KEYWORD_PREFIX, 171 KEYWORD_MATCHED, 172 C, 173 CA, 174 CO, 175 CON, 176 D, 177 DE, 178 F, 179 I, 180 IN, 181 N, 182 T, 183 TH, 184 TR, 185 V, 186 W 187 }; 188 189 struct FirstState { 190 const char* keyword; 191 State state; 192 Token::Value token; 193 }; 194 195 // Range of possible first characters of a keyword. 196 static const unsigned int kFirstCharRangeMin = 'b'; 197 static const unsigned int kFirstCharRangeMax = 'w'; 198 static const unsigned int kFirstCharRangeLength = 199 kFirstCharRangeMax - kFirstCharRangeMin + 1; 200 // State map for first keyword character range. 201 static FirstState first_states_[kFirstCharRangeLength]; 202 203 // Current state. 204 State state_; 205 // Token for currently added characters. 206 Token::Value token_; 207 208 // Matching a specific keyword string (there is only one possible valid 209 // keyword with the current prefix). 210 const char* keyword_; 211 int counter_; 212 Token::Value keyword_token_; 213 214 // If input equals keyword's character at position, continue matching keyword 215 // from that position. MatchKeywordStart(uc32 input,const char * keyword,int position,Token::Value token_if_match)216 inline bool MatchKeywordStart(uc32 input, 217 const char* keyword, 218 int position, 219 Token::Value token_if_match) { 220 if (input == keyword[position]) { 221 state_ = KEYWORD_PREFIX; 222 this->keyword_ = keyword; 223 this->counter_ = position + 1; 224 this->keyword_token_ = token_if_match; 225 return true; 226 } 227 return false; 228 } 229 230 // If input equals match character, transition to new state and return true. MatchState(uc32 input,char match,State new_state)231 inline bool MatchState(uc32 input, char match, State new_state) { 232 if (input == match) { 233 state_ = new_state; 234 return true; 235 } 236 return false; 237 } 238 MatchKeyword(uc32 input,char match,State new_state,Token::Value keyword_token)239 inline bool MatchKeyword(uc32 input, 240 char match, 241 State new_state, 242 Token::Value keyword_token) { 243 if (input == match) { // Matched "do". 244 state_ = new_state; 245 token_ = keyword_token; 246 return true; 247 } 248 return false; 249 } 250 251 void Step(uc32 input); 252 }; 253 254 255 enum ParserMode { PARSE, PREPARSE }; 256 enum ParserLanguage { JAVASCRIPT, JSON }; 257 258 259 class Scanner { 260 public: 261 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; 262 263 // Construction 264 explicit Scanner(ParserMode parse_mode); 265 266 // Initialize the Scanner to scan source: 267 void Init(Handle<String> source, 268 unibrow::CharacterStream* stream, 269 int position, 270 ParserLanguage language); 271 272 // Returns the next token. 273 Token::Value Next(); 274 275 // One token look-ahead (past the token returned by Next()). peek()276 Token::Value peek() const { return next_.token; } 277 278 // Returns true if there was a line terminator before the peek'ed token. has_line_terminator_before_next()279 bool has_line_terminator_before_next() const { 280 return has_line_terminator_before_next_; 281 } 282 283 struct Location { LocationLocation284 Location(int b, int e) : beg_pos(b), end_pos(e) { } LocationLocation285 Location() : beg_pos(0), end_pos(0) { } 286 int beg_pos; 287 int end_pos; 288 }; 289 290 // Returns the location information for the current token 291 // (the token returned by Next()). location()292 Location location() const { return current_.location; } peek_location()293 Location peek_location() const { return next_.location; } 294 295 // Returns the literal string, if any, for the current token (the 296 // token returned by Next()). The string is 0-terminated and in 297 // UTF-8 format; they may contain 0-characters. Literal strings are 298 // collected for identifiers, strings, and numbers. 299 // These functions only give the correct result if the literal 300 // was scanned between calls to StartLiteral() and TerminateLiteral(). literal_string()301 const char* literal_string() const { 302 return current_.literal_buffer->data(); 303 } literal_length()304 int literal_length() const { 305 // Excluding terminal '\0' added by TerminateLiteral(). 306 return current_.literal_buffer->pos() - 1; 307 } 308 309 // Returns the literal string for the next token (the token that 310 // would be returned if Next() were called). next_literal_string()311 const char* next_literal_string() const { 312 return next_.literal_buffer->data(); 313 } 314 // Returns the length of the next token (that would be returned if 315 // Next() were called). next_literal_length()316 int next_literal_length() const { 317 return next_.literal_buffer->pos() - 1; 318 } 319 next_literal()320 Vector<const char> next_literal() const { 321 return Vector<const char>(next_literal_string(), 322 next_literal_length()); 323 } 324 325 // Scans the input as a regular expression pattern, previous 326 // character(s) must be /(=). Returns true if a pattern is scanned. 327 bool ScanRegExpPattern(bool seen_equal); 328 // Returns true if regexp flags are scanned (always since flags can 329 // be empty). 330 bool ScanRegExpFlags(); 331 332 // Seek forward to the given position. This operation does not 333 // work in general, for instance when there are pushed back 334 // characters, but works for seeking forward until simple delimiter 335 // tokens, which is what it is used for. 336 void SeekForward(int pos); 337 338 Handle<String> SubString(int start_pos, int end_pos); stack_overflow()339 bool stack_overflow() { return stack_overflow_; } 340 utf8_decoder()341 static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; } 342 343 // Tells whether the buffer contains an identifier (no escapes). 344 // Used for checking if a property name is an identifier. 345 static bool IsIdentifier(unibrow::CharacterStream* buffer); 346 347 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; 348 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; 349 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; 350 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; 351 352 static const int kCharacterLookaheadBufferSize = 1; 353 354 private: 355 CharacterStreamUTF16Buffer char_stream_buffer_; 356 TwoByteStringUTF16Buffer two_byte_string_buffer_; 357 358 // Source. 359 UTF16Buffer* source_; 360 int position_; 361 362 // Buffer to hold literal values (identifiers, strings, numbers) 363 // using 0-terminated UTF-8 encoding. 364 UTF8Buffer literal_buffer_1_; 365 UTF8Buffer literal_buffer_2_; 366 367 bool stack_overflow_; 368 static StaticResource<Utf8Decoder> utf8_decoder_; 369 370 // One Unicode character look-ahead; c0_ < 0 at the end of the input. 371 uc32 c0_; 372 373 // The current and look-ahead token. 374 struct TokenDesc { 375 Token::Value token; 376 Location location; 377 UTF8Buffer* literal_buffer; 378 }; 379 380 TokenDesc current_; // desc for current token (as returned by Next()) 381 TokenDesc next_; // desc for next token (one token look-ahead) 382 bool has_line_terminator_before_next_; 383 bool is_pre_parsing_; 384 bool is_parsing_json_; 385 386 // Literal buffer support 387 void StartLiteral(); 388 void AddChar(uc32 ch); 389 void AddCharAdvance(); 390 void TerminateLiteral(); 391 392 // Low-level scanning support. Advance()393 void Advance() { c0_ = source_->Advance(); } PushBack(uc32 ch)394 void PushBack(uc32 ch) { 395 source_->PushBack(ch); 396 c0_ = ch; 397 } 398 SkipWhiteSpace()399 bool SkipWhiteSpace() { 400 if (is_parsing_json_) { 401 return SkipJsonWhiteSpace(); 402 } else { 403 return SkipJavaScriptWhiteSpace(); 404 } 405 } 406 bool SkipJavaScriptWhiteSpace(); 407 bool SkipJsonWhiteSpace(); 408 Token::Value SkipSingleLineComment(); 409 Token::Value SkipMultiLineComment(); 410 411 inline Token::Value Select(Token::Value tok); 412 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); 413 Scan()414 inline void Scan() { 415 if (is_parsing_json_) { 416 ScanJson(); 417 } else { 418 ScanJavaScript(); 419 } 420 } 421 422 // Scans a single JavaScript token. 423 void ScanJavaScript(); 424 425 // Scan a single JSON token. The JSON lexical grammar is specified in the 426 // ECMAScript 5 standard, section 15.12.1.1. 427 // Recognizes all of the single-character tokens directly, or calls a function 428 // to scan a number, string or identifier literal. 429 // The only allowed whitespace characters between tokens are tab, 430 // carrige-return, newline and space. 431 void ScanJson(); 432 433 // A JSON number (production JSONNumber) is a subset of the valid JavaScript 434 // decimal number literals. 435 // It includes an optional minus sign, must have at least one 436 // digit before and after a decimal point, may not have prefixed zeros (unless 437 // the integer part is zero), and may include an exponent part (e.g., "e-10"). 438 // Hexadecimal and octal numbers are not allowed. 439 Token::Value ScanJsonNumber(); 440 // A JSON string (production JSONString) is subset of valid JavaScript string 441 // literals. The string must only be double-quoted (not single-quoted), and 442 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and 443 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. 444 Token::Value ScanJsonString(); 445 // Used to recognizes one of the literals "true", "false", or "null". These 446 // are the only valid JSON identifiers (productions JSONBooleanLiteral, 447 // JSONNullLiteral). 448 Token::Value ScanJsonIdentifier(const char* text, Token::Value token); 449 450 void ScanDecimalDigits(); 451 Token::Value ScanNumber(bool seen_period); 452 Token::Value ScanIdentifier(); 453 uc32 ScanHexEscape(uc32 c, int length); 454 uc32 ScanOctalEscape(uc32 c, int length); 455 void ScanEscape(); 456 Token::Value ScanString(); 457 458 // Scans a possible HTML comment -- begins with '<!'. 459 Token::Value ScanHtmlComment(); 460 461 // Return the current source position. source_pos()462 int source_pos() { 463 return source_->pos() - kCharacterLookaheadBufferSize + position_; 464 } 465 466 // Decodes a unicode escape-sequence which is part of an identifier. 467 // If the escape sequence cannot be decoded the result is kBadRune. 468 uc32 ScanIdentifierUnicodeEscape(); 469 }; 470 471 } } // namespace v8::internal 472 473 #endif // V8_SCANNER_H_ 474