1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28 // Features shared by parsing and pre-parsing scanners.
29
30 #ifndef V8_SCANNER_BASE_H_
31 #define V8_SCANNER_BASE_H_
32
33 #include "globals.h"
34 #include "checks.h"
35 #include "allocation.h"
36 #include "token.h"
37 #include "unicode-inl.h"
38 #include "char-predicates.h"
39 #include "utils.h"
40 #include "list-inl.h"
41
42 namespace v8 {
43 namespace internal {
44
45 // Returns the value (0 .. 15) of a hexadecimal character c.
46 // If c is not a legal hexadecimal character, returns a value < 0.
HexValue(uc32 c)47 inline int HexValue(uc32 c) {
48 c -= '0';
49 if (static_cast<unsigned>(c) <= 9) return c;
50 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
51 if (static_cast<unsigned>(c) <= 5) return c + 10;
52 return -1;
53 }
54
55
56 // ---------------------------------------------------------------------
57 // Buffered stream of characters, using an internal UC16 buffer.
58
59 class UC16CharacterStream {
60 public:
UC16CharacterStream()61 UC16CharacterStream() : pos_(0) { }
~UC16CharacterStream()62 virtual ~UC16CharacterStream() { }
63
64 // Returns and advances past the next UC16 character in the input
65 // stream. If there are no more characters, it returns a negative
66 // value.
Advance()67 inline uc32 Advance() {
68 if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
69 pos_++;
70 return static_cast<uc32>(*(buffer_cursor_++));
71 }
72 // Note: currently the following increment is necessary to avoid a
73 // parser problem! The scanner treats the final kEndOfInput as
74 // a character with a position, and does math relative to that
75 // position.
76 pos_++;
77
78 return kEndOfInput;
79 }
80
81 // Return the current position in the character stream.
82 // Starts at zero.
pos()83 inline unsigned pos() const { return pos_; }
84
85 // Skips forward past the next character_count UC16 characters
86 // in the input, or until the end of input if that comes sooner.
87 // Returns the number of characters actually skipped. If less
88 // than character_count,
SeekForward(unsigned character_count)89 inline unsigned SeekForward(unsigned character_count) {
90 unsigned buffered_chars =
91 static_cast<unsigned>(buffer_end_ - buffer_cursor_);
92 if (character_count <= buffered_chars) {
93 buffer_cursor_ += character_count;
94 pos_ += character_count;
95 return character_count;
96 }
97 return SlowSeekForward(character_count);
98 }
99
100 // Pushes back the most recently read UC16 character (or negative
101 // value if at end of input), i.e., the value returned by the most recent
102 // call to Advance.
103 // Must not be used right after calling SeekForward.
104 virtual void PushBack(int32_t character) = 0;
105
106 protected:
107 static const uc32 kEndOfInput = -1;
108
109 // Ensures that the buffer_cursor_ points to the character at
110 // position pos_ of the input, if possible. If the position
111 // is at or after the end of the input, return false. If there
112 // are more characters available, return true.
113 virtual bool ReadBlock() = 0;
114 virtual unsigned SlowSeekForward(unsigned character_count) = 0;
115
116 const uc16* buffer_cursor_;
117 const uc16* buffer_end_;
118 unsigned pos_;
119 };
120
121
122 class UnicodeCache {
123 // ---------------------------------------------------------------------
124 // Caching predicates used by scanners.
125 public:
UnicodeCache()126 UnicodeCache() {}
127 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
128
utf8_decoder()129 StaticResource<Utf8Decoder>* utf8_decoder() {
130 return &utf8_decoder_;
131 }
132
IsIdentifierStart(unibrow::uchar c)133 bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); }
IsIdentifierPart(unibrow::uchar c)134 bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }
IsLineTerminator(unibrow::uchar c)135 bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
IsWhiteSpace(unibrow::uchar c)136 bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
137
138 private:
139
140 unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
141 unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
142 unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
143 unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
144 StaticResource<Utf8Decoder> utf8_decoder_;
145
146 DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
147 };
148
149
150 // ----------------------------------------------------------------------------
151 // LiteralBuffer - Collector of chars of literals.
152
153 class LiteralBuffer {
154 public:
LiteralBuffer()155 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
156
~LiteralBuffer()157 ~LiteralBuffer() {
158 if (backing_store_.length() > 0) {
159 backing_store_.Dispose();
160 }
161 }
162
AddChar(uc16 character)163 inline void AddChar(uc16 character) {
164 if (position_ >= backing_store_.length()) ExpandBuffer();
165 if (is_ascii_) {
166 if (character < kMaxAsciiCharCodeU) {
167 backing_store_[position_] = static_cast<byte>(character);
168 position_ += kASCIISize;
169 return;
170 }
171 ConvertToUC16();
172 }
173 *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
174 position_ += kUC16Size;
175 }
176
is_ascii()177 bool is_ascii() { return is_ascii_; }
178
uc16_literal()179 Vector<const uc16> uc16_literal() {
180 ASSERT(!is_ascii_);
181 ASSERT((position_ & 0x1) == 0);
182 return Vector<const uc16>(
183 reinterpret_cast<const uc16*>(backing_store_.start()),
184 position_ >> 1);
185 }
186
ascii_literal()187 Vector<const char> ascii_literal() {
188 ASSERT(is_ascii_);
189 return Vector<const char>(
190 reinterpret_cast<const char*>(backing_store_.start()),
191 position_);
192 }
193
length()194 int length() {
195 return is_ascii_ ? position_ : (position_ >> 1);
196 }
197
Reset()198 void Reset() {
199 position_ = 0;
200 is_ascii_ = true;
201 }
202 private:
203 static const int kInitialCapacity = 16;
204 static const int kGrowthFactory = 4;
205 static const int kMinConversionSlack = 256;
206 static const int kMaxGrowth = 1 * MB;
NewCapacity(int min_capacity)207 inline int NewCapacity(int min_capacity) {
208 int capacity = Max(min_capacity, backing_store_.length());
209 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
210 return new_capacity;
211 }
212
ExpandBuffer()213 void ExpandBuffer() {
214 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
215 memcpy(new_store.start(), backing_store_.start(), position_);
216 backing_store_.Dispose();
217 backing_store_ = new_store;
218 }
219
ConvertToUC16()220 void ConvertToUC16() {
221 ASSERT(is_ascii_);
222 Vector<byte> new_store;
223 int new_content_size = position_ * kUC16Size;
224 if (new_content_size >= backing_store_.length()) {
225 // Ensure room for all currently read characters as UC16 as well
226 // as the character about to be stored.
227 new_store = Vector<byte>::New(NewCapacity(new_content_size));
228 } else {
229 new_store = backing_store_;
230 }
231 char* src = reinterpret_cast<char*>(backing_store_.start());
232 uc16* dst = reinterpret_cast<uc16*>(new_store.start());
233 for (int i = position_ - 1; i >= 0; i--) {
234 dst[i] = src[i];
235 }
236 if (new_store.start() != backing_store_.start()) {
237 backing_store_.Dispose();
238 backing_store_ = new_store;
239 }
240 position_ = new_content_size;
241 is_ascii_ = false;
242 }
243
244 bool is_ascii_;
245 int position_;
246 Vector<byte> backing_store_;
247
248 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
249 };
250
251
252 // ----------------------------------------------------------------------------
253 // Scanner base-class.
254
255 // Generic functionality used by both JSON and JavaScript scanners.
256 class Scanner {
257 public:
258 // -1 is outside of the range of any real source code.
259 static const int kNoOctalLocation = -1;
260
261 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
262
263 class LiteralScope {
264 public:
265 explicit LiteralScope(Scanner* self);
266 ~LiteralScope();
267 void Complete();
268
269 private:
270 Scanner* scanner_;
271 bool complete_;
272 };
273
274 explicit Scanner(UnicodeCache* scanner_contants);
275
276 // Returns the current token again.
current_token()277 Token::Value current_token() { return current_.token; }
278
279 // One token look-ahead (past the token returned by Next()).
peek()280 Token::Value peek() const { return next_.token; }
281
282 struct Location {
LocationLocation283 Location(int b, int e) : beg_pos(b), end_pos(e) { }
LocationLocation284 Location() : beg_pos(0), end_pos(0) { }
285
IsValidLocation286 bool IsValid() const {
287 return beg_pos >= 0 && end_pos >= beg_pos;
288 }
289
290 int beg_pos;
291 int end_pos;
292 };
293
NoLocation()294 static Location NoLocation() {
295 return Location(-1, -1);
296 }
297
298 // Returns the location information for the current token
299 // (the token returned by Next()).
location()300 Location location() const { return current_.location; }
peek_location()301 Location peek_location() const { return next_.location; }
302
303 // Returns the location of the last seen octal literal
octal_position()304 int octal_position() const { return octal_pos_; }
clear_octal_position()305 void clear_octal_position() { octal_pos_ = -1; }
306
307 // Returns the literal string, if any, for the current token (the
308 // token returned by Next()). The string is 0-terminated and in
309 // UTF-8 format; they may contain 0-characters. Literal strings are
310 // collected for identifiers, strings, and numbers.
311 // These functions only give the correct result if the literal
312 // was scanned between calls to StartLiteral() and TerminateLiteral().
is_literal_ascii()313 bool is_literal_ascii() {
314 ASSERT_NOT_NULL(current_.literal_chars);
315 return current_.literal_chars->is_ascii();
316 }
literal_ascii_string()317 Vector<const char> literal_ascii_string() {
318 ASSERT_NOT_NULL(current_.literal_chars);
319 return current_.literal_chars->ascii_literal();
320 }
literal_uc16_string()321 Vector<const uc16> literal_uc16_string() {
322 ASSERT_NOT_NULL(current_.literal_chars);
323 return current_.literal_chars->uc16_literal();
324 }
literal_length()325 int literal_length() const {
326 ASSERT_NOT_NULL(current_.literal_chars);
327 return current_.literal_chars->length();
328 }
329
330 // Returns the literal string for the next token (the token that
331 // would be returned if Next() were called).
is_next_literal_ascii()332 bool is_next_literal_ascii() {
333 ASSERT_NOT_NULL(next_.literal_chars);
334 return next_.literal_chars->is_ascii();
335 }
next_literal_ascii_string()336 Vector<const char> next_literal_ascii_string() {
337 ASSERT_NOT_NULL(next_.literal_chars);
338 return next_.literal_chars->ascii_literal();
339 }
next_literal_uc16_string()340 Vector<const uc16> next_literal_uc16_string() {
341 ASSERT_NOT_NULL(next_.literal_chars);
342 return next_.literal_chars->uc16_literal();
343 }
next_literal_length()344 int next_literal_length() const {
345 ASSERT_NOT_NULL(next_.literal_chars);
346 return next_.literal_chars->length();
347 }
348
349 static const int kCharacterLookaheadBufferSize = 1;
350
351 protected:
352 // The current and look-ahead token.
353 struct TokenDesc {
354 Token::Value token;
355 Location location;
356 LiteralBuffer* literal_chars;
357 };
358
359 // Call this after setting source_ to the input.
Init()360 void Init() {
361 // Set c0_ (one character ahead)
362 ASSERT(kCharacterLookaheadBufferSize == 1);
363 Advance();
364 // Initialize current_ to not refer to a literal.
365 current_.literal_chars = NULL;
366 }
367
368 // Literal buffer support
StartLiteral()369 inline void StartLiteral() {
370 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
371 &literal_buffer2_ : &literal_buffer1_;
372 free_buffer->Reset();
373 next_.literal_chars = free_buffer;
374 }
375
AddLiteralChar(uc32 c)376 inline void AddLiteralChar(uc32 c) {
377 ASSERT_NOT_NULL(next_.literal_chars);
378 next_.literal_chars->AddChar(c);
379 }
380
381 // Complete scanning of a literal.
TerminateLiteral()382 inline void TerminateLiteral() {
383 // Does nothing in the current implementation.
384 }
385
386 // Stops scanning of a literal and drop the collected characters,
387 // e.g., due to an encountered error.
DropLiteral()388 inline void DropLiteral() {
389 next_.literal_chars = NULL;
390 }
391
AddLiteralCharAdvance()392 inline void AddLiteralCharAdvance() {
393 AddLiteralChar(c0_);
394 Advance();
395 }
396
397 // Low-level scanning support.
Advance()398 void Advance() { c0_ = source_->Advance(); }
PushBack(uc32 ch)399 void PushBack(uc32 ch) {
400 source_->PushBack(c0_);
401 c0_ = ch;
402 }
403
Select(Token::Value tok)404 inline Token::Value Select(Token::Value tok) {
405 Advance();
406 return tok;
407 }
408
Select(uc32 next,Token::Value then,Token::Value else_)409 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
410 Advance();
411 if (c0_ == next) {
412 Advance();
413 return then;
414 } else {
415 return else_;
416 }
417 }
418
419 uc32 ScanHexEscape(uc32 c, int length);
420
421 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
422 uc32 ScanOctalEscape(uc32 c, int length);
423
424 // Return the current source position.
source_pos()425 int source_pos() {
426 return source_->pos() - kCharacterLookaheadBufferSize;
427 }
428
429 UnicodeCache* unicode_cache_;
430
431 // Buffers collecting literal strings, numbers, etc.
432 LiteralBuffer literal_buffer1_;
433 LiteralBuffer literal_buffer2_;
434
435 TokenDesc current_; // desc for current token (as returned by Next())
436 TokenDesc next_; // desc for next token (one token look-ahead)
437
438 // Input stream. Must be initialized to an UC16CharacterStream.
439 UC16CharacterStream* source_;
440
441 // Start position of the octal literal last scanned.
442 int octal_pos_;
443
444 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
445 uc32 c0_;
446 };
447
448 // ----------------------------------------------------------------------------
449 // JavaScriptScanner - base logic for JavaScript scanning.
450
451 class JavaScriptScanner : public Scanner {
452 public:
453 // A LiteralScope that disables recording of some types of JavaScript
454 // literals. If the scanner is configured to not record the specific
455 // type of literal, the scope will not call StartLiteral.
456 class LiteralScope {
457 public:
LiteralScope(JavaScriptScanner * self)458 explicit LiteralScope(JavaScriptScanner* self)
459 : scanner_(self), complete_(false) {
460 scanner_->StartLiteral();
461 }
~LiteralScope()462 ~LiteralScope() {
463 if (!complete_) scanner_->DropLiteral();
464 }
Complete()465 void Complete() {
466 scanner_->TerminateLiteral();
467 complete_ = true;
468 }
469
470 private:
471 JavaScriptScanner* scanner_;
472 bool complete_;
473 };
474
475 explicit JavaScriptScanner(UnicodeCache* scanner_contants);
476
477 // Returns the next token.
478 Token::Value Next();
479
480 // Returns true if there was a line terminator before the peek'ed token.
has_line_terminator_before_next()481 bool has_line_terminator_before_next() const {
482 return has_line_terminator_before_next_;
483 }
484
485 // Scans the input as a regular expression pattern, previous
486 // character(s) must be /(=). Returns true if a pattern is scanned.
487 bool ScanRegExpPattern(bool seen_equal);
488 // Returns true if regexp flags are scanned (always since flags can
489 // be empty).
490 bool ScanRegExpFlags();
491
492 // Tells whether the buffer contains an identifier (no escapes).
493 // Used for checking if a property name is an identifier.
494 static bool IsIdentifier(unibrow::CharacterStream* buffer);
495
496 // Seek forward to the given position. This operation does not
497 // work in general, for instance when there are pushed back
498 // characters, but works for seeking forward until simple delimiter
499 // tokens, which is what it is used for.
500 void SeekForward(int pos);
501
502 protected:
503 bool SkipWhiteSpace();
504 Token::Value SkipSingleLineComment();
505 Token::Value SkipMultiLineComment();
506
507 // Scans a single JavaScript token.
508 void Scan();
509
510 void ScanDecimalDigits();
511 Token::Value ScanNumber(bool seen_period);
512 Token::Value ScanIdentifierOrKeyword();
513 Token::Value ScanIdentifierSuffix(LiteralScope* literal);
514
515 void ScanEscape();
516 Token::Value ScanString();
517
518 // Scans a possible HTML comment -- begins with '<!'.
519 Token::Value ScanHtmlComment();
520
521 // Decodes a unicode escape-sequence which is part of an identifier.
522 // If the escape sequence cannot be decoded the result is kBadChar.
523 uc32 ScanIdentifierUnicodeEscape();
524
525 bool has_line_terminator_before_next_;
526 };
527
528
529 // ----------------------------------------------------------------------------
530 // Keyword matching state machine.
531
532 class KeywordMatcher {
533 // Incrementally recognize keywords.
534 //
535 // Recognized keywords:
536 // break case catch const* continue debugger* default delete do else
537 // finally false for function if in instanceof native* new null
538 // return switch this throw true try typeof var void while with
539 //
540 // *: Actually "future reserved keywords". These are the only ones we
541 // recognize, the remaining are allowed as identifiers.
542 // In ES5 strict mode, we should disallow all reserved keywords.
543 public:
KeywordMatcher()544 KeywordMatcher()
545 : state_(INITIAL),
546 token_(Token::IDENTIFIER),
547 keyword_(NULL),
548 counter_(0),
549 keyword_token_(Token::ILLEGAL) {}
550
token()551 Token::Value token() { return token_; }
552
AddChar(unibrow::uchar input)553 inline bool AddChar(unibrow::uchar input) {
554 if (state_ != UNMATCHABLE) {
555 Step(input);
556 }
557 return state_ != UNMATCHABLE;
558 }
559
Fail()560 void Fail() {
561 token_ = Token::IDENTIFIER;
562 state_ = UNMATCHABLE;
563 }
564
565 private:
566 enum State {
567 UNMATCHABLE,
568 INITIAL,
569 KEYWORD_PREFIX,
570 KEYWORD_MATCHED,
571 C,
572 CA,
573 CO,
574 CON,
575 D,
576 DE,
577 E,
578 EX,
579 F,
580 I,
581 IM,
582 IMP,
583 IN,
584 N,
585 P,
586 PR,
587 S,
588 T,
589 TH,
590 TR,
591 V,
592 W
593 };
594
595 struct FirstState {
596 const char* keyword;
597 State state;
598 Token::Value token;
599 };
600
601 // Range of possible first characters of a keyword.
602 static const unsigned int kFirstCharRangeMin = 'b';
603 static const unsigned int kFirstCharRangeMax = 'y';
604 static const unsigned int kFirstCharRangeLength =
605 kFirstCharRangeMax - kFirstCharRangeMin + 1;
606 // State map for first keyword character range.
607 static FirstState first_states_[kFirstCharRangeLength];
608
609 // If input equals keyword's character at position, continue matching keyword
610 // from that position.
MatchKeywordStart(unibrow::uchar input,const char * keyword,int position,Token::Value token_if_match)611 inline bool MatchKeywordStart(unibrow::uchar input,
612 const char* keyword,
613 int position,
614 Token::Value token_if_match) {
615 if (input != static_cast<unibrow::uchar>(keyword[position])) {
616 return false;
617 }
618 state_ = KEYWORD_PREFIX;
619 this->keyword_ = keyword;
620 this->counter_ = position + 1;
621 this->keyword_token_ = token_if_match;
622 return true;
623 }
624
625 // If input equals match character, transition to new state and return true.
MatchState(unibrow::uchar input,char match,State new_state)626 inline bool MatchState(unibrow::uchar input, char match, State new_state) {
627 if (input != static_cast<unibrow::uchar>(match)) {
628 return false;
629 }
630 state_ = new_state;
631 return true;
632 }
633
MatchKeyword(unibrow::uchar input,char match,State new_state,Token::Value keyword_token)634 inline bool MatchKeyword(unibrow::uchar input,
635 char match,
636 State new_state,
637 Token::Value keyword_token) {
638 if (input != static_cast<unibrow::uchar>(match)) {
639 return false;
640 }
641 state_ = new_state;
642 token_ = keyword_token;
643 return true;
644 }
645
646 void Step(unibrow::uchar input);
647
648 // Current state.
649 State state_;
650 // Token for currently added characters.
651 Token::Value token_;
652
653 // Matching a specific keyword string (there is only one possible valid
654 // keyword with the current prefix).
655 const char* keyword_;
656 int counter_;
657 Token::Value keyword_token_;
658 };
659
660
661 } } // namespace v8::internal
662
663 #endif // V8_SCANNER_BASE_H_
664