1 /*
2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved.
4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
5 * Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
16 *
17 * You should have received a copy of the GNU Library General Public License
18 * along with this library; see the file COPYING.LIB. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 * Boston, MA 02110-1301, USA.
21 *
22 */
23
24 #include "config.h"
25 #include "Lexer.h"
26
27 #include "JSFunction.h"
28
29 #include "JSGlobalObjectFunctions.h"
30 #include "Identifier.h"
31 #include "NodeInfo.h"
32 #include "Nodes.h"
33 #include "dtoa.h"
34 #include <ctype.h>
35 #include <limits.h>
36 #include <string.h>
37 #include <wtf/Assertions.h>
38
39 using namespace WTF;
40 using namespace Unicode;
41
42 #include "JSParser.h"
43 #include "Lookup.h"
44 #include "Lexer.lut.h"
45
46 namespace JSC {
47
48
49 enum CharacterType {
50 // Types for the main switch
51
52 // The first three types are fixed, and also used for identifying
53 // ASCII alpha and alphanumeric characters (see isIdentStart and isIdentPart).
54 CharacterIdentifierStart,
55 CharacterZero,
56 CharacterNumber,
57
58 CharacterInvalid,
59 CharacterLineTerminator,
60 CharacterExclamationMark,
61 CharacterOpenParen,
62 CharacterCloseParen,
63 CharacterOpenBracket,
64 CharacterCloseBracket,
65 CharacterComma,
66 CharacterColon,
67 CharacterQuestion,
68 CharacterTilde,
69 CharacterQuote,
70 CharacterDot,
71 CharacterSlash,
72 CharacterBackSlash,
73 CharacterSemicolon,
74 CharacterOpenBrace,
75 CharacterCloseBrace,
76
77 CharacterAdd,
78 CharacterSub,
79 CharacterMultiply,
80 CharacterModulo,
81 CharacterAnd,
82 CharacterXor,
83 CharacterOr,
84 CharacterLess,
85 CharacterGreater,
86 CharacterEqual,
87
88 // Other types (only one so far)
89 CharacterWhiteSpace,
90 };
91
92 // 128 ASCII codes
93 static const unsigned short typesOfASCIICharacters[128] = {
94 /* 0 - Null */ CharacterInvalid,
95 /* 1 - Start of Heading */ CharacterInvalid,
96 /* 2 - Start of Text */ CharacterInvalid,
97 /* 3 - End of Text */ CharacterInvalid,
98 /* 4 - End of Transm. */ CharacterInvalid,
99 /* 5 - Enquiry */ CharacterInvalid,
100 /* 6 - Acknowledgment */ CharacterInvalid,
101 /* 7 - Bell */ CharacterInvalid,
102 /* 8 - Back Space */ CharacterInvalid,
103 /* 9 - Horizontal Tab */ CharacterWhiteSpace,
104 /* 10 - Line Feed */ CharacterLineTerminator,
105 /* 11 - Vertical Tab */ CharacterWhiteSpace,
106 /* 12 - Form Feed */ CharacterWhiteSpace,
107 /* 13 - Carriage Return */ CharacterLineTerminator,
108 /* 14 - Shift Out */ CharacterInvalid,
109 /* 15 - Shift In */ CharacterInvalid,
110 /* 16 - Data Line Escape */ CharacterInvalid,
111 /* 17 - Device Control 1 */ CharacterInvalid,
112 /* 18 - Device Control 2 */ CharacterInvalid,
113 /* 19 - Device Control 3 */ CharacterInvalid,
114 /* 20 - Device Control 4 */ CharacterInvalid,
115 /* 21 - Negative Ack. */ CharacterInvalid,
116 /* 22 - Synchronous Idle */ CharacterInvalid,
117 /* 23 - End of Transmit */ CharacterInvalid,
118 /* 24 - Cancel */ CharacterInvalid,
119 /* 25 - End of Medium */ CharacterInvalid,
120 /* 26 - Substitute */ CharacterInvalid,
121 /* 27 - Escape */ CharacterInvalid,
122 /* 28 - File Separator */ CharacterInvalid,
123 /* 29 - Group Separator */ CharacterInvalid,
124 /* 30 - Record Separator */ CharacterInvalid,
125 /* 31 - Unit Separator */ CharacterInvalid,
126 /* 32 - Space */ CharacterWhiteSpace,
127 /* 33 - ! */ CharacterExclamationMark,
128 /* 34 - " */ CharacterQuote,
129 /* 35 - # */ CharacterInvalid,
130 /* 36 - $ */ CharacterIdentifierStart,
131 /* 37 - % */ CharacterModulo,
132 /* 38 - & */ CharacterAnd,
133 /* 39 - ' */ CharacterQuote,
134 /* 40 - ( */ CharacterOpenParen,
135 /* 41 - ) */ CharacterCloseParen,
136 /* 42 - * */ CharacterMultiply,
137 /* 43 - + */ CharacterAdd,
138 /* 44 - , */ CharacterComma,
139 /* 45 - - */ CharacterSub,
140 /* 46 - . */ CharacterDot,
141 /* 47 - / */ CharacterSlash,
142 /* 48 - 0 */ CharacterZero,
143 /* 49 - 1 */ CharacterNumber,
144 /* 50 - 2 */ CharacterNumber,
145 /* 51 - 3 */ CharacterNumber,
146 /* 52 - 4 */ CharacterNumber,
147 /* 53 - 5 */ CharacterNumber,
148 /* 54 - 6 */ CharacterNumber,
149 /* 55 - 7 */ CharacterNumber,
150 /* 56 - 8 */ CharacterNumber,
151 /* 57 - 9 */ CharacterNumber,
152 /* 58 - : */ CharacterColon,
153 /* 59 - ; */ CharacterSemicolon,
154 /* 60 - < */ CharacterLess,
155 /* 61 - = */ CharacterEqual,
156 /* 62 - > */ CharacterGreater,
157 /* 63 - ? */ CharacterQuestion,
158 /* 64 - @ */ CharacterInvalid,
159 /* 65 - A */ CharacterIdentifierStart,
160 /* 66 - B */ CharacterIdentifierStart,
161 /* 67 - C */ CharacterIdentifierStart,
162 /* 68 - D */ CharacterIdentifierStart,
163 /* 69 - E */ CharacterIdentifierStart,
164 /* 70 - F */ CharacterIdentifierStart,
165 /* 71 - G */ CharacterIdentifierStart,
166 /* 72 - H */ CharacterIdentifierStart,
167 /* 73 - I */ CharacterIdentifierStart,
168 /* 74 - J */ CharacterIdentifierStart,
169 /* 75 - K */ CharacterIdentifierStart,
170 /* 76 - L */ CharacterIdentifierStart,
171 /* 77 - M */ CharacterIdentifierStart,
172 /* 78 - N */ CharacterIdentifierStart,
173 /* 79 - O */ CharacterIdentifierStart,
174 /* 80 - P */ CharacterIdentifierStart,
175 /* 81 - Q */ CharacterIdentifierStart,
176 /* 82 - R */ CharacterIdentifierStart,
177 /* 83 - S */ CharacterIdentifierStart,
178 /* 84 - T */ CharacterIdentifierStart,
179 /* 85 - U */ CharacterIdentifierStart,
180 /* 86 - V */ CharacterIdentifierStart,
181 /* 87 - W */ CharacterIdentifierStart,
182 /* 88 - X */ CharacterIdentifierStart,
183 /* 89 - Y */ CharacterIdentifierStart,
184 /* 90 - Z */ CharacterIdentifierStart,
185 /* 91 - [ */ CharacterOpenBracket,
186 /* 92 - \ */ CharacterBackSlash,
187 /* 93 - ] */ CharacterCloseBracket,
188 /* 94 - ^ */ CharacterXor,
189 /* 95 - _ */ CharacterIdentifierStart,
190 /* 96 - ` */ CharacterInvalid,
191 /* 97 - a */ CharacterIdentifierStart,
192 /* 98 - b */ CharacterIdentifierStart,
193 /* 99 - c */ CharacterIdentifierStart,
194 /* 100 - d */ CharacterIdentifierStart,
195 /* 101 - e */ CharacterIdentifierStart,
196 /* 102 - f */ CharacterIdentifierStart,
197 /* 103 - g */ CharacterIdentifierStart,
198 /* 104 - h */ CharacterIdentifierStart,
199 /* 105 - i */ CharacterIdentifierStart,
200 /* 106 - j */ CharacterIdentifierStart,
201 /* 107 - k */ CharacterIdentifierStart,
202 /* 108 - l */ CharacterIdentifierStart,
203 /* 109 - m */ CharacterIdentifierStart,
204 /* 110 - n */ CharacterIdentifierStart,
205 /* 111 - o */ CharacterIdentifierStart,
206 /* 112 - p */ CharacterIdentifierStart,
207 /* 113 - q */ CharacterIdentifierStart,
208 /* 114 - r */ CharacterIdentifierStart,
209 /* 115 - s */ CharacterIdentifierStart,
210 /* 116 - t */ CharacterIdentifierStart,
211 /* 117 - u */ CharacterIdentifierStart,
212 /* 118 - v */ CharacterIdentifierStart,
213 /* 119 - w */ CharacterIdentifierStart,
214 /* 120 - x */ CharacterIdentifierStart,
215 /* 121 - y */ CharacterIdentifierStart,
216 /* 122 - z */ CharacterIdentifierStart,
217 /* 123 - { */ CharacterOpenBrace,
218 /* 124 - | */ CharacterOr,
219 /* 125 - } */ CharacterCloseBrace,
220 /* 126 - ~ */ CharacterTilde,
221 /* 127 - Delete */ CharacterInvalid,
222 };
223
Lexer(JSGlobalData * globalData)224 Lexer::Lexer(JSGlobalData* globalData)
225 : m_isReparsing(false)
226 , m_globalData(globalData)
227 , m_keywordTable(JSC::mainTable)
228 {
229 }
230
~Lexer()231 Lexer::~Lexer()
232 {
233 m_keywordTable.deleteTable();
234 }
235
currentCharacter() const236 ALWAYS_INLINE const UChar* Lexer::currentCharacter() const
237 {
238 ASSERT(m_code <= m_codeEnd);
239 return m_code;
240 }
241
currentOffset() const242 ALWAYS_INLINE int Lexer::currentOffset() const
243 {
244 return currentCharacter() - m_codeStart;
245 }
246
setCode(const SourceCode & source,ParserArena & arena)247 void Lexer::setCode(const SourceCode& source, ParserArena& arena)
248 {
249 m_arena = &arena.identifierArena();
250
251 m_lineNumber = source.firstLine();
252 m_delimited = false;
253 m_lastToken = -1;
254
255 const UChar* data = source.provider()->data();
256
257 m_source = &source;
258 m_codeStart = data;
259 m_code = data + source.startOffset();
260 m_codeEnd = data + source.endOffset();
261 m_error = false;
262 m_atLineStart = true;
263
264 m_buffer8.reserveInitialCapacity(initialReadBufferCapacity);
265 m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2);
266
267 if (LIKELY(m_code < m_codeEnd))
268 m_current = *m_code;
269 else
270 m_current = -1;
271 ASSERT(currentOffset() == source.startOffset());
272 }
273
shift()274 ALWAYS_INLINE void Lexer::shift()
275 {
276 // Faster than an if-else sequence
277 ASSERT(m_current != -1);
278 m_current = -1;
279 ++m_code;
280 if (LIKELY(m_code < m_codeEnd))
281 m_current = *m_code;
282 }
283
peek(int offset)284 ALWAYS_INLINE int Lexer::peek(int offset)
285 {
286 // Only use if necessary
287 ASSERT(offset > 0 && offset < 5);
288 const UChar* code = m_code + offset;
289 return (code < m_codeEnd) ? *code : -1;
290 }
291
getUnicodeCharacter()292 int Lexer::getUnicodeCharacter()
293 {
294 int char1 = peek(1);
295 int char2 = peek(2);
296 int char3 = peek(3);
297
298 if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3)))
299 return -1;
300
301 int result = convertUnicode(m_current, char1, char2, char3);
302 shift();
303 shift();
304 shift();
305 shift();
306 return result;
307 }
308
shiftLineTerminator()309 void Lexer::shiftLineTerminator()
310 {
311 ASSERT(isLineTerminator(m_current));
312
313 int m_prev = m_current;
314 shift();
315
316 // Allow both CRLF and LFCR.
317 if (m_prev + m_current == '\n' + '\r')
318 shift();
319
320 ++m_lineNumber;
321 }
322
makeIdentifier(const UChar * characters,size_t length)323 ALWAYS_INLINE const Identifier* Lexer::makeIdentifier(const UChar* characters, size_t length)
324 {
325 return &m_arena->makeIdentifier(m_globalData, characters, length);
326 }
327
lastTokenWasRestrKeyword() const328 ALWAYS_INLINE bool Lexer::lastTokenWasRestrKeyword() const
329 {
330 return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW;
331 }
332
isNonASCIIIdentStart(int c)333 static NEVER_INLINE bool isNonASCIIIdentStart(int c)
334 {
335 return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other);
336 }
337
isIdentStart(int c)338 static inline bool isIdentStart(int c)
339 {
340 return isASCII(c) ? typesOfASCIICharacters[c] == CharacterIdentifierStart : isNonASCIIIdentStart(c);
341 }
342
isNonASCIIIdentPart(int c)343 static NEVER_INLINE bool isNonASCIIIdentPart(int c)
344 {
345 return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
346 | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector);
347 }
348
isIdentPart(int c)349 static inline bool isIdentPart(int c)
350 {
351 // Character types are divided into two groups depending on whether they can be part of an
352 // identifier or not. Those whose type value is less or equal than CharacterNumber can be
353 // part of an identifier. (See the CharacterType definition for more details.)
354 return isASCII(c) ? typesOfASCIICharacters[c] <= CharacterNumber : isNonASCIIIdentPart(c);
355 }
356
singleEscape(int c)357 static inline int singleEscape(int c)
358 {
359 switch (c) {
360 case 'b':
361 return 0x08;
362 case 't':
363 return 0x09;
364 case 'n':
365 return 0x0A;
366 case 'v':
367 return 0x0B;
368 case 'f':
369 return 0x0C;
370 case 'r':
371 return 0x0D;
372 case '\\':
373 return '\\';
374 case '\'':
375 return '\'';
376 case '"':
377 return '"';
378 default:
379 return 0;
380 }
381 }
382
record8(int c)383 inline void Lexer::record8(int c)
384 {
385 ASSERT(c >= 0);
386 ASSERT(c <= 0xFF);
387 m_buffer8.append(static_cast<char>(c));
388 }
389
record16(UChar c)390 inline void Lexer::record16(UChar c)
391 {
392 m_buffer16.append(c);
393 }
394
record16(int c)395 inline void Lexer::record16(int c)
396 {
397 ASSERT(c >= 0);
398 ASSERT(c <= USHRT_MAX);
399 record16(UChar(static_cast<unsigned short>(c)));
400 }
401
parseIdentifier(JSTokenData * lvalp,LexType lexType)402 ALWAYS_INLINE JSTokenType Lexer::parseIdentifier(JSTokenData* lvalp, LexType lexType)
403 {
404 bool bufferRequired = false;
405 const UChar* identifierStart = currentCharacter();
406 int identifierLength;
407
408 while (true) {
409 if (LIKELY(isIdentPart(m_current))) {
410 shift();
411 continue;
412 }
413 if (LIKELY(m_current != '\\'))
414 break;
415
416 // \uXXXX unicode characters.
417 bufferRequired = true;
418 if (identifierStart != currentCharacter())
419 m_buffer16.append(identifierStart, currentCharacter() - identifierStart);
420 shift();
421 if (UNLIKELY(m_current != 'u'))
422 return ERRORTOK;
423 shift();
424 int character = getUnicodeCharacter();
425 if (UNLIKELY(character == -1))
426 return ERRORTOK;
427 if (UNLIKELY(m_buffer16.size() ? !isIdentPart(character) : !isIdentStart(character)))
428 return ERRORTOK;
429 record16(character);
430 identifierStart = currentCharacter();
431 }
432
433 if (!bufferRequired)
434 identifierLength = currentCharacter() - identifierStart;
435 else {
436 if (identifierStart != currentCharacter())
437 m_buffer16.append(identifierStart, currentCharacter() - identifierStart);
438 identifierStart = m_buffer16.data();
439 identifierLength = m_buffer16.size();
440 }
441
442 const Identifier* ident = makeIdentifier(identifierStart, identifierLength);
443 lvalp->ident = ident;
444 m_delimited = false;
445
446 if (LIKELY(!bufferRequired && lexType == IdentifyReservedWords)) {
447 // Keywords must not be recognized if there was an \uXXXX in the identifier.
448 const HashEntry* entry = m_keywordTable.entry(m_globalData, *ident);
449 return entry ? static_cast<JSTokenType>(entry->lexerValue()) : IDENT;
450 }
451
452 m_buffer16.resize(0);
453 return IDENT;
454 }
455
parseString(JSTokenData * lvalp,bool strictMode)456 ALWAYS_INLINE bool Lexer::parseString(JSTokenData* lvalp, bool strictMode)
457 {
458 int stringQuoteCharacter = m_current;
459 shift();
460
461 const UChar* stringStart = currentCharacter();
462
463 while (m_current != stringQuoteCharacter) {
464 if (UNLIKELY(m_current == '\\')) {
465 if (stringStart != currentCharacter())
466 m_buffer16.append(stringStart, currentCharacter() - stringStart);
467 shift();
468
469 int escape = singleEscape(m_current);
470
471 // Most common escape sequences first
472 if (escape) {
473 record16(escape);
474 shift();
475 } else if (UNLIKELY(isLineTerminator(m_current)))
476 shiftLineTerminator();
477 else if (m_current == 'x') {
478 shift();
479 if (isASCIIHexDigit(m_current) && isASCIIHexDigit(peek(1))) {
480 int prev = m_current;
481 shift();
482 record16(convertHex(prev, m_current));
483 shift();
484 } else
485 record16('x');
486 } else if (m_current == 'u') {
487 shift();
488 int character = getUnicodeCharacter();
489 if (character != -1)
490 record16(character);
491 else if (m_current == stringQuoteCharacter)
492 record16('u');
493 else // Only stringQuoteCharacter allowed after \u
494 return false;
495 } else if (strictMode && isASCIIDigit(m_current)) {
496 // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit.
497 int character1 = m_current;
498 shift();
499 if (character1 != '0' || isASCIIDigit(m_current))
500 return false;
501 record16(0);
502 } else if (!strictMode && isASCIIOctalDigit(m_current)) {
503 // Octal character sequences
504 int character1 = m_current;
505 shift();
506 if (isASCIIOctalDigit(m_current)) {
507 // Two octal characters
508 int character2 = m_current;
509 shift();
510 if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) {
511 record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0');
512 shift();
513 } else
514 record16((character1 - '0') * 8 + character2 - '0');
515 } else
516 record16(character1 - '0');
517 } else if (m_current != -1) {
518 record16(m_current);
519 shift();
520 } else
521 return false;
522
523 stringStart = currentCharacter();
524 continue;
525 }
526 // Fast check for characters that require special handling.
527 // Catches -1, \n, \r, 0x2028, and 0x2029 as efficiently
528 // as possible, and lets through all common ASCII characters.
529 if (UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) {
530 // New-line or end of input is not allowed
531 if (UNLIKELY(isLineTerminator(m_current)) || UNLIKELY(m_current == -1))
532 return false;
533 // Anything else is just a normal character
534 }
535 shift();
536 }
537
538 if (currentCharacter() != stringStart)
539 m_buffer16.append(stringStart, currentCharacter() - stringStart);
540 lvalp->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
541 m_buffer16.resize(0);
542 return true;
543 }
544
parseHex(double & returnValue)545 ALWAYS_INLINE void Lexer::parseHex(double& returnValue)
546 {
547 // Optimization: most hexadecimal values fit into 4 bytes.
548 uint32_t hexValue = 0;
549 int maximumDigits = 7;
550
551 // Shift out the 'x' prefix.
552 shift();
553
554 do {
555 hexValue = (hexValue << 4) + toASCIIHexValue(m_current);
556 shift();
557 --maximumDigits;
558 } while (isASCIIHexDigit(m_current) && maximumDigits >= 0);
559
560 if (maximumDigits >= 0) {
561 returnValue = hexValue;
562 return;
563 }
564
565 // No more place in the hexValue buffer.
566 // The values are shifted out and placed into the m_buffer8 vector.
567 for (int i = 0; i < 8; ++i) {
568 int digit = hexValue >> 28;
569 if (digit < 10)
570 record8(digit + '0');
571 else
572 record8(digit - 10 + 'a');
573 hexValue <<= 4;
574 }
575
576 while (isASCIIHexDigit(m_current)) {
577 record8(m_current);
578 shift();
579 }
580
581 returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16);
582 }
583
parseOctal(double & returnValue)584 ALWAYS_INLINE bool Lexer::parseOctal(double& returnValue)
585 {
586 // Optimization: most octal values fit into 4 bytes.
587 uint32_t octalValue = 0;
588 int maximumDigits = 9;
589 // Temporary buffer for the digits. Makes easier
590 // to reconstruct the input characters when needed.
591 char digits[10];
592
593 do {
594 octalValue = octalValue * 8 + (m_current - '0');
595 digits[maximumDigits] = m_current;
596 shift();
597 --maximumDigits;
598 } while (isASCIIOctalDigit(m_current) && maximumDigits >= 0);
599
600 if (!isASCIIDigit(m_current) && maximumDigits >= 0) {
601 returnValue = octalValue;
602 return true;
603 }
604
605 for (int i = 9; i > maximumDigits; --i)
606 record8(digits[i]);
607
608 while (isASCIIOctalDigit(m_current)) {
609 record8(m_current);
610 shift();
611 }
612
613 if (isASCIIDigit(m_current))
614 return false;
615
616 returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 8);
617 return true;
618 }
619
parseDecimal(double & returnValue)620 ALWAYS_INLINE bool Lexer::parseDecimal(double& returnValue)
621 {
622 // Optimization: most decimal values fit into 4 bytes.
623 uint32_t decimalValue = 0;
624
625 // Since parseOctal may be executed before parseDecimal,
626 // the m_buffer8 may hold ascii digits.
627 if (!m_buffer8.size()) {
628 int maximumDigits = 9;
629 // Temporary buffer for the digits. Makes easier
630 // to reconstruct the input characters when needed.
631 char digits[10];
632
633 do {
634 decimalValue = decimalValue * 10 + (m_current - '0');
635 digits[maximumDigits] = m_current;
636 shift();
637 --maximumDigits;
638 } while (isASCIIDigit(m_current) && maximumDigits >= 0);
639
640 if (maximumDigits >= 0 && m_current != '.' && (m_current | 0x20) != 'e') {
641 returnValue = decimalValue;
642 return true;
643 }
644
645 for (int i = 9; i > maximumDigits; --i)
646 record8(digits[i]);
647 }
648
649 while (isASCIIDigit(m_current)) {
650 record8(m_current);
651 shift();
652 }
653
654 return false;
655 }
656
parseNumberAfterDecimalPoint()657 ALWAYS_INLINE void Lexer::parseNumberAfterDecimalPoint()
658 {
659 record8('.');
660 while (isASCIIDigit(m_current)) {
661 record8(m_current);
662 shift();
663 }
664 }
665
parseNumberAfterExponentIndicator()666 ALWAYS_INLINE bool Lexer::parseNumberAfterExponentIndicator()
667 {
668 record8('e');
669 shift();
670 if (m_current == '+' || m_current == '-') {
671 record8(m_current);
672 shift();
673 }
674
675 if (!isASCIIDigit(m_current))
676 return false;
677
678 do {
679 record8(m_current);
680 shift();
681 } while (isASCIIDigit(m_current));
682 return true;
683 }
684
parseMultilineComment()685 ALWAYS_INLINE bool Lexer::parseMultilineComment()
686 {
687 while (true) {
688 while (UNLIKELY(m_current == '*')) {
689 shift();
690 if (m_current == '/') {
691 shift();
692 return true;
693 }
694 }
695
696 if (UNLIKELY(m_current == -1))
697 return false;
698
699 if (isLineTerminator(m_current))
700 shiftLineTerminator();
701 else
702 shift();
703 }
704 }
705
nextTokenIsColon()706 bool Lexer::nextTokenIsColon()
707 {
708 const UChar* code = m_code;
709 while (code < m_codeEnd && (isWhiteSpace(*code) || isLineTerminator(*code)))
710 code++;
711
712 return code < m_codeEnd && *code == ':';
713 }
714
lex(JSTokenData * lvalp,JSTokenInfo * llocp,LexType lexType,bool strictMode)715 JSTokenType Lexer::lex(JSTokenData* lvalp, JSTokenInfo* llocp, LexType lexType, bool strictMode)
716 {
717 ASSERT(!m_error);
718 ASSERT(m_buffer8.isEmpty());
719 ASSERT(m_buffer16.isEmpty());
720
721 JSTokenType token = ERRORTOK;
722 m_terminator = false;
723
724 start:
725 while (isWhiteSpace(m_current))
726 shift();
727
728 int startOffset = currentOffset();
729
730 if (UNLIKELY(m_current == -1))
731 return EOFTOK;
732
733 m_delimited = false;
734
735 CharacterType type;
736 if (LIKELY(isASCII(m_current)))
737 type = static_cast<CharacterType>(typesOfASCIICharacters[m_current]);
738 else if (isNonASCIIIdentStart(m_current))
739 type = CharacterIdentifierStart;
740 else if (isLineTerminator(m_current))
741 type = CharacterLineTerminator;
742 else
743 type = CharacterInvalid;
744
745 switch (type) {
746 case CharacterGreater:
747 shift();
748 if (m_current == '>') {
749 shift();
750 if (m_current == '>') {
751 shift();
752 if (m_current == '=') {
753 shift();
754 token = URSHIFTEQUAL;
755 break;
756 }
757 token = URSHIFT;
758 break;
759 }
760 if (m_current == '=') {
761 shift();
762 token = RSHIFTEQUAL;
763 break;
764 }
765 token = RSHIFT;
766 break;
767 }
768 if (m_current == '=') {
769 shift();
770 token = GE;
771 break;
772 }
773 token = GT;
774 break;
775 case CharacterEqual:
776 shift();
777 if (m_current == '=') {
778 shift();
779 if (m_current == '=') {
780 shift();
781 token = STREQ;
782 break;
783 }
784 token = EQEQ;
785 break;
786 }
787 token = EQUAL;
788 break;
789 case CharacterLess:
790 shift();
791 if (m_current == '!' && peek(1) == '-' && peek(2) == '-') {
792 // <!-- marks the beginning of a line comment (for www usage)
793 goto inSingleLineComment;
794 }
795 if (m_current == '<') {
796 shift();
797 if (m_current == '=') {
798 shift();
799 token = LSHIFTEQUAL;
800 break;
801 }
802 token = LSHIFT;
803 break;
804 }
805 if (m_current == '=') {
806 shift();
807 token = LE;
808 break;
809 }
810 token = LT;
811 break;
812 case CharacterExclamationMark:
813 shift();
814 if (m_current == '=') {
815 shift();
816 if (m_current == '=') {
817 shift();
818 token = STRNEQ;
819 break;
820 }
821 token = NE;
822 break;
823 }
824 token = EXCLAMATION;
825 break;
826 case CharacterAdd:
827 shift();
828 if (m_current == '+') {
829 shift();
830 token = (!m_terminator) ? PLUSPLUS : AUTOPLUSPLUS;
831 break;
832 }
833 if (m_current == '=') {
834 shift();
835 token = PLUSEQUAL;
836 break;
837 }
838 token = PLUS;
839 break;
840 case CharacterSub:
841 shift();
842 if (m_current == '-') {
843 shift();
844 if (m_atLineStart && m_current == '>') {
845 shift();
846 goto inSingleLineComment;
847 }
848 token = (!m_terminator) ? MINUSMINUS : AUTOMINUSMINUS;
849 break;
850 }
851 if (m_current == '=') {
852 shift();
853 token = MINUSEQUAL;
854 break;
855 }
856 token = MINUS;
857 break;
858 case CharacterMultiply:
859 shift();
860 if (m_current == '=') {
861 shift();
862 token = MULTEQUAL;
863 break;
864 }
865 token = TIMES;
866 break;
867 case CharacterSlash:
868 shift();
869 if (m_current == '/') {
870 shift();
871 goto inSingleLineComment;
872 }
873 if (m_current == '*') {
874 shift();
875 if (parseMultilineComment())
876 goto start;
877 goto returnError;
878 }
879 if (m_current == '=') {
880 shift();
881 token = DIVEQUAL;
882 break;
883 }
884 token = DIVIDE;
885 break;
886 case CharacterAnd:
887 shift();
888 if (m_current == '&') {
889 shift();
890 token = AND;
891 break;
892 }
893 if (m_current == '=') {
894 shift();
895 token = ANDEQUAL;
896 break;
897 }
898 token = BITAND;
899 break;
900 case CharacterXor:
901 shift();
902 if (m_current == '=') {
903 shift();
904 token = XOREQUAL;
905 break;
906 }
907 token = BITXOR;
908 break;
909 case CharacterModulo:
910 shift();
911 if (m_current == '=') {
912 shift();
913 token = MODEQUAL;
914 break;
915 }
916 token = MOD;
917 break;
918 case CharacterOr:
919 shift();
920 if (m_current == '=') {
921 shift();
922 token = OREQUAL;
923 break;
924 }
925 if (m_current == '|') {
926 shift();
927 token = OR;
928 break;
929 }
930 token = BITOR;
931 break;
932 case CharacterOpenParen:
933 token = OPENPAREN;
934 shift();
935 break;
936 case CharacterCloseParen:
937 token = CLOSEPAREN;
938 shift();
939 break;
940 case CharacterOpenBracket:
941 token = OPENBRACKET;
942 shift();
943 break;
944 case CharacterCloseBracket:
945 token = CLOSEBRACKET;
946 shift();
947 break;
948 case CharacterComma:
949 token = COMMA;
950 shift();
951 break;
952 case CharacterColon:
953 token = COLON;
954 shift();
955 break;
956 case CharacterQuestion:
957 token = QUESTION;
958 shift();
959 break;
960 case CharacterTilde:
961 token = TILDE;
962 shift();
963 break;
964 case CharacterSemicolon:
965 m_delimited = true;
966 shift();
967 token = SEMICOLON;
968 break;
969 case CharacterOpenBrace:
970 lvalp->intValue = currentOffset();
971 shift();
972 token = OPENBRACE;
973 break;
974 case CharacterCloseBrace:
975 lvalp->intValue = currentOffset();
976 m_delimited = true;
977 shift();
978 token = CLOSEBRACE;
979 break;
980 case CharacterDot:
981 shift();
982 if (!isASCIIDigit(m_current)) {
983 token = DOT;
984 break;
985 }
986 goto inNumberAfterDecimalPoint;
987 case CharacterZero:
988 shift();
989 if ((m_current | 0x20) == 'x' && isASCIIHexDigit(peek(1))) {
990 parseHex(lvalp->doubleValue);
991 token = NUMBER;
992 } else {
993 record8('0');
994 if (isASCIIOctalDigit(m_current)) {
995 if (parseOctal(lvalp->doubleValue)) {
996 if (strictMode)
997 goto returnError;
998 token = NUMBER;
999 }
1000 }
1001 }
1002 // Fall through into CharacterNumber
1003 case CharacterNumber:
1004 if (LIKELY(token != NUMBER)) {
1005 if (!parseDecimal(lvalp->doubleValue)) {
1006 if (m_current == '.') {
1007 shift();
1008 inNumberAfterDecimalPoint:
1009 parseNumberAfterDecimalPoint();
1010 }
1011 if ((m_current | 0x20) == 'e')
1012 if (!parseNumberAfterExponentIndicator())
1013 goto returnError;
1014 // Null-terminate string for strtod.
1015 m_buffer8.append('\0');
1016 lvalp->doubleValue = WTF::strtod(m_buffer8.data(), 0);
1017 }
1018 token = NUMBER;
1019 }
1020
1021 // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
1022 if (UNLIKELY(isIdentStart(m_current)))
1023 goto returnError;
1024 m_buffer8.resize(0);
1025 m_delimited = false;
1026 break;
1027 case CharacterQuote:
1028 if (UNLIKELY(!parseString(lvalp, strictMode)))
1029 goto returnError;
1030 shift();
1031 m_delimited = false;
1032 token = STRING;
1033 break;
1034 case CharacterIdentifierStart:
1035 ASSERT(isIdentStart(m_current));
1036 // Fall through into CharacterBackSlash.
1037 case CharacterBackSlash:
1038 token = parseIdentifier(lvalp, lexType);
1039 break;
1040 case CharacterLineTerminator:
1041 ASSERT(isLineTerminator(m_current));
1042 shiftLineTerminator();
1043 m_atLineStart = true;
1044 m_terminator = true;
1045 goto start;
1046 case CharacterInvalid:
1047 goto returnError;
1048 default:
1049 ASSERT_NOT_REACHED();
1050 goto returnError;
1051 }
1052
1053 m_atLineStart = false;
1054 goto returnToken;
1055
1056 inSingleLineComment:
1057 while (!isLineTerminator(m_current)) {
1058 if (UNLIKELY(m_current == -1))
1059 return EOFTOK;
1060 shift();
1061 }
1062 shiftLineTerminator();
1063 m_atLineStart = true;
1064 m_terminator = true;
1065 if (!lastTokenWasRestrKeyword())
1066 goto start;
1067
1068 token = SEMICOLON;
1069 m_delimited = true;
1070 // Fall through into returnToken.
1071
1072 returnToken:
1073 llocp->line = m_lineNumber;
1074 llocp->startOffset = startOffset;
1075 llocp->endOffset = currentOffset();
1076 m_lastToken = token;
1077 return token;
1078
1079 returnError:
1080 m_error = true;
1081 return ERRORTOK;
1082 }
1083
scanRegExp(const Identifier * & pattern,const Identifier * & flags,UChar patternPrefix)1084 bool Lexer::scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix)
1085 {
1086 ASSERT(m_buffer16.isEmpty());
1087
1088 bool lastWasEscape = false;
1089 bool inBrackets = false;
1090
1091 if (patternPrefix) {
1092 ASSERT(!isLineTerminator(patternPrefix));
1093 ASSERT(patternPrefix != '/');
1094 ASSERT(patternPrefix != '[');
1095 record16(patternPrefix);
1096 }
1097
1098 while (true) {
1099 int current = m_current;
1100
1101 if (isLineTerminator(current) || current == -1) {
1102 m_buffer16.resize(0);
1103 return false;
1104 }
1105
1106 shift();
1107
1108 if (current == '/' && !lastWasEscape && !inBrackets)
1109 break;
1110
1111 record16(current);
1112
1113 if (lastWasEscape) {
1114 lastWasEscape = false;
1115 continue;
1116 }
1117
1118 switch (current) {
1119 case '[':
1120 inBrackets = true;
1121 break;
1122 case ']':
1123 inBrackets = false;
1124 break;
1125 case '\\':
1126 lastWasEscape = true;
1127 break;
1128 }
1129 }
1130
1131 pattern = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1132 m_buffer16.resize(0);
1133
1134 while (isIdentPart(m_current)) {
1135 record16(m_current);
1136 shift();
1137 }
1138
1139 flags = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1140 m_buffer16.resize(0);
1141
1142 return true;
1143 }
1144
skipRegExp()1145 bool Lexer::skipRegExp()
1146 {
1147 bool lastWasEscape = false;
1148 bool inBrackets = false;
1149
1150 while (true) {
1151 int current = m_current;
1152
1153 if (isLineTerminator(current) || current == -1)
1154 return false;
1155
1156 shift();
1157
1158 if (current == '/' && !lastWasEscape && !inBrackets)
1159 break;
1160
1161 if (lastWasEscape) {
1162 lastWasEscape = false;
1163 continue;
1164 }
1165
1166 switch (current) {
1167 case '[':
1168 inBrackets = true;
1169 break;
1170 case ']':
1171 inBrackets = false;
1172 break;
1173 case '\\':
1174 lastWasEscape = true;
1175 break;
1176 }
1177 }
1178
1179 while (isIdentPart(m_current))
1180 shift();
1181
1182 return true;
1183 }
1184
clear()1185 void Lexer::clear()
1186 {
1187 m_arena = 0;
1188
1189 Vector<char> newBuffer8;
1190 m_buffer8.swap(newBuffer8);
1191
1192 Vector<UChar> newBuffer16;
1193 m_buffer16.swap(newBuffer16);
1194
1195 m_isReparsing = false;
1196 }
1197
sourceCode(int openBrace,int closeBrace,int firstLine)1198 SourceCode Lexer::sourceCode(int openBrace, int closeBrace, int firstLine)
1199 {
1200 ASSERT(m_source->provider()->data()[openBrace] == '{');
1201 ASSERT(m_source->provider()->data()[closeBrace] == '}');
1202 return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine);
1203 }
1204
1205 } // namespace JSC
1206