• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3  *  Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved.
4  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
5  *  Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
6  *
7  *  This library is free software; you can redistribute it and/or
8  *  modify it under the terms of the GNU Library General Public
9  *  License as published by the Free Software Foundation; either
10  *  version 2 of the License, or (at your option) any later version.
11  *
12  *  This library is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  *  Library General Public License for more details.
16  *
17  *  You should have received a copy of the GNU Library General Public License
18  *  along with this library; see the file COPYING.LIB.  If not, write to
19  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20  *  Boston, MA 02110-1301, USA.
21  *
22  */
23 
24 #include "config.h"
25 #include "Lexer.h"
26 
27 #include "JSFunction.h"
28 
29 #include "JSGlobalObjectFunctions.h"
30 #include "Identifier.h"
31 #include "NodeInfo.h"
32 #include "Nodes.h"
33 #include "dtoa.h"
34 #include <ctype.h>
35 #include <limits.h>
36 #include <string.h>
37 #include <wtf/Assertions.h>
38 
39 using namespace WTF;
40 using namespace Unicode;
41 
42 #include "JSParser.h"
43 #include "Lookup.h"
44 #include "Lexer.lut.h"
45 
46 namespace JSC {
47 
48 
49 enum CharacterType {
50     // Types for the main switch
51 
52     // The first three types are fixed, and also used for identifying
53     // ASCII alpha and alphanumeric characters (see isIdentStart and isIdentPart).
54     CharacterIdentifierStart,
55     CharacterZero,
56     CharacterNumber,
57 
58     CharacterInvalid,
59     CharacterLineTerminator,
60     CharacterExclamationMark,
61     CharacterOpenParen,
62     CharacterCloseParen,
63     CharacterOpenBracket,
64     CharacterCloseBracket,
65     CharacterComma,
66     CharacterColon,
67     CharacterQuestion,
68     CharacterTilde,
69     CharacterQuote,
70     CharacterDot,
71     CharacterSlash,
72     CharacterBackSlash,
73     CharacterSemicolon,
74     CharacterOpenBrace,
75     CharacterCloseBrace,
76 
77     CharacterAdd,
78     CharacterSub,
79     CharacterMultiply,
80     CharacterModulo,
81     CharacterAnd,
82     CharacterXor,
83     CharacterOr,
84     CharacterLess,
85     CharacterGreater,
86     CharacterEqual,
87 
88     // Other types (only one so far)
89     CharacterWhiteSpace,
90 };
91 
92 // 128 ASCII codes
93 static const unsigned short typesOfASCIICharacters[128] = {
94 /*   0 - Null               */ CharacterInvalid,
95 /*   1 - Start of Heading   */ CharacterInvalid,
96 /*   2 - Start of Text      */ CharacterInvalid,
97 /*   3 - End of Text        */ CharacterInvalid,
98 /*   4 - End of Transm.     */ CharacterInvalid,
99 /*   5 - Enquiry            */ CharacterInvalid,
100 /*   6 - Acknowledgment     */ CharacterInvalid,
101 /*   7 - Bell               */ CharacterInvalid,
102 /*   8 - Back Space         */ CharacterInvalid,
103 /*   9 - Horizontal Tab     */ CharacterWhiteSpace,
104 /*  10 - Line Feed          */ CharacterLineTerminator,
105 /*  11 - Vertical Tab       */ CharacterWhiteSpace,
106 /*  12 - Form Feed          */ CharacterWhiteSpace,
107 /*  13 - Carriage Return    */ CharacterLineTerminator,
108 /*  14 - Shift Out          */ CharacterInvalid,
109 /*  15 - Shift In           */ CharacterInvalid,
110 /*  16 - Data Line Escape   */ CharacterInvalid,
111 /*  17 - Device Control 1   */ CharacterInvalid,
112 /*  18 - Device Control 2   */ CharacterInvalid,
113 /*  19 - Device Control 3   */ CharacterInvalid,
114 /*  20 - Device Control 4   */ CharacterInvalid,
115 /*  21 - Negative Ack.      */ CharacterInvalid,
116 /*  22 - Synchronous Idle   */ CharacterInvalid,
117 /*  23 - End of Transmit    */ CharacterInvalid,
118 /*  24 - Cancel             */ CharacterInvalid,
119 /*  25 - End of Medium      */ CharacterInvalid,
120 /*  26 - Substitute         */ CharacterInvalid,
121 /*  27 - Escape             */ CharacterInvalid,
122 /*  28 - File Separator     */ CharacterInvalid,
123 /*  29 - Group Separator    */ CharacterInvalid,
124 /*  30 - Record Separator   */ CharacterInvalid,
125 /*  31 - Unit Separator     */ CharacterInvalid,
126 /*  32 - Space              */ CharacterWhiteSpace,
127 /*  33 - !                  */ CharacterExclamationMark,
128 /*  34 - "                  */ CharacterQuote,
129 /*  35 - #                  */ CharacterInvalid,
130 /*  36 - $                  */ CharacterIdentifierStart,
131 /*  37 - %                  */ CharacterModulo,
132 /*  38 - &                  */ CharacterAnd,
133 /*  39 - '                  */ CharacterQuote,
134 /*  40 - (                  */ CharacterOpenParen,
135 /*  41 - )                  */ CharacterCloseParen,
136 /*  42 - *                  */ CharacterMultiply,
137 /*  43 - +                  */ CharacterAdd,
138 /*  44 - ,                  */ CharacterComma,
139 /*  45 - -                  */ CharacterSub,
140 /*  46 - .                  */ CharacterDot,
141 /*  47 - /                  */ CharacterSlash,
142 /*  48 - 0                  */ CharacterZero,
143 /*  49 - 1                  */ CharacterNumber,
144 /*  50 - 2                  */ CharacterNumber,
145 /*  51 - 3                  */ CharacterNumber,
146 /*  52 - 4                  */ CharacterNumber,
147 /*  53 - 5                  */ CharacterNumber,
148 /*  54 - 6                  */ CharacterNumber,
149 /*  55 - 7                  */ CharacterNumber,
150 /*  56 - 8                  */ CharacterNumber,
151 /*  57 - 9                  */ CharacterNumber,
152 /*  58 - :                  */ CharacterColon,
153 /*  59 - ;                  */ CharacterSemicolon,
154 /*  60 - <                  */ CharacterLess,
155 /*  61 - =                  */ CharacterEqual,
156 /*  62 - >                  */ CharacterGreater,
157 /*  63 - ?                  */ CharacterQuestion,
158 /*  64 - @                  */ CharacterInvalid,
159 /*  65 - A                  */ CharacterIdentifierStart,
160 /*  66 - B                  */ CharacterIdentifierStart,
161 /*  67 - C                  */ CharacterIdentifierStart,
162 /*  68 - D                  */ CharacterIdentifierStart,
163 /*  69 - E                  */ CharacterIdentifierStart,
164 /*  70 - F                  */ CharacterIdentifierStart,
165 /*  71 - G                  */ CharacterIdentifierStart,
166 /*  72 - H                  */ CharacterIdentifierStart,
167 /*  73 - I                  */ CharacterIdentifierStart,
168 /*  74 - J                  */ CharacterIdentifierStart,
169 /*  75 - K                  */ CharacterIdentifierStart,
170 /*  76 - L                  */ CharacterIdentifierStart,
171 /*  77 - M                  */ CharacterIdentifierStart,
172 /*  78 - N                  */ CharacterIdentifierStart,
173 /*  79 - O                  */ CharacterIdentifierStart,
174 /*  80 - P                  */ CharacterIdentifierStart,
175 /*  81 - Q                  */ CharacterIdentifierStart,
176 /*  82 - R                  */ CharacterIdentifierStart,
177 /*  83 - S                  */ CharacterIdentifierStart,
178 /*  84 - T                  */ CharacterIdentifierStart,
179 /*  85 - U                  */ CharacterIdentifierStart,
180 /*  86 - V                  */ CharacterIdentifierStart,
181 /*  87 - W                  */ CharacterIdentifierStart,
182 /*  88 - X                  */ CharacterIdentifierStart,
183 /*  89 - Y                  */ CharacterIdentifierStart,
184 /*  90 - Z                  */ CharacterIdentifierStart,
185 /*  91 - [                  */ CharacterOpenBracket,
186 /*  92 - \                  */ CharacterBackSlash,
187 /*  93 - ]                  */ CharacterCloseBracket,
188 /*  94 - ^                  */ CharacterXor,
189 /*  95 - _                  */ CharacterIdentifierStart,
190 /*  96 - `                  */ CharacterInvalid,
191 /*  97 - a                  */ CharacterIdentifierStart,
192 /*  98 - b                  */ CharacterIdentifierStart,
193 /*  99 - c                  */ CharacterIdentifierStart,
194 /* 100 - d                  */ CharacterIdentifierStart,
195 /* 101 - e                  */ CharacterIdentifierStart,
196 /* 102 - f                  */ CharacterIdentifierStart,
197 /* 103 - g                  */ CharacterIdentifierStart,
198 /* 104 - h                  */ CharacterIdentifierStart,
199 /* 105 - i                  */ CharacterIdentifierStart,
200 /* 106 - j                  */ CharacterIdentifierStart,
201 /* 107 - k                  */ CharacterIdentifierStart,
202 /* 108 - l                  */ CharacterIdentifierStart,
203 /* 109 - m                  */ CharacterIdentifierStart,
204 /* 110 - n                  */ CharacterIdentifierStart,
205 /* 111 - o                  */ CharacterIdentifierStart,
206 /* 112 - p                  */ CharacterIdentifierStart,
207 /* 113 - q                  */ CharacterIdentifierStart,
208 /* 114 - r                  */ CharacterIdentifierStart,
209 /* 115 - s                  */ CharacterIdentifierStart,
210 /* 116 - t                  */ CharacterIdentifierStart,
211 /* 117 - u                  */ CharacterIdentifierStart,
212 /* 118 - v                  */ CharacterIdentifierStart,
213 /* 119 - w                  */ CharacterIdentifierStart,
214 /* 120 - x                  */ CharacterIdentifierStart,
215 /* 121 - y                  */ CharacterIdentifierStart,
216 /* 122 - z                  */ CharacterIdentifierStart,
217 /* 123 - {                  */ CharacterOpenBrace,
218 /* 124 - |                  */ CharacterOr,
219 /* 125 - }                  */ CharacterCloseBrace,
220 /* 126 - ~                  */ CharacterTilde,
221 /* 127 - Delete             */ CharacterInvalid,
222 };
223 
Lexer(JSGlobalData * globalData)224 Lexer::Lexer(JSGlobalData* globalData)
225     : m_isReparsing(false)
226     , m_globalData(globalData)
227     , m_keywordTable(JSC::mainTable)
228 {
229 }
230 
~Lexer()231 Lexer::~Lexer()
232 {
233     m_keywordTable.deleteTable();
234 }
235 
currentCharacter() const236 ALWAYS_INLINE const UChar* Lexer::currentCharacter() const
237 {
238     ASSERT(m_code <= m_codeEnd);
239     return m_code;
240 }
241 
currentOffset() const242 ALWAYS_INLINE int Lexer::currentOffset() const
243 {
244     return currentCharacter() - m_codeStart;
245 }
246 
setCode(const SourceCode & source,ParserArena & arena)247 void Lexer::setCode(const SourceCode& source, ParserArena& arena)
248 {
249     m_arena = &arena.identifierArena();
250 
251     m_lineNumber = source.firstLine();
252     m_delimited = false;
253     m_lastToken = -1;
254 
255     const UChar* data = source.provider()->data();
256 
257     m_source = &source;
258     m_codeStart = data;
259     m_code = data + source.startOffset();
260     m_codeEnd = data + source.endOffset();
261     m_error = false;
262     m_atLineStart = true;
263 
264     m_buffer8.reserveInitialCapacity(initialReadBufferCapacity);
265     m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2);
266 
267     if (LIKELY(m_code < m_codeEnd))
268         m_current = *m_code;
269     else
270         m_current = -1;
271     ASSERT(currentOffset() == source.startOffset());
272 }
273 
shift()274 ALWAYS_INLINE void Lexer::shift()
275 {
276     // Faster than an if-else sequence
277     ASSERT(m_current != -1);
278     m_current = -1;
279     ++m_code;
280     if (LIKELY(m_code < m_codeEnd))
281         m_current = *m_code;
282 }
283 
peek(int offset)284 ALWAYS_INLINE int Lexer::peek(int offset)
285 {
286     // Only use if necessary
287     ASSERT(offset > 0 && offset < 5);
288     const UChar* code = m_code + offset;
289     return (code < m_codeEnd) ? *code : -1;
290 }
291 
getUnicodeCharacter()292 int Lexer::getUnicodeCharacter()
293 {
294     int char1 = peek(1);
295     int char2 = peek(2);
296     int char3 = peek(3);
297 
298     if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3)))
299         return -1;
300 
301     int result = convertUnicode(m_current, char1, char2, char3);
302     shift();
303     shift();
304     shift();
305     shift();
306     return result;
307 }
308 
shiftLineTerminator()309 void Lexer::shiftLineTerminator()
310 {
311     ASSERT(isLineTerminator(m_current));
312 
313     int m_prev = m_current;
314     shift();
315 
316     // Allow both CRLF and LFCR.
317     if (m_prev + m_current == '\n' + '\r')
318         shift();
319 
320     ++m_lineNumber;
321 }
322 
makeIdentifier(const UChar * characters,size_t length)323 ALWAYS_INLINE const Identifier* Lexer::makeIdentifier(const UChar* characters, size_t length)
324 {
325     return &m_arena->makeIdentifier(m_globalData, characters, length);
326 }
327 
lastTokenWasRestrKeyword() const328 ALWAYS_INLINE bool Lexer::lastTokenWasRestrKeyword() const
329 {
330     return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW;
331 }
332 
isNonASCIIIdentStart(int c)333 static NEVER_INLINE bool isNonASCIIIdentStart(int c)
334 {
335     return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other);
336 }
337 
isIdentStart(int c)338 static inline bool isIdentStart(int c)
339 {
340     return isASCII(c) ? typesOfASCIICharacters[c] == CharacterIdentifierStart : isNonASCIIIdentStart(c);
341 }
342 
isNonASCIIIdentPart(int c)343 static NEVER_INLINE bool isNonASCIIIdentPart(int c)
344 {
345     return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
346         | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector);
347 }
348 
isIdentPart(int c)349 static inline bool isIdentPart(int c)
350 {
351     // Character types are divided into two groups depending on whether they can be part of an
352     // identifier or not. Those whose type value is less or equal than CharacterNumber can be
353     // part of an identifier. (See the CharacterType definition for more details.)
354     return isASCII(c) ? typesOfASCIICharacters[c] <= CharacterNumber : isNonASCIIIdentPart(c);
355 }
356 
singleEscape(int c)357 static inline int singleEscape(int c)
358 {
359     switch (c) {
360     case 'b':
361         return 0x08;
362     case 't':
363         return 0x09;
364     case 'n':
365         return 0x0A;
366     case 'v':
367         return 0x0B;
368     case 'f':
369         return 0x0C;
370     case 'r':
371         return 0x0D;
372     case '\\':
373         return '\\';
374     case '\'':
375         return '\'';
376     case '"':
377         return '"';
378     default:
379         return 0;
380     }
381 }
382 
record8(int c)383 inline void Lexer::record8(int c)
384 {
385     ASSERT(c >= 0);
386     ASSERT(c <= 0xFF);
387     m_buffer8.append(static_cast<char>(c));
388 }
389 
record16(UChar c)390 inline void Lexer::record16(UChar c)
391 {
392     m_buffer16.append(c);
393 }
394 
record16(int c)395 inline void Lexer::record16(int c)
396 {
397     ASSERT(c >= 0);
398     ASSERT(c <= USHRT_MAX);
399     record16(UChar(static_cast<unsigned short>(c)));
400 }
401 
parseIdentifier(JSTokenData * lvalp,LexType lexType)402 ALWAYS_INLINE JSTokenType Lexer::parseIdentifier(JSTokenData* lvalp, LexType lexType)
403 {
404     bool bufferRequired = false;
405     const UChar* identifierStart = currentCharacter();
406     int identifierLength;
407 
408     while (true) {
409         if (LIKELY(isIdentPart(m_current))) {
410             shift();
411             continue;
412         }
413         if (LIKELY(m_current != '\\'))
414             break;
415 
416         // \uXXXX unicode characters.
417         bufferRequired = true;
418         if (identifierStart != currentCharacter())
419             m_buffer16.append(identifierStart, currentCharacter() - identifierStart);
420         shift();
421         if (UNLIKELY(m_current != 'u'))
422             return ERRORTOK;
423         shift();
424         int character = getUnicodeCharacter();
425         if (UNLIKELY(character == -1))
426             return ERRORTOK;
427         if (UNLIKELY(m_buffer16.size() ? !isIdentPart(character) : !isIdentStart(character)))
428             return ERRORTOK;
429         record16(character);
430         identifierStart = currentCharacter();
431     }
432 
433     if (!bufferRequired)
434         identifierLength = currentCharacter() - identifierStart;
435     else {
436         if (identifierStart != currentCharacter())
437             m_buffer16.append(identifierStart, currentCharacter() - identifierStart);
438         identifierStart = m_buffer16.data();
439         identifierLength = m_buffer16.size();
440     }
441 
442     const Identifier* ident = makeIdentifier(identifierStart, identifierLength);
443     lvalp->ident = ident;
444     m_delimited = false;
445 
446     if (LIKELY(!bufferRequired && lexType == IdentifyReservedWords)) {
447         // Keywords must not be recognized if there was an \uXXXX in the identifier.
448         const HashEntry* entry = m_keywordTable.entry(m_globalData, *ident);
449         return entry ? static_cast<JSTokenType>(entry->lexerValue()) : IDENT;
450     }
451 
452     m_buffer16.resize(0);
453     return IDENT;
454 }
455 
parseString(JSTokenData * lvalp,bool strictMode)456 ALWAYS_INLINE bool Lexer::parseString(JSTokenData* lvalp, bool strictMode)
457 {
458     int stringQuoteCharacter = m_current;
459     shift();
460 
461     const UChar* stringStart = currentCharacter();
462 
463     while (m_current != stringQuoteCharacter) {
464         if (UNLIKELY(m_current == '\\')) {
465             if (stringStart != currentCharacter())
466                 m_buffer16.append(stringStart, currentCharacter() - stringStart);
467             shift();
468 
469             int escape = singleEscape(m_current);
470 
471             // Most common escape sequences first
472             if (escape) {
473                 record16(escape);
474                 shift();
475             } else if (UNLIKELY(isLineTerminator(m_current)))
476                 shiftLineTerminator();
477             else if (m_current == 'x') {
478                 shift();
479                 if (isASCIIHexDigit(m_current) && isASCIIHexDigit(peek(1))) {
480                     int prev = m_current;
481                     shift();
482                     record16(convertHex(prev, m_current));
483                     shift();
484                 } else
485                     record16('x');
486             } else if (m_current == 'u') {
487                 shift();
488                 int character = getUnicodeCharacter();
489                 if (character != -1)
490                     record16(character);
491                 else if (m_current == stringQuoteCharacter)
492                     record16('u');
493                 else // Only stringQuoteCharacter allowed after \u
494                     return false;
495             } else if (strictMode && isASCIIDigit(m_current)) {
496                 // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit.
497                 int character1 = m_current;
498                 shift();
499                 if (character1 != '0' || isASCIIDigit(m_current))
500                     return false;
501                 record16(0);
502             } else if (!strictMode && isASCIIOctalDigit(m_current)) {
503                 // Octal character sequences
504                 int character1 = m_current;
505                 shift();
506                 if (isASCIIOctalDigit(m_current)) {
507                     // Two octal characters
508                     int character2 = m_current;
509                     shift();
510                     if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) {
511                         record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0');
512                         shift();
513                     } else
514                         record16((character1 - '0') * 8 + character2 - '0');
515                 } else
516                     record16(character1 - '0');
517             } else if (m_current != -1) {
518                 record16(m_current);
519                 shift();
520             } else
521                 return false;
522 
523             stringStart = currentCharacter();
524             continue;
525         }
526         // Fast check for characters that require special handling.
527         // Catches -1, \n, \r, 0x2028, and 0x2029 as efficiently
528         // as possible, and lets through all common ASCII characters.
529         if (UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) {
530             // New-line or end of input is not allowed
531             if (UNLIKELY(isLineTerminator(m_current)) || UNLIKELY(m_current == -1))
532                 return false;
533             // Anything else is just a normal character
534         }
535         shift();
536     }
537 
538     if (currentCharacter() != stringStart)
539         m_buffer16.append(stringStart, currentCharacter() - stringStart);
540     lvalp->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
541     m_buffer16.resize(0);
542     return true;
543 }
544 
parseHex(double & returnValue)545 ALWAYS_INLINE void Lexer::parseHex(double& returnValue)
546 {
547     // Optimization: most hexadecimal values fit into 4 bytes.
548     uint32_t hexValue = 0;
549     int maximumDigits = 7;
550 
551     // Shift out the 'x' prefix.
552     shift();
553 
554     do {
555         hexValue = (hexValue << 4) + toASCIIHexValue(m_current);
556         shift();
557         --maximumDigits;
558     } while (isASCIIHexDigit(m_current) && maximumDigits >= 0);
559 
560     if (maximumDigits >= 0) {
561         returnValue = hexValue;
562         return;
563     }
564 
565     // No more place in the hexValue buffer.
566     // The values are shifted out and placed into the m_buffer8 vector.
567     for (int i = 0; i < 8; ++i) {
568          int digit = hexValue >> 28;
569          if (digit < 10)
570              record8(digit + '0');
571          else
572              record8(digit - 10 + 'a');
573          hexValue <<= 4;
574     }
575 
576     while (isASCIIHexDigit(m_current)) {
577         record8(m_current);
578         shift();
579     }
580 
581     returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16);
582 }
583 
parseOctal(double & returnValue)584 ALWAYS_INLINE bool Lexer::parseOctal(double& returnValue)
585 {
586     // Optimization: most octal values fit into 4 bytes.
587     uint32_t octalValue = 0;
588     int maximumDigits = 9;
589     // Temporary buffer for the digits. Makes easier
590     // to reconstruct the input characters when needed.
591     char digits[10];
592 
593     do {
594         octalValue = octalValue * 8 + (m_current - '0');
595         digits[maximumDigits] = m_current;
596         shift();
597         --maximumDigits;
598     } while (isASCIIOctalDigit(m_current) && maximumDigits >= 0);
599 
600     if (!isASCIIDigit(m_current) && maximumDigits >= 0) {
601         returnValue = octalValue;
602         return true;
603     }
604 
605     for (int i = 9; i > maximumDigits; --i)
606          record8(digits[i]);
607 
608     while (isASCIIOctalDigit(m_current)) {
609         record8(m_current);
610         shift();
611     }
612 
613     if (isASCIIDigit(m_current))
614         return false;
615 
616     returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 8);
617     return true;
618 }
619 
parseDecimal(double & returnValue)620 ALWAYS_INLINE bool Lexer::parseDecimal(double& returnValue)
621 {
622     // Optimization: most decimal values fit into 4 bytes.
623     uint32_t decimalValue = 0;
624 
625     // Since parseOctal may be executed before parseDecimal,
626     // the m_buffer8 may hold ascii digits.
627     if (!m_buffer8.size()) {
628         int maximumDigits = 9;
629         // Temporary buffer for the digits. Makes easier
630         // to reconstruct the input characters when needed.
631         char digits[10];
632 
633         do {
634             decimalValue = decimalValue * 10 + (m_current - '0');
635             digits[maximumDigits] = m_current;
636             shift();
637             --maximumDigits;
638         } while (isASCIIDigit(m_current) && maximumDigits >= 0);
639 
640         if (maximumDigits >= 0 && m_current != '.' && (m_current | 0x20) != 'e') {
641             returnValue = decimalValue;
642             return true;
643         }
644 
645         for (int i = 9; i > maximumDigits; --i)
646             record8(digits[i]);
647     }
648 
649     while (isASCIIDigit(m_current)) {
650         record8(m_current);
651         shift();
652     }
653 
654     return false;
655 }
656 
parseNumberAfterDecimalPoint()657 ALWAYS_INLINE void Lexer::parseNumberAfterDecimalPoint()
658 {
659     record8('.');
660     while (isASCIIDigit(m_current)) {
661         record8(m_current);
662         shift();
663     }
664 }
665 
parseNumberAfterExponentIndicator()666 ALWAYS_INLINE bool Lexer::parseNumberAfterExponentIndicator()
667 {
668     record8('e');
669     shift();
670     if (m_current == '+' || m_current == '-') {
671         record8(m_current);
672         shift();
673     }
674 
675     if (!isASCIIDigit(m_current))
676         return false;
677 
678     do {
679         record8(m_current);
680         shift();
681     } while (isASCIIDigit(m_current));
682     return true;
683 }
684 
parseMultilineComment()685 ALWAYS_INLINE bool Lexer::parseMultilineComment()
686 {
687     while (true) {
688         while (UNLIKELY(m_current == '*')) {
689             shift();
690             if (m_current == '/') {
691                 shift();
692                 return true;
693             }
694         }
695 
696         if (UNLIKELY(m_current == -1))
697             return false;
698 
699         if (isLineTerminator(m_current))
700             shiftLineTerminator();
701         else
702             shift();
703     }
704 }
705 
nextTokenIsColon()706 bool Lexer::nextTokenIsColon()
707 {
708     const UChar* code = m_code;
709     while (code < m_codeEnd && (isWhiteSpace(*code) || isLineTerminator(*code)))
710         code++;
711 
712     return code < m_codeEnd && *code == ':';
713 }
714 
lex(JSTokenData * lvalp,JSTokenInfo * llocp,LexType lexType,bool strictMode)715 JSTokenType Lexer::lex(JSTokenData* lvalp, JSTokenInfo* llocp, LexType lexType, bool strictMode)
716 {
717     ASSERT(!m_error);
718     ASSERT(m_buffer8.isEmpty());
719     ASSERT(m_buffer16.isEmpty());
720 
721     JSTokenType token = ERRORTOK;
722     m_terminator = false;
723 
724 start:
725     while (isWhiteSpace(m_current))
726         shift();
727 
728     int startOffset = currentOffset();
729 
730     if (UNLIKELY(m_current == -1))
731         return EOFTOK;
732 
733     m_delimited = false;
734 
735     CharacterType type;
736     if (LIKELY(isASCII(m_current)))
737         type = static_cast<CharacterType>(typesOfASCIICharacters[m_current]);
738     else if (isNonASCIIIdentStart(m_current))
739         type = CharacterIdentifierStart;
740     else if (isLineTerminator(m_current))
741         type = CharacterLineTerminator;
742     else
743         type = CharacterInvalid;
744 
745     switch (type) {
746     case CharacterGreater:
747         shift();
748         if (m_current == '>') {
749             shift();
750             if (m_current == '>') {
751                 shift();
752                 if (m_current == '=') {
753                     shift();
754                     token = URSHIFTEQUAL;
755                     break;
756                 }
757                 token = URSHIFT;
758                 break;
759             }
760             if (m_current == '=') {
761                 shift();
762                 token = RSHIFTEQUAL;
763                 break;
764             }
765             token = RSHIFT;
766             break;
767         }
768         if (m_current == '=') {
769             shift();
770             token = GE;
771             break;
772         }
773         token = GT;
774         break;
775     case CharacterEqual:
776         shift();
777         if (m_current == '=') {
778             shift();
779             if (m_current == '=') {
780                 shift();
781                 token = STREQ;
782                 break;
783             }
784             token = EQEQ;
785             break;
786         }
787         token = EQUAL;
788         break;
789     case CharacterLess:
790         shift();
791         if (m_current == '!' && peek(1) == '-' && peek(2) == '-') {
792             // <!-- marks the beginning of a line comment (for www usage)
793             goto inSingleLineComment;
794         }
795         if (m_current == '<') {
796             shift();
797             if (m_current == '=') {
798                 shift();
799                 token = LSHIFTEQUAL;
800                 break;
801             }
802             token = LSHIFT;
803             break;
804         }
805         if (m_current == '=') {
806             shift();
807             token = LE;
808             break;
809         }
810         token = LT;
811         break;
812     case CharacterExclamationMark:
813         shift();
814         if (m_current == '=') {
815             shift();
816             if (m_current == '=') {
817                 shift();
818                 token = STRNEQ;
819                 break;
820             }
821             token = NE;
822             break;
823         }
824         token = EXCLAMATION;
825         break;
826     case CharacterAdd:
827         shift();
828         if (m_current == '+') {
829             shift();
830             token = (!m_terminator) ? PLUSPLUS : AUTOPLUSPLUS;
831             break;
832         }
833         if (m_current == '=') {
834             shift();
835             token = PLUSEQUAL;
836             break;
837         }
838         token = PLUS;
839         break;
840     case CharacterSub:
841         shift();
842         if (m_current == '-') {
843             shift();
844             if (m_atLineStart && m_current == '>') {
845                 shift();
846                 goto inSingleLineComment;
847             }
848             token = (!m_terminator) ? MINUSMINUS : AUTOMINUSMINUS;
849             break;
850         }
851         if (m_current == '=') {
852             shift();
853             token = MINUSEQUAL;
854             break;
855         }
856         token = MINUS;
857         break;
858     case CharacterMultiply:
859         shift();
860         if (m_current == '=') {
861             shift();
862             token = MULTEQUAL;
863             break;
864         }
865         token = TIMES;
866         break;
867     case CharacterSlash:
868         shift();
869         if (m_current == '/') {
870             shift();
871             goto inSingleLineComment;
872         }
873         if (m_current == '*') {
874             shift();
875             if (parseMultilineComment())
876                 goto start;
877             goto returnError;
878         }
879         if (m_current == '=') {
880             shift();
881             token = DIVEQUAL;
882             break;
883         }
884         token = DIVIDE;
885         break;
886     case CharacterAnd:
887         shift();
888         if (m_current == '&') {
889             shift();
890             token = AND;
891             break;
892         }
893         if (m_current == '=') {
894             shift();
895             token = ANDEQUAL;
896             break;
897         }
898         token = BITAND;
899         break;
900     case CharacterXor:
901         shift();
902         if (m_current == '=') {
903             shift();
904             token = XOREQUAL;
905             break;
906         }
907         token = BITXOR;
908         break;
909     case CharacterModulo:
910         shift();
911         if (m_current == '=') {
912             shift();
913             token = MODEQUAL;
914             break;
915         }
916         token = MOD;
917         break;
918     case CharacterOr:
919         shift();
920         if (m_current == '=') {
921             shift();
922             token = OREQUAL;
923             break;
924         }
925         if (m_current == '|') {
926             shift();
927             token = OR;
928             break;
929         }
930         token = BITOR;
931         break;
932     case CharacterOpenParen:
933         token = OPENPAREN;
934         shift();
935         break;
936     case CharacterCloseParen:
937         token = CLOSEPAREN;
938         shift();
939         break;
940     case CharacterOpenBracket:
941         token = OPENBRACKET;
942         shift();
943         break;
944     case CharacterCloseBracket:
945         token = CLOSEBRACKET;
946         shift();
947         break;
948     case CharacterComma:
949         token = COMMA;
950         shift();
951         break;
952     case CharacterColon:
953         token = COLON;
954         shift();
955         break;
956     case CharacterQuestion:
957         token = QUESTION;
958         shift();
959         break;
960     case CharacterTilde:
961         token = TILDE;
962         shift();
963         break;
964     case CharacterSemicolon:
965         m_delimited = true;
966         shift();
967         token = SEMICOLON;
968         break;
969     case CharacterOpenBrace:
970         lvalp->intValue = currentOffset();
971         shift();
972         token = OPENBRACE;
973         break;
974     case CharacterCloseBrace:
975         lvalp->intValue = currentOffset();
976         m_delimited = true;
977         shift();
978         token = CLOSEBRACE;
979         break;
980     case CharacterDot:
981         shift();
982         if (!isASCIIDigit(m_current)) {
983             token = DOT;
984             break;
985         }
986         goto inNumberAfterDecimalPoint;
987     case CharacterZero:
988         shift();
989         if ((m_current | 0x20) == 'x' && isASCIIHexDigit(peek(1))) {
990             parseHex(lvalp->doubleValue);
991             token = NUMBER;
992         } else {
993             record8('0');
994             if (isASCIIOctalDigit(m_current)) {
995                 if (parseOctal(lvalp->doubleValue)) {
996                     if (strictMode)
997                         goto returnError;
998                     token = NUMBER;
999                 }
1000             }
1001         }
1002         // Fall through into CharacterNumber
1003     case CharacterNumber:
1004         if (LIKELY(token != NUMBER)) {
1005             if (!parseDecimal(lvalp->doubleValue)) {
1006                 if (m_current == '.') {
1007                     shift();
1008 inNumberAfterDecimalPoint:
1009                     parseNumberAfterDecimalPoint();
1010                 }
1011                 if ((m_current | 0x20) == 'e')
1012                     if (!parseNumberAfterExponentIndicator())
1013                         goto returnError;
1014                 // Null-terminate string for strtod.
1015                 m_buffer8.append('\0');
1016                 lvalp->doubleValue = WTF::strtod(m_buffer8.data(), 0);
1017             }
1018             token = NUMBER;
1019         }
1020 
1021         // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
1022         if (UNLIKELY(isIdentStart(m_current)))
1023             goto returnError;
1024         m_buffer8.resize(0);
1025         m_delimited = false;
1026         break;
1027     case CharacterQuote:
1028         if (UNLIKELY(!parseString(lvalp, strictMode)))
1029             goto returnError;
1030         shift();
1031         m_delimited = false;
1032         token = STRING;
1033         break;
1034     case CharacterIdentifierStart:
1035         ASSERT(isIdentStart(m_current));
1036         // Fall through into CharacterBackSlash.
1037     case CharacterBackSlash:
1038         token = parseIdentifier(lvalp, lexType);
1039         break;
1040     case CharacterLineTerminator:
1041         ASSERT(isLineTerminator(m_current));
1042         shiftLineTerminator();
1043         m_atLineStart = true;
1044         m_terminator = true;
1045         goto start;
1046     case CharacterInvalid:
1047         goto returnError;
1048     default:
1049         ASSERT_NOT_REACHED();
1050         goto returnError;
1051     }
1052 
1053     m_atLineStart = false;
1054     goto returnToken;
1055 
1056 inSingleLineComment:
1057     while (!isLineTerminator(m_current)) {
1058         if (UNLIKELY(m_current == -1))
1059             return EOFTOK;
1060         shift();
1061     }
1062     shiftLineTerminator();
1063     m_atLineStart = true;
1064     m_terminator = true;
1065     if (!lastTokenWasRestrKeyword())
1066         goto start;
1067 
1068     token = SEMICOLON;
1069     m_delimited = true;
1070     // Fall through into returnToken.
1071 
1072 returnToken:
1073     llocp->line = m_lineNumber;
1074     llocp->startOffset = startOffset;
1075     llocp->endOffset = currentOffset();
1076     m_lastToken = token;
1077     return token;
1078 
1079 returnError:
1080     m_error = true;
1081     return ERRORTOK;
1082 }
1083 
scanRegExp(const Identifier * & pattern,const Identifier * & flags,UChar patternPrefix)1084 bool Lexer::scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix)
1085 {
1086     ASSERT(m_buffer16.isEmpty());
1087 
1088     bool lastWasEscape = false;
1089     bool inBrackets = false;
1090 
1091     if (patternPrefix) {
1092         ASSERT(!isLineTerminator(patternPrefix));
1093         ASSERT(patternPrefix != '/');
1094         ASSERT(patternPrefix != '[');
1095         record16(patternPrefix);
1096     }
1097 
1098     while (true) {
1099         int current = m_current;
1100 
1101         if (isLineTerminator(current) || current == -1) {
1102             m_buffer16.resize(0);
1103             return false;
1104         }
1105 
1106         shift();
1107 
1108         if (current == '/' && !lastWasEscape && !inBrackets)
1109             break;
1110 
1111         record16(current);
1112 
1113         if (lastWasEscape) {
1114             lastWasEscape = false;
1115             continue;
1116         }
1117 
1118         switch (current) {
1119         case '[':
1120             inBrackets = true;
1121             break;
1122         case ']':
1123             inBrackets = false;
1124             break;
1125         case '\\':
1126             lastWasEscape = true;
1127             break;
1128         }
1129     }
1130 
1131     pattern = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1132     m_buffer16.resize(0);
1133 
1134     while (isIdentPart(m_current)) {
1135         record16(m_current);
1136         shift();
1137     }
1138 
1139     flags = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1140     m_buffer16.resize(0);
1141 
1142     return true;
1143 }
1144 
skipRegExp()1145 bool Lexer::skipRegExp()
1146 {
1147     bool lastWasEscape = false;
1148     bool inBrackets = false;
1149 
1150     while (true) {
1151         int current = m_current;
1152 
1153         if (isLineTerminator(current) || current == -1)
1154             return false;
1155 
1156         shift();
1157 
1158         if (current == '/' && !lastWasEscape && !inBrackets)
1159             break;
1160 
1161         if (lastWasEscape) {
1162             lastWasEscape = false;
1163             continue;
1164         }
1165 
1166         switch (current) {
1167         case '[':
1168             inBrackets = true;
1169             break;
1170         case ']':
1171             inBrackets = false;
1172             break;
1173         case '\\':
1174             lastWasEscape = true;
1175             break;
1176         }
1177     }
1178 
1179     while (isIdentPart(m_current))
1180         shift();
1181 
1182     return true;
1183 }
1184 
clear()1185 void Lexer::clear()
1186 {
1187     m_arena = 0;
1188 
1189     Vector<char> newBuffer8;
1190     m_buffer8.swap(newBuffer8);
1191 
1192     Vector<UChar> newBuffer16;
1193     m_buffer16.swap(newBuffer16);
1194 
1195     m_isReparsing = false;
1196 }
1197 
sourceCode(int openBrace,int closeBrace,int firstLine)1198 SourceCode Lexer::sourceCode(int openBrace, int closeBrace, int firstLine)
1199 {
1200     ASSERT(m_source->provider()->data()[openBrace] == '{');
1201     ASSERT(m_source->provider()->data()[closeBrace] == '}');
1202     return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine);
1203 }
1204 
1205 } // namespace JSC
1206