• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ES2PANDA_PARSER_CORE_LEXER_H
17 #define ES2PANDA_PARSER_CORE_LEXER_H
18 
19 #include "lexer/regexp/regexp.h"
20 #include "lexer/token/letters.h"
21 #include "lexer/token/token.h"
22 #include "util/enumbitops.h"
23 
24 namespace panda::es2panda::parser {
25 class ParserContext;
26 }  // namespace panda::es2panda::parser
27 
28 namespace panda::es2panda::lexer {
29 class Keywords;
30 
31 enum class NextTokenFlags : uint32_t {
32     NONE = 0U,
33     KEYWORD_TO_IDENT = 1U << 0U,
34     NUMERIC_SEPARATOR_ALLOWED = 1U << 1U,
35     BIGINT_ALLOWED = 1U << 2U,
36 };
37 
DEFINE_BITOPS(NextTokenFlags)38 DEFINE_BITOPS(NextTokenFlags)
39 
40 class LexerPosition {
41 public:
42     explicit LexerPosition(const util::StringView &source);
43     DEFAULT_COPY_SEMANTIC(LexerPosition);
44     DEFAULT_MOVE_SEMANTIC(LexerPosition);
45     ~LexerPosition() = default;
46 
47     util::StringView::Iterator &Iterator()
48     {
49         return iterator_;
50     }
51 
52     const util::StringView::Iterator &Iterator() const
53     {
54         return iterator_;
55     }
56 
57     size_t Line() const
58     {
59         return line_;
60     }
61 
62     Token &GetToken()
63     {
64         return token_;
65     }
66 
67     const Token &GetToken() const
68     {
69         return token_;
70     }
71 
72     size_t &NextTokenLine()
73     {
74         return nextTokenLine_;
75     }
76 
77 private:
78     friend class Lexer;
79 
80     Token token_ {};
81     util::StringView::Iterator iterator_;
82     size_t line_ {};
83     size_t nextTokenLine_ {};
84 };
85 
86 class LexerTemplateString {
87 public:
LexerTemplateString(ArenaAllocator * allocator)88     explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
89     DEFAULT_COPY_SEMANTIC(LexerTemplateString);
90     DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
91     ~LexerTemplateString() = default;
92 
93     // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
94     util::UString str;
95     size_t end {};
96     bool scanExpression {};
97     // NOLINTEND(misc-non-private-member-variables-in-classes)
98 };
99 
100 class TemplateLiteralParserContext;
101 
102 class Lexer {
103 public:
104     explicit Lexer(const parser::ParserContext *parserContext, bool startLexer = true);
105     NO_COPY_SEMANTIC(Lexer);
106     NO_MOVE_SEMANTIC(Lexer);
107     virtual ~Lexer() = default;
108 
109     // NOLINTNEXTLINE(google-default-arguments)
110     virtual void NextToken(NextTokenFlags flags = NextTokenFlags::NONE);
111     virtual void ScanAsteriskPunctuator();
112 
113     Token &GetToken();
114     const Token &GetToken() const;
115     size_t Line() const;
116 
117     LexerPosition Save() const;
118     void Rewind(const LexerPosition &pos);
119     void BackwardToken(TokenType type, size_t offset);
120     void ForwardToken(TokenType type, size_t offset);
121 
122     char32_t Lookahead();
123     bool CheckArrow();
124 
125     RegExp ScanRegExp();
126     template <char32_t END>
127     void ScanString();
128     void ResetTokenEnd();
129     LexerTemplateString ScanTemplateString();
130     void ScanTemplateStringEnd();
131     void PushTemplateContext(TemplateLiteralParserContext *ctx);
ThrowUnexpectedStrictModeReservedKeyword()132     [[noreturn]] void ThrowUnexpectedStrictModeReservedKeyword() const
133     {
134         ThrowError("Unexpected strict mode reserved keyword");
135     }
136 
137     enum class ConversionResult : uint8_t {
138         SUCCESS,
139         INVALID_ARGUMENT,
140         OUT_OF_RANGE,
141     };
142 
143     template <typename Tret, typename Ret = Tret, typename... Base>
StrToNumeric(Tret (* converter)(const char *,char **,Base...),const char * str,ConversionResult & result,Base...base)144     static Ret StrToNumeric(Tret (*converter)(const char *, char **, Base...), const char *str,
145                             ConversionResult &result, Base... base) noexcept
146     {
147         Ret ret {};
148         char *endPtr;
149         // NOLINTBEGIN(cppcoreguidelines-special-member-functions)
150         struct SaveErrno {
151             explicit SaveErrno() : errno_(errno)
152             {
153                 errno = 0;
154             }
155             ~SaveErrno()
156             {
157                 if (errno == 0) {
158                     errno = errno_;
159                 }
160             }
161 
162         private:
163             decltype(errno) errno_;
164         } const savedErrno;
165         // NOLINTEND(cppcoreguidelines-special-member-functions)
166 
167         const Tret tmp = converter(str, &endPtr, base...);
168 
169         bool outOfRange = false;
170         if constexpr (std::is_same_v<Ret, int>) {
171             outOfRange = tmp < static_cast<Tret>(std::numeric_limits<int>::min()) ||
172                          tmp > static_cast<Tret>(std::numeric_limits<int>::max());
173         }
174 
175         if (endPtr == str) {
176             result = ConversionResult::INVALID_ARGUMENT;
177         } else if (errno == ERANGE || outOfRange) {
178             result = ConversionResult::OUT_OF_RANGE;
179         } else {
180             result = ConversionResult::SUCCESS;
181             ret = tmp;
182         }
183 
184         return ret;
185     }
186 
187     util::StringView SourceView(size_t begin, size_t end) const;
188 
189 protected:
190     void NextToken(Keywords *kws);
191     ArenaAllocator *Allocator();
192     bool IsLineTerminatorOrEos() const;
193     void ScanRegExpPattern();
194     RegExpFlags ScanRegExpFlags();
195 
196     [[noreturn]] void ThrowError(std::string_view message) const;
197     [[noreturn]] void ThrowUnexpectedToken(lexer::TokenType tokenType) const;
198 
199     void SetTokenStart();
200     void SetTokenEnd();
201 
Iterator()202     inline util::StringView::Iterator &Iterator()
203     {
204         return pos_.iterator_;
205     }
206 
Iterator()207     inline const util::StringView::Iterator &Iterator() const
208     {
209         return pos_.iterator_;
210     }
211 
212     util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
213 
214     void SkipWhiteSpaces();
215     void SkipSingleLineComment();
216 
217     bool ScanPunctuator();
218     void ScanQuestionPunctuator();
219     void ScanLessThanPunctuator();
220     void ScanGreaterThanPunctuator();
221     virtual void ScanEqualsPunctuator();
222     virtual void ScanExclamationPunctuator();
223     void ScanAmpersandPunctuator();
224     void ScanVLinePunctuator();
225     void ScanCircumflexPunctuator();
226     void ScanPlusPunctuator();
227     void ScanMinusPunctuator();
228     void ScanSlashPunctuator();
229     void ScanPercentPunctuator();
230     void ScanDotPunctuator();
231     void ScanColonPunctuator();
232     virtual bool ScanDollarPunctuator();
233     void ScanAtPunctuator();
234 
235     virtual void SkipMultiLineComment();
236     virtual void ScanHashMark();
237     virtual void ScanBackTick();
238 
ScanCharLiteral()239     virtual bool ScanCharLiteral()
240     {
241         return false;
242     }
243 
244     char32_t ScanUnicodeEscapeSequence();
245     template <int N, bool IN_AS = false>
246     char32_t ScanHexEscape();
247     char32_t ScanUnicodeCodePointEscape();
248 
249     void ScanStringUnicodePart(util::UString *str);
250     char32_t ScanUnicodeCharacter();
251 
252     void ScanDecimalNumbers();
253 
ScanNumberLeadingZero()254     virtual void ScanNumberLeadingZero()
255     {
256         ScanNumberLeadingZeroImpl<double>();
257     }
258 
259     template <typename RadixType, typename RadixLimit = void *>
260     void ScanNumberLeadingZeroImpl();
261     void ScanNumberLeadingZeroImplNonAllowedCases();
262     template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
263     void ScanNumberRadix(bool allowNumericSeparator = true);
264     void ScanNumber(bool allowBigInt = true);
265     template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
266     void ScanTooLargeNumber(RadixType number);
267     virtual void ConvertNumber(const std::string &utf8, NumberFlags flags);
268     void ScanDecimalLiteral();
269     void ScanDecimalDigits(bool allowNumericSeparator);
270     virtual void CheckNumberLiteralEnd();
271 
272     inline static uint32_t HexValue(char32_t ch);
273     inline static bool IsDecimalDigit(uint32_t cp);
274     inline static bool IsHexDigit(char32_t ch);
275     inline static bool IsBinaryDigit(char32_t ch);
276     inline static bool IsOctalDigit(char32_t ch);
277 
278     friend class KeywordsUtil;
279     friend class TemplateLiteralParserContext;
280 
281     LexerPosition &Pos();
282     const LexerPosition &Pos() const;
283 
284 private:
285     TemplateLiteralParserContext *tlCtx_ {};
286     ArenaAllocator *allocator_;
287     Keywords *kws_ {};
288     const parser::ParserContext *parserContext_;
289     util::StringView source_;
290     LexerPosition pos_;
291 };
292 
293 class TemplateLiteralParserContext {
294 public:
TemplateLiteralParserContext(Lexer * lexer)295     explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
296     NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
297     NO_COPY_SEMANTIC(TemplateLiteralParserContext);
298 
~TemplateLiteralParserContext()299     ~TemplateLiteralParserContext()
300     {
301         lexer_->tlCtx_ = prev_;
302     }
303 
ConsumeLeftBrace()304     void ConsumeLeftBrace()
305     {
306         braceDepth_++;
307     }
308 
ConsumeRightBrace()309     bool ConsumeRightBrace()
310     {
311         braceDepth_--;
312 
313         return braceDepth_ == 0;
314     }
315 
316 private:
317     Lexer *lexer_;
318     TemplateLiteralParserContext *prev_ {};
319     size_t braceDepth_ {1};
320 };
321 
322 template <char32_t END>
ScanString()323 void Lexer::ScanString()
324 {
325     util::UString str(Allocator());
326     GetToken().type_ = TokenType::LITERAL_STRING;
327     GetToken().keywordType_ = TokenType::LITERAL_STRING;
328 
329     const auto startPos = Iterator().Index();
330     auto escapeEnd = startPos;
331 
332     do {
333         char32_t cp = Iterator().Peek();
334 
335         switch (cp) {
336             case util::StringView::Iterator::INVALID_CP: {
337                 ThrowError("Unterminated string");
338                 break;
339             }
340             case LEX_CHAR_CR:
341             case LEX_CHAR_LF: {
342                 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
343                 if constexpr (END != LEX_CHAR_BACK_TICK) {
344                     ThrowError("Newline is not allowed in strings");
345                 }
346 
347                 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
348                 str.Append(SourceView(escapeEnd, Iterator().Index()));
349 
350                 if (cp == LEX_CHAR_CR) {
351                     Iterator().Forward(1);
352 
353                     if (Iterator().Peek() != LEX_CHAR_LF) {
354                         Iterator().Backward(1);
355                     }
356                 }
357 
358                 pos_.line_++;
359                 str.Append(LEX_CHAR_LF);
360                 Iterator().Forward(1);
361                 escapeEnd = Iterator().Index();
362                 continue;
363             }
364             case LEX_CHAR_BACKSLASH: {
365                 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
366                 str.Append(SourceView(escapeEnd, Iterator().Index()));
367 
368                 Iterator().Forward(1);
369                 ScanStringUnicodePart(&str);
370                 escapeEnd = Iterator().Index();
371                 continue;
372             }
373             case LEX_CHAR_BACK_TICK:
374             case LEX_CHAR_SINGLE_QUOTE:
375             case LEX_CHAR_DOUBLE_QUOTE: {
376                 if (END == cp) {
377                     break;
378                 }
379 
380                 Iterator().Forward(1);
381                 continue;
382             }
383             case LEX_CHAR_DOLLAR_SIGN: {
384                 Iterator().Forward(1);
385 
386                 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
387                 if constexpr (END == LEX_CHAR_BACK_TICK) {
388                     if (Iterator().Peek() == LEX_CHAR_LEFT_BRACE) {
389                         Iterator().Backward(1);
390                         break;
391                     }
392                 }
393 
394                 continue;
395             }
396             default: {
397                 Iterator().SkipCp();
398                 continue;
399             }
400         }
401 
402         if (GetToken().flags_ & TokenFlags::HAS_ESCAPE) {
403             str.Append(SourceView(escapeEnd, Iterator().Index()));
404             GetToken().src_ = str.View();
405         } else {
406             GetToken().src_ = SourceView(startPos, Iterator().Index());
407         }
408 
409         break;
410     } while (true);
411 
412     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
413     if constexpr (END != LEX_CHAR_BACK_TICK) {
414         Iterator().Forward(1);
415     }
416 }
417 
418 template <int N, bool IN_AS>
ScanHexEscape()419 char32_t Lexer::ScanHexEscape()
420 {
421     char32_t code = 0;
422 
423     for (size_t i = 0; i < N; ++i) {
424         const auto cp = Iterator().Peek();
425         if (IN_AS && cp == LEX_CHAR_BACK_TICK) {
426             break;
427         }
428 
429         Iterator().Forward(1);
430 
431         if (!IsHexDigit(cp)) {
432             ThrowError("Invalid unicode escape sequence");
433         }
434 
435         constexpr auto MULTIPLIER = 16;
436         code = code * MULTIPLIER + HexValue(cp);
437     }
438 
439     return code;
440 }
441 
442 template <typename RadixType, typename RadixLimit>
ScanNumberLeadingZeroImpl()443 void Lexer::ScanNumberLeadingZeroImpl()
444 {
445     GetToken().type_ = TokenType::LITERAL_NUMBER;
446     GetToken().keywordType_ = TokenType::LITERAL_NUMBER;
447 
448     switch (Iterator().Peek()) {
449         case LEX_CHAR_LOWERCASE_X:
450         case LEX_CHAR_UPPERCASE_X: {
451             Iterator().Forward(1);
452             constexpr auto RADIX = 16;
453             ScanNumberRadix<IsHexDigit, RADIX, RadixType, RadixLimit>();
454             CheckNumberLiteralEnd();
455             return;
456         }
457         case LEX_CHAR_LOWERCASE_B:
458         case LEX_CHAR_UPPERCASE_B: {
459             Iterator().Forward(1);
460             constexpr auto RADIX = 2;
461             ScanNumberRadix<IsBinaryDigit, RADIX, RadixType, RadixLimit>();
462             CheckNumberLiteralEnd();
463             return;
464         }
465         case LEX_CHAR_LOWERCASE_O:
466         case LEX_CHAR_UPPERCASE_O: {
467             Iterator().Forward(1);
468             constexpr auto RADIX = 8;
469             ScanNumberRadix<IsOctalDigit, RADIX, RadixType, RadixLimit>();
470 
471             switch (Iterator().Peek()) {
472                 case LEX_CHAR_8:
473                 case LEX_CHAR_9: {
474                     ThrowError("Invalid octal digit");
475                 }
476                 default: {
477                     break;
478                 }
479             }
480 
481             CheckNumberLiteralEnd();
482             return;
483         }
484         default: {
485             ScanNumberLeadingZeroImplNonAllowedCases();
486             break;
487         }
488     }
489 
490     ScanNumber();
491 }
492 
493 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanTooLargeNumber(RadixType number)494 void Lexer::ScanTooLargeNumber([[maybe_unused]] RadixType number)
495 {
496     if constexpr (std::is_arithmetic_v<RadixLimit>) {
497         if (number > std::numeric_limits<RadixLimit>::max() / RADIX) {
498             ThrowError("Number is too large");
499         }
500     }
501 }
502 
503 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanNumberRadix(bool allowNumericSeparator)504 void Lexer::ScanNumberRadix(bool allowNumericSeparator)
505 {
506     RadixType number {};
507 
508     auto cp = Iterator().Peek();
509     if (!RANGE_CHECK(cp)) {
510         ThrowError("Invalid digit");
511     }
512 
513     bool allowNumericOnNext = true;
514 
515     do {
516         cp = Iterator().Peek();
517         if (RANGE_CHECK(cp)) {
518             auto digit = HexValue(cp);
519 
520             ScanTooLargeNumber<RANGE_CHECK, RADIX, RadixType, RadixLimit>(number);
521 
522             number = number * RADIX + digit;
523             Iterator().Forward(1);
524             allowNumericOnNext = true;
525             continue;
526         }
527 
528         if (cp == LEX_CHAR_UNDERSCORE) {
529             if (!allowNumericSeparator || !allowNumericOnNext) {
530                 ThrowError("Invalid numeric separator");
531             }
532 
533             GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
534             Iterator().Forward(1);
535             allowNumericOnNext = false;
536             continue;
537         }
538 
539         if (!allowNumericOnNext) {
540             Iterator().Backward(1);
541             ThrowError("Numeric separators are not allowed at the end of numeric literals");
542         }
543 
544         break;
545     } while (true);
546 
547     GetToken().number_ = lexer::Number(number);
548 }
549 
HexValue(char32_t ch)550 inline uint32_t Lexer::HexValue(char32_t ch)
551 {
552     constexpr uint32_t HEX_MASK = 0xF;
553     constexpr uint32_t DEC_OFFSET = 10;
554     return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
555 }
556 
IsDecimalDigit(uint32_t cp)557 inline bool Lexer::IsDecimalDigit(uint32_t cp)
558 {
559     return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
560 }
561 
IsHexDigit(char32_t ch)562 inline bool Lexer::IsHexDigit(char32_t ch)
563 {
564     return ch < LEX_ASCII_MAX_BITS && (std::isxdigit(static_cast<unsigned char>(ch)) != 0);
565 }
566 
IsBinaryDigit(char32_t ch)567 inline bool Lexer::IsBinaryDigit(char32_t ch)
568 {
569     return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
570 }
571 
IsOctalDigit(char32_t ch)572 inline bool Lexer::IsOctalDigit(char32_t ch)
573 {
574     return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
575 }
576 }  // namespace panda::es2panda::lexer
577 
578 #endif
579