• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ES2PANDA_PARSER_CORE_LEXER_H
17 #define ES2PANDA_PARSER_CORE_LEXER_H
18 
19 #include <ios>
20 #include "lexer/regexp/regexp.h"
21 #include "lexer/token/letters.h"
22 #include "lexer/token/token.h"
23 #include "util/enumbitops.h"
24 
25 namespace ark::es2panda::parser {
26 class ParserContext;
27 class ETSNolintParser;
28 }  // namespace ark::es2panda::parser
29 
30 namespace ark::es2panda::lexer {
31 class Keywords;
32 
33 using ENUMBITOPS_OPERATORS;
34 
35 enum class NextTokenFlags : uint32_t {
36     NONE = 0U,
37     KEYWORD_TO_IDENT = 1U << 0U,
38     NUMERIC_SEPARATOR_ALLOWED = 1U << 1U,
39     BIGINT_ALLOWED = 1U << 2U,
40 };
41 
42 class LexerPosition {
43 public:
44     explicit LexerPosition(const util::StringView &source);
45     DEFAULT_COPY_SEMANTIC(LexerPosition);
46     DEFAULT_MOVE_SEMANTIC(LexerPosition);
47     ~LexerPosition() = default;
48 
Iterator()49     util::StringView::Iterator &Iterator()
50     {
51         return iterator_;
52     }
53 
Iterator()54     const util::StringView::Iterator &Iterator() const
55     {
56         return iterator_;
57     }
58 
Line()59     size_t Line() const
60     {
61         return line_;
62     }
63 
GetToken()64     Token &GetToken()
65     {
66         return token_;
67     }
68 
GetToken()69     const Token &GetToken() const
70     {
71         return token_;
72     }
73 
NextTokenLine()74     size_t &NextTokenLine()
75     {
76         return nextTokenLine_;
77     }
78 
79 private:
80     friend class Lexer;
81 
82     Token token_ {};
83     util::StringView::Iterator iterator_;
84     size_t line_ {};
85     size_t nextTokenLine_ {};
86 };
87 
88 class LexerTemplateString {
89 public:
LexerTemplateString(ArenaAllocator * allocator)90     explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
91     DEFAULT_COPY_SEMANTIC(LexerTemplateString);
92     DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
93     ~LexerTemplateString() = default;
94 
95     // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
96     util::UString str;
97     size_t end {};
98     bool scanExpression {};
99     // NOLINTEND(misc-non-private-member-variables-in-classes)
100 };
101 
102 class TemplateLiteralParserContext;
103 
104 class Lexer {
105 public:
106     explicit Lexer(const parser::ParserContext *parserContext, bool startLexer = true);
107     NO_COPY_SEMANTIC(Lexer);
108     NO_MOVE_SEMANTIC(Lexer);
109     virtual ~Lexer() = default;
110 
111     // NOLINTNEXTLINE(google-default-arguments)
112     virtual void NextToken(NextTokenFlags flags = NextTokenFlags::NONE);
113     virtual void ScanAsteriskPunctuator();
114 
115     Token &GetToken();
116     const Token &GetToken() const;
117     size_t Line() const;
118 
TryEatTokenType(lexer::TokenType type)119     bool TryEatTokenType(lexer::TokenType type)
120     {
121         auto token = GetToken();
122         if (token.Type() == type) {
123             NextToken();
124             return true;
125         }
126         return false;
127     }
128 
TryEatTokenKeyword(lexer::TokenType type)129     std::optional<Token> TryEatTokenKeyword(lexer::TokenType type)
130     {
131         auto token = GetToken();
132         if (token.KeywordType() == type) {
133             NextToken();
134             return token;
135         }
136         return std::nullopt;
137     }
138 
139     LexerPosition Save() const;
140     void Rewind(const LexerPosition &pos);
141     void BackwardToken(TokenType type, size_t offset);
142     void ForwardToken(TokenType type, size_t offset);
143 
144     char32_t Lookahead();
145     bool CheckArrow();
146 
147     RegExp ScanRegExp();
148     template <char32_t END>
149     void ScanString();
150     void ResetTokenEnd();
151     LexerTemplateString ScanTemplateString();
152     void ScanTemplateStringEnd();
153     void PushTemplateContext(TemplateLiteralParserContext *ctx);
ThrowUnexpectedStrictModeReservedKeyword()154     [[noreturn]] void ThrowUnexpectedStrictModeReservedKeyword() const
155     {
156         ThrowError("Unexpected strict mode reserved keyword");
157     }
158 
159     enum class ConversionResult : uint8_t {
160         SUCCESS,
161         INVALID_ARGUMENT,
162         OUT_OF_RANGE,
163     };
164 
165     template <typename Tret, typename Ret = Tret, typename... Base>
StrToNumeric(Tret (* converter)(const char *,char **,Base...),const char * str,ConversionResult & result,Base...base)166     static Ret StrToNumeric(Tret (*converter)(const char *, char **, Base...), const char *str,
167                             ConversionResult &result, Base... base) noexcept
168     {
169         Ret ret {};
170         char *endPtr;
171         // NOLINTBEGIN(cppcoreguidelines-special-member-functions)
172         struct SaveErrno {
173             explicit SaveErrno() : errno_(errno)
174             {
175                 errno = 0;
176             }
177             ~SaveErrno()
178             {
179                 if (errno == 0) {
180                     errno = errno_;
181                 }
182             }
183 
184         private:
185             decltype(errno) errno_;
186         } const savedErrno;
187         // NOLINTEND(cppcoreguidelines-special-member-functions)
188 
189         const Tret tmp = converter(str, &endPtr, base...);
190 
191         bool outOfRange = false;
192         if constexpr (std::is_same_v<Ret, int>) {
193             outOfRange = tmp < static_cast<Tret>(std::numeric_limits<int>::min()) ||
194                          tmp > static_cast<Tret>(std::numeric_limits<int>::max());
195         }
196 
197         if (endPtr == str) {
198             result = ConversionResult::INVALID_ARGUMENT;
199         } else if (errno == ERANGE || outOfRange) {
200             result = ConversionResult::OUT_OF_RANGE;
201         } else {
202             result = ConversionResult::SUCCESS;
203             ret = tmp;
204         }
205 
206         return ret;
207     }
208 
209     util::StringView SourceView(size_t begin, size_t end) const;
210 
211 protected:
212     void NextToken(Keywords *kws);
213     ArenaAllocator *Allocator();
214     bool IsLineTerminatorOrEos() const;
215     void ScanRegExpPattern();
216     RegExpFlags ScanRegExpFlags();
217 
218     [[noreturn]] void ThrowError(std::string_view message) const;
219     [[noreturn]] void ThrowUnexpectedToken(lexer::TokenType tokenType) const;
220 
221     void SetTokenStart();
222     void SetTokenEnd();
223 
Iterator()224     inline util::StringView::Iterator &Iterator()
225     {
226         return pos_.iterator_;
227     }
228 
Iterator()229     inline const util::StringView::Iterator &Iterator() const
230     {
231         return pos_.iterator_;
232     }
233 
234     util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
235 
236     void SkipWhiteSpaces();
237     void SkipSingleLineComment();
238 
239     bool ScanPunctuator();
240     void ScanQuestionPunctuator();
241     void ScanLessThanPunctuator();
242     void ScanGreaterThanPunctuator();
243     virtual void ScanEqualsPunctuator();
244     virtual void ScanExclamationPunctuator();
245     void ScanAmpersandPunctuator();
246     void ScanVLinePunctuator();
247     void ScanCircumflexPunctuator();
248     void ScanPlusPunctuator();
249     void ScanMinusPunctuator();
250     void ScanSlashPunctuator();
251     void ScanPercentPunctuator();
252     void ScanDotPunctuator();
253     void ScanColonPunctuator();
254     virtual bool ScanDollarPunctuator();
255     void ScanAtPunctuator();
256 
257     virtual void SkipMultiLineComment();
258     virtual void ScanHashMark();
259     virtual void ScanBackTick();
260 
ScanCharLiteral()261     virtual bool ScanCharLiteral()
262     {
263         return false;
264     }
265 
266     char32_t ScanUnicodeEscapeSequence();
267     template <int N, bool IN_AS = false>
268     char32_t ScanHexEscape();
269     char32_t ScanUnicodeCodePointEscape();
270 
271     void ScanStringUnicodePart(util::UString *str);
272     char32_t ScanUnicodeCharacter();
273 
274     void ScanDecimalNumbers();
275 
ScanNumberLeadingZero()276     virtual void ScanNumberLeadingZero()
277     {
278         ScanNumberLeadingZeroImpl<double>();
279     }
280 
281     template <typename RadixType, typename RadixLimit = void *>
282     void ScanNumberLeadingZeroImpl();
283     void ScanNumberLeadingZeroImplNonAllowedCases();
284     template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
285     void ScanNumberRadix(bool allowNumericSeparator = true);
286     void ScanNumber(bool allowBigInt = true);
287     std::tuple<size_t, bool, NumberFlags> ScanCharLex(bool allowBigInt, bool parseExponent, NumberFlags flags);
288     size_t ScanSignOfNumber();
289     template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
290     void ScanTooLargeNumber(RadixType number);
291     virtual void ConvertNumber(const std::string &utf8, NumberFlags flags);
292     void ScanDecimalLiteral();
293     void ScanDecimalDigits(bool allowNumericSeparator);
294     virtual void CheckNumberLiteralEnd();
295 
296     inline static uint32_t HexValue(char32_t ch);
297     inline static bool IsDecimalDigit(uint32_t cp);
298     inline static bool IsHexDigit(char32_t ch);
299     inline static bool IsBinaryDigit(char32_t ch);
300     inline static bool IsOctalDigit(char32_t ch);
301 
302     friend class KeywordsUtil;
303     friend class TemplateLiteralParserContext;
304     friend class parser::ETSNolintParser;
305 
306     LexerPosition &Pos();
307     const LexerPosition &Pos() const;
308 
309 private:
310     TemplateLiteralParserContext *tlCtx_ {};
311     ArenaAllocator *allocator_;
312     Keywords *kws_ {};
313     const parser::ParserContext *parserContext_;
314     util::StringView source_;
315     LexerPosition pos_;
316 };
317 
318 class TemplateLiteralParserContext {
319 public:
TemplateLiteralParserContext(Lexer * lexer)320     explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
321     NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
322     NO_COPY_SEMANTIC(TemplateLiteralParserContext);
323 
~TemplateLiteralParserContext()324     ~TemplateLiteralParserContext()
325     {
326         lexer_->tlCtx_ = prev_;
327     }
328 
ConsumeLeftBrace()329     void ConsumeLeftBrace()
330     {
331         braceDepth_++;
332     }
333 
ConsumeRightBrace()334     bool ConsumeRightBrace()
335     {
336         braceDepth_--;
337 
338         return braceDepth_ == 0;
339     }
340 
341 private:
342     Lexer *lexer_;
343     TemplateLiteralParserContext *prev_ {};
344     size_t braceDepth_ {1};
345 };
346 
347 template <char32_t END>
ScanString()348 void Lexer::ScanString()
349 {
350     util::UString str(Allocator());
351     GetToken().type_ = TokenType::LITERAL_STRING;
352     GetToken().keywordType_ = TokenType::LITERAL_STRING;
353 
354     const auto startPos = Iterator().Index();
355     auto escapeEnd = startPos;
356 
357     do {
358         char32_t cp = Iterator().Peek();
359 
360         switch (cp) {
361             case util::StringView::Iterator::INVALID_CP: {
362                 ThrowError("Unterminated string");
363                 break;
364             }
365             case LEX_CHAR_CR:
366             case LEX_CHAR_LF: {
367                 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
368                 if constexpr (END != LEX_CHAR_BACK_TICK) {
369                     ThrowError("Newline is not allowed in strings");
370                 }
371 
372                 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
373                 str.Append(SourceView(escapeEnd, Iterator().Index()));
374 
375                 if (cp == LEX_CHAR_CR) {
376                     Iterator().Forward(1);
377 
378                     if (Iterator().Peek() != LEX_CHAR_LF) {
379                         Iterator().Backward(1);
380                     }
381                 }
382 
383                 pos_.line_++;
384                 str.Append(LEX_CHAR_LF);
385                 Iterator().Forward(1);
386                 escapeEnd = Iterator().Index();
387                 continue;
388             }
389             case LEX_CHAR_BACKSLASH: {
390                 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
391                 str.Append(SourceView(escapeEnd, Iterator().Index()));
392 
393                 Iterator().Forward(1);
394                 ScanStringUnicodePart(&str);
395                 escapeEnd = Iterator().Index();
396                 continue;
397             }
398             case LEX_CHAR_BACK_TICK:
399             case LEX_CHAR_SINGLE_QUOTE:
400             case LEX_CHAR_DOUBLE_QUOTE: {
401                 if (END == cp) {
402                     break;
403                 }
404 
405                 Iterator().Forward(1);
406                 continue;
407             }
408             case LEX_CHAR_DOLLAR_SIGN: {
409                 Iterator().Forward(1);
410 
411                 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
412                 if constexpr (END == LEX_CHAR_BACK_TICK) {
413                     if (Iterator().Peek() == LEX_CHAR_LEFT_BRACE) {
414                         Iterator().Backward(1);
415                         break;
416                     }
417                 }
418 
419                 continue;
420             }
421             default: {
422                 Iterator().SkipCp();
423                 continue;
424             }
425         }
426 
427         if (GetToken().flags_ & TokenFlags::HAS_ESCAPE) {
428             str.Append(SourceView(escapeEnd, Iterator().Index()));
429             GetToken().src_ = str.View();
430         } else {
431             GetToken().src_ = SourceView(startPos, Iterator().Index());
432         }
433 
434         break;
435     } while (true);
436 
437     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
438     if constexpr (END != LEX_CHAR_BACK_TICK) {
439         Iterator().Forward(1);
440     }
441 }
442 
443 template <int N, bool IN_AS>
ScanHexEscape()444 char32_t Lexer::ScanHexEscape()
445 {
446     char32_t code = 0;
447 
448     for (size_t i = 0; i < N; ++i) {
449         const auto cp = Iterator().Peek();
450         if (IN_AS && cp == LEX_CHAR_BACK_TICK) {
451             break;
452         }
453 
454         Iterator().Forward(1);
455 
456         if (!IsHexDigit(cp)) {
457             ThrowError("Invalid unicode escape sequence");
458         }
459 
460         constexpr auto MULTIPLIER = 16;
461         code = code * MULTIPLIER + HexValue(cp);
462     }
463 
464     return code;
465 }
466 
467 template <typename RadixType, typename RadixLimit>
ScanNumberLeadingZeroImpl()468 void Lexer::ScanNumberLeadingZeroImpl()
469 {
470     GetToken().type_ = TokenType::LITERAL_NUMBER;
471     GetToken().keywordType_ = TokenType::LITERAL_NUMBER;
472 
473     switch (Iterator().Peek()) {
474         case LEX_CHAR_LOWERCASE_X:
475         case LEX_CHAR_UPPERCASE_X: {
476             Iterator().Forward(1);
477             constexpr auto RADIX = 16;
478             ScanNumberRadix<IsHexDigit, RADIX, RadixType, RadixLimit>();
479             CheckNumberLiteralEnd();
480             return;
481         }
482         case LEX_CHAR_LOWERCASE_B:
483         case LEX_CHAR_UPPERCASE_B: {
484             Iterator().Forward(1);
485             constexpr auto RADIX = 2;
486             ScanNumberRadix<IsBinaryDigit, RADIX, RadixType, RadixLimit>();
487             CheckNumberLiteralEnd();
488             return;
489         }
490         case LEX_CHAR_LOWERCASE_O:
491         case LEX_CHAR_UPPERCASE_O: {
492             Iterator().Forward(1);
493             constexpr auto RADIX = 8;
494             ScanNumberRadix<IsOctalDigit, RADIX, RadixType, RadixLimit>();
495 
496             switch (Iterator().Peek()) {
497                 case LEX_CHAR_8:
498                 case LEX_CHAR_9: {
499                     ThrowError("Invalid octal digit");
500                 }
501                 default: {
502                     break;
503                 }
504             }
505 
506             CheckNumberLiteralEnd();
507             return;
508         }
509         default: {
510             ScanNumberLeadingZeroImplNonAllowedCases();
511             break;
512         }
513     }
514 
515     ScanNumber();
516 }
517 
518 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanTooLargeNumber(RadixType number)519 void Lexer::ScanTooLargeNumber([[maybe_unused]] RadixType number)
520 {
521     if constexpr (std::is_arithmetic_v<RadixLimit>) {
522         if (number > std::numeric_limits<RadixLimit>::max() / RADIX) {
523             ThrowError("Number is too large");
524         }
525     }
526 }
527 
528 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanNumberRadix(bool allowNumericSeparator)529 void Lexer::ScanNumberRadix(bool allowNumericSeparator)
530 {
531     RadixType number {};
532 
533     auto cp = Iterator().Peek();
534     if (!RANGE_CHECK(cp)) {
535         ThrowError("Invalid digit");
536     }
537 
538     bool allowNumericOnNext = true;
539 
540     do {
541         cp = Iterator().Peek();
542         if (RANGE_CHECK(cp)) {
543             auto digit = HexValue(cp);
544 
545             ScanTooLargeNumber<RANGE_CHECK, RADIX, RadixType, RadixLimit>(number);
546 
547             number = number * RADIX + digit;
548             Iterator().Forward(1);
549             allowNumericOnNext = true;
550             continue;
551         }
552 
553         if (cp == LEX_CHAR_UNDERSCORE) {
554             if (!allowNumericSeparator || !allowNumericOnNext) {
555                 ThrowError("Invalid numeric separator");
556             }
557 
558             GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
559             Iterator().Forward(1);
560             allowNumericOnNext = false;
561             continue;
562         }
563 
564         if (!allowNumericOnNext) {
565             Iterator().Backward(1);
566             ThrowError("Numeric separators are not allowed at the end of numeric literals");
567         }
568 
569         break;
570     } while (true);
571 
572     GetToken().number_ = lexer::Number(number);
573 }
574 
HexValue(char32_t ch)575 inline uint32_t Lexer::HexValue(char32_t ch)
576 {
577     constexpr uint32_t HEX_MASK = 0xF;
578     constexpr uint32_t DEC_OFFSET = 10;
579     return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
580 }
581 
IsDecimalDigit(uint32_t cp)582 inline bool Lexer::IsDecimalDigit(uint32_t cp)
583 {
584     return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
585 }
586 
IsHexDigit(char32_t ch)587 inline bool Lexer::IsHexDigit(char32_t ch)
588 {
589     return ch < LEX_ASCII_MAX_BITS && (std::isxdigit(static_cast<unsigned char>(ch)) != 0);
590 }
591 
IsBinaryDigit(char32_t ch)592 inline bool Lexer::IsBinaryDigit(char32_t ch)
593 {
594     return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
595 }
596 
IsOctalDigit(char32_t ch)597 inline bool Lexer::IsOctalDigit(char32_t ch)
598 {
599     return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
600 }
601 }  // namespace ark::es2panda::lexer
602 
603 template <>
604 struct enumbitops::IsAllowedType<ark::es2panda::lexer::NextTokenFlags> : std::true_type {
605 };
606 
607 #endif
608