• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ES2PANDA_PARSER_CORE_LEXER_H
17 #define ES2PANDA_PARSER_CORE_LEXER_H
18 
19 #include "lexer/regexp/regexp.h"
20 #include "lexer/token/letters.h"
21 #include "lexer/token/token.h"
22 #include "util/enumbitops.h"
23 
24 namespace ark::es2panda::parser {
25 class ParserContext;
26 class ETSNolintParser;
27 }  // namespace ark::es2panda::parser
28 
29 namespace ark::es2panda::lexer {
30 class Keywords;
31 
32 using ENUMBITOPS_OPERATORS;
33 
34 enum class NextTokenFlags : uint32_t {
35     NONE = 0U,
36     KEYWORD_TO_IDENT = 1U << 0U,
37     NUMERIC_SEPARATOR_ALLOWED = 1U << 1U,
38     BIGINT_ALLOWED = 1U << 2U,
39     UNARY_MINUS = 1U << 3U,
40 };
41 
42 class LexerPosition {
43 public:
44     explicit LexerPosition(const util::StringView &source);
45     DEFAULT_COPY_SEMANTIC(LexerPosition);
46     DEFAULT_MOVE_SEMANTIC(LexerPosition);
47     ~LexerPosition() = default;
48 
Iterator()49     util::StringView::Iterator &Iterator()
50     {
51         return iterator_;
52     }
53 
Iterator()54     const util::StringView::Iterator &Iterator() const
55     {
56         return iterator_;
57     }
58 
Line()59     size_t Line() const
60     {
61         return line_;
62     }
63 
GetToken()64     Token &GetToken()
65     {
66         return token_;
67     }
68 
GetToken()69     const Token &GetToken() const
70     {
71         return token_;
72     }
73 
NextTokenLine()74     size_t &NextTokenLine()
75     {
76         return nextTokenLine_;
77     }
78 
79     bool operator==(const LexerPosition &other) const
80     {
81         return iterator_.Save() == other.iterator_.Save();
82     }
83 
84     bool operator!=(const LexerPosition &other) const
85     {
86         return !(*this == other);
87     }
88 
89 private:
90     friend class Lexer;
91 
92     Token token_ {};
93     util::StringView::Iterator iterator_;
94     size_t line_ {};
95     size_t nextTokenLine_ {};
96 };
97 
98 class LexerTemplateString {
99 public:
LexerTemplateString(ArenaAllocator * allocator)100     explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
101     DEFAULT_COPY_SEMANTIC(LexerTemplateString);
102     DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
103     ~LexerTemplateString() = default;
104 
105     // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
106     util::UString str;
107     size_t end {};
108     bool scanExpression {};
109     bool validSequence {true};
110     // NOLINTEND(misc-non-private-member-variables-in-classes)
111 };
112 
113 class TemplateLiteralParserContext;
114 
115 class Lexer {
116 public:
117     explicit Lexer(const parser::ParserContext *parserContext, util::ErrorLogger *errorLogger, bool startLexer = true);
118     NO_COPY_SEMANTIC(Lexer);
119     NO_MOVE_SEMANTIC(Lexer);
120     virtual ~Lexer() = default;
121 
122     // NOLINTNEXTLINE(google-default-arguments)
123     virtual void NextToken(NextTokenFlags flags = NextTokenFlags::NONE);
124     virtual void ScanAsteriskPunctuator();
125 
126     Token &GetToken();
127     const Token &GetToken() const;
128     size_t Line() const;
129 
TryEatTokenType(lexer::TokenType type)130     bool TryEatTokenType(lexer::TokenType type)
131     {
132         auto token = GetToken();
133         if (token.Type() == type) {
134             NextToken();
135             return true;
136         }
137         return false;
138     }
139 
ErrorLogger()140     const util::ErrorLogger *ErrorLogger()
141     {
142         return errorLogger_;
143     }
144 
TryEatTokenKeyword(lexer::TokenType type)145     std::optional<Token> TryEatTokenKeyword(lexer::TokenType type)
146     {
147         auto token = GetToken();
148         if (token.KeywordType() == type) {
149             NextToken();
150             return token;
151         }
152         return std::nullopt;
153     }
154 
155     LexerPosition Save() const;
156     void Rewind(const LexerPosition &pos);
157     void BackwardToken(TokenType type, size_t offset);
158     void ForwardToken(TokenType type, size_t offset);
159 
160     char32_t Lookahead();
161     bool CheckArrow();
162 
163     RegExp ScanRegExp();
164 
165     void HandleNewlineHelper(util::UString *str, size_t *escapeEnd);
166     bool HandleBackslashHelper(util::UString *str, size_t *escapeEnd);
167     bool HandleDollarSignHelper(const char32_t &end);
168     bool HandleDoubleQuoteHelper(const char32_t &end, const char32_t &cp);
169     void PrepareStringTokenHelper();
170     void FinalizeTokenHelper(util::UString *str, const size_t &startPos, size_t escapeEnd, bool finalize = true);
171     template <char32_t END>
172     void ScanString();
173 
174     void ResetTokenEnd();
175     bool CheckOctalDigit(char32_t const nextCp);
176     std::tuple<bool, bool, LexerTemplateString> ScanTemplateStringCpHelper(char32_t cp,
177                                                                            LexerTemplateString templateStr);
178     LexerTemplateString ScanTemplateString();
179     void ScanTemplateStringEnd();
180     void PushTemplateContext(TemplateLiteralParserContext *ctx);
LogUnexpectedStrictModeReservedKeyword()181     void LogUnexpectedStrictModeReservedKeyword() const
182     {
183         LogSyntaxError("Unexpected strict mode reserved keyword");
184     }
185 
186     enum class ConversionResult : uint8_t {
187         SUCCESS,
188         INVALID_ARGUMENT,
189         OUT_OF_RANGE,
190     };
191 
192     template <typename Tret, typename Ret = Tret, typename... Base>
StrToNumeric(Tret (* converter)(const char *,char **,Base...),const char * str,ConversionResult & result,Base...base)193     static Ret StrToNumeric(Tret (*converter)(const char *, char **, Base...), const char *str,
194                             ConversionResult &result, Base... base) noexcept
195     {
196         Ret ret {};
197         char *endPtr;
198         // NOLINTBEGIN(cppcoreguidelines-special-member-functions)
199         struct SaveErrno {
200             explicit SaveErrno() : errno_(errno)
201             {
202                 errno = 0;
203             }
204             ~SaveErrno()
205             {
206                 if (errno == 0) {
207                     errno = errno_;
208                 }
209             }
210 
211         private:
212             decltype(errno) errno_;
213         } const savedErrno;
214         // NOLINTEND(cppcoreguidelines-special-member-functions)
215 
216         const Tret tmp = converter(str, &endPtr, base...);
217 
218         bool outOfRange = false;
219         if constexpr (!std::is_same_v<Ret, Tret>) {
220             outOfRange = tmp < static_cast<Tret>(std::numeric_limits<Ret>::min()) ||
221                          tmp > static_cast<Tret>(std::numeric_limits<Ret>::max());
222         }
223 
224         if (endPtr == str) {
225             result = ConversionResult::INVALID_ARGUMENT;
226         } else if (errno == ERANGE || outOfRange) {
227             result = ConversionResult::OUT_OF_RANGE;
228         } else {
229             result = ConversionResult::SUCCESS;
230             ret = tmp;
231         }
232 
233         return ret;
234     }
235 
236     util::StringView SourceView(size_t begin, size_t end) const;
237 
238 protected:
239     void NextToken(Keywords *kws);
240     ArenaAllocator *Allocator();
241     bool IsLineTerminatorOrEos() const;
242     bool ScanRegExpPattern();
243     RegExpFlags ScanRegExpFlags();
244 
245     void LogSyntaxError(std::string_view const errorMessage) const;
246     void LogSyntaxError(std::string_view const errorMessage, const lexer::SourcePosition &pos) const;
247     void LogUnexpectedToken(lexer::TokenType const tokenType) const;
248 
249     void SetTokenStart();
250     void SetTokenEnd();
251 
Iterator()252     inline util::StringView::Iterator &Iterator()
253     {
254         return pos_.iterator_;
255     }
256 
Iterator()257     inline const util::StringView::Iterator &Iterator() const
258     {
259         return pos_.iterator_;
260     }
261 
262     util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
263 
264     bool SkipWhiteSpacesHelperSlash(char32_t *cp);
265     bool SkipWhiteSpacesHelperDefault(const char32_t &cp);
266     void SkipWhiteSpaces();
267     void SkipSingleLineComment();
268 
269     bool ScanPunctuator();
270     void ScanQuestionPunctuator();
271     void ScanLessThanPunctuator();
272     void ScanGreaterThanPunctuator();
273     virtual void ScanEqualsPunctuator();
274     virtual void ScanExclamationPunctuator();
275     void ScanAmpersandPunctuator();
276     void ScanVLinePunctuator();
277     void ScanCircumflexPunctuator();
278     void ScanPlusPunctuator();
279     void ScanMinusPunctuator();
280     void ScanSlashPunctuator();
281     void ScanPercentPunctuator();
282     void ScanDotPunctuator();
283     void ScanColonPunctuator();
284     virtual bool ScanDollarPunctuator();
285     void ScanAtPunctuator();
286 
287     virtual void SkipMultiLineComment();
288     virtual void ScanHashMark();
289     virtual void ScanBackTick();
290 
ScanCharLiteral()291     virtual bool ScanCharLiteral()
292     {
293         return false;
294     }
295 
296     char32_t ScanUnicodeEscapeSequence();
297     template <int N, bool IN_AS = false>
298     char32_t ScanHexEscape();
299     char32_t ScanUnicodeCodePointEscape();
300 
301     bool ScanStringUnicodePart(util::UString *str);
302     char32_t ScanUnicodeCharacterHelper(size_t cpSize, char32_t cp);
303     char32_t ScanUnicodeCharacter();
304 
305     void ScanDecimalNumbers();
306 
ScanNumberLeadingZero(bool const leadingMinus)307     virtual void ScanNumberLeadingZero(bool const leadingMinus)
308     {
309         ScanNumberLeadingZeroImpl<double>(leadingMinus);
310     }
311 
312     template <typename RadixType, typename RadixLimit = RadixType>
313     bool ScanNumberLeadingZeroImpl(bool const leadingMinus);
314     void ScanNumberLeadingZeroImplNonAllowedCases();
315     template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
316     bool ScanNumberRadix(bool leadingMinus, bool allowNumericSeparator = true);
317     void ScanNumber(bool const leadingMinus = false, bool allowBigInt = true);
318     std::optional<std::size_t> ScanCharLex(bool parseExponent, bool &allowBigInt, NumberFlags &flags);
319     std::optional<std::size_t> ScanSignOfNumber() noexcept;
320     virtual void ConvertNumber(NumberFlags flags);
321     void ScanDecimalLiteral();
322     void ScanDecimalDigits(bool allowNumericSeparator);
323     virtual void CheckNumberLiteralEnd();
324     void CheckOctal();
325 
326     inline static uint32_t HexValue(char32_t ch);
327     inline static bool IsDecimalDigit(uint32_t cp);
328     inline static bool IsHexDigit(char32_t ch);
329     inline static bool IsBinaryDigit(char32_t ch);
330     inline static bool IsOctalDigit(char32_t ch);
331 
332     friend class KeywordsUtil;
333     friend class TemplateLiteralParserContext;
334     friend class parser::ETSNolintParser;
335 
336     LexerPosition &Pos();
337     const LexerPosition &Pos() const;
338 
339 private:
340     TemplateLiteralParserContext *tlCtx_ {};
341     ArenaAllocator *allocator_;
342     Keywords *kws_ {};
343     const parser::ParserContext *parserContext_;
344     util::StringView source_;
345     LexerPosition pos_;
346     util::ErrorLogger *const errorLogger_;
347 };
348 
349 class TemplateLiteralParserContext {
350 public:
TemplateLiteralParserContext(Lexer * lexer)351     explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
352     NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
353     NO_COPY_SEMANTIC(TemplateLiteralParserContext);
354 
~TemplateLiteralParserContext()355     ~TemplateLiteralParserContext()
356     {
357         lexer_->tlCtx_ = prev_;
358     }
359 
ConsumeLeftBrace()360     void ConsumeLeftBrace()
361     {
362         braceDepth_++;
363     }
364 
ConsumeRightBrace()365     bool ConsumeRightBrace()
366     {
367         braceDepth_--;
368 
369         return braceDepth_ == 0;
370     }
371 
372 private:
373     Lexer *lexer_;
374     TemplateLiteralParserContext *prev_ {};
375     size_t braceDepth_ {1};
376 };
377 
378 template <char32_t END>
379 // CC-OFFNXT(huge_method,G.FUN.01) big switch-case, solid logic
ScanString()380 void Lexer::ScanString()
381 {
382     util::UString str(Allocator());
383     PrepareStringTokenHelper();
384     const auto startPos = Iterator().Index();
385     auto escapeEnd = startPos;
386     bool validEscape = true;
387 
388     do {
389         const char32_t cp = Iterator().Peek();
390         switch (cp) {
391             case util::StringView::Iterator::INVALID_CP: {
392                 LogSyntaxError("Unterminated string");
393                 break;
394             }
395             case LEX_CHAR_CR:
396             case LEX_CHAR_LF: {
397                 if constexpr (END != LEX_CHAR_BACK_TICK) {
398                     LogSyntaxError("Newline is not allowed in strings");
399                     break;
400                 }
401                 HandleNewlineHelper(&str, &escapeEnd);
402                 continue;
403             }
404             case LEX_CHAR_BACKSLASH: {
405                 validEscape &= HandleBackslashHelper(&str, &escapeEnd);
406                 continue;
407             }
408             case LEX_CHAR_BACK_TICK:
409             case LEX_CHAR_SINGLE_QUOTE:
410             case LEX_CHAR_DOUBLE_QUOTE: {
411                 if (!HandleDoubleQuoteHelper(END, cp)) {
412                     break;
413                 }
414                 continue;
415             }
416             case LEX_CHAR_DOLLAR_SIGN: {
417                 if (HandleDollarSignHelper(END)) {
418                     break;
419                 }
420                 continue;
421             }
422             default: {
423                 Iterator().SkipCp();
424                 continue;
425             }
426         }
427 
428         FinalizeTokenHelper(&str, startPos, escapeEnd, validEscape);
429         break;
430     } while (true);
431 
432     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
433     if constexpr (END != LEX_CHAR_BACK_TICK) {
434         Iterator().Forward(1);
435     }
436 }
437 
438 template <int N, bool IN_AS>
ScanHexEscape()439 char32_t Lexer::ScanHexEscape()
440 {
441     char32_t code = 0;
442 
443     for (size_t i = 0; i < N; ++i) {
444         const auto cp = Iterator().Peek();
445         if (IN_AS && cp == LEX_CHAR_BACK_TICK) {
446             break;
447         }
448 
449         Iterator().Forward(1);
450 
451         if (!IsHexDigit(cp)) {
452             LogSyntaxError("Invalid unicode escape sequence");
453             return UNICODE_INVALID_CP;
454         }
455 
456         constexpr auto MULTIPLIER = 16;
457         code = code * MULTIPLIER + HexValue(cp);
458     }
459 
460     return code;
461 }
462 
463 template <typename RadixType, typename RadixLimit>
ScanNumberLeadingZeroImpl(bool const leadingMinus)464 bool Lexer::ScanNumberLeadingZeroImpl(bool const leadingMinus)
465 {
466     GetToken().type_ = TokenType::LITERAL_NUMBER;
467     GetToken().keywordType_ = TokenType::LITERAL_NUMBER;
468 
469     switch (Iterator().Peek()) {
470         case LEX_CHAR_LOWERCASE_X:
471         case LEX_CHAR_UPPERCASE_X: {
472             Iterator().Forward(1);
473             constexpr auto RADIX = 16;
474             if (!ScanNumberRadix<IsHexDigit, RADIX, RadixType, RadixLimit>(leadingMinus)) {
475                 return false;
476             }
477             CheckNumberLiteralEnd();
478             return true;
479         }
480         case LEX_CHAR_LOWERCASE_B:
481         case LEX_CHAR_UPPERCASE_B: {
482             Iterator().Forward(1);
483             constexpr auto RADIX = 2;
484             if (!ScanNumberRadix<IsBinaryDigit, RADIX, RadixType, RadixLimit>(leadingMinus)) {
485                 return false;
486             }
487             CheckNumberLiteralEnd();
488             return true;
489         }
490         case LEX_CHAR_LOWERCASE_O:
491         case LEX_CHAR_UPPERCASE_O: {
492             Iterator().Forward(1);
493             constexpr auto RADIX = 8;
494             if (!ScanNumberRadix<IsOctalDigit, RADIX, RadixType, RadixLimit>(leadingMinus)) {
495                 return false;
496             }
497             CheckOctal();
498             CheckNumberLiteralEnd();
499             return true;
500         }
501         default: {
502             ScanNumberLeadingZeroImplNonAllowedCases();
503             break;
504         }
505     }
506 
507     ScanNumber(leadingMinus);
508     return true;
509 }
510 
511 template <int RADIX, typename RadixType, typename RadixLimit>
ScanTooLargeNumber(RadixType const number,std::uint32_t const digit)512 bool ScanTooLargeNumber([[maybe_unused]] RadixType const number, [[maybe_unused]] std::uint32_t const digit)
513 {
514     // NOTE (DZ): probably more sophisticates check will be required for general usage
515     if constexpr (std::is_integral_v<RadixLimit>) {
516         if (static_cast<RadixType>(std::numeric_limits<RadixLimit>::max()) / RADIX < number ||
517             static_cast<RadixType>(std::numeric_limits<RadixLimit>::max()) - number * RADIX < digit) {
518             return false;
519         }
520     }
521     return true;
522 }
523 
524 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanNumberRadix(bool leadingMinus,bool allowNumericSeparator)525 bool Lexer::ScanNumberRadix(bool leadingMinus, bool allowNumericSeparator)
526 {
527     RadixType number {};
528 
529     auto cp = Iterator().Peek();
530     if (!RANGE_CHECK(cp)) {
531         LogSyntaxError("Invalid digit");
532     }
533 
534     bool allowNumericOnNext = true;
535 
536     do {
537         cp = Iterator().Peek();
538         if (RANGE_CHECK(cp)) {
539             auto const digit = HexValue(cp);
540             if (!ScanTooLargeNumber<RADIX, RadixType, RadixLimit>(number, digit)) {
541                 return false;
542             }
543             number = number * RADIX + digit;
544 
545             Iterator().Forward(1);
546             allowNumericOnNext = true;
547             continue;
548         }
549 
550         if (cp == LEX_CHAR_UNDERSCORE) {
551             if (!allowNumericSeparator || !allowNumericOnNext) {
552                 LogSyntaxError("Invalid numeric separator");
553             }
554 
555             GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
556             Iterator().Forward(1);
557             allowNumericOnNext = false;
558             continue;
559         }
560 
561         if (!allowNumericOnNext) {
562             Iterator().Backward(1);
563             LogSyntaxError("Numeric separators are not allowed at the end of numeric literals");
564         }
565 
566         break;
567     } while (true);
568 
569     if (leadingMinus) {
570         if constexpr (std::is_integral_v<RadixType>) {
571             number = ~number + static_cast<RadixType>(1);
572         } else {
573             number = -number;
574         }
575     }
576 
577     GetToken().number_ = lexer::Number(number);
578     return true;
579 }
580 
HexValue(char32_t ch)581 inline uint32_t Lexer::HexValue(char32_t ch)
582 {
583     constexpr uint32_t HEX_MASK = 0xF;
584     constexpr uint32_t DEC_OFFSET = 10;
585     return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
586 }
587 
IsDecimalDigit(uint32_t cp)588 inline bool Lexer::IsDecimalDigit(uint32_t cp)
589 {
590     return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
591 }
592 
IsHexDigit(char32_t ch)593 inline bool Lexer::IsHexDigit(char32_t ch)
594 {
595     return ch < LEX_ASCII_MAX_BITS && (std::isxdigit(static_cast<unsigned char>(ch)) != 0);
596 }
597 
IsBinaryDigit(char32_t ch)598 inline bool Lexer::IsBinaryDigit(char32_t ch)
599 {
600     return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
601 }
602 
IsOctalDigit(char32_t ch)603 inline bool Lexer::IsOctalDigit(char32_t ch)
604 {
605     return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
606 }
607 }  // namespace ark::es2panda::lexer
608 
609 template <>
610 struct enumbitops::IsAllowedType<ark::es2panda::lexer::NextTokenFlags> : std::true_type {
611 };
612 
613 #endif
614