• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ES2PANDA_PARSER_CORE_LEXER_H
17 #define ES2PANDA_PARSER_CORE_LEXER_H
18 
19 #include "lexer/regexp/regexp.h"
20 #include "lexer/token/letters.h"
21 #include "lexer/token/token.h"
22 #include "util/enumbitops.h"
23 
24 namespace ark::es2panda::parser {
25 class ParserContext;
26 class ETSNolintParser;
27 }  // namespace ark::es2panda::parser
28 
29 namespace ark::es2panda::lexer {
30 class Keywords;
31 class KeywordsUtil;
32 
33 using ENUMBITOPS_OPERATORS;
34 
35 enum class NextTokenFlags : uint32_t {
36     NONE = 0U,
37     KEYWORD_TO_IDENT = 1U << 0U,
38     NUMERIC_SEPARATOR_ALLOWED = 1U << 1U,
39     BIGINT_ALLOWED = 1U << 2U,
40     UNARY_MINUS = 1U << 3U,
41 };
42 
43 class LexerPosition {
44 public:
45     explicit LexerPosition(const util::StringView &source);
46     DEFAULT_COPY_SEMANTIC(LexerPosition);
47     DEFAULT_MOVE_SEMANTIC(LexerPosition);
48     ~LexerPosition() = default;
49 
Iterator()50     util::StringView::Iterator &Iterator()
51     {
52         return iterator_;
53     }
54 
Iterator()55     const util::StringView::Iterator &Iterator() const
56     {
57         return iterator_;
58     }
59 
Line()60     size_t Line() const
61     {
62         return line_;
63     }
64 
GetToken()65     Token &GetToken()
66     {
67         return token_;
68     }
69 
GetToken()70     const Token &GetToken() const
71     {
72         return token_;
73     }
74 
NextTokenLine()75     size_t &NextTokenLine()
76     {
77         return nextTokenLine_;
78     }
79 
80     bool operator==(const LexerPosition &other) const
81     {
82         return iterator_.Save() == other.iterator_.Save();
83     }
84 
85     bool operator!=(const LexerPosition &other) const
86     {
87         return !(*this == other);
88     }
89 
90 private:
91     friend class Lexer;
92 
93     Token token_ {};
94     util::StringView::Iterator iterator_;
95     size_t line_ {};
96     size_t nextTokenLine_ {};
97 };
98 
99 class LexerTemplateString {
100 public:
LexerTemplateString(ArenaAllocator * allocator)101     explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
102     DEFAULT_COPY_SEMANTIC(LexerTemplateString);
103     DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
104     ~LexerTemplateString() = default;
105 
106     // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
107     util::UString str;
108     size_t end {};
109     bool scanExpression {};
110     bool validSequence {true};
111     // NOLINTEND(misc-non-private-member-variables-in-classes)
112 };
113 
114 class TemplateLiteralParserContext;
115 
116 class Lexer {
117 public:
118     explicit Lexer(const parser::ParserContext *parserContext, util::DiagnosticEngine &diagnosticEngine,
119                    bool startLexer = true);
120     NO_COPY_SEMANTIC(Lexer);
121     NO_MOVE_SEMANTIC(Lexer);
122     virtual ~Lexer() = default;
123 
124     // NOLINTNEXTLINE(google-default-arguments)
125     virtual void NextToken(NextTokenFlags flags = NextTokenFlags::NONE);
126     virtual void ScanAsteriskPunctuator();
127     bool IsEnableParseJsdoc() const;
128 
129     Token &GetToken();
130     const Token &GetToken() const;
131     size_t Line() const;
132     const parser::Program *GetProgram() const;
133 
TryEatTokenType(lexer::TokenType type)134     bool TryEatTokenType(lexer::TokenType type)
135     {
136         auto token = GetToken();
137         if (token.Type() == type) {
138             NextToken();
139             return true;
140         }
141         return false;
142     }
143 
TryEatTokenFromKeywordType(lexer::TokenType type)144     bool TryEatTokenFromKeywordType(lexer::TokenType type)
145     {
146         auto token = GetToken();
147         if (token.KeywordType() == type) {
148             NextToken();
149             return true;
150         }
151         return false;
152     }
153 
SkipCp()154     void SkipCp()
155     {
156         Iterator().SkipCp();
157     }
158 
DiagnosticEngine()159     util::DiagnosticEngine &DiagnosticEngine()
160     {
161         return diagnosticEngine_;
162     }
163 
TryEatTokenKeyword(lexer::TokenType type)164     std::optional<Token> TryEatTokenKeyword(lexer::TokenType type)
165     {
166         auto token = GetToken();
167         if (token.KeywordType() == type) {
168             NextToken();
169             return token;
170         }
171         return std::nullopt;
172     }
173 
174     LexerPosition Save() const;
175     void Rewind(const LexerPosition &pos);
176     void BackwardToken(TokenType type, size_t offset);
177     void ForwardToken(TokenType type, size_t offset);
178     void ForwardToken(TokenType type);
179 
180     char32_t Lookahead();
181     bool CheckArrow();
182 
183     RegExp ScanRegExp();
184 
185     void HandleNewlineHelper(util::UString *str, size_t *escapeEnd);
186     bool HandleBackslashHelper(util::UString *str, size_t *escapeEnd);
187     bool HandleDollarSignHelper(const char32_t &end);
188     bool HandleDoubleQuoteHelper(const char32_t &end, const char32_t &cp);
189     void PrepareStringTokenHelper();
190     void FinalizeTokenHelper(util::UString *str, const size_t &startPos, size_t escapeEnd, bool finalize = true);
191     void FinalizeJsDocInfoHelper(util::UString *str, const size_t &startPos, size_t escapeEnd);
192     template <char32_t END>
193     void ScanString();
194 
195     void ResetTokenEnd();
196     bool CheckOctalDigit(char32_t const nextCp);
197     std::tuple<bool, bool, LexerTemplateString> ScanTemplateStringCpHelper(char32_t cp,
198                                                                            LexerTemplateString templateStr);
199     LexerTemplateString ScanTemplateString();
200     util::StringView ScanMultilineString();
201     void ScanTemplateStringEnd();
202     void PushTemplateContext(TemplateLiteralParserContext *ctx);
LogUnexpectedStrictModeReservedKeyword()203     void LogUnexpectedStrictModeReservedKeyword() const
204     {
205         LogError(diagnostic::UNEXPECTED_STRICT_MODE_RESERVED_KEYWORD);
206     }
207 
208     enum class ConversionResult : uint8_t {
209         SUCCESS,
210         INVALID_ARGUMENT,
211         OUT_OF_RANGE,
212     };
213 
214     template <typename Tret, typename Ret = Tret, typename... Base>
StrToNumeric(Tret (* converter)(const char *,char **,Base...),const char * str,ConversionResult & result,Base...base)215     static Ret StrToNumeric(Tret (*converter)(const char *, char **, Base...), const char *str,
216                             ConversionResult &result, Base... base) noexcept
217     {
218         Ret ret {};
219         char *endPtr;
220         // NOLINTBEGIN(cppcoreguidelines-special-member-functions)
221         struct SaveErrno {
222             explicit SaveErrno() : errno_(errno)
223             {
224                 errno = 0;
225             }
226             ~SaveErrno()
227             {
228                 if (errno == 0) {
229                     errno = errno_;
230                 }
231             }
232 
233         private:
234             decltype(errno) errno_;
235         } const savedErrno;
236         // NOLINTEND(cppcoreguidelines-special-member-functions)
237 
238         const Tret tmp = converter(str, &endPtr, base...);
239 
240         bool outOfRange = false;
241         if constexpr (!std::is_same_v<Ret, Tret>) {
242             outOfRange = tmp < static_cast<Tret>(std::numeric_limits<Ret>::min()) ||
243                          tmp > static_cast<Tret>(std::numeric_limits<Ret>::max());
244         }
245 
246         if (endPtr == str) {
247             result = ConversionResult::INVALID_ARGUMENT;
248         } else if (errno == ERANGE || outOfRange) {
249             result = ConversionResult::OUT_OF_RANGE;
250         } else {
251             result = ConversionResult::SUCCESS;
252             ret = tmp;
253         }
254 
255         return ret;
256     }
257 
258     util::StringView SourceView(size_t begin, size_t end) const;
259 
GetPositionForDiagnostic()260     lexer::SourcePosition GetPositionForDiagnostic() const
261     {
262         return GetToken().Start();
263     }
264 
265     size_t GetIndex();
266 
267 protected:
268     void NextToken(Keywords *kws);
269     ArenaAllocator *Allocator();
270     bool IsLineTerminatorOrEos() const;
271     bool ScanRegExpPattern();
272     RegExpFlags ScanRegExpFlags();
273 
274     void LogUnexpectedToken(lexer::TokenType const tokenType) const;
275 
276     void LogError(const diagnostic::DiagnosticKind &diagnostic,
277                   const util::DiagnosticMessageParams &diagnosticParams = {}) const;
278     void LogError(const diagnostic::DiagnosticKind &diagnostic, const util::DiagnosticMessageParams &diagnosticParams,
279                   const lexer::SourcePosition &pos) const;
280 
281     void SetTokenStart();
282     void SetTokenEnd();
283 
Iterator()284     inline util::StringView::Iterator &Iterator()
285     {
286         return pos_.iterator_;
287     }
288 
Iterator()289     inline const util::StringView::Iterator &Iterator() const
290     {
291         return pos_.iterator_;
292     }
293 
294     util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
295 
296     bool SkipWhiteSpacesHelperSlash(char32_t *cp);
297     bool IsValidJsDocStart(char32_t *cp);
298     bool IsValidJsDocEnd(char32_t *cp);
299     bool SkipWhiteSpacesHelperDefault(const char32_t &cp);
300     void SkipWhiteSpaces();
301     void SkipSingleLineComment();
302 
303     bool ScanPunctuator();
304     void ScanQuestionPunctuator();
305     void ScanLessThanPunctuator();
306     void ScanGreaterThanPunctuator();
307     virtual void ScanEqualsPunctuator();
308     virtual void ScanExclamationPunctuator();
309     void ScanAmpersandPunctuator();
310     void ScanVLinePunctuator();
311     void ScanCircumflexPunctuator();
312     void ScanPlusPunctuator();
313     void ScanMinusPunctuator();
314     void ScanSlashPunctuator();
315     void ScanPercentPunctuator();
316     void ScanDotPunctuator(KeywordsUtil &kwu);
317     void ScanColonPunctuator();
318     virtual bool ScanDollarPunctuator();
319     void ScanAtPunctuator();
320 
321     virtual void SkipMultiLineComment();
322     virtual void ScanHashMark();
323     virtual void ScanBackTick();
324 
ScanCharLiteral()325     virtual bool ScanCharLiteral()
326     {
327         return false;
328     }
329 
330     char32_t ScanUnicodeEscapeSequence();
331     template <int N, bool IN_AS = false>
332     char32_t ScanHexEscape();
333     char32_t ScanUnicodeCodePointEscape();
334 
335     bool ScanStringUnicodePart(util::UString *str);
336     char32_t ScanUnicodeCharacterHelper(size_t cpSize, char32_t cp);
337     char32_t ScanUnicodeCharacter();
338 
339     void ScanDecimalNumbers();
340 
ScanNumberLeadingZero(bool const leadingMinus)341     virtual void ScanNumberLeadingZero(bool const leadingMinus)
342     {
343         ScanNumberLeadingZeroImpl<double>(leadingMinus);
344     }
345 
346     template <typename RadixType, typename RadixLimit = RadixType>
347     bool ScanNumberLeadingZeroImpl(bool const leadingMinus);
348     void ScanNumberLeadingZeroImplNonAllowedCases();
349     template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
350     bool ScanNumberRadix(bool leadingMinus, bool allowNumericSeparator = true);
351     void ScanNumber(bool const leadingMinus = false, bool allowBigInt = true);
352     std::optional<std::size_t> ScanCharLex(bool parseExponent, bool &allowBigInt, NumberFlags &flags);
353     std::optional<std::size_t> ScanSignOfNumber() noexcept;
354     virtual void ConvertNumber(NumberFlags flags);
355     void ScanDecimalLiteral();
356     void ScanDecimalDigits(bool allowNumericSeparator);
357     virtual void CheckNumberLiteralEnd();
358     virtual void CheckNumberLiteralEndForIdentifier();
359     void CheckOctal();
360 
361     inline static uint32_t HexValue(char32_t ch);
362     inline static bool IsDecimalDigit(uint32_t cp);
363     inline static bool IsHexDigit(char32_t ch);
364     inline static bool IsBinaryDigit(char32_t ch);
365     inline static bool IsOctalDigit(char32_t ch);
366 
367     friend class KeywordsUtil;
368     friend class TemplateLiteralParserContext;
369     friend class parser::ETSNolintParser;
370 
371     LexerPosition &Pos();
372     const LexerPosition &Pos() const;
373 
374 private:
375     TemplateLiteralParserContext *tlCtx_ {};
376     ArenaAllocator *allocator_;
377     Keywords *kws_ {};
378     const parser::ParserContext *parserContext_;
379     util::StringView source_;
380     LexerPosition pos_;
381     util::DiagnosticEngine &diagnosticEngine_;
GetContext()382     const parser::ParserContext *GetContext()
383     {
384         return parserContext_;
385     }
386 };
387 
388 class TemplateLiteralParserContext {
389 public:
TemplateLiteralParserContext(Lexer * lexer)390     explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
391     NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
392     NO_COPY_SEMANTIC(TemplateLiteralParserContext);
393 
~TemplateLiteralParserContext()394     ~TemplateLiteralParserContext()
395     {
396         lexer_->tlCtx_ = prev_;
397     }
398 
ConsumeLeftBrace()399     void ConsumeLeftBrace()
400     {
401         braceDepth_++;
402     }
403 
ConsumeRightBrace()404     bool ConsumeRightBrace()
405     {
406         braceDepth_--;
407 
408         return braceDepth_ == 0;
409     }
410 
411 private:
412     Lexer *lexer_;
413     TemplateLiteralParserContext *prev_ {};
414     size_t braceDepth_ {1};
415 };
416 
417 template <char32_t END>
418 // CC-OFFNXT(huge_method,G.FUN.01) big switch-case, solid logic
ScanString()419 void Lexer::ScanString()
420 {
421     util::UString str(Allocator());
422     PrepareStringTokenHelper();
423     const auto startPos = Iterator().Index();
424     auto escapeEnd = startPos;
425     bool isFinalizedStr = true;
426 
427     do {
428         const char32_t cp = Iterator().Peek();
429         switch (cp) {
430             case util::StringView::Iterator::INVALID_CP: {
431                 LogError(diagnostic::UNTERMINATED_STRING);
432                 isFinalizedStr = false;
433                 break;
434             }
435             case LEX_CHAR_CR:
436             case LEX_CHAR_LF: {
437                 if constexpr (END != LEX_CHAR_BACK_TICK) {
438                     LogError(diagnostic::NEWLINE_NOT_ALLOWED_IN_STRING);
439                     break;
440                 }
441                 HandleNewlineHelper(&str, &escapeEnd);
442                 continue;
443             }
444             case LEX_CHAR_BACKSLASH: {
445                 isFinalizedStr &= HandleBackslashHelper(&str, &escapeEnd);
446                 continue;
447             }
448             case LEX_CHAR_BACK_TICK:
449             case LEX_CHAR_SINGLE_QUOTE:
450             case LEX_CHAR_DOUBLE_QUOTE: {
451                 if (!HandleDoubleQuoteHelper(END, cp)) {
452                     break;
453                 }
454                 continue;
455             }
456             case LEX_CHAR_DOLLAR_SIGN: {
457                 if (HandleDollarSignHelper(END)) {
458                     break;
459                 }
460                 continue;
461             }
462             default: {
463                 Iterator().SkipCp();
464                 continue;
465             }
466         }
467 
468         FinalizeTokenHelper(&str, startPos, escapeEnd, isFinalizedStr);
469         break;
470     } while (true);
471 
472     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
473     if constexpr (END != LEX_CHAR_BACK_TICK) {
474         Iterator().Forward(1);
475     }
476 }
477 
478 template <int N, bool IN_AS>
ScanHexEscape()479 char32_t Lexer::ScanHexEscape()
480 {
481     char32_t code = 0;
482 
483     for (size_t i = 0; i < N; ++i) {
484         const auto cp = Iterator().Peek();
485         if (IN_AS && cp == LEX_CHAR_BACK_TICK) {
486             break;
487         }
488 
489         Iterator().Forward(1);
490 
491         if (!IsHexDigit(cp)) {
492             LogError(diagnostic::INVALID_UNICODE_ESCAPE);
493             return UNICODE_INVALID_CP;
494         }
495 
496         constexpr auto MULTIPLIER = 16;
497         code = code * MULTIPLIER + HexValue(cp);
498     }
499 
500     return code;
501 }
502 
503 template <typename RadixType, typename RadixLimit>
ScanNumberLeadingZeroImpl(bool const leadingMinus)504 bool Lexer::ScanNumberLeadingZeroImpl(bool const leadingMinus)
505 {
506     GetToken().type_ = TokenType::LITERAL_NUMBER;
507     GetToken().keywordType_ = TokenType::LITERAL_NUMBER;
508 
509     switch (Iterator().Peek()) {
510         case LEX_CHAR_LOWERCASE_X:
511         case LEX_CHAR_UPPERCASE_X: {
512             Iterator().Forward(1);
513             constexpr auto RADIX = 16;
514             if (!ScanNumberRadix<IsHexDigit, RADIX, RadixType, RadixLimit>(leadingMinus)) {
515                 return false;
516             }
517             CheckNumberLiteralEnd();
518             return true;
519         }
520         case LEX_CHAR_LOWERCASE_B:
521         case LEX_CHAR_UPPERCASE_B: {
522             Iterator().Forward(1);
523             constexpr auto RADIX = 2;
524             if (!ScanNumberRadix<IsBinaryDigit, RADIX, RadixType, RadixLimit>(leadingMinus)) {
525                 return false;
526             }
527             CheckNumberLiteralEnd();
528             return true;
529         }
530         case LEX_CHAR_LOWERCASE_O:
531         case LEX_CHAR_UPPERCASE_O: {
532             Iterator().Forward(1);
533             constexpr auto RADIX = 8;
534             if (!ScanNumberRadix<IsOctalDigit, RADIX, RadixType, RadixLimit>(leadingMinus)) {
535                 return false;
536             }
537             CheckOctal();
538             CheckNumberLiteralEnd();
539             return true;
540         }
541         default: {
542             ScanNumberLeadingZeroImplNonAllowedCases();
543             break;
544         }
545     }
546 
547     ScanNumber(leadingMinus);
548     return true;
549 }
550 
551 template <int RADIX, typename RadixType, typename RadixLimit>
ScanTooLargeNumber(RadixType const number,std::uint32_t const digit)552 bool ScanTooLargeNumber([[maybe_unused]] RadixType const number, [[maybe_unused]] std::uint32_t const digit)
553 {
554     // NOTE (DZ): probably more sophisticates check will be required for general usage
555     if constexpr (std::is_integral_v<RadixLimit>) {
556         if (static_cast<RadixType>(std::numeric_limits<RadixLimit>::max()) / RADIX < number ||
557             static_cast<RadixType>(std::numeric_limits<RadixLimit>::max()) - number * RADIX < digit) {
558             return false;
559         }
560     }
561     return true;
562 }
563 
564 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanNumberRadix(bool leadingMinus,bool allowNumericSeparator)565 bool Lexer::ScanNumberRadix(bool leadingMinus, bool allowNumericSeparator)
566 {
567     RadixType number {};
568 
569     auto cp = Iterator().Peek();
570     if (!RANGE_CHECK(cp)) {
571         LogError(diagnostic::INVALID_DIGIT);
572     }
573 
574     bool allowNumericOnNext = true;
575 
576     do {
577         cp = Iterator().Peek();
578         if (RANGE_CHECK(cp)) {
579             auto const digit = HexValue(cp);
580             if (!ScanTooLargeNumber<RADIX, RadixType, RadixLimit>(number, digit)) {
581                 return false;
582             }
583             number = number * RADIX + digit;
584 
585             Iterator().Forward(1);
586             allowNumericOnNext = true;
587             continue;
588         }
589 
590         if (cp == LEX_CHAR_UNDERSCORE) {
591             if (!allowNumericSeparator || !allowNumericOnNext) {
592                 LogError(diagnostic::INVALID_NUMERIC_SEP);
593             }
594 
595             GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
596             Iterator().Forward(1);
597             allowNumericOnNext = false;
598             continue;
599         }
600 
601         if (!allowNumericOnNext) {
602             Iterator().Backward(1);
603             LogError(diagnostic::INVALID_NUMERIC_SEP_AT_END_OF_NUM);
604         }
605 
606         break;
607     } while (true);
608 
609     if (leadingMinus) {
610         if constexpr (std::is_integral_v<RadixType>) {
611             number = ~number + static_cast<RadixType>(1);
612         } else {
613             number = -number;
614         }
615     }
616 
617     GetToken().number_ = lexer::Number(number);
618     return true;
619 }
620 
HexValue(char32_t ch)621 inline uint32_t Lexer::HexValue(char32_t ch)
622 {
623     constexpr uint32_t HEX_MASK = 0xF;
624     constexpr uint32_t DEC_OFFSET = 10;
625     return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
626 }
627 
IsDecimalDigit(uint32_t cp)628 inline bool Lexer::IsDecimalDigit(uint32_t cp)
629 {
630     return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
631 }
632 
IsHexDigit(char32_t ch)633 inline bool Lexer::IsHexDigit(char32_t ch)
634 {
635     return ch < LEX_ASCII_MAX_BITS && (std::isxdigit(static_cast<unsigned char>(ch)) != 0);
636 }
637 
IsBinaryDigit(char32_t ch)638 inline bool Lexer::IsBinaryDigit(char32_t ch)
639 {
640     return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
641 }
642 
IsOctalDigit(char32_t ch)643 inline bool Lexer::IsOctalDigit(char32_t ch)
644 {
645     return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
646 }
647 }  // namespace ark::es2panda::lexer
648 
649 template <>
650 struct enumbitops::IsAllowedType<ark::es2panda::lexer::NextTokenFlags> : std::true_type {
651 };
652 
653 #endif
654