1 /**
2 * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_PARSER_CORE_LEXER_H
17 #define ES2PANDA_PARSER_CORE_LEXER_H
18
19 #include "lexer/regexp/regexp.h"
20 #include "lexer/token/letters.h"
21 #include "lexer/token/token.h"
22 #include "util/enumbitops.h"
23
24 namespace ark::es2panda::parser {
25 class ParserContext;
26 class ETSNolintParser;
27 } // namespace ark::es2panda::parser
28
29 namespace ark::es2panda::lexer {
30 class Keywords;
31 class KeywordsUtil;
32
33 using ENUMBITOPS_OPERATORS;
34
35 enum class NextTokenFlags : uint32_t {
36 NONE = 0U,
37 KEYWORD_TO_IDENT = 1U << 0U,
38 NUMERIC_SEPARATOR_ALLOWED = 1U << 1U,
39 BIGINT_ALLOWED = 1U << 2U,
40 UNARY_MINUS = 1U << 3U,
41 };
42
43 class LexerPosition {
44 public:
45 explicit LexerPosition(const util::StringView &source);
46 DEFAULT_COPY_SEMANTIC(LexerPosition);
47 DEFAULT_MOVE_SEMANTIC(LexerPosition);
48 ~LexerPosition() = default;
49
Iterator()50 util::StringView::Iterator &Iterator()
51 {
52 return iterator_;
53 }
54
Iterator()55 const util::StringView::Iterator &Iterator() const
56 {
57 return iterator_;
58 }
59
Line()60 size_t Line() const
61 {
62 return line_;
63 }
64
GetToken()65 Token &GetToken()
66 {
67 return token_;
68 }
69
GetToken()70 const Token &GetToken() const
71 {
72 return token_;
73 }
74
NextTokenLine()75 size_t &NextTokenLine()
76 {
77 return nextTokenLine_;
78 }
79
80 bool operator==(const LexerPosition &other) const
81 {
82 return iterator_.Save() == other.iterator_.Save();
83 }
84
85 bool operator!=(const LexerPosition &other) const
86 {
87 return !(*this == other);
88 }
89
90 private:
91 friend class Lexer;
92
93 Token token_ {};
94 util::StringView::Iterator iterator_;
95 size_t line_ {};
96 size_t nextTokenLine_ {};
97 };
98
99 class LexerTemplateString {
100 public:
LexerTemplateString(ArenaAllocator * allocator)101 explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
102 DEFAULT_COPY_SEMANTIC(LexerTemplateString);
103 DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
104 ~LexerTemplateString() = default;
105
106 // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
107 util::UString str;
108 size_t end {};
109 bool scanExpression {};
110 bool validSequence {true};
111 // NOLINTEND(misc-non-private-member-variables-in-classes)
112 };
113
114 class TemplateLiteralParserContext;
115
116 class Lexer {
117 public:
118 explicit Lexer(const parser::ParserContext *parserContext, util::DiagnosticEngine &diagnosticEngine,
119 bool startLexer = true);
120 NO_COPY_SEMANTIC(Lexer);
121 NO_MOVE_SEMANTIC(Lexer);
122 virtual ~Lexer() = default;
123
124 // NOLINTNEXTLINE(google-default-arguments)
125 virtual void NextToken(NextTokenFlags flags = NextTokenFlags::NONE);
126 virtual void ScanAsteriskPunctuator();
127 bool IsEnableParseJsdoc() const;
128
129 Token &GetToken();
130 const Token &GetToken() const;
131 size_t Line() const;
132 const parser::Program *GetProgram() const;
133
TryEatTokenType(lexer::TokenType type)134 bool TryEatTokenType(lexer::TokenType type)
135 {
136 auto token = GetToken();
137 if (token.Type() == type) {
138 NextToken();
139 return true;
140 }
141 return false;
142 }
143
TryEatTokenFromKeywordType(lexer::TokenType type)144 bool TryEatTokenFromKeywordType(lexer::TokenType type)
145 {
146 auto token = GetToken();
147 if (token.KeywordType() == type) {
148 NextToken();
149 return true;
150 }
151 return false;
152 }
153
SkipCp()154 void SkipCp()
155 {
156 Iterator().SkipCp();
157 }
158
DiagnosticEngine()159 util::DiagnosticEngine &DiagnosticEngine()
160 {
161 return diagnosticEngine_;
162 }
163
TryEatTokenKeyword(lexer::TokenType type)164 std::optional<Token> TryEatTokenKeyword(lexer::TokenType type)
165 {
166 auto token = GetToken();
167 if (token.KeywordType() == type) {
168 NextToken();
169 return token;
170 }
171 return std::nullopt;
172 }
173
174 LexerPosition Save() const;
175 void Rewind(const LexerPosition &pos);
176 void BackwardToken(TokenType type, size_t offset);
177 void ForwardToken(TokenType type, size_t offset);
178 void ForwardToken(TokenType type);
179
180 char32_t Lookahead();
181 bool CheckArrow();
182
183 RegExp ScanRegExp();
184
185 void HandleNewlineHelper(util::UString *str, size_t *escapeEnd);
186 bool HandleBackslashHelper(util::UString *str, size_t *escapeEnd);
187 bool HandleDollarSignHelper(const char32_t &end);
188 bool HandleDoubleQuoteHelper(const char32_t &end, const char32_t &cp);
189 void PrepareStringTokenHelper();
190 void FinalizeTokenHelper(util::UString *str, const size_t &startPos, size_t escapeEnd, bool finalize = true);
191 void FinalizeJsDocInfoHelper(util::UString *str, const size_t &startPos, size_t escapeEnd);
192 template <char32_t END>
193 void ScanString();
194
195 void ResetTokenEnd();
196 bool CheckOctalDigit(char32_t const nextCp);
197 std::tuple<bool, bool, LexerTemplateString> ScanTemplateStringCpHelper(char32_t cp,
198 LexerTemplateString templateStr);
199 LexerTemplateString ScanTemplateString();
200 util::StringView ScanMultilineString();
201 void ScanTemplateStringEnd();
202 void PushTemplateContext(TemplateLiteralParserContext *ctx);
LogUnexpectedStrictModeReservedKeyword()203 void LogUnexpectedStrictModeReservedKeyword() const
204 {
205 LogError(diagnostic::UNEXPECTED_STRICT_MODE_RESERVED_KEYWORD);
206 }
207
208 enum class ConversionResult : uint8_t {
209 SUCCESS,
210 INVALID_ARGUMENT,
211 OUT_OF_RANGE,
212 };
213
214 template <typename Tret, typename Ret = Tret, typename... Base>
StrToNumeric(Tret (* converter)(const char *,char **,Base...),const char * str,ConversionResult & result,Base...base)215 static Ret StrToNumeric(Tret (*converter)(const char *, char **, Base...), const char *str,
216 ConversionResult &result, Base... base) noexcept
217 {
218 Ret ret {};
219 char *endPtr;
220 // NOLINTBEGIN(cppcoreguidelines-special-member-functions)
221 struct SaveErrno {
222 explicit SaveErrno() : errno_(errno)
223 {
224 errno = 0;
225 }
226 ~SaveErrno()
227 {
228 if (errno == 0) {
229 errno = errno_;
230 }
231 }
232
233 private:
234 decltype(errno) errno_;
235 } const savedErrno;
236 // NOLINTEND(cppcoreguidelines-special-member-functions)
237
238 const Tret tmp = converter(str, &endPtr, base...);
239
240 bool outOfRange = false;
241 if constexpr (!std::is_same_v<Ret, Tret>) {
242 outOfRange = tmp < static_cast<Tret>(std::numeric_limits<Ret>::min()) ||
243 tmp > static_cast<Tret>(std::numeric_limits<Ret>::max());
244 }
245
246 if (endPtr == str) {
247 result = ConversionResult::INVALID_ARGUMENT;
248 } else if (errno == ERANGE || outOfRange) {
249 result = ConversionResult::OUT_OF_RANGE;
250 } else {
251 result = ConversionResult::SUCCESS;
252 ret = tmp;
253 }
254
255 return ret;
256 }
257
258 util::StringView SourceView(size_t begin, size_t end) const;
259
GetPositionForDiagnostic()260 lexer::SourcePosition GetPositionForDiagnostic() const
261 {
262 return GetToken().Start();
263 }
264
265 size_t GetIndex();
266
267 protected:
268 void NextToken(Keywords *kws);
269 ArenaAllocator *Allocator();
270 bool IsLineTerminatorOrEos() const;
271 bool ScanRegExpPattern();
272 RegExpFlags ScanRegExpFlags();
273
274 void LogUnexpectedToken(lexer::TokenType const tokenType) const;
275
276 void LogError(const diagnostic::DiagnosticKind &diagnostic,
277 const util::DiagnosticMessageParams &diagnosticParams = {}) const;
278 void LogError(const diagnostic::DiagnosticKind &diagnostic, const util::DiagnosticMessageParams &diagnosticParams,
279 const lexer::SourcePosition &pos) const;
280
281 void SetTokenStart();
282 void SetTokenEnd();
283
Iterator()284 inline util::StringView::Iterator &Iterator()
285 {
286 return pos_.iterator_;
287 }
288
Iterator()289 inline const util::StringView::Iterator &Iterator() const
290 {
291 return pos_.iterator_;
292 }
293
294 util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
295
296 bool SkipWhiteSpacesHelperSlash(char32_t *cp);
297 bool IsValidJsDocStart(char32_t *cp);
298 bool IsValidJsDocEnd(char32_t *cp);
299 bool SkipWhiteSpacesHelperDefault(const char32_t &cp);
300 void SkipWhiteSpaces();
301 void SkipSingleLineComment();
302
303 bool ScanPunctuator();
304 void ScanQuestionPunctuator();
305 void ScanLessThanPunctuator();
306 void ScanGreaterThanPunctuator();
307 virtual void ScanEqualsPunctuator();
308 virtual void ScanExclamationPunctuator();
309 void ScanAmpersandPunctuator();
310 void ScanVLinePunctuator();
311 void ScanCircumflexPunctuator();
312 void ScanPlusPunctuator();
313 void ScanMinusPunctuator();
314 void ScanSlashPunctuator();
315 void ScanPercentPunctuator();
316 void ScanDotPunctuator(KeywordsUtil &kwu);
317 void ScanColonPunctuator();
318 virtual bool ScanDollarPunctuator();
319 void ScanAtPunctuator();
320
321 virtual void SkipMultiLineComment();
322 virtual void ScanHashMark();
323 virtual void ScanBackTick();
324
ScanCharLiteral()325 virtual bool ScanCharLiteral()
326 {
327 return false;
328 }
329
330 char32_t ScanUnicodeEscapeSequence();
331 template <int N, bool IN_AS = false>
332 char32_t ScanHexEscape();
333 char32_t ScanUnicodeCodePointEscape();
334
335 bool ScanStringUnicodePart(util::UString *str);
336 char32_t ScanUnicodeCharacterHelper(size_t cpSize, char32_t cp);
337 char32_t ScanUnicodeCharacter();
338
339 void ScanDecimalNumbers();
340
ScanNumberLeadingZero(bool const leadingMinus)341 virtual void ScanNumberLeadingZero(bool const leadingMinus)
342 {
343 ScanNumberLeadingZeroImpl<double>(leadingMinus);
344 }
345
346 template <typename RadixType, typename RadixLimit = RadixType>
347 bool ScanNumberLeadingZeroImpl(bool const leadingMinus);
348 void ScanNumberLeadingZeroImplNonAllowedCases();
349 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
350 bool ScanNumberRadix(bool leadingMinus, bool allowNumericSeparator = true);
351 void ScanNumber(bool const leadingMinus = false, bool allowBigInt = true);
352 std::optional<std::size_t> ScanCharLex(bool parseExponent, bool &allowBigInt, NumberFlags &flags);
353 std::optional<std::size_t> ScanSignOfNumber() noexcept;
354 virtual void ConvertNumber(NumberFlags flags);
355 void ScanDecimalLiteral();
356 void ScanDecimalDigits(bool allowNumericSeparator);
357 virtual void CheckNumberLiteralEnd();
358 virtual void CheckNumberLiteralEndForIdentifier();
359 void CheckOctal();
360
361 inline static uint32_t HexValue(char32_t ch);
362 inline static bool IsDecimalDigit(uint32_t cp);
363 inline static bool IsHexDigit(char32_t ch);
364 inline static bool IsBinaryDigit(char32_t ch);
365 inline static bool IsOctalDigit(char32_t ch);
366
367 friend class KeywordsUtil;
368 friend class TemplateLiteralParserContext;
369 friend class parser::ETSNolintParser;
370
371 LexerPosition &Pos();
372 const LexerPosition &Pos() const;
373
374 private:
375 TemplateLiteralParserContext *tlCtx_ {};
376 ArenaAllocator *allocator_;
377 Keywords *kws_ {};
378 const parser::ParserContext *parserContext_;
379 util::StringView source_;
380 LexerPosition pos_;
381 util::DiagnosticEngine &diagnosticEngine_;
GetContext()382 const parser::ParserContext *GetContext()
383 {
384 return parserContext_;
385 }
386 };
387
388 class TemplateLiteralParserContext {
389 public:
TemplateLiteralParserContext(Lexer * lexer)390 explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
391 NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
392 NO_COPY_SEMANTIC(TemplateLiteralParserContext);
393
~TemplateLiteralParserContext()394 ~TemplateLiteralParserContext()
395 {
396 lexer_->tlCtx_ = prev_;
397 }
398
ConsumeLeftBrace()399 void ConsumeLeftBrace()
400 {
401 braceDepth_++;
402 }
403
ConsumeRightBrace()404 bool ConsumeRightBrace()
405 {
406 braceDepth_--;
407
408 return braceDepth_ == 0;
409 }
410
411 private:
412 Lexer *lexer_;
413 TemplateLiteralParserContext *prev_ {};
414 size_t braceDepth_ {1};
415 };
416
417 template <char32_t END>
418 // CC-OFFNXT(huge_method,G.FUN.01) big switch-case, solid logic
ScanString()419 void Lexer::ScanString()
420 {
421 util::UString str(Allocator());
422 PrepareStringTokenHelper();
423 const auto startPos = Iterator().Index();
424 auto escapeEnd = startPos;
425 bool isFinalizedStr = true;
426
427 do {
428 const char32_t cp = Iterator().Peek();
429 switch (cp) {
430 case util::StringView::Iterator::INVALID_CP: {
431 LogError(diagnostic::UNTERMINATED_STRING);
432 isFinalizedStr = false;
433 break;
434 }
435 case LEX_CHAR_CR:
436 case LEX_CHAR_LF: {
437 if constexpr (END != LEX_CHAR_BACK_TICK) {
438 LogError(diagnostic::NEWLINE_NOT_ALLOWED_IN_STRING);
439 break;
440 }
441 HandleNewlineHelper(&str, &escapeEnd);
442 continue;
443 }
444 case LEX_CHAR_BACKSLASH: {
445 isFinalizedStr &= HandleBackslashHelper(&str, &escapeEnd);
446 continue;
447 }
448 case LEX_CHAR_BACK_TICK:
449 case LEX_CHAR_SINGLE_QUOTE:
450 case LEX_CHAR_DOUBLE_QUOTE: {
451 if (!HandleDoubleQuoteHelper(END, cp)) {
452 break;
453 }
454 continue;
455 }
456 case LEX_CHAR_DOLLAR_SIGN: {
457 if (HandleDollarSignHelper(END)) {
458 break;
459 }
460 continue;
461 }
462 default: {
463 Iterator().SkipCp();
464 continue;
465 }
466 }
467
468 FinalizeTokenHelper(&str, startPos, escapeEnd, isFinalizedStr);
469 break;
470 } while (true);
471
472 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
473 if constexpr (END != LEX_CHAR_BACK_TICK) {
474 Iterator().Forward(1);
475 }
476 }
477
478 template <int N, bool IN_AS>
ScanHexEscape()479 char32_t Lexer::ScanHexEscape()
480 {
481 char32_t code = 0;
482
483 for (size_t i = 0; i < N; ++i) {
484 const auto cp = Iterator().Peek();
485 if (IN_AS && cp == LEX_CHAR_BACK_TICK) {
486 break;
487 }
488
489 Iterator().Forward(1);
490
491 if (!IsHexDigit(cp)) {
492 LogError(diagnostic::INVALID_UNICODE_ESCAPE);
493 return UNICODE_INVALID_CP;
494 }
495
496 constexpr auto MULTIPLIER = 16;
497 code = code * MULTIPLIER + HexValue(cp);
498 }
499
500 return code;
501 }
502
503 template <typename RadixType, typename RadixLimit>
ScanNumberLeadingZeroImpl(bool const leadingMinus)504 bool Lexer::ScanNumberLeadingZeroImpl(bool const leadingMinus)
505 {
506 GetToken().type_ = TokenType::LITERAL_NUMBER;
507 GetToken().keywordType_ = TokenType::LITERAL_NUMBER;
508
509 switch (Iterator().Peek()) {
510 case LEX_CHAR_LOWERCASE_X:
511 case LEX_CHAR_UPPERCASE_X: {
512 Iterator().Forward(1);
513 constexpr auto RADIX = 16;
514 if (!ScanNumberRadix<IsHexDigit, RADIX, RadixType, RadixLimit>(leadingMinus)) {
515 return false;
516 }
517 CheckNumberLiteralEnd();
518 return true;
519 }
520 case LEX_CHAR_LOWERCASE_B:
521 case LEX_CHAR_UPPERCASE_B: {
522 Iterator().Forward(1);
523 constexpr auto RADIX = 2;
524 if (!ScanNumberRadix<IsBinaryDigit, RADIX, RadixType, RadixLimit>(leadingMinus)) {
525 return false;
526 }
527 CheckNumberLiteralEnd();
528 return true;
529 }
530 case LEX_CHAR_LOWERCASE_O:
531 case LEX_CHAR_UPPERCASE_O: {
532 Iterator().Forward(1);
533 constexpr auto RADIX = 8;
534 if (!ScanNumberRadix<IsOctalDigit, RADIX, RadixType, RadixLimit>(leadingMinus)) {
535 return false;
536 }
537 CheckOctal();
538 CheckNumberLiteralEnd();
539 return true;
540 }
541 default: {
542 ScanNumberLeadingZeroImplNonAllowedCases();
543 break;
544 }
545 }
546
547 ScanNumber(leadingMinus);
548 return true;
549 }
550
551 template <int RADIX, typename RadixType, typename RadixLimit>
ScanTooLargeNumber(RadixType const number,std::uint32_t const digit)552 bool ScanTooLargeNumber([[maybe_unused]] RadixType const number, [[maybe_unused]] std::uint32_t const digit)
553 {
554 // NOTE (DZ): probably more sophisticates check will be required for general usage
555 if constexpr (std::is_integral_v<RadixLimit>) {
556 if (static_cast<RadixType>(std::numeric_limits<RadixLimit>::max()) / RADIX < number ||
557 static_cast<RadixType>(std::numeric_limits<RadixLimit>::max()) - number * RADIX < digit) {
558 return false;
559 }
560 }
561 return true;
562 }
563
564 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanNumberRadix(bool leadingMinus,bool allowNumericSeparator)565 bool Lexer::ScanNumberRadix(bool leadingMinus, bool allowNumericSeparator)
566 {
567 RadixType number {};
568
569 auto cp = Iterator().Peek();
570 if (!RANGE_CHECK(cp)) {
571 LogError(diagnostic::INVALID_DIGIT);
572 }
573
574 bool allowNumericOnNext = true;
575
576 do {
577 cp = Iterator().Peek();
578 if (RANGE_CHECK(cp)) {
579 auto const digit = HexValue(cp);
580 if (!ScanTooLargeNumber<RADIX, RadixType, RadixLimit>(number, digit)) {
581 return false;
582 }
583 number = number * RADIX + digit;
584
585 Iterator().Forward(1);
586 allowNumericOnNext = true;
587 continue;
588 }
589
590 if (cp == LEX_CHAR_UNDERSCORE) {
591 if (!allowNumericSeparator || !allowNumericOnNext) {
592 LogError(diagnostic::INVALID_NUMERIC_SEP);
593 }
594
595 GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
596 Iterator().Forward(1);
597 allowNumericOnNext = false;
598 continue;
599 }
600
601 if (!allowNumericOnNext) {
602 Iterator().Backward(1);
603 LogError(diagnostic::INVALID_NUMERIC_SEP_AT_END_OF_NUM);
604 }
605
606 break;
607 } while (true);
608
609 if (leadingMinus) {
610 if constexpr (std::is_integral_v<RadixType>) {
611 number = ~number + static_cast<RadixType>(1);
612 } else {
613 number = -number;
614 }
615 }
616
617 GetToken().number_ = lexer::Number(number);
618 return true;
619 }
620
HexValue(char32_t ch)621 inline uint32_t Lexer::HexValue(char32_t ch)
622 {
623 constexpr uint32_t HEX_MASK = 0xF;
624 constexpr uint32_t DEC_OFFSET = 10;
625 return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
626 }
627
IsDecimalDigit(uint32_t cp)628 inline bool Lexer::IsDecimalDigit(uint32_t cp)
629 {
630 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
631 }
632
IsHexDigit(char32_t ch)633 inline bool Lexer::IsHexDigit(char32_t ch)
634 {
635 return ch < LEX_ASCII_MAX_BITS && (std::isxdigit(static_cast<unsigned char>(ch)) != 0);
636 }
637
IsBinaryDigit(char32_t ch)638 inline bool Lexer::IsBinaryDigit(char32_t ch)
639 {
640 return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
641 }
642
IsOctalDigit(char32_t ch)643 inline bool Lexer::IsOctalDigit(char32_t ch)
644 {
645 return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
646 }
647 } // namespace ark::es2panda::lexer
648
649 template <>
650 struct enumbitops::IsAllowedType<ark::es2panda::lexer::NextTokenFlags> : std::true_type {
651 };
652
653 #endif
654