1 /**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_PARSER_CORE_LEXER_H
17 #define ES2PANDA_PARSER_CORE_LEXER_H
18
19 #include "lexer/regexp/regexp.h"
20 #include "lexer/token/letters.h"
21 #include "lexer/token/token.h"
22 #include "util/enumbitops.h"
23
24 namespace ark::es2panda::parser {
25 class ParserContext;
26 class ETSNolintParser;
27 } // namespace ark::es2panda::parser
28
29 namespace ark::es2panda::lexer {
30 class Keywords;
31
32 using ENUMBITOPS_OPERATORS;
33
34 enum class NextTokenFlags : uint32_t {
35 NONE = 0U,
36 KEYWORD_TO_IDENT = 1U << 0U,
37 NUMERIC_SEPARATOR_ALLOWED = 1U << 1U,
38 BIGINT_ALLOWED = 1U << 2U,
39 UNARY_MINUS = 1U << 3U,
40 };
41
42 class LexerPosition {
43 public:
44 explicit LexerPosition(const util::StringView &source);
45 DEFAULT_COPY_SEMANTIC(LexerPosition);
46 DEFAULT_MOVE_SEMANTIC(LexerPosition);
47 ~LexerPosition() = default;
48
Iterator()49 util::StringView::Iterator &Iterator()
50 {
51 return iterator_;
52 }
53
Iterator()54 const util::StringView::Iterator &Iterator() const
55 {
56 return iterator_;
57 }
58
Line()59 size_t Line() const
60 {
61 return line_;
62 }
63
GetToken()64 Token &GetToken()
65 {
66 return token_;
67 }
68
GetToken()69 const Token &GetToken() const
70 {
71 return token_;
72 }
73
NextTokenLine()74 size_t &NextTokenLine()
75 {
76 return nextTokenLine_;
77 }
78
79 bool operator==(const LexerPosition &other) const
80 {
81 return iterator_.Save() == other.iterator_.Save();
82 }
83
84 bool operator!=(const LexerPosition &other) const
85 {
86 return !(*this == other);
87 }
88
89 private:
90 friend class Lexer;
91
92 Token token_ {};
93 util::StringView::Iterator iterator_;
94 size_t line_ {};
95 size_t nextTokenLine_ {};
96 };
97
98 class LexerTemplateString {
99 public:
LexerTemplateString(ArenaAllocator * allocator)100 explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
101 DEFAULT_COPY_SEMANTIC(LexerTemplateString);
102 DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
103 ~LexerTemplateString() = default;
104
105 // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
106 util::UString str;
107 size_t end {};
108 bool scanExpression {};
109 bool validSequence {true};
110 // NOLINTEND(misc-non-private-member-variables-in-classes)
111 };
112
113 class TemplateLiteralParserContext;
114
115 class Lexer {
116 public:
117 explicit Lexer(const parser::ParserContext *parserContext, util::ErrorLogger *errorLogger, bool startLexer = true);
118 NO_COPY_SEMANTIC(Lexer);
119 NO_MOVE_SEMANTIC(Lexer);
120 virtual ~Lexer() = default;
121
122 // NOLINTNEXTLINE(google-default-arguments)
123 virtual void NextToken(NextTokenFlags flags = NextTokenFlags::NONE);
124 virtual void ScanAsteriskPunctuator();
125
126 Token &GetToken();
127 const Token &GetToken() const;
128 size_t Line() const;
129
TryEatTokenType(lexer::TokenType type)130 bool TryEatTokenType(lexer::TokenType type)
131 {
132 auto token = GetToken();
133 if (token.Type() == type) {
134 NextToken();
135 return true;
136 }
137 return false;
138 }
139
ErrorLogger()140 const util::ErrorLogger *ErrorLogger()
141 {
142 return errorLogger_;
143 }
144
TryEatTokenKeyword(lexer::TokenType type)145 std::optional<Token> TryEatTokenKeyword(lexer::TokenType type)
146 {
147 auto token = GetToken();
148 if (token.KeywordType() == type) {
149 NextToken();
150 return token;
151 }
152 return std::nullopt;
153 }
154
155 LexerPosition Save() const;
156 void Rewind(const LexerPosition &pos);
157 void BackwardToken(TokenType type, size_t offset);
158 void ForwardToken(TokenType type, size_t offset);
159
160 char32_t Lookahead();
161 bool CheckArrow();
162
163 RegExp ScanRegExp();
164
165 void HandleNewlineHelper(util::UString *str, size_t *escapeEnd);
166 bool HandleBackslashHelper(util::UString *str, size_t *escapeEnd);
167 bool HandleDollarSignHelper(const char32_t &end);
168 bool HandleDoubleQuoteHelper(const char32_t &end, const char32_t &cp);
169 void PrepareStringTokenHelper();
170 void FinalizeTokenHelper(util::UString *str, const size_t &startPos, size_t escapeEnd, bool finalize = true);
171 template <char32_t END>
172 void ScanString();
173
174 void ResetTokenEnd();
175 bool CheckOctalDigit(char32_t const nextCp);
176 std::tuple<bool, bool, LexerTemplateString> ScanTemplateStringCpHelper(char32_t cp,
177 LexerTemplateString templateStr);
178 LexerTemplateString ScanTemplateString();
179 void ScanTemplateStringEnd();
180 void PushTemplateContext(TemplateLiteralParserContext *ctx);
LogUnexpectedStrictModeReservedKeyword()181 void LogUnexpectedStrictModeReservedKeyword() const
182 {
183 LogSyntaxError("Unexpected strict mode reserved keyword");
184 }
185
186 enum class ConversionResult : uint8_t {
187 SUCCESS,
188 INVALID_ARGUMENT,
189 OUT_OF_RANGE,
190 };
191
192 template <typename Tret, typename Ret = Tret, typename... Base>
StrToNumeric(Tret (* converter)(const char *,char **,Base...),const char * str,ConversionResult & result,Base...base)193 static Ret StrToNumeric(Tret (*converter)(const char *, char **, Base...), const char *str,
194 ConversionResult &result, Base... base) noexcept
195 {
196 Ret ret {};
197 char *endPtr;
198 // NOLINTBEGIN(cppcoreguidelines-special-member-functions)
199 struct SaveErrno {
200 explicit SaveErrno() : errno_(errno)
201 {
202 errno = 0;
203 }
204 ~SaveErrno()
205 {
206 if (errno == 0) {
207 errno = errno_;
208 }
209 }
210
211 private:
212 decltype(errno) errno_;
213 } const savedErrno;
214 // NOLINTEND(cppcoreguidelines-special-member-functions)
215
216 const Tret tmp = converter(str, &endPtr, base...);
217
218 bool outOfRange = false;
219 if constexpr (!std::is_same_v<Ret, Tret>) {
220 outOfRange = tmp < static_cast<Tret>(std::numeric_limits<Ret>::min()) ||
221 tmp > static_cast<Tret>(std::numeric_limits<Ret>::max());
222 }
223
224 if (endPtr == str) {
225 result = ConversionResult::INVALID_ARGUMENT;
226 } else if (errno == ERANGE || outOfRange) {
227 result = ConversionResult::OUT_OF_RANGE;
228 } else {
229 result = ConversionResult::SUCCESS;
230 ret = tmp;
231 }
232
233 return ret;
234 }
235
236 util::StringView SourceView(size_t begin, size_t end) const;
237
238 protected:
239 void NextToken(Keywords *kws);
240 ArenaAllocator *Allocator();
241 bool IsLineTerminatorOrEos() const;
242 bool ScanRegExpPattern();
243 RegExpFlags ScanRegExpFlags();
244
245 void LogSyntaxError(std::string_view const errorMessage) const;
246 void LogSyntaxError(std::string_view const errorMessage, const lexer::SourcePosition &pos) const;
247 void LogUnexpectedToken(lexer::TokenType const tokenType) const;
248
249 void SetTokenStart();
250 void SetTokenEnd();
251
Iterator()252 inline util::StringView::Iterator &Iterator()
253 {
254 return pos_.iterator_;
255 }
256
Iterator()257 inline const util::StringView::Iterator &Iterator() const
258 {
259 return pos_.iterator_;
260 }
261
262 util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
263
264 bool SkipWhiteSpacesHelperSlash(char32_t *cp);
265 bool SkipWhiteSpacesHelperDefault(const char32_t &cp);
266 void SkipWhiteSpaces();
267 void SkipSingleLineComment();
268
269 bool ScanPunctuator();
270 void ScanQuestionPunctuator();
271 void ScanLessThanPunctuator();
272 void ScanGreaterThanPunctuator();
273 virtual void ScanEqualsPunctuator();
274 virtual void ScanExclamationPunctuator();
275 void ScanAmpersandPunctuator();
276 void ScanVLinePunctuator();
277 void ScanCircumflexPunctuator();
278 void ScanPlusPunctuator();
279 void ScanMinusPunctuator();
280 void ScanSlashPunctuator();
281 void ScanPercentPunctuator();
282 void ScanDotPunctuator();
283 void ScanColonPunctuator();
284 virtual bool ScanDollarPunctuator();
285 void ScanAtPunctuator();
286
287 virtual void SkipMultiLineComment();
288 virtual void ScanHashMark();
289 virtual void ScanBackTick();
290
ScanCharLiteral()291 virtual bool ScanCharLiteral()
292 {
293 return false;
294 }
295
296 char32_t ScanUnicodeEscapeSequence();
297 template <int N, bool IN_AS = false>
298 char32_t ScanHexEscape();
299 char32_t ScanUnicodeCodePointEscape();
300
301 bool ScanStringUnicodePart(util::UString *str);
302 char32_t ScanUnicodeCharacterHelper(size_t cpSize, char32_t cp);
303 char32_t ScanUnicodeCharacter();
304
305 void ScanDecimalNumbers();
306
ScanNumberLeadingZero(bool const leadingMinus)307 virtual void ScanNumberLeadingZero(bool const leadingMinus)
308 {
309 ScanNumberLeadingZeroImpl<double>(leadingMinus);
310 }
311
312 template <typename RadixType, typename RadixLimit = RadixType>
313 bool ScanNumberLeadingZeroImpl(bool const leadingMinus);
314 void ScanNumberLeadingZeroImplNonAllowedCases();
315 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
316 bool ScanNumberRadix(bool leadingMinus, bool allowNumericSeparator = true);
317 void ScanNumber(bool const leadingMinus = false, bool allowBigInt = true);
318 std::optional<std::size_t> ScanCharLex(bool parseExponent, bool &allowBigInt, NumberFlags &flags);
319 std::optional<std::size_t> ScanSignOfNumber() noexcept;
320 virtual void ConvertNumber(NumberFlags flags);
321 void ScanDecimalLiteral();
322 void ScanDecimalDigits(bool allowNumericSeparator);
323 virtual void CheckNumberLiteralEnd();
324 void CheckOctal();
325
326 inline static uint32_t HexValue(char32_t ch);
327 inline static bool IsDecimalDigit(uint32_t cp);
328 inline static bool IsHexDigit(char32_t ch);
329 inline static bool IsBinaryDigit(char32_t ch);
330 inline static bool IsOctalDigit(char32_t ch);
331
332 friend class KeywordsUtil;
333 friend class TemplateLiteralParserContext;
334 friend class parser::ETSNolintParser;
335
336 LexerPosition &Pos();
337 const LexerPosition &Pos() const;
338
339 private:
340 TemplateLiteralParserContext *tlCtx_ {};
341 ArenaAllocator *allocator_;
342 Keywords *kws_ {};
343 const parser::ParserContext *parserContext_;
344 util::StringView source_;
345 LexerPosition pos_;
346 util::ErrorLogger *const errorLogger_;
347 };
348
349 class TemplateLiteralParserContext {
350 public:
TemplateLiteralParserContext(Lexer * lexer)351 explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
352 NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
353 NO_COPY_SEMANTIC(TemplateLiteralParserContext);
354
~TemplateLiteralParserContext()355 ~TemplateLiteralParserContext()
356 {
357 lexer_->tlCtx_ = prev_;
358 }
359
ConsumeLeftBrace()360 void ConsumeLeftBrace()
361 {
362 braceDepth_++;
363 }
364
ConsumeRightBrace()365 bool ConsumeRightBrace()
366 {
367 braceDepth_--;
368
369 return braceDepth_ == 0;
370 }
371
372 private:
373 Lexer *lexer_;
374 TemplateLiteralParserContext *prev_ {};
375 size_t braceDepth_ {1};
376 };
377
378 template <char32_t END>
379 // CC-OFFNXT(huge_method,G.FUN.01) big switch-case, solid logic
ScanString()380 void Lexer::ScanString()
381 {
382 util::UString str(Allocator());
383 PrepareStringTokenHelper();
384 const auto startPos = Iterator().Index();
385 auto escapeEnd = startPos;
386 bool validEscape = true;
387
388 do {
389 const char32_t cp = Iterator().Peek();
390 switch (cp) {
391 case util::StringView::Iterator::INVALID_CP: {
392 LogSyntaxError("Unterminated string");
393 break;
394 }
395 case LEX_CHAR_CR:
396 case LEX_CHAR_LF: {
397 if constexpr (END != LEX_CHAR_BACK_TICK) {
398 LogSyntaxError("Newline is not allowed in strings");
399 break;
400 }
401 HandleNewlineHelper(&str, &escapeEnd);
402 continue;
403 }
404 case LEX_CHAR_BACKSLASH: {
405 validEscape &= HandleBackslashHelper(&str, &escapeEnd);
406 continue;
407 }
408 case LEX_CHAR_BACK_TICK:
409 case LEX_CHAR_SINGLE_QUOTE:
410 case LEX_CHAR_DOUBLE_QUOTE: {
411 if (!HandleDoubleQuoteHelper(END, cp)) {
412 break;
413 }
414 continue;
415 }
416 case LEX_CHAR_DOLLAR_SIGN: {
417 if (HandleDollarSignHelper(END)) {
418 break;
419 }
420 continue;
421 }
422 default: {
423 Iterator().SkipCp();
424 continue;
425 }
426 }
427
428 FinalizeTokenHelper(&str, startPos, escapeEnd, validEscape);
429 break;
430 } while (true);
431
432 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
433 if constexpr (END != LEX_CHAR_BACK_TICK) {
434 Iterator().Forward(1);
435 }
436 }
437
438 template <int N, bool IN_AS>
ScanHexEscape()439 char32_t Lexer::ScanHexEscape()
440 {
441 char32_t code = 0;
442
443 for (size_t i = 0; i < N; ++i) {
444 const auto cp = Iterator().Peek();
445 if (IN_AS && cp == LEX_CHAR_BACK_TICK) {
446 break;
447 }
448
449 Iterator().Forward(1);
450
451 if (!IsHexDigit(cp)) {
452 LogSyntaxError("Invalid unicode escape sequence");
453 return UNICODE_INVALID_CP;
454 }
455
456 constexpr auto MULTIPLIER = 16;
457 code = code * MULTIPLIER + HexValue(cp);
458 }
459
460 return code;
461 }
462
463 template <typename RadixType, typename RadixLimit>
ScanNumberLeadingZeroImpl(bool const leadingMinus)464 bool Lexer::ScanNumberLeadingZeroImpl(bool const leadingMinus)
465 {
466 GetToken().type_ = TokenType::LITERAL_NUMBER;
467 GetToken().keywordType_ = TokenType::LITERAL_NUMBER;
468
469 switch (Iterator().Peek()) {
470 case LEX_CHAR_LOWERCASE_X:
471 case LEX_CHAR_UPPERCASE_X: {
472 Iterator().Forward(1);
473 constexpr auto RADIX = 16;
474 if (!ScanNumberRadix<IsHexDigit, RADIX, RadixType, RadixLimit>(leadingMinus)) {
475 return false;
476 }
477 CheckNumberLiteralEnd();
478 return true;
479 }
480 case LEX_CHAR_LOWERCASE_B:
481 case LEX_CHAR_UPPERCASE_B: {
482 Iterator().Forward(1);
483 constexpr auto RADIX = 2;
484 if (!ScanNumberRadix<IsBinaryDigit, RADIX, RadixType, RadixLimit>(leadingMinus)) {
485 return false;
486 }
487 CheckNumberLiteralEnd();
488 return true;
489 }
490 case LEX_CHAR_LOWERCASE_O:
491 case LEX_CHAR_UPPERCASE_O: {
492 Iterator().Forward(1);
493 constexpr auto RADIX = 8;
494 if (!ScanNumberRadix<IsOctalDigit, RADIX, RadixType, RadixLimit>(leadingMinus)) {
495 return false;
496 }
497 CheckOctal();
498 CheckNumberLiteralEnd();
499 return true;
500 }
501 default: {
502 ScanNumberLeadingZeroImplNonAllowedCases();
503 break;
504 }
505 }
506
507 ScanNumber(leadingMinus);
508 return true;
509 }
510
511 template <int RADIX, typename RadixType, typename RadixLimit>
ScanTooLargeNumber(RadixType const number,std::uint32_t const digit)512 bool ScanTooLargeNumber([[maybe_unused]] RadixType const number, [[maybe_unused]] std::uint32_t const digit)
513 {
514 // NOTE (DZ): probably more sophisticates check will be required for general usage
515 if constexpr (std::is_integral_v<RadixLimit>) {
516 if (static_cast<RadixType>(std::numeric_limits<RadixLimit>::max()) / RADIX < number ||
517 static_cast<RadixType>(std::numeric_limits<RadixLimit>::max()) - number * RADIX < digit) {
518 return false;
519 }
520 }
521 return true;
522 }
523
524 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanNumberRadix(bool leadingMinus,bool allowNumericSeparator)525 bool Lexer::ScanNumberRadix(bool leadingMinus, bool allowNumericSeparator)
526 {
527 RadixType number {};
528
529 auto cp = Iterator().Peek();
530 if (!RANGE_CHECK(cp)) {
531 LogSyntaxError("Invalid digit");
532 }
533
534 bool allowNumericOnNext = true;
535
536 do {
537 cp = Iterator().Peek();
538 if (RANGE_CHECK(cp)) {
539 auto const digit = HexValue(cp);
540 if (!ScanTooLargeNumber<RADIX, RadixType, RadixLimit>(number, digit)) {
541 return false;
542 }
543 number = number * RADIX + digit;
544
545 Iterator().Forward(1);
546 allowNumericOnNext = true;
547 continue;
548 }
549
550 if (cp == LEX_CHAR_UNDERSCORE) {
551 if (!allowNumericSeparator || !allowNumericOnNext) {
552 LogSyntaxError("Invalid numeric separator");
553 }
554
555 GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
556 Iterator().Forward(1);
557 allowNumericOnNext = false;
558 continue;
559 }
560
561 if (!allowNumericOnNext) {
562 Iterator().Backward(1);
563 LogSyntaxError("Numeric separators are not allowed at the end of numeric literals");
564 }
565
566 break;
567 } while (true);
568
569 if (leadingMinus) {
570 if constexpr (std::is_integral_v<RadixType>) {
571 number = ~number + static_cast<RadixType>(1);
572 } else {
573 number = -number;
574 }
575 }
576
577 GetToken().number_ = lexer::Number(number);
578 return true;
579 }
580
HexValue(char32_t ch)581 inline uint32_t Lexer::HexValue(char32_t ch)
582 {
583 constexpr uint32_t HEX_MASK = 0xF;
584 constexpr uint32_t DEC_OFFSET = 10;
585 return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
586 }
587
IsDecimalDigit(uint32_t cp)588 inline bool Lexer::IsDecimalDigit(uint32_t cp)
589 {
590 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
591 }
592
IsHexDigit(char32_t ch)593 inline bool Lexer::IsHexDigit(char32_t ch)
594 {
595 return ch < LEX_ASCII_MAX_BITS && (std::isxdigit(static_cast<unsigned char>(ch)) != 0);
596 }
597
IsBinaryDigit(char32_t ch)598 inline bool Lexer::IsBinaryDigit(char32_t ch)
599 {
600 return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
601 }
602
IsOctalDigit(char32_t ch)603 inline bool Lexer::IsOctalDigit(char32_t ch)
604 {
605 return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
606 }
607 } // namespace ark::es2panda::lexer
608
609 template <>
610 struct enumbitops::IsAllowedType<ark::es2panda::lexer::NextTokenFlags> : std::true_type {
611 };
612
613 #endif
614