1 /**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_PARSER_CORE_LEXER_H
17 #define ES2PANDA_PARSER_CORE_LEXER_H
18
19 #include <ios>
20 #include "lexer/regexp/regexp.h"
21 #include "lexer/token/letters.h"
22 #include "lexer/token/token.h"
23 #include "util/enumbitops.h"
24
25 namespace ark::es2panda::parser {
26 class ParserContext;
27 class ETSNolintParser;
28 } // namespace ark::es2panda::parser
29
30 namespace ark::es2panda::lexer {
31 class Keywords;
32
33 using ENUMBITOPS_OPERATORS;
34
35 enum class NextTokenFlags : uint32_t {
36 NONE = 0U,
37 KEYWORD_TO_IDENT = 1U << 0U,
38 NUMERIC_SEPARATOR_ALLOWED = 1U << 1U,
39 BIGINT_ALLOWED = 1U << 2U,
40 };
41
42 class LexerPosition {
43 public:
44 explicit LexerPosition(const util::StringView &source);
45 DEFAULT_COPY_SEMANTIC(LexerPosition);
46 DEFAULT_MOVE_SEMANTIC(LexerPosition);
47 ~LexerPosition() = default;
48
Iterator()49 util::StringView::Iterator &Iterator()
50 {
51 return iterator_;
52 }
53
Iterator()54 const util::StringView::Iterator &Iterator() const
55 {
56 return iterator_;
57 }
58
Line()59 size_t Line() const
60 {
61 return line_;
62 }
63
GetToken()64 Token &GetToken()
65 {
66 return token_;
67 }
68
GetToken()69 const Token &GetToken() const
70 {
71 return token_;
72 }
73
NextTokenLine()74 size_t &NextTokenLine()
75 {
76 return nextTokenLine_;
77 }
78
79 private:
80 friend class Lexer;
81
82 Token token_ {};
83 util::StringView::Iterator iterator_;
84 size_t line_ {};
85 size_t nextTokenLine_ {};
86 };
87
88 class LexerTemplateString {
89 public:
LexerTemplateString(ArenaAllocator * allocator)90 explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
91 DEFAULT_COPY_SEMANTIC(LexerTemplateString);
92 DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
93 ~LexerTemplateString() = default;
94
95 // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
96 util::UString str;
97 size_t end {};
98 bool scanExpression {};
99 // NOLINTEND(misc-non-private-member-variables-in-classes)
100 };
101
102 class TemplateLiteralParserContext;
103
104 class Lexer {
105 public:
106 explicit Lexer(const parser::ParserContext *parserContext, bool startLexer = true);
107 NO_COPY_SEMANTIC(Lexer);
108 NO_MOVE_SEMANTIC(Lexer);
109 virtual ~Lexer() = default;
110
111 // NOLINTNEXTLINE(google-default-arguments)
112 virtual void NextToken(NextTokenFlags flags = NextTokenFlags::NONE);
113 virtual void ScanAsteriskPunctuator();
114
115 Token &GetToken();
116 const Token &GetToken() const;
117 size_t Line() const;
118
TryEatTokenType(lexer::TokenType type)119 bool TryEatTokenType(lexer::TokenType type)
120 {
121 auto token = GetToken();
122 if (token.Type() == type) {
123 NextToken();
124 return true;
125 }
126 return false;
127 }
128
TryEatTokenKeyword(lexer::TokenType type)129 std::optional<Token> TryEatTokenKeyword(lexer::TokenType type)
130 {
131 auto token = GetToken();
132 if (token.KeywordType() == type) {
133 NextToken();
134 return token;
135 }
136 return std::nullopt;
137 }
138
139 LexerPosition Save() const;
140 void Rewind(const LexerPosition &pos);
141 void BackwardToken(TokenType type, size_t offset);
142 void ForwardToken(TokenType type, size_t offset);
143
144 char32_t Lookahead();
145 bool CheckArrow();
146
147 RegExp ScanRegExp();
148 template <char32_t END>
149 void ScanString();
150 void ResetTokenEnd();
151 LexerTemplateString ScanTemplateString();
152 void ScanTemplateStringEnd();
153 void PushTemplateContext(TemplateLiteralParserContext *ctx);
ThrowUnexpectedStrictModeReservedKeyword()154 [[noreturn]] void ThrowUnexpectedStrictModeReservedKeyword() const
155 {
156 ThrowError("Unexpected strict mode reserved keyword");
157 }
158
159 enum class ConversionResult : uint8_t {
160 SUCCESS,
161 INVALID_ARGUMENT,
162 OUT_OF_RANGE,
163 };
164
165 template <typename Tret, typename Ret = Tret, typename... Base>
StrToNumeric(Tret (* converter)(const char *,char **,Base...),const char * str,ConversionResult & result,Base...base)166 static Ret StrToNumeric(Tret (*converter)(const char *, char **, Base...), const char *str,
167 ConversionResult &result, Base... base) noexcept
168 {
169 Ret ret {};
170 char *endPtr;
171 // NOLINTBEGIN(cppcoreguidelines-special-member-functions)
172 struct SaveErrno {
173 explicit SaveErrno() : errno_(errno)
174 {
175 errno = 0;
176 }
177 ~SaveErrno()
178 {
179 if (errno == 0) {
180 errno = errno_;
181 }
182 }
183
184 private:
185 decltype(errno) errno_;
186 } const savedErrno;
187 // NOLINTEND(cppcoreguidelines-special-member-functions)
188
189 const Tret tmp = converter(str, &endPtr, base...);
190
191 bool outOfRange = false;
192 if constexpr (std::is_same_v<Ret, int>) {
193 outOfRange = tmp < static_cast<Tret>(std::numeric_limits<int>::min()) ||
194 tmp > static_cast<Tret>(std::numeric_limits<int>::max());
195 }
196
197 if (endPtr == str) {
198 result = ConversionResult::INVALID_ARGUMENT;
199 } else if (errno == ERANGE || outOfRange) {
200 result = ConversionResult::OUT_OF_RANGE;
201 } else {
202 result = ConversionResult::SUCCESS;
203 ret = tmp;
204 }
205
206 return ret;
207 }
208
209 util::StringView SourceView(size_t begin, size_t end) const;
210
211 protected:
212 void NextToken(Keywords *kws);
213 ArenaAllocator *Allocator();
214 bool IsLineTerminatorOrEos() const;
215 void ScanRegExpPattern();
216 RegExpFlags ScanRegExpFlags();
217
218 [[noreturn]] void ThrowError(std::string_view message) const;
219 [[noreturn]] void ThrowUnexpectedToken(lexer::TokenType tokenType) const;
220
221 void SetTokenStart();
222 void SetTokenEnd();
223
Iterator()224 inline util::StringView::Iterator &Iterator()
225 {
226 return pos_.iterator_;
227 }
228
Iterator()229 inline const util::StringView::Iterator &Iterator() const
230 {
231 return pos_.iterator_;
232 }
233
234 util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
235
236 void SkipWhiteSpaces();
237 void SkipSingleLineComment();
238
239 bool ScanPunctuator();
240 void ScanQuestionPunctuator();
241 void ScanLessThanPunctuator();
242 void ScanGreaterThanPunctuator();
243 virtual void ScanEqualsPunctuator();
244 virtual void ScanExclamationPunctuator();
245 void ScanAmpersandPunctuator();
246 void ScanVLinePunctuator();
247 void ScanCircumflexPunctuator();
248 void ScanPlusPunctuator();
249 void ScanMinusPunctuator();
250 void ScanSlashPunctuator();
251 void ScanPercentPunctuator();
252 void ScanDotPunctuator();
253 void ScanColonPunctuator();
254 virtual bool ScanDollarPunctuator();
255 void ScanAtPunctuator();
256
257 virtual void SkipMultiLineComment();
258 virtual void ScanHashMark();
259 virtual void ScanBackTick();
260
ScanCharLiteral()261 virtual bool ScanCharLiteral()
262 {
263 return false;
264 }
265
266 char32_t ScanUnicodeEscapeSequence();
267 template <int N, bool IN_AS = false>
268 char32_t ScanHexEscape();
269 char32_t ScanUnicodeCodePointEscape();
270
271 void ScanStringUnicodePart(util::UString *str);
272 char32_t ScanUnicodeCharacter();
273
274 void ScanDecimalNumbers();
275
ScanNumberLeadingZero()276 virtual void ScanNumberLeadingZero()
277 {
278 ScanNumberLeadingZeroImpl<double>();
279 }
280
281 template <typename RadixType, typename RadixLimit = void *>
282 void ScanNumberLeadingZeroImpl();
283 void ScanNumberLeadingZeroImplNonAllowedCases();
284 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
285 void ScanNumberRadix(bool allowNumericSeparator = true);
286 void ScanNumber(bool allowBigInt = true);
287 std::tuple<size_t, bool, NumberFlags> ScanCharLex(bool allowBigInt, bool parseExponent, NumberFlags flags);
288 size_t ScanSignOfNumber();
289 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
290 void ScanTooLargeNumber(RadixType number);
291 virtual void ConvertNumber(const std::string &utf8, NumberFlags flags);
292 void ScanDecimalLiteral();
293 void ScanDecimalDigits(bool allowNumericSeparator);
294 virtual void CheckNumberLiteralEnd();
295
296 inline static uint32_t HexValue(char32_t ch);
297 inline static bool IsDecimalDigit(uint32_t cp);
298 inline static bool IsHexDigit(char32_t ch);
299 inline static bool IsBinaryDigit(char32_t ch);
300 inline static bool IsOctalDigit(char32_t ch);
301
302 friend class KeywordsUtil;
303 friend class TemplateLiteralParserContext;
304 friend class parser::ETSNolintParser;
305
306 LexerPosition &Pos();
307 const LexerPosition &Pos() const;
308
309 private:
310 TemplateLiteralParserContext *tlCtx_ {};
311 ArenaAllocator *allocator_;
312 Keywords *kws_ {};
313 const parser::ParserContext *parserContext_;
314 util::StringView source_;
315 LexerPosition pos_;
316 };
317
318 class TemplateLiteralParserContext {
319 public:
TemplateLiteralParserContext(Lexer * lexer)320 explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
321 NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
322 NO_COPY_SEMANTIC(TemplateLiteralParserContext);
323
~TemplateLiteralParserContext()324 ~TemplateLiteralParserContext()
325 {
326 lexer_->tlCtx_ = prev_;
327 }
328
ConsumeLeftBrace()329 void ConsumeLeftBrace()
330 {
331 braceDepth_++;
332 }
333
ConsumeRightBrace()334 bool ConsumeRightBrace()
335 {
336 braceDepth_--;
337
338 return braceDepth_ == 0;
339 }
340
341 private:
342 Lexer *lexer_;
343 TemplateLiteralParserContext *prev_ {};
344 size_t braceDepth_ {1};
345 };
346
347 template <char32_t END>
ScanString()348 void Lexer::ScanString()
349 {
350 util::UString str(Allocator());
351 GetToken().type_ = TokenType::LITERAL_STRING;
352 GetToken().keywordType_ = TokenType::LITERAL_STRING;
353
354 const auto startPos = Iterator().Index();
355 auto escapeEnd = startPos;
356
357 do {
358 char32_t cp = Iterator().Peek();
359
360 switch (cp) {
361 case util::StringView::Iterator::INVALID_CP: {
362 ThrowError("Unterminated string");
363 break;
364 }
365 case LEX_CHAR_CR:
366 case LEX_CHAR_LF: {
367 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
368 if constexpr (END != LEX_CHAR_BACK_TICK) {
369 ThrowError("Newline is not allowed in strings");
370 }
371
372 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
373 str.Append(SourceView(escapeEnd, Iterator().Index()));
374
375 if (cp == LEX_CHAR_CR) {
376 Iterator().Forward(1);
377
378 if (Iterator().Peek() != LEX_CHAR_LF) {
379 Iterator().Backward(1);
380 }
381 }
382
383 pos_.line_++;
384 str.Append(LEX_CHAR_LF);
385 Iterator().Forward(1);
386 escapeEnd = Iterator().Index();
387 continue;
388 }
389 case LEX_CHAR_BACKSLASH: {
390 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
391 str.Append(SourceView(escapeEnd, Iterator().Index()));
392
393 Iterator().Forward(1);
394 ScanStringUnicodePart(&str);
395 escapeEnd = Iterator().Index();
396 continue;
397 }
398 case LEX_CHAR_BACK_TICK:
399 case LEX_CHAR_SINGLE_QUOTE:
400 case LEX_CHAR_DOUBLE_QUOTE: {
401 if (END == cp) {
402 break;
403 }
404
405 Iterator().Forward(1);
406 continue;
407 }
408 case LEX_CHAR_DOLLAR_SIGN: {
409 Iterator().Forward(1);
410
411 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
412 if constexpr (END == LEX_CHAR_BACK_TICK) {
413 if (Iterator().Peek() == LEX_CHAR_LEFT_BRACE) {
414 Iterator().Backward(1);
415 break;
416 }
417 }
418
419 continue;
420 }
421 default: {
422 Iterator().SkipCp();
423 continue;
424 }
425 }
426
427 if (GetToken().flags_ & TokenFlags::HAS_ESCAPE) {
428 str.Append(SourceView(escapeEnd, Iterator().Index()));
429 GetToken().src_ = str.View();
430 } else {
431 GetToken().src_ = SourceView(startPos, Iterator().Index());
432 }
433
434 break;
435 } while (true);
436
437 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
438 if constexpr (END != LEX_CHAR_BACK_TICK) {
439 Iterator().Forward(1);
440 }
441 }
442
443 template <int N, bool IN_AS>
ScanHexEscape()444 char32_t Lexer::ScanHexEscape()
445 {
446 char32_t code = 0;
447
448 for (size_t i = 0; i < N; ++i) {
449 const auto cp = Iterator().Peek();
450 if (IN_AS && cp == LEX_CHAR_BACK_TICK) {
451 break;
452 }
453
454 Iterator().Forward(1);
455
456 if (!IsHexDigit(cp)) {
457 ThrowError("Invalid unicode escape sequence");
458 }
459
460 constexpr auto MULTIPLIER = 16;
461 code = code * MULTIPLIER + HexValue(cp);
462 }
463
464 return code;
465 }
466
467 template <typename RadixType, typename RadixLimit>
ScanNumberLeadingZeroImpl()468 void Lexer::ScanNumberLeadingZeroImpl()
469 {
470 GetToken().type_ = TokenType::LITERAL_NUMBER;
471 GetToken().keywordType_ = TokenType::LITERAL_NUMBER;
472
473 switch (Iterator().Peek()) {
474 case LEX_CHAR_LOWERCASE_X:
475 case LEX_CHAR_UPPERCASE_X: {
476 Iterator().Forward(1);
477 constexpr auto RADIX = 16;
478 ScanNumberRadix<IsHexDigit, RADIX, RadixType, RadixLimit>();
479 CheckNumberLiteralEnd();
480 return;
481 }
482 case LEX_CHAR_LOWERCASE_B:
483 case LEX_CHAR_UPPERCASE_B: {
484 Iterator().Forward(1);
485 constexpr auto RADIX = 2;
486 ScanNumberRadix<IsBinaryDigit, RADIX, RadixType, RadixLimit>();
487 CheckNumberLiteralEnd();
488 return;
489 }
490 case LEX_CHAR_LOWERCASE_O:
491 case LEX_CHAR_UPPERCASE_O: {
492 Iterator().Forward(1);
493 constexpr auto RADIX = 8;
494 ScanNumberRadix<IsOctalDigit, RADIX, RadixType, RadixLimit>();
495
496 switch (Iterator().Peek()) {
497 case LEX_CHAR_8:
498 case LEX_CHAR_9: {
499 ThrowError("Invalid octal digit");
500 }
501 default: {
502 break;
503 }
504 }
505
506 CheckNumberLiteralEnd();
507 return;
508 }
509 default: {
510 ScanNumberLeadingZeroImplNonAllowedCases();
511 break;
512 }
513 }
514
515 ScanNumber();
516 }
517
518 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanTooLargeNumber(RadixType number)519 void Lexer::ScanTooLargeNumber([[maybe_unused]] RadixType number)
520 {
521 if constexpr (std::is_arithmetic_v<RadixLimit>) {
522 if (number > std::numeric_limits<RadixLimit>::max() / RADIX) {
523 ThrowError("Number is too large");
524 }
525 }
526 }
527
528 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanNumberRadix(bool allowNumericSeparator)529 void Lexer::ScanNumberRadix(bool allowNumericSeparator)
530 {
531 RadixType number {};
532
533 auto cp = Iterator().Peek();
534 if (!RANGE_CHECK(cp)) {
535 ThrowError("Invalid digit");
536 }
537
538 bool allowNumericOnNext = true;
539
540 do {
541 cp = Iterator().Peek();
542 if (RANGE_CHECK(cp)) {
543 auto digit = HexValue(cp);
544
545 ScanTooLargeNumber<RANGE_CHECK, RADIX, RadixType, RadixLimit>(number);
546
547 number = number * RADIX + digit;
548 Iterator().Forward(1);
549 allowNumericOnNext = true;
550 continue;
551 }
552
553 if (cp == LEX_CHAR_UNDERSCORE) {
554 if (!allowNumericSeparator || !allowNumericOnNext) {
555 ThrowError("Invalid numeric separator");
556 }
557
558 GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
559 Iterator().Forward(1);
560 allowNumericOnNext = false;
561 continue;
562 }
563
564 if (!allowNumericOnNext) {
565 Iterator().Backward(1);
566 ThrowError("Numeric separators are not allowed at the end of numeric literals");
567 }
568
569 break;
570 } while (true);
571
572 GetToken().number_ = lexer::Number(number);
573 }
574
HexValue(char32_t ch)575 inline uint32_t Lexer::HexValue(char32_t ch)
576 {
577 constexpr uint32_t HEX_MASK = 0xF;
578 constexpr uint32_t DEC_OFFSET = 10;
579 return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
580 }
581
IsDecimalDigit(uint32_t cp)582 inline bool Lexer::IsDecimalDigit(uint32_t cp)
583 {
584 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
585 }
586
IsHexDigit(char32_t ch)587 inline bool Lexer::IsHexDigit(char32_t ch)
588 {
589 return ch < LEX_ASCII_MAX_BITS && (std::isxdigit(static_cast<unsigned char>(ch)) != 0);
590 }
591
IsBinaryDigit(char32_t ch)592 inline bool Lexer::IsBinaryDigit(char32_t ch)
593 {
594 return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
595 }
596
IsOctalDigit(char32_t ch)597 inline bool Lexer::IsOctalDigit(char32_t ch)
598 {
599 return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
600 }
601 } // namespace ark::es2panda::lexer
602
603 template <>
604 struct enumbitops::IsAllowedType<ark::es2panda::lexer::NextTokenFlags> : std::true_type {
605 };
606
607 #endif
608