1 /**
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_PARSER_CORE_LEXER_H
17 #define ES2PANDA_PARSER_CORE_LEXER_H
18
19 #include "lexer/regexp/regexp.h"
20 #include "lexer/token/letters.h"
21 #include "lexer/token/token.h"
22 #include "util/enumbitops.h"
23
24 namespace panda::es2panda::parser {
25 class ParserContext;
26 } // namespace panda::es2panda::parser
27
28 namespace panda::es2panda::lexer {
29 class Keywords;
30
31 enum class NextTokenFlags : uint32_t {
32 NONE = 0U,
33 KEYWORD_TO_IDENT = 1U << 0U,
34 NUMERIC_SEPARATOR_ALLOWED = 1U << 1U,
35 BIGINT_ALLOWED = 1U << 2U,
36 };
37
DEFINE_BITOPS(NextTokenFlags)38 DEFINE_BITOPS(NextTokenFlags)
39
40 class LexerPosition {
41 public:
42 explicit LexerPosition(const util::StringView &source);
43 DEFAULT_COPY_SEMANTIC(LexerPosition);
44 DEFAULT_MOVE_SEMANTIC(LexerPosition);
45 ~LexerPosition() = default;
46
47 util::StringView::Iterator &Iterator()
48 {
49 return iterator_;
50 }
51
52 const util::StringView::Iterator &Iterator() const
53 {
54 return iterator_;
55 }
56
57 size_t Line() const
58 {
59 return line_;
60 }
61
62 Token &GetToken()
63 {
64 return token_;
65 }
66
67 const Token &GetToken() const
68 {
69 return token_;
70 }
71
72 size_t &NextTokenLine()
73 {
74 return nextTokenLine_;
75 }
76
77 private:
78 friend class Lexer;
79
80 Token token_ {};
81 util::StringView::Iterator iterator_;
82 size_t line_ {};
83 size_t nextTokenLine_ {};
84 };
85
86 class LexerTemplateString {
87 public:
LexerTemplateString(ArenaAllocator * allocator)88 explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
89 DEFAULT_COPY_SEMANTIC(LexerTemplateString);
90 DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
91 ~LexerTemplateString() = default;
92
93 // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
94 util::UString str;
95 size_t end {};
96 bool scanExpression {};
97 // NOLINTEND(misc-non-private-member-variables-in-classes)
98 };
99
100 class TemplateLiteralParserContext;
101
102 class Lexer {
103 public:
104 explicit Lexer(const parser::ParserContext *parserContext, bool startLexer = true);
105 NO_COPY_SEMANTIC(Lexer);
106 NO_MOVE_SEMANTIC(Lexer);
107 virtual ~Lexer() = default;
108
109 // NOLINTNEXTLINE(google-default-arguments)
110 virtual void NextToken(NextTokenFlags flags = NextTokenFlags::NONE);
111 virtual void ScanAsteriskPunctuator();
112
113 Token &GetToken();
114 const Token &GetToken() const;
115 size_t Line() const;
116
117 LexerPosition Save() const;
118 void Rewind(const LexerPosition &pos);
119 void BackwardToken(TokenType type, size_t offset);
120 void ForwardToken(TokenType type, size_t offset);
121
122 char32_t Lookahead();
123 bool CheckArrow();
124
125 RegExp ScanRegExp();
126 template <char32_t END>
127 void ScanString();
128 void ResetTokenEnd();
129 LexerTemplateString ScanTemplateString();
130 void ScanTemplateStringEnd();
131 void PushTemplateContext(TemplateLiteralParserContext *ctx);
ThrowUnexpectedStrictModeReservedKeyword()132 [[noreturn]] void ThrowUnexpectedStrictModeReservedKeyword() const
133 {
134 ThrowError("Unexpected strict mode reserved keyword");
135 }
136
137 enum class ConversionResult : uint8_t {
138 SUCCESS,
139 INVALID_ARGUMENT,
140 OUT_OF_RANGE,
141 };
142
143 template <typename Tret, typename Ret = Tret, typename... Base>
StrToNumeric(Tret (* converter)(const char *,char **,Base...),const char * str,ConversionResult & result,Base...base)144 static Ret StrToNumeric(Tret (*converter)(const char *, char **, Base...), const char *str,
145 ConversionResult &result, Base... base) noexcept
146 {
147 Ret ret {};
148 char *endPtr;
149 // NOLINTBEGIN(cppcoreguidelines-special-member-functions)
150 struct SaveErrno {
151 explicit SaveErrno() : errno_(errno)
152 {
153 errno = 0;
154 }
155 ~SaveErrno()
156 {
157 if (errno == 0) {
158 errno = errno_;
159 }
160 }
161
162 private:
163 decltype(errno) errno_;
164 } const savedErrno;
165 // NOLINTEND(cppcoreguidelines-special-member-functions)
166
167 const Tret tmp = converter(str, &endPtr, base...);
168
169 bool outOfRange = false;
170 if constexpr (std::is_same_v<Ret, int>) {
171 outOfRange = tmp < static_cast<Tret>(std::numeric_limits<int>::min()) ||
172 tmp > static_cast<Tret>(std::numeric_limits<int>::max());
173 }
174
175 if (endPtr == str) {
176 result = ConversionResult::INVALID_ARGUMENT;
177 } else if (errno == ERANGE || outOfRange) {
178 result = ConversionResult::OUT_OF_RANGE;
179 } else {
180 result = ConversionResult::SUCCESS;
181 ret = tmp;
182 }
183
184 return ret;
185 }
186
187 util::StringView SourceView(size_t begin, size_t end) const;
188
189 protected:
190 void NextToken(Keywords *kws);
191 ArenaAllocator *Allocator();
192 bool IsLineTerminatorOrEos() const;
193 void ScanRegExpPattern();
194 RegExpFlags ScanRegExpFlags();
195
196 [[noreturn]] void ThrowError(std::string_view message) const;
197 [[noreturn]] void ThrowUnexpectedToken(lexer::TokenType tokenType) const;
198
199 void SetTokenStart();
200 void SetTokenEnd();
201
Iterator()202 inline util::StringView::Iterator &Iterator()
203 {
204 return pos_.iterator_;
205 }
206
Iterator()207 inline const util::StringView::Iterator &Iterator() const
208 {
209 return pos_.iterator_;
210 }
211
212 util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
213
214 void SkipWhiteSpaces();
215 void SkipSingleLineComment();
216
217 bool ScanPunctuator();
218 void ScanQuestionPunctuator();
219 void ScanLessThanPunctuator();
220 void ScanGreaterThanPunctuator();
221 virtual void ScanEqualsPunctuator();
222 virtual void ScanExclamationPunctuator();
223 void ScanAmpersandPunctuator();
224 void ScanVLinePunctuator();
225 void ScanCircumflexPunctuator();
226 void ScanPlusPunctuator();
227 void ScanMinusPunctuator();
228 void ScanSlashPunctuator();
229 void ScanPercentPunctuator();
230 void ScanDotPunctuator();
231 void ScanColonPunctuator();
232 virtual bool ScanDollarPunctuator();
233 void ScanAtPunctuator();
234
235 virtual void SkipMultiLineComment();
236 virtual void ScanHashMark();
237 virtual void ScanBackTick();
238
ScanCharLiteral()239 virtual bool ScanCharLiteral()
240 {
241 return false;
242 }
243
244 char32_t ScanUnicodeEscapeSequence();
245 template <int N, bool IN_AS = false>
246 char32_t ScanHexEscape();
247 char32_t ScanUnicodeCodePointEscape();
248
249 void ScanStringUnicodePart(util::UString *str);
250 char32_t ScanUnicodeCharacter();
251
252 void ScanDecimalNumbers();
253
ScanNumberLeadingZero()254 virtual void ScanNumberLeadingZero()
255 {
256 ScanNumberLeadingZeroImpl<double>();
257 }
258
259 template <typename RadixType, typename RadixLimit = void *>
260 void ScanNumberLeadingZeroImpl();
261 void ScanNumberLeadingZeroImplNonAllowedCases();
262 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
263 void ScanNumberRadix(bool allowNumericSeparator = true);
264 void ScanNumber(bool allowBigInt = true);
265 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
266 void ScanTooLargeNumber(RadixType number);
267 virtual void ConvertNumber(const std::string &utf8, NumberFlags flags);
268 void ScanDecimalLiteral();
269 void ScanDecimalDigits(bool allowNumericSeparator);
270 virtual void CheckNumberLiteralEnd();
271
272 inline static uint32_t HexValue(char32_t ch);
273 inline static bool IsDecimalDigit(uint32_t cp);
274 inline static bool IsHexDigit(char32_t ch);
275 inline static bool IsBinaryDigit(char32_t ch);
276 inline static bool IsOctalDigit(char32_t ch);
277
278 friend class KeywordsUtil;
279 friend class TemplateLiteralParserContext;
280
281 LexerPosition &Pos();
282 const LexerPosition &Pos() const;
283
284 private:
285 TemplateLiteralParserContext *tlCtx_ {};
286 ArenaAllocator *allocator_;
287 Keywords *kws_ {};
288 const parser::ParserContext *parserContext_;
289 util::StringView source_;
290 LexerPosition pos_;
291 };
292
293 class TemplateLiteralParserContext {
294 public:
TemplateLiteralParserContext(Lexer * lexer)295 explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
296 NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
297 NO_COPY_SEMANTIC(TemplateLiteralParserContext);
298
~TemplateLiteralParserContext()299 ~TemplateLiteralParserContext()
300 {
301 lexer_->tlCtx_ = prev_;
302 }
303
ConsumeLeftBrace()304 void ConsumeLeftBrace()
305 {
306 braceDepth_++;
307 }
308
ConsumeRightBrace()309 bool ConsumeRightBrace()
310 {
311 braceDepth_--;
312
313 return braceDepth_ == 0;
314 }
315
316 private:
317 Lexer *lexer_;
318 TemplateLiteralParserContext *prev_ {};
319 size_t braceDepth_ {1};
320 };
321
322 template <char32_t END>
ScanString()323 void Lexer::ScanString()
324 {
325 util::UString str(Allocator());
326 GetToken().type_ = TokenType::LITERAL_STRING;
327 GetToken().keywordType_ = TokenType::LITERAL_STRING;
328
329 const auto startPos = Iterator().Index();
330 auto escapeEnd = startPos;
331
332 do {
333 char32_t cp = Iterator().Peek();
334
335 switch (cp) {
336 case util::StringView::Iterator::INVALID_CP: {
337 ThrowError("Unterminated string");
338 break;
339 }
340 case LEX_CHAR_CR:
341 case LEX_CHAR_LF: {
342 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
343 if constexpr (END != LEX_CHAR_BACK_TICK) {
344 ThrowError("Newline is not allowed in strings");
345 }
346
347 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
348 str.Append(SourceView(escapeEnd, Iterator().Index()));
349
350 if (cp == LEX_CHAR_CR) {
351 Iterator().Forward(1);
352
353 if (Iterator().Peek() != LEX_CHAR_LF) {
354 Iterator().Backward(1);
355 }
356 }
357
358 pos_.line_++;
359 str.Append(LEX_CHAR_LF);
360 Iterator().Forward(1);
361 escapeEnd = Iterator().Index();
362 continue;
363 }
364 case LEX_CHAR_BACKSLASH: {
365 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
366 str.Append(SourceView(escapeEnd, Iterator().Index()));
367
368 Iterator().Forward(1);
369 ScanStringUnicodePart(&str);
370 escapeEnd = Iterator().Index();
371 continue;
372 }
373 case LEX_CHAR_BACK_TICK:
374 case LEX_CHAR_SINGLE_QUOTE:
375 case LEX_CHAR_DOUBLE_QUOTE: {
376 if (END == cp) {
377 break;
378 }
379
380 Iterator().Forward(1);
381 continue;
382 }
383 case LEX_CHAR_DOLLAR_SIGN: {
384 Iterator().Forward(1);
385
386 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
387 if constexpr (END == LEX_CHAR_BACK_TICK) {
388 if (Iterator().Peek() == LEX_CHAR_LEFT_BRACE) {
389 Iterator().Backward(1);
390 break;
391 }
392 }
393
394 continue;
395 }
396 default: {
397 Iterator().SkipCp();
398 continue;
399 }
400 }
401
402 if (GetToken().flags_ & TokenFlags::HAS_ESCAPE) {
403 str.Append(SourceView(escapeEnd, Iterator().Index()));
404 GetToken().src_ = str.View();
405 } else {
406 GetToken().src_ = SourceView(startPos, Iterator().Index());
407 }
408
409 break;
410 } while (true);
411
412 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
413 if constexpr (END != LEX_CHAR_BACK_TICK) {
414 Iterator().Forward(1);
415 }
416 }
417
418 template <int N, bool IN_AS>
ScanHexEscape()419 char32_t Lexer::ScanHexEscape()
420 {
421 char32_t code = 0;
422
423 for (size_t i = 0; i < N; ++i) {
424 const auto cp = Iterator().Peek();
425 if (IN_AS && cp == LEX_CHAR_BACK_TICK) {
426 break;
427 }
428
429 Iterator().Forward(1);
430
431 if (!IsHexDigit(cp)) {
432 ThrowError("Invalid unicode escape sequence");
433 }
434
435 constexpr auto MULTIPLIER = 16;
436 code = code * MULTIPLIER + HexValue(cp);
437 }
438
439 return code;
440 }
441
442 template <typename RadixType, typename RadixLimit>
ScanNumberLeadingZeroImpl()443 void Lexer::ScanNumberLeadingZeroImpl()
444 {
445 GetToken().type_ = TokenType::LITERAL_NUMBER;
446 GetToken().keywordType_ = TokenType::LITERAL_NUMBER;
447
448 switch (Iterator().Peek()) {
449 case LEX_CHAR_LOWERCASE_X:
450 case LEX_CHAR_UPPERCASE_X: {
451 Iterator().Forward(1);
452 constexpr auto RADIX = 16;
453 ScanNumberRadix<IsHexDigit, RADIX, RadixType, RadixLimit>();
454 CheckNumberLiteralEnd();
455 return;
456 }
457 case LEX_CHAR_LOWERCASE_B:
458 case LEX_CHAR_UPPERCASE_B: {
459 Iterator().Forward(1);
460 constexpr auto RADIX = 2;
461 ScanNumberRadix<IsBinaryDigit, RADIX, RadixType, RadixLimit>();
462 CheckNumberLiteralEnd();
463 return;
464 }
465 case LEX_CHAR_LOWERCASE_O:
466 case LEX_CHAR_UPPERCASE_O: {
467 Iterator().Forward(1);
468 constexpr auto RADIX = 8;
469 ScanNumberRadix<IsOctalDigit, RADIX, RadixType, RadixLimit>();
470
471 switch (Iterator().Peek()) {
472 case LEX_CHAR_8:
473 case LEX_CHAR_9: {
474 ThrowError("Invalid octal digit");
475 }
476 default: {
477 break;
478 }
479 }
480
481 CheckNumberLiteralEnd();
482 return;
483 }
484 default: {
485 ScanNumberLeadingZeroImplNonAllowedCases();
486 break;
487 }
488 }
489
490 ScanNumber();
491 }
492
493 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanTooLargeNumber(RadixType number)494 void Lexer::ScanTooLargeNumber([[maybe_unused]] RadixType number)
495 {
496 if constexpr (std::is_arithmetic_v<RadixLimit>) {
497 if (number > std::numeric_limits<RadixLimit>::max() / RADIX) {
498 ThrowError("Number is too large");
499 }
500 }
501 }
502
503 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanNumberRadix(bool allowNumericSeparator)504 void Lexer::ScanNumberRadix(bool allowNumericSeparator)
505 {
506 RadixType number {};
507
508 auto cp = Iterator().Peek();
509 if (!RANGE_CHECK(cp)) {
510 ThrowError("Invalid digit");
511 }
512
513 bool allowNumericOnNext = true;
514
515 do {
516 cp = Iterator().Peek();
517 if (RANGE_CHECK(cp)) {
518 auto digit = HexValue(cp);
519
520 ScanTooLargeNumber<RANGE_CHECK, RADIX, RadixType, RadixLimit>(number);
521
522 number = number * RADIX + digit;
523 Iterator().Forward(1);
524 allowNumericOnNext = true;
525 continue;
526 }
527
528 if (cp == LEX_CHAR_UNDERSCORE) {
529 if (!allowNumericSeparator || !allowNumericOnNext) {
530 ThrowError("Invalid numeric separator");
531 }
532
533 GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
534 Iterator().Forward(1);
535 allowNumericOnNext = false;
536 continue;
537 }
538
539 if (!allowNumericOnNext) {
540 Iterator().Backward(1);
541 ThrowError("Numeric separators are not allowed at the end of numeric literals");
542 }
543
544 break;
545 } while (true);
546
547 GetToken().number_ = lexer::Number(number);
548 }
549
HexValue(char32_t ch)550 inline uint32_t Lexer::HexValue(char32_t ch)
551 {
552 constexpr uint32_t HEX_MASK = 0xF;
553 constexpr uint32_t DEC_OFFSET = 10;
554 return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
555 }
556
IsDecimalDigit(uint32_t cp)557 inline bool Lexer::IsDecimalDigit(uint32_t cp)
558 {
559 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
560 }
561
IsHexDigit(char32_t ch)562 inline bool Lexer::IsHexDigit(char32_t ch)
563 {
564 return ch < LEX_ASCII_MAX_BITS && (std::isxdigit(static_cast<unsigned char>(ch)) != 0);
565 }
566
IsBinaryDigit(char32_t ch)567 inline bool Lexer::IsBinaryDigit(char32_t ch)
568 {
569 return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
570 }
571
IsOctalDigit(char32_t ch)572 inline bool Lexer::IsOctalDigit(char32_t ch)
573 {
574 return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
575 }
576 } // namespace panda::es2panda::lexer
577
578 #endif
579