1 /**
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_PARSER_CORE_LEXER_H
17 #define ES2PANDA_PARSER_CORE_LEXER_H
18
19 #include <lexer/regexp/regexp.h>
20 #include <lexer/token/letters.h>
21 #include <lexer/token/token.h>
22 #include <util/enumbitops.h>
23
24 namespace panda::es2panda::parser {
25 class ParserContext;
26 } // namespace panda::es2panda::parser
27
28 namespace panda::es2panda::lexer {
29
30 enum class LexerNextTokenFlags {
31 NONE = 0,
32 KEYWORD_TO_IDENT = 1 << 0,
33 NUMERIC_SEPARATOR_ALLOWED = 1 << 1,
34 BIGINT_ALLOWED = 1 << 2,
35 };
36
DEFINE_BITOPS(LexerNextTokenFlags)37 DEFINE_BITOPS(LexerNextTokenFlags)
38
39 class LexerPosition {
40 public:
41 explicit LexerPosition(const util::StringView &source);
42 DEFAULT_COPY_SEMANTIC(LexerPosition);
43 DEFAULT_MOVE_SEMANTIC(LexerPosition);
44 ~LexerPosition() = default;
45
46 Token token {};
47 util::StringView::Iterator iterator;
48 size_t line {};
49 size_t nextTokenLine {};
50 };
51
52 class LexerTemplateString {
53 public:
LexerTemplateString(ArenaAllocator * allocator)54 explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
55 DEFAULT_COPY_SEMANTIC(LexerTemplateString);
56 DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
57 ~LexerTemplateString() = default;
58
59 util::UString str;
60 size_t end {};
61 bool scanExpression {};
62 };
63
64 class TemplateLiteralParserContext;
65
66 class Lexer {
67 public:
68 explicit Lexer(const parser::ParserContext *parserContext);
69 NO_COPY_SEMANTIC(Lexer);
70 NO_MOVE_SEMANTIC(Lexer);
71 ~Lexer() = default;
72
73 void NextToken(LexerNextTokenFlags flags = LexerNextTokenFlags::NONE);
74
75 Token &GetToken();
76 const Token &GetToken() const;
77 size_t Line() const;
78
79 LexerPosition Save() const;
80 void Rewind(const LexerPosition &pos);
81 void BackwardToken(TokenType type, size_t offset);
82 void ForwardToken(TokenType type, size_t offset);
83
84 char32_t Lookahead();
85 bool CheckArrow();
86
87 RegExp ScanRegExp();
88 template <char32_t end>
89 void ScanString();
90 void ResetTokenEnd();
91 LexerTemplateString ScanTemplateString();
92 void ScanTemplateStringEnd();
93 void PushTemplateContext(TemplateLiteralParserContext *ctx);
94
95 private:
96 ArenaAllocator *Allocator();
97 bool IsLineTerminatorOrEos() const;
98 void ScanRegExpPattern();
99 RegExpFlags ScanRegExpFlags();
100
101 void ThrowError(std::string_view message);
102
103 void SetTokenStart();
104 void SetTokenEnd();
105
Iterator()106 inline util::StringView::Iterator &Iterator()
107 {
108 return pos_.iterator;
109 }
110
Iterator()111 inline const util::StringView::Iterator &Iterator() const
112 {
113 return pos_.iterator;
114 }
115
116 util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
117 util::StringView SourceView(size_t begin, size_t end) const;
118
119 void SkipWhiteSpaces();
120 void SkipSingleLineComment();
121 void SkipMultiLineComment();
122 template <TokenType keyword_type>
123 void CheckKeyword([[maybe_unused]] TokenType type, [[maybe_unused]] LexerNextTokenFlags flags);
124 void CheckKeywordEscape(TokenType type);
125 void CheckAwaitKeyword();
126 void CheckEnumKeyword();
127 void CheckLetKeyword();
128 void CheckYieldKeyword();
129 void CheckFutureReservedKeyword(TokenType keywordType);
130
131 bool ScanPunctuator();
132 void ScanQuestionPunctuator();
133 void ScanLessThanPunctuator();
134 void ScanGreaterThanPunctuator();
135 void ScanEqualsPunctuator();
136 void ScanExclamationPunctuator();
137 void ScanAmpersandPunctuator();
138 void ScanVLinePunctuator();
139 void ScanCircumflexPunctuator();
140 void ScanPlusPunctuator();
141 void ScanMinusPunctuator();
142 void ScanAsterixPunctuator();
143 void ScanSlashPunctuator();
144 void ScanPercentPunctuator();
145 void ScanDotPunctuator();
146
147 char32_t ScanUnicodeEscapeSequence();
148 template <int N>
149 char32_t ScanHexEscape();
150 char32_t ScanUnicodeCodePointEscape();
151
152 void ScanStringUnicodePart(util::UString *str);
153
154 void ScanNumberLeadingZero();
155 void ScanDecimalNumbers(bool allowNumericSeparator);
156 template <bool rangeCheck(char32_t), int radix>
157 void ScanNumberRadix(bool allowNumericSeparator = true);
158 void ScanNumber(bool allowNumericSeparator = true, bool allowBigInt = true);
159 void ConvertNumber(size_t exponentSignPos);
160 void ScanDecimalLiteral();
161 void ScanDecimalDigits(bool allowNumericSeparator);
162 void CheckNumberLiteralEnd();
163
164 inline static uint32_t HexValue(char32_t ch);
165 inline static bool IsDecimalDigit(uint32_t cp);
166 inline static bool IsHexDigit(char32_t ch);
167 inline static bool IsBinaryDigit(char32_t ch);
168 inline static bool IsOctalDigit(char32_t ch);
169
170 friend class KeywordsUtil;
171 friend class TemplateLiteralParserContext;
172 TemplateLiteralParserContext *tlCtx_ {};
173 ArenaAllocator *allocator_;
174 const parser::ParserContext *parserContext_;
175 util::StringView source_;
176 LexerPosition pos_;
177 bool isUnderscore_ = false;
178 };
179
180 class TemplateLiteralParserContext {
181 public:
TemplateLiteralParserContext(Lexer * lexer)182 explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
183 NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
184 NO_COPY_SEMANTIC(TemplateLiteralParserContext);
185
~TemplateLiteralParserContext()186 ~TemplateLiteralParserContext()
187 {
188 lexer_->tlCtx_ = prev_;
189 }
190
ConsumeLeftBrace()191 void ConsumeLeftBrace()
192 {
193 braceDepth_++;
194 }
195
ConsumeRightBrace()196 bool ConsumeRightBrace()
197 {
198 braceDepth_--;
199
200 return braceDepth_ == 0;
201 }
202
203 private:
204 Lexer *lexer_;
205 TemplateLiteralParserContext *prev_ {};
206 size_t braceDepth_ {1};
207 };
208
209 template <char32_t end>
ScanString()210 void Lexer::ScanString()
211 {
212 util::UString str(Allocator());
213 GetToken().type_ = TokenType::LITERAL_STRING;
214
215 const auto startPos = Iterator().Index();
216 auto escapeEnd = startPos;
217
218 do {
219 char32_t cp = Iterator().Peek();
220
221 switch (cp) {
222 case util::StringView::Iterator::INVALID_CP: {
223 ThrowError("Unterminated string");
224 break;
225 }
226 case LEX_CHAR_CR:
227 case LEX_CHAR_LF: {
228 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
229 if constexpr (end != LEX_CHAR_BACK_TICK) {
230 ThrowError("Newline is not allowed in strings");
231 }
232
233 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
234 str.Append(SourceView(escapeEnd, Iterator().Index()));
235
236 if (cp == LEX_CHAR_CR) {
237 Iterator().Forward(1);
238
239 if (Iterator().Peek() != LEX_CHAR_LF) {
240 Iterator().Backward(1);
241 }
242 }
243
244 pos_.line++;
245 str.Append(LEX_CHAR_LF);
246 Iterator().Forward(1);
247 escapeEnd = Iterator().Index();
248 continue;
249 }
250 case LEX_CHAR_BACKSLASH: {
251 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
252 str.Append(SourceView(escapeEnd, Iterator().Index()));
253
254 Iterator().Forward(1);
255 ScanStringUnicodePart(&str);
256 escapeEnd = Iterator().Index();
257 continue;
258 }
259 case LEX_CHAR_BACK_TICK:
260 case LEX_CHAR_SINGLE_QUOTE:
261 case LEX_CHAR_DOUBLE_QUOTE: {
262 if (end == cp) {
263 break;
264 }
265
266 Iterator().Forward(1);
267 continue;
268 }
269 case LEX_CHAR_DOLLAR_SIGN: {
270 Iterator().Forward(1);
271
272 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
273 if constexpr (end == LEX_CHAR_BACK_TICK) {
274 if (Iterator().Peek() == LEX_CHAR_LEFT_BRACE) {
275 Iterator().Backward(1);
276 break;
277 }
278 }
279
280 continue;
281 }
282 default: {
283 Iterator().SkipCp();
284 continue;
285 }
286 }
287
288 if (GetToken().flags_ & TokenFlags::HAS_ESCAPE) {
289 str.Append(SourceView(escapeEnd, Iterator().Index()));
290 GetToken().src_ = str.View();
291 } else {
292 GetToken().src_ = SourceView(startPos, Iterator().Index());
293 }
294
295 break;
296 } while (true);
297
298 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
299 if constexpr (end != LEX_CHAR_BACK_TICK) {
300 Iterator().Forward(1);
301 }
302 }
303
304 template <int N>
ScanHexEscape()305 char32_t Lexer::ScanHexEscape()
306 {
307 char32_t code = 0;
308
309 for (size_t i = 0; i < N; ++i) {
310 const auto cp = Iterator().Peek();
311 Iterator().Forward(1);
312
313 if (!IsHexDigit(cp)) {
314 ThrowError("Invalid unicode escape sequence");
315 }
316
317 constexpr auto MULTIPLIER = 16;
318 code = code * MULTIPLIER + HexValue(cp);
319 }
320
321 return code;
322 }
323
324 template <bool rangeCheck(char32_t), int radix>
ScanNumberRadix(bool allowNumericSeparator)325 void Lexer::ScanNumberRadix(bool allowNumericSeparator)
326 {
327 double number = 0.0;
328
329 auto cp = Iterator().Peek();
330 if (!rangeCheck(cp)) {
331 ThrowError("Invalid digit");
332 }
333
334 bool allowNumericOnNext = true;
335
336 do {
337 cp = Iterator().Peek();
338 if (rangeCheck(cp)) {
339 number = number * radix + HexValue(cp);
340 Iterator().Forward(1);
341 allowNumericOnNext = true;
342 continue;
343 }
344
345 if (cp == LEX_CHAR_UNDERSCORE) {
346 if (!allowNumericSeparator || !allowNumericOnNext) {
347 ThrowError("Invalid numeric separator");
348 }
349
350 GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
351 Iterator().Forward(1);
352 allowNumericOnNext = false;
353 continue;
354 }
355
356 if (!allowNumericOnNext) {
357 Iterator().Backward(1);
358 ThrowError("Numeric separators are not allowed at the end of numeric literals");
359 }
360
361 break;
362 } while (true);
363
364 GetToken().number_ = number;
365 }
366
367 template <TokenType keyword_type>
CheckKeyword(TokenType type,LexerNextTokenFlags flags)368 void Lexer::CheckKeyword([[maybe_unused]] TokenType type, [[maybe_unused]] LexerNextTokenFlags flags)
369 {
370 // NOLINTNEXTLINE
371 if constexpr (keyword_type == TokenType::KEYW_AWAIT) {
372 CheckAwaitKeyword();
373 return;
374 }
375
376 // NOLINTNEXTLINE
377 if constexpr (keyword_type == TokenType::KEYW_ENUM) {
378 CheckEnumKeyword();
379 return;
380 }
381
382 // NOLINTNEXTLINE
383 if constexpr (keyword_type == TokenType::KEYW_YIELD) {
384 CheckYieldKeyword();
385 return;
386 }
387
388 // NOLINTNEXTLINE
389 if constexpr (keyword_type == TokenType::KEYW_LET) {
390 CheckLetKeyword();
391 return;
392 }
393
394 // NOLINTNEXTLINE
395 if constexpr (keyword_type <= TokenType::KEYW_ASYNC) {
396 CheckKeywordEscape(type);
397 return;
398 }
399
400 // NOLINTNEXTLINE
401 if constexpr (keyword_type >= TokenType::KEYW_PUBLIC) {
402 // NOLINTNEXTLINE
403 CheckFutureReservedKeyword(keyword_type);
404 return;
405 }
406
407 GetToken().type_ = TokenType::LITERAL_IDENT;
408 }
409
HexValue(char32_t ch)410 inline uint32_t Lexer::HexValue(char32_t ch)
411 {
412 constexpr uint32_t HEX_MASK = 0xF;
413 constexpr uint32_t DEC_OFFSET = 10;
414 return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
415 }
416
IsDecimalDigit(uint32_t cp)417 inline bool Lexer::IsDecimalDigit(uint32_t cp)
418 {
419 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
420 }
421
IsHexDigit(char32_t ch)422 inline bool Lexer::IsHexDigit(char32_t ch)
423 {
424 return ch < LEX_ASCII_MAX_BITS && std::isxdigit(static_cast<unsigned char>(ch));
425 }
426
IsBinaryDigit(char32_t ch)427 inline bool Lexer::IsBinaryDigit(char32_t ch)
428 {
429 return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
430 }
431
IsOctalDigit(char32_t ch)432 inline bool Lexer::IsOctalDigit(char32_t ch)
433 {
434 return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
435 }
436
437 } // namespace panda::es2panda::lexer
438
439 #endif
440