1 /**
2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "keywordsUtil.h"
17
18 #include <gen/keywords.h>
19 #include <lexer/lexer.h>
20 #include <unicode/uchar.h>
21 #include <util/enumbitops.h>
22
23 namespace panda::es2panda::lexer {
24
25 enum class AsciiFlags : uint8_t {
26 NONE = 0,
27 ID_START = 1 << 0,
28 ID_CONTINUE = 1 << 1,
29 };
30
operator |(AsciiFlags a,AsciiFlags b)31 constexpr AsciiFlags operator|(AsciiFlags a, AsciiFlags b)
32 {
33 using utype = std::underlying_type_t<AsciiFlags>;
34 return static_cast<AsciiFlags>(static_cast<utype>(a) | static_cast<utype>(b));
35 }
36
operator &(AsciiFlags a,AsciiFlags b)37 inline std::underlying_type_t<AsciiFlags> operator&(AsciiFlags a, AsciiFlags b)
38 {
39 using utype = std::underlying_type_t<AsciiFlags>;
40 /* NOLINTNEXTLINE(hicpp-signed-bitwise) */
41 return static_cast<utype>(static_cast<utype>(a) & static_cast<utype>(b));
42 }
43
44 constexpr std::array<AsciiFlags, 128> ASCII_FLAGS = {{
45 AsciiFlags::NONE, /* NUL */
46 AsciiFlags::NONE, /* SOH */
47 AsciiFlags::NONE, /* STX */
48 AsciiFlags::NONE, /* ETX */
49 AsciiFlags::NONE, /* EOT */
50 AsciiFlags::NONE, /* ENQ */
51 AsciiFlags::NONE, /* ACK */
52 AsciiFlags::NONE, /* BEL */
53 AsciiFlags::NONE, /* BS */
54 AsciiFlags::NONE, /* TAB */
55 AsciiFlags::NONE, /* LF */
56 AsciiFlags::NONE, /* VT */
57 AsciiFlags::NONE, /* FF */
58 AsciiFlags::NONE, /* CR */
59 AsciiFlags::NONE, /* SO */
60 AsciiFlags::NONE, /* SI */
61 AsciiFlags::NONE, /* DLE */
62 AsciiFlags::NONE, /* DC1 */
63 AsciiFlags::NONE, /* DC2 */
64 AsciiFlags::NONE, /* DC3 */
65 AsciiFlags::NONE, /* DC4 */
66 AsciiFlags::NONE, /* NAK */
67 AsciiFlags::NONE, /* SYN */
68 AsciiFlags::NONE, /* ETB */
69 AsciiFlags::NONE, /* CAN */
70 AsciiFlags::NONE, /* EM */
71 AsciiFlags::NONE, /* SUB */
72 AsciiFlags::NONE, /* ESC */
73 AsciiFlags::NONE, /* FS */
74 AsciiFlags::NONE, /* GS */
75 AsciiFlags::NONE, /* RS */
76 AsciiFlags::NONE, /* US */
77 AsciiFlags::NONE, /* Space */
78 AsciiFlags::NONE, /* ! */
79 AsciiFlags::NONE, /* " */
80 AsciiFlags::NONE, /* # */
81 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* $ */
82 AsciiFlags::NONE, /* % */
83 AsciiFlags::NONE, /* & */
84 AsciiFlags::NONE, /* ' */
85 AsciiFlags::NONE, /* ( */
86 AsciiFlags::NONE, /* ) */
87 AsciiFlags::NONE, /* * */
88 AsciiFlags::NONE, /* + */
89 AsciiFlags::NONE, /* , */
90 AsciiFlags::NONE, /* - */
91 AsciiFlags::NONE, /* . */
92 AsciiFlags::NONE, /* / */
93 AsciiFlags::ID_CONTINUE, /* 0 */
94 AsciiFlags::ID_CONTINUE, /* 1 */
95 AsciiFlags::ID_CONTINUE, /* 2 */
96 AsciiFlags::ID_CONTINUE, /* 3 */
97 AsciiFlags::ID_CONTINUE, /* 4 */
98 AsciiFlags::ID_CONTINUE, /* 5 */
99 AsciiFlags::ID_CONTINUE, /* 6 */
100 AsciiFlags::ID_CONTINUE, /* 7 */
101 AsciiFlags::ID_CONTINUE, /* 8 */
102 AsciiFlags::ID_CONTINUE, /* 9 */
103 AsciiFlags::NONE, /* : */
104 AsciiFlags::NONE, /* ; */
105 AsciiFlags::NONE, /* < */
106 AsciiFlags::NONE, /* = */
107 AsciiFlags::NONE, /* > */
108 AsciiFlags::NONE, /* ? */
109 AsciiFlags::NONE, /* @ */
110 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* A */
111 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* B */
112 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* C */
113 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* D */
114 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* E */
115 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* F */
116 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* G */
117 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* H */
118 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* I */
119 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* J */
120 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* K */
121 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* L */
122 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* M */
123 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* N */
124 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* O */
125 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* P */
126 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Q */
127 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* R */
128 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* S */
129 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* T */
130 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* U */
131 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* V */
132 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* W */
133 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* X */
134 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Y */
135 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Z */
136 AsciiFlags::NONE, /* [ */
137 AsciiFlags::NONE, /* \ */
138 AsciiFlags::NONE, /* ] */
139 AsciiFlags::NONE, /* ^ */
140 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* _ */
141 AsciiFlags::NONE, /* ` */
142 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* a */
143 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* b */
144 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* c */
145 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* d */
146 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* e */
147 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* f */
148 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* g */
149 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* h */
150 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* i */
151 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* j */
152 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* k */
153 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* l */
154 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* m */
155 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* n */
156 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* o */
157 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* p */
158 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* q */
159 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* r */
160 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* s */
161 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* t */
162 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* u */
163 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* v */
164 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* w */
165 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* x */
166 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* y */
167 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* z */
168 AsciiFlags::NONE, /* { */
169 AsciiFlags::NONE, /* | */
170 AsciiFlags::NONE, /* } */
171 AsciiFlags::NONE, /* ~ */
172 AsciiFlags::NONE /* DEL */
173 }};
174
IsIdentifierStart(char32_t cp)175 bool KeywordsUtil::IsIdentifierStart(char32_t cp)
176 {
177 if (cp < LEX_ASCII_MAX_BITS) {
178 return (ASCII_FLAGS[cp] & AsciiFlags::ID_START) != 0;
179 }
180 // Unicode {xxxxx} may consist of 4 bytes information and cannot be forcibly converted to 2 bytes
181 auto uchar = static_cast<UChar32>(cp);
182 return u_hasBinaryProperty(uchar, UCHAR_ID_START);
183 }
184
IsIdentifierPart(char32_t cp)185 bool KeywordsUtil::IsIdentifierPart(char32_t cp)
186 {
187 if (cp < LEX_ASCII_MAX_BITS) {
188 return (ASCII_FLAGS[cp] & AsciiFlags::ID_CONTINUE) != 0;
189 }
190
191 /**
192 * u_isIDPart or Other_ID_Continue characters or ZWJ/ZWNJ.
193 * Unicode {xxxxx} may consist of 4 bytes information and cannot be forcibly converted to 2 bytes
194 */
195 auto uchar = static_cast<UChar32>(cp);
196 return (u_hasBinaryProperty(uchar, UCHAR_ID_CONTINUE) || cp == LEX_CHAR_ZWNJ || cp == LEX_CHAR_ZWJ);
197 }
198
ScanIdentifierStart(char32_t cp)199 void KeywordsUtil::ScanIdentifierStart(char32_t cp)
200 {
201 if (!KeywordsUtil::IsIdentifierStart(cp)) {
202 lexer_->ThrowError("Expected an identifier");
203 }
204
205 cp_ = cp;
206 const auto map = KeywordsMap::Map(cp);
207 ScanIdContinueMaybeKeyword(map);
208 }
209
ScanIdContinue()210 void KeywordsUtil::ScanIdContinue()
211 {
212 util::UString ident(lexer_->Allocator());
213 size_t startPos = lexer_->GetToken().Start().index;
214
215 if (HasEscape()) {
216 ident.Append(cp_);
217 startPos = Iterator().Index();
218 }
219
220 auto escapeEnd = startPos;
221
222 do {
223 if (Iterator().Peek() == LEX_CHAR_BACKSLASH) {
224 ident.Append(lexer_->SourceView(escapeEnd, Iterator().Index()));
225
226 auto cp = ScanUnicodeEscapeSequence();
227 if (!IsIdentifierPart(cp)) {
228 lexer_->ThrowError("Invalid identifier part");
229 }
230
231 escapeEnd = Iterator().Index();
232 ident.Append(cp);
233 continue;
234 }
235
236 size_t cpSize {};
237 auto cp = Iterator().PeekCp(&cpSize);
238 if (!IsIdentifierPart(cp)) {
239 break;
240 }
241
242 Iterator().Forward(cpSize);
243 } while (true);
244
245 lexer_->GetToken().type_ = TokenType::LITERAL_IDENT;
246 lexer_->GetToken().keywordType_ = TokenType::EOS;
247
248 if (HasEscape()) {
249 ident.Append(lexer_->SourceView(escapeEnd, Iterator().Index()));
250 lexer_->GetToken().src_ = ident.View();
251 } else {
252 lexer_->GetToken().src_ = lexer_->SourceView(startPos, Iterator().Index());
253 }
254 }
255
ScanIdContinueMaybeKeyword(Span<const KeywordString> map)256 void KeywordsUtil::ScanIdContinueMaybeKeyword(Span<const KeywordString> map)
257 {
258 ScanIdContinue();
259
260 if (!HasEscape() || map.empty()) {
261 return;
262 }
263
264 const auto &str = lexer_->GetToken().Ident().Utf8();
265
266 int start = 0;
267 int end = static_cast<int>(map.size());
268 int middle = end / 2;
269
270 while (true) {
271 const auto &kws = map[middle];
272
273 int relation = str.compare(kws.str);
274 if (relation == 0) {
275 Keywords::SetKeyword(this, kws);
276 }
277
278 if (relation > 0) {
279 start = middle + 1;
280 } else {
281 end = middle;
282 }
283
284 middle = (start + end) / 2;
285
286 if (start >= end) {
287 return;
288 }
289 }
290 }
291
ScanUnicodeEscapeSequence()292 char32_t KeywordsUtil::ScanUnicodeEscapeSequence()
293 {
294 ASSERT(Iterator().Peek() == LEX_CHAR_BACKSLASH);
295
296 lexer_->GetToken().flags_ |= lexer::TokenFlags::HAS_ESCAPE;
297
298 Iterator().Forward(1);
299
300 if (Iterator().Peek() != LEX_CHAR_LOWERCASE_U) {
301 return util::StringView::Iterator::INVALID_CP;
302 }
303
304 return lexer_->ScanUnicodeEscapeSequence();
305 }
306
307 } // namespace panda::es2panda::lexer
308