• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "keywordsUtil.h"
17 
18 #include <gen/keywords.h>
19 #include <lexer/lexer.h>
20 #include <unicode/uchar.h>
21 #include <util/enumbitops.h>
22 
23 namespace panda::es2panda::lexer {
24 
25 enum class AsciiFlags : uint8_t {
26     NONE = 0,
27     ID_START = 1 << 0,
28     ID_CONTINUE = 1 << 1,
29 };
30 
operator |(AsciiFlags a,AsciiFlags b)31 constexpr AsciiFlags operator|(AsciiFlags a, AsciiFlags b)
32 {
33     using utype = std::underlying_type_t<AsciiFlags>;
34     return static_cast<AsciiFlags>(static_cast<utype>(a) | static_cast<utype>(b));
35 }
36 
operator &(AsciiFlags a,AsciiFlags b)37 inline std::underlying_type_t<AsciiFlags> operator&(AsciiFlags a, AsciiFlags b)
38 {
39     using utype = std::underlying_type_t<AsciiFlags>;
40     /* NOLINTNEXTLINE(hicpp-signed-bitwise) */
41     return static_cast<utype>(static_cast<utype>(a) & static_cast<utype>(b));
42 }
43 
44 constexpr std::array<AsciiFlags, 128> ASCII_FLAGS = {{
45     AsciiFlags::NONE,                               /* NUL */
46     AsciiFlags::NONE,                               /* SOH */
47     AsciiFlags::NONE,                               /* STX */
48     AsciiFlags::NONE,                               /* ETX */
49     AsciiFlags::NONE,                               /* EOT */
50     AsciiFlags::NONE,                               /* ENQ */
51     AsciiFlags::NONE,                               /* ACK */
52     AsciiFlags::NONE,                               /* BEL */
53     AsciiFlags::NONE,                               /* BS */
54     AsciiFlags::NONE,                               /* TAB */
55     AsciiFlags::NONE,                               /* LF */
56     AsciiFlags::NONE,                               /* VT */
57     AsciiFlags::NONE,                               /* FF */
58     AsciiFlags::NONE,                               /* CR */
59     AsciiFlags::NONE,                               /* SO */
60     AsciiFlags::NONE,                               /* SI */
61     AsciiFlags::NONE,                               /* DLE */
62     AsciiFlags::NONE,                               /* DC1 */
63     AsciiFlags::NONE,                               /* DC2 */
64     AsciiFlags::NONE,                               /* DC3 */
65     AsciiFlags::NONE,                               /* DC4 */
66     AsciiFlags::NONE,                               /* NAK */
67     AsciiFlags::NONE,                               /* SYN */
68     AsciiFlags::NONE,                               /* ETB */
69     AsciiFlags::NONE,                               /* CAN */
70     AsciiFlags::NONE,                               /* EM */
71     AsciiFlags::NONE,                               /* SUB */
72     AsciiFlags::NONE,                               /* ESC */
73     AsciiFlags::NONE,                               /* FS */
74     AsciiFlags::NONE,                               /* GS */
75     AsciiFlags::NONE,                               /* RS */
76     AsciiFlags::NONE,                               /* US */
77     AsciiFlags::NONE,                               /* Space */
78     AsciiFlags::NONE,                               /* ! */
79     AsciiFlags::NONE,                               /* " */
80     AsciiFlags::NONE,                               /* # */
81     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* $ */
82     AsciiFlags::NONE,                               /* % */
83     AsciiFlags::NONE,                               /* & */
84     AsciiFlags::NONE,                               /* ' */
85     AsciiFlags::NONE,                               /* ( */
86     AsciiFlags::NONE,                               /* ) */
87     AsciiFlags::NONE,                               /* * */
88     AsciiFlags::NONE,                               /* + */
89     AsciiFlags::NONE,                               /* , */
90     AsciiFlags::NONE,                               /* - */
91     AsciiFlags::NONE,                               /* . */
92     AsciiFlags::NONE,                               /* / */
93     AsciiFlags::ID_CONTINUE,                        /* 0 */
94     AsciiFlags::ID_CONTINUE,                        /* 1 */
95     AsciiFlags::ID_CONTINUE,                        /* 2 */
96     AsciiFlags::ID_CONTINUE,                        /* 3 */
97     AsciiFlags::ID_CONTINUE,                        /* 4 */
98     AsciiFlags::ID_CONTINUE,                        /* 5 */
99     AsciiFlags::ID_CONTINUE,                        /* 6 */
100     AsciiFlags::ID_CONTINUE,                        /* 7 */
101     AsciiFlags::ID_CONTINUE,                        /* 8 */
102     AsciiFlags::ID_CONTINUE,                        /* 9 */
103     AsciiFlags::NONE,                               /* : */
104     AsciiFlags::NONE,                               /* ; */
105     AsciiFlags::NONE,                               /* < */
106     AsciiFlags::NONE,                               /* = */
107     AsciiFlags::NONE,                               /* > */
108     AsciiFlags::NONE,                               /* ? */
109     AsciiFlags::NONE,                               /* @ */
110     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* A */
111     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* B */
112     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* C */
113     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* D */
114     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* E */
115     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* F */
116     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* G */
117     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* H */
118     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* I */
119     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* J */
120     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* K */
121     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* L */
122     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* M */
123     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* N */
124     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* O */
125     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* P */
126     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Q */
127     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* R */
128     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* S */
129     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* T */
130     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* U */
131     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* V */
132     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* W */
133     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* X */
134     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Y */
135     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Z */
136     AsciiFlags::NONE,                               /* [ */
137     AsciiFlags::NONE,                               /* \ */
138     AsciiFlags::NONE,                               /* ] */
139     AsciiFlags::NONE,                               /* ^ */
140     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* _ */
141     AsciiFlags::NONE,                               /* ` */
142     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* a */
143     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* b */
144     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* c */
145     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* d */
146     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* e */
147     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* f */
148     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* g */
149     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* h */
150     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* i */
151     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* j */
152     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* k */
153     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* l */
154     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* m */
155     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* n */
156     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* o */
157     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* p */
158     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* q */
159     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* r */
160     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* s */
161     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* t */
162     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* u */
163     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* v */
164     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* w */
165     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* x */
166     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* y */
167     AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* z */
168     AsciiFlags::NONE,                               /* { */
169     AsciiFlags::NONE,                               /* | */
170     AsciiFlags::NONE,                               /* } */
171     AsciiFlags::NONE,                               /* ~ */
172     AsciiFlags::NONE                                /* DEL */
173 }};
174 
IsIdentifierStart(char32_t cp)175 bool KeywordsUtil::IsIdentifierStart(char32_t cp)
176 {
177     if (cp < LEX_ASCII_MAX_BITS) {
178         return (ASCII_FLAGS[cp] & AsciiFlags::ID_START) != 0;
179     }
180     // Unicode {xxxxx} may consist of 4 bytes information and cannot be forcibly converted to 2 bytes
181     auto uchar = static_cast<UChar32>(cp);
182     return u_hasBinaryProperty(uchar, UCHAR_ID_START);
183 }
184 
IsIdentifierPart(char32_t cp)185 bool KeywordsUtil::IsIdentifierPart(char32_t cp)
186 {
187     if (cp < LEX_ASCII_MAX_BITS) {
188         return (ASCII_FLAGS[cp] & AsciiFlags::ID_CONTINUE) != 0;
189     }
190 
191     /**
192      * u_isIDPart or Other_ID_Continue characters or ZWJ/ZWNJ.
193      * Unicode {xxxxx} may consist of 4 bytes information and cannot be forcibly converted to 2 bytes
194      */
195     auto uchar = static_cast<UChar32>(cp);
196     return (u_hasBinaryProperty(uchar, UCHAR_ID_CONTINUE) || cp == LEX_CHAR_ZWNJ || cp == LEX_CHAR_ZWJ);
197 }
198 
ScanIdentifierStart(char32_t cp)199 void KeywordsUtil::ScanIdentifierStart(char32_t cp)
200 {
201     if (!KeywordsUtil::IsIdentifierStart(cp)) {
202         lexer_->ThrowError("Expected an identifier");
203     }
204 
205     cp_ = cp;
206     const auto map = KeywordsMap::Map(cp);
207     ScanIdContinueMaybeKeyword(map);
208 }
209 
ScanIdContinue()210 void KeywordsUtil::ScanIdContinue()
211 {
212     util::UString ident(lexer_->Allocator());
213     size_t startPos = lexer_->GetToken().Start().index;
214 
215     if (HasEscape()) {
216         ident.Append(cp_);
217         startPos = Iterator().Index();
218     }
219 
220     auto escapeEnd = startPos;
221 
222     do {
223         if (Iterator().Peek() == LEX_CHAR_BACKSLASH) {
224             ident.Append(lexer_->SourceView(escapeEnd, Iterator().Index()));
225 
226             auto cp = ScanUnicodeEscapeSequence();
227             if (!IsIdentifierPart(cp)) {
228                 lexer_->ThrowError("Invalid identifier part");
229             }
230 
231             escapeEnd = Iterator().Index();
232             ident.Append(cp);
233             continue;
234         }
235 
236         size_t cpSize {};
237         auto cp = Iterator().PeekCp(&cpSize);
238         if (!IsIdentifierPart(cp)) {
239             break;
240         }
241 
242         Iterator().Forward(cpSize);
243     } while (true);
244 
245     lexer_->GetToken().type_ = TokenType::LITERAL_IDENT;
246     lexer_->GetToken().keywordType_ = TokenType::EOS;
247 
248     if (HasEscape()) {
249         ident.Append(lexer_->SourceView(escapeEnd, Iterator().Index()));
250         lexer_->GetToken().src_ = ident.View();
251     } else {
252         lexer_->GetToken().src_ = lexer_->SourceView(startPos, Iterator().Index());
253     }
254 }
255 
ScanIdContinueMaybeKeyword(Span<const KeywordString> map)256 void KeywordsUtil::ScanIdContinueMaybeKeyword(Span<const KeywordString> map)
257 {
258     ScanIdContinue();
259 
260     if (!HasEscape() || map.empty()) {
261         return;
262     }
263 
264     const auto &str = lexer_->GetToken().Ident().Utf8();
265 
266     int start = 0;
267     int end = static_cast<int>(map.size());
268     int middle = end / 2;
269 
270     while (true) {
271         const auto &kws = map[middle];
272 
273         int relation = str.compare(kws.str);
274         if (relation == 0) {
275             Keywords::SetKeyword(this, kws);
276         }
277 
278         if (relation > 0) {
279             start = middle + 1;
280         } else {
281             end = middle;
282         }
283 
284         middle = (start + end) / 2;
285 
286         if (start >= end) {
287             return;
288         }
289     }
290 }
291 
ScanUnicodeEscapeSequence()292 char32_t KeywordsUtil::ScanUnicodeEscapeSequence()
293 {
294     ASSERT(Iterator().Peek() == LEX_CHAR_BACKSLASH);
295 
296     lexer_->GetToken().flags_ |= lexer::TokenFlags::HAS_ESCAPE;
297 
298     Iterator().Forward(1);
299 
300     if (Iterator().Peek() != LEX_CHAR_LOWERCASE_U) {
301         return util::StringView::Iterator::INVALID_CP;
302     }
303 
304     return lexer_->ScanUnicodeEscapeSequence();
305 }
306 
307 }  // namespace panda::es2panda::lexer
308