1 /*
2 * Copyright (c) 2021-2023 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "lexer.h"
17
18 namespace panda::pandasm {
19
20 /*-------------------------------*/
21
22 /* Is this a delimiter ? */
FindDelim(char c)23 Token::Type FindDelim(char c)
24 {
25 /* The map of delimiters */
26 static const std::unordered_map<char, Token::Type> DELIM = {{',', Token::Type::DEL_COMMA},
27 {':', Token::Type::DEL_COLON},
28 {'{', Token::Type::DEL_BRACE_L},
29 {'}', Token::Type::DEL_BRACE_R},
30 {'(', Token::Type::DEL_BRACKET_L},
31 {')', Token::Type::DEL_BRACKET_R},
32 {'<', Token::Type::DEL_LT},
33 {'>', Token::Type::DEL_GT},
34 {'=', Token::Type::DEL_EQ},
35 {'[', Token::Type::DEL_SQUARE_BRACKET_L},
36 {']', Token::Type::DEL_SQUARE_BRACKET_R}};
37
38 auto iter = DELIM.find(c);
39 if (iter == DELIM.end()) {
40 return Token::Type::ID_BAD;
41 }
42
43 return DELIM.at(c);
44 }
45
FindOperation(std::string_view s)46 Token::Type FindOperation(std::string_view s)
47 {
48 /* Generate the map of OPERATIONS from ISA: */
49 static const std::unordered_map<std::string_view, Token::Type> OPERATIONS = {
50 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
51 #define OPLIST(inst_code, name, optype, width, flags, dst_idx, use_idxs, prof_size) \
52 {std::string_view(name), Token::Type::ID_OP_##inst_code},
53 PANDA_INSTRUCTION_LIST(OPLIST)
54 #undef OPLIST
55 };
56
57 auto iter = OPERATIONS.find(s);
58 if (iter == OPERATIONS.end()) {
59 return Token::Type::ID_BAD;
60 }
61
62 return OPERATIONS.at(s);
63 }
64
Findkeyword(std::string_view s)65 Token::Type Findkeyword(std::string_view s)
66 {
67 /* Generate the map of KEYWORDS: */
68 static const std::unordered_map<std::string_view, Token::Type> KEYWORDS = {
69 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
70 #define KEYWORDS(name, inst_code) {std::string_view(name), Token::Type::ID_##inst_code},
71 KEYWORDS_LIST(KEYWORDS)
72 #undef KEYWORDS
73 };
74
75 auto iter = KEYWORDS.find(s);
76 if (iter == KEYWORDS.end()) {
77 return Token::Type::ID_BAD;
78 }
79
80 return KEYWORDS.at(s);
81 }
82
TokenTypeWhat(Token::Type t)83 std::string_view TokenTypeWhat(Token::Type t)
84 {
85 if (t >= Token::Type::OPERATION && t < Token::Type::KEYWORD) {
86 return "OPERATION";
87 }
88
89 if (t >= Token::Type::KEYWORD) {
90 return "KEYWORD";
91 }
92
93 switch (t) {
94 case Token::Type::ID_BAD: {
95 return "ID_BAD";
96 }
97 case Token::Type::DEL_COMMA: {
98 return "DEL_COMMA";
99 }
100 case Token::Type::DEL_COLON: {
101 return "DEL_COLON";
102 }
103 case Token::Type::DEL_BRACE_L: {
104 return "DEL_BRACE_L";
105 }
106 case Token::Type::DEL_BRACE_R: {
107 return "DEL_BRACE_R";
108 }
109 case Token::Type::DEL_BRACKET_L: {
110 return "DEL_BRACKET_L";
111 }
112 case Token::Type::DEL_BRACKET_R: {
113 return "DEL_BRACKET_R";
114 }
115 case Token::Type::DEL_SQUARE_BRACKET_L: {
116 return "DEL_SQUARE_BRACKET_L";
117 }
118 case Token::Type::DEL_SQUARE_BRACKET_R: {
119 return "DEL_SQUARE_BRACKET_R";
120 }
121 case Token::Type::DEL_GT: {
122 return "DEL_GT";
123 }
124 case Token::Type::DEL_LT: {
125 return "DEL_LT";
126 }
127 case Token::Type::DEL_EQ: {
128 return "DEL_EQ";
129 }
130 case Token::Type::DEL_DOT: {
131 return "DEL_DOT";
132 }
133 case Token::Type::ID: {
134 return "ID";
135 }
136 case Token::Type::ID_STRING: {
137 return "ID_STRING";
138 }
139 default:
140 return "NONE";
141 }
142 }
143
IsQuote(char c)144 static bool IsQuote(char c)
145 {
146 return c == '"';
147 }
148
Lexer()149 Lexer::Lexer()
150 {
151 LOG(DEBUG, ASSEMBLER) << "element of class Lexer initialized";
152 }
153
~Lexer()154 Lexer::~Lexer()
155 {
156 LOG(DEBUG, ASSEMBLER) << "element of class Lexer destructed";
157 }
158
TokenizeString(const std::string & sourceStr)159 Tokens Lexer::TokenizeString(const std::string &sourceStr)
160 {
161 LOG(DEBUG, ASSEMBLER) << "started tokenizing of line " << (lines_.size() + 1) << ": ";
162
163 lines_.emplace_back(sourceStr);
164
165 currLine_ = &lines_.back();
166
167 LOG(DEBUG, ASSEMBLER) << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
168 currLine_->end - currLine_->pos);
169
170 AnalyzeLine();
171
172 LOG(DEBUG, ASSEMBLER) << "tokenization of line " << lines_.size() << " is successful";
173 LOG(DEBUG, ASSEMBLER) << " tokens identified: ";
174
175 for (const auto &fI : lines_.back().tokens) {
176 LOG(DEBUG, ASSEMBLER) << "\n "
177 << std::string_view(&*(fI.wholeLine.begin() + fI.boundLeft), fI.boundRight - fI.boundLeft)
178 << " (type: " << TokenTypeWhat(fI.type) << ")";
179
180 LOG(DEBUG, ASSEMBLER);
181 LOG(DEBUG, ASSEMBLER);
182 }
183 return std::pair<std::vector<Token>, Error>(lines_.back().tokens, err_);
184 }
185
186 /* End of line? */
Eol() const187 bool Lexer::Eol() const
188 {
189 return currLine_->pos == currLine_->end;
190 }
191
192 /* Return the type of token */
LexGetType(size_t beg,size_t end) const193 Token::Type Lexer::LexGetType(size_t beg, size_t end) const
194 {
195 if (FindDelim(currLine_->buffer[beg]) != Token::Type::ID_BAD) { /* delimiter */
196 return FindDelim(currLine_->buffer[beg]);
197 }
198
199 std::string_view p(&*(currLine_->buffer.begin() + beg), end - beg);
200
201 Token::Type type = Findkeyword(p);
202 if (type != Token::Type::ID_BAD) {
203 return type;
204 }
205
206 type = FindOperation(p);
207 if (type != Token::Type::ID_BAD) {
208 return type;
209 }
210
211 if (IsQuote(currLine_->buffer[beg])) {
212 return Token::Type::ID_STRING;
213 }
214
215 return Token::Type::ID; /* other */
216 }
217
218 /* Handle string literal */
LexString()219 bool Lexer::LexString()
220 {
221 bool isEscapeSeq = false;
222 char quote = currLine_->buffer[currLine_->pos];
223 size_t begin = currLine_->pos;
224 while (!Eol()) {
225 ++(currLine_->pos);
226
227 char c = currLine_->buffer[currLine_->pos];
228
229 if (isEscapeSeq) {
230 isEscapeSeq = false;
231 continue;
232 }
233
234 if (c == '\\') {
235 isEscapeSeq = true;
236 }
237
238 if (c == quote) {
239 break;
240 }
241 }
242
243 if (currLine_->buffer[currLine_->pos] != quote) {
244 err_ = Error(std::string("Missing terminating ") + quote + " character", 0,
245 Error::ErrorType::ERR_STRING_MISSING_TERMINATING_CHARACTER, "", begin, currLine_->pos,
246 currLine_->buffer);
247 return false;
248 }
249
250 ++(currLine_->pos);
251
252 return true;
253 }
254
UpdateCurLinePos()255 void Lexer::UpdateCurLinePos()
256 {
257 if (FindDelim(currLine_->buffer[currLine_->pos]) != Token::Type::ID_BAD) {
258 ++(currLine_->pos);
259 } else if (IsQuote(currLine_->buffer[currLine_->pos])) {
260 if (!LexString()) {
261 return;
262 }
263 } else {
264 while (!Eol() && FindDelim(currLine_->buffer[currLine_->pos]) == Token::Type::ID_BAD &&
265 isspace(currLine_->buffer[currLine_->pos]) == 0) {
266 ++(currLine_->pos);
267 size_t position = currLine_->pos;
268 while (FindDelim(currLine_->buffer[position]) == Token::Type::DEL_SQUARE_BRACKET_L ||
269 FindDelim(currLine_->buffer[position]) == Token::Type::DEL_SQUARE_BRACKET_R) {
270 position++;
271 }
272 if (isspace(currLine_->buffer[position]) == 0 && (position != currLine_->end)) {
273 currLine_->pos = position;
274 }
275 }
276 }
277 }
278
279 /*
280 * Tokens handling: set a corresponding
281 * elements bound_left and bound_right of the array tokens
282 * to the first and last characters of a corresponding token.
283 *
284 * bound_r1 bound_r2 bound_r3
285 * | | |
286 * v v v
287 * token1 token2 token3 ... token1 token2 token3 ...
288 * => ^ ^ ^
289 * | | |
290 * bound1 bound2 bound3 ... bound_l1 bound_l2 bound_l3 ...
291 *
292 */
LexTokens()293 void Lexer::LexTokens()
294 {
295 if (Eol()) {
296 return;
297 }
298
299 LOG(DEBUG, ASSEMBLER) << "token search started (line " << lines_.size() << "): "
300 << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
301 currLine_->end - currLine_->pos);
302
303 while (currLine_->end > currLine_->pos && isspace(currLine_->buffer[currLine_->end - 1]) != 0) {
304 --(currLine_->end);
305 }
306
307 while (isspace(currLine_->buffer[currLine_->pos]) != 0 && !Eol()) {
308 ++(currLine_->pos);
309 }
310
311 size_t boundRight;
312
313 size_t boundLeft;
314
315 while (!Eol()) {
316 boundLeft = currLine_->pos;
317
318 UpdateCurLinePos();
319
320 boundRight = currLine_->pos;
321
322 LOG(DEBUG, ASSEMBLER) << "token identified (line " << lines_.size() << ", "
323 << "token " << currLine_->tokens.size() + 1 << "): "
324 << std::string_view(&*(currLine_->buffer.begin() + boundLeft), boundRight - boundLeft)
325 << " ("
326 << "type: " << TokenTypeWhat(LexGetType(boundLeft, boundRight)) << ")";
327
328 currLine_->tokens.emplace_back(boundLeft, boundRight, LexGetType(boundLeft, boundRight), currLine_->buffer);
329
330 while (isspace(currLine_->buffer[currLine_->pos]) != 0 && !Eol()) {
331 ++(currLine_->pos);
332 }
333 }
334
335 LOG(DEBUG, ASSEMBLER) << "all tokens identified (line " << lines_.size() << ")";
336 }
337
338 /*
339 * Ignore comments:
340 * find PARSE_COMMENT_MARKER and move line->end
341 * to another position (next after the last character of the last
342 * significant (this is no a comment) element in a current
343 * line: line->buffer).
344 *
345 * Ex:
346 * [Label:] operation operand[,operand] [# comment]
347 *
348 * L1: mov v0, v1 # moving! L1: mov v0, v1 # moving!
349 * ^ => ^
350 * | |
351 * end end
352 */
LexPreprocess()353 void Lexer::LexPreprocess()
354 {
355 LOG(DEBUG, ASSEMBLER) << "started removing comments (line " << lines_.size() << "): "
356 << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
357 currLine_->end - currLine_->pos);
358
359 // Searching for comment marker located outside of string literals.
360 bool insideStrLit = !currLine_->buffer.empty() && currLine_->buffer[0] == '\"';
361 size_t cmtPos = currLine_->buffer.find_first_of("\"#", 0);
362 if (cmtPos != std::string::npos) {
363 do {
364 if (cmtPos != 0 && currLine_->buffer[cmtPos - 1] != '\\' && currLine_->buffer[cmtPos] == '\"') {
365 insideStrLit = !insideStrLit;
366 } else if (currLine_->buffer[cmtPos] == PARSE_COMMENT_MARKER && !insideStrLit) {
367 break;
368 }
369 } while ((cmtPos = currLine_->buffer.find_first_of("\"#", cmtPos + 1)) != std::string::npos);
370 }
371
372 if (cmtPos != std::string::npos) {
373 currLine_->end = cmtPos;
374 }
375
376 while (currLine_->end > currLine_->pos && isspace(currLine_->buffer[currLine_->end - 1]) != 0) {
377 --(currLine_->end);
378 }
379
380 LOG(DEBUG, ASSEMBLER) << "comments removed (line " << lines_.size() << "): "
381 << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
382 currLine_->end - currLine_->pos);
383 }
384
SkipSpace()385 void Lexer::SkipSpace()
386 {
387 while (!Eol() && isspace(currLine_->buffer[currLine_->pos]) != 0) {
388 ++(currLine_->pos);
389 }
390 }
391
AnalyzeLine()392 void Lexer::AnalyzeLine()
393 {
394 LexPreprocess();
395
396 SkipSpace();
397
398 LexTokens();
399 }
400
401 /*-------------------------------*/
402
403 } // namespace panda::pandasm
404