1 /**
2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "lexer.h"
17
18 namespace panda::pandasm {
19
20 /*-------------------------------*/
21
22 /* Is this a delimiter ? */
FindDelim(char c)23 Token::Type FindDelim(char c)
24 {
25 /* The map of delimiters */
26 static const std::unordered_map<char, Token::Type> DELIM = {{',', Token::Type::DEL_COMMA},
27 {':', Token::Type::DEL_COLON},
28 {'{', Token::Type::DEL_BRACE_L},
29 {'}', Token::Type::DEL_BRACE_R},
30 {'(', Token::Type::DEL_BRACKET_L},
31 {')', Token::Type::DEL_BRACKET_R},
32 {'<', Token::Type::DEL_LT},
33 {'>', Token::Type::DEL_GT},
34 {'=', Token::Type::DEL_EQ},
35 {'[', Token::Type::DEL_SQUARE_BRACKET_L},
36 {']', Token::Type::DEL_SQUARE_BRACKET_R}};
37
38 auto iter = DELIM.find(c);
39
40 if (iter == DELIM.end()) {
41 return Token::Type::ID_BAD;
42 }
43
44 return DELIM.at(c);
45 }
46
FindOperation(std::string_view s)47 Token::Type FindOperation(std::string_view s)
48 {
49 /* Generate the map of OPERATIONS from ISA: */
50 static const std::unordered_map<std::string_view, Token::Type> OPERATIONS = {
51 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
52 #define OPLIST(inst_code, name, optype, width, flags, dst_idx, use_idxs) \
53 {std::string_view(name), Token::Type::ID_OP_##inst_code},
54 PANDA_INSTRUCTION_LIST(OPLIST)
55 #undef OPLIST
56 };
57
58 auto iter = OPERATIONS.find(s);
59
60 if (iter == OPERATIONS.end()) {
61 return Token::Type::ID_BAD;
62 }
63
64 return OPERATIONS.at(s);
65 }
66
Findkeyword(std::string_view s)67 Token::Type Findkeyword(std::string_view s)
68 {
69 /* Generate the map of KEYWORDS: */
70 static const std::unordered_map<std::string_view, Token::Type> KEYWORDS = {
71 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
72 #define KEYWORDS(name, inst_code) {std::string_view(name), Token::Type::ID_##inst_code},
73 KEYWORDS_LIST(KEYWORDS)
74 #undef KEYWORDS
75 };
76
77 auto iter = KEYWORDS.find(s);
78
79 if (iter == KEYWORDS.end()) {
80 return Token::Type::ID_BAD;
81 }
82
83 return KEYWORDS.at(s);
84 }
85
TokenTypeWhat(Token::Type t)86 std::string_view TokenTypeWhat(Token::Type t)
87 {
88 if (t >= Token::Type::OPERATION && t < Token::Type::KEYWORD) {
89 return "OPERATION";
90 }
91
92 if (t >= Token::Type::KEYWORD) {
93 return "KEYWORD";
94 }
95
96 switch (t) {
97 case Token::Type::ID_BAD: {
98 return "ID_BAD";
99 }
100 case Token::Type::DEL_COMMA: {
101 return "DEL_COMMA";
102 }
103 case Token::Type::DEL_COLON: {
104 return "DEL_COLON";
105 }
106 case Token::Type::DEL_BRACE_L: {
107 return "DEL_BRACE_L";
108 }
109 case Token::Type::DEL_BRACE_R: {
110 return "DEL_BRACE_R";
111 }
112 case Token::Type::DEL_BRACKET_L: {
113 return "DEL_BRACKET_L";
114 }
115 case Token::Type::DEL_BRACKET_R: {
116 return "DEL_BRACKET_R";
117 }
118 case Token::Type::DEL_SQUARE_BRACKET_L: {
119 return "DEL_SQUARE_BRACKET_L";
120 }
121 case Token::Type::DEL_SQUARE_BRACKET_R: {
122 return "DEL_SQUARE_BRACKET_R";
123 }
124 case Token::Type::DEL_GT: {
125 return "DEL_GT";
126 }
127 case Token::Type::DEL_LT: {
128 return "DEL_LT";
129 }
130 case Token::Type::DEL_EQ: {
131 return "DEL_EQ";
132 }
133 case Token::Type::DEL_DOT: {
134 return "DEL_DOT";
135 }
136 case Token::Type::ID: {
137 return "ID";
138 }
139 case Token::Type::ID_STRING: {
140 return "ID_STRING";
141 }
142 default:
143 return "NONE";
144 }
145 }
146
IsQuote(char c)147 static bool IsQuote(char c)
148 {
149 return c == '"';
150 }
151
Lexer()152 Lexer::Lexer() : curr_line_(nullptr)
153 {
154 LOG(DEBUG, ASSEMBLER) << "element of class Lexer initialized";
155 }
156
~Lexer()157 Lexer::~Lexer()
158 {
159 LOG(DEBUG, ASSEMBLER) << "element of class Lexer destructed";
160 }
161
TokenizeString(const std::string & source_str)162 Tokens Lexer::TokenizeString(const std::string &source_str)
163 {
164 LOG(DEBUG, ASSEMBLER) << "started tokenizing of line " << lines_.size() + 1 << ": ";
165
166 lines_.emplace_back(source_str);
167
168 curr_line_ = &lines_.back();
169
170 LOG(DEBUG, ASSEMBLER) << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
171 curr_line_->end - curr_line_->pos);
172
173 AnalyzeLine();
174
175 LOG(DEBUG, ASSEMBLER) << "tokenization of line " << lines_.size() << " is successful";
176 LOG(DEBUG, ASSEMBLER) << " tokens identified: ";
177
178 for (const auto &f_i : lines_.back().tokens) {
179 LOG(DEBUG, ASSEMBLER) << "\n "
180 << std::string_view(&*(f_i.whole_line.begin() + f_i.bound_left),
181 f_i.bound_right - f_i.bound_left)
182 << " (type: " << TokenTypeWhat(f_i.type) << ")";
183
184 LOG(DEBUG, ASSEMBLER);
185 LOG(DEBUG, ASSEMBLER);
186 }
187 return std::pair<std::vector<Token>, Error>(lines_.back().tokens, err_);
188 }
189
190 /* End of line? */
Eol() const191 bool Lexer::Eol() const
192 {
193 return curr_line_->pos == curr_line_->end;
194 }
195
196 /* Return the type of token */
LexGetType(size_t beg,size_t end) const197 Token::Type Lexer::LexGetType(size_t beg, size_t end) const
198 {
199 if (FindDelim(curr_line_->buffer[beg]) != Token::Type::ID_BAD) { /* delimiter */
200 return FindDelim(curr_line_->buffer[beg]);
201 }
202
203 std::string_view p(&*(curr_line_->buffer.begin() + beg), end - beg);
204
205 Token::Type type = Findkeyword(p);
206
207 if (type != Token::Type::ID_BAD) {
208 return type;
209 }
210
211 type = FindOperation(p);
212
213 if (type != Token::Type::ID_BAD) {
214 return type;
215 }
216
217 if (IsQuote(curr_line_->buffer[beg])) {
218 return Token::Type::ID_STRING;
219 }
220
221 return Token::Type::ID; /* other */
222 }
223
224 /* Handle string literal */
LexString()225 bool Lexer::LexString()
226 {
227 bool is_escape_seq = false;
228 char quote = curr_line_->buffer[curr_line_->pos];
229 size_t begin = curr_line_->pos;
230 while (!Eol()) {
231 ++(curr_line_->pos);
232
233 char c = curr_line_->buffer[curr_line_->pos];
234
235 if (is_escape_seq) {
236 is_escape_seq = false;
237 continue;
238 }
239
240 if (c == '\\') {
241 is_escape_seq = true;
242 }
243
244 if (c == quote) {
245 break;
246 }
247 }
248
249 if (curr_line_->buffer[curr_line_->pos] != quote) {
250 err_ = Error(std::string("Missing terminating ") + quote + " character", 0,
251 Error::ErrorType::ERR_STRING_MISSING_TERMINATING_CHARACTER, "", begin, curr_line_->pos,
252 curr_line_->buffer);
253 return false;
254 }
255
256 ++(curr_line_->pos);
257
258 return true;
259 }
260
261 /*
262 * Tokens handling: set a corresponding
263 * elements bound_left and bound_right of the array tokens
264 * to the first and last characters of a corresponding token.
265 *
266 * bound_r1 bound_r2 bound_r3
267 * | | |
268 * v v v
269 * token1 token2 token3 ... token1 token2 token3 ...
270 * => ^ ^ ^
271 * | | |
272 * bound1 bound2 bound3 ... bound_l1 bound_l2 bound_l3 ...
273 *
274 */
LexTokens()275 void Lexer::LexTokens()
276 {
277 if (Eol()) {
278 return;
279 }
280
281 LOG(DEBUG, ASSEMBLER) << "token search started (line " << lines_.size() << "): "
282 << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
283 curr_line_->end - curr_line_->pos);
284
285 while (curr_line_->end > curr_line_->pos && isspace(curr_line_->buffer[curr_line_->end - 1]) != 0) {
286 --(curr_line_->end);
287 }
288
289 while (isspace(curr_line_->buffer[curr_line_->pos]) != 0 && !Eol()) {
290 ++(curr_line_->pos);
291 }
292
293 size_t bound_right;
294
295 size_t bound_left;
296
297 for (int i = 0; !Eol(); ++i) {
298 bound_left = curr_line_->pos;
299
300 if (FindDelim(curr_line_->buffer[curr_line_->pos]) != Token::Type::ID_BAD) {
301 ++(curr_line_->pos);
302 } else if (IsQuote(curr_line_->buffer[curr_line_->pos])) {
303 if (!LexString()) {
304 return;
305 }
306 } else {
307 while (!Eol() && FindDelim(curr_line_->buffer[curr_line_->pos]) == Token::Type::ID_BAD &&
308 isspace(curr_line_->buffer[curr_line_->pos]) == 0) {
309 ++(curr_line_->pos);
310 }
311 }
312
313 bound_right = curr_line_->pos;
314
315 LOG(DEBUG, ASSEMBLER) << "token identified (line " << lines_.size() << ", "
316 << "token " << curr_line_->tokens.size() + 1 << "): "
317 << std::string_view(&*(curr_line_->buffer.begin() + bound_left), bound_right - bound_left)
318 << " ("
319 << "type: " << TokenTypeWhat(LexGetType(bound_left, bound_right)) << ")";
320
321 curr_line_->tokens.emplace_back(bound_left, bound_right, LexGetType(bound_left, bound_right),
322 curr_line_->buffer);
323
324 while (isspace(curr_line_->buffer[curr_line_->pos]) != 0 && !Eol()) {
325 ++(curr_line_->pos);
326 }
327 }
328
329 LOG(DEBUG, ASSEMBLER) << "all tokens identified (line " << lines_.size() << ")";
330 }
331
332 /*
333 * Ignore comments:
334 * find PARSE_COMMENT_MARKER and move line->end
335 * to another position (next after the last character of the last
336 * significant (this is no a comment) element in a current
337 * line: line->buffer).
338 *
339 * Ex:
340 * [Label:] operation operand[,operand] [# comment]
341 *
342 * L1: mov v0, v1 # moving! L1: mov v0, v1 # moving!
343 * ^ => ^
344 * | |
345 * end end
346 */
LexPreprocess()347 void Lexer::LexPreprocess()
348 {
349 LOG(DEBUG, ASSEMBLER) << "started removing comments (line " << lines_.size() << "): "
350 << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
351 curr_line_->end - curr_line_->pos);
352
353 // Searching for comment marker located outside of string literals.
354 bool inside_str_lit = curr_line_->buffer.size() > 0 && curr_line_->buffer[0] == '\"';
355 size_t cmt_pos = curr_line_->buffer.find_first_of("\"#", 0);
356 if (cmt_pos != std::string::npos) {
357 do {
358 if (cmt_pos != 0 && curr_line_->buffer[cmt_pos - 1] != '\\' && curr_line_->buffer[cmt_pos] == '\"') {
359 inside_str_lit = !inside_str_lit;
360 } else if (curr_line_->buffer[cmt_pos] == PARSE_COMMENT_MARKER && !inside_str_lit) {
361 break;
362 }
363 } while ((cmt_pos = curr_line_->buffer.find_first_of("\"#", cmt_pos + 1)) != std::string::npos);
364 }
365
366 if (cmt_pos != std::string::npos) {
367 curr_line_->end = cmt_pos;
368 }
369
370 while (curr_line_->end > curr_line_->pos && isspace(curr_line_->buffer[curr_line_->end - 1]) != 0) {
371 --(curr_line_->end);
372 }
373
374 LOG(DEBUG, ASSEMBLER) << "comments removed (line " << lines_.size() << "): "
375 << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
376 curr_line_->end - curr_line_->pos);
377 }
378
SkipSpace()379 void Lexer::SkipSpace()
380 {
381 while (!Eol() && isspace(curr_line_->buffer[curr_line_->pos]) != 0) {
382 ++(curr_line_->pos);
383 }
384 }
385
AnalyzeLine()386 void Lexer::AnalyzeLine()
387 {
388 LexPreprocess();
389
390 SkipSpace();
391
392 LexTokens();
393 }
394
395 /*-------------------------------*/
396
397 } // namespace panda::pandasm
398