1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "lexer.h"
17
18 namespace panda::pandasm {
19
FindDelim(char c)20 Token::Type FindDelim(char c)
21 {
22 // The map of delimiters
23 static const std::unordered_map<char, Token::Type> DELIM = {{',', Token::Type::DEL_COMMA},
24 {':', Token::Type::DEL_COLON},
25 {'{', Token::Type::DEL_BRACE_L},
26 {'}', Token::Type::DEL_BRACE_R},
27 {'(', Token::Type::DEL_BRACKET_L},
28 {')', Token::Type::DEL_BRACKET_R},
29 {'<', Token::Type::DEL_LT},
30 {'>', Token::Type::DEL_GT},
31 {'=', Token::Type::DEL_EQ},
32 {'[', Token::Type::DEL_SQUARE_BRACKET_L},
33 {']', Token::Type::DEL_SQUARE_BRACKET_R}};
34
35 auto iter = DELIM.find(c);
36 if (iter == DELIM.end()) {
37 return Token::Type::ID_BAD;
38 }
39
40 return DELIM.at(c);
41 }
42
FindOperation(std::string_view s)43 Token::Type FindOperation(std::string_view s)
44 {
45 // Generate the map of OPERATIONS from ISA
46 static const std::unordered_map<std::string_view, Token::Type> OPERATIONS = {
47 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
48 #define OPLIST(inst_code, name, optype, width, flags, dst_idx, use_idxs) \
49 {std::string_view(name), Token::Type::ID_OP_##inst_code},
50 PANDA_INSTRUCTION_LIST(OPLIST)
51 #undef OPLIST
52 };
53
54 auto iter = OPERATIONS.find(s);
55 if (iter == OPERATIONS.end()) {
56 return Token::Type::ID_BAD;
57 }
58
59 return OPERATIONS.at(s);
60 }
61
Findkeyword(std::string_view s)62 Token::Type Findkeyword(std::string_view s)
63 {
64 // Generate the map of KEYWORDS
65 static const std::unordered_map<std::string_view, Token::Type> KEYWORDS = {
66 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
67 #define KEYWORDS(name, inst_code) {std::string_view(name), Token::Type::ID_##inst_code},
68 KEYWORDS_LIST(KEYWORDS)
69 #undef KEYWORDS
70 };
71
72 auto iter = KEYWORDS.find(s);
73 if (iter == KEYWORDS.end()) {
74 return Token::Type::ID_BAD;
75 }
76
77 return KEYWORDS.at(s);
78 }
79
80 // CODECHECK-NOLINTNEXTLINE(C_RULE_ID_FUNCTION_SIZE)
TokenTypeWhat(Token::Type t)81 std::string_view TokenTypeWhat(Token::Type t)
82 {
83 if (t >= Token::Type::OPERATION && t < Token::Type::KEYWORD) {
84 return "OPERATION";
85 }
86
87 if (t >= Token::Type::KEYWORD) {
88 return "KEYWORD";
89 }
90
91 switch (t) {
92 case Token::Type::ID_BAD: {
93 return "ID_BAD";
94 }
95 case Token::Type::DEL_COMMA: {
96 return "DEL_COMMA";
97 }
98 case Token::Type::DEL_COLON: {
99 return "DEL_COLON";
100 }
101 case Token::Type::DEL_BRACE_L: {
102 return "DEL_BRACE_L";
103 }
104 case Token::Type::DEL_BRACE_R: {
105 return "DEL_BRACE_R";
106 }
107 case Token::Type::DEL_BRACKET_L: {
108 return "DEL_BRACKET_L";
109 }
110 case Token::Type::DEL_BRACKET_R: {
111 return "DEL_BRACKET_R";
112 }
113 case Token::Type::DEL_SQUARE_BRACKET_L: {
114 return "DEL_SQUARE_BRACKET_L";
115 }
116 case Token::Type::DEL_SQUARE_BRACKET_R: {
117 return "DEL_SQUARE_BRACKET_R";
118 }
119 case Token::Type::DEL_GT: {
120 return "DEL_GT";
121 }
122 case Token::Type::DEL_LT: {
123 return "DEL_LT";
124 }
125 case Token::Type::DEL_EQ: {
126 return "DEL_EQ";
127 }
128 case Token::Type::DEL_DOT: {
129 return "DEL_DOT";
130 }
131 case Token::Type::ID: {
132 return "ID";
133 }
134 case Token::Type::ID_STRING: {
135 return "ID_STRING";
136 }
137 default:
138 return "NONE";
139 }
140 }
141
IsQuote(char c)142 static bool IsQuote(char c)
143 {
144 return c == '"';
145 }
146
Lexer()147 Lexer::Lexer() : curr_line_(nullptr)
148 {
149 LOG(DEBUG, ASSEMBLER) << "element of class Lexer initialized";
150 }
151
~Lexer()152 Lexer::~Lexer()
153 {
154 LOG(DEBUG, ASSEMBLER) << "element of class Lexer destructed";
155 }
156
TokenizeString(const std::string & source_str)157 Tokens Lexer::TokenizeString(const std::string &source_str)
158 {
159 LOG(DEBUG, ASSEMBLER) << "started tokenizing of line " << lines_.size() + 1 << ": ";
160
161 lines_.emplace_back(source_str);
162
163 curr_line_ = &lines_.back();
164
165 LOG(DEBUG, ASSEMBLER) << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
166 curr_line_->end - curr_line_->pos);
167
168 AnalyzeLine();
169
170 LOG(DEBUG, ASSEMBLER) << "tokenization of line " << lines_.size() << " is successful";
171 LOG(DEBUG, ASSEMBLER) << " tokens identified: ";
172
173 for (const auto &f_i : lines_.back().tokens) {
174 LOG(DEBUG, ASSEMBLER) << "\n "
175 << std::string_view(&*(f_i.whole_line.begin() + f_i.bound_left),
176 f_i.bound_right - f_i.bound_left)
177 << " (type: " << TokenTypeWhat(f_i.type) << ")";
178
179 LOG(DEBUG, ASSEMBLER);
180 LOG(DEBUG, ASSEMBLER);
181 }
182 return std::pair<std::vector<Token>, Error>(lines_.back().tokens, err_);
183 }
184
185 // End of line
Eol() const186 bool Lexer::Eol() const
187 {
188 return curr_line_->pos == curr_line_->end;
189 }
190
191 // Return the type of token
LexGetType(size_t beg,size_t end) const192 Token::Type Lexer::LexGetType(size_t beg, size_t end) const
193 {
194 if (FindDelim(curr_line_->buffer[beg]) != Token::Type::ID_BAD) { /* delimiter */
195 return FindDelim(curr_line_->buffer[beg]);
196 }
197
198 std::string_view p(&*(curr_line_->buffer.begin() + beg), end - beg);
199 Token::Type type = Findkeyword(p);
200 if (type != Token::Type::ID_BAD) {
201 return type;
202 }
203
204 type = FindOperation(p);
205 if (type != Token::Type::ID_BAD) {
206 return type;
207 }
208
209 if (IsQuote(curr_line_->buffer[beg])) {
210 return Token::Type::ID_STRING;
211 }
212
213 return Token::Type::ID; // other
214 }
215
216 // Handle string literal
LexString()217 bool Lexer::LexString()
218 {
219 bool is_escape_seq = false;
220 char quote = curr_line_->buffer[curr_line_->pos];
221 size_t begin = curr_line_->pos;
222 while (!Eol()) {
223 ++(curr_line_->pos);
224
225 char c = curr_line_->buffer[curr_line_->pos];
226
227 if (is_escape_seq) {
228 is_escape_seq = false;
229 continue;
230 }
231
232 if (c == '\\') {
233 is_escape_seq = true;
234 }
235
236 if (c == quote) {
237 break;
238 }
239 }
240
241 if (curr_line_->buffer[curr_line_->pos] != quote) {
242 err_ = Error(std::string("Missing terminating ") + quote + " character", 0,
243 Error::ErrorType::ERR_STRING_MISSING_TERMINATING_CHARACTER, "", begin, curr_line_->pos,
244 curr_line_->buffer);
245 return false;
246 }
247
248 ++(curr_line_->pos);
249
250 return true;
251 }
252
253 /*
254 * Tokens handling: set the corresponding
255 * elements bound_left and bound_right of the array tokens
256 * to the first and last characters of a corresponding token.
257 *
258 * bound_r1 bound_r2 bound_r3
259 * | | |
260 * v v v
261 * token1 token2 token3 ... token1 token2 token3 ...
262 * => ^ ^ ^
263 * | | |
264 * bound1 bound2 bound3 ... bound_l1 bound_l2 bound_l3 ...
265 *
266 */
LexTokens()267 void Lexer::LexTokens()
268 {
269 if (Eol()) {
270 return;
271 }
272
273 LOG(DEBUG, ASSEMBLER) << "token search started (line " << lines_.size() << "): "
274 << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
275 curr_line_->end - curr_line_->pos);
276
277 while (curr_line_->end > curr_line_->pos && isspace(curr_line_->buffer[curr_line_->end - 1]) != 0) {
278 --(curr_line_->end);
279 }
280
281 while (isspace(curr_line_->buffer[curr_line_->pos]) != 0 && !Eol()) {
282 ++(curr_line_->pos);
283 }
284
285 size_t bound_right;
286 size_t bound_left;
287
288 for (int i = 0; !Eol(); ++i) {
289 bound_left = curr_line_->pos;
290
291 if (FindDelim(curr_line_->buffer[curr_line_->pos]) != Token::Type::ID_BAD) {
292 ++(curr_line_->pos);
293 } else if (IsQuote(curr_line_->buffer[curr_line_->pos])) {
294 if (!LexString()) {
295 return;
296 }
297 } else {
298 while (!Eol() && FindDelim(curr_line_->buffer[curr_line_->pos]) == Token::Type::ID_BAD &&
299 isspace(curr_line_->buffer[curr_line_->pos]) == 0) {
300 ++(curr_line_->pos);
301 }
302 }
303
304 bound_right = curr_line_->pos;
305
306 LOG(DEBUG, ASSEMBLER) << "token identified (line " << lines_.size() << ", "
307 << "token " << curr_line_->tokens.size() + 1 << "): "
308 << std::string_view(&*(curr_line_->buffer.begin() + bound_left), bound_right - bound_left)
309 << " ("
310 << "type: " << TokenTypeWhat(LexGetType(bound_left, bound_right)) << ")";
311
312 curr_line_->tokens.emplace_back(bound_left, bound_right, LexGetType(bound_left, bound_right),
313 curr_line_->buffer);
314
315 while (isspace(curr_line_->buffer[curr_line_->pos]) != 0 && !Eol()) {
316 ++(curr_line_->pos);
317 }
318 }
319
320 LOG(DEBUG, ASSEMBLER) << "all tokens identified (line " << lines_.size() << ")";
321 }
322
323 /*
324 * Ignore comments:
325 * find PARSE_COMMENT_MARKER and move line->end to another position
326 * next after the last character of the last significant (not a comment)
327 * element in a current line: line->buffer.
328 *
329 * Ex:
330 * [Label:] operation operand[,operand] [# comment]
331 *
332 * L1: mov v0, v1 # moving! L1: mov v0, v1 # moving!
333 * ^ => ^
334 * | |
335 * end end
336 */
LexPreprocess()337 void Lexer::LexPreprocess()
338 {
339 LOG(DEBUG, ASSEMBLER) << "started removing comments (line " << lines_.size() << "): "
340 << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
341 curr_line_->end - curr_line_->pos);
342
343 size_t cmt_pos;
344 bool inside_str_lit;
345
346 // Searching for comment marker located outside of the string literals.
347 inside_str_lit = curr_line_->buffer.size() > 0 && curr_line_->buffer[0] == '\"';
348 cmt_pos = curr_line_->buffer.find_first_of("\"#", 0);
349 if (cmt_pos != std::string::npos) {
350 do {
351 if (cmt_pos != 0 && curr_line_->buffer[cmt_pos - 1] != '\\' && curr_line_->buffer[cmt_pos] == '\"') {
352 inside_str_lit = !inside_str_lit;
353 } else if (curr_line_->buffer[cmt_pos] == PARSE_COMMENT_MARKER && !inside_str_lit) {
354 break;
355 }
356 } while ((cmt_pos = curr_line_->buffer.find_first_of("\"#", cmt_pos + 1)) != std::string::npos);
357 }
358
359 if (cmt_pos != std::string::npos) {
360 curr_line_->end = cmt_pos;
361 }
362
363 while (curr_line_->end > curr_line_->pos && isspace(curr_line_->buffer[curr_line_->end - 1]) != 0) {
364 --(curr_line_->end);
365 }
366
367 LOG(DEBUG, ASSEMBLER) << "comments removed (line " << lines_.size() << "): "
368 << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
369 curr_line_->end - curr_line_->pos);
370 }
371
SkipSpace()372 void Lexer::SkipSpace()
373 {
374 while (!Eol() && isspace(curr_line_->buffer[curr_line_->pos]) != 0) {
375 ++(curr_line_->pos);
376 }
377 }
378
AnalyzeLine()379 void Lexer::AnalyzeLine()
380 {
381 LexPreprocess();
382
383 SkipSpace();
384
385 LexTokens();
386 }
387
388 } // namespace panda::pandasm
389