1 /*
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "lexer.h"
17
18 namespace ark::pandasm {
19
20 /*-------------------------------*/
21
22 /* Is this a delimiter ? */
FindDelim(char c)23 Token::Type FindDelim(char c)
24 {
25 /* The map of delimiters */
26 static const std::unordered_map<char, Token::Type> DELIM = {{',', Token::Type::DEL_COMMA},
27 {':', Token::Type::DEL_COLON},
28 {'{', Token::Type::DEL_BRACE_L},
29 {'}', Token::Type::DEL_BRACE_R},
30 {'(', Token::Type::DEL_BRACKET_L},
31 {')', Token::Type::DEL_BRACKET_R},
32 {'<', Token::Type::DEL_LT},
33 {'>', Token::Type::DEL_GT},
34 {'=', Token::Type::DEL_EQ},
35 {'[', Token::Type::DEL_SQUARE_BRACKET_L},
36 {']', Token::Type::DEL_SQUARE_BRACKET_R}};
37
38 auto iter = DELIM.find(c);
39 if (iter == DELIM.end()) {
40 return Token::Type::ID_BAD;
41 }
42
43 return DELIM.at(c);
44 }
45
FindOperation(std::string_view s)46 Token::Type FindOperation(std::string_view s)
47 {
48 /* Generate the map of OPERATIONS from ISA: */
49 static const std::unordered_map<std::string_view, Token::Type> OPERATIONS = {
50 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
51 #define OPLIST(inst_code, name, optype, width, flags, dst_idx, use_idxs, prof_size) \
52 {std::string_view(name), Token::Type::ID_OP_##inst_code},
53 PANDA_INSTRUCTION_LIST(OPLIST)
54 #undef OPLIST
55 };
56
57 auto iter = OPERATIONS.find(s);
58 if (iter == OPERATIONS.end()) {
59 return Token::Type::ID_BAD;
60 }
61
62 return OPERATIONS.at(s);
63 }
64
Findkeyword(std::string_view s)65 Token::Type Findkeyword(std::string_view s)
66 {
67 /* Generate the map of KEYWORDS: */
68 static const std::unordered_map<std::string_view, Token::Type> KEYWORDS = {
69 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
70 #define KEYWORDS(name, inst_code) {std::string_view(name), Token::Type::ID_##inst_code},
71 KEYWORDS_LIST(KEYWORDS)
72 #undef KEYWORDS
73 };
74
75 auto iter = KEYWORDS.find(s);
76 if (iter == KEYWORDS.end()) {
77 return Token::Type::ID_BAD;
78 }
79
80 return KEYWORDS.at(s);
81 }
82
83 // CC-OFFNXT(huge_method[C++], G.FUN.01-CPP) big switch case
TokenTypeWhat(Token::Type t)84 std::string_view TokenTypeWhat(Token::Type t)
85 {
86 if (t >= Token::Type::OPERATION && t < Token::Type::KEYWORD) {
87 return "OPERATION";
88 }
89
90 if (t >= Token::Type::KEYWORD) {
91 return "KEYWORD";
92 }
93
94 switch (t) {
95 case Token::Type::ID_BAD: {
96 return "ID_BAD";
97 }
98 case Token::Type::DEL_COMMA: {
99 return "DEL_COMMA";
100 }
101 case Token::Type::DEL_COLON: {
102 return "DEL_COLON";
103 }
104 case Token::Type::DEL_BRACE_L: {
105 return "DEL_BRACE_L";
106 }
107 case Token::Type::DEL_BRACE_R: {
108 return "DEL_BRACE_R";
109 }
110 case Token::Type::DEL_BRACKET_L: {
111 return "DEL_BRACKET_L";
112 }
113 case Token::Type::DEL_BRACKET_R: {
114 return "DEL_BRACKET_R";
115 }
116 case Token::Type::DEL_SQUARE_BRACKET_L: {
117 return "DEL_SQUARE_BRACKET_L";
118 }
119 case Token::Type::DEL_SQUARE_BRACKET_R: {
120 return "DEL_SQUARE_BRACKET_R";
121 }
122 case Token::Type::DEL_GT: {
123 return "DEL_GT";
124 }
125 case Token::Type::DEL_LT: {
126 return "DEL_LT";
127 }
128 case Token::Type::DEL_EQ: {
129 return "DEL_EQ";
130 }
131 case Token::Type::DEL_DOT: {
132 return "DEL_DOT";
133 }
134 case Token::Type::ID: {
135 return "ID";
136 }
137 case Token::Type::ID_STRING: {
138 return "ID_STRING";
139 }
140 default:
141 return "NONE";
142 }
143 }
144
IsQuote(char c)145 static bool IsQuote(char c)
146 {
147 return c == '"';
148 }
149
Lexer()150 Lexer::Lexer()
151 {
152 LOG(DEBUG, ASSEMBLER) << "element of class Lexer initialized";
153 }
154
~Lexer()155 Lexer::~Lexer()
156 {
157 LOG(DEBUG, ASSEMBLER) << "element of class Lexer destructed";
158 }
159
TokenizeString(const std::string & sourceStr)160 Tokens Lexer::TokenizeString(const std::string &sourceStr)
161 {
162 LOG(DEBUG, ASSEMBLER) << "started tokenizing of line " << (lines_.size() + 1) << ": ";
163
164 lines_.emplace_back(sourceStr);
165
166 currLine_ = &lines_.back();
167
168 LOG(DEBUG, ASSEMBLER) << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
169 currLine_->end - currLine_->pos);
170
171 AnalyzeLine();
172
173 LOG(DEBUG, ASSEMBLER) << "tokenization of line " << lines_.size() << " is successful";
174 LOG(DEBUG, ASSEMBLER) << " tokens identified: ";
175
176 for (const auto &fI : lines_.back().tokens) {
177 LOG(DEBUG, ASSEMBLER) << "\n "
178 << std::string_view(&*(fI.wholeLine.begin() + fI.boundLeft), fI.boundRight - fI.boundLeft)
179 << " (type: " << TokenTypeWhat(fI.type) << ")";
180
181 LOG(DEBUG, ASSEMBLER);
182 LOG(DEBUG, ASSEMBLER);
183 }
184 return std::pair<std::vector<Token>, Error>(lines_.back().tokens, err_);
185 }
186
187 /* End of line? */
Eol() const188 bool Lexer::Eol() const
189 {
190 return currLine_->pos == currLine_->end;
191 }
192
193 /* Return the type of token */
LexGetType(size_t beg,size_t end) const194 Token::Type Lexer::LexGetType(size_t beg, size_t end) const
195 {
196 if (FindDelim(currLine_->buffer[beg]) != Token::Type::ID_BAD) { /* delimiter */
197 return FindDelim(currLine_->buffer[beg]);
198 }
199
200 std::string_view p(&*(currLine_->buffer.begin() + beg), end - beg);
201
202 Token::Type type = Findkeyword(p);
203 if (type != Token::Type::ID_BAD) {
204 return type;
205 }
206
207 type = FindOperation(p);
208 if (type != Token::Type::ID_BAD) {
209 return type;
210 }
211
212 if (IsQuote(currLine_->buffer[beg])) {
213 return Token::Type::ID_STRING;
214 }
215
216 return Token::Type::ID; /* other */
217 }
218
219 /* Handle string literal */
LexString()220 bool Lexer::LexString()
221 {
222 bool isEscapeSeq = false;
223 char quote = currLine_->buffer[currLine_->pos];
224 size_t begin = currLine_->pos;
225 while (!Eol()) {
226 ++(currLine_->pos);
227
228 char c = currLine_->buffer[currLine_->pos];
229
230 if (isEscapeSeq) {
231 isEscapeSeq = false;
232 continue;
233 }
234
235 if (c == '\\') {
236 isEscapeSeq = true;
237 }
238
239 if (c == quote) {
240 break;
241 }
242 }
243
244 if (currLine_->buffer[currLine_->pos] != quote) {
245 err_ = Error(std::string("Missing terminating ") + quote + " character", 0,
246 Error::ErrorType::ERR_STRING_MISSING_TERMINATING_CHARACTER, "", begin, currLine_->pos,
247 currLine_->buffer);
248 return false;
249 }
250
251 ++(currLine_->pos);
252
253 return true;
254 }
255
256 /*
257 * Tokens handling: set a corresponding
258 * elements bound_left and bound_right of the array tokens
259 * to the first and last characters of a corresponding token.
260 *
261 * bound_r1 bound_r2 bound_r3
262 * | | |
263 * v v v
264 * token1 token2 token3 ... token1 token2 token3 ...
265 * => ^ ^ ^
266 * | | |
267 * bound1 bound2 bound3 ... bound_l1 bound_l2 bound_l3 ...
268 *
269 */
LexTokens()270 void Lexer::LexTokens()
271 {
272 if (Eol()) {
273 return;
274 }
275
276 LOG(DEBUG, ASSEMBLER) << "token search started (line " << lines_.size() << "): "
277 << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
278 currLine_->end - currLine_->pos);
279
280 while (currLine_->end > currLine_->pos && isspace(currLine_->buffer[currLine_->end - 1]) != 0) {
281 --(currLine_->end);
282 }
283
284 while (isspace(currLine_->buffer[currLine_->pos]) != 0 && !Eol()) {
285 ++(currLine_->pos);
286 }
287
288 size_t boundRight;
289 size_t boundLeft;
290
291 while (!Eol()) {
292 boundLeft = currLine_->pos;
293
294 if (FindDelim(currLine_->buffer[currLine_->pos]) != Token::Type::ID_BAD) {
295 ++(currLine_->pos);
296 } else if (IsQuote(currLine_->buffer[currLine_->pos])) {
297 if (!LexString()) {
298 return;
299 }
300 } else {
301 LexBadTokens();
302 }
303
304 boundRight = currLine_->pos;
305
306 LOG(DEBUG, ASSEMBLER) << "token identified (line " << lines_.size() << ", "
307 << "token " << currLine_->tokens.size() + 1 << "): "
308 << std::string_view(&*(currLine_->buffer.begin() + boundLeft), boundRight - boundLeft)
309 << " ("
310 << "type: " << TokenTypeWhat(LexGetType(boundLeft, boundRight)) << ")";
311
312 currLine_->tokens.emplace_back(boundLeft, boundRight, LexGetType(boundLeft, boundRight), currLine_->buffer);
313
314 while (isspace(currLine_->buffer[currLine_->pos]) != 0 && !Eol()) {
315 ++(currLine_->pos);
316 }
317 }
318
319 LOG(DEBUG, ASSEMBLER) << "all tokens identified (line " << lines_.size() << ")";
320 }
321
LexBadTokens()322 void Lexer::LexBadTokens()
323 {
324 while (!Eol() && FindDelim(currLine_->buffer[currLine_->pos]) == Token::Type::ID_BAD &&
325 isspace(currLine_->buffer[currLine_->pos]) == 0) {
326 ++(currLine_->pos);
327 size_t position = currLine_->pos;
328 while (FindDelim(currLine_->buffer[position]) == Token::Type::DEL_SQUARE_BRACKET_L ||
329 FindDelim(currLine_->buffer[position]) == Token::Type::DEL_SQUARE_BRACKET_R) {
330 position++;
331 }
332 if (isspace(currLine_->buffer[position]) == 0 && (position != currLine_->end)) {
333 currLine_->pos = position;
334 }
335 }
336 }
337
338 /*
339 * Ignore comments:
340 * find PARSE_COMMENT_MARKER and move line->end
341 * to another position (next after the last character of the last
342 * significant (this is no a comment) element in a current
343 * line: line->buffer).
344 *
345 * Ex:
346 * [Label:] operation operand[,operand] [# comment]
347 *
348 * L1: mov v0, v1 # moving! L1: mov v0, v1 # moving!
349 * ^ => ^
350 * | |
351 * end end
352 */
LexPreprocess()353 void Lexer::LexPreprocess()
354 {
355 LOG(DEBUG, ASSEMBLER) << "started removing comments (line " << lines_.size() << "): "
356 << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
357 currLine_->end - currLine_->pos);
358
359 // Searching for comment marker located outside of string literals.
360 bool insideStrLit = !currLine_->buffer.empty() && currLine_->buffer[0] == '\"';
361 size_t cmtPos = currLine_->buffer.find_first_of("\"#", 0);
362 if (cmtPos != std::string::npos) {
363 do {
364 if (cmtPos != 0 && currLine_->buffer[cmtPos - 1] != '\\' && currLine_->buffer[cmtPos] == '\"') {
365 insideStrLit = !insideStrLit;
366 } else if (currLine_->buffer[cmtPos] == PARSE_COMMENT_MARKER && !insideStrLit) {
367 break;
368 }
369 } while ((cmtPos = currLine_->buffer.find_first_of("\"#", cmtPos + 1)) != std::string::npos);
370 }
371
372 if (cmtPos != std::string::npos) {
373 currLine_->end = cmtPos;
374 }
375
376 while (currLine_->end > currLine_->pos && isspace(currLine_->buffer[currLine_->end - 1]) != 0) {
377 --(currLine_->end);
378 }
379
380 LOG(DEBUG, ASSEMBLER) << "comments removed (line " << lines_.size() << "): "
381 << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
382 currLine_->end - currLine_->pos);
383 }
384
SkipSpace()385 void Lexer::SkipSpace()
386 {
387 while (!Eol() && isspace(currLine_->buffer[currLine_->pos]) != 0) {
388 ++(currLine_->pos);
389 }
390 }
391
AnalyzeLine()392 void Lexer::AnalyzeLine()
393 {
394 LexPreprocess();
395
396 SkipSpace();
397
398 LexTokens();
399 }
400
401 /*-------------------------------*/
402
403 } // namespace ark::pandasm
404