1 /*
2 * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "lexer.h"
17
18 namespace ark::pandasm {
19
20 /*-------------------------------*/
21
22 /* Is this a delimiter ? */
FindDelim(char c)23 Token::Type FindDelim(char c)
24 {
25 /* The map of delimiters */
26 static const std::unordered_map<char, Token::Type> DELIM = {{',', Token::Type::DEL_COMMA},
27 {':', Token::Type::DEL_COLON},
28 {'{', Token::Type::DEL_BRACE_L},
29 {'}', Token::Type::DEL_BRACE_R},
30 {'(', Token::Type::DEL_BRACKET_L},
31 {')', Token::Type::DEL_BRACKET_R},
32 {'<', Token::Type::DEL_LT},
33 {'>', Token::Type::DEL_GT},
34 {'=', Token::Type::DEL_EQ},
35 {'[', Token::Type::DEL_SQUARE_BRACKET_L},
36 {']', Token::Type::DEL_SQUARE_BRACKET_R}};
37
38 auto iter = DELIM.find(c);
39 if (iter == DELIM.end()) {
40 return Token::Type::ID_BAD;
41 }
42
43 return DELIM.at(c);
44 }
45
FindOperation(std::string_view s)46 Token::Type FindOperation(std::string_view s)
47 {
48 /* Generate the map of OPERATIONS from ISA: */
49 static const std::unordered_map<std::string_view, Token::Type> OPERATIONS = {
50 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
51 #define OPLIST(inst_code, name, optype, width, flags, dst_idx, use_idxs, prof_size) \
52 {std::string_view(name), Token::Type::ID_OP_##inst_code},
53 PANDA_INSTRUCTION_LIST(OPLIST)
54 #undef OPLIST
55 };
56
57 auto iter = OPERATIONS.find(s);
58 if (iter == OPERATIONS.end()) {
59 return Token::Type::ID_BAD;
60 }
61
62 return OPERATIONS.at(s);
63 }
64
Findkeyword(std::string_view s)65 Token::Type Findkeyword(std::string_view s)
66 {
67 /* Generate the map of KEYWORDS: */
68 static const std::unordered_map<std::string_view, Token::Type> KEYWORDS = {
69 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
70 #define KEYWORDS(name, inst_code) {std::string_view(name), Token::Type::ID_##inst_code},
71 KEYWORDS_LIST(KEYWORDS)
72 #undef KEYWORDS
73 };
74
75 auto iter = KEYWORDS.find(s);
76 if (iter == KEYWORDS.end()) {
77 return Token::Type::ID_BAD;
78 }
79
80 return KEYWORDS.at(s);
81 }
82
83 // CC-OFFNXT(huge_method[C++], G.FUN.01-CPP) big switch case
TokenTypeWhat(Token::Type t)84 std::string_view TokenTypeWhat(Token::Type t)
85 {
86 if (t >= Token::Type::OPERATION && t < Token::Type::KEYWORD) {
87 return "OPERATION";
88 }
89
90 if (t >= Token::Type::KEYWORD) {
91 return "KEYWORD";
92 }
93
94 switch (t) {
95 case Token::Type::ID_BAD: {
96 return "ID_BAD";
97 }
98 case Token::Type::DEL_COMMA: {
99 return "DEL_COMMA";
100 }
101 case Token::Type::DEL_COLON: {
102 return "DEL_COLON";
103 }
104 case Token::Type::DEL_BRACE_L: {
105 return "DEL_BRACE_L";
106 }
107 case Token::Type::DEL_BRACE_R: {
108 return "DEL_BRACE_R";
109 }
110 case Token::Type::DEL_BRACKET_L: {
111 return "DEL_BRACKET_L";
112 }
113 case Token::Type::DEL_BRACKET_R: {
114 return "DEL_BRACKET_R";
115 }
116 case Token::Type::DEL_SQUARE_BRACKET_L: {
117 return "DEL_SQUARE_BRACKET_L";
118 }
119 case Token::Type::DEL_SQUARE_BRACKET_R: {
120 return "DEL_SQUARE_BRACKET_R";
121 }
122 case Token::Type::DEL_GT: {
123 return "DEL_GT";
124 }
125 case Token::Type::DEL_LT: {
126 return "DEL_LT";
127 }
128 case Token::Type::DEL_EQ: {
129 return "DEL_EQ";
130 }
131 case Token::Type::DEL_DOT: {
132 return "DEL_DOT";
133 }
134 case Token::Type::ID: {
135 return "ID";
136 }
137 case Token::Type::ID_STRING: {
138 return "ID_STRING";
139 }
140 default:
141 return "NONE";
142 }
143 }
144
IsQuote(char c)145 static bool IsQuote(char c)
146 {
147 return c == '"';
148 }
149
Lexer()150 Lexer::Lexer()
151 {
152 LOG(DEBUG, ASSEMBLER) << "element of class Lexer initialized";
153 }
154
~Lexer()155 Lexer::~Lexer()
156 {
157 LOG(DEBUG, ASSEMBLER) << "element of class Lexer destructed";
158 }
159
TokenizeString(const std::string & sourceStr)160 Tokens Lexer::TokenizeString(const std::string &sourceStr)
161 {
162 LOG(DEBUG, ASSEMBLER) << "started tokenizing of line " << (lines_.size() + 1) << ": ";
163
164 lines_.emplace_back(sourceStr);
165
166 currLine_ = &lines_.back();
167
168 LOG(DEBUG, ASSEMBLER) << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
169 currLine_->end - currLine_->pos);
170
171 AnalyzeLine();
172
173 LOG(DEBUG, ASSEMBLER) << "tokenization of line " << lines_.size() << " is successful";
174 LOG(DEBUG, ASSEMBLER) << " tokens identified: ";
175
176 for (const auto &fI : lines_.back().tokens) {
177 LOG(DEBUG, ASSEMBLER) << "\n "
178 << std::string_view(&*(fI.wholeLine.begin() + fI.boundLeft), fI.boundRight - fI.boundLeft)
179 << " (type: " << TokenTypeWhat(fI.type) << ")";
180
181 LOG(DEBUG, ASSEMBLER);
182 LOG(DEBUG, ASSEMBLER);
183 }
184 return std::pair<std::vector<Token>, Error>(lines_.back().tokens, err_);
185 }
186
187 /* End of line? */
Eol() const188 bool Lexer::Eol() const
189 {
190 return currLine_->pos == currLine_->end;
191 }
192
193 /* Return the type of token */
LexGetType(size_t beg,size_t end) const194 Token::Type Lexer::LexGetType(size_t beg, size_t end) const
195 {
196 if (FindDelim(currLine_->buffer[beg]) != Token::Type::ID_BAD) { /* delimiter */
197 return FindDelim(currLine_->buffer[beg]);
198 }
199
200 std::string_view p(&*(currLine_->buffer.begin() + beg), end - beg);
201
202 Token::Type type = Findkeyword(p);
203 if (type != Token::Type::ID_BAD) {
204 return type;
205 }
206
207 type = FindOperation(p);
208 if (type != Token::Type::ID_BAD) {
209 return type;
210 }
211
212 if (IsQuote(currLine_->buffer[beg])) {
213 return Token::Type::ID_STRING;
214 }
215
216 return Token::Type::ID; /* other */
217 }
218
219 /* Handle string literal */
LexString()220 bool Lexer::LexString()
221 {
222 bool isEscapeSeq = false;
223 char quote = currLine_->buffer[currLine_->pos];
224 size_t begin = currLine_->pos;
225 while (!Eol()) {
226 ++(currLine_->pos);
227
228 char c = currLine_->buffer[currLine_->pos];
229
230 if (isEscapeSeq) {
231 isEscapeSeq = false;
232 continue;
233 }
234
235 if (c == '\\') {
236 isEscapeSeq = true;
237 }
238
239 if (c == quote) {
240 break;
241 }
242 }
243
244 if (currLine_->buffer[currLine_->pos] != quote) {
245 err_ = Error(std::string("Missing terminating ") + quote + " character", 0,
246 Error::ErrorType::ERR_STRING_MISSING_TERMINATING_CHARACTER, "", begin, currLine_->pos,
247 currLine_->buffer);
248 return false;
249 }
250
251 ++(currLine_->pos);
252
253 return true;
254 }
255
IsAngleBracketInFunctionName(char c,Line * currLine)256 bool Lexer::IsAngleBracketInFunctionName(char c, Line *currLine)
257 {
258 // <get> and <set> are used for mangling function name for setter and getter
259 // ensure "<" and ">" are only valid for function name:
260 // .function return_type <get>...(...)
261
262 // CC-OFFNXT(G.NAM.03-CPP) project code style
263 constexpr size_t FUNCTION_KEY_WORD_OFFSET = 2;
264 size_t currTokenSize = currLine->tokens.size();
265 if (currTokenSize < FUNCTION_KEY_WORD_OFFSET) {
266 return false;
267 }
268 bool isManglingName = (FindDelim(c) == Token::Type::DEL_LT || FindDelim(c) == Token::Type::DEL_GT);
269 return currLine->tokens[currTokenSize - FUNCTION_KEY_WORD_OFFSET].type == Token::Type::ID_FUN && isManglingName;
270 }
271
EatSpace()272 void Lexer::EatSpace()
273 {
274 while (currLine_->end > currLine_->pos && isspace(currLine_->buffer[currLine_->end - 1]) != 0) {
275 --(currLine_->end);
276 }
277
278 while (isspace(currLine_->buffer[currLine_->pos]) != 0 && !Eol()) {
279 ++(currLine_->pos);
280 }
281 }
282
HandleBrackets()283 void Lexer::HandleBrackets()
284 {
285 size_t position = currLine_->pos;
286 while (FindDelim(currLine_->buffer[position]) == Token::Type::DEL_SQUARE_BRACKET_L ||
287 FindDelim(currLine_->buffer[position]) == Token::Type::DEL_SQUARE_BRACKET_R) {
288 position++;
289 }
290 if (IsAngleBracketInFunctionName(currLine_->buffer[position], currLine_)) {
291 position++;
292 }
293 if (isspace(currLine_->buffer[position]) == 0 && (position != currLine_->end)) {
294 currLine_->pos = position;
295 }
296 }
297
298 /*
299 * Tokens handling: set a corresponding
300 * elements bound_left and bound_right of the array tokens
301 * to the first and last characters of a corresponding token.
302 *
303 * bound_r1 bound_r2 bound_r3
304 * | | |
305 * v v v
306 * token1 token2 token3 ... token1 token2 token3 ...
307 * => ^ ^ ^
308 * | | |
309 * bound1 bound2 bound3 ... bound_l1 bound_l2 bound_l3 ...
310 *
311 */
LexTokens()312 void Lexer::LexTokens()
313 {
314 if (Eol()) {
315 return;
316 }
317
318 LOG(DEBUG, ASSEMBLER) << "token search started (line " << lines_.size() << "): "
319 << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
320 currLine_->end - currLine_->pos);
321 EatSpace();
322 size_t boundRight;
323 size_t boundLeft;
324
325 while (!Eol()) {
326 boundLeft = currLine_->pos;
327
328 if (FindDelim(currLine_->buffer[currLine_->pos]) != Token::Type::ID_BAD) {
329 ++(currLine_->pos);
330 } else if (IsQuote(currLine_->buffer[currLine_->pos])) {
331 if (!LexString()) {
332 return;
333 }
334 } else {
335 LexBadTokens();
336 }
337
338 boundRight = currLine_->pos;
339
340 LOG(DEBUG, ASSEMBLER) << "token identified (line " << lines_.size() << ", "
341 << "token " << currLine_->tokens.size() + 1 << "): "
342 << std::string_view(&*(currLine_->buffer.begin() + boundLeft), boundRight - boundLeft)
343 << " ("
344 << "type: " << TokenTypeWhat(LexGetType(boundLeft, boundRight)) << ")";
345
346 currLine_->tokens.emplace_back(boundLeft, boundRight, LexGetType(boundLeft, boundRight), currLine_->buffer);
347
348 EatSpace();
349 }
350
351 LOG(DEBUG, ASSEMBLER) << "all tokens identified (line " << lines_.size() << ")";
352 }
353
LexBadTokens()354 void Lexer::LexBadTokens()
355 {
356 while (!Eol() && FindDelim(currLine_->buffer[currLine_->pos]) == Token::Type::ID_BAD &&
357 isspace(currLine_->buffer[currLine_->pos]) == 0) {
358 ++(currLine_->pos);
359 HandleBrackets();
360 }
361 }
362
363 /*
364 * Ignore comments:
365 * find PARSE_COMMENT_MARKER and move line->end
366 * to another position (next after the last character of the last
367 * significant (this is no a comment) element in a current
368 * line: line->buffer).
369 *
370 * Ex:
371 * [Label:] operation operand[,operand] [# comment]
372 *
373 * L1: mov v0, v1 # moving! L1: mov v0, v1 # moving!
374 * ^ => ^
375 * | |
376 * end end
377 */
LexPreprocess()378 void Lexer::LexPreprocess()
379 {
380 LOG(DEBUG, ASSEMBLER) << "started removing comments (line " << lines_.size() << "): "
381 << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
382 currLine_->end - currLine_->pos);
383
384 // Searching for comment marker located outside of string literals.
385 bool insideStrLit = !currLine_->buffer.empty() && currLine_->buffer[0] == '\"';
386 size_t cmtPos = currLine_->buffer.find_first_of("\"#", 0);
387 if (cmtPos != std::string::npos) {
388 do {
389 if (cmtPos != 0 && currLine_->buffer[cmtPos - 1] != '\\' && currLine_->buffer[cmtPos] == '\"') {
390 insideStrLit = !insideStrLit;
391 } else if (currLine_->buffer[cmtPos] == PARSE_COMMENT_MARKER && !insideStrLit) {
392 break;
393 }
394 } while ((cmtPos = currLine_->buffer.find_first_of("\"#", cmtPos + 1)) != std::string::npos);
395 }
396
397 if (cmtPos != std::string::npos) {
398 currLine_->end = cmtPos;
399 }
400
401 while (currLine_->end > currLine_->pos && isspace(currLine_->buffer[currLine_->end - 1]) != 0) {
402 --(currLine_->end);
403 }
404
405 LOG(DEBUG, ASSEMBLER) << "comments removed (line " << lines_.size() << "): "
406 << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
407 currLine_->end - currLine_->pos);
408 }
409
SkipSpace()410 void Lexer::SkipSpace()
411 {
412 while (!Eol() && isspace(currLine_->buffer[currLine_->pos]) != 0) {
413 ++(currLine_->pos);
414 }
415 }
416
AnalyzeLine()417 void Lexer::AnalyzeLine()
418 {
419 LexPreprocess();
420
421 SkipSpace();
422
423 LexTokens();
424 }
425
426 /*-------------------------------*/
427
428 } // namespace ark::pandasm
429