• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "lexer.h"
17 
18 namespace ark::pandasm {
19 
20 /*-------------------------------*/
21 
22 /* Is this a delimiter ? */
FindDelim(char c)23 Token::Type FindDelim(char c)
24 {
25     /* The map of delimiters */
26     static const std::unordered_map<char, Token::Type> DELIM = {{',', Token::Type::DEL_COMMA},
27                                                                 {':', Token::Type::DEL_COLON},
28                                                                 {'{', Token::Type::DEL_BRACE_L},
29                                                                 {'}', Token::Type::DEL_BRACE_R},
30                                                                 {'(', Token::Type::DEL_BRACKET_L},
31                                                                 {')', Token::Type::DEL_BRACKET_R},
32                                                                 {'<', Token::Type::DEL_LT},
33                                                                 {'>', Token::Type::DEL_GT},
34                                                                 {'=', Token::Type::DEL_EQ},
35                                                                 {'[', Token::Type::DEL_SQUARE_BRACKET_L},
36                                                                 {']', Token::Type::DEL_SQUARE_BRACKET_R}};
37 
38     auto iter = DELIM.find(c);
39     if (iter == DELIM.end()) {
40         return Token::Type::ID_BAD;
41     }
42 
43     return DELIM.at(c);
44 }
45 
FindOperation(std::string_view s)46 Token::Type FindOperation(std::string_view s)
47 {
48     /* Generate the map of OPERATIONS from ISA: */
49     static const std::unordered_map<std::string_view, Token::Type> OPERATIONS = {
50 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
51 #define OPLIST(inst_code, name, optype, width, flags, dst_idx, use_idxs, prof_size) \
52     {std::string_view(name), Token::Type::ID_OP_##inst_code},
53         PANDA_INSTRUCTION_LIST(OPLIST)
54 #undef OPLIST
55     };
56 
57     auto iter = OPERATIONS.find(s);
58     if (iter == OPERATIONS.end()) {
59         return Token::Type::ID_BAD;
60     }
61 
62     return OPERATIONS.at(s);
63 }
64 
Findkeyword(std::string_view s)65 Token::Type Findkeyword(std::string_view s)
66 {
67     /* Generate the map of KEYWORDS: */
68     static const std::unordered_map<std::string_view, Token::Type> KEYWORDS = {
69 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
70 #define KEYWORDS(name, inst_code) {std::string_view(name), Token::Type::ID_##inst_code},
71         KEYWORDS_LIST(KEYWORDS)
72 #undef KEYWORDS
73     };
74 
75     auto iter = KEYWORDS.find(s);
76     if (iter == KEYWORDS.end()) {
77         return Token::Type::ID_BAD;
78     }
79 
80     return KEYWORDS.at(s);
81 }
82 
83 // CC-OFFNXT(huge_method[C++], G.FUN.01-CPP) big switch case
TokenTypeWhat(Token::Type t)84 std::string_view TokenTypeWhat(Token::Type t)
85 {
86     if (t >= Token::Type::OPERATION && t < Token::Type::KEYWORD) {
87         return "OPERATION";
88     }
89 
90     if (t >= Token::Type::KEYWORD) {
91         return "KEYWORD";
92     }
93 
94     switch (t) {
95         case Token::Type::ID_BAD: {
96             return "ID_BAD";
97         }
98         case Token::Type::DEL_COMMA: {
99             return "DEL_COMMA";
100         }
101         case Token::Type::DEL_COLON: {
102             return "DEL_COLON";
103         }
104         case Token::Type::DEL_BRACE_L: {
105             return "DEL_BRACE_L";
106         }
107         case Token::Type::DEL_BRACE_R: {
108             return "DEL_BRACE_R";
109         }
110         case Token::Type::DEL_BRACKET_L: {
111             return "DEL_BRACKET_L";
112         }
113         case Token::Type::DEL_BRACKET_R: {
114             return "DEL_BRACKET_R";
115         }
116         case Token::Type::DEL_SQUARE_BRACKET_L: {
117             return "DEL_SQUARE_BRACKET_L";
118         }
119         case Token::Type::DEL_SQUARE_BRACKET_R: {
120             return "DEL_SQUARE_BRACKET_R";
121         }
122         case Token::Type::DEL_GT: {
123             return "DEL_GT";
124         }
125         case Token::Type::DEL_LT: {
126             return "DEL_LT";
127         }
128         case Token::Type::DEL_EQ: {
129             return "DEL_EQ";
130         }
131         case Token::Type::DEL_DOT: {
132             return "DEL_DOT";
133         }
134         case Token::Type::ID: {
135             return "ID";
136         }
137         case Token::Type::ID_STRING: {
138             return "ID_STRING";
139         }
140         default:
141             return "NONE";
142     }
143 }
144 
IsQuote(char c)145 static bool IsQuote(char c)
146 {
147     return c == '"';
148 }
149 
Lexer()150 Lexer::Lexer()
151 {
152     LOG(DEBUG, ASSEMBLER) << "element of class Lexer initialized";
153 }
154 
~Lexer()155 Lexer::~Lexer()
156 {
157     LOG(DEBUG, ASSEMBLER) << "element of class Lexer destructed";
158 }
159 
TokenizeString(const std::string & sourceStr)160 Tokens Lexer::TokenizeString(const std::string &sourceStr)
161 {
162     LOG(DEBUG, ASSEMBLER) << "started tokenizing of line " << (lines_.size() + 1) << ": ";
163 
164     lines_.emplace_back(sourceStr);
165 
166     currLine_ = &lines_.back();
167 
168     LOG(DEBUG, ASSEMBLER) << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
169                                               currLine_->end - currLine_->pos);
170 
171     AnalyzeLine();
172 
173     LOG(DEBUG, ASSEMBLER) << "tokenization of line " << lines_.size() << " is successful";
174     LOG(DEBUG, ASSEMBLER) << "         tokens identified: ";
175 
176     for (const auto &fI : lines_.back().tokens) {
177         LOG(DEBUG, ASSEMBLER) << "\n                           "
178                               << std::string_view(&*(fI.wholeLine.begin() + fI.boundLeft), fI.boundRight - fI.boundLeft)
179                               << " (type: " << TokenTypeWhat(fI.type) << ")";
180 
181         LOG(DEBUG, ASSEMBLER);
182         LOG(DEBUG, ASSEMBLER);
183     }
184     return std::pair<std::vector<Token>, Error>(lines_.back().tokens, err_);
185 }
186 
187 /* End of line? */
Eol() const188 bool Lexer::Eol() const
189 {
190     return currLine_->pos == currLine_->end;
191 }
192 
193 /* Return the type of token */
LexGetType(size_t beg,size_t end) const194 Token::Type Lexer::LexGetType(size_t beg, size_t end) const
195 {
196     if (FindDelim(currLine_->buffer[beg]) != Token::Type::ID_BAD) { /* delimiter */
197         return FindDelim(currLine_->buffer[beg]);
198     }
199 
200     std::string_view p(&*(currLine_->buffer.begin() + beg), end - beg);
201 
202     Token::Type type = Findkeyword(p);
203     if (type != Token::Type::ID_BAD) {
204         return type;
205     }
206 
207     type = FindOperation(p);
208     if (type != Token::Type::ID_BAD) {
209         return type;
210     }
211 
212     if (IsQuote(currLine_->buffer[beg])) {
213         return Token::Type::ID_STRING;
214     }
215 
216     return Token::Type::ID; /* other */
217 }
218 
219 /* Handle string literal */
LexString()220 bool Lexer::LexString()
221 {
222     bool isEscapeSeq = false;
223     char quote = currLine_->buffer[currLine_->pos];
224     size_t begin = currLine_->pos;
225     while (!Eol()) {
226         ++(currLine_->pos);
227 
228         char c = currLine_->buffer[currLine_->pos];
229 
230         if (isEscapeSeq) {
231             isEscapeSeq = false;
232             continue;
233         }
234 
235         if (c == '\\') {
236             isEscapeSeq = true;
237         }
238 
239         if (c == quote) {
240             break;
241         }
242     }
243 
244     if (currLine_->buffer[currLine_->pos] != quote) {
245         err_ = Error(std::string("Missing terminating ") + quote + " character", 0,
246                      Error::ErrorType::ERR_STRING_MISSING_TERMINATING_CHARACTER, "", begin, currLine_->pos,
247                      currLine_->buffer);
248         return false;
249     }
250 
251     ++(currLine_->pos);
252 
253     return true;
254 }
255 
IsAngleBracketInFunctionName(char c,Line * currLine)256 bool Lexer::IsAngleBracketInFunctionName(char c, Line *currLine)
257 {
258     // <get> and <set> are used for mangling function name for setter and getter
259     // ensure "<" and ">" are only valid for function name:
260     // .function return_type <get>...(...)
261 
262     // CC-OFFNXT(G.NAM.03-CPP) project code style
263     constexpr size_t FUNCTION_KEY_WORD_OFFSET = 2;
264     size_t currTokenSize = currLine->tokens.size();
265     if (currTokenSize < FUNCTION_KEY_WORD_OFFSET) {
266         return false;
267     }
268     bool isManglingName = (FindDelim(c) == Token::Type::DEL_LT || FindDelim(c) == Token::Type::DEL_GT);
269     return currLine->tokens[currTokenSize - FUNCTION_KEY_WORD_OFFSET].type == Token::Type::ID_FUN && isManglingName;
270 }
271 
EatSpace()272 void Lexer::EatSpace()
273 {
274     while (currLine_->end > currLine_->pos && isspace(currLine_->buffer[currLine_->end - 1]) != 0) {
275         --(currLine_->end);
276     }
277 
278     while (isspace(currLine_->buffer[currLine_->pos]) != 0 && !Eol()) {
279         ++(currLine_->pos);
280     }
281 }
282 
HandleBrackets()283 void Lexer::HandleBrackets()
284 {
285     size_t position = currLine_->pos;
286     while (FindDelim(currLine_->buffer[position]) == Token::Type::DEL_SQUARE_BRACKET_L ||
287            FindDelim(currLine_->buffer[position]) == Token::Type::DEL_SQUARE_BRACKET_R) {
288         position++;
289     }
290     if (IsAngleBracketInFunctionName(currLine_->buffer[position], currLine_)) {
291         position++;
292     }
293     if (isspace(currLine_->buffer[position]) == 0 && (position != currLine_->end)) {
294         currLine_->pos = position;
295     }
296 }
297 
298 /*
299  * Tokens handling: set a corresponding
300  * elements bound_left and bound_right of the array tokens
301  * to the first and last characters of a corresponding token.
302  *
303  *                                                  bound_r1   bound_r2    bound_r3
304  *                                                  |          |           |
305  *                                                  v          v           v
306  *       token1 token2 token3 ...             token1     token2      token3 ...
307  *                                       =>   ^          ^           ^
308  *                                            |          |           |
309  *    bound1    bound2    bound3 ...          bound_l1   bound_l2    bound_l3 ...
310  *
311  */
LexTokens()312 void Lexer::LexTokens()
313 {
314     if (Eol()) {
315         return;
316     }
317 
318     LOG(DEBUG, ASSEMBLER) << "token search started (line " << lines_.size() << "): "
319                           << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
320                                               currLine_->end - currLine_->pos);
321     EatSpace();
322     size_t boundRight;
323     size_t boundLeft;
324 
325     while (!Eol()) {
326         boundLeft = currLine_->pos;
327 
328         if (FindDelim(currLine_->buffer[currLine_->pos]) != Token::Type::ID_BAD) {
329             ++(currLine_->pos);
330         } else if (IsQuote(currLine_->buffer[currLine_->pos])) {
331             if (!LexString()) {
332                 return;
333             }
334         } else {
335             LexBadTokens();
336         }
337 
338         boundRight = currLine_->pos;
339 
340         LOG(DEBUG, ASSEMBLER) << "token identified (line " << lines_.size() << ", "
341                               << "token " << currLine_->tokens.size() + 1 << "): "
342                               << std::string_view(&*(currLine_->buffer.begin() + boundLeft), boundRight - boundLeft)
343                               << " ("
344                               << "type: " << TokenTypeWhat(LexGetType(boundLeft, boundRight)) << ")";
345 
346         currLine_->tokens.emplace_back(boundLeft, boundRight, LexGetType(boundLeft, boundRight), currLine_->buffer);
347 
348         EatSpace();
349     }
350 
351     LOG(DEBUG, ASSEMBLER) << "all tokens identified (line " << lines_.size() << ")";
352 }
353 
LexBadTokens()354 void Lexer::LexBadTokens()
355 {
356     while (!Eol() && FindDelim(currLine_->buffer[currLine_->pos]) == Token::Type::ID_BAD &&
357            isspace(currLine_->buffer[currLine_->pos]) == 0) {
358         ++(currLine_->pos);
359         HandleBrackets();
360     }
361 }
362 
363 /*
364  * Ignore comments:
365  * find PARSE_COMMENT_MARKER and move line->end
366  * to another position (next after the last character of the last
367  * significant (this is no a comment) element in a current
368  * line: line->buffer).
369  *
370  * Ex:
371  *   [Label:] operation operand[,operand] [# comment]
372  *
373  *   L1: mov v0, v1 # moving!        L1: mov v0, v1 # moving!
374  *                          ^   =>                 ^
375  *                          |                      |
376  *                         end                    end
377  */
LexPreprocess()378 void Lexer::LexPreprocess()
379 {
380     LOG(DEBUG, ASSEMBLER) << "started removing comments (line " << lines_.size() << "): "
381                           << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
382                                               currLine_->end - currLine_->pos);
383 
384     // Searching for comment marker located outside of string literals.
385     bool insideStrLit = !currLine_->buffer.empty() && currLine_->buffer[0] == '\"';
386     size_t cmtPos = currLine_->buffer.find_first_of("\"#", 0);
387     if (cmtPos != std::string::npos) {
388         do {
389             if (cmtPos != 0 && currLine_->buffer[cmtPos - 1] != '\\' && currLine_->buffer[cmtPos] == '\"') {
390                 insideStrLit = !insideStrLit;
391             } else if (currLine_->buffer[cmtPos] == PARSE_COMMENT_MARKER && !insideStrLit) {
392                 break;
393             }
394         } while ((cmtPos = currLine_->buffer.find_first_of("\"#", cmtPos + 1)) != std::string::npos);
395     }
396 
397     if (cmtPos != std::string::npos) {
398         currLine_->end = cmtPos;
399     }
400 
401     while (currLine_->end > currLine_->pos && isspace(currLine_->buffer[currLine_->end - 1]) != 0) {
402         --(currLine_->end);
403     }
404 
405     LOG(DEBUG, ASSEMBLER) << "comments removed (line " << lines_.size() << "): "
406                           << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
407                                               currLine_->end - currLine_->pos);
408 }
409 
SkipSpace()410 void Lexer::SkipSpace()
411 {
412     while (!Eol() && isspace(currLine_->buffer[currLine_->pos]) != 0) {
413         ++(currLine_->pos);
414     }
415 }
416 
AnalyzeLine()417 void Lexer::AnalyzeLine()
418 {
419     LexPreprocess();
420 
421     SkipSpace();
422 
423     LexTokens();
424 }
425 
426 /*-------------------------------*/
427 
428 }  // namespace ark::pandasm
429