• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "lexer.h"
17 
18 namespace ark::pandasm {
19 
20 /*-------------------------------*/
21 
22 /* Is this a delimiter ? */
FindDelim(char c)23 Token::Type FindDelim(char c)
24 {
25     /* The map of delimiters */
26     static const std::unordered_map<char, Token::Type> DELIM = {{',', Token::Type::DEL_COMMA},
27                                                                 {':', Token::Type::DEL_COLON},
28                                                                 {'{', Token::Type::DEL_BRACE_L},
29                                                                 {'}', Token::Type::DEL_BRACE_R},
30                                                                 {'(', Token::Type::DEL_BRACKET_L},
31                                                                 {')', Token::Type::DEL_BRACKET_R},
32                                                                 {'<', Token::Type::DEL_LT},
33                                                                 {'>', Token::Type::DEL_GT},
34                                                                 {'=', Token::Type::DEL_EQ},
35                                                                 {'[', Token::Type::DEL_SQUARE_BRACKET_L},
36                                                                 {']', Token::Type::DEL_SQUARE_BRACKET_R}};
37 
38     auto iter = DELIM.find(c);
39     if (iter == DELIM.end()) {
40         return Token::Type::ID_BAD;
41     }
42 
43     return DELIM.at(c);
44 }
45 
FindOperation(std::string_view s)46 Token::Type FindOperation(std::string_view s)
47 {
48     /* Generate the map of OPERATIONS from ISA: */
49     static const std::unordered_map<std::string_view, Token::Type> OPERATIONS = {
50 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
51 #define OPLIST(inst_code, name, optype, width, flags, dst_idx, use_idxs, prof_size) \
52     {std::string_view(name), Token::Type::ID_OP_##inst_code},
53         PANDA_INSTRUCTION_LIST(OPLIST)
54 #undef OPLIST
55     };
56 
57     auto iter = OPERATIONS.find(s);
58     if (iter == OPERATIONS.end()) {
59         return Token::Type::ID_BAD;
60     }
61 
62     return OPERATIONS.at(s);
63 }
64 
Findkeyword(std::string_view s)65 Token::Type Findkeyword(std::string_view s)
66 {
67     /* Generate the map of KEYWORDS: */
68     static const std::unordered_map<std::string_view, Token::Type> KEYWORDS = {
69 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
70 #define KEYWORDS(name, inst_code) {std::string_view(name), Token::Type::ID_##inst_code},
71         KEYWORDS_LIST(KEYWORDS)
72 #undef KEYWORDS
73     };
74 
75     auto iter = KEYWORDS.find(s);
76     if (iter == KEYWORDS.end()) {
77         return Token::Type::ID_BAD;
78     }
79 
80     return KEYWORDS.at(s);
81 }
82 
83 // CC-OFFNXT(huge_method[C++], G.FUN.01-CPP) big switch case
TokenTypeWhat(Token::Type t)84 std::string_view TokenTypeWhat(Token::Type t)
85 {
86     if (t >= Token::Type::OPERATION && t < Token::Type::KEYWORD) {
87         return "OPERATION";
88     }
89 
90     if (t >= Token::Type::KEYWORD) {
91         return "KEYWORD";
92     }
93 
94     switch (t) {
95         case Token::Type::ID_BAD: {
96             return "ID_BAD";
97         }
98         case Token::Type::DEL_COMMA: {
99             return "DEL_COMMA";
100         }
101         case Token::Type::DEL_COLON: {
102             return "DEL_COLON";
103         }
104         case Token::Type::DEL_BRACE_L: {
105             return "DEL_BRACE_L";
106         }
107         case Token::Type::DEL_BRACE_R: {
108             return "DEL_BRACE_R";
109         }
110         case Token::Type::DEL_BRACKET_L: {
111             return "DEL_BRACKET_L";
112         }
113         case Token::Type::DEL_BRACKET_R: {
114             return "DEL_BRACKET_R";
115         }
116         case Token::Type::DEL_SQUARE_BRACKET_L: {
117             return "DEL_SQUARE_BRACKET_L";
118         }
119         case Token::Type::DEL_SQUARE_BRACKET_R: {
120             return "DEL_SQUARE_BRACKET_R";
121         }
122         case Token::Type::DEL_GT: {
123             return "DEL_GT";
124         }
125         case Token::Type::DEL_LT: {
126             return "DEL_LT";
127         }
128         case Token::Type::DEL_EQ: {
129             return "DEL_EQ";
130         }
131         case Token::Type::DEL_DOT: {
132             return "DEL_DOT";
133         }
134         case Token::Type::ID: {
135             return "ID";
136         }
137         case Token::Type::ID_STRING: {
138             return "ID_STRING";
139         }
140         default:
141             return "NONE";
142     }
143 }
144 
IsQuote(char c)145 static bool IsQuote(char c)
146 {
147     return c == '"';
148 }
149 
Lexer()150 Lexer::Lexer()
151 {
152     LOG(DEBUG, ASSEMBLER) << "element of class Lexer initialized";
153 }
154 
~Lexer()155 Lexer::~Lexer()
156 {
157     LOG(DEBUG, ASSEMBLER) << "element of class Lexer destructed";
158 }
159 
TokenizeString(const std::string & sourceStr)160 Tokens Lexer::TokenizeString(const std::string &sourceStr)
161 {
162     LOG(DEBUG, ASSEMBLER) << "started tokenizing of line " << (lines_.size() + 1) << ": ";
163 
164     lines_.emplace_back(sourceStr);
165 
166     currLine_ = &lines_.back();
167 
168     LOG(DEBUG, ASSEMBLER) << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
169                                               currLine_->end - currLine_->pos);
170 
171     AnalyzeLine();
172 
173     LOG(DEBUG, ASSEMBLER) << "tokenization of line " << lines_.size() << " is successful";
174     LOG(DEBUG, ASSEMBLER) << "         tokens identified: ";
175 
176     for (const auto &fI : lines_.back().tokens) {
177         LOG(DEBUG, ASSEMBLER) << "\n                           "
178                               << std::string_view(&*(fI.wholeLine.begin() + fI.boundLeft), fI.boundRight - fI.boundLeft)
179                               << " (type: " << TokenTypeWhat(fI.type) << ")";
180 
181         LOG(DEBUG, ASSEMBLER);
182         LOG(DEBUG, ASSEMBLER);
183     }
184     return std::pair<std::vector<Token>, Error>(lines_.back().tokens, err_);
185 }
186 
187 /* End of line? */
Eol() const188 bool Lexer::Eol() const
189 {
190     return currLine_->pos == currLine_->end;
191 }
192 
193 /* Return the type of token */
LexGetType(size_t beg,size_t end) const194 Token::Type Lexer::LexGetType(size_t beg, size_t end) const
195 {
196     if (FindDelim(currLine_->buffer[beg]) != Token::Type::ID_BAD) { /* delimiter */
197         return FindDelim(currLine_->buffer[beg]);
198     }
199 
200     std::string_view p(&*(currLine_->buffer.begin() + beg), end - beg);
201 
202     Token::Type type = Findkeyword(p);
203     if (type != Token::Type::ID_BAD) {
204         return type;
205     }
206 
207     type = FindOperation(p);
208     if (type != Token::Type::ID_BAD) {
209         return type;
210     }
211 
212     if (IsQuote(currLine_->buffer[beg])) {
213         return Token::Type::ID_STRING;
214     }
215 
216     return Token::Type::ID; /* other */
217 }
218 
219 /* Handle string literal */
LexString()220 bool Lexer::LexString()
221 {
222     bool isEscapeSeq = false;
223     char quote = currLine_->buffer[currLine_->pos];
224     size_t begin = currLine_->pos;
225     while (!Eol()) {
226         ++(currLine_->pos);
227 
228         char c = currLine_->buffer[currLine_->pos];
229 
230         if (isEscapeSeq) {
231             isEscapeSeq = false;
232             continue;
233         }
234 
235         if (c == '\\') {
236             isEscapeSeq = true;
237         }
238 
239         if (c == quote) {
240             break;
241         }
242     }
243 
244     if (currLine_->buffer[currLine_->pos] != quote) {
245         err_ = Error(std::string("Missing terminating ") + quote + " character", 0,
246                      Error::ErrorType::ERR_STRING_MISSING_TERMINATING_CHARACTER, "", begin, currLine_->pos,
247                      currLine_->buffer);
248         return false;
249     }
250 
251     ++(currLine_->pos);
252 
253     return true;
254 }
255 
256 /*
257  * Tokens handling: set a corresponding
258  * elements bound_left and bound_right of the array tokens
259  * to the first and last characters of a corresponding token.
260  *
261  *                                                  bound_r1   bound_r2    bound_r3
262  *                                                  |          |           |
263  *                                                  v          v           v
264  *       token1 token2 token3 ...             token1     token2      token3 ...
265  *                                       =>   ^          ^           ^
266  *                                            |          |           |
267  *    bound1    bound2    bound3 ...          bound_l1   bound_l2    bound_l3 ...
268  *
269  */
LexTokens()270 void Lexer::LexTokens()
271 {
272     if (Eol()) {
273         return;
274     }
275 
276     LOG(DEBUG, ASSEMBLER) << "token search started (line " << lines_.size() << "): "
277                           << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
278                                               currLine_->end - currLine_->pos);
279 
280     while (currLine_->end > currLine_->pos && isspace(currLine_->buffer[currLine_->end - 1]) != 0) {
281         --(currLine_->end);
282     }
283 
284     while (isspace(currLine_->buffer[currLine_->pos]) != 0 && !Eol()) {
285         ++(currLine_->pos);
286     }
287 
288     size_t boundRight;
289     size_t boundLeft;
290 
291     while (!Eol()) {
292         boundLeft = currLine_->pos;
293 
294         if (FindDelim(currLine_->buffer[currLine_->pos]) != Token::Type::ID_BAD) {
295             ++(currLine_->pos);
296         } else if (IsQuote(currLine_->buffer[currLine_->pos])) {
297             if (!LexString()) {
298                 return;
299             }
300         } else {
301             LexBadTokens();
302         }
303 
304         boundRight = currLine_->pos;
305 
306         LOG(DEBUG, ASSEMBLER) << "token identified (line " << lines_.size() << ", "
307                               << "token " << currLine_->tokens.size() + 1 << "): "
308                               << std::string_view(&*(currLine_->buffer.begin() + boundLeft), boundRight - boundLeft)
309                               << " ("
310                               << "type: " << TokenTypeWhat(LexGetType(boundLeft, boundRight)) << ")";
311 
312         currLine_->tokens.emplace_back(boundLeft, boundRight, LexGetType(boundLeft, boundRight), currLine_->buffer);
313 
314         while (isspace(currLine_->buffer[currLine_->pos]) != 0 && !Eol()) {
315             ++(currLine_->pos);
316         }
317     }
318 
319     LOG(DEBUG, ASSEMBLER) << "all tokens identified (line " << lines_.size() << ")";
320 }
321 
LexBadTokens()322 void Lexer::LexBadTokens()
323 {
324     while (!Eol() && FindDelim(currLine_->buffer[currLine_->pos]) == Token::Type::ID_BAD &&
325            isspace(currLine_->buffer[currLine_->pos]) == 0) {
326         ++(currLine_->pos);
327         size_t position = currLine_->pos;
328         while (FindDelim(currLine_->buffer[position]) == Token::Type::DEL_SQUARE_BRACKET_L ||
329                FindDelim(currLine_->buffer[position]) == Token::Type::DEL_SQUARE_BRACKET_R) {
330             position++;
331         }
332         if (isspace(currLine_->buffer[position]) == 0 && (position != currLine_->end)) {
333             currLine_->pos = position;
334         }
335     }
336 }
337 
338 /*
339  * Ignore comments:
340  * find PARSE_COMMENT_MARKER and move line->end
341  * to another position (next after the last character of the last
342  * significant (this is no a comment) element in a current
343  * line: line->buffer).
344  *
345  * Ex:
346  *   [Label:] operation operand[,operand] [# comment]
347  *
348  *   L1: mov v0, v1 # moving!        L1: mov v0, v1 # moving!
349  *                          ^   =>                 ^
350  *                          |                      |
351  *                         end                    end
352  */
LexPreprocess()353 void Lexer::LexPreprocess()
354 {
355     LOG(DEBUG, ASSEMBLER) << "started removing comments (line " << lines_.size() << "): "
356                           << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
357                                               currLine_->end - currLine_->pos);
358 
359     // Searching for comment marker located outside of string literals.
360     bool insideStrLit = !currLine_->buffer.empty() && currLine_->buffer[0] == '\"';
361     size_t cmtPos = currLine_->buffer.find_first_of("\"#", 0);
362     if (cmtPos != std::string::npos) {
363         do {
364             if (cmtPos != 0 && currLine_->buffer[cmtPos - 1] != '\\' && currLine_->buffer[cmtPos] == '\"') {
365                 insideStrLit = !insideStrLit;
366             } else if (currLine_->buffer[cmtPos] == PARSE_COMMENT_MARKER && !insideStrLit) {
367                 break;
368             }
369         } while ((cmtPos = currLine_->buffer.find_first_of("\"#", cmtPos + 1)) != std::string::npos);
370     }
371 
372     if (cmtPos != std::string::npos) {
373         currLine_->end = cmtPos;
374     }
375 
376     while (currLine_->end > currLine_->pos && isspace(currLine_->buffer[currLine_->end - 1]) != 0) {
377         --(currLine_->end);
378     }
379 
380     LOG(DEBUG, ASSEMBLER) << "comments removed (line " << lines_.size() << "): "
381                           << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
382                                               currLine_->end - currLine_->pos);
383 }
384 
SkipSpace()385 void Lexer::SkipSpace()
386 {
387     while (!Eol() && isspace(currLine_->buffer[currLine_->pos]) != 0) {
388         ++(currLine_->pos);
389     }
390 }
391 
AnalyzeLine()392 void Lexer::AnalyzeLine()
393 {
394     LexPreprocess();
395 
396     SkipSpace();
397 
398     LexTokens();
399 }
400 
401 /*-------------------------------*/
402 
403 }  // namespace ark::pandasm
404