• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021-2023 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "lexer.h"
17 
18 namespace panda::pandasm {
19 
20 /*-------------------------------*/
21 
22 /* Is this a delimiter ? */
FindDelim(char c)23 Token::Type FindDelim(char c)
24 {
25     /* The map of delimiters */
26     static const std::unordered_map<char, Token::Type> DELIM = {{',', Token::Type::DEL_COMMA},
27                                                                 {':', Token::Type::DEL_COLON},
28                                                                 {'{', Token::Type::DEL_BRACE_L},
29                                                                 {'}', Token::Type::DEL_BRACE_R},
30                                                                 {'(', Token::Type::DEL_BRACKET_L},
31                                                                 {')', Token::Type::DEL_BRACKET_R},
32                                                                 {'<', Token::Type::DEL_LT},
33                                                                 {'>', Token::Type::DEL_GT},
34                                                                 {'=', Token::Type::DEL_EQ},
35                                                                 {'[', Token::Type::DEL_SQUARE_BRACKET_L},
36                                                                 {']', Token::Type::DEL_SQUARE_BRACKET_R}};
37 
38     auto iter = DELIM.find(c);
39     if (iter == DELIM.end()) {
40         return Token::Type::ID_BAD;
41     }
42 
43     return DELIM.at(c);
44 }
45 
FindOperation(std::string_view s)46 Token::Type FindOperation(std::string_view s)
47 {
48     /* Generate the map of OPERATIONS from ISA: */
49     static const std::unordered_map<std::string_view, Token::Type> OPERATIONS = {
50 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
51 #define OPLIST(inst_code, name, optype, width, flags, dst_idx, use_idxs, prof_size) \
52     {std::string_view(name), Token::Type::ID_OP_##inst_code},
53         PANDA_INSTRUCTION_LIST(OPLIST)
54 #undef OPLIST
55     };
56 
57     auto iter = OPERATIONS.find(s);
58     if (iter == OPERATIONS.end()) {
59         return Token::Type::ID_BAD;
60     }
61 
62     return OPERATIONS.at(s);
63 }
64 
Findkeyword(std::string_view s)65 Token::Type Findkeyword(std::string_view s)
66 {
67     /* Generate the map of KEYWORDS: */
68     static const std::unordered_map<std::string_view, Token::Type> KEYWORDS = {
69 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
70 #define KEYWORDS(name, inst_code) {std::string_view(name), Token::Type::ID_##inst_code},
71         KEYWORDS_LIST(KEYWORDS)
72 #undef KEYWORDS
73     };
74 
75     auto iter = KEYWORDS.find(s);
76     if (iter == KEYWORDS.end()) {
77         return Token::Type::ID_BAD;
78     }
79 
80     return KEYWORDS.at(s);
81 }
82 
TokenTypeWhat(Token::Type t)83 std::string_view TokenTypeWhat(Token::Type t)
84 {
85     if (t >= Token::Type::OPERATION && t < Token::Type::KEYWORD) {
86         return "OPERATION";
87     }
88 
89     if (t >= Token::Type::KEYWORD) {
90         return "KEYWORD";
91     }
92 
93     switch (t) {
94         case Token::Type::ID_BAD: {
95             return "ID_BAD";
96         }
97         case Token::Type::DEL_COMMA: {
98             return "DEL_COMMA";
99         }
100         case Token::Type::DEL_COLON: {
101             return "DEL_COLON";
102         }
103         case Token::Type::DEL_BRACE_L: {
104             return "DEL_BRACE_L";
105         }
106         case Token::Type::DEL_BRACE_R: {
107             return "DEL_BRACE_R";
108         }
109         case Token::Type::DEL_BRACKET_L: {
110             return "DEL_BRACKET_L";
111         }
112         case Token::Type::DEL_BRACKET_R: {
113             return "DEL_BRACKET_R";
114         }
115         case Token::Type::DEL_SQUARE_BRACKET_L: {
116             return "DEL_SQUARE_BRACKET_L";
117         }
118         case Token::Type::DEL_SQUARE_BRACKET_R: {
119             return "DEL_SQUARE_BRACKET_R";
120         }
121         case Token::Type::DEL_GT: {
122             return "DEL_GT";
123         }
124         case Token::Type::DEL_LT: {
125             return "DEL_LT";
126         }
127         case Token::Type::DEL_EQ: {
128             return "DEL_EQ";
129         }
130         case Token::Type::DEL_DOT: {
131             return "DEL_DOT";
132         }
133         case Token::Type::ID: {
134             return "ID";
135         }
136         case Token::Type::ID_STRING: {
137             return "ID_STRING";
138         }
139         default:
140             return "NONE";
141     }
142 }
143 
IsQuote(char c)144 static bool IsQuote(char c)
145 {
146     return c == '"';
147 }
148 
Lexer()149 Lexer::Lexer()
150 {
151     LOG(DEBUG, ASSEMBLER) << "element of class Lexer initialized";
152 }
153 
~Lexer()154 Lexer::~Lexer()
155 {
156     LOG(DEBUG, ASSEMBLER) << "element of class Lexer destructed";
157 }
158 
TokenizeString(const std::string & sourceStr)159 Tokens Lexer::TokenizeString(const std::string &sourceStr)
160 {
161     LOG(DEBUG, ASSEMBLER) << "started tokenizing of line " << (lines_.size() + 1) << ": ";
162 
163     lines_.emplace_back(sourceStr);
164 
165     currLine_ = &lines_.back();
166 
167     LOG(DEBUG, ASSEMBLER) << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
168                                               currLine_->end - currLine_->pos);
169 
170     AnalyzeLine();
171 
172     LOG(DEBUG, ASSEMBLER) << "tokenization of line " << lines_.size() << " is successful";
173     LOG(DEBUG, ASSEMBLER) << "         tokens identified: ";
174 
175     for (const auto &fI : lines_.back().tokens) {
176         LOG(DEBUG, ASSEMBLER) << "\n                           "
177                               << std::string_view(&*(fI.wholeLine.begin() + fI.boundLeft), fI.boundRight - fI.boundLeft)
178                               << " (type: " << TokenTypeWhat(fI.type) << ")";
179 
180         LOG(DEBUG, ASSEMBLER);
181         LOG(DEBUG, ASSEMBLER);
182     }
183     return std::pair<std::vector<Token>, Error>(lines_.back().tokens, err_);
184 }
185 
186 /* End of line? */
Eol() const187 bool Lexer::Eol() const
188 {
189     return currLine_->pos == currLine_->end;
190 }
191 
192 /* Return the type of token */
LexGetType(size_t beg,size_t end) const193 Token::Type Lexer::LexGetType(size_t beg, size_t end) const
194 {
195     if (FindDelim(currLine_->buffer[beg]) != Token::Type::ID_BAD) { /* delimiter */
196         return FindDelim(currLine_->buffer[beg]);
197     }
198 
199     std::string_view p(&*(currLine_->buffer.begin() + beg), end - beg);
200 
201     Token::Type type = Findkeyword(p);
202     if (type != Token::Type::ID_BAD) {
203         return type;
204     }
205 
206     type = FindOperation(p);
207     if (type != Token::Type::ID_BAD) {
208         return type;
209     }
210 
211     if (IsQuote(currLine_->buffer[beg])) {
212         return Token::Type::ID_STRING;
213     }
214 
215     return Token::Type::ID; /* other */
216 }
217 
218 /* Handle string literal */
LexString()219 bool Lexer::LexString()
220 {
221     bool isEscapeSeq = false;
222     char quote = currLine_->buffer[currLine_->pos];
223     size_t begin = currLine_->pos;
224     while (!Eol()) {
225         ++(currLine_->pos);
226 
227         char c = currLine_->buffer[currLine_->pos];
228 
229         if (isEscapeSeq) {
230             isEscapeSeq = false;
231             continue;
232         }
233 
234         if (c == '\\') {
235             isEscapeSeq = true;
236         }
237 
238         if (c == quote) {
239             break;
240         }
241     }
242 
243     if (currLine_->buffer[currLine_->pos] != quote) {
244         err_ = Error(std::string("Missing terminating ") + quote + " character", 0,
245                      Error::ErrorType::ERR_STRING_MISSING_TERMINATING_CHARACTER, "", begin, currLine_->pos,
246                      currLine_->buffer);
247         return false;
248     }
249 
250     ++(currLine_->pos);
251 
252     return true;
253 }
254 
UpdateCurLinePos()255 void Lexer::UpdateCurLinePos()
256 {
257     if (FindDelim(currLine_->buffer[currLine_->pos]) != Token::Type::ID_BAD) {
258         ++(currLine_->pos);
259     } else if (IsQuote(currLine_->buffer[currLine_->pos])) {
260         if (!LexString()) {
261             return;
262         }
263     } else {
264         while (!Eol() && FindDelim(currLine_->buffer[currLine_->pos]) == Token::Type::ID_BAD &&
265                isspace(currLine_->buffer[currLine_->pos]) == 0) {
266             ++(currLine_->pos);
267             size_t position = currLine_->pos;
268             while (FindDelim(currLine_->buffer[position]) == Token::Type::DEL_SQUARE_BRACKET_L ||
269                    FindDelim(currLine_->buffer[position]) == Token::Type::DEL_SQUARE_BRACKET_R) {
270                 position++;
271             }
272             if (isspace(currLine_->buffer[position]) == 0 && (position != currLine_->end)) {
273                 currLine_->pos = position;
274             }
275         }
276     }
277 }
278 
279 /*
280  * Tokens handling: set a corresponding
281  * elements bound_left and bound_right of the array tokens
282  * to the first and last characters of a corresponding token.
283  *
284  *                                                  bound_r1   bound_r2    bound_r3
285  *                                                  |          |           |
286  *                                                  v          v           v
287  *       token1 token2 token3 ...             token1     token2      token3 ...
288  *                                       =>   ^          ^           ^
289  *                                            |          |           |
290  *    bound1    bound2    bound3 ...          bound_l1   bound_l2    bound_l3 ...
291  *
292  */
LexTokens()293 void Lexer::LexTokens()
294 {
295     if (Eol()) {
296         return;
297     }
298 
299     LOG(DEBUG, ASSEMBLER) << "token search started (line " << lines_.size() << "): "
300                           << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
301                                               currLine_->end - currLine_->pos);
302 
303     while (currLine_->end > currLine_->pos && isspace(currLine_->buffer[currLine_->end - 1]) != 0) {
304         --(currLine_->end);
305     }
306 
307     while (isspace(currLine_->buffer[currLine_->pos]) != 0 && !Eol()) {
308         ++(currLine_->pos);
309     }
310 
311     size_t boundRight;
312 
313     size_t boundLeft;
314 
315     while (!Eol()) {
316         boundLeft = currLine_->pos;
317 
318         UpdateCurLinePos();
319 
320         boundRight = currLine_->pos;
321 
322         LOG(DEBUG, ASSEMBLER) << "token identified (line " << lines_.size() << ", "
323                               << "token " << currLine_->tokens.size() + 1 << "): "
324                               << std::string_view(&*(currLine_->buffer.begin() + boundLeft), boundRight - boundLeft)
325                               << " ("
326                               << "type: " << TokenTypeWhat(LexGetType(boundLeft, boundRight)) << ")";
327 
328         currLine_->tokens.emplace_back(boundLeft, boundRight, LexGetType(boundLeft, boundRight), currLine_->buffer);
329 
330         while (isspace(currLine_->buffer[currLine_->pos]) != 0 && !Eol()) {
331             ++(currLine_->pos);
332         }
333     }
334 
335     LOG(DEBUG, ASSEMBLER) << "all tokens identified (line " << lines_.size() << ")";
336 }
337 
338 /*
339  * Ignore comments:
340  * find PARSE_COMMENT_MARKER and move line->end
341  * to another position (next after the last character of the last
342  * significant (this is no a comment) element in a current
343  * line: line->buffer).
344  *
345  * Ex:
346  *   [Label:] operation operand[,operand] [# comment]
347  *
348  *   L1: mov v0, v1 # moving!        L1: mov v0, v1 # moving!
349  *                          ^   =>                 ^
350  *                          |                      |
351  *                         end                    end
352  */
LexPreprocess()353 void Lexer::LexPreprocess()
354 {
355     LOG(DEBUG, ASSEMBLER) << "started removing comments (line " << lines_.size() << "): "
356                           << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
357                                               currLine_->end - currLine_->pos);
358 
359     // Searching for comment marker located outside of string literals.
360     bool insideStrLit = !currLine_->buffer.empty() && currLine_->buffer[0] == '\"';
361     size_t cmtPos = currLine_->buffer.find_first_of("\"#", 0);
362     if (cmtPos != std::string::npos) {
363         do {
364             if (cmtPos != 0 && currLine_->buffer[cmtPos - 1] != '\\' && currLine_->buffer[cmtPos] == '\"') {
365                 insideStrLit = !insideStrLit;
366             } else if (currLine_->buffer[cmtPos] == PARSE_COMMENT_MARKER && !insideStrLit) {
367                 break;
368             }
369         } while ((cmtPos = currLine_->buffer.find_first_of("\"#", cmtPos + 1)) != std::string::npos);
370     }
371 
372     if (cmtPos != std::string::npos) {
373         currLine_->end = cmtPos;
374     }
375 
376     while (currLine_->end > currLine_->pos && isspace(currLine_->buffer[currLine_->end - 1]) != 0) {
377         --(currLine_->end);
378     }
379 
380     LOG(DEBUG, ASSEMBLER) << "comments removed (line " << lines_.size() << "): "
381                           << std::string_view(&*(currLine_->buffer.begin() + currLine_->pos),
382                                               currLine_->end - currLine_->pos);
383 }
384 
SkipSpace()385 void Lexer::SkipSpace()
386 {
387     while (!Eol() && isspace(currLine_->buffer[currLine_->pos]) != 0) {
388         ++(currLine_->pos);
389     }
390 }
391 
AnalyzeLine()392 void Lexer::AnalyzeLine()
393 {
394     LexPreprocess();
395 
396     SkipSpace();
397 
398     LexTokens();
399 }
400 
401 /*-------------------------------*/
402 
403 }  // namespace panda::pandasm
404