• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "lexer.h"
17 
18 namespace panda::pandasm {
19 
FindDelim(char c)20 Token::Type FindDelim(char c)
21 {
22     // The map of delimiters
23     static const std::unordered_map<char, Token::Type> DELIM = {{',', Token::Type::DEL_COMMA},
24                                                                 {':', Token::Type::DEL_COLON},
25                                                                 {'{', Token::Type::DEL_BRACE_L},
26                                                                 {'}', Token::Type::DEL_BRACE_R},
27                                                                 {'(', Token::Type::DEL_BRACKET_L},
28                                                                 {')', Token::Type::DEL_BRACKET_R},
29                                                                 {'<', Token::Type::DEL_LT},
30                                                                 {'>', Token::Type::DEL_GT},
31                                                                 {'=', Token::Type::DEL_EQ},
32                                                                 {'[', Token::Type::DEL_SQUARE_BRACKET_L},
33                                                                 {']', Token::Type::DEL_SQUARE_BRACKET_R}};
34 
35     auto iter = DELIM.find(c);
36     if (iter == DELIM.end()) {
37         return Token::Type::ID_BAD;
38     }
39 
40     return DELIM.at(c);
41 }
42 
FindOperation(std::string_view s)43 Token::Type FindOperation(std::string_view s)
44 {
45     // Generate the map of OPERATIONS from ISA
46     static const std::unordered_map<std::string_view, Token::Type> OPERATIONS = {
47 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
48 #define OPLIST(inst_code, name, optype, width, flags, dst_idx, use_idxs) \
49     {std::string_view(name), Token::Type::ID_OP_##inst_code},
50         PANDA_INSTRUCTION_LIST(OPLIST)
51 #undef OPLIST
52     };
53 
54     auto iter = OPERATIONS.find(s);
55     if (iter == OPERATIONS.end()) {
56         return Token::Type::ID_BAD;
57     }
58 
59     return OPERATIONS.at(s);
60 }
61 
Findkeyword(std::string_view s)62 Token::Type Findkeyword(std::string_view s)
63 {
64     // Generate the map of KEYWORDS
65     static const std::unordered_map<std::string_view, Token::Type> KEYWORDS = {
66 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
67 #define KEYWORDS(name, inst_code) {std::string_view(name), Token::Type::ID_##inst_code},
68         KEYWORDS_LIST(KEYWORDS)
69 #undef KEYWORDS
70     };
71 
72     auto iter = KEYWORDS.find(s);
73     if (iter == KEYWORDS.end()) {
74         return Token::Type::ID_BAD;
75     }
76 
77     return KEYWORDS.at(s);
78 }
79 
80 // CODECHECK-NOLINTNEXTLINE(C_RULE_ID_FUNCTION_SIZE)
TokenTypeWhat(Token::Type t)81 std::string_view TokenTypeWhat(Token::Type t)
82 {
83     if (t >= Token::Type::OPERATION && t < Token::Type::KEYWORD) {
84         return "OPERATION";
85     }
86 
87     if (t >= Token::Type::KEYWORD) {
88         return "KEYWORD";
89     }
90 
91     switch (t) {
92         case Token::Type::ID_BAD: {
93             return "ID_BAD";
94         }
95         case Token::Type::DEL_COMMA: {
96             return "DEL_COMMA";
97         }
98         case Token::Type::DEL_COLON: {
99             return "DEL_COLON";
100         }
101         case Token::Type::DEL_BRACE_L: {
102             return "DEL_BRACE_L";
103         }
104         case Token::Type::DEL_BRACE_R: {
105             return "DEL_BRACE_R";
106         }
107         case Token::Type::DEL_BRACKET_L: {
108             return "DEL_BRACKET_L";
109         }
110         case Token::Type::DEL_BRACKET_R: {
111             return "DEL_BRACKET_R";
112         }
113         case Token::Type::DEL_SQUARE_BRACKET_L: {
114             return "DEL_SQUARE_BRACKET_L";
115         }
116         case Token::Type::DEL_SQUARE_BRACKET_R: {
117             return "DEL_SQUARE_BRACKET_R";
118         }
119         case Token::Type::DEL_GT: {
120             return "DEL_GT";
121         }
122         case Token::Type::DEL_LT: {
123             return "DEL_LT";
124         }
125         case Token::Type::DEL_EQ: {
126             return "DEL_EQ";
127         }
128         case Token::Type::DEL_DOT: {
129             return "DEL_DOT";
130         }
131         case Token::Type::ID: {
132             return "ID";
133         }
134         case Token::Type::ID_STRING: {
135             return "ID_STRING";
136         }
137         default:
138             return "NONE";
139     }
140 }
141 
IsQuote(char c)142 static bool IsQuote(char c)
143 {
144     return c == '"';
145 }
146 
Lexer()147 Lexer::Lexer() : curr_line_(nullptr)
148 {
149     LOG(DEBUG, ASSEMBLER) << "element of class Lexer initialized";
150 }
151 
~Lexer()152 Lexer::~Lexer()
153 {
154     LOG(DEBUG, ASSEMBLER) << "element of class Lexer destructed";
155 }
156 
TokenizeString(const std::string & source_str)157 Tokens Lexer::TokenizeString(const std::string &source_str)
158 {
159     LOG(DEBUG, ASSEMBLER) << "started tokenizing of line " << lines_.size() + 1 << ": ";
160 
161     lines_.emplace_back(source_str);
162 
163     curr_line_ = &lines_.back();
164 
165     LOG(DEBUG, ASSEMBLER) << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
166                                               curr_line_->end - curr_line_->pos);
167 
168     AnalyzeLine();
169 
170     LOG(DEBUG, ASSEMBLER) << "tokenization of line " << lines_.size() << " is successful";
171     LOG(DEBUG, ASSEMBLER) << "         tokens identified: ";
172 
173     for (const auto &f_i : lines_.back().tokens) {
174         LOG(DEBUG, ASSEMBLER) << "\n                           "
175                               << std::string_view(&*(f_i.whole_line.begin() + f_i.bound_left),
176                                                   f_i.bound_right - f_i.bound_left)
177                               << " (type: " << TokenTypeWhat(f_i.type) << ")";
178 
179         LOG(DEBUG, ASSEMBLER);
180         LOG(DEBUG, ASSEMBLER);
181     }
182     return std::pair<std::vector<Token>, Error>(lines_.back().tokens, err_);
183 }
184 
185 // End of line
Eol() const186 bool Lexer::Eol() const
187 {
188     return curr_line_->pos == curr_line_->end;
189 }
190 
191 // Return the type of token
LexGetType(size_t beg,size_t end) const192 Token::Type Lexer::LexGetType(size_t beg, size_t end) const
193 {
194     if (FindDelim(curr_line_->buffer[beg]) != Token::Type::ID_BAD) { /* delimiter */
195         return FindDelim(curr_line_->buffer[beg]);
196     }
197 
198     std::string_view p(&*(curr_line_->buffer.begin() + beg), end - beg);
199     Token::Type type = Findkeyword(p);
200     if (type != Token::Type::ID_BAD) {
201         return type;
202     }
203 
204     type = FindOperation(p);
205     if (type != Token::Type::ID_BAD) {
206         return type;
207     }
208 
209     if (IsQuote(curr_line_->buffer[beg])) {
210         return Token::Type::ID_STRING;
211     }
212 
213     return Token::Type::ID;  // other
214 }
215 
216 // Handle string literal
LexString()217 bool Lexer::LexString()
218 {
219     bool is_escape_seq = false;
220     char quote = curr_line_->buffer[curr_line_->pos];
221     size_t begin = curr_line_->pos;
222     while (!Eol()) {
223         ++(curr_line_->pos);
224 
225         char c = curr_line_->buffer[curr_line_->pos];
226 
227         if (is_escape_seq) {
228             is_escape_seq = false;
229             continue;
230         }
231 
232         if (c == '\\') {
233             is_escape_seq = true;
234         }
235 
236         if (c == quote) {
237             break;
238         }
239     }
240 
241     if (curr_line_->buffer[curr_line_->pos] != quote) {
242         err_ = Error(std::string("Missing terminating ") + quote + " character", 0,
243                      Error::ErrorType::ERR_STRING_MISSING_TERMINATING_CHARACTER, "", begin, curr_line_->pos,
244                      curr_line_->buffer);
245         return false;
246     }
247 
248     ++(curr_line_->pos);
249 
250     return true;
251 }
252 
253 /*
254  * Tokens handling: set the corresponding
255  * elements bound_left and bound_right of the array tokens
256  * to the first and last characters of a corresponding token.
257  *
258  *                                                  bound_r1   bound_r2    bound_r3
259  *                                                  |          |           |
260  *                                                  v          v           v
261  *       token1 token2 token3 ...             token1     token2      token3 ...
262  *                                       =>   ^          ^           ^
263  *                                            |          |           |
264  *    bound1    bound2    bound3 ...          bound_l1   bound_l2    bound_l3 ...
265  *
266  */
LexTokens()267 void Lexer::LexTokens()
268 {
269     if (Eol()) {
270         return;
271     }
272 
273     LOG(DEBUG, ASSEMBLER) << "token search started (line " << lines_.size() << "): "
274                           << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
275                                               curr_line_->end - curr_line_->pos);
276 
277     while (curr_line_->end > curr_line_->pos && isspace(curr_line_->buffer[curr_line_->end - 1]) != 0) {
278         --(curr_line_->end);
279     }
280 
281     while (isspace(curr_line_->buffer[curr_line_->pos]) != 0 && !Eol()) {
282         ++(curr_line_->pos);
283     }
284 
285     size_t bound_right;
286     size_t bound_left;
287 
288     for (int i = 0; !Eol(); ++i) {
289         bound_left = curr_line_->pos;
290 
291         if (FindDelim(curr_line_->buffer[curr_line_->pos]) != Token::Type::ID_BAD) {
292             ++(curr_line_->pos);
293         } else if (IsQuote(curr_line_->buffer[curr_line_->pos])) {
294             if (!LexString()) {
295                 return;
296             }
297         } else {
298             while (!Eol() && FindDelim(curr_line_->buffer[curr_line_->pos]) == Token::Type::ID_BAD &&
299                    isspace(curr_line_->buffer[curr_line_->pos]) == 0) {
300                 ++(curr_line_->pos);
301             }
302         }
303 
304         bound_right = curr_line_->pos;
305 
306         LOG(DEBUG, ASSEMBLER) << "token identified (line " << lines_.size() << ", "
307                               << "token " << curr_line_->tokens.size() + 1 << "): "
308                               << std::string_view(&*(curr_line_->buffer.begin() + bound_left), bound_right - bound_left)
309                               << " ("
310                               << "type: " << TokenTypeWhat(LexGetType(bound_left, bound_right)) << ")";
311 
312         curr_line_->tokens.emplace_back(bound_left, bound_right, LexGetType(bound_left, bound_right),
313                                         curr_line_->buffer);
314 
315         while (isspace(curr_line_->buffer[curr_line_->pos]) != 0 && !Eol()) {
316             ++(curr_line_->pos);
317         }
318     }
319 
320     LOG(DEBUG, ASSEMBLER) << "all tokens identified (line " << lines_.size() << ")";
321 }
322 
323 /*
324  * Ignore comments:
325  * find PARSE_COMMENT_MARKER and move line->end to another position
326  * next after the last character of the last significant (not a comment)
327  * element in a current line: line->buffer.
328  *
329  * Ex:
330  *   [Label:] operation operand[,operand] [# comment]
331  *
332  *   L1: mov v0, v1 # moving!        L1: mov v0, v1 # moving!
333  *                          ^   =>                 ^
334  *                          |                      |
335  *                         end                    end
336  */
LexPreprocess()337 void Lexer::LexPreprocess()
338 {
339     LOG(DEBUG, ASSEMBLER) << "started removing comments (line " << lines_.size() << "): "
340                           << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
341                                               curr_line_->end - curr_line_->pos);
342 
343     size_t cmt_pos;
344     bool inside_str_lit;
345 
346     // Searching for comment marker located outside of the string literals.
347     inside_str_lit = curr_line_->buffer.size() > 0 && curr_line_->buffer[0] == '\"';
348     cmt_pos = curr_line_->buffer.find_first_of("\"#", 0);
349     if (cmt_pos != std::string::npos) {
350         do {
351             if (cmt_pos != 0 && curr_line_->buffer[cmt_pos - 1] != '\\' && curr_line_->buffer[cmt_pos] == '\"') {
352                 inside_str_lit = !inside_str_lit;
353             } else if (curr_line_->buffer[cmt_pos] == PARSE_COMMENT_MARKER && !inside_str_lit) {
354                 break;
355             }
356         } while ((cmt_pos = curr_line_->buffer.find_first_of("\"#", cmt_pos + 1)) != std::string::npos);
357     }
358 
359     if (cmt_pos != std::string::npos) {
360         curr_line_->end = cmt_pos;
361     }
362 
363     while (curr_line_->end > curr_line_->pos && isspace(curr_line_->buffer[curr_line_->end - 1]) != 0) {
364         --(curr_line_->end);
365     }
366 
367     LOG(DEBUG, ASSEMBLER) << "comments removed (line " << lines_.size() << "): "
368                           << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
369                                               curr_line_->end - curr_line_->pos);
370 }
371 
SkipSpace()372 void Lexer::SkipSpace()
373 {
374     while (!Eol() && isspace(curr_line_->buffer[curr_line_->pos]) != 0) {
375         ++(curr_line_->pos);
376     }
377 }
378 
AnalyzeLine()379 void Lexer::AnalyzeLine()
380 {
381     LexPreprocess();
382 
383     SkipSpace();
384 
385     LexTokens();
386 }
387 
388 }  // namespace panda::pandasm
389