• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "lexer.h"
17 
18 namespace panda::pandasm {
19 
20 /*-------------------------------*/
21 
22 /* Is this a delimiter ? */
FindDelim(char c)23 Token::Type FindDelim(char c)
24 {
25     /* The map of delimiters */
26     static const std::unordered_map<char, Token::Type> DELIM = {{',', Token::Type::DEL_COMMA},
27                                                                 {':', Token::Type::DEL_COLON},
28                                                                 {'{', Token::Type::DEL_BRACE_L},
29                                                                 {'}', Token::Type::DEL_BRACE_R},
30                                                                 {'(', Token::Type::DEL_BRACKET_L},
31                                                                 {')', Token::Type::DEL_BRACKET_R},
32                                                                 {'<', Token::Type::DEL_LT},
33                                                                 {'>', Token::Type::DEL_GT},
34                                                                 {'=', Token::Type::DEL_EQ},
35                                                                 {'[', Token::Type::DEL_SQUARE_BRACKET_L},
36                                                                 {']', Token::Type::DEL_SQUARE_BRACKET_R}};
37 
38     auto iter = DELIM.find(c);
39 
40     if (iter == DELIM.end()) {
41         return Token::Type::ID_BAD;
42     }
43 
44     return DELIM.at(c);
45 }
46 
FindOperation(std::string_view s)47 Token::Type FindOperation(std::string_view s)
48 {
49     /* Generate the map of OPERATIONS from ISA: */
50     static const std::unordered_map<std::string_view, Token::Type> OPERATIONS = {
51 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
52 #define OPLIST(inst_code, name, optype, width, flags, dst_idx, use_idxs) \
53     {std::string_view(name), Token::Type::ID_OP_##inst_code},
54         PANDA_INSTRUCTION_LIST(OPLIST)
55 #undef OPLIST
56     };
57 
58     auto iter = OPERATIONS.find(s);
59 
60     if (iter == OPERATIONS.end()) {
61         return Token::Type::ID_BAD;
62     }
63 
64     return OPERATIONS.at(s);
65 }
66 
Findkeyword(std::string_view s)67 Token::Type Findkeyword(std::string_view s)
68 {
69     /* Generate the map of KEYWORDS: */
70     static const std::unordered_map<std::string_view, Token::Type> KEYWORDS = {
71 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
72 #define KEYWORDS(name, inst_code) {std::string_view(name), Token::Type::ID_##inst_code},
73         KEYWORDS_LIST(KEYWORDS)
74 #undef KEYWORDS
75     };
76 
77     auto iter = KEYWORDS.find(s);
78 
79     if (iter == KEYWORDS.end()) {
80         return Token::Type::ID_BAD;
81     }
82 
83     return KEYWORDS.at(s);
84 }
85 
TokenTypeWhat(Token::Type t)86 std::string_view TokenTypeWhat(Token::Type t)
87 {
88     if (t >= Token::Type::OPERATION && t < Token::Type::KEYWORD) {
89         return "OPERATION";
90     }
91 
92     if (t >= Token::Type::KEYWORD) {
93         return "KEYWORD";
94     }
95 
96     switch (t) {
97         case Token::Type::ID_BAD: {
98             return "ID_BAD";
99         }
100         case Token::Type::DEL_COMMA: {
101             return "DEL_COMMA";
102         }
103         case Token::Type::DEL_COLON: {
104             return "DEL_COLON";
105         }
106         case Token::Type::DEL_BRACE_L: {
107             return "DEL_BRACE_L";
108         }
109         case Token::Type::DEL_BRACE_R: {
110             return "DEL_BRACE_R";
111         }
112         case Token::Type::DEL_BRACKET_L: {
113             return "DEL_BRACKET_L";
114         }
115         case Token::Type::DEL_BRACKET_R: {
116             return "DEL_BRACKET_R";
117         }
118         case Token::Type::DEL_SQUARE_BRACKET_L: {
119             return "DEL_SQUARE_BRACKET_L";
120         }
121         case Token::Type::DEL_SQUARE_BRACKET_R: {
122             return "DEL_SQUARE_BRACKET_R";
123         }
124         case Token::Type::DEL_GT: {
125             return "DEL_GT";
126         }
127         case Token::Type::DEL_LT: {
128             return "DEL_LT";
129         }
130         case Token::Type::DEL_EQ: {
131             return "DEL_EQ";
132         }
133         case Token::Type::DEL_DOT: {
134             return "DEL_DOT";
135         }
136         case Token::Type::ID: {
137             return "ID";
138         }
139         case Token::Type::ID_STRING: {
140             return "ID_STRING";
141         }
142         default:
143             return "NONE";
144     }
145 }
146 
IsQuote(char c)147 static bool IsQuote(char c)
148 {
149     return c == '"';
150 }
151 
Lexer()152 Lexer::Lexer() : curr_line_(nullptr)
153 {
154     LOG(DEBUG, ASSEMBLER) << "element of class Lexer initialized";
155 }
156 
~Lexer()157 Lexer::~Lexer()
158 {
159     LOG(DEBUG, ASSEMBLER) << "element of class Lexer destructed";
160 }
161 
TokenizeString(const std::string & source_str)162 Tokens Lexer::TokenizeString(const std::string &source_str)
163 {
164     LOG(DEBUG, ASSEMBLER) << "started tokenizing of line " << lines_.size() + 1 << ": ";
165 
166     lines_.emplace_back(source_str);
167 
168     curr_line_ = &lines_.back();
169 
170     LOG(DEBUG, ASSEMBLER) << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
171                                               curr_line_->end - curr_line_->pos);
172 
173     AnalyzeLine();
174 
175     LOG(DEBUG, ASSEMBLER) << "tokenization of line " << lines_.size() << " is successful";
176     LOG(DEBUG, ASSEMBLER) << "         tokens identified: ";
177 
178     for (const auto &f_i : lines_.back().tokens) {
179         LOG(DEBUG, ASSEMBLER) << "\n                           "
180                               << std::string_view(&*(f_i.whole_line.begin() + f_i.bound_left),
181                                                   f_i.bound_right - f_i.bound_left)
182                               << " (type: " << TokenTypeWhat(f_i.type) << ")";
183 
184         LOG(DEBUG, ASSEMBLER);
185         LOG(DEBUG, ASSEMBLER);
186     }
187     return std::pair<std::vector<Token>, Error>(lines_.back().tokens, err_);
188 }
189 
190 /* End of line? */
Eol() const191 bool Lexer::Eol() const
192 {
193     return curr_line_->pos == curr_line_->end;
194 }
195 
196 /* Return the type of token */
LexGetType(size_t beg,size_t end) const197 Token::Type Lexer::LexGetType(size_t beg, size_t end) const
198 {
199     if (FindDelim(curr_line_->buffer[beg]) != Token::Type::ID_BAD) { /* delimiter */
200         return FindDelim(curr_line_->buffer[beg]);
201     }
202 
203     std::string_view p(&*(curr_line_->buffer.begin() + beg), end - beg);
204 
205     Token::Type type = Findkeyword(p);
206 
207     if (type != Token::Type::ID_BAD) {
208         return type;
209     }
210 
211     type = FindOperation(p);
212 
213     if (type != Token::Type::ID_BAD) {
214         return type;
215     }
216 
217     if (IsQuote(curr_line_->buffer[beg])) {
218         return Token::Type::ID_STRING;
219     }
220 
221     return Token::Type::ID; /* other */
222 }
223 
224 /* Handle string literal */
LexString()225 bool Lexer::LexString()
226 {
227     bool is_escape_seq = false;
228     char quote = curr_line_->buffer[curr_line_->pos];
229     size_t begin = curr_line_->pos;
230     while (!Eol()) {
231         ++(curr_line_->pos);
232 
233         char c = curr_line_->buffer[curr_line_->pos];
234 
235         if (is_escape_seq) {
236             is_escape_seq = false;
237             continue;
238         }
239 
240         if (c == '\\') {
241             is_escape_seq = true;
242         }
243 
244         if (c == quote) {
245             break;
246         }
247     }
248 
249     if (curr_line_->buffer[curr_line_->pos] != quote) {
250         err_ = Error(std::string("Missing terminating ") + quote + " character", 0,
251                      Error::ErrorType::ERR_STRING_MISSING_TERMINATING_CHARACTER, "", begin, curr_line_->pos,
252                      curr_line_->buffer);
253         return false;
254     }
255 
256     ++(curr_line_->pos);
257 
258     return true;
259 }
260 
261 /*
262  * Tokens handling: set a corresponding
263  * elements bound_left and bound_right of the array tokens
264  * to the first and last characters of a corresponding token.
265  *
266  *                                                  bound_r1   bound_r2    bound_r3
267  *                                                  |          |           |
268  *                                                  v          v           v
269  *       token1 token2 token3 ...             token1     token2      token3 ...
270  *                                       =>   ^          ^           ^
271  *                                            |          |           |
272  *    bound1    bound2    bound3 ...          bound_l1   bound_l2    bound_l3 ...
273  *
274  */
LexTokens()275 void Lexer::LexTokens()
276 {
277     if (Eol()) {
278         return;
279     }
280 
281     LOG(DEBUG, ASSEMBLER) << "token search started (line " << lines_.size() << "): "
282                           << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
283                                               curr_line_->end - curr_line_->pos);
284 
285     while (curr_line_->end > curr_line_->pos && isspace(curr_line_->buffer[curr_line_->end - 1]) != 0) {
286         --(curr_line_->end);
287     }
288 
289     while (isspace(curr_line_->buffer[curr_line_->pos]) != 0 && !Eol()) {
290         ++(curr_line_->pos);
291     }
292 
293     size_t bound_right;
294 
295     size_t bound_left;
296 
297     for (int i = 0; !Eol(); ++i) {
298         bound_left = curr_line_->pos;
299 
300         if (FindDelim(curr_line_->buffer[curr_line_->pos]) != Token::Type::ID_BAD) {
301             ++(curr_line_->pos);
302         } else if (IsQuote(curr_line_->buffer[curr_line_->pos])) {
303             if (!LexString()) {
304                 return;
305             }
306         } else {
307             while (!Eol() && FindDelim(curr_line_->buffer[curr_line_->pos]) == Token::Type::ID_BAD &&
308                    isspace(curr_line_->buffer[curr_line_->pos]) == 0) {
309                 ++(curr_line_->pos);
310             }
311         }
312 
313         bound_right = curr_line_->pos;
314 
315         LOG(DEBUG, ASSEMBLER) << "token identified (line " << lines_.size() << ", "
316                               << "token " << curr_line_->tokens.size() + 1 << "): "
317                               << std::string_view(&*(curr_line_->buffer.begin() + bound_left), bound_right - bound_left)
318                               << " ("
319                               << "type: " << TokenTypeWhat(LexGetType(bound_left, bound_right)) << ")";
320 
321         curr_line_->tokens.emplace_back(bound_left, bound_right, LexGetType(bound_left, bound_right),
322                                         curr_line_->buffer);
323 
324         while (isspace(curr_line_->buffer[curr_line_->pos]) != 0 && !Eol()) {
325             ++(curr_line_->pos);
326         }
327     }
328 
329     LOG(DEBUG, ASSEMBLER) << "all tokens identified (line " << lines_.size() << ")";
330 }
331 
332 /*
333  * Ignore comments:
334  * find PARSE_COMMENT_MARKER and move line->end
335  * to another position (next after the last character of the last
336  * significant (this is no a comment) element in a current
337  * line: line->buffer).
338  *
339  * Ex:
340  *   [Label:] operation operand[,operand] [# comment]
341  *
342  *   L1: mov v0, v1 # moving!        L1: mov v0, v1 # moving!
343  *                          ^   =>                 ^
344  *                          |                      |
345  *                         end                    end
346  */
LexPreprocess()347 void Lexer::LexPreprocess()
348 {
349     LOG(DEBUG, ASSEMBLER) << "started removing comments (line " << lines_.size() << "): "
350                           << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
351                                               curr_line_->end - curr_line_->pos);
352 
353     // Searching for comment marker located outside of string literals.
354     bool inside_str_lit = curr_line_->buffer.size() > 0 && curr_line_->buffer[0] == '\"';
355     size_t cmt_pos = curr_line_->buffer.find_first_of("\"#", 0);
356     if (cmt_pos != std::string::npos) {
357         do {
358             if (cmt_pos != 0 && curr_line_->buffer[cmt_pos - 1] != '\\' && curr_line_->buffer[cmt_pos] == '\"') {
359                 inside_str_lit = !inside_str_lit;
360             } else if (curr_line_->buffer[cmt_pos] == PARSE_COMMENT_MARKER && !inside_str_lit) {
361                 break;
362             }
363         } while ((cmt_pos = curr_line_->buffer.find_first_of("\"#", cmt_pos + 1)) != std::string::npos);
364     }
365 
366     if (cmt_pos != std::string::npos) {
367         curr_line_->end = cmt_pos;
368     }
369 
370     while (curr_line_->end > curr_line_->pos && isspace(curr_line_->buffer[curr_line_->end - 1]) != 0) {
371         --(curr_line_->end);
372     }
373 
374     LOG(DEBUG, ASSEMBLER) << "comments removed (line " << lines_.size() << "): "
375                           << std::string_view(&*(curr_line_->buffer.begin() + curr_line_->pos),
376                                               curr_line_->end - curr_line_->pos);
377 }
378 
SkipSpace()379 void Lexer::SkipSpace()
380 {
381     while (!Eol() && isspace(curr_line_->buffer[curr_line_->pos]) != 0) {
382         ++(curr_line_->pos);
383     }
384 }
385 
AnalyzeLine()386 void Lexer::AnalyzeLine()
387 {
388     LexPreprocess();
389 
390     SkipSpace();
391 
392     LexTokens();
393 }
394 
395 /*-------------------------------*/
396 
397 }  // namespace panda::pandasm
398