1 // Copyright 2018 The Amber Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/tokenizer.h"
16
17 #include <cctype>
18 #include <cstdlib>
19 #include <limits>
20 #include <sstream>
21
22 #include "src/make_unique.h"
23
24 namespace amber {
25
Token(TokenType type)26 Token::Token(TokenType type) : type_(type) {}
27
28 Token::~Token() = default;
29
ConvertToDouble()30 Result Token::ConvertToDouble() {
31 if (IsDouble())
32 return {};
33
34 if (IsIdentifier() || IsEOL() || IsEOS())
35 return Result("Invalid conversion to double");
36
37 if (IsInteger()) {
38 if (is_negative_ ||
39 uint_value_ <=
40 static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) {
41 double_value_ = static_cast<double>(AsInt64());
42 } else {
43 return Result("uint64_t value too big to fit in double");
44 }
45
46 uint_value_ = 0;
47 } else if (IsHex()) {
48 double_value_ = static_cast<double>(AsHex());
49 string_value_ = "";
50 }
51 type_ = TokenType::kDouble;
52 return {};
53 }
54
Tokenizer(const std::string & data)55 Tokenizer::Tokenizer(const std::string& data) : data_(data) {}
56
57 Tokenizer::~Tokenizer() = default;
58
NextToken()59 std::unique_ptr<Token> Tokenizer::NextToken() {
60 SkipWhitespace();
61 if (current_position_ >= data_.length())
62 return MakeUnique<Token>(TokenType::kEOS);
63
64 if (data_[current_position_] == '#') {
65 SkipComment();
66 SkipWhitespace();
67 }
68 if (current_position_ >= data_.length())
69 return MakeUnique<Token>(TokenType::kEOS);
70
71 if (data_[current_position_] == '\n') {
72 ++current_line_;
73 ++current_position_;
74 return MakeUnique<Token>(TokenType::kEOL);
75 }
76
77 if (data_[current_position_] == '"') {
78 current_position_++; // Skip opening quote
79 std::string tok_str;
80 bool escape = false;
81 for (; current_position_ < data_.length(); current_position_++) {
82 auto c = data_[current_position_];
83 switch (c) {
84 case '\\':
85 if (!escape) {
86 escape = true;
87 continue;
88 }
89 break;
90 case '"':
91 if (!escape) {
92 current_position_++; // Skip closing quote
93 auto tok = MakeUnique<Token>(TokenType::kString);
94 tok->SetStringValue(tok_str);
95 return tok;
96 }
97 break;
98 case 'a':
99 if (escape) {
100 tok_str += '\a';
101 escape = false;
102 continue;
103 }
104 break;
105 case 'b':
106 if (escape) {
107 tok_str += '\b';
108 escape = false;
109 continue;
110 }
111 break;
112 case 't':
113 if (escape) {
114 tok_str += '\t';
115 escape = false;
116 continue;
117 }
118 break;
119 case 'n':
120 if (escape) {
121 tok_str += '\n';
122 escape = false;
123 continue;
124 }
125 break;
126 case 'v':
127 if (escape) {
128 tok_str += '\v';
129 escape = false;
130 continue;
131 }
132 break;
133 case 'f':
134 if (escape) {
135 tok_str += '\f';
136 escape = false;
137 continue;
138 }
139 break;
140 case 'r':
141 if (escape) {
142 tok_str += '\r';
143 escape = false;
144 continue;
145 }
146 break;
147 }
148 escape = false;
149 tok_str += c;
150 }
151
152 auto tok = MakeUnique<Token>(TokenType::kString);
153 tok->SetStringValue(tok_str);
154 return tok;
155 }
156
157 // If the current position is a , ( or ) then handle it specially as we don't
158 // want to consume any other characters.
159 if (data_[current_position_] == ',' || data_[current_position_] == '(' ||
160 data_[current_position_] == ')') {
161 auto tok = MakeUnique<Token>(TokenType::kIdentifier);
162 std::string str(1, data_[current_position_]);
163 tok->SetStringValue(str);
164 ++current_position_;
165 return tok;
166 }
167
168 size_t end_pos = current_position_;
169 while (end_pos < data_.length()) {
170 if (data_[end_pos] == ' ' || data_[end_pos] == '\r' ||
171 data_[end_pos] == '\n' || data_[end_pos] == ')' ||
172 data_[end_pos] == ',' || data_[end_pos] == '(') {
173 break;
174 }
175 ++end_pos;
176 }
177
178 std::string tok_str =
179 data_.substr(current_position_, end_pos - current_position_);
180 current_position_ = end_pos;
181
182 // Check for "NaN" explicitly.
183 bool is_nan =
184 (tok_str.size() == 3 && std::tolower(tok_str[0]) == 'n' &&
185 std::tolower(tok_str[1]) == 'a' && std::tolower(tok_str[2]) == 'n');
186
187 // Starts with an alpha is a string.
188 if (!is_nan && !std::isdigit(tok_str[0]) &&
189 !(tok_str[0] == '-' && tok_str.size() >= 2 && std::isdigit(tok_str[1])) &&
190 !(tok_str[0] == '.' && tok_str.size() >= 2 && std::isdigit(tok_str[1]))) {
191 // If we've got a continuation, skip over the end of line and get the next
192 // token.
193 if (tok_str == "\\") {
194 if ((current_position_ < data_.length() &&
195 data_[current_position_] == '\n')) {
196 ++current_line_;
197 ++current_position_;
198 return NextToken();
199 } else if (current_position_ + 1 < data_.length() &&
200 data_[current_position_] == '\r' &&
201 data_[current_position_ + 1] == '\n') {
202 ++current_line_;
203 current_position_ += 2;
204 return NextToken();
205 }
206 }
207
208 auto tok = MakeUnique<Token>(TokenType::kIdentifier);
209 tok->SetStringValue(tok_str);
210 return tok;
211 }
212
213 // Handle hex strings
214 if (!is_nan && tok_str.size() > 2 && tok_str[0] == '0' && tok_str[1] == 'x') {
215 auto tok = MakeUnique<Token>(TokenType::kHex);
216 tok->SetStringValue(tok_str);
217 return tok;
218 }
219
220 bool is_double = false;
221 if (is_nan) {
222 is_double = true;
223 } else {
224 for (const char ch : tok_str) {
225 if (ch == '.') {
226 is_double = true;
227 break;
228 }
229 }
230 }
231
232 std::unique_ptr<Token> tok;
233
234 char* final_pos = nullptr;
235 if (is_double) {
236 tok = MakeUnique<Token>(TokenType::kDouble);
237
238 double val = strtod(tok_str.c_str(), &final_pos);
239 tok->SetDoubleValue(val);
240 } else {
241 tok = MakeUnique<Token>(TokenType::kInteger);
242
243 uint64_t val = uint64_t(std::strtoull(tok_str.c_str(), &final_pos, 10));
244 tok->SetUint64Value(static_cast<uint64_t>(val));
245 }
246 if (tok_str.size() > 1 && tok_str[0] == '-')
247 tok->SetNegative();
248
249 tok->SetOriginalString(
250 tok_str.substr(0, static_cast<size_t>(final_pos - tok_str.c_str())));
251
252 // If the number isn't the whole token then move back so we can then parse
253 // the string portion.
254 auto diff = size_t(final_pos - tok_str.c_str());
255 if (diff > 0)
256 current_position_ -= tok_str.length() - diff;
257
258 return tok;
259 }
260
PeekNextToken()261 std::unique_ptr<Token> Tokenizer::PeekNextToken() {
262 // Use NextToken() and restore location pointers.
263 auto orig_position = current_position_;
264 auto orig_line = current_line_;
265 std::unique_ptr<Token> tok = NextToken();
266 current_position_ = orig_position;
267 current_line_ = orig_line;
268
269 return tok;
270 }
271
ExtractToNext(const std::string & str)272 std::string Tokenizer::ExtractToNext(const std::string& str) {
273 size_t pos = data_.find(str, current_position_);
274 std::string ret;
275 if (pos == std::string::npos) {
276 ret = data_.substr(current_position_);
277 current_position_ = data_.length();
278 } else {
279 ret = data_.substr(current_position_, pos - current_position_);
280 current_position_ = pos;
281 }
282
283 // Account for any new lines in the extracted text so our current line
284 // number stays correct.
285 for (const char c : ret) {
286 if (c == '\n')
287 ++current_line_;
288 }
289
290 return ret;
291 }
292
IsWhitespace(char ch)293 bool Tokenizer::IsWhitespace(char ch) {
294 return ch == '\0' || ch == '\t' || ch == '\r' || ch == 0x0c /* ff */ ||
295 ch == ' ';
296 }
297
SkipWhitespace()298 void Tokenizer::SkipWhitespace() {
299 while (current_position_ < data_.size() &&
300 IsWhitespace(data_[current_position_])) {
301 ++current_position_;
302 }
303 }
304
SkipComment()305 void Tokenizer::SkipComment() {
306 while (current_position_ < data_.length() &&
307 data_[current_position_] != '\n') {
308 ++current_position_;
309 }
310 }
311
312 } // namespace amber
313