1 // Copyright 2018 The Amber Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/tokenizer.h"
16
17 #include <cctype>
18 #include <cstdlib>
19 #include <limits>
20 #include <sstream>
21
22 #include "src/make_unique.h"
23
24 namespace amber {
25
Token(TokenType type)26 Token::Token(TokenType type) : type_(type) {}
27
28 Token::~Token() = default;
29
ConvertToDouble()30 Result Token::ConvertToDouble() {
31 if (IsDouble())
32 return {};
33
34 if (IsString() || IsEOL() || IsEOS())
35 return Result("Invalid conversion to double");
36
37 if (IsInteger()) {
38 if (is_negative_ ||
39 uint_value_ <=
40 static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) {
41 double_value_ = static_cast<double>(AsInt64());
42 } else {
43 return Result("uint64_t value too big to fit in double");
44 }
45
46 uint_value_ = 0;
47 } else if (IsHex()) {
48 double_value_ = static_cast<double>(AsHex());
49 string_value_ = "";
50 }
51 type_ = TokenType::kDouble;
52 return {};
53 }
54
Tokenizer(const std::string & data)55 Tokenizer::Tokenizer(const std::string& data) : data_(data) {}
56
57 Tokenizer::~Tokenizer() = default;
58
NextToken()59 std::unique_ptr<Token> Tokenizer::NextToken() {
60 SkipWhitespace();
61 if (current_position_ >= data_.length())
62 return MakeUnique<Token>(TokenType::kEOS);
63
64 if (data_[current_position_] == '#') {
65 SkipComment();
66 SkipWhitespace();
67 }
68 if (current_position_ >= data_.length())
69 return MakeUnique<Token>(TokenType::kEOS);
70
71 if (data_[current_position_] == '\n') {
72 ++current_line_;
73 ++current_position_;
74 return MakeUnique<Token>(TokenType::kEOL);
75 }
76
77 // If the current position is a , ( or ) then handle it specially as we don't
78 // want to consume any other characters.
79 if (data_[current_position_] == ',' || data_[current_position_] == '(' ||
80 data_[current_position_] == ')') {
81 auto tok = MakeUnique<Token>(TokenType::kString);
82 std::string str(1, data_[current_position_]);
83 tok->SetStringValue(str);
84 ++current_position_;
85 return tok;
86 }
87
88 size_t end_pos = current_position_;
89 while (end_pos < data_.length()) {
90 if (data_[end_pos] == ' ' || data_[end_pos] == '\r' ||
91 data_[end_pos] == '\n' || data_[end_pos] == ')' ||
92 data_[end_pos] == ',' || data_[end_pos] == '(') {
93 break;
94 }
95 ++end_pos;
96 }
97
98 std::string tok_str =
99 data_.substr(current_position_, end_pos - current_position_);
100 current_position_ = end_pos;
101
102 // Check for "NaN" explicitly.
103 bool is_nan =
104 (tok_str.size() == 3 && std::tolower(tok_str[0]) == 'n' &&
105 std::tolower(tok_str[1]) == 'a' && std::tolower(tok_str[2]) == 'n');
106
107 // Starts with an alpha is a string.
108 if (!is_nan && !std::isdigit(tok_str[0]) &&
109 !(tok_str[0] == '-' && tok_str.size() >= 2 && std::isdigit(tok_str[1])) &&
110 !(tok_str[0] == '.' && tok_str.size() >= 2 && std::isdigit(tok_str[1]))) {
111 // If we've got a continuation, skip over the end of line and get the next
112 // token.
113 if (tok_str == "\\") {
114 if ((current_position_ < data_.length() &&
115 data_[current_position_] == '\n')) {
116 ++current_line_;
117 ++current_position_;
118 return NextToken();
119 } else if (current_position_ + 1 < data_.length() &&
120 data_[current_position_] == '\r' &&
121 data_[current_position_ + 1] == '\n') {
122 ++current_line_;
123 current_position_ += 2;
124 return NextToken();
125 }
126 }
127
128 auto tok = MakeUnique<Token>(TokenType::kString);
129 tok->SetStringValue(tok_str);
130 return tok;
131 }
132
133 // Handle hex strings
134 if (!is_nan && tok_str.size() > 2 && tok_str[0] == '0' && tok_str[1] == 'x') {
135 auto tok = MakeUnique<Token>(TokenType::kHex);
136 tok->SetStringValue(tok_str);
137 return tok;
138 }
139
140 bool is_double = false;
141 if (is_nan) {
142 is_double = true;
143 } else {
144 for (const char ch : tok_str) {
145 if (ch == '.') {
146 is_double = true;
147 break;
148 }
149 }
150 }
151
152 std::unique_ptr<Token> tok;
153
154 char* final_pos = nullptr;
155 if (is_double) {
156 tok = MakeUnique<Token>(TokenType::kDouble);
157
158 double val = strtod(tok_str.c_str(), &final_pos);
159 tok->SetDoubleValue(val);
160 } else {
161 tok = MakeUnique<Token>(TokenType::kInteger);
162
163 uint64_t val = uint64_t(std::strtoull(tok_str.c_str(), &final_pos, 10));
164 tok->SetUint64Value(static_cast<uint64_t>(val));
165 }
166 if (tok_str.size() > 1 && tok_str[0] == '-')
167 tok->SetNegative();
168
169 tok->SetOriginalString(
170 tok_str.substr(0, static_cast<size_t>(final_pos - tok_str.c_str())));
171
172 // If the number isn't the whole token then move back so we can then parse
173 // the string portion.
174 auto diff = size_t(final_pos - tok_str.c_str());
175 if (diff > 0)
176 current_position_ -= tok_str.length() - diff;
177
178 return tok;
179 }
180
ExtractToNext(const std::string & str)181 std::string Tokenizer::ExtractToNext(const std::string& str) {
182 size_t pos = data_.find(str, current_position_);
183 std::string ret;
184 if (pos == std::string::npos) {
185 ret = data_.substr(current_position_);
186 current_position_ = data_.length();
187 } else {
188 ret = data_.substr(current_position_, pos - current_position_);
189 current_position_ = pos;
190 }
191
192 // Account for any new lines in the extracted text so our current line
193 // number stays correct.
194 for (const char c : ret) {
195 if (c == '\n')
196 ++current_line_;
197 }
198
199 return ret;
200 }
201
IsWhitespace(char ch)202 bool Tokenizer::IsWhitespace(char ch) {
203 return ch == '\0' || ch == '\t' || ch == '\r' || ch == 0x0c /* ff */ ||
204 ch == ' ';
205 }
206
SkipWhitespace()207 void Tokenizer::SkipWhitespace() {
208 while (current_position_ < data_.size() &&
209 IsWhitespace(data_[current_position_])) {
210 ++current_position_;
211 }
212 }
213
SkipComment()214 void Tokenizer::SkipComment() {
215 while (current_position_ < data_.length() &&
216 data_[current_position_] != '\n') {
217 ++current_position_;
218 }
219 }
220
221 } // namespace amber
222