• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 The Amber Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/tokenizer.h"
16 
17 #include <cctype>
18 #include <cstdlib>
19 #include <limits>
20 #include <sstream>
21 
22 #include "src/make_unique.h"
23 
24 namespace amber {
25 
Token(TokenType type)26 Token::Token(TokenType type) : type_(type) {}
27 
28 Token::~Token() = default;
29 
ConvertToDouble()30 Result Token::ConvertToDouble() {
31   if (IsDouble())
32     return {};
33 
34   if (IsString() || IsEOL() || IsEOS())
35     return Result("Invalid conversion to double");
36 
37   if (IsInteger()) {
38     if (is_negative_ ||
39         uint_value_ <=
40             static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) {
41       double_value_ = static_cast<double>(AsInt64());
42     } else {
43       return Result("uint64_t value too big to fit in double");
44     }
45 
46     uint_value_ = 0;
47   } else if (IsHex()) {
48     double_value_ = static_cast<double>(AsHex());
49     string_value_ = "";
50   }
51   type_ = TokenType::kDouble;
52   return {};
53 }
54 
Tokenizer(const std::string & data)55 Tokenizer::Tokenizer(const std::string& data) : data_(data) {}
56 
57 Tokenizer::~Tokenizer() = default;
58 
NextToken()59 std::unique_ptr<Token> Tokenizer::NextToken() {
60   SkipWhitespace();
61   if (current_position_ >= data_.length())
62     return MakeUnique<Token>(TokenType::kEOS);
63 
64   if (data_[current_position_] == '#') {
65     SkipComment();
66     SkipWhitespace();
67   }
68   if (current_position_ >= data_.length())
69     return MakeUnique<Token>(TokenType::kEOS);
70 
71   if (data_[current_position_] == '\n') {
72     ++current_line_;
73     ++current_position_;
74     return MakeUnique<Token>(TokenType::kEOL);
75   }
76 
77   // If the current position is a , ( or ) then handle it specially as we don't
78   // want to consume any other characters.
79   if (data_[current_position_] == ',' || data_[current_position_] == '(' ||
80       data_[current_position_] == ')') {
81     auto tok = MakeUnique<Token>(TokenType::kString);
82     std::string str(1, data_[current_position_]);
83     tok->SetStringValue(str);
84     ++current_position_;
85     return tok;
86   }
87 
88   size_t end_pos = current_position_;
89   while (end_pos < data_.length()) {
90     if (data_[end_pos] == ' ' || data_[end_pos] == '\r' ||
91         data_[end_pos] == '\n' || data_[end_pos] == ')' ||
92         data_[end_pos] == ',' || data_[end_pos] == '(') {
93       break;
94     }
95     ++end_pos;
96   }
97 
98   std::string tok_str =
99       data_.substr(current_position_, end_pos - current_position_);
100   current_position_ = end_pos;
101 
102   // Check for "NaN" explicitly.
103   bool is_nan =
104       (tok_str.size() == 3 && std::tolower(tok_str[0]) == 'n' &&
105        std::tolower(tok_str[1]) == 'a' && std::tolower(tok_str[2]) == 'n');
106 
107   // Starts with an alpha is a string.
108   if (!is_nan && !std::isdigit(tok_str[0]) &&
109       !(tok_str[0] == '-' && tok_str.size() >= 2 && std::isdigit(tok_str[1])) &&
110       !(tok_str[0] == '.' && tok_str.size() >= 2 && std::isdigit(tok_str[1]))) {
111     // If we've got a continuation, skip over the end of line and get the next
112     // token.
113     if (tok_str == "\\") {
114       if ((current_position_ < data_.length() &&
115            data_[current_position_] == '\n')) {
116         ++current_line_;
117         ++current_position_;
118         return NextToken();
119       } else if (current_position_ + 1 < data_.length() &&
120                  data_[current_position_] == '\r' &&
121                  data_[current_position_ + 1] == '\n') {
122         ++current_line_;
123         current_position_ += 2;
124         return NextToken();
125       }
126     }
127 
128     auto tok = MakeUnique<Token>(TokenType::kString);
129     tok->SetStringValue(tok_str);
130     return tok;
131   }
132 
133   // Handle hex strings
134   if (!is_nan && tok_str.size() > 2 && tok_str[0] == '0' && tok_str[1] == 'x') {
135     auto tok = MakeUnique<Token>(TokenType::kHex);
136     tok->SetStringValue(tok_str);
137     return tok;
138   }
139 
140   bool is_double = false;
141   if (is_nan) {
142     is_double = true;
143   } else {
144     for (const char ch : tok_str) {
145       if (ch == '.') {
146         is_double = true;
147         break;
148       }
149     }
150   }
151 
152   std::unique_ptr<Token> tok;
153 
154   char* final_pos = nullptr;
155   if (is_double) {
156     tok = MakeUnique<Token>(TokenType::kDouble);
157 
158     double val = strtod(tok_str.c_str(), &final_pos);
159     tok->SetDoubleValue(val);
160   } else {
161     tok = MakeUnique<Token>(TokenType::kInteger);
162 
163     uint64_t val = uint64_t(std::strtoull(tok_str.c_str(), &final_pos, 10));
164     tok->SetUint64Value(static_cast<uint64_t>(val));
165   }
166   if (tok_str.size() > 1 && tok_str[0] == '-')
167     tok->SetNegative();
168 
169   tok->SetOriginalString(
170       tok_str.substr(0, static_cast<size_t>(final_pos - tok_str.c_str())));
171 
172   // If the number isn't the whole token then move back so we can then parse
173   // the string portion.
174   auto diff = size_t(final_pos - tok_str.c_str());
175   if (diff > 0)
176     current_position_ -= tok_str.length() - diff;
177 
178   return tok;
179 }
180 
ExtractToNext(const std::string & str)181 std::string Tokenizer::ExtractToNext(const std::string& str) {
182   size_t pos = data_.find(str, current_position_);
183   std::string ret;
184   if (pos == std::string::npos) {
185     ret = data_.substr(current_position_);
186     current_position_ = data_.length();
187   } else {
188     ret = data_.substr(current_position_, pos - current_position_);
189     current_position_ = pos;
190   }
191 
192   // Account for any new lines in the extracted text so our current line
193   // number stays correct.
194   for (const char c : ret) {
195     if (c == '\n')
196       ++current_line_;
197   }
198 
199   return ret;
200 }
201 
IsWhitespace(char ch)202 bool Tokenizer::IsWhitespace(char ch) {
203   return ch == '\0' || ch == '\t' || ch == '\r' || ch == 0x0c /* ff */ ||
204          ch == ' ';
205 }
206 
SkipWhitespace()207 void Tokenizer::SkipWhitespace() {
208   while (current_position_ < data_.size() &&
209          IsWhitespace(data_[current_position_])) {
210     ++current_position_;
211   }
212 }
213 
SkipComment()214 void Tokenizer::SkipComment() {
215   while (current_position_ < data_.length() &&
216          data_[current_position_] != '\n') {
217     ++current_position_;
218   }
219 }
220 
221 }  // namespace amber
222