1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "gn/tokenizer.h"
6
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "gn/input_file.h"
10
11 namespace {
12
CouldBeTwoCharOperatorBegin(char c)13 bool CouldBeTwoCharOperatorBegin(char c) {
14 return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || c == '+' ||
15 c == '|' || c == '&';
16 }
17
CouldBeTwoCharOperatorEnd(char c)18 bool CouldBeTwoCharOperatorEnd(char c) {
19 return c == '=' || c == '|' || c == '&';
20 }
21
CouldBeOneCharOperator(char c)22 bool CouldBeOneCharOperator(char c) {
23 return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || c == ':' ||
24 c == '|' || c == '&' || c == '-';
25 }
26
CouldBeOperator(char c)27 bool CouldBeOperator(char c) {
28 return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
29 }
30
IsScoperChar(char c)31 bool IsScoperChar(char c) {
32 return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
33 }
34
GetSpecificOperatorType(std::string_view value)35 Token::Type GetSpecificOperatorType(std::string_view value) {
36 if (value == "=")
37 return Token::EQUAL;
38 if (value == "+")
39 return Token::PLUS;
40 if (value == "-")
41 return Token::MINUS;
42 if (value == "+=")
43 return Token::PLUS_EQUALS;
44 if (value == "-=")
45 return Token::MINUS_EQUALS;
46 if (value == "==")
47 return Token::EQUAL_EQUAL;
48 if (value == "!=")
49 return Token::NOT_EQUAL;
50 if (value == "<=")
51 return Token::LESS_EQUAL;
52 if (value == ">=")
53 return Token::GREATER_EQUAL;
54 if (value == "<")
55 return Token::LESS_THAN;
56 if (value == ">")
57 return Token::GREATER_THAN;
58 if (value == "&&")
59 return Token::BOOLEAN_AND;
60 if (value == "||")
61 return Token::BOOLEAN_OR;
62 if (value == "!")
63 return Token::BANG;
64 if (value == ".")
65 return Token::DOT;
66 return Token::INVALID;
67 }
68
69 } // namespace
70
Tokenizer(const InputFile * input_file,Err * err,WhitespaceTransform whitespace_transform)71 Tokenizer::Tokenizer(const InputFile* input_file,
72 Err* err,
73 WhitespaceTransform whitespace_transform)
74 : input_file_(input_file),
75 input_(input_file->contents()),
76 err_(err),
77 whitespace_transform_(whitespace_transform) {}
78
79 Tokenizer::~Tokenizer() = default;
80
81 // static
Tokenize(const InputFile * input_file,Err * err,WhitespaceTransform whitespace_transform)82 std::vector<Token> Tokenizer::Tokenize(
83 const InputFile* input_file,
84 Err* err,
85 WhitespaceTransform whitespace_transform) {
86 Tokenizer t(input_file, err, whitespace_transform);
87 return t.Run();
88 }
89
Run()90 std::vector<Token> Tokenizer::Run() {
91 DCHECK(tokens_.empty());
92 while (!done()) {
93 AdvanceToNextToken();
94 if (done())
95 break;
96 Location location = GetCurrentLocation();
97
98 Token::Type type = ClassifyCurrent();
99 if (type == Token::INVALID) {
100 *err_ = GetErrorForInvalidToken(location);
101 break;
102 }
103 size_t token_begin = cur_;
104 AdvanceToEndOfToken(location, type);
105 if (has_error())
106 break;
107 size_t token_end = cur_;
108
109 std::string_view token_value(&input_.data()[token_begin],
110 token_end - token_begin);
111
112 if (type == Token::UNCLASSIFIED_OPERATOR) {
113 type = GetSpecificOperatorType(token_value);
114 } else if (type == Token::IDENTIFIER) {
115 if (token_value == "if")
116 type = Token::IF;
117 else if (token_value == "else")
118 type = Token::ELSE;
119 else if (token_value == "true")
120 type = Token::TRUE_TOKEN;
121 else if (token_value == "false")
122 type = Token::FALSE_TOKEN;
123 } else if (type == Token::UNCLASSIFIED_COMMENT) {
124 if (AtStartOfLine(token_begin) &&
125 // If it's a standalone comment, but is a continuation of a comment on
126 // a previous line, then instead make it a continued suffix comment.
127 (tokens_.empty() || tokens_.back().type() != Token::SUFFIX_COMMENT ||
128 tokens_.back().location().line_number() + 1 !=
129 location.line_number() ||
130 tokens_.back().location().column_number() !=
131 location.column_number())) {
132 type = Token::LINE_COMMENT;
133 if (!at_end()) // Could be EOF.
134 Advance(); // The current \n.
135 // If this comment is separated from the next syntax element, then we
136 // want to tag it as a block comment. This will become a standalone
137 // statement at the parser level to keep this comment separate, rather
138 // than attached to the subsequent statement.
139 while (!at_end() && IsCurrentWhitespace()) {
140 if (IsCurrentNewline()) {
141 type = Token::BLOCK_COMMENT;
142 break;
143 }
144 Advance();
145 }
146 } else {
147 type = Token::SUFFIX_COMMENT;
148 }
149 }
150
151 tokens_.push_back(Token(location, type, token_value));
152 }
153 if (err_->has_error())
154 tokens_.clear();
155 return tokens_;
156 }
157
158 // static
ByteOffsetOfNthLine(std::string_view buf,int n)159 size_t Tokenizer::ByteOffsetOfNthLine(std::string_view buf, int n) {
160 DCHECK_GT(n, 0);
161
162 if (n == 1)
163 return 0;
164
165 int cur_line = 1;
166 size_t cur_byte = 0;
167 while (cur_byte < buf.size()) {
168 if (IsNewline(buf, cur_byte)) {
169 cur_line++;
170 if (cur_line == n)
171 return cur_byte + 1;
172 }
173 cur_byte++;
174 }
175 return static_cast<size_t>(-1);
176 }
177
178 // static
IsNewline(std::string_view buffer,size_t offset)179 bool Tokenizer::IsNewline(std::string_view buffer, size_t offset) {
180 DCHECK(offset < buffer.size());
181 // We may need more logic here to handle different line ending styles.
182 return buffer[offset] == '\n';
183 }
184
185 // static
IsIdentifierFirstChar(char c)186 bool Tokenizer::IsIdentifierFirstChar(char c) {
187 return base::IsAsciiAlpha(c) || c == '_';
188 }
189
190 // static
IsIdentifierContinuingChar(char c)191 bool Tokenizer::IsIdentifierContinuingChar(char c) {
192 // Also allow digits after the first char.
193 return IsIdentifierFirstChar(c) || base::IsAsciiDigit(c);
194 }
195
AdvanceToNextToken()196 void Tokenizer::AdvanceToNextToken() {
197 while (!at_end() && IsCurrentWhitespace())
198 Advance();
199 }
200
201 // static
ClassifyToken(char next_char,char following_char)202 Token::Type Tokenizer::ClassifyToken(char next_char, char following_char) {
203 if (base::IsAsciiDigit(next_char))
204 return Token::INTEGER;
205 if (next_char == '"')
206 return Token::STRING;
207
208 // Note: '-' handled specially below.
209 if (next_char != '-' && CouldBeOperator(next_char))
210 return Token::UNCLASSIFIED_OPERATOR;
211
212 if (IsIdentifierFirstChar(next_char))
213 return Token::IDENTIFIER;
214
215 if (next_char == '[')
216 return Token::LEFT_BRACKET;
217 if (next_char == ']')
218 return Token::RIGHT_BRACKET;
219 if (next_char == '(')
220 return Token::LEFT_PAREN;
221 if (next_char == ')')
222 return Token::RIGHT_PAREN;
223 if (next_char == '{')
224 return Token::LEFT_BRACE;
225 if (next_char == '}')
226 return Token::RIGHT_BRACE;
227
228 if (next_char == '.')
229 return Token::DOT;
230 if (next_char == ',')
231 return Token::COMMA;
232
233 if (next_char == '#')
234 return Token::UNCLASSIFIED_COMMENT;
235
236 // For the case of '-' differentiate between a negative number and anything
237 // else.
238 if (next_char == '-') {
239 if (following_char == '\0')
240 return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
241 // file.
242 if (base::IsAsciiDigit(following_char))
243 return Token::INTEGER;
244 return Token::UNCLASSIFIED_OPERATOR;
245 }
246
247 return Token::INVALID;
248 }
249
ClassifyCurrent() const250 Token::Type Tokenizer::ClassifyCurrent() const {
251 DCHECK(!at_end());
252 char next_char = cur_char();
253 char following_char = CanIncrement() ? input_[cur_ + 1] : '\0';
254 return ClassifyToken(next_char, following_char);
255 }
256
AdvanceToEndOfToken(const Location & location,Token::Type type)257 void Tokenizer::AdvanceToEndOfToken(const Location& location,
258 Token::Type type) {
259 switch (type) {
260 case Token::INTEGER:
261 do {
262 Advance();
263 } while (!at_end() && base::IsAsciiDigit(cur_char()));
264 if (!at_end()) {
265 // Require the char after a number to be some kind of space, scope,
266 // or operator.
267 char c = cur_char();
268 if (!IsCurrentWhitespace() && !CouldBeOperator(c) && !IsScoperChar(c) &&
269 c != ',') {
270 *err_ = Err(GetCurrentLocation(), "This is not a valid number.");
271 // Highlight the number.
272 err_->AppendRange(LocationRange(location, GetCurrentLocation()));
273 }
274 }
275 break;
276
277 case Token::STRING: {
278 char initial = cur_char();
279 Advance(); // Advance past initial "
280 for (;;) {
281 if (at_end()) {
282 *err_ = Err(LocationRange(location, GetCurrentLocation()),
283 "Unterminated string literal.",
284 "Don't leave me hanging like this!");
285 break;
286 }
287 if (IsCurrentStringTerminator(initial)) {
288 Advance(); // Skip past last "
289 break;
290 } else if (IsCurrentNewline()) {
291 *err_ = Err(LocationRange(location, GetCurrentLocation()),
292 "Newline in string constant.");
293 }
294 Advance();
295 }
296 break;
297 }
298
299 case Token::UNCLASSIFIED_OPERATOR:
300 // Some operators are two characters, some are one.
301 if (CouldBeTwoCharOperatorBegin(cur_char())) {
302 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
303 Advance();
304 }
305 Advance();
306 break;
307
308 case Token::IDENTIFIER:
309 while (!at_end() && IsIdentifierContinuingChar(cur_char()))
310 Advance();
311 break;
312
313 case Token::LEFT_BRACKET:
314 case Token::RIGHT_BRACKET:
315 case Token::LEFT_BRACE:
316 case Token::RIGHT_BRACE:
317 case Token::LEFT_PAREN:
318 case Token::RIGHT_PAREN:
319 case Token::DOT:
320 case Token::COMMA:
321 Advance(); // All are one char.
322 break;
323
324 case Token::UNCLASSIFIED_COMMENT:
325 // Eat to EOL.
326 while (!at_end() && !IsCurrentNewline())
327 Advance();
328 break;
329
330 case Token::INVALID:
331 default:
332 *err_ = Err(location, "Everything is all messed up",
333 "Please insert system disk in drive A: and press any key.");
334 NOTREACHED();
335 return;
336 }
337 }
338
AtStartOfLine(size_t location) const339 bool Tokenizer::AtStartOfLine(size_t location) const {
340 while (location > 0) {
341 --location;
342 char c = input_[location];
343 if (c == '\n')
344 return true;
345 if (c != ' ')
346 return false;
347 }
348 return true;
349 }
350
IsCurrentWhitespace() const351 bool Tokenizer::IsCurrentWhitespace() const {
352 DCHECK(!at_end());
353 char c = input_[cur_];
354 // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
355 return c == 0x0A || c == 0x0D || c == 0x20 ||
356 (whitespace_transform_ == WhitespaceTransform::kInvalidToSpace &&
357 (c == 0x09 || c == 0x0B || c == 0x0C));
358 }
359
IsCurrentStringTerminator(char quote_char) const360 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
361 DCHECK(!at_end());
362 if (cur_char() != quote_char)
363 return false;
364
365 // Check for escaping. \" is not a string terminator, but \\" is. Count
366 // the number of preceding backslashes.
367 int num_backslashes = 0;
368 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
369 num_backslashes++;
370
371 // Even backslashes mean that they were escaping each other and don't count
372 // as escaping this quote.
373 return (num_backslashes % 2) == 0;
374 }
375
IsCurrentNewline() const376 bool Tokenizer::IsCurrentNewline() const {
377 return IsNewline(input_, cur_);
378 }
379
Advance()380 void Tokenizer::Advance() {
381 DCHECK(cur_ < input_.size());
382 if (IsCurrentNewline()) {
383 line_number_++;
384 column_number_ = 1;
385 } else {
386 column_number_++;
387 }
388 cur_++;
389 }
390
GetCurrentLocation() const391 Location Tokenizer::GetCurrentLocation() const {
392 return Location(input_file_, line_number_, column_number_);
393 }
394
GetErrorForInvalidToken(const Location & location) const395 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
396 std::string help;
397 if (cur_char() == ';') {
398 // Semicolon.
399 help = "Semicolons are not needed, delete this one.";
400 } else if (cur_char() == '\t') {
401 // Tab.
402 help =
403 "You got a tab character in here. Tabs are evil. "
404 "Convert to spaces.";
405 } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
406 (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
407 // Different types of comments.
408 help = "Comments should start with # instead";
409 } else if (cur_char() == '\'') {
410 help = "Strings are delimited by \" characters, not apostrophes.";
411 } else {
412 help = "I have no idea what this is.";
413 }
414
415 return Err(location, "Invalid token.", help);
416 }
417