• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "gn/tokenizer.h"
6 
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "gn/input_file.h"
10 
11 namespace {
12 
CouldBeTwoCharOperatorBegin(char c)13 bool CouldBeTwoCharOperatorBegin(char c) {
14   return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || c == '+' ||
15          c == '|' || c == '&';
16 }
17 
CouldBeTwoCharOperatorEnd(char c)18 bool CouldBeTwoCharOperatorEnd(char c) {
19   return c == '=' || c == '|' || c == '&';
20 }
21 
CouldBeOneCharOperator(char c)22 bool CouldBeOneCharOperator(char c) {
23   return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || c == ':' ||
24          c == '|' || c == '&' || c == '-';
25 }
26 
CouldBeOperator(char c)27 bool CouldBeOperator(char c) {
28   return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
29 }
30 
IsScoperChar(char c)31 bool IsScoperChar(char c) {
32   return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
33 }
34 
GetSpecificOperatorType(std::string_view value)35 Token::Type GetSpecificOperatorType(std::string_view value) {
36   if (value == "=")
37     return Token::EQUAL;
38   if (value == "+")
39     return Token::PLUS;
40   if (value == "-")
41     return Token::MINUS;
42   if (value == "+=")
43     return Token::PLUS_EQUALS;
44   if (value == "-=")
45     return Token::MINUS_EQUALS;
46   if (value == "==")
47     return Token::EQUAL_EQUAL;
48   if (value == "!=")
49     return Token::NOT_EQUAL;
50   if (value == "<=")
51     return Token::LESS_EQUAL;
52   if (value == ">=")
53     return Token::GREATER_EQUAL;
54   if (value == "<")
55     return Token::LESS_THAN;
56   if (value == ">")
57     return Token::GREATER_THAN;
58   if (value == "&&")
59     return Token::BOOLEAN_AND;
60   if (value == "||")
61     return Token::BOOLEAN_OR;
62   if (value == "!")
63     return Token::BANG;
64   if (value == ".")
65     return Token::DOT;
66   return Token::INVALID;
67 }
68 
69 }  // namespace
70 
Tokenizer(const InputFile * input_file,Err * err,WhitespaceTransform whitespace_transform)71 Tokenizer::Tokenizer(const InputFile* input_file,
72                      Err* err,
73                      WhitespaceTransform whitespace_transform)
74     : input_file_(input_file),
75       input_(input_file->contents()),
76       err_(err),
77       whitespace_transform_(whitespace_transform) {}
78 
79 Tokenizer::~Tokenizer() = default;
80 
81 // static
Tokenize(const InputFile * input_file,Err * err,WhitespaceTransform whitespace_transform)82 std::vector<Token> Tokenizer::Tokenize(
83     const InputFile* input_file,
84     Err* err,
85     WhitespaceTransform whitespace_transform) {
86   Tokenizer t(input_file, err, whitespace_transform);
87   return t.Run();
88 }
89 
Run()90 std::vector<Token> Tokenizer::Run() {
91   DCHECK(tokens_.empty());
92   while (!done()) {
93     AdvanceToNextToken();
94     if (done())
95       break;
96     Location location = GetCurrentLocation();
97 
98     Token::Type type = ClassifyCurrent();
99     if (type == Token::INVALID) {
100       *err_ = GetErrorForInvalidToken(location);
101       break;
102     }
103     size_t token_begin = cur_;
104     AdvanceToEndOfToken(location, type);
105     if (has_error())
106       break;
107     size_t token_end = cur_;
108 
109     std::string_view token_value(&input_.data()[token_begin],
110                                  token_end - token_begin);
111 
112     if (type == Token::UNCLASSIFIED_OPERATOR) {
113       type = GetSpecificOperatorType(token_value);
114     } else if (type == Token::IDENTIFIER) {
115       if (token_value == "if")
116         type = Token::IF;
117       else if (token_value == "else")
118         type = Token::ELSE;
119       else if (token_value == "true")
120         type = Token::TRUE_TOKEN;
121       else if (token_value == "false")
122         type = Token::FALSE_TOKEN;
123     } else if (type == Token::UNCLASSIFIED_COMMENT) {
124       if (AtStartOfLine(token_begin) &&
125           // If it's a standalone comment, but is a continuation of a comment on
126           // a previous line, then instead make it a continued suffix comment.
127           (tokens_.empty() || tokens_.back().type() != Token::SUFFIX_COMMENT ||
128            tokens_.back().location().line_number() + 1 !=
129                location.line_number() ||
130            tokens_.back().location().column_number() !=
131                location.column_number())) {
132         type = Token::LINE_COMMENT;
133         if (!at_end())  // Could be EOF.
134           Advance();    // The current \n.
135         // If this comment is separated from the next syntax element, then we
136         // want to tag it as a block comment. This will become a standalone
137         // statement at the parser level to keep this comment separate, rather
138         // than attached to the subsequent statement.
139         while (!at_end() && IsCurrentWhitespace()) {
140           if (IsCurrentNewline()) {
141             type = Token::BLOCK_COMMENT;
142             break;
143           }
144           Advance();
145         }
146       } else {
147         type = Token::SUFFIX_COMMENT;
148       }
149     }
150 
151     tokens_.push_back(Token(location, type, token_value));
152   }
153   if (err_->has_error())
154     tokens_.clear();
155   return tokens_;
156 }
157 
158 // static
ByteOffsetOfNthLine(std::string_view buf,int n)159 size_t Tokenizer::ByteOffsetOfNthLine(std::string_view buf, int n) {
160   DCHECK_GT(n, 0);
161 
162   if (n == 1)
163     return 0;
164 
165   int cur_line = 1;
166   size_t cur_byte = 0;
167   while (cur_byte < buf.size()) {
168     if (IsNewline(buf, cur_byte)) {
169       cur_line++;
170       if (cur_line == n)
171         return cur_byte + 1;
172     }
173     cur_byte++;
174   }
175   return static_cast<size_t>(-1);
176 }
177 
178 // static
IsNewline(std::string_view buffer,size_t offset)179 bool Tokenizer::IsNewline(std::string_view buffer, size_t offset) {
180   DCHECK(offset < buffer.size());
181   // We may need more logic here to handle different line ending styles.
182   return buffer[offset] == '\n';
183 }
184 
185 // static
IsIdentifierFirstChar(char c)186 bool Tokenizer::IsIdentifierFirstChar(char c) {
187   return base::IsAsciiAlpha(c) || c == '_';
188 }
189 
190 // static
IsIdentifierContinuingChar(char c)191 bool Tokenizer::IsIdentifierContinuingChar(char c) {
192   // Also allow digits after the first char.
193   return IsIdentifierFirstChar(c) || base::IsAsciiDigit(c);
194 }
195 
AdvanceToNextToken()196 void Tokenizer::AdvanceToNextToken() {
197   while (!at_end() && IsCurrentWhitespace())
198     Advance();
199 }
200 
201 // static
ClassifyToken(char next_char,char following_char)202 Token::Type Tokenizer::ClassifyToken(char next_char, char following_char) {
203   if (base::IsAsciiDigit(next_char))
204     return Token::INTEGER;
205   if (next_char == '"')
206     return Token::STRING;
207 
208   // Note: '-' handled specially below.
209   if (next_char != '-' && CouldBeOperator(next_char))
210     return Token::UNCLASSIFIED_OPERATOR;
211 
212   if (IsIdentifierFirstChar(next_char))
213     return Token::IDENTIFIER;
214 
215   if (next_char == '[')
216     return Token::LEFT_BRACKET;
217   if (next_char == ']')
218     return Token::RIGHT_BRACKET;
219   if (next_char == '(')
220     return Token::LEFT_PAREN;
221   if (next_char == ')')
222     return Token::RIGHT_PAREN;
223   if (next_char == '{')
224     return Token::LEFT_BRACE;
225   if (next_char == '}')
226     return Token::RIGHT_BRACE;
227 
228   if (next_char == '.')
229     return Token::DOT;
230   if (next_char == ',')
231     return Token::COMMA;
232 
233   if (next_char == '#')
234     return Token::UNCLASSIFIED_COMMENT;
235 
236   // For the case of '-' differentiate between a negative number and anything
237   // else.
238   if (next_char == '-') {
239     if (following_char == '\0')
240       return Token::UNCLASSIFIED_OPERATOR;  // Just the minus before end of
241                                             // file.
242     if (base::IsAsciiDigit(following_char))
243       return Token::INTEGER;
244     return Token::UNCLASSIFIED_OPERATOR;
245   }
246 
247   return Token::INVALID;
248 }
249 
ClassifyCurrent() const250 Token::Type Tokenizer::ClassifyCurrent() const {
251   DCHECK(!at_end());
252   char next_char = cur_char();
253   char following_char = CanIncrement() ? input_[cur_ + 1] : '\0';
254   return ClassifyToken(next_char, following_char);
255 }
256 
AdvanceToEndOfToken(const Location & location,Token::Type type)257 void Tokenizer::AdvanceToEndOfToken(const Location& location,
258                                     Token::Type type) {
259   switch (type) {
260     case Token::INTEGER:
261       do {
262         Advance();
263       } while (!at_end() && base::IsAsciiDigit(cur_char()));
264       if (!at_end()) {
265         // Require the char after a number to be some kind of space, scope,
266         // or operator.
267         char c = cur_char();
268         if (!IsCurrentWhitespace() && !CouldBeOperator(c) && !IsScoperChar(c) &&
269             c != ',') {
270           *err_ = Err(GetCurrentLocation(), "This is not a valid number.");
271           // Highlight the number.
272           err_->AppendRange(LocationRange(location, GetCurrentLocation()));
273         }
274       }
275       break;
276 
277     case Token::STRING: {
278       char initial = cur_char();
279       Advance();  // Advance past initial "
280       for (;;) {
281         if (at_end()) {
282           *err_ = Err(LocationRange(location, GetCurrentLocation()),
283                       "Unterminated string literal.",
284                       "Don't leave me hanging like this!");
285           break;
286         }
287         if (IsCurrentStringTerminator(initial)) {
288           Advance();  // Skip past last "
289           break;
290         } else if (IsCurrentNewline()) {
291           *err_ = Err(LocationRange(location, GetCurrentLocation()),
292                       "Newline in string constant.");
293         }
294         Advance();
295       }
296       break;
297     }
298 
299     case Token::UNCLASSIFIED_OPERATOR:
300       // Some operators are two characters, some are one.
301       if (CouldBeTwoCharOperatorBegin(cur_char())) {
302         if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
303           Advance();
304       }
305       Advance();
306       break;
307 
308     case Token::IDENTIFIER:
309       while (!at_end() && IsIdentifierContinuingChar(cur_char()))
310         Advance();
311       break;
312 
313     case Token::LEFT_BRACKET:
314     case Token::RIGHT_BRACKET:
315     case Token::LEFT_BRACE:
316     case Token::RIGHT_BRACE:
317     case Token::LEFT_PAREN:
318     case Token::RIGHT_PAREN:
319     case Token::DOT:
320     case Token::COMMA:
321       Advance();  // All are one char.
322       break;
323 
324     case Token::UNCLASSIFIED_COMMENT:
325       // Eat to EOL.
326       while (!at_end() && !IsCurrentNewline())
327         Advance();
328       break;
329 
330     case Token::INVALID:
331     default:
332       *err_ = Err(location, "Everything is all messed up",
333                   "Please insert system disk in drive A: and press any key.");
334       NOTREACHED();
335       return;
336   }
337 }
338 
AtStartOfLine(size_t location) const339 bool Tokenizer::AtStartOfLine(size_t location) const {
340   while (location > 0) {
341     --location;
342     char c = input_[location];
343     if (c == '\n')
344       return true;
345     if (c != ' ')
346       return false;
347   }
348   return true;
349 }
350 
IsCurrentWhitespace() const351 bool Tokenizer::IsCurrentWhitespace() const {
352   DCHECK(!at_end());
353   char c = input_[cur_];
354   // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
355   return c == 0x0A || c == 0x0D || c == 0x20 ||
356          (whitespace_transform_ == WhitespaceTransform::kInvalidToSpace &&
357           (c == 0x09 || c == 0x0B || c == 0x0C));
358 }
359 
IsCurrentStringTerminator(char quote_char) const360 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
361   DCHECK(!at_end());
362   if (cur_char() != quote_char)
363     return false;
364 
365   // Check for escaping. \" is not a string terminator, but \\" is. Count
366   // the number of preceding backslashes.
367   int num_backslashes = 0;
368   for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
369     num_backslashes++;
370 
371   // Even backslashes mean that they were escaping each other and don't count
372   // as escaping this quote.
373   return (num_backslashes % 2) == 0;
374 }
375 
IsCurrentNewline() const376 bool Tokenizer::IsCurrentNewline() const {
377   return IsNewline(input_, cur_);
378 }
379 
Advance()380 void Tokenizer::Advance() {
381   DCHECK(cur_ < input_.size());
382   if (IsCurrentNewline()) {
383     line_number_++;
384     column_number_ = 1;
385   } else {
386     column_number_++;
387   }
388   cur_++;
389 }
390 
GetCurrentLocation() const391 Location Tokenizer::GetCurrentLocation() const {
392   return Location(input_file_, line_number_, column_number_);
393 }
394 
GetErrorForInvalidToken(const Location & location) const395 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
396   std::string help;
397   if (cur_char() == ';') {
398     // Semicolon.
399     help = "Semicolons are not needed, delete this one.";
400   } else if (cur_char() == '\t') {
401     // Tab.
402     help =
403         "You got a tab character in here. Tabs are evil. "
404         "Convert to spaces.";
405   } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
406              (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
407     // Different types of comments.
408     help = "Comments should start with # instead";
409   } else if (cur_char() == '\'') {
410     help = "Strings are delimited by \" characters, not apostrophes.";
411   } else {
412     help = "I have no idea what this is.";
413   }
414 
415   return Err(location, "Invalid token.", help);
416 }
417