• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef TOOLS_GN_TOKENIZER_H_
6 #define TOOLS_GN_TOKENIZER_H_
7 
8 #include <stddef.h>
9 
10 #include <string_view>
11 #include <vector>
12 
13 #include "gn/err.h"
14 #include "gn/token.h"
15 
16 class InputFile;
17 
18 // Tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal in GN files.
19 // Almost always these are errors. However, in the case of running the formatter
20 // it's nice to convert these to spaces when encountered so that the input can
21 // still be parsed and rewritten correctly by the formatter.
22 enum class WhitespaceTransform {
23   kMaintainOriginalInput,
24   kInvalidToSpace,
25 };
26 
27 class Tokenizer {
28  public:
29   static std::vector<Token> Tokenize(
30       const InputFile* input_file,
31       Err* err,
32       WhitespaceTransform whitespace_transform =
33           WhitespaceTransform::kMaintainOriginalInput);
34 
35   // Counts lines in the given buffer (the first line is "1") and returns
36   // the byte offset of the beginning of that line, or (size_t)-1 if there
37   // aren't that many lines in the file. Note that this will return the byte
38   // one past the end of the input if the last character is a newline.
39   //
40   // This is a helper function for error output so that the tokenizer's
41   // notion of lines can be used elsewhere.
42   static size_t ByteOffsetOfNthLine(std::string_view buf, int n);
43 
44   // Returns true if the given offset of the string piece counts as a newline.
45   // The offset must be in the buffer.
46   static bool IsNewline(std::string_view buffer, size_t offset);
47 
48   static bool IsIdentifierFirstChar(char c);
49 
50   static bool IsIdentifierContinuingChar(char c);
51 
52   static Token::Type ClassifyToken(char next_char, char following_char);
53 
54  private:
55   // InputFile must outlive the tokenizer and all generated tokens.
56   Tokenizer(const InputFile* input_file,
57             Err* err,
58             WhitespaceTransform whitespace_transform);
59   ~Tokenizer();
60 
61   std::vector<Token> Run();
62 
63   void AdvanceToNextToken();
64   Token::Type ClassifyCurrent() const;
65   void AdvanceToEndOfToken(const Location& location, Token::Type type);
66 
67   // Whether from this location back to the beginning of the line is only
68   // whitespace. |location| should be the first character of the token to be
69   // checked.
70   bool AtStartOfLine(size_t location) const;
71 
72   bool IsCurrentWhitespace() const;
73   bool IsCurrentNewline() const;
74   bool IsCurrentStringTerminator(char quote_char) const;
75 
CanIncrement()76   bool CanIncrement() const { return cur_ < input_.size() - 1; }
77 
78   // Increments the current location by one.
79   void Advance();
80 
81   // Returns the current character in the file as a location.
82   Location GetCurrentLocation() const;
83 
84   Err GetErrorForInvalidToken(const Location& location) const;
85 
done()86   bool done() const { return at_end() || has_error(); }
87 
at_end()88   bool at_end() const { return cur_ == input_.size(); }
cur_char()89   char cur_char() const { return input_[cur_]; }
90 
has_error()91   bool has_error() const { return err_->has_error(); }
92 
93   std::vector<Token> tokens_;
94 
95   const InputFile* input_file_;
96   const std::string_view input_;
97   Err* err_;
98   WhitespaceTransform whitespace_transform_;
99   size_t cur_ = 0;  // Byte offset into input buffer.
100 
101   int line_number_ = 1;
102   int column_number_ = 1;
103 
104   Tokenizer(const Tokenizer&) = delete;
105   Tokenizer& operator=(const Tokenizer&) = delete;
106 };
107 
108 #endif  // TOOLS_GN_TOKENIZER_H_
109