1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef TOOLS_GN_TOKENIZER_H_ 6 #define TOOLS_GN_TOKENIZER_H_ 7 8 #include <stddef.h> 9 10 #include <string_view> 11 #include <vector> 12 13 #include "base/macros.h" 14 #include "gn/err.h" 15 #include "gn/token.h" 16 17 class InputFile; 18 19 // Tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal in GN files. 20 // Almost always these are errors. However, in the case of running the formatter 21 // it's nice to convert these to spaces when encountered so that the input can 22 // still be parsed and rewritten correctly by the formatter. 23 enum class WhitespaceTransform { 24 kMaintainOriginalInput, 25 kInvalidToSpace, 26 }; 27 28 class Tokenizer { 29 public: 30 static std::vector<Token> Tokenize( 31 const InputFile* input_file, 32 Err* err, 33 WhitespaceTransform whitespace_transform = 34 WhitespaceTransform::kMaintainOriginalInput); 35 36 // Counts lines in the given buffer (the first line is "1") and returns 37 // the byte offset of the beginning of that line, or (size_t)-1 if there 38 // aren't that many lines in the file. Note that this will return the byte 39 // one past the end of the input if the last character is a newline. 40 // 41 // This is a helper function for error output so that the tokenizer's 42 // notion of lines can be used elsewhere. 43 static size_t ByteOffsetOfNthLine(const std::string_view& buf, int n); 44 45 // Returns true if the given offset of the string piece counts as a newline. 46 // The offset must be in the buffer. 47 static bool IsNewline(const std::string_view& buffer, size_t offset); 48 49 static bool IsIdentifierFirstChar(char c); 50 51 static bool IsIdentifierContinuingChar(char c); 52 53 private: 54 // InputFile must outlive the tokenizer and all generated tokens. 55 Tokenizer(const InputFile* input_file, 56 Err* err, 57 WhitespaceTransform whitespace_transform); 58 ~Tokenizer(); 59 60 std::vector<Token> Run(); 61 62 void AdvanceToNextToken(); 63 Token::Type ClassifyCurrent() const; 64 void AdvanceToEndOfToken(const Location& location, Token::Type type); 65 66 // Whether from this location back to the beginning of the line is only 67 // whitespace. |location| should be the first character of the token to be 68 // checked. 69 bool AtStartOfLine(size_t location) const; 70 71 bool IsCurrentWhitespace() const; 72 bool IsCurrentNewline() const; 73 bool IsCurrentStringTerminator(char quote_char) const; 74 CanIncrement()75 bool CanIncrement() const { return cur_ < input_.size() - 1; } 76 77 // Increments the current location by one. 78 void Advance(); 79 80 // Returns the current character in the file as a location. 81 Location GetCurrentLocation() const; 82 83 Err GetErrorForInvalidToken(const Location& location) const; 84 done()85 bool done() const { return at_end() || has_error(); } 86 at_end()87 bool at_end() const { return cur_ == input_.size(); } cur_char()88 char cur_char() const { return input_[cur_]; } 89 has_error()90 bool has_error() const { return err_->has_error(); } 91 92 std::vector<Token> tokens_; 93 94 const InputFile* input_file_; 95 const std::string_view input_; 96 Err* err_; 97 WhitespaceTransform whitespace_transform_; 98 size_t cur_ = 0; // Byte offset into input buffer. 99 100 int line_number_ = 1; 101 int column_number_ = 1; 102 103 DISALLOW_COPY_AND_ASSIGN(Tokenizer); 104 }; 105 106 #endif // TOOLS_GN_TOKENIZER_H_ 107