1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef TOOLS_GN_TOKENIZER_H_ 6 #define TOOLS_GN_TOKENIZER_H_ 7 8 #include <stddef.h> 9 10 #include <string_view> 11 #include <vector> 12 13 #include "gn/err.h" 14 #include "gn/token.h" 15 16 class InputFile; 17 18 // Tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal in GN files. 19 // Almost always these are errors. However, in the case of running the formatter 20 // it's nice to convert these to spaces when encountered so that the input can 21 // still be parsed and rewritten correctly by the formatter. 22 enum class WhitespaceTransform { 23 kMaintainOriginalInput, 24 kInvalidToSpace, 25 }; 26 27 class Tokenizer { 28 public: 29 static std::vector<Token> Tokenize( 30 const InputFile* input_file, 31 Err* err, 32 WhitespaceTransform whitespace_transform = 33 WhitespaceTransform::kMaintainOriginalInput); 34 35 // Counts lines in the given buffer (the first line is "1") and returns 36 // the byte offset of the beginning of that line, or (size_t)-1 if there 37 // aren't that many lines in the file. Note that this will return the byte 38 // one past the end of the input if the last character is a newline. 39 // 40 // This is a helper function for error output so that the tokenizer's 41 // notion of lines can be used elsewhere. 42 static size_t ByteOffsetOfNthLine(std::string_view buf, int n); 43 44 // Returns true if the given offset of the string piece counts as a newline. 45 // The offset must be in the buffer. 46 static bool IsNewline(std::string_view buffer, size_t offset); 47 48 static bool IsIdentifierFirstChar(char c); 49 50 static bool IsIdentifierContinuingChar(char c); 51 52 static Token::Type ClassifyToken(char next_char, char following_char); 53 54 private: 55 // InputFile must outlive the tokenizer and all generated tokens. 56 Tokenizer(const InputFile* input_file, 57 Err* err, 58 WhitespaceTransform whitespace_transform); 59 ~Tokenizer(); 60 61 std::vector<Token> Run(); 62 63 void AdvanceToNextToken(); 64 Token::Type ClassifyCurrent() const; 65 void AdvanceToEndOfToken(const Location& location, Token::Type type); 66 67 // Whether from this location back to the beginning of the line is only 68 // whitespace. |location| should be the first character of the token to be 69 // checked. 70 bool AtStartOfLine(size_t location) const; 71 72 bool IsCurrentWhitespace() const; 73 bool IsCurrentNewline() const; 74 bool IsCurrentStringTerminator(char quote_char) const; 75 CanIncrement()76 bool CanIncrement() const { return cur_ < input_.size() - 1; } 77 78 // Increments the current location by one. 79 void Advance(); 80 81 // Returns the current character in the file as a location. 82 Location GetCurrentLocation() const; 83 84 Err GetErrorForInvalidToken(const Location& location) const; 85 done()86 bool done() const { return at_end() || has_error(); } 87 at_end()88 bool at_end() const { return cur_ == input_.size(); } cur_char()89 char cur_char() const { return input_[cur_]; } 90 has_error()91 bool has_error() const { return err_->has_error(); } 92 93 std::vector<Token> tokens_; 94 95 const InputFile* input_file_; 96 const std::string_view input_; 97 Err* err_; 98 WhitespaceTransform whitespace_transform_; 99 size_t cur_ = 0; // Byte offset into input buffer. 100 101 int line_number_ = 1; 102 int column_number_ = 1; 103 104 Tokenizer(const Tokenizer&) = delete; 105 Tokenizer& operator=(const Tokenizer&) = delete; 106 }; 107 108 #endif // TOOLS_GN_TOKENIZER_H_ 109