1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef BASE_JSON_JSON_PARSER_H_ 6 #define BASE_JSON_JSON_PARSER_H_ 7 8 #include <stddef.h> 9 #include <stdint.h> 10 11 #include <memory> 12 #include <optional> 13 #include <string> 14 #include <string_view> 15 16 #include "base/compiler_specific.h" 17 #include "base/gtest_prod_util.h" 18 #include "base/json/json_reader.h" 19 20 namespace base { 21 22 class Value; 23 24 namespace internal { 25 26 class JSONParserTest; 27 28 // The implementation behind the JSONReader interface. This class is not meant 29 // to be used directly; it encapsulates logic that need not be exposed publicly. 30 // 31 // This parser guarantees O(n) time through the input string. Iteration happens 32 // on the byte level, with the functions ConsumeChars() and ConsumeChar(). The 33 // conversion from byte to JSON token happens without advancing the parser in 34 // GetNextToken/ParseToken, that is tokenization operates on the current parser 35 // position without advancing. 36 // 37 // Built on top of these are a family of Consume functions that iterate 38 // internally. Invariant: on entry of a Consume function, the parser is wound 39 // to the first byte of a valid JSON token. On exit, it is on the first byte 40 // after the token that was just consumed, which would likely be the first byte 41 // of the next token. 42 class JSONParser { 43 public: 44 JSONParser(int options, int max_depth = JSONReader::kStackMaxDepth); 45 ~JSONParser(); 46 47 // Parses the input string according to the set options and returns the 48 // result as a Value. 49 // Wrap this in base::FooValue::From() to check the Value is of type Foo and 50 // convert to a FooValue at the same time. 51 std::optional<Value> Parse(std::string_view input); 52 53 // Returns the error code. 54 JSONReader::JsonParseError error_code() const; 55 56 // Returns the human-friendly error message. 57 std::string GetErrorMessage() const; 58 59 // Returns the error line number if parse error happened. Otherwise always 60 // returns 0. 61 int error_line() const; 62 63 // Returns the error column number if parse error happened. Otherwise always 64 // returns 0. 65 int error_column() const; 66 67 private: 68 enum Token { 69 T_OBJECT_BEGIN, // { 70 T_OBJECT_END, // } 71 T_ARRAY_BEGIN, // [ 72 T_ARRAY_END, // ] 73 T_STRING, 74 T_NUMBER, 75 T_BOOL_TRUE, // true 76 T_BOOL_FALSE, // false 77 T_NULL, // null 78 T_LIST_SEPARATOR, // , 79 T_OBJECT_PAIR_SEPARATOR, // : 80 T_END_OF_INPUT, 81 T_INVALID_TOKEN, 82 }; 83 84 // A helper class used for parsing strings. One optimization performed is to 85 // create base::Value with a std::string_view to avoid unnecessary std::string 86 // copies. This is not possible if the input string needs to be decoded from 87 // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped. 88 // This class centralizes that logic. 89 class StringBuilder { 90 public: 91 // Empty constructor. Used for creating a builder with which to assign to. 92 StringBuilder(); 93 94 // |pos| is the beginning of an input string, excluding the |"|. 95 explicit StringBuilder(const char* pos); 96 97 ~StringBuilder(); 98 99 StringBuilder& operator=(StringBuilder&& other); 100 101 // Appends the Unicode code point |point| to the string, either by 102 // increasing the |length_| of the string if the string has not been 103 // converted, or by appending the UTF8 bytes for the code point. 104 void Append(uint32_t point); 105 106 // Converts the builder from its default std::string_view to a full 107 // std::string, performing a copy. Once a builder is converted, it cannot be 108 // made a std::string_view again. 109 void Convert(); 110 111 // Returns the builder as a string, invalidating all state. This allows 112 // the internal string buffer representation to be destructively moved 113 // in cases where the builder will not be needed any more. 114 std::string DestructiveAsString(); 115 116 private: 117 // The beginning of the input string. 118 const char* pos_; 119 120 // Number of bytes in |pos_| that make up the string being built. 121 size_t length_; 122 123 // The copied string representation. Will be unset until Convert() is 124 // called. 125 std::optional<std::string> string_; 126 }; 127 128 // Returns the next |count| bytes of the input stream, or nullopt if fewer 129 // than |count| bytes remain. 130 std::optional<std::string_view> PeekChars(int count); 131 132 // Calls PeekChars() with a |count| of 1. 133 std::optional<char> PeekChar(); 134 135 // Returns the next |count| bytes of the input stream, or nullopt if fewer 136 // than |count| bytes remain, and advances the parser position by |count|. 137 std::optional<std::string_view> ConsumeChars(int count); 138 139 // Calls ConsumeChars() with a |count| of 1. 140 std::optional<char> ConsumeChar(); 141 142 // Returns a pointer to the current character position. 143 const char* pos(); 144 145 // Skips over whitespace and comments to find the next token in the stream. 146 // This does not advance the parser for non-whitespace or comment chars. 147 Token GetNextToken(); 148 149 // Consumes whitespace characters and comments until the next non-that is 150 // encountered. 151 void EatWhitespaceAndComments(); 152 // Helper function that consumes a comment, assuming that the parser is 153 // currently wound to a '/'. 154 bool EatComment(); 155 156 // Calls GetNextToken() and then ParseToken(). 157 std::optional<Value> ParseNextToken(); 158 159 // Takes a token that represents the start of a Value ("a structural token" 160 // in RFC terms) and consumes it, returning the result as a Value. 161 std::optional<Value> ParseToken(Token token); 162 163 // Assuming that the parser is currently wound to '{', this parses a JSON 164 // object into a Value. 165 std::optional<Value> ConsumeDictionary(); 166 167 // Assuming that the parser is wound to '[', this parses a JSON list into a 168 // Value. 169 std::optional<Value> ConsumeList(); 170 171 // Calls through ConsumeStringRaw and wraps it in a value. 172 std::optional<Value> ConsumeString(); 173 174 // Assuming that the parser is wound to a double quote, this parses a string, 175 // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on 176 // success and places result into |out|. Returns false on failure with 177 // error information set. 178 bool ConsumeStringRaw(StringBuilder* out); 179 // Helper function for ConsumeStringRaw() that consumes the next four or 10 180 // bytes (parser is wound to the first character of a HEX sequence, with the 181 // potential for consuming another \uXXXX for a surrogate). Returns true on 182 // success and places the code point |out_code_point|, and false on failure. 183 bool DecodeUTF16(uint32_t* out_code_point); 184 185 // Assuming that the parser is wound to the start of a valid JSON number, 186 // this parses and converts it to either an int or double value. 187 std::optional<Value> ConsumeNumber(); 188 // Helper that reads characters that are ints. Returns true if a number was 189 // read and false on error. 190 bool ReadInt(bool allow_leading_zeros); 191 192 // Consumes the literal values of |true|, |false|, and |null|, assuming the 193 // parser is wound to the first character of any of those. 194 std::optional<Value> ConsumeLiteral(); 195 196 // Helper function that returns true if the byte sequence |match| can be 197 // consumed at the current parser position. Returns false if there are fewer 198 // than |match|-length bytes or if the sequence does not match, and the 199 // parser state is unchanged. 200 bool ConsumeIfMatch(std::string_view match); 201 202 // Sets the error information to |code| at the current column, based on 203 // |index_| and |index_last_line_|, with an optional positive/negative 204 // adjustment by |column_adjust|. 205 void ReportError(JSONReader::JsonParseError code, int column_adjust); 206 207 // Given the line and column number of an error, formats one of the error 208 // message contants from json_reader.h for human display. 209 static std::string FormatErrorMessage(int line, 210 int column, 211 const std::string& description); 212 213 // base::JSONParserOptions that control parsing. 214 const int options_; 215 216 // Maximum depth to parse. 217 const int max_depth_; 218 219 // The input stream being parsed. Note: Not guaranteed to NUL-terminated. 220 std::string_view input_; 221 222 // The index in the input stream to which the parser is wound. 223 int index_; 224 225 // The number of times the parser has recursed (current stack depth). 226 int stack_depth_; 227 228 // The line number that the parser is at currently. 229 int line_number_; 230 231 // The last value of |index_| on the previous line. 232 int index_last_line_; 233 234 // Error information. 235 JSONReader::JsonParseError error_code_; 236 int error_line_; 237 int error_column_; 238 239 friend class JSONParserTest; 240 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar); 241 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary); 242 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList); 243 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString); 244 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals); 245 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers); 246 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages); 247 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidCharacters); 248 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidUTF16EscapeSequence); 249 250 JSONParser(const JSONParser&) = delete; 251 JSONParser& operator=(const JSONParser&) = delete; 252 }; 253 254 // Used when decoding and an invalid utf-8 sequence is encountered. 255 extern const char kUnicodeReplacementString[]; 256 257 } // namespace internal 258 } // namespace base 259 260 #endif // BASE_JSON_JSON_PARSER_H_ 261