1 // Copyright 2012 The Chromium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef BASE_JSON_JSON_PARSER_H_ 6 #define BASE_JSON_JSON_PARSER_H_ 7 8 #include <stddef.h> 9 #include <stdint.h> 10 11 #include <memory> 12 #include <optional> 13 #include <string> 14 #include <string_view> 15 #include <utility> 16 17 #include "base/base_export.h" 18 #include "base/compiler_specific.h" 19 #include "base/gtest_prod_util.h" 20 #include "base/json/json_common.h" 21 #include "base/third_party/icu/icu_utf.h" 22 #include "base/values.h" 23 24 namespace base { 25 26 class Value; 27 28 namespace internal { 29 30 class JSONParserTest; 31 32 // The implementation behind the JSONReader interface. This class is not meant 33 // to be used directly; it encapsulates logic that need not be exposed publicly. 34 // 35 // This parser guarantees O(n) time through the input string. Iteration happens 36 // on the byte level, with the functions ConsumeChars() and ConsumeChar(). The 37 // conversion from byte to JSON token happens without advancing the parser in 38 // GetNextToken/ParseToken, that is tokenization operates on the current parser 39 // position without advancing. 40 // 41 // Built on top of these are a family of Consume functions that iterate 42 // internally. Invariant: on entry of a Consume function, the parser is wound 43 // to the first byte of a valid JSON token. On exit, it is on the first byte 44 // after the token that was just consumed, which would likely be the first byte 45 // of the next token. 46 class BASE_EXPORT JSONParser { 47 public: 48 // Error codes during parsing. 49 enum JsonParseError { 50 JSON_NO_ERROR = base::ValueDeserializer::kErrorCodeNoError, 51 JSON_SYNTAX_ERROR = base::ValueDeserializer::kErrorCodeInvalidFormat, 52 JSON_INVALID_ESCAPE, 53 JSON_UNEXPECTED_TOKEN, 54 JSON_TRAILING_COMMA, 55 JSON_TOO_MUCH_NESTING, 56 JSON_UNEXPECTED_DATA_AFTER_ROOT, 57 JSON_UNSUPPORTED_ENCODING, 58 JSON_UNQUOTED_DICTIONARY_KEY, 59 JSON_UNREPRESENTABLE_NUMBER, 60 JSON_PARSE_ERROR_COUNT 61 }; 62 63 // String versions of parse error codes. 64 static const char kSyntaxError[]; 65 static const char kInvalidEscape[]; 66 static const char kUnexpectedToken[]; 67 static const char kTrailingComma[]; 68 static const char kTooMuchNesting[]; 69 static const char kUnexpectedDataAfterRoot[]; 70 static const char kUnsupportedEncoding[]; 71 static const char kUnquotedDictionaryKey[]; 72 static const char kUnrepresentableNumber[]; 73 74 explicit JSONParser(int options, size_t max_depth = kAbsoluteMaxDepth); 75 76 JSONParser(const JSONParser&) = delete; 77 JSONParser& operator=(const JSONParser&) = delete; 78 79 ~JSONParser(); 80 81 // Parses the input string according to the set options and returns the 82 // result as a Value. 83 // Wrap this in base::FooValue::From() to check the Value is of type Foo and 84 // convert to a FooValue at the same time. 85 std::optional<Value> Parse(std::string_view input); 86 87 // Returns the error code. 88 JsonParseError error_code() const; 89 90 // Returns the human-friendly error message. 91 std::string GetErrorMessage() const; 92 93 // Returns the error line number if parse error happened. Otherwise always 94 // returns 0. 95 int error_line() const; 96 97 // Returns the error column number if parse error happened. Otherwise always 98 // returns 0. 99 int error_column() const; 100 101 private: 102 enum Token { 103 T_OBJECT_BEGIN, // { 104 T_OBJECT_END, // } 105 T_ARRAY_BEGIN, // [ 106 T_ARRAY_END, // ] 107 T_STRING, 108 T_NUMBER, 109 T_BOOL_TRUE, // true 110 T_BOOL_FALSE, // false 111 T_NULL, // null 112 T_LIST_SEPARATOR, // , 113 T_OBJECT_PAIR_SEPARATOR, // : 114 T_END_OF_INPUT, 115 T_INVALID_TOKEN, 116 }; 117 118 // Returns the next |count| bytes of the input stream, or nullopt if fewer 119 // than |count| bytes remain. 120 std::optional<std::string_view> PeekChars(size_t count); 121 122 // Calls PeekChars() with a |count| of 1. 123 std::optional<char> PeekChar(); 124 125 // Returns the next |count| bytes of the input stream, or nullopt if fewer 126 // than |count| bytes remain, and advances the parser position by |count|. 127 std::optional<std::string_view> ConsumeChars(size_t count); 128 129 // Calls ConsumeChars() with a |count| of 1. 130 std::optional<char> ConsumeChar(); 131 132 // Returns a pointer to the current character position. 133 const char* pos(); 134 135 // Skips over whitespace and comments to find the next token in the stream. 136 // This does not advance the parser for non-whitespace or comment chars. 137 Token GetNextToken(); 138 139 // Consumes whitespace characters and comments until the next non-that is 140 // encountered. 141 void EatWhitespaceAndComments(); 142 // Helper function that consumes a comment, assuming that the parser is 143 // currently wound to a '/'. 144 bool EatComment(); 145 146 // Calls GetNextToken() and then ParseToken(). 147 std::optional<Value> ParseNextToken(); 148 149 // Takes a token that represents the start of a Value ("a structural token" 150 // in RFC terms) and consumes it, returning the result as a Value. 151 std::optional<Value> ParseToken(Token token); 152 153 // Assuming that the parser is currently wound to '{', this parses a JSON 154 // object into a Value. 155 std::optional<Value> ConsumeDictionary(); 156 157 // Assuming that the parser is wound to '[', this parses a JSON list into a 158 // Value. 159 std::optional<Value> ConsumeList(); 160 161 // Calls through ConsumeStringRaw and wraps it in a value. 162 std::optional<Value> ConsumeString(); 163 164 // Assuming that the parser is wound to a double quote, this parses a string, 165 // decoding any escape sequences and validating UTF-8. Returns the string on 166 // success or std::nullopt on error, with error information set. 167 std::optional<std::string> ConsumeStringRaw(); 168 169 enum class StringResult { 170 // Parsing stopped because of invalid input. Error information has been set. 171 // The caller should return failure. 172 kError, 173 // Parsing stopped because the string is finished. The parser is wound to 174 // just paste the closing quote. The caller should stop parsing the string. 175 kDone, 176 // Parsing stopped because of invalid Unicode which should be replaced with 177 // a replacement character. The parser is wound to just past the input that 178 // should be a replacement character. The caller should add a replacement 179 // character and continue parsing. 180 kReplacementCharacter, 181 // Parsing stopped because of an escape sequence. The parser is wound to 182 // just past the backslash. The caller should consume the escape sequence 183 // and continue parsing. 184 kEscape, 185 }; 186 187 // Consumes the portion of a JavaScript string which may be copied to the 188 // input with no conversions, stopping at one of the events above. Returns the 189 // reason parsing stopped and the data that was consumed. This should be 190 // called in a loop, handling all the cases above until reaching kDone. 191 std::pair<StringResult, std::string_view> ConsumeStringPart(); 192 193 // Helper function for ConsumeStringRaw() that consumes the next four or 10 194 // bytes (parser is wound to the first character of a HEX sequence, with the 195 // potential for consuming another \uXXXX for a surrogate). Returns true on 196 // success and places the code point |out_code_point|, and false on failure. 197 bool DecodeUTF16(base_icu::UChar32* out_code_point); 198 199 // Assuming that the parser is wound to the start of a valid JSON number, 200 // this parses and converts it to either an int or double value. 201 std::optional<Value> ConsumeNumber(); 202 // Helper that reads characters that are ints. Returns true if a number was 203 // read and false on error. 204 bool ReadInt(bool allow_leading_zeros); 205 206 // Consumes the literal values of |true|, |false|, and |null|, assuming the 207 // parser is wound to the first character of any of those. 208 std::optional<Value> ConsumeLiteral(); 209 210 // Helper function that returns true if the byte squence |match| can be 211 // consumed at the current parser position. Returns false if there are fewer 212 // than |match|-length bytes or if the sequence does not match, and the 213 // parser state is unchanged. 214 bool ConsumeIfMatch(std::string_view match); 215 216 // Sets the error information to |code| at the current column, based on 217 // |index_| and |index_last_line_|, with an optional positive/negative 218 // adjustment by |column_adjust|. 219 void ReportError(JsonParseError code, int column_adjust); 220 221 // Given the line and column number of an error, formats one of the error 222 // message contants from json_reader.h for human display. 223 static std::string FormatErrorMessage(int line, int column, 224 const std::string& description); 225 226 // base::JSONParserOptions that control parsing. 227 const int options_; 228 229 // Maximum depth to parse. 230 const size_t max_depth_; 231 232 // The input stream being parsed. Note: Not guaranteed to NUL-terminated. 233 std::string_view input_; 234 235 // The index in the input stream to which the parser is wound. 236 size_t index_; 237 238 // The number of times the parser has recursed (current stack depth). 239 size_t stack_depth_; 240 241 // The line number that the parser is at currently. 242 int line_number_; 243 244 // The last value of |index_| on the previous line. 245 size_t index_last_line_; 246 247 // Error information. 248 JsonParseError error_code_; 249 int error_line_; 250 int error_column_; 251 252 friend class JSONParserTest; 253 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar); 254 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary); 255 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList); 256 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString); 257 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals); 258 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers); 259 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages); 260 }; 261 262 // Used when decoding and an invalid utf-8 sequence is encountered. 263 BASE_EXPORT extern const char kUnicodeReplacementString[]; 264 265 } // namespace internal 266 } // namespace base 267 268 #endif // BASE_JSON_JSON_PARSER_H_ 269