1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef BASE_JSON_JSON_PARSER_H_ 6 #define BASE_JSON_JSON_PARSER_H_ 7 8 #include <stddef.h> 9 #include <stdint.h> 10 11 #include <memory> 12 #include <string> 13 14 #include "base/base_export.h" 15 #include "base/compiler_specific.h" 16 #include "base/gtest_prod_util.h" 17 #include "base/json/json_reader.h" 18 #include "base/macros.h" 19 #include "base/memory/manual_constructor.h" 20 #include "base/strings/string_piece.h" 21 22 namespace base { 23 24 class Value; 25 26 namespace internal { 27 28 class JSONParserTest; 29 30 // The implementation behind the JSONReader interface. This class is not meant 31 // to be used directly; it encapsulates logic that need not be exposed publicly. 32 // 33 // This parser guarantees O(n) time through the input string. It also optimizes 34 // base::Value by using StringPiece where possible when returning Value 35 // objects by using "hidden roots," discussed in the implementation. 36 // 37 // Iteration happens on the byte level, with the functions CanConsume and 38 // NextChar. The conversion from byte to JSON token happens without advancing 39 // the parser in GetNextToken/ParseToken, that is tokenization operates on 40 // the current parser position without advancing. 41 // 42 // Built on top of these are a family of Consume functions that iterate 43 // internally. Invariant: on entry of a Consume function, the parser is wound 44 // to the first byte of a valid JSON token. On exit, it is on the last byte 45 // of a token, such that the next iteration of the parser will be at the byte 46 // immediately following the token, which would likely be the first byte of the 47 // next token. 48 class BASE_EXPORT JSONParser { 49 public: 50 explicit JSONParser(int options); 51 ~JSONParser(); 52 53 // Parses the input string according to the set options and returns the 54 // result as a Value. 55 // Wrap this in base::FooValue::From() to check the Value is of type Foo and 56 // convert to a FooValue at the same time. 57 std::unique_ptr<Value> Parse(StringPiece input); 58 59 // Returns the error code. 60 JSONReader::JsonParseError error_code() const; 61 62 // Returns the human-friendly error message. 63 std::string GetErrorMessage() const; 64 65 // Returns the error line number if parse error happened. Otherwise always 66 // returns 0. 67 int error_line() const; 68 69 // Returns the error column number if parse error happened. Otherwise always 70 // returns 0. 71 int error_column() const; 72 73 private: 74 enum Token { 75 T_OBJECT_BEGIN, // { 76 T_OBJECT_END, // } 77 T_ARRAY_BEGIN, // [ 78 T_ARRAY_END, // ] 79 T_STRING, 80 T_NUMBER, 81 T_BOOL_TRUE, // true 82 T_BOOL_FALSE, // false 83 T_NULL, // null 84 T_LIST_SEPARATOR, // , 85 T_OBJECT_PAIR_SEPARATOR, // : 86 T_END_OF_INPUT, 87 T_INVALID_TOKEN, 88 }; 89 90 // A helper class used for parsing strings. One optimization performed is to 91 // create base::Value with a StringPiece to avoid unnecessary std::string 92 // copies. This is not possible if the input string needs to be decoded from 93 // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped. 94 // This class centralizes that logic. 95 class StringBuilder { 96 public: 97 // Empty constructor. Used for creating a builder with which to assign to. 98 StringBuilder(); 99 100 // |pos| is the beginning of an input string, excluding the |"|. 101 explicit StringBuilder(const char* pos); 102 103 ~StringBuilder(); 104 105 void operator=(StringBuilder&& other); 106 107 // Either increases the |length_| of the string or copies the character if 108 // the StringBuilder has been converted. |c| must be in the basic ASCII 109 // plane; all other characters need to be in UTF-8 units, appended with 110 // AppendString below. 111 void Append(const char& c); 112 113 // Appends a string to the std::string. Must be Convert()ed to use. 114 void AppendString(const char* str, size_t len); 115 116 // Converts the builder from its default StringPiece to a full std::string, 117 // performing a copy. Once a builder is converted, it cannot be made a 118 // StringPiece again. 119 void Convert(); 120 121 // Returns the builder as a StringPiece. 122 StringPiece AsStringPiece(); 123 124 // Returns the builder as a std::string. 125 const std::string& AsString(); 126 127 // Returns the builder as a string, invalidating all state. This allows 128 // the internal string buffer representation to be destructively moved 129 // in cases where the builder will not be needed any more. 130 std::string DestructiveAsString(); 131 132 private: 133 // The beginning of the input string. 134 const char* pos_; 135 136 // Number of bytes in |pos_| that make up the string being built. 137 size_t length_; 138 139 // The copied string representation. Will be uninitialized until Convert() 140 // is called, which will set has_string_ to true. 141 bool has_string_; 142 base::ManualConstructor<std::string> string_; 143 }; 144 145 // Quick check that the stream has capacity to consume |length| more bytes. 146 bool CanConsume(int length); 147 148 // The basic way to consume a single character in the stream. Consumes one 149 // byte of the input stream and returns a pointer to the rest of it. 150 const char* NextChar(); 151 152 // Performs the equivalent of NextChar N times. 153 void NextNChars(int n); 154 155 // Skips over whitespace and comments to find the next token in the stream. 156 // This does not advance the parser for non-whitespace or comment chars. 157 Token GetNextToken(); 158 159 // Consumes whitespace characters and comments until the next non-that is 160 // encountered. 161 void EatWhitespaceAndComments(); 162 // Helper function that consumes a comment, assuming that the parser is 163 // currently wound to a '/'. 164 bool EatComment(); 165 166 // Calls GetNextToken() and then ParseToken(). 167 std::unique_ptr<Value> ParseNextToken(); 168 169 // Takes a token that represents the start of a Value ("a structural token" 170 // in RFC terms) and consumes it, returning the result as a Value. 171 std::unique_ptr<Value> ParseToken(Token token); 172 173 // Assuming that the parser is currently wound to '{', this parses a JSON 174 // object into a DictionaryValue. 175 std::unique_ptr<Value> ConsumeDictionary(); 176 177 // Assuming that the parser is wound to '[', this parses a JSON list into a 178 // std::unique_ptr<ListValue>. 179 std::unique_ptr<Value> ConsumeList(); 180 181 // Calls through ConsumeStringRaw and wraps it in a value. 182 std::unique_ptr<Value> ConsumeString(); 183 184 // Assuming that the parser is wound to a double quote, this parses a string, 185 // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on 186 // success and places result into |out|. Returns false on failure with 187 // error information set. 188 bool ConsumeStringRaw(StringBuilder* out); 189 // Helper function for ConsumeStringRaw() that consumes the next four or 10 190 // bytes (parser is wound to the first character of a HEX sequence, with the 191 // potential for consuming another \uXXXX for a surrogate). Returns true on 192 // success and places the UTF8 code units in |dest_string|, and false on 193 // failure. 194 bool DecodeUTF16(std::string* dest_string); 195 // Helper function for ConsumeStringRaw() that takes a single code point, 196 // decodes it into UTF-8 units, and appends it to the given builder. The 197 // point must be valid. 198 void DecodeUTF8(const int32_t& point, StringBuilder* dest); 199 200 // Assuming that the parser is wound to the start of a valid JSON number, 201 // this parses and converts it to either an int or double value. 202 std::unique_ptr<Value> ConsumeNumber(); 203 // Helper that reads characters that are ints. Returns true if a number was 204 // read and false on error. 205 bool ReadInt(bool allow_leading_zeros); 206 207 // Consumes the literal values of |true|, |false|, and |null|, assuming the 208 // parser is wound to the first character of any of those. 209 std::unique_ptr<Value> ConsumeLiteral(); 210 211 // Compares two string buffers of a given length. 212 static bool StringsAreEqual(const char* left, const char* right, size_t len); 213 214 // Sets the error information to |code| at the current column, based on 215 // |index_| and |index_last_line_|, with an optional positive/negative 216 // adjustment by |column_adjust|. 217 void ReportError(JSONReader::JsonParseError code, int column_adjust); 218 219 // Given the line and column number of an error, formats one of the error 220 // message contants from json_reader.h for human display. 221 static std::string FormatErrorMessage(int line, int column, 222 const std::string& description); 223 224 // base::JSONParserOptions that control parsing. 225 const int options_; 226 227 // Pointer to the start of the input data. 228 const char* start_pos_; 229 230 // Pointer to the current position in the input data. Equivalent to 231 // |start_pos_ + index_|. 232 const char* pos_; 233 234 // Pointer to the last character of the input data. 235 const char* end_pos_; 236 237 // The index in the input stream to which the parser is wound. 238 int index_; 239 240 // The number of times the parser has recursed (current stack depth). 241 int stack_depth_; 242 243 // The line number that the parser is at currently. 244 int line_number_; 245 246 // The last value of |index_| on the previous line. 247 int index_last_line_; 248 249 // Error information. 250 JSONReader::JsonParseError error_code_; 251 int error_line_; 252 int error_column_; 253 254 friend class JSONParserTest; 255 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar); 256 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary); 257 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList); 258 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString); 259 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals); 260 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers); 261 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages); 262 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidCharacters); 263 264 DISALLOW_COPY_AND_ASSIGN(JSONParser); 265 }; 266 267 // Used when decoding and an invalid utf-8 sequence is encountered. 268 BASE_EXPORT extern const char kUnicodeReplacementString[]; 269 270 } // namespace internal 271 } // namespace base 272 273 #endif // BASE_JSON_JSON_PARSER_H_ 274