1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef BASE_JSON_JSON_PARSER_H_ 6 #define BASE_JSON_JSON_PARSER_H_ 7 8 #include <stddef.h> 9 #include <stdint.h> 10 11 #include <memory> 12 #include <string> 13 14 #include "base/base_export.h" 15 #include "base/compiler_specific.h" 16 #include "base/gtest_prod_util.h" 17 #include "base/json/json_reader.h" 18 #include "base/macros.h" 19 #include "base/optional.h" 20 #include "base/strings/string_piece.h" 21 22 namespace base { 23 24 class Value; 25 26 namespace internal { 27 28 class JSONParserTest; 29 30 // The implementation behind the JSONReader interface. This class is not meant 31 // to be used directly; it encapsulates logic that need not be exposed publicly. 32 // 33 // This parser guarantees O(n) time through the input string. Iteration happens 34 // on the byte level, with the functions ConsumeChars() and ConsumeChar(). The 35 // conversion from byte to JSON token happens without advancing the parser in 36 // GetNextToken/ParseToken, that is tokenization operates on the current parser 37 // position without advancing. 38 // 39 // Built on top of these are a family of Consume functions that iterate 40 // internally. Invariant: on entry of a Consume function, the parser is wound 41 // to the first byte of a valid JSON token. On exit, it is on the first byte 42 // after the token that was just consumed, which would likely be the first byte 43 // of the next token. 44 class BASE_EXPORT JSONParser { 45 public: 46 JSONParser(int options, int max_depth = JSONReader::kStackMaxDepth); 47 ~JSONParser(); 48 49 // Parses the input string according to the set options and returns the 50 // result as a Value. 51 // Wrap this in base::FooValue::From() to check the Value is of type Foo and 52 // convert to a FooValue at the same time. 53 Optional<Value> Parse(StringPiece input); 54 55 // Returns the error code. 56 JSONReader::JsonParseError error_code() const; 57 58 // Returns the human-friendly error message. 59 std::string GetErrorMessage() const; 60 61 // Returns the error line number if parse error happened. Otherwise always 62 // returns 0. 63 int error_line() const; 64 65 // Returns the error column number if parse error happened. Otherwise always 66 // returns 0. 67 int error_column() const; 68 69 private: 70 enum Token { 71 T_OBJECT_BEGIN, // { 72 T_OBJECT_END, // } 73 T_ARRAY_BEGIN, // [ 74 T_ARRAY_END, // ] 75 T_STRING, 76 T_NUMBER, 77 T_BOOL_TRUE, // true 78 T_BOOL_FALSE, // false 79 T_NULL, // null 80 T_LIST_SEPARATOR, // , 81 T_OBJECT_PAIR_SEPARATOR, // : 82 T_END_OF_INPUT, 83 T_INVALID_TOKEN, 84 }; 85 86 // A helper class used for parsing strings. One optimization performed is to 87 // create base::Value with a StringPiece to avoid unnecessary std::string 88 // copies. This is not possible if the input string needs to be decoded from 89 // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped. 90 // This class centralizes that logic. 91 class StringBuilder { 92 public: 93 // Empty constructor. Used for creating a builder with which to assign to. 94 StringBuilder(); 95 96 // |pos| is the beginning of an input string, excluding the |"|. 97 explicit StringBuilder(const char* pos); 98 99 ~StringBuilder(); 100 101 StringBuilder& operator=(StringBuilder&& other); 102 103 // Appends the Unicode code point |point| to the string, either by 104 // increasing the |length_| of the string if the string has not been 105 // converted, or by appending the UTF8 bytes for the code point. 106 void Append(uint32_t point); 107 108 // Converts the builder from its default StringPiece to a full std::string, 109 // performing a copy. Once a builder is converted, it cannot be made a 110 // StringPiece again. 111 void Convert(); 112 113 // Returns the builder as a string, invalidating all state. This allows 114 // the internal string buffer representation to be destructively moved 115 // in cases where the builder will not be needed any more. 116 std::string DestructiveAsString(); 117 118 private: 119 // The beginning of the input string. 120 const char* pos_; 121 122 // Number of bytes in |pos_| that make up the string being built. 123 size_t length_; 124 125 // The copied string representation. Will be unset until Convert() is 126 // called. 127 base::Optional<std::string> string_; 128 }; 129 130 // Returns the next |count| bytes of the input stream, or nullopt if fewer 131 // than |count| bytes remain. 132 Optional<StringPiece> PeekChars(int count); 133 134 // Calls PeekChars() with a |count| of 1. 135 Optional<char> PeekChar(); 136 137 // Returns the next |count| bytes of the input stream, or nullopt if fewer 138 // than |count| bytes remain, and advances the parser position by |count|. 139 Optional<StringPiece> ConsumeChars(int count); 140 141 // Calls ConsumeChars() with a |count| of 1. 142 Optional<char> ConsumeChar(); 143 144 // Returns a pointer to the current character position. 145 const char* pos(); 146 147 // Skips over whitespace and comments to find the next token in the stream. 148 // This does not advance the parser for non-whitespace or comment chars. 149 Token GetNextToken(); 150 151 // Consumes whitespace characters and comments until the next non-that is 152 // encountered. 153 void EatWhitespaceAndComments(); 154 // Helper function that consumes a comment, assuming that the parser is 155 // currently wound to a '/'. 156 bool EatComment(); 157 158 // Calls GetNextToken() and then ParseToken(). 159 Optional<Value> ParseNextToken(); 160 161 // Takes a token that represents the start of a Value ("a structural token" 162 // in RFC terms) and consumes it, returning the result as a Value. 163 Optional<Value> ParseToken(Token token); 164 165 // Assuming that the parser is currently wound to '{', this parses a JSON 166 // object into a Value. 167 Optional<Value> ConsumeDictionary(); 168 169 // Assuming that the parser is wound to '[', this parses a JSON list into a 170 // Value. 171 Optional<Value> ConsumeList(); 172 173 // Calls through ConsumeStringRaw and wraps it in a value. 174 Optional<Value> ConsumeString(); 175 176 // Assuming that the parser is wound to a double quote, this parses a string, 177 // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on 178 // success and places result into |out|. Returns false on failure with 179 // error information set. 180 bool ConsumeStringRaw(StringBuilder* out); 181 // Helper function for ConsumeStringRaw() that consumes the next four or 10 182 // bytes (parser is wound to the first character of a HEX sequence, with the 183 // potential for consuming another \uXXXX for a surrogate). Returns true on 184 // success and places the code point |out_code_point|, and false on failure. 185 bool DecodeUTF16(uint32_t* out_code_point); 186 187 // Assuming that the parser is wound to the start of a valid JSON number, 188 // this parses and converts it to either an int or double value. 189 Optional<Value> ConsumeNumber(); 190 // Helper that reads characters that are ints. Returns true if a number was 191 // read and false on error. 192 bool ReadInt(bool allow_leading_zeros); 193 194 // Consumes the literal values of |true|, |false|, and |null|, assuming the 195 // parser is wound to the first character of any of those. 196 Optional<Value> ConsumeLiteral(); 197 198 // Helper function that returns true if the byte squence |match| can be 199 // consumed at the current parser position. Returns false if there are fewer 200 // than |match|-length bytes or if the sequence does not match, and the 201 // parser state is unchanged. 202 bool ConsumeIfMatch(StringPiece match); 203 204 // Sets the error information to |code| at the current column, based on 205 // |index_| and |index_last_line_|, with an optional positive/negative 206 // adjustment by |column_adjust|. 207 void ReportError(JSONReader::JsonParseError code, int column_adjust); 208 209 // Given the line and column number of an error, formats one of the error 210 // message contants from json_reader.h for human display. 211 static std::string FormatErrorMessage(int line, int column, 212 const std::string& description); 213 214 // base::JSONParserOptions that control parsing. 215 const int options_; 216 217 // Maximum depth to parse. 218 const int max_depth_; 219 220 // The input stream being parsed. Note: Not guaranteed to NUL-terminated. 221 StringPiece input_; 222 223 // The index in the input stream to which the parser is wound. 224 int index_; 225 226 // The number of times the parser has recursed (current stack depth). 227 int stack_depth_; 228 229 // The line number that the parser is at currently. 230 int line_number_; 231 232 // The last value of |index_| on the previous line. 233 int index_last_line_; 234 235 // Error information. 236 JSONReader::JsonParseError error_code_; 237 int error_line_; 238 int error_column_; 239 240 friend class JSONParserTest; 241 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar); 242 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary); 243 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList); 244 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString); 245 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals); 246 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers); 247 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages); 248 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidCharacters); 249 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidUTF16EscapeSequence); 250 251 DISALLOW_COPY_AND_ASSIGN(JSONParser); 252 }; 253 254 // Used when decoding and an invalid utf-8 sequence is encountered. 255 BASE_EXPORT extern const char kUnicodeReplacementString[]; 256 257 } // namespace internal 258 } // namespace base 259 260 #endif // BASE_JSON_JSON_PARSER_H_ 261