• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef BASE_JSON_JSON_PARSER_H_
6 #define BASE_JSON_JSON_PARSER_H_
7 
8 #include <stddef.h>
9 #include <stdint.h>
10 
11 #include <memory>
12 #include <string>
13 
14 #include "base/base_export.h"
15 #include "base/compiler_specific.h"
16 #include "base/gtest_prod_util.h"
17 #include "base/json/json_reader.h"
18 #include "base/macros.h"
19 #include "base/memory/manual_constructor.h"
20 #include "base/strings/string_piece.h"
21 
22 namespace base {
23 
24 class Value;
25 
26 namespace internal {
27 
28 class JSONParserTest;
29 
30 // The implementation behind the JSONReader interface. This class is not meant
31 // to be used directly; it encapsulates logic that need not be exposed publicly.
32 //
33 // This parser guarantees O(n) time through the input string. It also optimizes
34 // base::Value by using StringPiece where possible when returning Value
35 // objects by using "hidden roots," discussed in the implementation.
36 //
37 // Iteration happens on the byte level, with the functions CanConsume and
38 // NextChar. The conversion from byte to JSON token happens without advancing
39 // the parser in GetNextToken/ParseToken, that is tokenization operates on
40 // the current parser position without advancing.
41 //
42 // Built on top of these are a family of Consume functions that iterate
43 // internally. Invariant: on entry of a Consume function, the parser is wound
44 // to the first byte of a valid JSON token. On exit, it is on the last byte
45 // of a token, such that the next iteration of the parser will be at the byte
46 // immediately following the token, which would likely be the first byte of the
47 // next token.
48 class BASE_EXPORT JSONParser {
49  public:
50   explicit JSONParser(int options);
51   ~JSONParser();
52 
53   // Parses the input string according to the set options and returns the
54   // result as a Value.
55   // Wrap this in base::FooValue::From() to check the Value is of type Foo and
56   // convert to a FooValue at the same time.
57   std::unique_ptr<Value> Parse(StringPiece input);
58 
59   // Returns the error code.
60   JSONReader::JsonParseError error_code() const;
61 
62   // Returns the human-friendly error message.
63   std::string GetErrorMessage() const;
64 
65   // Returns the error line number if parse error happened. Otherwise always
66   // returns 0.
67   int error_line() const;
68 
69   // Returns the error column number if parse error happened. Otherwise always
70   // returns 0.
71   int error_column() const;
72 
73  private:
74   enum Token {
75     T_OBJECT_BEGIN,           // {
76     T_OBJECT_END,             // }
77     T_ARRAY_BEGIN,            // [
78     T_ARRAY_END,              // ]
79     T_STRING,
80     T_NUMBER,
81     T_BOOL_TRUE,              // true
82     T_BOOL_FALSE,             // false
83     T_NULL,                   // null
84     T_LIST_SEPARATOR,         // ,
85     T_OBJECT_PAIR_SEPARATOR,  // :
86     T_END_OF_INPUT,
87     T_INVALID_TOKEN,
88   };
89 
90   // A helper class used for parsing strings. One optimization performed is to
91   // create base::Value with a StringPiece to avoid unnecessary std::string
92   // copies. This is not possible if the input string needs to be decoded from
93   // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped.
94   // This class centralizes that logic.
95   class StringBuilder {
96    public:
97     // Empty constructor. Used for creating a builder with which to assign to.
98     StringBuilder();
99 
100     // |pos| is the beginning of an input string, excluding the |"|.
101     explicit StringBuilder(const char* pos);
102 
103     ~StringBuilder();
104 
105     void operator=(StringBuilder&& other);
106 
107     // Either increases the |length_| of the string or copies the character if
108     // the StringBuilder has been converted. |c| must be in the basic ASCII
109     // plane; all other characters need to be in UTF-8 units, appended with
110     // AppendString below.
111     void Append(const char& c);
112 
113     // Appends a string to the std::string. Must be Convert()ed to use.
114     void AppendString(const char* str, size_t len);
115 
116     // Converts the builder from its default StringPiece to a full std::string,
117     // performing a copy. Once a builder is converted, it cannot be made a
118     // StringPiece again.
119     void Convert();
120 
121     // Returns the builder as a StringPiece.
122     StringPiece AsStringPiece();
123 
124     // Returns the builder as a std::string.
125     const std::string& AsString();
126 
127     // Returns the builder as a string, invalidating all state. This allows
128     // the internal string buffer representation to be destructively moved
129     // in cases where the builder will not be needed any more.
130     std::string DestructiveAsString();
131 
132    private:
133     // The beginning of the input string.
134     const char* pos_;
135 
136     // Number of bytes in |pos_| that make up the string being built.
137     size_t length_;
138 
139     // The copied string representation. Will be uninitialized until Convert()
140     // is called, which will set has_string_ to true.
141     bool has_string_;
142     base::ManualConstructor<std::string> string_;
143   };
144 
145   // Quick check that the stream has capacity to consume |length| more bytes.
146   bool CanConsume(int length);
147 
148   // The basic way to consume a single character in the stream. Consumes one
149   // byte of the input stream and returns a pointer to the rest of it.
150   const char* NextChar();
151 
152   // Performs the equivalent of NextChar N times.
153   void NextNChars(int n);
154 
155   // Skips over whitespace and comments to find the next token in the stream.
156   // This does not advance the parser for non-whitespace or comment chars.
157   Token GetNextToken();
158 
159   // Consumes whitespace characters and comments until the next non-that is
160   // encountered.
161   void EatWhitespaceAndComments();
162   // Helper function that consumes a comment, assuming that the parser is
163   // currently wound to a '/'.
164   bool EatComment();
165 
166   // Calls GetNextToken() and then ParseToken().
167   std::unique_ptr<Value> ParseNextToken();
168 
169   // Takes a token that represents the start of a Value ("a structural token"
170   // in RFC terms) and consumes it, returning the result as a Value.
171   std::unique_ptr<Value> ParseToken(Token token);
172 
173   // Assuming that the parser is currently wound to '{', this parses a JSON
174   // object into a DictionaryValue.
175   std::unique_ptr<Value> ConsumeDictionary();
176 
177   // Assuming that the parser is wound to '[', this parses a JSON list into a
178   // std::unique_ptr<ListValue>.
179   std::unique_ptr<Value> ConsumeList();
180 
181   // Calls through ConsumeStringRaw and wraps it in a value.
182   std::unique_ptr<Value> ConsumeString();
183 
184   // Assuming that the parser is wound to a double quote, this parses a string,
185   // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on
186   // success and places result into |out|. Returns false on failure with
187   // error information set.
188   bool ConsumeStringRaw(StringBuilder* out);
189   // Helper function for ConsumeStringRaw() that consumes the next four or 10
190   // bytes (parser is wound to the first character of a HEX sequence, with the
191   // potential for consuming another \uXXXX for a surrogate). Returns true on
192   // success and places the UTF8 code units in |dest_string|, and false on
193   // failure.
194   bool DecodeUTF16(std::string* dest_string);
195   // Helper function for ConsumeStringRaw() that takes a single code point,
196   // decodes it into UTF-8 units, and appends it to the given builder. The
197   // point must be valid.
198   void DecodeUTF8(const int32_t& point, StringBuilder* dest);
199 
200   // Assuming that the parser is wound to the start of a valid JSON number,
201   // this parses and converts it to either an int or double value.
202   std::unique_ptr<Value> ConsumeNumber();
203   // Helper that reads characters that are ints. Returns true if a number was
204   // read and false on error.
205   bool ReadInt(bool allow_leading_zeros);
206 
207   // Consumes the literal values of |true|, |false|, and |null|, assuming the
208   // parser is wound to the first character of any of those.
209   std::unique_ptr<Value> ConsumeLiteral();
210 
211   // Compares two string buffers of a given length.
212   static bool StringsAreEqual(const char* left, const char* right, size_t len);
213 
214   // Sets the error information to |code| at the current column, based on
215   // |index_| and |index_last_line_|, with an optional positive/negative
216   // adjustment by |column_adjust|.
217   void ReportError(JSONReader::JsonParseError code, int column_adjust);
218 
219   // Given the line and column number of an error, formats one of the error
220   // message contants from json_reader.h for human display.
221   static std::string FormatErrorMessage(int line, int column,
222                                         const std::string& description);
223 
224   // base::JSONParserOptions that control parsing.
225   const int options_;
226 
227   // Pointer to the start of the input data.
228   const char* start_pos_;
229 
230   // Pointer to the current position in the input data. Equivalent to
231   // |start_pos_ + index_|.
232   const char* pos_;
233 
234   // Pointer to the last character of the input data.
235   const char* end_pos_;
236 
237   // The index in the input stream to which the parser is wound.
238   int index_;
239 
240   // The number of times the parser has recursed (current stack depth).
241   int stack_depth_;
242 
243   // The line number that the parser is at currently.
244   int line_number_;
245 
246   // The last value of |index_| on the previous line.
247   int index_last_line_;
248 
249   // Error information.
250   JSONReader::JsonParseError error_code_;
251   int error_line_;
252   int error_column_;
253 
254   friend class JSONParserTest;
255   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar);
256   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary);
257   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList);
258   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString);
259   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals);
260   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers);
261   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages);
262   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidCharacters);
263 
264   DISALLOW_COPY_AND_ASSIGN(JSONParser);
265 };
266 
267 // Used when decoding and an invalid utf-8 sequence is encountered.
268 BASE_EXPORT extern const char kUnicodeReplacementString[];
269 
270 }  // namespace internal
271 }  // namespace base
272 
273 #endif  // BASE_JSON_JSON_PARSER_H_
274