• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef BASE_JSON_JSON_PARSER_H_
6 #define BASE_JSON_JSON_PARSER_H_
7 
8 #include <stddef.h>
9 #include <stdint.h>
10 
11 #include <memory>
12 #include <optional>
13 #include <string>
14 #include <string_view>
15 
16 #include "base/compiler_specific.h"
17 #include "base/gtest_prod_util.h"
18 #include "base/json/json_reader.h"
19 
20 namespace base {
21 
22 class Value;
23 
24 namespace internal {
25 
26 class JSONParserTest;
27 
28 // The implementation behind the JSONReader interface. This class is not meant
29 // to be used directly; it encapsulates logic that need not be exposed publicly.
30 //
31 // This parser guarantees O(n) time through the input string. Iteration happens
32 // on the byte level, with the functions ConsumeChars() and ConsumeChar(). The
33 // conversion from byte to JSON token happens without advancing the parser in
34 // GetNextToken/ParseToken, that is tokenization operates on the current parser
35 // position without advancing.
36 //
37 // Built on top of these are a family of Consume functions that iterate
38 // internally. Invariant: on entry of a Consume function, the parser is wound
39 // to the first byte of a valid JSON token. On exit, it is on the first byte
40 // after the token that was just consumed, which would likely be the first byte
41 // of the next token.
42 class JSONParser {
43  public:
44   JSONParser(int options, int max_depth = JSONReader::kStackMaxDepth);
45   ~JSONParser();
46 
47   // Parses the input string according to the set options and returns the
48   // result as a Value.
49   // Wrap this in base::FooValue::From() to check the Value is of type Foo and
50   // convert to a FooValue at the same time.
51   std::optional<Value> Parse(std::string_view input);
52 
53   // Returns the error code.
54   JSONReader::JsonParseError error_code() const;
55 
56   // Returns the human-friendly error message.
57   std::string GetErrorMessage() const;
58 
59   // Returns the error line number if parse error happened. Otherwise always
60   // returns 0.
61   int error_line() const;
62 
63   // Returns the error column number if parse error happened. Otherwise always
64   // returns 0.
65   int error_column() const;
66 
67  private:
68   enum Token {
69     T_OBJECT_BEGIN,  // {
70     T_OBJECT_END,    // }
71     T_ARRAY_BEGIN,   // [
72     T_ARRAY_END,     // ]
73     T_STRING,
74     T_NUMBER,
75     T_BOOL_TRUE,              // true
76     T_BOOL_FALSE,             // false
77     T_NULL,                   // null
78     T_LIST_SEPARATOR,         // ,
79     T_OBJECT_PAIR_SEPARATOR,  // :
80     T_END_OF_INPUT,
81     T_INVALID_TOKEN,
82   };
83 
84   // A helper class used for parsing strings. One optimization performed is to
85   // create base::Value with a std::string_view to avoid unnecessary std::string
86   // copies. This is not possible if the input string needs to be decoded from
87   // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped.
88   // This class centralizes that logic.
89   class StringBuilder {
90    public:
91     // Empty constructor. Used for creating a builder with which to assign to.
92     StringBuilder();
93 
94     // |pos| is the beginning of an input string, excluding the |"|.
95     explicit StringBuilder(const char* pos);
96 
97     ~StringBuilder();
98 
99     StringBuilder& operator=(StringBuilder&& other);
100 
101     // Appends the Unicode code point |point| to the string, either by
102     // increasing the |length_| of the string if the string has not been
103     // converted, or by appending the UTF8 bytes for the code point.
104     void Append(uint32_t point);
105 
106     // Converts the builder from its default std::string_view to a full
107     // std::string, performing a copy. Once a builder is converted, it cannot be
108     // made a std::string_view again.
109     void Convert();
110 
111     // Returns the builder as a string, invalidating all state. This allows
112     // the internal string buffer representation to be destructively moved
113     // in cases where the builder will not be needed any more.
114     std::string DestructiveAsString();
115 
116    private:
117     // The beginning of the input string.
118     const char* pos_;
119 
120     // Number of bytes in |pos_| that make up the string being built.
121     size_t length_;
122 
123     // The copied string representation. Will be unset until Convert() is
124     // called.
125     std::optional<std::string> string_;
126   };
127 
128   // Returns the next |count| bytes of the input stream, or nullopt if fewer
129   // than |count| bytes remain.
130   std::optional<std::string_view> PeekChars(int count);
131 
132   // Calls PeekChars() with a |count| of 1.
133   std::optional<char> PeekChar();
134 
135   // Returns the next |count| bytes of the input stream, or nullopt if fewer
136   // than |count| bytes remain, and advances the parser position by |count|.
137   std::optional<std::string_view> ConsumeChars(int count);
138 
139   // Calls ConsumeChars() with a |count| of 1.
140   std::optional<char> ConsumeChar();
141 
142   // Returns a pointer to the current character position.
143   const char* pos();
144 
145   // Skips over whitespace and comments to find the next token in the stream.
146   // This does not advance the parser for non-whitespace or comment chars.
147   Token GetNextToken();
148 
149   // Consumes whitespace characters and comments until the next non-that is
150   // encountered.
151   void EatWhitespaceAndComments();
152   // Helper function that consumes a comment, assuming that the parser is
153   // currently wound to a '/'.
154   bool EatComment();
155 
156   // Calls GetNextToken() and then ParseToken().
157   std::optional<Value> ParseNextToken();
158 
159   // Takes a token that represents the start of a Value ("a structural token"
160   // in RFC terms) and consumes it, returning the result as a Value.
161   std::optional<Value> ParseToken(Token token);
162 
163   // Assuming that the parser is currently wound to '{', this parses a JSON
164   // object into a Value.
165   std::optional<Value> ConsumeDictionary();
166 
167   // Assuming that the parser is wound to '[', this parses a JSON list into a
168   // Value.
169   std::optional<Value> ConsumeList();
170 
171   // Calls through ConsumeStringRaw and wraps it in a value.
172   std::optional<Value> ConsumeString();
173 
174   // Assuming that the parser is wound to a double quote, this parses a string,
175   // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on
176   // success and places result into |out|. Returns false on failure with
177   // error information set.
178   bool ConsumeStringRaw(StringBuilder* out);
179   // Helper function for ConsumeStringRaw() that consumes the next four or 10
180   // bytes (parser is wound to the first character of a HEX sequence, with the
181   // potential for consuming another \uXXXX for a surrogate). Returns true on
182   // success and places the code point |out_code_point|, and false on failure.
183   bool DecodeUTF16(uint32_t* out_code_point);
184 
185   // Assuming that the parser is wound to the start of a valid JSON number,
186   // this parses and converts it to either an int or double value.
187   std::optional<Value> ConsumeNumber();
188   // Helper that reads characters that are ints. Returns true if a number was
189   // read and false on error.
190   bool ReadInt(bool allow_leading_zeros);
191 
192   // Consumes the literal values of |true|, |false|, and |null|, assuming the
193   // parser is wound to the first character of any of those.
194   std::optional<Value> ConsumeLiteral();
195 
196   // Helper function that returns true if the byte sequence |match| can be
197   // consumed at the current parser position. Returns false if there are fewer
198   // than |match|-length bytes or if the sequence does not match, and the
199   // parser state is unchanged.
200   bool ConsumeIfMatch(std::string_view match);
201 
202   // Sets the error information to |code| at the current column, based on
203   // |index_| and |index_last_line_|, with an optional positive/negative
204   // adjustment by |column_adjust|.
205   void ReportError(JSONReader::JsonParseError code, int column_adjust);
206 
207   // Given the line and column number of an error, formats one of the error
208   // message contants from json_reader.h for human display.
209   static std::string FormatErrorMessage(int line,
210                                         int column,
211                                         const std::string& description);
212 
213   // base::JSONParserOptions that control parsing.
214   const int options_;
215 
216   // Maximum depth to parse.
217   const int max_depth_;
218 
219   // The input stream being parsed. Note: Not guaranteed to NUL-terminated.
220   std::string_view input_;
221 
222   // The index in the input stream to which the parser is wound.
223   int index_;
224 
225   // The number of times the parser has recursed (current stack depth).
226   int stack_depth_;
227 
228   // The line number that the parser is at currently.
229   int line_number_;
230 
231   // The last value of |index_| on the previous line.
232   int index_last_line_;
233 
234   // Error information.
235   JSONReader::JsonParseError error_code_;
236   int error_line_;
237   int error_column_;
238 
239   friend class JSONParserTest;
240   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar);
241   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary);
242   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList);
243   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString);
244   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals);
245   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers);
246   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages);
247   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidCharacters);
248   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidUTF16EscapeSequence);
249 
250   JSONParser(const JSONParser&) = delete;
251   JSONParser& operator=(const JSONParser&) = delete;
252 };
253 
254 // Used when decoding and an invalid utf-8 sequence is encountered.
255 extern const char kUnicodeReplacementString[];
256 
257 }  // namespace internal
258 }  // namespace base
259 
260 #endif  // BASE_JSON_JSON_PARSER_H_
261