• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7 
8 // Internal JSON tokenization utilities; not public API.
9 #ifndef GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__
10 #define GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__
11 
12 #include <array>
13 #include <cfloat>
14 #include <cmath>
15 #include <cstdint>
16 #include <iostream>
17 #include <limits>
18 #include <ostream>
19 #include <string>
20 #include <utility>
21 
22 #include "absl/status/status.h"
23 #include "absl/status/statusor.h"
24 #include "absl/strings/match.h"
25 #include "absl/strings/str_format.h"
26 #include "absl/strings/string_view.h"
27 #include "google/protobuf/descriptor.h"
28 #include "google/protobuf/io/zero_copy_stream.h"
29 #include "google/protobuf/json/internal/message_path.h"
30 #include "google/protobuf/json/internal/zero_copy_buffered_stream.h"
31 #include "google/protobuf/stubs/status_macros.h"
32 
33 
34 // Must be included last.
35 #include "google/protobuf/port_def.inc"
36 
37 namespace google {
38 namespace protobuf {
39 namespace json_internal {
40 // This is a duplicate of JsonParseOptions from json_util.h; it must be
41 // re-defined here so that :json_lexer does not need to depend on :json_util.
42 struct ParseOptions {
43   bool ignore_unknown_fields = false;
44   bool case_insensitive_enum_parsing = false;
45 
46   static constexpr size_t kDefaultDepth = 100;
47 
48   // The number of times we may recurse before bailing out on the grounds of
49   // avoiding pathological input.
50   int recursion_depth = kDefaultDepth;
51 
52   // The original parser used by json_util2 accepted a number of non-standard
53   // options. Setting this flag enables them.
54   //
55   // What those extensions were is explicitly not documented, beyond what exists
56   // in the unit tests; we intend to remove this setting eventually. See
57   // b/234868512.
58   bool allow_legacy_syntax = false;
59 };
60 
61 // A position in JSON input, for error context.
62 struct JsonLocation {
63   // This type exists to work around an absl type that has not yet been
64   // released.
65   struct SourceLocation {
currentJsonLocation::SourceLocation66     static SourceLocation current() { return {}; }
67   };
68 
69   // Line and column are both zero-indexed in-memory.
70   size_t offset = 0;
71   size_t line = 0;
72   size_t col = 0;
73   const MessagePath* path = nullptr;
74 
75   // Creates an absl::InvalidArgumentError with line/column information.
76   absl::Status Invalid(absl::string_view message,
77                        SourceLocation sl = SourceLocation::current()) const;
78 };
79 
80 template <typename T>
81 struct LocationWith {
82   T value;
83   JsonLocation loc;
84 };
85 
86 class JsonLexer {
87  public:
88   // A kind of token that PeekKind() can detect.
89   enum Kind {
90     kObj,
91     kArr,
92     kStr,
93     kNum,
94     kTrue,
95     kFalse,
96     kNull,
97   };
98 
99   using SourceLocation = JsonLocation::SourceLocation;
100 
101   JsonLexer(io::ZeroCopyInputStream* stream, const ParseOptions& options,
102             MessagePath* path = nullptr, JsonLocation start = {})
stream_(stream)103       : stream_(stream), options_(options), json_loc_(start), path_(path) {
104     json_loc_.path = path_;
105   }
106 
options()107   const ParseOptions& options() const { return options_; }
108 
path()109   const MessagePath& path() const { return *path_; }
path()110   MessagePath& path() { return *path_; }
111 
112   // Creates an absl::InvalidArgumentError with line/column information.
113   absl::Status Invalid(absl::string_view message,
114                        SourceLocation sl = SourceLocation::current()) {
115     return json_loc_.Invalid(message, sl);
116   }
117 
118   // Expects the next bytes to be parsed (after consuming whitespace) to be
119   // exactly `literal`. If they are, consumes them; otherwise returns an error.
120   absl::Status Expect(absl::string_view literal,
121                       SourceLocation sl = SourceLocation::current()) {
122     RETURN_IF_ERROR(SkipToToken());
123     auto buffering = stream_.BufferAtLeast(literal.size());
124     RETURN_IF_ERROR(buffering.status());
125 
126     if (!absl::StartsWith(stream_.Unread(), literal)) {
127       return Invalid(
128           absl::StrFormat("unexpected character: '%c'; expected '%s'",
129                           stream_.PeekChar(), literal),
130           sl);
131     }
132 
133     return Advance(literal.size());
134   }
135 
136   // Like Expect(), but returns a boolean. This makes it clear that the
137   // lookahead is failible.
Peek(absl::string_view literal)138   bool Peek(absl::string_view literal) {
139     // Suppress the error; this can only fail on EOF in which case we would
140     // return false regardless.
141     (void)SkipToToken();
142     auto ignored = stream_.BufferAtLeast(literal.size());
143     if (!absl::StartsWith(stream_.Unread(), literal)) {
144       return false;
145     }
146 
147     // We just ensured we had enough buffered so we can suppress this error.
148     (void)Advance(literal.size());
149     return true;
150   }
151 
152   // Like Peek(string), but returns true if and only if a token of the given
153   // kind can be lexed next. Returns false on EOF, just like Peek(string).
Peek(Kind needle)154   bool Peek(Kind needle) {
155     auto kind = PeekKind();
156     return kind.ok() && *kind == needle;
157   }
158 
159   // Consumes all whitespace and other ignored characters until the next
160   // token.
161   //
162   // This function returns an error on EOF, so PeekChar() can be safely
163   // called if it returns ok.
164   absl::Status SkipToToken();
165 
166   // Returns which kind of value token (i.e., something that can occur after
167   // a `:`) is next up to be parsed.
168   absl::StatusOr<Kind> PeekKind();
169 
170   // Parses a JSON number.
171   absl::StatusOr<LocationWith<double>> ParseNumber();
172 
173   // Parses a number as a string, without turning it into an integer.
174   absl::StatusOr<LocationWith<MaybeOwnedString>> ParseRawNumber();
175 
176   // Parses a UTF-8 string. If the contents of the string happen to actually be
177   // UTF-8, it will return a zero-copy view; otherwise it will allocate.
178   absl::StatusOr<LocationWith<MaybeOwnedString>> ParseUtf8();
179 
180   // Walks over an array, calling `f` each time an element is reached.
181   //
182   // `f` should have type `() -> absl::Status`.
183   template <typename F>
184   absl::Status VisitArray(F f);
185 
186   // Walks over an object, calling `f` just after parsing each `:`.
187   //
188   // `f` should have type `(absl::string_view) -> absl::Status`.
189   template <typename F>
190   absl::Status VisitObject(F f);
191 
192   // Parses a single value and discards it.
193   absl::Status SkipValue();
194 
195   // Forwards of functions from ZeroCopyBufferedStream.
196 
AtEof()197   bool AtEof() {
198     // Ignore whitespace for the purposes of finding the EOF. This will return
199     // an error if we hit EOF, so we discard it.
200     (void)SkipToToken();
201     return stream_.AtEof();
202   }
203 
Take(size_t len)204   absl::StatusOr<LocationWith<MaybeOwnedString>> Take(size_t len) {
205     JsonLocation loc = json_loc_;
206     auto taken = stream_.Take(len);
207     RETURN_IF_ERROR(taken.status());
208     return LocationWith<MaybeOwnedString>{*std::move(taken), loc};
209   }
210 
211   template <typename Pred>
TakeWhile(Pred p)212   absl::StatusOr<LocationWith<MaybeOwnedString>> TakeWhile(Pred p) {
213     JsonLocation loc = json_loc_;
214     auto taken = stream_.TakeWhile(std::move(p));
215     RETURN_IF_ERROR(taken.status());
216     return LocationWith<MaybeOwnedString>{*std::move(taken), loc};
217   }
218 
BeginMark()219   LocationWith<Mark> BeginMark() { return {stream_.BeginMark(), json_loc_}; }
220 
221  private:
222   friend BufferingGuard;
223   friend Mark;
224   friend MaybeOwnedString;
225 
Push()226   absl::Status Push() {
227     if (options_.recursion_depth == 0) {
228       return Invalid("JSON content was too deeply nested");
229     }
230     --options_.recursion_depth;
231     return absl::OkStatus();
232   }
233 
Pop()234   void Pop() { ++options_.recursion_depth; }
235 
236   // Parses the next four bytes as a 16-bit hex numeral.
237   absl::StatusOr<uint16_t> ParseU16HexCodepoint();
238 
239   // Parses a Unicode escape (\uXXXX); this may be a surrogate pair, so it may
240   // consume the character that follows. Both are encoded as utf8 into
241   // `out_utf8`; returns the number of bytes written.
242   absl::StatusOr<size_t> ParseUnicodeEscape(char out_utf8[4]);
243 
244   // Parses an alphanumeric "identifier", for use with the non-standard
245   // "unquoted keys" extension.
246   absl::StatusOr<LocationWith<MaybeOwnedString>> ParseBareWord();
247 
Advance(size_t bytes)248   absl::Status Advance(size_t bytes) {
249     RETURN_IF_ERROR(stream_.Advance(bytes));
250     json_loc_.offset += static_cast<int>(bytes);
251     json_loc_.col += static_cast<int>(bytes);
252     return absl::OkStatus();
253   }
254 
255   ZeroCopyBufferedStream stream_;
256 
257   ParseOptions options_;
258   JsonLocation json_loc_;
259   MessagePath* path_;
260 };
261 
262 template <typename F>
VisitArray(F f)263 absl::Status JsonLexer::VisitArray(F f) {
264   RETURN_IF_ERROR(Expect("["));
265   RETURN_IF_ERROR(Push());
266 
267   if (Peek("]")) {
268     Pop();
269     return absl::OkStatus();
270   }
271 
272   bool has_comma = true;
273   do {
274     if (!has_comma) {
275       return Invalid("expected ','");
276     }
277     RETURN_IF_ERROR(f());
278     has_comma = Peek(",");
279   } while (!Peek("]"));
280 
281   if (!options_.allow_legacy_syntax && has_comma) {
282     return Invalid("expected ']'");
283   }
284 
285   Pop();
286   return absl::OkStatus();
287 }
288 
289 // Walks over an object, calling `f` just after parsing each `:`.
290 //
291 // `f` should have type `(MaybeOwnedString&) -> absl::Status`.
292 template <typename F>
VisitObject(F f)293 absl::Status JsonLexer::VisitObject(F f) {
294   RETURN_IF_ERROR(Expect("{"));
295   RETURN_IF_ERROR(Push());
296 
297   if (Peek("}")) {
298     Pop();
299     return absl::OkStatus();
300   }
301 
302   bool has_comma = true;
303   do {
304     if (!has_comma) {
305       return Invalid("expected ','");
306     }
307     RETURN_IF_ERROR(SkipToToken());
308 
309     absl::StatusOr<LocationWith<MaybeOwnedString>> key;
310     if (stream_.PeekChar() == '"' || stream_.PeekChar() == '\'') {
311       key = ParseUtf8();
312     } else if (options_.allow_legacy_syntax) {
313       key = ParseBareWord();
314     } else {
315       return Invalid("expected '\"'");
316     }
317 
318     RETURN_IF_ERROR(key.status());
319     RETURN_IF_ERROR(Expect(":"));
320     RETURN_IF_ERROR(f(*key));
321     has_comma = Peek(",");
322   } while (!Peek("}"));
323   Pop();
324 
325   if (!options_.allow_legacy_syntax && has_comma) {
326     return Invalid("expected '}'");
327   }
328 
329   return absl::OkStatus();
330 }
331 }  // namespace json_internal
332 }  // namespace protobuf
333 }  // namespace google
334 
335 #include "google/protobuf/port_undef.inc"
336 #endif  // GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__
337