1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7
8 // Internal JSON tokenization utilities; not public API.
9 #ifndef GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__
10 #define GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__
11
12 #include <array>
13 #include <cfloat>
14 #include <cmath>
15 #include <cstdint>
16 #include <iostream>
17 #include <limits>
18 #include <ostream>
19 #include <string>
20 #include <utility>
21
22 #include "absl/status/status.h"
23 #include "absl/status/statusor.h"
24 #include "absl/strings/match.h"
25 #include "absl/strings/str_format.h"
26 #include "absl/strings/string_view.h"
27 #include "google/protobuf/descriptor.h"
28 #include "google/protobuf/io/zero_copy_stream.h"
29 #include "google/protobuf/json/internal/message_path.h"
30 #include "google/protobuf/json/internal/zero_copy_buffered_stream.h"
31 #include "google/protobuf/stubs/status_macros.h"
32
33
34 // Must be included last.
35 #include "google/protobuf/port_def.inc"
36
37 namespace google {
38 namespace protobuf {
39 namespace json_internal {
40 // This is a duplicate of JsonParseOptions from json_util.h; it must be
41 // re-defined here so that :json_lexer does not need to depend on :json_util.
42 struct ParseOptions {
43 bool ignore_unknown_fields = false;
44 bool case_insensitive_enum_parsing = false;
45
46 static constexpr size_t kDefaultDepth = 100;
47
48 // The number of times we may recurse before bailing out on the grounds of
49 // avoiding pathological input.
50 int recursion_depth = kDefaultDepth;
51
52 // The original parser used by json_util2 accepted a number of non-standard
53 // options. Setting this flag enables them.
54 //
55 // What those extensions were is explicitly not documented, beyond what exists
56 // in the unit tests; we intend to remove this setting eventually. See
57 // b/234868512.
58 bool allow_legacy_syntax = false;
59 };
60
61 // A position in JSON input, for error context.
62 struct JsonLocation {
63 // This type exists to work around an absl type that has not yet been
64 // released.
65 struct SourceLocation {
currentJsonLocation::SourceLocation66 static SourceLocation current() { return {}; }
67 };
68
69 // Line and column are both zero-indexed in-memory.
70 size_t offset = 0;
71 size_t line = 0;
72 size_t col = 0;
73 const MessagePath* path = nullptr;
74
75 // Creates an absl::InvalidArgumentError with line/column information.
76 absl::Status Invalid(absl::string_view message,
77 SourceLocation sl = SourceLocation::current()) const;
78 };
79
80 template <typename T>
81 struct LocationWith {
82 T value;
83 JsonLocation loc;
84 };
85
86 class JsonLexer {
87 public:
88 // A kind of token that PeekKind() can detect.
89 enum Kind {
90 kObj,
91 kArr,
92 kStr,
93 kNum,
94 kTrue,
95 kFalse,
96 kNull,
97 };
98
99 using SourceLocation = JsonLocation::SourceLocation;
100
101 JsonLexer(io::ZeroCopyInputStream* stream, const ParseOptions& options,
102 MessagePath* path = nullptr, JsonLocation start = {})
stream_(stream)103 : stream_(stream), options_(options), json_loc_(start), path_(path) {
104 json_loc_.path = path_;
105 }
106
options()107 const ParseOptions& options() const { return options_; }
108
path()109 const MessagePath& path() const { return *path_; }
path()110 MessagePath& path() { return *path_; }
111
112 // Creates an absl::InvalidArgumentError with line/column information.
113 absl::Status Invalid(absl::string_view message,
114 SourceLocation sl = SourceLocation::current()) {
115 return json_loc_.Invalid(message, sl);
116 }
117
118 // Expects the next bytes to be parsed (after consuming whitespace) to be
119 // exactly `literal`. If they are, consumes them; otherwise returns an error.
120 absl::Status Expect(absl::string_view literal,
121 SourceLocation sl = SourceLocation::current()) {
122 RETURN_IF_ERROR(SkipToToken());
123 auto buffering = stream_.BufferAtLeast(literal.size());
124 RETURN_IF_ERROR(buffering.status());
125
126 if (!absl::StartsWith(stream_.Unread(), literal)) {
127 return Invalid(
128 absl::StrFormat("unexpected character: '%c'; expected '%s'",
129 stream_.PeekChar(), literal),
130 sl);
131 }
132
133 return Advance(literal.size());
134 }
135
136 // Like Expect(), but returns a boolean. This makes it clear that the
137 // lookahead is failible.
Peek(absl::string_view literal)138 bool Peek(absl::string_view literal) {
139 // Suppress the error; this can only fail on EOF in which case we would
140 // return false regardless.
141 (void)SkipToToken();
142 auto ignored = stream_.BufferAtLeast(literal.size());
143 if (!absl::StartsWith(stream_.Unread(), literal)) {
144 return false;
145 }
146
147 // We just ensured we had enough buffered so we can suppress this error.
148 (void)Advance(literal.size());
149 return true;
150 }
151
152 // Like Peek(string), but returns true if and only if a token of the given
153 // kind can be lexed next. Returns false on EOF, just like Peek(string).
Peek(Kind needle)154 bool Peek(Kind needle) {
155 auto kind = PeekKind();
156 return kind.ok() && *kind == needle;
157 }
158
159 // Consumes all whitespace and other ignored characters until the next
160 // token.
161 //
162 // This function returns an error on EOF, so PeekChar() can be safely
163 // called if it returns ok.
164 absl::Status SkipToToken();
165
166 // Returns which kind of value token (i.e., something that can occur after
167 // a `:`) is next up to be parsed.
168 absl::StatusOr<Kind> PeekKind();
169
170 // Parses a JSON number.
171 absl::StatusOr<LocationWith<double>> ParseNumber();
172
173 // Parses a number as a string, without turning it into an integer.
174 absl::StatusOr<LocationWith<MaybeOwnedString>> ParseRawNumber();
175
176 // Parses a UTF-8 string. If the contents of the string happen to actually be
177 // UTF-8, it will return a zero-copy view; otherwise it will allocate.
178 absl::StatusOr<LocationWith<MaybeOwnedString>> ParseUtf8();
179
180 // Walks over an array, calling `f` each time an element is reached.
181 //
182 // `f` should have type `() -> absl::Status`.
183 template <typename F>
184 absl::Status VisitArray(F f);
185
186 // Walks over an object, calling `f` just after parsing each `:`.
187 //
188 // `f` should have type `(absl::string_view) -> absl::Status`.
189 template <typename F>
190 absl::Status VisitObject(F f);
191
192 // Parses a single value and discards it.
193 absl::Status SkipValue();
194
195 // Forwards of functions from ZeroCopyBufferedStream.
196
AtEof()197 bool AtEof() {
198 // Ignore whitespace for the purposes of finding the EOF. This will return
199 // an error if we hit EOF, so we discard it.
200 (void)SkipToToken();
201 return stream_.AtEof();
202 }
203
Take(size_t len)204 absl::StatusOr<LocationWith<MaybeOwnedString>> Take(size_t len) {
205 JsonLocation loc = json_loc_;
206 auto taken = stream_.Take(len);
207 RETURN_IF_ERROR(taken.status());
208 return LocationWith<MaybeOwnedString>{*std::move(taken), loc};
209 }
210
211 template <typename Pred>
TakeWhile(Pred p)212 absl::StatusOr<LocationWith<MaybeOwnedString>> TakeWhile(Pred p) {
213 JsonLocation loc = json_loc_;
214 auto taken = stream_.TakeWhile(std::move(p));
215 RETURN_IF_ERROR(taken.status());
216 return LocationWith<MaybeOwnedString>{*std::move(taken), loc};
217 }
218
BeginMark()219 LocationWith<Mark> BeginMark() { return {stream_.BeginMark(), json_loc_}; }
220
221 private:
222 friend BufferingGuard;
223 friend Mark;
224 friend MaybeOwnedString;
225
Push()226 absl::Status Push() {
227 if (options_.recursion_depth == 0) {
228 return Invalid("JSON content was too deeply nested");
229 }
230 --options_.recursion_depth;
231 return absl::OkStatus();
232 }
233
Pop()234 void Pop() { ++options_.recursion_depth; }
235
236 // Parses the next four bytes as a 16-bit hex numeral.
237 absl::StatusOr<uint16_t> ParseU16HexCodepoint();
238
239 // Parses a Unicode escape (\uXXXX); this may be a surrogate pair, so it may
240 // consume the character that follows. Both are encoded as utf8 into
241 // `out_utf8`; returns the number of bytes written.
242 absl::StatusOr<size_t> ParseUnicodeEscape(char out_utf8[4]);
243
244 // Parses an alphanumeric "identifier", for use with the non-standard
245 // "unquoted keys" extension.
246 absl::StatusOr<LocationWith<MaybeOwnedString>> ParseBareWord();
247
Advance(size_t bytes)248 absl::Status Advance(size_t bytes) {
249 RETURN_IF_ERROR(stream_.Advance(bytes));
250 json_loc_.offset += static_cast<int>(bytes);
251 json_loc_.col += static_cast<int>(bytes);
252 return absl::OkStatus();
253 }
254
255 ZeroCopyBufferedStream stream_;
256
257 ParseOptions options_;
258 JsonLocation json_loc_;
259 MessagePath* path_;
260 };
261
262 template <typename F>
VisitArray(F f)263 absl::Status JsonLexer::VisitArray(F f) {
264 RETURN_IF_ERROR(Expect("["));
265 RETURN_IF_ERROR(Push());
266
267 if (Peek("]")) {
268 Pop();
269 return absl::OkStatus();
270 }
271
272 bool has_comma = true;
273 do {
274 if (!has_comma) {
275 return Invalid("expected ','");
276 }
277 RETURN_IF_ERROR(f());
278 has_comma = Peek(",");
279 } while (!Peek("]"));
280
281 if (!options_.allow_legacy_syntax && has_comma) {
282 return Invalid("expected ']'");
283 }
284
285 Pop();
286 return absl::OkStatus();
287 }
288
289 // Walks over an object, calling `f` just after parsing each `:`.
290 //
291 // `f` should have type `(MaybeOwnedString&) -> absl::Status`.
292 template <typename F>
VisitObject(F f)293 absl::Status JsonLexer::VisitObject(F f) {
294 RETURN_IF_ERROR(Expect("{"));
295 RETURN_IF_ERROR(Push());
296
297 if (Peek("}")) {
298 Pop();
299 return absl::OkStatus();
300 }
301
302 bool has_comma = true;
303 do {
304 if (!has_comma) {
305 return Invalid("expected ','");
306 }
307 RETURN_IF_ERROR(SkipToToken());
308
309 absl::StatusOr<LocationWith<MaybeOwnedString>> key;
310 if (stream_.PeekChar() == '"' || stream_.PeekChar() == '\'') {
311 key = ParseUtf8();
312 } else if (options_.allow_legacy_syntax) {
313 key = ParseBareWord();
314 } else {
315 return Invalid("expected '\"'");
316 }
317
318 RETURN_IF_ERROR(key.status());
319 RETURN_IF_ERROR(Expect(":"));
320 RETURN_IF_ERROR(f(*key));
321 has_comma = Peek(",");
322 } while (!Peek("}"));
323 Pop();
324
325 if (!options_.allow_legacy_syntax && has_comma) {
326 return Invalid("expected '}'");
327 }
328
329 return absl::OkStatus();
330 }
331 } // namespace json_internal
332 } // namespace protobuf
333 } // namespace google
334
335 #include "google/protobuf/port_undef.inc"
336 #endif // GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__
337