1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7
8 #include "google/protobuf/json/internal/lexer.h"
9
10 #include <sys/types.h>
11
12 #include <atomic>
13 #include <cfloat>
14 #include <cmath>
15 #include <cstdint>
16 #include <iostream>
17 #include <limits>
18 #include <ostream>
19 #include <string>
20 #include <utility>
21
22 #include "absl/algorithm/container.h"
23 #include "absl/log/absl_check.h"
24 #include "absl/numeric/bits.h"
25 #include "absl/status/status.h"
26 #include "absl/status/statusor.h"
27 #include "absl/strings/ascii.h"
28 #include "absl/strings/numbers.h"
29 #include "absl/strings/str_cat.h"
30 #include "absl/strings/str_format.h"
31 #include "absl/strings/string_view.h"
32 #include "utf8_validity.h"
33 #include "google/protobuf/stubs/status_macros.h"
34
35 // Must be included last.
36 #include "google/protobuf/port_def.inc"
37
38 namespace google {
39 namespace protobuf {
40 namespace json_internal {
41 namespace {
42 // Randomly inserts bonus whitespace of a few different kinds into a string.
43 //
44 // This utility is intended to make error messages hostile to machine
45 // interpretation as a Hyrum's Law countermeasure, without potentially confusing
46 // human readers.
HardenAgainstHyrumsLaw(absl::string_view to_obfuscate,std::string & out)47 void HardenAgainstHyrumsLaw(absl::string_view to_obfuscate, std::string& out) {
48 // Get some simple randomness from ASLR, which is enabled in most
49 // environments. Our goal is to be annoying, not secure.
50 static const void* const kAslrSeed = &kAslrSeed;
51 // Per-call randomness from a relaxed atomic.
52 static std::atomic<uintptr_t> kCounterSeed{0};
53
54 constexpr uint64_t kA = 0x5851f42d4c957f2dull;
55 constexpr uint64_t kB = 0x14057b7ef767814full;
56
57 uint64_t state = absl::bit_cast<uintptr_t>(kAslrSeed) + kB +
58 kCounterSeed.fetch_add(1, std::memory_order_relaxed);
59 auto rng = [&state, &kA, &kB] {
60 state = state * kA + kB;
61 return absl::rotr(static_cast<uint32_t>(((state >> 18) ^ state) >> 27),
62 state >> 59);
63 };
64 (void)rng(); // Advance state once.
65
66 out.reserve(to_obfuscate.size() + absl::c_count(to_obfuscate, ' '));
67 for (char c : to_obfuscate) {
68 out.push_back(c);
69 if (c != ' ' || rng() % 3 != 0) {
70 continue;
71 }
72
73 size_t count = rng() % 2 + 1;
74 for (size_t i = 0; i < count; ++i) {
75 out.push_back(' ');
76 }
77 }
78 }
79 } // namespace
80
81 constexpr size_t ParseOptions::kDefaultDepth;
82
Invalid(absl::string_view message,SourceLocation sl) const83 absl::Status JsonLocation::Invalid(absl::string_view message,
84 SourceLocation sl) const {
85 // NOTE: we intentionally do not harden the "invalid JSON" part, so that
86 // people have a hope of grepping for it in logs. That part is easy to
87 // commit to, as stability goes.
88 //
89 // This copies the error twice. Because this is the "unhappy" path, this
90 // function is cold and can afford the waste.
91 std::string status_message = "invalid JSON";
92 std::string to_obfuscate;
93 if (path != nullptr) {
94 absl::StrAppend(&to_obfuscate, " in ");
95 path->Describe(to_obfuscate);
96 to_obfuscate.push_back(',');
97 }
98 absl::StrAppendFormat(&to_obfuscate, " near %zu:%zu (offset %zu): %s",
99 line + 1, col + 1, offset, message);
100 HardenAgainstHyrumsLaw(to_obfuscate, status_message);
101
102 return absl::InvalidArgumentError(std::move(status_message));
103 }
104
PeekKind()105 absl::StatusOr<JsonLexer::Kind> JsonLexer::PeekKind() {
106 RETURN_IF_ERROR(SkipToToken());
107 char c = stream_.PeekChar();
108 switch (c) {
109 case '{':
110 return JsonLexer::kObj;
111 case '[':
112 return JsonLexer::kArr;
113 case '"':
114 case '\'':
115 return JsonLexer::kStr;
116 case '-':
117 case '0':
118 case '1':
119 case '2':
120 case '3':
121 case '4':
122 case '5':
123 case '6':
124 case '7':
125 case '8':
126 case '9':
127 return JsonLexer::kNum;
128 case 't':
129 return JsonLexer::kTrue;
130 case 'f':
131 return JsonLexer::kFalse;
132 case 'n':
133 return JsonLexer::kNull;
134 default:
135 return Invalid(absl::StrFormat("unexpected character: '%c'", c));
136 }
137 }
138
SkipValue()139 absl::Status JsonLexer::SkipValue() {
140 absl::StatusOr<Kind> kind = PeekKind();
141 RETURN_IF_ERROR(kind.status());
142
143 switch (*kind) {
144 case JsonLexer::kObj:
145 return VisitObject(
146 [this](LocationWith<MaybeOwnedString>&) { return SkipValue(); });
147 case JsonLexer::kArr:
148 return VisitArray([this] { return SkipValue(); });
149 case JsonLexer::kStr:
150 return ParseUtf8().status();
151 case JsonLexer::kNum:
152 return ParseNumber().status();
153 case JsonLexer::kTrue:
154 return Expect("true");
155 case JsonLexer::kFalse:
156 return Expect("false");
157 case JsonLexer::kNull:
158 return Expect("null");
159 default:
160 break;
161 }
162 // Some compilers seem to fail to realize this is a basic block
163 // terminator and incorrectly believe this function is missing
164 // a return.
165 ABSL_CHECK(false) << "unreachable";
166 return absl::OkStatus();
167 }
168
ParseU16HexCodepoint()169 absl::StatusOr<uint16_t> JsonLexer::ParseU16HexCodepoint() {
170 absl::StatusOr<LocationWith<MaybeOwnedString>> escape = Take(4);
171 RETURN_IF_ERROR(escape.status());
172
173 uint16_t u16 = 0;
174 for (char c : escape->value.AsView()) {
175 if (c >= '0' && c <= '9') {
176 c -= '0';
177 } else if (c >= 'a' && c <= 'f') {
178 c = c - 'a' + 10;
179 } else if (c >= 'A' && c <= 'F') {
180 c = c - 'A' + 10;
181 } else {
182 return Invalid("invalid Unicode escape");
183 }
184 u16 <<= 4;
185 u16 |= c;
186 }
187
188 return u16;
189 }
190
SkipToToken()191 absl::Status JsonLexer::SkipToToken() {
192 while (true) {
193 RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
194 switch (stream_.PeekChar()) {
195 case '\n':
196 RETURN_IF_ERROR(Advance(1));
197 ++json_loc_.line;
198 json_loc_.col = 0;
199 break;
200 case '\r':
201 case '\t':
202 case ' ':
203 RETURN_IF_ERROR(Advance(1));
204 break;
205 default:
206 return absl::OkStatus();
207 }
208 }
209 }
210
ParseRawNumber()211 absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseRawNumber() {
212 RETURN_IF_ERROR(SkipToToken());
213
214 enum { kInt, kFraction, kExponent } state = kInt;
215 char prev_var = 0;
216 auto number = TakeWhile([state, prev_var](size_t index, char c) mutable {
217 char prev = prev_var;
218 prev_var = c;
219 if (absl::ascii_isdigit(c)) {
220 return true;
221 }
222
223 bool last_was_int = absl::ascii_isdigit(prev);
224 // These checks handle transitions between the integer, fractional, and
225 // exponent part of a number. This will cut off at the first syntax error.
226 // Because all numbers must be followed by `,`, `]`, or `}`, we can let
227 // that catch what's left behind.
228 if (state == kInt && c == '-') {
229 return !last_was_int;
230 }
231 if (state == kInt && last_was_int && c == '.') {
232 state = kFraction;
233 return true;
234 }
235 if (state != kExponent && last_was_int && (c == 'e' || c == 'E')) {
236 state = kExponent;
237 return true;
238 }
239 if ((prev == 'e' || prev == 'E') && (c == '-' || c == '+')) {
240 return true;
241 }
242
243 return false;
244 });
245
246 RETURN_IF_ERROR(number.status());
247 absl::string_view number_text = number->value.AsView();
248
249 if (number_text.empty() || number_text == "-") {
250 return number->loc.Invalid("expected a number");
251 }
252
253 auto without_minus =
254 number_text[0] == '-' ? number_text.substr(1) : number_text;
255 if (without_minus.size() > 1 && without_minus[0] == '0' &&
256 absl::ascii_isdigit(without_minus[1])) {
257 return number->loc.Invalid("number cannot have extraneous leading zero");
258 }
259
260 if (number_text.back() == '.') {
261 return number->loc.Invalid("number cannot have trailing period");
262 }
263
264 double d;
265 if (!absl::SimpleAtod(number_text, &d) || !std::isfinite(d)) {
266 return number->loc.Invalid(
267 absl::StrFormat("invalid number: '%s'", number_text));
268 }
269
270 // Find the next token, to make sure we didn't leave something behind we
271 // shouldn't have.
272 if (!stream_.AtEof()) {
273 RETURN_IF_ERROR(SkipToToken());
274 switch (stream_.PeekChar()) {
275 case ',':
276 case ']':
277 case '}':
278 break;
279 default:
280 return Invalid(
281 absl::StrFormat("unexpected character: '%c'", stream_.PeekChar()));
282 }
283 }
284
285 return number;
286 }
287
ParseNumber()288 absl::StatusOr<LocationWith<double>> JsonLexer::ParseNumber() {
289 auto number = ParseRawNumber();
290 RETURN_IF_ERROR(number.status());
291
292 double d;
293 if (!absl::SimpleAtod(number->value.AsView(), &d) || !std::isfinite(d)) {
294 return number->loc.Invalid(
295 absl::StrFormat("invalid number: '%s'", number->value.AsView()));
296 }
297
298 return LocationWith<double>{d, number->loc};
299 }
300
ParseUnicodeEscape(char out_utf8[4])301 absl::StatusOr<size_t> JsonLexer::ParseUnicodeEscape(char out_utf8[4]) {
302 auto hex = ParseU16HexCodepoint();
303 RETURN_IF_ERROR(hex.status());
304
305 uint32_t rune = *hex;
306 if (rune >= 0xd800 && rune <= 0xdbff) {
307 // Surrogate pair: two 16-bit codepoints become a 32-bit codepoint.
308 uint32_t high = rune;
309
310 RETURN_IF_ERROR(Expect("\\u"));
311 auto hex = ParseU16HexCodepoint();
312 RETURN_IF_ERROR(hex.status());
313
314 uint32_t low = *hex;
315 if (low < 0xdc00 || low > 0xdfff) {
316 return Invalid("invalid low surrogate");
317 }
318
319 rune = (high & 0x3ff) << 10;
320 rune |= (low & 0x3ff);
321 rune += 0x10000;
322 } else if (rune >= 0xdc00 && rune <= 0xdfff) {
323 return Invalid("unpaired low surrogate");
324 }
325
326 // Write as UTF-8.
327 if (rune <= 0x7f) {
328 out_utf8[0] = rune;
329 return 1;
330 } else if (rune <= 0x07ff) {
331 out_utf8[0] = ((rune >> 6) & 0x1f) | 0xc0;
332 out_utf8[1] = ((rune >> 0) & 0x3f) | 0x80;
333 return 2;
334 } else if (rune <= 0xffff) {
335 out_utf8[0] = ((rune >> 12) & 0x0f) | 0xe0;
336 out_utf8[1] = ((rune >> 6) & 0x3f) | 0x80;
337 out_utf8[2] = ((rune >> 0) & 0x3f) | 0x80;
338 return 3;
339 } else if (rune <= 0x10ffff) {
340 out_utf8[0] = ((rune >> 18) & 0x07) | 0xF0;
341 out_utf8[1] = ((rune >> 12) & 0x3f) | 0x80;
342 out_utf8[2] = ((rune >> 6) & 0x3f) | 0x80;
343 out_utf8[3] = ((rune >> 0) & 0x3f) | 0x80;
344 return 4;
345 } else {
346 return Invalid("invalid codepoint");
347 }
348 }
349
ParseSimpleEscape(char c,bool allow_legacy_syntax)350 static char ParseSimpleEscape(char c, bool allow_legacy_syntax) {
351 switch (c) {
352 case '"':
353 return '"';
354 case '\\':
355 return '\\';
356 case '/':
357 return '/';
358 case 'b':
359 return '\b';
360 case 'f':
361 return '\f';
362 case 'n':
363 return '\n';
364 case 'r':
365 return '\r';
366 case 't':
367 return '\t';
368 case '\'':
369 if (allow_legacy_syntax) {
370 return '\'';
371 }
372 ABSL_FALLTHROUGH_INTENDED;
373 default:
374 return 0;
375 }
376 }
377
ParseUtf8()378 absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
379 RETURN_IF_ERROR(SkipToToken());
380 // This is a non-standard extension accepted by the ESF parser that we will
381 // need to accept for backwards-compat.
382 bool is_single_quote = stream_.PeekChar() == '\'';
383 if (!options_.allow_legacy_syntax && is_single_quote) {
384 return Invalid("expected '\"'");
385 }
386
387 JsonLocation loc = json_loc_;
388 RETURN_IF_ERROR(Expect(is_single_quote ? "'" : "\""));
389
390 // on_heap is empty if we do not need to heap-allocate the string.
391 std::string on_heap;
392 LocationWith<Mark> mark = BeginMark();
393 while (true) {
394 RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
395
396 char c = stream_.PeekChar();
397 RETURN_IF_ERROR(Advance(1));
398 switch (c) {
399 case '"':
400 case '\'': {
401 if (c != (is_single_quote ? '\'' : '"')) {
402 goto normal_character;
403 }
404
405 // NOTE: the 1 below clips off the " from the end of the string.
406 MaybeOwnedString result = on_heap.empty()
407 ? mark.value.UpToUnread(1)
408 : MaybeOwnedString{std::move(on_heap)};
409 if (utf8_range::IsStructurallyValid(result)) {
410 return LocationWith<MaybeOwnedString>{std::move(result), loc};
411 }
412 return Invalid("Invalid UTF-8 string");
413 }
414 case '\\': {
415 if (on_heap.empty()) {
416 // The 1 skips over the `\`.
417 on_heap = std::string(mark.value.UpToUnread(1).AsView());
418 // Clang-tidy incorrectly notes this as being moved-from multiple
419 // times, but it can only occur in one loop iteration. The mark is
420 // destroyed only if we need to handle an escape when on_heap is
421 // empty. Because this branch unconditionally pushes to on_heap, this
422 // condition can never be reached in any iteration that follows it.
423 // Thus, at most one move ever actually occurs.
424 std::move(mark).value.Discard();
425 }
426 RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
427
428 char c = stream_.PeekChar();
429 RETURN_IF_ERROR(Advance(1));
430 if (c == 'u' || (c == 'U' && options_.allow_legacy_syntax)) {
431 // Ensure there is actual space to scribble the UTF-8 onto.
432 on_heap.resize(on_heap.size() + 4);
433 auto written = ParseUnicodeEscape(&on_heap[on_heap.size() - 4]);
434 RETURN_IF_ERROR(written.status());
435 on_heap.resize(on_heap.size() - 4 + *written);
436 } else {
437 char escape = ParseSimpleEscape(c, options_.allow_legacy_syntax);
438 if (escape == 0) {
439 return Invalid(absl::StrFormat("invalid escape char: '%c'", c));
440 }
441 on_heap.push_back(escape);
442 }
443 break;
444 }
445 normal_character:
446 default: {
447 uint8_t uc = static_cast<uint8_t>(c);
448 // If people have newlines in their strings, that's their problem; it
449 // is too difficult to support correctly in our location tracking, and
450 // is out of spec, so users will get slightly wrong locations in errors.
451 if ((uc < 0x20 || uc == 0xff) && !options_.allow_legacy_syntax) {
452 return Invalid(absl::StrFormat(
453 "invalid control character 0x%02x in string", uc));
454 }
455
456 // Process this UTF-8 code point. We do not need to fully validate it
457 // at this stage; we just need to interpret it enough to know how many
458 // bytes to read. UTF-8 is a varint encoding satisfying one of the
459 // following (big-endian) patterns:
460 //
461 // 0b0xxxxxxx
462 // 0b110xxxxx'10xxxxxx
463 // 0b1110xxxx'10xxxxxx'10xxxxxx
464 // 0b11110xxx'10xxxxxx'10xxxxxx'10xxxxxx
465 size_t lookahead = 0;
466 switch (absl::countl_one(uc)) {
467 case 0:
468 break;
469 case 2:
470 lookahead = 1;
471 break;
472 case 3:
473 lookahead = 2;
474 break;
475 case 4:
476 lookahead = 3;
477 break;
478 default:
479 return Invalid("invalid UTF-8 in string");
480 }
481
482 if (!on_heap.empty()) {
483 on_heap.push_back(c);
484 }
485 auto lookahead_bytes = stream_.Take(lookahead);
486 RETURN_IF_ERROR(lookahead_bytes.status());
487 if (!on_heap.empty()) {
488 absl::string_view view = lookahead_bytes->AsView();
489 on_heap.append(view.data(), view.size());
490 }
491 break;
492 }
493 }
494 }
495
496 return Invalid("EOF inside string");
497 }
498
ParseBareWord()499 absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseBareWord() {
500 RETURN_IF_ERROR(SkipToToken());
501 auto ident = TakeWhile(
502 [](size_t, char c) { return c == '_' || absl::ascii_isalnum(c); });
503 RETURN_IF_ERROR(ident.status());
504 absl::string_view text = ident->value.AsView();
505
506 if (text.empty() || absl::ascii_isdigit(text[0]) || text == "null" ||
507 text == "true" || text == "false") {
508 return ident->loc.Invalid("expected bare word");
509 }
510 return ident;
511 }
512
513 } // namespace json_internal
514 } // namespace protobuf
515 } // namespace google
516
517 #include "google/protobuf/port_undef.inc"
518