• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7 
8 #include "google/protobuf/json/internal/lexer.h"
9 
10 #include <sys/types.h>
11 
12 #include <atomic>
13 #include <cfloat>
14 #include <cmath>
15 #include <cstdint>
16 #include <iostream>
17 #include <limits>
18 #include <ostream>
19 #include <string>
20 #include <utility>
21 
22 #include "absl/algorithm/container.h"
23 #include "absl/log/absl_check.h"
24 #include "absl/numeric/bits.h"
25 #include "absl/status/status.h"
26 #include "absl/status/statusor.h"
27 #include "absl/strings/ascii.h"
28 #include "absl/strings/numbers.h"
29 #include "absl/strings/str_cat.h"
30 #include "absl/strings/str_format.h"
31 #include "absl/strings/string_view.h"
32 #include "utf8_validity.h"
33 #include "google/protobuf/stubs/status_macros.h"
34 
35 // Must be included last.
36 #include "google/protobuf/port_def.inc"
37 
38 namespace google {
39 namespace protobuf {
40 namespace json_internal {
41 namespace {
42 // Randomly inserts bonus whitespace of a few different kinds into a string.
43 //
44 // This utility is intended to make error messages hostile to machine
45 // interpretation as a Hyrum's Law countermeasure, without potentially confusing
46 // human readers.
HardenAgainstHyrumsLaw(absl::string_view to_obfuscate,std::string & out)47 void HardenAgainstHyrumsLaw(absl::string_view to_obfuscate, std::string& out) {
48   // Get some simple randomness from ASLR, which is enabled in most
49   // environments. Our goal is to be annoying, not secure.
50   static const void* const kAslrSeed = &kAslrSeed;
51   // Per-call randomness from a relaxed atomic.
52   static std::atomic<uintptr_t> kCounterSeed{0};
53 
54   constexpr uint64_t kA = 0x5851f42d4c957f2dull;
55   constexpr uint64_t kB = 0x14057b7ef767814full;
56 
57   uint64_t state = absl::bit_cast<uintptr_t>(kAslrSeed) + kB +
58                    kCounterSeed.fetch_add(1, std::memory_order_relaxed);
59   auto rng = [&state, &kA, &kB] {
60     state = state * kA + kB;
61     return absl::rotr(static_cast<uint32_t>(((state >> 18) ^ state) >> 27),
62                       state >> 59);
63   };
64   (void)rng();  // Advance state once.
65 
66   out.reserve(to_obfuscate.size() + absl::c_count(to_obfuscate, ' '));
67   for (char c : to_obfuscate) {
68     out.push_back(c);
69     if (c != ' ' || rng() % 3 != 0) {
70       continue;
71     }
72 
73     size_t count = rng() % 2 + 1;
74     for (size_t i = 0; i < count; ++i) {
75       out.push_back(' ');
76     }
77   }
78 }
79 }  // namespace
80 
81 constexpr size_t ParseOptions::kDefaultDepth;
82 
Invalid(absl::string_view message,SourceLocation sl) const83 absl::Status JsonLocation::Invalid(absl::string_view message,
84                                    SourceLocation sl) const {
85   // NOTE: we intentionally do not harden the "invalid JSON" part, so that
86   // people have a hope of grepping for it in logs. That part is easy to
87   // commit to, as stability goes.
88   //
89   // This copies the error twice. Because this is the "unhappy" path, this
90   // function is cold and can afford the waste.
91   std::string status_message = "invalid JSON";
92   std::string to_obfuscate;
93   if (path != nullptr) {
94     absl::StrAppend(&to_obfuscate, " in ");
95     path->Describe(to_obfuscate);
96     to_obfuscate.push_back(',');
97   }
98   absl::StrAppendFormat(&to_obfuscate, " near %zu:%zu (offset %zu): %s",
99                         line + 1, col + 1, offset, message);
100   HardenAgainstHyrumsLaw(to_obfuscate, status_message);
101 
102   return absl::InvalidArgumentError(std::move(status_message));
103 }
104 
PeekKind()105 absl::StatusOr<JsonLexer::Kind> JsonLexer::PeekKind() {
106   RETURN_IF_ERROR(SkipToToken());
107   char c = stream_.PeekChar();
108   switch (c) {
109     case '{':
110       return JsonLexer::kObj;
111     case '[':
112       return JsonLexer::kArr;
113     case '"':
114     case '\'':
115       return JsonLexer::kStr;
116     case '-':
117     case '0':
118     case '1':
119     case '2':
120     case '3':
121     case '4':
122     case '5':
123     case '6':
124     case '7':
125     case '8':
126     case '9':
127       return JsonLexer::kNum;
128     case 't':
129       return JsonLexer::kTrue;
130     case 'f':
131       return JsonLexer::kFalse;
132     case 'n':
133       return JsonLexer::kNull;
134     default:
135       return Invalid(absl::StrFormat("unexpected character: '%c'", c));
136   }
137 }
138 
SkipValue()139 absl::Status JsonLexer::SkipValue() {
140   absl::StatusOr<Kind> kind = PeekKind();
141   RETURN_IF_ERROR(kind.status());
142 
143   switch (*kind) {
144     case JsonLexer::kObj:
145       return VisitObject(
146           [this](LocationWith<MaybeOwnedString>&) { return SkipValue(); });
147     case JsonLexer::kArr:
148       return VisitArray([this] { return SkipValue(); });
149     case JsonLexer::kStr:
150       return ParseUtf8().status();
151     case JsonLexer::kNum:
152       return ParseNumber().status();
153     case JsonLexer::kTrue:
154       return Expect("true");
155     case JsonLexer::kFalse:
156       return Expect("false");
157     case JsonLexer::kNull:
158       return Expect("null");
159     default:
160       break;
161   }
162   // Some compilers seem to fail to realize this is a basic block
163   // terminator and incorrectly believe this function is missing
164   // a return.
165   ABSL_CHECK(false) << "unreachable";
166   return absl::OkStatus();
167 }
168 
ParseU16HexCodepoint()169 absl::StatusOr<uint16_t> JsonLexer::ParseU16HexCodepoint() {
170   absl::StatusOr<LocationWith<MaybeOwnedString>> escape = Take(4);
171   RETURN_IF_ERROR(escape.status());
172 
173   uint16_t u16 = 0;
174   for (char c : escape->value.AsView()) {
175     if (c >= '0' && c <= '9') {
176       c -= '0';
177     } else if (c >= 'a' && c <= 'f') {
178       c = c - 'a' + 10;
179     } else if (c >= 'A' && c <= 'F') {
180       c = c - 'A' + 10;
181     } else {
182       return Invalid("invalid Unicode escape");
183     }
184     u16 <<= 4;
185     u16 |= c;
186   }
187 
188   return u16;
189 }
190 
SkipToToken()191 absl::Status JsonLexer::SkipToToken() {
192   while (true) {
193     RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
194     switch (stream_.PeekChar()) {
195       case '\n':
196         RETURN_IF_ERROR(Advance(1));
197         ++json_loc_.line;
198         json_loc_.col = 0;
199         break;
200       case '\r':
201       case '\t':
202       case ' ':
203         RETURN_IF_ERROR(Advance(1));
204         break;
205       default:
206         return absl::OkStatus();
207     }
208   }
209 }
210 
ParseRawNumber()211 absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseRawNumber() {
212   RETURN_IF_ERROR(SkipToToken());
213 
214   enum { kInt, kFraction, kExponent } state = kInt;
215   char prev_var = 0;
216   auto number = TakeWhile([state, prev_var](size_t index, char c) mutable {
217     char prev = prev_var;
218     prev_var = c;
219     if (absl::ascii_isdigit(c)) {
220       return true;
221     }
222 
223     bool last_was_int = absl::ascii_isdigit(prev);
224     // These checks handle transitions between the integer, fractional, and
225     // exponent part of a number. This will cut off at the first syntax error.
226     // Because all numbers must be followed by `,`, `]`, or `}`, we can let
227     // that catch what's left behind.
228     if (state == kInt && c == '-') {
229       return !last_was_int;
230     }
231     if (state == kInt && last_was_int && c == '.') {
232       state = kFraction;
233       return true;
234     }
235     if (state != kExponent && last_was_int && (c == 'e' || c == 'E')) {
236       state = kExponent;
237       return true;
238     }
239     if ((prev == 'e' || prev == 'E') && (c == '-' || c == '+')) {
240       return true;
241     }
242 
243     return false;
244   });
245 
246   RETURN_IF_ERROR(number.status());
247   absl::string_view number_text = number->value.AsView();
248 
249   if (number_text.empty() || number_text == "-") {
250     return number->loc.Invalid("expected a number");
251   }
252 
253   auto without_minus =
254       number_text[0] == '-' ? number_text.substr(1) : number_text;
255   if (without_minus.size() > 1 && without_minus[0] == '0' &&
256       absl::ascii_isdigit(without_minus[1])) {
257     return number->loc.Invalid("number cannot have extraneous leading zero");
258   }
259 
260   if (number_text.back() == '.') {
261     return number->loc.Invalid("number cannot have trailing period");
262   }
263 
264   double d;
265   if (!absl::SimpleAtod(number_text, &d) || !std::isfinite(d)) {
266     return number->loc.Invalid(
267         absl::StrFormat("invalid number: '%s'", number_text));
268   }
269 
270   // Find the next token, to make sure we didn't leave something behind we
271   // shouldn't have.
272   if (!stream_.AtEof()) {
273     RETURN_IF_ERROR(SkipToToken());
274     switch (stream_.PeekChar()) {
275       case ',':
276       case ']':
277       case '}':
278         break;
279       default:
280         return Invalid(
281             absl::StrFormat("unexpected character: '%c'", stream_.PeekChar()));
282     }
283   }
284 
285   return number;
286 }
287 
ParseNumber()288 absl::StatusOr<LocationWith<double>> JsonLexer::ParseNumber() {
289   auto number = ParseRawNumber();
290   RETURN_IF_ERROR(number.status());
291 
292   double d;
293   if (!absl::SimpleAtod(number->value.AsView(), &d) || !std::isfinite(d)) {
294     return number->loc.Invalid(
295         absl::StrFormat("invalid number: '%s'", number->value.AsView()));
296   }
297 
298   return LocationWith<double>{d, number->loc};
299 }
300 
ParseUnicodeEscape(char out_utf8[4])301 absl::StatusOr<size_t> JsonLexer::ParseUnicodeEscape(char out_utf8[4]) {
302   auto hex = ParseU16HexCodepoint();
303   RETURN_IF_ERROR(hex.status());
304 
305   uint32_t rune = *hex;
306   if (rune >= 0xd800 && rune <= 0xdbff) {
307     // Surrogate pair: two 16-bit codepoints become a 32-bit codepoint.
308     uint32_t high = rune;
309 
310     RETURN_IF_ERROR(Expect("\\u"));
311     auto hex = ParseU16HexCodepoint();
312     RETURN_IF_ERROR(hex.status());
313 
314     uint32_t low = *hex;
315     if (low < 0xdc00 || low > 0xdfff) {
316       return Invalid("invalid low surrogate");
317     }
318 
319     rune = (high & 0x3ff) << 10;
320     rune |= (low & 0x3ff);
321     rune += 0x10000;
322   } else if (rune >= 0xdc00 && rune <= 0xdfff) {
323     return Invalid("unpaired low surrogate");
324   }
325 
326   // Write as UTF-8.
327   if (rune <= 0x7f) {
328     out_utf8[0] = rune;
329     return 1;
330   } else if (rune <= 0x07ff) {
331     out_utf8[0] = ((rune >> 6) & 0x1f) | 0xc0;
332     out_utf8[1] = ((rune >> 0) & 0x3f) | 0x80;
333     return 2;
334   } else if (rune <= 0xffff) {
335     out_utf8[0] = ((rune >> 12) & 0x0f) | 0xe0;
336     out_utf8[1] = ((rune >> 6) & 0x3f) | 0x80;
337     out_utf8[2] = ((rune >> 0) & 0x3f) | 0x80;
338     return 3;
339   } else if (rune <= 0x10ffff) {
340     out_utf8[0] = ((rune >> 18) & 0x07) | 0xF0;
341     out_utf8[1] = ((rune >> 12) & 0x3f) | 0x80;
342     out_utf8[2] = ((rune >> 6) & 0x3f) | 0x80;
343     out_utf8[3] = ((rune >> 0) & 0x3f) | 0x80;
344     return 4;
345   } else {
346     return Invalid("invalid codepoint");
347   }
348 }
349 
ParseSimpleEscape(char c,bool allow_legacy_syntax)350 static char ParseSimpleEscape(char c, bool allow_legacy_syntax) {
351   switch (c) {
352     case '"':
353       return '"';
354     case '\\':
355       return '\\';
356     case '/':
357       return '/';
358     case 'b':
359       return '\b';
360     case 'f':
361       return '\f';
362     case 'n':
363       return '\n';
364     case 'r':
365       return '\r';
366     case 't':
367       return '\t';
368     case '\'':
369       if (allow_legacy_syntax) {
370         return '\'';
371       }
372       ABSL_FALLTHROUGH_INTENDED;
373     default:
374       return 0;
375   }
376 }
377 
ParseUtf8()378 absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseUtf8() {
379   RETURN_IF_ERROR(SkipToToken());
380   // This is a non-standard extension accepted by the ESF parser that we will
381   // need to accept for backwards-compat.
382   bool is_single_quote = stream_.PeekChar() == '\'';
383   if (!options_.allow_legacy_syntax && is_single_quote) {
384     return Invalid("expected '\"'");
385   }
386 
387   JsonLocation loc = json_loc_;
388   RETURN_IF_ERROR(Expect(is_single_quote ? "'" : "\""));
389 
390   // on_heap is empty if we do not need to heap-allocate the string.
391   std::string on_heap;
392   LocationWith<Mark> mark = BeginMark();
393   while (true) {
394     RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
395 
396     char c = stream_.PeekChar();
397     RETURN_IF_ERROR(Advance(1));
398     switch (c) {
399       case '"':
400       case '\'': {
401         if (c != (is_single_quote ? '\'' : '"')) {
402           goto normal_character;
403         }
404 
405         // NOTE: the 1 below clips off the " from the end of the string.
406         MaybeOwnedString result = on_heap.empty()
407                                       ? mark.value.UpToUnread(1)
408                                       : MaybeOwnedString{std::move(on_heap)};
409         if (utf8_range::IsStructurallyValid(result)) {
410           return LocationWith<MaybeOwnedString>{std::move(result), loc};
411         }
412         return Invalid("Invalid UTF-8 string");
413       }
414       case '\\': {
415         if (on_heap.empty()) {
416           // The 1 skips over the `\`.
417           on_heap = std::string(mark.value.UpToUnread(1).AsView());
418           // Clang-tidy incorrectly notes this as being moved-from multiple
419           // times, but it can only occur in one loop iteration. The mark is
420           // destroyed only if we need to handle an escape when on_heap is
421           // empty. Because this branch unconditionally pushes to on_heap, this
422           // condition can never be reached in any iteration that follows it.
423           // Thus, at most one move ever actually occurs.
424           std::move(mark).value.Discard();
425         }
426         RETURN_IF_ERROR(stream_.BufferAtLeast(1).status());
427 
428         char c = stream_.PeekChar();
429         RETURN_IF_ERROR(Advance(1));
430         if (c == 'u' || (c == 'U' && options_.allow_legacy_syntax)) {
431           // Ensure there is actual space to scribble the UTF-8 onto.
432           on_heap.resize(on_heap.size() + 4);
433           auto written = ParseUnicodeEscape(&on_heap[on_heap.size() - 4]);
434           RETURN_IF_ERROR(written.status());
435           on_heap.resize(on_heap.size() - 4 + *written);
436         } else {
437           char escape = ParseSimpleEscape(c, options_.allow_legacy_syntax);
438           if (escape == 0) {
439             return Invalid(absl::StrFormat("invalid escape char: '%c'", c));
440           }
441           on_heap.push_back(escape);
442         }
443         break;
444       }
445       normal_character:
446       default: {
447         uint8_t uc = static_cast<uint8_t>(c);
448         // If people have newlines in their strings, that's their problem; it
449         // is too difficult to support correctly in our location tracking, and
450         // is out of spec, so users will get slightly wrong locations in errors.
451         if ((uc < 0x20 || uc == 0xff) && !options_.allow_legacy_syntax) {
452           return Invalid(absl::StrFormat(
453               "invalid control character 0x%02x in string", uc));
454         }
455 
456         // Process this UTF-8 code point. We do not need to fully validate it
457         // at this stage; we just need to interpret it enough to know how many
458         // bytes to read. UTF-8 is a varint encoding satisfying one of the
459         // following (big-endian) patterns:
460         //
461         // 0b0xxxxxxx
462         // 0b110xxxxx'10xxxxxx
463         // 0b1110xxxx'10xxxxxx'10xxxxxx
464         // 0b11110xxx'10xxxxxx'10xxxxxx'10xxxxxx
465         size_t lookahead = 0;
466         switch (absl::countl_one(uc)) {
467           case 0:
468             break;
469           case 2:
470             lookahead = 1;
471             break;
472           case 3:
473             lookahead = 2;
474             break;
475           case 4:
476             lookahead = 3;
477             break;
478           default:
479             return Invalid("invalid UTF-8 in string");
480         }
481 
482         if (!on_heap.empty()) {
483           on_heap.push_back(c);
484         }
485         auto lookahead_bytes = stream_.Take(lookahead);
486         RETURN_IF_ERROR(lookahead_bytes.status());
487         if (!on_heap.empty()) {
488           absl::string_view view = lookahead_bytes->AsView();
489           on_heap.append(view.data(), view.size());
490         }
491         break;
492       }
493     }
494   }
495 
496   return Invalid("EOF inside string");
497 }
498 
ParseBareWord()499 absl::StatusOr<LocationWith<MaybeOwnedString>> JsonLexer::ParseBareWord() {
500   RETURN_IF_ERROR(SkipToToken());
501   auto ident = TakeWhile(
502       [](size_t, char c) { return c == '_' || absl::ascii_isalnum(c); });
503   RETURN_IF_ERROR(ident.status());
504   absl::string_view text = ident->value.AsView();
505 
506   if (text.empty() || absl::ascii_isdigit(text[0]) || text == "null" ||
507       text == "true" || text == "false") {
508     return ident->loc.Invalid("expected bare word");
509   }
510   return ident;
511 }
512 
513 }  // namespace json_internal
514 }  // namespace protobuf
515 }  // namespace google
516 
517 #include "google/protobuf/port_undef.inc"
518