1 // Copyright 2019 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "encoding.h"
6
7 #include <algorithm>
8 #include <cassert>
9 #include <cmath>
10 #include <cstring>
11 #include <limits>
12 #include <stack>
13
14 namespace v8_inspector_protocol_encoding {
15 // =============================================================================
16 // Status and Error codes
17 // =============================================================================
18
ToASCIIString() const19 std::string Status::ToASCIIString() const {
20 switch (error) {
21 case Error::OK:
22 return "OK";
23 case Error::JSON_PARSER_UNPROCESSED_INPUT_REMAINS:
24 return ToASCIIString("JSON: unprocessed input remains");
25 case Error::JSON_PARSER_STACK_LIMIT_EXCEEDED:
26 return ToASCIIString("JSON: stack limit exceeded");
27 case Error::JSON_PARSER_NO_INPUT:
28 return ToASCIIString("JSON: no input");
29 case Error::JSON_PARSER_INVALID_TOKEN:
30 return ToASCIIString("JSON: invalid token");
31 case Error::JSON_PARSER_INVALID_NUMBER:
32 return ToASCIIString("JSON: invalid number");
33 case Error::JSON_PARSER_INVALID_STRING:
34 return ToASCIIString("JSON: invalid string");
35 case Error::JSON_PARSER_UNEXPECTED_ARRAY_END:
36 return ToASCIIString("JSON: unexpected array end");
37 case Error::JSON_PARSER_COMMA_OR_ARRAY_END_EXPECTED:
38 return ToASCIIString("JSON: comma or array end expected");
39 case Error::JSON_PARSER_STRING_LITERAL_EXPECTED:
40 return ToASCIIString("JSON: string literal expected");
41 case Error::JSON_PARSER_COLON_EXPECTED:
42 return ToASCIIString("JSON: colon expected");
43 case Error::JSON_PARSER_UNEXPECTED_MAP_END:
44 return ToASCIIString("JSON: unexpected map end");
45 case Error::JSON_PARSER_COMMA_OR_MAP_END_EXPECTED:
46 return ToASCIIString("JSON: comma or map end expected");
47 case Error::JSON_PARSER_VALUE_EXPECTED:
48 return ToASCIIString("JSON: value expected");
49
50 case Error::CBOR_INVALID_INT32:
51 return ToASCIIString("CBOR: invalid int32");
52 case Error::CBOR_INVALID_DOUBLE:
53 return ToASCIIString("CBOR: invalid double");
54 case Error::CBOR_INVALID_ENVELOPE:
55 return ToASCIIString("CBOR: invalid envelope");
56 case Error::CBOR_INVALID_STRING8:
57 return ToASCIIString("CBOR: invalid string8");
58 case Error::CBOR_INVALID_STRING16:
59 return ToASCIIString("CBOR: invalid string16");
60 case Error::CBOR_INVALID_BINARY:
61 return ToASCIIString("CBOR: invalid binary");
62 case Error::CBOR_UNSUPPORTED_VALUE:
63 return ToASCIIString("CBOR: unsupported value");
64 case Error::CBOR_NO_INPUT:
65 return ToASCIIString("CBOR: no input");
66 case Error::CBOR_INVALID_START_BYTE:
67 return ToASCIIString("CBOR: invalid start byte");
68 case Error::CBOR_UNEXPECTED_EOF_EXPECTED_VALUE:
69 return ToASCIIString("CBOR: unexpected eof expected value");
70 case Error::CBOR_UNEXPECTED_EOF_IN_ARRAY:
71 return ToASCIIString("CBOR: unexpected eof in array");
72 case Error::CBOR_UNEXPECTED_EOF_IN_MAP:
73 return ToASCIIString("CBOR: unexpected eof in map");
74 case Error::CBOR_INVALID_MAP_KEY:
75 return ToASCIIString("CBOR: invalid map key");
76 case Error::CBOR_STACK_LIMIT_EXCEEDED:
77 return ToASCIIString("CBOR: stack limit exceeded");
78 case Error::CBOR_TRAILING_JUNK:
79 return ToASCIIString("CBOR: trailing junk");
80 case Error::CBOR_MAP_START_EXPECTED:
81 return ToASCIIString("CBOR: map start expected");
82 case Error::CBOR_MAP_STOP_EXPECTED:
83 return ToASCIIString("CBOR: map stop expected");
84 case Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED:
85 return ToASCIIString("CBOR: envelope size limit exceeded");
86 }
87 // Some compilers can't figure out that we can't get here.
88 return "INVALID ERROR CODE";
89 }
90
ToASCIIString(const char * msg) const91 std::string Status::ToASCIIString(const char* msg) const {
92 return std::string(msg) + " at position " + std::to_string(pos);
93 }
94
95 namespace cbor {
96 namespace {
97 // Indicates the number of bits the "initial byte" needs to be shifted to the
98 // right after applying |kMajorTypeMask| to produce the major type in the
99 // lowermost bits.
100 static constexpr uint8_t kMajorTypeBitShift = 5u;
101 // Mask selecting the low-order 5 bits of the "initial byte", which is where
102 // the additional information is encoded.
103 static constexpr uint8_t kAdditionalInformationMask = 0x1f;
104 // Mask selecting the high-order 3 bits of the "initial byte", which indicates
105 // the major type of the encoded value.
106 static constexpr uint8_t kMajorTypeMask = 0xe0;
107 // Indicates the integer is in the following byte.
108 static constexpr uint8_t kAdditionalInformation1Byte = 24u;
109 // Indicates the integer is in the next 2 bytes.
110 static constexpr uint8_t kAdditionalInformation2Bytes = 25u;
111 // Indicates the integer is in the next 4 bytes.
112 static constexpr uint8_t kAdditionalInformation4Bytes = 26u;
113 // Indicates the integer is in the next 8 bytes.
114 static constexpr uint8_t kAdditionalInformation8Bytes = 27u;
115
116 // Encodes the initial byte, consisting of the |type| in the first 3 bits
117 // followed by 5 bits of |additional_info|.
EncodeInitialByte(MajorType type,uint8_t additional_info)118 constexpr uint8_t EncodeInitialByte(MajorType type, uint8_t additional_info) {
119 return (static_cast<uint8_t>(type) << kMajorTypeBitShift) |
120 (additional_info & kAdditionalInformationMask);
121 }
122
123 // TAG 24 indicates that what follows is a byte string which is
124 // encoded in CBOR format. We use this as a wrapper for
125 // maps and arrays, allowing us to skip them, because the
126 // byte string carries its size (byte length).
127 // https://tools.ietf.org/html/rfc7049#section-2.4.4.1
128 static constexpr uint8_t kInitialByteForEnvelope =
129 EncodeInitialByte(MajorType::TAG, 24);
130 // The initial byte for a byte string with at most 2^32 bytes
131 // of payload. This is used for envelope encoding, even if
132 // the byte string is shorter.
133 static constexpr uint8_t kInitialByteFor32BitLengthByteString =
134 EncodeInitialByte(MajorType::BYTE_STRING, 26);
135
136 // See RFC 7049 Section 2.2.1, indefinite length arrays / maps have additional
137 // info = 31.
138 static constexpr uint8_t kInitialByteIndefiniteLengthArray =
139 EncodeInitialByte(MajorType::ARRAY, 31);
140 static constexpr uint8_t kInitialByteIndefiniteLengthMap =
141 EncodeInitialByte(MajorType::MAP, 31);
142 // See RFC 7049 Section 2.3, Table 1; this is used for finishing indefinite
143 // length maps / arrays.
144 static constexpr uint8_t kStopByte =
145 EncodeInitialByte(MajorType::SIMPLE_VALUE, 31);
146
147 // See RFC 7049 Section 2.3, Table 2.
148 static constexpr uint8_t kEncodedTrue =
149 EncodeInitialByte(MajorType::SIMPLE_VALUE, 21);
150 static constexpr uint8_t kEncodedFalse =
151 EncodeInitialByte(MajorType::SIMPLE_VALUE, 20);
152 static constexpr uint8_t kEncodedNull =
153 EncodeInitialByte(MajorType::SIMPLE_VALUE, 22);
154 static constexpr uint8_t kInitialByteForDouble =
155 EncodeInitialByte(MajorType::SIMPLE_VALUE, 27);
156
157 // See RFC 7049 Table 3 and Section 2.4.4.2. This is used as a prefix for
158 // arbitrary binary data encoded as BYTE_STRING.
159 static constexpr uint8_t kExpectedConversionToBase64Tag =
160 EncodeInitialByte(MajorType::TAG, 22);
161
162 // Writes the bytes for |v| to |out|, starting with the most significant byte.
163 // See also: https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html
164 template <typename T, class C>
WriteBytesMostSignificantByteFirst(T v,C * out)165 void WriteBytesMostSignificantByteFirst(T v, C* out) {
166 for (int shift_bytes = sizeof(T) - 1; shift_bytes >= 0; --shift_bytes)
167 out->push_back(0xff & (v >> (shift_bytes * 8)));
168 }
169
170 // Extracts sizeof(T) bytes from |in| to extract a value of type T
171 // (e.g. uint64_t, uint32_t, ...), most significant byte first.
172 // See also: https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html
173 template <typename T>
ReadBytesMostSignificantByteFirst(span<uint8_t> in)174 T ReadBytesMostSignificantByteFirst(span<uint8_t> in) {
175 assert(in.size() >= sizeof(T));
176 T result = 0;
177 for (size_t shift_bytes = 0; shift_bytes < sizeof(T); ++shift_bytes)
178 result |= T(in[sizeof(T) - 1 - shift_bytes]) << (shift_bytes * 8);
179 return result;
180 }
181 } // namespace
182
183 namespace internals {
184 // Reads the start of a token with definitive size from |bytes|.
185 // |type| is the major type as specified in RFC 7049 Section 2.1.
186 // |value| is the payload (e.g. for MajorType::UNSIGNED) or is the size
187 // (e.g. for BYTE_STRING).
188 // If successful, returns the number of bytes read. Otherwise returns -1.
189 // TODO(johannes): change return type to size_t and use 0 for error.
ReadTokenStart(span<uint8_t> bytes,MajorType * type,uint64_t * value)190 int8_t ReadTokenStart(span<uint8_t> bytes, MajorType* type, uint64_t* value) {
191 if (bytes.empty())
192 return -1;
193 uint8_t initial_byte = bytes[0];
194 *type = MajorType((initial_byte & kMajorTypeMask) >> kMajorTypeBitShift);
195
196 uint8_t additional_information = initial_byte & kAdditionalInformationMask;
197 if (additional_information < 24) {
198 // Values 0-23 are encoded directly into the additional info of the
199 // initial byte.
200 *value = additional_information;
201 return 1;
202 }
203 if (additional_information == kAdditionalInformation1Byte) {
204 // Values 24-255 are encoded with one initial byte, followed by the value.
205 if (bytes.size() < 2)
206 return -1;
207 *value = ReadBytesMostSignificantByteFirst<uint8_t>(bytes.subspan(1));
208 return 2;
209 }
210 if (additional_information == kAdditionalInformation2Bytes) {
211 // Values 256-65535: 1 initial byte + 2 bytes payload.
212 if (bytes.size() < 1 + sizeof(uint16_t))
213 return -1;
214 *value = ReadBytesMostSignificantByteFirst<uint16_t>(bytes.subspan(1));
215 return 3;
216 }
217 if (additional_information == kAdditionalInformation4Bytes) {
218 // 32 bit uint: 1 initial byte + 4 bytes payload.
219 if (bytes.size() < 1 + sizeof(uint32_t))
220 return -1;
221 *value = ReadBytesMostSignificantByteFirst<uint32_t>(bytes.subspan(1));
222 return 5;
223 }
224 if (additional_information == kAdditionalInformation8Bytes) {
225 // 64 bit uint: 1 initial byte + 8 bytes payload.
226 if (bytes.size() < 1 + sizeof(uint64_t))
227 return -1;
228 *value = ReadBytesMostSignificantByteFirst<uint64_t>(bytes.subspan(1));
229 return 9;
230 }
231 return -1;
232 }
233
234 // Writes the start of a token with |type|. The |value| may indicate the size,
235 // or it may be the payload if the value is an unsigned integer.
236 template <typename C>
WriteTokenStartTmpl(MajorType type,uint64_t value,C * encoded)237 void WriteTokenStartTmpl(MajorType type, uint64_t value, C* encoded) {
238 if (value < 24) {
239 // Values 0-23 are encoded directly into the additional info of the
240 // initial byte.
241 encoded->push_back(EncodeInitialByte(type, /*additional_info=*/value));
242 return;
243 }
244 if (value <= std::numeric_limits<uint8_t>::max()) {
245 // Values 24-255 are encoded with one initial byte, followed by the value.
246 encoded->push_back(EncodeInitialByte(type, kAdditionalInformation1Byte));
247 encoded->push_back(value);
248 return;
249 }
250 if (value <= std::numeric_limits<uint16_t>::max()) {
251 // Values 256-65535: 1 initial byte + 2 bytes payload.
252 encoded->push_back(EncodeInitialByte(type, kAdditionalInformation2Bytes));
253 WriteBytesMostSignificantByteFirst<uint16_t>(value, encoded);
254 return;
255 }
256 if (value <= std::numeric_limits<uint32_t>::max()) {
257 // 32 bit uint: 1 initial byte + 4 bytes payload.
258 encoded->push_back(EncodeInitialByte(type, kAdditionalInformation4Bytes));
259 WriteBytesMostSignificantByteFirst<uint32_t>(static_cast<uint32_t>(value),
260 encoded);
261 return;
262 }
263 // 64 bit uint: 1 initial byte + 8 bytes payload.
264 encoded->push_back(EncodeInitialByte(type, kAdditionalInformation8Bytes));
265 WriteBytesMostSignificantByteFirst<uint64_t>(value, encoded);
266 }
WriteTokenStart(MajorType type,uint64_t value,std::vector<uint8_t> * encoded)267 void WriteTokenStart(MajorType type,
268 uint64_t value,
269 std::vector<uint8_t>* encoded) {
270 WriteTokenStartTmpl(type, value, encoded);
271 }
WriteTokenStart(MajorType type,uint64_t value,std::string * encoded)272 void WriteTokenStart(MajorType type, uint64_t value, std::string* encoded) {
273 WriteTokenStartTmpl(type, value, encoded);
274 }
275 } // namespace internals
276
277 // =============================================================================
278 // Detecting CBOR content
279 // =============================================================================
280
InitialByteForEnvelope()281 uint8_t InitialByteForEnvelope() {
282 return kInitialByteForEnvelope;
283 }
InitialByteFor32BitLengthByteString()284 uint8_t InitialByteFor32BitLengthByteString() {
285 return kInitialByteFor32BitLengthByteString;
286 }
IsCBORMessage(span<uint8_t> msg)287 bool IsCBORMessage(span<uint8_t> msg) {
288 return msg.size() >= 6 && msg[0] == InitialByteForEnvelope() &&
289 msg[1] == InitialByteFor32BitLengthByteString();
290 }
291
292 // =============================================================================
293 // Encoding invidiual CBOR items
294 // =============================================================================
295
EncodeTrue()296 uint8_t EncodeTrue() {
297 return kEncodedTrue;
298 }
EncodeFalse()299 uint8_t EncodeFalse() {
300 return kEncodedFalse;
301 }
EncodeNull()302 uint8_t EncodeNull() {
303 return kEncodedNull;
304 }
305
EncodeIndefiniteLengthArrayStart()306 uint8_t EncodeIndefiniteLengthArrayStart() {
307 return kInitialByteIndefiniteLengthArray;
308 }
309
EncodeIndefiniteLengthMapStart()310 uint8_t EncodeIndefiniteLengthMapStart() {
311 return kInitialByteIndefiniteLengthMap;
312 }
313
EncodeStop()314 uint8_t EncodeStop() {
315 return kStopByte;
316 }
317
318 template <typename C>
EncodeInt32Tmpl(int32_t value,C * out)319 void EncodeInt32Tmpl(int32_t value, C* out) {
320 if (value >= 0) {
321 internals::WriteTokenStart(MajorType::UNSIGNED, value, out);
322 } else {
323 uint64_t representation = static_cast<uint64_t>(-(value + 1));
324 internals::WriteTokenStart(MajorType::NEGATIVE, representation, out);
325 }
326 }
EncodeInt32(int32_t value,std::vector<uint8_t> * out)327 void EncodeInt32(int32_t value, std::vector<uint8_t>* out) {
328 EncodeInt32Tmpl(value, out);
329 }
EncodeInt32(int32_t value,std::string * out)330 void EncodeInt32(int32_t value, std::string* out) {
331 EncodeInt32Tmpl(value, out);
332 }
333
334 template <typename C>
EncodeString16Tmpl(span<uint16_t> in,C * out)335 void EncodeString16Tmpl(span<uint16_t> in, C* out) {
336 uint64_t byte_length = static_cast<uint64_t>(in.size_bytes());
337 internals::WriteTokenStart(MajorType::BYTE_STRING, byte_length, out);
338 // When emitting UTF16 characters, we always write the least significant byte
339 // first; this is because it's the native representation for X86.
340 // TODO(johannes): Implement a more efficient thing here later, e.g.
341 // casting *iff* the machine has this byte order.
342 // The wire format for UTF16 chars will probably remain the same
343 // (least significant byte first) since this way we can have
344 // golden files, unittests, etc. that port easily and universally.
345 // See also:
346 // https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html
347 for (const uint16_t two_bytes : in) {
348 out->push_back(two_bytes);
349 out->push_back(two_bytes >> 8);
350 }
351 }
EncodeString16(span<uint16_t> in,std::vector<uint8_t> * out)352 void EncodeString16(span<uint16_t> in, std::vector<uint8_t>* out) {
353 EncodeString16Tmpl(in, out);
354 }
EncodeString16(span<uint16_t> in,std::string * out)355 void EncodeString16(span<uint16_t> in, std::string* out) {
356 EncodeString16Tmpl(in, out);
357 }
358
359 template <typename C>
EncodeString8Tmpl(span<uint8_t> in,C * out)360 void EncodeString8Tmpl(span<uint8_t> in, C* out) {
361 internals::WriteTokenStart(MajorType::STRING,
362 static_cast<uint64_t>(in.size_bytes()), out);
363 out->insert(out->end(), in.begin(), in.end());
364 }
EncodeString8(span<uint8_t> in,std::vector<uint8_t> * out)365 void EncodeString8(span<uint8_t> in, std::vector<uint8_t>* out) {
366 EncodeString8Tmpl(in, out);
367 }
EncodeString8(span<uint8_t> in,std::string * out)368 void EncodeString8(span<uint8_t> in, std::string* out) {
369 EncodeString8Tmpl(in, out);
370 }
371
372 template <typename C>
EncodeFromLatin1Tmpl(span<uint8_t> latin1,C * out)373 void EncodeFromLatin1Tmpl(span<uint8_t> latin1, C* out) {
374 for (size_t ii = 0; ii < latin1.size(); ++ii) {
375 if (latin1[ii] <= 127)
376 continue;
377 // If there's at least one non-ASCII char, convert to UTF8.
378 std::vector<uint8_t> utf8(latin1.begin(), latin1.begin() + ii);
379 for (; ii < latin1.size(); ++ii) {
380 if (latin1[ii] <= 127) {
381 utf8.push_back(latin1[ii]);
382 } else {
383 // 0xC0 means it's a UTF8 sequence with 2 bytes.
384 utf8.push_back((latin1[ii] >> 6) | 0xc0);
385 utf8.push_back((latin1[ii] | 0x80) & 0xbf);
386 }
387 }
388 EncodeString8(SpanFrom(utf8), out);
389 return;
390 }
391 EncodeString8(latin1, out);
392 }
EncodeFromLatin1(span<uint8_t> latin1,std::vector<uint8_t> * out)393 void EncodeFromLatin1(span<uint8_t> latin1, std::vector<uint8_t>* out) {
394 EncodeFromLatin1Tmpl(latin1, out);
395 }
EncodeFromLatin1(span<uint8_t> latin1,std::string * out)396 void EncodeFromLatin1(span<uint8_t> latin1, std::string* out) {
397 EncodeFromLatin1Tmpl(latin1, out);
398 }
399
400 template <typename C>
EncodeFromUTF16Tmpl(span<uint16_t> utf16,C * out)401 void EncodeFromUTF16Tmpl(span<uint16_t> utf16, C* out) {
402 // If there's at least one non-ASCII char, encode as STRING16 (UTF16).
403 for (uint16_t ch : utf16) {
404 if (ch <= 127)
405 continue;
406 EncodeString16(utf16, out);
407 return;
408 }
409 // It's all US-ASCII, strip out every second byte and encode as UTF8.
410 internals::WriteTokenStart(MajorType::STRING,
411 static_cast<uint64_t>(utf16.size()), out);
412 out->insert(out->end(), utf16.begin(), utf16.end());
413 }
EncodeFromUTF16(span<uint16_t> utf16,std::vector<uint8_t> * out)414 void EncodeFromUTF16(span<uint16_t> utf16, std::vector<uint8_t>* out) {
415 EncodeFromUTF16Tmpl(utf16, out);
416 }
EncodeFromUTF16(span<uint16_t> utf16,std::string * out)417 void EncodeFromUTF16(span<uint16_t> utf16, std::string* out) {
418 EncodeFromUTF16Tmpl(utf16, out);
419 }
420
421 template <typename C>
EncodeBinaryTmpl(span<uint8_t> in,C * out)422 void EncodeBinaryTmpl(span<uint8_t> in, C* out) {
423 out->push_back(kExpectedConversionToBase64Tag);
424 uint64_t byte_length = static_cast<uint64_t>(in.size_bytes());
425 internals::WriteTokenStart(MajorType::BYTE_STRING, byte_length, out);
426 out->insert(out->end(), in.begin(), in.end());
427 }
EncodeBinary(span<uint8_t> in,std::vector<uint8_t> * out)428 void EncodeBinary(span<uint8_t> in, std::vector<uint8_t>* out) {
429 EncodeBinaryTmpl(in, out);
430 }
EncodeBinary(span<uint8_t> in,std::string * out)431 void EncodeBinary(span<uint8_t> in, std::string* out) {
432 EncodeBinaryTmpl(in, out);
433 }
434
435 // A double is encoded with a specific initial byte
436 // (kInitialByteForDouble) plus the 64 bits of payload for its value.
437 constexpr size_t kEncodedDoubleSize = 1 + sizeof(uint64_t);
438
439 // An envelope is encoded with a specific initial byte
440 // (kInitialByteForEnvelope), plus the start byte for a BYTE_STRING with a 32
441 // bit wide length, plus a 32 bit length for that string.
442 constexpr size_t kEncodedEnvelopeHeaderSize = 1 + 1 + sizeof(uint32_t);
443
444 template <typename C>
EncodeDoubleTmpl(double value,C * out)445 void EncodeDoubleTmpl(double value, C* out) {
446 // The additional_info=27 indicates 64 bits for the double follow.
447 // See RFC 7049 Section 2.3, Table 1.
448 out->push_back(kInitialByteForDouble);
449 union {
450 double from_double;
451 uint64_t to_uint64;
452 } reinterpret;
453 reinterpret.from_double = value;
454 WriteBytesMostSignificantByteFirst<uint64_t>(reinterpret.to_uint64, out);
455 }
EncodeDouble(double value,std::vector<uint8_t> * out)456 void EncodeDouble(double value, std::vector<uint8_t>* out) {
457 EncodeDoubleTmpl(value, out);
458 }
EncodeDouble(double value,std::string * out)459 void EncodeDouble(double value, std::string* out) {
460 EncodeDoubleTmpl(value, out);
461 }
462
463 // =============================================================================
464 // cbor::EnvelopeEncoder - for wrapping submessages
465 // =============================================================================
466
467 template <typename C>
EncodeStartTmpl(C * out,size_t * byte_size_pos)468 void EncodeStartTmpl(C* out, size_t* byte_size_pos) {
469 assert(*byte_size_pos == 0);
470 out->push_back(kInitialByteForEnvelope);
471 out->push_back(kInitialByteFor32BitLengthByteString);
472 *byte_size_pos = out->size();
473 out->resize(out->size() + sizeof(uint32_t));
474 }
475
EncodeStart(std::vector<uint8_t> * out)476 void EnvelopeEncoder::EncodeStart(std::vector<uint8_t>* out) {
477 EncodeStartTmpl<std::vector<uint8_t>>(out, &byte_size_pos_);
478 }
479
EncodeStart(std::string * out)480 void EnvelopeEncoder::EncodeStart(std::string* out) {
481 EncodeStartTmpl<std::string>(out, &byte_size_pos_);
482 }
483
484 template <typename C>
EncodeStopTmpl(C * out,size_t * byte_size_pos)485 bool EncodeStopTmpl(C* out, size_t* byte_size_pos) {
486 assert(*byte_size_pos != 0);
487 // The byte size is the size of the payload, that is, all the
488 // bytes that were written past the byte size position itself.
489 uint64_t byte_size = out->size() - (*byte_size_pos + sizeof(uint32_t));
490 // We store exactly 4 bytes, so at most INT32MAX, with most significant
491 // byte first.
492 if (byte_size > std::numeric_limits<uint32_t>::max())
493 return false;
494 for (int shift_bytes = sizeof(uint32_t) - 1; shift_bytes >= 0;
495 --shift_bytes) {
496 (*out)[(*byte_size_pos)++] = 0xff & (byte_size >> (shift_bytes * 8));
497 }
498 return true;
499 }
500
EncodeStop(std::vector<uint8_t> * out)501 bool EnvelopeEncoder::EncodeStop(std::vector<uint8_t>* out) {
502 return EncodeStopTmpl(out, &byte_size_pos_);
503 }
504
EncodeStop(std::string * out)505 bool EnvelopeEncoder::EncodeStop(std::string* out) {
506 return EncodeStopTmpl(out, &byte_size_pos_);
507 }
508
509 // =============================================================================
510 // cbor::NewCBOREncoder - for encoding from a streaming parser
511 // =============================================================================
512
513 namespace {
514 template <typename C>
515 class CBOREncoder : public StreamingParserHandler {
516 public:
CBOREncoder(C * out,Status * status)517 CBOREncoder(C* out, Status* status) : out_(out), status_(status) {
518 *status_ = Status();
519 }
520
HandleMapBegin()521 void HandleMapBegin() override {
522 if (!status_->ok())
523 return;
524 envelopes_.emplace_back();
525 envelopes_.back().EncodeStart(out_);
526 out_->push_back(kInitialByteIndefiniteLengthMap);
527 }
528
HandleMapEnd()529 void HandleMapEnd() override {
530 if (!status_->ok())
531 return;
532 out_->push_back(kStopByte);
533 assert(!envelopes_.empty());
534 if (!envelopes_.back().EncodeStop(out_)) {
535 HandleError(
536 Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, out_->size()));
537 return;
538 }
539 envelopes_.pop_back();
540 }
541
HandleArrayBegin()542 void HandleArrayBegin() override {
543 if (!status_->ok())
544 return;
545 envelopes_.emplace_back();
546 envelopes_.back().EncodeStart(out_);
547 out_->push_back(kInitialByteIndefiniteLengthArray);
548 }
549
HandleArrayEnd()550 void HandleArrayEnd() override {
551 if (!status_->ok())
552 return;
553 out_->push_back(kStopByte);
554 assert(!envelopes_.empty());
555 if (!envelopes_.back().EncodeStop(out_)) {
556 HandleError(
557 Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, out_->size()));
558 return;
559 }
560 envelopes_.pop_back();
561 }
562
HandleString8(span<uint8_t> chars)563 void HandleString8(span<uint8_t> chars) override {
564 if (!status_->ok())
565 return;
566 EncodeString8(chars, out_);
567 }
568
HandleString16(span<uint16_t> chars)569 void HandleString16(span<uint16_t> chars) override {
570 if (!status_->ok())
571 return;
572 EncodeFromUTF16(chars, out_);
573 }
574
HandleBinary(span<uint8_t> bytes)575 void HandleBinary(span<uint8_t> bytes) override {
576 if (!status_->ok())
577 return;
578 EncodeBinary(bytes, out_);
579 }
580
HandleDouble(double value)581 void HandleDouble(double value) override {
582 if (!status_->ok())
583 return;
584 EncodeDouble(value, out_);
585 }
586
HandleInt32(int32_t value)587 void HandleInt32(int32_t value) override {
588 if (!status_->ok())
589 return;
590 EncodeInt32(value, out_);
591 }
592
HandleBool(bool value)593 void HandleBool(bool value) override {
594 if (!status_->ok())
595 return;
596 // See RFC 7049 Section 2.3, Table 2.
597 out_->push_back(value ? kEncodedTrue : kEncodedFalse);
598 }
599
HandleNull()600 void HandleNull() override {
601 if (!status_->ok())
602 return;
603 // See RFC 7049 Section 2.3, Table 2.
604 out_->push_back(kEncodedNull);
605 }
606
HandleError(Status error)607 void HandleError(Status error) override {
608 if (!status_->ok())
609 return;
610 *status_ = error;
611 out_->clear();
612 }
613
614 private:
615 C* out_;
616 std::vector<EnvelopeEncoder> envelopes_;
617 Status* status_;
618 };
619 } // namespace
620
NewCBOREncoder(std::vector<uint8_t> * out,Status * status)621 std::unique_ptr<StreamingParserHandler> NewCBOREncoder(
622 std::vector<uint8_t>* out,
623 Status* status) {
624 return std::unique_ptr<StreamingParserHandler>(
625 new CBOREncoder<std::vector<uint8_t>>(out, status));
626 }
NewCBOREncoder(std::string * out,Status * status)627 std::unique_ptr<StreamingParserHandler> NewCBOREncoder(std::string* out,
628 Status* status) {
629 return std::unique_ptr<StreamingParserHandler>(
630 new CBOREncoder<std::string>(out, status));
631 }
632
633 // =============================================================================
634 // cbor::CBORTokenizer - for parsing individual CBOR items
635 // =============================================================================
636
CBORTokenizer(span<uint8_t> bytes)637 CBORTokenizer::CBORTokenizer(span<uint8_t> bytes) : bytes_(bytes) {
638 ReadNextToken(/*enter_envelope=*/false);
639 }
~CBORTokenizer()640 CBORTokenizer::~CBORTokenizer() {}
641
TokenTag() const642 CBORTokenTag CBORTokenizer::TokenTag() const {
643 return token_tag_;
644 }
645
Next()646 void CBORTokenizer::Next() {
647 if (token_tag_ == CBORTokenTag::ERROR_VALUE ||
648 token_tag_ == CBORTokenTag::DONE)
649 return;
650 ReadNextToken(/*enter_envelope=*/false);
651 }
652
EnterEnvelope()653 void CBORTokenizer::EnterEnvelope() {
654 assert(token_tag_ == CBORTokenTag::ENVELOPE);
655 ReadNextToken(/*enter_envelope=*/true);
656 }
657
Status() const658 Status CBORTokenizer::Status() const {
659 return status_;
660 }
661
662 // The following accessor functions ::GetInt32, ::GetDouble,
663 // ::GetString8, ::GetString16WireRep, ::GetBinary, ::GetEnvelopeContents
664 // assume that a particular token was recognized in ::ReadNextToken.
665 // That's where all the error checking is done. By design,
666 // the accessors (assuming the token was recognized) never produce
667 // an error.
668
GetInt32() const669 int32_t CBORTokenizer::GetInt32() const {
670 assert(token_tag_ == CBORTokenTag::INT32);
671 // The range checks happen in ::ReadNextToken().
672 return static_cast<int32_t>(
673 token_start_type_ == MajorType::UNSIGNED
674 ? token_start_internal_value_
675 : -static_cast<int64_t>(token_start_internal_value_) - 1);
676 }
677
GetDouble() const678 double CBORTokenizer::GetDouble() const {
679 assert(token_tag_ == CBORTokenTag::DOUBLE);
680 union {
681 uint64_t from_uint64;
682 double to_double;
683 } reinterpret;
684 reinterpret.from_uint64 = ReadBytesMostSignificantByteFirst<uint64_t>(
685 bytes_.subspan(status_.pos + 1));
686 return reinterpret.to_double;
687 }
688
GetString8() const689 span<uint8_t> CBORTokenizer::GetString8() const {
690 assert(token_tag_ == CBORTokenTag::STRING8);
691 auto length = static_cast<size_t>(token_start_internal_value_);
692 return bytes_.subspan(status_.pos + (token_byte_length_ - length), length);
693 }
694
GetString16WireRep() const695 span<uint8_t> CBORTokenizer::GetString16WireRep() const {
696 assert(token_tag_ == CBORTokenTag::STRING16);
697 auto length = static_cast<size_t>(token_start_internal_value_);
698 return bytes_.subspan(status_.pos + (token_byte_length_ - length), length);
699 }
700
GetBinary() const701 span<uint8_t> CBORTokenizer::GetBinary() const {
702 assert(token_tag_ == CBORTokenTag::BINARY);
703 auto length = static_cast<size_t>(token_start_internal_value_);
704 return bytes_.subspan(status_.pos + (token_byte_length_ - length), length);
705 }
706
GetEnvelopeContents() const707 span<uint8_t> CBORTokenizer::GetEnvelopeContents() const {
708 assert(token_tag_ == CBORTokenTag::ENVELOPE);
709 auto length = static_cast<size_t>(token_start_internal_value_);
710 return bytes_.subspan(status_.pos + kEncodedEnvelopeHeaderSize, length);
711 }
712
713 // All error checking happens in ::ReadNextToken, so that the accessors
714 // can avoid having to carry an error return value.
715 //
716 // With respect to checking the encoded lengths of strings, arrays, etc:
717 // On the wire, CBOR uses 1,2,4, and 8 byte unsigned integers, so
718 // we initially read them as uint64_t, usually into token_start_internal_value_.
719 //
720 // However, since these containers have a representation on the machine,
721 // we need to do corresponding size computations on the input byte array,
722 // output span (e.g. the payload for a string), etc., and size_t is
723 // machine specific (in practice either 32 bit or 64 bit).
724 //
725 // Further, we must avoid overflowing size_t. Therefore, we use this
726 // kMaxValidLength constant to:
727 // - Reject values that are larger than the architecture specific
728 // max size_t (differs between 32 bit and 64 bit arch).
729 // - Reserve at least one bit so that we can check against overflows
730 // when adding lengths (array / string length / etc.); we do this by
731 // ensuring that the inputs to an addition are <= kMaxValidLength,
732 // and then checking whether the sum went past it.
733 //
734 // See also
735 // https://chromium.googlesource.com/chromium/src/+/master/docs/security/integer-semantics.md
736 static const uint64_t kMaxValidLength =
737 std::min<uint64_t>(std::numeric_limits<uint64_t>::max() >> 2,
738 std::numeric_limits<size_t>::max());
739
ReadNextToken(bool enter_envelope)740 void CBORTokenizer::ReadNextToken(bool enter_envelope) {
741 if (enter_envelope) {
742 status_.pos += kEncodedEnvelopeHeaderSize;
743 } else {
744 status_.pos =
745 status_.pos == Status::npos() ? 0 : status_.pos + token_byte_length_;
746 }
747 status_.error = Error::OK;
748 if (status_.pos >= bytes_.size()) {
749 token_tag_ = CBORTokenTag::DONE;
750 return;
751 }
752 const size_t remaining_bytes = bytes_.size() - status_.pos;
753 switch (bytes_[status_.pos]) {
754 case kStopByte:
755 SetToken(CBORTokenTag::STOP, 1);
756 return;
757 case kInitialByteIndefiniteLengthMap:
758 SetToken(CBORTokenTag::MAP_START, 1);
759 return;
760 case kInitialByteIndefiniteLengthArray:
761 SetToken(CBORTokenTag::ARRAY_START, 1);
762 return;
763 case kEncodedTrue:
764 SetToken(CBORTokenTag::TRUE_VALUE, 1);
765 return;
766 case kEncodedFalse:
767 SetToken(CBORTokenTag::FALSE_VALUE, 1);
768 return;
769 case kEncodedNull:
770 SetToken(CBORTokenTag::NULL_VALUE, 1);
771 return;
772 case kExpectedConversionToBase64Tag: { // BINARY
773 const int8_t bytes_read = internals::ReadTokenStart(
774 bytes_.subspan(status_.pos + 1), &token_start_type_,
775 &token_start_internal_value_);
776 if (bytes_read < 0 || token_start_type_ != MajorType::BYTE_STRING ||
777 token_start_internal_value_ > kMaxValidLength) {
778 SetError(Error::CBOR_INVALID_BINARY);
779 return;
780 }
781 const uint64_t token_byte_length = token_start_internal_value_ +
782 /* tag before token start: */ 1 +
783 /* token start: */ bytes_read;
784 if (token_byte_length > remaining_bytes) {
785 SetError(Error::CBOR_INVALID_BINARY);
786 return;
787 }
788 SetToken(CBORTokenTag::BINARY, static_cast<size_t>(token_byte_length));
789 return;
790 }
791 case kInitialByteForDouble: { // DOUBLE
792 if (kEncodedDoubleSize > remaining_bytes) {
793 SetError(Error::CBOR_INVALID_DOUBLE);
794 return;
795 }
796 SetToken(CBORTokenTag::DOUBLE, kEncodedDoubleSize);
797 return;
798 }
799 case kInitialByteForEnvelope: { // ENVELOPE
800 if (kEncodedEnvelopeHeaderSize > remaining_bytes) {
801 SetError(Error::CBOR_INVALID_ENVELOPE);
802 return;
803 }
804 // The envelope must be a byte string with 32 bit length.
805 if (bytes_[status_.pos + 1] != kInitialByteFor32BitLengthByteString) {
806 SetError(Error::CBOR_INVALID_ENVELOPE);
807 return;
808 }
809 // Read the length of the byte string.
810 token_start_internal_value_ = ReadBytesMostSignificantByteFirst<uint32_t>(
811 bytes_.subspan(status_.pos + 2));
812 if (token_start_internal_value_ > kMaxValidLength) {
813 SetError(Error::CBOR_INVALID_ENVELOPE);
814 return;
815 }
816 uint64_t token_byte_length =
817 token_start_internal_value_ + kEncodedEnvelopeHeaderSize;
818 if (token_byte_length > remaining_bytes) {
819 SetError(Error::CBOR_INVALID_ENVELOPE);
820 return;
821 }
822 SetToken(CBORTokenTag::ENVELOPE, static_cast<size_t>(token_byte_length));
823 return;
824 }
825 default: {
826 const int8_t token_start_length = internals::ReadTokenStart(
827 bytes_.subspan(status_.pos), &token_start_type_,
828 &token_start_internal_value_);
829 const bool success = token_start_length >= 0;
830 switch (token_start_type_) {
831 case MajorType::UNSIGNED: // INT32.
832 // INT32 is a signed int32 (int32 makes sense for the
833 // inspector_protocol, it's not a CBOR limitation), so we check
834 // against the signed max, so that the allowable values are
835 // 0, 1, 2, ... 2^31 - 1.
836 if (!success || std::numeric_limits<int32_t>::max() <
837 token_start_internal_value_) {
838 SetError(Error::CBOR_INVALID_INT32);
839 return;
840 }
841 SetToken(CBORTokenTag::INT32, token_start_length);
842 return;
843 case MajorType::NEGATIVE: { // INT32.
844 // INT32 is a signed int32 (int32 makes sense for the
845 // inspector_protocol, it's not a CBOR limitation); in CBOR,
846 // the negative values for INT32 are represented as NEGATIVE,
847 // that is, -1 INT32 is represented as 1 << 5 | 0 (major type 1,
848 // additional info value 0). So here, we compute the INT32 value
849 // and then check it against the INT32 min.
850 int64_t actual_value =
851 -static_cast<int64_t>(token_start_internal_value_) - 1;
852 if (!success || actual_value < std::numeric_limits<int32_t>::min()) {
853 SetError(Error::CBOR_INVALID_INT32);
854 return;
855 }
856 SetToken(CBORTokenTag::INT32, token_start_length);
857 return;
858 }
859 case MajorType::STRING: { // STRING8.
860 if (!success || token_start_internal_value_ > kMaxValidLength) {
861 SetError(Error::CBOR_INVALID_STRING8);
862 return;
863 }
864 uint64_t token_byte_length =
865 token_start_internal_value_ + token_start_length;
866 if (token_byte_length > remaining_bytes) {
867 SetError(Error::CBOR_INVALID_STRING8);
868 return;
869 }
870 SetToken(CBORTokenTag::STRING8,
871 static_cast<size_t>(token_byte_length));
872 return;
873 }
874 case MajorType::BYTE_STRING: { // STRING16.
875 // Length must be divisible by 2 since UTF16 is 2 bytes per
876 // character, hence the &1 check.
877 if (!success || token_start_internal_value_ > kMaxValidLength ||
878 token_start_internal_value_ & 1) {
879 SetError(Error::CBOR_INVALID_STRING16);
880 return;
881 }
882 uint64_t token_byte_length =
883 token_start_internal_value_ + token_start_length;
884 if (token_byte_length > remaining_bytes) {
885 SetError(Error::CBOR_INVALID_STRING16);
886 return;
887 }
888 SetToken(CBORTokenTag::STRING16,
889 static_cast<size_t>(token_byte_length));
890 return;
891 }
892 case MajorType::ARRAY:
893 case MajorType::MAP:
894 case MajorType::TAG:
895 case MajorType::SIMPLE_VALUE:
896 SetError(Error::CBOR_UNSUPPORTED_VALUE);
897 return;
898 }
899 }
900 }
901 }
902
SetToken(CBORTokenTag token_tag,size_t token_byte_length)903 void CBORTokenizer::SetToken(CBORTokenTag token_tag, size_t token_byte_length) {
904 token_tag_ = token_tag;
905 token_byte_length_ = token_byte_length;
906 }
907
SetError(Error error)908 void CBORTokenizer::SetError(Error error) {
909 token_tag_ = CBORTokenTag::ERROR_VALUE;
910 status_.error = error;
911 }
912
913 // =============================================================================
914 // cbor::ParseCBOR - for receiving streaming parser events for CBOR messages
915 // =============================================================================
916
917 namespace {
918 // When parsing CBOR, we limit recursion depth for objects and arrays
919 // to this constant.
920 static constexpr int kStackLimit = 300;
921
922 // Below are three parsing routines for CBOR, which cover enough
923 // to roundtrip JSON messages.
924 bool ParseMap(int32_t stack_depth,
925 CBORTokenizer* tokenizer,
926 StreamingParserHandler* out);
927 bool ParseArray(int32_t stack_depth,
928 CBORTokenizer* tokenizer,
929 StreamingParserHandler* out);
930 bool ParseValue(int32_t stack_depth,
931 CBORTokenizer* tokenizer,
932 StreamingParserHandler* out);
933
ParseUTF16String(CBORTokenizer * tokenizer,StreamingParserHandler * out)934 void ParseUTF16String(CBORTokenizer* tokenizer, StreamingParserHandler* out) {
935 std::vector<uint16_t> value;
936 span<uint8_t> rep = tokenizer->GetString16WireRep();
937 for (size_t ii = 0; ii < rep.size(); ii += 2)
938 value.push_back((rep[ii + 1] << 8) | rep[ii]);
939 out->HandleString16(span<uint16_t>(value.data(), value.size()));
940 tokenizer->Next();
941 }
942
ParseUTF8String(CBORTokenizer * tokenizer,StreamingParserHandler * out)943 bool ParseUTF8String(CBORTokenizer* tokenizer, StreamingParserHandler* out) {
944 assert(tokenizer->TokenTag() == CBORTokenTag::STRING8);
945 out->HandleString8(tokenizer->GetString8());
946 tokenizer->Next();
947 return true;
948 }
949
ParseValue(int32_t stack_depth,CBORTokenizer * tokenizer,StreamingParserHandler * out)950 bool ParseValue(int32_t stack_depth,
951 CBORTokenizer* tokenizer,
952 StreamingParserHandler* out) {
953 if (stack_depth > kStackLimit) {
954 out->HandleError(
955 Status{Error::CBOR_STACK_LIMIT_EXCEEDED, tokenizer->Status().pos});
956 return false;
957 }
958 // Skip past the envelope to get to what's inside.
959 if (tokenizer->TokenTag() == CBORTokenTag::ENVELOPE)
960 tokenizer->EnterEnvelope();
961 switch (tokenizer->TokenTag()) {
962 case CBORTokenTag::ERROR_VALUE:
963 out->HandleError(tokenizer->Status());
964 return false;
965 case CBORTokenTag::DONE:
966 out->HandleError(Status{Error::CBOR_UNEXPECTED_EOF_EXPECTED_VALUE,
967 tokenizer->Status().pos});
968 return false;
969 case CBORTokenTag::TRUE_VALUE:
970 out->HandleBool(true);
971 tokenizer->Next();
972 return true;
973 case CBORTokenTag::FALSE_VALUE:
974 out->HandleBool(false);
975 tokenizer->Next();
976 return true;
977 case CBORTokenTag::NULL_VALUE:
978 out->HandleNull();
979 tokenizer->Next();
980 return true;
981 case CBORTokenTag::INT32:
982 out->HandleInt32(tokenizer->GetInt32());
983 tokenizer->Next();
984 return true;
985 case CBORTokenTag::DOUBLE:
986 out->HandleDouble(tokenizer->GetDouble());
987 tokenizer->Next();
988 return true;
989 case CBORTokenTag::STRING8:
990 return ParseUTF8String(tokenizer, out);
991 case CBORTokenTag::STRING16:
992 ParseUTF16String(tokenizer, out);
993 return true;
994 case CBORTokenTag::BINARY: {
995 out->HandleBinary(tokenizer->GetBinary());
996 tokenizer->Next();
997 return true;
998 }
999 case CBORTokenTag::MAP_START:
1000 return ParseMap(stack_depth + 1, tokenizer, out);
1001 case CBORTokenTag::ARRAY_START:
1002 return ParseArray(stack_depth + 1, tokenizer, out);
1003 default:
1004 out->HandleError(
1005 Status{Error::CBOR_UNSUPPORTED_VALUE, tokenizer->Status().pos});
1006 return false;
1007 }
1008 }
1009
1010 // |bytes| must start with the indefinite length array byte, so basically,
1011 // ParseArray may only be called after an indefinite length array has been
1012 // detected.
ParseArray(int32_t stack_depth,CBORTokenizer * tokenizer,StreamingParserHandler * out)1013 bool ParseArray(int32_t stack_depth,
1014 CBORTokenizer* tokenizer,
1015 StreamingParserHandler* out) {
1016 assert(tokenizer->TokenTag() == CBORTokenTag::ARRAY_START);
1017 tokenizer->Next();
1018 out->HandleArrayBegin();
1019 while (tokenizer->TokenTag() != CBORTokenTag::STOP) {
1020 if (tokenizer->TokenTag() == CBORTokenTag::DONE) {
1021 out->HandleError(
1022 Status{Error::CBOR_UNEXPECTED_EOF_IN_ARRAY, tokenizer->Status().pos});
1023 return false;
1024 }
1025 if (tokenizer->TokenTag() == CBORTokenTag::ERROR_VALUE) {
1026 out->HandleError(tokenizer->Status());
1027 return false;
1028 }
1029 // Parse value.
1030 if (!ParseValue(stack_depth, tokenizer, out))
1031 return false;
1032 }
1033 out->HandleArrayEnd();
1034 tokenizer->Next();
1035 return true;
1036 }
1037
1038 // |bytes| must start with the indefinite length array byte, so basically,
1039 // ParseArray may only be called after an indefinite length array has been
1040 // detected.
ParseMap(int32_t stack_depth,CBORTokenizer * tokenizer,StreamingParserHandler * out)1041 bool ParseMap(int32_t stack_depth,
1042 CBORTokenizer* tokenizer,
1043 StreamingParserHandler* out) {
1044 assert(tokenizer->TokenTag() == CBORTokenTag::MAP_START);
1045 out->HandleMapBegin();
1046 tokenizer->Next();
1047 while (tokenizer->TokenTag() != CBORTokenTag::STOP) {
1048 if (tokenizer->TokenTag() == CBORTokenTag::DONE) {
1049 out->HandleError(
1050 Status{Error::CBOR_UNEXPECTED_EOF_IN_MAP, tokenizer->Status().pos});
1051 return false;
1052 }
1053 if (tokenizer->TokenTag() == CBORTokenTag::ERROR_VALUE) {
1054 out->HandleError(tokenizer->Status());
1055 return false;
1056 }
1057 // Parse key.
1058 if (tokenizer->TokenTag() == CBORTokenTag::STRING8) {
1059 if (!ParseUTF8String(tokenizer, out))
1060 return false;
1061 } else if (tokenizer->TokenTag() == CBORTokenTag::STRING16) {
1062 ParseUTF16String(tokenizer, out);
1063 } else {
1064 out->HandleError(
1065 Status{Error::CBOR_INVALID_MAP_KEY, tokenizer->Status().pos});
1066 return false;
1067 }
1068 // Parse value.
1069 if (!ParseValue(stack_depth, tokenizer, out))
1070 return false;
1071 }
1072 out->HandleMapEnd();
1073 tokenizer->Next();
1074 return true;
1075 }
1076 } // namespace
1077
ParseCBOR(span<uint8_t> bytes,StreamingParserHandler * out)1078 void ParseCBOR(span<uint8_t> bytes, StreamingParserHandler* out) {
1079 if (bytes.empty()) {
1080 out->HandleError(Status{Error::CBOR_NO_INPUT, 0});
1081 return;
1082 }
1083 if (bytes[0] != kInitialByteForEnvelope) {
1084 out->HandleError(Status{Error::CBOR_INVALID_START_BYTE, 0});
1085 return;
1086 }
1087 CBORTokenizer tokenizer(bytes);
1088 if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE) {
1089 out->HandleError(tokenizer.Status());
1090 return;
1091 }
1092 // We checked for the envelope start byte above, so the tokenizer
1093 // must agree here, since it's not an error.
1094 assert(tokenizer.TokenTag() == CBORTokenTag::ENVELOPE);
1095 tokenizer.EnterEnvelope();
1096 if (tokenizer.TokenTag() != CBORTokenTag::MAP_START) {
1097 out->HandleError(
1098 Status{Error::CBOR_MAP_START_EXPECTED, tokenizer.Status().pos});
1099 return;
1100 }
1101 if (!ParseMap(/*stack_depth=*/1, &tokenizer, out))
1102 return;
1103 if (tokenizer.TokenTag() == CBORTokenTag::DONE)
1104 return;
1105 if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE) {
1106 out->HandleError(tokenizer.Status());
1107 return;
1108 }
1109 out->HandleError(Status{Error::CBOR_TRAILING_JUNK, tokenizer.Status().pos});
1110 }
1111
1112 // =============================================================================
1113 // cbor::AppendString8EntryToMap - for limited in-place editing of messages
1114 // =============================================================================
1115
1116 template <typename C>
AppendString8EntryToCBORMapTmpl(span<uint8_t> string8_key,span<uint8_t> string8_value,C * cbor)1117 Status AppendString8EntryToCBORMapTmpl(span<uint8_t> string8_key,
1118 span<uint8_t> string8_value,
1119 C* cbor) {
1120 // Careful below: Don't compare (*cbor)[idx] with a uint8_t, since
1121 // it could be a char (signed!). Instead, use bytes.
1122 span<uint8_t> bytes(reinterpret_cast<const uint8_t*>(cbor->data()),
1123 cbor->size());
1124 CBORTokenizer tokenizer(bytes);
1125 if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE)
1126 return tokenizer.Status();
1127 if (tokenizer.TokenTag() != CBORTokenTag::ENVELOPE)
1128 return Status(Error::CBOR_INVALID_ENVELOPE, 0);
1129 size_t envelope_size = tokenizer.GetEnvelopeContents().size();
1130 size_t old_size = cbor->size();
1131 if (old_size != envelope_size + kEncodedEnvelopeHeaderSize)
1132 return Status(Error::CBOR_INVALID_ENVELOPE, 0);
1133 if (envelope_size == 0 ||
1134 (tokenizer.GetEnvelopeContents()[0] != EncodeIndefiniteLengthMapStart()))
1135 return Status(Error::CBOR_MAP_START_EXPECTED, kEncodedEnvelopeHeaderSize);
1136 if (bytes[bytes.size() - 1] != EncodeStop())
1137 return Status(Error::CBOR_MAP_STOP_EXPECTED, cbor->size() - 1);
1138 cbor->pop_back();
1139 EncodeString8(string8_key, cbor);
1140 EncodeString8(string8_value, cbor);
1141 cbor->push_back(EncodeStop());
1142 size_t new_envelope_size = envelope_size + (cbor->size() - old_size);
1143 if (new_envelope_size > std::numeric_limits<uint32_t>::max())
1144 return Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, 0);
1145 size_t size_pos = cbor->size() - new_envelope_size - sizeof(uint32_t);
1146 uint8_t* out = reinterpret_cast<uint8_t*>(&cbor->at(size_pos));
1147 *(out++) = (new_envelope_size >> 24) & 0xff;
1148 *(out++) = (new_envelope_size >> 16) & 0xff;
1149 *(out++) = (new_envelope_size >> 8) & 0xff;
1150 *(out) = new_envelope_size & 0xff;
1151 return Status();
1152 }
AppendString8EntryToCBORMap(span<uint8_t> string8_key,span<uint8_t> string8_value,std::vector<uint8_t> * cbor)1153 Status AppendString8EntryToCBORMap(span<uint8_t> string8_key,
1154 span<uint8_t> string8_value,
1155 std::vector<uint8_t>* cbor) {
1156 return AppendString8EntryToCBORMapTmpl(string8_key, string8_value, cbor);
1157 }
AppendString8EntryToCBORMap(span<uint8_t> string8_key,span<uint8_t> string8_value,std::string * cbor)1158 Status AppendString8EntryToCBORMap(span<uint8_t> string8_key,
1159 span<uint8_t> string8_value,
1160 std::string* cbor) {
1161 return AppendString8EntryToCBORMapTmpl(string8_key, string8_value, cbor);
1162 }
1163 } // namespace cbor
1164
1165 namespace json {
1166
1167 // =============================================================================
1168 // json::NewJSONEncoder - for encoding streaming parser events as JSON
1169 // =============================================================================
1170
1171 namespace {
1172 // Prints |value| to |out| with 4 hex digits, most significant chunk first.
1173 template <typename C>
PrintHex(uint16_t value,C * out)1174 void PrintHex(uint16_t value, C* out) {
1175 for (int ii = 3; ii >= 0; --ii) {
1176 int four_bits = 0xf & (value >> (4 * ii));
1177 out->push_back(four_bits + ((four_bits <= 9) ? '0' : ('a' - 10)));
1178 }
1179 }
1180
1181 // In the writer below, we maintain a stack of State instances.
1182 // It is just enough to emit the appropriate delimiters and brackets
1183 // in JSON.
1184 enum class Container {
1185 // Used for the top-level, initial state.
1186 NONE,
1187 // Inside a JSON object.
1188 MAP,
1189 // Inside a JSON array.
1190 ARRAY
1191 };
1192 class State {
1193 public:
State(Container container)1194 explicit State(Container container) : container_(container) {}
StartElement(std::vector<uint8_t> * out)1195 void StartElement(std::vector<uint8_t>* out) { StartElementTmpl(out); }
StartElement(std::string * out)1196 void StartElement(std::string* out) { StartElementTmpl(out); }
container() const1197 Container container() const { return container_; }
1198
1199 private:
1200 template <typename C>
StartElementTmpl(C * out)1201 void StartElementTmpl(C* out) {
1202 assert(container_ != Container::NONE || size_ == 0);
1203 if (size_ != 0) {
1204 char delim = (!(size_ & 1) || container_ == Container::ARRAY) ? ',' : ':';
1205 out->push_back(delim);
1206 }
1207 ++size_;
1208 }
1209
1210 Container container_ = Container::NONE;
1211 int size_ = 0;
1212 };
1213
1214 constexpr char kBase64Table[] =
1215 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1216 "abcdefghijklmnopqrstuvwxyz0123456789+/";
1217
1218 template <typename C>
Base64Encode(const span<uint8_t> & in,C * out)1219 void Base64Encode(const span<uint8_t>& in, C* out) {
1220 // The following three cases are based on the tables in the example
1221 // section in https://en.wikipedia.org/wiki/Base64. We process three
1222 // input bytes at a time, emitting 4 output bytes at a time.
1223 size_t ii = 0;
1224
1225 // While possible, process three input bytes.
1226 for (; ii + 3 <= in.size(); ii += 3) {
1227 uint32_t twentyfour_bits = (in[ii] << 16) | (in[ii + 1] << 8) | in[ii + 2];
1228 out->push_back(kBase64Table[(twentyfour_bits >> 18)]);
1229 out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]);
1230 out->push_back(kBase64Table[(twentyfour_bits >> 6) & 0x3f]);
1231 out->push_back(kBase64Table[twentyfour_bits & 0x3f]);
1232 }
1233 if (ii + 2 <= in.size()) { // Process two input bytes.
1234 uint32_t twentyfour_bits = (in[ii] << 16) | (in[ii + 1] << 8);
1235 out->push_back(kBase64Table[(twentyfour_bits >> 18)]);
1236 out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]);
1237 out->push_back(kBase64Table[(twentyfour_bits >> 6) & 0x3f]);
1238 out->push_back('='); // Emit padding.
1239 return;
1240 }
1241 if (ii + 1 <= in.size()) { // Process a single input byte.
1242 uint32_t twentyfour_bits = (in[ii] << 16);
1243 out->push_back(kBase64Table[(twentyfour_bits >> 18)]);
1244 out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]);
1245 out->push_back('='); // Emit padding.
1246 out->push_back('='); // Emit padding.
1247 }
1248 }
1249
1250 // Implements a handler for JSON parser events to emit a JSON string.
1251 template <typename C>
1252 class JSONEncoder : public StreamingParserHandler {
1253 public:
JSONEncoder(const Platform * platform,C * out,Status * status)1254 JSONEncoder(const Platform* platform, C* out, Status* status)
1255 : platform_(platform), out_(out), status_(status) {
1256 *status_ = Status();
1257 state_.emplace(Container::NONE);
1258 }
1259
HandleMapBegin()1260 void HandleMapBegin() override {
1261 if (!status_->ok())
1262 return;
1263 assert(!state_.empty());
1264 state_.top().StartElement(out_);
1265 state_.emplace(Container::MAP);
1266 Emit('{');
1267 }
1268
HandleMapEnd()1269 void HandleMapEnd() override {
1270 if (!status_->ok())
1271 return;
1272 assert(state_.size() >= 2 && state_.top().container() == Container::MAP);
1273 state_.pop();
1274 Emit('}');
1275 }
1276
HandleArrayBegin()1277 void HandleArrayBegin() override {
1278 if (!status_->ok())
1279 return;
1280 state_.top().StartElement(out_);
1281 state_.emplace(Container::ARRAY);
1282 Emit('[');
1283 }
1284
HandleArrayEnd()1285 void HandleArrayEnd() override {
1286 if (!status_->ok())
1287 return;
1288 assert(state_.size() >= 2 && state_.top().container() == Container::ARRAY);
1289 state_.pop();
1290 Emit(']');
1291 }
1292
HandleString16(span<uint16_t> chars)1293 void HandleString16(span<uint16_t> chars) override {
1294 if (!status_->ok())
1295 return;
1296 state_.top().StartElement(out_);
1297 Emit('"');
1298 for (const uint16_t ch : chars) {
1299 if (ch == '"') {
1300 Emit("\\\"");
1301 } else if (ch == '\\') {
1302 Emit("\\\\");
1303 } else if (ch == '\b') {
1304 Emit("\\b");
1305 } else if (ch == '\f') {
1306 Emit("\\f");
1307 } else if (ch == '\n') {
1308 Emit("\\n");
1309 } else if (ch == '\r') {
1310 Emit("\\r");
1311 } else if (ch == '\t') {
1312 Emit("\\t");
1313 } else if (ch >= 32 && ch <= 126) {
1314 Emit(ch);
1315 } else {
1316 Emit("\\u");
1317 PrintHex(ch, out_);
1318 }
1319 }
1320 Emit('"');
1321 }
1322
HandleString8(span<uint8_t> chars)1323 void HandleString8(span<uint8_t> chars) override {
1324 if (!status_->ok())
1325 return;
1326 state_.top().StartElement(out_);
1327 Emit('"');
1328 for (size_t ii = 0; ii < chars.size(); ++ii) {
1329 uint8_t c = chars[ii];
1330 if (c == '"') {
1331 Emit("\\\"");
1332 } else if (c == '\\') {
1333 Emit("\\\\");
1334 } else if (c == '\b') {
1335 Emit("\\b");
1336 } else if (c == '\f') {
1337 Emit("\\f");
1338 } else if (c == '\n') {
1339 Emit("\\n");
1340 } else if (c == '\r') {
1341 Emit("\\r");
1342 } else if (c == '\t') {
1343 Emit("\\t");
1344 } else if (c >= 32 && c <= 126) {
1345 Emit(c);
1346 } else if (c < 32) {
1347 Emit("\\u");
1348 PrintHex(static_cast<uint16_t>(c), out_);
1349 } else {
1350 // Inspect the leading byte to figure out how long the utf8
1351 // byte sequence is; while doing this initialize |codepoint|
1352 // with the first few bits.
1353 // See table in: https://en.wikipedia.org/wiki/UTF-8
1354 // byte one is 110x xxxx -> 2 byte utf8 sequence
1355 // byte one is 1110 xxxx -> 3 byte utf8 sequence
1356 // byte one is 1111 0xxx -> 4 byte utf8 sequence
1357 uint32_t codepoint;
1358 int num_bytes_left;
1359 if ((c & 0xe0) == 0xc0) { // 2 byte utf8 sequence
1360 num_bytes_left = 1;
1361 codepoint = c & 0x1f;
1362 } else if ((c & 0xf0) == 0xe0) { // 3 byte utf8 sequence
1363 num_bytes_left = 2;
1364 codepoint = c & 0x0f;
1365 } else if ((c & 0xf8) == 0xf0) { // 4 byte utf8 sequence
1366 codepoint = c & 0x07;
1367 num_bytes_left = 3;
1368 } else {
1369 continue; // invalid leading byte
1370 }
1371
1372 // If we have enough bytes in our input, decode the remaining ones
1373 // belonging to this Unicode character into |codepoint|.
1374 if (ii + num_bytes_left > chars.size())
1375 continue;
1376 while (num_bytes_left > 0) {
1377 c = chars[++ii];
1378 --num_bytes_left;
1379 // Check the next byte is a continuation byte, that is 10xx xxxx.
1380 if ((c & 0xc0) != 0x80)
1381 continue;
1382 codepoint = (codepoint << 6) | (c & 0x3f);
1383 }
1384
1385 // Disallow overlong encodings for ascii characters, as these
1386 // would include " and other characters significant to JSON
1387 // string termination / control.
1388 if (codepoint < 0x7f)
1389 continue;
1390 // Invalid in UTF8, and can't be represented in UTF16 anyway.
1391 if (codepoint > 0x10ffff)
1392 continue;
1393
1394 // So, now we transcode to UTF16,
1395 // using the math described at https://en.wikipedia.org/wiki/UTF-16,
1396 // for either one or two 16 bit characters.
1397 if (codepoint < 0xffff) {
1398 Emit("\\u");
1399 PrintHex(static_cast<uint16_t>(codepoint), out_);
1400 continue;
1401 }
1402 codepoint -= 0x10000;
1403 // high surrogate
1404 Emit("\\u");
1405 PrintHex(static_cast<uint16_t>((codepoint >> 10) + 0xd800), out_);
1406 // low surrogate
1407 Emit("\\u");
1408 PrintHex(static_cast<uint16_t>((codepoint & 0x3ff) + 0xdc00), out_);
1409 }
1410 }
1411 Emit('"');
1412 }
1413
HandleBinary(span<uint8_t> bytes)1414 void HandleBinary(span<uint8_t> bytes) override {
1415 if (!status_->ok())
1416 return;
1417 state_.top().StartElement(out_);
1418 Emit('"');
1419 Base64Encode(bytes, out_);
1420 Emit('"');
1421 }
1422
HandleDouble(double value)1423 void HandleDouble(double value) override {
1424 if (!status_->ok())
1425 return;
1426 state_.top().StartElement(out_);
1427 // JSON cannot represent NaN or Infinity. So, for compatibility,
1428 // we behave like the JSON object in web browsers: emit 'null'.
1429 if (!std::isfinite(value)) {
1430 Emit("null");
1431 return;
1432 }
1433 std::unique_ptr<char[]> str_value = platform_->DToStr(value);
1434
1435 // DToStr may fail to emit a 0 before the decimal dot. E.g. this is
1436 // the case in base::NumberToString in Chromium (which is based on
1437 // dmg_fp). So, much like
1438 // https://cs.chromium.org/chromium/src/base/json/json_writer.cc
1439 // we probe for this and emit the leading 0 anyway if necessary.
1440 const char* chars = str_value.get();
1441 if (chars[0] == '.') {
1442 Emit('0');
1443 } else if (chars[0] == '-' && chars[1] == '.') {
1444 Emit("-0");
1445 ++chars;
1446 }
1447 Emit(chars);
1448 }
1449
HandleInt32(int32_t value)1450 void HandleInt32(int32_t value) override {
1451 if (!status_->ok())
1452 return;
1453 state_.top().StartElement(out_);
1454 Emit(std::to_string(value));
1455 }
1456
HandleBool(bool value)1457 void HandleBool(bool value) override {
1458 if (!status_->ok())
1459 return;
1460 state_.top().StartElement(out_);
1461 Emit(value ? "true" : "false");
1462 }
1463
HandleNull()1464 void HandleNull() override {
1465 if (!status_->ok())
1466 return;
1467 state_.top().StartElement(out_);
1468 Emit("null");
1469 }
1470
HandleError(Status error)1471 void HandleError(Status error) override {
1472 assert(!error.ok());
1473 *status_ = error;
1474 out_->clear();
1475 }
1476
1477 private:
Emit(char c)1478 void Emit(char c) { out_->push_back(c); }
Emit(const char * str)1479 void Emit(const char* str) {
1480 out_->insert(out_->end(), str, str + strlen(str));
1481 }
Emit(const std::string & str)1482 void Emit(const std::string& str) {
1483 out_->insert(out_->end(), str.begin(), str.end());
1484 }
1485
1486 const Platform* platform_;
1487 C* out_;
1488 Status* status_;
1489 std::stack<State> state_;
1490 };
1491 } // namespace
1492
NewJSONEncoder(const Platform * platform,std::vector<uint8_t> * out,Status * status)1493 std::unique_ptr<StreamingParserHandler> NewJSONEncoder(
1494 const Platform* platform,
1495 std::vector<uint8_t>* out,
1496 Status* status) {
1497 return std::unique_ptr<StreamingParserHandler>(
1498 new JSONEncoder<std::vector<uint8_t>>(platform, out, status));
1499 }
NewJSONEncoder(const Platform * platform,std::string * out,Status * status)1500 std::unique_ptr<StreamingParserHandler> NewJSONEncoder(const Platform* platform,
1501 std::string* out,
1502 Status* status) {
1503 return std::unique_ptr<StreamingParserHandler>(
1504 new JSONEncoder<std::string>(platform, out, status));
1505 }
1506
1507 // =============================================================================
1508 // json::ParseJSON - for receiving streaming parser events for JSON.
1509 // =============================================================================
1510
1511 namespace {
1512 const int kStackLimit = 300;
1513
1514 enum Token {
1515 ObjectBegin,
1516 ObjectEnd,
1517 ArrayBegin,
1518 ArrayEnd,
1519 StringLiteral,
1520 Number,
1521 BoolTrue,
1522 BoolFalse,
1523 NullToken,
1524 ListSeparator,
1525 ObjectPairSeparator,
1526 InvalidToken,
1527 NoInput
1528 };
1529
1530 const char* const kNullString = "null";
1531 const char* const kTrueString = "true";
1532 const char* const kFalseString = "false";
1533
1534 template <typename Char>
1535 class JsonParser {
1536 public:
JsonParser(const Platform * platform,StreamingParserHandler * handler)1537 JsonParser(const Platform* platform, StreamingParserHandler* handler)
1538 : platform_(platform), handler_(handler) {}
1539
Parse(const Char * start,size_t length)1540 void Parse(const Char* start, size_t length) {
1541 start_pos_ = start;
1542 const Char* end = start + length;
1543 const Char* tokenEnd = nullptr;
1544 ParseValue(start, end, &tokenEnd, 0);
1545 if (error_)
1546 return;
1547 if (tokenEnd != end) {
1548 HandleError(Error::JSON_PARSER_UNPROCESSED_INPUT_REMAINS, tokenEnd);
1549 }
1550 }
1551
1552 private:
CharsToDouble(const uint16_t * chars,size_t length,double * result)1553 bool CharsToDouble(const uint16_t* chars, size_t length, double* result) {
1554 std::string buffer;
1555 buffer.reserve(length + 1);
1556 for (size_t ii = 0; ii < length; ++ii) {
1557 bool is_ascii = !(chars[ii] & ~0x7F);
1558 if (!is_ascii)
1559 return false;
1560 buffer.push_back(static_cast<char>(chars[ii]));
1561 }
1562 return platform_->StrToD(buffer.c_str(), result);
1563 }
1564
CharsToDouble(const uint8_t * chars,size_t length,double * result)1565 bool CharsToDouble(const uint8_t* chars, size_t length, double* result) {
1566 std::string buffer(reinterpret_cast<const char*>(chars), length);
1567 return platform_->StrToD(buffer.c_str(), result);
1568 }
1569
ParseConstToken(const Char * start,const Char * end,const Char ** token_end,const char * token)1570 static bool ParseConstToken(const Char* start,
1571 const Char* end,
1572 const Char** token_end,
1573 const char* token) {
1574 // |token| is \0 terminated, it's one of the constants at top of the file.
1575 while (start < end && *token != '\0' && *start++ == *token++) {
1576 }
1577 if (*token != '\0')
1578 return false;
1579 *token_end = start;
1580 return true;
1581 }
1582
ReadInt(const Char * start,const Char * end,const Char ** token_end,bool allow_leading_zeros)1583 static bool ReadInt(const Char* start,
1584 const Char* end,
1585 const Char** token_end,
1586 bool allow_leading_zeros) {
1587 if (start == end)
1588 return false;
1589 bool has_leading_zero = '0' == *start;
1590 int length = 0;
1591 while (start < end && '0' <= *start && *start <= '9') {
1592 ++start;
1593 ++length;
1594 }
1595 if (!length)
1596 return false;
1597 if (!allow_leading_zeros && length > 1 && has_leading_zero)
1598 return false;
1599 *token_end = start;
1600 return true;
1601 }
1602
ParseNumberToken(const Char * start,const Char * end,const Char ** token_end)1603 static bool ParseNumberToken(const Char* start,
1604 const Char* end,
1605 const Char** token_end) {
1606 // We just grab the number here. We validate the size in DecodeNumber.
1607 // According to RFC4627, a valid number is: [minus] int [frac] [exp]
1608 if (start == end)
1609 return false;
1610 Char c = *start;
1611 if ('-' == c)
1612 ++start;
1613
1614 if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/false))
1615 return false;
1616 if (start == end) {
1617 *token_end = start;
1618 return true;
1619 }
1620
1621 // Optional fraction part
1622 c = *start;
1623 if ('.' == c) {
1624 ++start;
1625 if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/true))
1626 return false;
1627 if (start == end) {
1628 *token_end = start;
1629 return true;
1630 }
1631 c = *start;
1632 }
1633
1634 // Optional exponent part
1635 if ('e' == c || 'E' == c) {
1636 ++start;
1637 if (start == end)
1638 return false;
1639 c = *start;
1640 if ('-' == c || '+' == c) {
1641 ++start;
1642 if (start == end)
1643 return false;
1644 }
1645 if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/true))
1646 return false;
1647 }
1648
1649 *token_end = start;
1650 return true;
1651 }
1652
ReadHexDigits(const Char * start,const Char * end,const Char ** token_end,int digits)1653 static bool ReadHexDigits(const Char* start,
1654 const Char* end,
1655 const Char** token_end,
1656 int digits) {
1657 if (end - start < digits)
1658 return false;
1659 for (int i = 0; i < digits; ++i) {
1660 Char c = *start++;
1661 if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
1662 ('A' <= c && c <= 'F')))
1663 return false;
1664 }
1665 *token_end = start;
1666 return true;
1667 }
1668
ParseStringToken(const Char * start,const Char * end,const Char ** token_end)1669 static bool ParseStringToken(const Char* start,
1670 const Char* end,
1671 const Char** token_end) {
1672 while (start < end) {
1673 Char c = *start++;
1674 if ('\\' == c) {
1675 if (start == end)
1676 return false;
1677 c = *start++;
1678 // Make sure the escaped char is valid.
1679 switch (c) {
1680 case 'x':
1681 if (!ReadHexDigits(start, end, &start, 2))
1682 return false;
1683 break;
1684 case 'u':
1685 if (!ReadHexDigits(start, end, &start, 4))
1686 return false;
1687 break;
1688 case '\\':
1689 case '/':
1690 case 'b':
1691 case 'f':
1692 case 'n':
1693 case 'r':
1694 case 't':
1695 case 'v':
1696 case '"':
1697 break;
1698 default:
1699 return false;
1700 }
1701 } else if ('"' == c) {
1702 *token_end = start;
1703 return true;
1704 }
1705 }
1706 return false;
1707 }
1708
SkipComment(const Char * start,const Char * end,const Char ** comment_end)1709 static bool SkipComment(const Char* start,
1710 const Char* end,
1711 const Char** comment_end) {
1712 if (start == end)
1713 return false;
1714
1715 if (*start != '/' || start + 1 >= end)
1716 return false;
1717 ++start;
1718
1719 if (*start == '/') {
1720 // Single line comment, read to newline.
1721 for (++start; start < end; ++start) {
1722 if (*start == '\n' || *start == '\r') {
1723 *comment_end = start + 1;
1724 return true;
1725 }
1726 }
1727 *comment_end = end;
1728 // Comment reaches end-of-input, which is fine.
1729 return true;
1730 }
1731
1732 if (*start == '*') {
1733 Char previous = '\0';
1734 // Block comment, read until end marker.
1735 for (++start; start < end; previous = *start++) {
1736 if (previous == '*' && *start == '/') {
1737 *comment_end = start + 1;
1738 return true;
1739 }
1740 }
1741 // Block comment must close before end-of-input.
1742 return false;
1743 }
1744
1745 return false;
1746 }
1747
IsSpaceOrNewLine(Char c)1748 static bool IsSpaceOrNewLine(Char c) {
1749 // \v = vertial tab; \f = form feed page break.
1750 return c == ' ' || c == '\n' || c == '\v' || c == '\f' || c == '\r' ||
1751 c == '\t';
1752 }
1753
SkipWhitespaceAndComments(const Char * start,const Char * end,const Char ** whitespace_end)1754 static void SkipWhitespaceAndComments(const Char* start,
1755 const Char* end,
1756 const Char** whitespace_end) {
1757 while (start < end) {
1758 if (IsSpaceOrNewLine(*start)) {
1759 ++start;
1760 } else if (*start == '/') {
1761 const Char* comment_end = nullptr;
1762 if (!SkipComment(start, end, &comment_end))
1763 break;
1764 start = comment_end;
1765 } else {
1766 break;
1767 }
1768 }
1769 *whitespace_end = start;
1770 }
1771
ParseToken(const Char * start,const Char * end,const Char ** tokenStart,const Char ** token_end)1772 static Token ParseToken(const Char* start,
1773 const Char* end,
1774 const Char** tokenStart,
1775 const Char** token_end) {
1776 SkipWhitespaceAndComments(start, end, tokenStart);
1777 start = *tokenStart;
1778
1779 if (start == end)
1780 return NoInput;
1781
1782 switch (*start) {
1783 case 'n':
1784 if (ParseConstToken(start, end, token_end, kNullString))
1785 return NullToken;
1786 break;
1787 case 't':
1788 if (ParseConstToken(start, end, token_end, kTrueString))
1789 return BoolTrue;
1790 break;
1791 case 'f':
1792 if (ParseConstToken(start, end, token_end, kFalseString))
1793 return BoolFalse;
1794 break;
1795 case '[':
1796 *token_end = start + 1;
1797 return ArrayBegin;
1798 case ']':
1799 *token_end = start + 1;
1800 return ArrayEnd;
1801 case ',':
1802 *token_end = start + 1;
1803 return ListSeparator;
1804 case '{':
1805 *token_end = start + 1;
1806 return ObjectBegin;
1807 case '}':
1808 *token_end = start + 1;
1809 return ObjectEnd;
1810 case ':':
1811 *token_end = start + 1;
1812 return ObjectPairSeparator;
1813 case '0':
1814 case '1':
1815 case '2':
1816 case '3':
1817 case '4':
1818 case '5':
1819 case '6':
1820 case '7':
1821 case '8':
1822 case '9':
1823 case '-':
1824 if (ParseNumberToken(start, end, token_end))
1825 return Number;
1826 break;
1827 case '"':
1828 if (ParseStringToken(start + 1, end, token_end))
1829 return StringLiteral;
1830 break;
1831 }
1832 return InvalidToken;
1833 }
1834
HexToInt(Char c)1835 static int HexToInt(Char c) {
1836 if ('0' <= c && c <= '9')
1837 return c - '0';
1838 if ('A' <= c && c <= 'F')
1839 return c - 'A' + 10;
1840 if ('a' <= c && c <= 'f')
1841 return c - 'a' + 10;
1842 assert(false); // Unreachable.
1843 return 0;
1844 }
1845
DecodeString(const Char * start,const Char * end,std::vector<uint16_t> * output)1846 static bool DecodeString(const Char* start,
1847 const Char* end,
1848 std::vector<uint16_t>* output) {
1849 if (start == end)
1850 return true;
1851 if (start > end)
1852 return false;
1853 output->reserve(end - start);
1854 while (start < end) {
1855 uint16_t c = *start++;
1856 // If the |Char| we're dealing with is really a byte, then
1857 // we have utf8 here, and we need to check for multibyte characters
1858 // and transcode them to utf16 (either one or two utf16 chars).
1859 if (sizeof(Char) == sizeof(uint8_t) && c >= 0x7f) {
1860 // Inspect the leading byte to figure out how long the utf8
1861 // byte sequence is; while doing this initialize |codepoint|
1862 // with the first few bits.
1863 // See table in: https://en.wikipedia.org/wiki/UTF-8
1864 // byte one is 110x xxxx -> 2 byte utf8 sequence
1865 // byte one is 1110 xxxx -> 3 byte utf8 sequence
1866 // byte one is 1111 0xxx -> 4 byte utf8 sequence
1867 uint32_t codepoint;
1868 int num_bytes_left;
1869 if ((c & 0xe0) == 0xc0) { // 2 byte utf8 sequence
1870 num_bytes_left = 1;
1871 codepoint = c & 0x1f;
1872 } else if ((c & 0xf0) == 0xe0) { // 3 byte utf8 sequence
1873 num_bytes_left = 2;
1874 codepoint = c & 0x0f;
1875 } else if ((c & 0xf8) == 0xf0) { // 4 byte utf8 sequence
1876 codepoint = c & 0x07;
1877 num_bytes_left = 3;
1878 } else {
1879 return false; // invalid leading byte
1880 }
1881
1882 // If we have enough bytes in our inpput, decode the remaining ones
1883 // belonging to this Unicode character into |codepoint|.
1884 if (start + num_bytes_left > end)
1885 return false;
1886 while (num_bytes_left > 0) {
1887 c = *start++;
1888 --num_bytes_left;
1889 // Check the next byte is a continuation byte, that is 10xx xxxx.
1890 if ((c & 0xc0) != 0x80)
1891 return false;
1892 codepoint = (codepoint << 6) | (c & 0x3f);
1893 }
1894
1895 // Disallow overlong encodings for ascii characters, as these
1896 // would include " and other characters significant to JSON
1897 // string termination / control.
1898 if (codepoint < 0x7f)
1899 return false;
1900 // Invalid in UTF8, and can't be represented in UTF16 anyway.
1901 if (codepoint > 0x10ffff)
1902 return false;
1903
1904 // So, now we transcode to UTF16,
1905 // using the math described at https://en.wikipedia.org/wiki/UTF-16,
1906 // for either one or two 16 bit characters.
1907 if (codepoint < 0xffff) {
1908 output->push_back(codepoint);
1909 continue;
1910 }
1911 codepoint -= 0x10000;
1912 output->push_back((codepoint >> 10) + 0xd800); // high surrogate
1913 output->push_back((codepoint & 0x3ff) + 0xdc00); // low surrogate
1914 continue;
1915 }
1916 if ('\\' != c) {
1917 output->push_back(c);
1918 continue;
1919 }
1920 if (start == end)
1921 return false;
1922 c = *start++;
1923
1924 if (c == 'x') {
1925 // \x is not supported.
1926 return false;
1927 }
1928
1929 switch (c) {
1930 case '"':
1931 case '/':
1932 case '\\':
1933 break;
1934 case 'b':
1935 c = '\b';
1936 break;
1937 case 'f':
1938 c = '\f';
1939 break;
1940 case 'n':
1941 c = '\n';
1942 break;
1943 case 'r':
1944 c = '\r';
1945 break;
1946 case 't':
1947 c = '\t';
1948 break;
1949 case 'v':
1950 c = '\v';
1951 break;
1952 case 'u':
1953 c = (HexToInt(*start) << 12) + (HexToInt(*(start + 1)) << 8) +
1954 (HexToInt(*(start + 2)) << 4) + HexToInt(*(start + 3));
1955 start += 4;
1956 break;
1957 default:
1958 return false;
1959 }
1960 output->push_back(c);
1961 }
1962 return true;
1963 }
1964
ParseValue(const Char * start,const Char * end,const Char ** value_token_end,int depth)1965 void ParseValue(const Char* start,
1966 const Char* end,
1967 const Char** value_token_end,
1968 int depth) {
1969 if (depth > kStackLimit) {
1970 HandleError(Error::JSON_PARSER_STACK_LIMIT_EXCEEDED, start);
1971 return;
1972 }
1973 const Char* token_start = nullptr;
1974 const Char* token_end = nullptr;
1975 Token token = ParseToken(start, end, &token_start, &token_end);
1976 switch (token) {
1977 case NoInput:
1978 HandleError(Error::JSON_PARSER_NO_INPUT, token_start);
1979 return;
1980 case InvalidToken:
1981 HandleError(Error::JSON_PARSER_INVALID_TOKEN, token_start);
1982 return;
1983 case NullToken:
1984 handler_->HandleNull();
1985 break;
1986 case BoolTrue:
1987 handler_->HandleBool(true);
1988 break;
1989 case BoolFalse:
1990 handler_->HandleBool(false);
1991 break;
1992 case Number: {
1993 double value;
1994 if (!CharsToDouble(token_start, token_end - token_start, &value)) {
1995 HandleError(Error::JSON_PARSER_INVALID_NUMBER, token_start);
1996 return;
1997 }
1998 if (value >= std::numeric_limits<int32_t>::min() &&
1999 value <= std::numeric_limits<int32_t>::max() &&
2000 static_cast<int32_t>(value) == value)
2001 handler_->HandleInt32(static_cast<int32_t>(value));
2002 else
2003 handler_->HandleDouble(value);
2004 break;
2005 }
2006 case StringLiteral: {
2007 std::vector<uint16_t> value;
2008 bool ok = DecodeString(token_start + 1, token_end - 1, &value);
2009 if (!ok) {
2010 HandleError(Error::JSON_PARSER_INVALID_STRING, token_start);
2011 return;
2012 }
2013 handler_->HandleString16(span<uint16_t>(value.data(), value.size()));
2014 break;
2015 }
2016 case ArrayBegin: {
2017 handler_->HandleArrayBegin();
2018 start = token_end;
2019 token = ParseToken(start, end, &token_start, &token_end);
2020 while (token != ArrayEnd) {
2021 ParseValue(start, end, &token_end, depth + 1);
2022 if (error_)
2023 return;
2024
2025 // After a list value, we expect a comma or the end of the list.
2026 start = token_end;
2027 token = ParseToken(start, end, &token_start, &token_end);
2028 if (token == ListSeparator) {
2029 start = token_end;
2030 token = ParseToken(start, end, &token_start, &token_end);
2031 if (token == ArrayEnd) {
2032 HandleError(Error::JSON_PARSER_UNEXPECTED_ARRAY_END, token_start);
2033 return;
2034 }
2035 } else if (token != ArrayEnd) {
2036 // Unexpected value after list value. Bail out.
2037 HandleError(Error::JSON_PARSER_COMMA_OR_ARRAY_END_EXPECTED,
2038 token_start);
2039 return;
2040 }
2041 }
2042 handler_->HandleArrayEnd();
2043 break;
2044 }
2045 case ObjectBegin: {
2046 handler_->HandleMapBegin();
2047 start = token_end;
2048 token = ParseToken(start, end, &token_start, &token_end);
2049 while (token != ObjectEnd) {
2050 if (token != StringLiteral) {
2051 HandleError(Error::JSON_PARSER_STRING_LITERAL_EXPECTED,
2052 token_start);
2053 return;
2054 }
2055 std::vector<uint16_t> key;
2056 if (!DecodeString(token_start + 1, token_end - 1, &key)) {
2057 HandleError(Error::JSON_PARSER_INVALID_STRING, token_start);
2058 return;
2059 }
2060 handler_->HandleString16(span<uint16_t>(key.data(), key.size()));
2061 start = token_end;
2062
2063 token = ParseToken(start, end, &token_start, &token_end);
2064 if (token != ObjectPairSeparator) {
2065 HandleError(Error::JSON_PARSER_COLON_EXPECTED, token_start);
2066 return;
2067 }
2068 start = token_end;
2069
2070 ParseValue(start, end, &token_end, depth + 1);
2071 if (error_)
2072 return;
2073 start = token_end;
2074
2075 // After a key/value pair, we expect a comma or the end of the
2076 // object.
2077 token = ParseToken(start, end, &token_start, &token_end);
2078 if (token == ListSeparator) {
2079 start = token_end;
2080 token = ParseToken(start, end, &token_start, &token_end);
2081 if (token == ObjectEnd) {
2082 HandleError(Error::JSON_PARSER_UNEXPECTED_MAP_END, token_start);
2083 return;
2084 }
2085 } else if (token != ObjectEnd) {
2086 // Unexpected value after last object value. Bail out.
2087 HandleError(Error::JSON_PARSER_COMMA_OR_MAP_END_EXPECTED,
2088 token_start);
2089 return;
2090 }
2091 }
2092 handler_->HandleMapEnd();
2093 break;
2094 }
2095
2096 default:
2097 // We got a token that's not a value.
2098 HandleError(Error::JSON_PARSER_VALUE_EXPECTED, token_start);
2099 return;
2100 }
2101
2102 SkipWhitespaceAndComments(token_end, end, value_token_end);
2103 }
2104
HandleError(Error error,const Char * pos)2105 void HandleError(Error error, const Char* pos) {
2106 assert(error != Error::OK);
2107 if (!error_) {
2108 handler_->HandleError(
2109 Status{error, static_cast<size_t>(pos - start_pos_)});
2110 error_ = true;
2111 }
2112 }
2113
2114 const Char* start_pos_ = nullptr;
2115 bool error_ = false;
2116 const Platform* platform_;
2117 StreamingParserHandler* handler_;
2118 };
2119 } // namespace
2120
ParseJSON(const Platform & platform,span<uint8_t> chars,StreamingParserHandler * handler)2121 void ParseJSON(const Platform& platform,
2122 span<uint8_t> chars,
2123 StreamingParserHandler* handler) {
2124 JsonParser<uint8_t> parser(&platform, handler);
2125 parser.Parse(chars.data(), chars.size());
2126 }
2127
ParseJSON(const Platform & platform,span<uint16_t> chars,StreamingParserHandler * handler)2128 void ParseJSON(const Platform& platform,
2129 span<uint16_t> chars,
2130 StreamingParserHandler* handler) {
2131 JsonParser<uint16_t> parser(&platform, handler);
2132 parser.Parse(chars.data(), chars.size());
2133 }
2134
2135 // =============================================================================
2136 // json::ConvertCBORToJSON, json::ConvertJSONToCBOR - for transcoding
2137 // =============================================================================
2138 template <typename C>
ConvertCBORToJSONTmpl(const Platform & platform,span<uint8_t> cbor,C * json)2139 Status ConvertCBORToJSONTmpl(const Platform& platform,
2140 span<uint8_t> cbor,
2141 C* json) {
2142 Status status;
2143 std::unique_ptr<StreamingParserHandler> json_writer =
2144 NewJSONEncoder(&platform, json, &status);
2145 cbor::ParseCBOR(cbor, json_writer.get());
2146 return status;
2147 }
2148
ConvertCBORToJSON(const Platform & platform,span<uint8_t> cbor,std::vector<uint8_t> * json)2149 Status ConvertCBORToJSON(const Platform& platform,
2150 span<uint8_t> cbor,
2151 std::vector<uint8_t>* json) {
2152 return ConvertCBORToJSONTmpl(platform, cbor, json);
2153 }
ConvertCBORToJSON(const Platform & platform,span<uint8_t> cbor,std::string * json)2154 Status ConvertCBORToJSON(const Platform& platform,
2155 span<uint8_t> cbor,
2156 std::string* json) {
2157 return ConvertCBORToJSONTmpl(platform, cbor, json);
2158 }
2159
2160 template <typename T, typename C>
ConvertJSONToCBORTmpl(const Platform & platform,span<T> json,C * cbor)2161 Status ConvertJSONToCBORTmpl(const Platform& platform, span<T> json, C* cbor) {
2162 Status status;
2163 std::unique_ptr<StreamingParserHandler> encoder =
2164 cbor::NewCBOREncoder(cbor, &status);
2165 ParseJSON(platform, json, encoder.get());
2166 return status;
2167 }
ConvertJSONToCBOR(const Platform & platform,span<uint8_t> json,std::string * cbor)2168 Status ConvertJSONToCBOR(const Platform& platform,
2169 span<uint8_t> json,
2170 std::string* cbor) {
2171 return ConvertJSONToCBORTmpl(platform, json, cbor);
2172 }
ConvertJSONToCBOR(const Platform & platform,span<uint16_t> json,std::string * cbor)2173 Status ConvertJSONToCBOR(const Platform& platform,
2174 span<uint16_t> json,
2175 std::string* cbor) {
2176 return ConvertJSONToCBORTmpl(platform, json, cbor);
2177 }
ConvertJSONToCBOR(const Platform & platform,span<uint8_t> json,std::vector<uint8_t> * cbor)2178 Status ConvertJSONToCBOR(const Platform& platform,
2179 span<uint8_t> json,
2180 std::vector<uint8_t>* cbor) {
2181 return ConvertJSONToCBORTmpl(platform, json, cbor);
2182 }
ConvertJSONToCBOR(const Platform & platform,span<uint16_t> json,std::vector<uint8_t> * cbor)2183 Status ConvertJSONToCBOR(const Platform& platform,
2184 span<uint16_t> json,
2185 std::vector<uint8_t>* cbor) {
2186 return ConvertJSONToCBORTmpl(platform, json, cbor);
2187 }
2188 } // namespace json
2189 } // namespace v8_inspector_protocol_encoding
2190