1 // Copyright 2019 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "encoding.h"
6
7 #include <algorithm>
8 #include <cassert>
9 #include <cmath>
10 #include <cstring>
11 #include <limits>
12 #include <stack>
13
14 namespace v8_inspector_protocol_encoding {
15 // =============================================================================
16 // Status and Error codes
17 // =============================================================================
18
ToASCIIString() const19 std::string Status::ToASCIIString() const {
20 switch (error) {
21 case Error::OK:
22 return "OK";
23 case Error::JSON_PARSER_UNPROCESSED_INPUT_REMAINS:
24 return ToASCIIString("JSON: unprocessed input remains");
25 case Error::JSON_PARSER_STACK_LIMIT_EXCEEDED:
26 return ToASCIIString("JSON: stack limit exceeded");
27 case Error::JSON_PARSER_NO_INPUT:
28 return ToASCIIString("JSON: no input");
29 case Error::JSON_PARSER_INVALID_TOKEN:
30 return ToASCIIString("JSON: invalid token");
31 case Error::JSON_PARSER_INVALID_NUMBER:
32 return ToASCIIString("JSON: invalid number");
33 case Error::JSON_PARSER_INVALID_STRING:
34 return ToASCIIString("JSON: invalid string");
35 case Error::JSON_PARSER_UNEXPECTED_ARRAY_END:
36 return ToASCIIString("JSON: unexpected array end");
37 case Error::JSON_PARSER_COMMA_OR_ARRAY_END_EXPECTED:
38 return ToASCIIString("JSON: comma or array end expected");
39 case Error::JSON_PARSER_STRING_LITERAL_EXPECTED:
40 return ToASCIIString("JSON: string literal expected");
41 case Error::JSON_PARSER_COLON_EXPECTED:
42 return ToASCIIString("JSON: colon expected");
43 case Error::JSON_PARSER_UNEXPECTED_MAP_END:
44 return ToASCIIString("JSON: unexpected map end");
45 case Error::JSON_PARSER_COMMA_OR_MAP_END_EXPECTED:
46 return ToASCIIString("JSON: comma or map end expected");
47 case Error::JSON_PARSER_VALUE_EXPECTED:
48 return ToASCIIString("JSON: value expected");
49
50 case Error::CBOR_INVALID_INT32:
51 return ToASCIIString("CBOR: invalid int32");
52 case Error::CBOR_INVALID_DOUBLE:
53 return ToASCIIString("CBOR: invalid double");
54 case Error::CBOR_INVALID_ENVELOPE:
55 return ToASCIIString("CBOR: invalid envelope");
56 case Error::CBOR_INVALID_STRING8:
57 return ToASCIIString("CBOR: invalid string8");
58 case Error::CBOR_INVALID_STRING16:
59 return ToASCIIString("CBOR: invalid string16");
60 case Error::CBOR_INVALID_BINARY:
61 return ToASCIIString("CBOR: invalid binary");
62 case Error::CBOR_UNSUPPORTED_VALUE:
63 return ToASCIIString("CBOR: unsupported value");
64 case Error::CBOR_NO_INPUT:
65 return ToASCIIString("CBOR: no input");
66 case Error::CBOR_INVALID_START_BYTE:
67 return ToASCIIString("CBOR: invalid start byte");
68 case Error::CBOR_UNEXPECTED_EOF_EXPECTED_VALUE:
69 return ToASCIIString("CBOR: unexpected eof expected value");
70 case Error::CBOR_UNEXPECTED_EOF_IN_ARRAY:
71 return ToASCIIString("CBOR: unexpected eof in array");
72 case Error::CBOR_UNEXPECTED_EOF_IN_MAP:
73 return ToASCIIString("CBOR: unexpected eof in map");
74 case Error::CBOR_INVALID_MAP_KEY:
75 return ToASCIIString("CBOR: invalid map key");
76 case Error::CBOR_STACK_LIMIT_EXCEEDED:
77 return ToASCIIString("CBOR: stack limit exceeded");
78 case Error::CBOR_TRAILING_JUNK:
79 return ToASCIIString("CBOR: trailing junk");
80 case Error::CBOR_MAP_START_EXPECTED:
81 return ToASCIIString("CBOR: map start expected");
82 case Error::CBOR_MAP_STOP_EXPECTED:
83 return ToASCIIString("CBOR: map stop expected");
84 case Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED:
85 return ToASCIIString("CBOR: envelope size limit exceeded");
86 }
87 // Some compilers can't figure out that we can't get here.
88 return "INVALID ERROR CODE";
89 }
90
ToASCIIString(const char * msg) const91 std::string Status::ToASCIIString(const char* msg) const {
92 return std::string(msg) + " at position " + std::to_string(pos);
93 }
94
95 namespace cbor {
96 namespace {
97 // Indicates the number of bits the "initial byte" needs to be shifted to the
98 // right after applying |kMajorTypeMask| to produce the major type in the
99 // lowermost bits.
100 static constexpr uint8_t kMajorTypeBitShift = 5u;
101 // Mask selecting the low-order 5 bits of the "initial byte", which is where
102 // the additional information is encoded.
103 static constexpr uint8_t kAdditionalInformationMask = 0x1f;
104 // Mask selecting the high-order 3 bits of the "initial byte", which indicates
105 // the major type of the encoded value.
106 static constexpr uint8_t kMajorTypeMask = 0xe0;
107 // Indicates the integer is in the following byte.
108 static constexpr uint8_t kAdditionalInformation1Byte = 24u;
109 // Indicates the integer is in the next 2 bytes.
110 static constexpr uint8_t kAdditionalInformation2Bytes = 25u;
111 // Indicates the integer is in the next 4 bytes.
112 static constexpr uint8_t kAdditionalInformation4Bytes = 26u;
113 // Indicates the integer is in the next 8 bytes.
114 static constexpr uint8_t kAdditionalInformation8Bytes = 27u;
115
116 // Encodes the initial byte, consisting of the |type| in the first 3 bits
117 // followed by 5 bits of |additional_info|.
EncodeInitialByte(MajorType type,uint8_t additional_info)118 constexpr uint8_t EncodeInitialByte(MajorType type, uint8_t additional_info) {
119 return (static_cast<uint8_t>(type) << kMajorTypeBitShift) |
120 (additional_info & kAdditionalInformationMask);
121 }
122
123 // TAG 24 indicates that what follows is a byte string which is
124 // encoded in CBOR format. We use this as a wrapper for
125 // maps and arrays, allowing us to skip them, because the
126 // byte string carries its size (byte length).
127 // https://tools.ietf.org/html/rfc7049#section-2.4.4.1
128 static constexpr uint8_t kInitialByteForEnvelope =
129 EncodeInitialByte(MajorType::TAG, 24);
130 // The initial byte for a byte string with at most 2^32 bytes
131 // of payload. This is used for envelope encoding, even if
132 // the byte string is shorter.
133 static constexpr uint8_t kInitialByteFor32BitLengthByteString =
134 EncodeInitialByte(MajorType::BYTE_STRING, 26);
135
136 // See RFC 7049 Section 2.2.1, indefinite length arrays / maps have additional
137 // info = 31.
138 static constexpr uint8_t kInitialByteIndefiniteLengthArray =
139 EncodeInitialByte(MajorType::ARRAY, 31);
140 static constexpr uint8_t kInitialByteIndefiniteLengthMap =
141 EncodeInitialByte(MajorType::MAP, 31);
142 // See RFC 7049 Section 2.3, Table 1; this is used for finishing indefinite
143 // length maps / arrays.
144 static constexpr uint8_t kStopByte =
145 EncodeInitialByte(MajorType::SIMPLE_VALUE, 31);
146
147 // See RFC 7049 Section 2.3, Table 2.
148 static constexpr uint8_t kEncodedTrue =
149 EncodeInitialByte(MajorType::SIMPLE_VALUE, 21);
150 static constexpr uint8_t kEncodedFalse =
151 EncodeInitialByte(MajorType::SIMPLE_VALUE, 20);
152 static constexpr uint8_t kEncodedNull =
153 EncodeInitialByte(MajorType::SIMPLE_VALUE, 22);
154 static constexpr uint8_t kInitialByteForDouble =
155 EncodeInitialByte(MajorType::SIMPLE_VALUE, 27);
156
157 // See RFC 7049 Table 3 and Section 2.4.4.2. This is used as a prefix for
158 // arbitrary binary data encoded as BYTE_STRING.
159 static constexpr uint8_t kExpectedConversionToBase64Tag =
160 EncodeInitialByte(MajorType::TAG, 22);
161
162 // Writes the bytes for |v| to |out|, starting with the most significant byte.
163 // See also: https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html
164 template <typename T, class C>
WriteBytesMostSignificantByteFirst(T v,C * out)165 void WriteBytesMostSignificantByteFirst(T v, C* out) {
166 for (int shift_bytes = sizeof(T) - 1; shift_bytes >= 0; --shift_bytes)
167 out->push_back(0xff & (v >> (shift_bytes * 8)));
168 }
169
170 // Extracts sizeof(T) bytes from |in| to extract a value of type T
171 // (e.g. uint64_t, uint32_t, ...), most significant byte first.
172 // See also: https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html
173 template <typename T>
ReadBytesMostSignificantByteFirst(span<uint8_t> in)174 T ReadBytesMostSignificantByteFirst(span<uint8_t> in) {
175 assert(in.size() >= sizeof(T));
176 T result = 0;
177 for (size_t shift_bytes = 0; shift_bytes < sizeof(T); ++shift_bytes)
178 result |= T(in[sizeof(T) - 1 - shift_bytes]) << (shift_bytes * 8);
179 return result;
180 }
181 } // namespace
182
183 namespace internals {
184 // Reads the start of a token with definitive size from |bytes|.
185 // |type| is the major type as specified in RFC 7049 Section 2.1.
186 // |value| is the payload (e.g. for MajorType::UNSIGNED) or is the size
187 // (e.g. for BYTE_STRING).
188 // If successful, returns the number of bytes read. Otherwise returns 0.
ReadTokenStart(span<uint8_t> bytes,MajorType * type,uint64_t * value)189 size_t ReadTokenStart(span<uint8_t> bytes, MajorType* type, uint64_t* value) {
190 if (bytes.empty())
191 return 0;
192 uint8_t initial_byte = bytes[0];
193 *type = MajorType((initial_byte & kMajorTypeMask) >> kMajorTypeBitShift);
194
195 uint8_t additional_information = initial_byte & kAdditionalInformationMask;
196 if (additional_information < 24) {
197 // Values 0-23 are encoded directly into the additional info of the
198 // initial byte.
199 *value = additional_information;
200 return 1;
201 }
202 if (additional_information == kAdditionalInformation1Byte) {
203 // Values 24-255 are encoded with one initial byte, followed by the value.
204 if (bytes.size() < 2)
205 return 0;
206 *value = ReadBytesMostSignificantByteFirst<uint8_t>(bytes.subspan(1));
207 return 2;
208 }
209 if (additional_information == kAdditionalInformation2Bytes) {
210 // Values 256-65535: 1 initial byte + 2 bytes payload.
211 if (bytes.size() < 1 + sizeof(uint16_t))
212 return 0;
213 *value = ReadBytesMostSignificantByteFirst<uint16_t>(bytes.subspan(1));
214 return 3;
215 }
216 if (additional_information == kAdditionalInformation4Bytes) {
217 // 32 bit uint: 1 initial byte + 4 bytes payload.
218 if (bytes.size() < 1 + sizeof(uint32_t))
219 return 0;
220 *value = ReadBytesMostSignificantByteFirst<uint32_t>(bytes.subspan(1));
221 return 5;
222 }
223 if (additional_information == kAdditionalInformation8Bytes) {
224 // 64 bit uint: 1 initial byte + 8 bytes payload.
225 if (bytes.size() < 1 + sizeof(uint64_t))
226 return 0;
227 *value = ReadBytesMostSignificantByteFirst<uint64_t>(bytes.subspan(1));
228 return 9;
229 }
230 return 0;
231 }
232
233 // Writes the start of a token with |type|. The |value| may indicate the size,
234 // or it may be the payload if the value is an unsigned integer.
235 template <typename C>
WriteTokenStartTmpl(MajorType type,uint64_t value,C * encoded)236 void WriteTokenStartTmpl(MajorType type, uint64_t value, C* encoded) {
237 if (value < 24) {
238 // Values 0-23 are encoded directly into the additional info of the
239 // initial byte.
240 encoded->push_back(EncodeInitialByte(type, /*additional_info=*/value));
241 return;
242 }
243 if (value <= std::numeric_limits<uint8_t>::max()) {
244 // Values 24-255 are encoded with one initial byte, followed by the value.
245 encoded->push_back(EncodeInitialByte(type, kAdditionalInformation1Byte));
246 encoded->push_back(value);
247 return;
248 }
249 if (value <= std::numeric_limits<uint16_t>::max()) {
250 // Values 256-65535: 1 initial byte + 2 bytes payload.
251 encoded->push_back(EncodeInitialByte(type, kAdditionalInformation2Bytes));
252 WriteBytesMostSignificantByteFirst<uint16_t>(value, encoded);
253 return;
254 }
255 if (value <= std::numeric_limits<uint32_t>::max()) {
256 // 32 bit uint: 1 initial byte + 4 bytes payload.
257 encoded->push_back(EncodeInitialByte(type, kAdditionalInformation4Bytes));
258 WriteBytesMostSignificantByteFirst<uint32_t>(static_cast<uint32_t>(value),
259 encoded);
260 return;
261 }
262 // 64 bit uint: 1 initial byte + 8 bytes payload.
263 encoded->push_back(EncodeInitialByte(type, kAdditionalInformation8Bytes));
264 WriteBytesMostSignificantByteFirst<uint64_t>(value, encoded);
265 }
WriteTokenStart(MajorType type,uint64_t value,std::vector<uint8_t> * encoded)266 void WriteTokenStart(MajorType type,
267 uint64_t value,
268 std::vector<uint8_t>* encoded) {
269 WriteTokenStartTmpl(type, value, encoded);
270 }
WriteTokenStart(MajorType type,uint64_t value,std::string * encoded)271 void WriteTokenStart(MajorType type, uint64_t value, std::string* encoded) {
272 WriteTokenStartTmpl(type, value, encoded);
273 }
274 } // namespace internals
275
276 // =============================================================================
277 // Detecting CBOR content
278 // =============================================================================
279
InitialByteForEnvelope()280 uint8_t InitialByteForEnvelope() {
281 return kInitialByteForEnvelope;
282 }
InitialByteFor32BitLengthByteString()283 uint8_t InitialByteFor32BitLengthByteString() {
284 return kInitialByteFor32BitLengthByteString;
285 }
IsCBORMessage(span<uint8_t> msg)286 bool IsCBORMessage(span<uint8_t> msg) {
287 return msg.size() >= 6 && msg[0] == InitialByteForEnvelope() &&
288 msg[1] == InitialByteFor32BitLengthByteString();
289 }
290
291 // =============================================================================
292 // Encoding invidiual CBOR items
293 // =============================================================================
294
EncodeTrue()295 uint8_t EncodeTrue() {
296 return kEncodedTrue;
297 }
EncodeFalse()298 uint8_t EncodeFalse() {
299 return kEncodedFalse;
300 }
EncodeNull()301 uint8_t EncodeNull() {
302 return kEncodedNull;
303 }
304
EncodeIndefiniteLengthArrayStart()305 uint8_t EncodeIndefiniteLengthArrayStart() {
306 return kInitialByteIndefiniteLengthArray;
307 }
308
EncodeIndefiniteLengthMapStart()309 uint8_t EncodeIndefiniteLengthMapStart() {
310 return kInitialByteIndefiniteLengthMap;
311 }
312
EncodeStop()313 uint8_t EncodeStop() {
314 return kStopByte;
315 }
316
317 template <typename C>
EncodeInt32Tmpl(int32_t value,C * out)318 void EncodeInt32Tmpl(int32_t value, C* out) {
319 if (value >= 0) {
320 internals::WriteTokenStart(MajorType::UNSIGNED, value, out);
321 } else {
322 uint64_t representation = static_cast<uint64_t>(-(value + 1));
323 internals::WriteTokenStart(MajorType::NEGATIVE, representation, out);
324 }
325 }
EncodeInt32(int32_t value,std::vector<uint8_t> * out)326 void EncodeInt32(int32_t value, std::vector<uint8_t>* out) {
327 EncodeInt32Tmpl(value, out);
328 }
EncodeInt32(int32_t value,std::string * out)329 void EncodeInt32(int32_t value, std::string* out) {
330 EncodeInt32Tmpl(value, out);
331 }
332
333 template <typename C>
EncodeString16Tmpl(span<uint16_t> in,C * out)334 void EncodeString16Tmpl(span<uint16_t> in, C* out) {
335 uint64_t byte_length = static_cast<uint64_t>(in.size_bytes());
336 internals::WriteTokenStart(MajorType::BYTE_STRING, byte_length, out);
337 // When emitting UTF16 characters, we always write the least significant byte
338 // first; this is because it's the native representation for X86.
339 // TODO(johannes): Implement a more efficient thing here later, e.g.
340 // casting *iff* the machine has this byte order.
341 // The wire format for UTF16 chars will probably remain the same
342 // (least significant byte first) since this way we can have
343 // golden files, unittests, etc. that port easily and universally.
344 // See also:
345 // https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html
346 for (const uint16_t two_bytes : in) {
347 out->push_back(two_bytes);
348 out->push_back(two_bytes >> 8);
349 }
350 }
EncodeString16(span<uint16_t> in,std::vector<uint8_t> * out)351 void EncodeString16(span<uint16_t> in, std::vector<uint8_t>* out) {
352 EncodeString16Tmpl(in, out);
353 }
EncodeString16(span<uint16_t> in,std::string * out)354 void EncodeString16(span<uint16_t> in, std::string* out) {
355 EncodeString16Tmpl(in, out);
356 }
357
358 template <typename C>
EncodeString8Tmpl(span<uint8_t> in,C * out)359 void EncodeString8Tmpl(span<uint8_t> in, C* out) {
360 internals::WriteTokenStart(MajorType::STRING,
361 static_cast<uint64_t>(in.size_bytes()), out);
362 out->insert(out->end(), in.begin(), in.end());
363 }
EncodeString8(span<uint8_t> in,std::vector<uint8_t> * out)364 void EncodeString8(span<uint8_t> in, std::vector<uint8_t>* out) {
365 EncodeString8Tmpl(in, out);
366 }
EncodeString8(span<uint8_t> in,std::string * out)367 void EncodeString8(span<uint8_t> in, std::string* out) {
368 EncodeString8Tmpl(in, out);
369 }
370
371 template <typename C>
EncodeFromLatin1Tmpl(span<uint8_t> latin1,C * out)372 void EncodeFromLatin1Tmpl(span<uint8_t> latin1, C* out) {
373 for (size_t ii = 0; ii < latin1.size(); ++ii) {
374 if (latin1[ii] <= 127)
375 continue;
376 // If there's at least one non-ASCII char, convert to UTF8.
377 std::vector<uint8_t> utf8(latin1.begin(), latin1.begin() + ii);
378 for (; ii < latin1.size(); ++ii) {
379 if (latin1[ii] <= 127) {
380 utf8.push_back(latin1[ii]);
381 } else {
382 // 0xC0 means it's a UTF8 sequence with 2 bytes.
383 utf8.push_back((latin1[ii] >> 6) | 0xc0);
384 utf8.push_back((latin1[ii] | 0x80) & 0xbf);
385 }
386 }
387 EncodeString8(SpanFrom(utf8), out);
388 return;
389 }
390 EncodeString8(latin1, out);
391 }
EncodeFromLatin1(span<uint8_t> latin1,std::vector<uint8_t> * out)392 void EncodeFromLatin1(span<uint8_t> latin1, std::vector<uint8_t>* out) {
393 EncodeFromLatin1Tmpl(latin1, out);
394 }
EncodeFromLatin1(span<uint8_t> latin1,std::string * out)395 void EncodeFromLatin1(span<uint8_t> latin1, std::string* out) {
396 EncodeFromLatin1Tmpl(latin1, out);
397 }
398
399 template <typename C>
EncodeFromUTF16Tmpl(span<uint16_t> utf16,C * out)400 void EncodeFromUTF16Tmpl(span<uint16_t> utf16, C* out) {
401 // If there's at least one non-ASCII char, encode as STRING16 (UTF16).
402 for (uint16_t ch : utf16) {
403 if (ch <= 127)
404 continue;
405 EncodeString16(utf16, out);
406 return;
407 }
408 // It's all US-ASCII, strip out every second byte and encode as UTF8.
409 internals::WriteTokenStart(MajorType::STRING,
410 static_cast<uint64_t>(utf16.size()), out);
411 out->insert(out->end(), utf16.begin(), utf16.end());
412 }
EncodeFromUTF16(span<uint16_t> utf16,std::vector<uint8_t> * out)413 void EncodeFromUTF16(span<uint16_t> utf16, std::vector<uint8_t>* out) {
414 EncodeFromUTF16Tmpl(utf16, out);
415 }
EncodeFromUTF16(span<uint16_t> utf16,std::string * out)416 void EncodeFromUTF16(span<uint16_t> utf16, std::string* out) {
417 EncodeFromUTF16Tmpl(utf16, out);
418 }
419
420 template <typename C>
EncodeBinaryTmpl(span<uint8_t> in,C * out)421 void EncodeBinaryTmpl(span<uint8_t> in, C* out) {
422 out->push_back(kExpectedConversionToBase64Tag);
423 uint64_t byte_length = static_cast<uint64_t>(in.size_bytes());
424 internals::WriteTokenStart(MajorType::BYTE_STRING, byte_length, out);
425 out->insert(out->end(), in.begin(), in.end());
426 }
EncodeBinary(span<uint8_t> in,std::vector<uint8_t> * out)427 void EncodeBinary(span<uint8_t> in, std::vector<uint8_t>* out) {
428 EncodeBinaryTmpl(in, out);
429 }
EncodeBinary(span<uint8_t> in,std::string * out)430 void EncodeBinary(span<uint8_t> in, std::string* out) {
431 EncodeBinaryTmpl(in, out);
432 }
433
434 // A double is encoded with a specific initial byte
435 // (kInitialByteForDouble) plus the 64 bits of payload for its value.
436 constexpr size_t kEncodedDoubleSize = 1 + sizeof(uint64_t);
437
438 // An envelope is encoded with a specific initial byte
439 // (kInitialByteForEnvelope), plus the start byte for a BYTE_STRING with a 32
440 // bit wide length, plus a 32 bit length for that string.
441 constexpr size_t kEncodedEnvelopeHeaderSize = 1 + 1 + sizeof(uint32_t);
442
443 template <typename C>
EncodeDoubleTmpl(double value,C * out)444 void EncodeDoubleTmpl(double value, C* out) {
445 // The additional_info=27 indicates 64 bits for the double follow.
446 // See RFC 7049 Section 2.3, Table 1.
447 out->push_back(kInitialByteForDouble);
448 union {
449 double from_double;
450 uint64_t to_uint64;
451 } reinterpret;
452 reinterpret.from_double = value;
453 WriteBytesMostSignificantByteFirst<uint64_t>(reinterpret.to_uint64, out);
454 }
EncodeDouble(double value,std::vector<uint8_t> * out)455 void EncodeDouble(double value, std::vector<uint8_t>* out) {
456 EncodeDoubleTmpl(value, out);
457 }
EncodeDouble(double value,std::string * out)458 void EncodeDouble(double value, std::string* out) {
459 EncodeDoubleTmpl(value, out);
460 }
461
462 // =============================================================================
463 // cbor::EnvelopeEncoder - for wrapping submessages
464 // =============================================================================
465
466 template <typename C>
EncodeStartTmpl(C * out,size_t * byte_size_pos)467 void EncodeStartTmpl(C* out, size_t* byte_size_pos) {
468 assert(*byte_size_pos == 0);
469 out->push_back(kInitialByteForEnvelope);
470 out->push_back(kInitialByteFor32BitLengthByteString);
471 *byte_size_pos = out->size();
472 out->resize(out->size() + sizeof(uint32_t));
473 }
474
EncodeStart(std::vector<uint8_t> * out)475 void EnvelopeEncoder::EncodeStart(std::vector<uint8_t>* out) {
476 EncodeStartTmpl<std::vector<uint8_t>>(out, &byte_size_pos_);
477 }
478
EncodeStart(std::string * out)479 void EnvelopeEncoder::EncodeStart(std::string* out) {
480 EncodeStartTmpl<std::string>(out, &byte_size_pos_);
481 }
482
483 template <typename C>
EncodeStopTmpl(C * out,size_t * byte_size_pos)484 bool EncodeStopTmpl(C* out, size_t* byte_size_pos) {
485 assert(*byte_size_pos != 0);
486 // The byte size is the size of the payload, that is, all the
487 // bytes that were written past the byte size position itself.
488 uint64_t byte_size = out->size() - (*byte_size_pos + sizeof(uint32_t));
489 // We store exactly 4 bytes, so at most INT32MAX, with most significant
490 // byte first.
491 if (byte_size > std::numeric_limits<uint32_t>::max())
492 return false;
493 for (int shift_bytes = sizeof(uint32_t) - 1; shift_bytes >= 0;
494 --shift_bytes) {
495 (*out)[(*byte_size_pos)++] = 0xff & (byte_size >> (shift_bytes * 8));
496 }
497 return true;
498 }
499
EncodeStop(std::vector<uint8_t> * out)500 bool EnvelopeEncoder::EncodeStop(std::vector<uint8_t>* out) {
501 return EncodeStopTmpl(out, &byte_size_pos_);
502 }
503
EncodeStop(std::string * out)504 bool EnvelopeEncoder::EncodeStop(std::string* out) {
505 return EncodeStopTmpl(out, &byte_size_pos_);
506 }
507
508 // =============================================================================
509 // cbor::NewCBOREncoder - for encoding from a streaming parser
510 // =============================================================================
511
512 namespace {
513 template <typename C>
514 class CBOREncoder : public StreamingParserHandler {
515 public:
CBOREncoder(C * out,Status * status)516 CBOREncoder(C* out, Status* status) : out_(out), status_(status) {
517 *status_ = Status();
518 }
519
HandleMapBegin()520 void HandleMapBegin() override {
521 if (!status_->ok())
522 return;
523 envelopes_.emplace_back();
524 envelopes_.back().EncodeStart(out_);
525 out_->push_back(kInitialByteIndefiniteLengthMap);
526 }
527
HandleMapEnd()528 void HandleMapEnd() override {
529 if (!status_->ok())
530 return;
531 out_->push_back(kStopByte);
532 assert(!envelopes_.empty());
533 if (!envelopes_.back().EncodeStop(out_)) {
534 HandleError(
535 Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, out_->size()));
536 return;
537 }
538 envelopes_.pop_back();
539 }
540
HandleArrayBegin()541 void HandleArrayBegin() override {
542 if (!status_->ok())
543 return;
544 envelopes_.emplace_back();
545 envelopes_.back().EncodeStart(out_);
546 out_->push_back(kInitialByteIndefiniteLengthArray);
547 }
548
HandleArrayEnd()549 void HandleArrayEnd() override {
550 if (!status_->ok())
551 return;
552 out_->push_back(kStopByte);
553 assert(!envelopes_.empty());
554 if (!envelopes_.back().EncodeStop(out_)) {
555 HandleError(
556 Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, out_->size()));
557 return;
558 }
559 envelopes_.pop_back();
560 }
561
HandleString8(span<uint8_t> chars)562 void HandleString8(span<uint8_t> chars) override {
563 if (!status_->ok())
564 return;
565 EncodeString8(chars, out_);
566 }
567
HandleString16(span<uint16_t> chars)568 void HandleString16(span<uint16_t> chars) override {
569 if (!status_->ok())
570 return;
571 EncodeFromUTF16(chars, out_);
572 }
573
HandleBinary(span<uint8_t> bytes)574 void HandleBinary(span<uint8_t> bytes) override {
575 if (!status_->ok())
576 return;
577 EncodeBinary(bytes, out_);
578 }
579
HandleDouble(double value)580 void HandleDouble(double value) override {
581 if (!status_->ok())
582 return;
583 EncodeDouble(value, out_);
584 }
585
HandleInt32(int32_t value)586 void HandleInt32(int32_t value) override {
587 if (!status_->ok())
588 return;
589 EncodeInt32(value, out_);
590 }
591
HandleBool(bool value)592 void HandleBool(bool value) override {
593 if (!status_->ok())
594 return;
595 // See RFC 7049 Section 2.3, Table 2.
596 out_->push_back(value ? kEncodedTrue : kEncodedFalse);
597 }
598
HandleNull()599 void HandleNull() override {
600 if (!status_->ok())
601 return;
602 // See RFC 7049 Section 2.3, Table 2.
603 out_->push_back(kEncodedNull);
604 }
605
HandleError(Status error)606 void HandleError(Status error) override {
607 if (!status_->ok())
608 return;
609 *status_ = error;
610 out_->clear();
611 }
612
613 private:
614 C* out_;
615 std::vector<EnvelopeEncoder> envelopes_;
616 Status* status_;
617 };
618 } // namespace
619
NewCBOREncoder(std::vector<uint8_t> * out,Status * status)620 std::unique_ptr<StreamingParserHandler> NewCBOREncoder(
621 std::vector<uint8_t>* out,
622 Status* status) {
623 return std::unique_ptr<StreamingParserHandler>(
624 new CBOREncoder<std::vector<uint8_t>>(out, status));
625 }
NewCBOREncoder(std::string * out,Status * status)626 std::unique_ptr<StreamingParserHandler> NewCBOREncoder(std::string* out,
627 Status* status) {
628 return std::unique_ptr<StreamingParserHandler>(
629 new CBOREncoder<std::string>(out, status));
630 }
631
632 // =============================================================================
633 // cbor::CBORTokenizer - for parsing individual CBOR items
634 // =============================================================================
635
CBORTokenizer(span<uint8_t> bytes)636 CBORTokenizer::CBORTokenizer(span<uint8_t> bytes) : bytes_(bytes) {
637 ReadNextToken(/*enter_envelope=*/false);
638 }
~CBORTokenizer()639 CBORTokenizer::~CBORTokenizer() {}
640
TokenTag() const641 CBORTokenTag CBORTokenizer::TokenTag() const {
642 return token_tag_;
643 }
644
Next()645 void CBORTokenizer::Next() {
646 if (token_tag_ == CBORTokenTag::ERROR_VALUE ||
647 token_tag_ == CBORTokenTag::DONE)
648 return;
649 ReadNextToken(/*enter_envelope=*/false);
650 }
651
EnterEnvelope()652 void CBORTokenizer::EnterEnvelope() {
653 assert(token_tag_ == CBORTokenTag::ENVELOPE);
654 ReadNextToken(/*enter_envelope=*/true);
655 }
656
Status() const657 Status CBORTokenizer::Status() const {
658 return status_;
659 }
660
661 // The following accessor functions ::GetInt32, ::GetDouble,
662 // ::GetString8, ::GetString16WireRep, ::GetBinary, ::GetEnvelopeContents
663 // assume that a particular token was recognized in ::ReadNextToken.
664 // That's where all the error checking is done. By design,
665 // the accessors (assuming the token was recognized) never produce
666 // an error.
667
GetInt32() const668 int32_t CBORTokenizer::GetInt32() const {
669 assert(token_tag_ == CBORTokenTag::INT32);
670 // The range checks happen in ::ReadNextToken().
671 return static_cast<int32_t>(
672 token_start_type_ == MajorType::UNSIGNED
673 ? token_start_internal_value_
674 : -static_cast<int64_t>(token_start_internal_value_) - 1);
675 }
676
GetDouble() const677 double CBORTokenizer::GetDouble() const {
678 assert(token_tag_ == CBORTokenTag::DOUBLE);
679 union {
680 uint64_t from_uint64;
681 double to_double;
682 } reinterpret;
683 reinterpret.from_uint64 = ReadBytesMostSignificantByteFirst<uint64_t>(
684 bytes_.subspan(status_.pos + 1));
685 return reinterpret.to_double;
686 }
687
GetString8() const688 span<uint8_t> CBORTokenizer::GetString8() const {
689 assert(token_tag_ == CBORTokenTag::STRING8);
690 auto length = static_cast<size_t>(token_start_internal_value_);
691 return bytes_.subspan(status_.pos + (token_byte_length_ - length), length);
692 }
693
GetString16WireRep() const694 span<uint8_t> CBORTokenizer::GetString16WireRep() const {
695 assert(token_tag_ == CBORTokenTag::STRING16);
696 auto length = static_cast<size_t>(token_start_internal_value_);
697 return bytes_.subspan(status_.pos + (token_byte_length_ - length), length);
698 }
699
GetBinary() const700 span<uint8_t> CBORTokenizer::GetBinary() const {
701 assert(token_tag_ == CBORTokenTag::BINARY);
702 auto length = static_cast<size_t>(token_start_internal_value_);
703 return bytes_.subspan(status_.pos + (token_byte_length_ - length), length);
704 }
705
GetEnvelopeContents() const706 span<uint8_t> CBORTokenizer::GetEnvelopeContents() const {
707 assert(token_tag_ == CBORTokenTag::ENVELOPE);
708 auto length = static_cast<size_t>(token_start_internal_value_);
709 return bytes_.subspan(status_.pos + kEncodedEnvelopeHeaderSize, length);
710 }
711
712 // All error checking happens in ::ReadNextToken, so that the accessors
713 // can avoid having to carry an error return value.
714 //
715 // With respect to checking the encoded lengths of strings, arrays, etc:
716 // On the wire, CBOR uses 1,2,4, and 8 byte unsigned integers, so
717 // we initially read them as uint64_t, usually into token_start_internal_value_.
718 //
719 // However, since these containers have a representation on the machine,
720 // we need to do corresponding size computations on the input byte array,
721 // output span (e.g. the payload for a string), etc., and size_t is
722 // machine specific (in practice either 32 bit or 64 bit).
723 //
724 // Further, we must avoid overflowing size_t. Therefore, we use this
725 // kMaxValidLength constant to:
726 // - Reject values that are larger than the architecture specific
727 // max size_t (differs between 32 bit and 64 bit arch).
728 // - Reserve at least one bit so that we can check against overflows
729 // when adding lengths (array / string length / etc.); we do this by
730 // ensuring that the inputs to an addition are <= kMaxValidLength,
731 // and then checking whether the sum went past it.
732 //
733 // See also
734 // https://chromium.googlesource.com/chromium/src/+/master/docs/security/integer-semantics.md
735 static const uint64_t kMaxValidLength =
736 std::min<uint64_t>(std::numeric_limits<uint64_t>::max() >> 2,
737 std::numeric_limits<size_t>::max());
738
ReadNextToken(bool enter_envelope)739 void CBORTokenizer::ReadNextToken(bool enter_envelope) {
740 if (enter_envelope) {
741 status_.pos += kEncodedEnvelopeHeaderSize;
742 } else {
743 status_.pos =
744 status_.pos == Status::npos() ? 0 : status_.pos + token_byte_length_;
745 }
746 status_.error = Error::OK;
747 if (status_.pos >= bytes_.size()) {
748 token_tag_ = CBORTokenTag::DONE;
749 return;
750 }
751 const size_t remaining_bytes = bytes_.size() - status_.pos;
752 switch (bytes_[status_.pos]) {
753 case kStopByte:
754 SetToken(CBORTokenTag::STOP, 1);
755 return;
756 case kInitialByteIndefiniteLengthMap:
757 SetToken(CBORTokenTag::MAP_START, 1);
758 return;
759 case kInitialByteIndefiniteLengthArray:
760 SetToken(CBORTokenTag::ARRAY_START, 1);
761 return;
762 case kEncodedTrue:
763 SetToken(CBORTokenTag::TRUE_VALUE, 1);
764 return;
765 case kEncodedFalse:
766 SetToken(CBORTokenTag::FALSE_VALUE, 1);
767 return;
768 case kEncodedNull:
769 SetToken(CBORTokenTag::NULL_VALUE, 1);
770 return;
771 case kExpectedConversionToBase64Tag: { // BINARY
772 const size_t bytes_read = internals::ReadTokenStart(
773 bytes_.subspan(status_.pos + 1), &token_start_type_,
774 &token_start_internal_value_);
775 if (!bytes_read || token_start_type_ != MajorType::BYTE_STRING ||
776 token_start_internal_value_ > kMaxValidLength) {
777 SetError(Error::CBOR_INVALID_BINARY);
778 return;
779 }
780 const uint64_t token_byte_length = token_start_internal_value_ +
781 /* tag before token start: */ 1 +
782 /* token start: */ bytes_read;
783 if (token_byte_length > remaining_bytes) {
784 SetError(Error::CBOR_INVALID_BINARY);
785 return;
786 }
787 SetToken(CBORTokenTag::BINARY, static_cast<size_t>(token_byte_length));
788 return;
789 }
790 case kInitialByteForDouble: { // DOUBLE
791 if (kEncodedDoubleSize > remaining_bytes) {
792 SetError(Error::CBOR_INVALID_DOUBLE);
793 return;
794 }
795 SetToken(CBORTokenTag::DOUBLE, kEncodedDoubleSize);
796 return;
797 }
798 case kInitialByteForEnvelope: { // ENVELOPE
799 if (kEncodedEnvelopeHeaderSize > remaining_bytes) {
800 SetError(Error::CBOR_INVALID_ENVELOPE);
801 return;
802 }
803 // The envelope must be a byte string with 32 bit length.
804 if (bytes_[status_.pos + 1] != kInitialByteFor32BitLengthByteString) {
805 SetError(Error::CBOR_INVALID_ENVELOPE);
806 return;
807 }
808 // Read the length of the byte string.
809 token_start_internal_value_ = ReadBytesMostSignificantByteFirst<uint32_t>(
810 bytes_.subspan(status_.pos + 2));
811 if (token_start_internal_value_ > kMaxValidLength) {
812 SetError(Error::CBOR_INVALID_ENVELOPE);
813 return;
814 }
815 uint64_t token_byte_length =
816 token_start_internal_value_ + kEncodedEnvelopeHeaderSize;
817 if (token_byte_length > remaining_bytes) {
818 SetError(Error::CBOR_INVALID_ENVELOPE);
819 return;
820 }
821 SetToken(CBORTokenTag::ENVELOPE, static_cast<size_t>(token_byte_length));
822 return;
823 }
824 default: {
825 const size_t bytes_read = internals::ReadTokenStart(
826 bytes_.subspan(status_.pos), &token_start_type_,
827 &token_start_internal_value_);
828 switch (token_start_type_) {
829 case MajorType::UNSIGNED: // INT32.
830 // INT32 is a signed int32 (int32 makes sense for the
831 // inspector_protocol, it's not a CBOR limitation), so we check
832 // against the signed max, so that the allowable values are
833 // 0, 1, 2, ... 2^31 - 1.
834 if (!bytes_read ||
835 static_cast<int64_t>(std::numeric_limits<int32_t>::max()) <
836 static_cast<int64_t>(token_start_internal_value_)) {
837 SetError(Error::CBOR_INVALID_INT32);
838 return;
839 }
840 SetToken(CBORTokenTag::INT32, bytes_read);
841 return;
842 case MajorType::NEGATIVE: { // INT32.
843 // INT32 is a signed int32 (int32 makes sense for the
844 // inspector_protocol, it's not a CBOR limitation); in CBOR, the
845 // negative values for INT32 are represented as NEGATIVE, that is, -1
846 // INT32 is represented as 1 << 5 | 0 (major type 1, additional info
847 // value 0).
848 // The represented allowed values range is -1 to -2^31.
849 // They are mapped into the encoded range of 0 to 2^31-1.
850 // We check the the payload in token_start_internal_value_ against
851 // that range (2^31-1 is also known as
852 // std::numeric_limits<int32_t>::max()).
853 if (!bytes_read ||
854 static_cast<int64_t>(token_start_internal_value_) >
855 static_cast<int64_t>(std::numeric_limits<int32_t>::max())) {
856 SetError(Error::CBOR_INVALID_INT32);
857 return;
858 }
859 SetToken(CBORTokenTag::INT32, bytes_read);
860 return;
861 }
862 case MajorType::STRING: { // STRING8.
863 if (!bytes_read || token_start_internal_value_ > kMaxValidLength) {
864 SetError(Error::CBOR_INVALID_STRING8);
865 return;
866 }
867 uint64_t token_byte_length = token_start_internal_value_ + bytes_read;
868 if (token_byte_length > remaining_bytes) {
869 SetError(Error::CBOR_INVALID_STRING8);
870 return;
871 }
872 SetToken(CBORTokenTag::STRING8,
873 static_cast<size_t>(token_byte_length));
874 return;
875 }
876 case MajorType::BYTE_STRING: { // STRING16.
877 // Length must be divisible by 2 since UTF16 is 2 bytes per
878 // character, hence the &1 check.
879 if (!bytes_read || token_start_internal_value_ > kMaxValidLength ||
880 token_start_internal_value_ & 1) {
881 SetError(Error::CBOR_INVALID_STRING16);
882 return;
883 }
884 uint64_t token_byte_length = token_start_internal_value_ + bytes_read;
885 if (token_byte_length > remaining_bytes) {
886 SetError(Error::CBOR_INVALID_STRING16);
887 return;
888 }
889 SetToken(CBORTokenTag::STRING16,
890 static_cast<size_t>(token_byte_length));
891 return;
892 }
893 case MajorType::ARRAY:
894 case MajorType::MAP:
895 case MajorType::TAG:
896 case MajorType::SIMPLE_VALUE:
897 SetError(Error::CBOR_UNSUPPORTED_VALUE);
898 return;
899 }
900 }
901 }
902 }
903
SetToken(CBORTokenTag token_tag,size_t token_byte_length)904 void CBORTokenizer::SetToken(CBORTokenTag token_tag, size_t token_byte_length) {
905 token_tag_ = token_tag;
906 token_byte_length_ = token_byte_length;
907 }
908
SetError(Error error)909 void CBORTokenizer::SetError(Error error) {
910 token_tag_ = CBORTokenTag::ERROR_VALUE;
911 status_.error = error;
912 }
913
914 // =============================================================================
915 // cbor::ParseCBOR - for receiving streaming parser events for CBOR messages
916 // =============================================================================
917
918 namespace {
919 // When parsing CBOR, we limit recursion depth for objects and arrays
920 // to this constant.
921 static constexpr int kStackLimit = 300;
922
923 // Below are three parsing routines for CBOR, which cover enough
924 // to roundtrip JSON messages.
925 bool ParseMap(int32_t stack_depth,
926 CBORTokenizer* tokenizer,
927 StreamingParserHandler* out);
928 bool ParseArray(int32_t stack_depth,
929 CBORTokenizer* tokenizer,
930 StreamingParserHandler* out);
931 bool ParseValue(int32_t stack_depth,
932 CBORTokenizer* tokenizer,
933 StreamingParserHandler* out);
934
ParseUTF16String(CBORTokenizer * tokenizer,StreamingParserHandler * out)935 void ParseUTF16String(CBORTokenizer* tokenizer, StreamingParserHandler* out) {
936 std::vector<uint16_t> value;
937 span<uint8_t> rep = tokenizer->GetString16WireRep();
938 for (size_t ii = 0; ii < rep.size(); ii += 2)
939 value.push_back((rep[ii + 1] << 8) | rep[ii]);
940 out->HandleString16(span<uint16_t>(value.data(), value.size()));
941 tokenizer->Next();
942 }
943
ParseUTF8String(CBORTokenizer * tokenizer,StreamingParserHandler * out)944 bool ParseUTF8String(CBORTokenizer* tokenizer, StreamingParserHandler* out) {
945 assert(tokenizer->TokenTag() == CBORTokenTag::STRING8);
946 out->HandleString8(tokenizer->GetString8());
947 tokenizer->Next();
948 return true;
949 }
950
ParseValue(int32_t stack_depth,CBORTokenizer * tokenizer,StreamingParserHandler * out)951 bool ParseValue(int32_t stack_depth,
952 CBORTokenizer* tokenizer,
953 StreamingParserHandler* out) {
954 if (stack_depth > kStackLimit) {
955 out->HandleError(
956 Status{Error::CBOR_STACK_LIMIT_EXCEEDED, tokenizer->Status().pos});
957 return false;
958 }
959 // Skip past the envelope to get to what's inside.
960 if (tokenizer->TokenTag() == CBORTokenTag::ENVELOPE)
961 tokenizer->EnterEnvelope();
962 switch (tokenizer->TokenTag()) {
963 case CBORTokenTag::ERROR_VALUE:
964 out->HandleError(tokenizer->Status());
965 return false;
966 case CBORTokenTag::DONE:
967 out->HandleError(Status{Error::CBOR_UNEXPECTED_EOF_EXPECTED_VALUE,
968 tokenizer->Status().pos});
969 return false;
970 case CBORTokenTag::TRUE_VALUE:
971 out->HandleBool(true);
972 tokenizer->Next();
973 return true;
974 case CBORTokenTag::FALSE_VALUE:
975 out->HandleBool(false);
976 tokenizer->Next();
977 return true;
978 case CBORTokenTag::NULL_VALUE:
979 out->HandleNull();
980 tokenizer->Next();
981 return true;
982 case CBORTokenTag::INT32:
983 out->HandleInt32(tokenizer->GetInt32());
984 tokenizer->Next();
985 return true;
986 case CBORTokenTag::DOUBLE:
987 out->HandleDouble(tokenizer->GetDouble());
988 tokenizer->Next();
989 return true;
990 case CBORTokenTag::STRING8:
991 return ParseUTF8String(tokenizer, out);
992 case CBORTokenTag::STRING16:
993 ParseUTF16String(tokenizer, out);
994 return true;
995 case CBORTokenTag::BINARY: {
996 out->HandleBinary(tokenizer->GetBinary());
997 tokenizer->Next();
998 return true;
999 }
1000 case CBORTokenTag::MAP_START:
1001 return ParseMap(stack_depth + 1, tokenizer, out);
1002 case CBORTokenTag::ARRAY_START:
1003 return ParseArray(stack_depth + 1, tokenizer, out);
1004 default:
1005 out->HandleError(
1006 Status{Error::CBOR_UNSUPPORTED_VALUE, tokenizer->Status().pos});
1007 return false;
1008 }
1009 }
1010
1011 // |bytes| must start with the indefinite length array byte, so basically,
1012 // ParseArray may only be called after an indefinite length array has been
1013 // detected.
ParseArray(int32_t stack_depth,CBORTokenizer * tokenizer,StreamingParserHandler * out)1014 bool ParseArray(int32_t stack_depth,
1015 CBORTokenizer* tokenizer,
1016 StreamingParserHandler* out) {
1017 assert(tokenizer->TokenTag() == CBORTokenTag::ARRAY_START);
1018 tokenizer->Next();
1019 out->HandleArrayBegin();
1020 while (tokenizer->TokenTag() != CBORTokenTag::STOP) {
1021 if (tokenizer->TokenTag() == CBORTokenTag::DONE) {
1022 out->HandleError(
1023 Status{Error::CBOR_UNEXPECTED_EOF_IN_ARRAY, tokenizer->Status().pos});
1024 return false;
1025 }
1026 if (tokenizer->TokenTag() == CBORTokenTag::ERROR_VALUE) {
1027 out->HandleError(tokenizer->Status());
1028 return false;
1029 }
1030 // Parse value.
1031 if (!ParseValue(stack_depth, tokenizer, out))
1032 return false;
1033 }
1034 out->HandleArrayEnd();
1035 tokenizer->Next();
1036 return true;
1037 }
1038
1039 // |bytes| must start with the indefinite length array byte, so basically,
1040 // ParseArray may only be called after an indefinite length array has been
1041 // detected.
ParseMap(int32_t stack_depth,CBORTokenizer * tokenizer,StreamingParserHandler * out)1042 bool ParseMap(int32_t stack_depth,
1043 CBORTokenizer* tokenizer,
1044 StreamingParserHandler* out) {
1045 assert(tokenizer->TokenTag() == CBORTokenTag::MAP_START);
1046 out->HandleMapBegin();
1047 tokenizer->Next();
1048 while (tokenizer->TokenTag() != CBORTokenTag::STOP) {
1049 if (tokenizer->TokenTag() == CBORTokenTag::DONE) {
1050 out->HandleError(
1051 Status{Error::CBOR_UNEXPECTED_EOF_IN_MAP, tokenizer->Status().pos});
1052 return false;
1053 }
1054 if (tokenizer->TokenTag() == CBORTokenTag::ERROR_VALUE) {
1055 out->HandleError(tokenizer->Status());
1056 return false;
1057 }
1058 // Parse key.
1059 if (tokenizer->TokenTag() == CBORTokenTag::STRING8) {
1060 if (!ParseUTF8String(tokenizer, out))
1061 return false;
1062 } else if (tokenizer->TokenTag() == CBORTokenTag::STRING16) {
1063 ParseUTF16String(tokenizer, out);
1064 } else {
1065 out->HandleError(
1066 Status{Error::CBOR_INVALID_MAP_KEY, tokenizer->Status().pos});
1067 return false;
1068 }
1069 // Parse value.
1070 if (!ParseValue(stack_depth, tokenizer, out))
1071 return false;
1072 }
1073 out->HandleMapEnd();
1074 tokenizer->Next();
1075 return true;
1076 }
1077 } // namespace
1078
ParseCBOR(span<uint8_t> bytes,StreamingParserHandler * out)1079 void ParseCBOR(span<uint8_t> bytes, StreamingParserHandler* out) {
1080 if (bytes.empty()) {
1081 out->HandleError(Status{Error::CBOR_NO_INPUT, 0});
1082 return;
1083 }
1084 if (bytes[0] != kInitialByteForEnvelope) {
1085 out->HandleError(Status{Error::CBOR_INVALID_START_BYTE, 0});
1086 return;
1087 }
1088 CBORTokenizer tokenizer(bytes);
1089 if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE) {
1090 out->HandleError(tokenizer.Status());
1091 return;
1092 }
1093 // We checked for the envelope start byte above, so the tokenizer
1094 // must agree here, since it's not an error.
1095 assert(tokenizer.TokenTag() == CBORTokenTag::ENVELOPE);
1096 tokenizer.EnterEnvelope();
1097 if (tokenizer.TokenTag() != CBORTokenTag::MAP_START) {
1098 out->HandleError(
1099 Status{Error::CBOR_MAP_START_EXPECTED, tokenizer.Status().pos});
1100 return;
1101 }
1102 if (!ParseMap(/*stack_depth=*/1, &tokenizer, out))
1103 return;
1104 if (tokenizer.TokenTag() == CBORTokenTag::DONE)
1105 return;
1106 if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE) {
1107 out->HandleError(tokenizer.Status());
1108 return;
1109 }
1110 out->HandleError(Status{Error::CBOR_TRAILING_JUNK, tokenizer.Status().pos});
1111 }
1112
1113 // =============================================================================
1114 // cbor::AppendString8EntryToMap - for limited in-place editing of messages
1115 // =============================================================================
1116
1117 template <typename C>
AppendString8EntryToCBORMapTmpl(span<uint8_t> string8_key,span<uint8_t> string8_value,C * cbor)1118 Status AppendString8EntryToCBORMapTmpl(span<uint8_t> string8_key,
1119 span<uint8_t> string8_value,
1120 C* cbor) {
1121 // Careful below: Don't compare (*cbor)[idx] with a uint8_t, since
1122 // it could be a char (signed!). Instead, use bytes.
1123 span<uint8_t> bytes(reinterpret_cast<const uint8_t*>(cbor->data()),
1124 cbor->size());
1125 CBORTokenizer tokenizer(bytes);
1126 if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE)
1127 return tokenizer.Status();
1128 if (tokenizer.TokenTag() != CBORTokenTag::ENVELOPE)
1129 return Status(Error::CBOR_INVALID_ENVELOPE, 0);
1130 size_t envelope_size = tokenizer.GetEnvelopeContents().size();
1131 size_t old_size = cbor->size();
1132 if (old_size != envelope_size + kEncodedEnvelopeHeaderSize)
1133 return Status(Error::CBOR_INVALID_ENVELOPE, 0);
1134 if (envelope_size == 0 ||
1135 (tokenizer.GetEnvelopeContents()[0] != EncodeIndefiniteLengthMapStart()))
1136 return Status(Error::CBOR_MAP_START_EXPECTED, kEncodedEnvelopeHeaderSize);
1137 if (bytes[bytes.size() - 1] != EncodeStop())
1138 return Status(Error::CBOR_MAP_STOP_EXPECTED, cbor->size() - 1);
1139 cbor->pop_back();
1140 EncodeString8(string8_key, cbor);
1141 EncodeString8(string8_value, cbor);
1142 cbor->push_back(EncodeStop());
1143 size_t new_envelope_size = envelope_size + (cbor->size() - old_size);
1144 if (new_envelope_size > std::numeric_limits<uint32_t>::max())
1145 return Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, 0);
1146 size_t size_pos = cbor->size() - new_envelope_size - sizeof(uint32_t);
1147 uint8_t* out = reinterpret_cast<uint8_t*>(&cbor->at(size_pos));
1148 *(out++) = (new_envelope_size >> 24) & 0xff;
1149 *(out++) = (new_envelope_size >> 16) & 0xff;
1150 *(out++) = (new_envelope_size >> 8) & 0xff;
1151 *(out) = new_envelope_size & 0xff;
1152 return Status();
1153 }
AppendString8EntryToCBORMap(span<uint8_t> string8_key,span<uint8_t> string8_value,std::vector<uint8_t> * cbor)1154 Status AppendString8EntryToCBORMap(span<uint8_t> string8_key,
1155 span<uint8_t> string8_value,
1156 std::vector<uint8_t>* cbor) {
1157 return AppendString8EntryToCBORMapTmpl(string8_key, string8_value, cbor);
1158 }
AppendString8EntryToCBORMap(span<uint8_t> string8_key,span<uint8_t> string8_value,std::string * cbor)1159 Status AppendString8EntryToCBORMap(span<uint8_t> string8_key,
1160 span<uint8_t> string8_value,
1161 std::string* cbor) {
1162 return AppendString8EntryToCBORMapTmpl(string8_key, string8_value, cbor);
1163 }
1164 } // namespace cbor
1165
1166 namespace json {
1167
1168 // =============================================================================
1169 // json::NewJSONEncoder - for encoding streaming parser events as JSON
1170 // =============================================================================
1171
1172 namespace {
1173 // Prints |value| to |out| with 4 hex digits, most significant chunk first.
1174 template <typename C>
PrintHex(uint16_t value,C * out)1175 void PrintHex(uint16_t value, C* out) {
1176 for (int ii = 3; ii >= 0; --ii) {
1177 int four_bits = 0xf & (value >> (4 * ii));
1178 out->push_back(four_bits + ((four_bits <= 9) ? '0' : ('a' - 10)));
1179 }
1180 }
1181
1182 // In the writer below, we maintain a stack of State instances.
1183 // It is just enough to emit the appropriate delimiters and brackets
1184 // in JSON.
1185 enum class Container {
1186 // Used for the top-level, initial state.
1187 NONE,
1188 // Inside a JSON object.
1189 MAP,
1190 // Inside a JSON array.
1191 ARRAY
1192 };
1193 class State {
1194 public:
State(Container container)1195 explicit State(Container container) : container_(container) {}
StartElement(std::vector<uint8_t> * out)1196 void StartElement(std::vector<uint8_t>* out) { StartElementTmpl(out); }
StartElement(std::string * out)1197 void StartElement(std::string* out) { StartElementTmpl(out); }
container() const1198 Container container() const { return container_; }
1199
1200 private:
1201 template <typename C>
StartElementTmpl(C * out)1202 void StartElementTmpl(C* out) {
1203 assert(container_ != Container::NONE || size_ == 0);
1204 if (size_ != 0) {
1205 char delim = (!(size_ & 1) || container_ == Container::ARRAY) ? ',' : ':';
1206 out->push_back(delim);
1207 }
1208 ++size_;
1209 }
1210
1211 Container container_ = Container::NONE;
1212 int size_ = 0;
1213 };
1214
1215 constexpr char kBase64Table[] =
1216 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1217 "abcdefghijklmnopqrstuvwxyz0123456789+/";
1218
1219 template <typename C>
Base64Encode(const span<uint8_t> & in,C * out)1220 void Base64Encode(const span<uint8_t>& in, C* out) {
1221 // The following three cases are based on the tables in the example
1222 // section in https://en.wikipedia.org/wiki/Base64. We process three
1223 // input bytes at a time, emitting 4 output bytes at a time.
1224 size_t ii = 0;
1225
1226 // While possible, process three input bytes.
1227 for (; ii + 3 <= in.size(); ii += 3) {
1228 uint32_t twentyfour_bits = (in[ii] << 16) | (in[ii + 1] << 8) | in[ii + 2];
1229 out->push_back(kBase64Table[(twentyfour_bits >> 18)]);
1230 out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]);
1231 out->push_back(kBase64Table[(twentyfour_bits >> 6) & 0x3f]);
1232 out->push_back(kBase64Table[twentyfour_bits & 0x3f]);
1233 }
1234 if (ii + 2 <= in.size()) { // Process two input bytes.
1235 uint32_t twentyfour_bits = (in[ii] << 16) | (in[ii + 1] << 8);
1236 out->push_back(kBase64Table[(twentyfour_bits >> 18)]);
1237 out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]);
1238 out->push_back(kBase64Table[(twentyfour_bits >> 6) & 0x3f]);
1239 out->push_back('='); // Emit padding.
1240 return;
1241 }
1242 if (ii + 1 <= in.size()) { // Process a single input byte.
1243 uint32_t twentyfour_bits = (in[ii] << 16);
1244 out->push_back(kBase64Table[(twentyfour_bits >> 18)]);
1245 out->push_back(kBase64Table[(twentyfour_bits >> 12) & 0x3f]);
1246 out->push_back('='); // Emit padding.
1247 out->push_back('='); // Emit padding.
1248 }
1249 }
1250
1251 // Implements a handler for JSON parser events to emit a JSON string.
1252 template <typename C>
1253 class JSONEncoder : public StreamingParserHandler {
1254 public:
JSONEncoder(const Platform * platform,C * out,Status * status)1255 JSONEncoder(const Platform* platform, C* out, Status* status)
1256 : platform_(platform), out_(out), status_(status) {
1257 *status_ = Status();
1258 state_.emplace(Container::NONE);
1259 }
1260
HandleMapBegin()1261 void HandleMapBegin() override {
1262 if (!status_->ok())
1263 return;
1264 assert(!state_.empty());
1265 state_.top().StartElement(out_);
1266 state_.emplace(Container::MAP);
1267 Emit('{');
1268 }
1269
HandleMapEnd()1270 void HandleMapEnd() override {
1271 if (!status_->ok())
1272 return;
1273 assert(state_.size() >= 2 && state_.top().container() == Container::MAP);
1274 state_.pop();
1275 Emit('}');
1276 }
1277
HandleArrayBegin()1278 void HandleArrayBegin() override {
1279 if (!status_->ok())
1280 return;
1281 state_.top().StartElement(out_);
1282 state_.emplace(Container::ARRAY);
1283 Emit('[');
1284 }
1285
HandleArrayEnd()1286 void HandleArrayEnd() override {
1287 if (!status_->ok())
1288 return;
1289 assert(state_.size() >= 2 && state_.top().container() == Container::ARRAY);
1290 state_.pop();
1291 Emit(']');
1292 }
1293
HandleString16(span<uint16_t> chars)1294 void HandleString16(span<uint16_t> chars) override {
1295 if (!status_->ok())
1296 return;
1297 state_.top().StartElement(out_);
1298 Emit('"');
1299 for (const uint16_t ch : chars) {
1300 if (ch == '"') {
1301 Emit("\\\"");
1302 } else if (ch == '\\') {
1303 Emit("\\\\");
1304 } else if (ch == '\b') {
1305 Emit("\\b");
1306 } else if (ch == '\f') {
1307 Emit("\\f");
1308 } else if (ch == '\n') {
1309 Emit("\\n");
1310 } else if (ch == '\r') {
1311 Emit("\\r");
1312 } else if (ch == '\t') {
1313 Emit("\\t");
1314 } else if (ch >= 32 && ch <= 126) {
1315 Emit(ch);
1316 } else {
1317 Emit("\\u");
1318 PrintHex(ch, out_);
1319 }
1320 }
1321 Emit('"');
1322 }
1323
HandleString8(span<uint8_t> chars)1324 void HandleString8(span<uint8_t> chars) override {
1325 if (!status_->ok())
1326 return;
1327 state_.top().StartElement(out_);
1328 Emit('"');
1329 for (size_t ii = 0; ii < chars.size(); ++ii) {
1330 uint8_t c = chars[ii];
1331 if (c == '"') {
1332 Emit("\\\"");
1333 } else if (c == '\\') {
1334 Emit("\\\\");
1335 } else if (c == '\b') {
1336 Emit("\\b");
1337 } else if (c == '\f') {
1338 Emit("\\f");
1339 } else if (c == '\n') {
1340 Emit("\\n");
1341 } else if (c == '\r') {
1342 Emit("\\r");
1343 } else if (c == '\t') {
1344 Emit("\\t");
1345 } else if (c >= 32 && c <= 126) {
1346 Emit(c);
1347 } else if (c < 32) {
1348 Emit("\\u");
1349 PrintHex(static_cast<uint16_t>(c), out_);
1350 } else {
1351 // Inspect the leading byte to figure out how long the utf8
1352 // byte sequence is; while doing this initialize |codepoint|
1353 // with the first few bits.
1354 // See table in: https://en.wikipedia.org/wiki/UTF-8
1355 // byte one is 110x xxxx -> 2 byte utf8 sequence
1356 // byte one is 1110 xxxx -> 3 byte utf8 sequence
1357 // byte one is 1111 0xxx -> 4 byte utf8 sequence
1358 uint32_t codepoint;
1359 int num_bytes_left;
1360 if ((c & 0xe0) == 0xc0) { // 2 byte utf8 sequence
1361 num_bytes_left = 1;
1362 codepoint = c & 0x1f;
1363 } else if ((c & 0xf0) == 0xe0) { // 3 byte utf8 sequence
1364 num_bytes_left = 2;
1365 codepoint = c & 0x0f;
1366 } else if ((c & 0xf8) == 0xf0) { // 4 byte utf8 sequence
1367 codepoint = c & 0x07;
1368 num_bytes_left = 3;
1369 } else {
1370 continue; // invalid leading byte
1371 }
1372
1373 // If we have enough bytes in our input, decode the remaining ones
1374 // belonging to this Unicode character into |codepoint|.
1375 if (ii + num_bytes_left > chars.size())
1376 continue;
1377 while (num_bytes_left > 0) {
1378 c = chars[++ii];
1379 --num_bytes_left;
1380 // Check the next byte is a continuation byte, that is 10xx xxxx.
1381 if ((c & 0xc0) != 0x80)
1382 continue;
1383 codepoint = (codepoint << 6) | (c & 0x3f);
1384 }
1385
1386 // Disallow overlong encodings for ascii characters, as these
1387 // would include " and other characters significant to JSON
1388 // string termination / control.
1389 if (codepoint < 0x7f)
1390 continue;
1391 // Invalid in UTF8, and can't be represented in UTF16 anyway.
1392 if (codepoint > 0x10ffff)
1393 continue;
1394
1395 // So, now we transcode to UTF16,
1396 // using the math described at https://en.wikipedia.org/wiki/UTF-16,
1397 // for either one or two 16 bit characters.
1398 if (codepoint < 0xffff) {
1399 Emit("\\u");
1400 PrintHex(static_cast<uint16_t>(codepoint), out_);
1401 continue;
1402 }
1403 codepoint -= 0x10000;
1404 // high surrogate
1405 Emit("\\u");
1406 PrintHex(static_cast<uint16_t>((codepoint >> 10) + 0xd800), out_);
1407 // low surrogate
1408 Emit("\\u");
1409 PrintHex(static_cast<uint16_t>((codepoint & 0x3ff) + 0xdc00), out_);
1410 }
1411 }
1412 Emit('"');
1413 }
1414
HandleBinary(span<uint8_t> bytes)1415 void HandleBinary(span<uint8_t> bytes) override {
1416 if (!status_->ok())
1417 return;
1418 state_.top().StartElement(out_);
1419 Emit('"');
1420 Base64Encode(bytes, out_);
1421 Emit('"');
1422 }
1423
HandleDouble(double value)1424 void HandleDouble(double value) override {
1425 if (!status_->ok())
1426 return;
1427 state_.top().StartElement(out_);
1428 // JSON cannot represent NaN or Infinity. So, for compatibility,
1429 // we behave like the JSON object in web browsers: emit 'null'.
1430 if (!std::isfinite(value)) {
1431 Emit("null");
1432 return;
1433 }
1434 std::unique_ptr<char[]> str_value = platform_->DToStr(value);
1435
1436 // DToStr may fail to emit a 0 before the decimal dot. E.g. this is
1437 // the case in base::NumberToString in Chromium (which is based on
1438 // dmg_fp). So, much like
1439 // https://cs.chromium.org/chromium/src/base/json/json_writer.cc
1440 // we probe for this and emit the leading 0 anyway if necessary.
1441 const char* chars = str_value.get();
1442 if (chars[0] == '.') {
1443 Emit('0');
1444 } else if (chars[0] == '-' && chars[1] == '.') {
1445 Emit("-0");
1446 ++chars;
1447 }
1448 Emit(chars);
1449 }
1450
HandleInt32(int32_t value)1451 void HandleInt32(int32_t value) override {
1452 if (!status_->ok())
1453 return;
1454 state_.top().StartElement(out_);
1455 Emit(std::to_string(value));
1456 }
1457
HandleBool(bool value)1458 void HandleBool(bool value) override {
1459 if (!status_->ok())
1460 return;
1461 state_.top().StartElement(out_);
1462 Emit(value ? "true" : "false");
1463 }
1464
HandleNull()1465 void HandleNull() override {
1466 if (!status_->ok())
1467 return;
1468 state_.top().StartElement(out_);
1469 Emit("null");
1470 }
1471
HandleError(Status error)1472 void HandleError(Status error) override {
1473 assert(!error.ok());
1474 *status_ = error;
1475 out_->clear();
1476 }
1477
1478 private:
Emit(char c)1479 void Emit(char c) { out_->push_back(c); }
Emit(const char * str)1480 void Emit(const char* str) {
1481 out_->insert(out_->end(), str, str + strlen(str));
1482 }
Emit(const std::string & str)1483 void Emit(const std::string& str) {
1484 out_->insert(out_->end(), str.begin(), str.end());
1485 }
1486
1487 const Platform* platform_;
1488 C* out_;
1489 Status* status_;
1490 std::stack<State> state_;
1491 };
1492 } // namespace
1493
NewJSONEncoder(const Platform * platform,std::vector<uint8_t> * out,Status * status)1494 std::unique_ptr<StreamingParserHandler> NewJSONEncoder(
1495 const Platform* platform,
1496 std::vector<uint8_t>* out,
1497 Status* status) {
1498 return std::unique_ptr<StreamingParserHandler>(
1499 new JSONEncoder<std::vector<uint8_t>>(platform, out, status));
1500 }
NewJSONEncoder(const Platform * platform,std::string * out,Status * status)1501 std::unique_ptr<StreamingParserHandler> NewJSONEncoder(const Platform* platform,
1502 std::string* out,
1503 Status* status) {
1504 return std::unique_ptr<StreamingParserHandler>(
1505 new JSONEncoder<std::string>(platform, out, status));
1506 }
1507
1508 // =============================================================================
1509 // json::ParseJSON - for receiving streaming parser events for JSON.
1510 // =============================================================================
1511
1512 namespace {
1513 const int kStackLimit = 300;
1514
1515 enum Token {
1516 ObjectBegin,
1517 ObjectEnd,
1518 ArrayBegin,
1519 ArrayEnd,
1520 StringLiteral,
1521 Number,
1522 BoolTrue,
1523 BoolFalse,
1524 NullToken,
1525 ListSeparator,
1526 ObjectPairSeparator,
1527 InvalidToken,
1528 NoInput
1529 };
1530
1531 const char* const kNullString = "null";
1532 const char* const kTrueString = "true";
1533 const char* const kFalseString = "false";
1534
1535 template <typename Char>
1536 class JsonParser {
1537 public:
JsonParser(const Platform * platform,StreamingParserHandler * handler)1538 JsonParser(const Platform* platform, StreamingParserHandler* handler)
1539 : platform_(platform), handler_(handler) {}
1540
Parse(const Char * start,size_t length)1541 void Parse(const Char* start, size_t length) {
1542 start_pos_ = start;
1543 const Char* end = start + length;
1544 const Char* tokenEnd = nullptr;
1545 ParseValue(start, end, &tokenEnd, 0);
1546 if (error_)
1547 return;
1548 if (tokenEnd != end) {
1549 HandleError(Error::JSON_PARSER_UNPROCESSED_INPUT_REMAINS, tokenEnd);
1550 }
1551 }
1552
1553 private:
CharsToDouble(const uint16_t * chars,size_t length,double * result)1554 bool CharsToDouble(const uint16_t* chars, size_t length, double* result) {
1555 std::string buffer;
1556 buffer.reserve(length + 1);
1557 for (size_t ii = 0; ii < length; ++ii) {
1558 bool is_ascii = !(chars[ii] & ~0x7F);
1559 if (!is_ascii)
1560 return false;
1561 buffer.push_back(static_cast<char>(chars[ii]));
1562 }
1563 return platform_->StrToD(buffer.c_str(), result);
1564 }
1565
CharsToDouble(const uint8_t * chars,size_t length,double * result)1566 bool CharsToDouble(const uint8_t* chars, size_t length, double* result) {
1567 std::string buffer(reinterpret_cast<const char*>(chars), length);
1568 return platform_->StrToD(buffer.c_str(), result);
1569 }
1570
ParseConstToken(const Char * start,const Char * end,const Char ** token_end,const char * token)1571 static bool ParseConstToken(const Char* start,
1572 const Char* end,
1573 const Char** token_end,
1574 const char* token) {
1575 // |token| is \0 terminated, it's one of the constants at top of the file.
1576 while (start < end && *token != '\0' && *start++ == *token++) {
1577 }
1578 if (*token != '\0')
1579 return false;
1580 *token_end = start;
1581 return true;
1582 }
1583
ReadInt(const Char * start,const Char * end,const Char ** token_end,bool allow_leading_zeros)1584 static bool ReadInt(const Char* start,
1585 const Char* end,
1586 const Char** token_end,
1587 bool allow_leading_zeros) {
1588 if (start == end)
1589 return false;
1590 bool has_leading_zero = '0' == *start;
1591 int length = 0;
1592 while (start < end && '0' <= *start && *start <= '9') {
1593 ++start;
1594 ++length;
1595 }
1596 if (!length)
1597 return false;
1598 if (!allow_leading_zeros && length > 1 && has_leading_zero)
1599 return false;
1600 *token_end = start;
1601 return true;
1602 }
1603
ParseNumberToken(const Char * start,const Char * end,const Char ** token_end)1604 static bool ParseNumberToken(const Char* start,
1605 const Char* end,
1606 const Char** token_end) {
1607 // We just grab the number here. We validate the size in DecodeNumber.
1608 // According to RFC4627, a valid number is: [minus] int [frac] [exp]
1609 if (start == end)
1610 return false;
1611 Char c = *start;
1612 if ('-' == c)
1613 ++start;
1614
1615 if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/false))
1616 return false;
1617 if (start == end) {
1618 *token_end = start;
1619 return true;
1620 }
1621
1622 // Optional fraction part
1623 c = *start;
1624 if ('.' == c) {
1625 ++start;
1626 if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/true))
1627 return false;
1628 if (start == end) {
1629 *token_end = start;
1630 return true;
1631 }
1632 c = *start;
1633 }
1634
1635 // Optional exponent part
1636 if ('e' == c || 'E' == c) {
1637 ++start;
1638 if (start == end)
1639 return false;
1640 c = *start;
1641 if ('-' == c || '+' == c) {
1642 ++start;
1643 if (start == end)
1644 return false;
1645 }
1646 if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/true))
1647 return false;
1648 }
1649
1650 *token_end = start;
1651 return true;
1652 }
1653
ReadHexDigits(const Char * start,const Char * end,const Char ** token_end,int digits)1654 static bool ReadHexDigits(const Char* start,
1655 const Char* end,
1656 const Char** token_end,
1657 int digits) {
1658 if (end - start < digits)
1659 return false;
1660 for (int i = 0; i < digits; ++i) {
1661 Char c = *start++;
1662 if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
1663 ('A' <= c && c <= 'F')))
1664 return false;
1665 }
1666 *token_end = start;
1667 return true;
1668 }
1669
ParseStringToken(const Char * start,const Char * end,const Char ** token_end)1670 static bool ParseStringToken(const Char* start,
1671 const Char* end,
1672 const Char** token_end) {
1673 while (start < end) {
1674 Char c = *start++;
1675 if ('\\' == c) {
1676 if (start == end)
1677 return false;
1678 c = *start++;
1679 // Make sure the escaped char is valid.
1680 switch (c) {
1681 case 'x':
1682 if (!ReadHexDigits(start, end, &start, 2))
1683 return false;
1684 break;
1685 case 'u':
1686 if (!ReadHexDigits(start, end, &start, 4))
1687 return false;
1688 break;
1689 case '\\':
1690 case '/':
1691 case 'b':
1692 case 'f':
1693 case 'n':
1694 case 'r':
1695 case 't':
1696 case 'v':
1697 case '"':
1698 break;
1699 default:
1700 return false;
1701 }
1702 } else if ('"' == c) {
1703 *token_end = start;
1704 return true;
1705 }
1706 }
1707 return false;
1708 }
1709
SkipComment(const Char * start,const Char * end,const Char ** comment_end)1710 static bool SkipComment(const Char* start,
1711 const Char* end,
1712 const Char** comment_end) {
1713 if (start == end)
1714 return false;
1715
1716 if (*start != '/' || start + 1 >= end)
1717 return false;
1718 ++start;
1719
1720 if (*start == '/') {
1721 // Single line comment, read to newline.
1722 for (++start; start < end; ++start) {
1723 if (*start == '\n' || *start == '\r') {
1724 *comment_end = start + 1;
1725 return true;
1726 }
1727 }
1728 *comment_end = end;
1729 // Comment reaches end-of-input, which is fine.
1730 return true;
1731 }
1732
1733 if (*start == '*') {
1734 Char previous = '\0';
1735 // Block comment, read until end marker.
1736 for (++start; start < end; previous = *start++) {
1737 if (previous == '*' && *start == '/') {
1738 *comment_end = start + 1;
1739 return true;
1740 }
1741 }
1742 // Block comment must close before end-of-input.
1743 return false;
1744 }
1745
1746 return false;
1747 }
1748
IsSpaceOrNewLine(Char c)1749 static bool IsSpaceOrNewLine(Char c) {
1750 // \v = vertial tab; \f = form feed page break.
1751 return c == ' ' || c == '\n' || c == '\v' || c == '\f' || c == '\r' ||
1752 c == '\t';
1753 }
1754
SkipWhitespaceAndComments(const Char * start,const Char * end,const Char ** whitespace_end)1755 static void SkipWhitespaceAndComments(const Char* start,
1756 const Char* end,
1757 const Char** whitespace_end) {
1758 while (start < end) {
1759 if (IsSpaceOrNewLine(*start)) {
1760 ++start;
1761 } else if (*start == '/') {
1762 const Char* comment_end = nullptr;
1763 if (!SkipComment(start, end, &comment_end))
1764 break;
1765 start = comment_end;
1766 } else {
1767 break;
1768 }
1769 }
1770 *whitespace_end = start;
1771 }
1772
ParseToken(const Char * start,const Char * end,const Char ** tokenStart,const Char ** token_end)1773 static Token ParseToken(const Char* start,
1774 const Char* end,
1775 const Char** tokenStart,
1776 const Char** token_end) {
1777 SkipWhitespaceAndComments(start, end, tokenStart);
1778 start = *tokenStart;
1779
1780 if (start == end)
1781 return NoInput;
1782
1783 switch (*start) {
1784 case 'n':
1785 if (ParseConstToken(start, end, token_end, kNullString))
1786 return NullToken;
1787 break;
1788 case 't':
1789 if (ParseConstToken(start, end, token_end, kTrueString))
1790 return BoolTrue;
1791 break;
1792 case 'f':
1793 if (ParseConstToken(start, end, token_end, kFalseString))
1794 return BoolFalse;
1795 break;
1796 case '[':
1797 *token_end = start + 1;
1798 return ArrayBegin;
1799 case ']':
1800 *token_end = start + 1;
1801 return ArrayEnd;
1802 case ',':
1803 *token_end = start + 1;
1804 return ListSeparator;
1805 case '{':
1806 *token_end = start + 1;
1807 return ObjectBegin;
1808 case '}':
1809 *token_end = start + 1;
1810 return ObjectEnd;
1811 case ':':
1812 *token_end = start + 1;
1813 return ObjectPairSeparator;
1814 case '0':
1815 case '1':
1816 case '2':
1817 case '3':
1818 case '4':
1819 case '5':
1820 case '6':
1821 case '7':
1822 case '8':
1823 case '9':
1824 case '-':
1825 if (ParseNumberToken(start, end, token_end))
1826 return Number;
1827 break;
1828 case '"':
1829 if (ParseStringToken(start + 1, end, token_end))
1830 return StringLiteral;
1831 break;
1832 }
1833 return InvalidToken;
1834 }
1835
HexToInt(Char c)1836 static int HexToInt(Char c) {
1837 if ('0' <= c && c <= '9')
1838 return c - '0';
1839 if ('A' <= c && c <= 'F')
1840 return c - 'A' + 10;
1841 if ('a' <= c && c <= 'f')
1842 return c - 'a' + 10;
1843 assert(false); // Unreachable.
1844 return 0;
1845 }
1846
DecodeString(const Char * start,const Char * end,std::vector<uint16_t> * output)1847 static bool DecodeString(const Char* start,
1848 const Char* end,
1849 std::vector<uint16_t>* output) {
1850 if (start == end)
1851 return true;
1852 if (start > end)
1853 return false;
1854 output->reserve(end - start);
1855 while (start < end) {
1856 uint16_t c = *start++;
1857 // If the |Char| we're dealing with is really a byte, then
1858 // we have utf8 here, and we need to check for multibyte characters
1859 // and transcode them to utf16 (either one or two utf16 chars).
1860 if (sizeof(Char) == sizeof(uint8_t) && c > 0x7f) {
1861 // Inspect the leading byte to figure out how long the utf8
1862 // byte sequence is; while doing this initialize |codepoint|
1863 // with the first few bits.
1864 // See table in: https://en.wikipedia.org/wiki/UTF-8
1865 // byte one is 110x xxxx -> 2 byte utf8 sequence
1866 // byte one is 1110 xxxx -> 3 byte utf8 sequence
1867 // byte one is 1111 0xxx -> 4 byte utf8 sequence
1868 uint32_t codepoint;
1869 int num_bytes_left;
1870 if ((c & 0xe0) == 0xc0) { // 2 byte utf8 sequence
1871 num_bytes_left = 1;
1872 codepoint = c & 0x1f;
1873 } else if ((c & 0xf0) == 0xe0) { // 3 byte utf8 sequence
1874 num_bytes_left = 2;
1875 codepoint = c & 0x0f;
1876 } else if ((c & 0xf8) == 0xf0) { // 4 byte utf8 sequence
1877 codepoint = c & 0x07;
1878 num_bytes_left = 3;
1879 } else {
1880 return false; // invalid leading byte
1881 }
1882
1883 // If we have enough bytes in our inpput, decode the remaining ones
1884 // belonging to this Unicode character into |codepoint|.
1885 if (start + num_bytes_left > end)
1886 return false;
1887 while (num_bytes_left > 0) {
1888 c = *start++;
1889 --num_bytes_left;
1890 // Check the next byte is a continuation byte, that is 10xx xxxx.
1891 if ((c & 0xc0) != 0x80)
1892 return false;
1893 codepoint = (codepoint << 6) | (c & 0x3f);
1894 }
1895
1896 // Disallow overlong encodings for ascii characters, as these
1897 // would include " and other characters significant to JSON
1898 // string termination / control.
1899 if (codepoint <= 0x7f)
1900 return false;
1901 // Invalid in UTF8, and can't be represented in UTF16 anyway.
1902 if (codepoint > 0x10ffff)
1903 return false;
1904
1905 // So, now we transcode to UTF16,
1906 // using the math described at https://en.wikipedia.org/wiki/UTF-16,
1907 // for either one or two 16 bit characters.
1908 if (codepoint < 0xffff) {
1909 output->push_back(codepoint);
1910 continue;
1911 }
1912 codepoint -= 0x10000;
1913 output->push_back((codepoint >> 10) + 0xd800); // high surrogate
1914 output->push_back((codepoint & 0x3ff) + 0xdc00); // low surrogate
1915 continue;
1916 }
1917 if ('\\' != c) {
1918 output->push_back(c);
1919 continue;
1920 }
1921 if (start == end)
1922 return false;
1923 c = *start++;
1924
1925 if (c == 'x') {
1926 // \x is not supported.
1927 return false;
1928 }
1929
1930 switch (c) {
1931 case '"':
1932 case '/':
1933 case '\\':
1934 break;
1935 case 'b':
1936 c = '\b';
1937 break;
1938 case 'f':
1939 c = '\f';
1940 break;
1941 case 'n':
1942 c = '\n';
1943 break;
1944 case 'r':
1945 c = '\r';
1946 break;
1947 case 't':
1948 c = '\t';
1949 break;
1950 case 'v':
1951 c = '\v';
1952 break;
1953 case 'u':
1954 c = (HexToInt(*start) << 12) + (HexToInt(*(start + 1)) << 8) +
1955 (HexToInt(*(start + 2)) << 4) + HexToInt(*(start + 3));
1956 start += 4;
1957 break;
1958 default:
1959 return false;
1960 }
1961 output->push_back(c);
1962 }
1963 return true;
1964 }
1965
ParseValue(const Char * start,const Char * end,const Char ** value_token_end,int depth)1966 void ParseValue(const Char* start,
1967 const Char* end,
1968 const Char** value_token_end,
1969 int depth) {
1970 if (depth > kStackLimit) {
1971 HandleError(Error::JSON_PARSER_STACK_LIMIT_EXCEEDED, start);
1972 return;
1973 }
1974 const Char* token_start = nullptr;
1975 const Char* token_end = nullptr;
1976 Token token = ParseToken(start, end, &token_start, &token_end);
1977 switch (token) {
1978 case NoInput:
1979 HandleError(Error::JSON_PARSER_NO_INPUT, token_start);
1980 return;
1981 case InvalidToken:
1982 HandleError(Error::JSON_PARSER_INVALID_TOKEN, token_start);
1983 return;
1984 case NullToken:
1985 handler_->HandleNull();
1986 break;
1987 case BoolTrue:
1988 handler_->HandleBool(true);
1989 break;
1990 case BoolFalse:
1991 handler_->HandleBool(false);
1992 break;
1993 case Number: {
1994 double value;
1995 if (!CharsToDouble(token_start, token_end - token_start, &value)) {
1996 HandleError(Error::JSON_PARSER_INVALID_NUMBER, token_start);
1997 return;
1998 }
1999 if (value >= std::numeric_limits<int32_t>::min() &&
2000 value <= std::numeric_limits<int32_t>::max() &&
2001 static_cast<int32_t>(value) == value)
2002 handler_->HandleInt32(static_cast<int32_t>(value));
2003 else
2004 handler_->HandleDouble(value);
2005 break;
2006 }
2007 case StringLiteral: {
2008 std::vector<uint16_t> value;
2009 bool ok = DecodeString(token_start + 1, token_end - 1, &value);
2010 if (!ok) {
2011 HandleError(Error::JSON_PARSER_INVALID_STRING, token_start);
2012 return;
2013 }
2014 handler_->HandleString16(span<uint16_t>(value.data(), value.size()));
2015 break;
2016 }
2017 case ArrayBegin: {
2018 handler_->HandleArrayBegin();
2019 start = token_end;
2020 token = ParseToken(start, end, &token_start, &token_end);
2021 while (token != ArrayEnd) {
2022 ParseValue(start, end, &token_end, depth + 1);
2023 if (error_)
2024 return;
2025
2026 // After a list value, we expect a comma or the end of the list.
2027 start = token_end;
2028 token = ParseToken(start, end, &token_start, &token_end);
2029 if (token == ListSeparator) {
2030 start = token_end;
2031 token = ParseToken(start, end, &token_start, &token_end);
2032 if (token == ArrayEnd) {
2033 HandleError(Error::JSON_PARSER_UNEXPECTED_ARRAY_END, token_start);
2034 return;
2035 }
2036 } else if (token != ArrayEnd) {
2037 // Unexpected value after list value. Bail out.
2038 HandleError(Error::JSON_PARSER_COMMA_OR_ARRAY_END_EXPECTED,
2039 token_start);
2040 return;
2041 }
2042 }
2043 handler_->HandleArrayEnd();
2044 break;
2045 }
2046 case ObjectBegin: {
2047 handler_->HandleMapBegin();
2048 start = token_end;
2049 token = ParseToken(start, end, &token_start, &token_end);
2050 while (token != ObjectEnd) {
2051 if (token != StringLiteral) {
2052 HandleError(Error::JSON_PARSER_STRING_LITERAL_EXPECTED,
2053 token_start);
2054 return;
2055 }
2056 std::vector<uint16_t> key;
2057 if (!DecodeString(token_start + 1, token_end - 1, &key)) {
2058 HandleError(Error::JSON_PARSER_INVALID_STRING, token_start);
2059 return;
2060 }
2061 handler_->HandleString16(span<uint16_t>(key.data(), key.size()));
2062 start = token_end;
2063
2064 token = ParseToken(start, end, &token_start, &token_end);
2065 if (token != ObjectPairSeparator) {
2066 HandleError(Error::JSON_PARSER_COLON_EXPECTED, token_start);
2067 return;
2068 }
2069 start = token_end;
2070
2071 ParseValue(start, end, &token_end, depth + 1);
2072 if (error_)
2073 return;
2074 start = token_end;
2075
2076 // After a key/value pair, we expect a comma or the end of the
2077 // object.
2078 token = ParseToken(start, end, &token_start, &token_end);
2079 if (token == ListSeparator) {
2080 start = token_end;
2081 token = ParseToken(start, end, &token_start, &token_end);
2082 if (token == ObjectEnd) {
2083 HandleError(Error::JSON_PARSER_UNEXPECTED_MAP_END, token_start);
2084 return;
2085 }
2086 } else if (token != ObjectEnd) {
2087 // Unexpected value after last object value. Bail out.
2088 HandleError(Error::JSON_PARSER_COMMA_OR_MAP_END_EXPECTED,
2089 token_start);
2090 return;
2091 }
2092 }
2093 handler_->HandleMapEnd();
2094 break;
2095 }
2096
2097 default:
2098 // We got a token that's not a value.
2099 HandleError(Error::JSON_PARSER_VALUE_EXPECTED, token_start);
2100 return;
2101 }
2102
2103 SkipWhitespaceAndComments(token_end, end, value_token_end);
2104 }
2105
HandleError(Error error,const Char * pos)2106 void HandleError(Error error, const Char* pos) {
2107 assert(error != Error::OK);
2108 if (!error_) {
2109 handler_->HandleError(
2110 Status{error, static_cast<size_t>(pos - start_pos_)});
2111 error_ = true;
2112 }
2113 }
2114
2115 const Char* start_pos_ = nullptr;
2116 bool error_ = false;
2117 const Platform* platform_;
2118 StreamingParserHandler* handler_;
2119 };
2120 } // namespace
2121
ParseJSON(const Platform & platform,span<uint8_t> chars,StreamingParserHandler * handler)2122 void ParseJSON(const Platform& platform,
2123 span<uint8_t> chars,
2124 StreamingParserHandler* handler) {
2125 JsonParser<uint8_t> parser(&platform, handler);
2126 parser.Parse(chars.data(), chars.size());
2127 }
2128
ParseJSON(const Platform & platform,span<uint16_t> chars,StreamingParserHandler * handler)2129 void ParseJSON(const Platform& platform,
2130 span<uint16_t> chars,
2131 StreamingParserHandler* handler) {
2132 JsonParser<uint16_t> parser(&platform, handler);
2133 parser.Parse(chars.data(), chars.size());
2134 }
2135
2136 // =============================================================================
2137 // json::ConvertCBORToJSON, json::ConvertJSONToCBOR - for transcoding
2138 // =============================================================================
2139 template <typename C>
ConvertCBORToJSONTmpl(const Platform & platform,span<uint8_t> cbor,C * json)2140 Status ConvertCBORToJSONTmpl(const Platform& platform,
2141 span<uint8_t> cbor,
2142 C* json) {
2143 Status status;
2144 std::unique_ptr<StreamingParserHandler> json_writer =
2145 NewJSONEncoder(&platform, json, &status);
2146 cbor::ParseCBOR(cbor, json_writer.get());
2147 return status;
2148 }
2149
ConvertCBORToJSON(const Platform & platform,span<uint8_t> cbor,std::vector<uint8_t> * json)2150 Status ConvertCBORToJSON(const Platform& platform,
2151 span<uint8_t> cbor,
2152 std::vector<uint8_t>* json) {
2153 return ConvertCBORToJSONTmpl(platform, cbor, json);
2154 }
ConvertCBORToJSON(const Platform & platform,span<uint8_t> cbor,std::string * json)2155 Status ConvertCBORToJSON(const Platform& platform,
2156 span<uint8_t> cbor,
2157 std::string* json) {
2158 return ConvertCBORToJSONTmpl(platform, cbor, json);
2159 }
2160
2161 template <typename T, typename C>
ConvertJSONToCBORTmpl(const Platform & platform,span<T> json,C * cbor)2162 Status ConvertJSONToCBORTmpl(const Platform& platform, span<T> json, C* cbor) {
2163 Status status;
2164 std::unique_ptr<StreamingParserHandler> encoder =
2165 cbor::NewCBOREncoder(cbor, &status);
2166 ParseJSON(platform, json, encoder.get());
2167 return status;
2168 }
ConvertJSONToCBOR(const Platform & platform,span<uint8_t> json,std::string * cbor)2169 Status ConvertJSONToCBOR(const Platform& platform,
2170 span<uint8_t> json,
2171 std::string* cbor) {
2172 return ConvertJSONToCBORTmpl(platform, json, cbor);
2173 }
ConvertJSONToCBOR(const Platform & platform,span<uint16_t> json,std::string * cbor)2174 Status ConvertJSONToCBOR(const Platform& platform,
2175 span<uint16_t> json,
2176 std::string* cbor) {
2177 return ConvertJSONToCBORTmpl(platform, json, cbor);
2178 }
ConvertJSONToCBOR(const Platform & platform,span<uint8_t> json,std::vector<uint8_t> * cbor)2179 Status ConvertJSONToCBOR(const Platform& platform,
2180 span<uint8_t> json,
2181 std::vector<uint8_t>* cbor) {
2182 return ConvertJSONToCBORTmpl(platform, json, cbor);
2183 }
ConvertJSONToCBOR(const Platform & platform,span<uint16_t> json,std::vector<uint8_t> * cbor)2184 Status ConvertJSONToCBOR(const Platform& platform,
2185 span<uint16_t> json,
2186 std::vector<uint8_t>* cbor) {
2187 return ConvertJSONToCBORTmpl(platform, json, cbor);
2188 }
2189 } // namespace json
2190 } // namespace v8_inspector_protocol_encoding
2191