1 // Copyright 2019 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "cbor.h"
6
7 #include <algorithm>
8 #include <cassert>
9 #include <cmath>
10 #include <cstring>
11 #include <limits>
12 #include <stack>
13
14 namespace v8_crdtp {
15 namespace cbor {
16 namespace {
17 // Indicates the number of bits the "initial byte" needs to be shifted to the
18 // right after applying |kMajorTypeMask| to produce the major type in the
19 // lowermost bits.
20 static constexpr uint8_t kMajorTypeBitShift = 5u;
21 // Mask selecting the low-order 5 bits of the "initial byte", which is where
22 // the additional information is encoded.
23 static constexpr uint8_t kAdditionalInformationMask = 0x1f;
24 // Mask selecting the high-order 3 bits of the "initial byte", which indicates
25 // the major type of the encoded value.
26 static constexpr uint8_t kMajorTypeMask = 0xe0;
27 // Indicates the integer is in the following byte.
28 static constexpr uint8_t kAdditionalInformation1Byte = 24u;
29 // Indicates the integer is in the next 2 bytes.
30 static constexpr uint8_t kAdditionalInformation2Bytes = 25u;
31 // Indicates the integer is in the next 4 bytes.
32 static constexpr uint8_t kAdditionalInformation4Bytes = 26u;
33 // Indicates the integer is in the next 8 bytes.
34 static constexpr uint8_t kAdditionalInformation8Bytes = 27u;
35
36 // Encodes the initial byte, consisting of the |type| in the first 3 bits
37 // followed by 5 bits of |additional_info|.
EncodeInitialByte(MajorType type,uint8_t additional_info)38 constexpr uint8_t EncodeInitialByte(MajorType type, uint8_t additional_info) {
39 return (static_cast<uint8_t>(type) << kMajorTypeBitShift) |
40 (additional_info & kAdditionalInformationMask);
41 }
42
43 // TAG 24 indicates that what follows is a byte string which is
44 // encoded in CBOR format. We use this as a wrapper for
45 // maps and arrays, allowing us to skip them, because the
46 // byte string carries its size (byte length).
47 // https://tools.ietf.org/html/rfc7049#section-2.4.4.1
48 static constexpr uint8_t kInitialByteForEnvelope =
49 EncodeInitialByte(MajorType::TAG, 24);
50 // The initial byte for a byte string with at most 2^32 bytes
51 // of payload. This is used for envelope encoding, even if
52 // the byte string is shorter.
53 static constexpr uint8_t kInitialByteFor32BitLengthByteString =
54 EncodeInitialByte(MajorType::BYTE_STRING, 26);
55
56 // See RFC 7049 Section 2.2.1, indefinite length arrays / maps have additional
57 // info = 31.
58 static constexpr uint8_t kInitialByteIndefiniteLengthArray =
59 EncodeInitialByte(MajorType::ARRAY, 31);
60 static constexpr uint8_t kInitialByteIndefiniteLengthMap =
61 EncodeInitialByte(MajorType::MAP, 31);
62 // See RFC 7049 Section 2.3, Table 1; this is used for finishing indefinite
63 // length maps / arrays.
64 static constexpr uint8_t kStopByte =
65 EncodeInitialByte(MajorType::SIMPLE_VALUE, 31);
66
67 // See RFC 7049 Section 2.3, Table 2.
68 static constexpr uint8_t kEncodedTrue =
69 EncodeInitialByte(MajorType::SIMPLE_VALUE, 21);
70 static constexpr uint8_t kEncodedFalse =
71 EncodeInitialByte(MajorType::SIMPLE_VALUE, 20);
72 static constexpr uint8_t kEncodedNull =
73 EncodeInitialByte(MajorType::SIMPLE_VALUE, 22);
74 static constexpr uint8_t kInitialByteForDouble =
75 EncodeInitialByte(MajorType::SIMPLE_VALUE, 27);
76
77 // See RFC 7049 Table 3 and Section 2.4.4.2. This is used as a prefix for
78 // arbitrary binary data encoded as BYTE_STRING.
79 static constexpr uint8_t kExpectedConversionToBase64Tag =
80 EncodeInitialByte(MajorType::TAG, 22);
81
82 // Writes the bytes for |v| to |out|, starting with the most significant byte.
83 // See also: https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html
84 template <typename T>
WriteBytesMostSignificantByteFirst(T v,std::vector<uint8_t> * out)85 void WriteBytesMostSignificantByteFirst(T v, std::vector<uint8_t>* out) {
86 for (int shift_bytes = sizeof(T) - 1; shift_bytes >= 0; --shift_bytes)
87 out->push_back(0xff & (v >> (shift_bytes * 8)));
88 }
89
90 // Extracts sizeof(T) bytes from |in| to extract a value of type T
91 // (e.g. uint64_t, uint32_t, ...), most significant byte first.
92 // See also: https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html
93 template <typename T>
ReadBytesMostSignificantByteFirst(span<uint8_t> in)94 T ReadBytesMostSignificantByteFirst(span<uint8_t> in) {
95 assert(in.size() >= sizeof(T));
96 T result = 0;
97 for (size_t shift_bytes = 0; shift_bytes < sizeof(T); ++shift_bytes)
98 result |= T(in[sizeof(T) - 1 - shift_bytes]) << (shift_bytes * 8);
99 return result;
100 }
101 } // namespace
102
103 namespace internals {
104 // Reads the start of a token with definitive size from |bytes|.
105 // |type| is the major type as specified in RFC 7049 Section 2.1.
106 // |value| is the payload (e.g. for MajorType::UNSIGNED) or is the size
107 // (e.g. for BYTE_STRING).
108 // If successful, returns the number of bytes read. Otherwise returns 0.
ReadTokenStart(span<uint8_t> bytes,MajorType * type,uint64_t * value)109 size_t ReadTokenStart(span<uint8_t> bytes, MajorType* type, uint64_t* value) {
110 if (bytes.empty())
111 return 0;
112 uint8_t initial_byte = bytes[0];
113 *type = MajorType((initial_byte & kMajorTypeMask) >> kMajorTypeBitShift);
114
115 uint8_t additional_information = initial_byte & kAdditionalInformationMask;
116 if (additional_information < 24) {
117 // Values 0-23 are encoded directly into the additional info of the
118 // initial byte.
119 *value = additional_information;
120 return 1;
121 }
122 if (additional_information == kAdditionalInformation1Byte) {
123 // Values 24-255 are encoded with one initial byte, followed by the value.
124 if (bytes.size() < 2)
125 return 0;
126 *value = ReadBytesMostSignificantByteFirst<uint8_t>(bytes.subspan(1));
127 return 2;
128 }
129 if (additional_information == kAdditionalInformation2Bytes) {
130 // Values 256-65535: 1 initial byte + 2 bytes payload.
131 if (bytes.size() < 1 + sizeof(uint16_t))
132 return 0;
133 *value = ReadBytesMostSignificantByteFirst<uint16_t>(bytes.subspan(1));
134 return 3;
135 }
136 if (additional_information == kAdditionalInformation4Bytes) {
137 // 32 bit uint: 1 initial byte + 4 bytes payload.
138 if (bytes.size() < 1 + sizeof(uint32_t))
139 return 0;
140 *value = ReadBytesMostSignificantByteFirst<uint32_t>(bytes.subspan(1));
141 return 5;
142 }
143 if (additional_information == kAdditionalInformation8Bytes) {
144 // 64 bit uint: 1 initial byte + 8 bytes payload.
145 if (bytes.size() < 1 + sizeof(uint64_t))
146 return 0;
147 *value = ReadBytesMostSignificantByteFirst<uint64_t>(bytes.subspan(1));
148 return 9;
149 }
150 return 0;
151 }
152
153 // Writes the start of a token with |type|. The |value| may indicate the size,
154 // or it may be the payload if the value is an unsigned integer.
WriteTokenStart(MajorType type,uint64_t value,std::vector<uint8_t> * encoded)155 void WriteTokenStart(MajorType type,
156 uint64_t value,
157 std::vector<uint8_t>* encoded) {
158 if (value < 24) {
159 // Values 0-23 are encoded directly into the additional info of the
160 // initial byte.
161 encoded->push_back(EncodeInitialByte(type, /*additional_info=*/value));
162 return;
163 }
164 if (value <= std::numeric_limits<uint8_t>::max()) {
165 // Values 24-255 are encoded with one initial byte, followed by the value.
166 encoded->push_back(EncodeInitialByte(type, kAdditionalInformation1Byte));
167 encoded->push_back(value);
168 return;
169 }
170 if (value <= std::numeric_limits<uint16_t>::max()) {
171 // Values 256-65535: 1 initial byte + 2 bytes payload.
172 encoded->push_back(EncodeInitialByte(type, kAdditionalInformation2Bytes));
173 WriteBytesMostSignificantByteFirst<uint16_t>(value, encoded);
174 return;
175 }
176 if (value <= std::numeric_limits<uint32_t>::max()) {
177 // 32 bit uint: 1 initial byte + 4 bytes payload.
178 encoded->push_back(EncodeInitialByte(type, kAdditionalInformation4Bytes));
179 WriteBytesMostSignificantByteFirst<uint32_t>(static_cast<uint32_t>(value),
180 encoded);
181 return;
182 }
183 // 64 bit uint: 1 initial byte + 8 bytes payload.
184 encoded->push_back(EncodeInitialByte(type, kAdditionalInformation8Bytes));
185 WriteBytesMostSignificantByteFirst<uint64_t>(value, encoded);
186 }
187 } // namespace internals
188
189 // =============================================================================
190 // Detecting CBOR content
191 // =============================================================================
192
InitialByteForEnvelope()193 uint8_t InitialByteForEnvelope() {
194 return kInitialByteForEnvelope;
195 }
196
InitialByteFor32BitLengthByteString()197 uint8_t InitialByteFor32BitLengthByteString() {
198 return kInitialByteFor32BitLengthByteString;
199 }
200
IsCBORMessage(span<uint8_t> msg)201 bool IsCBORMessage(span<uint8_t> msg) {
202 return msg.size() >= 6 && msg[0] == InitialByteForEnvelope() &&
203 msg[1] == InitialByteFor32BitLengthByteString();
204 }
205
CheckCBORMessage(span<uint8_t> msg)206 Status CheckCBORMessage(span<uint8_t> msg) {
207 if (msg.empty())
208 return Status(Error::CBOR_NO_INPUT, 0);
209 if (msg[0] != InitialByteForEnvelope())
210 return Status(Error::CBOR_INVALID_START_BYTE, 0);
211 if (msg.size() < 6 || msg[1] != InitialByteFor32BitLengthByteString())
212 return Status(Error::CBOR_INVALID_ENVELOPE, 1);
213 if (msg[2] == 0 && msg[3] == 0 && msg[4] == 0 && msg[5] == 0)
214 return Status(Error::CBOR_INVALID_ENVELOPE, 1);
215 if (msg.size() < 7 || msg[6] != EncodeIndefiniteLengthMapStart())
216 return Status(Error::CBOR_MAP_START_EXPECTED, 6);
217 return Status();
218 }
219
220 // =============================================================================
221 // Encoding invidiual CBOR items
222 // =============================================================================
223
EncodeTrue()224 uint8_t EncodeTrue() {
225 return kEncodedTrue;
226 }
227
EncodeFalse()228 uint8_t EncodeFalse() {
229 return kEncodedFalse;
230 }
231
EncodeNull()232 uint8_t EncodeNull() {
233 return kEncodedNull;
234 }
235
EncodeIndefiniteLengthArrayStart()236 uint8_t EncodeIndefiniteLengthArrayStart() {
237 return kInitialByteIndefiniteLengthArray;
238 }
239
EncodeIndefiniteLengthMapStart()240 uint8_t EncodeIndefiniteLengthMapStart() {
241 return kInitialByteIndefiniteLengthMap;
242 }
243
EncodeStop()244 uint8_t EncodeStop() {
245 return kStopByte;
246 }
247
EncodeInt32(int32_t value,std::vector<uint8_t> * out)248 void EncodeInt32(int32_t value, std::vector<uint8_t>* out) {
249 if (value >= 0) {
250 internals::WriteTokenStart(MajorType::UNSIGNED, value, out);
251 } else {
252 uint64_t representation = static_cast<uint64_t>(-(value + 1));
253 internals::WriteTokenStart(MajorType::NEGATIVE, representation, out);
254 }
255 }
256
EncodeString16(span<uint16_t> in,std::vector<uint8_t> * out)257 void EncodeString16(span<uint16_t> in, std::vector<uint8_t>* out) {
258 uint64_t byte_length = static_cast<uint64_t>(in.size_bytes());
259 internals::WriteTokenStart(MajorType::BYTE_STRING, byte_length, out);
260 // When emitting UTF16 characters, we always write the least significant byte
261 // first; this is because it's the native representation for X86.
262 // TODO(johannes): Implement a more efficient thing here later, e.g.
263 // casting *iff* the machine has this byte order.
264 // The wire format for UTF16 chars will probably remain the same
265 // (least significant byte first) since this way we can have
266 // golden files, unittests, etc. that port easily and universally.
267 // See also:
268 // https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html
269 for (const uint16_t two_bytes : in) {
270 out->push_back(two_bytes);
271 out->push_back(two_bytes >> 8);
272 }
273 }
274
EncodeString8(span<uint8_t> in,std::vector<uint8_t> * out)275 void EncodeString8(span<uint8_t> in, std::vector<uint8_t>* out) {
276 internals::WriteTokenStart(MajorType::STRING,
277 static_cast<uint64_t>(in.size_bytes()), out);
278 out->insert(out->end(), in.begin(), in.end());
279 }
280
EncodeFromLatin1(span<uint8_t> latin1,std::vector<uint8_t> * out)281 void EncodeFromLatin1(span<uint8_t> latin1, std::vector<uint8_t>* out) {
282 for (size_t ii = 0; ii < latin1.size(); ++ii) {
283 if (latin1[ii] <= 127)
284 continue;
285 // If there's at least one non-ASCII char, convert to UTF8.
286 std::vector<uint8_t> utf8(latin1.begin(), latin1.begin() + ii);
287 for (; ii < latin1.size(); ++ii) {
288 if (latin1[ii] <= 127) {
289 utf8.push_back(latin1[ii]);
290 } else {
291 // 0xC0 means it's a UTF8 sequence with 2 bytes.
292 utf8.push_back((latin1[ii] >> 6) | 0xc0);
293 utf8.push_back((latin1[ii] | 0x80) & 0xbf);
294 }
295 }
296 EncodeString8(SpanFrom(utf8), out);
297 return;
298 }
299 EncodeString8(latin1, out);
300 }
301
EncodeFromUTF16(span<uint16_t> utf16,std::vector<uint8_t> * out)302 void EncodeFromUTF16(span<uint16_t> utf16, std::vector<uint8_t>* out) {
303 // If there's at least one non-ASCII char, encode as STRING16 (UTF16).
304 for (uint16_t ch : utf16) {
305 if (ch <= 127)
306 continue;
307 EncodeString16(utf16, out);
308 return;
309 }
310 // It's all US-ASCII, strip out every second byte and encode as UTF8.
311 internals::WriteTokenStart(MajorType::STRING,
312 static_cast<uint64_t>(utf16.size()), out);
313 out->insert(out->end(), utf16.begin(), utf16.end());
314 }
315
EncodeBinary(span<uint8_t> in,std::vector<uint8_t> * out)316 void EncodeBinary(span<uint8_t> in, std::vector<uint8_t>* out) {
317 out->push_back(kExpectedConversionToBase64Tag);
318 uint64_t byte_length = static_cast<uint64_t>(in.size_bytes());
319 internals::WriteTokenStart(MajorType::BYTE_STRING, byte_length, out);
320 out->insert(out->end(), in.begin(), in.end());
321 }
322
323 // A double is encoded with a specific initial byte
324 // (kInitialByteForDouble) plus the 64 bits of payload for its value.
325 constexpr size_t kEncodedDoubleSize = 1 + sizeof(uint64_t);
326
327 // An envelope is encoded with a specific initial byte
328 // (kInitialByteForEnvelope), plus the start byte for a BYTE_STRING with a 32
329 // bit wide length, plus a 32 bit length for that string.
330 constexpr size_t kEncodedEnvelopeHeaderSize = 1 + 1 + sizeof(uint32_t);
331
EncodeDouble(double value,std::vector<uint8_t> * out)332 void EncodeDouble(double value, std::vector<uint8_t>* out) {
333 // The additional_info=27 indicates 64 bits for the double follow.
334 // See RFC 7049 Section 2.3, Table 1.
335 out->push_back(kInitialByteForDouble);
336 union {
337 double from_double;
338 uint64_t to_uint64;
339 } reinterpret;
340 reinterpret.from_double = value;
341 WriteBytesMostSignificantByteFirst<uint64_t>(reinterpret.to_uint64, out);
342 }
343
344 // =============================================================================
345 // cbor::EnvelopeEncoder - for wrapping submessages
346 // =============================================================================
347
EncodeStart(std::vector<uint8_t> * out)348 void EnvelopeEncoder::EncodeStart(std::vector<uint8_t>* out) {
349 assert(byte_size_pos_ == 0);
350 out->push_back(kInitialByteForEnvelope);
351 out->push_back(kInitialByteFor32BitLengthByteString);
352 byte_size_pos_ = out->size();
353 out->resize(out->size() + sizeof(uint32_t));
354 }
355
EncodeStop(std::vector<uint8_t> * out)356 bool EnvelopeEncoder::EncodeStop(std::vector<uint8_t>* out) {
357 assert(byte_size_pos_ != 0);
358 // The byte size is the size of the payload, that is, all the
359 // bytes that were written past the byte size position itself.
360 uint64_t byte_size = out->size() - (byte_size_pos_ + sizeof(uint32_t));
361 // We store exactly 4 bytes, so at most INT32MAX, with most significant
362 // byte first.
363 if (byte_size > std::numeric_limits<uint32_t>::max())
364 return false;
365 for (int shift_bytes = sizeof(uint32_t) - 1; shift_bytes >= 0;
366 --shift_bytes) {
367 (*out)[byte_size_pos_++] = 0xff & (byte_size >> (shift_bytes * 8));
368 }
369 return true;
370 }
371
372 // =============================================================================
373 // cbor::NewCBOREncoder - for encoding from a streaming parser
374 // =============================================================================
375
376 namespace {
377 class CBOREncoder : public ParserHandler {
378 public:
CBOREncoder(std::vector<uint8_t> * out,Status * status)379 CBOREncoder(std::vector<uint8_t>* out, Status* status)
380 : out_(out), status_(status) {
381 *status_ = Status();
382 }
383
HandleMapBegin()384 void HandleMapBegin() override {
385 if (!status_->ok())
386 return;
387 envelopes_.emplace_back();
388 envelopes_.back().EncodeStart(out_);
389 out_->push_back(kInitialByteIndefiniteLengthMap);
390 }
391
HandleMapEnd()392 void HandleMapEnd() override {
393 if (!status_->ok())
394 return;
395 out_->push_back(kStopByte);
396 assert(!envelopes_.empty());
397 if (!envelopes_.back().EncodeStop(out_)) {
398 HandleError(
399 Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, out_->size()));
400 return;
401 }
402 envelopes_.pop_back();
403 }
404
HandleArrayBegin()405 void HandleArrayBegin() override {
406 if (!status_->ok())
407 return;
408 envelopes_.emplace_back();
409 envelopes_.back().EncodeStart(out_);
410 out_->push_back(kInitialByteIndefiniteLengthArray);
411 }
412
HandleArrayEnd()413 void HandleArrayEnd() override {
414 if (!status_->ok())
415 return;
416 out_->push_back(kStopByte);
417 assert(!envelopes_.empty());
418 if (!envelopes_.back().EncodeStop(out_)) {
419 HandleError(
420 Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, out_->size()));
421 return;
422 }
423 envelopes_.pop_back();
424 }
425
HandleString8(span<uint8_t> chars)426 void HandleString8(span<uint8_t> chars) override {
427 if (!status_->ok())
428 return;
429 EncodeString8(chars, out_);
430 }
431
HandleString16(span<uint16_t> chars)432 void HandleString16(span<uint16_t> chars) override {
433 if (!status_->ok())
434 return;
435 EncodeFromUTF16(chars, out_);
436 }
437
HandleBinary(span<uint8_t> bytes)438 void HandleBinary(span<uint8_t> bytes) override {
439 if (!status_->ok())
440 return;
441 EncodeBinary(bytes, out_);
442 }
443
HandleDouble(double value)444 void HandleDouble(double value) override {
445 if (!status_->ok())
446 return;
447 EncodeDouble(value, out_);
448 }
449
HandleInt32(int32_t value)450 void HandleInt32(int32_t value) override {
451 if (!status_->ok())
452 return;
453 EncodeInt32(value, out_);
454 }
455
HandleBool(bool value)456 void HandleBool(bool value) override {
457 if (!status_->ok())
458 return;
459 // See RFC 7049 Section 2.3, Table 2.
460 out_->push_back(value ? kEncodedTrue : kEncodedFalse);
461 }
462
HandleNull()463 void HandleNull() override {
464 if (!status_->ok())
465 return;
466 // See RFC 7049 Section 2.3, Table 2.
467 out_->push_back(kEncodedNull);
468 }
469
HandleError(Status error)470 void HandleError(Status error) override {
471 if (!status_->ok())
472 return;
473 *status_ = error;
474 out_->clear();
475 }
476
477 private:
478 std::vector<uint8_t>* out_;
479 std::vector<EnvelopeEncoder> envelopes_;
480 Status* status_;
481 };
482 } // namespace
483
NewCBOREncoder(std::vector<uint8_t> * out,Status * status)484 std::unique_ptr<ParserHandler> NewCBOREncoder(std::vector<uint8_t>* out,
485 Status* status) {
486 return std::unique_ptr<ParserHandler>(new CBOREncoder(out, status));
487 }
488
489 // =============================================================================
490 // cbor::CBORTokenizer - for parsing individual CBOR items
491 // =============================================================================
492
CBORTokenizer(span<uint8_t> bytes)493 CBORTokenizer::CBORTokenizer(span<uint8_t> bytes) : bytes_(bytes) {
494 ReadNextToken(/*enter_envelope=*/false);
495 }
496
~CBORTokenizer()497 CBORTokenizer::~CBORTokenizer() {}
498
TokenTag() const499 CBORTokenTag CBORTokenizer::TokenTag() const {
500 return token_tag_;
501 }
502
Next()503 void CBORTokenizer::Next() {
504 if (token_tag_ == CBORTokenTag::ERROR_VALUE ||
505 token_tag_ == CBORTokenTag::DONE)
506 return;
507 ReadNextToken(/*enter_envelope=*/false);
508 }
509
EnterEnvelope()510 void CBORTokenizer::EnterEnvelope() {
511 assert(token_tag_ == CBORTokenTag::ENVELOPE);
512 ReadNextToken(/*enter_envelope=*/true);
513 }
514
Status() const515 Status CBORTokenizer::Status() const {
516 return status_;
517 }
518
519 // The following accessor functions ::GetInt32, ::GetDouble,
520 // ::GetString8, ::GetString16WireRep, ::GetBinary, ::GetEnvelopeContents
521 // assume that a particular token was recognized in ::ReadNextToken.
522 // That's where all the error checking is done. By design,
523 // the accessors (assuming the token was recognized) never produce
524 // an error.
525
GetInt32() const526 int32_t CBORTokenizer::GetInt32() const {
527 assert(token_tag_ == CBORTokenTag::INT32);
528 // The range checks happen in ::ReadNextToken().
529 return static_cast<int32_t>(
530 token_start_type_ == MajorType::UNSIGNED
531 ? token_start_internal_value_
532 : -static_cast<int64_t>(token_start_internal_value_) - 1);
533 }
534
GetDouble() const535 double CBORTokenizer::GetDouble() const {
536 assert(token_tag_ == CBORTokenTag::DOUBLE);
537 union {
538 uint64_t from_uint64;
539 double to_double;
540 } reinterpret;
541 reinterpret.from_uint64 = ReadBytesMostSignificantByteFirst<uint64_t>(
542 bytes_.subspan(status_.pos + 1));
543 return reinterpret.to_double;
544 }
545
GetString8() const546 span<uint8_t> CBORTokenizer::GetString8() const {
547 assert(token_tag_ == CBORTokenTag::STRING8);
548 auto length = static_cast<size_t>(token_start_internal_value_);
549 return bytes_.subspan(status_.pos + (token_byte_length_ - length), length);
550 }
551
GetString16WireRep() const552 span<uint8_t> CBORTokenizer::GetString16WireRep() const {
553 assert(token_tag_ == CBORTokenTag::STRING16);
554 auto length = static_cast<size_t>(token_start_internal_value_);
555 return bytes_.subspan(status_.pos + (token_byte_length_ - length), length);
556 }
557
GetBinary() const558 span<uint8_t> CBORTokenizer::GetBinary() const {
559 assert(token_tag_ == CBORTokenTag::BINARY);
560 auto length = static_cast<size_t>(token_start_internal_value_);
561 return bytes_.subspan(status_.pos + (token_byte_length_ - length), length);
562 }
563
GetEnvelope() const564 span<uint8_t> CBORTokenizer::GetEnvelope() const {
565 assert(token_tag_ == CBORTokenTag::ENVELOPE);
566 auto length = static_cast<size_t>(token_start_internal_value_);
567 return bytes_.subspan(status_.pos, length + kEncodedEnvelopeHeaderSize);
568 }
569
GetEnvelopeContents() const570 span<uint8_t> CBORTokenizer::GetEnvelopeContents() const {
571 assert(token_tag_ == CBORTokenTag::ENVELOPE);
572 auto length = static_cast<size_t>(token_start_internal_value_);
573 return bytes_.subspan(status_.pos + kEncodedEnvelopeHeaderSize, length);
574 }
575
576 // All error checking happens in ::ReadNextToken, so that the accessors
577 // can avoid having to carry an error return value.
578 //
579 // With respect to checking the encoded lengths of strings, arrays, etc:
580 // On the wire, CBOR uses 1,2,4, and 8 byte unsigned integers, so
581 // we initially read them as uint64_t, usually into token_start_internal_value_.
582 //
583 // However, since these containers have a representation on the machine,
584 // we need to do corresponding size computations on the input byte array,
585 // output span (e.g. the payload for a string), etc., and size_t is
586 // machine specific (in practice either 32 bit or 64 bit).
587 //
588 // Further, we must avoid overflowing size_t. Therefore, we use this
589 // kMaxValidLength constant to:
590 // - Reject values that are larger than the architecture specific
591 // max size_t (differs between 32 bit and 64 bit arch).
592 // - Reserve at least one bit so that we can check against overflows
593 // when adding lengths (array / string length / etc.); we do this by
594 // ensuring that the inputs to an addition are <= kMaxValidLength,
595 // and then checking whether the sum went past it.
596 //
597 // See also
598 // https://chromium.googlesource.com/chromium/src/+/master/docs/security/integer-semantics.md
599 static const uint64_t kMaxValidLength =
600 std::min<uint64_t>(std::numeric_limits<uint64_t>::max() >> 2,
601 std::numeric_limits<size_t>::max());
602
ReadNextToken(bool enter_envelope)603 void CBORTokenizer::ReadNextToken(bool enter_envelope) {
604 if (enter_envelope) {
605 status_.pos += kEncodedEnvelopeHeaderSize;
606 } else {
607 status_.pos =
608 status_.pos == Status::npos() ? 0 : status_.pos + token_byte_length_;
609 }
610 status_.error = Error::OK;
611 if (status_.pos >= bytes_.size()) {
612 token_tag_ = CBORTokenTag::DONE;
613 return;
614 }
615 const size_t remaining_bytes = bytes_.size() - status_.pos;
616 switch (bytes_[status_.pos]) {
617 case kStopByte:
618 SetToken(CBORTokenTag::STOP, 1);
619 return;
620 case kInitialByteIndefiniteLengthMap:
621 SetToken(CBORTokenTag::MAP_START, 1);
622 return;
623 case kInitialByteIndefiniteLengthArray:
624 SetToken(CBORTokenTag::ARRAY_START, 1);
625 return;
626 case kEncodedTrue:
627 SetToken(CBORTokenTag::TRUE_VALUE, 1);
628 return;
629 case kEncodedFalse:
630 SetToken(CBORTokenTag::FALSE_VALUE, 1);
631 return;
632 case kEncodedNull:
633 SetToken(CBORTokenTag::NULL_VALUE, 1);
634 return;
635 case kExpectedConversionToBase64Tag: { // BINARY
636 const size_t bytes_read = internals::ReadTokenStart(
637 bytes_.subspan(status_.pos + 1), &token_start_type_,
638 &token_start_internal_value_);
639 if (!bytes_read || token_start_type_ != MajorType::BYTE_STRING ||
640 token_start_internal_value_ > kMaxValidLength) {
641 SetError(Error::CBOR_INVALID_BINARY);
642 return;
643 }
644 const uint64_t token_byte_length = token_start_internal_value_ +
645 /* tag before token start: */ 1 +
646 /* token start: */ bytes_read;
647 if (token_byte_length > remaining_bytes) {
648 SetError(Error::CBOR_INVALID_BINARY);
649 return;
650 }
651 SetToken(CBORTokenTag::BINARY, static_cast<size_t>(token_byte_length));
652 return;
653 }
654 case kInitialByteForDouble: { // DOUBLE
655 if (kEncodedDoubleSize > remaining_bytes) {
656 SetError(Error::CBOR_INVALID_DOUBLE);
657 return;
658 }
659 SetToken(CBORTokenTag::DOUBLE, kEncodedDoubleSize);
660 return;
661 }
662 case kInitialByteForEnvelope: { // ENVELOPE
663 if (kEncodedEnvelopeHeaderSize > remaining_bytes) {
664 SetError(Error::CBOR_INVALID_ENVELOPE);
665 return;
666 }
667 // The envelope must be a byte string with 32 bit length.
668 if (bytes_[status_.pos + 1] != kInitialByteFor32BitLengthByteString) {
669 SetError(Error::CBOR_INVALID_ENVELOPE);
670 return;
671 }
672 // Read the length of the byte string.
673 token_start_internal_value_ = ReadBytesMostSignificantByteFirst<uint32_t>(
674 bytes_.subspan(status_.pos + 2));
675 if (token_start_internal_value_ > kMaxValidLength) {
676 SetError(Error::CBOR_INVALID_ENVELOPE);
677 return;
678 }
679 uint64_t token_byte_length =
680 token_start_internal_value_ + kEncodedEnvelopeHeaderSize;
681 if (token_byte_length > remaining_bytes) {
682 SetError(Error::CBOR_INVALID_ENVELOPE);
683 return;
684 }
685 SetToken(CBORTokenTag::ENVELOPE, static_cast<size_t>(token_byte_length));
686 return;
687 }
688 default: {
689 const size_t bytes_read = internals::ReadTokenStart(
690 bytes_.subspan(status_.pos), &token_start_type_,
691 &token_start_internal_value_);
692 switch (token_start_type_) {
693 case MajorType::UNSIGNED: // INT32.
694 // INT32 is a signed int32 (int32 makes sense for the
695 // inspector protocol, it's not a CBOR limitation), so we check
696 // against the signed max, so that the allowable values are
697 // 0, 1, 2, ... 2^31 - 1.
698 if (!bytes_read ||
699 static_cast<uint64_t>(std::numeric_limits<int32_t>::max()) <
700 static_cast<uint64_t>(token_start_internal_value_)) {
701 SetError(Error::CBOR_INVALID_INT32);
702 return;
703 }
704 SetToken(CBORTokenTag::INT32, bytes_read);
705 return;
706 case MajorType::NEGATIVE: { // INT32.
707 // INT32 is a signed int32 (int32 makes sense for the
708 // inspector protocol, it's not a CBOR limitation); in CBOR, the
709 // negative values for INT32 are represented as NEGATIVE, that is, -1
710 // INT32 is represented as 1 << 5 | 0 (major type 1, additional info
711 // value 0).
712 // The represented allowed values range is -1 to -2^31.
713 // They are mapped into the encoded range of 0 to 2^31-1.
714 // We check the payload in token_start_internal_value_ against
715 // that range (2^31-1 is also known as
716 // std::numeric_limits<int32_t>::max()).
717 if (!bytes_read ||
718 static_cast<uint64_t>(token_start_internal_value_) >
719 static_cast<uint64_t>(std::numeric_limits<int32_t>::max())) {
720 SetError(Error::CBOR_INVALID_INT32);
721 return;
722 }
723 SetToken(CBORTokenTag::INT32, bytes_read);
724 return;
725 }
726 case MajorType::STRING: { // STRING8.
727 if (!bytes_read || token_start_internal_value_ > kMaxValidLength) {
728 SetError(Error::CBOR_INVALID_STRING8);
729 return;
730 }
731 uint64_t token_byte_length = token_start_internal_value_ + bytes_read;
732 if (token_byte_length > remaining_bytes) {
733 SetError(Error::CBOR_INVALID_STRING8);
734 return;
735 }
736 SetToken(CBORTokenTag::STRING8,
737 static_cast<size_t>(token_byte_length));
738 return;
739 }
740 case MajorType::BYTE_STRING: { // STRING16.
741 // Length must be divisible by 2 since UTF16 is 2 bytes per
742 // character, hence the &1 check.
743 if (!bytes_read || token_start_internal_value_ > kMaxValidLength ||
744 token_start_internal_value_ & 1) {
745 SetError(Error::CBOR_INVALID_STRING16);
746 return;
747 }
748 uint64_t token_byte_length = token_start_internal_value_ + bytes_read;
749 if (token_byte_length > remaining_bytes) {
750 SetError(Error::CBOR_INVALID_STRING16);
751 return;
752 }
753 SetToken(CBORTokenTag::STRING16,
754 static_cast<size_t>(token_byte_length));
755 return;
756 }
757 case MajorType::ARRAY:
758 case MajorType::MAP:
759 case MajorType::TAG:
760 case MajorType::SIMPLE_VALUE:
761 SetError(Error::CBOR_UNSUPPORTED_VALUE);
762 return;
763 }
764 }
765 }
766 }
767
SetToken(CBORTokenTag token_tag,size_t token_byte_length)768 void CBORTokenizer::SetToken(CBORTokenTag token_tag, size_t token_byte_length) {
769 token_tag_ = token_tag;
770 token_byte_length_ = token_byte_length;
771 }
772
SetError(Error error)773 void CBORTokenizer::SetError(Error error) {
774 token_tag_ = CBORTokenTag::ERROR_VALUE;
775 status_.error = error;
776 }
777
778 // =============================================================================
779 // cbor::ParseCBOR - for receiving streaming parser events for CBOR messages
780 // =============================================================================
781
782 namespace {
783 // When parsing CBOR, we limit recursion depth for objects and arrays
784 // to this constant.
785 static constexpr int kStackLimit = 300;
786
787 // Below are three parsing routines for CBOR, which cover enough
788 // to roundtrip JSON messages.
789 bool ParseMap(int32_t stack_depth,
790 CBORTokenizer* tokenizer,
791 ParserHandler* out);
792 bool ParseArray(int32_t stack_depth,
793 CBORTokenizer* tokenizer,
794 ParserHandler* out);
795 bool ParseValue(int32_t stack_depth,
796 CBORTokenizer* tokenizer,
797 ParserHandler* out);
798 bool ParseEnvelope(int32_t stack_depth,
799 CBORTokenizer* tokenizer,
800 ParserHandler* out);
801
ParseUTF16String(CBORTokenizer * tokenizer,ParserHandler * out)802 void ParseUTF16String(CBORTokenizer* tokenizer, ParserHandler* out) {
803 std::vector<uint16_t> value;
804 span<uint8_t> rep = tokenizer->GetString16WireRep();
805 for (size_t ii = 0; ii < rep.size(); ii += 2)
806 value.push_back((rep[ii + 1] << 8) | rep[ii]);
807 out->HandleString16(span<uint16_t>(value.data(), value.size()));
808 tokenizer->Next();
809 }
810
ParseUTF8String(CBORTokenizer * tokenizer,ParserHandler * out)811 bool ParseUTF8String(CBORTokenizer* tokenizer, ParserHandler* out) {
812 assert(tokenizer->TokenTag() == CBORTokenTag::STRING8);
813 out->HandleString8(tokenizer->GetString8());
814 tokenizer->Next();
815 return true;
816 }
817
ParseEnvelope(int32_t stack_depth,CBORTokenizer * tokenizer,ParserHandler * out)818 bool ParseEnvelope(int32_t stack_depth,
819 CBORTokenizer* tokenizer,
820 ParserHandler* out) {
821 assert(tokenizer->TokenTag() == CBORTokenTag::ENVELOPE);
822 // Before we enter the envelope, we save the position that we
823 // expect to see after we're done parsing the envelope contents.
824 // This way we can compare and produce an error if the contents
825 // didn't fit exactly into the envelope length.
826 size_t pos_past_envelope = tokenizer->Status().pos +
827 kEncodedEnvelopeHeaderSize +
828 tokenizer->GetEnvelopeContents().size();
829 tokenizer->EnterEnvelope();
830 switch (tokenizer->TokenTag()) {
831 case CBORTokenTag::ERROR_VALUE:
832 out->HandleError(tokenizer->Status());
833 return false;
834 case CBORTokenTag::MAP_START:
835 if (!ParseMap(stack_depth + 1, tokenizer, out))
836 return false;
837 break; // Continue to check pos_past_envelope below.
838 case CBORTokenTag::ARRAY_START:
839 if (!ParseArray(stack_depth + 1, tokenizer, out))
840 return false;
841 break; // Continue to check pos_past_envelope below.
842 default:
843 out->HandleError(Status{Error::CBOR_MAP_OR_ARRAY_EXPECTED_IN_ENVELOPE,
844 tokenizer->Status().pos});
845 return false;
846 }
847 // The contents of the envelope parsed OK, now check that we're at
848 // the expected position.
849 if (pos_past_envelope != tokenizer->Status().pos) {
850 out->HandleError(Status{Error::CBOR_ENVELOPE_CONTENTS_LENGTH_MISMATCH,
851 tokenizer->Status().pos});
852 return false;
853 }
854 return true;
855 }
856
ParseValue(int32_t stack_depth,CBORTokenizer * tokenizer,ParserHandler * out)857 bool ParseValue(int32_t stack_depth,
858 CBORTokenizer* tokenizer,
859 ParserHandler* out) {
860 if (stack_depth > kStackLimit) {
861 out->HandleError(
862 Status{Error::CBOR_STACK_LIMIT_EXCEEDED, tokenizer->Status().pos});
863 return false;
864 }
865 switch (tokenizer->TokenTag()) {
866 case CBORTokenTag::ERROR_VALUE:
867 out->HandleError(tokenizer->Status());
868 return false;
869 case CBORTokenTag::DONE:
870 out->HandleError(Status{Error::CBOR_UNEXPECTED_EOF_EXPECTED_VALUE,
871 tokenizer->Status().pos});
872 return false;
873 case CBORTokenTag::ENVELOPE:
874 return ParseEnvelope(stack_depth, tokenizer, out);
875 case CBORTokenTag::TRUE_VALUE:
876 out->HandleBool(true);
877 tokenizer->Next();
878 return true;
879 case CBORTokenTag::FALSE_VALUE:
880 out->HandleBool(false);
881 tokenizer->Next();
882 return true;
883 case CBORTokenTag::NULL_VALUE:
884 out->HandleNull();
885 tokenizer->Next();
886 return true;
887 case CBORTokenTag::INT32:
888 out->HandleInt32(tokenizer->GetInt32());
889 tokenizer->Next();
890 return true;
891 case CBORTokenTag::DOUBLE:
892 out->HandleDouble(tokenizer->GetDouble());
893 tokenizer->Next();
894 return true;
895 case CBORTokenTag::STRING8:
896 return ParseUTF8String(tokenizer, out);
897 case CBORTokenTag::STRING16:
898 ParseUTF16String(tokenizer, out);
899 return true;
900 case CBORTokenTag::BINARY: {
901 out->HandleBinary(tokenizer->GetBinary());
902 tokenizer->Next();
903 return true;
904 }
905 case CBORTokenTag::MAP_START:
906 return ParseMap(stack_depth + 1, tokenizer, out);
907 case CBORTokenTag::ARRAY_START:
908 return ParseArray(stack_depth + 1, tokenizer, out);
909 default:
910 out->HandleError(
911 Status{Error::CBOR_UNSUPPORTED_VALUE, tokenizer->Status().pos});
912 return false;
913 }
914 }
915
916 // |bytes| must start with the indefinite length array byte, so basically,
917 // ParseArray may only be called after an indefinite length array has been
918 // detected.
ParseArray(int32_t stack_depth,CBORTokenizer * tokenizer,ParserHandler * out)919 bool ParseArray(int32_t stack_depth,
920 CBORTokenizer* tokenizer,
921 ParserHandler* out) {
922 assert(tokenizer->TokenTag() == CBORTokenTag::ARRAY_START);
923 tokenizer->Next();
924 out->HandleArrayBegin();
925 while (tokenizer->TokenTag() != CBORTokenTag::STOP) {
926 if (tokenizer->TokenTag() == CBORTokenTag::DONE) {
927 out->HandleError(
928 Status{Error::CBOR_UNEXPECTED_EOF_IN_ARRAY, tokenizer->Status().pos});
929 return false;
930 }
931 if (tokenizer->TokenTag() == CBORTokenTag::ERROR_VALUE) {
932 out->HandleError(tokenizer->Status());
933 return false;
934 }
935 // Parse value.
936 if (!ParseValue(stack_depth, tokenizer, out))
937 return false;
938 }
939 out->HandleArrayEnd();
940 tokenizer->Next();
941 return true;
942 }
943
944 // |bytes| must start with the indefinite length array byte, so basically,
945 // ParseArray may only be called after an indefinite length array has been
946 // detected.
ParseMap(int32_t stack_depth,CBORTokenizer * tokenizer,ParserHandler * out)947 bool ParseMap(int32_t stack_depth,
948 CBORTokenizer* tokenizer,
949 ParserHandler* out) {
950 assert(tokenizer->TokenTag() == CBORTokenTag::MAP_START);
951 out->HandleMapBegin();
952 tokenizer->Next();
953 while (tokenizer->TokenTag() != CBORTokenTag::STOP) {
954 if (tokenizer->TokenTag() == CBORTokenTag::DONE) {
955 out->HandleError(
956 Status{Error::CBOR_UNEXPECTED_EOF_IN_MAP, tokenizer->Status().pos});
957 return false;
958 }
959 if (tokenizer->TokenTag() == CBORTokenTag::ERROR_VALUE) {
960 out->HandleError(tokenizer->Status());
961 return false;
962 }
963 // Parse key.
964 if (tokenizer->TokenTag() == CBORTokenTag::STRING8) {
965 if (!ParseUTF8String(tokenizer, out))
966 return false;
967 } else if (tokenizer->TokenTag() == CBORTokenTag::STRING16) {
968 ParseUTF16String(tokenizer, out);
969 } else {
970 out->HandleError(
971 Status{Error::CBOR_INVALID_MAP_KEY, tokenizer->Status().pos});
972 return false;
973 }
974 // Parse value.
975 if (!ParseValue(stack_depth, tokenizer, out))
976 return false;
977 }
978 out->HandleMapEnd();
979 tokenizer->Next();
980 return true;
981 }
982 } // namespace
983
ParseCBOR(span<uint8_t> bytes,ParserHandler * out)984 void ParseCBOR(span<uint8_t> bytes, ParserHandler* out) {
985 if (bytes.empty()) {
986 out->HandleError(Status{Error::CBOR_NO_INPUT, 0});
987 return;
988 }
989 CBORTokenizer tokenizer(bytes);
990 if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE) {
991 out->HandleError(tokenizer.Status());
992 return;
993 }
994 if (!ParseValue(/*stack_depth=*/0, &tokenizer, out))
995 return;
996 if (tokenizer.TokenTag() == CBORTokenTag::DONE)
997 return;
998 if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE) {
999 out->HandleError(tokenizer.Status());
1000 return;
1001 }
1002 out->HandleError(Status{Error::CBOR_TRAILING_JUNK, tokenizer.Status().pos});
1003 }
1004
1005 // =============================================================================
1006 // cbor::AppendString8EntryToMap - for limited in-place editing of messages
1007 // =============================================================================
1008
AppendString8EntryToCBORMap(span<uint8_t> string8_key,span<uint8_t> string8_value,std::vector<uint8_t> * cbor)1009 Status AppendString8EntryToCBORMap(span<uint8_t> string8_key,
1010 span<uint8_t> string8_value,
1011 std::vector<uint8_t>* cbor) {
1012 // Careful below: Don't compare (*cbor)[idx] with a uint8_t, since
1013 // it could be a char (signed!). Instead, use bytes.
1014 span<uint8_t> bytes(reinterpret_cast<const uint8_t*>(cbor->data()),
1015 cbor->size());
1016 CBORTokenizer tokenizer(bytes);
1017 if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE)
1018 return tokenizer.Status();
1019 if (tokenizer.TokenTag() != CBORTokenTag::ENVELOPE)
1020 return Status(Error::CBOR_INVALID_ENVELOPE, 0);
1021 size_t envelope_size = tokenizer.GetEnvelopeContents().size();
1022 size_t old_size = cbor->size();
1023 if (old_size != envelope_size + kEncodedEnvelopeHeaderSize)
1024 return Status(Error::CBOR_INVALID_ENVELOPE, 0);
1025 if (envelope_size == 0 ||
1026 (tokenizer.GetEnvelopeContents()[0] != EncodeIndefiniteLengthMapStart()))
1027 return Status(Error::CBOR_MAP_START_EXPECTED, kEncodedEnvelopeHeaderSize);
1028 if (bytes[bytes.size() - 1] != EncodeStop())
1029 return Status(Error::CBOR_MAP_STOP_EXPECTED, cbor->size() - 1);
1030 cbor->pop_back();
1031 EncodeString8(string8_key, cbor);
1032 EncodeString8(string8_value, cbor);
1033 cbor->push_back(EncodeStop());
1034 size_t new_envelope_size = envelope_size + (cbor->size() - old_size);
1035 if (new_envelope_size > std::numeric_limits<uint32_t>::max())
1036 return Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, 0);
1037 size_t size_pos = cbor->size() - new_envelope_size - sizeof(uint32_t);
1038 uint8_t* out = reinterpret_cast<uint8_t*>(&cbor->at(size_pos));
1039 *(out++) = (new_envelope_size >> 24) & 0xff;
1040 *(out++) = (new_envelope_size >> 16) & 0xff;
1041 *(out++) = (new_envelope_size >> 8) & 0xff;
1042 *(out) = new_envelope_size & 0xff;
1043 return Status();
1044 }
1045 } // namespace cbor
1046 } // namespace v8_crdtp
1047