• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "cbor.h"
6 
7 #include <algorithm>
8 #include <cassert>
9 #include <cmath>
10 #include <cstring>
11 #include <limits>
12 #include <stack>
13 
14 namespace v8_crdtp {
15 namespace cbor {
16 namespace {
17 // Indicates the number of bits the "initial byte" needs to be shifted to the
18 // right after applying |kMajorTypeMask| to produce the major type in the
19 // lowermost bits.
20 static constexpr uint8_t kMajorTypeBitShift = 5u;
21 // Mask selecting the low-order 5 bits of the "initial byte", which is where
22 // the additional information is encoded.
23 static constexpr uint8_t kAdditionalInformationMask = 0x1f;
24 // Mask selecting the high-order 3 bits of the "initial byte", which indicates
25 // the major type of the encoded value.
26 static constexpr uint8_t kMajorTypeMask = 0xe0;
27 // Indicates the integer is in the following byte.
28 static constexpr uint8_t kAdditionalInformation1Byte = 24u;
29 // Indicates the integer is in the next 2 bytes.
30 static constexpr uint8_t kAdditionalInformation2Bytes = 25u;
31 // Indicates the integer is in the next 4 bytes.
32 static constexpr uint8_t kAdditionalInformation4Bytes = 26u;
33 // Indicates the integer is in the next 8 bytes.
34 static constexpr uint8_t kAdditionalInformation8Bytes = 27u;
35 
36 // Encodes the initial byte, consisting of the |type| in the first 3 bits
37 // followed by 5 bits of |additional_info|.
EncodeInitialByte(MajorType type,uint8_t additional_info)38 constexpr uint8_t EncodeInitialByte(MajorType type, uint8_t additional_info) {
39   return (static_cast<uint8_t>(type) << kMajorTypeBitShift) |
40          (additional_info & kAdditionalInformationMask);
41 }
42 
43 // TAG 24 indicates that what follows is a byte string which is
44 // encoded in CBOR format. We use this as a wrapper for
45 // maps and arrays, allowing us to skip them, because the
46 // byte string carries its size (byte length).
47 // https://tools.ietf.org/html/rfc7049#section-2.4.4.1
48 static constexpr uint8_t kInitialByteForEnvelope =
49     EncodeInitialByte(MajorType::TAG, 24);
50 // The initial byte for a byte string with at most 2^32 bytes
51 // of payload. This is used for envelope encoding, even if
52 // the byte string is shorter.
53 static constexpr uint8_t kInitialByteFor32BitLengthByteString =
54     EncodeInitialByte(MajorType::BYTE_STRING, 26);
55 
56 // See RFC 7049 Section 2.2.1, indefinite length arrays / maps have additional
57 // info = 31.
58 static constexpr uint8_t kInitialByteIndefiniteLengthArray =
59     EncodeInitialByte(MajorType::ARRAY, 31);
60 static constexpr uint8_t kInitialByteIndefiniteLengthMap =
61     EncodeInitialByte(MajorType::MAP, 31);
62 // See RFC 7049 Section 2.3, Table 1; this is used for finishing indefinite
63 // length maps / arrays.
64 static constexpr uint8_t kStopByte =
65     EncodeInitialByte(MajorType::SIMPLE_VALUE, 31);
66 
67 // See RFC 7049 Section 2.3, Table 2.
68 static constexpr uint8_t kEncodedTrue =
69     EncodeInitialByte(MajorType::SIMPLE_VALUE, 21);
70 static constexpr uint8_t kEncodedFalse =
71     EncodeInitialByte(MajorType::SIMPLE_VALUE, 20);
72 static constexpr uint8_t kEncodedNull =
73     EncodeInitialByte(MajorType::SIMPLE_VALUE, 22);
74 static constexpr uint8_t kInitialByteForDouble =
75     EncodeInitialByte(MajorType::SIMPLE_VALUE, 27);
76 
77 // See RFC 7049 Table 3 and Section 2.4.4.2. This is used as a prefix for
78 // arbitrary binary data encoded as BYTE_STRING.
79 static constexpr uint8_t kExpectedConversionToBase64Tag =
80     EncodeInitialByte(MajorType::TAG, 22);
81 
82 // Writes the bytes for |v| to |out|, starting with the most significant byte.
83 // See also: https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html
84 template <typename T>
WriteBytesMostSignificantByteFirst(T v,std::vector<uint8_t> * out)85 void WriteBytesMostSignificantByteFirst(T v, std::vector<uint8_t>* out) {
86   for (int shift_bytes = sizeof(T) - 1; shift_bytes >= 0; --shift_bytes)
87     out->push_back(0xff & (v >> (shift_bytes * 8)));
88 }
89 
90 // Extracts sizeof(T) bytes from |in| to extract a value of type T
91 // (e.g. uint64_t, uint32_t, ...), most significant byte first.
92 // See also: https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html
93 template <typename T>
ReadBytesMostSignificantByteFirst(span<uint8_t> in)94 T ReadBytesMostSignificantByteFirst(span<uint8_t> in) {
95   assert(in.size() >= sizeof(T));
96   T result = 0;
97   for (size_t shift_bytes = 0; shift_bytes < sizeof(T); ++shift_bytes)
98     result |= T(in[sizeof(T) - 1 - shift_bytes]) << (shift_bytes * 8);
99   return result;
100 }
101 }  // namespace
102 
103 namespace internals {
104 // Reads the start of a token with definitive size from |bytes|.
105 // |type| is the major type as specified in RFC 7049 Section 2.1.
106 // |value| is the payload (e.g. for MajorType::UNSIGNED) or is the size
107 // (e.g. for BYTE_STRING).
108 // If successful, returns the number of bytes read. Otherwise returns 0.
ReadTokenStart(span<uint8_t> bytes,MajorType * type,uint64_t * value)109 size_t ReadTokenStart(span<uint8_t> bytes, MajorType* type, uint64_t* value) {
110   if (bytes.empty())
111     return 0;
112   uint8_t initial_byte = bytes[0];
113   *type = MajorType((initial_byte & kMajorTypeMask) >> kMajorTypeBitShift);
114 
115   uint8_t additional_information = initial_byte & kAdditionalInformationMask;
116   if (additional_information < 24) {
117     // Values 0-23 are encoded directly into the additional info of the
118     // initial byte.
119     *value = additional_information;
120     return 1;
121   }
122   if (additional_information == kAdditionalInformation1Byte) {
123     // Values 24-255 are encoded with one initial byte, followed by the value.
124     if (bytes.size() < 2)
125       return 0;
126     *value = ReadBytesMostSignificantByteFirst<uint8_t>(bytes.subspan(1));
127     return 2;
128   }
129   if (additional_information == kAdditionalInformation2Bytes) {
130     // Values 256-65535: 1 initial byte + 2 bytes payload.
131     if (bytes.size() < 1 + sizeof(uint16_t))
132       return 0;
133     *value = ReadBytesMostSignificantByteFirst<uint16_t>(bytes.subspan(1));
134     return 3;
135   }
136   if (additional_information == kAdditionalInformation4Bytes) {
137     // 32 bit uint: 1 initial byte + 4 bytes payload.
138     if (bytes.size() < 1 + sizeof(uint32_t))
139       return 0;
140     *value = ReadBytesMostSignificantByteFirst<uint32_t>(bytes.subspan(1));
141     return 5;
142   }
143   if (additional_information == kAdditionalInformation8Bytes) {
144     // 64 bit uint: 1 initial byte + 8 bytes payload.
145     if (bytes.size() < 1 + sizeof(uint64_t))
146       return 0;
147     *value = ReadBytesMostSignificantByteFirst<uint64_t>(bytes.subspan(1));
148     return 9;
149   }
150   return 0;
151 }
152 
153 // Writes the start of a token with |type|. The |value| may indicate the size,
154 // or it may be the payload if the value is an unsigned integer.
WriteTokenStart(MajorType type,uint64_t value,std::vector<uint8_t> * encoded)155 void WriteTokenStart(MajorType type,
156                      uint64_t value,
157                      std::vector<uint8_t>* encoded) {
158   if (value < 24) {
159     // Values 0-23 are encoded directly into the additional info of the
160     // initial byte.
161     encoded->push_back(EncodeInitialByte(type, /*additional_info=*/value));
162     return;
163   }
164   if (value <= std::numeric_limits<uint8_t>::max()) {
165     // Values 24-255 are encoded with one initial byte, followed by the value.
166     encoded->push_back(EncodeInitialByte(type, kAdditionalInformation1Byte));
167     encoded->push_back(value);
168     return;
169   }
170   if (value <= std::numeric_limits<uint16_t>::max()) {
171     // Values 256-65535: 1 initial byte + 2 bytes payload.
172     encoded->push_back(EncodeInitialByte(type, kAdditionalInformation2Bytes));
173     WriteBytesMostSignificantByteFirst<uint16_t>(value, encoded);
174     return;
175   }
176   if (value <= std::numeric_limits<uint32_t>::max()) {
177     // 32 bit uint: 1 initial byte + 4 bytes payload.
178     encoded->push_back(EncodeInitialByte(type, kAdditionalInformation4Bytes));
179     WriteBytesMostSignificantByteFirst<uint32_t>(static_cast<uint32_t>(value),
180                                                  encoded);
181     return;
182   }
183   // 64 bit uint: 1 initial byte + 8 bytes payload.
184   encoded->push_back(EncodeInitialByte(type, kAdditionalInformation8Bytes));
185   WriteBytesMostSignificantByteFirst<uint64_t>(value, encoded);
186 }
187 }  // namespace internals
188 
189 // =============================================================================
190 // Detecting CBOR content
191 // =============================================================================
192 
InitialByteForEnvelope()193 uint8_t InitialByteForEnvelope() {
194   return kInitialByteForEnvelope;
195 }
196 
InitialByteFor32BitLengthByteString()197 uint8_t InitialByteFor32BitLengthByteString() {
198   return kInitialByteFor32BitLengthByteString;
199 }
200 
IsCBORMessage(span<uint8_t> msg)201 bool IsCBORMessage(span<uint8_t> msg) {
202   return msg.size() >= 6 && msg[0] == InitialByteForEnvelope() &&
203          msg[1] == InitialByteFor32BitLengthByteString();
204 }
205 
CheckCBORMessage(span<uint8_t> msg)206 Status CheckCBORMessage(span<uint8_t> msg) {
207   if (msg.empty())
208     return Status(Error::CBOR_NO_INPUT, 0);
209   if (msg[0] != InitialByteForEnvelope())
210     return Status(Error::CBOR_INVALID_START_BYTE, 0);
211   if (msg.size() < 6 || msg[1] != InitialByteFor32BitLengthByteString())
212     return Status(Error::CBOR_INVALID_ENVELOPE, 1);
213   if (msg[2] == 0 && msg[3] == 0 && msg[4] == 0 && msg[5] == 0)
214     return Status(Error::CBOR_INVALID_ENVELOPE, 1);
215   if (msg.size() < 7 || msg[6] != EncodeIndefiniteLengthMapStart())
216     return Status(Error::CBOR_MAP_START_EXPECTED, 6);
217   return Status();
218 }
219 
220 // =============================================================================
221 // Encoding invidiual CBOR items
222 // =============================================================================
223 
EncodeTrue()224 uint8_t EncodeTrue() {
225   return kEncodedTrue;
226 }
227 
EncodeFalse()228 uint8_t EncodeFalse() {
229   return kEncodedFalse;
230 }
231 
EncodeNull()232 uint8_t EncodeNull() {
233   return kEncodedNull;
234 }
235 
EncodeIndefiniteLengthArrayStart()236 uint8_t EncodeIndefiniteLengthArrayStart() {
237   return kInitialByteIndefiniteLengthArray;
238 }
239 
EncodeIndefiniteLengthMapStart()240 uint8_t EncodeIndefiniteLengthMapStart() {
241   return kInitialByteIndefiniteLengthMap;
242 }
243 
EncodeStop()244 uint8_t EncodeStop() {
245   return kStopByte;
246 }
247 
EncodeInt32(int32_t value,std::vector<uint8_t> * out)248 void EncodeInt32(int32_t value, std::vector<uint8_t>* out) {
249   if (value >= 0) {
250     internals::WriteTokenStart(MajorType::UNSIGNED, value, out);
251   } else {
252     uint64_t representation = static_cast<uint64_t>(-(value + 1));
253     internals::WriteTokenStart(MajorType::NEGATIVE, representation, out);
254   }
255 }
256 
EncodeString16(span<uint16_t> in,std::vector<uint8_t> * out)257 void EncodeString16(span<uint16_t> in, std::vector<uint8_t>* out) {
258   uint64_t byte_length = static_cast<uint64_t>(in.size_bytes());
259   internals::WriteTokenStart(MajorType::BYTE_STRING, byte_length, out);
260   // When emitting UTF16 characters, we always write the least significant byte
261   // first; this is because it's the native representation for X86.
262   // TODO(johannes): Implement a more efficient thing here later, e.g.
263   // casting *iff* the machine has this byte order.
264   // The wire format for UTF16 chars will probably remain the same
265   // (least significant byte first) since this way we can have
266   // golden files, unittests, etc. that port easily and universally.
267   // See also:
268   // https://commandcenter.blogspot.com/2012/04/byte-order-fallacy.html
269   for (const uint16_t two_bytes : in) {
270     out->push_back(two_bytes);
271     out->push_back(two_bytes >> 8);
272   }
273 }
274 
EncodeString8(span<uint8_t> in,std::vector<uint8_t> * out)275 void EncodeString8(span<uint8_t> in, std::vector<uint8_t>* out) {
276   internals::WriteTokenStart(MajorType::STRING,
277                              static_cast<uint64_t>(in.size_bytes()), out);
278   out->insert(out->end(), in.begin(), in.end());
279 }
280 
EncodeFromLatin1(span<uint8_t> latin1,std::vector<uint8_t> * out)281 void EncodeFromLatin1(span<uint8_t> latin1, std::vector<uint8_t>* out) {
282   for (size_t ii = 0; ii < latin1.size(); ++ii) {
283     if (latin1[ii] <= 127)
284       continue;
285     // If there's at least one non-ASCII char, convert to UTF8.
286     std::vector<uint8_t> utf8(latin1.begin(), latin1.begin() + ii);
287     for (; ii < latin1.size(); ++ii) {
288       if (latin1[ii] <= 127) {
289         utf8.push_back(latin1[ii]);
290       } else {
291         // 0xC0 means it's a UTF8 sequence with 2 bytes.
292         utf8.push_back((latin1[ii] >> 6) | 0xc0);
293         utf8.push_back((latin1[ii] | 0x80) & 0xbf);
294       }
295     }
296     EncodeString8(SpanFrom(utf8), out);
297     return;
298   }
299   EncodeString8(latin1, out);
300 }
301 
EncodeFromUTF16(span<uint16_t> utf16,std::vector<uint8_t> * out)302 void EncodeFromUTF16(span<uint16_t> utf16, std::vector<uint8_t>* out) {
303   // If there's at least one non-ASCII char, encode as STRING16 (UTF16).
304   for (uint16_t ch : utf16) {
305     if (ch <= 127)
306       continue;
307     EncodeString16(utf16, out);
308     return;
309   }
310   // It's all US-ASCII, strip out every second byte and encode as UTF8.
311   internals::WriteTokenStart(MajorType::STRING,
312                              static_cast<uint64_t>(utf16.size()), out);
313   out->insert(out->end(), utf16.begin(), utf16.end());
314 }
315 
EncodeBinary(span<uint8_t> in,std::vector<uint8_t> * out)316 void EncodeBinary(span<uint8_t> in, std::vector<uint8_t>* out) {
317   out->push_back(kExpectedConversionToBase64Tag);
318   uint64_t byte_length = static_cast<uint64_t>(in.size_bytes());
319   internals::WriteTokenStart(MajorType::BYTE_STRING, byte_length, out);
320   out->insert(out->end(), in.begin(), in.end());
321 }
322 
323 // A double is encoded with a specific initial byte
324 // (kInitialByteForDouble) plus the 64 bits of payload for its value.
325 constexpr size_t kEncodedDoubleSize = 1 + sizeof(uint64_t);
326 
327 // An envelope is encoded with a specific initial byte
328 // (kInitialByteForEnvelope), plus the start byte for a BYTE_STRING with a 32
329 // bit wide length, plus a 32 bit length for that string.
330 constexpr size_t kEncodedEnvelopeHeaderSize = 1 + 1 + sizeof(uint32_t);
331 
EncodeDouble(double value,std::vector<uint8_t> * out)332 void EncodeDouble(double value, std::vector<uint8_t>* out) {
333   // The additional_info=27 indicates 64 bits for the double follow.
334   // See RFC 7049 Section 2.3, Table 1.
335   out->push_back(kInitialByteForDouble);
336   union {
337     double from_double;
338     uint64_t to_uint64;
339   } reinterpret;
340   reinterpret.from_double = value;
341   WriteBytesMostSignificantByteFirst<uint64_t>(reinterpret.to_uint64, out);
342 }
343 
344 // =============================================================================
345 // cbor::EnvelopeEncoder - for wrapping submessages
346 // =============================================================================
347 
EncodeStart(std::vector<uint8_t> * out)348 void EnvelopeEncoder::EncodeStart(std::vector<uint8_t>* out) {
349   assert(byte_size_pos_ == 0);
350   out->push_back(kInitialByteForEnvelope);
351   out->push_back(kInitialByteFor32BitLengthByteString);
352   byte_size_pos_ = out->size();
353   out->resize(out->size() + sizeof(uint32_t));
354 }
355 
EncodeStop(std::vector<uint8_t> * out)356 bool EnvelopeEncoder::EncodeStop(std::vector<uint8_t>* out) {
357   assert(byte_size_pos_ != 0);
358   // The byte size is the size of the payload, that is, all the
359   // bytes that were written past the byte size position itself.
360   uint64_t byte_size = out->size() - (byte_size_pos_ + sizeof(uint32_t));
361   // We store exactly 4 bytes, so at most INT32MAX, with most significant
362   // byte first.
363   if (byte_size > std::numeric_limits<uint32_t>::max())
364     return false;
365   for (int shift_bytes = sizeof(uint32_t) - 1; shift_bytes >= 0;
366        --shift_bytes) {
367     (*out)[byte_size_pos_++] = 0xff & (byte_size >> (shift_bytes * 8));
368   }
369   return true;
370 }
371 
372 // =============================================================================
373 // cbor::NewCBOREncoder - for encoding from a streaming parser
374 // =============================================================================
375 
376 namespace {
377 class CBOREncoder : public ParserHandler {
378  public:
CBOREncoder(std::vector<uint8_t> * out,Status * status)379   CBOREncoder(std::vector<uint8_t>* out, Status* status)
380       : out_(out), status_(status) {
381     *status_ = Status();
382   }
383 
HandleMapBegin()384   void HandleMapBegin() override {
385     if (!status_->ok())
386       return;
387     envelopes_.emplace_back();
388     envelopes_.back().EncodeStart(out_);
389     out_->push_back(kInitialByteIndefiniteLengthMap);
390   }
391 
HandleMapEnd()392   void HandleMapEnd() override {
393     if (!status_->ok())
394       return;
395     out_->push_back(kStopByte);
396     assert(!envelopes_.empty());
397     if (!envelopes_.back().EncodeStop(out_)) {
398       HandleError(
399           Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, out_->size()));
400       return;
401     }
402     envelopes_.pop_back();
403   }
404 
HandleArrayBegin()405   void HandleArrayBegin() override {
406     if (!status_->ok())
407       return;
408     envelopes_.emplace_back();
409     envelopes_.back().EncodeStart(out_);
410     out_->push_back(kInitialByteIndefiniteLengthArray);
411   }
412 
HandleArrayEnd()413   void HandleArrayEnd() override {
414     if (!status_->ok())
415       return;
416     out_->push_back(kStopByte);
417     assert(!envelopes_.empty());
418     if (!envelopes_.back().EncodeStop(out_)) {
419       HandleError(
420           Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, out_->size()));
421       return;
422     }
423     envelopes_.pop_back();
424   }
425 
HandleString8(span<uint8_t> chars)426   void HandleString8(span<uint8_t> chars) override {
427     if (!status_->ok())
428       return;
429     EncodeString8(chars, out_);
430   }
431 
HandleString16(span<uint16_t> chars)432   void HandleString16(span<uint16_t> chars) override {
433     if (!status_->ok())
434       return;
435     EncodeFromUTF16(chars, out_);
436   }
437 
HandleBinary(span<uint8_t> bytes)438   void HandleBinary(span<uint8_t> bytes) override {
439     if (!status_->ok())
440       return;
441     EncodeBinary(bytes, out_);
442   }
443 
HandleDouble(double value)444   void HandleDouble(double value) override {
445     if (!status_->ok())
446       return;
447     EncodeDouble(value, out_);
448   }
449 
HandleInt32(int32_t value)450   void HandleInt32(int32_t value) override {
451     if (!status_->ok())
452       return;
453     EncodeInt32(value, out_);
454   }
455 
HandleBool(bool value)456   void HandleBool(bool value) override {
457     if (!status_->ok())
458       return;
459     // See RFC 7049 Section 2.3, Table 2.
460     out_->push_back(value ? kEncodedTrue : kEncodedFalse);
461   }
462 
HandleNull()463   void HandleNull() override {
464     if (!status_->ok())
465       return;
466     // See RFC 7049 Section 2.3, Table 2.
467     out_->push_back(kEncodedNull);
468   }
469 
HandleError(Status error)470   void HandleError(Status error) override {
471     if (!status_->ok())
472       return;
473     *status_ = error;
474     out_->clear();
475   }
476 
477  private:
478   std::vector<uint8_t>* out_;
479   std::vector<EnvelopeEncoder> envelopes_;
480   Status* status_;
481 };
482 }  // namespace
483 
NewCBOREncoder(std::vector<uint8_t> * out,Status * status)484 std::unique_ptr<ParserHandler> NewCBOREncoder(std::vector<uint8_t>* out,
485                                               Status* status) {
486   return std::unique_ptr<ParserHandler>(new CBOREncoder(out, status));
487 }
488 
489 // =============================================================================
490 // cbor::CBORTokenizer - for parsing individual CBOR items
491 // =============================================================================
492 
CBORTokenizer(span<uint8_t> bytes)493 CBORTokenizer::CBORTokenizer(span<uint8_t> bytes) : bytes_(bytes) {
494   ReadNextToken(/*enter_envelope=*/false);
495 }
496 
~CBORTokenizer()497 CBORTokenizer::~CBORTokenizer() {}
498 
TokenTag() const499 CBORTokenTag CBORTokenizer::TokenTag() const {
500   return token_tag_;
501 }
502 
Next()503 void CBORTokenizer::Next() {
504   if (token_tag_ == CBORTokenTag::ERROR_VALUE ||
505       token_tag_ == CBORTokenTag::DONE)
506     return;
507   ReadNextToken(/*enter_envelope=*/false);
508 }
509 
EnterEnvelope()510 void CBORTokenizer::EnterEnvelope() {
511   assert(token_tag_ == CBORTokenTag::ENVELOPE);
512   ReadNextToken(/*enter_envelope=*/true);
513 }
514 
Status() const515 Status CBORTokenizer::Status() const {
516   return status_;
517 }
518 
519 // The following accessor functions ::GetInt32, ::GetDouble,
520 // ::GetString8, ::GetString16WireRep, ::GetBinary, ::GetEnvelopeContents
521 // assume that a particular token was recognized in ::ReadNextToken.
522 // That's where all the error checking is done. By design,
523 // the accessors (assuming the token was recognized) never produce
524 // an error.
525 
GetInt32() const526 int32_t CBORTokenizer::GetInt32() const {
527   assert(token_tag_ == CBORTokenTag::INT32);
528   // The range checks happen in ::ReadNextToken().
529   return static_cast<int32_t>(
530       token_start_type_ == MajorType::UNSIGNED
531           ? token_start_internal_value_
532           : -static_cast<int64_t>(token_start_internal_value_) - 1);
533 }
534 
GetDouble() const535 double CBORTokenizer::GetDouble() const {
536   assert(token_tag_ == CBORTokenTag::DOUBLE);
537   union {
538     uint64_t from_uint64;
539     double to_double;
540   } reinterpret;
541   reinterpret.from_uint64 = ReadBytesMostSignificantByteFirst<uint64_t>(
542       bytes_.subspan(status_.pos + 1));
543   return reinterpret.to_double;
544 }
545 
GetString8() const546 span<uint8_t> CBORTokenizer::GetString8() const {
547   assert(token_tag_ == CBORTokenTag::STRING8);
548   auto length = static_cast<size_t>(token_start_internal_value_);
549   return bytes_.subspan(status_.pos + (token_byte_length_ - length), length);
550 }
551 
GetString16WireRep() const552 span<uint8_t> CBORTokenizer::GetString16WireRep() const {
553   assert(token_tag_ == CBORTokenTag::STRING16);
554   auto length = static_cast<size_t>(token_start_internal_value_);
555   return bytes_.subspan(status_.pos + (token_byte_length_ - length), length);
556 }
557 
GetBinary() const558 span<uint8_t> CBORTokenizer::GetBinary() const {
559   assert(token_tag_ == CBORTokenTag::BINARY);
560   auto length = static_cast<size_t>(token_start_internal_value_);
561   return bytes_.subspan(status_.pos + (token_byte_length_ - length), length);
562 }
563 
GetEnvelope() const564 span<uint8_t> CBORTokenizer::GetEnvelope() const {
565   assert(token_tag_ == CBORTokenTag::ENVELOPE);
566   auto length = static_cast<size_t>(token_start_internal_value_);
567   return bytes_.subspan(status_.pos, length + kEncodedEnvelopeHeaderSize);
568 }
569 
GetEnvelopeContents() const570 span<uint8_t> CBORTokenizer::GetEnvelopeContents() const {
571   assert(token_tag_ == CBORTokenTag::ENVELOPE);
572   auto length = static_cast<size_t>(token_start_internal_value_);
573   return bytes_.subspan(status_.pos + kEncodedEnvelopeHeaderSize, length);
574 }
575 
576 // All error checking happens in ::ReadNextToken, so that the accessors
577 // can avoid having to carry an error return value.
578 //
579 // With respect to checking the encoded lengths of strings, arrays, etc:
580 // On the wire, CBOR uses 1,2,4, and 8 byte unsigned integers, so
581 // we initially read them as uint64_t, usually into token_start_internal_value_.
582 //
583 // However, since these containers have a representation on the machine,
584 // we need to do corresponding size computations on the input byte array,
585 // output span (e.g. the payload for a string), etc., and size_t is
586 // machine specific (in practice either 32 bit or 64 bit).
587 //
588 // Further, we must avoid overflowing size_t. Therefore, we use this
589 // kMaxValidLength constant to:
590 // - Reject values that are larger than the architecture specific
591 //   max size_t (differs between 32 bit and 64 bit arch).
592 // - Reserve at least one bit so that we can check against overflows
593 //   when adding lengths (array / string length / etc.); we do this by
594 //   ensuring that the inputs to an addition are <= kMaxValidLength,
595 //   and then checking whether the sum went past it.
596 //
597 // See also
598 // https://chromium.googlesource.com/chromium/src/+/master/docs/security/integer-semantics.md
599 static const uint64_t kMaxValidLength =
600     std::min<uint64_t>(std::numeric_limits<uint64_t>::max() >> 2,
601                        std::numeric_limits<size_t>::max());
602 
ReadNextToken(bool enter_envelope)603 void CBORTokenizer::ReadNextToken(bool enter_envelope) {
604   if (enter_envelope) {
605     status_.pos += kEncodedEnvelopeHeaderSize;
606   } else {
607     status_.pos =
608         status_.pos == Status::npos() ? 0 : status_.pos + token_byte_length_;
609   }
610   status_.error = Error::OK;
611   if (status_.pos >= bytes_.size()) {
612     token_tag_ = CBORTokenTag::DONE;
613     return;
614   }
615   const size_t remaining_bytes = bytes_.size() - status_.pos;
616   switch (bytes_[status_.pos]) {
617     case kStopByte:
618       SetToken(CBORTokenTag::STOP, 1);
619       return;
620     case kInitialByteIndefiniteLengthMap:
621       SetToken(CBORTokenTag::MAP_START, 1);
622       return;
623     case kInitialByteIndefiniteLengthArray:
624       SetToken(CBORTokenTag::ARRAY_START, 1);
625       return;
626     case kEncodedTrue:
627       SetToken(CBORTokenTag::TRUE_VALUE, 1);
628       return;
629     case kEncodedFalse:
630       SetToken(CBORTokenTag::FALSE_VALUE, 1);
631       return;
632     case kEncodedNull:
633       SetToken(CBORTokenTag::NULL_VALUE, 1);
634       return;
635     case kExpectedConversionToBase64Tag: {  // BINARY
636       const size_t bytes_read = internals::ReadTokenStart(
637           bytes_.subspan(status_.pos + 1), &token_start_type_,
638           &token_start_internal_value_);
639       if (!bytes_read || token_start_type_ != MajorType::BYTE_STRING ||
640           token_start_internal_value_ > kMaxValidLength) {
641         SetError(Error::CBOR_INVALID_BINARY);
642         return;
643       }
644       const uint64_t token_byte_length = token_start_internal_value_ +
645                                          /* tag before token start: */ 1 +
646                                          /* token start: */ bytes_read;
647       if (token_byte_length > remaining_bytes) {
648         SetError(Error::CBOR_INVALID_BINARY);
649         return;
650       }
651       SetToken(CBORTokenTag::BINARY, static_cast<size_t>(token_byte_length));
652       return;
653     }
654     case kInitialByteForDouble: {  // DOUBLE
655       if (kEncodedDoubleSize > remaining_bytes) {
656         SetError(Error::CBOR_INVALID_DOUBLE);
657         return;
658       }
659       SetToken(CBORTokenTag::DOUBLE, kEncodedDoubleSize);
660       return;
661     }
662     case kInitialByteForEnvelope: {  // ENVELOPE
663       if (kEncodedEnvelopeHeaderSize > remaining_bytes) {
664         SetError(Error::CBOR_INVALID_ENVELOPE);
665         return;
666       }
667       // The envelope must be a byte string with 32 bit length.
668       if (bytes_[status_.pos + 1] != kInitialByteFor32BitLengthByteString) {
669         SetError(Error::CBOR_INVALID_ENVELOPE);
670         return;
671       }
672       // Read the length of the byte string.
673       token_start_internal_value_ = ReadBytesMostSignificantByteFirst<uint32_t>(
674           bytes_.subspan(status_.pos + 2));
675       if (token_start_internal_value_ > kMaxValidLength) {
676         SetError(Error::CBOR_INVALID_ENVELOPE);
677         return;
678       }
679       uint64_t token_byte_length =
680           token_start_internal_value_ + kEncodedEnvelopeHeaderSize;
681       if (token_byte_length > remaining_bytes) {
682         SetError(Error::CBOR_INVALID_ENVELOPE);
683         return;
684       }
685       SetToken(CBORTokenTag::ENVELOPE, static_cast<size_t>(token_byte_length));
686       return;
687     }
688     default: {
689       const size_t bytes_read = internals::ReadTokenStart(
690           bytes_.subspan(status_.pos), &token_start_type_,
691           &token_start_internal_value_);
692       switch (token_start_type_) {
693         case MajorType::UNSIGNED:  // INT32.
694           // INT32 is a signed int32 (int32 makes sense for the
695           // inspector protocol, it's not a CBOR limitation), so we check
696           // against the signed max, so that the allowable values are
697           // 0, 1, 2, ... 2^31 - 1.
698           if (!bytes_read ||
699               static_cast<uint64_t>(std::numeric_limits<int32_t>::max()) <
700                   static_cast<uint64_t>(token_start_internal_value_)) {
701             SetError(Error::CBOR_INVALID_INT32);
702             return;
703           }
704           SetToken(CBORTokenTag::INT32, bytes_read);
705           return;
706         case MajorType::NEGATIVE: {  // INT32.
707           // INT32 is a signed int32 (int32 makes sense for the
708           // inspector protocol, it's not a CBOR limitation); in CBOR, the
709           // negative values for INT32 are represented as NEGATIVE, that is, -1
710           // INT32 is represented as 1 << 5 | 0 (major type 1, additional info
711           // value 0).
712           // The represented allowed values range is -1 to -2^31.
713           // They are mapped into the encoded range of 0 to 2^31-1.
714           // We check the payload in token_start_internal_value_ against
715           // that range (2^31-1 is also known as
716           // std::numeric_limits<int32_t>::max()).
717           if (!bytes_read ||
718               static_cast<uint64_t>(token_start_internal_value_) >
719                   static_cast<uint64_t>(std::numeric_limits<int32_t>::max())) {
720             SetError(Error::CBOR_INVALID_INT32);
721             return;
722           }
723           SetToken(CBORTokenTag::INT32, bytes_read);
724           return;
725         }
726         case MajorType::STRING: {  // STRING8.
727           if (!bytes_read || token_start_internal_value_ > kMaxValidLength) {
728             SetError(Error::CBOR_INVALID_STRING8);
729             return;
730           }
731           uint64_t token_byte_length = token_start_internal_value_ + bytes_read;
732           if (token_byte_length > remaining_bytes) {
733             SetError(Error::CBOR_INVALID_STRING8);
734             return;
735           }
736           SetToken(CBORTokenTag::STRING8,
737                    static_cast<size_t>(token_byte_length));
738           return;
739         }
740         case MajorType::BYTE_STRING: {  // STRING16.
741           // Length must be divisible by 2 since UTF16 is 2 bytes per
742           // character, hence the &1 check.
743           if (!bytes_read || token_start_internal_value_ > kMaxValidLength ||
744               token_start_internal_value_ & 1) {
745             SetError(Error::CBOR_INVALID_STRING16);
746             return;
747           }
748           uint64_t token_byte_length = token_start_internal_value_ + bytes_read;
749           if (token_byte_length > remaining_bytes) {
750             SetError(Error::CBOR_INVALID_STRING16);
751             return;
752           }
753           SetToken(CBORTokenTag::STRING16,
754                    static_cast<size_t>(token_byte_length));
755           return;
756         }
757         case MajorType::ARRAY:
758         case MajorType::MAP:
759         case MajorType::TAG:
760         case MajorType::SIMPLE_VALUE:
761           SetError(Error::CBOR_UNSUPPORTED_VALUE);
762           return;
763       }
764     }
765   }
766 }
767 
SetToken(CBORTokenTag token_tag,size_t token_byte_length)768 void CBORTokenizer::SetToken(CBORTokenTag token_tag, size_t token_byte_length) {
769   token_tag_ = token_tag;
770   token_byte_length_ = token_byte_length;
771 }
772 
SetError(Error error)773 void CBORTokenizer::SetError(Error error) {
774   token_tag_ = CBORTokenTag::ERROR_VALUE;
775   status_.error = error;
776 }
777 
778 // =============================================================================
779 // cbor::ParseCBOR - for receiving streaming parser events for CBOR messages
780 // =============================================================================
781 
782 namespace {
783 // When parsing CBOR, we limit recursion depth for objects and arrays
784 // to this constant.
785 static constexpr int kStackLimit = 300;
786 
787 // Below are three parsing routines for CBOR, which cover enough
788 // to roundtrip JSON messages.
789 bool ParseMap(int32_t stack_depth,
790               CBORTokenizer* tokenizer,
791               ParserHandler* out);
792 bool ParseArray(int32_t stack_depth,
793                 CBORTokenizer* tokenizer,
794                 ParserHandler* out);
795 bool ParseValue(int32_t stack_depth,
796                 CBORTokenizer* tokenizer,
797                 ParserHandler* out);
798 bool ParseEnvelope(int32_t stack_depth,
799                    CBORTokenizer* tokenizer,
800                    ParserHandler* out);
801 
ParseUTF16String(CBORTokenizer * tokenizer,ParserHandler * out)802 void ParseUTF16String(CBORTokenizer* tokenizer, ParserHandler* out) {
803   std::vector<uint16_t> value;
804   span<uint8_t> rep = tokenizer->GetString16WireRep();
805   for (size_t ii = 0; ii < rep.size(); ii += 2)
806     value.push_back((rep[ii + 1] << 8) | rep[ii]);
807   out->HandleString16(span<uint16_t>(value.data(), value.size()));
808   tokenizer->Next();
809 }
810 
ParseUTF8String(CBORTokenizer * tokenizer,ParserHandler * out)811 bool ParseUTF8String(CBORTokenizer* tokenizer, ParserHandler* out) {
812   assert(tokenizer->TokenTag() == CBORTokenTag::STRING8);
813   out->HandleString8(tokenizer->GetString8());
814   tokenizer->Next();
815   return true;
816 }
817 
ParseEnvelope(int32_t stack_depth,CBORTokenizer * tokenizer,ParserHandler * out)818 bool ParseEnvelope(int32_t stack_depth,
819                    CBORTokenizer* tokenizer,
820                    ParserHandler* out) {
821   assert(tokenizer->TokenTag() == CBORTokenTag::ENVELOPE);
822   // Before we enter the envelope, we save the position that we
823   // expect to see after we're done parsing the envelope contents.
824   // This way we can compare and produce an error if the contents
825   // didn't fit exactly into the envelope length.
826   size_t pos_past_envelope = tokenizer->Status().pos +
827                              kEncodedEnvelopeHeaderSize +
828                              tokenizer->GetEnvelopeContents().size();
829   tokenizer->EnterEnvelope();
830   switch (tokenizer->TokenTag()) {
831     case CBORTokenTag::ERROR_VALUE:
832       out->HandleError(tokenizer->Status());
833       return false;
834     case CBORTokenTag::MAP_START:
835       if (!ParseMap(stack_depth + 1, tokenizer, out))
836         return false;
837       break;  // Continue to check pos_past_envelope below.
838     case CBORTokenTag::ARRAY_START:
839       if (!ParseArray(stack_depth + 1, tokenizer, out))
840         return false;
841       break;  // Continue to check pos_past_envelope below.
842     default:
843       out->HandleError(Status{Error::CBOR_MAP_OR_ARRAY_EXPECTED_IN_ENVELOPE,
844                               tokenizer->Status().pos});
845       return false;
846   }
847   // The contents of the envelope parsed OK, now check that we're at
848   // the expected position.
849   if (pos_past_envelope != tokenizer->Status().pos) {
850     out->HandleError(Status{Error::CBOR_ENVELOPE_CONTENTS_LENGTH_MISMATCH,
851                             tokenizer->Status().pos});
852     return false;
853   }
854   return true;
855 }
856 
ParseValue(int32_t stack_depth,CBORTokenizer * tokenizer,ParserHandler * out)857 bool ParseValue(int32_t stack_depth,
858                 CBORTokenizer* tokenizer,
859                 ParserHandler* out) {
860   if (stack_depth > kStackLimit) {
861     out->HandleError(
862         Status{Error::CBOR_STACK_LIMIT_EXCEEDED, tokenizer->Status().pos});
863     return false;
864   }
865   switch (tokenizer->TokenTag()) {
866     case CBORTokenTag::ERROR_VALUE:
867       out->HandleError(tokenizer->Status());
868       return false;
869     case CBORTokenTag::DONE:
870       out->HandleError(Status{Error::CBOR_UNEXPECTED_EOF_EXPECTED_VALUE,
871                               tokenizer->Status().pos});
872       return false;
873     case CBORTokenTag::ENVELOPE:
874       return ParseEnvelope(stack_depth, tokenizer, out);
875     case CBORTokenTag::TRUE_VALUE:
876       out->HandleBool(true);
877       tokenizer->Next();
878       return true;
879     case CBORTokenTag::FALSE_VALUE:
880       out->HandleBool(false);
881       tokenizer->Next();
882       return true;
883     case CBORTokenTag::NULL_VALUE:
884       out->HandleNull();
885       tokenizer->Next();
886       return true;
887     case CBORTokenTag::INT32:
888       out->HandleInt32(tokenizer->GetInt32());
889       tokenizer->Next();
890       return true;
891     case CBORTokenTag::DOUBLE:
892       out->HandleDouble(tokenizer->GetDouble());
893       tokenizer->Next();
894       return true;
895     case CBORTokenTag::STRING8:
896       return ParseUTF8String(tokenizer, out);
897     case CBORTokenTag::STRING16:
898       ParseUTF16String(tokenizer, out);
899       return true;
900     case CBORTokenTag::BINARY: {
901       out->HandleBinary(tokenizer->GetBinary());
902       tokenizer->Next();
903       return true;
904     }
905     case CBORTokenTag::MAP_START:
906       return ParseMap(stack_depth + 1, tokenizer, out);
907     case CBORTokenTag::ARRAY_START:
908       return ParseArray(stack_depth + 1, tokenizer, out);
909     default:
910       out->HandleError(
911           Status{Error::CBOR_UNSUPPORTED_VALUE, tokenizer->Status().pos});
912       return false;
913   }
914 }
915 
916 // |bytes| must start with the indefinite length array byte, so basically,
917 // ParseArray may only be called after an indefinite length array has been
918 // detected.
ParseArray(int32_t stack_depth,CBORTokenizer * tokenizer,ParserHandler * out)919 bool ParseArray(int32_t stack_depth,
920                 CBORTokenizer* tokenizer,
921                 ParserHandler* out) {
922   assert(tokenizer->TokenTag() == CBORTokenTag::ARRAY_START);
923   tokenizer->Next();
924   out->HandleArrayBegin();
925   while (tokenizer->TokenTag() != CBORTokenTag::STOP) {
926     if (tokenizer->TokenTag() == CBORTokenTag::DONE) {
927       out->HandleError(
928           Status{Error::CBOR_UNEXPECTED_EOF_IN_ARRAY, tokenizer->Status().pos});
929       return false;
930     }
931     if (tokenizer->TokenTag() == CBORTokenTag::ERROR_VALUE) {
932       out->HandleError(tokenizer->Status());
933       return false;
934     }
935     // Parse value.
936     if (!ParseValue(stack_depth, tokenizer, out))
937       return false;
938   }
939   out->HandleArrayEnd();
940   tokenizer->Next();
941   return true;
942 }
943 
944 // |bytes| must start with the indefinite length array byte, so basically,
945 // ParseArray may only be called after an indefinite length array has been
946 // detected.
ParseMap(int32_t stack_depth,CBORTokenizer * tokenizer,ParserHandler * out)947 bool ParseMap(int32_t stack_depth,
948               CBORTokenizer* tokenizer,
949               ParserHandler* out) {
950   assert(tokenizer->TokenTag() == CBORTokenTag::MAP_START);
951   out->HandleMapBegin();
952   tokenizer->Next();
953   while (tokenizer->TokenTag() != CBORTokenTag::STOP) {
954     if (tokenizer->TokenTag() == CBORTokenTag::DONE) {
955       out->HandleError(
956           Status{Error::CBOR_UNEXPECTED_EOF_IN_MAP, tokenizer->Status().pos});
957       return false;
958     }
959     if (tokenizer->TokenTag() == CBORTokenTag::ERROR_VALUE) {
960       out->HandleError(tokenizer->Status());
961       return false;
962     }
963     // Parse key.
964     if (tokenizer->TokenTag() == CBORTokenTag::STRING8) {
965       if (!ParseUTF8String(tokenizer, out))
966         return false;
967     } else if (tokenizer->TokenTag() == CBORTokenTag::STRING16) {
968       ParseUTF16String(tokenizer, out);
969     } else {
970       out->HandleError(
971           Status{Error::CBOR_INVALID_MAP_KEY, tokenizer->Status().pos});
972       return false;
973     }
974     // Parse value.
975     if (!ParseValue(stack_depth, tokenizer, out))
976       return false;
977   }
978   out->HandleMapEnd();
979   tokenizer->Next();
980   return true;
981 }
982 }  // namespace
983 
ParseCBOR(span<uint8_t> bytes,ParserHandler * out)984 void ParseCBOR(span<uint8_t> bytes, ParserHandler* out) {
985   if (bytes.empty()) {
986     out->HandleError(Status{Error::CBOR_NO_INPUT, 0});
987     return;
988   }
989   CBORTokenizer tokenizer(bytes);
990   if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE) {
991     out->HandleError(tokenizer.Status());
992     return;
993   }
994   if (!ParseValue(/*stack_depth=*/0, &tokenizer, out))
995     return;
996   if (tokenizer.TokenTag() == CBORTokenTag::DONE)
997     return;
998   if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE) {
999     out->HandleError(tokenizer.Status());
1000     return;
1001   }
1002   out->HandleError(Status{Error::CBOR_TRAILING_JUNK, tokenizer.Status().pos});
1003 }
1004 
1005 // =============================================================================
1006 // cbor::AppendString8EntryToMap - for limited in-place editing of messages
1007 // =============================================================================
1008 
AppendString8EntryToCBORMap(span<uint8_t> string8_key,span<uint8_t> string8_value,std::vector<uint8_t> * cbor)1009 Status AppendString8EntryToCBORMap(span<uint8_t> string8_key,
1010                                    span<uint8_t> string8_value,
1011                                    std::vector<uint8_t>* cbor) {
1012   // Careful below: Don't compare (*cbor)[idx] with a uint8_t, since
1013   // it could be a char (signed!). Instead, use bytes.
1014   span<uint8_t> bytes(reinterpret_cast<const uint8_t*>(cbor->data()),
1015                       cbor->size());
1016   CBORTokenizer tokenizer(bytes);
1017   if (tokenizer.TokenTag() == CBORTokenTag::ERROR_VALUE)
1018     return tokenizer.Status();
1019   if (tokenizer.TokenTag() != CBORTokenTag::ENVELOPE)
1020     return Status(Error::CBOR_INVALID_ENVELOPE, 0);
1021   size_t envelope_size = tokenizer.GetEnvelopeContents().size();
1022   size_t old_size = cbor->size();
1023   if (old_size != envelope_size + kEncodedEnvelopeHeaderSize)
1024     return Status(Error::CBOR_INVALID_ENVELOPE, 0);
1025   if (envelope_size == 0 ||
1026       (tokenizer.GetEnvelopeContents()[0] != EncodeIndefiniteLengthMapStart()))
1027     return Status(Error::CBOR_MAP_START_EXPECTED, kEncodedEnvelopeHeaderSize);
1028   if (bytes[bytes.size() - 1] != EncodeStop())
1029     return Status(Error::CBOR_MAP_STOP_EXPECTED, cbor->size() - 1);
1030   cbor->pop_back();
1031   EncodeString8(string8_key, cbor);
1032   EncodeString8(string8_value, cbor);
1033   cbor->push_back(EncodeStop());
1034   size_t new_envelope_size = envelope_size + (cbor->size() - old_size);
1035   if (new_envelope_size > std::numeric_limits<uint32_t>::max())
1036     return Status(Error::CBOR_ENVELOPE_SIZE_LIMIT_EXCEEDED, 0);
1037   size_t size_pos = cbor->size() - new_envelope_size - sizeof(uint32_t);
1038   uint8_t* out = reinterpret_cast<uint8_t*>(&cbor->at(size_pos));
1039   *(out++) = (new_envelope_size >> 24) & 0xff;
1040   *(out++) = (new_envelope_size >> 16) & 0xff;
1041   *(out++) = (new_envelope_size >> 8) & 0xff;
1042   *(out) = new_envelope_size & 0xff;
1043   return Status();
1044 }
1045 }  // namespace cbor
1046 }  // namespace v8_crdtp
1047