1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef INCLUDE_PERFETTO_PROTOZERO_PROTO_DECODER_H_ 18 #define INCLUDE_PERFETTO_PROTOZERO_PROTO_DECODER_H_ 19 20 #include <stdint.h> 21 #include <array> 22 #include <memory> 23 #include <vector> 24 25 #include "perfetto/base/compiler.h" 26 #include "perfetto/base/logging.h" 27 #include "perfetto/protozero/field.h" 28 #include "perfetto/protozero/proto_utils.h" 29 30 namespace protozero { 31 32 // A generic protobuf decoder. Doesn't require any knowledge about the proto 33 // schema. It tokenizes fields, retrieves their ID and type and exposes 34 // accessors to retrieve its values. 35 // It does NOT recurse in nested submessages, instead it just computes their 36 // boundaries, recursion is left to the caller. 37 // This class is designed to be used in perf-sensitive contexts. It does not 38 // allocate and does not perform any proto semantic checks (e.g. repeated / 39 // required / optional). It's supposedly safe wrt out-of-bounds memory accesses 40 // (see proto_decoder_fuzzer.cc). 41 // This class serves also as a building block for TypedProtoDecoder, used when 42 // the schema is known at compile time. 43 class PERFETTO_EXPORT_COMPONENT ProtoDecoder { 44 public: 45 // Creates a ProtoDecoder using the given |buffer| with size |length| bytes. ProtoDecoder(const void * buffer,size_t length)46 ProtoDecoder(const void* buffer, size_t length) 47 : begin_(reinterpret_cast<const uint8_t*>(buffer)), 48 end_(begin_ + length), 49 read_ptr_(begin_) {} ProtoDecoder(const std::string & str)50 ProtoDecoder(const std::string& str) : ProtoDecoder(str.data(), str.size()) {} ProtoDecoder(const ConstBytes & cb)51 ProtoDecoder(const ConstBytes& cb) : ProtoDecoder(cb.data, cb.size) {} 52 53 // Reads the next field from the buffer and advances the read cursor. If a 54 // full field cannot be read, the returned Field will be invalid (i.e. 55 // field.valid() == false). 56 Field ReadField(); 57 58 // Finds the first field with the given id. Doesn't affect the read cursor. 59 Field FindField(uint32_t field_id); 60 61 // Resets the read cursor to the start of the buffer. Reset()62 void Reset() { read_ptr_ = begin_; } 63 64 // Resets the read cursor to the given position (must be within the buffer). Reset(const uint8_t * pos)65 void Reset(const uint8_t* pos) { 66 PERFETTO_DCHECK(pos >= begin_ && pos < end_); 67 read_ptr_ = pos; 68 } 69 70 // Returns the position of read cursor, relative to the start of the buffer. read_offset()71 size_t read_offset() const { return static_cast<size_t>(read_ptr_ - begin_); } 72 bytes_left()73 size_t bytes_left() const { 74 PERFETTO_DCHECK(read_ptr_ <= end_); 75 return static_cast<size_t>(end_ - read_ptr_); 76 } 77 begin()78 const uint8_t* begin() const { return begin_; } end()79 const uint8_t* end() const { return end_; } 80 81 protected: 82 const uint8_t* const begin_; 83 const uint8_t* const end_; 84 const uint8_t* read_ptr_ = nullptr; 85 }; 86 87 // An iterator-like class used to iterate through repeated fields. Used by 88 // TypedProtoDecoder. The iteration sequence is a bit counter-intuitive due to 89 // the fact that fields_[field_id] holds the *last* value of the field, not the 90 // first, but the remaining storage holds repeated fields in FIFO order. 91 // Assume that we push the 10,11,12 into a repeated field with ID=1. 92 // 93 // Decoder memory layout: [ fields storage ] [ repeated fields storage ] 94 // 1st iteration: 10 95 // 2nd iteration: 11 10 96 // 3rd iteration: 12 10 11 97 // 98 // We start the iteration @ fields_[num_fields], which is the start of the 99 // repeated fields storage, proceed until the end and lastly jump @ fields_[id]. 100 template <typename T> 101 class RepeatedFieldIterator { 102 public: RepeatedFieldIterator(uint32_t field_id,const Field * begin,const Field * end,const Field * last)103 RepeatedFieldIterator(uint32_t field_id, 104 const Field* begin, 105 const Field* end, 106 const Field* last) 107 : field_id_(field_id), iter_(begin), end_(end), last_(last) { 108 FindNextMatchingId(); 109 } 110 111 // Constructs an invalid iterator. RepeatedFieldIterator()112 RepeatedFieldIterator() 113 : field_id_(0u), iter_(nullptr), end_(nullptr), last_(nullptr) {} 114 115 explicit operator bool() const { return iter_ != end_; } field()116 const Field& field() const { return *iter_; } 117 118 T operator*() const { 119 T val{}; 120 iter_->get(&val); 121 return val; 122 } 123 const Field* operator->() const { return iter_; } 124 125 RepeatedFieldIterator& operator++() { 126 PERFETTO_DCHECK(iter_ != end_); 127 if (iter_ == last_) { 128 iter_ = end_; 129 return *this; 130 } 131 ++iter_; 132 FindNextMatchingId(); 133 return *this; 134 } 135 136 RepeatedFieldIterator operator++(int) { 137 PERFETTO_DCHECK(iter_ != end_); 138 RepeatedFieldIterator it(*this); 139 ++(*this); 140 return it; 141 } 142 143 private: FindNextMatchingId()144 void FindNextMatchingId() { 145 PERFETTO_DCHECK(iter_ != last_); 146 for (; iter_ != end_; ++iter_) { 147 if (iter_->id() == field_id_) 148 return; 149 } 150 iter_ = last_->valid() ? last_ : end_; 151 } 152 153 uint32_t field_id_; 154 155 // Initially points to the beginning of the repeated field storage, then is 156 // incremented as we call operator++(). 157 const Field* iter_; 158 159 // Always points to fields_[size_], i.e. past the end of the storage. 160 const Field* end_; 161 162 // Always points to fields_[field_id]. 163 const Field* last_; 164 }; 165 166 // As RepeatedFieldIterator, but allows iterating over a packed repeated field 167 // (which will be initially stored as a single length-delimited field). 168 // See |GetPackedRepeatedField| for details. 169 // 170 // Assumes little endianness, and that the input buffers are well formed - 171 // containing an exact multiple of encoded elements. 172 template <proto_utils::ProtoWireType wire_type, typename CppType> 173 class PackedRepeatedFieldIterator { 174 public: PackedRepeatedFieldIterator(const uint8_t * data_begin,size_t size,bool * parse_error_ptr)175 PackedRepeatedFieldIterator(const uint8_t* data_begin, 176 size_t size, 177 bool* parse_error_ptr) 178 : data_end_(data_begin ? data_begin + size : nullptr), 179 read_ptr_(data_begin), 180 parse_error_(parse_error_ptr) { 181 using proto_utils::ProtoWireType; 182 static_assert(wire_type == ProtoWireType::kVarInt || 183 wire_type == ProtoWireType::kFixed32 || 184 wire_type == ProtoWireType::kFixed64, 185 "invalid type"); 186 187 PERFETTO_DCHECK(parse_error_ptr); 188 189 // Either the field is unset (and there are no data pointer), or the field 190 // is set with a zero length payload. Mark the iterator as invalid in both 191 // cases. 192 if (size == 0) { 193 curr_value_valid_ = false; 194 return; 195 } 196 197 if ((wire_type == ProtoWireType::kFixed32 && (size % 4) != 0) || 198 (wire_type == ProtoWireType::kFixed64 && (size % 8) != 0)) { 199 *parse_error_ = true; 200 curr_value_valid_ = false; 201 return; 202 } 203 204 ++(*this); 205 } 206 207 const CppType operator*() const { return curr_value_; } 208 explicit operator bool() const { return curr_value_valid_; } 209 210 PackedRepeatedFieldIterator& operator++() { 211 using proto_utils::ProtoWireType; 212 213 if (PERFETTO_UNLIKELY(!curr_value_valid_)) 214 return *this; 215 216 if (PERFETTO_UNLIKELY(read_ptr_ == data_end_)) { 217 curr_value_valid_ = false; 218 return *this; 219 } 220 221 if (wire_type == ProtoWireType::kVarInt) { 222 uint64_t new_value = 0; 223 const uint8_t* new_pos = 224 proto_utils::ParseVarInt(read_ptr_, data_end_, &new_value); 225 226 if (PERFETTO_UNLIKELY(new_pos == read_ptr_)) { 227 // Failed to decode the varint (probably incomplete buffer). 228 *parse_error_ = true; 229 curr_value_valid_ = false; 230 } else { 231 read_ptr_ = new_pos; 232 curr_value_ = static_cast<CppType>(new_value); 233 } 234 } else { // kFixed32 or kFixed64 235 constexpr size_t kStep = wire_type == ProtoWireType::kFixed32 ? 4 : 8; 236 237 // NB: the raw buffer is not guaranteed to be aligned, so neither are 238 // these copies. 239 memcpy(&curr_value_, read_ptr_, sizeof(CppType)); 240 read_ptr_ += kStep; 241 } 242 243 return *this; 244 } 245 246 PackedRepeatedFieldIterator operator++(int) { 247 PackedRepeatedFieldIterator it(*this); 248 ++(*this); 249 return it; 250 } 251 252 private: 253 // Might be null if the backing proto field isn't set. 254 const uint8_t* const data_end_; 255 256 // The iterator looks ahead by an element, so |curr_value| holds the value 257 // to be returned when the caller dereferences the iterator, and |read_ptr_| 258 // points at the start of the next element to be decoded. 259 // |read_ptr_| might be null if the backing proto field isn't set. 260 const uint8_t* read_ptr_; 261 CppType curr_value_ = {}; 262 263 // Set to false once we've exhausted the iterator, or encountered an error. 264 bool curr_value_valid_ = true; 265 266 // Where to set parsing errors, supplied by the caller. 267 bool* const parse_error_; 268 }; 269 270 // This decoder loads all fields upfront, without recursing in nested messages. 271 // It is used as a base class for typed decoders generated by the pbzero plugin. 272 // The split between TypedProtoDecoderBase and TypedProtoDecoder<> is to have 273 // unique definition of functions like ParseAllFields() and ExpandHeapStorage(). 274 // The storage (either on-stack or on-heap) for this class is organized as 275 // follows: 276 // |-------------------------- fields_ ----------------------| 277 // [ field 0 (invalid) ] [ fields 1 .. N ] [ repeated fields ] 278 // ^ ^ 279 // num_fields_ size_ 280 // Note that if a message has high field numbers, upon creation |size_| can be 281 // < |num_fields_| (until a heap expansion is hit while inserting). 282 class PERFETTO_EXPORT_COMPONENT TypedProtoDecoderBase : public ProtoDecoder { 283 public: 284 // If the field |id| is known at compile time, prefer the templated 285 // specialization at<kFieldNumber>(). Get(uint32_t id)286 const Field& Get(uint32_t id) const { 287 if (PERFETTO_LIKELY(id < num_fields_ && id < size_)) 288 return fields_[id]; 289 // If id >= num_fields_, the field id is invalid (was not known in the 290 // .proto) and we return the 0th field, which is always !valid(). 291 // If id >= size_ and <= num_fields, the id is valid but the field has not 292 // been seen while decoding (hence the stack storage has not been expanded) 293 // so we return the 0th invalid field. 294 return fields_[0]; 295 } 296 297 // Returns an object that allows to iterate over all instances of a repeated 298 // field given its id. Example usage: 299 // for (auto it = decoder.GetRepeated<int32_t>(N); it; ++it) { ... } 300 template <typename T> GetRepeated(uint32_t field_id)301 RepeatedFieldIterator<T> GetRepeated(uint32_t field_id) const { 302 const Field* repeated_begin; 303 // The storage for repeated fields starts after the slot for the highest 304 // field id (refer to the diagram in the class-level comment). However, if 305 // a message has more than INITIAL_STACK_CAPACITY field there will be no 306 // slots available for the repeated fields (if ExpandHeapStorage() was not 307 // called). Imagine a message that has highest field id = 102 and that is 308 // still using the stack: 309 // [ F0 ] [ F1 ] ... [ F100 ] [ F101 ] [ F1012] [ repeated fields ] 310 // ^ num_fields_ 311 // ^ size (== capacity) 312 if (PERFETTO_LIKELY(num_fields_ < size_)) { 313 repeated_begin = &fields_[num_fields_]; 314 } else { 315 // This is the case of not having any storage space for repeated fields. 316 // This makes it so begin == end, so the iterator will just skip @ last. 317 repeated_begin = &fields_[size_]; 318 } 319 const Field* repeated_end = &fields_[size_]; 320 const Field* last = &Get(field_id); 321 return RepeatedFieldIterator<T>(field_id, repeated_begin, repeated_end, 322 last); 323 } 324 325 // Returns an objects that allows to iterate over all entries of a packed 326 // repeated field given its id and type. The |wire_type| is necessary for 327 // decoding the packed field, the |cpp_type| is for convenience & stronger 328 // typing. 329 // 330 // The caller must also supply a pointer to a bool that is set to true if the 331 // packed buffer is found to be malformed while iterating (so you need to 332 // exhaust the iterator if you want to check the full extent of the buffer). 333 // 334 // Note that unlike standard protobuf parsers, protozero does not allow 335 // treating of packed repeated fields as non-packed and vice-versa (therefore 336 // not making the packed option forwards and backwards compatible). So 337 // the caller needs to use the right accessor for correct results. 338 template <proto_utils::ProtoWireType wire_type, typename cpp_type> GetPackedRepeated(uint32_t field_id,bool * parse_error_location)339 PackedRepeatedFieldIterator<wire_type, cpp_type> GetPackedRepeated( 340 uint32_t field_id, 341 bool* parse_error_location) const { 342 const Field& field = Get(field_id); 343 if (field.valid() && 344 field.type() == proto_utils::ProtoWireType::kLengthDelimited) { 345 return PackedRepeatedFieldIterator<wire_type, cpp_type>( 346 field.data(), field.size(), parse_error_location); 347 } 348 return PackedRepeatedFieldIterator<wire_type, cpp_type>( 349 nullptr, 0, parse_error_location); 350 } 351 352 protected: TypedProtoDecoderBase(Field * storage,uint32_t num_fields,uint32_t capacity,const uint8_t * buffer,size_t length)353 TypedProtoDecoderBase(Field* storage, 354 uint32_t num_fields, 355 uint32_t capacity, 356 const uint8_t* buffer, 357 size_t length) 358 : ProtoDecoder(buffer, length), 359 fields_(storage), 360 num_fields_(num_fields), 361 // The reason for "capacity -1" is to avoid hitting the expansion path 362 // in TypedProtoDecoderBase::ParseAllFields() when we are just setting 363 // fields < INITIAL_STACK_CAPACITY (which is the most common case). 364 size_(std::min(num_fields, capacity - 1)), 365 capacity_(capacity) { 366 // The reason why Field needs to be trivially de/constructible is to avoid 367 // implicit initializers on all the ~1000 entries. We need it to initialize 368 // only on the first |max_field_id| fields, the remaining capacity doesn't 369 // require initialization. 370 static_assert(std::is_trivially_constructible<Field>::value && 371 std::is_trivially_destructible<Field>::value && 372 std::is_trivial<Field>::value, 373 "Field must be a trivial aggregate type"); 374 memset(fields_, 0, sizeof(Field) * capacity_); 375 PERFETTO_DCHECK(capacity > 0); 376 } 377 378 void ParseAllFields(); 379 380 // Called when the default on-stack storage is exhausted and new repeated 381 // fields need to be pushed. 382 void ExpandHeapStorage(); 383 384 // Used only in presence of a large number of repeated fields, when the 385 // default on-stack storage is exhausted. 386 std::unique_ptr<Field[]> heap_storage_; 387 388 // Points to the storage, either on-stack (default, provided by the template 389 // specialization) or |heap_storage_| after ExpandHeapStorage() is called, in 390 // case of a large number of repeated fields. 391 Field* fields_; 392 393 // Number of known fields, without accounting repeated storage. This is equal 394 // to MAX_FIELD_ID + 1 (to account for the invalid 0th field). It never 395 // changes after construction. 396 // This is unrelated with |size_| and |capacity_|. If the highest field id of 397 // a proto message is 131, |num_fields_| will be = 132 but, on initialization, 398 // |size_| = |capacity_| = 100 (INITIAL_STACK_CAPACITY). 399 // One cannot generally assume that |fields_| has enough storage to 400 // dereference every field. That is only true: 401 // - For field ids < INITIAL_STACK_CAPACITY. 402 // - After the first call to ExpandHeapStorage(). 403 uint32_t num_fields_; 404 405 // Number of active |fields_| entries. This is initially equal to 406 // min(num_fields_, INITIAL_STACK_CAPACITY - 1) and after ExpandHeapStorage() 407 // becomes == |num_fields_|. If the message has non-packed repeated fields, it 408 // can grow further, up to |capacity_|. 409 // |size_| is always <= |capacity_|. But |num_fields_| can be > |size_|. 410 uint32_t size_; 411 412 // Initially equal to kFieldsCapacity of the TypedProtoDecoder 413 // specialization. Can grow when falling back on heap-based storage, in which 414 // case it represents the size (#fields with each entry of a repeated field 415 // counted individually) of the |heap_storage_| array. 416 uint32_t capacity_; 417 }; 418 419 // This constant is a tradeoff between having a larger stack frame and being 420 // able to decode field IDs up to N (or N - num_fields repeated fields) without 421 // falling back on the heap. 422 #define PROTOZERO_DECODER_INITIAL_STACK_CAPACITY 100 423 424 // Template class instantiated by the auto-generated decoder classes declared in 425 // xxx.pbzero.h files. 426 template <int MAX_FIELD_ID, bool HAS_NONPACKED_REPEATED_FIELDS> 427 class TypedProtoDecoder : public TypedProtoDecoderBase { 428 public: TypedProtoDecoder(const uint8_t * buffer,size_t length)429 TypedProtoDecoder(const uint8_t* buffer, size_t length) 430 : TypedProtoDecoderBase(on_stack_storage_, 431 /*num_fields=*/MAX_FIELD_ID + 1, 432 PROTOZERO_DECODER_INITIAL_STACK_CAPACITY, 433 buffer, 434 length) { 435 TypedProtoDecoderBase::ParseAllFields(); 436 } 437 438 template <uint32_t FIELD_ID> at()439 const Field& at() const { 440 static_assert(FIELD_ID <= MAX_FIELD_ID, "FIELD_ID > MAX_FIELD_ID"); 441 // If the field id is < the on-stack capacity, it's safe to always 442 // dereference |fields_|, whether it's still using the stack or it fell 443 // back on the heap. Because both terms of the if () are known at compile 444 // time, the compiler elides the branch for ids < INITIAL_STACK_CAPACITY. 445 if (FIELD_ID < PROTOZERO_DECODER_INITIAL_STACK_CAPACITY) { 446 return fields_[FIELD_ID]; 447 } else { 448 // Otherwise use the slowpath Get() which will do a runtime check. 449 return Get(FIELD_ID); 450 } 451 } 452 TypedProtoDecoder(TypedProtoDecoder && other)453 TypedProtoDecoder(TypedProtoDecoder&& other) noexcept 454 : TypedProtoDecoderBase(std::move(other)) { 455 // If the moved-from decoder was using on-stack storage, we need to update 456 // our pointer to point to this decoder's on-stack storage. 457 if (fields_ == other.on_stack_storage_) { 458 fields_ = on_stack_storage_; 459 memcpy(on_stack_storage_, other.on_stack_storage_, 460 sizeof(on_stack_storage_)); 461 } 462 } 463 464 private: 465 Field on_stack_storage_[PROTOZERO_DECODER_INITIAL_STACK_CAPACITY]; 466 }; 467 468 } // namespace protozero 469 470 #endif // INCLUDE_PERFETTO_PROTOZERO_PROTO_DECODER_H_ 471