1 /* 2 * Copyright (C) 2021 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef SRC_PROTOZERO_FILTERING_MESSAGE_TOKENIZER_H_ 18 #define SRC_PROTOZERO_FILTERING_MESSAGE_TOKENIZER_H_ 19 20 #include <stdint.h> 21 22 #include "perfetto/base/compiler.h" 23 #include "perfetto/base/logging.h" 24 #include "perfetto/protozero/proto_utils.h" 25 26 namespace protozero { 27 28 // A helper class for schema-less tokenizing of protobuf messages. 29 // This class takes a stream of proto-encoded bytes, pushed one by one in input 30 // via Push(octet), and returns a stream of tokens (each Push() call can return 31 // 0 or 1 token). 32 // A "token" contains metadata about a field, specifically: its ID, its wire 33 // type and: 34 // - For varint and fixed32/64 fields: its payload. 35 // - For string and bytes fields: the length of its payload. 36 // In this case the caller is supposed to "eat" those N bytes before calling 37 // Push() again. 38 // Note that this class cannot differentiate between a string/bytes field or 39 // a submessage, because they are encoded in the same way. The caller is 40 // supposed to know whether a field can be recursed into by just keep calling 41 // Push() or is a string that should be skipped. 42 // This is inline to allow the compiler to see through the Push method and 43 // avoid a function call for each byte. 44 class MessageTokenizer { 45 public: 46 struct Token { 47 uint32_t field_id; // 0 == not valid. 48 proto_utils::ProtoWireType type; 49 50 // For kLengthDelimited, |value| represent the length of the payload. 51 uint64_t value; 52 validToken53 inline bool valid() const { return field_id != 0; } 54 bool operator==(const Token& o) const { 55 return field_id == o.field_id && type == o.type && value == o.value; 56 } 57 }; 58 59 // Pushes a byte in input and returns a token, only when getting to the last 60 // byte of each field. Specifically: 61 // - For varint and fixed32 fields, the Token is returned after the last byte 62 // of the numeric payload is pushed. 63 // - For length-delimited fields, this returns after the last byte of the 64 // length is pushed (i.e. right before the payload starts). The caller is 65 // expected to either skip the next |value| bytes (in the case of a string 66 // or bytes fields) or keep calling Push, in the case of a submessage. Push(uint8_t octet)67 inline Token Push(uint8_t octet) { 68 using protozero::proto_utils::ProtoWireType; 69 70 // Parsing a fixed32/64 field is the only case where we don't have to do 71 // any varint decoding. This is why this block is before the remaining 72 // switch statement below (all the rest is a varint). 73 if (PERFETTO_UNLIKELY(state_ == kFixedIntValue)) { 74 PERFETTO_DCHECK(fixed_int_bits_ == 32 || fixed_int_bits_ == 64); 75 fixed_int_value_ |= static_cast<uint64_t>(octet) << fixed_int_shift_; 76 fixed_int_shift_ += 8; 77 if (fixed_int_shift_ < fixed_int_bits_) 78 return Token{}; // Intermediate byte of a fixed32/64. 79 auto wire_type = fixed_int_bits_ == 32 ? ProtoWireType::kFixed32 80 : ProtoWireType::kFixed64; 81 uint64_t fixed_int_value = fixed_int_value_; 82 fixed_int_value_ = fixed_int_shift_ = fixed_int_bits_ = 0; 83 state_ = kFieldPreamble; 84 return Token{field_id_, wire_type, fixed_int_value}; 85 } 86 87 // At this point either we are: (i) parsing a field preamble; (ii) parsing a 88 // varint field paylod; (iii) parsing the length of a length-delimited 89 // field. In all cases, we need to decode a varint before proceeding. 90 varint_ |= static_cast<uint64_t>(octet & 0x7F) << varint_shift_; 91 if (octet & 0x80) { 92 varint_shift_ += 7; 93 if (PERFETTO_UNLIKELY(varint_shift_ >= 64)) { 94 varint_shift_ = 0; 95 state_ = kInvalidVarInt; 96 } 97 return Token{}; // Still parsing a varint. 98 } 99 100 uint64_t varint = varint_; 101 varint_ = 0; 102 varint_shift_ = 0; 103 104 switch (state_) { 105 case kFieldPreamble: { 106 auto field_type = static_cast<uint32_t>(varint & 7u); // 7 = 0..0111 107 field_id_ = static_cast<uint32_t>(varint >> 3); 108 109 // The field type is legit, now check it's well formed and within 110 // boundaries. 111 if (field_type == static_cast<uint32_t>(ProtoWireType::kVarInt)) { 112 state_ = kVarIntValue; 113 } else if (field_type == 114 static_cast<uint32_t>(ProtoWireType::kFixed32) || 115 field_type == 116 static_cast<uint32_t>(ProtoWireType::kFixed64)) { 117 state_ = kFixedIntValue; 118 fixed_int_shift_ = 0; 119 fixed_int_value_ = 0; 120 fixed_int_bits_ = 121 field_type == static_cast<uint32_t>(ProtoWireType::kFixed32) ? 32 122 : 64; 123 } else if (field_type == 124 static_cast<uint32_t>(ProtoWireType::kLengthDelimited)) { 125 state_ = kLenDelimited; 126 } else { 127 state_ = kInvalidFieldType; 128 } 129 return Token{}; 130 } 131 132 case kVarIntValue: { 133 // Return the varint field payload and go back to the next field. 134 state_ = kFieldPreamble; 135 return Token{field_id_, ProtoWireType::kVarInt, varint}; 136 } 137 138 case kLenDelimited: { 139 const auto payload_len = varint; 140 if (payload_len > protozero::proto_utils::kMaxMessageLength) { 141 state_ = kMessageTooBig; 142 return Token{}; 143 } 144 state_ = kFieldPreamble; 145 // At this point the caller is expected to consume the next 146 // |payload_len| bytes. 147 return Token{field_id_, ProtoWireType::kLengthDelimited, payload_len}; 148 } 149 150 case kFixedIntValue: 151 // Unreacheable because of the if before the switch. 152 PERFETTO_DCHECK(false); 153 break; 154 155 // Unrecoverable error states. 156 case kInvalidFieldType: 157 case kMessageTooBig: 158 case kInvalidVarInt: 159 break; 160 } // switch(state_) 161 162 return Token{}; // Keep GCC happy. 163 } 164 165 // Returns true if the tokenizer FSM has reached quiescence (i.e. if we are 166 // NOT in the middle of parsing a field). idle()167 bool idle() const { 168 return state_ == kFieldPreamble && varint_shift_ == 0 && 169 fixed_int_shift_ == 0; 170 } 171 172 // Only for reporting parser errors in the trace. state()173 uint32_t state() const { return static_cast<uint32_t>(state_); } 174 175 private: 176 enum State { 177 kFieldPreamble = 0, // Parsing the varint for the field preamble. 178 kVarIntValue = 1, // Parsing the payload of a varint field. 179 kFixedIntValue = 2, // Parsing the payload of a fixed32/64 field. 180 kLenDelimited = 3, // Parsing the length of a length-delimited field. 181 182 // Unrecoverable error states: 183 kInvalidFieldType = 4, // Encountered an invalid field type. 184 kMessageTooBig = 5, // Size of the length delimited message was too big. 185 kInvalidVarInt = 6, // Varint larger than 64 bits. 186 }; 187 188 State state_ = kFieldPreamble; 189 uint32_t field_id_ = 0; 190 uint64_t varint_ = 0; 191 uint32_t varint_shift_ = 0; 192 uint32_t fixed_int_shift_ = 0; 193 uint32_t fixed_int_bits_ = 0; 194 uint64_t fixed_int_value_ = 0; 195 }; 196 197 } // namespace protozero 198 199 #endif // SRC_PROTOZERO_FILTERING_MESSAGE_TOKENIZER_H_ 200