1 /* 2 * Copyright (C) 2021 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef SRC_PROTOZERO_FILTERING_MESSAGE_FILTER_H_ 18 #define SRC_PROTOZERO_FILTERING_MESSAGE_FILTER_H_ 19 20 #include <stdint.h> 21 22 #include <memory> 23 #include <string> 24 #include <unordered_map> 25 26 #include "src/protozero/filtering/filter_bytecode_parser.h" 27 #include "src/protozero/filtering/message_tokenizer.h" 28 #include "src/protozero/filtering/string_filter.h" 29 30 namespace protozero { 31 32 // A class to filter binary-encoded proto messages using an allow-list of field 33 // ids, also known as "filter bytecode". The filter determines which fields are 34 // allowed to be passed through in output and strips all the other fields. 35 // See go/trace-filtering for full design. 36 // This class takes in input: 37 // 1) The filter bytecode, loaded once via the LoadFilterBytecode() method. 38 // 2) A proto-encoded binary message. The message doesn't have to be contiguous, 39 // it can be passed as an array of arbitrarily chunked fragments. 40 // The FilterMessage*() method returns in output a proto message, stripping out 41 // all unknown fields. If the input is malformed (e.g., unknown proto field wire 42 // types, lengths out of bound) the whole filtering failed and the |error| flag 43 // of the FilteredMessage object is set to true. 44 // The filtering operation is based on rewriting a copy of the message into a 45 // self-allocated buffer, which is then returned in the output. The input buffer 46 // is NOT altered. 47 // Note also that the process of rewriting the protos gets rid of most redundant 48 // varint encoding (if present). So even if all fields are allow-listed, the 49 // output might NOT be bitwise identical to the input (but it will be 50 // semantically equivalent). 51 // Furthermore the enable_field_usage_tracking() method allows to keep track of 52 // a histogram of allowed / denied fields. It slows down filtering and is 53 // intended only on host tools. 54 class MessageFilter { 55 public: 56 MessageFilter(); 57 explicit MessageFilter(const MessageFilter&); 58 ~MessageFilter(); 59 60 struct InputSlice { 61 const void* data; 62 size_t len; 63 }; 64 65 struct FilteredMessage { FilteredMessageFilteredMessage66 FilteredMessage(std::unique_ptr<uint8_t[]> d, size_t s) 67 : data(std::move(d)), size(s) {} 68 std::unique_ptr<uint8_t[]> data; 69 size_t size; // The used bytes in |data|. This is <= sizeof(data). 70 bool error = false; 71 }; 72 73 // Loads the filter bytecode that will be used to filter any subsequent 74 // message. Must be called before the first call to FilterMessage*(). 75 // |filter_data| must point to a byte buffer for a proto-encoded ProtoFilter 76 // message (see proto_filter.proto). 77 bool LoadFilterBytecode(const void* filter_data, size_t len); 78 79 // This affects the filter starting point of the subsequent FilterMessage*() 80 // calls. By default the filtering process starts from the message @ index 0, 81 // the root message passed to proto_filter when generating the bytecode 82 // (in typical tracing use-cases, this is perfetto.protos.Trace). However, the 83 // caller (TracingServiceImpl) might want to filter packets from the 2nd level 84 // (perfetto.protos.TracePacket) because the root level is pre-pended after 85 // the fact. This call allows to change the root message for the filter. 86 // The argument |field_ids| is an array of proto field ids and determines the 87 // path to the new root. For instance, in the case of [1,2,3] SetFilterRoot 88 // will identify the sub-message for the field "root.1.2.3" and use that. 89 // In order for this to succeed all the fields in the path must be allowed 90 // in the filter and must be a nested message type. 91 bool SetFilterRoot(const uint32_t* field_ids, size_t num_fields); 92 93 // Takes an input message, fragmented in arbitrary slices, and returns a 94 // filtered message in output. 95 FilteredMessage FilterMessageFragments(const InputSlice*, size_t num_slices); 96 97 // Helper for tests, where the input is a contiguous buffer. FilterMessage(const void * data,size_t len)98 FilteredMessage FilterMessage(const void* data, size_t len) { 99 InputSlice slice{data, len}; 100 return FilterMessageFragments(&slice, 1); 101 } 102 103 // When enabled returns a map of "field path" to "usage counter". 104 // The key (std::string) is a binary buffer (i.e. NOT an ASCII/UTF-8 string) 105 // which contains a varint for each field. Consider the following: 106 // message Root { Sub1 f1 = 1; }; 107 // message Sub1 { Sub2 f2 = 7;} 108 // message Sub2 { string f3 = 5; } 109 // The field .f1.f2.f3 will be encoded as \x01\0x07\x05. 110 // The value is the number of times that field has been encountered. If the 111 // field is not allow-listed in the bytecode (the field is stripped in output) 112 // the count will be negative. enable_field_usage_tracking(bool x)113 void enable_field_usage_tracking(bool x) { track_field_usage_ = x; } field_usage()114 const std::unordered_map<std::string, int32_t>& field_usage() const { 115 return field_usage_; 116 } 117 118 // Exposed only for DCHECKS in TracingServiceImpl. root_msg_index()119 uint32_t root_msg_index() { return root_msg_index_; } 120 121 // Retuns the helper class used to perform string filtering. string_filter()122 StringFilter& string_filter() { return string_filter_; } 123 124 private: 125 // This is called by FilterMessageFragments(). 126 // Inlining allows the compiler turn the per-byte call/return into a for loop, 127 // while, at the same time, keeping the code easy to read and reason about. 128 // It gives a 20-25% speedup (265ms vs 215ms for a 25MB trace). 129 void FilterOneByte(uint8_t octet) PERFETTO_ALWAYS_INLINE; 130 131 // No-inline because this is a slowpath (only when usage tracking is enabled). 132 void IncrementCurrentFieldUsage(uint32_t field_id, 133 bool allowed) PERFETTO_NO_INLINE; 134 135 // Gets into an error state which swallows all the input and emits no output. 136 void SetUnrecoverableErrorState(); 137 138 // We keep track of the nest of messages in a stack. Each StackState 139 // object corresponds to a level of nesting in the proto message structure. 140 // Every time a new field of type len-delimited that has a corresponding 141 // sub-message in the bytecode is encountered, a new StackState is pushed in 142 // |stack_|. stack_[0] is a sentinel to prevent over-popping without adding 143 // extra branches in the fastpath. 144 // |stack_|. stack_[1] is the state of the root message. 145 struct StackState { 146 uint32_t in_bytes = 0; // Number of input bytes processed. 147 148 // When |in_bytes| reaches this value, the current state should be popped. 149 // This is set when recursing into nested submessages. This is 0 only for 150 // stack_[0] (we don't know the size of the root message upfront). 151 uint32_t in_bytes_limit = 0; 152 153 // This is set when a len-delimited message is encountered, either a string 154 // or a nested submessage that is NOT allow-listed in the bytecode. 155 // This causes input bytes to be consumed without being parsed from the 156 // input stream. If |passthrough_eaten_bytes| == true, they will be copied 157 // as-is in output (e.g. in the case of an allowed string/bytes field). 158 uint32_t eat_next_bytes = 0; 159 160 // Keeps tracks of the stream_writer output counter (out_.written()) then 161 // the StackState is pushed. This is used to work out, when popping, how 162 // many bytes have been written for the current submessage. 163 uint32_t out_bytes_written_at_start = 0; 164 165 uint32_t field_id = 0; // The proto field id for the current message. 166 uint32_t msg_index = 0; // The index of the message filter in the bytecode. 167 168 // This is a pointer to the proto preamble for the current submessage 169 // (it's nullptr for stack_[0] and non-null elsewhere). This will be filled 170 // with the actual size of the message (out_.written() - 171 // |out_bytes_written_at_start|) when finishing (popping) the message. 172 // This must be filled using WriteRedundantVarint(). Note that the 173 // |size_field_len| is variable and depends on the actual length of the 174 // input message. If the output message has roughly the same size of the 175 // input message, the length will not be redundant. 176 // In other words: the length of the field is reserved when the submessage 177 // starts. At that point we know the upper-bound for the output message 178 // (a filtered submessage can be <= the original one, but not >). So we 179 // reserve as many bytes it takes to write the input length in varint. 180 // Then, when the message is finalized and we know the actual output size 181 // we backfill the field. 182 // Consider the example of a submessage where the input size = 130 (>127, 183 // 2 varint bytes) and the output is 120 bytes. The length will be 2 bytes 184 // wide even though could have been encoded with just one byte. 185 uint8_t* size_field = nullptr; 186 uint32_t size_field_len = 0; 187 188 // The pointer to the start of the string to update the string if it is 189 // filtered. 190 uint8_t* filter_string_ptr = nullptr; 191 192 // How |eat_next_bytes| should be handled. It seems that keeping this field 193 // at the end rather than next to |eat_next_bytes| makes the filter a little 194 // (but measurably) faster. (likely something related with struct layout vs 195 // cache sizes). 196 enum FilterAction { 197 kDrop, 198 kPassthrough, 199 kFilterString, 200 }; 201 FilterAction action = FilterAction::kDrop; 202 }; 203 out_written()204 uint32_t out_written() { return static_cast<uint32_t>(out_ - &out_buf_[0]); } 205 206 // WARNING: Some of these fields should be in the copy constructor. 207 std::unique_ptr<uint8_t[]> out_buf_; 208 uint8_t* out_ = nullptr; 209 uint8_t* out_end_ = nullptr; 210 uint32_t root_msg_index_ = 0; 211 212 FilterBytecodeParser filter_; 213 MessageTokenizer tokenizer_; 214 StringFilter string_filter_; 215 std::vector<StackState> stack_; 216 217 bool error_ = false; 218 bool track_field_usage_ = false; 219 std::unordered_map<std::string, int32_t> field_usage_; 220 }; 221 222 } // namespace protozero 223 224 #endif // SRC_PROTOZERO_FILTERING_MESSAGE_FILTER_H_ 225