1 /*
2 * Copyright (C) 2021 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/protozero/filtering/filter_util.h"
18
19 #include <algorithm>
20 #include <deque>
21 #include <map>
22 #include <memory>
23 #include <set>
24
25 #include <google/protobuf/compiler/importer.h>
26
27 #include "perfetto/base/build_config.h"
28 #include "perfetto/ext/base/file_utils.h"
29 #include "perfetto/ext/base/getopt.h"
30 #include "perfetto/ext/base/string_utils.h"
31 #include "perfetto/protozero/proto_utils.h"
32 #include "src/protozero/filtering/filter_bytecode_generator.h"
33 #include "src/protozero/filtering/filter_bytecode_parser.h"
34
35 namespace protozero {
36
37 namespace {
38
39 class MultiFileErrorCollectorImpl
40 : public google::protobuf::compiler::MultiFileErrorCollector {
41 public:
42 ~MultiFileErrorCollectorImpl() override;
43 void AddError(const std::string&, int, int, const std::string&) override;
44 void AddWarning(const std::string&, int, int, const std::string&) override;
45 };
46
47 MultiFileErrorCollectorImpl::~MultiFileErrorCollectorImpl() = default;
48
AddError(const std::string & filename,int line,int column,const std::string & message)49 void MultiFileErrorCollectorImpl::AddError(const std::string& filename,
50 int line,
51 int column,
52 const std::string& message) {
53 PERFETTO_ELOG("Error %s %d:%d: %s", filename.c_str(), line, column,
54 message.c_str());
55 }
56
AddWarning(const std::string & filename,int line,int column,const std::string & message)57 void MultiFileErrorCollectorImpl::AddWarning(const std::string& filename,
58 int line,
59 int column,
60 const std::string& message) {
61 PERFETTO_ELOG("Warning %s %d:%d: %s", filename.c_str(), line, column,
62 message.c_str());
63 }
64
65 } // namespace
66
67 FilterUtil::FilterUtil() = default;
68 FilterUtil::~FilterUtil() = default;
69
LoadMessageDefinition(const std::string & proto_file,const std::string & root_message,const std::string & proto_dir_path,const std::set<std::string> & passthrough_fields,const std::set<std::string> & string_filter_fields)70 bool FilterUtil::LoadMessageDefinition(
71 const std::string& proto_file,
72 const std::string& root_message,
73 const std::string& proto_dir_path,
74 const std::set<std::string>& passthrough_fields,
75 const std::set<std::string>& string_filter_fields) {
76 passthrough_fields_ = passthrough_fields;
77 passthrough_fields_seen_.clear();
78 filter_string_fields_ = string_filter_fields;
79 filter_string_fields_seen_.clear();
80
81 // The protobuf compiler doesn't like backslashes and prints an error like:
82 // Error C:\it7mjanpw3\perfetto-a16500 -1:0: Backslashes, consecutive slashes,
83 // ".", or ".." are not allowed in the virtual path.
84 // Given that C:\foo\bar is a legit path on windows, fix it at this level
85 // because the problem is really the protobuf compiler being too picky.
86 static auto normalize_for_win = [](const std::string& path) {
87 #if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN)
88 return perfetto::base::ReplaceAll(path, "\\", "/");
89 #else
90 return path;
91 #endif
92 };
93
94 google::protobuf::compiler::DiskSourceTree dst;
95 #if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN)
96 // If the path is absolute, maps "C:/" -> "C:/" (without hardcoding 'C').
97 if (proto_file.size() > 3 && proto_file[1] == ':') {
98 char win_drive[4]{proto_file[0], ':', '/', '\0'};
99 dst.MapPath(win_drive, win_drive);
100 }
101 #endif
102 dst.MapPath("/", "/"); // We might still need this on Win under cygwin.
103 dst.MapPath("", normalize_for_win(proto_dir_path));
104 MultiFileErrorCollectorImpl mfe;
105 google::protobuf::compiler::Importer importer(&dst, &mfe);
106 const google::protobuf::FileDescriptor* root_file =
107 importer.Import(normalize_for_win(proto_file));
108 const google::protobuf::Descriptor* root_msg = nullptr;
109 if (!root_message.empty()) {
110 root_msg = importer.pool()->FindMessageTypeByName(root_message);
111 } else if (root_file->message_type_count() > 0) {
112 // The user didn't specfy the root type. Pick the first type in the file,
113 // most times it's the right guess.
114 root_msg = root_file->message_type(0);
115 if (root_msg)
116 PERFETTO_LOG(
117 "The guessed root message name is \"%.*s\". Pass -r com.MyName to "
118 "override",
119 int(root_msg->full_name().size()), root_msg->full_name().data());
120 }
121
122 if (!root_msg) {
123 PERFETTO_ELOG("Could not find the root message \"%s\" in %s",
124 root_message.c_str(), proto_file.c_str());
125 return false;
126 }
127
128 // |descriptors_by_full_name| is passed by argument rather than being a member
129 // field so that we don't risk leaving it out of sync (and depending on it in
130 // future without realizing) when performing the Dedupe() pass.
131 DescriptorsByNameMap descriptors_by_full_name;
132 ParseProtoDescriptor(root_msg, &descriptors_by_full_name);
133
134 // If the user specified a set of fields to pass through, print an error and
135 // fail if any of the passed fields have not been seen while recursing in the
136 // schema. This is to avoid typos or naming changes to be silently ignored.
137 std::vector<std::string> unused;
138 std::set_difference(passthrough_fields_.begin(), passthrough_fields_.end(),
139 passthrough_fields_seen_.begin(),
140 passthrough_fields_seen_.end(),
141 std::back_inserter(unused));
142 for (const std::string& message_and_field : unused) {
143 PERFETTO_ELOG("Field not found %s", message_and_field.c_str());
144 }
145 if (!unused.empty()) {
146 PERFETTO_ELOG("Passthrough syntax: perfetto.protos.MessageName:field_name");
147 return false;
148 }
149 std::set_difference(
150 filter_string_fields_.begin(), filter_string_fields_.end(),
151 filter_string_fields_seen_.begin(), filter_string_fields_seen_.end(),
152 std::back_inserter(unused));
153 for (const std::string& message_and_field : unused) {
154 PERFETTO_ELOG("Field not found %s", message_and_field.c_str());
155 }
156 if (!unused.empty()) {
157 PERFETTO_ELOG(
158 "Filter string syntax: perfetto.protos.MessageName:field_name");
159 return false;
160 }
161 return true;
162 }
163
164 // Generates a Message object for the given libprotobuf message descriptor.
165 // Recurses as needed into nested fields.
ParseProtoDescriptor(const google::protobuf::Descriptor * proto,DescriptorsByNameMap * descriptors_by_full_name)166 FilterUtil::Message* FilterUtil::ParseProtoDescriptor(
167 const google::protobuf::Descriptor* proto,
168 DescriptorsByNameMap* descriptors_by_full_name) {
169 auto descr_it =
170 descriptors_by_full_name->find(std::string(proto->full_name()));
171 if (descr_it != descriptors_by_full_name->end())
172 return descr_it->second;
173
174 descriptors_.emplace_back();
175 Message* msg = &descriptors_.back();
176 msg->full_name = std::string(proto->full_name());
177 (*descriptors_by_full_name)[msg->full_name] = msg;
178 for (int i = 0; i < proto->field_count(); ++i) {
179 const auto* proto_field = proto->field(i);
180 const uint32_t field_id = static_cast<uint32_t>(proto_field->number());
181 PERFETTO_CHECK(msg->fields.count(field_id) == 0);
182 auto& field = msg->fields[field_id];
183 field.name = proto_field->name();
184 field.type = proto_field->type_name();
185
186 std::string message_and_field = msg->full_name + ":" + field.name;
187 bool passthrough = false;
188 if (passthrough_fields_.count(message_and_field)) {
189 field.type = "bytes";
190 passthrough = true;
191 passthrough_fields_seen_.insert(message_and_field);
192 }
193 if (filter_string_fields_.count(message_and_field)) {
194 PERFETTO_CHECK(field.type == "string");
195 field.filter_string = true;
196 msg->has_filter_string_fields = true;
197 filter_string_fields_seen_.insert(message_and_field);
198 }
199 if (proto_field->message_type() && !passthrough) {
200 msg->has_nested_fields = true;
201 // Recurse.
202 field.nested_type = ParseProtoDescriptor(proto_field->message_type(),
203 descriptors_by_full_name);
204 }
205 }
206 return msg;
207 }
208
Dedupe()209 void FilterUtil::Dedupe() {
210 std::map<std::string /*identity*/, Message*> index;
211
212 std::map<Message*, Message*> dupe_graph; // K,V: K shall be duped against V.
213
214 // As a first pass, generate an |identity| string for each leaf message. The
215 // identity is simply the comma-separated stringification of its field ids.
216 // If another message with the same identity exists, add an edge to the graph.
217 const size_t initial_count = descriptors_.size();
218 size_t field_count = 0;
219 for (auto& descr : descriptors_) {
220 // Dedupe only leaf messages without nested or string filter fields.
221 if (descr.has_nested_fields || descr.has_filter_string_fields)
222 continue;
223 std::string identity;
224 for (const auto& id_and_field : descr.fields)
225 identity.append(std::to_string(id_and_field.first) + ",");
226 auto it_and_inserted = index.emplace(identity, &descr);
227 if (!it_and_inserted.second) {
228 // insertion failed, a dupe exists already.
229 Message* dupe_against = it_and_inserted.first->second;
230 dupe_graph.emplace(&descr, dupe_against);
231 }
232 }
233
234 // Now apply de-duplications by re-directing the nested_type pointer to the
235 // equivalent descriptors that have the same set of allowed field ids.
236 std::set<Message*> referenced_descriptors;
237 referenced_descriptors.emplace(&descriptors_.front()); // The root.
238 for (auto& descr : descriptors_) {
239 for (auto& id_and_field : descr.fields) {
240 Message* target = id_and_field.second.nested_type;
241 if (!target)
242 continue; // Only try to dedupe nested types.
243 auto it = dupe_graph.find(target);
244 if (it == dupe_graph.end()) {
245 referenced_descriptors.emplace(target);
246 continue;
247 }
248 ++field_count;
249 // Replace with the dupe.
250 id_and_field.second.nested_type = it->second;
251 } // for (nested_fields).
252 } // for (descriptors_).
253
254 // Remove unreferenced descriptors. We should much rather crash in the case of
255 // a logic bug rathern than trying to use them but don't emit them.
256 size_t removed_count = 0;
257 for (auto it = descriptors_.begin(); it != descriptors_.end();) {
258 if (referenced_descriptors.count(&*it)) {
259 ++it;
260 } else {
261 ++removed_count;
262 it = descriptors_.erase(it);
263 }
264 }
265 PERFETTO_LOG(
266 "Deduplication removed %zu duped descriptors out of %zu descriptors from "
267 "%zu fields",
268 removed_count, initial_count, field_count);
269 }
270
271 // Prints the list of messages and fields in a diff-friendly text format.
PrintAsText(std::optional<std::string> filter_bytecode)272 void FilterUtil::PrintAsText(std::optional<std::string> filter_bytecode) {
273 using perfetto::base::StripPrefix;
274 const std::string& root_name = descriptors_.front().full_name;
275 std::string root_prefix = root_name.substr(0, root_name.rfind('.'));
276 if (!root_prefix.empty())
277 root_prefix.append(".");
278
279 FilterBytecodeParser parser;
280 if (filter_bytecode) {
281 PERFETTO_CHECK(
282 parser.Load(filter_bytecode->data(), filter_bytecode->size()));
283 }
284
285 // <Filter msg_index, Message>
286 std::deque<std::pair<uint32_t, const Message*>> queue;
287 std::set<const Message*> seen_msgs{&descriptors_.front()};
288 queue.emplace_back(0u, &descriptors_.front());
289
290 while (!queue.empty()) {
291 auto index_and_descr = queue.front();
292 queue.pop_front();
293 uint32_t msg_index = index_and_descr.first;
294 const auto& descr = *index_and_descr.second;
295
296 for (const auto& id_and_field : descr.fields) {
297 const uint32_t field_id = id_and_field.first;
298 const auto& field = id_and_field.second;
299
300 FilterBytecodeParser::QueryResult result{0, false};
301 if (filter_bytecode) {
302 result = parser.Query(msg_index, field_id);
303 if (!result.allowed) {
304 continue;
305 }
306 }
307
308 const Message* nested_type = id_and_field.second.nested_type;
309 bool passthrough = false;
310 if (nested_type) {
311 // result.simple_field might be true if the generated bytecode is
312 // passing through a whole submessage without recursing.
313 passthrough = result.allowed && result.simple_field();
314 if (seen_msgs.find(nested_type) == seen_msgs.end()) {
315 seen_msgs.insert(nested_type);
316 queue.emplace_back(result.nested_msg_index, nested_type);
317 }
318 } else { // simple field
319 PERFETTO_CHECK(result.simple_field() || result.filter_string_field() ||
320 !filter_bytecode);
321 PERFETTO_CHECK(result.filter_string_field() == field.filter_string ||
322 !filter_bytecode);
323 }
324
325 auto stripped_name = StripPrefix(descr.full_name, root_prefix);
326 std::string stripped_nested =
327 nested_type ? " " + StripPrefix(nested_type->full_name, root_prefix)
328 : "";
329 if (passthrough)
330 stripped_nested += " # PASSTHROUGH";
331 if (field.filter_string)
332 stripped_nested += " # FILTER STRING";
333 fprintf(print_stream_, "%-60s %3u %-8s %-32s%s\n", stripped_name.c_str(),
334 field_id, field.type.c_str(), field.name.c_str(),
335 stripped_nested.c_str());
336 }
337 }
338 }
339
GenerateFilterBytecode()340 std::string FilterUtil::GenerateFilterBytecode() {
341 protozero::FilterBytecodeGenerator bytecode_gen;
342
343 // Assign indexes to descriptors, simply by counting them in order;
344 std::map<Message*, uint32_t> descr_to_idx;
345 for (auto& descr : descriptors_)
346 descr_to_idx[&descr] = static_cast<uint32_t>(descr_to_idx.size());
347
348 for (auto& descr : descriptors_) {
349 for (auto it = descr.fields.begin(); it != descr.fields.end();) {
350 uint32_t field_id = it->first;
351 const Message::Field& field = it->second;
352 if (field.nested_type) {
353 // Append the index of the target submessage.
354 PERFETTO_CHECK(descr_to_idx.count(field.nested_type));
355 uint32_t nested_msg_index = descr_to_idx[field.nested_type];
356 bytecode_gen.AddNestedField(field_id, nested_msg_index);
357 ++it;
358 continue;
359 }
360 if (field.filter_string) {
361 bytecode_gen.AddFilterStringField(field_id);
362 ++it;
363 continue;
364 }
365 // Simple field. Lookahead to see if we have a range of contiguous simple
366 // fields.
367 for (uint32_t range_len = 1;; ++range_len) {
368 ++it;
369 if (it != descr.fields.end() && it->first == field_id + range_len &&
370 it->second.is_simple()) {
371 continue;
372 }
373 // At this point it points to either the end() of the vector or a
374 // non-contiguous or non-simple field (which will be picked up by the
375 // next iteration).
376 if (range_len == 1) {
377 bytecode_gen.AddSimpleField(field_id);
378 } else {
379 bytecode_gen.AddSimpleFieldRange(field_id, range_len);
380 }
381 break;
382 } // for (range_len)
383 } // for (descr.fields)
384 bytecode_gen.EndMessage();
385 } // for (descriptors)
386 return bytecode_gen.Serialize();
387 }
388
LookupField(const std::string & varint_encoded_path)389 std::string FilterUtil::LookupField(const std::string& varint_encoded_path) {
390 const uint8_t* ptr =
391 reinterpret_cast<const uint8_t*>(varint_encoded_path.data());
392 const uint8_t* const end = ptr + varint_encoded_path.size();
393
394 std::vector<uint32_t> fields;
395 while (ptr < end) {
396 uint64_t varint;
397 const uint8_t* next = proto_utils::ParseVarInt(ptr, end, &varint);
398 PERFETTO_CHECK(next != ptr);
399 fields.emplace_back(static_cast<uint32_t>(varint));
400 ptr = next;
401 }
402 return LookupField(fields.data(), fields.size());
403 }
404
LookupField(const uint32_t * field_ids,size_t num_fields)405 std::string FilterUtil::LookupField(const uint32_t* field_ids,
406 size_t num_fields) {
407 const Message* msg = descriptors_.empty() ? nullptr : &descriptors_.front();
408 std::string res;
409 for (size_t i = 0; i < num_fields; ++i) {
410 const uint32_t field_id = field_ids[i];
411 const Message::Field* field = nullptr;
412 if (msg) {
413 auto it = msg->fields.find(field_id);
414 field = it == msg->fields.end() ? nullptr : &it->second;
415 }
416 res.append(".");
417 if (field) {
418 res.append(field->name);
419 msg = field->nested_type;
420 } else {
421 res.append(std::to_string(field_id));
422 }
423 }
424 return res;
425 }
426
427 } // namespace protozero
428