• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/traced/probes/ftrace/cpu_reader.h"
18 
19 #include <dirent.h>
20 #include <fcntl.h>
21 #include <signal.h>
22 
23 #include <algorithm>
24 #include <optional>
25 #include <utility>
26 
27 #include "perfetto/base/build_config.h"
28 #include "perfetto/base/logging.h"
29 #include "perfetto/ext/base/metatrace.h"
30 #include "perfetto/ext/base/string_splitter.h"
31 #include "perfetto/ext/base/string_utils.h"
32 #include "perfetto/ext/base/utils.h"
33 #include "perfetto/ext/tracing/core/trace_writer.h"
34 #include "src/kallsyms/kernel_symbol_map.h"
35 #include "src/kallsyms/lazy_kernel_symbolizer.h"
36 #include "src/traced/probes/ftrace/cpu_stats_parser.h"
37 #include "src/traced/probes/ftrace/ftrace_config_muxer.h"
38 #include "src/traced/probes/ftrace/ftrace_controller.h"
39 #include "src/traced/probes/ftrace/ftrace_data_source.h"
40 #include "src/traced/probes/ftrace/ftrace_print_filter.h"
41 #include "src/traced/probes/ftrace/proto_translation_table.h"
42 
43 #include "protos/perfetto/trace/ftrace/ftrace_event.pbzero.h"
44 #include "protos/perfetto/trace/ftrace/ftrace_event_bundle.pbzero.h"
45 #include "protos/perfetto/trace/ftrace/generic.pbzero.h"
46 #include "protos/perfetto/trace/interned_data/interned_data.pbzero.h"
47 #include "protos/perfetto/trace/profiling/profile_common.pbzero.h"
48 #include "protos/perfetto/trace/trace_packet.pbzero.h"
49 
50 namespace perfetto {
51 namespace {
52 
53 // If the compact_sched buffer accumulates more unique strings, the reader will
54 // flush it to reset the interning state (and make it cheap again).
55 // This is not an exact cap, since we check only at tracing page boundaries.
56 constexpr size_t kCompactSchedInternerThreshold = 64;
57 
58 // For further documentation of these constants see the kernel source:
59 //   linux/include/linux/ring_buffer.h
60 // Some of this is also available to userspace at runtime via:
61 //   /sys/kernel/tracing/events/header_event
62 constexpr uint32_t kTypeDataTypeLengthMax = 28;
63 constexpr uint32_t kTypePadding = 29;
64 constexpr uint32_t kTypeTimeExtend = 30;
65 constexpr uint32_t kTypeTimeStamp = 31;
66 
67 struct EventHeader {
68   // bottom 5 bits
69   uint32_t type_or_length : 5;
70   // top 27 bits
71   uint32_t time_delta : 27;
72 };
73 
74 // Reads a string from `start` until the first '\0' byte or until fixed_len
75 // characters have been read. Appends it to `*out` as field `field_id`.
ReadIntoString(const uint8_t * start,size_t fixed_len,uint32_t field_id,protozero::Message * out)76 void ReadIntoString(const uint8_t* start,
77                     size_t fixed_len,
78                     uint32_t field_id,
79                     protozero::Message* out) {
80   size_t len = strnlen(reinterpret_cast<const char*>(start), fixed_len);
81   out->AppendBytes(field_id, reinterpret_cast<const char*>(start), len);
82 }
83 
ReadDataLoc(const uint8_t * start,const uint8_t * field_start,const uint8_t * end,const Field & field,protozero::Message * message)84 bool ReadDataLoc(const uint8_t* start,
85                  const uint8_t* field_start,
86                  const uint8_t* end,
87                  const Field& field,
88                  protozero::Message* message) {
89   PERFETTO_DCHECK(field.ftrace_size == 4);
90   // See kernel header include/trace/trace_events.h
91   uint32_t data = 0;
92   const uint8_t* ptr = field_start;
93   if (!CpuReader::ReadAndAdvance(&ptr, end, &data)) {
94     PERFETTO_DFATAL("couldn't read __data_loc value");
95     return false;
96   }
97 
98   const uint16_t offset = data & 0xffff;
99   const uint16_t len = (data >> 16) & 0xffff;
100   const uint8_t* const string_start = start + offset;
101 
102   if (PERFETTO_UNLIKELY(len == 0))
103     return true;
104   if (PERFETTO_UNLIKELY(string_start < start || string_start + len > end)) {
105     PERFETTO_DFATAL("__data_loc points at invalid location");
106     return false;
107   }
108   ReadIntoString(string_start, len, field.proto_field_id, message);
109   return true;
110 }
111 
112 template <typename T>
ReadValue(const uint8_t * ptr)113 T ReadValue(const uint8_t* ptr) {
114   T t;
115   memcpy(&t, reinterpret_cast<const void*>(ptr), sizeof(T));
116   return t;
117 }
118 
119 // Reads a signed ftrace value as an int64_t, sign extending if necessary.
ReadSignedFtraceValue(const uint8_t * ptr,FtraceFieldType ftrace_type)120 static int64_t ReadSignedFtraceValue(const uint8_t* ptr,
121                                      FtraceFieldType ftrace_type) {
122   if (ftrace_type == kFtraceInt32) {
123     int32_t value;
124     memcpy(&value, reinterpret_cast<const void*>(ptr), sizeof(value));
125     return int64_t(value);
126   }
127   if (ftrace_type == kFtraceInt64) {
128     int64_t value;
129     memcpy(&value, reinterpret_cast<const void*>(ptr), sizeof(value));
130     return value;
131   }
132   PERFETTO_FATAL("unexpected ftrace type");
133 }
134 
SetBlocking(int fd,bool is_blocking)135 bool SetBlocking(int fd, bool is_blocking) {
136   int flags = fcntl(fd, F_GETFL, 0);
137   flags = (is_blocking) ? (flags & ~O_NONBLOCK) : (flags | O_NONBLOCK);
138   return fcntl(fd, F_SETFL, flags) == 0;
139 }
140 
ZeroPaddedPageTail(const uint8_t * start,const uint8_t * end)141 bool ZeroPaddedPageTail(const uint8_t* start, const uint8_t* end) {
142   for (auto p = start; p < end; p++) {
143     if (*p != 0)
144       return false;
145   }
146   return true;
147 }
148 
LogInvalidPage(const void * start,size_t size)149 void LogInvalidPage(const void* start, size_t size) {
150   PERFETTO_ELOG("Invalid ftrace page");
151   std::string hexdump = base::HexDump(start, size);
152   // Only a single line per log message, because log message size might be
153   // limited.
154   for (base::StringSplitter ss(std::move(hexdump), '\n'); ss.Next();) {
155     PERFETTO_ELOG("%s", ss.cur_token());
156   }
157 }
158 
159 }  // namespace
160 
161 using protos::pbzero::GenericFtraceEvent;
162 
CpuReader(size_t cpu,const ProtoTranslationTable * table,LazyKernelSymbolizer * symbolizer,const FtraceClockSnapshot * ftrace_clock_snapshot,base::ScopedFile trace_fd)163 CpuReader::CpuReader(size_t cpu,
164                      const ProtoTranslationTable* table,
165                      LazyKernelSymbolizer* symbolizer,
166                      const FtraceClockSnapshot* ftrace_clock_snapshot,
167                      base::ScopedFile trace_fd)
168     : cpu_(cpu),
169       table_(table),
170       symbolizer_(symbolizer),
171       ftrace_clock_snapshot_(ftrace_clock_snapshot),
172       trace_fd_(std::move(trace_fd)) {
173   PERFETTO_CHECK(trace_fd_);
174   PERFETTO_CHECK(SetBlocking(*trace_fd_, false));
175 }
176 
177 CpuReader::~CpuReader() = default;
178 
ReadCycle(uint8_t * parsing_buf,size_t parsing_buf_size_pages,size_t max_pages,const std::set<FtraceDataSource * > & started_data_sources)179 size_t CpuReader::ReadCycle(
180     uint8_t* parsing_buf,
181     size_t parsing_buf_size_pages,
182     size_t max_pages,
183     const std::set<FtraceDataSource*>& started_data_sources) {
184   PERFETTO_DCHECK(max_pages > 0 && parsing_buf_size_pages > 0);
185   metatrace::ScopedEvent evt(metatrace::TAG_FTRACE,
186                              metatrace::FTRACE_CPU_READ_CYCLE);
187 
188   // Work in batches to keep cache locality, and limit memory usage.
189   size_t total_pages_read = 0;
190   for (bool is_first_batch = true;; is_first_batch = false) {
191     size_t batch_pages =
192         std::min(parsing_buf_size_pages, max_pages - total_pages_read);
193     size_t pages_read = ReadAndProcessBatch(
194         parsing_buf, batch_pages, is_first_batch, started_data_sources);
195 
196     PERFETTO_DCHECK(pages_read <= batch_pages);
197     total_pages_read += pages_read;
198 
199     // Check whether we've caught up to the writer, or possibly giving up on
200     // this attempt due to some error.
201     if (pages_read != batch_pages)
202       break;
203     // Check if we've hit the limit of work for this cycle.
204     if (total_pages_read >= max_pages)
205       break;
206   }
207   PERFETTO_METATRACE_COUNTER(TAG_FTRACE, FTRACE_PAGES_DRAINED,
208                              total_pages_read);
209   return total_pages_read;
210 }
211 
212 // metatrace note: mark the reading phase as FTRACE_CPU_READ_BATCH, but let the
213 // parsing time be implied (by the difference between the caller's span, and
214 // this reading span). Makes it easier to estimate the read/parse ratio when
215 // looking at the trace in the UI.
ReadAndProcessBatch(uint8_t * parsing_buf,size_t max_pages,bool first_batch_in_cycle,const std::set<FtraceDataSource * > & started_data_sources)216 size_t CpuReader::ReadAndProcessBatch(
217     uint8_t* parsing_buf,
218     size_t max_pages,
219     bool first_batch_in_cycle,
220     const std::set<FtraceDataSource*>& started_data_sources) {
221   size_t pages_read = 0;
222   {
223     metatrace::ScopedEvent evt(metatrace::TAG_FTRACE,
224                                metatrace::FTRACE_CPU_READ_BATCH);
225     for (; pages_read < max_pages;) {
226       uint8_t* curr_page = parsing_buf + (pages_read * base::kPageSize);
227       ssize_t res =
228           PERFETTO_EINTR(read(*trace_fd_, curr_page, base::kPageSize));
229       if (res < 0) {
230         // Expected errors:
231         // EAGAIN: no data (since we're in non-blocking mode).
232         // ENONMEM, EBUSY: temporary ftrace failures (they happen).
233         // ENODEV: the cpu is offline (b/145583318).
234         if (errno != EAGAIN && errno != ENOMEM && errno != EBUSY &&
235             errno != ENODEV) {
236           PERFETTO_PLOG("Unexpected error on raw ftrace read");
237         }
238         break;  // stop reading regardless of errno
239       }
240 
241       // As long as all of our reads are for a single page, the kernel should
242       // return exactly a well-formed raw ftrace page (if not in the steady
243       // state of reading out fully-written pages, the kernel will construct
244       // pages as necessary, copying over events and zero-filling at the end).
245       // A sub-page read() is therefore not expected in practice (unless
246       // there's a concurrent reader requesting less than a page?). Crash if
247       // encountering this situation. Kernel source pointer: see usage of
248       // |info->read| within |tracing_buffers_read|.
249       if (res == 0) {
250         // Very rare, but possible. Stop for now, should recover.
251         PERFETTO_DLOG("[cpu%zu]: 0-sized read from ftrace pipe.", cpu_);
252         break;
253       }
254       PERFETTO_CHECK(res == static_cast<ssize_t>(base::kPageSize));
255 
256       pages_read += 1;
257 
258       // Compare the amount of ftrace data read against an empirical threshold
259       // to make an educated guess on whether we should read more. To figure
260       // out the amount of ftrace data, we need to parse the page header (since
261       // the read always returns a page, zero-filled at the end). If we read
262       // fewer bytes than the threshold, it means that we caught up with the
263       // write pointer and we started consuming ftrace events in real-time.
264       // This cannot be just 4096 because it needs to account for
265       // fragmentation, i.e. for the fact that the last trace event didn't fit
266       // in the current page and hence the current page was terminated
267       // prematurely.
268       static constexpr size_t kRoughlyAPage = base::kPageSize - 512;
269       const uint8_t* scratch_ptr = curr_page;
270       std::optional<PageHeader> hdr =
271           ParsePageHeader(&scratch_ptr, table_->page_header_size_len());
272       PERFETTO_DCHECK(hdr && hdr->size > 0 && hdr->size <= base::kPageSize);
273       if (!hdr.has_value()) {
274         PERFETTO_ELOG("[cpu%zu]: can't parse page header", cpu_);
275         break;
276       }
277       // Note that the first read after starting the read cycle being small is
278       // normal. It means that we're given the remainder of events from a
279       // page that we've partially consumed during the last read of the previous
280       // cycle (having caught up to the writer).
281       if (hdr->size < kRoughlyAPage &&
282           !(first_batch_in_cycle && pages_read == 1)) {
283         break;
284       }
285     }
286   }  // end of metatrace::FTRACE_CPU_READ_BATCH
287 
288   // Parse the pages and write to the trace for all relevant data
289   // sources.
290   if (pages_read == 0)
291     return pages_read;
292 
293   for (FtraceDataSource* data_source : started_data_sources) {
294     size_t pages_parsed_ok = ProcessPagesForDataSource(
295         data_source->trace_writer(), data_source->mutable_metadata(), cpu_,
296         data_source->parsing_config(), parsing_buf, pages_read, table_,
297         symbolizer_, ftrace_clock_snapshot_, ftrace_clock_);
298     // If this happens, it means that we did not know how to parse the kernel
299     // binary format. This is a bug in either perfetto or the kernel, and must
300     // be investigated. Hence we abort instead of recording a bit in the ftrace
301     // stats proto, which is easier to overlook.
302     if (pages_parsed_ok != pages_read) {
303       const size_t first_bad_page_idx = pages_parsed_ok;
304       const uint8_t* curr_page =
305           parsing_buf + (first_bad_page_idx * base::kPageSize);
306       LogInvalidPage(curr_page, base::kPageSize);
307       PERFETTO_FATAL("Failed to parse ftrace page");
308     }
309   }
310 
311   return pages_read;
312 }
313 
StartNewPacket(bool lost_events)314 void CpuReader::Bundler::StartNewPacket(bool lost_events) {
315   FinalizeAndRunSymbolizer();
316   packet_ = trace_writer_->NewTracePacket();
317   bundle_ = packet_->set_ftrace_events();
318   if (ftrace_clock_) {
319     bundle_->set_ftrace_clock(ftrace_clock_);
320 
321     if (ftrace_clock_snapshot_ && ftrace_clock_snapshot_->ftrace_clock_ts) {
322       bundle_->set_ftrace_timestamp(ftrace_clock_snapshot_->ftrace_clock_ts);
323       bundle_->set_boot_timestamp(ftrace_clock_snapshot_->boot_clock_ts);
324     }
325   }
326 
327   bundle_->set_cpu(static_cast<uint32_t>(cpu_));
328   if (lost_events) {
329     bundle_->set_lost_events(true);
330   }
331 }
332 
FinalizeAndRunSymbolizer()333 void CpuReader::Bundler::FinalizeAndRunSymbolizer() {
334   if (!packet_) {
335     return;
336   }
337 
338   if (compact_sched_enabled_) {
339     compact_sched_buffer_.WriteAndReset(bundle_);
340   }
341 
342   bundle_->Finalize();
343   bundle_ = nullptr;
344   // Write the kernel symbol index (mangled address) -> name table.
345   // |metadata| is shared across all cpus, is distinct per |data_source| (i.e.
346   // tracing session) and is cleared after each FtraceController::ReadTick().
347   if (symbolizer_) {
348     // Symbol indexes are assigned mononically as |kernel_addrs.size()|,
349     // starting from index 1 (no symbol has index 0). Here we remember the
350     // size() (which is also == the highest value in |kernel_addrs|) at the
351     // beginning and only write newer indexes bigger than that.
352     uint32_t max_index_at_start = metadata_->last_kernel_addr_index_written;
353     PERFETTO_DCHECK(max_index_at_start <= metadata_->kernel_addrs.size());
354     protos::pbzero::InternedData* interned_data = nullptr;
355     auto* ksyms_map = symbolizer_->GetOrCreateKernelSymbolMap();
356     bool wrote_at_least_one_symbol = false;
357     for (const FtraceMetadata::KernelAddr& kaddr : metadata_->kernel_addrs) {
358       if (kaddr.index <= max_index_at_start)
359         continue;
360       std::string sym_name = ksyms_map->Lookup(kaddr.addr);
361       if (sym_name.empty()) {
362         // Lookup failed. This can genuinely happen in many occasions. E.g.,
363         // workqueue_execute_start has two pointers: one is a pointer to a
364         // function (which we expect to be symbolized), the other (|work|) is
365         // a pointer to a heap struct, which is unsymbolizable, even when
366         // using the textual ftrace endpoint.
367         continue;
368       }
369 
370       if (!interned_data) {
371         // If this is the very first write, clear the start of the sequence
372         // so the trace processor knows that all previous indexes can be
373         // discarded and that the mapping is restarting.
374         // In most cases this occurs with cpu==0. But if cpu0 is idle, this
375         // will happen with the first CPU that has any ftrace data.
376         if (max_index_at_start == 0) {
377           packet_->set_sequence_flags(
378               protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
379         }
380         interned_data = packet_->set_interned_data();
381       }
382       auto* interned_sym = interned_data->add_kernel_symbols();
383       interned_sym->set_iid(kaddr.index);
384       interned_sym->set_str(sym_name);
385       wrote_at_least_one_symbol = true;
386     }
387 
388     auto max_it_at_end = static_cast<uint32_t>(metadata_->kernel_addrs.size());
389 
390     // Rationale for the if (wrote_at_least_one_symbol) check: in rare cases,
391     // all symbols seen in a ProcessPagesForDataSource() call can fail the
392     // ksyms_map->Lookup(). If that happens we don't want to bump the
393     // last_kernel_addr_index_written watermark, as that would cause the next
394     // call to NOT emit the SEQ_INCREMENTAL_STATE_CLEARED.
395     if (wrote_at_least_one_symbol) {
396       metadata_->last_kernel_addr_index_written = max_it_at_end;
397     }
398   }
399   packet_ = TraceWriter::TracePacketHandle(nullptr);
400 }
401 
402 // static
ProcessPagesForDataSource(TraceWriter * trace_writer,FtraceMetadata * metadata,size_t cpu,const FtraceDataSourceConfig * ds_config,const uint8_t * parsing_buf,const size_t pages_read,const ProtoTranslationTable * table,LazyKernelSymbolizer * symbolizer,const FtraceClockSnapshot * ftrace_clock_snapshot,protos::pbzero::FtraceClock ftrace_clock)403 size_t CpuReader::ProcessPagesForDataSource(
404     TraceWriter* trace_writer,
405     FtraceMetadata* metadata,
406     size_t cpu,
407     const FtraceDataSourceConfig* ds_config,
408     const uint8_t* parsing_buf,
409     const size_t pages_read,
410     const ProtoTranslationTable* table,
411     LazyKernelSymbolizer* symbolizer,
412     const FtraceClockSnapshot* ftrace_clock_snapshot,
413     protos::pbzero::FtraceClock ftrace_clock) {
414   Bundler bundler(trace_writer, metadata,
415                   ds_config->symbolize_ksyms ? symbolizer : nullptr, cpu,
416                   ftrace_clock_snapshot, ftrace_clock,
417                   ds_config->compact_sched.enabled);
418 
419   size_t pages_parsed = 0;
420   bool compact_sched_enabled = ds_config->compact_sched.enabled;
421   for (; pages_parsed < pages_read; pages_parsed++) {
422     const uint8_t* curr_page = parsing_buf + (pages_parsed * base::kPageSize);
423     const uint8_t* curr_page_end = curr_page + base::kPageSize;
424     const uint8_t* parse_pos = curr_page;
425     std::optional<PageHeader> page_header =
426         ParsePageHeader(&parse_pos, table->page_header_size_len());
427 
428     if (!page_header.has_value() || page_header->size == 0 ||
429         parse_pos >= curr_page_end ||
430         parse_pos + page_header->size > curr_page_end) {
431       break;
432     }
433 
434     // Start a new bundle if either:
435     // * The page we're about to read indicates that there was a kernel ring
436     //   buffer overrun since our last read from that per-cpu buffer. We have
437     //   a single |lost_events| field per bundle, so start a new packet.
438     // * The compact_sched buffer is holding more unique interned strings than
439     //   a threshold. We need to flush the compact buffer to make the
440     //   interning lookups cheap again.
441     bool interner_past_threshold =
442         compact_sched_enabled &&
443         bundler.compact_sched_buffer()->interner().interned_comms_size() >
444             kCompactSchedInternerThreshold;
445 
446     if (page_header->lost_events || interner_past_threshold) {
447       bundler.StartNewPacket(page_header->lost_events);
448     }
449 
450     size_t evt_size = ParsePagePayload(parse_pos, &page_header.value(), table,
451                                        ds_config, &bundler, metadata);
452 
453     if (evt_size != page_header->size) {
454       break;
455     }
456   }
457   // bundler->FinalizeAndRunSymbolizer() will run as part of the destructor.
458 
459   return pages_parsed;
460 }
461 
462 // A page header consists of:
463 // * timestamp: 8 bytes
464 // * commit: 8 bytes on 64 bit, 4 bytes on 32 bit kernels
465 //
466 // The kernel reports this at /sys/kernel/debug/tracing/events/header_page.
467 //
468 // |commit|'s bottom bits represent the length of the payload following this
469 // header. The top bits have been repurposed as a bitset of flags pertaining to
470 // data loss. We look only at the "there has been some data lost" flag
471 // (RB_MISSED_EVENTS), and ignore the relatively tricky "appended the precise
472 // lost events count past the end of the valid data, as there was room to do so"
473 // flag (RB_MISSED_STORED).
474 //
475 // static
ParsePageHeader(const uint8_t ** ptr,uint16_t page_header_size_len)476 std::optional<CpuReader::PageHeader> CpuReader::ParsePageHeader(
477     const uint8_t** ptr,
478     uint16_t page_header_size_len) {
479   // Mask for the data length portion of the |commit| field. Note that the
480   // kernel implementation never explicitly defines the boundary (beyond using
481   // bits 30 and 31 as flags), but 27 bits are mentioned as sufficient in the
482   // original commit message, and is the constant used by trace-cmd.
483   constexpr static uint64_t kDataSizeMask = (1ull << 27) - 1;
484   // If set, indicates that the relevant cpu has lost events since the last read
485   // (clearing the bit internally).
486   constexpr static uint64_t kMissedEventsFlag = (1ull << 31);
487 
488   const uint8_t* end_of_page = *ptr + base::kPageSize;
489   PageHeader page_header;
490   if (!CpuReader::ReadAndAdvance<uint64_t>(ptr, end_of_page,
491                                            &page_header.timestamp))
492     return std::nullopt;
493 
494   uint32_t size_and_flags;
495 
496   // On little endian, we can just read a uint32_t and reject the rest of the
497   // number later.
498   if (!CpuReader::ReadAndAdvance<uint32_t>(
499           ptr, end_of_page, base::AssumeLittleEndian(&size_and_flags)))
500     return std::nullopt;
501 
502   page_header.size = size_and_flags & kDataSizeMask;
503   page_header.lost_events = bool(size_and_flags & kMissedEventsFlag);
504   PERFETTO_DCHECK(page_header.size <= base::kPageSize);
505 
506   // Reject rest of the number, if applicable. On 32-bit, size_bytes - 4 will
507   // evaluate to 0 and this will be a no-op. On 64-bit, this will advance by 4
508   // bytes.
509   PERFETTO_DCHECK(page_header_size_len >= 4);
510   *ptr += page_header_size_len - 4;
511 
512   return std::make_optional(page_header);
513 }
514 
515 // A raw ftrace buffer page consists of a header followed by a sequence of
516 // binary ftrace events. See |ParsePageHeader| for the format of the earlier.
517 //
518 // This method is deliberately static so it can be tested independently.
ParsePagePayload(const uint8_t * start_of_payload,const PageHeader * page_header,const ProtoTranslationTable * table,const FtraceDataSourceConfig * ds_config,Bundler * bundler,FtraceMetadata * metadata)519 size_t CpuReader::ParsePagePayload(const uint8_t* start_of_payload,
520                                    const PageHeader* page_header,
521                                    const ProtoTranslationTable* table,
522                                    const FtraceDataSourceConfig* ds_config,
523                                    Bundler* bundler,
524                                    FtraceMetadata* metadata) {
525   const uint8_t* ptr = start_of_payload;
526   const uint8_t* const end = ptr + page_header->size;
527 
528   uint64_t timestamp = page_header->timestamp;
529 
530   while (ptr < end) {
531     EventHeader event_header;
532     if (!ReadAndAdvance(&ptr, end, &event_header))
533       return 0;
534 
535     timestamp += event_header.time_delta;
536 
537     switch (event_header.type_or_length) {
538       case kTypePadding: {
539         // Left over page padding or discarded event.
540         if (event_header.time_delta == 0) {
541           // Not clear what the correct behaviour is in this case.
542           PERFETTO_DFATAL("Empty padding event.");
543           return 0;
544         }
545         uint32_t length = 0;
546         if (!ReadAndAdvance<uint32_t>(&ptr, end, &length))
547           return 0;
548         // length includes itself (4 bytes)
549         if (length < 4)
550           return 0;
551         ptr += length - 4;
552         break;
553       }
554       case kTypeTimeExtend: {
555         // Extend the time delta.
556         uint32_t time_delta_ext = 0;
557         if (!ReadAndAdvance<uint32_t>(&ptr, end, &time_delta_ext))
558           return 0;
559         timestamp += (static_cast<uint64_t>(time_delta_ext)) << 27;
560         break;
561       }
562       case kTypeTimeStamp: {
563         // Absolute timestamp. This was historically partially implemented, but
564         // not written. Kernels 4.17+ reimplemented this record, changing its
565         // size in the process. We assume the newer layout. Parsed the same as
566         // kTypeTimeExtend, except that the timestamp is interpreted as an
567         // absolute, instead of a delta on top of the previous state.
568         uint32_t time_delta_ext = 0;
569         if (!ReadAndAdvance<uint32_t>(&ptr, end, &time_delta_ext))
570           return 0;
571         timestamp = event_header.time_delta +
572                     (static_cast<uint64_t>(time_delta_ext) << 27);
573         break;
574       }
575       // Data record:
576       default: {
577         PERFETTO_CHECK(event_header.type_or_length <= kTypeDataTypeLengthMax);
578         // type_or_length is <=28 so it represents the length of a data
579         // record. if == 0, this is an extended record and the size of the
580         // record is stored in the first uint32_t word in the payload. See
581         // Kernel's include/linux/ring_buffer.h
582         uint32_t event_size = 0;
583         if (event_header.type_or_length == 0) {
584           if (!ReadAndAdvance<uint32_t>(&ptr, end, &event_size)) {
585             return 0;
586           }
587           // Size includes the size field itself. Special case for invalid
588           // tracing pages seen on select Android 4.19 kernels: the page header
589           // says there's still valid data, but the rest of the page is full of
590           // zeroes (which would not decode to a valid event). We pretend that
591           // such pages have been fully parsed. b/204564312.
592           if (PERFETTO_UNLIKELY(event_size == 0 &&
593                                 event_header.time_delta == 0 &&
594                                 ZeroPaddedPageTail(ptr, end))) {
595             return static_cast<size_t>(page_header->size);
596           } else if (PERFETTO_UNLIKELY(event_size < 4)) {
597             return 0;
598           }
599           event_size -= 4;
600         } else {
601           event_size = 4 * event_header.type_or_length;
602         }
603         const uint8_t* start = ptr;
604         const uint8_t* next = ptr + event_size;
605 
606         if (next > end)
607           return 0;
608 
609         uint16_t ftrace_event_id;
610         if (!ReadAndAdvance<uint16_t>(&ptr, end, &ftrace_event_id))
611           return 0;
612 
613         if (ds_config->event_filter.IsEventEnabled(ftrace_event_id)) {
614           // Special-cased handling of some scheduler events when compact format
615           // is enabled.
616           bool compact_sched_enabled = ds_config->compact_sched.enabled;
617           const CompactSchedSwitchFormat& sched_switch_format =
618               table->compact_sched_format().sched_switch;
619           const CompactSchedWakingFormat& sched_waking_format =
620               table->compact_sched_format().sched_waking;
621 
622           bool ftrace_print_filter_enabled =
623               ds_config->print_filter.has_value();
624 
625           // compact sched_switch
626           if (compact_sched_enabled &&
627               ftrace_event_id == sched_switch_format.event_id) {
628             if (event_size < sched_switch_format.size)
629               return 0;
630 
631             ParseSchedSwitchCompact(start, timestamp, &sched_switch_format,
632                                     bundler->compact_sched_buffer(), metadata);
633 
634             // compact sched_waking
635           } else if (compact_sched_enabled &&
636                      ftrace_event_id == sched_waking_format.event_id) {
637             if (event_size < sched_waking_format.size)
638               return 0;
639 
640             ParseSchedWakingCompact(start, timestamp, &sched_waking_format,
641                                     bundler->compact_sched_buffer(), metadata);
642 
643           } else if (ftrace_print_filter_enabled &&
644                      ftrace_event_id == ds_config->print_filter->event_id()) {
645             if (ds_config->print_filter->IsEventInteresting(start, next)) {
646               protos::pbzero::FtraceEvent* event =
647                   bundler->GetOrCreateBundle()->add_event();
648               event->set_timestamp(timestamp);
649               if (!ParseEvent(ftrace_event_id, start, next, table, ds_config,
650                               event, metadata))
651                 return 0;
652             }
653           } else {
654             // Common case: parse all other types of enabled events.
655             protos::pbzero::FtraceEvent* event =
656                 bundler->GetOrCreateBundle()->add_event();
657             event->set_timestamp(timestamp);
658             if (!ParseEvent(ftrace_event_id, start, next, table, ds_config,
659                             event, metadata))
660               return 0;
661           }
662         }
663 
664         // Jump to next event.
665         ptr = next;
666       }
667     }
668   }
669   return static_cast<size_t>(ptr - start_of_payload);
670 }
671 
672 // |start| is the start of the current event.
673 // |end| is the end of the buffer.
ParseEvent(uint16_t ftrace_event_id,const uint8_t * start,const uint8_t * end,const ProtoTranslationTable * table,const FtraceDataSourceConfig * ds_config,protozero::Message * message,FtraceMetadata * metadata)674 bool CpuReader::ParseEvent(uint16_t ftrace_event_id,
675                            const uint8_t* start,
676                            const uint8_t* end,
677                            const ProtoTranslationTable* table,
678                            const FtraceDataSourceConfig* ds_config,
679                            protozero::Message* message,
680                            FtraceMetadata* metadata) {
681   PERFETTO_DCHECK(start < end);
682   const size_t length = static_cast<size_t>(end - start);
683 
684   // TODO(hjd): Rework to work even if the event is unknown.
685   const Event& info = *table->GetEventById(ftrace_event_id);
686 
687   // TODO(hjd): Test truncated events.
688   // If the end of the buffer is before the end of the event give up.
689   if (info.size > length) {
690     PERFETTO_DFATAL("Buffer overflowed.");
691     return false;
692   }
693 
694   bool success = true;
695   const Field* common_pid_field = table->common_pid();
696   if (PERFETTO_LIKELY(common_pid_field))
697     success &=
698         ParseField(*common_pid_field, start, end, table, message, metadata);
699 
700   protozero::Message* nested =
701       message->BeginNestedMessage<protozero::Message>(info.proto_field_id);
702 
703   // Parse generic event.
704   if (PERFETTO_UNLIKELY(info.proto_field_id ==
705                         protos::pbzero::FtraceEvent::kGenericFieldNumber)) {
706     nested->AppendString(GenericFtraceEvent::kEventNameFieldNumber, info.name);
707     for (const Field& field : info.fields) {
708       auto generic_field = nested->BeginNestedMessage<protozero::Message>(
709           GenericFtraceEvent::kFieldFieldNumber);
710       generic_field->AppendString(GenericFtraceEvent::Field::kNameFieldNumber,
711                                   field.ftrace_name);
712       success &= ParseField(field, start, end, table, generic_field, metadata);
713     }
714   } else if (PERFETTO_UNLIKELY(
715                  info.proto_field_id ==
716                  protos::pbzero::FtraceEvent::kSysEnterFieldNumber)) {
717     success &= ParseSysEnter(info, start, end, nested, metadata);
718   } else if (PERFETTO_UNLIKELY(
719                  info.proto_field_id ==
720                  protos::pbzero::FtraceEvent::kSysExitFieldNumber)) {
721     success &= ParseSysExit(info, start, end, ds_config, nested, metadata);
722   } else {  // Parse all other events.
723     for (const Field& field : info.fields) {
724       success &= ParseField(field, start, end, table, nested, metadata);
725     }
726   }
727 
728   if (PERFETTO_UNLIKELY(info.proto_field_id ==
729                         protos::pbzero::FtraceEvent::kTaskRenameFieldNumber)) {
730     // For task renames, we want to store that the pid was renamed. We use the
731     // common pid to reduce code complexity as in all the cases we care about,
732     // the common pid is the same as the renamed pid (the pid inside the event).
733     PERFETTO_DCHECK(metadata->last_seen_common_pid);
734     metadata->AddRenamePid(metadata->last_seen_common_pid);
735   }
736 
737   // This finalizes |nested| and |proto_field| automatically.
738   message->Finalize();
739   metadata->FinishEvent();
740   return success;
741 }
742 
743 // Caller must guarantee that the field fits in the range,
744 // explicitly: start + field.ftrace_offset + field.ftrace_size <= end
745 // The only exception is fields with strategy = kCStringToString
746 // where the total size isn't known up front. In this case ParseField
747 // will check the string terminates in the bounds and won't read past |end|.
ParseField(const Field & field,const uint8_t * start,const uint8_t * end,const ProtoTranslationTable * table,protozero::Message * message,FtraceMetadata * metadata)748 bool CpuReader::ParseField(const Field& field,
749                            const uint8_t* start,
750                            const uint8_t* end,
751                            const ProtoTranslationTable* table,
752                            protozero::Message* message,
753                            FtraceMetadata* metadata) {
754   PERFETTO_DCHECK(start + field.ftrace_offset + field.ftrace_size <= end);
755   const uint8_t* field_start = start + field.ftrace_offset;
756   uint32_t field_id = field.proto_field_id;
757 
758   switch (field.strategy) {
759     case kUint8ToUint32:
760     case kUint8ToUint64:
761       ReadIntoVarInt<uint8_t>(field_start, field_id, message);
762       return true;
763     case kUint16ToUint32:
764     case kUint16ToUint64:
765       ReadIntoVarInt<uint16_t>(field_start, field_id, message);
766       return true;
767     case kUint32ToUint32:
768     case kUint32ToUint64:
769       ReadIntoVarInt<uint32_t>(field_start, field_id, message);
770       return true;
771     case kUint64ToUint64:
772       ReadIntoVarInt<uint64_t>(field_start, field_id, message);
773       return true;
774     case kInt8ToInt32:
775     case kInt8ToInt64:
776       ReadIntoVarInt<int8_t>(field_start, field_id, message);
777       return true;
778     case kInt16ToInt32:
779     case kInt16ToInt64:
780       ReadIntoVarInt<int16_t>(field_start, field_id, message);
781       return true;
782     case kInt32ToInt32:
783     case kInt32ToInt64:
784       ReadIntoVarInt<int32_t>(field_start, field_id, message);
785       return true;
786     case kInt64ToInt64:
787       ReadIntoVarInt<int64_t>(field_start, field_id, message);
788       return true;
789     case kFixedCStringToString:
790       // TODO(hjd): Kernel-dive to check this how size:0 char fields work.
791       ReadIntoString(field_start, field.ftrace_size, field_id, message);
792       return true;
793     case kCStringToString:
794       // TODO(hjd): Kernel-dive to check this how size:0 char fields work.
795       ReadIntoString(field_start, static_cast<size_t>(end - field_start),
796                      field_id, message);
797       return true;
798     case kStringPtrToString: {
799       uint64_t n = 0;
800       // The ftrace field may be 8 or 4 bytes and we need to copy it into the
801       // bottom of n. In the unlikely case where the field is >8 bytes we
802       // should avoid making things worse by corrupting the stack but we
803       // don't need to handle it correctly.
804       size_t size = std::min<size_t>(field.ftrace_size, sizeof(n));
805       memcpy(base::AssumeLittleEndian(&n),
806              reinterpret_cast<const void*>(field_start), size);
807       // Look up the adddress in the printk format map and write it into the
808       // proto.
809       base::StringView name = table->LookupTraceString(n);
810       message->AppendBytes(field_id, name.begin(), name.size());
811       return true;
812     }
813     case kDataLocToString:
814       return ReadDataLoc(start, field_start, end, field, message);
815     case kBoolToUint32:
816     case kBoolToUint64:
817       ReadIntoVarInt<uint8_t>(field_start, field_id, message);
818       return true;
819     case kInode32ToUint64:
820       ReadInode<uint32_t>(field_start, field_id, message, metadata);
821       return true;
822     case kInode64ToUint64:
823       ReadInode<uint64_t>(field_start, field_id, message, metadata);
824       return true;
825     case kPid32ToInt32:
826     case kPid32ToInt64:
827       ReadPid(field_start, field_id, message, metadata);
828       return true;
829     case kCommonPid32ToInt32:
830     case kCommonPid32ToInt64:
831       ReadCommonPid(field_start, field_id, message, metadata);
832       return true;
833     case kDevId32ToUint64:
834       ReadDevId<uint32_t>(field_start, field_id, message, metadata);
835       return true;
836     case kDevId64ToUint64:
837       ReadDevId<uint64_t>(field_start, field_id, message, metadata);
838       return true;
839     case kFtraceSymAddr64ToUint64:
840       ReadSymbolAddr<uint64_t>(field_start, field_id, message, metadata);
841       return true;
842     case kInvalidTranslationStrategy:
843       break;
844   }
845   PERFETTO_FATAL("Unexpected translation strategy");
846 }
847 
ParseSysEnter(const Event & info,const uint8_t * start,const uint8_t * end,protozero::Message * message,FtraceMetadata *)848 bool CpuReader::ParseSysEnter(const Event& info,
849                               const uint8_t* start,
850                               const uint8_t* end,
851                               protozero::Message* message,
852                               FtraceMetadata* /* metadata */) {
853   if (info.fields.size() != 2) {
854     PERFETTO_DLOG("Unexpected number of fields for sys_enter");
855     return false;
856   }
857   const auto& id_field = info.fields[0];
858   const auto& args_field = info.fields[1];
859   if (start + id_field.ftrace_size + args_field.ftrace_size > end) {
860     return false;
861   }
862   // field:long id;
863   if (id_field.ftrace_type != kFtraceInt32 &&
864       id_field.ftrace_type != kFtraceInt64) {
865     return false;
866   }
867   const int64_t syscall_id = ReadSignedFtraceValue(
868       start + id_field.ftrace_offset, id_field.ftrace_type);
869   message->AppendVarInt(id_field.proto_field_id, syscall_id);
870   // field:unsigned long args[6];
871   // proto_translation_table will only allow exactly 6-element array, so we can
872   // make the same hard assumption here.
873   constexpr uint16_t arg_count = 6;
874   size_t element_size = 0;
875   if (args_field.ftrace_type == kFtraceUint32) {
876     element_size = 4u;
877   } else if (args_field.ftrace_type == kFtraceUint64) {
878     element_size = 8u;
879   } else {
880     return false;
881   }
882   for (uint16_t i = 0; i < arg_count; ++i) {
883     const uint8_t* element_ptr =
884         start + args_field.ftrace_offset + i * element_size;
885     uint64_t arg_value = 0;
886     if (element_size == 8) {
887       arg_value = ReadValue<uint64_t>(element_ptr);
888     } else {
889       arg_value = ReadValue<uint32_t>(element_ptr);
890     }
891     message->AppendVarInt(args_field.proto_field_id, arg_value);
892   }
893   return true;
894 }
895 
ParseSysExit(const Event & info,const uint8_t * start,const uint8_t * end,const FtraceDataSourceConfig * ds_config,protozero::Message * message,FtraceMetadata * metadata)896 bool CpuReader::ParseSysExit(const Event& info,
897                              const uint8_t* start,
898                              const uint8_t* end,
899                              const FtraceDataSourceConfig* ds_config,
900                              protozero::Message* message,
901                              FtraceMetadata* metadata) {
902   if (info.fields.size() != 2) {
903     PERFETTO_DLOG("Unexpected number of fields for sys_exit");
904     return false;
905   }
906   const auto& id_field = info.fields[0];
907   const auto& ret_field = info.fields[1];
908   if (start + id_field.ftrace_size + ret_field.ftrace_size > end) {
909     return false;
910   }
911   //    field:long id;
912   if (id_field.ftrace_type != kFtraceInt32 &&
913       id_field.ftrace_type != kFtraceInt64) {
914     return false;
915   }
916   const int64_t syscall_id = ReadSignedFtraceValue(
917       start + id_field.ftrace_offset, id_field.ftrace_type);
918   message->AppendVarInt(id_field.proto_field_id, syscall_id);
919   //    field:long ret;
920   if (ret_field.ftrace_type != kFtraceInt32 &&
921       ret_field.ftrace_type != kFtraceInt64) {
922     return false;
923   }
924   const int64_t syscall_ret = ReadSignedFtraceValue(
925       start + ret_field.ftrace_offset, ret_field.ftrace_type);
926   message->AppendVarInt(ret_field.proto_field_id, syscall_ret);
927   // for any syscalls which return a new filedescriptor
928   // we mark the fd as potential candidate for scraping
929   // if the call succeeded and is within fd bounds
930   if (ds_config->syscalls_returning_fd.count(syscall_id) && syscall_ret >= 0 &&
931       syscall_ret <= std::numeric_limits<int>::max()) {
932     const auto pid = metadata->last_seen_common_pid;
933     const auto syscall_ret_u = static_cast<uint64_t>(syscall_ret);
934     metadata->fds.insert(std::make_pair(pid, syscall_ret_u));
935   }
936   return true;
937 }
938 
939 // Parse a sched_switch event according to pre-validated format, and buffer the
940 // individual fields in the current compact batch. See the code populating
941 // |CompactSchedSwitchFormat| for the assumptions made around the format, which
942 // this code is closely tied to.
943 // static
ParseSchedSwitchCompact(const uint8_t * start,uint64_t timestamp,const CompactSchedSwitchFormat * format,CompactSchedBuffer * compact_buf,FtraceMetadata * metadata)944 void CpuReader::ParseSchedSwitchCompact(const uint8_t* start,
945                                         uint64_t timestamp,
946                                         const CompactSchedSwitchFormat* format,
947                                         CompactSchedBuffer* compact_buf,
948                                         FtraceMetadata* metadata) {
949   compact_buf->sched_switch().AppendTimestamp(timestamp);
950 
951   int32_t next_pid = ReadValue<int32_t>(start + format->next_pid_offset);
952   compact_buf->sched_switch().next_pid().Append(next_pid);
953   metadata->AddPid(next_pid);
954 
955   int32_t next_prio = ReadValue<int32_t>(start + format->next_prio_offset);
956   compact_buf->sched_switch().next_prio().Append(next_prio);
957 
958   // Varint encoding of int32 and int64 is the same, so treat the value as
959   // int64 after reading.
960   int64_t prev_state = ReadSignedFtraceValue(start + format->prev_state_offset,
961                                              format->prev_state_type);
962   compact_buf->sched_switch().prev_state().Append(prev_state);
963 
964   // next_comm
965   const char* comm_ptr =
966       reinterpret_cast<const char*>(start + format->next_comm_offset);
967   size_t iid = compact_buf->interner().InternComm(comm_ptr);
968   compact_buf->sched_switch().next_comm_index().Append(iid);
969 }
970 
971 // static
ParseSchedWakingCompact(const uint8_t * start,uint64_t timestamp,const CompactSchedWakingFormat * format,CompactSchedBuffer * compact_buf,FtraceMetadata * metadata)972 void CpuReader::ParseSchedWakingCompact(const uint8_t* start,
973                                         uint64_t timestamp,
974                                         const CompactSchedWakingFormat* format,
975                                         CompactSchedBuffer* compact_buf,
976                                         FtraceMetadata* metadata) {
977   compact_buf->sched_waking().AppendTimestamp(timestamp);
978 
979   int32_t pid = ReadValue<int32_t>(start + format->pid_offset);
980   compact_buf->sched_waking().pid().Append(pid);
981   metadata->AddPid(pid);
982 
983   int32_t target_cpu = ReadValue<int32_t>(start + format->target_cpu_offset);
984   compact_buf->sched_waking().target_cpu().Append(target_cpu);
985 
986   int32_t prio = ReadValue<int32_t>(start + format->prio_offset);
987   compact_buf->sched_waking().prio().Append(prio);
988 
989   // comm
990   const char* comm_ptr =
991       reinterpret_cast<const char*>(start + format->comm_offset);
992   size_t iid = compact_buf->interner().InternComm(comm_ptr);
993   compact_buf->sched_waking().comm_index().Append(iid);
994 
995   uint32_t common_flags =
996       ReadValue<uint8_t>(start + format->common_flags_offset);
997   compact_buf->sched_waking().common_flags().Append(common_flags);
998 }
999 
1000 }  // namespace perfetto
1001