• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/traced/probes/ftrace/cpu_reader.h"
18 
19 #include <dirent.h>
20 #include <fcntl.h>
21 #include <signal.h>
22 
23 #include <algorithm>
24 #include <utility>
25 
26 #include "perfetto/base/build_config.h"
27 #include "perfetto/base/logging.h"
28 #include "perfetto/ext/base/metatrace.h"
29 #include "perfetto/ext/base/optional.h"
30 #include "perfetto/ext/base/string_splitter.h"
31 #include "perfetto/ext/base/string_utils.h"
32 #include "perfetto/ext/base/utils.h"
33 #include "perfetto/ext/tracing/core/trace_writer.h"
34 #include "src/kallsyms/kernel_symbol_map.h"
35 #include "src/kallsyms/lazy_kernel_symbolizer.h"
36 #include "src/traced/probes/ftrace/cpu_stats_parser.h"
37 #include "src/traced/probes/ftrace/ftrace_config_muxer.h"
38 #include "src/traced/probes/ftrace/ftrace_controller.h"
39 #include "src/traced/probes/ftrace/ftrace_data_source.h"
40 #include "src/traced/probes/ftrace/proto_translation_table.h"
41 
42 #include "protos/perfetto/trace/ftrace/ftrace_event.pbzero.h"
43 #include "protos/perfetto/trace/ftrace/ftrace_event_bundle.pbzero.h"
44 #include "protos/perfetto/trace/ftrace/generic.pbzero.h"
45 #include "protos/perfetto/trace/interned_data/interned_data.pbzero.h"
46 #include "protos/perfetto/trace/profiling/profile_common.pbzero.h"
47 #include "protos/perfetto/trace/trace_packet.pbzero.h"
48 
49 namespace perfetto {
50 namespace {
51 
52 // If the compact_sched buffer accumulates more unique strings, the reader will
53 // flush it to reset the interning state (and make it cheap again).
54 // This is not an exact cap, since we check only at tracing page boundaries.
55 constexpr size_t kCompactSchedInternerThreshold = 64;
56 
57 // For further documentation of these constants see the kernel source:
58 //   linux/include/linux/ring_buffer.h
59 // Some of this is also available to userspace at runtime via:
60 //   /sys/kernel/tracing/events/header_event
61 constexpr uint32_t kTypeDataTypeLengthMax = 28;
62 constexpr uint32_t kTypePadding = 29;
63 constexpr uint32_t kTypeTimeExtend = 30;
64 constexpr uint32_t kTypeTimeStamp = 31;
65 
66 struct EventHeader {
67   // bottom 5 bits
68   uint32_t type_or_length : 5;
69   // top 27 bits
70   uint32_t time_delta : 27;
71 };
72 
73 // Reads a string from `start` until the first '\0' byte or until fixed_len
74 // characters have been read. Appends it to `*out` as field `field_id`.
ReadIntoString(const uint8_t * start,size_t fixed_len,uint32_t field_id,protozero::Message * out)75 void ReadIntoString(const uint8_t* start,
76                     size_t fixed_len,
77                     uint32_t field_id,
78                     protozero::Message* out) {
79   size_t len = strnlen(reinterpret_cast<const char*>(start), fixed_len);
80   out->AppendBytes(field_id, reinterpret_cast<const char*>(start), len);
81 }
82 
ReadDataLoc(const uint8_t * start,const uint8_t * field_start,const uint8_t * end,const Field & field,protozero::Message * message)83 bool ReadDataLoc(const uint8_t* start,
84                  const uint8_t* field_start,
85                  const uint8_t* end,
86                  const Field& field,
87                  protozero::Message* message) {
88   PERFETTO_DCHECK(field.ftrace_size == 4);
89   // See kernel header include/trace/trace_events.h
90   uint32_t data = 0;
91   const uint8_t* ptr = field_start;
92   if (!CpuReader::ReadAndAdvance(&ptr, end, &data)) {
93     PERFETTO_DFATAL("couldn't read __data_loc value");
94     return false;
95   }
96 
97   const uint16_t offset = data & 0xffff;
98   const uint16_t len = (data >> 16) & 0xffff;
99   const uint8_t* const string_start = start + offset;
100 
101   if (PERFETTO_UNLIKELY(len == 0))
102     return true;
103   if (PERFETTO_UNLIKELY(string_start < start || string_start + len > end)) {
104     PERFETTO_DFATAL("__data_loc points at invalid location");
105     return false;
106   }
107   ReadIntoString(string_start, len, field.proto_field_id, message);
108   return true;
109 }
110 
111 template <typename T>
ReadValue(const uint8_t * ptr)112 T ReadValue(const uint8_t* ptr) {
113   T t;
114   memcpy(&t, reinterpret_cast<const void*>(ptr), sizeof(T));
115   return t;
116 }
117 
118 // Reads a signed ftrace value as an int64_t, sign extending if necessary.
ReadSignedFtraceValue(const uint8_t * ptr,FtraceFieldType ftrace_type)119 static int64_t ReadSignedFtraceValue(const uint8_t* ptr,
120                                      FtraceFieldType ftrace_type) {
121   if (ftrace_type == kFtraceInt32) {
122     int32_t value;
123     memcpy(&value, reinterpret_cast<const void*>(ptr), sizeof(value));
124     return int64_t(value);
125   }
126   if (ftrace_type == kFtraceInt64) {
127     int64_t value;
128     memcpy(&value, reinterpret_cast<const void*>(ptr), sizeof(value));
129     return value;
130   }
131   PERFETTO_FATAL("unexpected ftrace type");
132 }
133 
SetBlocking(int fd,bool is_blocking)134 bool SetBlocking(int fd, bool is_blocking) {
135   int flags = fcntl(fd, F_GETFL, 0);
136   flags = (is_blocking) ? (flags & ~O_NONBLOCK) : (flags | O_NONBLOCK);
137   return fcntl(fd, F_SETFL, flags) == 0;
138 }
139 
LogInvalidPage(const void * start,size_t size)140 void LogInvalidPage(const void* start, size_t size) {
141   PERFETTO_ELOG("Invalid ftrace page");
142   std::string hexdump = base::HexDump(start, size);
143   // Only a single line per log message, because log message size might be
144   // limited.
145   for (base::StringSplitter ss(std::move(hexdump), '\n'); ss.Next();) {
146     PERFETTO_ELOG("%s", ss.cur_token());
147   }
148 }
149 
150 }  // namespace
151 
152 using protos::pbzero::GenericFtraceEvent;
153 
CpuReader(size_t cpu,const ProtoTranslationTable * table,LazyKernelSymbolizer * symbolizer,const FtraceClockSnapshot * ftrace_clock_snapshot,base::ScopedFile trace_fd)154 CpuReader::CpuReader(size_t cpu,
155                      const ProtoTranslationTable* table,
156                      LazyKernelSymbolizer* symbolizer,
157                      const FtraceClockSnapshot* ftrace_clock_snapshot,
158                      base::ScopedFile trace_fd)
159     : cpu_(cpu),
160       table_(table),
161       symbolizer_(symbolizer),
162       ftrace_clock_snapshot_(ftrace_clock_snapshot),
163       trace_fd_(std::move(trace_fd)) {
164   PERFETTO_CHECK(trace_fd_);
165   PERFETTO_CHECK(SetBlocking(*trace_fd_, false));
166 }
167 
168 CpuReader::~CpuReader() = default;
169 
ReadCycle(uint8_t * parsing_buf,size_t parsing_buf_size_pages,size_t max_pages,const std::set<FtraceDataSource * > & started_data_sources)170 size_t CpuReader::ReadCycle(
171     uint8_t* parsing_buf,
172     size_t parsing_buf_size_pages,
173     size_t max_pages,
174     const std::set<FtraceDataSource*>& started_data_sources) {
175   PERFETTO_DCHECK(max_pages > 0 && parsing_buf_size_pages > 0);
176   metatrace::ScopedEvent evt(metatrace::TAG_FTRACE,
177                              metatrace::FTRACE_CPU_READ_CYCLE);
178 
179   // Work in batches to keep cache locality, and limit memory usage.
180   size_t batch_pages = std::min(parsing_buf_size_pages, max_pages);
181   size_t total_pages_read = 0;
182   for (bool is_first_batch = true;; is_first_batch = false) {
183     size_t pages_read = ReadAndProcessBatch(
184         parsing_buf, batch_pages, is_first_batch, started_data_sources);
185 
186     PERFETTO_DCHECK(pages_read <= batch_pages);
187     total_pages_read += pages_read;
188 
189     // Check whether we've caught up to the writer, or possibly giving up on
190     // this attempt due to some error.
191     if (pages_read != batch_pages)
192       break;
193     // Check if we've hit the limit of work for this cycle.
194     if (total_pages_read >= max_pages)
195       break;
196   }
197   PERFETTO_METATRACE_COUNTER(TAG_FTRACE, FTRACE_PAGES_DRAINED,
198                              total_pages_read);
199   return total_pages_read;
200 }
201 
202 // metatrace note: mark the reading phase as FTRACE_CPU_READ_BATCH, but let the
203 // parsing time be implied (by the difference between the caller's span, and
204 // this reading span). Makes it easier to estimate the read/parse ratio when
205 // looking at the trace in the UI.
ReadAndProcessBatch(uint8_t * parsing_buf,size_t max_pages,bool first_batch_in_cycle,const std::set<FtraceDataSource * > & started_data_sources)206 size_t CpuReader::ReadAndProcessBatch(
207     uint8_t* parsing_buf,
208     size_t max_pages,
209     bool first_batch_in_cycle,
210     const std::set<FtraceDataSource*>& started_data_sources) {
211   size_t pages_read = 0;
212   {
213     metatrace::ScopedEvent evt(metatrace::TAG_FTRACE,
214                                metatrace::FTRACE_CPU_READ_BATCH);
215     for (; pages_read < max_pages;) {
216       uint8_t* curr_page = parsing_buf + (pages_read * base::kPageSize);
217       ssize_t res =
218           PERFETTO_EINTR(read(*trace_fd_, curr_page, base::kPageSize));
219       if (res < 0) {
220         // Expected errors:
221         // EAGAIN: no data (since we're in non-blocking mode).
222         // ENONMEM, EBUSY: temporary ftrace failures (they happen).
223         // ENODEV: the cpu is offline (b/145583318).
224         if (errno != EAGAIN && errno != ENOMEM && errno != EBUSY &&
225             errno != ENODEV) {
226           PERFETTO_PLOG("Unexpected error on raw ftrace read");
227         }
228         break;  // stop reading regardless of errno
229       }
230 
231       // As long as all of our reads are for a single page, the kernel should
232       // return exactly a well-formed raw ftrace page (if not in the steady
233       // state of reading out fully-written pages, the kernel will construct
234       // pages as necessary, copying over events and zero-filling at the end).
235       // A sub-page read() is therefore not expected in practice (unless
236       // there's a concurrent reader requesting less than a page?). Crash if
237       // encountering this situation. Kernel source pointer: see usage of
238       // |info->read| within |tracing_buffers_read|.
239       if (res == 0) {
240         // Very rare, but possible. Stop for now, should recover.
241         PERFETTO_DLOG("[cpu%zu]: 0-sized read from ftrace pipe.", cpu_);
242         break;
243       }
244       PERFETTO_CHECK(res == static_cast<ssize_t>(base::kPageSize));
245 
246       pages_read += 1;
247 
248       // Compare the amount of ftrace data read against an empirical threshold
249       // to make an educated guess on whether we should read more. To figure
250       // out the amount of ftrace data, we need to parse the page header (since
251       // the read always returns a page, zero-filled at the end). If we read
252       // fewer bytes than the threshold, it means that we caught up with the
253       // write pointer and we started consuming ftrace events in real-time.
254       // This cannot be just 4096 because it needs to account for
255       // fragmentation, i.e. for the fact that the last trace event didn't fit
256       // in the current page and hence the current page was terminated
257       // prematurely.
258       static constexpr size_t kRoughlyAPage = base::kPageSize - 512;
259       const uint8_t* scratch_ptr = curr_page;
260       base::Optional<PageHeader> hdr =
261           ParsePageHeader(&scratch_ptr, table_->page_header_size_len());
262       PERFETTO_DCHECK(hdr && hdr->size > 0 && hdr->size <= base::kPageSize);
263       if (!hdr.has_value()) {
264         PERFETTO_ELOG("[cpu%zu]: can't parse page header", cpu_);
265         break;
266       }
267       // Note that the first read after starting the read cycle being small is
268       // normal. It means that we're given the remainder of events from a
269       // page that we've partially consumed during the last read of the previous
270       // cycle (having caught up to the writer).
271       if (hdr->size < kRoughlyAPage &&
272           !(first_batch_in_cycle && pages_read == 1)) {
273         break;
274       }
275     }
276   }  // end of metatrace::FTRACE_CPU_READ_BATCH
277 
278   // Parse the pages and write to the trace for all relevant data
279   // sources.
280   if (pages_read == 0)
281     return pages_read;
282 
283   for (FtraceDataSource* data_source : started_data_sources) {
284     size_t pages_parsed_ok = ProcessPagesForDataSource(
285         data_source->trace_writer(), data_source->mutable_metadata(), cpu_,
286         data_source->parsing_config(), parsing_buf, pages_read, table_,
287         symbolizer_, ftrace_clock_snapshot_, ftrace_clock_);
288     // If this happens, it means that we did not know how to parse the kernel
289     // binary format. This is a bug in either perfetto or the kernel, and must
290     // be investigated. Hence we abort instead of recording a bit in the ftrace
291     // stats proto, which is easier to overlook.
292     if (pages_parsed_ok != pages_read) {
293       const size_t first_bad_page_idx = pages_parsed_ok;
294       const uint8_t* curr_page =
295           parsing_buf + (first_bad_page_idx * base::kPageSize);
296       LogInvalidPage(curr_page, base::kPageSize);
297       PERFETTO_FATAL("Failed to parse ftrace page");
298     }
299   }
300 
301   return pages_read;
302 }
303 
304 // static
ProcessPagesForDataSource(TraceWriter * trace_writer,FtraceMetadata * metadata,size_t cpu,const FtraceDataSourceConfig * ds_config,const uint8_t * parsing_buf,const size_t pages_read,const ProtoTranslationTable * table,LazyKernelSymbolizer * symbolizer,const FtraceClockSnapshot * ftrace_clock_snapshot,protos::pbzero::FtraceClock ftrace_clock)305 size_t CpuReader::ProcessPagesForDataSource(
306     TraceWriter* trace_writer,
307     FtraceMetadata* metadata,
308     size_t cpu,
309     const FtraceDataSourceConfig* ds_config,
310     const uint8_t* parsing_buf,
311     const size_t pages_read,
312     const ProtoTranslationTable* table,
313     LazyKernelSymbolizer* symbolizer,
314     const FtraceClockSnapshot* ftrace_clock_snapshot,
315     protos::pbzero::FtraceClock ftrace_clock) {
316   // Allocate the buffer for compact scheduler events (which will be unused if
317   // the compact option isn't enabled).
318   CompactSchedBuffer compact_sched;
319   bool compact_sched_enabled = ds_config->compact_sched.enabled;
320 
321   TraceWriter::TracePacketHandle packet;
322   protos::pbzero::FtraceEventBundle* bundle = nullptr;
323 
324   // This function is called after the contents of a FtraceBundle are written.
325   auto finalize_cur_packet = [&] {
326     PERFETTO_DCHECK(packet);
327     if (compact_sched_enabled)
328       compact_sched.WriteAndReset(bundle);
329 
330     bundle->Finalize();
331     bundle = nullptr;
332 
333     // Write the kernel symbol index (mangled address) -> name table.
334     // |metadata| is shared across all cpus, is distinct per |data_source| (i.e.
335     // tracing session) and is cleared after each FtraceController::ReadTick().
336     if (ds_config->symbolize_ksyms) {
337       // Symbol indexes are assigned mononically as |kernel_addrs.size()|,
338       // starting from index 1 (no symbol has index 0). Here we remember the
339       // size() (which is also == the highest value in |kernel_addrs|) at the
340       // beginning and only write newer indexes bigger than that.
341       uint32_t max_index_at_start = metadata->last_kernel_addr_index_written;
342       PERFETTO_DCHECK(max_index_at_start <= metadata->kernel_addrs.size());
343       protos::pbzero::InternedData* interned_data = nullptr;
344       auto* ksyms_map = symbolizer->GetOrCreateKernelSymbolMap();
345       bool wrote_at_least_one_symbol = false;
346       for (const FtraceMetadata::KernelAddr& kaddr : metadata->kernel_addrs) {
347         if (kaddr.index <= max_index_at_start)
348           continue;
349         std::string sym_name = ksyms_map->Lookup(kaddr.addr);
350         if (sym_name.empty()) {
351           // Lookup failed. This can genuinely happen in many occasions. E.g.,
352           // workqueue_execute_start has two pointers: one is a pointer to a
353           // function (which we expect to be symbolized), the other (|work|) is
354           // a pointer to a heap struct, which is unsymbolizable, even when
355           // using the textual ftrace endpoint.
356           continue;
357         }
358 
359         if (!interned_data) {
360           // If this is the very first write, clear the start of the sequence
361           // so the trace processor knows that all previous indexes can be
362           // discarded and that the mapping is restarting.
363           // In most cases this occurs with cpu==0. But if cpu0 is idle, this
364           // will happen with the first CPU that has any ftrace data.
365           if (max_index_at_start == 0) {
366             packet->set_sequence_flags(
367                 protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
368           }
369           interned_data = packet->set_interned_data();
370         }
371         auto* interned_sym = interned_data->add_kernel_symbols();
372         interned_sym->set_iid(kaddr.index);
373         interned_sym->set_str(sym_name);
374         wrote_at_least_one_symbol = true;
375       }
376 
377       auto max_it_at_end = static_cast<uint32_t>(metadata->kernel_addrs.size());
378 
379       // Rationale for the if (wrote_at_least_one_symbol) check: in rare cases,
380       // all symbols seen in a ProcessPagesForDataSource() call can fail the
381       // ksyms_map->Lookup(). If that happens we don't want to bump the
382       // last_kernel_addr_index_written watermark, as that would cause the next
383       // call to NOT emit the SEQ_INCREMENTAL_STATE_CLEARED.
384       if (wrote_at_least_one_symbol)
385         metadata->last_kernel_addr_index_written = max_it_at_end;
386     }
387 
388     packet->Finalize();
389   };  // finalize_cur_packet().
390 
391   auto start_new_packet = [&](bool lost_events) {
392     if (packet)
393       finalize_cur_packet();
394     packet = trace_writer->NewTracePacket();
395     bundle = packet->set_ftrace_events();
396     if (ftrace_clock) {
397       bundle->set_ftrace_clock(ftrace_clock);
398 
399       if (ftrace_clock_snapshot && ftrace_clock_snapshot->ftrace_clock_ts) {
400         bundle->set_ftrace_timestamp(ftrace_clock_snapshot->ftrace_clock_ts);
401         bundle->set_boot_timestamp(ftrace_clock_snapshot->boot_clock_ts);
402       }
403     }
404 
405     // Note: The fastpath in proto_trace_parser.cc speculates on the fact
406     // that the cpu field is the first field of the proto message. If this
407     // changes, change proto_trace_parser.cc accordingly.
408     bundle->set_cpu(static_cast<uint32_t>(cpu));
409     if (lost_events)
410       bundle->set_lost_events(true);
411   };
412 
413   start_new_packet(/*lost_events=*/false);
414   size_t pages_parsed = 0;
415   for (; pages_parsed < pages_read; pages_parsed++) {
416     const uint8_t* curr_page = parsing_buf + (pages_parsed * base::kPageSize);
417     const uint8_t* curr_page_end = curr_page + base::kPageSize;
418     const uint8_t* parse_pos = curr_page;
419     base::Optional<PageHeader> page_header =
420         ParsePageHeader(&parse_pos, table->page_header_size_len());
421 
422     if (!page_header.has_value() || page_header->size == 0 ||
423         parse_pos >= curr_page_end ||
424         parse_pos + page_header->size > curr_page_end) {
425       break;
426     }
427 
428     // Start a new bundle if either:
429     // * The page we're about to read indicates that there was a kernel ring
430     //   buffer overrun since our last read from that per-cpu buffer. We have
431     //   a single |lost_events| field per bundle, so start a new packet.
432     // * The compact_sched buffer is holding more unique interned strings than
433     //   a threshold. We need to flush the compact buffer to make the
434     //   interning lookups cheap again.
435     bool interner_past_threshold =
436         compact_sched_enabled &&
437         compact_sched.interner().interned_comms_size() >
438             kCompactSchedInternerThreshold;
439 
440     if (page_header->lost_events || interner_past_threshold)
441       start_new_packet(page_header->lost_events);
442 
443     size_t evt_size =
444         ParsePagePayload(parse_pos, &page_header.value(), table, ds_config,
445                          &compact_sched, bundle, metadata);
446 
447     if (evt_size != page_header->size) {
448       break;
449     }
450   }
451   finalize_cur_packet();
452 
453   return pages_parsed;
454 }
455 
456 // A page header consists of:
457 // * timestamp: 8 bytes
458 // * commit: 8 bytes on 64 bit, 4 bytes on 32 bit kernels
459 //
460 // The kernel reports this at /sys/kernel/debug/tracing/events/header_page.
461 //
462 // |commit|'s bottom bits represent the length of the payload following this
463 // header. The top bits have been repurposed as a bitset of flags pertaining to
464 // data loss. We look only at the "there has been some data lost" flag
465 // (RB_MISSED_EVENTS), and ignore the relatively tricky "appended the precise
466 // lost events count past the end of the valid data, as there was room to do so"
467 // flag (RB_MISSED_STORED).
468 //
469 // static
ParsePageHeader(const uint8_t ** ptr,uint16_t page_header_size_len)470 base::Optional<CpuReader::PageHeader> CpuReader::ParsePageHeader(
471     const uint8_t** ptr,
472     uint16_t page_header_size_len) {
473   // Mask for the data length portion of the |commit| field. Note that the
474   // kernel implementation never explicitly defines the boundary (beyond using
475   // bits 30 and 31 as flags), but 27 bits are mentioned as sufficient in the
476   // original commit message, and is the constant used by trace-cmd.
477   constexpr static uint64_t kDataSizeMask = (1ull << 27) - 1;
478   // If set, indicates that the relevant cpu has lost events since the last read
479   // (clearing the bit internally).
480   constexpr static uint64_t kMissedEventsFlag = (1ull << 31);
481 
482   const uint8_t* end_of_page = *ptr + base::kPageSize;
483   PageHeader page_header;
484   if (!CpuReader::ReadAndAdvance<uint64_t>(ptr, end_of_page,
485                                            &page_header.timestamp))
486     return base::nullopt;
487 
488   uint32_t size_and_flags;
489 
490   // On little endian, we can just read a uint32_t and reject the rest of the
491   // number later.
492   if (!CpuReader::ReadAndAdvance<uint32_t>(
493           ptr, end_of_page, base::AssumeLittleEndian(&size_and_flags)))
494     return base::nullopt;
495 
496   page_header.size = size_and_flags & kDataSizeMask;
497   page_header.lost_events = bool(size_and_flags & kMissedEventsFlag);
498   PERFETTO_DCHECK(page_header.size <= base::kPageSize);
499 
500   // Reject rest of the number, if applicable. On 32-bit, size_bytes - 4 will
501   // evaluate to 0 and this will be a no-op. On 64-bit, this will advance by 4
502   // bytes.
503   PERFETTO_DCHECK(page_header_size_len >= 4);
504   *ptr += page_header_size_len - 4;
505 
506   return base::make_optional(page_header);
507 }
508 
509 // A raw ftrace buffer page consists of a header followed by a sequence of
510 // binary ftrace events. See |ParsePageHeader| for the format of the earlier.
511 //
512 // This method is deliberately static so it can be tested independently.
ParsePagePayload(const uint8_t * start_of_payload,const PageHeader * page_header,const ProtoTranslationTable * table,const FtraceDataSourceConfig * ds_config,CompactSchedBuffer * compact_sched_buffer,FtraceEventBundle * bundle,FtraceMetadata * metadata)513 size_t CpuReader::ParsePagePayload(const uint8_t* start_of_payload,
514                                    const PageHeader* page_header,
515                                    const ProtoTranslationTable* table,
516                                    const FtraceDataSourceConfig* ds_config,
517                                    CompactSchedBuffer* compact_sched_buffer,
518                                    FtraceEventBundle* bundle,
519                                    FtraceMetadata* metadata) {
520   const uint8_t* ptr = start_of_payload;
521   const uint8_t* const end = ptr + page_header->size;
522 
523   uint64_t timestamp = page_header->timestamp;
524 
525   while (ptr < end) {
526     EventHeader event_header;
527     if (!ReadAndAdvance(&ptr, end, &event_header))
528       return 0;
529 
530     timestamp += event_header.time_delta;
531 
532     switch (event_header.type_or_length) {
533       case kTypePadding: {
534         // Left over page padding or discarded event.
535         if (event_header.time_delta == 0) {
536           // Not clear what the correct behaviour is in this case.
537           PERFETTO_DFATAL("Empty padding event.");
538           return 0;
539         }
540         uint32_t length = 0;
541         if (!ReadAndAdvance<uint32_t>(&ptr, end, &length))
542           return 0;
543         // length includes itself (4 bytes)
544         if (length < 4)
545           return 0;
546         ptr += length - 4;
547         break;
548       }
549       case kTypeTimeExtend: {
550         // Extend the time delta.
551         uint32_t time_delta_ext = 0;
552         if (!ReadAndAdvance<uint32_t>(&ptr, end, &time_delta_ext))
553           return 0;
554         timestamp += (static_cast<uint64_t>(time_delta_ext)) << 27;
555         break;
556       }
557       case kTypeTimeStamp: {
558         // Absolute timestamp. This was historically partially implemented, but
559         // not written. Kernels 4.17+ reimplemented this record, changing its
560         // size in the process. We assume the newer layout. Parsed the same as
561         // kTypeTimeExtend, except that the timestamp is interpreted as an
562         // absolute, instead of a delta on top of the previous state.
563         uint32_t time_delta_ext = 0;
564         if (!ReadAndAdvance<uint32_t>(&ptr, end, &time_delta_ext))
565           return 0;
566         timestamp = event_header.time_delta +
567                     (static_cast<uint64_t>(time_delta_ext) << 27);
568         break;
569       }
570       // Data record:
571       default: {
572         PERFETTO_CHECK(event_header.type_or_length <= kTypeDataTypeLengthMax);
573         // type_or_length is <=28 so it represents the length of a data
574         // record. if == 0, this is an extended record and the size of the
575         // record is stored in the first uint32_t word in the payload. See
576         // Kernel's include/linux/ring_buffer.h
577         uint32_t event_size = 0;
578         if (event_header.type_or_length == 0) {
579           if (!ReadAndAdvance<uint32_t>(&ptr, end, &event_size))
580             return 0;
581           // Size includes the size field itself.
582           if (event_size < 4)
583             return 0;
584           event_size -= 4;
585         } else {
586           event_size = 4 * event_header.type_or_length;
587         }
588         const uint8_t* start = ptr;
589         const uint8_t* next = ptr + event_size;
590 
591         if (next > end)
592           return 0;
593 
594         uint16_t ftrace_event_id;
595         if (!ReadAndAdvance<uint16_t>(&ptr, end, &ftrace_event_id))
596           return 0;
597 
598         if (ds_config->event_filter.IsEventEnabled(ftrace_event_id)) {
599           // Special-cased handling of some scheduler events when compact format
600           // is enabled.
601           bool compact_sched_enabled = ds_config->compact_sched.enabled;
602           const CompactSchedSwitchFormat& sched_switch_format =
603               table->compact_sched_format().sched_switch;
604           const CompactSchedWakingFormat& sched_waking_format =
605               table->compact_sched_format().sched_waking;
606 
607           // compact sched_switch
608           if (compact_sched_enabled &&
609               ftrace_event_id == sched_switch_format.event_id) {
610             if (event_size < sched_switch_format.size)
611               return 0;
612 
613             ParseSchedSwitchCompact(start, timestamp, &sched_switch_format,
614                                     compact_sched_buffer, metadata);
615 
616             // compact sched_waking
617           } else if (compact_sched_enabled &&
618                      ftrace_event_id == sched_waking_format.event_id) {
619             if (event_size < sched_waking_format.size)
620               return 0;
621 
622             ParseSchedWakingCompact(start, timestamp, &sched_waking_format,
623                                     compact_sched_buffer, metadata);
624 
625           } else {
626             // Common case: parse all other types of enabled events.
627             protos::pbzero::FtraceEvent* event = bundle->add_event();
628             event->set_timestamp(timestamp);
629             if (!ParseEvent(ftrace_event_id, start, next, table, event,
630                             metadata))
631               return 0;
632           }
633         }
634 
635         // Jump to next event.
636         ptr = next;
637       }
638     }
639   }
640   return static_cast<size_t>(ptr - start_of_payload);
641 }
642 
643 // |start| is the start of the current event.
644 // |end| is the end of the buffer.
ParseEvent(uint16_t ftrace_event_id,const uint8_t * start,const uint8_t * end,const ProtoTranslationTable * table,protozero::Message * message,FtraceMetadata * metadata)645 bool CpuReader::ParseEvent(uint16_t ftrace_event_id,
646                            const uint8_t* start,
647                            const uint8_t* end,
648                            const ProtoTranslationTable* table,
649                            protozero::Message* message,
650                            FtraceMetadata* metadata) {
651   PERFETTO_DCHECK(start < end);
652   const size_t length = static_cast<size_t>(end - start);
653 
654   // TODO(hjd): Rework to work even if the event is unknown.
655   const Event& info = *table->GetEventById(ftrace_event_id);
656 
657   // TODO(hjd): Test truncated events.
658   // If the end of the buffer is before the end of the event give up.
659   if (info.size > length) {
660     PERFETTO_DFATAL("Buffer overflowed.");
661     return false;
662   }
663 
664   bool success = true;
665   for (const Field& field : table->common_fields())
666     success &= ParseField(field, start, end, table, message, metadata);
667 
668   protozero::Message* nested =
669       message->BeginNestedMessage<protozero::Message>(info.proto_field_id);
670 
671   // Parse generic event.
672   if (PERFETTO_UNLIKELY(info.proto_field_id ==
673                         protos::pbzero::FtraceEvent::kGenericFieldNumber)) {
674     nested->AppendString(GenericFtraceEvent::kEventNameFieldNumber, info.name);
675     for (const Field& field : info.fields) {
676       auto generic_field = nested->BeginNestedMessage<protozero::Message>(
677           GenericFtraceEvent::kFieldFieldNumber);
678       generic_field->AppendString(GenericFtraceEvent::Field::kNameFieldNumber,
679                                   field.ftrace_name);
680       success &= ParseField(field, start, end, table, generic_field, metadata);
681     }
682   } else {  // Parse all other events.
683     for (const Field& field : info.fields) {
684       success &= ParseField(field, start, end, table, nested, metadata);
685     }
686   }
687 
688   if (PERFETTO_UNLIKELY(info.proto_field_id ==
689                         protos::pbzero::FtraceEvent::kTaskRenameFieldNumber)) {
690     // For task renames, we want to store that the pid was renamed. We use the
691     // common pid to reduce code complexity as in all the cases we care about,
692     // the common pid is the same as the renamed pid (the pid inside the event).
693     PERFETTO_DCHECK(metadata->last_seen_common_pid);
694     metadata->AddRenamePid(metadata->last_seen_common_pid);
695   }
696 
697   // This finalizes |nested| and |proto_field| automatically.
698   message->Finalize();
699   metadata->FinishEvent();
700   return success;
701 }
702 
703 // Caller must guarantee that the field fits in the range,
704 // explicitly: start + field.ftrace_offset + field.ftrace_size <= end
705 // The only exception is fields with strategy = kCStringToString
706 // where the total size isn't known up front. In this case ParseField
707 // will check the string terminates in the bounds and won't read past |end|.
ParseField(const Field & field,const uint8_t * start,const uint8_t * end,const ProtoTranslationTable * table,protozero::Message * message,FtraceMetadata * metadata)708 bool CpuReader::ParseField(const Field& field,
709                            const uint8_t* start,
710                            const uint8_t* end,
711                            const ProtoTranslationTable* table,
712                            protozero::Message* message,
713                            FtraceMetadata* metadata) {
714   PERFETTO_DCHECK(start + field.ftrace_offset + field.ftrace_size <= end);
715   const uint8_t* field_start = start + field.ftrace_offset;
716   uint32_t field_id = field.proto_field_id;
717 
718   switch (field.strategy) {
719     case kUint8ToUint32:
720     case kUint8ToUint64:
721       ReadIntoVarInt<uint8_t>(field_start, field_id, message);
722       return true;
723     case kUint16ToUint32:
724     case kUint16ToUint64:
725       ReadIntoVarInt<uint16_t>(field_start, field_id, message);
726       return true;
727     case kUint32ToUint32:
728     case kUint32ToUint64:
729       ReadIntoVarInt<uint32_t>(field_start, field_id, message);
730       return true;
731     case kUint64ToUint64:
732       ReadIntoVarInt<uint64_t>(field_start, field_id, message);
733       return true;
734     case kInt8ToInt32:
735     case kInt8ToInt64:
736       ReadIntoVarInt<int8_t>(field_start, field_id, message);
737       return true;
738     case kInt16ToInt32:
739     case kInt16ToInt64:
740       ReadIntoVarInt<int16_t>(field_start, field_id, message);
741       return true;
742     case kInt32ToInt32:
743     case kInt32ToInt64:
744       ReadIntoVarInt<int32_t>(field_start, field_id, message);
745       return true;
746     case kInt64ToInt64:
747       ReadIntoVarInt<int64_t>(field_start, field_id, message);
748       return true;
749     case kFixedCStringToString:
750       // TODO(hjd): Kernel-dive to check this how size:0 char fields work.
751       ReadIntoString(field_start, field.ftrace_size, field_id, message);
752       return true;
753     case kCStringToString:
754       // TODO(hjd): Kernel-dive to check this how size:0 char fields work.
755       ReadIntoString(field_start, static_cast<size_t>(end - field_start),
756                      field_id, message);
757       return true;
758     case kStringPtrToString: {
759       uint64_t n = 0;
760       // The ftrace field may be 8 or 4 bytes and we need to copy it into the
761       // bottom of n. In the unlikely case where the field is >8 bytes we
762       // should avoid making things worse by corrupting the stack but we
763       // don't need to handle it correctly.
764       size_t size = std::min<size_t>(field.ftrace_size, sizeof(n));
765       memcpy(base::AssumeLittleEndian(&n),
766              reinterpret_cast<const void*>(field_start), size);
767       // Look up the adddress in the printk format map and write it into the
768       // proto.
769       base::StringView name = table->LookupTraceString(n);
770       message->AppendBytes(field_id, name.begin(), name.size());
771       return true;
772     }
773     case kDataLocToString:
774       return ReadDataLoc(start, field_start, end, field, message);
775     case kBoolToUint32:
776     case kBoolToUint64:
777       ReadIntoVarInt<uint8_t>(field_start, field_id, message);
778       return true;
779     case kInode32ToUint64:
780       ReadInode<uint32_t>(field_start, field_id, message, metadata);
781       return true;
782     case kInode64ToUint64:
783       ReadInode<uint64_t>(field_start, field_id, message, metadata);
784       return true;
785     case kPid32ToInt32:
786     case kPid32ToInt64:
787       ReadPid(field_start, field_id, message, metadata);
788       return true;
789     case kCommonPid32ToInt32:
790     case kCommonPid32ToInt64:
791       ReadCommonPid(field_start, field_id, message, metadata);
792       return true;
793     case kDevId32ToUint64:
794       ReadDevId<uint32_t>(field_start, field_id, message, metadata);
795       return true;
796     case kDevId64ToUint64:
797       ReadDevId<uint64_t>(field_start, field_id, message, metadata);
798       return true;
799     case kFtraceSymAddr64ToUint64:
800       ReadSymbolAddr<uint64_t>(field_start, field_id, message, metadata);
801       return true;
802     case kInvalidTranslationStrategy:
803       break;
804   }
805   PERFETTO_FATAL("Unexpected translation strategy");
806 }
807 
808 // Parse a sched_switch event according to pre-validated format, and buffer the
809 // individual fields in the current compact batch. See the code populating
810 // |CompactSchedSwitchFormat| for the assumptions made around the format, which
811 // this code is closely tied to.
812 // static
ParseSchedSwitchCompact(const uint8_t * start,uint64_t timestamp,const CompactSchedSwitchFormat * format,CompactSchedBuffer * compact_buf,FtraceMetadata * metadata)813 void CpuReader::ParseSchedSwitchCompact(const uint8_t* start,
814                                         uint64_t timestamp,
815                                         const CompactSchedSwitchFormat* format,
816                                         CompactSchedBuffer* compact_buf,
817                                         FtraceMetadata* metadata) {
818   compact_buf->sched_switch().AppendTimestamp(timestamp);
819 
820   int32_t next_pid = ReadValue<int32_t>(start + format->next_pid_offset);
821   compact_buf->sched_switch().next_pid().Append(next_pid);
822   metadata->AddPid(next_pid);
823 
824   int32_t next_prio = ReadValue<int32_t>(start + format->next_prio_offset);
825   compact_buf->sched_switch().next_prio().Append(next_prio);
826 
827   // Varint encoding of int32 and int64 is the same, so treat the value as
828   // int64 after reading.
829   int64_t prev_state = ReadSignedFtraceValue(start + format->prev_state_offset,
830                                              format->prev_state_type);
831   compact_buf->sched_switch().prev_state().Append(prev_state);
832 
833   // next_comm
834   const char* comm_ptr =
835       reinterpret_cast<const char*>(start + format->next_comm_offset);
836   size_t iid = compact_buf->interner().InternComm(comm_ptr);
837   compact_buf->sched_switch().next_comm_index().Append(iid);
838 }
839 
840 // static
ParseSchedWakingCompact(const uint8_t * start,uint64_t timestamp,const CompactSchedWakingFormat * format,CompactSchedBuffer * compact_buf,FtraceMetadata * metadata)841 void CpuReader::ParseSchedWakingCompact(const uint8_t* start,
842                                         uint64_t timestamp,
843                                         const CompactSchedWakingFormat* format,
844                                         CompactSchedBuffer* compact_buf,
845                                         FtraceMetadata* metadata) {
846   compact_buf->sched_waking().AppendTimestamp(timestamp);
847 
848   int32_t pid = ReadValue<int32_t>(start + format->pid_offset);
849   compact_buf->sched_waking().pid().Append(pid);
850   metadata->AddPid(pid);
851 
852   int32_t target_cpu = ReadValue<int32_t>(start + format->target_cpu_offset);
853   compact_buf->sched_waking().target_cpu().Append(target_cpu);
854 
855   int32_t prio = ReadValue<int32_t>(start + format->prio_offset);
856   compact_buf->sched_waking().prio().Append(prio);
857 
858   // comm
859   const char* comm_ptr =
860       reinterpret_cast<const char*>(start + format->comm_offset);
861   size_t iid = compact_buf->interner().InternComm(comm_ptr);
862   compact_buf->sched_waking().comm_index().Append(iid);
863 }
864 
865 }  // namespace perfetto
866