• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2019 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/profiling/perf/event_reader.h"
18 
19 #include <linux/perf_event.h>
20 #include <sys/ioctl.h>
21 #include <sys/mman.h>
22 #include <sys/syscall.h>
23 #include <sys/types.h>
24 #include <unistd.h>
25 
26 #include "perfetto/ext/base/utils.h"
27 #include "src/profiling/perf/regs_parsing.h"
28 
29 namespace perfetto {
30 namespace profiling {
31 
32 namespace {
33 
34 template <typename T>
ReadValue(T * value_out,const char * ptr)35 const char* ReadValue(T* value_out, const char* ptr) {
36   memcpy(value_out, reinterpret_cast<const void*>(ptr), sizeof(T));
37   return ptr + sizeof(T);
38 }
39 
IsPowerOfTwo(size_t v)40 bool IsPowerOfTwo(size_t v) {
41   return (v != 0 && ((v & (v - 1)) == 0));
42 }
43 
perf_event_open(perf_event_attr * attr,pid_t pid,int cpu,int group_fd,unsigned long flags)44 static int perf_event_open(perf_event_attr* attr,
45                            pid_t pid,
46                            int cpu,
47                            int group_fd,
48                            unsigned long flags) {
49   return static_cast<int>(
50       syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags));
51 }
52 
PerfEventOpen(uint32_t cpu,const EventConfig & event_cfg)53 base::ScopedFile PerfEventOpen(uint32_t cpu, const EventConfig& event_cfg) {
54   base::ScopedFile perf_fd{
55       perf_event_open(event_cfg.perf_attr(), /*pid=*/-1, static_cast<int>(cpu),
56                       /*group_fd=*/-1, PERF_FLAG_FD_CLOEXEC)};
57   return perf_fd;
58 }
59 
60 }  // namespace
61 
PerfRingBuffer(PerfRingBuffer && other)62 PerfRingBuffer::PerfRingBuffer(PerfRingBuffer&& other) noexcept
63     : metadata_page_(other.metadata_page_),
64       mmap_sz_(other.mmap_sz_),
65       data_buf_(other.data_buf_),
66       data_buf_sz_(other.data_buf_sz_) {
67   other.metadata_page_ = nullptr;
68   other.mmap_sz_ = 0;
69   other.data_buf_ = nullptr;
70   other.data_buf_sz_ = 0;
71 }
72 
operator =(PerfRingBuffer && other)73 PerfRingBuffer& PerfRingBuffer::operator=(PerfRingBuffer&& other) noexcept {
74   if (this == &other)
75     return *this;
76 
77   this->~PerfRingBuffer();
78   new (this) PerfRingBuffer(std::move(other));
79   return *this;
80 }
81 
~PerfRingBuffer()82 PerfRingBuffer::~PerfRingBuffer() {
83   if (!valid())
84     return;
85 
86   if (munmap(reinterpret_cast<void*>(metadata_page_), mmap_sz_) != 0)
87     PERFETTO_PLOG("failed munmap");
88 }
89 
Allocate(int perf_fd,size_t data_page_count)90 base::Optional<PerfRingBuffer> PerfRingBuffer::Allocate(
91     int perf_fd,
92     size_t data_page_count) {
93   // perf_event_open requires the ring buffer to be a power of two in size.
94   PERFETTO_DCHECK(IsPowerOfTwo(data_page_count));
95 
96   PerfRingBuffer ret;
97 
98   // mmap request is one page larger than the buffer size (for the metadata).
99   ret.data_buf_sz_ = data_page_count * base::kPageSize;
100   ret.mmap_sz_ = ret.data_buf_sz_ + base::kPageSize;
101 
102   // If PROT_WRITE, kernel won't overwrite unread samples.
103   void* mmap_addr = mmap(nullptr, ret.mmap_sz_, PROT_READ | PROT_WRITE,
104                          MAP_SHARED, perf_fd, 0);
105   if (mmap_addr == MAP_FAILED) {
106     PERFETTO_PLOG("failed mmap");
107     return base::nullopt;
108   }
109 
110   // Expected layout is [ metadata page ] [ data pages ... ]
111   ret.metadata_page_ = reinterpret_cast<perf_event_mmap_page*>(mmap_addr);
112   ret.data_buf_ = reinterpret_cast<char*>(mmap_addr) + base::kPageSize;
113   PERFETTO_CHECK(ret.metadata_page_->data_offset == base::kPageSize);
114   PERFETTO_CHECK(ret.metadata_page_->data_size = ret.data_buf_sz_);
115 
116   return base::make_optional(std::move(ret));
117 }
118 
119 // See |perf_output_put_handle| for the necessary synchronization between the
120 // kernel and this userspace thread (which are using the same shared memory, but
121 // might be on different cores).
122 // TODO(rsavitski): is there false sharing between |data_tail| and |data_head|?
123 // Is there an argument for maintaining our own copy of |data_tail| instead of
124 // reloading it?
ReadRecordNonconsuming()125 char* PerfRingBuffer::ReadRecordNonconsuming() {
126   static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t), "");
127 
128   PERFETTO_CHECK(valid());
129 
130   // |data_tail| is written only by this userspace thread, so we can safely read
131   // it without any synchronization.
132   uint64_t read_offset = metadata_page_->data_tail;
133 
134   // |data_head| is written by the kernel, perform an acquiring load such that
135   // the payload reads below are ordered after this load.
136   uint64_t write_offset =
137       reinterpret_cast<std::atomic<uint64_t>*>(&metadata_page_->data_head)
138           ->load(std::memory_order_acquire);
139 
140   PERFETTO_DCHECK(read_offset <= write_offset);
141   if (write_offset == read_offset)
142     return nullptr;  // no new data
143 
144   size_t read_pos = static_cast<size_t>(read_offset & (data_buf_sz_ - 1));
145 
146   // event header (64 bits) guaranteed to be contiguous
147   PERFETTO_DCHECK(read_pos <= data_buf_sz_ - sizeof(perf_event_header));
148   PERFETTO_DCHECK(0 == reinterpret_cast<size_t>(data_buf_ + read_pos) %
149                            alignof(perf_event_header));
150 
151   perf_event_header* evt_header =
152       reinterpret_cast<perf_event_header*>(data_buf_ + read_pos);
153   uint16_t evt_size = evt_header->size;
154 
155   // event wrapped - reconstruct it, and return a pointer to the buffer
156   if (read_pos + evt_size > data_buf_sz_) {
157     PERFETTO_DCHECK(read_pos + evt_size !=
158                     ((read_pos + evt_size) & (data_buf_sz_ - 1)));
159     PERFETTO_DLOG("PerfRingBuffer: returning reconstructed event");
160 
161     size_t prefix_sz = data_buf_sz_ - read_pos;
162     memcpy(&reconstructed_record_[0], data_buf_ + read_pos, prefix_sz);
163     memcpy(&reconstructed_record_[0] + prefix_sz, data_buf_,
164            evt_size - prefix_sz);
165     return &reconstructed_record_[0];
166   } else {
167     // usual case - contiguous sample
168     PERFETTO_DCHECK(read_pos + evt_size ==
169                     ((read_pos + evt_size) & (data_buf_sz_ - 1)));
170 
171     return data_buf_ + read_pos;
172   }
173 }
174 
Consume(size_t bytes)175 void PerfRingBuffer::Consume(size_t bytes) {
176   PERFETTO_CHECK(valid());
177 
178   // Advance |data_tail|, which is written only by this thread. The store of the
179   // updated value needs to have release semantics such that the preceding
180   // payload reads are ordered before it. The reader in this case is the kernel,
181   // which reads |data_tail| to calculate the available ring buffer capacity
182   // before trying to store a new record.
183   uint64_t updated_tail = metadata_page_->data_tail + bytes;
184   reinterpret_cast<std::atomic<uint64_t>*>(&metadata_page_->data_tail)
185       ->store(updated_tail, std::memory_order_release);
186 }
187 
EventReader(uint32_t cpu,perf_event_attr event_attr,base::ScopedFile perf_fd,PerfRingBuffer ring_buffer)188 EventReader::EventReader(uint32_t cpu,
189                          perf_event_attr event_attr,
190                          base::ScopedFile perf_fd,
191                          PerfRingBuffer ring_buffer)
192     : cpu_(cpu),
193       event_attr_(event_attr),
194       perf_fd_(std::move(perf_fd)),
195       ring_buffer_(std::move(ring_buffer)) {}
196 
EventReader(EventReader && other)197 EventReader::EventReader(EventReader&& other) noexcept
198     : cpu_(other.cpu_),
199       event_attr_(other.event_attr_),
200       perf_fd_(std::move(other.perf_fd_)),
201       ring_buffer_(std::move(other.ring_buffer_)) {}
202 
operator =(EventReader && other)203 EventReader& EventReader::operator=(EventReader&& other) noexcept {
204   if (this == &other)
205     return *this;
206 
207   this->~EventReader();
208   new (this) EventReader(std::move(other));
209   return *this;
210 }
211 
ConfigureEvents(uint32_t cpu,const EventConfig & event_cfg)212 base::Optional<EventReader> EventReader::ConfigureEvents(
213     uint32_t cpu,
214     const EventConfig& event_cfg) {
215   auto perf_fd = PerfEventOpen(cpu, event_cfg);
216   if (!perf_fd) {
217     PERFETTO_PLOG("failed perf_event_open");
218     return base::nullopt;
219   }
220 
221   auto ring_buffer =
222       PerfRingBuffer::Allocate(perf_fd.get(), event_cfg.ring_buffer_pages());
223   if (!ring_buffer.has_value()) {
224     return base::nullopt;
225   }
226 
227   return base::make_optional<EventReader>(cpu, *event_cfg.perf_attr(),
228                                           std::move(perf_fd),
229                                           std::move(ring_buffer.value()));
230 }
231 
ReadUntilSample(std::function<void (uint64_t)> records_lost_callback)232 base::Optional<ParsedSample> EventReader::ReadUntilSample(
233     std::function<void(uint64_t)> records_lost_callback) {
234   for (;;) {
235     char* event = ring_buffer_.ReadRecordNonconsuming();
236     if (!event)
237       return base::nullopt;  // caught up with the writer
238 
239     auto* event_hdr = reinterpret_cast<const perf_event_header*>(event);
240 
241     if (event_hdr->type == PERF_RECORD_SAMPLE) {
242       ParsedSample sample = ParseSampleRecord(cpu_, event);
243       ring_buffer_.Consume(event_hdr->size);
244       return base::make_optional(std::move(sample));
245     }
246 
247     if (event_hdr->type == PERF_RECORD_LOST) {
248       /*
249        * struct {
250        *   struct perf_event_header header;
251        *   u64 id;
252        *   u64 lost;
253        *   struct sample_id sample_id;
254        * };
255        */
256       uint64_t records_lost = *reinterpret_cast<const uint64_t*>(
257           event + sizeof(perf_event_header) + sizeof(uint64_t));
258 
259       records_lost_callback(records_lost);
260       ring_buffer_.Consume(event_hdr->size);
261       continue;  // keep looking for a sample
262     }
263 
264     // Kernel had to throttle irqs.
265     if (event_hdr->type == PERF_RECORD_THROTTLE ||
266         event_hdr->type == PERF_RECORD_UNTHROTTLE) {
267       ring_buffer_.Consume(event_hdr->size);
268       continue;  // keep looking for a sample
269     }
270 
271     PERFETTO_DFATAL_OR_ELOG("Unsupported event type [%zu]",
272                             static_cast<size_t>(event_hdr->type));
273     ring_buffer_.Consume(event_hdr->size);
274   }
275 }
276 
277 // Generally, samples can belong to any cpu (which can be recorded with
278 // PERF_SAMPLE_CPU). However, this producer uses only cpu-scoped events,
279 // therefore it is already known.
ParseSampleRecord(uint32_t cpu,const char * record_start)280 ParsedSample EventReader::ParseSampleRecord(uint32_t cpu,
281                                             const char* record_start) {
282   if (event_attr_.sample_type &
283       (~uint64_t(PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_STACK_USER |
284                  PERF_SAMPLE_REGS_USER))) {
285     PERFETTO_FATAL("Unsupported sampling option");
286   }
287 
288   auto* event_hdr = reinterpret_cast<const perf_event_header*>(record_start);
289   size_t sample_size = event_hdr->size;
290 
291   ParsedSample sample = {};
292   sample.cpu = cpu;
293   sample.cpu_mode = event_hdr->misc & PERF_RECORD_MISC_CPUMODE_MASK;
294 
295   // Parse the payload, which consists of concatenated data for each
296   // |attr.sample_type| flag.
297   const char* parse_pos = record_start + sizeof(perf_event_header);
298 
299   if (event_attr_.sample_type & PERF_SAMPLE_TID) {
300     uint32_t pid = 0;
301     uint32_t tid = 0;
302     parse_pos = ReadValue(&pid, parse_pos);
303     parse_pos = ReadValue(&tid, parse_pos);
304     sample.pid = static_cast<pid_t>(pid);
305     sample.tid = static_cast<pid_t>(tid);
306   }
307 
308   if (event_attr_.sample_type & PERF_SAMPLE_TIME) {
309     parse_pos = ReadValue(&sample.timestamp, parse_pos);
310   }
311 
312   if (event_attr_.sample_type & PERF_SAMPLE_REGS_USER) {
313     // Can be empty, e.g. if we sampled a kernel thread.
314     sample.regs = ReadPerfUserRegsData(&parse_pos);
315   }
316 
317   if (event_attr_.sample_type & PERF_SAMPLE_STACK_USER) {
318     uint64_t max_stack_size;  // the requested size
319     parse_pos = ReadValue(&max_stack_size, parse_pos);
320 
321     const char* stack_start = parse_pos;
322     parse_pos += max_stack_size;  // skip to dyn_size
323 
324     // Payload written conditionally, e.g. kernel threads don't have a
325     // user stack.
326     if (max_stack_size > 0) {
327       uint64_t filled_stack_size;
328       parse_pos = ReadValue(&filled_stack_size, parse_pos);
329       PERFETTO_DLOG("sampled stack size: %" PRIu64 " / %" PRIu64 "",
330                     filled_stack_size, max_stack_size);
331 
332       // copy stack bytes into a vector
333       size_t payload_sz = static_cast<size_t>(filled_stack_size);
334       sample.stack.resize(payload_sz);
335       memcpy(sample.stack.data(), stack_start, payload_sz);
336     }
337   }
338 
339   PERFETTO_CHECK(parse_pos == record_start + sample_size);
340   return sample;
341 }
342 
PauseEvents()343 void EventReader::PauseEvents() {
344   int ret = ioctl(perf_fd_.get(), PERF_EVENT_IOC_DISABLE);
345   PERFETTO_CHECK(ret == 0);
346 }
347 
348 }  // namespace profiling
349 }  // namespace perfetto
350