1 /** 2 * @file cpu_buffer.c 3 * 4 * @remark Copyright 2002-2009 OProfile authors 5 * @remark Read the file COPYING 6 * 7 * @author John Levon <levon@movementarian.org> 8 * @author Barry Kasindorf <barry.kasindorf@amd.com> 9 * @author Robert Richter <robert.richter@amd.com> 10 * 11 * Each CPU has a local buffer that stores PC value/event 12 * pairs. We also log context switches when we notice them. 13 * Eventually each CPU's buffer is processed into the global 14 * event buffer by sync_buffer(). 15 * 16 * We use a local buffer for two reasons: an NMI or similar 17 * interrupt cannot synchronise, and high sampling rates 18 * would lead to catastrophic global synchronisation if 19 * a global buffer was used. 20 */ 21 22 #include <linux/sched.h> 23 #include <linux/oprofile.h> 24 #include <linux/errno.h> 25 26 #include <asm/ptrace.h> 27 28 #include "event_buffer.h" 29 #include "cpu_buffer.h" 30 #include "buffer_sync.h" 31 #include "oprof.h" 32 33 #define OP_BUFFER_FLAGS 0 34 35 static struct ring_buffer *op_ring_buffer; 36 DEFINE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer); 37 38 static void wq_sync_buffer(struct work_struct *work); 39 40 #define DEFAULT_TIMER_EXPIRE (HZ / 10) 41 static int work_enabled; 42 oprofile_get_cpu_buffer_size(void)43 unsigned long oprofile_get_cpu_buffer_size(void) 44 { 45 return oprofile_cpu_buffer_size; 46 } 47 oprofile_cpu_buffer_inc_smpl_lost(void)48 void oprofile_cpu_buffer_inc_smpl_lost(void) 49 { 50 struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer); 51 52 cpu_buf->sample_lost_overflow++; 53 } 54 free_cpu_buffers(void)55 void free_cpu_buffers(void) 56 { 57 if (op_ring_buffer) 58 ring_buffer_free(op_ring_buffer); 59 op_ring_buffer = NULL; 60 } 61 62 #define RB_EVENT_HDR_SIZE 4 63 alloc_cpu_buffers(void)64 int alloc_cpu_buffers(void) 65 { 66 int i; 67 68 unsigned long buffer_size = oprofile_cpu_buffer_size; 69 unsigned long byte_size = buffer_size * (sizeof(struct op_sample) + 70 RB_EVENT_HDR_SIZE); 71 72 op_ring_buffer = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS); 73 if (!op_ring_buffer) 74 goto fail; 75 76 for_each_possible_cpu(i) { 77 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i); 78 79 b->last_task = NULL; 80 b->last_is_kernel = -1; 81 b->tracing = 0; 82 b->buffer_size = buffer_size; 83 b->sample_received = 0; 84 b->sample_lost_overflow = 0; 85 b->backtrace_aborted = 0; 86 b->sample_invalid_eip = 0; 87 b->cpu = i; 88 INIT_DELAYED_WORK(&b->work, wq_sync_buffer); 89 } 90 return 0; 91 92 fail: 93 free_cpu_buffers(); 94 return -ENOMEM; 95 } 96 start_cpu_work(void)97 void start_cpu_work(void) 98 { 99 int i; 100 101 work_enabled = 1; 102 103 for_each_online_cpu(i) { 104 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i); 105 106 /* 107 * Spread the work by 1 jiffy per cpu so they dont all 108 * fire at once. 109 */ 110 schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i); 111 } 112 } 113 end_cpu_work(void)114 void end_cpu_work(void) 115 { 116 work_enabled = 0; 117 } 118 flush_cpu_work(void)119 void flush_cpu_work(void) 120 { 121 int i; 122 123 for_each_online_cpu(i) { 124 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i); 125 126 /* these works are per-cpu, no need for flush_sync */ 127 flush_delayed_work(&b->work); 128 } 129 } 130 131 /* 132 * This function prepares the cpu buffer to write a sample. 133 * 134 * Struct op_entry is used during operations on the ring buffer while 135 * struct op_sample contains the data that is stored in the ring 136 * buffer. Struct entry can be uninitialized. The function reserves a 137 * data array that is specified by size. Use 138 * op_cpu_buffer_write_commit() after preparing the sample. In case of 139 * errors a null pointer is returned, otherwise the pointer to the 140 * sample. 141 * 142 */ 143 struct op_sample op_cpu_buffer_write_reserve(struct op_entry * entry,unsigned long size)144 *op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size) 145 { 146 entry->event = ring_buffer_lock_reserve 147 (op_ring_buffer, sizeof(struct op_sample) + 148 size * sizeof(entry->sample->data[0])); 149 if (!entry->event) 150 return NULL; 151 entry->sample = ring_buffer_event_data(entry->event); 152 entry->size = size; 153 entry->data = entry->sample->data; 154 155 return entry->sample; 156 } 157 op_cpu_buffer_write_commit(struct op_entry * entry)158 int op_cpu_buffer_write_commit(struct op_entry *entry) 159 { 160 return ring_buffer_unlock_commit(op_ring_buffer, entry->event); 161 } 162 op_cpu_buffer_read_entry(struct op_entry * entry,int cpu)163 struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu) 164 { 165 struct ring_buffer_event *e; 166 e = ring_buffer_consume(op_ring_buffer, cpu, NULL, NULL); 167 if (!e) 168 return NULL; 169 170 entry->event = e; 171 entry->sample = ring_buffer_event_data(e); 172 entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample)) 173 / sizeof(entry->sample->data[0]); 174 entry->data = entry->sample->data; 175 return entry->sample; 176 } 177 op_cpu_buffer_entries(int cpu)178 unsigned long op_cpu_buffer_entries(int cpu) 179 { 180 return ring_buffer_entries_cpu(op_ring_buffer, cpu); 181 } 182 183 static int op_add_code(struct oprofile_cpu_buffer * cpu_buf,unsigned long backtrace,int is_kernel,struct task_struct * task)184 op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace, 185 int is_kernel, struct task_struct *task) 186 { 187 struct op_entry entry; 188 struct op_sample *sample; 189 unsigned long flags; 190 int size; 191 192 flags = 0; 193 194 if (backtrace) 195 flags |= TRACE_BEGIN; 196 197 /* notice a switch from user->kernel or vice versa */ 198 is_kernel = !!is_kernel; 199 if (cpu_buf->last_is_kernel != is_kernel) { 200 cpu_buf->last_is_kernel = is_kernel; 201 flags |= KERNEL_CTX_SWITCH; 202 if (is_kernel) 203 flags |= IS_KERNEL; 204 } 205 206 /* notice a task switch */ 207 if (cpu_buf->last_task != task) { 208 cpu_buf->last_task = task; 209 flags |= USER_CTX_SWITCH; 210 } 211 212 if (!flags) 213 /* nothing to do */ 214 return 0; 215 216 if (flags & USER_CTX_SWITCH) 217 size = 1; 218 else 219 size = 0; 220 221 sample = op_cpu_buffer_write_reserve(&entry, size); 222 if (!sample) 223 return -ENOMEM; 224 225 sample->eip = ESCAPE_CODE; 226 sample->event = flags; 227 228 if (size) 229 op_cpu_buffer_add_data(&entry, (unsigned long)task); 230 231 op_cpu_buffer_write_commit(&entry); 232 233 return 0; 234 } 235 236 static inline int op_add_sample(struct oprofile_cpu_buffer * cpu_buf,unsigned long pc,unsigned long event)237 op_add_sample(struct oprofile_cpu_buffer *cpu_buf, 238 unsigned long pc, unsigned long event) 239 { 240 struct op_entry entry; 241 struct op_sample *sample; 242 243 sample = op_cpu_buffer_write_reserve(&entry, 0); 244 if (!sample) 245 return -ENOMEM; 246 247 sample->eip = pc; 248 sample->event = event; 249 250 return op_cpu_buffer_write_commit(&entry); 251 } 252 253 /* 254 * This must be safe from any context. 255 * 256 * is_kernel is needed because on some architectures you cannot 257 * tell if you are in kernel or user space simply by looking at 258 * pc. We tag this in the buffer by generating kernel enter/exit 259 * events whenever is_kernel changes 260 */ 261 static int log_sample(struct oprofile_cpu_buffer * cpu_buf,unsigned long pc,unsigned long backtrace,int is_kernel,unsigned long event,struct task_struct * task)262 log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, 263 unsigned long backtrace, int is_kernel, unsigned long event, 264 struct task_struct *task) 265 { 266 struct task_struct *tsk = task ? task : current; 267 cpu_buf->sample_received++; 268 269 if (pc == ESCAPE_CODE) { 270 cpu_buf->sample_invalid_eip++; 271 return 0; 272 } 273 274 if (op_add_code(cpu_buf, backtrace, is_kernel, tsk)) 275 goto fail; 276 277 if (op_add_sample(cpu_buf, pc, event)) 278 goto fail; 279 280 return 1; 281 282 fail: 283 cpu_buf->sample_lost_overflow++; 284 return 0; 285 } 286 oprofile_begin_trace(struct oprofile_cpu_buffer * cpu_buf)287 static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf) 288 { 289 cpu_buf->tracing = 1; 290 } 291 oprofile_end_trace(struct oprofile_cpu_buffer * cpu_buf)292 static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf) 293 { 294 cpu_buf->tracing = 0; 295 } 296 297 static inline void __oprofile_add_ext_sample(unsigned long pc,struct pt_regs * const regs,unsigned long event,int is_kernel,struct task_struct * task)298 __oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, 299 unsigned long event, int is_kernel, 300 struct task_struct *task) 301 { 302 struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer); 303 unsigned long backtrace = oprofile_backtrace_depth; 304 305 /* 306 * if log_sample() fail we can't backtrace since we lost the 307 * source of this event 308 */ 309 if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event, task)) 310 /* failed */ 311 return; 312 313 if (!backtrace) 314 return; 315 316 oprofile_begin_trace(cpu_buf); 317 oprofile_ops.backtrace(regs, backtrace); 318 oprofile_end_trace(cpu_buf); 319 } 320 oprofile_add_ext_hw_sample(unsigned long pc,struct pt_regs * const regs,unsigned long event,int is_kernel,struct task_struct * task)321 void oprofile_add_ext_hw_sample(unsigned long pc, struct pt_regs * const regs, 322 unsigned long event, int is_kernel, 323 struct task_struct *task) 324 { 325 __oprofile_add_ext_sample(pc, regs, event, is_kernel, task); 326 } 327 oprofile_add_ext_sample(unsigned long pc,struct pt_regs * const regs,unsigned long event,int is_kernel)328 void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, 329 unsigned long event, int is_kernel) 330 { 331 __oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL); 332 } 333 oprofile_add_sample(struct pt_regs * const regs,unsigned long event)334 void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) 335 { 336 int is_kernel; 337 unsigned long pc; 338 339 if (likely(regs)) { 340 is_kernel = !user_mode(regs); 341 pc = profile_pc(regs); 342 } else { 343 is_kernel = 0; /* This value will not be used */ 344 pc = ESCAPE_CODE; /* as this causes an early return. */ 345 } 346 347 __oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL); 348 } 349 350 /* 351 * Add samples with data to the ring buffer. 352 * 353 * Use oprofile_add_data(&entry, val) to add data and 354 * oprofile_write_commit(&entry) to commit the sample. 355 */ 356 void oprofile_write_reserve(struct op_entry * entry,struct pt_regs * const regs,unsigned long pc,int code,int size)357 oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs, 358 unsigned long pc, int code, int size) 359 { 360 struct op_sample *sample; 361 int is_kernel = !user_mode(regs); 362 struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer); 363 364 cpu_buf->sample_received++; 365 366 /* no backtraces for samples with data */ 367 if (op_add_code(cpu_buf, 0, is_kernel, current)) 368 goto fail; 369 370 sample = op_cpu_buffer_write_reserve(entry, size + 2); 371 if (!sample) 372 goto fail; 373 sample->eip = ESCAPE_CODE; 374 sample->event = 0; /* no flags */ 375 376 op_cpu_buffer_add_data(entry, code); 377 op_cpu_buffer_add_data(entry, pc); 378 379 return; 380 381 fail: 382 entry->event = NULL; 383 cpu_buf->sample_lost_overflow++; 384 } 385 oprofile_add_data(struct op_entry * entry,unsigned long val)386 int oprofile_add_data(struct op_entry *entry, unsigned long val) 387 { 388 if (!entry->event) 389 return 0; 390 return op_cpu_buffer_add_data(entry, val); 391 } 392 oprofile_add_data64(struct op_entry * entry,u64 val)393 int oprofile_add_data64(struct op_entry *entry, u64 val) 394 { 395 if (!entry->event) 396 return 0; 397 if (op_cpu_buffer_get_size(entry) < 2) 398 /* 399 * the function returns 0 to indicate a too small 400 * buffer, even if there is some space left 401 */ 402 return 0; 403 if (!op_cpu_buffer_add_data(entry, (u32)val)) 404 return 0; 405 return op_cpu_buffer_add_data(entry, (u32)(val >> 32)); 406 } 407 oprofile_write_commit(struct op_entry * entry)408 int oprofile_write_commit(struct op_entry *entry) 409 { 410 if (!entry->event) 411 return -EINVAL; 412 return op_cpu_buffer_write_commit(entry); 413 } 414 oprofile_add_pc(unsigned long pc,int is_kernel,unsigned long event)415 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event) 416 { 417 struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer); 418 log_sample(cpu_buf, pc, 0, is_kernel, event, NULL); 419 } 420 oprofile_add_trace(unsigned long pc)421 void oprofile_add_trace(unsigned long pc) 422 { 423 struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer); 424 425 if (!cpu_buf->tracing) 426 return; 427 428 /* 429 * broken frame can give an eip with the same value as an 430 * escape code, abort the trace if we get it 431 */ 432 if (pc == ESCAPE_CODE) 433 goto fail; 434 435 if (op_add_sample(cpu_buf, pc, 0)) 436 goto fail; 437 438 return; 439 fail: 440 cpu_buf->tracing = 0; 441 cpu_buf->backtrace_aborted++; 442 return; 443 } 444 445 /* 446 * This serves to avoid cpu buffer overflow, and makes sure 447 * the task mortuary progresses 448 * 449 * By using schedule_delayed_work_on and then schedule_delayed_work 450 * we guarantee this will stay on the correct cpu 451 */ wq_sync_buffer(struct work_struct * work)452 static void wq_sync_buffer(struct work_struct *work) 453 { 454 struct oprofile_cpu_buffer *b = 455 container_of(work, struct oprofile_cpu_buffer, work.work); 456 if (b->cpu != smp_processor_id() && !cpu_online(b->cpu)) { 457 cancel_delayed_work(&b->work); 458 return; 459 } 460 sync_buffer(b->cpu); 461 462 /* don't re-add the work if we're shutting down */ 463 if (work_enabled) 464 schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE); 465 } 466