1// Copyright 2019 The Bazel Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package starlark 6 7// This file defines a simple execution-time profiler for Starlark. 8// It measures the wall time spent executing Starlark code, and emits a 9// gzipped protocol message in pprof format (github.com/google/pprof). 10// 11// When profiling is enabled, the interpreter calls the profiler to 12// indicate the start and end of each "span" or time interval. A leaf 13// function (whether Go or Starlark) has a single span. A function that 14// calls another function has spans for each interval in which it is the 15// top of the stack. (A LOAD instruction also ends a span.) 16// 17// At the start of a span, the interpreter records the current time in 18// the thread's topmost frame. At the end of the span, it obtains the 19// time again and subtracts the span start time. The difference is added 20// to an accumulator variable in the thread. If the accumulator exceeds 21// some fixed quantum (10ms, say), the profiler records the current call 22// stack and sends it to the profiler goroutine, along with the number 23// of quanta, which are subtracted. For example, if the accumulator 24// holds 3ms and then a completed span adds 25ms to it, its value is 28ms, 25// which exceeeds 10ms. The profiler records a stack with the value 20ms 26// (2 quanta), and the accumulator is left with 8ms. 27// 28// The profiler goroutine converts the stacks into the pprof format and 29// emits a gzip-compressed protocol message to the designated output 30// file. We use a hand-written streaming proto encoder to avoid 31// dependencies on pprof and proto, and to avoid the need to 32// materialize the profile data structure in memory. 33// 34// A limitation of this profiler is that it measures wall time, which 35// does not necessarily correspond to CPU time. A CPU profiler requires 36// that only running (not runnable) threads are sampled; this is 37// commonly achieved by having the kernel deliver a (PROF) signal to an 38// arbitrary running thread, through setitimer(2). The CPU profiler in the 39// Go runtime uses this mechanism, but it is not possible for a Go 40// application to register a SIGPROF handler, nor is it possible for a 41// Go handler for some other signal to read the stack pointer of 42// the interrupted thread. 43// 44// Two caveats: 45// (1) it is tempting to send the leaf Frame directly to the profiler 46// goroutine instead of making a copy of the stack, since a Frame is a 47// spaghetti stack--a linked list. However, as soon as execution 48// resumes, the stack's Frame.pc values may be mutated, so Frames are 49// not safe to share with the asynchronous profiler goroutine. 50// (2) it is tempting to use Callables as keys in a map when tabulating 51// the pprof protocols's Function entities. However, we cannot assume 52// that Callables are valid map keys, and furthermore we must not 53// pin function values in memory indefinitely as this may cause lambda 54// values to keep their free variables live much longer than necessary. 55 56// TODO(adonovan): 57// - make Start/Stop fully thread-safe. 58// - fix the pc hack. 59// - experiment with other values of quantum. 60 61import ( 62 "bufio" 63 "bytes" 64 "compress/gzip" 65 "encoding/binary" 66 "fmt" 67 "io" 68 "log" 69 "reflect" 70 "sync/atomic" 71 "time" 72 "unsafe" 73 74 "go.starlark.net/syntax" 75) 76 77// StartProfile enables time profiling of all Starlark threads, 78// and writes a profile in pprof format to w. 79// It must be followed by a call to StopProfiler to stop 80// the profiler and finalize the profile. 81// 82// StartProfile returns an error if profiling was already enabled. 83// 84// StartProfile must not be called concurrently with Starlark execution. 85func StartProfile(w io.Writer) error { 86 if !atomic.CompareAndSwapUint32(&profiler.on, 0, 1) { 87 return fmt.Errorf("profiler already running") 88 } 89 90 // TODO(adonovan): make the API fully concurrency-safe. 91 // The main challenge is racy reads/writes of profiler.events, 92 // and of send/close races on the channel it refers to. 93 // It's easy to solve them with a mutex but harder to do 94 // it efficiently. 95 96 profiler.events = make(chan *profEvent, 1) 97 profiler.done = make(chan error) 98 99 go profile(w) 100 101 return nil 102} 103 104// StopProfiler stops the profiler started by a prior call to 105// StartProfile and finalizes the profile. It returns an error if the 106// profile could not be completed. 107// 108// StopProfiler must not be called concurrently with Starlark execution. 109func StopProfile() error { 110 // Terminate the profiler goroutine and get its result. 111 close(profiler.events) 112 err := <-profiler.done 113 114 profiler.done = nil 115 profiler.events = nil 116 atomic.StoreUint32(&profiler.on, 0) 117 118 return err 119} 120 121// globals 122var profiler struct { 123 on uint32 // nonzero => profiler running 124 events chan *profEvent // profile events from interpreter threads 125 done chan error // indicates profiler goroutine is ready 126} 127 128func (thread *Thread) beginProfSpan() { 129 if profiler.events == nil { 130 return // profiling not enabled 131 } 132 133 thread.frameAt(0).spanStart = nanotime() 134} 135 136// TODO(adonovan): experiment with smaller values, 137// which trade space and time for greater precision. 138const quantum = 10 * time.Millisecond 139 140func (thread *Thread) endProfSpan() { 141 if profiler.events == nil { 142 return // profiling not enabled 143 } 144 145 // Add the span to the thread's accumulator. 146 thread.proftime += time.Duration(nanotime() - thread.frameAt(0).spanStart) 147 if thread.proftime < quantum { 148 return 149 } 150 151 // Only record complete quanta. 152 n := thread.proftime / quantum 153 thread.proftime -= n * quantum 154 155 // Copy the stack. 156 // (We can't save thread.frame because its pc will change.) 157 ev := &profEvent{ 158 thread: thread, 159 time: n * quantum, 160 } 161 ev.stack = ev.stackSpace[:0] 162 for i := range thread.stack { 163 fr := thread.frameAt(i) 164 ev.stack = append(ev.stack, profFrame{ 165 pos: fr.Position(), 166 fn: fr.Callable(), 167 pc: fr.pc, 168 }) 169 } 170 171 profiler.events <- ev 172} 173 174type profEvent struct { 175 thread *Thread // currently unused 176 time time.Duration 177 stack []profFrame 178 stackSpace [8]profFrame // initial space for stack 179} 180 181type profFrame struct { 182 fn Callable // don't hold this live for too long (prevents GC of lambdas) 183 pc uint32 // program counter (Starlark frames only) 184 pos syntax.Position // position of pc within this frame 185} 186 187// profile is the profiler goroutine. 188// It runs until StopProfiler is called. 189func profile(w io.Writer) { 190 // Field numbers from pprof protocol. 191 // See https://github.com/google/pprof/blob/master/proto/profile.proto 192 const ( 193 Profile_sample_type = 1 // repeated ValueType 194 Profile_sample = 2 // repeated Sample 195 Profile_mapping = 3 // repeated Mapping 196 Profile_location = 4 // repeated Location 197 Profile_function = 5 // repeated Function 198 Profile_string_table = 6 // repeated string 199 Profile_time_nanos = 9 // int64 200 Profile_duration_nanos = 10 // int64 201 Profile_period_type = 11 // ValueType 202 Profile_period = 12 // int64 203 204 ValueType_type = 1 // int64 205 ValueType_unit = 2 // int64 206 207 Sample_location_id = 1 // repeated uint64 208 Sample_value = 2 // repeated int64 209 Sample_label = 3 // repeated Label 210 211 Label_key = 1 // int64 212 Label_str = 2 // int64 213 Label_num = 3 // int64 214 Label_num_unit = 4 // int64 215 216 Location_id = 1 // uint64 217 Location_mapping_id = 2 // uint64 218 Location_address = 3 // uint64 219 Location_line = 4 // repeated Line 220 221 Line_function_id = 1 // uint64 222 Line_line = 2 // int64 223 224 Function_id = 1 // uint64 225 Function_name = 2 // int64 226 Function_system_name = 3 // int64 227 Function_filename = 4 // int64 228 Function_start_line = 5 // int64 229 ) 230 231 bufw := bufio.NewWriter(w) // write file in 4KB (not 240B flate-sized) chunks 232 gz := gzip.NewWriter(bufw) 233 enc := protoEncoder{w: gz} 234 235 // strings 236 stringIndex := make(map[string]int64) 237 str := func(s string) int64 { 238 i, ok := stringIndex[s] 239 if !ok { 240 i = int64(len(stringIndex)) 241 enc.string(Profile_string_table, s) 242 stringIndex[s] = i 243 } 244 return i 245 } 246 str("") // entry 0 247 248 // functions 249 // 250 // function returns the ID of a Callable for use in Line.FunctionId. 251 // The ID is the same as the function's logical address, 252 // which is supplied by the caller to avoid the need to recompute it. 253 functionId := make(map[uintptr]uint64) 254 function := func(fn Callable, addr uintptr) uint64 { 255 id, ok := functionId[addr] 256 if !ok { 257 id = uint64(addr) 258 259 var pos syntax.Position 260 if fn, ok := fn.(callableWithPosition); ok { 261 pos = fn.Position() 262 } 263 264 name := fn.Name() 265 if name == "<toplevel>" { 266 name = pos.Filename() 267 } 268 269 nameIndex := str(name) 270 271 fun := new(bytes.Buffer) 272 funenc := protoEncoder{w: fun} 273 funenc.uint(Function_id, id) 274 funenc.int(Function_name, nameIndex) 275 funenc.int(Function_system_name, nameIndex) 276 funenc.int(Function_filename, str(pos.Filename())) 277 funenc.int(Function_start_line, int64(pos.Line)) 278 enc.bytes(Profile_function, fun.Bytes()) 279 280 functionId[addr] = id 281 } 282 return id 283 } 284 285 // locations 286 // 287 // location returns the ID of the location denoted by fr. 288 // For Starlark frames, this is the Frame pc. 289 locationId := make(map[uintptr]uint64) 290 location := func(fr profFrame) uint64 { 291 fnAddr := profFuncAddr(fr.fn) 292 293 // For Starlark functions, the frame position 294 // represents the current PC value. 295 // Mix it into the low bits of the address. 296 // This is super hacky and may result in collisions 297 // in large functions or if functions are numerous. 298 // TODO(adonovan): fix: try making this cleaner by treating 299 // each bytecode segment as a Profile.Mapping. 300 pcAddr := fnAddr 301 if _, ok := fr.fn.(*Function); ok { 302 pcAddr = (pcAddr << 16) ^ uintptr(fr.pc) 303 } 304 305 id, ok := locationId[pcAddr] 306 if !ok { 307 id = uint64(pcAddr) 308 309 line := new(bytes.Buffer) 310 lineenc := protoEncoder{w: line} 311 lineenc.uint(Line_function_id, function(fr.fn, fnAddr)) 312 lineenc.int(Line_line, int64(fr.pos.Line)) 313 loc := new(bytes.Buffer) 314 locenc := protoEncoder{w: loc} 315 locenc.uint(Location_id, id) 316 locenc.uint(Location_address, uint64(pcAddr)) 317 locenc.bytes(Location_line, line.Bytes()) 318 enc.bytes(Profile_location, loc.Bytes()) 319 320 locationId[pcAddr] = id 321 } 322 return id 323 } 324 325 wallNanos := new(bytes.Buffer) 326 wnenc := protoEncoder{w: wallNanos} 327 wnenc.int(ValueType_type, str("wall")) 328 wnenc.int(ValueType_unit, str("nanoseconds")) 329 330 // informational fields of Profile 331 enc.bytes(Profile_sample_type, wallNanos.Bytes()) 332 enc.int(Profile_period, quantum.Nanoseconds()) // magnitude of sampling period 333 enc.bytes(Profile_period_type, wallNanos.Bytes()) // dimension and unit of period 334 enc.int(Profile_time_nanos, time.Now().UnixNano()) // start (real) time of profile 335 336 startNano := nanotime() 337 338 // Read profile events from the channel 339 // until it is closed by StopProfiler. 340 for e := range profiler.events { 341 sample := new(bytes.Buffer) 342 sampleenc := protoEncoder{w: sample} 343 sampleenc.int(Sample_value, e.time.Nanoseconds()) // wall nanoseconds 344 for _, fr := range e.stack { 345 sampleenc.uint(Sample_location_id, location(fr)) 346 } 347 enc.bytes(Profile_sample, sample.Bytes()) 348 } 349 350 endNano := nanotime() 351 enc.int(Profile_duration_nanos, endNano-startNano) 352 353 err := gz.Close() // Close reports any prior write error 354 if flushErr := bufw.Flush(); err == nil { 355 err = flushErr 356 } 357 profiler.done <- err 358} 359 360// nanotime returns the time in nanoseconds since epoch. 361// It is implemented by runtime.nanotime using the linkname hack; 362// runtime.nanotime is defined for all OSs/ARCHS and uses the 363// monotonic system clock, which there is no portable way to access. 364// Should that function ever go away, these alternatives exist: 365// 366// // POSIX only. REALTIME not MONOTONIC. 17ns. 367// var tv syscall.Timeval 368// syscall.Gettimeofday(&tv) // can't fail 369// return tv.Nano() 370// 371// // Portable. REALTIME not MONOTONIC. 46ns. 372// return time.Now().Nanoseconds() 373// 374// // POSIX only. Adds a dependency. 375// import "golang.org/x/sys/unix" 376// var ts unix.Timespec 377// unix.ClockGettime(CLOCK_MONOTONIC, &ts) // can't fail 378// return unix.TimespecToNsec(ts) 379// 380//go:linkname nanotime runtime.nanotime 381func nanotime() int64 382 383// profFuncAddr returns the canonical "address" 384// of a Callable for use by the profiler. 385func profFuncAddr(fn Callable) uintptr { 386 switch fn := fn.(type) { 387 case *Builtin: 388 return reflect.ValueOf(fn.fn).Pointer() 389 case *Function: 390 return uintptr(unsafe.Pointer(fn.funcode)) 391 } 392 393 // User-defined callable types are typically of 394 // of kind pointer-to-struct. Handle them specially. 395 if v := reflect.ValueOf(fn); v.Type().Kind() == reflect.Ptr { 396 return v.Pointer() 397 } 398 399 // Address zero is reserved by the protocol. 400 // Use 1 for callables we don't recognize. 401 log.Printf("Starlark profiler: no address for Callable %T", fn) 402 return 1 403} 404 405// We encode the protocol message by hand to avoid making 406// the interpreter depend on both github.com/google/pprof 407// and github.com/golang/protobuf. 408// 409// This also avoids the need to materialize a protocol message object 410// tree of unbounded size and serialize it all at the end. 411// The pprof format appears to have been designed to 412// permit streaming implementations such as this one. 413// 414// See https://developers.google.com/protocol-buffers/docs/encoding. 415type protoEncoder struct { 416 w io.Writer // *bytes.Buffer or *gzip.Writer 417 tmp [binary.MaxVarintLen64]byte 418} 419 420func (e *protoEncoder) uvarint(x uint64) { 421 n := binary.PutUvarint(e.tmp[:], x) 422 e.w.Write(e.tmp[:n]) 423} 424 425func (e *protoEncoder) tag(field, wire uint) { 426 e.uvarint(uint64(field<<3 | wire)) 427} 428 429func (e *protoEncoder) string(field uint, s string) { 430 e.tag(field, 2) // length-delimited 431 e.uvarint(uint64(len(s))) 432 io.WriteString(e.w, s) 433} 434 435func (e *protoEncoder) bytes(field uint, b []byte) { 436 e.tag(field, 2) // length-delimited 437 e.uvarint(uint64(len(b))) 438 e.w.Write(b) 439} 440 441func (e *protoEncoder) uint(field uint, x uint64) { 442 e.tag(field, 0) // varint 443 e.uvarint(x) 444} 445 446func (e *protoEncoder) int(field uint, x int64) { 447 e.tag(field, 0) // varint 448 e.uvarint(uint64(x)) 449} 450