• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Malloc profiling.
6// Patterned after tcmalloc's algorithms; shorter code.
7
8package runtime
9
10import (
11	"internal/abi"
12	"internal/goarch"
13	"internal/profilerecord"
14	"internal/runtime/atomic"
15	"runtime/internal/sys"
16	"unsafe"
17)
18
19// NOTE(rsc): Everything here could use cas if contention became an issue.
20var (
21	// profInsertLock protects changes to the start of all *bucket linked lists
22	profInsertLock mutex
23	// profBlockLock protects the contents of every blockRecord struct
24	profBlockLock mutex
25	// profMemActiveLock protects the active field of every memRecord struct
26	profMemActiveLock mutex
27	// profMemFutureLock is a set of locks that protect the respective elements
28	// of the future array of every memRecord struct
29	profMemFutureLock [len(memRecord{}.future)]mutex
30)
31
32// All memory allocations are local and do not escape outside of the profiler.
33// The profiler is forbidden from referring to garbage-collected memory.
34
35const (
36	// profile types
37	memProfile bucketType = 1 + iota
38	blockProfile
39	mutexProfile
40
41	// size of bucket hash table
42	buckHashSize = 179999
43
44	// maxSkip is to account for deferred inline expansion
45	// when using frame pointer unwinding. We record the stack
46	// with "physical" frame pointers but handle skipping "logical"
47	// frames at some point after collecting the stack. So
48	// we need extra space in order to avoid getting fewer than the
49	// desired maximum number of frames after expansion.
50	// This should be at least as large as the largest skip value
51	// used for profiling; otherwise stacks may be truncated inconsistently
52	maxSkip = 5
53
54	// maxProfStackDepth is the highest valid value for debug.profstackdepth.
55	// It's used for the bucket.stk func.
56	// TODO(fg): can we get rid of this?
57	maxProfStackDepth = 1024
58)
59
60type bucketType int
61
62// A bucket holds per-call-stack profiling information.
63// The representation is a bit sleazy, inherited from C.
64// This struct defines the bucket header. It is followed in
65// memory by the stack words and then the actual record
66// data, either a memRecord or a blockRecord.
67//
68// Per-call-stack profiling information.
69// Lookup by hashing call stack into a linked-list hash table.
70//
71// None of the fields in this bucket header are modified after
72// creation, including its next and allnext links.
73//
74// No heap pointers.
75type bucket struct {
76	_       sys.NotInHeap
77	next    *bucket
78	allnext *bucket
79	typ     bucketType // memBucket or blockBucket (includes mutexProfile)
80	hash    uintptr
81	size    uintptr
82	nstk    uintptr
83}
84
85// A memRecord is the bucket data for a bucket of type memProfile,
86// part of the memory profile.
87type memRecord struct {
88	// The following complex 3-stage scheme of stats accumulation
89	// is required to obtain a consistent picture of mallocs and frees
90	// for some point in time.
91	// The problem is that mallocs come in real time, while frees
92	// come only after a GC during concurrent sweeping. So if we would
93	// naively count them, we would get a skew toward mallocs.
94	//
95	// Hence, we delay information to get consistent snapshots as
96	// of mark termination. Allocations count toward the next mark
97	// termination's snapshot, while sweep frees count toward the
98	// previous mark termination's snapshot:
99	//
100	//              MT          MT          MT          MT
101	//             .·|         .·|         .·|         .·|
102	//          .·˙  |      .·˙  |      .·˙  |      .·˙  |
103	//       .·˙     |   .·˙     |   .·˙     |   .·˙     |
104	//    .·˙        |.·˙        |.·˙        |.·˙        |
105	//
106	//       alloc → ▲ ← free
107	//               ┠┅┅┅┅┅┅┅┅┅┅┅P
108	//       C+2     →    C+1    →  C
109	//
110	//                   alloc → ▲ ← free
111	//                           ┠┅┅┅┅┅┅┅┅┅┅┅P
112	//                   C+2     →    C+1    →  C
113	//
114	// Since we can't publish a consistent snapshot until all of
115	// the sweep frees are accounted for, we wait until the next
116	// mark termination ("MT" above) to publish the previous mark
117	// termination's snapshot ("P" above). To do this, allocation
118	// and free events are accounted to *future* heap profile
119	// cycles ("C+n" above) and we only publish a cycle once all
120	// of the events from that cycle must be done. Specifically:
121	//
122	// Mallocs are accounted to cycle C+2.
123	// Explicit frees are accounted to cycle C+2.
124	// GC frees (done during sweeping) are accounted to cycle C+1.
125	//
126	// After mark termination, we increment the global heap
127	// profile cycle counter and accumulate the stats from cycle C
128	// into the active profile.
129
130	// active is the currently published profile. A profiling
131	// cycle can be accumulated into active once its complete.
132	active memRecordCycle
133
134	// future records the profile events we're counting for cycles
135	// that have not yet been published. This is ring buffer
136	// indexed by the global heap profile cycle C and stores
137	// cycles C, C+1, and C+2. Unlike active, these counts are
138	// only for a single cycle; they are not cumulative across
139	// cycles.
140	//
141	// We store cycle C here because there's a window between when
142	// C becomes the active cycle and when we've flushed it to
143	// active.
144	future [3]memRecordCycle
145}
146
147// memRecordCycle
148type memRecordCycle struct {
149	allocs, frees           uintptr
150	alloc_bytes, free_bytes uintptr
151}
152
153// add accumulates b into a. It does not zero b.
154func (a *memRecordCycle) add(b *memRecordCycle) {
155	a.allocs += b.allocs
156	a.frees += b.frees
157	a.alloc_bytes += b.alloc_bytes
158	a.free_bytes += b.free_bytes
159}
160
161// A blockRecord is the bucket data for a bucket of type blockProfile,
162// which is used in blocking and mutex profiles.
163type blockRecord struct {
164	count  float64
165	cycles int64
166}
167
168var (
169	mbuckets atomic.UnsafePointer // *bucket, memory profile buckets
170	bbuckets atomic.UnsafePointer // *bucket, blocking profile buckets
171	xbuckets atomic.UnsafePointer // *bucket, mutex profile buckets
172	buckhash atomic.UnsafePointer // *buckhashArray
173
174	mProfCycle mProfCycleHolder
175)
176
177type buckhashArray [buckHashSize]atomic.UnsafePointer // *bucket
178
179const mProfCycleWrap = uint32(len(memRecord{}.future)) * (2 << 24)
180
181// mProfCycleHolder holds the global heap profile cycle number (wrapped at
182// mProfCycleWrap, stored starting at bit 1), and a flag (stored at bit 0) to
183// indicate whether future[cycle] in all buckets has been queued to flush into
184// the active profile.
185type mProfCycleHolder struct {
186	value atomic.Uint32
187}
188
189// read returns the current cycle count.
190func (c *mProfCycleHolder) read() (cycle uint32) {
191	v := c.value.Load()
192	cycle = v >> 1
193	return cycle
194}
195
196// setFlushed sets the flushed flag. It returns the current cycle count and the
197// previous value of the flushed flag.
198func (c *mProfCycleHolder) setFlushed() (cycle uint32, alreadyFlushed bool) {
199	for {
200		prev := c.value.Load()
201		cycle = prev >> 1
202		alreadyFlushed = (prev & 0x1) != 0
203		next := prev | 0x1
204		if c.value.CompareAndSwap(prev, next) {
205			return cycle, alreadyFlushed
206		}
207	}
208}
209
210// increment increases the cycle count by one, wrapping the value at
211// mProfCycleWrap. It clears the flushed flag.
212func (c *mProfCycleHolder) increment() {
213	// We explicitly wrap mProfCycle rather than depending on
214	// uint wraparound because the memRecord.future ring does not
215	// itself wrap at a power of two.
216	for {
217		prev := c.value.Load()
218		cycle := prev >> 1
219		cycle = (cycle + 1) % mProfCycleWrap
220		next := cycle << 1
221		if c.value.CompareAndSwap(prev, next) {
222			break
223		}
224	}
225}
226
227// newBucket allocates a bucket with the given type and number of stack entries.
228func newBucket(typ bucketType, nstk int) *bucket {
229	size := unsafe.Sizeof(bucket{}) + uintptr(nstk)*unsafe.Sizeof(uintptr(0))
230	switch typ {
231	default:
232		throw("invalid profile bucket type")
233	case memProfile:
234		size += unsafe.Sizeof(memRecord{})
235	case blockProfile, mutexProfile:
236		size += unsafe.Sizeof(blockRecord{})
237	}
238
239	b := (*bucket)(persistentalloc(size, 0, &memstats.buckhash_sys))
240	b.typ = typ
241	b.nstk = uintptr(nstk)
242	return b
243}
244
245// stk returns the slice in b holding the stack. The caller can asssume that the
246// backing array is immutable.
247func (b *bucket) stk() []uintptr {
248	stk := (*[maxProfStackDepth]uintptr)(add(unsafe.Pointer(b), unsafe.Sizeof(*b)))
249	if b.nstk > maxProfStackDepth {
250		// prove that slicing works; otherwise a failure requires a P
251		throw("bad profile stack count")
252	}
253	return stk[:b.nstk:b.nstk]
254}
255
256// mp returns the memRecord associated with the memProfile bucket b.
257func (b *bucket) mp() *memRecord {
258	if b.typ != memProfile {
259		throw("bad use of bucket.mp")
260	}
261	data := add(unsafe.Pointer(b), unsafe.Sizeof(*b)+b.nstk*unsafe.Sizeof(uintptr(0)))
262	return (*memRecord)(data)
263}
264
265// bp returns the blockRecord associated with the blockProfile bucket b.
266func (b *bucket) bp() *blockRecord {
267	if b.typ != blockProfile && b.typ != mutexProfile {
268		throw("bad use of bucket.bp")
269	}
270	data := add(unsafe.Pointer(b), unsafe.Sizeof(*b)+b.nstk*unsafe.Sizeof(uintptr(0)))
271	return (*blockRecord)(data)
272}
273
274// Return the bucket for stk[0:nstk], allocating new bucket if needed.
275func stkbucket(typ bucketType, size uintptr, stk []uintptr, alloc bool) *bucket {
276	bh := (*buckhashArray)(buckhash.Load())
277	if bh == nil {
278		lock(&profInsertLock)
279		// check again under the lock
280		bh = (*buckhashArray)(buckhash.Load())
281		if bh == nil {
282			bh = (*buckhashArray)(sysAlloc(unsafe.Sizeof(buckhashArray{}), &memstats.buckhash_sys))
283			if bh == nil {
284				throw("runtime: cannot allocate memory")
285			}
286			buckhash.StoreNoWB(unsafe.Pointer(bh))
287		}
288		unlock(&profInsertLock)
289	}
290
291	// Hash stack.
292	var h uintptr
293	for _, pc := range stk {
294		h += pc
295		h += h << 10
296		h ^= h >> 6
297	}
298	// hash in size
299	h += size
300	h += h << 10
301	h ^= h >> 6
302	// finalize
303	h += h << 3
304	h ^= h >> 11
305
306	i := int(h % buckHashSize)
307	// first check optimistically, without the lock
308	for b := (*bucket)(bh[i].Load()); b != nil; b = b.next {
309		if b.typ == typ && b.hash == h && b.size == size && eqslice(b.stk(), stk) {
310			return b
311		}
312	}
313
314	if !alloc {
315		return nil
316	}
317
318	lock(&profInsertLock)
319	// check again under the insertion lock
320	for b := (*bucket)(bh[i].Load()); b != nil; b = b.next {
321		if b.typ == typ && b.hash == h && b.size == size && eqslice(b.stk(), stk) {
322			unlock(&profInsertLock)
323			return b
324		}
325	}
326
327	// Create new bucket.
328	b := newBucket(typ, len(stk))
329	copy(b.stk(), stk)
330	b.hash = h
331	b.size = size
332
333	var allnext *atomic.UnsafePointer
334	if typ == memProfile {
335		allnext = &mbuckets
336	} else if typ == mutexProfile {
337		allnext = &xbuckets
338	} else {
339		allnext = &bbuckets
340	}
341
342	b.next = (*bucket)(bh[i].Load())
343	b.allnext = (*bucket)(allnext.Load())
344
345	bh[i].StoreNoWB(unsafe.Pointer(b))
346	allnext.StoreNoWB(unsafe.Pointer(b))
347
348	unlock(&profInsertLock)
349	return b
350}
351
352func eqslice(x, y []uintptr) bool {
353	if len(x) != len(y) {
354		return false
355	}
356	for i, xi := range x {
357		if xi != y[i] {
358			return false
359		}
360	}
361	return true
362}
363
364// mProf_NextCycle publishes the next heap profile cycle and creates a
365// fresh heap profile cycle. This operation is fast and can be done
366// during STW. The caller must call mProf_Flush before calling
367// mProf_NextCycle again.
368//
369// This is called by mark termination during STW so allocations and
370// frees after the world is started again count towards a new heap
371// profiling cycle.
372func mProf_NextCycle() {
373	mProfCycle.increment()
374}
375
376// mProf_Flush flushes the events from the current heap profiling
377// cycle into the active profile. After this it is safe to start a new
378// heap profiling cycle with mProf_NextCycle.
379//
380// This is called by GC after mark termination starts the world. In
381// contrast with mProf_NextCycle, this is somewhat expensive, but safe
382// to do concurrently.
383func mProf_Flush() {
384	cycle, alreadyFlushed := mProfCycle.setFlushed()
385	if alreadyFlushed {
386		return
387	}
388
389	index := cycle % uint32(len(memRecord{}.future))
390	lock(&profMemActiveLock)
391	lock(&profMemFutureLock[index])
392	mProf_FlushLocked(index)
393	unlock(&profMemFutureLock[index])
394	unlock(&profMemActiveLock)
395}
396
397// mProf_FlushLocked flushes the events from the heap profiling cycle at index
398// into the active profile. The caller must hold the lock for the active profile
399// (profMemActiveLock) and for the profiling cycle at index
400// (profMemFutureLock[index]).
401func mProf_FlushLocked(index uint32) {
402	assertLockHeld(&profMemActiveLock)
403	assertLockHeld(&profMemFutureLock[index])
404	head := (*bucket)(mbuckets.Load())
405	for b := head; b != nil; b = b.allnext {
406		mp := b.mp()
407
408		// Flush cycle C into the published profile and clear
409		// it for reuse.
410		mpc := &mp.future[index]
411		mp.active.add(mpc)
412		*mpc = memRecordCycle{}
413	}
414}
415
416// mProf_PostSweep records that all sweep frees for this GC cycle have
417// completed. This has the effect of publishing the heap profile
418// snapshot as of the last mark termination without advancing the heap
419// profile cycle.
420func mProf_PostSweep() {
421	// Flush cycle C+1 to the active profile so everything as of
422	// the last mark termination becomes visible. *Don't* advance
423	// the cycle, since we're still accumulating allocs in cycle
424	// C+2, which have to become C+1 in the next mark termination
425	// and so on.
426	cycle := mProfCycle.read() + 1
427
428	index := cycle % uint32(len(memRecord{}.future))
429	lock(&profMemActiveLock)
430	lock(&profMemFutureLock[index])
431	mProf_FlushLocked(index)
432	unlock(&profMemFutureLock[index])
433	unlock(&profMemActiveLock)
434}
435
436// Called by malloc to record a profiled block.
437func mProf_Malloc(mp *m, p unsafe.Pointer, size uintptr) {
438	if mp.profStack == nil {
439		// mp.profStack is nil if we happen to sample an allocation during the
440		// initialization of mp. This case is rare, so we just ignore such
441		// allocations. Change MemProfileRate to 1 if you need to reproduce such
442		// cases for testing purposes.
443		return
444	}
445	// Only use the part of mp.profStack we need and ignore the extra space
446	// reserved for delayed inline expansion with frame pointer unwinding.
447	nstk := callers(4, mp.profStack[:debug.profstackdepth])
448	index := (mProfCycle.read() + 2) % uint32(len(memRecord{}.future))
449
450	b := stkbucket(memProfile, size, mp.profStack[:nstk], true)
451	mr := b.mp()
452	mpc := &mr.future[index]
453
454	lock(&profMemFutureLock[index])
455	mpc.allocs++
456	mpc.alloc_bytes += size
457	unlock(&profMemFutureLock[index])
458
459	// Setprofilebucket locks a bunch of other mutexes, so we call it outside of
460	// the profiler locks. This reduces potential contention and chances of
461	// deadlocks. Since the object must be alive during the call to
462	// mProf_Malloc, it's fine to do this non-atomically.
463	systemstack(func() {
464		setprofilebucket(p, b)
465	})
466}
467
468// Called when freeing a profiled block.
469func mProf_Free(b *bucket, size uintptr) {
470	index := (mProfCycle.read() + 1) % uint32(len(memRecord{}.future))
471
472	mp := b.mp()
473	mpc := &mp.future[index]
474
475	lock(&profMemFutureLock[index])
476	mpc.frees++
477	mpc.free_bytes += size
478	unlock(&profMemFutureLock[index])
479}
480
481var blockprofilerate uint64 // in CPU ticks
482
483// SetBlockProfileRate controls the fraction of goroutine blocking events
484// that are reported in the blocking profile. The profiler aims to sample
485// an average of one blocking event per rate nanoseconds spent blocked.
486//
487// To include every blocking event in the profile, pass rate = 1.
488// To turn off profiling entirely, pass rate <= 0.
489func SetBlockProfileRate(rate int) {
490	var r int64
491	if rate <= 0 {
492		r = 0 // disable profiling
493	} else if rate == 1 {
494		r = 1 // profile everything
495	} else {
496		// convert ns to cycles, use float64 to prevent overflow during multiplication
497		r = int64(float64(rate) * float64(ticksPerSecond()) / (1000 * 1000 * 1000))
498		if r == 0 {
499			r = 1
500		}
501	}
502
503	atomic.Store64(&blockprofilerate, uint64(r))
504}
505
506func blockevent(cycles int64, skip int) {
507	if cycles <= 0 {
508		cycles = 1
509	}
510
511	rate := int64(atomic.Load64(&blockprofilerate))
512	if blocksampled(cycles, rate) {
513		saveblockevent(cycles, rate, skip+1, blockProfile)
514	}
515}
516
517// blocksampled returns true for all events where cycles >= rate. Shorter
518// events have a cycles/rate random chance of returning true.
519func blocksampled(cycles, rate int64) bool {
520	if rate <= 0 || (rate > cycles && cheaprand64()%rate > cycles) {
521		return false
522	}
523	return true
524}
525
526// saveblockevent records a profile event of the type specified by which.
527// cycles is the quantity associated with this event and rate is the sampling rate,
528// used to adjust the cycles value in the manner determined by the profile type.
529// skip is the number of frames to omit from the traceback associated with the event.
530// The traceback will be recorded from the stack of the goroutine associated with the current m.
531// skip should be positive if this event is recorded from the current stack
532// (e.g. when this is not called from a system stack)
533func saveblockevent(cycles, rate int64, skip int, which bucketType) {
534	if debug.profstackdepth == 0 {
535		// profstackdepth is set to 0 by the user, so mp.profStack is nil and we
536		// can't record a stack trace.
537		return
538	}
539	if skip > maxSkip {
540		print("requested skip=", skip)
541		throw("invalid skip value")
542	}
543	gp := getg()
544	mp := acquirem() // we must not be preempted while accessing profstack
545
546	var nstk int
547	if tracefpunwindoff() || gp.m.hasCgoOnStack() {
548		if gp.m.curg == nil || gp.m.curg == gp {
549			nstk = callers(skip, mp.profStack)
550		} else {
551			nstk = gcallers(gp.m.curg, skip, mp.profStack)
552		}
553	} else {
554		if gp.m.curg == nil || gp.m.curg == gp {
555			if skip > 0 {
556				// We skip one fewer frame than the provided value for frame
557				// pointer unwinding because the skip value includes the current
558				// frame, whereas the saved frame pointer will give us the
559				// caller's return address first (so, not including
560				// saveblockevent)
561				skip -= 1
562			}
563			nstk = fpTracebackPartialExpand(skip, unsafe.Pointer(getfp()), mp.profStack)
564		} else {
565			mp.profStack[0] = gp.m.curg.sched.pc
566			nstk = 1 + fpTracebackPartialExpand(skip, unsafe.Pointer(gp.m.curg.sched.bp), mp.profStack[1:])
567		}
568	}
569
570	saveBlockEventStack(cycles, rate, mp.profStack[:nstk], which)
571	releasem(mp)
572}
573
574// fpTracebackPartialExpand records a call stack obtained starting from fp.
575// This function will skip the given number of frames, properly accounting for
576// inlining, and save remaining frames as "physical" return addresses. The
577// consumer should later use CallersFrames or similar to expand inline frames.
578func fpTracebackPartialExpand(skip int, fp unsafe.Pointer, pcBuf []uintptr) int {
579	var n int
580	lastFuncID := abi.FuncIDNormal
581	skipOrAdd := func(retPC uintptr) bool {
582		if skip > 0 {
583			skip--
584		} else if n < len(pcBuf) {
585			pcBuf[n] = retPC
586			n++
587		}
588		return n < len(pcBuf)
589	}
590	for n < len(pcBuf) && fp != nil {
591		// return addr sits one word above the frame pointer
592		pc := *(*uintptr)(unsafe.Pointer(uintptr(fp) + goarch.PtrSize))
593
594		if skip > 0 {
595			callPC := pc - 1
596			fi := findfunc(callPC)
597			u, uf := newInlineUnwinder(fi, callPC)
598			for ; uf.valid(); uf = u.next(uf) {
599				sf := u.srcFunc(uf)
600				if sf.funcID == abi.FuncIDWrapper && elideWrapperCalling(lastFuncID) {
601					// ignore wrappers
602				} else if more := skipOrAdd(uf.pc + 1); !more {
603					return n
604				}
605				lastFuncID = sf.funcID
606			}
607		} else {
608			// We've skipped the desired number of frames, so no need
609			// to perform further inline expansion now.
610			pcBuf[n] = pc
611			n++
612		}
613
614		// follow the frame pointer to the next one
615		fp = unsafe.Pointer(*(*uintptr)(fp))
616	}
617	return n
618}
619
620// lockTimer assists with profiling contention on runtime-internal locks.
621//
622// There are several steps between the time that an M experiences contention and
623// when that contention may be added to the profile. This comes from our
624// constraints: We need to keep the critical section of each lock small,
625// especially when those locks are contended. The reporting code cannot acquire
626// new locks until the M has released all other locks, which means no memory
627// allocations and encourages use of (temporary) M-local storage.
628//
629// The M will have space for storing one call stack that caused contention, and
630// for the magnitude of that contention. It will also have space to store the
631// magnitude of additional contention the M caused, since it only has space to
632// remember one call stack and might encounter several contention events before
633// it releases all of its locks and is thus able to transfer the local buffer
634// into the profile.
635//
636// The M will collect the call stack when it unlocks the contended lock. That
637// minimizes the impact on the critical section of the contended lock, and
638// matches the mutex profile's behavior for contention in sync.Mutex: measured
639// at the Unlock method.
640//
641// The profile for contention on sync.Mutex blames the caller of Unlock for the
642// amount of contention experienced by the callers of Lock which had to wait.
643// When there are several critical sections, this allows identifying which of
644// them is responsible.
645//
646// Matching that behavior for runtime-internal locks will require identifying
647// which Ms are blocked on the mutex. The semaphore-based implementation is
648// ready to allow that, but the futex-based implementation will require a bit
649// more work. Until then, we report contention on runtime-internal locks with a
650// call stack taken from the unlock call (like the rest of the user-space
651// "mutex" profile), but assign it a duration value based on how long the
652// previous lock call took (like the user-space "block" profile).
653//
654// Thus, reporting the call stacks of runtime-internal lock contention is
655// guarded by GODEBUG for now. Set GODEBUG=runtimecontentionstacks=1 to enable.
656//
657// TODO(rhysh): plumb through the delay duration, remove GODEBUG, update comment
658//
659// The M will track this by storing a pointer to the lock; lock/unlock pairs for
660// runtime-internal locks are always on the same M.
661//
662// Together, that demands several steps for recording contention. First, when
663// finally acquiring a contended lock, the M decides whether it should plan to
664// profile that event by storing a pointer to the lock in its "to be profiled
665// upon unlock" field. If that field is already set, it uses the relative
666// magnitudes to weight a random choice between itself and the other lock, with
667// the loser's time being added to the "additional contention" field. Otherwise
668// if the M's call stack buffer is occupied, it does the comparison against that
669// sample's magnitude.
670//
671// Second, having unlocked a mutex the M checks to see if it should capture the
672// call stack into its local buffer. Finally, when the M unlocks its last mutex,
673// it transfers the local buffer into the profile. As part of that step, it also
674// transfers any "additional contention" time to the profile. Any lock
675// contention that it experiences while adding samples to the profile will be
676// recorded later as "additional contention" and not include a call stack, to
677// avoid an echo.
678type lockTimer struct {
679	lock      *mutex
680	timeRate  int64
681	timeStart int64
682	tickStart int64
683}
684
685func (lt *lockTimer) begin() {
686	rate := int64(atomic.Load64(&mutexprofilerate))
687
688	lt.timeRate = gTrackingPeriod
689	if rate != 0 && rate < lt.timeRate {
690		lt.timeRate = rate
691	}
692	if int64(cheaprand())%lt.timeRate == 0 {
693		lt.timeStart = nanotime()
694	}
695
696	if rate > 0 && int64(cheaprand())%rate == 0 {
697		lt.tickStart = cputicks()
698	}
699}
700
701func (lt *lockTimer) end() {
702	gp := getg()
703
704	if lt.timeStart != 0 {
705		nowTime := nanotime()
706		gp.m.mLockProfile.waitTime.Add((nowTime - lt.timeStart) * lt.timeRate)
707	}
708
709	if lt.tickStart != 0 {
710		nowTick := cputicks()
711		gp.m.mLockProfile.recordLock(nowTick-lt.tickStart, lt.lock)
712	}
713}
714
715type mLockProfile struct {
716	waitTime   atomic.Int64 // total nanoseconds spent waiting in runtime.lockWithRank
717	stack      []uintptr    // stack that experienced contention in runtime.lockWithRank
718	pending    uintptr      // *mutex that experienced contention (to be traceback-ed)
719	cycles     int64        // cycles attributable to "pending" (if set), otherwise to "stack"
720	cyclesLost int64        // contention for which we weren't able to record a call stack
721	disabled   bool         // attribute all time to "lost"
722}
723
724func (prof *mLockProfile) recordLock(cycles int64, l *mutex) {
725	if cycles <= 0 {
726		return
727	}
728
729	if prof.disabled {
730		// We're experiencing contention while attempting to report contention.
731		// Make a note of its magnitude, but don't allow it to be the sole cause
732		// of another contention report.
733		prof.cyclesLost += cycles
734		return
735	}
736
737	if uintptr(unsafe.Pointer(l)) == prof.pending {
738		// Optimization: we'd already planned to profile this same lock (though
739		// possibly from a different unlock site).
740		prof.cycles += cycles
741		return
742	}
743
744	if prev := prof.cycles; prev > 0 {
745		// We can only store one call stack for runtime-internal lock contention
746		// on this M, and we've already got one. Decide which should stay, and
747		// add the other to the report for runtime._LostContendedRuntimeLock.
748		prevScore := uint64(cheaprand64()) % uint64(prev)
749		thisScore := uint64(cheaprand64()) % uint64(cycles)
750		if prevScore > thisScore {
751			prof.cyclesLost += cycles
752			return
753		} else {
754			prof.cyclesLost += prev
755		}
756	}
757	// Saving the *mutex as a uintptr is safe because:
758	//  - lockrank_on.go does this too, which gives it regular exercise
759	//  - the lock would only move if it's stack allocated, which means it
760	//      cannot experience multi-M contention
761	prof.pending = uintptr(unsafe.Pointer(l))
762	prof.cycles = cycles
763}
764
765// From unlock2, we might not be holding a p in this code.
766//
767//go:nowritebarrierrec
768func (prof *mLockProfile) recordUnlock(l *mutex) {
769	if uintptr(unsafe.Pointer(l)) == prof.pending {
770		prof.captureStack()
771	}
772	if gp := getg(); gp.m.locks == 1 && gp.m.mLockProfile.cycles != 0 {
773		prof.store()
774	}
775}
776
777func (prof *mLockProfile) captureStack() {
778	if debug.profstackdepth == 0 {
779		// profstackdepth is set to 0 by the user, so mp.profStack is nil and we
780		// can't record a stack trace.
781		return
782	}
783
784	skip := 3 // runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
785	if staticLockRanking {
786		// When static lock ranking is enabled, we'll always be on the system
787		// stack at this point. There will be a runtime.unlockWithRank.func1
788		// frame, and if the call to runtime.unlock took place on a user stack
789		// then there'll also be a runtime.systemstack frame. To keep stack
790		// traces somewhat consistent whether or not static lock ranking is
791		// enabled, we'd like to skip those. But it's hard to tell how long
792		// we've been on the system stack so accept an extra frame in that case,
793		// with a leaf of "runtime.unlockWithRank runtime.unlock" instead of
794		// "runtime.unlock".
795		skip += 1 // runtime.unlockWithRank.func1
796	}
797	prof.pending = 0
798
799	prof.stack[0] = logicalStackSentinel
800	if debug.runtimeContentionStacks.Load() == 0 {
801		prof.stack[1] = abi.FuncPCABIInternal(_LostContendedRuntimeLock) + sys.PCQuantum
802		prof.stack[2] = 0
803		return
804	}
805
806	var nstk int
807	gp := getg()
808	sp := getcallersp()
809	pc := getcallerpc()
810	systemstack(func() {
811		var u unwinder
812		u.initAt(pc, sp, 0, gp, unwindSilentErrors|unwindJumpStack)
813		nstk = 1 + tracebackPCs(&u, skip, prof.stack[1:])
814	})
815	if nstk < len(prof.stack) {
816		prof.stack[nstk] = 0
817	}
818}
819
820func (prof *mLockProfile) store() {
821	// Report any contention we experience within this function as "lost"; it's
822	// important that the act of reporting a contention event not lead to a
823	// reportable contention event. This also means we can use prof.stack
824	// without copying, since it won't change during this function.
825	mp := acquirem()
826	prof.disabled = true
827
828	nstk := int(debug.profstackdepth)
829	for i := 0; i < nstk; i++ {
830		if pc := prof.stack[i]; pc == 0 {
831			nstk = i
832			break
833		}
834	}
835
836	cycles, lost := prof.cycles, prof.cyclesLost
837	prof.cycles, prof.cyclesLost = 0, 0
838
839	rate := int64(atomic.Load64(&mutexprofilerate))
840	saveBlockEventStack(cycles, rate, prof.stack[:nstk], mutexProfile)
841	if lost > 0 {
842		lostStk := [...]uintptr{
843			logicalStackSentinel,
844			abi.FuncPCABIInternal(_LostContendedRuntimeLock) + sys.PCQuantum,
845		}
846		saveBlockEventStack(lost, rate, lostStk[:], mutexProfile)
847	}
848
849	prof.disabled = false
850	releasem(mp)
851}
852
853func saveBlockEventStack(cycles, rate int64, stk []uintptr, which bucketType) {
854	b := stkbucket(which, 0, stk, true)
855	bp := b.bp()
856
857	lock(&profBlockLock)
858	// We want to up-scale the count and cycles according to the
859	// probability that the event was sampled. For block profile events,
860	// the sample probability is 1 if cycles >= rate, and cycles / rate
861	// otherwise. For mutex profile events, the sample probability is 1 / rate.
862	// We scale the events by 1 / (probability the event was sampled).
863	if which == blockProfile && cycles < rate {
864		// Remove sampling bias, see discussion on http://golang.org/cl/299991.
865		bp.count += float64(rate) / float64(cycles)
866		bp.cycles += rate
867	} else if which == mutexProfile {
868		bp.count += float64(rate)
869		bp.cycles += rate * cycles
870	} else {
871		bp.count++
872		bp.cycles += cycles
873	}
874	unlock(&profBlockLock)
875}
876
877var mutexprofilerate uint64 // fraction sampled
878
879// SetMutexProfileFraction controls the fraction of mutex contention events
880// that are reported in the mutex profile. On average 1/rate events are
881// reported. The previous rate is returned.
882//
883// To turn off profiling entirely, pass rate 0.
884// To just read the current rate, pass rate < 0.
885// (For n>1 the details of sampling may change.)
886func SetMutexProfileFraction(rate int) int {
887	if rate < 0 {
888		return int(mutexprofilerate)
889	}
890	old := mutexprofilerate
891	atomic.Store64(&mutexprofilerate, uint64(rate))
892	return int(old)
893}
894
895//go:linkname mutexevent sync.event
896func mutexevent(cycles int64, skip int) {
897	if cycles < 0 {
898		cycles = 0
899	}
900	rate := int64(atomic.Load64(&mutexprofilerate))
901	if rate > 0 && cheaprand64()%rate == 0 {
902		saveblockevent(cycles, rate, skip+1, mutexProfile)
903	}
904}
905
906// Go interface to profile data.
907
908// A StackRecord describes a single execution stack.
909type StackRecord struct {
910	Stack0 [32]uintptr // stack trace for this record; ends at first 0 entry
911}
912
913// Stack returns the stack trace associated with the record,
914// a prefix of r.Stack0.
915func (r *StackRecord) Stack() []uintptr {
916	for i, v := range r.Stack0 {
917		if v == 0 {
918			return r.Stack0[0:i]
919		}
920	}
921	return r.Stack0[0:]
922}
923
924// MemProfileRate controls the fraction of memory allocations
925// that are recorded and reported in the memory profile.
926// The profiler aims to sample an average of
927// one allocation per MemProfileRate bytes allocated.
928//
929// To include every allocated block in the profile, set MemProfileRate to 1.
930// To turn off profiling entirely, set MemProfileRate to 0.
931//
932// The tools that process the memory profiles assume that the
933// profile rate is constant across the lifetime of the program
934// and equal to the current value. Programs that change the
935// memory profiling rate should do so just once, as early as
936// possible in the execution of the program (for example,
937// at the beginning of main).
938var MemProfileRate int = 512 * 1024
939
940// disableMemoryProfiling is set by the linker if memory profiling
941// is not used and the link type guarantees nobody else could use it
942// elsewhere.
943// We check if the runtime.memProfileInternal symbol is present.
944var disableMemoryProfiling bool
945
946// A MemProfileRecord describes the live objects allocated
947// by a particular call sequence (stack trace).
948type MemProfileRecord struct {
949	AllocBytes, FreeBytes     int64       // number of bytes allocated, freed
950	AllocObjects, FreeObjects int64       // number of objects allocated, freed
951	Stack0                    [32]uintptr // stack trace for this record; ends at first 0 entry
952}
953
954// InUseBytes returns the number of bytes in use (AllocBytes - FreeBytes).
955func (r *MemProfileRecord) InUseBytes() int64 { return r.AllocBytes - r.FreeBytes }
956
957// InUseObjects returns the number of objects in use (AllocObjects - FreeObjects).
958func (r *MemProfileRecord) InUseObjects() int64 {
959	return r.AllocObjects - r.FreeObjects
960}
961
962// Stack returns the stack trace associated with the record,
963// a prefix of r.Stack0.
964func (r *MemProfileRecord) Stack() []uintptr {
965	for i, v := range r.Stack0 {
966		if v == 0 {
967			return r.Stack0[0:i]
968		}
969	}
970	return r.Stack0[0:]
971}
972
973// MemProfile returns a profile of memory allocated and freed per allocation
974// site.
975//
976// MemProfile returns n, the number of records in the current memory profile.
977// If len(p) >= n, MemProfile copies the profile into p and returns n, true.
978// If len(p) < n, MemProfile does not change p and returns n, false.
979//
980// If inuseZero is true, the profile includes allocation records
981// where r.AllocBytes > 0 but r.AllocBytes == r.FreeBytes.
982// These are sites where memory was allocated, but it has all
983// been released back to the runtime.
984//
985// The returned profile may be up to two garbage collection cycles old.
986// This is to avoid skewing the profile toward allocations; because
987// allocations happen in real time but frees are delayed until the garbage
988// collector performs sweeping, the profile only accounts for allocations
989// that have had a chance to be freed by the garbage collector.
990//
991// Most clients should use the runtime/pprof package or
992// the testing package's -test.memprofile flag instead
993// of calling MemProfile directly.
994func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
995	return memProfileInternal(len(p), inuseZero, func(r profilerecord.MemProfileRecord) {
996		copyMemProfileRecord(&p[0], r)
997		p = p[1:]
998	})
999}
1000
1001// memProfileInternal returns the number of records n in the profile. If there
1002// are less than size records, copyFn is invoked for each record, and ok returns
1003// true.
1004//
1005// The linker set disableMemoryProfiling to true to disable memory profiling
1006// if this function is not reachable. Mark it noinline to ensure the symbol exists.
1007// (This function is big and normally not inlined anyway.)
1008// See also disableMemoryProfiling above and cmd/link/internal/ld/lib.go:linksetup.
1009//
1010//go:noinline
1011func memProfileInternal(size int, inuseZero bool, copyFn func(profilerecord.MemProfileRecord)) (n int, ok bool) {
1012	cycle := mProfCycle.read()
1013	// If we're between mProf_NextCycle and mProf_Flush, take care
1014	// of flushing to the active profile so we only have to look
1015	// at the active profile below.
1016	index := cycle % uint32(len(memRecord{}.future))
1017	lock(&profMemActiveLock)
1018	lock(&profMemFutureLock[index])
1019	mProf_FlushLocked(index)
1020	unlock(&profMemFutureLock[index])
1021	clear := true
1022	head := (*bucket)(mbuckets.Load())
1023	for b := head; b != nil; b = b.allnext {
1024		mp := b.mp()
1025		if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
1026			n++
1027		}
1028		if mp.active.allocs != 0 || mp.active.frees != 0 {
1029			clear = false
1030		}
1031	}
1032	if clear {
1033		// Absolutely no data, suggesting that a garbage collection
1034		// has not yet happened. In order to allow profiling when
1035		// garbage collection is disabled from the beginning of execution,
1036		// accumulate all of the cycles, and recount buckets.
1037		n = 0
1038		for b := head; b != nil; b = b.allnext {
1039			mp := b.mp()
1040			for c := range mp.future {
1041				lock(&profMemFutureLock[c])
1042				mp.active.add(&mp.future[c])
1043				mp.future[c] = memRecordCycle{}
1044				unlock(&profMemFutureLock[c])
1045			}
1046			if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
1047				n++
1048			}
1049		}
1050	}
1051	if n <= size {
1052		ok = true
1053		for b := head; b != nil; b = b.allnext {
1054			mp := b.mp()
1055			if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
1056				r := profilerecord.MemProfileRecord{
1057					AllocBytes:   int64(mp.active.alloc_bytes),
1058					FreeBytes:    int64(mp.active.free_bytes),
1059					AllocObjects: int64(mp.active.allocs),
1060					FreeObjects:  int64(mp.active.frees),
1061					Stack:        b.stk(),
1062				}
1063				copyFn(r)
1064			}
1065		}
1066	}
1067	unlock(&profMemActiveLock)
1068	return
1069}
1070
1071func copyMemProfileRecord(dst *MemProfileRecord, src profilerecord.MemProfileRecord) {
1072	dst.AllocBytes = src.AllocBytes
1073	dst.FreeBytes = src.FreeBytes
1074	dst.AllocObjects = src.AllocObjects
1075	dst.FreeObjects = src.FreeObjects
1076	if raceenabled {
1077		racewriterangepc(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0), getcallerpc(), abi.FuncPCABIInternal(MemProfile))
1078	}
1079	if msanenabled {
1080		msanwrite(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0))
1081	}
1082	if asanenabled {
1083		asanwrite(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0))
1084	}
1085	i := copy(dst.Stack0[:], src.Stack)
1086	clear(dst.Stack0[i:])
1087}
1088
1089//go:linkname pprof_memProfileInternal
1090func pprof_memProfileInternal(p []profilerecord.MemProfileRecord, inuseZero bool) (n int, ok bool) {
1091	return memProfileInternal(len(p), inuseZero, func(r profilerecord.MemProfileRecord) {
1092		p[0] = r
1093		p = p[1:]
1094	})
1095}
1096
1097func iterate_memprof(fn func(*bucket, uintptr, *uintptr, uintptr, uintptr, uintptr)) {
1098	lock(&profMemActiveLock)
1099	head := (*bucket)(mbuckets.Load())
1100	for b := head; b != nil; b = b.allnext {
1101		mp := b.mp()
1102		fn(b, b.nstk, &b.stk()[0], b.size, mp.active.allocs, mp.active.frees)
1103	}
1104	unlock(&profMemActiveLock)
1105}
1106
1107// BlockProfileRecord describes blocking events originated
1108// at a particular call sequence (stack trace).
1109type BlockProfileRecord struct {
1110	Count  int64
1111	Cycles int64
1112	StackRecord
1113}
1114
1115// BlockProfile returns n, the number of records in the current blocking profile.
1116// If len(p) >= n, BlockProfile copies the profile into p and returns n, true.
1117// If len(p) < n, BlockProfile does not change p and returns n, false.
1118//
1119// Most clients should use the [runtime/pprof] package or
1120// the [testing] package's -test.blockprofile flag instead
1121// of calling BlockProfile directly.
1122func BlockProfile(p []BlockProfileRecord) (n int, ok bool) {
1123	var m int
1124	n, ok = blockProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) {
1125		copyBlockProfileRecord(&p[m], r)
1126		m++
1127	})
1128	if ok {
1129		expandFrames(p[:n])
1130	}
1131	return
1132}
1133
1134func expandFrames(p []BlockProfileRecord) {
1135	expandedStack := makeProfStack()
1136	for i := range p {
1137		cf := CallersFrames(p[i].Stack())
1138		j := 0
1139		for j < len(expandedStack) {
1140			f, more := cf.Next()
1141			// f.PC is a "call PC", but later consumers will expect
1142			// "return PCs"
1143			expandedStack[j] = f.PC + 1
1144			j++
1145			if !more {
1146				break
1147			}
1148		}
1149		k := copy(p[i].Stack0[:], expandedStack[:j])
1150		clear(p[i].Stack0[k:])
1151	}
1152}
1153
1154// blockProfileInternal returns the number of records n in the profile. If there
1155// are less than size records, copyFn is invoked for each record, and ok returns
1156// true.
1157func blockProfileInternal(size int, copyFn func(profilerecord.BlockProfileRecord)) (n int, ok bool) {
1158	lock(&profBlockLock)
1159	head := (*bucket)(bbuckets.Load())
1160	for b := head; b != nil; b = b.allnext {
1161		n++
1162	}
1163	if n <= size {
1164		ok = true
1165		for b := head; b != nil; b = b.allnext {
1166			bp := b.bp()
1167			r := profilerecord.BlockProfileRecord{
1168				Count:  int64(bp.count),
1169				Cycles: bp.cycles,
1170				Stack:  b.stk(),
1171			}
1172			// Prevent callers from having to worry about division by zero errors.
1173			// See discussion on http://golang.org/cl/299991.
1174			if r.Count == 0 {
1175				r.Count = 1
1176			}
1177			copyFn(r)
1178		}
1179	}
1180	unlock(&profBlockLock)
1181	return
1182}
1183
1184// copyBlockProfileRecord copies the sample values and call stack from src to dst.
1185// The call stack is copied as-is. The caller is responsible for handling inline
1186// expansion, needed when the call stack was collected with frame pointer unwinding.
1187func copyBlockProfileRecord(dst *BlockProfileRecord, src profilerecord.BlockProfileRecord) {
1188	dst.Count = src.Count
1189	dst.Cycles = src.Cycles
1190	if raceenabled {
1191		racewriterangepc(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0), getcallerpc(), abi.FuncPCABIInternal(BlockProfile))
1192	}
1193	if msanenabled {
1194		msanwrite(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0))
1195	}
1196	if asanenabled {
1197		asanwrite(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0))
1198	}
1199	// We just copy the stack here without inline expansion
1200	// (needed if frame pointer unwinding is used)
1201	// since this function is called under the profile lock,
1202	// and doing something that might allocate can violate lock ordering.
1203	i := copy(dst.Stack0[:], src.Stack)
1204	clear(dst.Stack0[i:])
1205}
1206
1207//go:linkname pprof_blockProfileInternal
1208func pprof_blockProfileInternal(p []profilerecord.BlockProfileRecord) (n int, ok bool) {
1209	return blockProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) {
1210		p[0] = r
1211		p = p[1:]
1212	})
1213}
1214
1215// MutexProfile returns n, the number of records in the current mutex profile.
1216// If len(p) >= n, MutexProfile copies the profile into p and returns n, true.
1217// Otherwise, MutexProfile does not change p, and returns n, false.
1218//
1219// Most clients should use the [runtime/pprof] package
1220// instead of calling MutexProfile directly.
1221func MutexProfile(p []BlockProfileRecord) (n int, ok bool) {
1222	var m int
1223	n, ok = mutexProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) {
1224		copyBlockProfileRecord(&p[m], r)
1225		m++
1226	})
1227	if ok {
1228		expandFrames(p[:n])
1229	}
1230	return
1231}
1232
1233// mutexProfileInternal returns the number of records n in the profile. If there
1234// are less than size records, copyFn is invoked for each record, and ok returns
1235// true.
1236func mutexProfileInternal(size int, copyFn func(profilerecord.BlockProfileRecord)) (n int, ok bool) {
1237	lock(&profBlockLock)
1238	head := (*bucket)(xbuckets.Load())
1239	for b := head; b != nil; b = b.allnext {
1240		n++
1241	}
1242	if n <= size {
1243		ok = true
1244		for b := head; b != nil; b = b.allnext {
1245			bp := b.bp()
1246			r := profilerecord.BlockProfileRecord{
1247				Count:  int64(bp.count),
1248				Cycles: bp.cycles,
1249				Stack:  b.stk(),
1250			}
1251			copyFn(r)
1252		}
1253	}
1254	unlock(&profBlockLock)
1255	return
1256}
1257
1258//go:linkname pprof_mutexProfileInternal
1259func pprof_mutexProfileInternal(p []profilerecord.BlockProfileRecord) (n int, ok bool) {
1260	return mutexProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) {
1261		p[0] = r
1262		p = p[1:]
1263	})
1264}
1265
1266// ThreadCreateProfile returns n, the number of records in the thread creation profile.
1267// If len(p) >= n, ThreadCreateProfile copies the profile into p and returns n, true.
1268// If len(p) < n, ThreadCreateProfile does not change p and returns n, false.
1269//
1270// Most clients should use the runtime/pprof package instead
1271// of calling ThreadCreateProfile directly.
1272func ThreadCreateProfile(p []StackRecord) (n int, ok bool) {
1273	return threadCreateProfileInternal(len(p), func(r profilerecord.StackRecord) {
1274		i := copy(p[0].Stack0[:], r.Stack)
1275		clear(p[0].Stack0[i:])
1276		p = p[1:]
1277	})
1278}
1279
1280// threadCreateProfileInternal returns the number of records n in the profile.
1281// If there are less than size records, copyFn is invoked for each record, and
1282// ok returns true.
1283func threadCreateProfileInternal(size int, copyFn func(profilerecord.StackRecord)) (n int, ok bool) {
1284	first := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))
1285	for mp := first; mp != nil; mp = mp.alllink {
1286		n++
1287	}
1288	if n <= size {
1289		ok = true
1290		for mp := first; mp != nil; mp = mp.alllink {
1291			r := profilerecord.StackRecord{Stack: mp.createstack[:]}
1292			copyFn(r)
1293		}
1294	}
1295	return
1296}
1297
1298//go:linkname pprof_threadCreateInternal
1299func pprof_threadCreateInternal(p []profilerecord.StackRecord) (n int, ok bool) {
1300	return threadCreateProfileInternal(len(p), func(r profilerecord.StackRecord) {
1301		p[0] = r
1302		p = p[1:]
1303	})
1304}
1305
1306//go:linkname pprof_goroutineProfileWithLabels
1307func pprof_goroutineProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1308	return goroutineProfileWithLabels(p, labels)
1309}
1310
1311// labels may be nil. If labels is non-nil, it must have the same length as p.
1312func goroutineProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1313	if labels != nil && len(labels) != len(p) {
1314		labels = nil
1315	}
1316
1317	return goroutineProfileWithLabelsConcurrent(p, labels)
1318}
1319
1320var goroutineProfile = struct {
1321	sema    uint32
1322	active  bool
1323	offset  atomic.Int64
1324	records []profilerecord.StackRecord
1325	labels  []unsafe.Pointer
1326}{
1327	sema: 1,
1328}
1329
1330// goroutineProfileState indicates the status of a goroutine's stack for the
1331// current in-progress goroutine profile. Goroutines' stacks are initially
1332// "Absent" from the profile, and end up "Satisfied" by the time the profile is
1333// complete. While a goroutine's stack is being captured, its
1334// goroutineProfileState will be "InProgress" and it will not be able to run
1335// until the capture completes and the state moves to "Satisfied".
1336//
1337// Some goroutines (the finalizer goroutine, which at various times can be
1338// either a "system" or a "user" goroutine, and the goroutine that is
1339// coordinating the profile, any goroutines created during the profile) move
1340// directly to the "Satisfied" state.
1341type goroutineProfileState uint32
1342
1343const (
1344	goroutineProfileAbsent goroutineProfileState = iota
1345	goroutineProfileInProgress
1346	goroutineProfileSatisfied
1347)
1348
1349type goroutineProfileStateHolder atomic.Uint32
1350
1351func (p *goroutineProfileStateHolder) Load() goroutineProfileState {
1352	return goroutineProfileState((*atomic.Uint32)(p).Load())
1353}
1354
1355func (p *goroutineProfileStateHolder) Store(value goroutineProfileState) {
1356	(*atomic.Uint32)(p).Store(uint32(value))
1357}
1358
1359func (p *goroutineProfileStateHolder) CompareAndSwap(old, new goroutineProfileState) bool {
1360	return (*atomic.Uint32)(p).CompareAndSwap(uint32(old), uint32(new))
1361}
1362
1363func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1364	if len(p) == 0 {
1365		// An empty slice is obviously too small. Return a rough
1366		// allocation estimate without bothering to STW. As long as
1367		// this is close, then we'll only need to STW once (on the next
1368		// call).
1369		return int(gcount()), false
1370	}
1371
1372	semacquire(&goroutineProfile.sema)
1373
1374	ourg := getg()
1375
1376	pcbuf := makeProfStack() // see saveg() for explanation
1377	stw := stopTheWorld(stwGoroutineProfile)
1378	// Using gcount while the world is stopped should give us a consistent view
1379	// of the number of live goroutines, minus the number of goroutines that are
1380	// alive and permanently marked as "system". But to make this count agree
1381	// with what we'd get from isSystemGoroutine, we need special handling for
1382	// goroutines that can vary between user and system to ensure that the count
1383	// doesn't change during the collection. So, check the finalizer goroutine
1384	// in particular.
1385	n = int(gcount())
1386	if fingStatus.Load()&fingRunningFinalizer != 0 {
1387		n++
1388	}
1389
1390	if n > len(p) {
1391		// There's not enough space in p to store the whole profile, so (per the
1392		// contract of runtime.GoroutineProfile) we're not allowed to write to p
1393		// at all and must return n, false.
1394		startTheWorld(stw)
1395		semrelease(&goroutineProfile.sema)
1396		return n, false
1397	}
1398
1399	// Save current goroutine.
1400	sp := getcallersp()
1401	pc := getcallerpc()
1402	systemstack(func() {
1403		saveg(pc, sp, ourg, &p[0], pcbuf)
1404	})
1405	if labels != nil {
1406		labels[0] = ourg.labels
1407	}
1408	ourg.goroutineProfiled.Store(goroutineProfileSatisfied)
1409	goroutineProfile.offset.Store(1)
1410
1411	// Prepare for all other goroutines to enter the profile. Aside from ourg,
1412	// every goroutine struct in the allgs list has its goroutineProfiled field
1413	// cleared. Any goroutine created from this point on (while
1414	// goroutineProfile.active is set) will start with its goroutineProfiled
1415	// field set to goroutineProfileSatisfied.
1416	goroutineProfile.active = true
1417	goroutineProfile.records = p
1418	goroutineProfile.labels = labels
1419	// The finalizer goroutine needs special handling because it can vary over
1420	// time between being a user goroutine (eligible for this profile) and a
1421	// system goroutine (to be excluded). Pick one before restarting the world.
1422	if fing != nil {
1423		fing.goroutineProfiled.Store(goroutineProfileSatisfied)
1424		if readgstatus(fing) != _Gdead && !isSystemGoroutine(fing, false) {
1425			doRecordGoroutineProfile(fing, pcbuf)
1426		}
1427	}
1428	startTheWorld(stw)
1429
1430	// Visit each goroutine that existed as of the startTheWorld call above.
1431	//
1432	// New goroutines may not be in this list, but we didn't want to know about
1433	// them anyway. If they do appear in this list (via reusing a dead goroutine
1434	// struct, or racing to launch between the world restarting and us getting
1435	// the list), they will already have their goroutineProfiled field set to
1436	// goroutineProfileSatisfied before their state transitions out of _Gdead.
1437	//
1438	// Any goroutine that the scheduler tries to execute concurrently with this
1439	// call will start by adding itself to the profile (before the act of
1440	// executing can cause any changes in its stack).
1441	forEachGRace(func(gp1 *g) {
1442		tryRecordGoroutineProfile(gp1, pcbuf, Gosched)
1443	})
1444
1445	stw = stopTheWorld(stwGoroutineProfileCleanup)
1446	endOffset := goroutineProfile.offset.Swap(0)
1447	goroutineProfile.active = false
1448	goroutineProfile.records = nil
1449	goroutineProfile.labels = nil
1450	startTheWorld(stw)
1451
1452	// Restore the invariant that every goroutine struct in allgs has its
1453	// goroutineProfiled field cleared.
1454	forEachGRace(func(gp1 *g) {
1455		gp1.goroutineProfiled.Store(goroutineProfileAbsent)
1456	})
1457
1458	if raceenabled {
1459		raceacquire(unsafe.Pointer(&labelSync))
1460	}
1461
1462	if n != int(endOffset) {
1463		// It's a big surprise that the number of goroutines changed while we
1464		// were collecting the profile. But probably better to return a
1465		// truncated profile than to crash the whole process.
1466		//
1467		// For instance, needm moves a goroutine out of the _Gdead state and so
1468		// might be able to change the goroutine count without interacting with
1469		// the scheduler. For code like that, the race windows are small and the
1470		// combination of features is uncommon, so it's hard to be (and remain)
1471		// sure we've caught them all.
1472	}
1473
1474	semrelease(&goroutineProfile.sema)
1475	return n, true
1476}
1477
1478// tryRecordGoroutineProfileWB asserts that write barriers are allowed and calls
1479// tryRecordGoroutineProfile.
1480//
1481//go:yeswritebarrierrec
1482func tryRecordGoroutineProfileWB(gp1 *g) {
1483	if getg().m.p.ptr() == nil {
1484		throw("no P available, write barriers are forbidden")
1485	}
1486	tryRecordGoroutineProfile(gp1, nil, osyield)
1487}
1488
1489// tryRecordGoroutineProfile ensures that gp1 has the appropriate representation
1490// in the current goroutine profile: either that it should not be profiled, or
1491// that a snapshot of its call stack and labels are now in the profile.
1492func tryRecordGoroutineProfile(gp1 *g, pcbuf []uintptr, yield func()) {
1493	if readgstatus(gp1) == _Gdead {
1494		// Dead goroutines should not appear in the profile. Goroutines that
1495		// start while profile collection is active will get goroutineProfiled
1496		// set to goroutineProfileSatisfied before transitioning out of _Gdead,
1497		// so here we check _Gdead first.
1498		return
1499	}
1500	if isSystemGoroutine(gp1, true) {
1501		// System goroutines should not appear in the profile. (The finalizer
1502		// goroutine is marked as "already profiled".)
1503		return
1504	}
1505
1506	for {
1507		prev := gp1.goroutineProfiled.Load()
1508		if prev == goroutineProfileSatisfied {
1509			// This goroutine is already in the profile (or is new since the
1510			// start of collection, so shouldn't appear in the profile).
1511			break
1512		}
1513		if prev == goroutineProfileInProgress {
1514			// Something else is adding gp1 to the goroutine profile right now.
1515			// Give that a moment to finish.
1516			yield()
1517			continue
1518		}
1519
1520		// While we have gp1.goroutineProfiled set to
1521		// goroutineProfileInProgress, gp1 may appear _Grunnable but will not
1522		// actually be able to run. Disable preemption for ourselves, to make
1523		// sure we finish profiling gp1 right away instead of leaving it stuck
1524		// in this limbo.
1525		mp := acquirem()
1526		if gp1.goroutineProfiled.CompareAndSwap(goroutineProfileAbsent, goroutineProfileInProgress) {
1527			doRecordGoroutineProfile(gp1, pcbuf)
1528			gp1.goroutineProfiled.Store(goroutineProfileSatisfied)
1529		}
1530		releasem(mp)
1531	}
1532}
1533
1534// doRecordGoroutineProfile writes gp1's call stack and labels to an in-progress
1535// goroutine profile. Preemption is disabled.
1536//
1537// This may be called via tryRecordGoroutineProfile in two ways: by the
1538// goroutine that is coordinating the goroutine profile (running on its own
1539// stack), or from the scheduler in preparation to execute gp1 (running on the
1540// system stack).
1541func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) {
1542	if readgstatus(gp1) == _Grunning {
1543		print("doRecordGoroutineProfile gp1=", gp1.goid, "\n")
1544		throw("cannot read stack of running goroutine")
1545	}
1546
1547	offset := int(goroutineProfile.offset.Add(1)) - 1
1548
1549	if offset >= len(goroutineProfile.records) {
1550		// Should be impossible, but better to return a truncated profile than
1551		// to crash the entire process at this point. Instead, deal with it in
1552		// goroutineProfileWithLabelsConcurrent where we have more context.
1553		return
1554	}
1555
1556	// saveg calls gentraceback, which may call cgo traceback functions. When
1557	// called from the scheduler, this is on the system stack already so
1558	// traceback.go:cgoContextPCs will avoid calling back into the scheduler.
1559	//
1560	// When called from the goroutine coordinating the profile, we still have
1561	// set gp1.goroutineProfiled to goroutineProfileInProgress and so are still
1562	// preventing it from being truly _Grunnable. So we'll use the system stack
1563	// to avoid schedule delays.
1564	systemstack(func() { saveg(^uintptr(0), ^uintptr(0), gp1, &goroutineProfile.records[offset], pcbuf) })
1565
1566	if goroutineProfile.labels != nil {
1567		goroutineProfile.labels[offset] = gp1.labels
1568	}
1569}
1570
1571func goroutineProfileWithLabelsSync(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1572	gp := getg()
1573
1574	isOK := func(gp1 *g) bool {
1575		// Checking isSystemGoroutine here makes GoroutineProfile
1576		// consistent with both NumGoroutine and Stack.
1577		return gp1 != gp && readgstatus(gp1) != _Gdead && !isSystemGoroutine(gp1, false)
1578	}
1579
1580	pcbuf := makeProfStack() // see saveg() for explanation
1581	stw := stopTheWorld(stwGoroutineProfile)
1582
1583	// World is stopped, no locking required.
1584	n = 1
1585	forEachGRace(func(gp1 *g) {
1586		if isOK(gp1) {
1587			n++
1588		}
1589	})
1590
1591	if n <= len(p) {
1592		ok = true
1593		r, lbl := p, labels
1594
1595		// Save current goroutine.
1596		sp := getcallersp()
1597		pc := getcallerpc()
1598		systemstack(func() {
1599			saveg(pc, sp, gp, &r[0], pcbuf)
1600		})
1601		r = r[1:]
1602
1603		// If we have a place to put our goroutine labelmap, insert it there.
1604		if labels != nil {
1605			lbl[0] = gp.labels
1606			lbl = lbl[1:]
1607		}
1608
1609		// Save other goroutines.
1610		forEachGRace(func(gp1 *g) {
1611			if !isOK(gp1) {
1612				return
1613			}
1614
1615			if len(r) == 0 {
1616				// Should be impossible, but better to return a
1617				// truncated profile than to crash the entire process.
1618				return
1619			}
1620			// saveg calls gentraceback, which may call cgo traceback functions.
1621			// The world is stopped, so it cannot use cgocall (which will be
1622			// blocked at exitsyscall). Do it on the system stack so it won't
1623			// call into the schedular (see traceback.go:cgoContextPCs).
1624			systemstack(func() { saveg(^uintptr(0), ^uintptr(0), gp1, &r[0], pcbuf) })
1625			if labels != nil {
1626				lbl[0] = gp1.labels
1627				lbl = lbl[1:]
1628			}
1629			r = r[1:]
1630		})
1631	}
1632
1633	if raceenabled {
1634		raceacquire(unsafe.Pointer(&labelSync))
1635	}
1636
1637	startTheWorld(stw)
1638	return n, ok
1639}
1640
1641// GoroutineProfile returns n, the number of records in the active goroutine stack profile.
1642// If len(p) >= n, GoroutineProfile copies the profile into p and returns n, true.
1643// If len(p) < n, GoroutineProfile does not change p and returns n, false.
1644//
1645// Most clients should use the [runtime/pprof] package instead
1646// of calling GoroutineProfile directly.
1647func GoroutineProfile(p []StackRecord) (n int, ok bool) {
1648	records := make([]profilerecord.StackRecord, len(p))
1649	n, ok = goroutineProfileInternal(records)
1650	if !ok {
1651		return
1652	}
1653	for i, mr := range records[0:n] {
1654		l := copy(p[i].Stack0[:], mr.Stack)
1655		clear(p[i].Stack0[l:])
1656	}
1657	return
1658}
1659
1660func goroutineProfileInternal(p []profilerecord.StackRecord) (n int, ok bool) {
1661	return goroutineProfileWithLabels(p, nil)
1662}
1663
1664func saveg(pc, sp uintptr, gp *g, r *profilerecord.StackRecord, pcbuf []uintptr) {
1665	// To reduce memory usage, we want to allocate a r.Stack that is just big
1666	// enough to hold gp's stack trace. Naively we might achieve this by
1667	// recording our stack trace into mp.profStack, and then allocating a
1668	// r.Stack of the right size. However, mp.profStack is also used for
1669	// allocation profiling, so it could get overwritten if the slice allocation
1670	// gets profiled. So instead we record the stack trace into a temporary
1671	// pcbuf which is usually given to us by our caller. When it's not, we have
1672	// to allocate one here. This will only happen for goroutines that were in a
1673	// syscall when the goroutine profile started or for goroutines that manage
1674	// to execute before we finish iterating over all the goroutines.
1675	if pcbuf == nil {
1676		pcbuf = makeProfStack()
1677	}
1678
1679	var u unwinder
1680	u.initAt(pc, sp, 0, gp, unwindSilentErrors)
1681	n := tracebackPCs(&u, 0, pcbuf)
1682	r.Stack = make([]uintptr, n)
1683	copy(r.Stack, pcbuf)
1684}
1685
1686// Stack formats a stack trace of the calling goroutine into buf
1687// and returns the number of bytes written to buf.
1688// If all is true, Stack formats stack traces of all other goroutines
1689// into buf after the trace for the current goroutine.
1690func Stack(buf []byte, all bool) int {
1691	var stw worldStop
1692	if all {
1693		stw = stopTheWorld(stwAllGoroutinesStack)
1694	}
1695
1696	n := 0
1697	if len(buf) > 0 {
1698		gp := getg()
1699		sp := getcallersp()
1700		pc := getcallerpc()
1701		systemstack(func() {
1702			g0 := getg()
1703			// Force traceback=1 to override GOTRACEBACK setting,
1704			// so that Stack's results are consistent.
1705			// GOTRACEBACK is only about crash dumps.
1706			g0.m.traceback = 1
1707			g0.writebuf = buf[0:0:len(buf)]
1708			goroutineheader(gp)
1709			traceback(pc, sp, 0, gp)
1710			if all {
1711				tracebackothers(gp)
1712			}
1713			g0.m.traceback = 0
1714			n = len(g0.writebuf)
1715			g0.writebuf = nil
1716		})
1717	}
1718
1719	if all {
1720		startTheWorld(stw)
1721	}
1722	return n
1723}
1724