go/src/runtime/lock_spinbit.go

// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build !wasm

package runtime

import (
	"internal/goarch"
	"internal/runtime/atomic"
	"internal/runtime/gc"
	"unsafe"
)

// This implementation depends on OS-specific implementations of
//
//	func semacreate(mp *m)
//		Create a semaphore for mp, if it does not already have one.
//
//	func semasleep(ns int64) int32
//		If ns < 0, acquire m's semaphore and return 0.
//		If ns >= 0, try to acquire m's semaphore for at most ns nanoseconds.
//		Return 0 if the semaphore was acquired, -1 if interrupted or timed out.
//
//	func semawakeup(mp *m)
//		Wake up mp, which is or will soon be sleeping on its semaphore.

// The mutex state consists of four flags and a pointer. The flag at bit 0,
// mutexLocked, represents the lock itself. Bit 1, mutexSleeping, is a hint that
// the pointer is non-nil. The fast paths for locking and unlocking the mutex
// are based on atomic 8-bit swap operations on the low byte; bits 2 through 7
// are unused.
//
// Bit 8, mutexSpinning, is a try-lock that grants a waiting M permission to
// spin on the state word. Most other Ms must attempt to spend their time
// sleeping to reduce traffic on the cache line. This is the "spin bit" for
// which the implementation is named. (The anti-starvation mechanism also grants
// temporary permission for an M to spin.)
//
// Bit 9, mutexStackLocked, is a try-lock that grants an unlocking M permission
// to inspect the list of waiting Ms and to pop an M off of that stack.
//
// The upper bits hold a (partial) pointer to the M that most recently went to
// sleep. The sleeping Ms form a stack linked by their mWaitList.next fields.
// Because the fast paths use an 8-bit swap on the low byte of the state word,
// we'll need to reconstruct the full M pointer from the bits we have. Most Ms
// are allocated on the heap, and have a known alignment and base offset. (The
// offset is due to mallocgc's allocation headers.) The main program thread uses
// a static M value, m0. We check for m0 specifically and add a known offset
// otherwise.

const (
	active_spin     = 4  // referenced in proc.go for sync.Mutex implementation
	active_spin_cnt = 30 // referenced in proc.go for sync.Mutex implementation
)

const (
	mutexLocked      = 0x001
	mutexSleeping    = 0x002
	mutexSpinning    = 0x100
	mutexStackLocked = 0x200
	mutexMMask       = 0x3FF
	mutexMOffset     = gc.MallocHeaderSize // alignment of heap-allocated Ms (those other than m0)

	mutexActiveSpinCount  = 4
	mutexActiveSpinSize   = 30
	mutexPassiveSpinCount = 1

	mutexTailWakePeriod = 16
)

//go:nosplit
func key8(p *uintptr) *uint8 {
	if goarch.BigEndian {
		return &(*[8]uint8)(unsafe.Pointer(p))[goarch.PtrSize/1-1]
	}
	return &(*[8]uint8)(unsafe.Pointer(p))[0]
}

// mWaitList is part of the M struct, and holds the list of Ms that are waiting
// for a particular runtime.mutex.
//
// When an M is unable to immediately obtain a lock, it adds itself to the list
// of Ms waiting for the lock. It does that via this struct's next field,
// forming a singly-linked list with the mutex's key field pointing to the head
// of the list.
type mWaitList struct {
	next       muintptr // next m waiting for lock
	startTicks int64    // when this m started waiting for the current lock holder, in cputicks
}

// lockVerifyMSize confirms that we can recreate the low bits of the M pointer.
func lockVerifyMSize() {
	size := roundupsize(unsafe.Sizeof(mPadded{}), false) + gc.MallocHeaderSize
	if size&mutexMMask != 0 {
		print("M structure uses sizeclass ", size, "/", hex(size), " bytes; ",
			"incompatible with mutex flag mask ", hex(mutexMMask), "\n")
		throw("runtime.m memory alignment too small for spinbit mutex")
	}
}

// mutexWaitListHead recovers a full muintptr that was missing its low bits.
// With the exception of the static m0 value, it requires allocating runtime.m
// values in a size class with a particular minimum alignment. The 2048-byte
// size class allows recovering the full muintptr value even after overwriting
// the low 11 bits with flags. We can use those 11 bits as 3 flags and an
// atomically-swapped byte.
//
//go:nosplit
func mutexWaitListHead(v uintptr) muintptr {
	if highBits := v &^ mutexMMask; highBits == 0 {
		return 0
	} else if m0bits := muintptr(unsafe.Pointer(&m0)); highBits == uintptr(m0bits)&^mutexMMask {
		return m0bits
	} else {
		return muintptr(highBits + mutexMOffset)
	}
}

// mutexPreferLowLatency reports if this mutex prefers low latency at the risk
// of performance collapse. If so, we can allow all waiting threads to spin on
// the state word rather than go to sleep.
//
// TODO: We could have the waiting Ms each spin on their own private cache line,
// especially if we can put a bound on the on-CPU time that would consume.
//
// TODO: If there's a small set of mutex values with special requirements, they
// could make use of a more specialized lock2/unlock2 implementation. Otherwise,
// we're constrained to what we can fit within a single uintptr with no
// additional storage on the M for each lock held.
//
//go:nosplit
func mutexPreferLowLatency(l *mutex) bool {
	switch l {
	default:
		return false
	case &sched.lock:
		// We often expect sched.lock to pass quickly between Ms in a way that
		// each M has unique work to do: for instance when we stop-the-world
		// (bringing each P to idle) or add new netpoller-triggered work to the
		// global run queue.
		return true
	}
}

func mutexContended(l *mutex) bool {
	return atomic.Loaduintptr(&l.key)&^mutexMMask != 0
}

func lock(l *mutex) {
	lockWithRank(l, getLockRank(l))
}

func lock2(l *mutex) {
	gp := getg()
	if gp.m.locks < 0 {
		throw("runtime·lock: lock count")
	}
	gp.m.locks++

	k8 := key8(&l.key)

	// Speculative grab for lock.
	v8 := atomic.Xchg8(k8, mutexLocked)
	if v8&mutexLocked == 0 {
		if v8&mutexSleeping != 0 {
			atomic.Or8(k8, mutexSleeping)
		}
		return
	}
	semacreate(gp.m)

	var startTime int64
	// On uniprocessors, no point spinning.
	// On multiprocessors, spin for mutexActiveSpinCount attempts.
	spin := 0
	if numCPUStartup > 1 {
		spin = mutexActiveSpinCount
	}

	var weSpin, atTail, haveTimers bool
	v := atomic.Loaduintptr(&l.key)
tryAcquire:
	for i := 0; ; i++ {
		if v&mutexLocked == 0 {
			if weSpin {
				next := (v &^ mutexSpinning) | mutexSleeping | mutexLocked
				if next&^mutexMMask == 0 {
					// The fast-path Xchg8 may have cleared mutexSleeping. Fix
					// the hint so unlock2 knows when to use its slow path.
					next = next &^ mutexSleeping
				}
				if atomic.Casuintptr(&l.key, v, next) {
					gp.m.mLockProfile.end(startTime)
					return
				}
			} else {
				prev8 := atomic.Xchg8(k8, mutexLocked|mutexSleeping)
				if prev8&mutexLocked == 0 {
					gp.m.mLockProfile.end(startTime)
					return
				}
			}
			v = atomic.Loaduintptr(&l.key)
			continue tryAcquire
		}

		if !weSpin && v&mutexSpinning == 0 && atomic.Casuintptr(&l.key, v, v|mutexSpinning) {
			v |= mutexSpinning
			weSpin = true
		}

		if weSpin || atTail || mutexPreferLowLatency(l) {
			if i < spin {
				procyield(mutexActiveSpinSize)
				v = atomic.Loaduintptr(&l.key)
				continue tryAcquire
			} else if i < spin+mutexPassiveSpinCount {
				osyield() // TODO: Consider removing this step. See https://go.dev/issue/69268.
				v = atomic.Loaduintptr(&l.key)
				continue tryAcquire
			}
		}

		// Go to sleep
		if v&mutexLocked == 0 {
			throw("runtime·lock: sleeping while lock is available")
		}

		// Collect times for mutex profile (seen in unlock2 only via mWaitList),
		// and for "/sync/mutex/wait/total:seconds" metric (to match).
		if !haveTimers {
			gp.m.mWaitList.startTicks = cputicks()
			startTime = gp.m.mLockProfile.start()
			haveTimers = true
		}
		// Store the current head of the list of sleeping Ms in our gp.m.mWaitList.next field
		gp.m.mWaitList.next = mutexWaitListHead(v)

		// Pack a (partial) pointer to this M with the current lock state bits
		next := (uintptr(unsafe.Pointer(gp.m)) &^ mutexMMask) | v&mutexMMask | mutexSleeping
		if weSpin { // If we were spinning, prepare to retire
			next = next &^ mutexSpinning
		}

		if atomic.Casuintptr(&l.key, v, next) {
			weSpin = false
			// We've pushed ourselves onto the stack of waiters. Wait.
			semasleep(-1)
			atTail = gp.m.mWaitList.next == 0 // we were at risk of starving
			i = 0
		}

		gp.m.mWaitList.next = 0
		v = atomic.Loaduintptr(&l.key)
	}
}

func unlock(l *mutex) {
	unlockWithRank(l)
}

// We might not be holding a p in this code.
//
//go:nowritebarrier
func unlock2(l *mutex) {
	gp := getg()

	var prev8 uint8
	var haveStackLock bool
	var endTicks int64
	if !mutexSampleContention() {
		// Not collecting a sample for the contention profile, do the quick release
		prev8 = atomic.Xchg8(key8(&l.key), 0)
	} else {
		// If there's contention, we'll sample it. Don't allow another
		// lock2/unlock2 pair to finish before us and take our blame. Prevent
		// that by trading for the stack lock with a CAS.
		v := atomic.Loaduintptr(&l.key)
		for {
			if v&^mutexMMask == 0 || v&mutexStackLocked != 0 {
				// No contention, or (stack lock unavailable) no way to calculate it
				prev8 = atomic.Xchg8(key8(&l.key), 0)
				endTicks = 0
				break
			}

			// There's contention, the stack lock appeared to be available, and
			// we'd like to collect a sample for the contention profile.
			if endTicks == 0 {
				// Read the time before releasing the lock. The profile will be
				// strictly smaller than what other threads would see by timing
				// their lock calls.
				endTicks = cputicks()
			}
			next := (v | mutexStackLocked) &^ (mutexLocked | mutexSleeping)
			if atomic.Casuintptr(&l.key, v, next) {
				haveStackLock = true
				prev8 = uint8(v)
				// The fast path of lock2 may have cleared mutexSleeping.
				// Restore it so we're sure to call unlock2Wake below.
				prev8 |= mutexSleeping
				break
			}
			v = atomic.Loaduintptr(&l.key)
		}
	}
	if prev8&mutexLocked == 0 {
		throw("unlock of unlocked lock")
	}

	if prev8&mutexSleeping != 0 {
		unlock2Wake(l, haveStackLock, endTicks)
	}

	gp.m.mLockProfile.store()
	gp.m.locks--
	if gp.m.locks < 0 {
		throw("runtime·unlock: lock count")
	}
	if gp.m.locks == 0 && gp.preempt { // restore the preemption request in case we've cleared it in newstack
		gp.stackguard0 = stackPreempt
	}
}

// mutexSampleContention returns whether the current mutex operation should
// report any contention it discovers.
func mutexSampleContention() bool {
	rate := atomic.Load64(&mutexprofilerate)
	return rate > 0 && cheaprandu64()%rate == 0
}

// unlock2Wake updates the list of Ms waiting on l, waking an M if necessary.
//
//go:nowritebarrier
func unlock2Wake(l *mutex, haveStackLock bool, endTicks int64) {
	v := atomic.Loaduintptr(&l.key)

	// On occasion, seek out and wake the M at the bottom of the stack so it
	// doesn't starve.
	antiStarve := cheaprandn(mutexTailWakePeriod) == 0

	if haveStackLock {
		goto useStackLock
	}

	if !(antiStarve || // avoiding starvation may require a wake
		v&mutexSpinning == 0 || // no spinners means we must wake
		mutexPreferLowLatency(l)) { // prefer waiters be awake as much as possible
		return
	}

	for {
		if v&^mutexMMask == 0 || v&mutexStackLocked != 0 {
			// No waiting Ms means nothing to do.
			//
			// If the stack lock is unavailable, its owner would make the same
			// wake decisions that we would, so there's nothing for us to do.
			//
			// Although: This thread may have a different call stack, which
			// would result in a different entry in the mutex contention profile
			// (upon completion of go.dev/issue/66999). That could lead to weird
			// results if a slow critical section ends but another thread
			// quickly takes the lock, finishes its own critical section,
			// releases the lock, and then grabs the stack lock. That quick
			// thread would then take credit (blame) for the delay that this
			// slow thread caused. The alternative is to have more expensive
			// atomic operations (a CAS) on the critical path of unlock2.
			return
		}
		// Other M's are waiting for the lock.
		// Obtain the stack lock, and pop off an M.
		next := v | mutexStackLocked
		if atomic.Casuintptr(&l.key, v, next) {
			break
		}
		v = atomic.Loaduintptr(&l.key)
	}

	// We own the mutexStackLocked flag. New Ms may push themselves onto the
	// stack concurrently, but we're now the only thread that can remove or
	// modify the Ms that are sleeping in the list.
useStackLock:

	if endTicks != 0 {
		// Find the M at the bottom of the stack of waiters, which has been
		// asleep for the longest. Take the average of its wait time and the
		// head M's wait time for the mutex contention profile, matching the
		// estimate we do in semrelease1 (for sync.Mutex contention).
		//
		// We don't keep track of the tail node (we don't need it often), so do
		// an O(N) walk on the list of sleeping Ms to find it.
		head := mutexWaitListHead(v).ptr()
		for node, n := head, 0; ; {
			n++
			next := node.mWaitList.next.ptr()
			if next == nil {
				cycles := ((endTicks - head.mWaitList.startTicks) + (endTicks - node.mWaitList.startTicks)) / 2
				node.mWaitList.startTicks = endTicks
				head.mWaitList.startTicks = endTicks
				getg().m.mLockProfile.recordUnlock(cycles * int64(n))
				break
			}
			node = next
		}
	}

	var committed *m // If we choose an M within the stack, we've made a promise to wake it
	for {
		headM := v &^ mutexMMask
		flags := v & (mutexMMask &^ mutexStackLocked) // preserve low bits, but release stack lock

		mp := mutexWaitListHead(v).ptr()
		wakem := committed
		if committed == nil {
			if v&mutexSpinning == 0 || mutexPreferLowLatency(l) {
				wakem = mp
			}
			if antiStarve {
				// Wake the M at the bottom of the stack of waiters. (This is
				// O(N) with the number of waiters.)
				wakem = mp
				prev := mp
				for {
					next := wakem.mWaitList.next.ptr()
					if next == nil {
						break
					}
					prev, wakem = wakem, next
				}
				if wakem != mp {
					committed = wakem
					prev.mWaitList.next = wakem.mWaitList.next
					// An M sets its own startTicks when it first goes to sleep.
					// When an unlock operation is sampled for the mutex
					// contention profile, it takes blame for the entire list of
					// waiting Ms but only updates the startTicks value at the
					// tail. Copy any updates to the next-oldest M.
					prev.mWaitList.startTicks = wakem.mWaitList.startTicks
				}
			}
		}

		if wakem == mp {
			headM = uintptr(mp.mWaitList.next) &^ mutexMMask
		}

		next := headM | flags
		if atomic.Casuintptr(&l.key, v, next) {
			if wakem != nil {
				// Claimed an M. Wake it.
				semawakeup(wakem)
			}
			return
		}

		v = atomic.Loaduintptr(&l.key)
	}
}