mirror of
https://github.com/golang/go.git
synced 2026-01-29 07:02:05 +03:00
The current cheaprand performs 128-bit multiplication on 64-bit numbers
and truncate the result to 32 bits, which is inefficient.
A 32-bit specific implementation is more performant because it performs
64-bit multiplication on 32-bit numbers instead.
The current cheaprand64 involves two cheaprand calls.
Implementing it as 64-bit wyrand is significantly faster.
Since cheaprand64 discards one bit, I have preserved this behavior.
The underlying uint64 function is made available as cheaprandu64.
│ old │ new │
│ sec/op │ sec/op vs base │
Cheaprand-8 1.358n ± 0% 1.218n ± 0% -10.31% (n=100)
Cheaprand64-8 2.424n ± 0% 1.391n ± 0% -42.62% (n=100)
Blocksampled-8 8.347n ± 0% 2.022n ± 0% -75.78% (n=100)
Fixes #77149
461 lines
14 KiB
Go
461 lines
14 KiB
Go
// Copyright 2024 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
//go:build !wasm
|
|
|
|
package runtime
|
|
|
|
import (
|
|
"internal/goarch"
|
|
"internal/runtime/atomic"
|
|
"internal/runtime/gc"
|
|
"unsafe"
|
|
)
|
|
|
|
// This implementation depends on OS-specific implementations of
|
|
//
|
|
// func semacreate(mp *m)
|
|
// Create a semaphore for mp, if it does not already have one.
|
|
//
|
|
// func semasleep(ns int64) int32
|
|
// If ns < 0, acquire m's semaphore and return 0.
|
|
// If ns >= 0, try to acquire m's semaphore for at most ns nanoseconds.
|
|
// Return 0 if the semaphore was acquired, -1 if interrupted or timed out.
|
|
//
|
|
// func semawakeup(mp *m)
|
|
// Wake up mp, which is or will soon be sleeping on its semaphore.
|
|
|
|
// The mutex state consists of four flags and a pointer. The flag at bit 0,
|
|
// mutexLocked, represents the lock itself. Bit 1, mutexSleeping, is a hint that
|
|
// the pointer is non-nil. The fast paths for locking and unlocking the mutex
|
|
// are based on atomic 8-bit swap operations on the low byte; bits 2 through 7
|
|
// are unused.
|
|
//
|
|
// Bit 8, mutexSpinning, is a try-lock that grants a waiting M permission to
|
|
// spin on the state word. Most other Ms must attempt to spend their time
|
|
// sleeping to reduce traffic on the cache line. This is the "spin bit" for
|
|
// which the implementation is named. (The anti-starvation mechanism also grants
|
|
// temporary permission for an M to spin.)
|
|
//
|
|
// Bit 9, mutexStackLocked, is a try-lock that grants an unlocking M permission
|
|
// to inspect the list of waiting Ms and to pop an M off of that stack.
|
|
//
|
|
// The upper bits hold a (partial) pointer to the M that most recently went to
|
|
// sleep. The sleeping Ms form a stack linked by their mWaitList.next fields.
|
|
// Because the fast paths use an 8-bit swap on the low byte of the state word,
|
|
// we'll need to reconstruct the full M pointer from the bits we have. Most Ms
|
|
// are allocated on the heap, and have a known alignment and base offset. (The
|
|
// offset is due to mallocgc's allocation headers.) The main program thread uses
|
|
// a static M value, m0. We check for m0 specifically and add a known offset
|
|
// otherwise.
|
|
|
|
const (
|
|
active_spin = 4 // referenced in proc.go for sync.Mutex implementation
|
|
active_spin_cnt = 30 // referenced in proc.go for sync.Mutex implementation
|
|
)
|
|
|
|
const (
|
|
mutexLocked = 0x001
|
|
mutexSleeping = 0x002
|
|
mutexSpinning = 0x100
|
|
mutexStackLocked = 0x200
|
|
mutexMMask = 0x3FF
|
|
mutexMOffset = gc.MallocHeaderSize // alignment of heap-allocated Ms (those other than m0)
|
|
|
|
mutexActiveSpinCount = 4
|
|
mutexActiveSpinSize = 30
|
|
mutexPassiveSpinCount = 1
|
|
|
|
mutexTailWakePeriod = 16
|
|
)
|
|
|
|
//go:nosplit
|
|
func key8(p *uintptr) *uint8 {
|
|
if goarch.BigEndian {
|
|
return &(*[8]uint8)(unsafe.Pointer(p))[goarch.PtrSize/1-1]
|
|
}
|
|
return &(*[8]uint8)(unsafe.Pointer(p))[0]
|
|
}
|
|
|
|
// mWaitList is part of the M struct, and holds the list of Ms that are waiting
|
|
// for a particular runtime.mutex.
|
|
//
|
|
// When an M is unable to immediately obtain a lock, it adds itself to the list
|
|
// of Ms waiting for the lock. It does that via this struct's next field,
|
|
// forming a singly-linked list with the mutex's key field pointing to the head
|
|
// of the list.
|
|
type mWaitList struct {
|
|
next muintptr // next m waiting for lock
|
|
startTicks int64 // when this m started waiting for the current lock holder, in cputicks
|
|
}
|
|
|
|
// lockVerifyMSize confirms that we can recreate the low bits of the M pointer.
|
|
func lockVerifyMSize() {
|
|
size := roundupsize(unsafe.Sizeof(mPadded{}), false) + gc.MallocHeaderSize
|
|
if size&mutexMMask != 0 {
|
|
print("M structure uses sizeclass ", size, "/", hex(size), " bytes; ",
|
|
"incompatible with mutex flag mask ", hex(mutexMMask), "\n")
|
|
throw("runtime.m memory alignment too small for spinbit mutex")
|
|
}
|
|
}
|
|
|
|
// mutexWaitListHead recovers a full muintptr that was missing its low bits.
|
|
// With the exception of the static m0 value, it requires allocating runtime.m
|
|
// values in a size class with a particular minimum alignment. The 2048-byte
|
|
// size class allows recovering the full muintptr value even after overwriting
|
|
// the low 11 bits with flags. We can use those 11 bits as 3 flags and an
|
|
// atomically-swapped byte.
|
|
//
|
|
//go:nosplit
|
|
func mutexWaitListHead(v uintptr) muintptr {
|
|
if highBits := v &^ mutexMMask; highBits == 0 {
|
|
return 0
|
|
} else if m0bits := muintptr(unsafe.Pointer(&m0)); highBits == uintptr(m0bits)&^mutexMMask {
|
|
return m0bits
|
|
} else {
|
|
return muintptr(highBits + mutexMOffset)
|
|
}
|
|
}
|
|
|
|
// mutexPreferLowLatency reports if this mutex prefers low latency at the risk
|
|
// of performance collapse. If so, we can allow all waiting threads to spin on
|
|
// the state word rather than go to sleep.
|
|
//
|
|
// TODO: We could have the waiting Ms each spin on their own private cache line,
|
|
// especially if we can put a bound on the on-CPU time that would consume.
|
|
//
|
|
// TODO: If there's a small set of mutex values with special requirements, they
|
|
// could make use of a more specialized lock2/unlock2 implementation. Otherwise,
|
|
// we're constrained to what we can fit within a single uintptr with no
|
|
// additional storage on the M for each lock held.
|
|
//
|
|
//go:nosplit
|
|
func mutexPreferLowLatency(l *mutex) bool {
|
|
switch l {
|
|
default:
|
|
return false
|
|
case &sched.lock:
|
|
// We often expect sched.lock to pass quickly between Ms in a way that
|
|
// each M has unique work to do: for instance when we stop-the-world
|
|
// (bringing each P to idle) or add new netpoller-triggered work to the
|
|
// global run queue.
|
|
return true
|
|
}
|
|
}
|
|
|
|
func mutexContended(l *mutex) bool {
|
|
return atomic.Loaduintptr(&l.key)&^mutexMMask != 0
|
|
}
|
|
|
|
func lock(l *mutex) {
|
|
lockWithRank(l, getLockRank(l))
|
|
}
|
|
|
|
func lock2(l *mutex) {
|
|
gp := getg()
|
|
if gp.m.locks < 0 {
|
|
throw("runtime·lock: lock count")
|
|
}
|
|
gp.m.locks++
|
|
|
|
k8 := key8(&l.key)
|
|
|
|
// Speculative grab for lock.
|
|
v8 := atomic.Xchg8(k8, mutexLocked)
|
|
if v8&mutexLocked == 0 {
|
|
if v8&mutexSleeping != 0 {
|
|
atomic.Or8(k8, mutexSleeping)
|
|
}
|
|
return
|
|
}
|
|
semacreate(gp.m)
|
|
|
|
var startTime int64
|
|
// On uniprocessors, no point spinning.
|
|
// On multiprocessors, spin for mutexActiveSpinCount attempts.
|
|
spin := 0
|
|
if numCPUStartup > 1 {
|
|
spin = mutexActiveSpinCount
|
|
}
|
|
|
|
var weSpin, atTail, haveTimers bool
|
|
v := atomic.Loaduintptr(&l.key)
|
|
tryAcquire:
|
|
for i := 0; ; i++ {
|
|
if v&mutexLocked == 0 {
|
|
if weSpin {
|
|
next := (v &^ mutexSpinning) | mutexSleeping | mutexLocked
|
|
if next&^mutexMMask == 0 {
|
|
// The fast-path Xchg8 may have cleared mutexSleeping. Fix
|
|
// the hint so unlock2 knows when to use its slow path.
|
|
next = next &^ mutexSleeping
|
|
}
|
|
if atomic.Casuintptr(&l.key, v, next) {
|
|
gp.m.mLockProfile.end(startTime)
|
|
return
|
|
}
|
|
} else {
|
|
prev8 := atomic.Xchg8(k8, mutexLocked|mutexSleeping)
|
|
if prev8&mutexLocked == 0 {
|
|
gp.m.mLockProfile.end(startTime)
|
|
return
|
|
}
|
|
}
|
|
v = atomic.Loaduintptr(&l.key)
|
|
continue tryAcquire
|
|
}
|
|
|
|
if !weSpin && v&mutexSpinning == 0 && atomic.Casuintptr(&l.key, v, v|mutexSpinning) {
|
|
v |= mutexSpinning
|
|
weSpin = true
|
|
}
|
|
|
|
if weSpin || atTail || mutexPreferLowLatency(l) {
|
|
if i < spin {
|
|
procyield(mutexActiveSpinSize)
|
|
v = atomic.Loaduintptr(&l.key)
|
|
continue tryAcquire
|
|
} else if i < spin+mutexPassiveSpinCount {
|
|
osyield() // TODO: Consider removing this step. See https://go.dev/issue/69268.
|
|
v = atomic.Loaduintptr(&l.key)
|
|
continue tryAcquire
|
|
}
|
|
}
|
|
|
|
// Go to sleep
|
|
if v&mutexLocked == 0 {
|
|
throw("runtime·lock: sleeping while lock is available")
|
|
}
|
|
|
|
// Collect times for mutex profile (seen in unlock2 only via mWaitList),
|
|
// and for "/sync/mutex/wait/total:seconds" metric (to match).
|
|
if !haveTimers {
|
|
gp.m.mWaitList.startTicks = cputicks()
|
|
startTime = gp.m.mLockProfile.start()
|
|
haveTimers = true
|
|
}
|
|
// Store the current head of the list of sleeping Ms in our gp.m.mWaitList.next field
|
|
gp.m.mWaitList.next = mutexWaitListHead(v)
|
|
|
|
// Pack a (partial) pointer to this M with the current lock state bits
|
|
next := (uintptr(unsafe.Pointer(gp.m)) &^ mutexMMask) | v&mutexMMask | mutexSleeping
|
|
if weSpin { // If we were spinning, prepare to retire
|
|
next = next &^ mutexSpinning
|
|
}
|
|
|
|
if atomic.Casuintptr(&l.key, v, next) {
|
|
weSpin = false
|
|
// We've pushed ourselves onto the stack of waiters. Wait.
|
|
semasleep(-1)
|
|
atTail = gp.m.mWaitList.next == 0 // we were at risk of starving
|
|
i = 0
|
|
}
|
|
|
|
gp.m.mWaitList.next = 0
|
|
v = atomic.Loaduintptr(&l.key)
|
|
}
|
|
}
|
|
|
|
func unlock(l *mutex) {
|
|
unlockWithRank(l)
|
|
}
|
|
|
|
// We might not be holding a p in this code.
|
|
//
|
|
//go:nowritebarrier
|
|
func unlock2(l *mutex) {
|
|
gp := getg()
|
|
|
|
var prev8 uint8
|
|
var haveStackLock bool
|
|
var endTicks int64
|
|
if !mutexSampleContention() {
|
|
// Not collecting a sample for the contention profile, do the quick release
|
|
prev8 = atomic.Xchg8(key8(&l.key), 0)
|
|
} else {
|
|
// If there's contention, we'll sample it. Don't allow another
|
|
// lock2/unlock2 pair to finish before us and take our blame. Prevent
|
|
// that by trading for the stack lock with a CAS.
|
|
v := atomic.Loaduintptr(&l.key)
|
|
for {
|
|
if v&^mutexMMask == 0 || v&mutexStackLocked != 0 {
|
|
// No contention, or (stack lock unavailable) no way to calculate it
|
|
prev8 = atomic.Xchg8(key8(&l.key), 0)
|
|
endTicks = 0
|
|
break
|
|
}
|
|
|
|
// There's contention, the stack lock appeared to be available, and
|
|
// we'd like to collect a sample for the contention profile.
|
|
if endTicks == 0 {
|
|
// Read the time before releasing the lock. The profile will be
|
|
// strictly smaller than what other threads would see by timing
|
|
// their lock calls.
|
|
endTicks = cputicks()
|
|
}
|
|
next := (v | mutexStackLocked) &^ (mutexLocked | mutexSleeping)
|
|
if atomic.Casuintptr(&l.key, v, next) {
|
|
haveStackLock = true
|
|
prev8 = uint8(v)
|
|
// The fast path of lock2 may have cleared mutexSleeping.
|
|
// Restore it so we're sure to call unlock2Wake below.
|
|
prev8 |= mutexSleeping
|
|
break
|
|
}
|
|
v = atomic.Loaduintptr(&l.key)
|
|
}
|
|
}
|
|
if prev8&mutexLocked == 0 {
|
|
throw("unlock of unlocked lock")
|
|
}
|
|
|
|
if prev8&mutexSleeping != 0 {
|
|
unlock2Wake(l, haveStackLock, endTicks)
|
|
}
|
|
|
|
gp.m.mLockProfile.store()
|
|
gp.m.locks--
|
|
if gp.m.locks < 0 {
|
|
throw("runtime·unlock: lock count")
|
|
}
|
|
if gp.m.locks == 0 && gp.preempt { // restore the preemption request in case we've cleared it in newstack
|
|
gp.stackguard0 = stackPreempt
|
|
}
|
|
}
|
|
|
|
// mutexSampleContention returns whether the current mutex operation should
|
|
// report any contention it discovers.
|
|
func mutexSampleContention() bool {
|
|
rate := atomic.Load64(&mutexprofilerate)
|
|
return rate > 0 && cheaprandu64()%rate == 0
|
|
}
|
|
|
|
// unlock2Wake updates the list of Ms waiting on l, waking an M if necessary.
|
|
//
|
|
//go:nowritebarrier
|
|
func unlock2Wake(l *mutex, haveStackLock bool, endTicks int64) {
|
|
v := atomic.Loaduintptr(&l.key)
|
|
|
|
// On occasion, seek out and wake the M at the bottom of the stack so it
|
|
// doesn't starve.
|
|
antiStarve := cheaprandn(mutexTailWakePeriod) == 0
|
|
|
|
if haveStackLock {
|
|
goto useStackLock
|
|
}
|
|
|
|
if !(antiStarve || // avoiding starvation may require a wake
|
|
v&mutexSpinning == 0 || // no spinners means we must wake
|
|
mutexPreferLowLatency(l)) { // prefer waiters be awake as much as possible
|
|
return
|
|
}
|
|
|
|
for {
|
|
if v&^mutexMMask == 0 || v&mutexStackLocked != 0 {
|
|
// No waiting Ms means nothing to do.
|
|
//
|
|
// If the stack lock is unavailable, its owner would make the same
|
|
// wake decisions that we would, so there's nothing for us to do.
|
|
//
|
|
// Although: This thread may have a different call stack, which
|
|
// would result in a different entry in the mutex contention profile
|
|
// (upon completion of go.dev/issue/66999). That could lead to weird
|
|
// results if a slow critical section ends but another thread
|
|
// quickly takes the lock, finishes its own critical section,
|
|
// releases the lock, and then grabs the stack lock. That quick
|
|
// thread would then take credit (blame) for the delay that this
|
|
// slow thread caused. The alternative is to have more expensive
|
|
// atomic operations (a CAS) on the critical path of unlock2.
|
|
return
|
|
}
|
|
// Other M's are waiting for the lock.
|
|
// Obtain the stack lock, and pop off an M.
|
|
next := v | mutexStackLocked
|
|
if atomic.Casuintptr(&l.key, v, next) {
|
|
break
|
|
}
|
|
v = atomic.Loaduintptr(&l.key)
|
|
}
|
|
|
|
// We own the mutexStackLocked flag. New Ms may push themselves onto the
|
|
// stack concurrently, but we're now the only thread that can remove or
|
|
// modify the Ms that are sleeping in the list.
|
|
useStackLock:
|
|
|
|
if endTicks != 0 {
|
|
// Find the M at the bottom of the stack of waiters, which has been
|
|
// asleep for the longest. Take the average of its wait time and the
|
|
// head M's wait time for the mutex contention profile, matching the
|
|
// estimate we do in semrelease1 (for sync.Mutex contention).
|
|
//
|
|
// We don't keep track of the tail node (we don't need it often), so do
|
|
// an O(N) walk on the list of sleeping Ms to find it.
|
|
head := mutexWaitListHead(v).ptr()
|
|
for node, n := head, 0; ; {
|
|
n++
|
|
next := node.mWaitList.next.ptr()
|
|
if next == nil {
|
|
cycles := ((endTicks - head.mWaitList.startTicks) + (endTicks - node.mWaitList.startTicks)) / 2
|
|
node.mWaitList.startTicks = endTicks
|
|
head.mWaitList.startTicks = endTicks
|
|
getg().m.mLockProfile.recordUnlock(cycles * int64(n))
|
|
break
|
|
}
|
|
node = next
|
|
}
|
|
}
|
|
|
|
var committed *m // If we choose an M within the stack, we've made a promise to wake it
|
|
for {
|
|
headM := v &^ mutexMMask
|
|
flags := v & (mutexMMask &^ mutexStackLocked) // preserve low bits, but release stack lock
|
|
|
|
mp := mutexWaitListHead(v).ptr()
|
|
wakem := committed
|
|
if committed == nil {
|
|
if v&mutexSpinning == 0 || mutexPreferLowLatency(l) {
|
|
wakem = mp
|
|
}
|
|
if antiStarve {
|
|
// Wake the M at the bottom of the stack of waiters. (This is
|
|
// O(N) with the number of waiters.)
|
|
wakem = mp
|
|
prev := mp
|
|
for {
|
|
next := wakem.mWaitList.next.ptr()
|
|
if next == nil {
|
|
break
|
|
}
|
|
prev, wakem = wakem, next
|
|
}
|
|
if wakem != mp {
|
|
committed = wakem
|
|
prev.mWaitList.next = wakem.mWaitList.next
|
|
// An M sets its own startTicks when it first goes to sleep.
|
|
// When an unlock operation is sampled for the mutex
|
|
// contention profile, it takes blame for the entire list of
|
|
// waiting Ms but only updates the startTicks value at the
|
|
// tail. Copy any updates to the next-oldest M.
|
|
prev.mWaitList.startTicks = wakem.mWaitList.startTicks
|
|
}
|
|
}
|
|
}
|
|
|
|
if wakem == mp {
|
|
headM = uintptr(mp.mWaitList.next) &^ mutexMMask
|
|
}
|
|
|
|
next := headM | flags
|
|
if atomic.Casuintptr(&l.key, v, next) {
|
|
if wakem != nil {
|
|
// Claimed an M. Wake it.
|
|
semawakeup(wakem)
|
|
}
|
|
return
|
|
}
|
|
|
|
v = atomic.Loaduintptr(&l.key)
|
|
}
|
|
}
|