runtime: speed up cheaprand and cheaprand64

The current cheaprand performs 128-bit multiplication on 64-bit numbers
and truncate the result to 32 bits, which is inefficient.

A 32-bit specific implementation is more performant because it performs
64-bit multiplication on 32-bit numbers instead.

The current cheaprand64 involves two cheaprand calls.
Implementing it as 64-bit wyrand is significantly faster.

Since cheaprand64 discards one bit, I have preserved this behavior.
The underlying uint64 function is made available as cheaprandu64.

               │    old      │                new           │
               │   sec/op    │   sec/op     vs base         │
Cheaprand-8      1.358n ± 0%   1.218n ± 0%  -10.31% (n=100)
Cheaprand64-8    2.424n ± 0%   1.391n ± 0%  -42.62% (n=100)
Blocksampled-8   8.347n ± 0%   2.022n ± 0%  -75.78% (n=100)

Fixes #77149
This commit is contained in:
Gavin Lam
2026-01-09 23:33:38 -05:00
parent e0c4ad77cf
commit 549d8d407e
7 changed files with 75 additions and 25 deletions

View File

@@ -273,6 +273,10 @@ func CountPagesInUse() (pagesInUse, counted uintptr) {
return
}
func Blocksampled(cycles, rate int64) bool { return blocksampled(cycles, rate) }
func Cheaprand() uint32 { return cheaprand() }
func Cheaprand64() int64 { return cheaprand64() }
func Fastrand() uint32 { return uint32(rand()) }
func Fastrand64() uint64 { return rand() }
func Fastrandn(n uint32) uint32 { return randn(n) }

View File

@@ -327,16 +327,8 @@ func unlock2(l *mutex) {
// mutexSampleContention returns whether the current mutex operation should
// report any contention it discovers.
func mutexSampleContention() bool {
if rate := int64(atomic.Load64(&mutexprofilerate)); rate <= 0 {
return false
} else {
// TODO: have SetMutexProfileFraction do the clamping
rate32 := uint32(rate)
if int64(rate32) != rate {
rate32 = ^uint32(0)
}
return cheaprandn(rate32) == 0
}
rate := atomic.Load64(&mutexprofilerate)
return rate > 0 && cheaprandu64()%rate == 0
}
// unlock2Wake updates the list of Ms waiting on l, waking an M if necessary.

View File

@@ -696,8 +696,8 @@ func (prof *mLockProfile) recordUnlock(cycles int64) {
if cycles == 0 {
return
}
prevScore := uint64(cheaprand64()) % uint64(prev)
thisScore := uint64(cheaprand64()) % uint64(cycles)
prevScore := cheaprandu64() % uint64(prev)
thisScore := cheaprandu64() % uint64(cycles)
if prevScore > thisScore {
prof.cyclesLost += cycles
return

16
src/runtime/mprof_test.go Normal file
View File

@@ -0,0 +1,16 @@
// Copyright 2026 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package runtime_test
import (
. "runtime"
"testing"
)
func BenchmarkBlocksampled(b *testing.B) {
for b.Loop() {
Blocksampled(42, 1337)
}
}

View File

@@ -192,7 +192,8 @@ func mrandinit(mp *m) {
}
bootstrapRandReseed() // erase key we just extracted
mp.chacha8.Init64(seed)
mp.cheaprand = rand()
mp.cheaprand = uint32(rand())
mp.cheaprand64 = rand()
}
// randn is like rand() % n but faster.
@@ -227,14 +228,12 @@ func randn(n uint32) uint32 {
func cheaprand() uint32 {
mp := getg().m
// Implement wyrand: https://github.com/wangyi-fudan/wyhash
// Only the platform that bits.Mul64 can be lowered
// by the compiler should be in this list.
if goarch.IsAmd64|goarch.IsArm64|goarch.IsPpc64|
goarch.IsPpc64le|goarch.IsMips64|goarch.IsMips64le|
goarch.IsS390x|goarch.IsRiscv64|goarch.IsLoong64 == 1 {
mp.cheaprand += 0xa0761d6478bd642f
hi, lo := bits.Mul64(mp.cheaprand, mp.cheaprand^0xe7037ed1a0b428db)
return uint32(hi ^ lo)
// Only the platform that supports 64-bit multiplication
// natively should be allowed.
if bits.UintSize == 64 {
mp.cheaprand += 0x53c5ca59
hi, lo := bits.Mul32(mp.cheaprand, mp.cheaprand^0x74743c1b)
return hi ^ lo
}
// Implement xorshift64+: 2 32-bit xorshift sequences added together.
@@ -242,7 +241,7 @@ func cheaprand() uint32 {
// Xorshift paper: https://www.jstatsoft.org/article/view/v008i14/xorshift.pdf
// This generator passes the SmallCrush suite, part of TestU01 framework:
// http://simul.iro.umontreal.ca/testu01/tu01.html
t := (*[2]uint32)(unsafe.Pointer(&mp.cheaprand))
t := (*[2]uint32)(unsafe.Pointer(&mp.cheaprand64))
s1, s0 := t[0], t[1]
s1 ^= s1 << 17
s1 = s1 ^ s0 ^ s1>>7 ^ s0>>16
@@ -269,7 +268,33 @@ func cheaprand() uint32 {
//go:linkname cheaprand64
//go:nosplit
func cheaprand64() int64 {
return int64(cheaprand())<<31 ^ int64(cheaprand())
return int64(cheaprandu64() & ^(uint64(1) << 63))
}
// cheaprandu64 is a non-cryptographic-quality 64-bit random generator
// suitable for calling at very high frequency (such as during sampling decisions).
// it is "cheap" in the sense of both expense and quality.
//
// cheaprandu64 must not be exported to other packages:
// the rule is that other packages using runtime-provided
// randomness must always use rand.
//
//go:nosplit
func cheaprandu64() uint64 {
// Implement wyrand: https://github.com/wangyi-fudan/wyhash
// Only the platform that bits.Mul64 can be lowered
// by the compiler should be in this list.
if goarch.IsAmd64|goarch.IsArm64|goarch.IsPpc64|
goarch.IsPpc64le|goarch.IsMips64|goarch.IsMips64le|
goarch.IsS390x|goarch.IsRiscv64|goarch.IsLoong64 == 1 {
mp := getg().m
// Implement wyrand: https://github.com/wangyi-fudan/wyhash
mp.cheaprand64 += 0xa0761d6478bd642f
hi, lo := bits.Mul64(mp.cheaprand64, mp.cheaprand64^0xe7037ed1a0b428db)
return hi ^ lo
}
return uint64(cheaprand())<<32 | uint64(cheaprand())
}
// cheaprandn is like cheaprand() % n but faster.

View File

@@ -22,6 +22,18 @@ func TestReadRandom(t *testing.T) {
}
}
func BenchmarkCheaprand(b *testing.B) {
for b.Loop() {
Cheaprand()
}
}
func BenchmarkCheaprand64(b *testing.B) {
for b.Loop() {
Cheaprand64()
}
}
func BenchmarkFastrand(b *testing.B) {
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {

View File

@@ -715,8 +715,9 @@ type m struct {
mOS
chacha8 chacha8rand.State
cheaprand uint64
chacha8 chacha8rand.State
cheaprand uint32
cheaprand64 uint64
// Up to 10 locks held by this m, maintained by the lock ranking code.
locksHeldLen int