runtime: speed up cheaprand and cheaprand64

The current cheaprand performs 128-bit multiplication on 64-bit numbers and truncate the result to 32 bits, which is inefficient. A 32-bit specific implementation is more performant because it performs 64-bit multiplication on 32-bit numbers instead. The current cheaprand64 involves two cheaprand calls. Implementing it as 64-bit wyrand is significantly faster. Since cheaprand64 discards one bit, I have preserved this behavior. The underlying uint64 function is made available as cheaprandu64. │ old │ new │ │ sec/op │ sec/op vs base │ Cheaprand-8 1.358n ± 0% 1.218n ± 0% -10.31% (n=100) Cheaprand64-8 2.424n ± 0% 1.391n ± 0% -42.62% (n=100) Blocksampled-8 8.347n ± 0% 2.022n ± 0% -75.78% (n=100) Fixes #77149
2026-01-29 07:02:05 +03:00 · 2026-01-09 23:33:38 -05:00
parent e0c4ad77cf
commit 549d8d407e
7 changed files with 75 additions and 25 deletions
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -273,6 +273,10 @@ func CountPagesInUse() (pagesInUse, counted uintptr) {
 	return
 }

+func Blocksampled(cycles, rate int64) bool { return blocksampled(cycles, rate) }
+
+func Cheaprand() uint32         { return cheaprand() }
+func Cheaprand64() int64        { return cheaprand64() }
 func Fastrand() uint32          { return uint32(rand()) }
 func Fastrand64() uint64        { return rand() }
 func Fastrandn(n uint32) uint32 { return randn(n) }
--- a/src/runtime/lock_spinbit.go
+++ b/src/runtime/lock_spinbit.go
@@ -327,16 +327,8 @@ func unlock2(l *mutex) {
 // mutexSampleContention returns whether the current mutex operation should
 // report any contention it discovers.
 func mutexSampleContention() bool {
-	if rate := int64(atomic.Load64(&mutexprofilerate)); rate <= 0 {
-		return false
-	} else {
-		// TODO: have SetMutexProfileFraction do the clamping
-		rate32 := uint32(rate)
-		if int64(rate32) != rate {
-			rate32 = ^uint32(0)
-		}
-		return cheaprandn(rate32) == 0
-	}
+	rate := atomic.Load64(&mutexprofilerate)
+	return rate > 0 && cheaprandu64()%rate == 0
 }

 // unlock2Wake updates the list of Ms waiting on l, waking an M if necessary.
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -696,8 +696,8 @@ func (prof *mLockProfile) recordUnlock(cycles int64) {
 		if cycles == 0 {
 			return
 		}
-		prevScore := uint64(cheaprand64()) % uint64(prev)
-		thisScore := uint64(cheaprand64()) % uint64(cycles)
+		prevScore := cheaprandu64() % uint64(prev)
+		thisScore := cheaprandu64() % uint64(cycles)
 		if prevScore > thisScore {
 			prof.cyclesLost += cycles
 			return
--- a/src/runtime/mprof_test.go
+++ b/src/runtime/mprof_test.go
@@ -0,0 +1,16 @@
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+)
+
+func BenchmarkBlocksampled(b *testing.B) {
+	for b.Loop() {
+		Blocksampled(42, 1337)
+	}
+}
--- a/src/runtime/rand.go
+++ b/src/runtime/rand.go
@@ -192,7 +192,8 @@ func mrandinit(mp *m) {
 	}
 	bootstrapRandReseed() // erase key we just extracted
 	mp.chacha8.Init64(seed)
-	mp.cheaprand = rand()
+	mp.cheaprand = uint32(rand())
+	mp.cheaprand64 = rand()
 }

 // randn is like rand() % n but faster.
@@ -227,14 +228,12 @@ func randn(n uint32) uint32 {
 func cheaprand() uint32 {
 	mp := getg().m
 	// Implement wyrand: https://github.com/wangyi-fudan/wyhash
-	// Only the platform that bits.Mul64 can be lowered
-	// by the compiler should be in this list.
-	if goarch.IsAmd64|goarch.IsArm64|goarch.IsPpc64|
-		goarch.IsPpc64le|goarch.IsMips64|goarch.IsMips64le|
-		goarch.IsS390x|goarch.IsRiscv64|goarch.IsLoong64 == 1 {
-		mp.cheaprand += 0xa0761d6478bd642f
-		hi, lo := bits.Mul64(mp.cheaprand, mp.cheaprand^0xe7037ed1a0b428db)
-		return uint32(hi ^ lo)
+	// Only the platform that supports 64-bit multiplication
+	// natively should be allowed.
+	if bits.UintSize == 64 {
+		mp.cheaprand += 0x53c5ca59
+		hi, lo := bits.Mul32(mp.cheaprand, mp.cheaprand^0x74743c1b)
+		return hi ^ lo
 	}

 	// Implement xorshift64+: 2 32-bit xorshift sequences added together.
@@ -242,7 +241,7 @@ func cheaprand() uint32 {
 	// Xorshift paper: https://www.jstatsoft.org/article/view/v008i14/xorshift.pdf
 	// This generator passes the SmallCrush suite, part of TestU01 framework:
 	// http://simul.iro.umontreal.ca/testu01/tu01.html
-	t := (*[2]uint32)(unsafe.Pointer(&mp.cheaprand))
+	t := (*[2]uint32)(unsafe.Pointer(&mp.cheaprand64))
 	s1, s0 := t[0], t[1]
 	s1 ^= s1 << 17
 	s1 = s1 ^ s0 ^ s1>>7 ^ s0>>16
@@ -269,7 +268,33 @@ func cheaprand() uint32 {
 //go:linkname cheaprand64
 //go:nosplit
 func cheaprand64() int64 {
-	return int64(cheaprand())<<31 ^ int64(cheaprand())
+	return int64(cheaprandu64() & ^(uint64(1) << 63))
+}
+
+// cheaprandu64 is a non-cryptographic-quality 64-bit random generator
+// suitable for calling at very high frequency (such as during sampling decisions).
+// it is "cheap" in the sense of both expense and quality.
+//
+// cheaprandu64 must not be exported to other packages:
+// the rule is that other packages using runtime-provided
+// randomness must always use rand.
+//
+//go:nosplit
+func cheaprandu64() uint64 {
+	// Implement wyrand: https://github.com/wangyi-fudan/wyhash
+	// Only the platform that bits.Mul64 can be lowered
+	// by the compiler should be in this list.
+	if goarch.IsAmd64|goarch.IsArm64|goarch.IsPpc64|
+		goarch.IsPpc64le|goarch.IsMips64|goarch.IsMips64le|
+		goarch.IsS390x|goarch.IsRiscv64|goarch.IsLoong64 == 1 {
+		mp := getg().m
+		// Implement wyrand: https://github.com/wangyi-fudan/wyhash
+		mp.cheaprand64 += 0xa0761d6478bd642f
+		hi, lo := bits.Mul64(mp.cheaprand64, mp.cheaprand64^0xe7037ed1a0b428db)
+		return hi ^ lo
+	}
+
+	return uint64(cheaprand())<<32 | uint64(cheaprand())
 }

 // cheaprandn is like cheaprand() % n but faster.
--- a/src/runtime/rand_test.go
+++ b/src/runtime/rand_test.go
@@ -22,6 +22,18 @@ func TestReadRandom(t *testing.T) {
 	}
 }

+func BenchmarkCheaprand(b *testing.B) {
+	for b.Loop() {
+		Cheaprand()
+	}
+}
+
+func BenchmarkCheaprand64(b *testing.B) {
+	for b.Loop() {
+		Cheaprand64()
+	}
+}
+
 func BenchmarkFastrand(b *testing.B) {
 	b.RunParallel(func(pb *testing.PB) {
 		for pb.Next() {
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -715,8 +715,9 @@ type m struct {

 	mOS

-	chacha8   chacha8rand.State
-	cheaprand uint64
+	chacha8     chacha8rand.State
+	cheaprand   uint32
+	cheaprand64 uint64

 	// Up to 10 locks held by this m, maintained by the lock ranking code.
 	locksHeldLen int