internal/runtime/gc/scan: import scan kernel from gclab [green tea]

This change imports the AVX512 GC scanning kernel from CL 593938 into a
new package, internal/runtime/gc/scan. Credit to Austin Clements for
most of this work. I did some cleanup, added support for more size
classes to the expanders, and added more testing. I also restructured
the code to make it easier and clearer to add new scan kernels for new
architectures.

For #73581.

Change-Id: I76bcbc889fa6cad73ba0084620fae084a5912e6b
Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64_avx512,gotip-linux-amd64_avx512-greenteagc
Reviewed-on: https://go-review.googlesource.com/c/go/+/655280
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
This commit is contained in:
Michael Anthony Knyszek
2025-03-05 20:12:47 +00:00
committed by Gopher Robot
parent 182336bf05
commit 889ab74169
34 changed files with 5426 additions and 14 deletions

View File

@@ -52,6 +52,7 @@ var runtimePkgs = []string{
"internal/runtime/cgroup",
"internal/runtime/exithook",
"internal/runtime/gc",
"internal/runtime/gc/scan",
"internal/runtime/maps",
"internal/runtime/math",
"internal/runtime/strconv",

View File

@@ -100,6 +100,7 @@ var depsRules = `
< internal/runtime/maps
< internal/runtime/strconv
< internal/runtime/cgroup
< internal/runtime/gc/scan
< runtime
< sync/atomic
< internal/sync
@@ -797,6 +798,20 @@ var depsRules = `
FMT, testing < internal/cgrouptest;
C, CGO < internal/runtime/cgobench;
# Generate-only packages can have anything they want
container/heap,
encoding/binary,
fmt,
hash/maphash,
io,
log,
math/bits,
os,
reflect,
strings,
sync
< internal/runtime/gc/internal/gen;
`
// listStdPkgs returns the same list of packages as "go list std".

View File

@@ -34,15 +34,19 @@ var X86 struct {
HasAVX512 bool // Virtual feature: F+CD+BW+DQ+VL
HasAVX512F bool
HasAVX512CD bool
HasAVX512BITALG bool
HasAVX512BW bool
HasAVX512DQ bool
HasAVX512VL bool
HasAVX512VPCLMULQDQ bool
HasAVX512VBMI bool
HasAVX512VBMI2 bool
HasBMI1 bool
HasBMI2 bool
HasERMS bool
HasFSRM bool
HasFMA bool
HasGFNI bool
HasOSXSAVE bool
HasPCLMULQDQ bool
HasPOPCNT bool

View File

@@ -18,7 +18,7 @@ func xgetbv() (eax, edx uint32)
func getGOAMD64level() int32
const (
// ecx bits
// Bits returned in ECX for CPUID EAX=0x1 ECX=0x0
cpuid_SSE3 = 1 << 0
cpuid_PCLMULQDQ = 1 << 1
cpuid_SSSE3 = 1 << 9
@@ -30,7 +30,7 @@ const (
cpuid_OSXSAVE = 1 << 27
cpuid_AVX = 1 << 28
// ebx bits
// "Extended Feature Flag" bits returned in EBX for CPUID EAX=0x7 ECX=0x0
cpuid_BMI1 = 1 << 3
cpuid_AVX2 = 1 << 5
cpuid_BMI2 = 1 << 8
@@ -43,8 +43,12 @@ const (
cpuid_AVX512BW = 1 << 30
cpuid_AVX512VL = 1 << 31
// ecx bits
// "Extended Feature Flag" bits returned in ECX for CPUID EAX=0x7 ECX=0x0
cpuid_AVX512_VBMI = 1 << 1
cpuid_AVX512_VBMI2 = 1 << 6
cpuid_GFNI = 1 << 8
cpuid_AVX512VPCLMULQDQ = 1 << 10
cpuid_AVX512_BITALG = 1 << 12
// edx bits
cpuid_FSRM = 1 << 4
@@ -163,6 +167,10 @@ func doinit() {
X86.HasAVX512DQ = isSet(ebx7, cpuid_AVX512DQ)
X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL)
X86.HasAVX512VPCLMULQDQ = isSet(ecx7, cpuid_AVX512VPCLMULQDQ)
X86.HasAVX512VBMI = isSet(ecx7, cpuid_AVX512_VBMI)
X86.HasAVX512VBMI2 = isSet(ecx7, cpuid_AVX512_VBMI2)
X86.HasGFNI = isSet(ecx7, cpuid_GFNI)
X86.HasAVX512BITALG = isSet(ecx7, cpuid_AVX512_BITALG)
}
X86.HasFSRM = isSet(edx7, cpuid_FSRM)

View File

@@ -0,0 +1,11 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !386 && !amd64
package cpu
func DataCacheSizes() []uintptr {
return nil
}

View File

@@ -0,0 +1,121 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build 386 || amd64
package cpu
// DataCacheSizes returns the size of each data cache from lowest
// level in the hierarchy to highest.
//
// Unlike other parts of this package's public API, it is not safe
// to reference early in runtime initialization because it allocates.
// It's intended for testing only.
func DataCacheSizes() []uintptr {
maxFunctionInformation, ebx0, ecx0, edx0 := cpuid(0, 0)
if maxFunctionInformation < 1 {
return nil
}
switch {
// Check for "GenuineIntel"
case ebx0 == 0x756E6547 && ecx0 == 0x6C65746E && edx0 == 0x49656E69:
return getDataCacheSizesIntel(maxFunctionInformation)
// Check for "AuthenticAMD"
case ebx0 == 0x68747541 && ecx0 == 0x444D4163 && edx0 == 0x69746E65:
return getDataCacheSizesAMD()
}
return nil
}
func extractBits(arg uint32, l int, r int) uint32 {
if l > r {
panic("bad bit range")
}
return (arg >> l) & ((1 << (r - l + 1)) - 1)
}
func getDataCacheSizesIntel(maxID uint32) []uintptr {
// Constants for cache types
const (
noCache = 0
dataCache = 1
instructionCache = 2
unifiedCache = 3
)
if maxID < 4 {
return nil
}
// Iterate through CPUID leaf 4 (deterministic cache parameters)
var caches []uintptr
for i := uint32(0); i < 0xFFFF; i++ {
eax, ebx, ecx, _ := cpuid(4, i)
cacheType := eax & 0xF // EAX bits 4-0: Cache Type
if cacheType == 0 {
break
}
// Report only data caches.
if !(cacheType == dataCache || cacheType == unifiedCache) {
continue
}
// Guaranteed to always start counting from 1.
level := (eax >> 5) & 0x7
lineSize := extractBits(ebx, 0, 11) + 1 // Bits 11-0: Line size in bytes - 1
partitions := extractBits(ebx, 12, 21) + 1 // Bits 21-12: Physical line partitions - 1
ways := extractBits(ebx, 22, 31) + 1 // Bits 31-22: Ways of associativity - 1
sets := uint64(ecx) + 1 // Number of sets - 1
size := uint64(ways*partitions*lineSize) * sets // Calculate cache size in bytes
caches = append(caches, uintptr(size))
// If we see more than one cache described per level, or they appear
// out of order, crash.
//
// Going by the SDM, it's not clear whether this is actually possible,
// so this code is purely defensive.
if level != uint32(len(caches)) {
panic("expected levels to be in order and for there to be one data/unified cache per level")
}
}
return caches
}
func getDataCacheSizesAMD() []uintptr {
maxExtendedFunctionInformation, _, _, _ := cpuid(0x80000000, 0)
if maxExtendedFunctionInformation < 0x80000006 {
return nil
}
var caches []uintptr
_, _, ecx5, _ := cpuid(0x80000005, 0)
_, _, ecx6, edx6 := cpuid(0x80000006, 0)
// The size is return in kb, turning into bytes.
l1dSize := uintptr(extractBits(ecx5, 24, 31) << 10)
caches = append(caches, l1dSize)
// Check that L2 cache is present.
if l2Assoc := extractBits(ecx6, 12, 15); l2Assoc == 0 {
return caches
}
l2Size := uintptr(extractBits(ecx6, 16, 31) << 10)
caches = append(caches, l2Size)
// Check that L3 cache is present.
if l3Assoc := extractBits(edx6, 12, 15); l3Assoc == 0 {
return caches
}
// Specifies the L3 cache size is within the following range:
// (L3Size[31:18] * 512KB) <= L3 cache size < ((L3Size[31:18]+1) * 512KB).
l3Size := uintptr(extractBits(edx6, 18, 31) * (512 << 10))
caches = append(caches, l3Size)
return caches
}

View File

@@ -0,0 +1,26 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build 386 || amd64
package cpu_test
import (
"internal/cpu"
"testing"
)
// Tests fetching data cache sizes. This test only checks that DataCacheSizes
// won't explode. Otherwise it's just informational, and dumps the current
// data cache sizes.
func TestDataCacheSizes(t *testing.T) {
// N.B. Don't try to check these values because we don't know what
// kind of environment we're running in. We don't want this test to
// fail on some random x86 chip that happens to not support the right
// CPUID bits for some reason.
caches := cpu.DataCacheSizes()
for i, size := range caches {
t.Logf("L%d: %d", i+1, size)
}
}

View File

@@ -34,6 +34,9 @@ const (
// It is also the size of the machine's native word size (that is, 4 on 32-bit systems, 8 on 64-bit).
const PtrSize = 4 << (^uintptr(0) >> 63)
// PtrSize is bit width of a pointer.
const PtrBits = PtrSize * 8
// ArchFamily is the architecture family (AMD64, ARM, ...)
const ArchFamily ArchFamilyType = _ArchFamily

View File

@@ -0,0 +1,537 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gen
import (
"container/heap"
"encoding/binary"
"fmt"
"hash/maphash"
"io"
"log"
"os"
"reflect"
"strings"
)
const logCompile = true
func fatalf(f string, args ...any) {
panic(fmt.Sprintf(f, args...))
}
type File struct {
w io.Writer
funcs []*Func
consts []fileConst
}
func NewFile(w io.Writer) *File {
return &File{w: w}
}
func (f *File) AddFunc(fn *Func) {
f.funcs = append(f.funcs, fn)
}
type fileConst struct {
name string
data any
}
func (f *File) AddConst(name string, data any) {
// TODO: It would be nice if this were unified with "const" ops, but the
// reason I added this was for []*Func consts, which would take an overhaul
// to represent in "const" ops.
f.consts = append(f.consts, fileConst{name, data})
}
type Func struct {
name string
nArgs int
idGen int
ops []*op
}
func NewFunc(name string) *Func {
fn := &Func{name: name}
return fn
}
// attach adds x to fn's op list. If x has any unattached arguments, this adds
// those first (recursively).
func (fn *Func) attach(x *op) {
// Make sure the arguments are attached to the function.
for _, arg := range x.args {
argFn := arg.fn
if argFn == nil {
fn.attach(arg)
} else if argFn != fn {
panic("ops from different functions")
}
}
x.fn = fn
x.id = fn.idGen
fn.idGen++
fn.ops = append(fn.ops, x)
}
func Arg[W wrap[T], T Word](fn *Func) T {
loc := locReg{cls: regClassGP, reg: fn.nArgs}
fn.nArgs++
var x W
o := &op{op: "arg", kind: x.kind(), c: loc}
fn.attach(o)
return x.wrap(o)
}
func Return(results ...Value) {
args := make([]*op, len(results))
for i, res := range results {
args[i] = res.getOp()
}
var x void
x.initOp(&op{op: "return", kind: voidKind, args: args})
}
type op struct {
op string
kind *kind
args []*op
id int
fn *Func
// c depends on "op".
//
// arg locReg - The register containing the argument value
// const any - The constant value
// deref int - Byte offset from args[0]
c any
name string
}
func (o *op) String() string {
return fmt.Sprintf("v%02d", o.id)
}
func imm(val any) *op {
return &op{op: "imm", c: val}
}
func (o *op) equalNoName(o2 *op) bool {
if o.op != o2.op || o.c != o2.c || len(o.args) != len(o2.args) {
return false
}
for i, arg := range o.args {
if o2.args[i] != arg {
return false
}
}
return true
}
func (o *op) write(w io.Writer) {
fmt.Fprintf(w, "v%02d = %s", o.id, o.op)
for _, arg := range o.args {
fmt.Fprintf(w, " v%02d", arg.id)
}
if o.c != nil {
fmt.Fprintf(w, " %v", o.c)
}
if o.name != "" {
fmt.Fprintf(w, " %q", o.name)
}
if o.kind != nil {
fmt.Fprintf(w, " [%s]", o.kind.typ)
}
fmt.Fprintf(w, "\n")
}
func (fn *Func) write(w io.Writer) {
fmt.Fprintf(w, "FUNC %s\n", fn.name)
for _, op := range fn.ops {
op.write(w)
}
}
func (f *File) Compile() {
// TODO: CSE constants across the whole file
fmt.Fprintf(f.w, `#include "go_asm.h"
#include "textflag.h"
`)
for _, c := range f.consts {
f.emitConst(c.name, c.data)
}
trace := func(fn *Func, step string) {
if !logCompile {
return
}
log.Printf("## Compiling %s: %s", fn.name, step)
fn.write(os.Stderr)
}
for _, fn := range f.funcs {
trace(fn, "initial")
for {
if fn.cse() {
trace(fn, "post cse")
continue
}
if fn.deadcode() {
trace(fn, "post deadcode")
continue
}
break
}
fn.addLoads()
trace(fn, "post addLoads")
// Assigning locations requires ops to be in dependency order.
fn.schedule()
trace(fn, "post schedule")
locs := fn.assignLocs()
fn.emit(f, locs)
}
}
// cse performs common subexpression elimination.
func (fn *Func) cse() bool {
// Compute structural hashes
hashes := make(map[*op]uint64)
var h maphash.Hash
var bbuf [8]byte
for _, op := range fn.ops {
// We ignore the name for canonicalization.
h.Reset()
h.WriteString(op.op)
// TODO: Ideally we would hash o1.c, but we don't have a good way to do that.
for _, arg := range op.args {
if _, ok := hashes[arg]; !ok {
panic("ops not in dependency order")
}
binary.NativeEndian.PutUint64(bbuf[:], hashes[arg])
h.Write(bbuf[:])
}
hashes[op] = h.Sum64()
}
canon := make(map[uint64][]*op)
lookup := func(o *op) *op {
hash := hashes[o]
for _, o2 := range canon[hash] {
if o.equalNoName(o2) {
return o2
}
}
canon[hash] = append(canon[hash], o)
return o
}
// Canonicalize ops.
dirty := false
for _, op := range fn.ops {
for i, arg := range op.args {
newArg := lookup(arg)
if arg != newArg {
dirty = true
op.args[i] = newArg
}
}
}
return dirty
}
// deadcode eliminates unused ops.
func (fn *Func) deadcode() bool {
marks := make(map[*op]bool)
var mark func(o *op)
mark = func(o *op) {
if marks[o] {
return
}
marks[o] = true
for _, arg := range o.args {
mark(arg)
}
}
// Mark operations that have a side-effect.
for _, op := range fn.ops {
switch op.op {
case "return":
mark(op)
}
}
// Discard unmarked operations
if len(marks) == len(fn.ops) {
return false
}
newOps := make([]*op, 0, len(marks))
for _, op := range fn.ops {
if marks[op] {
newOps = append(newOps, op)
}
}
fn.ops = newOps
return true
}
// canMem is a map from operation to a bitmap of which arguments can use a
// direct memory reference.
var canMem = map[string]uint64{
"VPERMB": 1 << 0,
"VPERMI2B": 1 << 0,
"VPERMT2B": 1 << 0,
"VGF2P8AFFINEQB": 1 << 0,
"VPORQ": 1 << 0,
"VPSUBQ": 1 << 0,
"VPSHUFBITQMB": 1 << 0,
}
// addLoads inserts load ops for ops that can't take memory inputs directly.
func (fn *Func) addLoads() {
// A lot of operations can directly take memory locations. If there's only a
// single reference to a deref operation, and the operation can do the deref
// itself, eliminate the deref. If there's more than one reference, then we
// leave the load so we can share the value in the register.
nRefs := fn.opRefs()
loads := make(map[*op]*op) // deref -> load
for _, o := range fn.ops {
canMask := canMem[o.op]
for i, arg := range o.args {
// TODO: Many AVX-512 operations that support memory operands also
// support a ".BCST" suffix that performs a broadcasting memory
// load. If the const can be broadcast and all uses support
// broadcast load, it would be nice to use .BCST. I'm not sure if
// that belongs in this pass or a different one.
if arg.op == "deref" || arg.op == "const" {
// These produce memory locations.
if canMask&(1<<i) == 0 || nRefs[arg] > 1 {
// This argument needs to be loaded into a register.
load, ok := loads[arg]
if !ok {
load = makeLoad(arg)
fn.attach(load)
loads[arg] = load
}
o.args[i] = load
}
}
}
}
}
func (fn *Func) opRefs() map[*op]int {
refs := make(map[*op]int)
for _, o1 := range fn.ops {
for _, arg := range o1.args {
refs[arg]++
}
}
return refs
}
func makeLoad(deref *op) *op {
var inst string
switch deref.kind.reg {
default:
fatalf("don't know how to load %v", deref.kind.reg)
case regClassGP:
inst = "MOVQ"
case regClassZ:
inst = "VMOVDQU64"
}
// The load references deref rather than deref.args[0] because when we
// assign locations, the deref op gets the memory location to load from,
// while its argument has some other location (like a register). Also, the
// offset to deref is attached to the deref op.
return &op{op: inst, kind: deref.kind, args: []*op{deref}}
}
type opHeap []*op
func (h opHeap) Len() int { return len(h) }
func (h opHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h opHeap) Less(i, j int) bool {
priority := func(o *op) int {
if o.op == "deref" || o.op == "const" {
// Input to memory load
return 1
}
if len(o.args) > 0 && (o.args[0].op == "deref" || o.args[0].op == "const") {
// Memory load
return 2
}
return 100
}
if p1, p2 := priority(h[i]), priority(h[j]); p1 != p2 {
return p1 < p2
}
return h[i].id < h[j].id
}
func (h *opHeap) Push(x any) {
*h = append(*h, x.(*op))
}
func (h *opHeap) Pop() any {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
// schedule ensures fn's ops are in dependency order.
func (fn *Func) schedule() {
// TODO: This tends to generate a huge amount of register pressure, mostly
// because it floats loads as early as possible and partly because it has no
// concept of rematerialization and CSE can make rematerializable values
// live for a very long time. It some sense it doesn't matter because we
// don't run out of registers for anything we need.
missing := make(map[*op]int)
uses := make(map[*op][]*op)
var h opHeap
for _, op := range fn.ops {
if len(op.args) == 0 {
h = append(h, op)
} else {
missing[op] = len(op.args)
}
for _, arg := range op.args {
uses[arg] = append(uses[arg], op)
}
}
heap.Init(&h)
newOps := make([]*op, 0, len(fn.ops))
for len(h) > 0 {
if false {
log.Printf("schedule: %s", h)
}
top := h[0]
newOps = append(newOps, top)
heap.Pop(&h)
for _, o := range uses[top] {
missing[o]--
if missing[o] == 0 {
heap.Push(&h, o)
}
}
}
if len(newOps) != len(fn.ops) {
log.Print("schedule didn't schedule all ops")
log.Print("before:")
fn.write(os.Stderr)
fn.ops = newOps
log.Print("after:")
fn.write(os.Stderr)
log.Fatal("bad schedule")
}
fn.ops = newOps
}
func (fn *Func) emit(f *File, locs map[*op]loc) {
w := f.w
// Emit constants first
for _, o := range fn.ops {
if o.op == "const" {
name := locs[o].(locMem).name
f.emitConst(name, o.c)
}
}
fmt.Fprintf(w, "TEXT %s(SB), NOSPLIT, $0-0\n", fn.name)
// Emit body
for _, o := range fn.ops {
switch o.op {
case "const", "arg", "return", "deref", "imm":
// Does not produce code
continue
}
switch o.op {
case "addConst":
fatalf("addConst not lowered")
}
opName := o.op
// A ".mask" suffix is used to distinguish AVX-512 ops that use the same
// mnemonic for regular and masked mode.
opName = strings.TrimSuffix(opName, ".mask")
fmt.Fprintf(w, "\t%s", opName)
if o.op == "VGF2P8AFFINEQB" {
// Hidden immediate, but always 0
//
// TODO: Replace this with an imm input.
fmt.Fprintf(w, " $0,")
}
for i, arg := range o.args {
if i == 0 {
fmt.Fprintf(w, " ")
} else {
fmt.Fprintf(w, ", ")
}
if arg.op == "imm" {
fmt.Fprintf(w, "$0x%x", arg.c)
} else {
fmt.Fprint(w, locs[arg].LocString())
}
}
if _, ok := opRMW[o.op]; ok {
// Read-modify-write instructions, so the output is already in the
// arguments above.
} else {
fmt.Fprintf(w, ", %s", locs[o].LocString())
}
fmt.Fprintf(w, "\n")
}
fmt.Fprintf(w, "\tRET\n")
fmt.Fprintf(w, "\n")
}
func (f *File) emitConst(name string, data any) {
switch data := data.(type) {
case []*Func:
fmt.Fprintf(f.w, "GLOBL %s(SB), RODATA, $%#x\n", name, len(data)*8)
for i, fn := range data {
fmt.Fprintf(f.w, "DATA %s+%#02x(SB)/8, ", name, 8*i)
if fn == nil {
fmt.Fprintf(f.w, "$0\n")
} else {
fmt.Fprintf(f.w, "$%s(SB)\n", fn.name)
}
}
fmt.Fprintf(f.w, "\n")
return
}
// Assume it's a numeric slice or array
rv := reflect.ValueOf(data)
sz := int(rv.Type().Elem().Size())
fmt.Fprintf(f.w, "GLOBL %s(SB), RODATA, $%#x\n", name, rv.Len()*sz)
for wi := 0; wi < sz*rv.Len()/8; wi++ { // Iterate over words
var word uint64
for j := 0; j < 8/sz; j++ { // Iterate over elements in this word
d := rv.Index(wi*8/sz + j).Uint()
word |= d << (j * sz * 8)
}
fmt.Fprintf(f.w, "DATA %s+%#02x(SB)/8, $%#016x\n", name, 8*wi, word)
}
fmt.Fprintf(f.w, "\n")
}

View File

@@ -0,0 +1,26 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gen
type Uint64 struct {
valGP
}
var kindUint64 = &kind{typ: "Uint64", reg: regClassGP}
func ConstUint64(c uint64, name string) (y Uint64) {
y.initOp(&op{op: "const", kind: y.kind(), c: c, name: name})
return y
}
func (Uint64) kind() *kind {
return kindUint64
}
func (Uint64) wrap(x *op) Uint64 {
var y Uint64
y.initOp(x)
return y
}

View File

@@ -0,0 +1,338 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gen
import (
"fmt"
"log"
"math/bits"
"strings"
)
const traceRegAlloc = true
type regClass uint8
const (
regClassFixed regClass = iota
regClassGP
regClassZ
regClassK
numRegClasses
regClassNone = ^regClass(0)
)
type locReg struct {
cls regClass
reg int
}
func (l locReg) LocString() string {
switch l.cls {
case regClassFixed:
return fixedRegs[l.reg]
case regClassGP:
return gpRegs[l.reg]
case regClassZ:
return fmt.Sprintf("Z%d", l.reg)
case regClassK:
return fmt.Sprintf("K%d", l.reg)
}
panic("bad register class")
}
func (l locReg) Deref(off int) (loc, error) {
return locMem{l, off, ""}, nil
}
func (l locReg) Reg() (locReg, bool) {
return l, true
}
type locMem struct {
base locReg
off int
name string
}
func (l locMem) LocString() string {
if l.base.cls == regClassFixed && l.base.reg == regSB && l.off == 0 {
return l.name + "(SB)"
}
if l.name != "" {
return fmt.Sprintf("%s+%d(%s)", l.name, l.off, l.base.LocString())
}
if l.off != 0 {
return fmt.Sprintf("%d(%s)", l.off, l.base.LocString())
}
return "(" + l.base.LocString() + ")"
}
func (l locMem) Deref(off int) (loc, error) {
return nil, fmt.Errorf("cannot dereference already memory address %s", l.LocString())
}
func (l locMem) Reg() (locReg, bool) {
if l.base.cls == regClassFixed {
return locReg{}, false
}
return l.base, true
}
type loc interface {
LocString() string // Return the assembly syntax for this location
Deref(off int) (loc, error) // Treat this location as an address and return a location with the contents of memory at that address
Reg() (locReg, bool) // Register used by this location
}
var opRMW = map[string]int{
"VPERMI2B": 2, // Overwrites third argument
"VPERMI2B.Z": 3, // Overwrites fourth argument
"VPERMI2B.mask": 3, // Overwrites fourth argument
"VPERMT2B": 1, // Overwrites second argument TODO: Check this. Unused for now.
"VPBROADCASTQ.mask": 2, // Overwrites last argument
}
// TODO: Should we have a general rule that all ".mask" instructions overwrite
// their last argument?
const (
regSB = iota
regFP
)
var fixedRegs = []string{regSB: "SB", regFP: "FP"}
var gpRegs = []string{"AX", "BX", "CX", "DI", "SI", "R8", "R9", "R10", "R11"} // ABI argument order
type regSet struct {
inUse [numRegClasses]uint32
}
func (s *regSet) used(o *op, l loc) {
if l == nil {
return
}
reg, ok := l.Reg()
if !ok {
return
}
if traceRegAlloc {
log.Printf(" alloc %s @ v%02d", reg.LocString(), o.id)
}
if s.inUse[reg.cls]&(1<<reg.reg) != 0 {
fatalf("register %s already used", reg.LocString())
}
s.inUse[reg.cls] |= 1 << reg.reg
}
func (s *regSet) free(l loc) {
if l == nil {
return
}
reg, ok := l.Reg()
if !ok {
return
}
if traceRegAlloc {
log.Printf(" free %s", reg.LocString())
}
if s.inUse[reg.cls]&(1<<reg.reg) == 0 {
fatalf("register %s is not in use", reg.LocString())
}
s.inUse[reg.cls] &^= 1 << reg.reg
}
func (fn *Func) assignLocs() map[*op]loc {
// Remove static indicator on name, if any. We'll add it back.
nameBase := strings.TrimSuffix(fn.name, "<>")
// Create map from op -> fn.ops index
opIndexes := make(map[*op]int, len(fn.ops))
for i, o := range fn.ops {
opIndexes[o] = i
}
// Read-modify-write operations share a location with one of their inputs.
// Likewise, deref ops extend the lifetime of their input (but in a shared
// way, unlike RMW ops).
//
// Compute a map from each op to the earliest "canonical" op whose live
// range we'll use.
canon := make(map[*op]*op)
overwritten := make(map[*op]bool)
for _, o := range fn.ops {
// Check that this op doesn't use any overwritten inputs.
for _, arg := range o.args {
if overwritten[arg] {
// TODO: The solution to this is to insert copy ops.
fatalf("op %+v uses overwritten input %+v", o, arg)
}
}
// Record canonical op.
rmw, ok := opRMW[o.op]
if ok {
canon[o] = canon[o.args[rmw]]
// Record that the input is dead now and must not be referenced.
overwritten[o.args[rmw]] = true
} else if o.op == "deref" {
canon[o] = canon[o.args[0]]
} else {
canon[o] = o
}
}
// Compute live ranges of each canonical op.
//
// First, find the last use of each op.
lastUses := make(map[*op]*op) // Canonical creation op -> last use op
for _, op := range fn.ops {
for _, arg := range op.args {
lastUses[canon[arg]] = op
}
}
// Invert the last uses map to get a map from op to the (canonical) values
// that die at that op.
lastUseMap := make(map[*op][]*op) // op of last use -> (canonical) creation ops
for def, lastUse := range lastUses {
lastUseMap[lastUse] = append(lastUseMap[lastUse], def)
}
// Prepare for assignments
regUsed := make([]regSet, len(fn.ops)) // In-use registers at each op
for i := range regUsed {
// X15/Y15/Z15 is reserved by the Go ABI
regUsed[i].inUse[regClassZ] |= 1 << 15
// K0 is contextual (if used as an opmask, it means no mask). Too
// complicated, so just ignore it.
regUsed[i].inUse[regClassK] |= 1 << 0
}
locs := make(map[*op]loc)
assign := func(o *op, l loc) {
if have, ok := locs[o]; ok {
fatalf("op %+v already assigned location %v (new %v)", o, have, l)
return
}
if o == canon[o] {
// Mark this location used over o's live range
for i := opIndexes[o]; i < opIndexes[lastUses[o]]; i++ {
regUsed[i].used(fn.ops[i], l)
}
}
locs[o] = l
}
// Assign fixed locations
id := 0
for _, o := range fn.ops {
switch o.op {
case "arg":
if traceRegAlloc {
log.Printf("fixed op %+v", o)
}
assign(o, o.c.(locReg))
case "const":
if traceRegAlloc {
log.Printf("fixed op %+v", o)
}
name := o.name
if name == "" {
name = fmt.Sprintf("%s_%d<>", nameBase, id)
id++
} else if name[0] == '*' {
name = nameBase + name[1:]
}
assign(o, locMem{locReg{cls: regClassFixed, reg: regSB}, 0, name})
case "return":
if traceRegAlloc {
log.Printf("fixed op %+v", o)
}
assign(o, nil) // no location
// TODO: argZ should start at 0.
argGP, argZ := 0, 1
for _, arg := range o.args {
switch arg.kind.reg {
default:
fatalf("bad register class for return value")
case regClassGP:
assign(canon[arg], locReg{regClassGP, argGP})
argGP++
case regClassZ:
assign(canon[arg], locReg{regClassZ, argZ})
argZ++
}
}
case "imm":
assign(o, nil) // no location
}
}
// Assign locations.
for _, o := range fn.ops {
if traceRegAlloc {
log.Printf("assign %+v", o)
}
if _, ok := locs[o]; ok {
// Already assigned a fixed location above.
continue
}
if o.op == "deref" {
loc, err := locs[o.args[0]].Deref(o.c.(int))
if err != nil {
fatalf("%v", err)
}
// We don't "assign" here because we've already processed the
// canonical op, which marked loc's register as in-use for the whole
// live range.
locs[o] = loc
continue
}
if canon[o] != o {
// Copy the canonical op's location.
locs[o] = locs[canon[o]]
continue
}
// Below here we know that o is already a canonical op.
if _, ok := opRMW[o.op]; ok {
fatalf("read-modify-write op not canonicalized")
}
// Find a free register of the right class.
cls := o.kind.reg
var used uint32
for i := opIndexes[o]; i < opIndexes[lastUses[o]]; i++ {
used |= regUsed[i].inUse[cls]
}
// Assign result location.
num := bits.TrailingZeros32(^used)
switch cls {
default:
fatalf("unknown reg class %v", cls)
case regClassGP:
if num >= len(gpRegs) {
panic("out of GP regs")
}
case regClassZ:
if num >= 32 {
panic("out of Z regs")
}
case regClassK:
if num >= 8 {
panic("out of K regs")
}
}
loc := locReg{cls, num}
assign(o, loc)
}
return locs
}

View File

@@ -0,0 +1,246 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gen
type Uint8x64 struct {
valAny
}
var kindUint8x64 = &kind{typ: "Uint8x64", reg: regClassZ}
func ConstUint8x64(c [64]uint8, name string) (y Uint8x64) {
y.initOp(&op{op: "const", kind: y.kind(), c: c, name: name})
return y
}
func (Uint8x64) kind() *kind {
return kindUint8x64
}
func (Uint8x64) wrap(x *op) Uint8x64 {
var y Uint8x64
y.initOp(x)
return y
}
func (x Uint8x64) ToUint64x8() (z Uint64x8) {
z.op = x.op
return z
}
func (x Uint8x64) Shuffle(shuf Uint8x64) (y Uint8x64) {
if shuf.op.op == "const" {
// TODO: There are often patterns we can take advantage of here. Sometimes
// we can do a broadcast. Sometimes we can at least do a quadword
// permutation instead of a full byte permutation.
// Range check the shuffle
for i, inp := range shuf.op.c.([64]uint8) {
// 0xff is a special "don't care" value
if !(inp == 0xff || inp < 64) {
fatalf("shuffle[%d] = %d out of range [0, %d) or 0xff", i, inp, 64)
}
}
}
args := []*op{x.op, shuf.op}
y.initOp(&op{op: "VPERMB", kind: y.kind(), args: args})
return y
}
func (x Uint8x64) ShuffleZeroed(shuf Uint8x64, mask Mask64) (y Uint8x64) {
args := []*op{x.op, shuf.op, mask.op}
y.initOp(&op{op: "VPERMB.Z", kind: y.kind(), args: args})
return y
}
func (x Uint8x64) ShuffleMasked(shuf Uint8x64, mask Mask64) (y Uint8x64) {
args := []*op{x.op, shuf.op, mask.op}
y.initOp(&op{op: "VPERMB.mask", kind: y.kind(), args: args})
return y
}
// TODO: The two-argument shuffle is a little weird. You almost want the
// receiver to be the shuffle and the two arguments to be the two inputs, but
// that's almost certainly *not* what you want for the single input shuffle.
func (x Uint8x64) Shuffle2(y Uint8x64, shuf Uint8x64) (z Uint8x64) {
// Confusingly, the inputs are in the opposite order from what you'd expect.
args := []*op{y.op, x.op, shuf.op}
z.initOp(&op{op: "VPERMI2B", kind: z.kind(), args: args})
return z
}
func (x Uint8x64) Shuffle2Zeroed(y Uint8x64, shuf Uint8x64, mask Mask64) (z Uint8x64) {
// Confusingly, the inputs are in the opposite order from what you'd expect.
args := []*op{y.op, x.op, mask.op, shuf.op}
z.initOp(&op{op: "VPERMI2B.Z", kind: z.kind(), args: args})
return z
}
func (x Uint8x64) Shuffle2Masked(y Uint8x64, shuf Uint8x64, mask Mask64) (z Uint8x64) {
// Confusingly, the inputs are in the opposite order from what you'd expect.
args := []*op{y.op, x.op, mask.op, shuf.op}
z.initOp(&op{op: "VPERMI2B.mask", kind: z.kind(), args: args})
return z
}
type Uint64x8 struct {
valAny
}
var kindUint64x8 = &kind{typ: "Uint64x8", reg: regClassZ}
func ConstUint64x8(c [8]uint64, name string) (y Uint64x8) {
// TODO: Sometimes these can be optimized into broadcast loads.
y.initOp(&op{op: "const", kind: y.kind(), c: c, name: name})
return y
}
func BroadcastUint64x8Zeroed(src Uint64, mask Mask8) (z Uint64x8) {
z.initOp(&op{op: "VPBROADCASTQ.Z", kind: z.kind(), args: []*op{src.op, mask.op}})
return z
}
func (x Uint64x8) BroadcastMasked(src Uint64, mask Mask8) (z Uint64x8) {
z.initOp(&op{op: "VPBROADCASTQ.mask", kind: z.kind(), args: []*op{src.op, mask.op, x.op}})
return z
}
func (Uint64x8) kind() *kind {
return kindUint64x8
}
func (Uint64x8) wrap(x *op) Uint64x8 {
var y Uint64x8
y.initOp(x)
return y
}
func (x Uint64x8) Or(y Uint64x8) (z Uint64x8) {
z.initOp(&op{op: "VPORQ", kind: z.kind(), args: []*op{y.op, x.op}})
return z
}
func (x Uint64x8) Sub(y Uint64x8) (z Uint64x8) {
// Arguments are backwards
z.initOp(&op{op: "VPSUBQ", kind: z.kind(), args: []*op{y.op, x.op}})
return z
}
func (x Uint64x8) ToUint8x64() (z Uint8x64) {
z.op = x.op
return z
}
func (x Uint64x8) GF2P8Affine(y Uint8x64) (z Uint8x64) {
// matrix, vector
z.initOp(&op{op: "VGF2P8AFFINEQB", kind: z.kind(), args: []*op{x.op, y.op}})
return z
}
func (x Uint64x8) ShuffleBits(y Uint8x64) (z Mask64) {
z.initOp(&op{op: "VPSHUFBITQMB", kind: z.kind(), args: []*op{y.op, x.op}})
return z
}
func (x Uint64x8) ShuffleBitsMasked(y Uint8x64, mask Mask64) (z Mask64) {
// This is always zeroing if the mask is provided.
z.initOp(&op{op: "VPSHUFBITQMB", kind: z.kind(), args: []*op{y.op, x.op, mask.op}})
return z
}
type Mask8 struct {
valAny
}
var kindMask8 = &kind{typ: "Mask8", reg: regClassK}
func ConstMask8(c uint8) (y Mask8) {
var tmp Uint64
tmp.initOp(&op{op: "MOVQ", kind: tmp.kind(), args: []*op{imm(c)}})
y.initOp(&op{op: "KMOVB", kind: y.kind(), args: []*op{tmp.op}})
return y
}
func (Mask8) kind() *kind {
return kindMask8
}
func (Mask8) wrap(x *op) Mask8 {
var y Mask8
y.initOp(x)
return y
}
func (x Mask8) ToUint8() (z Uint64) {
z.initOp(&op{op: "KMOVB", kind: z.kind(), args: []*op{x.op}})
return z
}
func (x Mask8) Or(y Mask8) (z Mask8) {
z.initOp(&op{op: "KORQ", kind: z.kind(), args: []*op{y.op, x.op}})
return z
}
func (x Mask8) ShiftLeft(c uint8) (z Mask8) {
if c == 0 {
z = x
} else {
z.initOp(&op{op: "KSHIFTLB", kind: z.kind(), args: []*op{imm(c), x.op}})
}
return z
}
type Mask64 struct {
valAny
}
var kindMask64 = &kind{typ: "Mask64", reg: regClassK}
func ConstMask64(c uint64) (y Mask64) {
var tmp Uint64
tmp.initOp(&op{op: "MOVQ", kind: tmp.kind(), args: []*op{imm(c)}})
y.initOp(&op{op: "KMOVQ", kind: y.kind(), args: []*op{tmp.op}})
return y
}
func (Mask64) kind() *kind {
return kindMask64
}
func (Mask64) wrap(x *op) Mask64 {
var y Mask64
y.initOp(x)
return y
}
func (x Mask64) ToUint64() (z Uint64) {
z.initOp(&op{op: "KMOVQ", kind: z.kind(), args: []*op{x.op}})
return z
}
func (x Mask64) Or(y Mask64) (z Mask64) {
z.initOp(&op{op: "KORQ", kind: z.kind(), args: []*op{y.op, x.op}})
return z
}
func (x Mask64) ShiftLeft(c uint8) (z Mask64) {
if c == 0 {
z = x
} else {
z.initOp(&op{op: "KSHIFTLQ", kind: z.kind(), args: []*op{imm(c), x.op}})
}
return z
}
func (x Mask64) ShiftRight(c uint8) (z Mask64) {
if c == 0 {
z = x
} else {
z.initOp(&op{op: "KSHIFTRQ", kind: z.kind(), args: []*op{imm(c), x.op}})
}
return z
}

View File

@@ -0,0 +1,137 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gen
import "sync"
type Value interface {
kind() *kind
getOp() *op
}
type Word interface {
Value
isWord()
}
// wrap is an unfortunate necessity so that we can pass Value types around as
// values (not pointers), but still have generic functions that can construct a
// new Value. Ideally we would just have a method on Value to initialize its op,
// but that needs to have a non-pointer receiver to satisfy the interface and
// then it can't mutate the Value.
type wrap[T Value] interface {
Value
wrap(x *op) T
}
type kind struct {
typ string
reg regClass
}
type void struct {
valAny
}
var voidKind = &kind{typ: "void", reg: regClassNone}
func (void) kind() *kind { return voidKind }
type Ptr[T Value] struct {
valGP
}
// Ptr is a Word
var _ Word = Ptr[void]{}
var ptrKinds = sync.Map{} // *kind -> *kind
func (Ptr[T]) kind() *kind {
var x T
xk := x.kind()
pk, ok := ptrKinds.Load(xk)
if !ok {
k := &kind{typ: "Ptr[" + x.kind().typ + "]", reg: regClassGP}
pk, _ = ptrKinds.LoadOrStore(xk, k)
}
return pk.(*kind)
}
func (Ptr[T]) wrap(x *op) Ptr[T] {
var y Ptr[T]
y.initOp(x)
return y
}
func (x Ptr[T]) AddConst(off int) (y Ptr[T]) {
base := x.op
for base.op == "addConst" {
off += base.args[1].c.(int)
base = base.args[0]
}
y.initOp(&op{op: "addConst", kind: y.kind(), args: []*op{base, imm(off)}})
return y
}
func Deref[W wrap[T], T Value](ptr Ptr[W]) T {
var off int
base := ptr.op
for base.op == "addConst" {
off += base.args[1].c.(int)
base = base.args[0]
}
var y W
return y.wrap(&op{op: "deref", kind: y.kind(), args: []*op{base}, c: off})
}
type Array[T Value] struct {
valAny
}
func ConstArray[T Value](vals []T, name string) (y Array[T]) {
// TODO: This probably doesn't actually work because emitConst won't
// understand vals.
y.initOp(&op{op: "const", kind: y.kind(), c: vals, name: name})
return y
}
func (Array[T]) kind() *kind {
// TODO: Cache this like Ptr.kind.
var x T
return &kind{typ: "Array[" + x.kind().typ + "]", reg: regClassNone}
}
type valGP struct {
valAny
}
func (valGP) isWord() {}
type valAny struct {
*op
}
func (v *valAny) initOp(x *op) {
if v.op != nil {
panic("double init of val")
}
if x.kind == nil {
panic("val missing kind")
}
v.op = x
// Figure out this value's function.
for _, arg := range x.args {
if fn := arg.fn; fn != nil {
fn.attach(x)
break
}
}
}
func (v valAny) getOp() *op {
return v.op
}

View File

@@ -7,7 +7,8 @@ package gc
import "internal/goarch"
const (
ptrBits = 8 * goarch.PtrSize
// PageWords is the number of pointer-words per page.
PageWords = PageSize / goarch.PtrSize
// A malloc header is functionally a single type pointer, but
// we need to use 8 here to ensure 8-byte alignment of allocations
@@ -43,7 +44,7 @@ const (
// would not be invariant to size-class rounding. Eschewing this property means a
// more complex check or possibly storing additional state to determine whether a
// span has malloc headers.
MinSizeForMallocHeader = goarch.PtrSize * ptrBits
MinSizeForMallocHeader = goarch.PtrSize * goarch.PtrBits
// PageSize is the increment in which spans are managed.
PageSize = 1 << PageShift

View File

@@ -52,7 +52,7 @@ func main() {
fmt.Fprintln(&b, "// Code generated by mksizeclasses.go; DO NOT EDIT.")
fmt.Fprintln(&b, "//go:generate go run mksizeclasses.go")
fmt.Fprintln(&b)
fmt.Fprintln(&b, "package runtime")
fmt.Fprintln(&b, "package gc")
classes := makeClasses()
printComment(&b, classes)
@@ -287,6 +287,14 @@ func maxObjsPerSpan(classes []class) int {
return most
}
func maxNPages(classes []class) int {
most := 0
for _, c := range classes[1:] {
most = max(most, c.npages)
}
return most
}
func printClasses(w io.Writer, classes []class) {
fmt.Fprintln(w, "const (")
fmt.Fprintf(w, "MinHeapAlign = %d\n", minHeapAlign)
@@ -297,6 +305,7 @@ func printClasses(w io.Writer, classes []class) {
fmt.Fprintf(w, "NumSizeClasses = %d\n", len(classes))
fmt.Fprintf(w, "PageShift = %d\n", pageShift)
fmt.Fprintf(w, "MaxObjsPerSpan = %d\n", maxObjsPerSpan(classes))
fmt.Fprintf(w, "MaxSizeClassNPages = %d\n", maxNPages(classes))
fmt.Fprintln(w, ")")
fmt.Fprint(w, "var SizeClassToSize = [NumSizeClasses]uint16 {")

View File

@@ -0,0 +1,22 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package scan
import "internal/runtime/gc"
// ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked,
// where f is the word size of objects in sizeClass.
//
// This is a testing entrypoint to the expanders used by scanSpanPacked*.
//
//go:noescape
func ExpandAVX512(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask)
// gcExpandersAVX512 is the PCs of expander functions. These cannot be called directly
// as they don't follow the Go ABI, but you can use this to check if a given
// expander PC is 0.
//
// It is defined in assembly.
var gcExpandersAVX512 [len(gc.SizeClassToSize)]uintptr

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,19 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64
package scan_test
import (
"internal/runtime/gc/scan"
"testing"
)
func TestExpandAVX512(t *testing.T) {
if !scan.CanAVX512() {
t.Skip("no AVX512")
}
testExpand(t, scan.ExpandAVX512)
}

View File

@@ -0,0 +1,39 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package scan
import (
"internal/goarch"
"internal/runtime/gc"
)
// ExpandReference is a reference implementation of an expander function
// that translates object mark bits into a bitmap of one bit per word of
// marked object, assuming the object is of the provided size class.
func ExpandReference(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) {
// Look up the size and derive the number of objects in a span.
// We're only concerned with small objects in single-page spans,
// and gc.PtrMask enforces this by being statically sized to
// accomodate only such spans.
size := uintptr(gc.SizeClassToSize[sizeClass])
nObj := uintptr(gc.SizeClassToNPages[sizeClass]) * gc.PageSize / size
// f is the expansion factor. For example, if our objects are of size 48,
// then each mark bit will translate into 6 (48/8 = 6) set bits in the
// pointer bitmap.
f := size / goarch.PtrSize
for i := range nObj {
// Check if the object is marked.
if packed[i/goarch.PtrBits]&(uintptr(1)<<(i%goarch.PtrBits)) == 0 {
continue
}
// Propagate that mark into the destination into one bit per the
// expansion factor f, offset to the object's offset within the span.
for j := range f {
b := i*f + j // i*f is the start bit for the object, j indexes into each corresponding word after.
unpacked[b/goarch.PtrBits] |= uintptr(1) << (b % goarch.PtrBits)
}
}
}

View File

@@ -0,0 +1,37 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package scan_test
import (
"internal/goarch"
"internal/runtime/gc"
"internal/runtime/gc/scan"
"testing"
)
type expandFunc func(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask)
func testExpand(t *testing.T, expF expandFunc) {
expR := scan.ExpandReference
testObjs(t, func(t *testing.T, sizeClass int, objs *gc.ObjMask) {
var want, got gc.PtrMask
expR(sizeClass, objs, &want)
expF(sizeClass, objs, &got)
for i := range want {
if got[i] != want[i] {
t.Errorf("expansion differs from reference at bit %d", i*goarch.PtrSize)
if goarch.PtrSize == 4 {
t.Logf("got: %032b", got[i])
t.Logf("want: %032b", want[i])
} else {
t.Logf("got: %064b", got[i])
t.Logf("want: %064b", want[i])
}
}
}
})
}

View File

@@ -0,0 +1,35 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package scan
import "unsafe"
// FilterNil packs non-nil (non-zero) values in bufp together
// at the beginning of bufp, returning the length of the
// packed buffer. It treats bufp as an array of size n.
//
// TODO(mknyszek): Add a faster SIMD-based implementation.
func FilterNil(bufp *uintptr, n int32) int32 {
buf := unsafe.Slice(bufp, int(n))
lo := 0
hi := len(buf) - 1
for lo < hi {
for lo < hi && buf[hi] == 0 {
hi--
}
for lo < hi && buf[lo] != 0 {
lo++
}
if lo >= hi {
break
}
buf[lo] = buf[hi]
hi--
}
if hi >= 0 && buf[hi] == 0 {
hi--
}
return int32(hi) + 1
}

View File

@@ -0,0 +1,94 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package scan_test
import (
"internal/runtime/gc/scan"
"testing"
)
func TestFilterNil(t *testing.T) {
t.Run("empty", func(t *testing.T) {
testFilterNil(t, []uintptr{}, []uintptr{})
})
t.Run("one", func(t *testing.T) {
testFilterNil(t, []uintptr{4}, []uintptr{4})
})
t.Run("elimOne", func(t *testing.T) {
testFilterNil(t, []uintptr{0}, []uintptr{})
})
t.Run("oneElimBegin", func(t *testing.T) {
testFilterNil(t, []uintptr{0, 4}, []uintptr{4})
})
t.Run("oneElimEnd", func(t *testing.T) {
testFilterNil(t, []uintptr{4, 0}, []uintptr{4})
})
t.Run("oneElimMultiBegin", func(t *testing.T) {
testFilterNil(t, []uintptr{0, 0, 0, 4}, []uintptr{4})
})
t.Run("oneElimMultiEnd", func(t *testing.T) {
testFilterNil(t, []uintptr{4, 0, 0, 0}, []uintptr{4})
})
t.Run("oneElimMulti", func(t *testing.T) {
testFilterNil(t, []uintptr{0, 0, 0, 4, 0}, []uintptr{4})
})
t.Run("two", func(t *testing.T) {
testFilterNil(t, []uintptr{5, 12}, []uintptr{5, 12})
})
t.Run("twoElimBegin", func(t *testing.T) {
testFilterNil(t, []uintptr{0, 5, 12}, []uintptr{5, 12})
})
t.Run("twoElimMid", func(t *testing.T) {
testFilterNil(t, []uintptr{5, 0, 12}, []uintptr{5, 12})
})
t.Run("twoElimEnd", func(t *testing.T) {
testFilterNil(t, []uintptr{5, 12, 0}, []uintptr{5, 12})
})
t.Run("twoElimMulti", func(t *testing.T) {
testFilterNil(t, []uintptr{0, 5, 0, 12, 0}, []uintptr{5, 12})
})
t.Run("Multi", func(t *testing.T) {
testFilterNil(t, []uintptr{1, 5, 5, 0, 0, 0, 12, 0, 121, 5, 0}, []uintptr{1, 5, 5, 12, 121, 5})
})
}
func testFilterNil(t *testing.T, buf, want []uintptr) {
var bufp *uintptr
if len(buf) != 0 {
bufp = &buf[0]
}
n := scan.FilterNil(bufp, int32(len(buf)))
if n > int32(len(buf)) {
t.Errorf("bogus new length returned: %d > %d", n, len(buf))
return
}
buf = buf[:n]
if len(buf) != len(want) {
t.Errorf("lengths differ: got %d, want %d", len(buf), len(want))
}
wantMap := make(map[uintptr]int)
gotMap := make(map[uintptr]int)
for _, p := range want {
wantMap[p]++
}
for _, p := range buf {
gotMap[p]++
}
for p, nWant := range wantMap {
if nGot, ok := gotMap[p]; !ok {
t.Errorf("want %d, but missing from output", p)
} else if nGot != nWant {
t.Errorf("want %d copies of %d, but got %d", nWant, p, nGot)
}
}
for p := range gotMap {
if _, ok := wantMap[p]; !ok {
t.Errorf("got %d, but didn't want it", p)
}
}
t.Logf("got: %v", buf)
t.Logf("want: %v", want)
}

View File

@@ -0,0 +1,16 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !unix
package scan_test
import (
"testing"
)
func makeMem(t testing.TB, nPages int) ([]uintptr, func()) {
t.Skip("mmap unsupported")
return nil, nil
}

View File

@@ -0,0 +1,25 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build unix
package scan_test
import (
"internal/runtime/gc"
"syscall"
"testing"
"unsafe"
)
func makeMem(t testing.TB, nPages int) ([]uintptr, func()) {
mem, err := syscall.Mmap(-1, 0, int(gc.PageSize*nPages), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_PRIVATE|syscall.MAP_ANON)
if err != nil {
t.Fatalf("mmap failed: %s", err)
}
free := func() {
syscall.Munmap(mem)
}
return unsafe.Slice((*uintptr)(unsafe.Pointer(unsafe.SliceData(mem))), len(mem)/8), free
}

View File

@@ -0,0 +1,412 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ignore
package main
import (
"bytes"
"fmt"
"io"
"log"
"os"
"slices"
"strconv"
"internal/runtime/gc"
"internal/runtime/gc/internal/gen"
)
const header = "// Code generated by mkasm.go. DO NOT EDIT.\n\n"
func main() {
generate("expand_amd64.s", genExpanders)
}
func generate(fileName string, genFunc func(*gen.File)) {
var buf bytes.Buffer
tee := io.MultiWriter(&buf, os.Stdout)
file := gen.NewFile(tee)
genFunc(file)
fmt.Fprintf(tee, header)
file.Compile()
f, err := os.Create(fileName)
if err != nil {
log.Fatal(err)
}
defer f.Close()
_, err = f.Write(buf.Bytes())
if err != nil {
log.Fatal(err)
}
}
func genExpanders(file *gen.File) {
gcExpandersAVX512 := make([]*gen.Func, len(gc.SizeClassToSize))
for sc, ob := range gc.SizeClassToSize {
if gc.SizeClassToNPages[sc] != 1 {
// These functions all produce a bitmap that covers exactly one
// page.
continue
}
if ob > gc.MinSizeForMallocHeader {
// This size class is too big to have a packed pointer/scalar bitmap.
break
}
xf := int(ob) / 8
log.Printf("size class %d bytes, expansion %dx", ob, xf)
fn := gen.NewFunc(fmt.Sprintf("expandAVX512_%d<>", xf))
ptrObjBits := gen.Arg[gen.Ptr[gen.Uint8x64]](fn)
if xf == 1 {
expandIdentity(ptrObjBits)
} else {
ok := gfExpander(xf, ptrObjBits)
if !ok {
log.Printf("failed to generate expander for size class %d", sc)
}
}
file.AddFunc(fn)
gcExpandersAVX512[sc] = fn
}
// Generate table mapping size class to expander PC
file.AddConst("·gcExpandersAVX512", gcExpandersAVX512)
}
// mat8x8 is an 8x8 bit matrix.
type mat8x8 struct {
mat [8]uint8
}
func matGroupToVec(mats *[8]mat8x8) [8]uint64 {
var out [8]uint64
for i, mat := range mats {
for j, row := range mat.mat {
// For some reason, Intel flips the rows.
out[i] |= uint64(row) << ((7 - j) * 8)
}
}
return out
}
// expandIdentity implements 1x expansion (that is, no expansion).
func expandIdentity(ptrObjBits gen.Ptr[gen.Uint8x64]) {
objBitsLo := gen.Deref(ptrObjBits)
objBitsHi := gen.Deref(ptrObjBits.AddConst(64))
gen.Return(objBitsLo, objBitsHi)
}
// gfExpander produces a function that expands each bit in an input bitmap into
// f consecutive bits in an output bitmap.
//
// The input is
//
// AX *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)
//
// The output is
//
// Z1 [64]uint8 = The bottom 512 bits of the expanded bitmap
// Z2 [64]uint8 = The top 512 bits of the expanded bitmap
//
// TODO(austin): This should Z0/Z1.
func gfExpander(f int, ptrObjBits gen.Ptr[gen.Uint8x64]) bool {
// TODO(austin): For powers of 2 >= 8, we can use mask expansion ops to make this much simpler.
// TODO(austin): For f >= 8, I suspect there are better ways to do this.
//
// For example, we could use a mask expansion to get a full byte for each
// input bit, and separately create the bytes that blend adjacent bits, then
// shuffle those bytes together. Certainly for f >= 16 this makes sense
// because each of those bytes will be used, possibly more than once.
objBits := gen.Deref(ptrObjBits)
type term struct {
iByte, oByte int
mat mat8x8
}
var terms []term
// Iterate over all output bytes and construct the 8x8 GF2 matrix to compute
// the output byte from the appropriate input byte. Gather all of these into
// "terms".
for oByte := 0; oByte < 1024/8; oByte++ {
var byteMat mat8x8
iByte := -1
for oBit := oByte * 8; oBit < oByte*8+8; oBit++ {
iBit := oBit / f
if iByte == -1 {
iByte = iBit / 8
} else if iByte != iBit/8 {
log.Printf("output byte %d straddles input bytes %d and %d", oByte, iByte, iBit/8)
return false
}
// One way to view this is that the i'th row of the matrix will be
// ANDed with the input byte, and the parity of the result will set
// the i'th bit in the output. We use a simple 1 bit mask, so the
// parity is irrelevant beyond selecting out that one bit.
byteMat.mat[oBit%8] = 1 << (iBit % 8)
}
terms = append(terms, term{iByte, oByte, byteMat})
}
if false {
// Print input byte -> output byte as a matrix
maxIByte, maxOByte := 0, 0
for _, term := range terms {
maxIByte = max(maxIByte, term.iByte)
maxOByte = max(maxOByte, term.oByte)
}
iToO := make([][]rune, maxIByte+1)
for i := range iToO {
iToO[i] = make([]rune, maxOByte+1)
}
matMap := make(map[mat8x8]int)
for _, term := range terms {
i, ok := matMap[term.mat]
if !ok {
i = len(matMap)
matMap[term.mat] = i
}
iToO[term.iByte][term.oByte] = 'A' + rune(i)
}
for o := range maxOByte + 1 {
fmt.Printf("%d", o)
for i := range maxIByte + 1 {
fmt.Printf(",")
if mat := iToO[i][o]; mat != 0 {
fmt.Printf("%c", mat)
}
}
fmt.Println()
}
}
// In hardware, each (8 byte) matrix applies to 8 bytes of data in parallel,
// and we get to operate on up to 8 matrixes in parallel (or 64 values). That is:
//
// abcdefgh ijklmnop qrstuvwx yzABCDEF GHIJKLMN OPQRSTUV WXYZ0123 456789_+
// mat0 mat1 mat2 mat3 mat4 mat5 mat6 mat7
// Group the terms by matrix, but limit each group to 8 terms.
const termsPerGroup = 8 // Number of terms we can multiply by the same matrix.
const groupsPerSuperGroup = 8 // Number of matrixes we can fit in a vector.
matMap := make(map[mat8x8]int)
allMats := make(map[mat8x8]bool)
var termGroups [][]term
for _, term := range terms {
allMats[term.mat] = true
i, ok := matMap[term.mat]
if ok && f > groupsPerSuperGroup {
// The output is ultimately produced in two [64]uint8 registers.
// Getting every byte in the right place of each of these requires a
// final permutation that often requires more than one source.
//
// Up to 8x expansion, we can get a really nice grouping so we can use
// the same 8 matrix vector several times, without producing
// permutations that require more than two sources.
//
// Above 8x, however, we can't get nice matrixes anyway, so we
// instead prefer reducing the complexity of the permutations we
// need to produce the final outputs. To do this, avoid grouping
// together terms that are split across the two registers.
outRegister := termGroups[i][0].oByte / 64
if term.oByte/64 != outRegister {
ok = false
}
}
if !ok {
// Start a new term group.
i = len(termGroups)
matMap[term.mat] = i
termGroups = append(termGroups, nil)
}
termGroups[i] = append(termGroups[i], term)
if len(termGroups[i]) == termsPerGroup {
// This term group is full.
delete(matMap, term.mat)
}
}
for i, termGroup := range termGroups {
log.Printf("term group %d:", i)
for _, term := range termGroup {
log.Printf(" %+v", term)
}
}
// We can do 8 matrix multiplies in parallel, which is 8 term groups. Pack
// as many term groups as we can into each super-group to minimize the
// number of matrix multiplies.
//
// Ideally, we use the same matrix in each super-group, which might mean
// doing fewer than 8 multiplies at a time. That's fine because it never
// increases the total number of matrix multiplies.
//
// TODO: Packing the matrixes less densely may let us use more broadcast
// loads instead of general permutations, though. That replaces a load of
// the permutation with a load of the matrix, but is probably still slightly
// better.
var sgSize, nSuperGroups int
oneMatVec := f <= groupsPerSuperGroup
if oneMatVec {
// We can use the same matrix in each multiply by doing sgSize
// multiplies at a time.
sgSize = groupsPerSuperGroup / len(allMats) * len(allMats)
nSuperGroups = (len(termGroups) + sgSize - 1) / sgSize
} else {
// We can't use the same matrix for each multiply. Just do as many at a
// time as we can.
//
// TODO: This is going to produce several distinct matrixes, when we
// probably only need two. Be smarter about how we create super-groups
// in this case. Maybe we build up an array of super-groups and then the
// loop below just turns them into ops?
sgSize = 8
nSuperGroups = (len(termGroups) + groupsPerSuperGroup - 1) / groupsPerSuperGroup
}
// Construct each super-group.
var matGroup [8]mat8x8
var matMuls []gen.Uint8x64
var perm [128]int
for sgi := range nSuperGroups {
var iperm [64]uint8
for i := range iperm {
iperm[i] = 0xff // "Don't care"
}
// Pick off sgSize term groups.
superGroup := termGroups[:min(len(termGroups), sgSize)]
termGroups = termGroups[len(superGroup):]
// Build the matrix and permutations for this super-group.
var thisMatGroup [8]mat8x8
for i, termGroup := range superGroup {
// All terms in this group have the same matrix. Pick one.
thisMatGroup[i] = termGroup[0].mat
for j, term := range termGroup {
// Build the input permutation.
iperm[i*termsPerGroup+j] = uint8(term.iByte)
// Build the output permutation.
perm[term.oByte] = sgi*groupsPerSuperGroup*termsPerGroup + i*termsPerGroup + j
}
}
log.Printf("input permutation %d: %v", sgi, iperm)
// Check that we're not making more distinct matrixes than expected.
if oneMatVec {
if sgi == 0 {
matGroup = thisMatGroup
} else if matGroup != thisMatGroup {
log.Printf("super-groups have different matrixes:\n%+v\n%+v", matGroup, thisMatGroup)
return false
}
}
// Emit matrix op.
matConst := gen.ConstUint64x8(matGroupToVec(&thisMatGroup), fmt.Sprintf("*_mat%d<>", sgi))
inOp := objBits.Shuffle(gen.ConstUint8x64(iperm, fmt.Sprintf("*_inShuf%d<>", sgi)))
matMul := matConst.GF2P8Affine(inOp)
matMuls = append(matMuls, matMul)
}
log.Printf("output permutation: %v", perm)
outLo, ok := genShuffle("*_outShufLo", (*[64]int)(perm[:64]), matMuls...)
if !ok {
log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
return false
}
outHi, ok := genShuffle("*_outShufHi", (*[64]int)(perm[64:]), matMuls...)
if !ok {
log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
return false
}
gen.Return(outLo, outHi)
return true
}
func genShuffle(name string, perm *[64]int, args ...gen.Uint8x64) (gen.Uint8x64, bool) {
// Construct flattened permutation.
var vperm [64]byte
// Get the inputs used by this permutation.
var inputs []int
for i, src := range perm {
inputIdx := slices.Index(inputs, src/64)
if inputIdx == -1 {
inputIdx = len(inputs)
inputs = append(inputs, src/64)
}
vperm[i] = byte(src%64 | (inputIdx << 6))
}
// Emit instructions for easy cases.
switch len(inputs) {
case 1:
constOp := gen.ConstUint8x64(vperm, name)
return args[inputs[0]].Shuffle(constOp), true
case 2:
constOp := gen.ConstUint8x64(vperm, name)
return args[inputs[0]].Shuffle2(args[inputs[1]], constOp), true
}
// Harder case, we need to shuffle in from up to 2 more tables.
//
// Perform two shuffles. One shuffle will get its data from the first
// two inputs, the other shuffle will get its data from the other one
// or two inputs. All values they don't care each don't care about will
// be zeroed.
var vperms [2][64]byte
var masks [2]uint64
for j, idx := range vperm {
for i := range vperms {
vperms[i][j] = 0xff // "Don't care"
}
if idx == 0xff {
continue
}
vperms[idx/128][j] = idx % 128
masks[idx/128] |= uint64(1) << j
}
// Validate that the masks are fully disjoint.
if masks[0]^masks[1] != ^uint64(0) {
panic("bad shuffle!")
}
// Generate constants.
constOps := make([]gen.Uint8x64, len(vperms))
for i, v := range vperms {
constOps[i] = gen.ConstUint8x64(v, name+strconv.Itoa(i))
}
// Generate shuffles.
switch len(inputs) {
case 3:
r0 := args[inputs[0]].Shuffle2Zeroed(args[inputs[1]], constOps[0], gen.ConstMask64(masks[0]))
r1 := args[inputs[2]].ShuffleZeroed(constOps[1], gen.ConstMask64(masks[1]))
return r0.ToUint64x8().Or(r1.ToUint64x8()).ToUint8x64(), true
case 4:
r0 := args[inputs[0]].Shuffle2Zeroed(args[inputs[1]], constOps[0], gen.ConstMask64(masks[0]))
r1 := args[inputs[2]].Shuffle2Zeroed(args[inputs[3]], constOps[1], gen.ConstMask64(masks[1]))
return r0.ToUint64x8().Or(r1.ToUint64x8()).ToUint8x64(), true
}
// Too many inputs. To support more, we'd need to separate tables much earlier.
// Right now all the indices fit in a byte, but with >4 inputs they might not (>256 bytes).
return args[0], false
}

View File

@@ -0,0 +1,41 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package scan
import (
"internal/cpu"
"internal/runtime/gc"
"unsafe"
)
func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
if CanAVX512() {
return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)
}
panic("not implemented")
}
func HasFastScanSpanPacked() bool {
return avx512ScanPackedReqsMet
}
// -- AVX512 --
func CanAVX512() bool {
return avx512ScanPackedReqsMet
}
func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
return FilterNil(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
}
//go:noescape
func scanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)
var avx512ScanPackedReqsMet = cpu.X86.HasAVX512VL &&
cpu.X86.HasAVX512BW &&
cpu.X86.HasGFNI &&
cpu.X86.HasAVX512BITALG &&
cpu.X86.HasAVX512VBMI

View File

@@ -0,0 +1,103 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
// Test-only.
TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
MOVQ sizeClass+0(FP), CX
MOVQ packed+8(FP), AX
// Call the expander for this size class
LEAQ ·gcExpandersAVX512(SB), BX
CALL (BX)(CX*8)
MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
VMOVDQU64 Z1, 0(DI)
VMOVDQU64 Z2, 64(DI)
VZEROUPPER
RET
TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
// Z1+Z2 = Expand the grey object mask into a grey word mask
MOVQ objMarks+16(FP), AX
MOVQ sizeClass+24(FP), CX
LEAQ ·gcExpandersAVX512(SB), BX
CALL (BX)(CX*8)
// Z3+Z4 = Load the pointer mask
MOVQ ptrMask+32(FP), AX
VMOVDQU64 0(AX), Z3
VMOVDQU64 64(AX), Z4
// Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask
VPANDQ Z1, Z3, Z1
VPANDQ Z2, Z4, Z2
// Now each bit of Z1+Z2 represents one word of the span.
// Thus, each byte covers 64 bytes of memory, which is also how
// much we can fix in a Z register.
//
// We do a load/compress for each 64 byte frame.
//
// Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame
VPOPCNTB Z1, Z3 // Requires BITALG
VPOPCNTB Z2, Z4
// Store the scan mask and word counts at 0(SP) and 128(SP).
//
// TODO: Is it better to read directly from the registers?
VMOVDQU64 Z1, 0(SP)
VMOVDQU64 Z2, 64(SP)
VMOVDQU64 Z3, 128(SP)
VMOVDQU64 Z4, 192(SP)
// SI = Current address in span
MOVQ mem+0(FP), SI
// DI = Scan buffer base
MOVQ bufp+8(FP), DI
// DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer
MOVQ $0, DX
// AX = address in scan mask, 128(AX) = address in popcount
LEAQ 0(SP), AX
// Loop over the 64 byte frames in this span.
// BX = 1 past the end of the scan mask
LEAQ 128(SP), BX
// Align loop to a cache line so that performance is less sensitive
// to how this function ends up laid out in memory. This is a hot
// function in the GC, and this is a tight loop. We don't want
// performance to waver wildly due to unrelated changes.
PCALIGN $64
loop:
// CX = Fetch the mask of words to load from this frame.
MOVBQZX 0(AX), CX
// Skip empty frames.
TESTQ CX, CX
JZ skip
// Load the 64 byte frame.
KMOVB CX, K1
VMOVDQA64 0(SI), Z1
// Collect just the pointers from the greyed objects into the scan buffer,
// i.e., copy the word indices in the mask from Z1 into contiguous memory.
VPCOMPRESSQ Z1, K1, (DI)(DX*8)
// Advance the scan buffer position by the number of pointers.
MOVBQZX 128(AX), CX
ADDQ CX, DX
skip:
ADDQ $64, SI
ADDQ $1, AX
CMPQ AX, BX
JB loop
end:
MOVL DX, count+40(FP)
VZEROUPPER
RET

View File

@@ -0,0 +1,19 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64
package scan_test
import (
"internal/runtime/gc/scan"
"testing"
)
func TestScanSpanPackedAVX512(t *testing.T) {
if !scan.CanAVX512() {
t.Skip("no AVX512")
}
testScanSpanPacked(t, scan.ScanSpanPackedAVX512)
}

View File

@@ -0,0 +1,23 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64
package scan
import (
"internal/runtime/gc"
"unsafe"
)
func HasFastScanSpanPacked() bool {
// N.B. ScanSpanPackedGeneric isn't actually fast enough to serve as a general-purpose implementation.
// The runtime's alternative of jumping between each object is still substantially better, even at
// relatively high object densities.
return false
}
func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)
}

View File

@@ -0,0 +1,14 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package scan_test
import (
"internal/runtime/gc/scan"
"testing"
)
func TestScanSpanPackedGo(t *testing.T) {
testScanSpanPacked(t, scan.ScanSpanPackedGo)
}

View File

@@ -0,0 +1,104 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package scan
import (
"internal/goarch"
"internal/runtime/gc"
"internal/runtime/sys"
"unsafe"
)
// ScanSpanPackedGo is an optimized pure Go implementation of ScanSpanPacked.
func ScanSpanPackedGo(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
buf := newUnsafeBuf(bufp)
objBytes := uintptr(gc.SizeClassToSize[sizeClass])
// TODO(austin): Trim objMarks to the number of objects in this size class?
for markI, markWord := range objMarks {
for range sys.OnesCount64(uint64(markWord)) {
bitI := sys.TrailingZeros64(uint64(markWord))
markWord &^= 1 << bitI
objIndex := markI*goarch.PtrBits + bitI
// objStartInSpan is the index of the word from mem where the
// object stats. objEndInSpan points to the next object, i.e.
// it's an exclusive upper bound.
objStartInSpan := objBytes * uintptr(objIndex) / goarch.PtrSize
objEndInSpan := objStartInSpan + objBytes/goarch.PtrSize
// TODO: Another way to do this would be to extract the pointer mask
// for this object (it's at most 64 bits) and do a bit iteration
// over that.
for wordI := objStartInSpan; wordI < objEndInSpan; wordI++ {
val := *(*uintptr)(unsafe.Add(mem, wordI*goarch.PtrSize))
// Check if we should enqueue this word.
//
// We load the word before the check because, even though this
// can lead to loading much more than necessary, it's faster.
// Most likely this is because it warms up the hardware
// prefetcher much better, and gives us more time before we need
// the value.
//
// We discard values that can't possibly be useful pointers
// here, too, because this filters out a lot of words and does
// so with as little processing as possible.
//
// TODO: This is close to, but not entirely branchless.
isPtr := bool2int(ptrMask[wordI/goarch.PtrBits]&(1<<(wordI%goarch.PtrBits)) != 0)
isNonNil := bool2int(val >= 4096)
pred := isPtr&isNonNil != 0
buf.addIf(val, pred)
}
}
}
// We don't know the true size of bufp, but we can at least catch obvious errors
// in this function by making sure we didn't write more than gc.PageWords pointers
// into the buffer.
buf.check(gc.PageWords)
return int32(buf.n)
}
// unsafeBuf allows for appending to a buffer without bounds-checks or branches.
type unsafeBuf[T any] struct {
base *T
n int
}
func newUnsafeBuf[T any](base *T) unsafeBuf[T] {
return unsafeBuf[T]{base, 0}
}
// addIf appends a value to the buffer if the predicate is true.
//
// addIf speculatively writes to the next index of the buffer, so the caller
// must be certain that such a write will still be in-bounds with respect
// to the buffer's true capacity.
func (b *unsafeBuf[T]) addIf(val T, pred bool) {
*(*T)(unsafe.Add(unsafe.Pointer(b.base), b.n*int(unsafe.Sizeof(val)))) = val
b.n += bool2int(pred)
}
// check performs a bounds check on speculative writes into the buffer.
// Calling this shortly after a series of addIf calls is important to
// catch any misuse as fast as possible. Separating the bounds check from
// the append is more efficient, but one check to cover several appends is
// still efficient and much more memory safe.
func (b unsafeBuf[T]) check(cap int) {
// We fail even if b.n == cap because addIf speculatively writes one past b.n.
if b.n >= cap {
panic("unsafeBuf overflow")
}
}
func bool2int(x bool) int {
// This particular pattern gets optimized by the compiler.
var b int
if x {
b = 1
}
return b
}

View File

@@ -0,0 +1,40 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package scan
import (
"internal/goarch"
"internal/runtime/gc"
"unsafe"
)
// ScanSpanPackedReference is the reference implementation of ScanScanPacked. It prioritizes clarity over performance.
//
// Concretely, ScanScanPacked functions read pointers from mem, assumed to be gc.PageSize-aligned and gc.PageSize in size,
// and writes them to bufp, which is large enough to guarantee that even if pointer-word of mem is a pointer, it will fit.
// Therefore bufp, is always at least gc.PageSize in size.
//
// ScanSpanPacked is supposed to identify pointers by first filtering words by objMarks, where each bit of the mask
// represents gc.SizeClassToSize[sizeClass] bytes of memory, and then filtering again by the bits in ptrMask.
func ScanSpanPackedReference(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
buf := unsafe.Slice(bufp, gc.PageWords)
expandBy := uintptr(gc.SizeClassToSize[sizeClass]) / goarch.PtrSize
for word := range gc.PageWords {
objI := uintptr(word) / expandBy
if objMarks[objI/goarch.PtrBits]&(1<<(objI%goarch.PtrBits)) == 0 {
continue
}
if ptrMask[word/goarch.PtrBits]&(1<<(word%goarch.PtrBits)) == 0 {
continue
}
ptr := *(*uintptr)(unsafe.Add(mem, word*goarch.PtrSize))
if ptr == 0 {
continue
}
buf[count] = ptr
count++
}
return count
}

View File

@@ -0,0 +1,254 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package scan_test
import (
"fmt"
"internal/cpu"
"internal/goarch"
"internal/runtime/gc"
"internal/runtime/gc/scan"
"math/bits"
"math/rand/v2"
"slices"
"sync"
"testing"
"unsafe"
)
type scanFunc func(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)
func testScanSpanPacked(t *testing.T, scanF scanFunc) {
scanR := scan.ScanSpanPackedReference
// Construct a fake memory
mem, free := makeMem(t, 1)
defer free()
for i := range mem {
// Use values > heap.PageSize because a scan function can discard
// pointers smaller than this.
mem[i] = uintptr(int(gc.PageSize) + i + 1)
}
// Construct a random pointer mask
rnd := rand.New(rand.NewPCG(42, 42))
var ptrs gc.PtrMask
for i := range ptrs {
ptrs[i] = uintptr(rnd.Uint64())
}
bufF := make([]uintptr, gc.PageWords)
bufR := make([]uintptr, gc.PageWords)
testObjs(t, func(t *testing.T, sizeClass int, objs *gc.ObjMask) {
nF := scanF(unsafe.Pointer(&mem[0]), &bufF[0], objs, uintptr(sizeClass), &ptrs)
nR := scanR(unsafe.Pointer(&mem[0]), &bufR[0], objs, uintptr(sizeClass), &ptrs)
if nR != nF {
t.Errorf("want %d count, got %d", nR, nF)
} else if !slices.Equal(bufF[:nF], bufR[:nR]) {
t.Errorf("want scanned pointers %d, got %d", bufR[:nR], bufF[:nF])
}
})
}
func testObjs(t *testing.T, f func(t *testing.T, sizeClass int, objMask *gc.ObjMask)) {
for sizeClass := range gc.NumSizeClasses {
if sizeClass == 0 {
continue
}
size := uintptr(gc.SizeClassToSize[sizeClass])
if size > gc.MinSizeForMallocHeader {
break // Pointer/scalar metadata is not packed for larger sizes.
}
t.Run(fmt.Sprintf("size=%d", size), func(t *testing.T) {
// Scan a few objects near i to test boundary conditions.
const objMask = 0x101
nObj := uintptr(gc.SizeClassToNPages[sizeClass]) * gc.PageSize / size
for i := range nObj - uintptr(bits.Len(objMask)-1) {
t.Run(fmt.Sprintf("objs=0x%x<<%d", objMask, i), func(t *testing.T) {
var objs gc.ObjMask
objs[i/goarch.PtrBits] = objMask << (i % goarch.PtrBits)
f(t, sizeClass, &objs)
})
}
})
}
}
var dataCacheSizes = sync.OnceValue(func() []uintptr {
cs := cpu.DataCacheSizes()
for i, c := range cs {
fmt.Printf("# L%d cache: %d (%d Go pages)\n", i+1, c, c/gc.PageSize)
}
return cs
})
func BenchmarkScanSpanPacked(b *testing.B) {
benchmarkCacheSizes(b, benchmarkScanSpanPackedAllSizeClasses)
}
func benchmarkCacheSizes(b *testing.B, fn func(b *testing.B, heapPages int)) {
cacheSizes := dataCacheSizes()
b.Run("cache=tiny/pages=1", func(b *testing.B) {
fn(b, 1)
})
for i, cacheBytes := range cacheSizes {
pages := int(cacheBytes*3/4) / gc.PageSize
b.Run(fmt.Sprintf("cache=L%d/pages=%d", i+1, pages), func(b *testing.B) {
fn(b, pages)
})
}
ramPages := int(cacheSizes[len(cacheSizes)-1]*3/2) / gc.PageSize
b.Run(fmt.Sprintf("cache=ram/pages=%d", ramPages), func(b *testing.B) {
fn(b, ramPages)
})
}
func benchmarkScanSpanPackedAllSizeClasses(b *testing.B, nPages int) {
for sc := range gc.NumSizeClasses {
if sc == 0 {
continue
}
if sc >= gc.MinSizeForMallocHeader {
break
}
b.Run(fmt.Sprintf("sizeclass=%d", sc), func(b *testing.B) {
benchmarkScanSpanPacked(b, nPages, sc)
})
}
}
func benchmarkScanSpanPacked(b *testing.B, nPages int, sizeClass int) {
rnd := rand.New(rand.NewPCG(42, 42))
// Construct a fake memory
mem, free := makeMem(b, nPages)
defer free()
for i := range mem {
// Use values > heap.PageSize because a scan function can discard
// pointers smaller than this.
mem[i] = uintptr(int(gc.PageSize) + i + 1)
}
// Construct a random pointer mask
ptrs := make([]gc.PtrMask, nPages)
for i := range ptrs {
for j := range ptrs[i] {
ptrs[i][j] = uintptr(rnd.Uint64())
}
}
// Visit the pages in a random order
pageOrder := rnd.Perm(nPages)
// Create the scan buffer.
buf := make([]uintptr, gc.PageWords)
// Sweep from 0 marks to all marks. We'll use the same marks for each page
// because I don't think that predictability matters.
objBytes := uintptr(gc.SizeClassToSize[sizeClass])
nObj := gc.PageSize / objBytes
markOrder := rnd.Perm(int(nObj))
const steps = 11
for i := 0; i < steps; i++ {
frac := float64(i) / float64(steps-1)
// Set frac marks.
nMarks := int(float64(len(markOrder))*frac + 0.5)
var objMarks gc.ObjMask
for _, mark := range markOrder[:nMarks] {
objMarks[mark/goarch.PtrBits] |= 1 << (mark % goarch.PtrBits)
}
greyClusters := 0
for page := range ptrs {
greyClusters += countGreyClusters(sizeClass, &objMarks, &ptrs[page])
}
// Report MB/s of how much memory they're actually hitting. This assumes
// 64 byte cache lines (TODO: Should it assume 128 byte cache lines?)
// and expands each access to the whole cache line. This is useful for
// comparing against memory bandwidth.
//
// TODO: Add a benchmark that just measures single core memory bandwidth
// for comparison. (See runtime memcpy benchmarks.)
//
// TODO: Should there be a separate measure where we don't expand to
// cache lines?
avgBytes := int64(greyClusters) * int64(cpu.CacheLineSize) / int64(len(ptrs))
b.Run(fmt.Sprintf("pct=%d", int(100*frac)), func(b *testing.B) {
b.Run("impl=Reference", func(b *testing.B) {
b.SetBytes(avgBytes)
for i := range b.N {
page := pageOrder[i%len(pageOrder)]
scan.ScanSpanPackedReference(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
}
})
b.Run("impl=Go", func(b *testing.B) {
b.SetBytes(avgBytes)
for i := range b.N {
page := pageOrder[i%len(pageOrder)]
scan.ScanSpanPackedGo(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
}
})
if scan.HasFastScanSpanPacked() {
b.Run("impl=Platform", func(b *testing.B) {
b.SetBytes(avgBytes)
for i := range b.N {
page := pageOrder[i%len(pageOrder)]
scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
}
})
}
})
}
}
func countGreyClusters(sizeClass int, objMarks *gc.ObjMask, ptrMask *gc.PtrMask) int {
clusters := 0
lastCluster := -1
expandBy := uintptr(gc.SizeClassToSize[sizeClass]) / goarch.PtrSize
for word := range gc.PageWords {
objI := uintptr(word) / expandBy
if objMarks[objI/goarch.PtrBits]&(1<<(objI%goarch.PtrBits)) == 0 {
continue
}
if ptrMask[word/goarch.PtrBits]&(1<<(word%goarch.PtrBits)) == 0 {
continue
}
c := word * 8 / goarch.PtrBits
if c != lastCluster {
lastCluster = c
clusters++
}
}
return clusters
}
func BenchmarkScanMaxBandwidth(b *testing.B) {
// Measure the theoretical "maximum" bandwidth of scanning by reproducing
// the memory access pattern of a full page scan, but using memcpy as the
// kernel instead of scanning.
benchmarkCacheSizes(b, func(b *testing.B, heapPages int) {
mem, free := makeMem(b, heapPages)
defer free()
for i := range mem {
mem[i] = uintptr(int(gc.PageSize) + i + 1)
}
buf := make([]uintptr, gc.PageWords)
// Visit the pages in a random order
rnd := rand.New(rand.NewPCG(42, 42))
pageOrder := rnd.Perm(heapPages)
b.SetBytes(int64(gc.PageSize))
b.ResetTimer()
for i := range b.N {
page := pageOrder[i%len(pageOrder)]
copy(buf, mem[gc.PageWords*page:])
}
})
}

View File

@@ -82,14 +82,15 @@ package gc
// 8192 13 32768
const (
MinHeapAlign = 8
MaxSmallSize = 32768
SmallSizeDiv = 8
SmallSizeMax = 1024
LargeSizeDiv = 128
NumSizeClasses = 68
PageShift = 13
MaxObjsPerSpan = 1024
MinHeapAlign = 8
MaxSmallSize = 32768
SmallSizeDiv = 8
SmallSizeMax = 1024
LargeSizeDiv = 128
NumSizeClasses = 68
PageShift = 13
MaxObjsPerSpan = 1024
MaxSizeClassNPages = 10
)
var SizeClassToSize = [NumSizeClasses]uint16{0, 8, 16, 24, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 896, 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, 24576, 27264, 28672, 32768}