mirror of
https://github.com/golang/go.git
synced 2026-02-04 09:55:06 +03:00
internal/runtime/gc/scan: import scan kernel from gclab [green tea]
This change imports the AVX512 GC scanning kernel from CL 593938 into a new package, internal/runtime/gc/scan. Credit to Austin Clements for most of this work. I did some cleanup, added support for more size classes to the expanders, and added more testing. I also restructured the code to make it easier and clearer to add new scan kernels for new architectures. For #73581. Change-Id: I76bcbc889fa6cad73ba0084620fae084a5912e6b Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64_avx512,gotip-linux-amd64_avx512-greenteagc Reviewed-on: https://go-review.googlesource.com/c/go/+/655280 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com>
This commit is contained in:
committed by
Gopher Robot
parent
182336bf05
commit
889ab74169
@@ -52,6 +52,7 @@ var runtimePkgs = []string{
|
||||
"internal/runtime/cgroup",
|
||||
"internal/runtime/exithook",
|
||||
"internal/runtime/gc",
|
||||
"internal/runtime/gc/scan",
|
||||
"internal/runtime/maps",
|
||||
"internal/runtime/math",
|
||||
"internal/runtime/strconv",
|
||||
|
||||
@@ -100,6 +100,7 @@ var depsRules = `
|
||||
< internal/runtime/maps
|
||||
< internal/runtime/strconv
|
||||
< internal/runtime/cgroup
|
||||
< internal/runtime/gc/scan
|
||||
< runtime
|
||||
< sync/atomic
|
||||
< internal/sync
|
||||
@@ -797,6 +798,20 @@ var depsRules = `
|
||||
|
||||
FMT, testing < internal/cgrouptest;
|
||||
C, CGO < internal/runtime/cgobench;
|
||||
|
||||
# Generate-only packages can have anything they want
|
||||
container/heap,
|
||||
encoding/binary,
|
||||
fmt,
|
||||
hash/maphash,
|
||||
io,
|
||||
log,
|
||||
math/bits,
|
||||
os,
|
||||
reflect,
|
||||
strings,
|
||||
sync
|
||||
< internal/runtime/gc/internal/gen;
|
||||
`
|
||||
|
||||
// listStdPkgs returns the same list of packages as "go list std".
|
||||
|
||||
@@ -34,15 +34,19 @@ var X86 struct {
|
||||
HasAVX512 bool // Virtual feature: F+CD+BW+DQ+VL
|
||||
HasAVX512F bool
|
||||
HasAVX512CD bool
|
||||
HasAVX512BITALG bool
|
||||
HasAVX512BW bool
|
||||
HasAVX512DQ bool
|
||||
HasAVX512VL bool
|
||||
HasAVX512VPCLMULQDQ bool
|
||||
HasAVX512VBMI bool
|
||||
HasAVX512VBMI2 bool
|
||||
HasBMI1 bool
|
||||
HasBMI2 bool
|
||||
HasERMS bool
|
||||
HasFSRM bool
|
||||
HasFMA bool
|
||||
HasGFNI bool
|
||||
HasOSXSAVE bool
|
||||
HasPCLMULQDQ bool
|
||||
HasPOPCNT bool
|
||||
|
||||
@@ -18,7 +18,7 @@ func xgetbv() (eax, edx uint32)
|
||||
func getGOAMD64level() int32
|
||||
|
||||
const (
|
||||
// ecx bits
|
||||
// Bits returned in ECX for CPUID EAX=0x1 ECX=0x0
|
||||
cpuid_SSE3 = 1 << 0
|
||||
cpuid_PCLMULQDQ = 1 << 1
|
||||
cpuid_SSSE3 = 1 << 9
|
||||
@@ -30,7 +30,7 @@ const (
|
||||
cpuid_OSXSAVE = 1 << 27
|
||||
cpuid_AVX = 1 << 28
|
||||
|
||||
// ebx bits
|
||||
// "Extended Feature Flag" bits returned in EBX for CPUID EAX=0x7 ECX=0x0
|
||||
cpuid_BMI1 = 1 << 3
|
||||
cpuid_AVX2 = 1 << 5
|
||||
cpuid_BMI2 = 1 << 8
|
||||
@@ -43,8 +43,12 @@ const (
|
||||
cpuid_AVX512BW = 1 << 30
|
||||
cpuid_AVX512VL = 1 << 31
|
||||
|
||||
// ecx bits
|
||||
// "Extended Feature Flag" bits returned in ECX for CPUID EAX=0x7 ECX=0x0
|
||||
cpuid_AVX512_VBMI = 1 << 1
|
||||
cpuid_AVX512_VBMI2 = 1 << 6
|
||||
cpuid_GFNI = 1 << 8
|
||||
cpuid_AVX512VPCLMULQDQ = 1 << 10
|
||||
cpuid_AVX512_BITALG = 1 << 12
|
||||
|
||||
// edx bits
|
||||
cpuid_FSRM = 1 << 4
|
||||
@@ -163,6 +167,10 @@ func doinit() {
|
||||
X86.HasAVX512DQ = isSet(ebx7, cpuid_AVX512DQ)
|
||||
X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL)
|
||||
X86.HasAVX512VPCLMULQDQ = isSet(ecx7, cpuid_AVX512VPCLMULQDQ)
|
||||
X86.HasAVX512VBMI = isSet(ecx7, cpuid_AVX512_VBMI)
|
||||
X86.HasAVX512VBMI2 = isSet(ecx7, cpuid_AVX512_VBMI2)
|
||||
X86.HasGFNI = isSet(ecx7, cpuid_GFNI)
|
||||
X86.HasAVX512BITALG = isSet(ecx7, cpuid_AVX512_BITALG)
|
||||
}
|
||||
|
||||
X86.HasFSRM = isSet(edx7, cpuid_FSRM)
|
||||
|
||||
11
src/internal/cpu/datacache_unsupported.go
Normal file
11
src/internal/cpu/datacache_unsupported.go
Normal file
@@ -0,0 +1,11 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build !386 && !amd64
|
||||
|
||||
package cpu
|
||||
|
||||
func DataCacheSizes() []uintptr {
|
||||
return nil
|
||||
}
|
||||
121
src/internal/cpu/datacache_x86.go
Normal file
121
src/internal/cpu/datacache_x86.go
Normal file
@@ -0,0 +1,121 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build 386 || amd64
|
||||
|
||||
package cpu
|
||||
|
||||
// DataCacheSizes returns the size of each data cache from lowest
|
||||
// level in the hierarchy to highest.
|
||||
//
|
||||
// Unlike other parts of this package's public API, it is not safe
|
||||
// to reference early in runtime initialization because it allocates.
|
||||
// It's intended for testing only.
|
||||
func DataCacheSizes() []uintptr {
|
||||
maxFunctionInformation, ebx0, ecx0, edx0 := cpuid(0, 0)
|
||||
if maxFunctionInformation < 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
switch {
|
||||
// Check for "GenuineIntel"
|
||||
case ebx0 == 0x756E6547 && ecx0 == 0x6C65746E && edx0 == 0x49656E69:
|
||||
return getDataCacheSizesIntel(maxFunctionInformation)
|
||||
// Check for "AuthenticAMD"
|
||||
case ebx0 == 0x68747541 && ecx0 == 0x444D4163 && edx0 == 0x69746E65:
|
||||
return getDataCacheSizesAMD()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func extractBits(arg uint32, l int, r int) uint32 {
|
||||
if l > r {
|
||||
panic("bad bit range")
|
||||
}
|
||||
return (arg >> l) & ((1 << (r - l + 1)) - 1)
|
||||
}
|
||||
|
||||
func getDataCacheSizesIntel(maxID uint32) []uintptr {
|
||||
// Constants for cache types
|
||||
const (
|
||||
noCache = 0
|
||||
dataCache = 1
|
||||
instructionCache = 2
|
||||
unifiedCache = 3
|
||||
)
|
||||
if maxID < 4 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Iterate through CPUID leaf 4 (deterministic cache parameters)
|
||||
var caches []uintptr
|
||||
for i := uint32(0); i < 0xFFFF; i++ {
|
||||
eax, ebx, ecx, _ := cpuid(4, i)
|
||||
|
||||
cacheType := eax & 0xF // EAX bits 4-0: Cache Type
|
||||
if cacheType == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
// Report only data caches.
|
||||
if !(cacheType == dataCache || cacheType == unifiedCache) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Guaranteed to always start counting from 1.
|
||||
level := (eax >> 5) & 0x7
|
||||
|
||||
lineSize := extractBits(ebx, 0, 11) + 1 // Bits 11-0: Line size in bytes - 1
|
||||
partitions := extractBits(ebx, 12, 21) + 1 // Bits 21-12: Physical line partitions - 1
|
||||
ways := extractBits(ebx, 22, 31) + 1 // Bits 31-22: Ways of associativity - 1
|
||||
sets := uint64(ecx) + 1 // Number of sets - 1
|
||||
size := uint64(ways*partitions*lineSize) * sets // Calculate cache size in bytes
|
||||
|
||||
caches = append(caches, uintptr(size))
|
||||
|
||||
// If we see more than one cache described per level, or they appear
|
||||
// out of order, crash.
|
||||
//
|
||||
// Going by the SDM, it's not clear whether this is actually possible,
|
||||
// so this code is purely defensive.
|
||||
if level != uint32(len(caches)) {
|
||||
panic("expected levels to be in order and for there to be one data/unified cache per level")
|
||||
}
|
||||
}
|
||||
return caches
|
||||
}
|
||||
|
||||
func getDataCacheSizesAMD() []uintptr {
|
||||
maxExtendedFunctionInformation, _, _, _ := cpuid(0x80000000, 0)
|
||||
if maxExtendedFunctionInformation < 0x80000006 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var caches []uintptr
|
||||
|
||||
_, _, ecx5, _ := cpuid(0x80000005, 0)
|
||||
_, _, ecx6, edx6 := cpuid(0x80000006, 0)
|
||||
|
||||
// The size is return in kb, turning into bytes.
|
||||
l1dSize := uintptr(extractBits(ecx5, 24, 31) << 10)
|
||||
caches = append(caches, l1dSize)
|
||||
|
||||
// Check that L2 cache is present.
|
||||
if l2Assoc := extractBits(ecx6, 12, 15); l2Assoc == 0 {
|
||||
return caches
|
||||
}
|
||||
l2Size := uintptr(extractBits(ecx6, 16, 31) << 10)
|
||||
caches = append(caches, l2Size)
|
||||
|
||||
// Check that L3 cache is present.
|
||||
if l3Assoc := extractBits(edx6, 12, 15); l3Assoc == 0 {
|
||||
return caches
|
||||
}
|
||||
// Specifies the L3 cache size is within the following range:
|
||||
// (L3Size[31:18] * 512KB) <= L3 cache size < ((L3Size[31:18]+1) * 512KB).
|
||||
l3Size := uintptr(extractBits(edx6, 18, 31) * (512 << 10))
|
||||
caches = append(caches, l3Size)
|
||||
|
||||
return caches
|
||||
}
|
||||
26
src/internal/cpu/datacache_x86_test.go
Normal file
26
src/internal/cpu/datacache_x86_test.go
Normal file
@@ -0,0 +1,26 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build 386 || amd64
|
||||
|
||||
package cpu_test
|
||||
|
||||
import (
|
||||
"internal/cpu"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// Tests fetching data cache sizes. This test only checks that DataCacheSizes
|
||||
// won't explode. Otherwise it's just informational, and dumps the current
|
||||
// data cache sizes.
|
||||
func TestDataCacheSizes(t *testing.T) {
|
||||
// N.B. Don't try to check these values because we don't know what
|
||||
// kind of environment we're running in. We don't want this test to
|
||||
// fail on some random x86 chip that happens to not support the right
|
||||
// CPUID bits for some reason.
|
||||
caches := cpu.DataCacheSizes()
|
||||
for i, size := range caches {
|
||||
t.Logf("L%d: %d", i+1, size)
|
||||
}
|
||||
}
|
||||
@@ -34,6 +34,9 @@ const (
|
||||
// It is also the size of the machine's native word size (that is, 4 on 32-bit systems, 8 on 64-bit).
|
||||
const PtrSize = 4 << (^uintptr(0) >> 63)
|
||||
|
||||
// PtrSize is bit width of a pointer.
|
||||
const PtrBits = PtrSize * 8
|
||||
|
||||
// ArchFamily is the architecture family (AMD64, ARM, ...)
|
||||
const ArchFamily ArchFamilyType = _ArchFamily
|
||||
|
||||
|
||||
537
src/internal/runtime/gc/internal/gen/gen.go
Normal file
537
src/internal/runtime/gc/internal/gen/gen.go
Normal file
@@ -0,0 +1,537 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package gen
|
||||
|
||||
import (
|
||||
"container/heap"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"hash/maphash"
|
||||
"io"
|
||||
"log"
|
||||
"os"
|
||||
"reflect"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const logCompile = true
|
||||
|
||||
func fatalf(f string, args ...any) {
|
||||
panic(fmt.Sprintf(f, args...))
|
||||
}
|
||||
|
||||
type File struct {
|
||||
w io.Writer
|
||||
funcs []*Func
|
||||
consts []fileConst
|
||||
}
|
||||
|
||||
func NewFile(w io.Writer) *File {
|
||||
return &File{w: w}
|
||||
}
|
||||
|
||||
func (f *File) AddFunc(fn *Func) {
|
||||
f.funcs = append(f.funcs, fn)
|
||||
}
|
||||
|
||||
type fileConst struct {
|
||||
name string
|
||||
data any
|
||||
}
|
||||
|
||||
func (f *File) AddConst(name string, data any) {
|
||||
// TODO: It would be nice if this were unified with "const" ops, but the
|
||||
// reason I added this was for []*Func consts, which would take an overhaul
|
||||
// to represent in "const" ops.
|
||||
f.consts = append(f.consts, fileConst{name, data})
|
||||
}
|
||||
|
||||
type Func struct {
|
||||
name string
|
||||
nArgs int
|
||||
idGen int
|
||||
ops []*op
|
||||
}
|
||||
|
||||
func NewFunc(name string) *Func {
|
||||
fn := &Func{name: name}
|
||||
return fn
|
||||
}
|
||||
|
||||
// attach adds x to fn's op list. If x has any unattached arguments, this adds
|
||||
// those first (recursively).
|
||||
func (fn *Func) attach(x *op) {
|
||||
// Make sure the arguments are attached to the function.
|
||||
for _, arg := range x.args {
|
||||
argFn := arg.fn
|
||||
if argFn == nil {
|
||||
fn.attach(arg)
|
||||
} else if argFn != fn {
|
||||
panic("ops from different functions")
|
||||
}
|
||||
}
|
||||
|
||||
x.fn = fn
|
||||
x.id = fn.idGen
|
||||
fn.idGen++
|
||||
fn.ops = append(fn.ops, x)
|
||||
}
|
||||
|
||||
func Arg[W wrap[T], T Word](fn *Func) T {
|
||||
loc := locReg{cls: regClassGP, reg: fn.nArgs}
|
||||
fn.nArgs++
|
||||
var x W
|
||||
o := &op{op: "arg", kind: x.kind(), c: loc}
|
||||
fn.attach(o)
|
||||
return x.wrap(o)
|
||||
}
|
||||
|
||||
func Return(results ...Value) {
|
||||
args := make([]*op, len(results))
|
||||
for i, res := range results {
|
||||
args[i] = res.getOp()
|
||||
}
|
||||
var x void
|
||||
x.initOp(&op{op: "return", kind: voidKind, args: args})
|
||||
}
|
||||
|
||||
type op struct {
|
||||
op string
|
||||
kind *kind
|
||||
args []*op
|
||||
|
||||
id int
|
||||
fn *Func
|
||||
|
||||
// c depends on "op".
|
||||
//
|
||||
// arg locReg - The register containing the argument value
|
||||
// const any - The constant value
|
||||
// deref int - Byte offset from args[0]
|
||||
c any
|
||||
name string
|
||||
}
|
||||
|
||||
func (o *op) String() string {
|
||||
return fmt.Sprintf("v%02d", o.id)
|
||||
}
|
||||
|
||||
func imm(val any) *op {
|
||||
return &op{op: "imm", c: val}
|
||||
}
|
||||
|
||||
func (o *op) equalNoName(o2 *op) bool {
|
||||
if o.op != o2.op || o.c != o2.c || len(o.args) != len(o2.args) {
|
||||
return false
|
||||
}
|
||||
for i, arg := range o.args {
|
||||
if o2.args[i] != arg {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (o *op) write(w io.Writer) {
|
||||
fmt.Fprintf(w, "v%02d = %s", o.id, o.op)
|
||||
for _, arg := range o.args {
|
||||
fmt.Fprintf(w, " v%02d", arg.id)
|
||||
}
|
||||
if o.c != nil {
|
||||
fmt.Fprintf(w, " %v", o.c)
|
||||
}
|
||||
if o.name != "" {
|
||||
fmt.Fprintf(w, " %q", o.name)
|
||||
}
|
||||
if o.kind != nil {
|
||||
fmt.Fprintf(w, " [%s]", o.kind.typ)
|
||||
}
|
||||
fmt.Fprintf(w, "\n")
|
||||
}
|
||||
|
||||
func (fn *Func) write(w io.Writer) {
|
||||
fmt.Fprintf(w, "FUNC %s\n", fn.name)
|
||||
for _, op := range fn.ops {
|
||||
op.write(w)
|
||||
}
|
||||
}
|
||||
|
||||
func (f *File) Compile() {
|
||||
// TODO: CSE constants across the whole file
|
||||
|
||||
fmt.Fprintf(f.w, `#include "go_asm.h"
|
||||
#include "textflag.h"
|
||||
|
||||
`)
|
||||
|
||||
for _, c := range f.consts {
|
||||
f.emitConst(c.name, c.data)
|
||||
}
|
||||
|
||||
trace := func(fn *Func, step string) {
|
||||
if !logCompile {
|
||||
return
|
||||
}
|
||||
log.Printf("## Compiling %s: %s", fn.name, step)
|
||||
fn.write(os.Stderr)
|
||||
}
|
||||
|
||||
for _, fn := range f.funcs {
|
||||
trace(fn, "initial")
|
||||
|
||||
for {
|
||||
if fn.cse() {
|
||||
trace(fn, "post cse")
|
||||
continue
|
||||
}
|
||||
if fn.deadcode() {
|
||||
trace(fn, "post deadcode")
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
fn.addLoads()
|
||||
trace(fn, "post addLoads")
|
||||
|
||||
// Assigning locations requires ops to be in dependency order.
|
||||
fn.schedule()
|
||||
trace(fn, "post schedule")
|
||||
|
||||
locs := fn.assignLocs()
|
||||
|
||||
fn.emit(f, locs)
|
||||
}
|
||||
}
|
||||
|
||||
// cse performs common subexpression elimination.
|
||||
func (fn *Func) cse() bool {
|
||||
// Compute structural hashes
|
||||
hashes := make(map[*op]uint64)
|
||||
var h maphash.Hash
|
||||
var bbuf [8]byte
|
||||
for _, op := range fn.ops {
|
||||
// We ignore the name for canonicalization.
|
||||
h.Reset()
|
||||
h.WriteString(op.op)
|
||||
// TODO: Ideally we would hash o1.c, but we don't have a good way to do that.
|
||||
for _, arg := range op.args {
|
||||
if _, ok := hashes[arg]; !ok {
|
||||
panic("ops not in dependency order")
|
||||
}
|
||||
binary.NativeEndian.PutUint64(bbuf[:], hashes[arg])
|
||||
h.Write(bbuf[:])
|
||||
}
|
||||
hashes[op] = h.Sum64()
|
||||
}
|
||||
|
||||
canon := make(map[uint64][]*op)
|
||||
lookup := func(o *op) *op {
|
||||
hash := hashes[o]
|
||||
for _, o2 := range canon[hash] {
|
||||
if o.equalNoName(o2) {
|
||||
return o2
|
||||
}
|
||||
}
|
||||
canon[hash] = append(canon[hash], o)
|
||||
return o
|
||||
}
|
||||
|
||||
// Canonicalize ops.
|
||||
dirty := false
|
||||
for _, op := range fn.ops {
|
||||
for i, arg := range op.args {
|
||||
newArg := lookup(arg)
|
||||
if arg != newArg {
|
||||
dirty = true
|
||||
op.args[i] = newArg
|
||||
}
|
||||
}
|
||||
}
|
||||
return dirty
|
||||
}
|
||||
|
||||
// deadcode eliminates unused ops.
|
||||
func (fn *Func) deadcode() bool {
|
||||
marks := make(map[*op]bool)
|
||||
var mark func(o *op)
|
||||
mark = func(o *op) {
|
||||
if marks[o] {
|
||||
return
|
||||
}
|
||||
marks[o] = true
|
||||
for _, arg := range o.args {
|
||||
mark(arg)
|
||||
}
|
||||
}
|
||||
// Mark operations that have a side-effect.
|
||||
for _, op := range fn.ops {
|
||||
switch op.op {
|
||||
case "return":
|
||||
mark(op)
|
||||
}
|
||||
}
|
||||
// Discard unmarked operations
|
||||
if len(marks) == len(fn.ops) {
|
||||
return false
|
||||
}
|
||||
newOps := make([]*op, 0, len(marks))
|
||||
for _, op := range fn.ops {
|
||||
if marks[op] {
|
||||
newOps = append(newOps, op)
|
||||
}
|
||||
}
|
||||
fn.ops = newOps
|
||||
return true
|
||||
}
|
||||
|
||||
// canMem is a map from operation to a bitmap of which arguments can use a
|
||||
// direct memory reference.
|
||||
var canMem = map[string]uint64{
|
||||
"VPERMB": 1 << 0,
|
||||
"VPERMI2B": 1 << 0,
|
||||
"VPERMT2B": 1 << 0,
|
||||
"VGF2P8AFFINEQB": 1 << 0,
|
||||
"VPORQ": 1 << 0,
|
||||
"VPSUBQ": 1 << 0,
|
||||
"VPSHUFBITQMB": 1 << 0,
|
||||
}
|
||||
|
||||
// addLoads inserts load ops for ops that can't take memory inputs directly.
|
||||
func (fn *Func) addLoads() {
|
||||
// A lot of operations can directly take memory locations. If there's only a
|
||||
// single reference to a deref operation, and the operation can do the deref
|
||||
// itself, eliminate the deref. If there's more than one reference, then we
|
||||
// leave the load so we can share the value in the register.
|
||||
nRefs := fn.opRefs()
|
||||
loads := make(map[*op]*op) // deref -> load
|
||||
for _, o := range fn.ops {
|
||||
canMask := canMem[o.op]
|
||||
for i, arg := range o.args {
|
||||
// TODO: Many AVX-512 operations that support memory operands also
|
||||
// support a ".BCST" suffix that performs a broadcasting memory
|
||||
// load. If the const can be broadcast and all uses support
|
||||
// broadcast load, it would be nice to use .BCST. I'm not sure if
|
||||
// that belongs in this pass or a different one.
|
||||
if arg.op == "deref" || arg.op == "const" {
|
||||
// These produce memory locations.
|
||||
if canMask&(1<<i) == 0 || nRefs[arg] > 1 {
|
||||
// This argument needs to be loaded into a register.
|
||||
load, ok := loads[arg]
|
||||
if !ok {
|
||||
load = makeLoad(arg)
|
||||
fn.attach(load)
|
||||
loads[arg] = load
|
||||
}
|
||||
o.args[i] = load
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (fn *Func) opRefs() map[*op]int {
|
||||
refs := make(map[*op]int)
|
||||
for _, o1 := range fn.ops {
|
||||
for _, arg := range o1.args {
|
||||
refs[arg]++
|
||||
}
|
||||
}
|
||||
return refs
|
||||
}
|
||||
|
||||
func makeLoad(deref *op) *op {
|
||||
var inst string
|
||||
switch deref.kind.reg {
|
||||
default:
|
||||
fatalf("don't know how to load %v", deref.kind.reg)
|
||||
case regClassGP:
|
||||
inst = "MOVQ"
|
||||
case regClassZ:
|
||||
inst = "VMOVDQU64"
|
||||
}
|
||||
// The load references deref rather than deref.args[0] because when we
|
||||
// assign locations, the deref op gets the memory location to load from,
|
||||
// while its argument has some other location (like a register). Also, the
|
||||
// offset to deref is attached to the deref op.
|
||||
return &op{op: inst, kind: deref.kind, args: []*op{deref}}
|
||||
}
|
||||
|
||||
type opHeap []*op
|
||||
|
||||
func (h opHeap) Len() int { return len(h) }
|
||||
func (h opHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
|
||||
func (h opHeap) Less(i, j int) bool {
|
||||
priority := func(o *op) int {
|
||||
if o.op == "deref" || o.op == "const" {
|
||||
// Input to memory load
|
||||
return 1
|
||||
}
|
||||
if len(o.args) > 0 && (o.args[0].op == "deref" || o.args[0].op == "const") {
|
||||
// Memory load
|
||||
return 2
|
||||
}
|
||||
return 100
|
||||
}
|
||||
if p1, p2 := priority(h[i]), priority(h[j]); p1 != p2 {
|
||||
return p1 < p2
|
||||
}
|
||||
return h[i].id < h[j].id
|
||||
}
|
||||
|
||||
func (h *opHeap) Push(x any) {
|
||||
*h = append(*h, x.(*op))
|
||||
}
|
||||
|
||||
func (h *opHeap) Pop() any {
|
||||
old := *h
|
||||
n := len(old)
|
||||
x := old[n-1]
|
||||
*h = old[0 : n-1]
|
||||
return x
|
||||
}
|
||||
|
||||
// schedule ensures fn's ops are in dependency order.
|
||||
func (fn *Func) schedule() {
|
||||
// TODO: This tends to generate a huge amount of register pressure, mostly
|
||||
// because it floats loads as early as possible and partly because it has no
|
||||
// concept of rematerialization and CSE can make rematerializable values
|
||||
// live for a very long time. It some sense it doesn't matter because we
|
||||
// don't run out of registers for anything we need.
|
||||
|
||||
missing := make(map[*op]int)
|
||||
uses := make(map[*op][]*op)
|
||||
var h opHeap
|
||||
for _, op := range fn.ops {
|
||||
if len(op.args) == 0 {
|
||||
h = append(h, op)
|
||||
} else {
|
||||
missing[op] = len(op.args)
|
||||
}
|
||||
for _, arg := range op.args {
|
||||
uses[arg] = append(uses[arg], op)
|
||||
}
|
||||
}
|
||||
heap.Init(&h)
|
||||
|
||||
newOps := make([]*op, 0, len(fn.ops))
|
||||
for len(h) > 0 {
|
||||
if false {
|
||||
log.Printf("schedule: %s", h)
|
||||
}
|
||||
top := h[0]
|
||||
newOps = append(newOps, top)
|
||||
heap.Pop(&h)
|
||||
for _, o := range uses[top] {
|
||||
missing[o]--
|
||||
if missing[o] == 0 {
|
||||
heap.Push(&h, o)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(newOps) != len(fn.ops) {
|
||||
log.Print("schedule didn't schedule all ops")
|
||||
log.Print("before:")
|
||||
fn.write(os.Stderr)
|
||||
fn.ops = newOps
|
||||
log.Print("after:")
|
||||
fn.write(os.Stderr)
|
||||
log.Fatal("bad schedule")
|
||||
}
|
||||
|
||||
fn.ops = newOps
|
||||
}
|
||||
|
||||
func (fn *Func) emit(f *File, locs map[*op]loc) {
|
||||
w := f.w
|
||||
|
||||
// Emit constants first
|
||||
for _, o := range fn.ops {
|
||||
if o.op == "const" {
|
||||
name := locs[o].(locMem).name
|
||||
f.emitConst(name, o.c)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Fprintf(w, "TEXT %s(SB), NOSPLIT, $0-0\n", fn.name)
|
||||
|
||||
// Emit body
|
||||
for _, o := range fn.ops {
|
||||
switch o.op {
|
||||
case "const", "arg", "return", "deref", "imm":
|
||||
// Does not produce code
|
||||
continue
|
||||
}
|
||||
switch o.op {
|
||||
case "addConst":
|
||||
fatalf("addConst not lowered")
|
||||
}
|
||||
|
||||
opName := o.op
|
||||
// A ".mask" suffix is used to distinguish AVX-512 ops that use the same
|
||||
// mnemonic for regular and masked mode.
|
||||
opName = strings.TrimSuffix(opName, ".mask")
|
||||
|
||||
fmt.Fprintf(w, "\t%s", opName)
|
||||
if o.op == "VGF2P8AFFINEQB" {
|
||||
// Hidden immediate, but always 0
|
||||
//
|
||||
// TODO: Replace this with an imm input.
|
||||
fmt.Fprintf(w, " $0,")
|
||||
}
|
||||
for i, arg := range o.args {
|
||||
if i == 0 {
|
||||
fmt.Fprintf(w, " ")
|
||||
} else {
|
||||
fmt.Fprintf(w, ", ")
|
||||
}
|
||||
if arg.op == "imm" {
|
||||
fmt.Fprintf(w, "$0x%x", arg.c)
|
||||
} else {
|
||||
fmt.Fprint(w, locs[arg].LocString())
|
||||
}
|
||||
}
|
||||
if _, ok := opRMW[o.op]; ok {
|
||||
// Read-modify-write instructions, so the output is already in the
|
||||
// arguments above.
|
||||
} else {
|
||||
fmt.Fprintf(w, ", %s", locs[o].LocString())
|
||||
}
|
||||
fmt.Fprintf(w, "\n")
|
||||
}
|
||||
fmt.Fprintf(w, "\tRET\n")
|
||||
fmt.Fprintf(w, "\n")
|
||||
}
|
||||
|
||||
func (f *File) emitConst(name string, data any) {
|
||||
switch data := data.(type) {
|
||||
case []*Func:
|
||||
fmt.Fprintf(f.w, "GLOBL %s(SB), RODATA, $%#x\n", name, len(data)*8)
|
||||
for i, fn := range data {
|
||||
fmt.Fprintf(f.w, "DATA %s+%#02x(SB)/8, ", name, 8*i)
|
||||
if fn == nil {
|
||||
fmt.Fprintf(f.w, "$0\n")
|
||||
} else {
|
||||
fmt.Fprintf(f.w, "$%s(SB)\n", fn.name)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(f.w, "\n")
|
||||
return
|
||||
}
|
||||
|
||||
// Assume it's a numeric slice or array
|
||||
rv := reflect.ValueOf(data)
|
||||
sz := int(rv.Type().Elem().Size())
|
||||
fmt.Fprintf(f.w, "GLOBL %s(SB), RODATA, $%#x\n", name, rv.Len()*sz)
|
||||
for wi := 0; wi < sz*rv.Len()/8; wi++ { // Iterate over words
|
||||
var word uint64
|
||||
for j := 0; j < 8/sz; j++ { // Iterate over elements in this word
|
||||
d := rv.Index(wi*8/sz + j).Uint()
|
||||
word |= d << (j * sz * 8)
|
||||
}
|
||||
fmt.Fprintf(f.w, "DATA %s+%#02x(SB)/8, $%#016x\n", name, 8*wi, word)
|
||||
}
|
||||
|
||||
fmt.Fprintf(f.w, "\n")
|
||||
}
|
||||
26
src/internal/runtime/gc/internal/gen/gp.go
Normal file
26
src/internal/runtime/gc/internal/gen/gp.go
Normal file
@@ -0,0 +1,26 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package gen
|
||||
|
||||
type Uint64 struct {
|
||||
valGP
|
||||
}
|
||||
|
||||
var kindUint64 = &kind{typ: "Uint64", reg: regClassGP}
|
||||
|
||||
func ConstUint64(c uint64, name string) (y Uint64) {
|
||||
y.initOp(&op{op: "const", kind: y.kind(), c: c, name: name})
|
||||
return y
|
||||
}
|
||||
|
||||
func (Uint64) kind() *kind {
|
||||
return kindUint64
|
||||
}
|
||||
|
||||
func (Uint64) wrap(x *op) Uint64 {
|
||||
var y Uint64
|
||||
y.initOp(x)
|
||||
return y
|
||||
}
|
||||
338
src/internal/runtime/gc/internal/gen/regalloc.go
Normal file
338
src/internal/runtime/gc/internal/gen/regalloc.go
Normal file
@@ -0,0 +1,338 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package gen
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"math/bits"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const traceRegAlloc = true
|
||||
|
||||
type regClass uint8
|
||||
|
||||
const (
|
||||
regClassFixed regClass = iota
|
||||
regClassGP
|
||||
regClassZ
|
||||
regClassK
|
||||
|
||||
numRegClasses
|
||||
|
||||
regClassNone = ^regClass(0)
|
||||
)
|
||||
|
||||
type locReg struct {
|
||||
cls regClass
|
||||
reg int
|
||||
}
|
||||
|
||||
func (l locReg) LocString() string {
|
||||
switch l.cls {
|
||||
case regClassFixed:
|
||||
return fixedRegs[l.reg]
|
||||
case regClassGP:
|
||||
return gpRegs[l.reg]
|
||||
case regClassZ:
|
||||
return fmt.Sprintf("Z%d", l.reg)
|
||||
case regClassK:
|
||||
return fmt.Sprintf("K%d", l.reg)
|
||||
}
|
||||
panic("bad register class")
|
||||
}
|
||||
|
||||
func (l locReg) Deref(off int) (loc, error) {
|
||||
return locMem{l, off, ""}, nil
|
||||
}
|
||||
|
||||
func (l locReg) Reg() (locReg, bool) {
|
||||
return l, true
|
||||
}
|
||||
|
||||
type locMem struct {
|
||||
base locReg
|
||||
off int
|
||||
name string
|
||||
}
|
||||
|
||||
func (l locMem) LocString() string {
|
||||
if l.base.cls == regClassFixed && l.base.reg == regSB && l.off == 0 {
|
||||
return l.name + "(SB)"
|
||||
}
|
||||
if l.name != "" {
|
||||
return fmt.Sprintf("%s+%d(%s)", l.name, l.off, l.base.LocString())
|
||||
}
|
||||
if l.off != 0 {
|
||||
return fmt.Sprintf("%d(%s)", l.off, l.base.LocString())
|
||||
}
|
||||
return "(" + l.base.LocString() + ")"
|
||||
}
|
||||
|
||||
func (l locMem) Deref(off int) (loc, error) {
|
||||
return nil, fmt.Errorf("cannot dereference already memory address %s", l.LocString())
|
||||
}
|
||||
|
||||
func (l locMem) Reg() (locReg, bool) {
|
||||
if l.base.cls == regClassFixed {
|
||||
return locReg{}, false
|
||||
}
|
||||
return l.base, true
|
||||
}
|
||||
|
||||
type loc interface {
|
||||
LocString() string // Return the assembly syntax for this location
|
||||
Deref(off int) (loc, error) // Treat this location as an address and return a location with the contents of memory at that address
|
||||
Reg() (locReg, bool) // Register used by this location
|
||||
}
|
||||
|
||||
var opRMW = map[string]int{
|
||||
"VPERMI2B": 2, // Overwrites third argument
|
||||
"VPERMI2B.Z": 3, // Overwrites fourth argument
|
||||
"VPERMI2B.mask": 3, // Overwrites fourth argument
|
||||
"VPERMT2B": 1, // Overwrites second argument TODO: Check this. Unused for now.
|
||||
"VPBROADCASTQ.mask": 2, // Overwrites last argument
|
||||
}
|
||||
|
||||
// TODO: Should we have a general rule that all ".mask" instructions overwrite
|
||||
// their last argument?
|
||||
|
||||
const (
|
||||
regSB = iota
|
||||
regFP
|
||||
)
|
||||
|
||||
var fixedRegs = []string{regSB: "SB", regFP: "FP"}
|
||||
var gpRegs = []string{"AX", "BX", "CX", "DI", "SI", "R8", "R9", "R10", "R11"} // ABI argument order
|
||||
|
||||
type regSet struct {
|
||||
inUse [numRegClasses]uint32
|
||||
}
|
||||
|
||||
func (s *regSet) used(o *op, l loc) {
|
||||
if l == nil {
|
||||
return
|
||||
}
|
||||
reg, ok := l.Reg()
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if traceRegAlloc {
|
||||
log.Printf(" alloc %s @ v%02d", reg.LocString(), o.id)
|
||||
}
|
||||
if s.inUse[reg.cls]&(1<<reg.reg) != 0 {
|
||||
fatalf("register %s already used", reg.LocString())
|
||||
}
|
||||
s.inUse[reg.cls] |= 1 << reg.reg
|
||||
}
|
||||
|
||||
func (s *regSet) free(l loc) {
|
||||
if l == nil {
|
||||
return
|
||||
}
|
||||
reg, ok := l.Reg()
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if traceRegAlloc {
|
||||
log.Printf(" free %s", reg.LocString())
|
||||
}
|
||||
if s.inUse[reg.cls]&(1<<reg.reg) == 0 {
|
||||
fatalf("register %s is not in use", reg.LocString())
|
||||
}
|
||||
s.inUse[reg.cls] &^= 1 << reg.reg
|
||||
}
|
||||
|
||||
func (fn *Func) assignLocs() map[*op]loc {
|
||||
// Remove static indicator on name, if any. We'll add it back.
|
||||
nameBase := strings.TrimSuffix(fn.name, "<>")
|
||||
|
||||
// Create map from op -> fn.ops index
|
||||
opIndexes := make(map[*op]int, len(fn.ops))
|
||||
for i, o := range fn.ops {
|
||||
opIndexes[o] = i
|
||||
}
|
||||
|
||||
// Read-modify-write operations share a location with one of their inputs.
|
||||
// Likewise, deref ops extend the lifetime of their input (but in a shared
|
||||
// way, unlike RMW ops).
|
||||
//
|
||||
// Compute a map from each op to the earliest "canonical" op whose live
|
||||
// range we'll use.
|
||||
canon := make(map[*op]*op)
|
||||
overwritten := make(map[*op]bool)
|
||||
for _, o := range fn.ops {
|
||||
// Check that this op doesn't use any overwritten inputs.
|
||||
for _, arg := range o.args {
|
||||
if overwritten[arg] {
|
||||
// TODO: The solution to this is to insert copy ops.
|
||||
fatalf("op %+v uses overwritten input %+v", o, arg)
|
||||
}
|
||||
}
|
||||
|
||||
// Record canonical op.
|
||||
rmw, ok := opRMW[o.op]
|
||||
if ok {
|
||||
canon[o] = canon[o.args[rmw]]
|
||||
// Record that the input is dead now and must not be referenced.
|
||||
overwritten[o.args[rmw]] = true
|
||||
} else if o.op == "deref" {
|
||||
canon[o] = canon[o.args[0]]
|
||||
} else {
|
||||
canon[o] = o
|
||||
}
|
||||
}
|
||||
|
||||
// Compute live ranges of each canonical op.
|
||||
//
|
||||
// First, find the last use of each op.
|
||||
lastUses := make(map[*op]*op) // Canonical creation op -> last use op
|
||||
for _, op := range fn.ops {
|
||||
for _, arg := range op.args {
|
||||
lastUses[canon[arg]] = op
|
||||
}
|
||||
}
|
||||
// Invert the last uses map to get a map from op to the (canonical) values
|
||||
// that die at that op.
|
||||
lastUseMap := make(map[*op][]*op) // op of last use -> (canonical) creation ops
|
||||
for def, lastUse := range lastUses {
|
||||
lastUseMap[lastUse] = append(lastUseMap[lastUse], def)
|
||||
}
|
||||
|
||||
// Prepare for assignments
|
||||
regUsed := make([]regSet, len(fn.ops)) // In-use registers at each op
|
||||
for i := range regUsed {
|
||||
// X15/Y15/Z15 is reserved by the Go ABI
|
||||
regUsed[i].inUse[regClassZ] |= 1 << 15
|
||||
// K0 is contextual (if used as an opmask, it means no mask). Too
|
||||
// complicated, so just ignore it.
|
||||
regUsed[i].inUse[regClassK] |= 1 << 0
|
||||
}
|
||||
locs := make(map[*op]loc)
|
||||
assign := func(o *op, l loc) {
|
||||
if have, ok := locs[o]; ok {
|
||||
fatalf("op %+v already assigned location %v (new %v)", o, have, l)
|
||||
return
|
||||
}
|
||||
if o == canon[o] {
|
||||
// Mark this location used over o's live range
|
||||
for i := opIndexes[o]; i < opIndexes[lastUses[o]]; i++ {
|
||||
regUsed[i].used(fn.ops[i], l)
|
||||
}
|
||||
}
|
||||
locs[o] = l
|
||||
}
|
||||
|
||||
// Assign fixed locations
|
||||
id := 0
|
||||
for _, o := range fn.ops {
|
||||
switch o.op {
|
||||
case "arg":
|
||||
if traceRegAlloc {
|
||||
log.Printf("fixed op %+v", o)
|
||||
}
|
||||
assign(o, o.c.(locReg))
|
||||
case "const":
|
||||
if traceRegAlloc {
|
||||
log.Printf("fixed op %+v", o)
|
||||
}
|
||||
name := o.name
|
||||
if name == "" {
|
||||
name = fmt.Sprintf("%s_%d<>", nameBase, id)
|
||||
id++
|
||||
} else if name[0] == '*' {
|
||||
name = nameBase + name[1:]
|
||||
}
|
||||
assign(o, locMem{locReg{cls: regClassFixed, reg: regSB}, 0, name})
|
||||
case "return":
|
||||
if traceRegAlloc {
|
||||
log.Printf("fixed op %+v", o)
|
||||
}
|
||||
assign(o, nil) // no location
|
||||
// TODO: argZ should start at 0.
|
||||
argGP, argZ := 0, 1
|
||||
for _, arg := range o.args {
|
||||
switch arg.kind.reg {
|
||||
default:
|
||||
fatalf("bad register class for return value")
|
||||
case regClassGP:
|
||||
assign(canon[arg], locReg{regClassGP, argGP})
|
||||
argGP++
|
||||
case regClassZ:
|
||||
assign(canon[arg], locReg{regClassZ, argZ})
|
||||
argZ++
|
||||
}
|
||||
}
|
||||
case "imm":
|
||||
assign(o, nil) // no location
|
||||
}
|
||||
}
|
||||
|
||||
// Assign locations.
|
||||
for _, o := range fn.ops {
|
||||
if traceRegAlloc {
|
||||
log.Printf("assign %+v", o)
|
||||
}
|
||||
|
||||
if _, ok := locs[o]; ok {
|
||||
// Already assigned a fixed location above.
|
||||
continue
|
||||
}
|
||||
|
||||
if o.op == "deref" {
|
||||
loc, err := locs[o.args[0]].Deref(o.c.(int))
|
||||
if err != nil {
|
||||
fatalf("%v", err)
|
||||
}
|
||||
// We don't "assign" here because we've already processed the
|
||||
// canonical op, which marked loc's register as in-use for the whole
|
||||
// live range.
|
||||
locs[o] = loc
|
||||
continue
|
||||
}
|
||||
|
||||
if canon[o] != o {
|
||||
// Copy the canonical op's location.
|
||||
locs[o] = locs[canon[o]]
|
||||
continue
|
||||
}
|
||||
// Below here we know that o is already a canonical op.
|
||||
|
||||
if _, ok := opRMW[o.op]; ok {
|
||||
fatalf("read-modify-write op not canonicalized")
|
||||
}
|
||||
|
||||
// Find a free register of the right class.
|
||||
cls := o.kind.reg
|
||||
var used uint32
|
||||
for i := opIndexes[o]; i < opIndexes[lastUses[o]]; i++ {
|
||||
used |= regUsed[i].inUse[cls]
|
||||
}
|
||||
|
||||
// Assign result location.
|
||||
num := bits.TrailingZeros32(^used)
|
||||
switch cls {
|
||||
default:
|
||||
fatalf("unknown reg class %v", cls)
|
||||
case regClassGP:
|
||||
if num >= len(gpRegs) {
|
||||
panic("out of GP regs")
|
||||
}
|
||||
case regClassZ:
|
||||
if num >= 32 {
|
||||
panic("out of Z regs")
|
||||
}
|
||||
case regClassK:
|
||||
if num >= 8 {
|
||||
panic("out of K regs")
|
||||
}
|
||||
}
|
||||
loc := locReg{cls, num}
|
||||
assign(o, loc)
|
||||
}
|
||||
|
||||
return locs
|
||||
}
|
||||
246
src/internal/runtime/gc/internal/gen/simd.go
Normal file
246
src/internal/runtime/gc/internal/gen/simd.go
Normal file
@@ -0,0 +1,246 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package gen
|
||||
|
||||
type Uint8x64 struct {
|
||||
valAny
|
||||
}
|
||||
|
||||
var kindUint8x64 = &kind{typ: "Uint8x64", reg: regClassZ}
|
||||
|
||||
func ConstUint8x64(c [64]uint8, name string) (y Uint8x64) {
|
||||
y.initOp(&op{op: "const", kind: y.kind(), c: c, name: name})
|
||||
return y
|
||||
}
|
||||
|
||||
func (Uint8x64) kind() *kind {
|
||||
return kindUint8x64
|
||||
}
|
||||
|
||||
func (Uint8x64) wrap(x *op) Uint8x64 {
|
||||
var y Uint8x64
|
||||
y.initOp(x)
|
||||
return y
|
||||
}
|
||||
|
||||
func (x Uint8x64) ToUint64x8() (z Uint64x8) {
|
||||
z.op = x.op
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Uint8x64) Shuffle(shuf Uint8x64) (y Uint8x64) {
|
||||
if shuf.op.op == "const" {
|
||||
// TODO: There are often patterns we can take advantage of here. Sometimes
|
||||
// we can do a broadcast. Sometimes we can at least do a quadword
|
||||
// permutation instead of a full byte permutation.
|
||||
|
||||
// Range check the shuffle
|
||||
for i, inp := range shuf.op.c.([64]uint8) {
|
||||
// 0xff is a special "don't care" value
|
||||
if !(inp == 0xff || inp < 64) {
|
||||
fatalf("shuffle[%d] = %d out of range [0, %d) or 0xff", i, inp, 64)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
args := []*op{x.op, shuf.op}
|
||||
y.initOp(&op{op: "VPERMB", kind: y.kind(), args: args})
|
||||
return y
|
||||
}
|
||||
|
||||
func (x Uint8x64) ShuffleZeroed(shuf Uint8x64, mask Mask64) (y Uint8x64) {
|
||||
args := []*op{x.op, shuf.op, mask.op}
|
||||
y.initOp(&op{op: "VPERMB.Z", kind: y.kind(), args: args})
|
||||
return y
|
||||
}
|
||||
|
||||
func (x Uint8x64) ShuffleMasked(shuf Uint8x64, mask Mask64) (y Uint8x64) {
|
||||
args := []*op{x.op, shuf.op, mask.op}
|
||||
y.initOp(&op{op: "VPERMB.mask", kind: y.kind(), args: args})
|
||||
return y
|
||||
}
|
||||
|
||||
// TODO: The two-argument shuffle is a little weird. You almost want the
|
||||
// receiver to be the shuffle and the two arguments to be the two inputs, but
|
||||
// that's almost certainly *not* what you want for the single input shuffle.
|
||||
|
||||
func (x Uint8x64) Shuffle2(y Uint8x64, shuf Uint8x64) (z Uint8x64) {
|
||||
// Confusingly, the inputs are in the opposite order from what you'd expect.
|
||||
args := []*op{y.op, x.op, shuf.op}
|
||||
z.initOp(&op{op: "VPERMI2B", kind: z.kind(), args: args})
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Uint8x64) Shuffle2Zeroed(y Uint8x64, shuf Uint8x64, mask Mask64) (z Uint8x64) {
|
||||
// Confusingly, the inputs are in the opposite order from what you'd expect.
|
||||
args := []*op{y.op, x.op, mask.op, shuf.op}
|
||||
z.initOp(&op{op: "VPERMI2B.Z", kind: z.kind(), args: args})
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Uint8x64) Shuffle2Masked(y Uint8x64, shuf Uint8x64, mask Mask64) (z Uint8x64) {
|
||||
// Confusingly, the inputs are in the opposite order from what you'd expect.
|
||||
args := []*op{y.op, x.op, mask.op, shuf.op}
|
||||
z.initOp(&op{op: "VPERMI2B.mask", kind: z.kind(), args: args})
|
||||
return z
|
||||
}
|
||||
|
||||
type Uint64x8 struct {
|
||||
valAny
|
||||
}
|
||||
|
||||
var kindUint64x8 = &kind{typ: "Uint64x8", reg: regClassZ}
|
||||
|
||||
func ConstUint64x8(c [8]uint64, name string) (y Uint64x8) {
|
||||
// TODO: Sometimes these can be optimized into broadcast loads.
|
||||
y.initOp(&op{op: "const", kind: y.kind(), c: c, name: name})
|
||||
return y
|
||||
}
|
||||
|
||||
func BroadcastUint64x8Zeroed(src Uint64, mask Mask8) (z Uint64x8) {
|
||||
z.initOp(&op{op: "VPBROADCASTQ.Z", kind: z.kind(), args: []*op{src.op, mask.op}})
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Uint64x8) BroadcastMasked(src Uint64, mask Mask8) (z Uint64x8) {
|
||||
z.initOp(&op{op: "VPBROADCASTQ.mask", kind: z.kind(), args: []*op{src.op, mask.op, x.op}})
|
||||
return z
|
||||
}
|
||||
|
||||
func (Uint64x8) kind() *kind {
|
||||
return kindUint64x8
|
||||
}
|
||||
|
||||
func (Uint64x8) wrap(x *op) Uint64x8 {
|
||||
var y Uint64x8
|
||||
y.initOp(x)
|
||||
return y
|
||||
}
|
||||
|
||||
func (x Uint64x8) Or(y Uint64x8) (z Uint64x8) {
|
||||
z.initOp(&op{op: "VPORQ", kind: z.kind(), args: []*op{y.op, x.op}})
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Uint64x8) Sub(y Uint64x8) (z Uint64x8) {
|
||||
// Arguments are backwards
|
||||
z.initOp(&op{op: "VPSUBQ", kind: z.kind(), args: []*op{y.op, x.op}})
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Uint64x8) ToUint8x64() (z Uint8x64) {
|
||||
z.op = x.op
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Uint64x8) GF2P8Affine(y Uint8x64) (z Uint8x64) {
|
||||
// matrix, vector
|
||||
z.initOp(&op{op: "VGF2P8AFFINEQB", kind: z.kind(), args: []*op{x.op, y.op}})
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Uint64x8) ShuffleBits(y Uint8x64) (z Mask64) {
|
||||
z.initOp(&op{op: "VPSHUFBITQMB", kind: z.kind(), args: []*op{y.op, x.op}})
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Uint64x8) ShuffleBitsMasked(y Uint8x64, mask Mask64) (z Mask64) {
|
||||
// This is always zeroing if the mask is provided.
|
||||
z.initOp(&op{op: "VPSHUFBITQMB", kind: z.kind(), args: []*op{y.op, x.op, mask.op}})
|
||||
return z
|
||||
}
|
||||
|
||||
type Mask8 struct {
|
||||
valAny
|
||||
}
|
||||
|
||||
var kindMask8 = &kind{typ: "Mask8", reg: regClassK}
|
||||
|
||||
func ConstMask8(c uint8) (y Mask8) {
|
||||
var tmp Uint64
|
||||
tmp.initOp(&op{op: "MOVQ", kind: tmp.kind(), args: []*op{imm(c)}})
|
||||
y.initOp(&op{op: "KMOVB", kind: y.kind(), args: []*op{tmp.op}})
|
||||
return y
|
||||
}
|
||||
|
||||
func (Mask8) kind() *kind {
|
||||
return kindMask8
|
||||
}
|
||||
|
||||
func (Mask8) wrap(x *op) Mask8 {
|
||||
var y Mask8
|
||||
y.initOp(x)
|
||||
return y
|
||||
}
|
||||
|
||||
func (x Mask8) ToUint8() (z Uint64) {
|
||||
z.initOp(&op{op: "KMOVB", kind: z.kind(), args: []*op{x.op}})
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Mask8) Or(y Mask8) (z Mask8) {
|
||||
z.initOp(&op{op: "KORQ", kind: z.kind(), args: []*op{y.op, x.op}})
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Mask8) ShiftLeft(c uint8) (z Mask8) {
|
||||
if c == 0 {
|
||||
z = x
|
||||
} else {
|
||||
z.initOp(&op{op: "KSHIFTLB", kind: z.kind(), args: []*op{imm(c), x.op}})
|
||||
}
|
||||
return z
|
||||
}
|
||||
|
||||
type Mask64 struct {
|
||||
valAny
|
||||
}
|
||||
|
||||
var kindMask64 = &kind{typ: "Mask64", reg: regClassK}
|
||||
|
||||
func ConstMask64(c uint64) (y Mask64) {
|
||||
var tmp Uint64
|
||||
tmp.initOp(&op{op: "MOVQ", kind: tmp.kind(), args: []*op{imm(c)}})
|
||||
y.initOp(&op{op: "KMOVQ", kind: y.kind(), args: []*op{tmp.op}})
|
||||
return y
|
||||
}
|
||||
|
||||
func (Mask64) kind() *kind {
|
||||
return kindMask64
|
||||
}
|
||||
|
||||
func (Mask64) wrap(x *op) Mask64 {
|
||||
var y Mask64
|
||||
y.initOp(x)
|
||||
return y
|
||||
}
|
||||
|
||||
func (x Mask64) ToUint64() (z Uint64) {
|
||||
z.initOp(&op{op: "KMOVQ", kind: z.kind(), args: []*op{x.op}})
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Mask64) Or(y Mask64) (z Mask64) {
|
||||
z.initOp(&op{op: "KORQ", kind: z.kind(), args: []*op{y.op, x.op}})
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Mask64) ShiftLeft(c uint8) (z Mask64) {
|
||||
if c == 0 {
|
||||
z = x
|
||||
} else {
|
||||
z.initOp(&op{op: "KSHIFTLQ", kind: z.kind(), args: []*op{imm(c), x.op}})
|
||||
}
|
||||
return z
|
||||
}
|
||||
|
||||
func (x Mask64) ShiftRight(c uint8) (z Mask64) {
|
||||
if c == 0 {
|
||||
z = x
|
||||
} else {
|
||||
z.initOp(&op{op: "KSHIFTRQ", kind: z.kind(), args: []*op{imm(c), x.op}})
|
||||
}
|
||||
return z
|
||||
}
|
||||
137
src/internal/runtime/gc/internal/gen/val.go
Normal file
137
src/internal/runtime/gc/internal/gen/val.go
Normal file
@@ -0,0 +1,137 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package gen
|
||||
|
||||
import "sync"
|
||||
|
||||
type Value interface {
|
||||
kind() *kind
|
||||
getOp() *op
|
||||
}
|
||||
|
||||
type Word interface {
|
||||
Value
|
||||
isWord()
|
||||
}
|
||||
|
||||
// wrap is an unfortunate necessity so that we can pass Value types around as
|
||||
// values (not pointers), but still have generic functions that can construct a
|
||||
// new Value. Ideally we would just have a method on Value to initialize its op,
|
||||
// but that needs to have a non-pointer receiver to satisfy the interface and
|
||||
// then it can't mutate the Value.
|
||||
type wrap[T Value] interface {
|
||||
Value
|
||||
wrap(x *op) T
|
||||
}
|
||||
|
||||
type kind struct {
|
||||
typ string
|
||||
reg regClass
|
||||
}
|
||||
|
||||
type void struct {
|
||||
valAny
|
||||
}
|
||||
|
||||
var voidKind = &kind{typ: "void", reg: regClassNone}
|
||||
|
||||
func (void) kind() *kind { return voidKind }
|
||||
|
||||
type Ptr[T Value] struct {
|
||||
valGP
|
||||
}
|
||||
|
||||
// Ptr is a Word
|
||||
var _ Word = Ptr[void]{}
|
||||
|
||||
var ptrKinds = sync.Map{} // *kind -> *kind
|
||||
|
||||
func (Ptr[T]) kind() *kind {
|
||||
var x T
|
||||
xk := x.kind()
|
||||
pk, ok := ptrKinds.Load(xk)
|
||||
if !ok {
|
||||
k := &kind{typ: "Ptr[" + x.kind().typ + "]", reg: regClassGP}
|
||||
pk, _ = ptrKinds.LoadOrStore(xk, k)
|
||||
}
|
||||
return pk.(*kind)
|
||||
}
|
||||
|
||||
func (Ptr[T]) wrap(x *op) Ptr[T] {
|
||||
var y Ptr[T]
|
||||
y.initOp(x)
|
||||
return y
|
||||
}
|
||||
|
||||
func (x Ptr[T]) AddConst(off int) (y Ptr[T]) {
|
||||
base := x.op
|
||||
for base.op == "addConst" {
|
||||
off += base.args[1].c.(int)
|
||||
base = base.args[0]
|
||||
}
|
||||
y.initOp(&op{op: "addConst", kind: y.kind(), args: []*op{base, imm(off)}})
|
||||
return y
|
||||
}
|
||||
|
||||
func Deref[W wrap[T], T Value](ptr Ptr[W]) T {
|
||||
var off int
|
||||
base := ptr.op
|
||||
for base.op == "addConst" {
|
||||
off += base.args[1].c.(int)
|
||||
base = base.args[0]
|
||||
}
|
||||
|
||||
var y W
|
||||
return y.wrap(&op{op: "deref", kind: y.kind(), args: []*op{base}, c: off})
|
||||
}
|
||||
|
||||
type Array[T Value] struct {
|
||||
valAny
|
||||
}
|
||||
|
||||
func ConstArray[T Value](vals []T, name string) (y Array[T]) {
|
||||
// TODO: This probably doesn't actually work because emitConst won't
|
||||
// understand vals.
|
||||
y.initOp(&op{op: "const", kind: y.kind(), c: vals, name: name})
|
||||
return y
|
||||
}
|
||||
|
||||
func (Array[T]) kind() *kind {
|
||||
// TODO: Cache this like Ptr.kind.
|
||||
var x T
|
||||
return &kind{typ: "Array[" + x.kind().typ + "]", reg: regClassNone}
|
||||
}
|
||||
|
||||
type valGP struct {
|
||||
valAny
|
||||
}
|
||||
|
||||
func (valGP) isWord() {}
|
||||
|
||||
type valAny struct {
|
||||
*op
|
||||
}
|
||||
|
||||
func (v *valAny) initOp(x *op) {
|
||||
if v.op != nil {
|
||||
panic("double init of val")
|
||||
}
|
||||
if x.kind == nil {
|
||||
panic("val missing kind")
|
||||
}
|
||||
v.op = x
|
||||
|
||||
// Figure out this value's function.
|
||||
for _, arg := range x.args {
|
||||
if fn := arg.fn; fn != nil {
|
||||
fn.attach(x)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (v valAny) getOp() *op {
|
||||
return v.op
|
||||
}
|
||||
@@ -7,7 +7,8 @@ package gc
|
||||
import "internal/goarch"
|
||||
|
||||
const (
|
||||
ptrBits = 8 * goarch.PtrSize
|
||||
// PageWords is the number of pointer-words per page.
|
||||
PageWords = PageSize / goarch.PtrSize
|
||||
|
||||
// A malloc header is functionally a single type pointer, but
|
||||
// we need to use 8 here to ensure 8-byte alignment of allocations
|
||||
@@ -43,7 +44,7 @@ const (
|
||||
// would not be invariant to size-class rounding. Eschewing this property means a
|
||||
// more complex check or possibly storing additional state to determine whether a
|
||||
// span has malloc headers.
|
||||
MinSizeForMallocHeader = goarch.PtrSize * ptrBits
|
||||
MinSizeForMallocHeader = goarch.PtrSize * goarch.PtrBits
|
||||
|
||||
// PageSize is the increment in which spans are managed.
|
||||
PageSize = 1 << PageShift
|
||||
|
||||
@@ -52,7 +52,7 @@ func main() {
|
||||
fmt.Fprintln(&b, "// Code generated by mksizeclasses.go; DO NOT EDIT.")
|
||||
fmt.Fprintln(&b, "//go:generate go run mksizeclasses.go")
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, "package runtime")
|
||||
fmt.Fprintln(&b, "package gc")
|
||||
classes := makeClasses()
|
||||
|
||||
printComment(&b, classes)
|
||||
@@ -287,6 +287,14 @@ func maxObjsPerSpan(classes []class) int {
|
||||
return most
|
||||
}
|
||||
|
||||
func maxNPages(classes []class) int {
|
||||
most := 0
|
||||
for _, c := range classes[1:] {
|
||||
most = max(most, c.npages)
|
||||
}
|
||||
return most
|
||||
}
|
||||
|
||||
func printClasses(w io.Writer, classes []class) {
|
||||
fmt.Fprintln(w, "const (")
|
||||
fmt.Fprintf(w, "MinHeapAlign = %d\n", minHeapAlign)
|
||||
@@ -297,6 +305,7 @@ func printClasses(w io.Writer, classes []class) {
|
||||
fmt.Fprintf(w, "NumSizeClasses = %d\n", len(classes))
|
||||
fmt.Fprintf(w, "PageShift = %d\n", pageShift)
|
||||
fmt.Fprintf(w, "MaxObjsPerSpan = %d\n", maxObjsPerSpan(classes))
|
||||
fmt.Fprintf(w, "MaxSizeClassNPages = %d\n", maxNPages(classes))
|
||||
fmt.Fprintln(w, ")")
|
||||
|
||||
fmt.Fprint(w, "var SizeClassToSize = [NumSizeClasses]uint16 {")
|
||||
|
||||
22
src/internal/runtime/gc/scan/expand_amd64.go
Normal file
22
src/internal/runtime/gc/scan/expand_amd64.go
Normal file
@@ -0,0 +1,22 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package scan
|
||||
|
||||
import "internal/runtime/gc"
|
||||
|
||||
// ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked,
|
||||
// where f is the word size of objects in sizeClass.
|
||||
//
|
||||
// This is a testing entrypoint to the expanders used by scanSpanPacked*.
|
||||
//
|
||||
//go:noescape
|
||||
func ExpandAVX512(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask)
|
||||
|
||||
// gcExpandersAVX512 is the PCs of expander functions. These cannot be called directly
|
||||
// as they don't follow the Go ABI, but you can use this to check if a given
|
||||
// expander PC is 0.
|
||||
//
|
||||
// It is defined in assembly.
|
||||
var gcExpandersAVX512 [len(gc.SizeClassToSize)]uintptr
|
||||
2631
src/internal/runtime/gc/scan/expand_amd64.s
Normal file
2631
src/internal/runtime/gc/scan/expand_amd64.s
Normal file
File diff suppressed because it is too large
Load Diff
19
src/internal/runtime/gc/scan/expand_amd64_test.go
Normal file
19
src/internal/runtime/gc/scan/expand_amd64_test.go
Normal file
@@ -0,0 +1,19 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build amd64
|
||||
|
||||
package scan_test
|
||||
|
||||
import (
|
||||
"internal/runtime/gc/scan"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestExpandAVX512(t *testing.T) {
|
||||
if !scan.CanAVX512() {
|
||||
t.Skip("no AVX512")
|
||||
}
|
||||
testExpand(t, scan.ExpandAVX512)
|
||||
}
|
||||
39
src/internal/runtime/gc/scan/expand_reference.go
Normal file
39
src/internal/runtime/gc/scan/expand_reference.go
Normal file
@@ -0,0 +1,39 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package scan
|
||||
|
||||
import (
|
||||
"internal/goarch"
|
||||
"internal/runtime/gc"
|
||||
)
|
||||
|
||||
// ExpandReference is a reference implementation of an expander function
|
||||
// that translates object mark bits into a bitmap of one bit per word of
|
||||
// marked object, assuming the object is of the provided size class.
|
||||
func ExpandReference(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) {
|
||||
// Look up the size and derive the number of objects in a span.
|
||||
// We're only concerned with small objects in single-page spans,
|
||||
// and gc.PtrMask enforces this by being statically sized to
|
||||
// accomodate only such spans.
|
||||
size := uintptr(gc.SizeClassToSize[sizeClass])
|
||||
nObj := uintptr(gc.SizeClassToNPages[sizeClass]) * gc.PageSize / size
|
||||
|
||||
// f is the expansion factor. For example, if our objects are of size 48,
|
||||
// then each mark bit will translate into 6 (48/8 = 6) set bits in the
|
||||
// pointer bitmap.
|
||||
f := size / goarch.PtrSize
|
||||
for i := range nObj {
|
||||
// Check if the object is marked.
|
||||
if packed[i/goarch.PtrBits]&(uintptr(1)<<(i%goarch.PtrBits)) == 0 {
|
||||
continue
|
||||
}
|
||||
// Propagate that mark into the destination into one bit per the
|
||||
// expansion factor f, offset to the object's offset within the span.
|
||||
for j := range f {
|
||||
b := i*f + j // i*f is the start bit for the object, j indexes into each corresponding word after.
|
||||
unpacked[b/goarch.PtrBits] |= uintptr(1) << (b % goarch.PtrBits)
|
||||
}
|
||||
}
|
||||
}
|
||||
37
src/internal/runtime/gc/scan/expand_test.go
Normal file
37
src/internal/runtime/gc/scan/expand_test.go
Normal file
@@ -0,0 +1,37 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package scan_test
|
||||
|
||||
import (
|
||||
"internal/goarch"
|
||||
"internal/runtime/gc"
|
||||
"internal/runtime/gc/scan"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type expandFunc func(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask)
|
||||
|
||||
func testExpand(t *testing.T, expF expandFunc) {
|
||||
expR := scan.ExpandReference
|
||||
|
||||
testObjs(t, func(t *testing.T, sizeClass int, objs *gc.ObjMask) {
|
||||
var want, got gc.PtrMask
|
||||
expR(sizeClass, objs, &want)
|
||||
expF(sizeClass, objs, &got)
|
||||
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Errorf("expansion differs from reference at bit %d", i*goarch.PtrSize)
|
||||
if goarch.PtrSize == 4 {
|
||||
t.Logf("got: %032b", got[i])
|
||||
t.Logf("want: %032b", want[i])
|
||||
} else {
|
||||
t.Logf("got: %064b", got[i])
|
||||
t.Logf("want: %064b", want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
35
src/internal/runtime/gc/scan/filter.go
Normal file
35
src/internal/runtime/gc/scan/filter.go
Normal file
@@ -0,0 +1,35 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package scan
|
||||
|
||||
import "unsafe"
|
||||
|
||||
// FilterNil packs non-nil (non-zero) values in bufp together
|
||||
// at the beginning of bufp, returning the length of the
|
||||
// packed buffer. It treats bufp as an array of size n.
|
||||
//
|
||||
// TODO(mknyszek): Add a faster SIMD-based implementation.
|
||||
func FilterNil(bufp *uintptr, n int32) int32 {
|
||||
buf := unsafe.Slice(bufp, int(n))
|
||||
lo := 0
|
||||
hi := len(buf) - 1
|
||||
for lo < hi {
|
||||
for lo < hi && buf[hi] == 0 {
|
||||
hi--
|
||||
}
|
||||
for lo < hi && buf[lo] != 0 {
|
||||
lo++
|
||||
}
|
||||
if lo >= hi {
|
||||
break
|
||||
}
|
||||
buf[lo] = buf[hi]
|
||||
hi--
|
||||
}
|
||||
if hi >= 0 && buf[hi] == 0 {
|
||||
hi--
|
||||
}
|
||||
return int32(hi) + 1
|
||||
}
|
||||
94
src/internal/runtime/gc/scan/filter_test.go
Normal file
94
src/internal/runtime/gc/scan/filter_test.go
Normal file
@@ -0,0 +1,94 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package scan_test
|
||||
|
||||
import (
|
||||
"internal/runtime/gc/scan"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestFilterNil(t *testing.T) {
|
||||
t.Run("empty", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{}, []uintptr{})
|
||||
})
|
||||
t.Run("one", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{4}, []uintptr{4})
|
||||
})
|
||||
t.Run("elimOne", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{0}, []uintptr{})
|
||||
})
|
||||
t.Run("oneElimBegin", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{0, 4}, []uintptr{4})
|
||||
})
|
||||
t.Run("oneElimEnd", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{4, 0}, []uintptr{4})
|
||||
})
|
||||
t.Run("oneElimMultiBegin", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{0, 0, 0, 4}, []uintptr{4})
|
||||
})
|
||||
t.Run("oneElimMultiEnd", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{4, 0, 0, 0}, []uintptr{4})
|
||||
})
|
||||
t.Run("oneElimMulti", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{0, 0, 0, 4, 0}, []uintptr{4})
|
||||
})
|
||||
t.Run("two", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{5, 12}, []uintptr{5, 12})
|
||||
})
|
||||
t.Run("twoElimBegin", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{0, 5, 12}, []uintptr{5, 12})
|
||||
})
|
||||
t.Run("twoElimMid", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{5, 0, 12}, []uintptr{5, 12})
|
||||
})
|
||||
t.Run("twoElimEnd", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{5, 12, 0}, []uintptr{5, 12})
|
||||
})
|
||||
t.Run("twoElimMulti", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{0, 5, 0, 12, 0}, []uintptr{5, 12})
|
||||
})
|
||||
t.Run("Multi", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{1, 5, 5, 0, 0, 0, 12, 0, 121, 5, 0}, []uintptr{1, 5, 5, 12, 121, 5})
|
||||
})
|
||||
}
|
||||
|
||||
func testFilterNil(t *testing.T, buf, want []uintptr) {
|
||||
var bufp *uintptr
|
||||
if len(buf) != 0 {
|
||||
bufp = &buf[0]
|
||||
}
|
||||
n := scan.FilterNil(bufp, int32(len(buf)))
|
||||
if n > int32(len(buf)) {
|
||||
t.Errorf("bogus new length returned: %d > %d", n, len(buf))
|
||||
return
|
||||
}
|
||||
buf = buf[:n]
|
||||
if len(buf) != len(want) {
|
||||
t.Errorf("lengths differ: got %d, want %d", len(buf), len(want))
|
||||
}
|
||||
|
||||
wantMap := make(map[uintptr]int)
|
||||
gotMap := make(map[uintptr]int)
|
||||
for _, p := range want {
|
||||
wantMap[p]++
|
||||
}
|
||||
for _, p := range buf {
|
||||
gotMap[p]++
|
||||
}
|
||||
for p, nWant := range wantMap {
|
||||
if nGot, ok := gotMap[p]; !ok {
|
||||
t.Errorf("want %d, but missing from output", p)
|
||||
} else if nGot != nWant {
|
||||
t.Errorf("want %d copies of %d, but got %d", nWant, p, nGot)
|
||||
}
|
||||
}
|
||||
for p := range gotMap {
|
||||
if _, ok := wantMap[p]; !ok {
|
||||
t.Errorf("got %d, but didn't want it", p)
|
||||
}
|
||||
}
|
||||
t.Logf("got: %v", buf)
|
||||
t.Logf("want: %v", want)
|
||||
}
|
||||
16
src/internal/runtime/gc/scan/mem_nounix_test.go
Normal file
16
src/internal/runtime/gc/scan/mem_nounix_test.go
Normal file
@@ -0,0 +1,16 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build !unix
|
||||
|
||||
package scan_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func makeMem(t testing.TB, nPages int) ([]uintptr, func()) {
|
||||
t.Skip("mmap unsupported")
|
||||
return nil, nil
|
||||
}
|
||||
25
src/internal/runtime/gc/scan/mem_unix_test.go
Normal file
25
src/internal/runtime/gc/scan/mem_unix_test.go
Normal file
@@ -0,0 +1,25 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build unix
|
||||
|
||||
package scan_test
|
||||
|
||||
import (
|
||||
"internal/runtime/gc"
|
||||
"syscall"
|
||||
"testing"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
func makeMem(t testing.TB, nPages int) ([]uintptr, func()) {
|
||||
mem, err := syscall.Mmap(-1, 0, int(gc.PageSize*nPages), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_PRIVATE|syscall.MAP_ANON)
|
||||
if err != nil {
|
||||
t.Fatalf("mmap failed: %s", err)
|
||||
}
|
||||
free := func() {
|
||||
syscall.Munmap(mem)
|
||||
}
|
||||
return unsafe.Slice((*uintptr)(unsafe.Pointer(unsafe.SliceData(mem))), len(mem)/8), free
|
||||
}
|
||||
412
src/internal/runtime/gc/scan/mkasm.go
Normal file
412
src/internal/runtime/gc/scan/mkasm.go
Normal file
@@ -0,0 +1,412 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build ignore
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"os"
|
||||
"slices"
|
||||
"strconv"
|
||||
|
||||
"internal/runtime/gc"
|
||||
"internal/runtime/gc/internal/gen"
|
||||
)
|
||||
|
||||
const header = "// Code generated by mkasm.go. DO NOT EDIT.\n\n"
|
||||
|
||||
func main() {
|
||||
generate("expand_amd64.s", genExpanders)
|
||||
}
|
||||
|
||||
func generate(fileName string, genFunc func(*gen.File)) {
|
||||
var buf bytes.Buffer
|
||||
tee := io.MultiWriter(&buf, os.Stdout)
|
||||
|
||||
file := gen.NewFile(tee)
|
||||
|
||||
genFunc(file)
|
||||
|
||||
fmt.Fprintf(tee, header)
|
||||
file.Compile()
|
||||
|
||||
f, err := os.Create(fileName)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
_, err = f.Write(buf.Bytes())
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func genExpanders(file *gen.File) {
|
||||
gcExpandersAVX512 := make([]*gen.Func, len(gc.SizeClassToSize))
|
||||
for sc, ob := range gc.SizeClassToSize {
|
||||
if gc.SizeClassToNPages[sc] != 1 {
|
||||
// These functions all produce a bitmap that covers exactly one
|
||||
// page.
|
||||
continue
|
||||
}
|
||||
if ob > gc.MinSizeForMallocHeader {
|
||||
// This size class is too big to have a packed pointer/scalar bitmap.
|
||||
break
|
||||
}
|
||||
|
||||
xf := int(ob) / 8
|
||||
log.Printf("size class %d bytes, expansion %dx", ob, xf)
|
||||
|
||||
fn := gen.NewFunc(fmt.Sprintf("expandAVX512_%d<>", xf))
|
||||
ptrObjBits := gen.Arg[gen.Ptr[gen.Uint8x64]](fn)
|
||||
|
||||
if xf == 1 {
|
||||
expandIdentity(ptrObjBits)
|
||||
} else {
|
||||
ok := gfExpander(xf, ptrObjBits)
|
||||
if !ok {
|
||||
log.Printf("failed to generate expander for size class %d", sc)
|
||||
}
|
||||
}
|
||||
file.AddFunc(fn)
|
||||
gcExpandersAVX512[sc] = fn
|
||||
}
|
||||
|
||||
// Generate table mapping size class to expander PC
|
||||
file.AddConst("·gcExpandersAVX512", gcExpandersAVX512)
|
||||
}
|
||||
|
||||
// mat8x8 is an 8x8 bit matrix.
|
||||
type mat8x8 struct {
|
||||
mat [8]uint8
|
||||
}
|
||||
|
||||
func matGroupToVec(mats *[8]mat8x8) [8]uint64 {
|
||||
var out [8]uint64
|
||||
for i, mat := range mats {
|
||||
for j, row := range mat.mat {
|
||||
// For some reason, Intel flips the rows.
|
||||
out[i] |= uint64(row) << ((7 - j) * 8)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// expandIdentity implements 1x expansion (that is, no expansion).
|
||||
func expandIdentity(ptrObjBits gen.Ptr[gen.Uint8x64]) {
|
||||
objBitsLo := gen.Deref(ptrObjBits)
|
||||
objBitsHi := gen.Deref(ptrObjBits.AddConst(64))
|
||||
gen.Return(objBitsLo, objBitsHi)
|
||||
}
|
||||
|
||||
// gfExpander produces a function that expands each bit in an input bitmap into
|
||||
// f consecutive bits in an output bitmap.
|
||||
//
|
||||
// The input is
|
||||
//
|
||||
// AX *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)
|
||||
//
|
||||
// The output is
|
||||
//
|
||||
// Z1 [64]uint8 = The bottom 512 bits of the expanded bitmap
|
||||
// Z2 [64]uint8 = The top 512 bits of the expanded bitmap
|
||||
//
|
||||
// TODO(austin): This should Z0/Z1.
|
||||
func gfExpander(f int, ptrObjBits gen.Ptr[gen.Uint8x64]) bool {
|
||||
// TODO(austin): For powers of 2 >= 8, we can use mask expansion ops to make this much simpler.
|
||||
|
||||
// TODO(austin): For f >= 8, I suspect there are better ways to do this.
|
||||
//
|
||||
// For example, we could use a mask expansion to get a full byte for each
|
||||
// input bit, and separately create the bytes that blend adjacent bits, then
|
||||
// shuffle those bytes together. Certainly for f >= 16 this makes sense
|
||||
// because each of those bytes will be used, possibly more than once.
|
||||
|
||||
objBits := gen.Deref(ptrObjBits)
|
||||
|
||||
type term struct {
|
||||
iByte, oByte int
|
||||
mat mat8x8
|
||||
}
|
||||
var terms []term
|
||||
|
||||
// Iterate over all output bytes and construct the 8x8 GF2 matrix to compute
|
||||
// the output byte from the appropriate input byte. Gather all of these into
|
||||
// "terms".
|
||||
for oByte := 0; oByte < 1024/8; oByte++ {
|
||||
var byteMat mat8x8
|
||||
iByte := -1
|
||||
for oBit := oByte * 8; oBit < oByte*8+8; oBit++ {
|
||||
iBit := oBit / f
|
||||
if iByte == -1 {
|
||||
iByte = iBit / 8
|
||||
} else if iByte != iBit/8 {
|
||||
log.Printf("output byte %d straddles input bytes %d and %d", oByte, iByte, iBit/8)
|
||||
return false
|
||||
}
|
||||
// One way to view this is that the i'th row of the matrix will be
|
||||
// ANDed with the input byte, and the parity of the result will set
|
||||
// the i'th bit in the output. We use a simple 1 bit mask, so the
|
||||
// parity is irrelevant beyond selecting out that one bit.
|
||||
byteMat.mat[oBit%8] = 1 << (iBit % 8)
|
||||
}
|
||||
terms = append(terms, term{iByte, oByte, byteMat})
|
||||
}
|
||||
|
||||
if false {
|
||||
// Print input byte -> output byte as a matrix
|
||||
maxIByte, maxOByte := 0, 0
|
||||
for _, term := range terms {
|
||||
maxIByte = max(maxIByte, term.iByte)
|
||||
maxOByte = max(maxOByte, term.oByte)
|
||||
}
|
||||
iToO := make([][]rune, maxIByte+1)
|
||||
for i := range iToO {
|
||||
iToO[i] = make([]rune, maxOByte+1)
|
||||
}
|
||||
matMap := make(map[mat8x8]int)
|
||||
for _, term := range terms {
|
||||
i, ok := matMap[term.mat]
|
||||
if !ok {
|
||||
i = len(matMap)
|
||||
matMap[term.mat] = i
|
||||
}
|
||||
iToO[term.iByte][term.oByte] = 'A' + rune(i)
|
||||
}
|
||||
for o := range maxOByte + 1 {
|
||||
fmt.Printf("%d", o)
|
||||
for i := range maxIByte + 1 {
|
||||
fmt.Printf(",")
|
||||
if mat := iToO[i][o]; mat != 0 {
|
||||
fmt.Printf("%c", mat)
|
||||
}
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
|
||||
// In hardware, each (8 byte) matrix applies to 8 bytes of data in parallel,
|
||||
// and we get to operate on up to 8 matrixes in parallel (or 64 values). That is:
|
||||
//
|
||||
// abcdefgh ijklmnop qrstuvwx yzABCDEF GHIJKLMN OPQRSTUV WXYZ0123 456789_+
|
||||
// mat0 mat1 mat2 mat3 mat4 mat5 mat6 mat7
|
||||
|
||||
// Group the terms by matrix, but limit each group to 8 terms.
|
||||
const termsPerGroup = 8 // Number of terms we can multiply by the same matrix.
|
||||
const groupsPerSuperGroup = 8 // Number of matrixes we can fit in a vector.
|
||||
|
||||
matMap := make(map[mat8x8]int)
|
||||
allMats := make(map[mat8x8]bool)
|
||||
var termGroups [][]term
|
||||
for _, term := range terms {
|
||||
allMats[term.mat] = true
|
||||
|
||||
i, ok := matMap[term.mat]
|
||||
if ok && f > groupsPerSuperGroup {
|
||||
// The output is ultimately produced in two [64]uint8 registers.
|
||||
// Getting every byte in the right place of each of these requires a
|
||||
// final permutation that often requires more than one source.
|
||||
//
|
||||
// Up to 8x expansion, we can get a really nice grouping so we can use
|
||||
// the same 8 matrix vector several times, without producing
|
||||
// permutations that require more than two sources.
|
||||
//
|
||||
// Above 8x, however, we can't get nice matrixes anyway, so we
|
||||
// instead prefer reducing the complexity of the permutations we
|
||||
// need to produce the final outputs. To do this, avoid grouping
|
||||
// together terms that are split across the two registers.
|
||||
outRegister := termGroups[i][0].oByte / 64
|
||||
if term.oByte/64 != outRegister {
|
||||
ok = false
|
||||
}
|
||||
}
|
||||
if !ok {
|
||||
// Start a new term group.
|
||||
i = len(termGroups)
|
||||
matMap[term.mat] = i
|
||||
termGroups = append(termGroups, nil)
|
||||
}
|
||||
|
||||
termGroups[i] = append(termGroups[i], term)
|
||||
|
||||
if len(termGroups[i]) == termsPerGroup {
|
||||
// This term group is full.
|
||||
delete(matMap, term.mat)
|
||||
}
|
||||
}
|
||||
|
||||
for i, termGroup := range termGroups {
|
||||
log.Printf("term group %d:", i)
|
||||
for _, term := range termGroup {
|
||||
log.Printf(" %+v", term)
|
||||
}
|
||||
}
|
||||
|
||||
// We can do 8 matrix multiplies in parallel, which is 8 term groups. Pack
|
||||
// as many term groups as we can into each super-group to minimize the
|
||||
// number of matrix multiplies.
|
||||
//
|
||||
// Ideally, we use the same matrix in each super-group, which might mean
|
||||
// doing fewer than 8 multiplies at a time. That's fine because it never
|
||||
// increases the total number of matrix multiplies.
|
||||
//
|
||||
// TODO: Packing the matrixes less densely may let us use more broadcast
|
||||
// loads instead of general permutations, though. That replaces a load of
|
||||
// the permutation with a load of the matrix, but is probably still slightly
|
||||
// better.
|
||||
var sgSize, nSuperGroups int
|
||||
oneMatVec := f <= groupsPerSuperGroup
|
||||
if oneMatVec {
|
||||
// We can use the same matrix in each multiply by doing sgSize
|
||||
// multiplies at a time.
|
||||
sgSize = groupsPerSuperGroup / len(allMats) * len(allMats)
|
||||
nSuperGroups = (len(termGroups) + sgSize - 1) / sgSize
|
||||
} else {
|
||||
// We can't use the same matrix for each multiply. Just do as many at a
|
||||
// time as we can.
|
||||
//
|
||||
// TODO: This is going to produce several distinct matrixes, when we
|
||||
// probably only need two. Be smarter about how we create super-groups
|
||||
// in this case. Maybe we build up an array of super-groups and then the
|
||||
// loop below just turns them into ops?
|
||||
sgSize = 8
|
||||
nSuperGroups = (len(termGroups) + groupsPerSuperGroup - 1) / groupsPerSuperGroup
|
||||
}
|
||||
|
||||
// Construct each super-group.
|
||||
var matGroup [8]mat8x8
|
||||
var matMuls []gen.Uint8x64
|
||||
var perm [128]int
|
||||
for sgi := range nSuperGroups {
|
||||
var iperm [64]uint8
|
||||
for i := range iperm {
|
||||
iperm[i] = 0xff // "Don't care"
|
||||
}
|
||||
// Pick off sgSize term groups.
|
||||
superGroup := termGroups[:min(len(termGroups), sgSize)]
|
||||
termGroups = termGroups[len(superGroup):]
|
||||
// Build the matrix and permutations for this super-group.
|
||||
var thisMatGroup [8]mat8x8
|
||||
for i, termGroup := range superGroup {
|
||||
// All terms in this group have the same matrix. Pick one.
|
||||
thisMatGroup[i] = termGroup[0].mat
|
||||
for j, term := range termGroup {
|
||||
// Build the input permutation.
|
||||
iperm[i*termsPerGroup+j] = uint8(term.iByte)
|
||||
// Build the output permutation.
|
||||
perm[term.oByte] = sgi*groupsPerSuperGroup*termsPerGroup + i*termsPerGroup + j
|
||||
}
|
||||
}
|
||||
log.Printf("input permutation %d: %v", sgi, iperm)
|
||||
|
||||
// Check that we're not making more distinct matrixes than expected.
|
||||
if oneMatVec {
|
||||
if sgi == 0 {
|
||||
matGroup = thisMatGroup
|
||||
} else if matGroup != thisMatGroup {
|
||||
log.Printf("super-groups have different matrixes:\n%+v\n%+v", matGroup, thisMatGroup)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Emit matrix op.
|
||||
matConst := gen.ConstUint64x8(matGroupToVec(&thisMatGroup), fmt.Sprintf("*_mat%d<>", sgi))
|
||||
inOp := objBits.Shuffle(gen.ConstUint8x64(iperm, fmt.Sprintf("*_inShuf%d<>", sgi)))
|
||||
matMul := matConst.GF2P8Affine(inOp)
|
||||
matMuls = append(matMuls, matMul)
|
||||
}
|
||||
|
||||
log.Printf("output permutation: %v", perm)
|
||||
|
||||
outLo, ok := genShuffle("*_outShufLo", (*[64]int)(perm[:64]), matMuls...)
|
||||
if !ok {
|
||||
log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
|
||||
return false
|
||||
}
|
||||
outHi, ok := genShuffle("*_outShufHi", (*[64]int)(perm[64:]), matMuls...)
|
||||
if !ok {
|
||||
log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
|
||||
return false
|
||||
}
|
||||
gen.Return(outLo, outHi)
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func genShuffle(name string, perm *[64]int, args ...gen.Uint8x64) (gen.Uint8x64, bool) {
|
||||
// Construct flattened permutation.
|
||||
var vperm [64]byte
|
||||
|
||||
// Get the inputs used by this permutation.
|
||||
var inputs []int
|
||||
for i, src := range perm {
|
||||
inputIdx := slices.Index(inputs, src/64)
|
||||
if inputIdx == -1 {
|
||||
inputIdx = len(inputs)
|
||||
inputs = append(inputs, src/64)
|
||||
}
|
||||
vperm[i] = byte(src%64 | (inputIdx << 6))
|
||||
}
|
||||
|
||||
// Emit instructions for easy cases.
|
||||
switch len(inputs) {
|
||||
case 1:
|
||||
constOp := gen.ConstUint8x64(vperm, name)
|
||||
return args[inputs[0]].Shuffle(constOp), true
|
||||
case 2:
|
||||
constOp := gen.ConstUint8x64(vperm, name)
|
||||
return args[inputs[0]].Shuffle2(args[inputs[1]], constOp), true
|
||||
}
|
||||
|
||||
// Harder case, we need to shuffle in from up to 2 more tables.
|
||||
//
|
||||
// Perform two shuffles. One shuffle will get its data from the first
|
||||
// two inputs, the other shuffle will get its data from the other one
|
||||
// or two inputs. All values they don't care each don't care about will
|
||||
// be zeroed.
|
||||
var vperms [2][64]byte
|
||||
var masks [2]uint64
|
||||
for j, idx := range vperm {
|
||||
for i := range vperms {
|
||||
vperms[i][j] = 0xff // "Don't care"
|
||||
}
|
||||
if idx == 0xff {
|
||||
continue
|
||||
}
|
||||
vperms[idx/128][j] = idx % 128
|
||||
masks[idx/128] |= uint64(1) << j
|
||||
}
|
||||
|
||||
// Validate that the masks are fully disjoint.
|
||||
if masks[0]^masks[1] != ^uint64(0) {
|
||||
panic("bad shuffle!")
|
||||
}
|
||||
|
||||
// Generate constants.
|
||||
constOps := make([]gen.Uint8x64, len(vperms))
|
||||
for i, v := range vperms {
|
||||
constOps[i] = gen.ConstUint8x64(v, name+strconv.Itoa(i))
|
||||
}
|
||||
|
||||
// Generate shuffles.
|
||||
switch len(inputs) {
|
||||
case 3:
|
||||
r0 := args[inputs[0]].Shuffle2Zeroed(args[inputs[1]], constOps[0], gen.ConstMask64(masks[0]))
|
||||
r1 := args[inputs[2]].ShuffleZeroed(constOps[1], gen.ConstMask64(masks[1]))
|
||||
return r0.ToUint64x8().Or(r1.ToUint64x8()).ToUint8x64(), true
|
||||
case 4:
|
||||
r0 := args[inputs[0]].Shuffle2Zeroed(args[inputs[1]], constOps[0], gen.ConstMask64(masks[0]))
|
||||
r1 := args[inputs[2]].Shuffle2Zeroed(args[inputs[3]], constOps[1], gen.ConstMask64(masks[1]))
|
||||
return r0.ToUint64x8().Or(r1.ToUint64x8()).ToUint8x64(), true
|
||||
}
|
||||
|
||||
// Too many inputs. To support more, we'd need to separate tables much earlier.
|
||||
// Right now all the indices fit in a byte, but with >4 inputs they might not (>256 bytes).
|
||||
return args[0], false
|
||||
}
|
||||
41
src/internal/runtime/gc/scan/scan_amd64.go
Normal file
41
src/internal/runtime/gc/scan/scan_amd64.go
Normal file
@@ -0,0 +1,41 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package scan
|
||||
|
||||
import (
|
||||
"internal/cpu"
|
||||
"internal/runtime/gc"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
|
||||
if CanAVX512() {
|
||||
return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)
|
||||
}
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func HasFastScanSpanPacked() bool {
|
||||
return avx512ScanPackedReqsMet
|
||||
}
|
||||
|
||||
// -- AVX512 --
|
||||
|
||||
func CanAVX512() bool {
|
||||
return avx512ScanPackedReqsMet
|
||||
}
|
||||
|
||||
func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
|
||||
return FilterNil(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
|
||||
}
|
||||
|
||||
//go:noescape
|
||||
func scanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)
|
||||
|
||||
var avx512ScanPackedReqsMet = cpu.X86.HasAVX512VL &&
|
||||
cpu.X86.HasAVX512BW &&
|
||||
cpu.X86.HasGFNI &&
|
||||
cpu.X86.HasAVX512BITALG &&
|
||||
cpu.X86.HasAVX512VBMI
|
||||
103
src/internal/runtime/gc/scan/scan_amd64.s
Normal file
103
src/internal/runtime/gc/scan/scan_amd64.s
Normal file
@@ -0,0 +1,103 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "go_asm.h"
|
||||
#include "textflag.h"
|
||||
|
||||
// Test-only.
|
||||
TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
|
||||
MOVQ sizeClass+0(FP), CX
|
||||
MOVQ packed+8(FP), AX
|
||||
|
||||
// Call the expander for this size class
|
||||
LEAQ ·gcExpandersAVX512(SB), BX
|
||||
CALL (BX)(CX*8)
|
||||
|
||||
MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
|
||||
VMOVDQU64 Z1, 0(DI)
|
||||
VMOVDQU64 Z2, 64(DI)
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
|
||||
// Z1+Z2 = Expand the grey object mask into a grey word mask
|
||||
MOVQ objMarks+16(FP), AX
|
||||
MOVQ sizeClass+24(FP), CX
|
||||
LEAQ ·gcExpandersAVX512(SB), BX
|
||||
CALL (BX)(CX*8)
|
||||
|
||||
// Z3+Z4 = Load the pointer mask
|
||||
MOVQ ptrMask+32(FP), AX
|
||||
VMOVDQU64 0(AX), Z3
|
||||
VMOVDQU64 64(AX), Z4
|
||||
|
||||
// Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask
|
||||
VPANDQ Z1, Z3, Z1
|
||||
VPANDQ Z2, Z4, Z2
|
||||
|
||||
// Now each bit of Z1+Z2 represents one word of the span.
|
||||
// Thus, each byte covers 64 bytes of memory, which is also how
|
||||
// much we can fix in a Z register.
|
||||
//
|
||||
// We do a load/compress for each 64 byte frame.
|
||||
//
|
||||
// Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame
|
||||
VPOPCNTB Z1, Z3 // Requires BITALG
|
||||
VPOPCNTB Z2, Z4
|
||||
|
||||
// Store the scan mask and word counts at 0(SP) and 128(SP).
|
||||
//
|
||||
// TODO: Is it better to read directly from the registers?
|
||||
VMOVDQU64 Z1, 0(SP)
|
||||
VMOVDQU64 Z2, 64(SP)
|
||||
VMOVDQU64 Z3, 128(SP)
|
||||
VMOVDQU64 Z4, 192(SP)
|
||||
|
||||
// SI = Current address in span
|
||||
MOVQ mem+0(FP), SI
|
||||
// DI = Scan buffer base
|
||||
MOVQ bufp+8(FP), DI
|
||||
// DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer
|
||||
MOVQ $0, DX
|
||||
|
||||
// AX = address in scan mask, 128(AX) = address in popcount
|
||||
LEAQ 0(SP), AX
|
||||
|
||||
// Loop over the 64 byte frames in this span.
|
||||
// BX = 1 past the end of the scan mask
|
||||
LEAQ 128(SP), BX
|
||||
|
||||
// Align loop to a cache line so that performance is less sensitive
|
||||
// to how this function ends up laid out in memory. This is a hot
|
||||
// function in the GC, and this is a tight loop. We don't want
|
||||
// performance to waver wildly due to unrelated changes.
|
||||
PCALIGN $64
|
||||
loop:
|
||||
// CX = Fetch the mask of words to load from this frame.
|
||||
MOVBQZX 0(AX), CX
|
||||
// Skip empty frames.
|
||||
TESTQ CX, CX
|
||||
JZ skip
|
||||
|
||||
// Load the 64 byte frame.
|
||||
KMOVB CX, K1
|
||||
VMOVDQA64 0(SI), Z1
|
||||
|
||||
// Collect just the pointers from the greyed objects into the scan buffer,
|
||||
// i.e., copy the word indices in the mask from Z1 into contiguous memory.
|
||||
VPCOMPRESSQ Z1, K1, (DI)(DX*8)
|
||||
// Advance the scan buffer position by the number of pointers.
|
||||
MOVBQZX 128(AX), CX
|
||||
ADDQ CX, DX
|
||||
|
||||
skip:
|
||||
ADDQ $64, SI
|
||||
ADDQ $1, AX
|
||||
CMPQ AX, BX
|
||||
JB loop
|
||||
|
||||
end:
|
||||
MOVL DX, count+40(FP)
|
||||
VZEROUPPER
|
||||
RET
|
||||
19
src/internal/runtime/gc/scan/scan_amd64_test.go
Normal file
19
src/internal/runtime/gc/scan/scan_amd64_test.go
Normal file
@@ -0,0 +1,19 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build amd64
|
||||
|
||||
package scan_test
|
||||
|
||||
import (
|
||||
"internal/runtime/gc/scan"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestScanSpanPackedAVX512(t *testing.T) {
|
||||
if !scan.CanAVX512() {
|
||||
t.Skip("no AVX512")
|
||||
}
|
||||
testScanSpanPacked(t, scan.ScanSpanPackedAVX512)
|
||||
}
|
||||
23
src/internal/runtime/gc/scan/scan_generic.go
Normal file
23
src/internal/runtime/gc/scan/scan_generic.go
Normal file
@@ -0,0 +1,23 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build !amd64
|
||||
|
||||
package scan
|
||||
|
||||
import (
|
||||
"internal/runtime/gc"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
func HasFastScanSpanPacked() bool {
|
||||
// N.B. ScanSpanPackedGeneric isn't actually fast enough to serve as a general-purpose implementation.
|
||||
// The runtime's alternative of jumping between each object is still substantially better, even at
|
||||
// relatively high object densities.
|
||||
return false
|
||||
}
|
||||
|
||||
func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
|
||||
return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)
|
||||
}
|
||||
14
src/internal/runtime/gc/scan/scan_generic_test.go
Normal file
14
src/internal/runtime/gc/scan/scan_generic_test.go
Normal file
@@ -0,0 +1,14 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package scan_test
|
||||
|
||||
import (
|
||||
"internal/runtime/gc/scan"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestScanSpanPackedGo(t *testing.T) {
|
||||
testScanSpanPacked(t, scan.ScanSpanPackedGo)
|
||||
}
|
||||
104
src/internal/runtime/gc/scan/scan_go.go
Normal file
104
src/internal/runtime/gc/scan/scan_go.go
Normal file
@@ -0,0 +1,104 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package scan
|
||||
|
||||
import (
|
||||
"internal/goarch"
|
||||
"internal/runtime/gc"
|
||||
"internal/runtime/sys"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// ScanSpanPackedGo is an optimized pure Go implementation of ScanSpanPacked.
|
||||
func ScanSpanPackedGo(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
|
||||
buf := newUnsafeBuf(bufp)
|
||||
objBytes := uintptr(gc.SizeClassToSize[sizeClass])
|
||||
// TODO(austin): Trim objMarks to the number of objects in this size class?
|
||||
for markI, markWord := range objMarks {
|
||||
for range sys.OnesCount64(uint64(markWord)) {
|
||||
bitI := sys.TrailingZeros64(uint64(markWord))
|
||||
markWord &^= 1 << bitI
|
||||
|
||||
objIndex := markI*goarch.PtrBits + bitI
|
||||
|
||||
// objStartInSpan is the index of the word from mem where the
|
||||
// object stats. objEndInSpan points to the next object, i.e.
|
||||
// it's an exclusive upper bound.
|
||||
objStartInSpan := objBytes * uintptr(objIndex) / goarch.PtrSize
|
||||
objEndInSpan := objStartInSpan + objBytes/goarch.PtrSize
|
||||
|
||||
// TODO: Another way to do this would be to extract the pointer mask
|
||||
// for this object (it's at most 64 bits) and do a bit iteration
|
||||
// over that.
|
||||
|
||||
for wordI := objStartInSpan; wordI < objEndInSpan; wordI++ {
|
||||
val := *(*uintptr)(unsafe.Add(mem, wordI*goarch.PtrSize))
|
||||
// Check if we should enqueue this word.
|
||||
//
|
||||
// We load the word before the check because, even though this
|
||||
// can lead to loading much more than necessary, it's faster.
|
||||
// Most likely this is because it warms up the hardware
|
||||
// prefetcher much better, and gives us more time before we need
|
||||
// the value.
|
||||
//
|
||||
// We discard values that can't possibly be useful pointers
|
||||
// here, too, because this filters out a lot of words and does
|
||||
// so with as little processing as possible.
|
||||
//
|
||||
// TODO: This is close to, but not entirely branchless.
|
||||
isPtr := bool2int(ptrMask[wordI/goarch.PtrBits]&(1<<(wordI%goarch.PtrBits)) != 0)
|
||||
isNonNil := bool2int(val >= 4096)
|
||||
pred := isPtr&isNonNil != 0
|
||||
buf.addIf(val, pred)
|
||||
}
|
||||
}
|
||||
}
|
||||
// We don't know the true size of bufp, but we can at least catch obvious errors
|
||||
// in this function by making sure we didn't write more than gc.PageWords pointers
|
||||
// into the buffer.
|
||||
buf.check(gc.PageWords)
|
||||
return int32(buf.n)
|
||||
}
|
||||
|
||||
// unsafeBuf allows for appending to a buffer without bounds-checks or branches.
|
||||
type unsafeBuf[T any] struct {
|
||||
base *T
|
||||
n int
|
||||
}
|
||||
|
||||
func newUnsafeBuf[T any](base *T) unsafeBuf[T] {
|
||||
return unsafeBuf[T]{base, 0}
|
||||
}
|
||||
|
||||
// addIf appends a value to the buffer if the predicate is true.
|
||||
//
|
||||
// addIf speculatively writes to the next index of the buffer, so the caller
|
||||
// must be certain that such a write will still be in-bounds with respect
|
||||
// to the buffer's true capacity.
|
||||
func (b *unsafeBuf[T]) addIf(val T, pred bool) {
|
||||
*(*T)(unsafe.Add(unsafe.Pointer(b.base), b.n*int(unsafe.Sizeof(val)))) = val
|
||||
b.n += bool2int(pred)
|
||||
}
|
||||
|
||||
// check performs a bounds check on speculative writes into the buffer.
|
||||
// Calling this shortly after a series of addIf calls is important to
|
||||
// catch any misuse as fast as possible. Separating the bounds check from
|
||||
// the append is more efficient, but one check to cover several appends is
|
||||
// still efficient and much more memory safe.
|
||||
func (b unsafeBuf[T]) check(cap int) {
|
||||
// We fail even if b.n == cap because addIf speculatively writes one past b.n.
|
||||
if b.n >= cap {
|
||||
panic("unsafeBuf overflow")
|
||||
}
|
||||
}
|
||||
|
||||
func bool2int(x bool) int {
|
||||
// This particular pattern gets optimized by the compiler.
|
||||
var b int
|
||||
if x {
|
||||
b = 1
|
||||
}
|
||||
return b
|
||||
}
|
||||
40
src/internal/runtime/gc/scan/scan_reference.go
Normal file
40
src/internal/runtime/gc/scan/scan_reference.go
Normal file
@@ -0,0 +1,40 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package scan
|
||||
|
||||
import (
|
||||
"internal/goarch"
|
||||
"internal/runtime/gc"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// ScanSpanPackedReference is the reference implementation of ScanScanPacked. It prioritizes clarity over performance.
|
||||
//
|
||||
// Concretely, ScanScanPacked functions read pointers from mem, assumed to be gc.PageSize-aligned and gc.PageSize in size,
|
||||
// and writes them to bufp, which is large enough to guarantee that even if pointer-word of mem is a pointer, it will fit.
|
||||
// Therefore bufp, is always at least gc.PageSize in size.
|
||||
//
|
||||
// ScanSpanPacked is supposed to identify pointers by first filtering words by objMarks, where each bit of the mask
|
||||
// represents gc.SizeClassToSize[sizeClass] bytes of memory, and then filtering again by the bits in ptrMask.
|
||||
func ScanSpanPackedReference(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
|
||||
buf := unsafe.Slice(bufp, gc.PageWords)
|
||||
expandBy := uintptr(gc.SizeClassToSize[sizeClass]) / goarch.PtrSize
|
||||
for word := range gc.PageWords {
|
||||
objI := uintptr(word) / expandBy
|
||||
if objMarks[objI/goarch.PtrBits]&(1<<(objI%goarch.PtrBits)) == 0 {
|
||||
continue
|
||||
}
|
||||
if ptrMask[word/goarch.PtrBits]&(1<<(word%goarch.PtrBits)) == 0 {
|
||||
continue
|
||||
}
|
||||
ptr := *(*uintptr)(unsafe.Add(mem, word*goarch.PtrSize))
|
||||
if ptr == 0 {
|
||||
continue
|
||||
}
|
||||
buf[count] = ptr
|
||||
count++
|
||||
}
|
||||
return count
|
||||
}
|
||||
254
src/internal/runtime/gc/scan/scan_test.go
Normal file
254
src/internal/runtime/gc/scan/scan_test.go
Normal file
@@ -0,0 +1,254 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package scan_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"internal/cpu"
|
||||
"internal/goarch"
|
||||
"internal/runtime/gc"
|
||||
"internal/runtime/gc/scan"
|
||||
"math/bits"
|
||||
"math/rand/v2"
|
||||
"slices"
|
||||
"sync"
|
||||
"testing"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
type scanFunc func(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)
|
||||
|
||||
func testScanSpanPacked(t *testing.T, scanF scanFunc) {
|
||||
scanR := scan.ScanSpanPackedReference
|
||||
|
||||
// Construct a fake memory
|
||||
mem, free := makeMem(t, 1)
|
||||
defer free()
|
||||
for i := range mem {
|
||||
// Use values > heap.PageSize because a scan function can discard
|
||||
// pointers smaller than this.
|
||||
mem[i] = uintptr(int(gc.PageSize) + i + 1)
|
||||
}
|
||||
|
||||
// Construct a random pointer mask
|
||||
rnd := rand.New(rand.NewPCG(42, 42))
|
||||
var ptrs gc.PtrMask
|
||||
for i := range ptrs {
|
||||
ptrs[i] = uintptr(rnd.Uint64())
|
||||
}
|
||||
|
||||
bufF := make([]uintptr, gc.PageWords)
|
||||
bufR := make([]uintptr, gc.PageWords)
|
||||
testObjs(t, func(t *testing.T, sizeClass int, objs *gc.ObjMask) {
|
||||
nF := scanF(unsafe.Pointer(&mem[0]), &bufF[0], objs, uintptr(sizeClass), &ptrs)
|
||||
nR := scanR(unsafe.Pointer(&mem[0]), &bufR[0], objs, uintptr(sizeClass), &ptrs)
|
||||
|
||||
if nR != nF {
|
||||
t.Errorf("want %d count, got %d", nR, nF)
|
||||
} else if !slices.Equal(bufF[:nF], bufR[:nR]) {
|
||||
t.Errorf("want scanned pointers %d, got %d", bufR[:nR], bufF[:nF])
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func testObjs(t *testing.T, f func(t *testing.T, sizeClass int, objMask *gc.ObjMask)) {
|
||||
for sizeClass := range gc.NumSizeClasses {
|
||||
if sizeClass == 0 {
|
||||
continue
|
||||
}
|
||||
size := uintptr(gc.SizeClassToSize[sizeClass])
|
||||
if size > gc.MinSizeForMallocHeader {
|
||||
break // Pointer/scalar metadata is not packed for larger sizes.
|
||||
}
|
||||
t.Run(fmt.Sprintf("size=%d", size), func(t *testing.T) {
|
||||
// Scan a few objects near i to test boundary conditions.
|
||||
const objMask = 0x101
|
||||
nObj := uintptr(gc.SizeClassToNPages[sizeClass]) * gc.PageSize / size
|
||||
for i := range nObj - uintptr(bits.Len(objMask)-1) {
|
||||
t.Run(fmt.Sprintf("objs=0x%x<<%d", objMask, i), func(t *testing.T) {
|
||||
var objs gc.ObjMask
|
||||
objs[i/goarch.PtrBits] = objMask << (i % goarch.PtrBits)
|
||||
f(t, sizeClass, &objs)
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
var dataCacheSizes = sync.OnceValue(func() []uintptr {
|
||||
cs := cpu.DataCacheSizes()
|
||||
for i, c := range cs {
|
||||
fmt.Printf("# L%d cache: %d (%d Go pages)\n", i+1, c, c/gc.PageSize)
|
||||
}
|
||||
return cs
|
||||
})
|
||||
|
||||
func BenchmarkScanSpanPacked(b *testing.B) {
|
||||
benchmarkCacheSizes(b, benchmarkScanSpanPackedAllSizeClasses)
|
||||
}
|
||||
|
||||
func benchmarkCacheSizes(b *testing.B, fn func(b *testing.B, heapPages int)) {
|
||||
cacheSizes := dataCacheSizes()
|
||||
b.Run("cache=tiny/pages=1", func(b *testing.B) {
|
||||
fn(b, 1)
|
||||
})
|
||||
for i, cacheBytes := range cacheSizes {
|
||||
pages := int(cacheBytes*3/4) / gc.PageSize
|
||||
b.Run(fmt.Sprintf("cache=L%d/pages=%d", i+1, pages), func(b *testing.B) {
|
||||
fn(b, pages)
|
||||
})
|
||||
}
|
||||
ramPages := int(cacheSizes[len(cacheSizes)-1]*3/2) / gc.PageSize
|
||||
b.Run(fmt.Sprintf("cache=ram/pages=%d", ramPages), func(b *testing.B) {
|
||||
fn(b, ramPages)
|
||||
})
|
||||
}
|
||||
|
||||
func benchmarkScanSpanPackedAllSizeClasses(b *testing.B, nPages int) {
|
||||
for sc := range gc.NumSizeClasses {
|
||||
if sc == 0 {
|
||||
continue
|
||||
}
|
||||
if sc >= gc.MinSizeForMallocHeader {
|
||||
break
|
||||
}
|
||||
b.Run(fmt.Sprintf("sizeclass=%d", sc), func(b *testing.B) {
|
||||
benchmarkScanSpanPacked(b, nPages, sc)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func benchmarkScanSpanPacked(b *testing.B, nPages int, sizeClass int) {
|
||||
rnd := rand.New(rand.NewPCG(42, 42))
|
||||
|
||||
// Construct a fake memory
|
||||
mem, free := makeMem(b, nPages)
|
||||
defer free()
|
||||
for i := range mem {
|
||||
// Use values > heap.PageSize because a scan function can discard
|
||||
// pointers smaller than this.
|
||||
mem[i] = uintptr(int(gc.PageSize) + i + 1)
|
||||
}
|
||||
|
||||
// Construct a random pointer mask
|
||||
ptrs := make([]gc.PtrMask, nPages)
|
||||
for i := range ptrs {
|
||||
for j := range ptrs[i] {
|
||||
ptrs[i][j] = uintptr(rnd.Uint64())
|
||||
}
|
||||
}
|
||||
|
||||
// Visit the pages in a random order
|
||||
pageOrder := rnd.Perm(nPages)
|
||||
|
||||
// Create the scan buffer.
|
||||
buf := make([]uintptr, gc.PageWords)
|
||||
|
||||
// Sweep from 0 marks to all marks. We'll use the same marks for each page
|
||||
// because I don't think that predictability matters.
|
||||
objBytes := uintptr(gc.SizeClassToSize[sizeClass])
|
||||
nObj := gc.PageSize / objBytes
|
||||
markOrder := rnd.Perm(int(nObj))
|
||||
const steps = 11
|
||||
for i := 0; i < steps; i++ {
|
||||
frac := float64(i) / float64(steps-1)
|
||||
// Set frac marks.
|
||||
nMarks := int(float64(len(markOrder))*frac + 0.5)
|
||||
var objMarks gc.ObjMask
|
||||
for _, mark := range markOrder[:nMarks] {
|
||||
objMarks[mark/goarch.PtrBits] |= 1 << (mark % goarch.PtrBits)
|
||||
}
|
||||
greyClusters := 0
|
||||
for page := range ptrs {
|
||||
greyClusters += countGreyClusters(sizeClass, &objMarks, &ptrs[page])
|
||||
}
|
||||
|
||||
// Report MB/s of how much memory they're actually hitting. This assumes
|
||||
// 64 byte cache lines (TODO: Should it assume 128 byte cache lines?)
|
||||
// and expands each access to the whole cache line. This is useful for
|
||||
// comparing against memory bandwidth.
|
||||
//
|
||||
// TODO: Add a benchmark that just measures single core memory bandwidth
|
||||
// for comparison. (See runtime memcpy benchmarks.)
|
||||
//
|
||||
// TODO: Should there be a separate measure where we don't expand to
|
||||
// cache lines?
|
||||
avgBytes := int64(greyClusters) * int64(cpu.CacheLineSize) / int64(len(ptrs))
|
||||
|
||||
b.Run(fmt.Sprintf("pct=%d", int(100*frac)), func(b *testing.B) {
|
||||
b.Run("impl=Reference", func(b *testing.B) {
|
||||
b.SetBytes(avgBytes)
|
||||
for i := range b.N {
|
||||
page := pageOrder[i%len(pageOrder)]
|
||||
scan.ScanSpanPackedReference(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
|
||||
}
|
||||
})
|
||||
b.Run("impl=Go", func(b *testing.B) {
|
||||
b.SetBytes(avgBytes)
|
||||
for i := range b.N {
|
||||
page := pageOrder[i%len(pageOrder)]
|
||||
scan.ScanSpanPackedGo(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
|
||||
}
|
||||
})
|
||||
if scan.HasFastScanSpanPacked() {
|
||||
b.Run("impl=Platform", func(b *testing.B) {
|
||||
b.SetBytes(avgBytes)
|
||||
for i := range b.N {
|
||||
page := pageOrder[i%len(pageOrder)]
|
||||
scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func countGreyClusters(sizeClass int, objMarks *gc.ObjMask, ptrMask *gc.PtrMask) int {
|
||||
clusters := 0
|
||||
lastCluster := -1
|
||||
|
||||
expandBy := uintptr(gc.SizeClassToSize[sizeClass]) / goarch.PtrSize
|
||||
for word := range gc.PageWords {
|
||||
objI := uintptr(word) / expandBy
|
||||
if objMarks[objI/goarch.PtrBits]&(1<<(objI%goarch.PtrBits)) == 0 {
|
||||
continue
|
||||
}
|
||||
if ptrMask[word/goarch.PtrBits]&(1<<(word%goarch.PtrBits)) == 0 {
|
||||
continue
|
||||
}
|
||||
c := word * 8 / goarch.PtrBits
|
||||
if c != lastCluster {
|
||||
lastCluster = c
|
||||
clusters++
|
||||
}
|
||||
}
|
||||
return clusters
|
||||
}
|
||||
|
||||
func BenchmarkScanMaxBandwidth(b *testing.B) {
|
||||
// Measure the theoretical "maximum" bandwidth of scanning by reproducing
|
||||
// the memory access pattern of a full page scan, but using memcpy as the
|
||||
// kernel instead of scanning.
|
||||
benchmarkCacheSizes(b, func(b *testing.B, heapPages int) {
|
||||
mem, free := makeMem(b, heapPages)
|
||||
defer free()
|
||||
for i := range mem {
|
||||
mem[i] = uintptr(int(gc.PageSize) + i + 1)
|
||||
}
|
||||
buf := make([]uintptr, gc.PageWords)
|
||||
|
||||
// Visit the pages in a random order
|
||||
rnd := rand.New(rand.NewPCG(42, 42))
|
||||
pageOrder := rnd.Perm(heapPages)
|
||||
|
||||
b.SetBytes(int64(gc.PageSize))
|
||||
|
||||
b.ResetTimer()
|
||||
for i := range b.N {
|
||||
page := pageOrder[i%len(pageOrder)]
|
||||
copy(buf, mem[gc.PageWords*page:])
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -82,14 +82,15 @@ package gc
|
||||
// 8192 13 32768
|
||||
|
||||
const (
|
||||
MinHeapAlign = 8
|
||||
MaxSmallSize = 32768
|
||||
SmallSizeDiv = 8
|
||||
SmallSizeMax = 1024
|
||||
LargeSizeDiv = 128
|
||||
NumSizeClasses = 68
|
||||
PageShift = 13
|
||||
MaxObjsPerSpan = 1024
|
||||
MinHeapAlign = 8
|
||||
MaxSmallSize = 32768
|
||||
SmallSizeDiv = 8
|
||||
SmallSizeMax = 1024
|
||||
LargeSizeDiv = 128
|
||||
NumSizeClasses = 68
|
||||
PageShift = 13
|
||||
MaxObjsPerSpan = 1024
|
||||
MaxSizeClassNPages = 10
|
||||
)
|
||||
|
||||
var SizeClassToSize = [NumSizeClasses]uint16{0, 8, 16, 24, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 896, 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, 24576, 27264, 28672, 32768}
|
||||
|
||||
Reference in New Issue
Block a user