cmd/compile: use generated loops instead of DUFFZERO on amd64

goarch: amd64
cpu: 12th Gen Intel(R) Core(TM) i7-12700
                        │     base      │                 exp                 │
                        │    sec/op     │   sec/op     vs base                │
MemclrKnownSize112-20      1.270n ± 14%   1.006n ± 0%  -20.72% (p=0.000 n=10)
MemclrKnownSize128-20      1.266n ±  0%   1.005n ± 0%  -20.58% (p=0.000 n=10)
MemclrKnownSize192-20      1.771n ±  0%   1.579n ± 1%  -10.84% (p=0.000 n=10)
MemclrKnownSize248-20      4.034n ±  0%   3.520n ± 0%  -12.75% (p=0.000 n=10)
MemclrKnownSize256-20      2.269n ±  0%   2.014n ± 0%  -11.26% (p=0.000 n=10)
MemclrKnownSize512-20      4.280n ±  0%   4.030n ± 0%   -5.84% (p=0.000 n=10)
MemclrKnownSize1024-20     8.309n ±  1%   8.057n ± 0%   -3.03% (p=0.000 n=10)

Change-Id: I8f1627e2a1e981ff351dc7178932b32a2627f765
Reviewed-on: https://go-review.googlesource.com/c/go/+/678937
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Keith Randall
2025-06-03 16:23:02 -07:00
parent c0ee2fd4e3
commit eb7f515c4d
8 changed files with 225 additions and 150 deletions

View File

@@ -1007,26 +1007,103 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64DUFFZERO:
case ssa.OpAMD64LoweredZero:
if s.ABI != obj.ABIInternal {
// zero X15 manually
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
}
off := duffStart(v.AuxInt)
adj := duffAdj(v.AuxInt)
var p *obj.Prog
if adj != 0 {
p = s.Prog(x86.ALEAQ)
p.From.Type = obj.TYPE_MEM
p.From.Offset = adj
p.From.Reg = x86.REG_DI
p.To.Type = obj.TYPE_REG
p.To.Reg = x86.REG_DI
ptrReg := v.Args[0].Reg()
n := v.AuxInt
if n < 16 {
v.Fatalf("Zero too small %d", n)
}
p = s.Prog(obj.ADUFFZERO)
p.To.Type = obj.TYPE_ADDR
p.To.Sym = ir.Syms.Duffzero
p.To.Offset = off
zero16 := func(off int64) {
zero16(s, ptrReg, off)
}
// Generate zeroing instructions.
var off int64
for n >= 16 {
zero16(off)
off += 16
n -= 16
}
if n != 0 {
// use partially overlapped write.
// TODO: n <= 8, use smaller write?
zero16(off + n - 16)
}
case ssa.OpAMD64LoweredZeroLoop:
if s.ABI != obj.ABIInternal {
// zero X15 manually
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
}
ptrReg := v.Args[0].Reg()
countReg := v.RegTmp()
n := v.AuxInt
loopSize := int64(64)
if n < 3*loopSize {
// - a loop count of 0 won't work.
// - a loop count of 1 is useless.
// - a loop count of 2 is a code size ~tie
// 4 instructions to implement the loop
// 4 instructions in the loop body
// vs
// 8 instructions in the straightline code
// Might as well use straightline code.
v.Fatalf("ZeroLoop size too small %d", n)
}
zero16 := func(off int64) {
zero16(s, ptrReg, off)
}
// Put iteration count in a register.
// MOVL $n, countReg
p := s.Prog(x86.AMOVL)
p.From.Type = obj.TYPE_CONST
p.From.Offset = n / loopSize
p.To.Type = obj.TYPE_REG
p.To.Reg = countReg
cntInit := p
// Zero loopSize bytes starting at ptrReg.
for i := range loopSize / 16 {
zero16(i * 16)
}
// ADDQ $loopSize, ptrReg
p = s.Prog(x86.AADDQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = loopSize
p.To.Type = obj.TYPE_REG
p.To.Reg = ptrReg
// DECL countReg
p = s.Prog(x86.ADECL)
p.To.Type = obj.TYPE_REG
p.To.Reg = countReg
// Jump to first instruction in loop if we're not done yet.
// JNE head
p = s.Prog(x86.AJNE)
p.To.Type = obj.TYPE_BRANCH
p.To.SetTarget(cntInit.Link)
// Multiples of the loop size are now done.
n %= loopSize
// Write any fractional portion.
var off int64
for n >= 16 {
zero16(off)
off += 16
n -= 16
}
if n != 0 {
// Use partially-overlapping write.
// TODO: n <= 8, use smaller write?
zero16(off + n - 16)
}
case ssa.OpAMD64DUFFCOPY:
p := s.Prog(obj.ADUFFCOPY)
p.To.Type = obj.TYPE_ADDR
@@ -1621,3 +1698,14 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
p.Pos = p.Pos.WithNotStmt()
return p
}
// zero 16 bytes at reg+off.
func zero16(s *ssagen.State, reg int16, off int64) {
// MOVUPS X15, off(ptrReg)
p := s.Prog(x86.AMOVUPS)
p.From.Type = obj.TYPE_REG
p.From.Reg = x86.REG_X15
p.To.Type = obj.TYPE_MEM
p.To.Reg = reg
p.To.Offset = off
}

View File

@@ -375,34 +375,17 @@
(MOVQstoreconst [makeValAndOff(0,int32(s-8))] destptr
(MOVQstoreconst [makeValAndOff(0,0)] destptr mem))
// Adjust zeros to be a multiple of 16 bytes.
(Zero [s] destptr mem) && s%16 != 0 && s > 16 =>
(Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16])
(MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
// Zeroing up to 192 bytes uses straightline code.
(Zero [s] destptr mem) && s >= 16 && s < 192 => (LoweredZero [s] destptr mem)
(Zero [16] destptr mem) =>
(MOVOstoreconst [makeValAndOff(0,0)] destptr mem)
(Zero [32] destptr mem) =>
(MOVOstoreconst [makeValAndOff(0,16)] destptr
(MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
(Zero [48] destptr mem) =>
(MOVOstoreconst [makeValAndOff(0,32)] destptr
(MOVOstoreconst [makeValAndOff(0,16)] destptr
(MOVOstoreconst [makeValAndOff(0,0)] destptr mem)))
(Zero [64] destptr mem) =>
(MOVOstoreconst [makeValAndOff(0,48)] destptr
(MOVOstoreconst [makeValAndOff(0,32)] destptr
(MOVOstoreconst [makeValAndOff(0,16)] destptr
(MOVOstoreconst [makeValAndOff(0,0)] destptr mem))))
// Medium zeroing uses a duff device.
(Zero [s] destptr mem)
&& s > 64 && s <= 1024 && s%16 == 0 =>
(DUFFZERO [s] destptr mem)
// Zeroing up to ~1KB uses a small loop.
(Zero [s] destptr mem) && s >= 192 && s <= repZeroThreshold => (LoweredZeroLoop [s] destptr mem)
// Large zeroing uses REP STOSQ.
(Zero [s] destptr mem)
&& s > 1024 && s%8 == 0 =>
(Zero [s] destptr mem) && s > repZeroThreshold && s%8 != 0 =>
(Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8])
(MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
(Zero [s] destptr mem) && s > repZeroThreshold && s%8 == 0 =>
(REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem)
// Lowering constants

View File

@@ -889,15 +889,30 @@ func init() {
// auxint = # of bytes to zero
// returns mem
{
name: "DUFFZERO",
name: "LoweredZero",
aux: "Int64",
argLength: 2,
reg: regInfo{
inputs: []regMask{buildReg("DI")},
clobbers: buildReg("DI"),
inputs: []regMask{gp},
},
//faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
faultOnNilArg0: true,
},
// arg0 = pointer to start of memory to zero
// arg1 = mem
// auxint = # of bytes to zero
// returns mem
{
name: "LoweredZeroLoop",
aux: "Int64",
argLength: 2,
reg: regInfo{
inputs: []regMask{gp},
clobbersArg0: true,
},
clobberFlags: true,
faultOnNilArg0: true,
needIntTemp: true,
},
// arg0 = address of memory to zero

View File

@@ -1051,7 +1051,8 @@ const (
OpAMD64MOVLstoreconstidx4
OpAMD64MOVQstoreconstidx1
OpAMD64MOVQstoreconstidx8
OpAMD64DUFFZERO
OpAMD64LoweredZero
OpAMD64LoweredZeroLoop
OpAMD64REPSTOSQ
OpAMD64CALLstatic
OpAMD64CALLtail
@@ -13873,15 +13874,28 @@ var opcodeTable = [...]opInfo{
},
},
{
name: "DUFFZERO",
auxType: auxInt64,
argLen: 2,
unsafePoint: true,
name: "LoweredZero",
auxType: auxInt64,
argLen: 2,
faultOnNilArg0: true,
reg: regInfo{
inputs: []inputInfo{
{0, 128}, // DI
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
clobbers: 128, // DI
},
},
{
name: "LoweredZeroLoop",
auxType: auxInt64,
argLen: 2,
clobberFlags: true,
needIntTemp: true,
faultOnNilArg0: true,
reg: regInfo{
inputs: []inputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
clobbersArg0: true,
},
},
{

View File

@@ -6,6 +6,7 @@ package ssa
import (
"cmd/compile/internal/types"
"fmt"
"testing"
)
@@ -218,10 +219,37 @@ func TestSpillMove2(t *testing.T) {
}
func TestClobbersArg0(t *testing.T) {
c := testConfig(t)
f := c.Fun("entry",
Bloc("entry",
Valu("mem", OpInitMem, types.TypeMem, 0, nil),
Valu("ptr", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())),
Valu("dst", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())),
Valu("zero", OpAMD64LoweredZeroLoop, types.TypeMem, 256, nil, "ptr", "mem"),
Valu("store", OpAMD64MOVQstore, types.TypeMem, 0, nil, "dst", "ptr", "zero"),
Exit("store")))
flagalloc(f.f)
regalloc(f.f)
checkFunc(f.f)
// LoweredZeroLoop clobbers its argument, so there must be a copy of "ptr" somewhere
// so we still have that value available at "store".
if n := numCopies(f.blocks["entry"]); n != 1 {
fmt.Printf("%s\n", f.f.String())
t.Errorf("got %d copies, want 1", n)
}
}
func numSpills(b *Block) int {
return numOps(b, OpStoreReg)
}
func numCopies(b *Block) int {
return numOps(b, OpCopy)
}
func numOps(b *Block, op Op) int {
n := 0
for _, v := range b.Values {
if v.Op == OpStoreReg {
if v.Op == op {
n++
}
}

View File

@@ -29,6 +29,8 @@ type deadValueChoice bool
const (
leaveDeadValues deadValueChoice = false
removeDeadValues = true
repZeroThreshold = 1408 // size beyond which we use REP STOS for zeroing
)
// deadcode indicates whether rewrite should try to remove any values that become dead.

View File

@@ -30025,19 +30025,49 @@ func rewriteValueAMD64_OpZero(v *Value) bool {
return true
}
// match: (Zero [s] destptr mem)
// cond: s%16 != 0 && s > 16
// result: (Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16]) (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
// cond: s >= 16 && s < 192
// result: (LoweredZero [s] destptr mem)
for {
s := auxIntToInt64(v.AuxInt)
destptr := v_0
mem := v_1
if !(s%16 != 0 && s > 16) {
if !(s >= 16 && s < 192) {
break
}
v.reset(OpAMD64LoweredZero)
v.AuxInt = int64ToAuxInt(s)
v.AddArg2(destptr, mem)
return true
}
// match: (Zero [s] destptr mem)
// cond: s >= 192 && s <= repZeroThreshold
// result: (LoweredZeroLoop [s] destptr mem)
for {
s := auxIntToInt64(v.AuxInt)
destptr := v_0
mem := v_1
if !(s >= 192 && s <= repZeroThreshold) {
break
}
v.reset(OpAMD64LoweredZeroLoop)
v.AuxInt = int64ToAuxInt(s)
v.AddArg2(destptr, mem)
return true
}
// match: (Zero [s] destptr mem)
// cond: s > repZeroThreshold && s%8 != 0
// result: (Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8]) (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
for {
s := auxIntToInt64(v.AuxInt)
destptr := v_0
mem := v_1
if !(s > repZeroThreshold && s%8 != 0) {
break
}
v.reset(OpZero)
v.AuxInt = int64ToAuxInt(s - s%16)
v.AuxInt = int64ToAuxInt(s - s%8)
v0 := b.NewValue0(v.Pos, OpOffPtr, destptr.Type)
v0.AuxInt = int64ToAuxInt(s % 16)
v0.AuxInt = int64ToAuxInt(s % 8)
v0.AddArg(destptr)
v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
@@ -30045,99 +30075,14 @@ func rewriteValueAMD64_OpZero(v *Value) bool {
v.AddArg2(v0, v1)
return true
}
// match: (Zero [16] destptr mem)
// result: (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)
for {
if auxIntToInt64(v.AuxInt) != 16 {
break
}
destptr := v_0
mem := v_1
v.reset(OpAMD64MOVOstoreconst)
v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
v.AddArg2(destptr, mem)
return true
}
// match: (Zero [32] destptr mem)
// result: (MOVOstoreconst [makeValAndOff(0,16)] destptr (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
for {
if auxIntToInt64(v.AuxInt) != 32 {
break
}
destptr := v_0
mem := v_1
v.reset(OpAMD64MOVOstoreconst)
v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 16))
v0 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
v0.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
v0.AddArg2(destptr, mem)
v.AddArg2(destptr, v0)
return true
}
// match: (Zero [48] destptr mem)
// result: (MOVOstoreconst [makeValAndOff(0,32)] destptr (MOVOstoreconst [makeValAndOff(0,16)] destptr (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)))
for {
if auxIntToInt64(v.AuxInt) != 48 {
break
}
destptr := v_0
mem := v_1
v.reset(OpAMD64MOVOstoreconst)
v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 32))
v0 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
v0.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 16))
v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
v1.AddArg2(destptr, mem)
v0.AddArg2(destptr, v1)
v.AddArg2(destptr, v0)
return true
}
// match: (Zero [64] destptr mem)
// result: (MOVOstoreconst [makeValAndOff(0,48)] destptr (MOVOstoreconst [makeValAndOff(0,32)] destptr (MOVOstoreconst [makeValAndOff(0,16)] destptr (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))))
for {
if auxIntToInt64(v.AuxInt) != 64 {
break
}
destptr := v_0
mem := v_1
v.reset(OpAMD64MOVOstoreconst)
v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 48))
v0 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
v0.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 32))
v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 16))
v2 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
v2.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
v2.AddArg2(destptr, mem)
v1.AddArg2(destptr, v2)
v0.AddArg2(destptr, v1)
v.AddArg2(destptr, v0)
return true
}
// match: (Zero [s] destptr mem)
// cond: s > 64 && s <= 1024 && s%16 == 0
// result: (DUFFZERO [s] destptr mem)
for {
s := auxIntToInt64(v.AuxInt)
destptr := v_0
mem := v_1
if !(s > 64 && s <= 1024 && s%16 == 0) {
break
}
v.reset(OpAMD64DUFFZERO)
v.AuxInt = int64ToAuxInt(s)
v.AddArg2(destptr, mem)
return true
}
// match: (Zero [s] destptr mem)
// cond: s > 1024 && s%8 == 0
// cond: s > repZeroThreshold && s%8 == 0
// result: (REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem)
for {
s := auxIntToInt64(v.AuxInt)
destptr := v_0
mem := v_1
if !(s > 1024 && s%8 == 0) {
if !(s > repZeroThreshold && s%8 == 0) {
break
}
v.reset(OpAMD64REPSTOSQ)

View File

@@ -17,31 +17,31 @@ type T struct {
func (t *T) f() {
// amd64:-".*runtime.memclrNoHeapPointers"
// amd64:"DUFFZERO"
// amd64:`MOVUPS\tX15,`
for i := range t.a {
t.a[i] = 0
}
// amd64:-".*runtime.memclrNoHeapPointers"
// amd64:"DUFFZERO"
// amd64:`MOVUPS\tX15,`
for i := range *t.a {
t.a[i] = 0
}
// amd64:-".*runtime.memclrNoHeapPointers"
// amd64:"DUFFZERO"
// amd64:`MOVUPS\tX15,`
for i := range t.a {
(*t.a)[i] = 0
}
// amd64:-".*runtime.memclrNoHeapPointers"
// amd64:"DUFFZERO"
// amd64:`MOVUPS\tX15,`
for i := range *t.a {
(*t.a)[i] = 0
}
// amd64:-".*runtime.memclrNoHeapPointers"
// amd64:"DUFFZERO"
// amd64:`MOVUPS\tX15,`
for i := range t.b {
t.b[i] = 0
}