[dev.simd] cmd/compile: zero only low 128-bit of X15

Zeroing the upper part of X15 may make the CPU think it is
"dirty" and slow down SSE operations. For now, just not zeroing
the upper part, and construct a zero value on the fly if we need
a 256- or 512-bit zero value. Maybe VZEROUPPER works better than
explicitly zeroing X15, but we need to evaluate.

Long term, we probably want to move more things from SSE to AVX.

This essentially undoes CL 698237 and CL 698238, except keeping
using X15 for 128-bit zeroing for SIMD.

Change-Id: I1564e6332c4c57f9721397c92c7c734c5497534c
Reviewed-on: https://go-review.googlesource.com/c/go/+/728240
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
Cherry Mui
2025-12-08 12:14:24 -05:00
parent 144cf17d2c
commit f38e968aba
12 changed files with 25 additions and 102 deletions

View File

@@ -18,7 +18,6 @@ import (
"cmd/internal/obj"
"cmd/internal/obj/x86"
"internal/abi"
"internal/buildcfg"
)
// ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
@@ -1718,7 +1717,15 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
case ssa.OpAMD64VZEROUPPER, ssa.OpAMD64VZEROALL:
s.Prog(v.Op.Asm())
case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512: // no code emitted
case ssa.OpAMD64Zero128: // no code emitted
case ssa.OpAMD64Zero256, ssa.OpAMD64Zero512:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v)
p.AddRestSourceReg(simdReg(v))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
case ssa.OpAMD64VMOVSSf2v, ssa.OpAMD64VMOVSDf2v:
// These are for initializing the least 32/64 bits of a SIMD register from a "float".
@@ -1871,34 +1878,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
// zeroX15 zeroes the X15 register.
func zeroX15(s *ssagen.State) {
if !buildcfg.Experiment.SIMD {
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
return
}
vxorps := func(s *ssagen.State) {
p := s.Prog(x86.AVXORPS)
p.From.Type = obj.TYPE_REG
p.From.Reg = x86.REG_X15
p.AddRestSourceReg(x86.REG_X15)
p.To.Type = obj.TYPE_REG
p.To.Reg = x86.REG_X15
}
if buildcfg.GOAMD64 >= 3 {
vxorps(s)
return
}
// AVX may not be available, check before zeroing the high bits.
p := s.Prog(x86.ACMPB)
p.From.Type = obj.TYPE_MEM
p.From.Name = obj.NAME_EXTERN
p.From.Sym = ir.Syms.X86HasAVX
p.To.Type = obj.TYPE_CONST
p.To.Offset = 1
jmp := s.Prog(x86.AJNE)
jmp.To.Type = obj.TYPE_BRANCH
vxorps(s)
sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
jmp.To.SetTarget(sse)
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
}
// Example instruction: VRSQRTPS X1, X1

View File

@@ -214,6 +214,7 @@ func init() {
vloadk = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: vonly}
vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}}
v01 = regInfo{inputs: nil, outputs: vonly}
v11 = regInfo{inputs: vonly, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
v21 = regInfo{inputs: []regMask{v, vz}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
vk = regInfo{inputs: vzonly, outputs: maskonly}
@@ -232,6 +233,7 @@ func init() {
gpv = regInfo{inputs: []regMask{gp}, outputs: vonly}
v2flags = regInfo{inputs: []regMask{vz, vz}}
w01 = regInfo{inputs: nil, outputs: wonly}
w11 = regInfo{inputs: wonly, outputs: wonly} // used in resultInArg0 ops, arg0 must not be x15
w21 = regInfo{inputs: []regMask{wz, wz}, outputs: wonly}
wk = regInfo{inputs: wzonly, outputs: maskonly}
@@ -1398,12 +1400,15 @@ func init() {
{name: "VPMOVVec64x4ToM", argLength: 1, reg: vk, asm: "VPMOVQ2M"},
{name: "VPMOVVec64x8ToM", argLength: 1, reg: wk, asm: "VPMOVQ2M"},
// X15 is the zero register up to 128-bit. For larger values, we zero it on the fly.
{name: "Zero128", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
{name: "Zero256", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
{name: "Zero512", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
{name: "Zero256", argLength: 0, reg: v01, asm: "VPXOR"},
{name: "Zero512", argLength: 0, reg: w01, asm: "VPXORQ"},
// Move a 32/64 bit float to a 128-bit SIMD register.
{name: "VMOVSDf2v", argLength: 1, reg: fpv, asm: "VMOVSD"},
{name: "VMOVSSf2v", argLength: 1, reg: fpv, asm: "VMOVSS"},
{name: "VMOVQ", argLength: 1, reg: gpv, asm: "VMOVQ"},
{name: "VMOVD", argLength: 1, reg: gpv, asm: "VMOVD"},

View File

@@ -20365,24 +20365,22 @@ var opcodeTable = [...]opInfo{
},
},
{
name: "Zero256",
argLen: 0,
zeroWidth: true,
fixedReg: true,
name: "Zero256",
argLen: 0,
asm: x86.AVPXOR,
reg: regInfo{
outputs: []outputInfo{
{0, 2147483648}, // X15
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "Zero512",
argLen: 0,
zeroWidth: true,
fixedReg: true,
name: "Zero512",
argLen: 0,
asm: x86.AVPXORQ,
reg: regInfo{
outputs: []outputInfo{
{0, 2147483648}, // X15
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},

View File

@@ -1093,11 +1093,6 @@ needm:
// there's no need to handle that. Clear R14 so that there's
// a bad value in there, in case needm tries to use it.
XORPS X15, X15
#ifdef GOEXPERIMENT_simd
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
#endif
XORQ R14, R14
MOVQ $runtime·needAndBindM<ABIInternal>(SB), AX
CALL AX
@@ -1795,11 +1790,6 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0
get_tls(R14)
MOVQ g(R14), R14
XORPS X15, X15
#ifdef GOEXPERIMENT_simd
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
#endif
JMP ·sigpanic<ABIInternal>(SB)
// gcWriteBarrier informs the GC about heap pointer writes.

View File

@@ -456,11 +456,6 @@ call:
// Back to Go world, set special registers.
// The g register (R14) is preserved in C.
XORPS X15, X15
#ifdef GOEXPERIMENT_simd
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
#endif
RET
// C->Go callback thunk that allows to call runtime·racesymbolize from C code.

View File

@@ -177,11 +177,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
#ifdef GOEXPERIMENT_simd
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
#endif
// Reserve space for spill slots.
NOP SP // disable vet stack checking

View File

@@ -228,11 +228,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
#ifdef GOEXPERIMENT_simd
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
#endif
// Reserve space for spill slots.
NOP SP // disable vet stack checking

View File

@@ -265,11 +265,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
#ifdef GOEXPERIMENT_simd
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
#endif
// Reserve space for spill slots.
NOP SP // disable vet stack checking
@@ -295,11 +290,6 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
#ifdef GOEXPERIMENT_simd
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
#endif
// Reserve space for spill slots.
NOP SP // disable vet stack checking

View File

@@ -352,11 +352,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
#ifdef GOEXPERIMENT_simd
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
#endif
// Reserve space for spill slots.
NOP SP // disable vet stack checking
@@ -382,11 +377,6 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
#ifdef GOEXPERIMENT_simd
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
#endif
// Reserve space for spill slots.
NOP SP // disable vet stack checking

View File

@@ -310,11 +310,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
#ifdef GOEXPERIMENT_simd
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
#endif
// Reserve space for spill slots.
NOP SP // disable vet stack checking

View File

@@ -64,11 +64,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
#ifdef GOEXPERIMENT_simd
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
#endif
// Reserve space for spill slots.
NOP SP // disable vet stack checking

View File

@@ -32,11 +32,6 @@ TEXT sigtramp<>(SB),NOSPLIT,$0-0
// R14 is cleared in case there's a non-zero value in there
// if called from a non-go thread.
XORPS X15, X15
#ifdef GOEXPERIMENT_simd
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
#endif
XORQ R14, R14
get_tls(AX)