[dev.simd] simd/archsimd: 128- and 256-bit FMA operations do not require AVX-512

Currently, all FMA operations are marked as requiring AVX512, even on smaller vector widths. This is happening because the narrower FMA operations are marked as extension "FMA" in the XED. Since this extension doesn't start with "AVX", we filter them out very early in the XED process. However, this is just a quirk of naming: the FMA feature depends on the AVX feature, so it is part of AVX, even if it doesn't say so on the tin. Fix this by accepting the FMA extension and adding FMA to the table of CPU features. We also tweak internal/cpu slightly do it correctly enforces that the logical FMA feature depends on both the FMA and AVX CPUID flags. This actually *deletes* a lot of generated code because we no longer need the AVX-512 encoding of these 128- and 256-bit operations. Change-Id: I744a18d0be888f536ac034fe88b110347622be7e Reviewed-on: https://go-review.googlesource.com/c/go/+/736160 Auto-Submit: Austin Clements <austin@google.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
[dev.simd] simd/archsimd/_gen/simdgen: feature implications
2026-01-29 23:22:06 +03:00 · 2026-01-13 11:45:23 -08:00 · 2026-01-13 09:25:01 -08:00 · 2026-01-02 19:42:53 -05:00 · 2026-01-02 12:16:34 -08:00 · 2026-01-02 12:14:17 -08:00
137 changed files with 14972 additions and 5998 deletions
--- a/codereview.cfg
+++ b/codereview.cfg
@@ -1 +1,2 @@
-branch: master
+branch: dev.simd
+parent-branch: master
--- a/src/archive/tar/reader_test.go
+++ b/src/archive/tar/reader_test.go
@@ -787,7 +787,7 @@ type readBadSeeker struct{ io.ReadSeeker }

 func (rbs *readBadSeeker) Seek(int64, int) (int64, error) { return 0, fmt.Errorf("illegal seek") }

-// TestReadTruncation test the ending condition on various truncated files and
+// TestReadTruncation tests the ending condition on various truncated files and
 // that truncated files are still detected even if the underlying io.Reader
 // satisfies io.Seeker.
 func TestReadTruncation(t *testing.T) {
--- a/src/archive/tar/stat_unix.go
+++ b/src/archive/tar/stat_unix.go
@@ -19,7 +19,7 @@ func init() {
 	sysStat = statUnix
 }

-// userMap and groupMap caches UID and GID lookups for performance reasons.
+// userMap and groupMap cache UID and GID lookups for performance reasons.
 // The downside is that renaming uname or gname by the OS never takes effect.
 var userMap, groupMap sync.Map // map[int]string

--- a/src/archive/tar/strconv.go
+++ b/src/archive/tar/strconv.go
@@ -312,7 +312,7 @@ func formatPAXRecord(k, v string) (string, error) {
 //	"%d %s=%s\n" % (size, key, value)
 //
 // Keys and values should be UTF-8, but the number of bad writers out there
-// forces us to be a more liberal.
+// forces us to be more liberal.
 // Thus, we only reject all keys with NUL, and only reject NULs in values
 // for the PAX version of the USTAR string fields.
 // The key must not contain an '=' character.
--- a/src/bytes/bytes_test.go
+++ b/src/bytes/bytes_test.go
@@ -961,7 +961,7 @@ func TestSplit(t *testing.T) {
 		if tt.n < 0 {
 			b := sliceOfString(Split([]byte(tt.s), []byte(tt.sep)))
 			if !slices.Equal(result, b) {
-				t.Errorf("Split disagrees withSplitN(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, b, a)
+				t.Errorf("Split disagrees with SplitN(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, b, a)
 			}
 		}
 		if len(a) > 0 {
@@ -1023,7 +1023,7 @@ func TestSplitAfter(t *testing.T) {
 		if tt.n < 0 {
 			b := sliceOfString(SplitAfter([]byte(tt.s), []byte(tt.sep)))
 			if !slices.Equal(result, b) {
-				t.Errorf("SplitAfter disagrees withSplitAfterN(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, b, a)
+				t.Errorf("SplitAfter disagrees with SplitAfterN(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, b, a)
 			}
 		}
 	}
--- a/src/cmd/asm/internal/asm/endtoend_test.go
+++ b/src/cmd/asm/internal/asm/endtoend_test.go
@@ -199,6 +199,11 @@ Diff:
 	}
 	obj.Flushplist(ctxt, pList, nil)

+	if !ok {
+		// If we've encountered errors, the output is unlikely to be sane.
+		t.FailNow()
+	}
+
 	for p := top; p != nil; p = p.Link {
 		if p.As == obj.ATEXT {
 			text = p.From.Sym
@@ -486,16 +491,35 @@ func TestPPC64EndToEnd(t *testing.T) {
 	}
 }

-func TestRISCVEndToEnd(t *testing.T) {
-	testEndToEnd(t, "riscv64", "riscv64")
+func testRISCV64AllProfiles(t *testing.T, testFn func(t *testing.T)) {
+	t.Helper()
+
+	defer func(orig int) { buildcfg.GORISCV64 = orig }(buildcfg.GORISCV64)
+
+	for _, goriscv64 := range []int{20, 22, 23} {
+		t.Run(fmt.Sprintf("rva%vu64", goriscv64), func(t *testing.T) {
+			buildcfg.GORISCV64 = goriscv64
+			testFn(t)
+		})
+	}
 }

-func TestRISCVErrors(t *testing.T) {
-	testErrors(t, "riscv64", "riscv64error")
+func TestRISCV64EndToEnd(t *testing.T) {
+	testRISCV64AllProfiles(t, func(t *testing.T) {
+		testEndToEnd(t, "riscv64", "riscv64")
+	})
 }

-func TestRISCVValidation(t *testing.T) {
-	testErrors(t, "riscv64", "riscv64validation")
+func TestRISCV64Errors(t *testing.T) {
+	testRISCV64AllProfiles(t, func(t *testing.T) {
+		testErrors(t, "riscv64", "riscv64error")
+	})
+}
+
+func TestRISCV64Validation(t *testing.T) {
+	testRISCV64AllProfiles(t, func(t *testing.T) {
+		testErrors(t, "riscv64", "riscv64validation")
+	})
 }

 func TestS390XEndToEnd(t *testing.T) {
--- a/src/cmd/cgo/internal/test/issue76861.go
+++ b/src/cmd/cgo/internal/test/issue76861.go
@@ -0,0 +1,12 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build cgo
+
+package cgotest
+
+// Issue 43639: No runtime test needed, make sure package
+// cmd/cgo/internal/test/issue76861 compiles without error.
+
+import _ "cmd/cgo/internal/test/issue76861"
--- a/src/cmd/cgo/internal/test/issue76861/a.go
+++ b/src/cmd/cgo/internal/test/issue76861/a.go
@@ -0,0 +1,13 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package issue76861
+
+// #cgo CFLAGS: -Wall -Werror
+// void issue76861(void) {}
+import "C"
+
+func Issue76861() {
+	C.issue76861()
+}
--- a/src/cmd/cgo/out.go
+++ b/src/cmd/cgo/out.go
@@ -783,13 +783,13 @@ func (p *Package) writeOutputFunc(fgcc *os.File, n *Name) {
 	// We're trying to write a gcc struct that matches gc's layout.
 	// Use packed attribute to force no padding in this struct in case
 	// gcc has different packing requirements.
-	fmt.Fprintf(fgcc, "\t%s %v *_cgo_a = v;\n", ctype, p.packedAttribute())
-	if n.FuncType.Result != nil {
+	tr := n.FuncType.Result
+	if (n.Kind != "macro" && len(n.FuncType.Params) > 0) || tr != nil {
+		fmt.Fprintf(fgcc, "\t%s %v *_cgo_a = v;\n", ctype, p.packedAttribute())
+	}
+	if tr != nil {
 		// Save the stack top for use below.
 		fmt.Fprintf(fgcc, "\tchar *_cgo_stktop = _cgo_topofstack();\n")
-	}
-	tr := n.FuncType.Result
-	if tr != nil {
 		fmt.Fprintf(fgcc, "\t__typeof__(_cgo_a->r) _cgo_r;\n")
 	}
 	fmt.Fprintf(fgcc, "\t_cgo_tsan_acquire();\n")
@@ -819,7 +819,7 @@ func (p *Package) writeOutputFunc(fgcc *os.File, n *Name) {
 		fmt.Fprintf(fgcc, "\t_cgo_errno = errno;\n")
 	}
 	fmt.Fprintf(fgcc, "\t_cgo_tsan_release();\n")
-	if n.FuncType.Result != nil {
+	if tr != nil {
 		// The cgo call may have caused a stack copy (via a callback).
 		// Adjust the return value pointer appropriately.
 		fmt.Fprintf(fgcc, "\t_cgo_a = (void*)((char*)_cgo_a + (_cgo_topofstack() - _cgo_stktop));\n")
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.

 package amd64

@@ -175,7 +175,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMOVSQD128_128,
 		ssa.OpAMD64VPMOVSQD128_256,
 		ssa.OpAMD64VPMOVSQD256,
+		ssa.OpAMD64VPMOVUSWB128_128,
+		ssa.OpAMD64VPMOVUSWB128_256,
 		ssa.OpAMD64VPMOVUSWB256,
+		ssa.OpAMD64VPMOVUSDB128_128,
+		ssa.OpAMD64VPMOVUSDB128_256,
+		ssa.OpAMD64VPMOVUSDB128_512,
+		ssa.OpAMD64VPMOVUSQB128_128,
+		ssa.OpAMD64VPMOVUSQB128_256,
+		ssa.OpAMD64VPMOVUSQB128_512,
 		ssa.OpAMD64VPMOVUSDW128_128,
 		ssa.OpAMD64VPMOVUSDW128_256,
 		ssa.OpAMD64VPMOVUSDW256,
@@ -242,12 +250,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPADDQ256,
 		ssa.OpAMD64VPADDQ512,
 		ssa.OpAMD64VHADDPS128,
-		ssa.OpAMD64VHADDPS256,
 		ssa.OpAMD64VHADDPD128,
-		ssa.OpAMD64VHADDPD256,
 		ssa.OpAMD64VPHADDW128,
-		ssa.OpAMD64VPHADDW256,
 		ssa.OpAMD64VPHADDD128,
+		ssa.OpAMD64VHADDPS256,
+		ssa.OpAMD64VHADDPD256,
+		ssa.OpAMD64VPHADDW256,
 		ssa.OpAMD64VPHADDD256,
 		ssa.OpAMD64VPHADDSW128,
 		ssa.OpAMD64VPHADDSW256,
@@ -512,12 +520,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPSUBQ256,
 		ssa.OpAMD64VPSUBQ512,
 		ssa.OpAMD64VHSUBPS128,
-		ssa.OpAMD64VHSUBPS256,
 		ssa.OpAMD64VHSUBPD128,
-		ssa.OpAMD64VHSUBPD256,
 		ssa.OpAMD64VPHSUBW128,
-		ssa.OpAMD64VPHSUBW256,
 		ssa.OpAMD64VPHSUBD128,
+		ssa.OpAMD64VHSUBPS256,
+		ssa.OpAMD64VHSUBPD256,
+		ssa.OpAMD64VPHSUBW256,
 		ssa.OpAMD64VPHSUBD256,
 		ssa.OpAMD64VPHSUBSW128,
 		ssa.OpAMD64VPHSUBSW256,
@@ -731,12 +739,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORVQMasked128,
 		ssa.OpAMD64VPRORVQMasked256,
 		ssa.OpAMD64VPRORVQMasked512,
-		ssa.OpAMD64VPACKSSDWMasked128,
 		ssa.OpAMD64VPACKSSDWMasked256,
 		ssa.OpAMD64VPACKSSDWMasked512,
-		ssa.OpAMD64VPACKUSDWMasked128,
+		ssa.OpAMD64VPACKSSDWMasked128,
 		ssa.OpAMD64VPACKUSDWMasked256,
 		ssa.OpAMD64VPACKUSDWMasked512,
+		ssa.OpAMD64VPACKUSDWMasked128,
 		ssa.OpAMD64VSCALEFPSMasked128,
 		ssa.OpAMD64VSCALEFPSMasked256,
 		ssa.OpAMD64VSCALEFPSMasked512,
@@ -1010,7 +1018,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMOVSQDMasked128_128,
 		ssa.OpAMD64VPMOVSQDMasked128_256,
 		ssa.OpAMD64VPMOVSQDMasked256,
+		ssa.OpAMD64VPMOVUSWBMasked128_128,
+		ssa.OpAMD64VPMOVUSWBMasked128_256,
 		ssa.OpAMD64VPMOVUSWBMasked256,
+		ssa.OpAMD64VPMOVUSDBMasked128_128,
+		ssa.OpAMD64VPMOVUSDBMasked128_256,
+		ssa.OpAMD64VPMOVUSDBMasked128_512,
+		ssa.OpAMD64VPMOVUSQBMasked128_128,
+		ssa.OpAMD64VPMOVUSQBMasked128_256,
+		ssa.OpAMD64VPMOVUSQBMasked128_512,
 		ssa.OpAMD64VPMOVUSDWMasked128_128,
 		ssa.OpAMD64VPMOVUSDWMasked128_256,
 		ssa.OpAMD64VPMOVUSDWMasked256,
@@ -1308,12 +1324,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMI2Q256,
 		ssa.OpAMD64VPERMI2PD512,
 		ssa.OpAMD64VPERMI2Q512,
-		ssa.OpAMD64VPDPBUSD128,
-		ssa.OpAMD64VPDPBUSD256,
-		ssa.OpAMD64VPDPBUSD512,
-		ssa.OpAMD64VPDPBUSDS128,
-		ssa.OpAMD64VPDPBUSDS256,
-		ssa.OpAMD64VPDPBUSDS512,
 		ssa.OpAMD64VFMADD213PS128,
 		ssa.OpAMD64VFMADD213PS256,
 		ssa.OpAMD64VFMADD213PS512,
@@ -1430,12 +1440,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMADDUBSWMasked128Merging,
 		ssa.OpAMD64VPMADDUBSWMasked256Merging,
 		ssa.OpAMD64VPMADDUBSWMasked512Merging,
-		ssa.OpAMD64VPDPBUSDMasked128,
-		ssa.OpAMD64VPDPBUSDMasked256,
-		ssa.OpAMD64VPDPBUSDMasked512,
-		ssa.OpAMD64VPDPBUSDSMasked128,
-		ssa.OpAMD64VPDPBUSDSMasked256,
-		ssa.OpAMD64VPDPBUSDSMasked512,
 		ssa.OpAMD64VGF2P8MULBMasked128Merging,
 		ssa.OpAMD64VGF2P8MULBMasked256Merging,
 		ssa.OpAMD64VGF2P8MULBMasked512Merging,
@@ -1559,12 +1563,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORVQMasked128Merging,
 		ssa.OpAMD64VPRORVQMasked256Merging,
 		ssa.OpAMD64VPRORVQMasked512Merging,
-		ssa.OpAMD64VPACKSSDWMasked128Merging,
 		ssa.OpAMD64VPACKSSDWMasked256Merging,
 		ssa.OpAMD64VPACKSSDWMasked512Merging,
-		ssa.OpAMD64VPACKUSDWMasked128Merging,
+		ssa.OpAMD64VPACKSSDWMasked128Merging,
 		ssa.OpAMD64VPACKUSDWMasked256Merging,
 		ssa.OpAMD64VPACKUSDWMasked512Merging,
+		ssa.OpAMD64VPACKUSDWMasked128Merging,
 		ssa.OpAMD64VSCALEFPSMasked128Merging,
 		ssa.OpAMD64VSCALEFPSMasked256Merging,
 		ssa.OpAMD64VSCALEFPSMasked512Merging,
@@ -1955,25 +1959,11 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMI2Q256load,
 		ssa.OpAMD64VPERMI2PD512load,
 		ssa.OpAMD64VPERMI2Q512load,
-		ssa.OpAMD64VPDPBUSD512load,
-		ssa.OpAMD64VPDPBUSDS512load,
-		ssa.OpAMD64VFMADD213PS128load,
-		ssa.OpAMD64VFMADD213PS256load,
 		ssa.OpAMD64VFMADD213PS512load,
-		ssa.OpAMD64VFMADD213PD128load,
-		ssa.OpAMD64VFMADD213PD256load,
 		ssa.OpAMD64VFMADD213PD512load,
-		ssa.OpAMD64VFMADDSUB213PS128load,
-		ssa.OpAMD64VFMADDSUB213PS256load,
 		ssa.OpAMD64VFMADDSUB213PS512load,
-		ssa.OpAMD64VFMADDSUB213PD128load,
-		ssa.OpAMD64VFMADDSUB213PD256load,
 		ssa.OpAMD64VFMADDSUB213PD512load,
-		ssa.OpAMD64VFMSUBADD213PS128load,
-		ssa.OpAMD64VFMSUBADD213PS256load,
 		ssa.OpAMD64VFMSUBADD213PS512load,
-		ssa.OpAMD64VFMSUBADD213PD128load,
-		ssa.OpAMD64VFMSUBADD213PD256load,
 		ssa.OpAMD64VFMSUBADD213PD512load,
 		ssa.OpAMD64VPSHLDVD128load,
 		ssa.OpAMD64VPSHLDVD256load,
@@ -2004,12 +1994,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMI2QMasked256load,
 		ssa.OpAMD64VPERMI2PDMasked512load,
 		ssa.OpAMD64VPERMI2QMasked512load,
-		ssa.OpAMD64VPDPBUSDMasked128load,
-		ssa.OpAMD64VPDPBUSDMasked256load,
-		ssa.OpAMD64VPDPBUSDMasked512load,
-		ssa.OpAMD64VPDPBUSDSMasked128load,
-		ssa.OpAMD64VPDPBUSDSMasked256load,
-		ssa.OpAMD64VPDPBUSDSMasked512load,
 		ssa.OpAMD64VFMADD213PSMasked128load,
 		ssa.OpAMD64VFMADD213PSMasked256load,
 		ssa.OpAMD64VFMADD213PSMasked512load,
@@ -2146,12 +2130,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORVQMasked128load,
 		ssa.OpAMD64VPRORVQMasked256load,
 		ssa.OpAMD64VPRORVQMasked512load,
-		ssa.OpAMD64VPACKSSDWMasked128load,
 		ssa.OpAMD64VPACKSSDWMasked256load,
 		ssa.OpAMD64VPACKSSDWMasked512load,
-		ssa.OpAMD64VPACKUSDWMasked128load,
+		ssa.OpAMD64VPACKSSDWMasked128load,
 		ssa.OpAMD64VPACKUSDWMasked256load,
 		ssa.OpAMD64VPACKUSDWMasked512load,
+		ssa.OpAMD64VPACKUSDWMasked128load,
 		ssa.OpAMD64VSCALEFPSMasked128load,
 		ssa.OpAMD64VSCALEFPSMasked256load,
 		ssa.OpAMD64VSCALEFPSMasked512load,
@@ -2638,7 +2622,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMOVSQDMasked128_128Merging,
 		ssa.OpAMD64VPMOVSQDMasked128_256Merging,
 		ssa.OpAMD64VPMOVSQDMasked256Merging,
+		ssa.OpAMD64VPMOVUSWBMasked128_128Merging,
+		ssa.OpAMD64VPMOVUSWBMasked128_256Merging,
 		ssa.OpAMD64VPMOVUSWBMasked256Merging,
+		ssa.OpAMD64VPMOVUSDBMasked128_128Merging,
+		ssa.OpAMD64VPMOVUSDBMasked128_256Merging,
+		ssa.OpAMD64VPMOVUSDBMasked128_512Merging,
+		ssa.OpAMD64VPMOVUSQBMasked128_128Merging,
+		ssa.OpAMD64VPMOVUSQBMasked128_256Merging,
+		ssa.OpAMD64VPMOVUSQBMasked128_512Merging,
 		ssa.OpAMD64VPMOVUSDWMasked128_128Merging,
 		ssa.OpAMD64VPMOVUSDWMasked128_256Merging,
 		ssa.OpAMD64VPMOVUSDWMasked256Merging,
@@ -3021,18 +3013,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMADDUBSWMasked128,
 		ssa.OpAMD64VPMADDUBSWMasked256,
 		ssa.OpAMD64VPMADDUBSWMasked512,
-		ssa.OpAMD64VPDPBUSDMasked128,
-		ssa.OpAMD64VPDPBUSDMasked128load,
-		ssa.OpAMD64VPDPBUSDMasked256,
-		ssa.OpAMD64VPDPBUSDMasked256load,
-		ssa.OpAMD64VPDPBUSDMasked512,
-		ssa.OpAMD64VPDPBUSDMasked512load,
-		ssa.OpAMD64VPDPBUSDSMasked128,
-		ssa.OpAMD64VPDPBUSDSMasked128load,
-		ssa.OpAMD64VPDPBUSDSMasked256,
-		ssa.OpAMD64VPDPBUSDSMasked256load,
-		ssa.OpAMD64VPDPBUSDSMasked512,
-		ssa.OpAMD64VPDPBUSDSMasked512load,
 		ssa.OpAMD64VEXPANDPSMasked128,
 		ssa.OpAMD64VEXPANDPSMasked256,
 		ssa.OpAMD64VEXPANDPSMasked512,
@@ -3415,12 +3395,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMOVSQBMasked128_128,
 		ssa.OpAMD64VPMOVSQBMasked128_256,
 		ssa.OpAMD64VPMOVSQBMasked128_512,
-		ssa.OpAMD64VPACKSSDWMasked128,
-		ssa.OpAMD64VPACKSSDWMasked128load,
 		ssa.OpAMD64VPACKSSDWMasked256,
 		ssa.OpAMD64VPACKSSDWMasked256load,
 		ssa.OpAMD64VPACKSSDWMasked512,
 		ssa.OpAMD64VPACKSSDWMasked512load,
+		ssa.OpAMD64VPACKSSDWMasked128,
+		ssa.OpAMD64VPACKSSDWMasked128load,
 		ssa.OpAMD64VPMOVSDWMasked128_128,
 		ssa.OpAMD64VPMOVSDWMasked128_256,
 		ssa.OpAMD64VPMOVSDWMasked256,
@@ -3430,13 +3410,21 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMOVSQDMasked128_128,
 		ssa.OpAMD64VPMOVSQDMasked128_256,
 		ssa.OpAMD64VPMOVSQDMasked256,
+		ssa.OpAMD64VPMOVUSWBMasked128_128,
+		ssa.OpAMD64VPMOVUSWBMasked128_256,
 		ssa.OpAMD64VPMOVUSWBMasked256,
-		ssa.OpAMD64VPACKUSDWMasked128,
-		ssa.OpAMD64VPACKUSDWMasked128load,
+		ssa.OpAMD64VPMOVUSDBMasked128_128,
+		ssa.OpAMD64VPMOVUSDBMasked128_256,
+		ssa.OpAMD64VPMOVUSDBMasked128_512,
+		ssa.OpAMD64VPMOVUSQBMasked128_128,
+		ssa.OpAMD64VPMOVUSQBMasked128_256,
+		ssa.OpAMD64VPMOVUSQBMasked128_512,
 		ssa.OpAMD64VPACKUSDWMasked256,
 		ssa.OpAMD64VPACKUSDWMasked256load,
 		ssa.OpAMD64VPACKUSDWMasked512,
 		ssa.OpAMD64VPACKUSDWMasked512load,
+		ssa.OpAMD64VPACKUSDWMasked128,
+		ssa.OpAMD64VPACKUSDWMasked128load,
 		ssa.OpAMD64VPMOVUSDWMasked128_128,
 		ssa.OpAMD64VPMOVUSDWMasked128_256,
 		ssa.OpAMD64VPMOVUSDWMasked256,
--- a/src/cmd/compile/internal/inline/inl.go
+++ b/src/cmd/compile/internal/inline/inl.go
@@ -516,6 +516,9 @@ opSwitch:
 						break opSwitch
 					case "panicrangestate":
 						cheap = true
+					case "deferrangefunc":
+						v.reason = "defer call in range func"
+						return true
 					}
 				}
 			}
--- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
@@ -1730,6 +1730,13 @@
 // Misc
 (IsZeroVec x) => (SETEQ (VPTEST x x))

+(IsNaNFloat32x4  x) => (VCMPPS128 [3] x x)
+(IsNaNFloat32x8  x) => (VCMPPS256 [3] x x)
+(IsNaNFloat32x16 x) => (VPMOVMToVec32x16 (VCMPPS512 [3] x x))
+(IsNaNFloat64x2  x) => (VCMPPD128 [3] x x)
+(IsNaNFloat64x4  x) => (VCMPPD256 [3] x x)
+(IsNaNFloat64x8  x) => (VPMOVMToVec64x8 (VCMPPD512 [3] x x))
+
 // SIMD vector K-masked loads and stores

 (LoadMasked64 <t> ptr mask mem) && t.Size() == 64 => (VPMASK64load512 ptr (VPMOVVec64x8ToM  <types.TypeMask> mask) mem)
@@ -1818,10 +1825,10 @@
 (EQ (VPTEST x:(VPANDN(128|256) j k) y) yes no) && x == y && x.Uses == 2 => (ULT (VPTEST k j) yes no) // AndNot has swapped its operand order
 (EQ (VPTEST x:(VPANDN(D|Q)512 j k) y) yes no) && x == y && x.Uses == 2 => (ULT (VPTEST k j) yes no) // AndNot has swapped its operand order

-// DotProductQuadruple optimizations
-(VPADDD128 (VPDPBUSD128 (Zero128 <t>) x y) z) => (VPDPBUSD128 <t> z x y)
-(VPADDD256 (VPDPBUSD256 (Zero256 <t>) x y) z) => (VPDPBUSD256 <t> z x y)
-(VPADDD512 (VPDPBUSD512 (Zero512 <t>) x y) z) => (VPDPBUSD512 <t> z x y)
-(VPADDD128 (VPDPBUSDS128 (Zero128 <t>) x y) z) => (VPDPBUSDS128 <t> z x y)
-(VPADDD256 (VPDPBUSDS256 (Zero256 <t>) x y) z) => (VPDPBUSDS256 <t> z x y)
-(VPADDD512 (VPDPBUSDS512 (Zero512 <t>) x y) z) => (VPDPBUSDS512 <t> z x y)
+// optimize x.IsNaN().Or(y.IsNaN())
+(VPOR128 (VCMPP(S|D)128 [3] x x) (VCMPP(S|D)128 [3] y y)) => (VCMPP(S|D)128 [3] x y)
+(VPOR256 (VCMPP(S|D)256 [3] x x) (VCMPP(S|D)256 [3] y y)) => (VCMPP(S|D)256 [3] x y)
+(VPORD512 (VPMOVMToVec32x16 (VCMPPS512 [3] x x)) (VPMOVMToVec32x16 (VCMPPS512 [3] y y))) =>
+	(VPMOVMToVec32x16 (VCMPPS512 [3] x y))
+(VPORD512 (VPMOVMToVec64x8  (VCMPPD512 [3] x x)) (VPMOVMToVec64x8  (VCMPPD512 [3] y y))) =>
+	(VPMOVMToVec64x8  (VCMPPD512 [3] x y))
--- a/src/cmd/compile/internal/ssa/_gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go
@@ -715,6 +715,14 @@ var genericOps = []opData{

 	// Returns true if arg0 is all zero.
 	{name: "IsZeroVec", argLength: 1},
+
+	// Returns a mask indicating whether arg0's elements are NaN.
+	{name: "IsNaNFloat32x4", argLength: 1},
+	{name: "IsNaNFloat32x8", argLength: 1},
+	{name: "IsNaNFloat32x16", argLength: 1},
+	{name: "IsNaNFloat64x2", argLength: 1},
+	{name: "IsNaNFloat64x4", argLength: 1},
+	{name: "IsNaNFloat64x8", argLength: 1},
 }

 //     kind          controls          successors   implicit exit
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.

 (AESDecryptLastRoundUint8x16 ...) => (VAESDECLAST128 ...)
 (AESDecryptLastRoundUint8x32 ...) => (VAESDECLAST256 ...)
@@ -57,19 +57,19 @@
 (AddUint64x4 ...) => (VPADDQ256 ...)
 (AddUint64x8 ...) => (VPADDQ512 ...)
 (AddPairsFloat32x4 ...) => (VHADDPS128 ...)
-(AddPairsFloat32x8 ...) => (VHADDPS256 ...)
 (AddPairsFloat64x2 ...) => (VHADDPD128 ...)
-(AddPairsFloat64x4 ...) => (VHADDPD256 ...)
 (AddPairsInt16x8 ...) => (VPHADDW128 ...)
-(AddPairsInt16x16 ...) => (VPHADDW256 ...)
 (AddPairsInt32x4 ...) => (VPHADDD128 ...)
-(AddPairsInt32x8 ...) => (VPHADDD256 ...)
 (AddPairsUint16x8 ...) => (VPHADDW128 ...)
-(AddPairsUint16x16 ...) => (VPHADDW256 ...)
 (AddPairsUint32x4 ...) => (VPHADDD128 ...)
-(AddPairsUint32x8 ...) => (VPHADDD256 ...)
+(AddPairsGroupedFloat32x8 ...) => (VHADDPS256 ...)
+(AddPairsGroupedFloat64x4 ...) => (VHADDPD256 ...)
+(AddPairsGroupedInt16x16 ...) => (VPHADDW256 ...)
+(AddPairsGroupedInt32x8 ...) => (VPHADDD256 ...)
+(AddPairsGroupedUint16x16 ...) => (VPHADDW256 ...)
+(AddPairsGroupedUint32x8 ...) => (VPHADDD256 ...)
 (AddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...)
-(AddPairsSaturatedInt16x16 ...) => (VPHADDSW256 ...)
+(AddPairsSaturatedGroupedInt16x16 ...) => (VPHADDSW256 ...)
 (AddSaturatedInt8x16 ...) => (VPADDSB128 ...)
 (AddSaturatedInt8x32 ...) => (VPADDSB256 ...)
 (AddSaturatedInt8x64 ...) => (VPADDSB512 ...)
@@ -316,12 +316,6 @@
 (DotProductPairsSaturatedUint8x16 ...) => (VPMADDUBSW128 ...)
 (DotProductPairsSaturatedUint8x32 ...) => (VPMADDUBSW256 ...)
 (DotProductPairsSaturatedUint8x64 ...) => (VPMADDUBSW512 ...)
-(DotProductQuadrupleInt32x4 ...) => (VPDPBUSD128 ...)
-(DotProductQuadrupleInt32x8 ...) => (VPDPBUSD256 ...)
-(DotProductQuadrupleInt32x16 ...) => (VPDPBUSD512 ...)
-(DotProductQuadrupleSaturatedInt32x4 ...) => (VPDPBUSDS128 ...)
-(DotProductQuadrupleSaturatedInt32x8 ...) => (VPDPBUSDS256 ...)
-(DotProductQuadrupleSaturatedInt32x16 ...) => (VPDPBUSDS512 ...)
 (EqualFloat32x4 x y) => (VCMPPS128 [0] x y)
 (EqualFloat32x8 x y) => (VCMPPS256 [0] x y)
 (EqualFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [0] x y))
@@ -382,26 +376,26 @@
 (ExpandUint64x2 x mask) => (VPEXPANDQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (ExpandUint64x4 x mask) => (VPEXPANDQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (ExpandUint64x8 x mask) => (VPEXPANDQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(ExtendLo2ToInt64x2Int8x16 ...) => (VPMOVSXBQ128 ...)
-(ExtendLo2ToInt64x2Int16x8 ...) => (VPMOVSXWQ128 ...)
-(ExtendLo2ToInt64x2Int32x4 ...) => (VPMOVSXDQ128 ...)
-(ExtendLo2ToUint64x2Uint8x16 ...) => (VPMOVZXBQ128 ...)
-(ExtendLo2ToUint64x2Uint16x8 ...) => (VPMOVZXWQ128 ...)
-(ExtendLo2ToUint64x2Uint32x4 ...) => (VPMOVZXDQ128 ...)
-(ExtendLo4ToInt32x4Int8x16 ...) => (VPMOVSXBD128 ...)
-(ExtendLo4ToInt32x4Int16x8 ...) => (VPMOVSXWD128 ...)
-(ExtendLo4ToInt64x4Int8x16 ...) => (VPMOVSXBQ256 ...)
-(ExtendLo4ToInt64x4Int16x8 ...) => (VPMOVSXWQ256 ...)
-(ExtendLo4ToUint32x4Uint8x16 ...) => (VPMOVZXBD128 ...)
-(ExtendLo4ToUint32x4Uint16x8 ...) => (VPMOVZXWD128 ...)
-(ExtendLo4ToUint64x4Uint8x16 ...) => (VPMOVZXBQ256 ...)
-(ExtendLo4ToUint64x4Uint16x8 ...) => (VPMOVZXWQ256 ...)
-(ExtendLo8ToInt16x8Int8x16 ...) => (VPMOVSXBW128 ...)
-(ExtendLo8ToInt32x8Int8x16 ...) => (VPMOVSXBD256 ...)
-(ExtendLo8ToInt64x8Int8x16 ...) => (VPMOVSXBQ512 ...)
-(ExtendLo8ToUint16x8Uint8x16 ...) => (VPMOVZXBW128 ...)
-(ExtendLo8ToUint32x8Uint8x16 ...) => (VPMOVZXBD256 ...)
-(ExtendLo8ToUint64x8Uint8x16 ...) => (VPMOVZXBQ512 ...)
+(ExtendLo2ToInt64Int8x16 ...) => (VPMOVSXBQ128 ...)
+(ExtendLo2ToInt64Int16x8 ...) => (VPMOVSXWQ128 ...)
+(ExtendLo2ToInt64Int32x4 ...) => (VPMOVSXDQ128 ...)
+(ExtendLo2ToUint64Uint8x16 ...) => (VPMOVZXBQ128 ...)
+(ExtendLo2ToUint64Uint16x8 ...) => (VPMOVZXWQ128 ...)
+(ExtendLo2ToUint64Uint32x4 ...) => (VPMOVZXDQ128 ...)
+(ExtendLo4ToInt32Int8x16 ...) => (VPMOVSXBD128 ...)
+(ExtendLo4ToInt32Int16x8 ...) => (VPMOVSXWD128 ...)
+(ExtendLo4ToInt64Int8x16 ...) => (VPMOVSXBQ256 ...)
+(ExtendLo4ToInt64Int16x8 ...) => (VPMOVSXWQ256 ...)
+(ExtendLo4ToUint32Uint8x16 ...) => (VPMOVZXBD128 ...)
+(ExtendLo4ToUint32Uint16x8 ...) => (VPMOVZXWD128 ...)
+(ExtendLo4ToUint64Uint8x16 ...) => (VPMOVZXBQ256 ...)
+(ExtendLo4ToUint64Uint16x8 ...) => (VPMOVZXWQ256 ...)
+(ExtendLo8ToInt16Int8x16 ...) => (VPMOVSXBW128 ...)
+(ExtendLo8ToInt32Int8x16 ...) => (VPMOVSXBD256 ...)
+(ExtendLo8ToInt64Int8x16 ...) => (VPMOVSXBQ512 ...)
+(ExtendLo8ToUint16Uint8x16 ...) => (VPMOVZXBW128 ...)
+(ExtendLo8ToUint32Uint8x16 ...) => (VPMOVZXBD256 ...)
+(ExtendLo8ToUint64Uint8x16 ...) => (VPMOVZXBQ512 ...)
 (ExtendToInt16Int8x16 ...) => (VPMOVSXBW256 ...)
 (ExtendToInt16Int8x32 ...) => (VPMOVSXBW512 ...)
 (ExtendToInt32Int8x16 ...) => (VPMOVSXBD512 ...)
@@ -565,12 +559,6 @@
 (InterleaveLoGroupedUint32x16 ...) => (VPUNPCKLDQ512 ...)
 (InterleaveLoGroupedUint64x4 ...) => (VPUNPCKLQDQ256 ...)
 (InterleaveLoGroupedUint64x8 ...) => (VPUNPCKLQDQ512 ...)
-(IsNanFloat32x4 x y) => (VCMPPS128 [3] x y)
-(IsNanFloat32x8 x y) => (VCMPPS256 [3] x y)
-(IsNanFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [3] x y))
-(IsNanFloat64x2 x y) => (VCMPPD128 [3] x y)
-(IsNanFloat64x4 x y) => (VCMPPD256 [3] x y)
-(IsNanFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [3] x y))
 (LeadingZerosInt32x4 ...) => (VPLZCNTD128 ...)
 (LeadingZerosInt32x8 ...) => (VPLZCNTD256 ...)
 (LeadingZerosInt32x16 ...) => (VPLZCNTD512 ...)
@@ -914,29 +902,29 @@
 (SaturateToInt16Int64x4 ...) => (VPMOVSQW128_256 ...)
 (SaturateToInt16Int64x8 ...) => (VPMOVSQW128_512 ...)
 (SaturateToInt16ConcatInt32x4 ...) => (VPACKSSDW128 ...)
-(SaturateToInt16ConcatInt32x8 ...) => (VPACKSSDW256 ...)
-(SaturateToInt16ConcatInt32x16 ...) => (VPACKSSDW512 ...)
+(SaturateToInt16ConcatGroupedInt32x8 ...) => (VPACKSSDW256 ...)
+(SaturateToInt16ConcatGroupedInt32x16 ...) => (VPACKSSDW512 ...)
 (SaturateToInt32Int64x2 ...) => (VPMOVSQD128_128 ...)
 (SaturateToInt32Int64x4 ...) => (VPMOVSQD128_256 ...)
 (SaturateToInt32Int64x8 ...) => (VPMOVSQD256 ...)
-(SaturateToUint8Int16x8 ...) => (VPMOVSWB128_128 ...)
-(SaturateToUint8Int16x16 ...) => (VPMOVSWB128_256 ...)
-(SaturateToUint8Int32x4 ...) => (VPMOVSDB128_128 ...)
-(SaturateToUint8Int32x8 ...) => (VPMOVSDB128_256 ...)
-(SaturateToUint8Int32x16 ...) => (VPMOVSDB128_512 ...)
-(SaturateToUint8Int64x2 ...) => (VPMOVSQB128_128 ...)
-(SaturateToUint8Int64x4 ...) => (VPMOVSQB128_256 ...)
-(SaturateToUint8Int64x8 ...) => (VPMOVSQB128_512 ...)
+(SaturateToUint8Uint16x8 ...) => (VPMOVUSWB128_128 ...)
+(SaturateToUint8Uint16x16 ...) => (VPMOVUSWB128_256 ...)
 (SaturateToUint8Uint16x32 ...) => (VPMOVUSWB256 ...)
+(SaturateToUint8Uint32x4 ...) => (VPMOVUSDB128_128 ...)
+(SaturateToUint8Uint32x8 ...) => (VPMOVUSDB128_256 ...)
+(SaturateToUint8Uint32x16 ...) => (VPMOVUSDB128_512 ...)
+(SaturateToUint8Uint64x2 ...) => (VPMOVUSQB128_128 ...)
+(SaturateToUint8Uint64x4 ...) => (VPMOVUSQB128_256 ...)
+(SaturateToUint8Uint64x8 ...) => (VPMOVUSQB128_512 ...)
 (SaturateToUint16Uint32x4 ...) => (VPMOVUSDW128_128 ...)
 (SaturateToUint16Uint32x8 ...) => (VPMOVUSDW128_256 ...)
 (SaturateToUint16Uint32x16 ...) => (VPMOVUSDW256 ...)
 (SaturateToUint16Uint64x2 ...) => (VPMOVUSQW128_128 ...)
 (SaturateToUint16Uint64x4 ...) => (VPMOVUSQW128_256 ...)
 (SaturateToUint16Uint64x8 ...) => (VPMOVUSQW128_512 ...)
-(SaturateToUint16ConcatUint32x4 ...) => (VPACKUSDW128 ...)
-(SaturateToUint16ConcatUint32x8 ...) => (VPACKUSDW256 ...)
-(SaturateToUint16ConcatUint32x16 ...) => (VPACKUSDW512 ...)
+(SaturateToUint16ConcatInt32x4 ...) => (VPACKUSDW128 ...)
+(SaturateToUint16ConcatGroupedInt32x8 ...) => (VPACKUSDW256 ...)
+(SaturateToUint16ConcatGroupedInt32x16 ...) => (VPACKUSDW512 ...)
 (SaturateToUint32Uint64x2 ...) => (VPMOVUSQD128_128 ...)
 (SaturateToUint32Uint64x4 ...) => (VPMOVUSQD128_256 ...)
 (SaturateToUint32Uint64x8 ...) => (VPMOVUSQD256 ...)
@@ -1223,19 +1211,19 @@
 (SubUint64x4 ...) => (VPSUBQ256 ...)
 (SubUint64x8 ...) => (VPSUBQ512 ...)
 (SubPairsFloat32x4 ...) => (VHSUBPS128 ...)
-(SubPairsFloat32x8 ...) => (VHSUBPS256 ...)
 (SubPairsFloat64x2 ...) => (VHSUBPD128 ...)
-(SubPairsFloat64x4 ...) => (VHSUBPD256 ...)
 (SubPairsInt16x8 ...) => (VPHSUBW128 ...)
-(SubPairsInt16x16 ...) => (VPHSUBW256 ...)
 (SubPairsInt32x4 ...) => (VPHSUBD128 ...)
-(SubPairsInt32x8 ...) => (VPHSUBD256 ...)
 (SubPairsUint16x8 ...) => (VPHSUBW128 ...)
-(SubPairsUint16x16 ...) => (VPHSUBW256 ...)
 (SubPairsUint32x4 ...) => (VPHSUBD128 ...)
-(SubPairsUint32x8 ...) => (VPHSUBD256 ...)
+(SubPairsGroupedFloat32x8 ...) => (VHSUBPS256 ...)
+(SubPairsGroupedFloat64x4 ...) => (VHSUBPD256 ...)
+(SubPairsGroupedInt16x16 ...) => (VPHSUBW256 ...)
+(SubPairsGroupedInt32x8 ...) => (VPHSUBD256 ...)
+(SubPairsGroupedUint16x16 ...) => (VPHSUBW256 ...)
+(SubPairsGroupedUint32x8 ...) => (VPHSUBD256 ...)
 (SubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...)
-(SubPairsSaturatedInt16x16 ...) => (VPHSUBSW256 ...)
+(SubPairsSaturatedGroupedInt16x16 ...) => (VPHSUBSW256 ...)
 (SubSaturatedInt8x16 ...) => (VPSUBSB128 ...)
 (SubSaturatedInt8x32 ...) => (VPSUBSB256 ...)
 (SubSaturatedInt8x64 ...) => (VPSUBSB512 ...)
@@ -1547,12 +1535,6 @@
 (VMOVDQU16Masked128 (VPMADDUBSW128 x y) mask) => (VPMADDUBSWMasked128 x y mask)
 (VMOVDQU16Masked256 (VPMADDUBSW256 x y) mask) => (VPMADDUBSWMasked256 x y mask)
 (VMOVDQU16Masked512 (VPMADDUBSW512 x y) mask) => (VPMADDUBSWMasked512 x y mask)
-(VMOVDQU32Masked128 (VPDPBUSD128 x y z) mask) => (VPDPBUSDMasked128 x y z mask)
-(VMOVDQU32Masked256 (VPDPBUSD256 x y z) mask) => (VPDPBUSDMasked256 x y z mask)
-(VMOVDQU32Masked512 (VPDPBUSD512 x y z) mask) => (VPDPBUSDMasked512 x y z mask)
-(VMOVDQU32Masked128 (VPDPBUSDS128 x y z) mask) => (VPDPBUSDSMasked128 x y z mask)
-(VMOVDQU32Masked256 (VPDPBUSDS256 x y z) mask) => (VPDPBUSDSMasked256 x y z mask)
-(VMOVDQU32Masked512 (VPDPBUSDS512 x y z) mask) => (VPDPBUSDSMasked512 x y z mask)
 (VMOVDQU8Masked128 (VPMOVSXBQ128 x) mask) => (VPMOVSXBQMasked128 x mask)
 (VMOVDQU16Masked128 (VPMOVSXWQ128 x) mask) => (VPMOVSXWQMasked128 x mask)
 (VMOVDQU32Masked128 (VPMOVSXDQ128 x) mask) => (VPMOVSXDQMasked128 x mask)
@@ -1775,9 +1757,9 @@
 (VMOVDQU64Masked128 (VPMOVSQB128_128 x) mask) => (VPMOVSQBMasked128_128 x mask)
 (VMOVDQU64Masked256 (VPMOVSQB128_256 x) mask) => (VPMOVSQBMasked128_256 x mask)
 (VMOVDQU64Masked512 (VPMOVSQB128_512 x) mask) => (VPMOVSQBMasked128_512 x mask)
-(VMOVDQU32Masked128 (VPACKSSDW128 x y) mask) => (VPACKSSDWMasked128 x y mask)
 (VMOVDQU32Masked256 (VPACKSSDW256 x y) mask) => (VPACKSSDWMasked256 x y mask)
 (VMOVDQU32Masked512 (VPACKSSDW512 x y) mask) => (VPACKSSDWMasked512 x y mask)
+(VMOVDQU32Masked128 (VPACKSSDW128 x y) mask) => (VPACKSSDWMasked128 x y mask)
 (VMOVDQU32Masked128 (VPMOVSDW128_128 x) mask) => (VPMOVSDWMasked128_128 x mask)
 (VMOVDQU32Masked256 (VPMOVSDW128_256 x) mask) => (VPMOVSDWMasked128_256 x mask)
 (VMOVDQU32Masked256 (VPMOVSDW256 x) mask) => (VPMOVSDWMasked256 x mask)
@@ -1787,10 +1769,18 @@
 (VMOVDQU64Masked128 (VPMOVSQD128_128 x) mask) => (VPMOVSQDMasked128_128 x mask)
 (VMOVDQU64Masked256 (VPMOVSQD128_256 x) mask) => (VPMOVSQDMasked128_256 x mask)
 (VMOVDQU64Masked256 (VPMOVSQD256 x) mask) => (VPMOVSQDMasked256 x mask)
+(VMOVDQU16Masked128 (VPMOVUSWB128_128 x) mask) => (VPMOVUSWBMasked128_128 x mask)
+(VMOVDQU16Masked256 (VPMOVUSWB128_256 x) mask) => (VPMOVUSWBMasked128_256 x mask)
 (VMOVDQU16Masked256 (VPMOVUSWB256 x) mask) => (VPMOVUSWBMasked256 x mask)
-(VMOVDQU32Masked128 (VPACKUSDW128 x y) mask) => (VPACKUSDWMasked128 x y mask)
+(VMOVDQU32Masked128 (VPMOVUSDB128_128 x) mask) => (VPMOVUSDBMasked128_128 x mask)
+(VMOVDQU32Masked256 (VPMOVUSDB128_256 x) mask) => (VPMOVUSDBMasked128_256 x mask)
+(VMOVDQU32Masked512 (VPMOVUSDB128_512 x) mask) => (VPMOVUSDBMasked128_512 x mask)
+(VMOVDQU64Masked128 (VPMOVUSQB128_128 x) mask) => (VPMOVUSQBMasked128_128 x mask)
+(VMOVDQU64Masked256 (VPMOVUSQB128_256 x) mask) => (VPMOVUSQBMasked128_256 x mask)
+(VMOVDQU64Masked512 (VPMOVUSQB128_512 x) mask) => (VPMOVUSQBMasked128_512 x mask)
 (VMOVDQU32Masked256 (VPACKUSDW256 x y) mask) => (VPACKUSDWMasked256 x y mask)
 (VMOVDQU32Masked512 (VPACKUSDW512 x y) mask) => (VPACKUSDWMasked512 x y mask)
+(VMOVDQU32Masked128 (VPACKUSDW128 x y) mask) => (VPACKUSDWMasked128 x y mask)
 (VMOVDQU32Masked128 (VPMOVUSDW128_128 x) mask) => (VPMOVUSDWMasked128_128 x mask)
 (VMOVDQU32Masked256 (VPMOVUSDW128_256 x) mask) => (VPMOVUSDWMasked128_256 x mask)
 (VMOVDQU32Masked256 (VPMOVUSDW256 x) mask) => (VPMOVUSDWMasked256 x mask)
@@ -2018,6 +2008,7 @@
 (VPBLENDMDMasked512 dst (VPMOVDW256 x) mask) => (VPMOVDWMasked256Merging dst x mask)
 (VPBLENDMDMasked512 dst (VPMOVSDB128_512 x) mask) => (VPMOVSDBMasked128_512Merging dst x mask)
 (VPBLENDMDMasked512 dst (VPMOVSDW256 x) mask) => (VPMOVSDWMasked256Merging dst x mask)
+(VPBLENDMDMasked512 dst (VPMOVUSDB128_512 x) mask) => (VPMOVUSDBMasked128_512Merging dst x mask)
 (VPBLENDMDMasked512 dst (VPMOVUSDW256 x) mask) => (VPMOVUSDWMasked256Merging dst x mask)
 (VPBLENDMDMasked512 dst (VPMULLD512 x y) mask) => (VPMULLDMasked512Merging dst x y mask)
 (VPBLENDMDMasked512 dst (VPOPCNTD512 x) mask) => (VPOPCNTDMasked512Merging dst x mask)
@@ -2071,6 +2062,7 @@
 (VPBLENDMQMasked512 dst (VPMOVSQB128_512 x) mask) => (VPMOVSQBMasked128_512Merging dst x mask)
 (VPBLENDMQMasked512 dst (VPMOVSQD256 x) mask) => (VPMOVSQDMasked256Merging dst x mask)
 (VPBLENDMQMasked512 dst (VPMOVSQW128_512 x) mask) => (VPMOVSQWMasked128_512Merging dst x mask)
+(VPBLENDMQMasked512 dst (VPMOVUSQB128_512 x) mask) => (VPMOVUSQBMasked128_512Merging dst x mask)
 (VPBLENDMQMasked512 dst (VPMOVUSQD256 x) mask) => (VPMOVUSQDMasked256Merging dst x mask)
 (VPBLENDMQMasked512 dst (VPMOVUSQW128_512 x) mask) => (VPMOVUSQWMasked128_512Merging dst x mask)
 (VPBLENDMQMasked512 dst (VPMULLQ512 x y) mask) => (VPMULLQMasked512Merging dst x y mask)
@@ -2235,9 +2227,12 @@
 (VPBLENDVB128 dst (VPMOVSXWQ128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXWQMasked128Merging dst x (VPMOVVec16x8ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVSXWQ256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXWQMasked256Merging dst x (VPMOVVec16x8ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVSXWQ512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXWQMasked512Merging dst x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(VPBLENDVB128 dst (VPMOVUSDB128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSDBMasked128_128Merging dst x (VPMOVVec32x4ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVUSDW128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSDWMasked128_128Merging dst x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(VPBLENDVB128 dst (VPMOVUSQB128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQBMasked128_128Merging dst x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVUSQD128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQDMasked128_128Merging dst x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVUSQW128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQWMasked128_128Merging dst x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(VPBLENDVB128 dst (VPMOVUSWB128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSWBMasked128_128Merging dst x (VPMOVVec16x8ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVWB128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVWBMasked128_128Merging dst x (VPMOVVec16x8ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVZXBD128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVZXBDMasked128Merging dst x (VPMOVVec8x16ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPMOVZXBD256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVZXBDMasked256Merging dst x (VPMOVVec8x16ToM <types.TypeMask> mask))
@@ -2396,9 +2391,12 @@
 (VPBLENDVB256 dst (VPMOVSXBW512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXBWMasked512Merging dst x (VPMOVVec8x32ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVSXDQ512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXDQMasked512Merging dst x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVSXWD512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXWDMasked512Merging dst x (VPMOVVec16x16ToM <types.TypeMask> mask))
+(VPBLENDVB256 dst (VPMOVUSDB128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSDBMasked128_256Merging dst x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVUSDW128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSDWMasked128_256Merging dst x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(VPBLENDVB256 dst (VPMOVUSQB128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQBMasked128_256Merging dst x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVUSQD128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQDMasked128_256Merging dst x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVUSQW128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQWMasked128_256Merging dst x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(VPBLENDVB256 dst (VPMOVUSWB128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSWBMasked128_256Merging dst x (VPMOVVec16x16ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVWB128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVWBMasked128_256Merging dst x (VPMOVVec16x16ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVZXBW512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVZXBWMasked512Merging dst x (VPMOVVec8x32ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPMOVZXDQ512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVZXDQMasked512Merging dst x (VPMOVVec32x8ToM <types.TypeMask> mask))
@@ -2511,30 +2509,30 @@
 (VPANDNQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPANDNQMasked128load {sym} [off] x ptr mask mem)
 (VPANDNQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPANDNQMasked256load {sym} [off] x ptr mask mem)
 (VPANDNQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPANDNQMasked512load {sym} [off] x ptr mask mem)
-(VRNDSCALEPS128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VRNDSCALEPS256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VRNDSCALEPS512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VRNDSCALEPD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VRNDSCALEPD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VRNDSCALEPD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VRNDSCALEPSMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VRNDSCALEPSMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VRNDSCALEPSMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VRNDSCALEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VRNDSCALEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VRNDSCALEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VREDUCEPS128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VREDUCEPS256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VREDUCEPS512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VREDUCEPD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VREDUCEPD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VREDUCEPD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VREDUCEPSMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VREDUCEPSMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VREDUCEPSMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VREDUCEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VREDUCEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VREDUCEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VRNDSCALEPS128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VRNDSCALEPS256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VRNDSCALEPS512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VRNDSCALEPD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VRNDSCALEPD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VRNDSCALEPD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VRNDSCALEPSMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VRNDSCALEPSMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VRNDSCALEPSMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VRNDSCALEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VRNDSCALEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VRNDSCALEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VREDUCEPS128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VREDUCEPS256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VREDUCEPS512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VREDUCEPD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VREDUCEPD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VREDUCEPD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VREDUCEPSMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VREDUCEPSMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VREDUCEPSMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VREDUCEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VREDUCEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VREDUCEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
 (VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
 (VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
 (VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
@@ -2655,54 +2653,46 @@
 (VDIVPDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VDIVPDMasked128load {sym} [off] x ptr mask mem)
 (VDIVPDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VDIVPDMasked256load {sym} [off] x ptr mask mem)
 (VDIVPDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VDIVPDMasked512load {sym} [off] x ptr mask mem)
-(VPDPBUSD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSD512load {sym} [off] x y ptr mem)
-(VPDPBUSDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDMasked128load {sym} [off] x y ptr mask mem)
-(VPDPBUSDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDMasked256load {sym} [off] x y ptr mask mem)
-(VPDPBUSDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDMasked512load {sym} [off] x y ptr mask mem)
-(VPDPBUSDS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDS512load {sym} [off] x y ptr mem)
-(VPDPBUSDSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDSMasked128load {sym} [off] x y ptr mask mem)
-(VPDPBUSDSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDSMasked256load {sym} [off] x y ptr mask mem)
-(VPDPBUSDSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDSMasked512load {sym} [off] x y ptr mask mem)
 (VPCMPEQD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPEQD512load {sym} [off] x ptr mem)
 (VPCMPEQQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPEQQ512load {sym} [off] x ptr mem)
-(VCMPPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VCMPPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
-(VCMPPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VCMPPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
-(VCMPPSMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VCMPPSMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VCMPPSMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VCMPPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VCMPPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VCMPPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPUDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPUDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPUDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPUQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPUQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VPCMPUQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
-(VGF2P8AFFINEQB128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VGF2P8AFFINEQB256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VGF2P8AFFINEQB512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VGF2P8AFFINEINVQB128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VGF2P8AFFINEINVQB256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VGF2P8AFFINEINVQB512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VGF2P8AFFINEINVQBMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VGF2P8AFFINEINVQBMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VGF2P8AFFINEINVQBMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VGF2P8AFFINEQBMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VGF2P8AFFINEQBMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VGF2P8AFFINEQBMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VCMPPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VCMPPS512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VCMPPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VCMPPD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VCMPPSMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VCMPPSMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VCMPPSMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VCMPPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VCMPPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VCMPPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPUDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPUDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPUDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPUQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPUQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VPCMPUQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mask mem)
+(VGF2P8AFFINEQB128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEQB256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEQB512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEINVQB128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEINVQB256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEINVQB512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEINVQBMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEINVQBMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEINVQBMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEQBMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEQBMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEQBMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
 (VPCMPGTD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPGTD512load {sym} [off] x ptr mem)
 (VPCMPGTQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPGTQ512load {sym} [off] x ptr mem)
-(VPCMPUD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPUD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
-(VPCMPUQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
-(VPCMPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
-(VPCMPQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+(VPCMPUD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPUD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VPCMPUQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VPCMPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPD512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
+(VPCMPQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPQ512load {sym} [makeValAndOff(int32(uint8(c)),off)] x ptr mem)
 (VPUNPCKHDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPUNPCKHDQ512load {sym} [off] x ptr mem)
 (VPUNPCKHQDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPUNPCKHQDQ512load {sym} [off] x ptr mem)
 (VPUNPCKLDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPUNPCKLDQ512load {sym} [off] x ptr mem)
@@ -2781,11 +2771,7 @@
 (VPMULLQ128 x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPMULLQ128load {sym} [off] x ptr mem)
 (VPMULLQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPMULLQ256load {sym} [off] x ptr mem)
 (VPMULLQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPMULLQ512load {sym} [off] x ptr mem)
-(VFMADD213PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMADD213PS128load {sym} [off] x y ptr mem)
-(VFMADD213PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMADD213PS256load {sym} [off] x y ptr mem)
 (VFMADD213PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMADD213PS512load {sym} [off] x y ptr mem)
-(VFMADD213PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMADD213PD128load {sym} [off] x y ptr mem)
-(VFMADD213PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMADD213PD256load {sym} [off] x y ptr mem)
 (VFMADD213PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMADD213PD512load {sym} [off] x y ptr mem)
 (VFMADD213PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VFMADD213PSMasked128load {sym} [off] x y ptr mask mem)
 (VFMADD213PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VFMADD213PSMasked256load {sym} [off] x y ptr mask mem)
@@ -2793,11 +2779,7 @@
 (VFMADD213PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VFMADD213PDMasked128load {sym} [off] x y ptr mask mem)
 (VFMADD213PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VFMADD213PDMasked256load {sym} [off] x y ptr mask mem)
 (VFMADD213PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VFMADD213PDMasked512load {sym} [off] x y ptr mask mem)
-(VFMADDSUB213PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMADDSUB213PS128load {sym} [off] x y ptr mem)
-(VFMADDSUB213PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMADDSUB213PS256load {sym} [off] x y ptr mem)
 (VFMADDSUB213PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMADDSUB213PS512load {sym} [off] x y ptr mem)
-(VFMADDSUB213PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMADDSUB213PD128load {sym} [off] x y ptr mem)
-(VFMADDSUB213PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMADDSUB213PD256load {sym} [off] x y ptr mem)
 (VFMADDSUB213PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMADDSUB213PD512load {sym} [off] x y ptr mem)
 (VFMADDSUB213PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VFMADDSUB213PSMasked128load {sym} [off] x y ptr mask mem)
 (VFMADDSUB213PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VFMADDSUB213PSMasked256load {sym} [off] x y ptr mask mem)
@@ -2817,11 +2799,7 @@
 (VPMULLQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPMULLQMasked128load {sym} [off] x ptr mask mem)
 (VPMULLQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPMULLQMasked256load {sym} [off] x ptr mask mem)
 (VPMULLQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPMULLQMasked512load {sym} [off] x ptr mask mem)
-(VFMSUBADD213PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMSUBADD213PS128load {sym} [off] x y ptr mem)
-(VFMSUBADD213PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMSUBADD213PS256load {sym} [off] x y ptr mem)
 (VFMSUBADD213PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMSUBADD213PS512load {sym} [off] x y ptr mem)
-(VFMSUBADD213PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMSUBADD213PD128load {sym} [off] x y ptr mem)
-(VFMSUBADD213PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMSUBADD213PD256load {sym} [off] x y ptr mem)
 (VFMSUBADD213PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VFMSUBADD213PD512load {sym} [off] x y ptr mem)
 (VFMSUBADD213PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VFMSUBADD213PSMasked128load {sym} [off] x y ptr mask mem)
 (VFMSUBADD213PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VFMSUBADD213PSMasked256load {sym} [off] x y ptr mask mem)
@@ -2883,30 +2861,30 @@
 (VRSQRT14PDMasked128 l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRSQRT14PDMasked128load {sym} [off] ptr mask mem)
 (VRSQRT14PDMasked256 l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRSQRT14PDMasked256load {sym} [off] ptr mask mem)
 (VRSQRT14PDMasked512 l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRSQRT14PDMasked512load {sym} [off] ptr mask mem)
-(VPROLD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPROLD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPROLD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPROLQ128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPROLQ256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPROLQ512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPROLDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPROLDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPROLDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPROLQMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPROLQMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPROLQMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPRORD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPRORD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPRORD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPRORQ128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPRORQ256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPRORQ512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPRORDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPRORDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPRORDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPRORQMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPRORQMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPRORQMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPROLD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPROLD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPROLD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPROLQ128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPROLQ256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPROLQ512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPROLDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPROLDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPROLDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPROLQMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPROLQMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPROLQMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPRORD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPRORD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPRORD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPRORQ128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPRORQ256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPRORQ512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPRORDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPRORDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPRORDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPRORQMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPRORQMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPRORQMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
 (VPROLVD128 x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLVD128load {sym} [off] x ptr mem)
 (VPROLVD256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLVD256load {sym} [off] x ptr mem)
 (VPROLVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLVD512load {sym} [off] x ptr mem)
@@ -2932,13 +2910,13 @@
 (VPRORVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORVQMasked256load {sym} [off] x ptr mask mem)
 (VPRORVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORVQMasked512load {sym} [off] x ptr mask mem)
 (VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem)
-(VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
 (VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem)
 (VPACKSSDWMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked512load {sym} [off] x ptr mask mem)
+(VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
 (VPACKUSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDW512load {sym} [off] x ptr mem)
-(VPACKUSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked128load {sym} [off] x ptr mask mem)
 (VPACKUSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked256load {sym} [off] x ptr mask mem)
 (VPACKUSDWMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked512load {sym} [off] x ptr mask mem)
+(VPACKUSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked128load {sym} [off] x ptr mask mem)
 (VSCALEFPS128 x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS128load {sym} [off] x ptr mem)
 (VSCALEFPS256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS256load {sym} [off] x ptr mem)
 (VSCALEFPS512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS512load {sym} [off] x ptr mem)
@@ -2951,30 +2929,30 @@
 (VSCALEFPDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPDMasked128load {sym} [off] x ptr mask mem)
 (VSCALEFPDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPDMasked256load {sym} [off] x ptr mask mem)
 (VSCALEFPDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPDMasked512load {sym} [off] x ptr mask mem)
-(VPSHLDD128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHLDD256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHLDD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHLDQ128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHLDQ256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHLDQ512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHLDDMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHLDDMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHLDDMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHLDQMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHLDQMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHLDQMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHRDD128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHRDD256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHRDD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHRDQ128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHRDQ256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHRDQ512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHRDDMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHRDDMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHRDDMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHRDQMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHRDQMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
-(VPSHRDQMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VPSHLDD128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHLDD256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHLDD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHLDQ128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHLDQ256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHLDQ512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHLDDMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHLDDMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHLDDMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHLDQMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHLDQMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHLDQMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHRDD128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHRDD256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHRDD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHRDQ128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHRDQ256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHRDQ512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHRDDMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHRDDMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHRDDMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHRDQMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHRDQMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
+(VPSHRDQMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mask mem)
 (VPSLLVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLVD512load {sym} [off] x ptr mem)
 (VPSLLVQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLVQ512load {sym} [off] x ptr mem)
 (VPSHLDVD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVD128load {sym} [off] x y ptr mem)
@@ -3059,41 +3037,41 @@
 (VPXORQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPXORQMasked512load {sym} [off] x ptr mask mem)
 (VPBLENDMDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMDMasked512load {sym} [off] x ptr mask mem)
 (VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem)
-(VSHUFPS512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VSHUFPD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
-(VPSHUFD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSHUFDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSHUFDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSHUFDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSLLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSLLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSLLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSLLDMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSLLDMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSLLQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSLLQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSLLQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRLD512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSRLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSRAD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAD512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSRAQ128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSRAQ256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSRAQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSRLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRLDMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRLDMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRLQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRLQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRLQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRADMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRADMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRADMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRAQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRAQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSRAQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPTERNLOGD128 [c]  x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD128load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
-(VPTERNLOGD256 [c]  x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD256load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
-(VPTERNLOGD512 [c]  x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
-(VPTERNLOGQ128 [c]  x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
-(VPTERNLOGQ256 [c]  x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
-(VPTERNLOGQ512 [c]  x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
+(VSHUFPS512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VSHUFPD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x ptr mem)
+(VPSHUFD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSHUFDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSHUFDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSHUFDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSLLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSLLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSLLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSLLDMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSLLDMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSLLQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSLLQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSLLQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRLD512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSRLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRLQ512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSRAD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAD512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSRAQ128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSRAQ256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSRAQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mem)
+(VPSRLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRLDMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRLDMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRLQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRLQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRLQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRADMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRADMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRADMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRAQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked128constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRAQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked256constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPSRAQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked512constload {sym} [makeValAndOff(int32(uint8(c)),off)]  ptr mask mem)
+(VPTERNLOGD128 [c]  x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x y ptr mem)
+(VPTERNLOGD256 [c]  x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x y ptr mem)
+(VPTERNLOGD512 [c]  x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x y ptr mem)
+(VPTERNLOGQ128 [c]  x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ128load {sym} [makeValAndOff(int32(uint8(c)),off)]  x y ptr mem)
+(VPTERNLOGQ256 [c]  x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ256load {sym} [makeValAndOff(int32(uint8(c)),off)]  x y ptr mem)
+(VPTERNLOGQ512 [c]  x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ512load {sym} [makeValAndOff(int32(uint8(c)),off)]  x y ptr mem)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.

 package main

@@ -172,38 +172,38 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VEXPANDPSMasked128", argLength: 2, reg: wkw, asm: "VEXPANDPS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VEXPANDPSMasked256", argLength: 2, reg: wkw, asm: "VEXPANDPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VEXPANDPSMasked512", argLength: 2, reg: wkw, asm: "VEXPANDPS", commutative: false, typ: "Vec512", resultInArg0: false},
-		{name: "VFMADD213PD128", argLength: 3, reg: w31, asm: "VFMADD213PD", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VFMADD213PD256", argLength: 3, reg: w31, asm: "VFMADD213PD", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VFMADD213PD128", argLength: 3, reg: v31, asm: "VFMADD213PD", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VFMADD213PD256", argLength: 3, reg: v31, asm: "VFMADD213PD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VFMADD213PD512", argLength: 3, reg: w31, asm: "VFMADD213PD", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VFMADD213PDMasked128", argLength: 4, reg: w3kw, asm: "VFMADD213PD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VFMADD213PDMasked256", argLength: 4, reg: w3kw, asm: "VFMADD213PD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VFMADD213PDMasked512", argLength: 4, reg: w3kw, asm: "VFMADD213PD", commutative: false, typ: "Vec512", resultInArg0: true},
-		{name: "VFMADD213PS128", argLength: 3, reg: w31, asm: "VFMADD213PS", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VFMADD213PS256", argLength: 3, reg: w31, asm: "VFMADD213PS", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VFMADD213PS128", argLength: 3, reg: v31, asm: "VFMADD213PS", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VFMADD213PS256", argLength: 3, reg: v31, asm: "VFMADD213PS", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VFMADD213PS512", argLength: 3, reg: w31, asm: "VFMADD213PS", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VFMADD213PSMasked128", argLength: 4, reg: w3kw, asm: "VFMADD213PS", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VFMADD213PSMasked256", argLength: 4, reg: w3kw, asm: "VFMADD213PS", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VFMADD213PSMasked512", argLength: 4, reg: w3kw, asm: "VFMADD213PS", commutative: false, typ: "Vec512", resultInArg0: true},
-		{name: "VFMADDSUB213PD128", argLength: 3, reg: w31, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VFMADDSUB213PD256", argLength: 3, reg: w31, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VFMADDSUB213PD128", argLength: 3, reg: v31, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VFMADDSUB213PD256", argLength: 3, reg: v31, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VFMADDSUB213PD512", argLength: 3, reg: w31, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VFMADDSUB213PDMasked128", argLength: 4, reg: w3kw, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VFMADDSUB213PDMasked256", argLength: 4, reg: w3kw, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VFMADDSUB213PDMasked512", argLength: 4, reg: w3kw, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec512", resultInArg0: true},
-		{name: "VFMADDSUB213PS128", argLength: 3, reg: w31, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VFMADDSUB213PS256", argLength: 3, reg: w31, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VFMADDSUB213PS128", argLength: 3, reg: v31, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VFMADDSUB213PS256", argLength: 3, reg: v31, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VFMADDSUB213PS512", argLength: 3, reg: w31, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VFMADDSUB213PSMasked128", argLength: 4, reg: w3kw, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VFMADDSUB213PSMasked256", argLength: 4, reg: w3kw, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VFMADDSUB213PSMasked512", argLength: 4, reg: w3kw, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec512", resultInArg0: true},
-		{name: "VFMSUBADD213PD128", argLength: 3, reg: w31, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VFMSUBADD213PD256", argLength: 3, reg: w31, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VFMSUBADD213PD128", argLength: 3, reg: v31, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VFMSUBADD213PD256", argLength: 3, reg: v31, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VFMSUBADD213PD512", argLength: 3, reg: w31, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VFMSUBADD213PDMasked128", argLength: 4, reg: w3kw, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VFMSUBADD213PDMasked256", argLength: 4, reg: w3kw, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VFMSUBADD213PDMasked512", argLength: 4, reg: w3kw, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec512", resultInArg0: true},
-		{name: "VFMSUBADD213PS128", argLength: 3, reg: w31, asm: "VFMSUBADD213PS", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VFMSUBADD213PS256", argLength: 3, reg: w31, asm: "VFMSUBADD213PS", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VFMSUBADD213PS128", argLength: 3, reg: v31, asm: "VFMSUBADD213PS", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VFMSUBADD213PS256", argLength: 3, reg: v31, asm: "VFMSUBADD213PS", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VFMSUBADD213PS512", argLength: 3, reg: w31, asm: "VFMSUBADD213PS", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VFMSUBADD213PSMasked128", argLength: 4, reg: w3kw, asm: "VFMSUBADD213PS", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VFMSUBADD213PSMasked256", argLength: 4, reg: w3kw, asm: "VFMSUBADD213PS", commutative: false, typ: "Vec256", resultInArg0: true},
@@ -452,18 +452,6 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPCOMPRESSWMasked128", argLength: 2, reg: wkw, asm: "VPCOMPRESSW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPCOMPRESSWMasked256", argLength: 2, reg: wkw, asm: "VPCOMPRESSW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPCOMPRESSWMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSW", commutative: false, typ: "Vec512", resultInArg0: false},
-		{name: "VPDPBUSD128", argLength: 3, reg: v31, asm: "VPDPBUSD", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VPDPBUSD256", argLength: 3, reg: v31, asm: "VPDPBUSD", commutative: false, typ: "Vec256", resultInArg0: true},
-		{name: "VPDPBUSD512", argLength: 3, reg: w31, asm: "VPDPBUSD", commutative: false, typ: "Vec512", resultInArg0: true},
-		{name: "VPDPBUSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPBUSD", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VPDPBUSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPBUSD", commutative: false, typ: "Vec256", resultInArg0: true},
-		{name: "VPDPBUSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPBUSD", commutative: false, typ: "Vec512", resultInArg0: true},
-		{name: "VPDPBUSDS128", argLength: 3, reg: v31, asm: "VPDPBUSDS", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VPDPBUSDS256", argLength: 3, reg: v31, asm: "VPDPBUSDS", commutative: false, typ: "Vec256", resultInArg0: true},
-		{name: "VPDPBUSDS512", argLength: 3, reg: w31, asm: "VPDPBUSDS", commutative: false, typ: "Vec512", resultInArg0: true},
-		{name: "VPDPBUSDSMasked128", argLength: 4, reg: w3kw, asm: "VPDPBUSDS", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VPDPBUSDSMasked256", argLength: 4, reg: w3kw, asm: "VPDPBUSDS", commutative: false, typ: "Vec256", resultInArg0: true},
-		{name: "VPDPBUSDSMasked512", argLength: 4, reg: w3kw, asm: "VPDPBUSDS", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPDPWSSD128", argLength: 3, reg: v31, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPDPWSSD256", argLength: 3, reg: v31, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPDPWSSD512", argLength: 3, reg: w31, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true},
@@ -780,12 +768,24 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMOVSXWQMasked128", argLength: 2, reg: wkw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVSXWQMasked256", argLength: 2, reg: wkw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPMOVSXWQMasked512", argLength: 2, reg: wkw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPMOVUSDB128_128", argLength: 1, reg: w11, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSDB128_256", argLength: 1, reg: w11, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSDB128_512", argLength: 1, reg: w11, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSDBMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSDBMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSDBMasked128_512", argLength: 2, reg: wkw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSDW128_128", argLength: 1, reg: w11, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSDW128_256", argLength: 1, reg: w11, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSDW256", argLength: 1, reg: w11, asm: "VPMOVUSDW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPMOVUSDWMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSDWMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSDWMasked256", argLength: 2, reg: wkw, asm: "VPMOVUSDW", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPMOVUSQB128_128", argLength: 1, reg: w11, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSQB128_256", argLength: 1, reg: w11, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSQB128_512", argLength: 1, reg: w11, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSQBMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSQBMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSQBMasked128_512", argLength: 2, reg: wkw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSQD128_128", argLength: 1, reg: w11, asm: "VPMOVUSQD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSQD128_256", argLength: 1, reg: w11, asm: "VPMOVUSQD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSQD256", argLength: 1, reg: w11, asm: "VPMOVUSQD", commutative: false, typ: "Vec256", resultInArg0: false},
@@ -798,7 +798,11 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMOVUSQWMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSQWMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSQWMasked128_512", argLength: 2, reg: wkw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSWB128_128", argLength: 1, reg: w11, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSWB128_256", argLength: 1, reg: w11, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSWB256", argLength: 1, reg: w11, asm: "VPMOVUSWB", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPMOVUSWBMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPMOVUSWBMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVUSWBMasked256", argLength: 2, reg: wkw, asm: "VPMOVUSWB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPMOVWB128_128", argLength: 1, reg: w11, asm: "VPMOVWB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMOVWB128_256", argLength: 1, reg: w11, asm: "VPMOVWB", commutative: false, typ: "Vec128", resultInArg0: false},
@@ -1590,38 +1594,26 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VDIVPSMasked128load", argLength: 4, reg: w2kwload, asm: "VDIVPS", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: false},
 		{name: "VDIVPSMasked256load", argLength: 4, reg: w2kwload, asm: "VDIVPS", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: false},
 		{name: "VDIVPSMasked512load", argLength: 4, reg: w2kwload, asm: "VDIVPS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: false},
-		{name: "VFMADD213PD128load", argLength: 4, reg: w31load, asm: "VFMADD213PD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VFMADD213PD256load", argLength: 4, reg: w31load, asm: "VFMADD213PD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADD213PD512load", argLength: 4, reg: w31load, asm: "VFMADD213PD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADD213PDMasked128load", argLength: 5, reg: w3kwload, asm: "VFMADD213PD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADD213PDMasked256load", argLength: 5, reg: w3kwload, asm: "VFMADD213PD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADD213PDMasked512load", argLength: 5, reg: w3kwload, asm: "VFMADD213PD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VFMADD213PS128load", argLength: 4, reg: w31load, asm: "VFMADD213PS", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VFMADD213PS256load", argLength: 4, reg: w31load, asm: "VFMADD213PS", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADD213PS512load", argLength: 4, reg: w31load, asm: "VFMADD213PS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADD213PSMasked128load", argLength: 5, reg: w3kwload, asm: "VFMADD213PS", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADD213PSMasked256load", argLength: 5, reg: w3kwload, asm: "VFMADD213PS", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADD213PSMasked512load", argLength: 5, reg: w3kwload, asm: "VFMADD213PS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VFMADDSUB213PD128load", argLength: 4, reg: w31load, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VFMADDSUB213PD256load", argLength: 4, reg: w31load, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADDSUB213PD512load", argLength: 4, reg: w31load, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADDSUB213PDMasked128load", argLength: 5, reg: w3kwload, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADDSUB213PDMasked256load", argLength: 5, reg: w3kwload, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADDSUB213PDMasked512load", argLength: 5, reg: w3kwload, asm: "VFMADDSUB213PD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VFMADDSUB213PS128load", argLength: 4, reg: w31load, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VFMADDSUB213PS256load", argLength: 4, reg: w31load, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADDSUB213PS512load", argLength: 4, reg: w31load, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADDSUB213PSMasked128load", argLength: 5, reg: w3kwload, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADDSUB213PSMasked256load", argLength: 5, reg: w3kwload, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMADDSUB213PSMasked512load", argLength: 5, reg: w3kwload, asm: "VFMADDSUB213PS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VFMSUBADD213PD128load", argLength: 4, reg: w31load, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VFMSUBADD213PD256load", argLength: 4, reg: w31load, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMSUBADD213PD512load", argLength: 4, reg: w31load, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMSUBADD213PDMasked128load", argLength: 5, reg: w3kwload, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMSUBADD213PDMasked256load", argLength: 5, reg: w3kwload, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMSUBADD213PDMasked512load", argLength: 5, reg: w3kwload, asm: "VFMSUBADD213PD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VFMSUBADD213PS128load", argLength: 4, reg: w31load, asm: "VFMSUBADD213PS", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VFMSUBADD213PS256load", argLength: 4, reg: w31load, asm: "VFMSUBADD213PS", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMSUBADD213PS512load", argLength: 4, reg: w31load, asm: "VFMSUBADD213PS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMSUBADD213PSMasked128load", argLength: 5, reg: w3kwload, asm: "VFMSUBADD213PS", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VFMSUBADD213PSMasked256load", argLength: 5, reg: w3kwload, asm: "VFMSUBADD213PS", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
@@ -1698,14 +1690,6 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPCMPEQQ512load", argLength: 3, reg: w2kload, asm: "VPCMPEQQ", commutative: false, typ: "Mask", aux: "SymOff", symEffect: "Read", resultInArg0: false},
 		{name: "VPCMPGTD512load", argLength: 3, reg: w2kload, asm: "VPCMPGTD", commutative: false, typ: "Mask", aux: "SymOff", symEffect: "Read", resultInArg0: false},
 		{name: "VPCMPGTQ512load", argLength: 3, reg: w2kload, asm: "VPCMPGTQ", commutative: false, typ: "Mask", aux: "SymOff", symEffect: "Read", resultInArg0: false},
-		{name: "VPDPBUSD512load", argLength: 4, reg: w31load, asm: "VPDPBUSD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPBUSD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPBUSD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDMasked512load", argLength: 5, reg: w3kwload, asm: "VPDPBUSD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDS512load", argLength: 4, reg: w31load, asm: "VPDPBUSDS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDSMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPBUSDS", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDSMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPBUSDS", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
-		{name: "VPDPBUSDSMasked512load", argLength: 5, reg: w3kwload, asm: "VPDPBUSDS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VPDPWSSD512load", argLength: 4, reg: w31load, asm: "VPDPWSSD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VPDPWSSDMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPWSSD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
 		{name: "VPDPWSSDMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPWSSD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
@@ -2382,15 +2366,23 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMOVSXWQMasked128Merging", argLength: 3, reg: w2kw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVSXWQMasked256Merging", argLength: 3, reg: w2kw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPMOVSXWQMasked512Merging", argLength: 3, reg: w2kw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPMOVUSDBMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPMOVUSDBMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPMOVUSDBMasked128_512Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSDWMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSDWMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSDWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDW", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPMOVUSQBMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPMOVUSQBMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPMOVUSQBMasked128_512Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSQDMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSQDMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSQDMasked256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPMOVUSQWMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSQWMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSQWMasked128_512Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPMOVUSWBMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPMOVUSWBMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVUSWBMasked256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSWB", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPMOVWBMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVWB", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPMOVWBMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVWB", commutative: false, typ: "Vec128", resultInArg0: true},
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.

 package main

@@ -48,19 +48,19 @@ func simdGenericOps() []opData {
 		{name: "AddInt64x4", argLength: 2, commutative: true},
 		{name: "AddInt64x8", argLength: 2, commutative: true},
 		{name: "AddPairsFloat32x4", argLength: 2, commutative: false},
-		{name: "AddPairsFloat32x8", argLength: 2, commutative: false},
 		{name: "AddPairsFloat64x2", argLength: 2, commutative: false},
-		{name: "AddPairsFloat64x4", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedFloat32x8", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedFloat64x4", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedInt16x16", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedInt32x8", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedUint16x16", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedUint32x8", argLength: 2, commutative: false},
 		{name: "AddPairsInt16x8", argLength: 2, commutative: false},
-		{name: "AddPairsInt16x16", argLength: 2, commutative: false},
 		{name: "AddPairsInt32x4", argLength: 2, commutative: false},
-		{name: "AddPairsInt32x8", argLength: 2, commutative: false},
+		{name: "AddPairsSaturatedGroupedInt16x16", argLength: 2, commutative: false},
 		{name: "AddPairsSaturatedInt16x8", argLength: 2, commutative: false},
-		{name: "AddPairsSaturatedInt16x16", argLength: 2, commutative: false},
 		{name: "AddPairsUint16x8", argLength: 2, commutative: false},
-		{name: "AddPairsUint16x16", argLength: 2, commutative: false},
 		{name: "AddPairsUint32x4", argLength: 2, commutative: false},
-		{name: "AddPairsUint32x8", argLength: 2, commutative: false},
 		{name: "AddSaturatedInt8x16", argLength: 2, commutative: true},
 		{name: "AddSaturatedInt8x32", argLength: 2, commutative: true},
 		{name: "AddSaturatedInt8x64", argLength: 2, commutative: true},
@@ -304,12 +304,6 @@ func simdGenericOps() []opData {
 		{name: "DotProductPairsSaturatedUint8x16", argLength: 2, commutative: false},
 		{name: "DotProductPairsSaturatedUint8x32", argLength: 2, commutative: false},
 		{name: "DotProductPairsSaturatedUint8x64", argLength: 2, commutative: false},
-		{name: "DotProductQuadrupleInt32x4", argLength: 3, commutative: false},
-		{name: "DotProductQuadrupleInt32x8", argLength: 3, commutative: false},
-		{name: "DotProductQuadrupleInt32x16", argLength: 3, commutative: false},
-		{name: "DotProductQuadrupleSaturatedInt32x4", argLength: 3, commutative: false},
-		{name: "DotProductQuadrupleSaturatedInt32x8", argLength: 3, commutative: false},
-		{name: "DotProductQuadrupleSaturatedInt32x16", argLength: 3, commutative: false},
 		{name: "EqualFloat32x4", argLength: 2, commutative: true},
 		{name: "EqualFloat32x8", argLength: 2, commutative: true},
 		{name: "EqualFloat32x16", argLength: 2, commutative: true},
@@ -370,26 +364,26 @@ func simdGenericOps() []opData {
 		{name: "ExpandUint64x2", argLength: 2, commutative: false},
 		{name: "ExpandUint64x4", argLength: 2, commutative: false},
 		{name: "ExpandUint64x8", argLength: 2, commutative: false},
-		{name: "ExtendLo2ToInt64x2Int8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo2ToInt64x2Int16x8", argLength: 1, commutative: false},
-		{name: "ExtendLo2ToInt64x2Int32x4", argLength: 1, commutative: false},
-		{name: "ExtendLo2ToUint64x2Uint8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo2ToUint64x2Uint16x8", argLength: 1, commutative: false},
-		{name: "ExtendLo2ToUint64x2Uint32x4", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToInt32x4Int8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToInt32x4Int16x8", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToInt64x4Int8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToInt64x4Int16x8", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToUint32x4Uint8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToUint32x4Uint16x8", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToUint64x4Uint8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo4ToUint64x4Uint16x8", argLength: 1, commutative: false},
-		{name: "ExtendLo8ToInt16x8Int8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo8ToInt32x8Int8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo8ToInt64x8Int8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo8ToUint16x8Uint8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo8ToUint32x8Uint8x16", argLength: 1, commutative: false},
-		{name: "ExtendLo8ToUint64x8Uint8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo2ToInt64Int8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo2ToInt64Int16x8", argLength: 1, commutative: false},
+		{name: "ExtendLo2ToInt64Int32x4", argLength: 1, commutative: false},
+		{name: "ExtendLo2ToUint64Uint8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo2ToUint64Uint16x8", argLength: 1, commutative: false},
+		{name: "ExtendLo2ToUint64Uint32x4", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToInt32Int8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToInt32Int16x8", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToInt64Int8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToInt64Int16x8", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToUint32Uint8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToUint32Uint16x8", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToUint64Uint8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo4ToUint64Uint16x8", argLength: 1, commutative: false},
+		{name: "ExtendLo8ToInt16Int8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo8ToInt32Int8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo8ToInt64Int8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo8ToUint16Uint8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo8ToUint32Uint8x16", argLength: 1, commutative: false},
+		{name: "ExtendLo8ToUint64Uint8x16", argLength: 1, commutative: false},
 		{name: "ExtendToInt16Int8x16", argLength: 1, commutative: false},
 		{name: "ExtendToInt16Int8x32", argLength: 1, commutative: false},
 		{name: "ExtendToInt32Int8x16", argLength: 1, commutative: false},
@@ -525,12 +519,6 @@ func simdGenericOps() []opData {
 		{name: "InterleaveLoUint16x8", argLength: 2, commutative: false},
 		{name: "InterleaveLoUint32x4", argLength: 2, commutative: false},
 		{name: "InterleaveLoUint64x2", argLength: 2, commutative: false},
-		{name: "IsNanFloat32x4", argLength: 2, commutative: true},
-		{name: "IsNanFloat32x8", argLength: 2, commutative: true},
-		{name: "IsNanFloat32x16", argLength: 2, commutative: true},
-		{name: "IsNanFloat64x2", argLength: 2, commutative: true},
-		{name: "IsNanFloat64x4", argLength: 2, commutative: true},
-		{name: "IsNanFloat64x8", argLength: 2, commutative: true},
 		{name: "LeadingZerosInt32x4", argLength: 1, commutative: false},
 		{name: "LeadingZerosInt32x8", argLength: 1, commutative: false},
 		{name: "LeadingZerosInt32x16", argLength: 1, commutative: false},
@@ -830,9 +818,9 @@ func simdGenericOps() []opData {
 		{name: "SaturateToInt8Int64x2", argLength: 1, commutative: false},
 		{name: "SaturateToInt8Int64x4", argLength: 1, commutative: false},
 		{name: "SaturateToInt8Int64x8", argLength: 1, commutative: false},
+		{name: "SaturateToInt16ConcatGroupedInt32x8", argLength: 2, commutative: false},
+		{name: "SaturateToInt16ConcatGroupedInt32x16", argLength: 2, commutative: false},
 		{name: "SaturateToInt16ConcatInt32x4", argLength: 2, commutative: false},
-		{name: "SaturateToInt16ConcatInt32x8", argLength: 2, commutative: false},
-		{name: "SaturateToInt16ConcatInt32x16", argLength: 2, commutative: false},
 		{name: "SaturateToInt16Int32x4", argLength: 1, commutative: false},
 		{name: "SaturateToInt16Int32x8", argLength: 1, commutative: false},
 		{name: "SaturateToInt16Int32x16", argLength: 1, commutative: false},
@@ -842,18 +830,18 @@ func simdGenericOps() []opData {
 		{name: "SaturateToInt32Int64x2", argLength: 1, commutative: false},
 		{name: "SaturateToInt32Int64x4", argLength: 1, commutative: false},
 		{name: "SaturateToInt32Int64x8", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int16x8", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int16x16", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int32x4", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int32x8", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int32x16", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int64x2", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int64x4", argLength: 1, commutative: false},
-		{name: "SaturateToUint8Int64x8", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint16x8", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint16x16", argLength: 1, commutative: false},
 		{name: "SaturateToUint8Uint16x32", argLength: 1, commutative: false},
-		{name: "SaturateToUint16ConcatUint32x4", argLength: 2, commutative: false},
-		{name: "SaturateToUint16ConcatUint32x8", argLength: 2, commutative: false},
-		{name: "SaturateToUint16ConcatUint32x16", argLength: 2, commutative: false},
+		{name: "SaturateToUint8Uint32x4", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint32x8", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint32x16", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint64x2", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint64x4", argLength: 1, commutative: false},
+		{name: "SaturateToUint8Uint64x8", argLength: 1, commutative: false},
+		{name: "SaturateToUint16ConcatGroupedInt32x8", argLength: 2, commutative: false},
+		{name: "SaturateToUint16ConcatGroupedInt32x16", argLength: 2, commutative: false},
+		{name: "SaturateToUint16ConcatInt32x4", argLength: 2, commutative: false},
 		{name: "SaturateToUint16Uint32x4", argLength: 1, commutative: false},
 		{name: "SaturateToUint16Uint32x8", argLength: 1, commutative: false},
 		{name: "SaturateToUint16Uint32x16", argLength: 1, commutative: false},
@@ -1042,19 +1030,19 @@ func simdGenericOps() []opData {
 		{name: "SubInt64x4", argLength: 2, commutative: false},
 		{name: "SubInt64x8", argLength: 2, commutative: false},
 		{name: "SubPairsFloat32x4", argLength: 2, commutative: false},
-		{name: "SubPairsFloat32x8", argLength: 2, commutative: false},
 		{name: "SubPairsFloat64x2", argLength: 2, commutative: false},
-		{name: "SubPairsFloat64x4", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedFloat32x8", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedFloat64x4", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedInt16x16", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedInt32x8", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedUint16x16", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedUint32x8", argLength: 2, commutative: false},
 		{name: "SubPairsInt16x8", argLength: 2, commutative: false},
-		{name: "SubPairsInt16x16", argLength: 2, commutative: false},
 		{name: "SubPairsInt32x4", argLength: 2, commutative: false},
-		{name: "SubPairsInt32x8", argLength: 2, commutative: false},
+		{name: "SubPairsSaturatedGroupedInt16x16", argLength: 2, commutative: false},
 		{name: "SubPairsSaturatedInt16x8", argLength: 2, commutative: false},
-		{name: "SubPairsSaturatedInt16x16", argLength: 2, commutative: false},
 		{name: "SubPairsUint16x8", argLength: 2, commutative: false},
-		{name: "SubPairsUint16x16", argLength: 2, commutative: false},
 		{name: "SubPairsUint32x4", argLength: 2, commutative: false},
-		{name: "SubPairsUint32x8", argLength: 2, commutative: false},
 		{name: "SubSaturatedInt8x16", argLength: 2, commutative: false},
 		{name: "SubSaturatedInt8x32", argLength: 2, commutative: false},
 		{name: "SubSaturatedInt8x64", argLength: 2, commutative: false},
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
--- a/src/cmd/compile/internal/ssa/sccp.go
+++ b/src/cmd/compile/internal/ssa/sccp.go
@@ -507,6 +507,10 @@ func (t *worklist) propagate(block *Block) {
 				branchIdx = 1 - condLattice.val.AuxInt
 			} else {
 				branchIdx = condLattice.val.AuxInt
+				if branchIdx < 0 || branchIdx >= int64(len(block.Succs)) {
+					// unreachable code, do nothing then
+					break
+				}
 			}
 			t.edges = append(t.edges, block.Succs[branchIdx])
 		} else {
--- a/src/cmd/compile/internal/ssa/tern_helpers.go
+++ b/src/cmd/compile/internal/ssa/tern_helpers.go
@@ -1,4 +1,4 @@
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+// Code generated by 'tmplgen'; DO NOT EDIT.

 package ssa

--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@@ -1667,6 +1667,12 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
 		addF(simdPackage, "Uint16x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
 		addF(simdPackage, "Uint32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
 		addF(simdPackage, "Uint64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
+		addF(simdPackage, "Float32x4.IsNaN", opLen1(ssa.OpIsNaNFloat32x4, types.TypeVec128), sys.AMD64)
+		addF(simdPackage, "Float32x8.IsNaN", opLen1(ssa.OpIsNaNFloat32x8, types.TypeVec256), sys.AMD64)
+		addF(simdPackage, "Float32x16.IsNaN", opLen1(ssa.OpIsNaNFloat32x16, types.TypeVec512), sys.AMD64)
+		addF(simdPackage, "Float64x2.IsNaN", opLen1(ssa.OpIsNaNFloat64x2, types.TypeVec128), sys.AMD64)
+		addF(simdPackage, "Float64x4.IsNaN", opLen1(ssa.OpIsNaNFloat64x4, types.TypeVec256), sys.AMD64)
+		addF(simdPackage, "Float64x8.IsNaN", opLen1(ssa.OpIsNaNFloat64x8, types.TypeVec512), sys.AMD64)

 		// sfp4 is intrinsic-if-constant, but otherwise it's complicated enough to just implement in Go.
 		sfp4 := func(method string, hwop ssa.Op, vectype *types.Type) {
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -1,4 +1,4 @@
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.

 package ssagen

@@ -69,19 +69,19 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.AddPairs", opLen2(ssa.OpAddPairsFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.AddPairs", opLen2(ssa.OpAddPairsFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x8.AddPairs", opLen2(ssa.OpAddPairsInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.AddPairs", opLen2(ssa.OpAddPairsInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x4.AddPairs", opLen2(ssa.OpAddPairsInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.AddPairs", opLen2(ssa.OpAddPairsInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x8.AddPairs", opLen2(ssa.OpAddPairsUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.AddPairs", opLen2(ssa.OpAddPairsUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x4.AddPairs", opLen2(ssa.OpAddPairsUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x8.AddPairs", opLen2(ssa.OpAddPairsUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x4.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x8.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddPairsSaturatedGrouped", opLen2(ssa.OpAddPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x64, types.TypeVec512), sys.AMD64)
@@ -328,12 +328,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint8x16.DotProductPairsSaturated", opLen2(ssa.OpDotProductPairsSaturatedUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x32.DotProductPairsSaturated", opLen2(ssa.OpDotProductPairsSaturatedUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x64.DotProductPairsSaturated", opLen2(ssa.OpDotProductPairsSaturatedUint8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.DotProductQuadruple", opLen3_31Zero3(ssa.OpDotProductQuadrupleInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.DotProductQuadruple", opLen3_31Zero3(ssa.OpDotProductQuadrupleInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.DotProductQuadruple", opLen3_31Zero3(ssa.OpDotProductQuadrupleInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.DotProductQuadrupleSaturated", opLen3_31Zero3(ssa.OpDotProductQuadrupleSaturatedInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.DotProductQuadrupleSaturated", opLen3_31Zero3(ssa.OpDotProductQuadrupleSaturatedInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.DotProductQuadrupleSaturated", opLen3_31Zero3(ssa.OpDotProductQuadrupleSaturatedInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x16.Equal", opLen2(ssa.OpEqualInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.Equal", opLen2(ssa.OpEqualInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64)
@@ -394,26 +388,26 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x2.Expand", opLen2(ssa.OpExpandUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Expand", opLen2(ssa.OpExpandUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Expand", opLen2(ssa.OpExpandUint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.ExtendLo2ToInt64x2", opLen1(ssa.OpExtendLo2ToInt64x2Int8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x8.ExtendLo2ToInt64x2", opLen1(ssa.OpExtendLo2ToInt64x2Int16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x4.ExtendLo2ToInt64x2", opLen1(ssa.OpExtendLo2ToInt64x2Int32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x16.ExtendLo2ToUint64x2", opLen1(ssa.OpExtendLo2ToUint64x2Uint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x8.ExtendLo2ToUint64x2", opLen1(ssa.OpExtendLo2ToUint64x2Uint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x4.ExtendLo2ToUint64x2", opLen1(ssa.OpExtendLo2ToUint64x2Uint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x16.ExtendLo4ToInt32x4", opLen1(ssa.OpExtendLo4ToInt32x4Int8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x8.ExtendLo4ToInt32x4", opLen1(ssa.OpExtendLo4ToInt32x4Int16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x16.ExtendLo4ToInt64x4", opLen1(ssa.OpExtendLo4ToInt64x4Int8x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x8.ExtendLo4ToInt64x4", opLen1(ssa.OpExtendLo4ToInt64x4Int16x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x16.ExtendLo4ToUint32x4", opLen1(ssa.OpExtendLo4ToUint32x4Uint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x8.ExtendLo4ToUint32x4", opLen1(ssa.OpExtendLo4ToUint32x4Uint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x16.ExtendLo4ToUint64x4", opLen1(ssa.OpExtendLo4ToUint64x4Uint8x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x8.ExtendLo4ToUint64x4", opLen1(ssa.OpExtendLo4ToUint64x4Uint16x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x16.ExtendLo8ToInt16x8", opLen1(ssa.OpExtendLo8ToInt16x8Int8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x16.ExtendLo8ToInt32x8", opLen1(ssa.OpExtendLo8ToInt32x8Int8x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x16.ExtendLo8ToInt64x8", opLen1(ssa.OpExtendLo8ToInt64x8Int8x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x16.ExtendLo8ToUint16x8", opLen1(ssa.OpExtendLo8ToUint16x8Uint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x16.ExtendLo8ToUint32x8", opLen1(ssa.OpExtendLo8ToUint32x8Uint8x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x16.ExtendLo8ToUint64x8", opLen1(ssa.OpExtendLo8ToUint64x8Uint8x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.ExtendLo2ToInt64", opLen1(ssa.OpExtendLo2ToInt64Int8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x8.ExtendLo2ToInt64", opLen1(ssa.OpExtendLo2ToInt64Int16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.ExtendLo2ToInt64", opLen1(ssa.OpExtendLo2ToInt64Int32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ExtendLo2ToUint64", opLen1(ssa.OpExtendLo2ToUint64Uint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.ExtendLo2ToUint64", opLen1(ssa.OpExtendLo2ToUint64Uint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.ExtendLo2ToUint64", opLen1(ssa.OpExtendLo2ToUint64Uint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x16.ExtendLo4ToInt32", opLen1(ssa.OpExtendLo4ToInt32Int8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x8.ExtendLo4ToInt32", opLen1(ssa.OpExtendLo4ToInt32Int16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x16.ExtendLo4ToInt64", opLen1(ssa.OpExtendLo4ToInt64Int8x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x8.ExtendLo4ToInt64", opLen1(ssa.OpExtendLo4ToInt64Int16x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ExtendLo4ToUint32", opLen1(ssa.OpExtendLo4ToUint32Uint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.ExtendLo4ToUint32", opLen1(ssa.OpExtendLo4ToUint32Uint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ExtendLo4ToUint64", opLen1(ssa.OpExtendLo4ToUint64Uint8x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x8.ExtendLo4ToUint64", opLen1(ssa.OpExtendLo4ToUint64Uint16x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x16.ExtendLo8ToInt16", opLen1(ssa.OpExtendLo8ToInt16Int8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x16.ExtendLo8ToInt32", opLen1(ssa.OpExtendLo8ToInt32Int8x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x16.ExtendLo8ToInt64", opLen1(ssa.OpExtendLo8ToInt64Int8x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ExtendLo8ToUint16", opLen1(ssa.OpExtendLo8ToUint16Uint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ExtendLo8ToUint32", opLen1(ssa.OpExtendLo8ToUint32Uint8x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ExtendLo8ToUint64", opLen1(ssa.OpExtendLo8ToUint64Uint8x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x16.ExtendToInt16", opLen1(ssa.OpExtendToInt16Int8x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x32.ExtendToInt16", opLen1(ssa.OpExtendToInt16Int8x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x16.ExtendToInt32", opLen1(ssa.OpExtendToInt32Int8x16, types.TypeVec512), sys.AMD64)
@@ -577,12 +571,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint32x16.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x4.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.IsNan", opLen2(ssa.OpIsNanFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.IsNan", opLen2(ssa.OpIsNanFloat32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x16.IsNan", opLen2(ssa.OpIsNanFloat32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float64x2.IsNan", opLen2(ssa.OpIsNanFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.IsNan", opLen2(ssa.OpIsNanFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x8.IsNan", opLen2(ssa.OpIsNanFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x4.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x8.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x16.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x16, types.TypeVec512), sys.AMD64)
@@ -926,29 +914,29 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Int64x4.SaturateToInt16", opLen1(ssa.OpSaturateToInt16Int64x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int64x8.SaturateToInt16", opLen1(ssa.OpSaturateToInt16Int64x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x4.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x16.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x8.SaturateToInt16ConcatGrouped", opLen2(ssa.OpSaturateToInt16ConcatGroupedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x16.SaturateToInt16ConcatGrouped", opLen2(ssa.OpSaturateToInt16ConcatGroupedInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x2.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int64x4.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int64x8.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int16x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x4.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int32x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x16.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int32x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int64x2.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int64x4.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int64x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int64x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int64x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint16x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint16x32.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint16x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x4.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint32x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x16.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint32x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint64x2.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint64x4.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint64x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint64x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint64x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x4.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x8.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint32x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x16.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint32x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x2.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x8.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x4.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x8.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint32x16.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x4.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x8.SaturateToUint16ConcatGrouped", opLen2(ssa.OpSaturateToUint16ConcatGroupedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x16.SaturateToUint16ConcatGrouped", opLen2(ssa.OpSaturateToUint16ConcatGroupedInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x2.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x8.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x8, types.TypeVec256), sys.AMD64)
@@ -1199,19 +1187,19 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x4.Sub", opLen2(ssa.OpSubUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Sub", opLen2(ssa.OpSubUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.SubPairs", opLen2(ssa.OpSubPairsFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.SubPairs", opLen2(ssa.OpSubPairsFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.SubPairs", opLen2(ssa.OpSubPairsFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.SubPairs", opLen2(ssa.OpSubPairsFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x8.SubPairs", opLen2(ssa.OpSubPairsInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SubPairs", opLen2(ssa.OpSubPairsInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x4.SubPairs", opLen2(ssa.OpSubPairsInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.SubPairs", opLen2(ssa.OpSubPairsInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x8.SubPairs", opLen2(ssa.OpSubPairsUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.SubPairs", opLen2(ssa.OpSubPairsUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x4.SubPairs", opLen2(ssa.OpSubPairsUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x8.SubPairs", opLen2(ssa.OpSubPairsUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x4.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x8.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubPairsSaturatedGrouped", opLen2(ssa.OpSubPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x64, types.TypeVec512), sys.AMD64)
--- a/src/cmd/compile/testdata/script/issue77033.txt
+++ b/src/cmd/compile/testdata/script/issue77033.txt
@@ -0,0 +1,40 @@
+go test -bench=Foo -cpuprofile=default.pgo
+go test -bench=Foo -pgo=default.pgo
+! stdout 'FAIL'
+
+-- main_test.go --
+package main
+
+import (
+	"testing"
+)
+
+var a int
+
+func save(x int) {
+	a = x
+}
+
+func foo() {
+	for i := range yield1 {
+		defer save(i)
+	}
+}
+
+func yield1(yield func(int) bool) {
+	yield(1)
+}
+
+func BenchmarkFoo(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		foo()
+	}
+	if a != 1 {
+		b.Fatalf("a = %d; want 1", a)
+	}
+}
+
+-- go.mod --
+module demo
+
+go 1.24
--- a/src/cmd/dist/test.go
+++ b/src/cmd/dist/test.go
@@ -748,7 +748,7 @@ func (t *tester) registerTests() {
 	if !strings.Contains(goexperiment, "jsonv2") {
 		t.registerTest("GOEXPERIMENT=jsonv2 go test encoding/json/...", &goTest{
 			variant: "jsonv2",
-			env:     []string{"GOEXPERIMENT=jsonv2"},
+			env:     []string{"GOEXPERIMENT=" + goexperiments("jsonv2")},
 			pkg:     "encoding/json/...",
 		})
 	}
@@ -757,7 +757,7 @@ func (t *tester) registerTests() {
 	if !strings.Contains(goexperiment, "runtimesecret") {
 		t.registerTest("GOEXPERIMENT=runtimesecret go test runtime/secret/...", &goTest{
 			variant: "runtimesecret",
-			env:     []string{"GOEXPERIMENT=runtimesecret"},
+			env:     []string{"GOEXPERIMENT=" + goexperiments("runtimesecret")},
 			pkg:     "runtime/secret/...",
 		})
 	}
@@ -766,7 +766,7 @@ func (t *tester) registerTests() {
 	if goarch == "amd64" && !strings.Contains(goexperiment, "simd") {
 		t.registerTest("GOEXPERIMENT=simd go test simd/archsimd/...", &goTest{
 			variant: "simd",
-			env:     []string{"GOEXPERIMENT=simd"},
+			env:     []string{"GOEXPERIMENT=" + goexperiments("simd")},
 			pkg:     "simd/archsimd/...",
 		})
 	}
@@ -1888,3 +1888,19 @@ func fipsVersions(short bool) []string {
 	}
 	return versions
 }
+
+// goexperiments returns the GOEXPERIMENT value to use
+// when running a test with the given experiments enabled.
+//
+// It preserves any existing GOEXPERIMENTs.
+func goexperiments(exps ...string) string {
+	if len(exps) == 0 {
+		return goexperiment
+	}
+	existing := goexperiment
+	if existing != "" {
+		existing += ","
+	}
+	return existing + strings.Join(exps, ",")
+
+}
--- a/src/cmd/go/alldocs.go
+++ b/src/cmd/go/alldocs.go
@@ -1954,7 +1954,7 @@
 //
 //	-o file
 //	    Save a copy of the test binary to the named file.
-//	    The test still runs (unless -c or -i is specified).
+//	    The test still runs (unless -c is specified).
 //	    If file ends in a slash or names an existing directory,
 //	    the test is written to pkg.test in that directory.
 //
--- a/src/cmd/go/internal/doc/pkgsite.go
+++ b/src/cmd/go/internal/doc/pkgsite.go
@@ -71,7 +71,7 @@ func doPkgsite(urlPath, fragment string) error {
 		env = append(env, "GOPROXY="+gomodcache+","+goproxy)
 	}

-	const version = "v0.0.0-20250714212547-01b046e81fe7"
+	const version = "v0.0.0-20251223195805-1a3bd3c788fe"
 	cmd := exec.Command(goCmd(), "run", "golang.org/x/pkgsite/cmd/internal/doc@"+version,
 		"-gorepo", buildCtx.GOROOT,
 		"-http", addr,
--- a/src/cmd/go/internal/modindex/scan.go
+++ b/src/cmd/go/internal/modindex/scan.go
@@ -112,10 +112,10 @@ func parseErrorToString(err error) string {
 		return ""
 	}
 	var p parseError
-	if e, ok := err.(scanner.ErrorList); ok {
-		p.ErrorList = &e
+	if errlist, ok := err.(scanner.ErrorList); ok {
+		p.ErrorList = &errlist
 	} else {
-		p.ErrorString = e.Error()
+		p.ErrorString = err.Error()
 	}
 	s, err := json.Marshal(p)
 	if err != nil {
--- a/src/cmd/go/internal/test/test.go
+++ b/src/cmd/go/internal/test/test.go
@@ -163,7 +163,7 @@ In addition to the build flags, the flags handled by 'go test' itself are:

 	-o file
 	    Save a copy of the test binary to the named file.
-	    The test still runs (unless -c or -i is specified).
+	    The test still runs (unless -c is specified).
 	    If file ends in a slash or names an existing directory,
 	    the test is written to pkg.test in that directory.

--- a/src/cmd/go/testdata/script/list_empty_importpath.txt
+++ b/src/cmd/go/testdata/script/list_empty_importpath.txt
@@ -1,15 +1,6 @@
 ! go list all
 ! stderr 'panic'
-[!GOOS:windows] [!GOOS:solaris] [!GOOS:freebsd] [!GOOS:openbsd] [!GOOS:netbsd] stderr 'invalid import path'
-# #73976: Allow 'no errors' on Windows, Solaris, and BSD until issue
-# is resolved to prevent flakes. 'no errors' is printed by
-# empty scanner.ErrorList errors so that's probably where the
-# message is coming from, though we don't know how.
-[GOOS:windows] stderr 'invalid import path|no errors'
-[GOOS:solaris] stderr 'invalid import path|no errors'
-[GOOS:freebsd] stderr 'invalid import path|no errors'
-[GOOS:openbsd] stderr 'invalid import path|no errors'
-[GOOS:netbsd] stderr 'invalid import path|no errors'
+stderr 'invalid import path'

 # go list produces a package for 'p' but not for ''
 go list -e all
--- a/src/cmd/internal/bootstrap_test/overlaydir_test.go
+++ b/src/cmd/internal/bootstrap_test/overlaydir_test.go
@@ -43,6 +43,9 @@ func overlayDir(dstRoot, srcRoot string) error {
 		dstPath := filepath.Join(dstRoot, suffix)

 		info, err := entry.Info()
+		if err != nil {
+			return err
+		}
 		perm := info.Mode() & os.ModePerm
 		if info.Mode()&os.ModeSymlink != 0 {
 			info, err = os.Stat(srcPath)
--- a/src/cmd/link/link_test.go
+++ b/src/cmd/link/link_test.go
@@ -869,6 +869,9 @@ func TestFuncAlignOption(t *testing.T) {
 			"_main.bar": false,
 			"_main.baz": false}
 		syms, err := f.Symbols()
+		if err != nil {
+			t.Errorf("failed to get symbols with err %v", err)
+		}
 		for _, s := range syms {
 			fn := s.Name
 			if _, ok := fname[fn]; !ok {
--- a/src/crypto/cipher/gcm_fips140v1.26_test.go
+++ b/src/crypto/cipher/gcm_fips140v1.26_test.go
@@ -18,10 +18,10 @@ import (
 	"testing"
 )

-func TestGCMNoncesFIPSV2(t *testing.T) {
+func TestGCMNoncesFIPSV126(t *testing.T) {
 	cryptotest.MustSupportFIPS140(t)
 	if !fips140.Enabled {
-		cmd := testenv.Command(t, testenv.Executable(t), "-test.run=^TestGCMNoncesFIPSV2$", "-test.v")
+		cmd := testenv.Command(t, testenv.Executable(t), "-test.run=^TestGCMNoncesFIPSV126$", "-test.v")
 		cmd.Env = append(cmd.Environ(), "GODEBUG=fips140=on")
 		out, err := cmd.CombinedOutput()
 		t.Logf("running with GODEBUG=fips140=on:\n%s", out)
--- a/src/crypto/hpke/aead_fips140v1.0.go
+++ b/src/crypto/hpke/aead_fips140v1.0.go
--- a/src/crypto/hpke/aead_fips140v1.26.go
+++ b/src/crypto/hpke/aead_fips140v1.26.go
--- a/src/crypto/internal/fips140only/fips140only_test.go
+++ b/src/crypto/internal/fips140only/fips140only_test.go
@@ -0,0 +1,408 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package fips140only_test
+
+import (
+	"crypto"
+	"crypto/aes"
+	"crypto/cipher"
+	"crypto/des"
+	"crypto/dsa"
+	"crypto/ecdh"
+	"crypto/ecdsa"
+	"crypto/ed25519"
+	"crypto/elliptic"
+	"crypto/hkdf"
+	"crypto/hmac"
+	"crypto/hpke"
+	"crypto/internal/cryptotest"
+	"crypto/internal/fips140"
+	"crypto/internal/fips140only"
+	"crypto/md5"
+	"crypto/mlkem"
+	"crypto/mlkem/mlkemtest"
+	"crypto/pbkdf2"
+	"crypto/rand"
+	"crypto/rc4"
+	"crypto/rsa"
+	"crypto/sha1"
+	"crypto/sha256"
+	_ "crypto/sha3"
+	_ "crypto/sha512"
+	"crypto/x509"
+	"encoding/pem"
+	"fmt"
+	"internal/godebug"
+	"internal/testenv"
+	"io"
+	"math/big"
+	"os"
+	"strings"
+	"testing"
+
+	"golang.org/x/crypto/chacha20poly1305"
+)
+
+func TestFIPS140Only(t *testing.T) {
+	cryptotest.MustSupportFIPS140(t)
+	if !fips140only.Enforced() {
+		cmd := testenv.Command(t, testenv.Executable(t), "-test.run=^TestFIPS140Only$", "-test.v")
+		cmd.Env = append(cmd.Environ(), "GODEBUG=fips140=only")
+		out, err := cmd.CombinedOutput()
+		t.Logf("running with GODEBUG=fips140=only:\n%s", out)
+		if err != nil {
+			t.Errorf("fips140=only subprocess failed: %v", err)
+		}
+		return
+	}
+	t.Run("cryptocustomrand=0", func(t *testing.T) {
+		t.Setenv("GODEBUG", os.Getenv("GODEBUG")+",cryptocustomrand=0")
+		testFIPS140Only(t)
+	})
+	t.Run("cryptocustomrand=1", func(t *testing.T) {
+		t.Setenv("GODEBUG", os.Getenv("GODEBUG")+",cryptocustomrand=1")
+		testFIPS140Only(t)
+	})
+}
+
+func testFIPS140Only(t *testing.T) {
+	if !fips140only.Enforced() {
+		t.Fatal("FIPS 140-only mode not enforced")
+	}
+	t.Logf("GODEBUG=fips140=only enabled")
+	fips140.ResetServiceIndicator()
+
+	aesBlock, err := aes.NewCipher(make([]byte, 16))
+	if err != nil {
+		t.Fatal(err)
+	}
+	notAESBlock := blockWrap{aesBlock}
+	iv := make([]byte, aes.BlockSize)
+
+	cipher.NewCBCEncrypter(aesBlock, iv)
+	expectPanic(t, func() { cipher.NewCBCEncrypter(notAESBlock, iv) })
+	cipher.NewCBCDecrypter(aesBlock, iv)
+	expectPanic(t, func() { cipher.NewCBCDecrypter(notAESBlock, iv) })
+
+	expectPanic(t, func() { cipher.NewCFBEncrypter(aesBlock, iv) })
+	expectPanic(t, func() { cipher.NewCFBDecrypter(aesBlock, iv) })
+
+	cipher.NewCTR(aesBlock, iv)
+	expectPanic(t, func() { cipher.NewCTR(notAESBlock, iv) })
+
+	expectPanic(t, func() { cipher.NewOFB(aesBlock, iv) })
+
+	expectErr(t, errRet2(cipher.NewGCM(aesBlock)))
+	expectErr(t, errRet2(cipher.NewGCMWithNonceSize(aesBlock, 12)))
+	expectErr(t, errRet2(cipher.NewGCMWithTagSize(aesBlock, 12)))
+	expectNoErr(t, errRet2(cipher.NewGCMWithRandomNonce(aesBlock)))
+
+	expectErr(t, errRet2(des.NewCipher(make([]byte, 8))))
+	expectErr(t, errRet2(des.NewTripleDESCipher(make([]byte, 24))))
+
+	expectErr(t, errRet2(rc4.NewCipher(make([]byte, 16))))
+
+	expectErr(t, errRet2(chacha20poly1305.New(make([]byte, chacha20poly1305.KeySize))))
+	expectErr(t, errRet2(chacha20poly1305.NewX(make([]byte, chacha20poly1305.KeySize))))
+
+	expectPanic(t, func() { md5.New().Sum(nil) })
+	expectErr(t, errRet2(md5.New().Write(make([]byte, 16))))
+	expectPanic(t, func() { md5.Sum([]byte("foo")) })
+
+	expectPanic(t, func() { sha1.New().Sum(nil) })
+	expectErr(t, errRet2(sha1.New().Write(make([]byte, 16))))
+	expectPanic(t, func() { sha1.Sum([]byte("foo")) })
+
+	withApprovedHash(func(h crypto.Hash) { h.New().Sum(nil) })
+	withNonApprovedHash(func(h crypto.Hash) { expectPanic(t, func() { h.New().Sum(nil) }) })
+
+	expectErr(t, errRet2(pbkdf2.Key(sha256.New, "password", make([]byte, 16), 1, 10)))
+	expectErr(t, errRet2(pbkdf2.Key(sha256.New, "password", make([]byte, 10), 1, 14)))
+	withNonApprovedHash(func(h crypto.Hash) {
+		expectErr(t, errRet2(pbkdf2.Key(h.New, "password", make([]byte, 16), 1, 14)))
+	})
+	withApprovedHash(func(h crypto.Hash) {
+		expectNoErr(t, errRet2(pbkdf2.Key(h.New, "password", make([]byte, 16), 1, 14)))
+	})
+
+	expectPanic(t, func() { hmac.New(sha256.New, make([]byte, 10)) })
+	withNonApprovedHash(func(h crypto.Hash) {
+		expectPanic(t, func() { hmac.New(h.New, make([]byte, 16)) })
+	})
+	withApprovedHash(func(h crypto.Hash) { hmac.New(h.New, make([]byte, 16)) })
+
+	expectErr(t, errRet2(hkdf.Key(sha256.New, make([]byte, 10), nil, "", 16)))
+	withNonApprovedHash(func(h crypto.Hash) {
+		expectErr(t, errRet2(hkdf.Key(h.New, make([]byte, 16), nil, "", 16)))
+	})
+	withApprovedHash(func(h crypto.Hash) {
+		expectNoErr(t, errRet2(hkdf.Key(h.New, make([]byte, 16), nil, "", 16)))
+	})
+
+	expectErr(t, errRet2(hkdf.Extract(sha256.New, make([]byte, 10), nil)))
+	withNonApprovedHash(func(h crypto.Hash) {
+		expectErr(t, errRet2(hkdf.Extract(h.New, make([]byte, 16), nil)))
+	})
+	withApprovedHash(func(h crypto.Hash) {
+		expectNoErr(t, errRet2(hkdf.Extract(h.New, make([]byte, 16), nil)))
+	})
+
+	expectErr(t, errRet2(hkdf.Expand(sha256.New, make([]byte, 10), "", 16)))
+	withNonApprovedHash(func(h crypto.Hash) {
+		expectErr(t, errRet2(hkdf.Expand(h.New, make([]byte, 16), "", 16)))
+	})
+	withApprovedHash(func(h crypto.Hash) {
+		expectNoErr(t, errRet2(hkdf.Expand(h.New, make([]byte, 16), "", 16)))
+	})
+
+	expectErr(t, errRet2(rand.Prime(rand.Reader, 10)))
+
+	expectErr(t, dsa.GenerateParameters(&dsa.Parameters{}, rand.Reader, dsa.L1024N160))
+	expectErr(t, dsa.GenerateKey(&dsa.PrivateKey{}, rand.Reader))
+	expectErr(t, errRet3(dsa.Sign(rand.Reader, &dsa.PrivateKey{}, make([]byte, 16))))
+	expectPanic(t, func() {
+		dsa.Verify(&dsa.PublicKey{}, make([]byte, 16), big.NewInt(1), big.NewInt(1))
+	})
+
+	expectErr(t, errRet2(ecdh.X25519().GenerateKey(rand.Reader)))
+	expectErr(t, errRet2(ecdh.X25519().NewPrivateKey(make([]byte, 32))))
+	expectErr(t, errRet2(ecdh.X25519().NewPublicKey(make([]byte, 32))))
+	for _, curve := range []ecdh.Curve{ecdh.P256(), ecdh.P384(), ecdh.P521()} {
+		expectErrIfCustomRand(t, errRet2(curve.GenerateKey(readerWrap{rand.Reader})))
+		k, err := curve.GenerateKey(rand.Reader)
+		if err != nil {
+			t.Fatal(err)
+		}
+		expectNoErr(t, errRet2(curve.NewPrivateKey(k.Bytes())))
+		expectNoErr(t, errRet2(curve.NewPublicKey(k.PublicKey().Bytes())))
+	}
+
+	for _, curve := range []elliptic.Curve{elliptic.P256(), elliptic.P384(), elliptic.P521()} {
+		expectErrIfCustomRand(t, errRet2(ecdsa.GenerateKey(curve, readerWrap{rand.Reader})))
+		k, err := ecdsa.GenerateKey(curve, rand.Reader)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		expectErrIfCustomRand(t, errRet2(k.Sign(readerWrap{rand.Reader}, make([]byte, 32), nil)))
+		expectErrIfCustomRand(t, errRet2(ecdsa.SignASN1(readerWrap{rand.Reader}, k, make([]byte, 32))))
+		expectErrIfCustomRand(t, errRet3(ecdsa.Sign(readerWrap{rand.Reader}, k, make([]byte, 32))))
+		expectNoErr(t, errRet2(k.Sign(rand.Reader, make([]byte, 32), nil)))
+		expectNoErr(t, errRet2(ecdsa.SignASN1(rand.Reader, k, make([]byte, 32))))
+		expectNoErr(t, errRet3(ecdsa.Sign(rand.Reader, k, make([]byte, 32))))
+
+		withNonApprovedHash(func(h crypto.Hash) {
+			expectErr(t, errRet2(k.Sign(nil, make([]byte, h.Size()), h)))
+		})
+		withApprovedHash(func(h crypto.Hash) {
+			expectNoErr(t, errRet2(k.Sign(nil, make([]byte, h.Size()), h)))
+		})
+	}
+	customCurve := &elliptic.CurveParams{Name: "custom", P: big.NewInt(1)}
+	expectErr(t, errRet2(ecdsa.GenerateKey(customCurve, rand.Reader)))
+
+	_, ed25519Key, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expectNoErr(t, errRet2(ed25519Key.Sign(nil, make([]byte, 32), crypto.Hash(0))))
+	expectNoErr(t, errRet2(ed25519Key.Sign(nil, make([]byte, 64), crypto.SHA512)))
+	// ed25519ctx is not allowed (but ed25519ph with context is).
+	expectErr(t, errRet2(ed25519Key.Sign(nil, make([]byte, 32), &ed25519.Options{
+		Context: "test",
+	})))
+	expectNoErr(t, errRet2(ed25519Key.Sign(nil, make([]byte, 64), &ed25519.Options{
+		Hash: crypto.SHA512, Context: "test",
+	})))
+	expectNoErr(t, errRet2(ed25519Key.Sign(nil, make([]byte, 64), &ed25519.Options{
+		Hash: crypto.SHA512,
+	})))
+
+	expectErr(t, errRet2(rsa.GenerateMultiPrimeKey(rand.Reader, 3, 2048)))
+	expectErr(t, errRet2(rsa.GenerateKey(rand.Reader, 1024)))
+	expectErr(t, errRet2(rsa.GenerateKey(rand.Reader, 2049)))
+	expectErrIfCustomRand(t, errRet2(rsa.GenerateKey(readerWrap{rand.Reader}, 2048)))
+	rsaKey, err := rsa.GenerateKey(rand.Reader, 2048)
+	expectNoErr(t, err)
+
+	smallKey := parseKey(testingKey(`-----BEGIN RSA TESTING KEY-----
+MIICXQIBAAKBgQDMrln6XoAa3Rjts+kRi5obbP86qSf/562RcuDO+yMXeTLHfi4M
+8ubyhoFY+UKBCGBLmmTO7ikbvQgdipkT3xVkU8nM3XTW4sxrnw0X5QXsl4PGlMo0
+5UufxYyQxe7bbjuwFz2XnN6Jz4orpOfO0s36/KVHj9lZRl+REpr/Jy+nJQIDAQAB
+AoGAJ9WEwGO01cWSzOwXH2mGX/EKCQ4TsUuS7XwogU/B6BcXyVhmuPFq/ecsdDbq
+ePc62mvdU6JpELNsyWcIXKQtYsRgJHxNS+KJkCQIq6YeiAWRG0XL6q+qVj+HtT8a
+1Qrmul9ZBd23Y9wLF8pg/xWDQYvb8DPAb/xJ0e/KEBZcWU8CQQDXFCFCGpCfwyxY
+Cq8G/3B94D9UYwk5mK6jRIH5m8LbaX9bKKetf8+If8TWVgeuiRjjN4WEQ78lPoSg
+3Fsz2qs3AkEA85/JCudNUf2FnY+T6h1c/2SWekZiZ1NS4lCh/C7iYuAN3oa8zGkf
+gjjR5e0+Z8rUAcZkTukxyLLaNqy6rs9GgwJAVR6pXvEGhcQHe7yWso1LpvWl+q7L
+StkrXIBTdEb54j4pYhl/6wFnUB1I+I7JsYCeseYaWFM7hfDtKoCrM6V6FwJBANxh
+KmfmnJcSkw/YlaEuNrYAs+6gRNvbEBsRfba2Yqu2qlUl5Ruz7IDMDXPEjLMvU2DX
+ql2HrTU0NRlIXwdLESkCQQDGJ54H6WK1eE1YvtxCaLm28zmogcFlvc21pym+PpM1
+bXVL8iKLrG91IYQByUHZIn3WVAd2bfi4MfKagRt0ggd4
+-----END RSA TESTING KEY-----`))
+
+	expectNoErr(t, errRet2(rsaKey.Sign(rand.Reader, make([]byte, 32), crypto.SHA256)))
+	expectErr(t, errRet2(smallKey.Sign(rand.Reader, make([]byte, 32), crypto.SHA256)))
+	expectErr(t, errRet2(rsaKey.Sign(rand.Reader, make([]byte, 20), crypto.SHA1)))
+	// rand is always ignored for PKCS1v15 signing
+	expectNoErr(t, errRet2(rsaKey.Sign(readerWrap{rand.Reader}, make([]byte, 32), crypto.SHA256)))
+
+	sigPKCS1v15, err := rsa.SignPKCS1v15(rand.Reader, rsaKey, crypto.SHA256, make([]byte, 32))
+	expectNoErr(t, err)
+	expectErr(t, errRet2(rsa.SignPKCS1v15(rand.Reader, smallKey, crypto.SHA256, make([]byte, 32))))
+	expectErr(t, errRet2(rsa.SignPKCS1v15(rand.Reader, rsaKey, crypto.SHA1, make([]byte, 20))))
+	// rand is always ignored for PKCS1v15 signing
+	expectNoErr(t, errRet2(rsa.SignPKCS1v15(readerWrap{rand.Reader}, rsaKey, crypto.SHA256, make([]byte, 32))))
+
+	expectNoErr(t, rsa.VerifyPKCS1v15(&rsaKey.PublicKey, crypto.SHA256, make([]byte, 32), sigPKCS1v15))
+	expectErr(t, rsa.VerifyPKCS1v15(&smallKey.PublicKey, crypto.SHA256, make([]byte, 32), sigPKCS1v15))
+	expectErr(t, rsa.VerifyPKCS1v15(&rsaKey.PublicKey, crypto.SHA1, make([]byte, 20), sigPKCS1v15))
+
+	sigPSS, err := rsa.SignPSS(rand.Reader, rsaKey, crypto.SHA256, make([]byte, 32), nil)
+	expectNoErr(t, err)
+	expectErr(t, errRet2(rsa.SignPSS(rand.Reader, smallKey, crypto.SHA256, make([]byte, 32), nil)))
+	expectErr(t, errRet2(rsa.SignPSS(rand.Reader, rsaKey, crypto.SHA1, make([]byte, 20), nil)))
+	expectErr(t, errRet2(rsa.SignPSS(readerWrap{rand.Reader}, rsaKey, crypto.SHA256, make([]byte, 32), nil)))
+
+	expectNoErr(t, rsa.VerifyPSS(&rsaKey.PublicKey, crypto.SHA256, make([]byte, 32), sigPSS, nil))
+	expectErr(t, rsa.VerifyPSS(&smallKey.PublicKey, crypto.SHA256, make([]byte, 32), sigPSS, nil))
+	expectErr(t, rsa.VerifyPSS(&rsaKey.PublicKey, crypto.SHA1, make([]byte, 20), sigPSS, nil))
+
+	k, err := mlkem.GenerateKey768()
+	expectNoErr(t, err)
+	expectErr(t, errRet3(mlkemtest.Encapsulate768(k.EncapsulationKey(), make([]byte, 32))))
+	k1024, err := mlkem.GenerateKey1024()
+	expectNoErr(t, err)
+	expectErr(t, errRet3(mlkemtest.Encapsulate1024(k1024.EncapsulationKey(), make([]byte, 32))))
+
+	for _, kem := range []hpke.KEM{
+		hpke.DHKEM(ecdh.P256()),
+		hpke.DHKEM(ecdh.P384()),
+		hpke.DHKEM(ecdh.P521()),
+		hpke.MLKEM768(),
+		hpke.MLKEM1024(),
+		hpke.MLKEM768P256(),
+		hpke.MLKEM1024P384(),
+		hpke.MLKEM768X25519(), // allowed as hybrid
+	} {
+		t.Run(fmt.Sprintf("HKPE KEM %04x", kem.ID()), func(t *testing.T) {
+			k, err := kem.GenerateKey()
+			expectNoErr(t, err)
+			expectNoErr(t, errRet2(kem.DeriveKeyPair(make([]byte, 64))))
+			kb, err := k.Bytes()
+			expectNoErr(t, err)
+			expectNoErr(t, errRet2(kem.NewPrivateKey(kb)))
+			expectNoErr(t, errRet2(kem.NewPublicKey(k.PublicKey().Bytes())))
+			if fips140.Version() == "v1.0.0" {
+				t.Skip("FIPS 140-3 Module v1.0.0 does not provide HPKE GCM modes")
+			}
+			c, err := hpke.Seal(k.PublicKey(), hpke.HKDFSHA256(), hpke.AES128GCM(), nil, nil)
+			expectNoErr(t, err)
+			_, err = hpke.Open(k, hpke.HKDFSHA256(), hpke.AES128GCM(), nil, c)
+			expectNoErr(t, err)
+		})
+	}
+	expectErr(t, errRet2(hpke.DHKEM(ecdh.X25519()).GenerateKey()))
+	expectErr(t, errRet2(hpke.DHKEM(ecdh.X25519()).DeriveKeyPair(make([]byte, 64))))
+	expectErr(t, errRet2(hpke.DHKEM(ecdh.X25519()).NewPrivateKey(make([]byte, 32))))
+	expectErr(t, errRet2(hpke.DHKEM(ecdh.X25519()).NewPublicKey(make([]byte, 32))))
+	hpkeK, err := hpke.MLKEM768().GenerateKey()
+	expectNoErr(t, err)
+	expectErr(t, errRet2(hpke.Seal(hpkeK.PublicKey(), hpke.HKDFSHA256(), hpke.ChaCha20Poly1305(), nil, nil)))
+	expectErr(t, errRet2(hpke.Open(hpkeK, hpke.HKDFSHA256(), hpke.ChaCha20Poly1305(), nil, make([]byte, 2000))))
+
+	// fips140=only mode should prevent any operation that would make the FIPS
+	// 140-3 module set its service indicator to false.
+	if !fips140.ServiceIndicator() {
+		t.Errorf("service indicator not set")
+	}
+}
+
+type blockWrap struct {
+	cipher.Block
+}
+
+type readerWrap struct {
+	io.Reader
+}
+
+func withApprovedHash(f func(crypto.Hash)) {
+	f(crypto.SHA224)
+	f(crypto.SHA256)
+	f(crypto.SHA384)
+	f(crypto.SHA512)
+	f(crypto.SHA3_224)
+	f(crypto.SHA3_256)
+	f(crypto.SHA3_384)
+	f(crypto.SHA3_512)
+	f(crypto.SHA512_224)
+	f(crypto.SHA512_256)
+}
+
+func withNonApprovedHash(f func(crypto.Hash)) {
+	f(crypto.MD5)
+	f(crypto.SHA1)
+}
+
+func expectPanic(t *testing.T, f func()) {
+	t.Helper()
+	defer func() {
+		t.Helper()
+		if err := recover(); err == nil {
+			t.Errorf("expected panic")
+		} else {
+			if s, ok := err.(string); !ok || !strings.Contains(s, "FIPS 140-only") {
+				t.Errorf("unexpected panic: %v", err)
+			}
+		}
+	}()
+	f()
+}
+
+var cryptocustomrand = godebug.New("cryptocustomrand")
+
+func expectErr(t *testing.T, err error) {
+	t.Helper()
+	if err == nil {
+		t.Errorf("expected error")
+	} else if !strings.Contains(err.Error(), "FIPS 140-only") {
+		t.Errorf("unexpected error: %v", err)
+	}
+}
+
+func expectNoErr(t *testing.T, err error) {
+	t.Helper()
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+}
+
+func expectErrIfCustomRand(t *testing.T, err error) {
+	t.Helper()
+	if cryptocustomrand.Value() == "1" {
+		expectErr(t, err)
+	} else {
+		expectNoErr(t, err)
+	}
+}
+
+func errRet2[T any](_ T, err error) error {
+	return err
+}
+
+func errRet3[T any](_, _ T, err error) error {
+	return err
+}
+
+func testingKey(s string) string { return strings.ReplaceAll(s, "TESTING KEY", "PRIVATE KEY") }
+
+func parseKey(s string) *rsa.PrivateKey {
+	p, _ := pem.Decode([]byte(s))
+	k, err := x509.ParsePKCS1PrivateKey(p.Bytes)
+	if err != nil {
+		panic(err)
+	}
+	return k
+}
--- a/src/crypto/internal/fips140test/acvp_capabilities_fips140v1.26.json
+++ b/src/crypto/internal/fips140test/acvp_capabilities_fips140v1.26.json
--- a/src/crypto/internal/fips140test/acvp_fips140v1.26_test.go
+++ b/src/crypto/internal/fips140test/acvp_fips140v1.26_test.go
@@ -12,10 +12,10 @@ import (
 	"fmt"
 )

-//go:embed acvp_capabilities_fips140v2.0.json
+//go:embed acvp_capabilities_fips140v1.26.json
 var capabilitiesJson []byte

-var testConfigFile = "acvp_test_fips140v2.0.config.json"
+var testConfigFile = "acvp_test_fips140v1.26.config.json"

 func init() {
 	commands["ML-DSA-44/keyGen"] = cmdMlDsaKeyGenAft(mldsa.NewPrivateKey44)
--- a/src/crypto/internal/fips140test/acvp_test_fips140v1.26.config.json
+++ b/src/crypto/internal/fips140test/acvp_test_fips140v1.26.config.json
--- a/src/crypto/internal/fips140test/cast_fips140v1.0_test.go
+++ b/src/crypto/internal/fips140test/cast_fips140v1.0_test.go
@@ -6,4 +6,4 @@

 package fipstest

-func fips140v2Conditionals() {}
+func fips140v126Conditionals() {}
--- a/src/crypto/internal/fips140test/cast_fips140v1.26_test.go
+++ b/src/crypto/internal/fips140test/cast_fips140v1.26_test.go
@@ -8,7 +8,7 @@ package fipstest

 import "crypto/internal/fips140/mldsa"

-func fips140v2Conditionals() {
+func fips140v126Conditionals() {
 	// ML-DSA sign and verify PCT
 	kMLDSA := mldsa.GenerateKey44()
 	// ML-DSA-44
--- a/src/crypto/internal/fips140test/cast_test.go
+++ b/src/crypto/internal/fips140test/cast_test.go
@@ -115,7 +115,7 @@ func TestAllCASTs(t *testing.T) {

 // TestConditionals causes the conditional CASTs and PCTs to be invoked.
 func TestConditionals(t *testing.T) {
-	fips140v2Conditionals()
+	fips140v126Conditionals()
 	// ML-KEM PCT
 	kMLKEM, err := mlkem.GenerateKey768()
 	if err != nil {
--- a/src/crypto/internal/rand/rand_fips140v1.0.go
+++ b/src/crypto/internal/rand/rand_fips140v1.0.go
--- a/src/crypto/internal/rand/rand_fips140v1.26.go
+++ b/src/crypto/internal/rand/rand_fips140v1.26.go
--- a/src/crypto/tls/conn.go
+++ b/src/crypto/tls/conn.go
@@ -224,6 +224,9 @@ func (hc *halfConn) changeCipherSpec() error {
 	return nil
 }

+// setTrafficSecret sets the traffic secret for the given encryption level. setTrafficSecret
+// should not be called directly, but rather through the Conn setWriteTrafficSecret and
+// setReadTrafficSecret wrapper methods.
 func (hc *halfConn) setTrafficSecret(suite *cipherSuiteTLS13, level QUICEncryptionLevel, secret []byte) {
 	hc.trafficSecret = secret
 	hc.level = level
@@ -1339,9 +1342,6 @@ func (c *Conn) handleKeyUpdate(keyUpdate *keyUpdateMsg) error {
 		return c.in.setErrorLocked(c.sendAlert(alertInternalError))
 	}

-	newSecret := cipherSuite.nextTrafficSecret(c.in.trafficSecret)
-	c.in.setTrafficSecret(cipherSuite, QUICEncryptionLevelInitial, newSecret)
-
 	if keyUpdate.updateRequested {
 		c.out.Lock()
 		defer c.out.Unlock()
@@ -1359,7 +1359,12 @@ func (c *Conn) handleKeyUpdate(keyUpdate *keyUpdateMsg) error {
 		}

 		newSecret := cipherSuite.nextTrafficSecret(c.out.trafficSecret)
-		c.out.setTrafficSecret(cipherSuite, QUICEncryptionLevelInitial, newSecret)
+		c.setWriteTrafficSecret(cipherSuite, QUICEncryptionLevelInitial, newSecret)
+	}
+
+	newSecret := cipherSuite.nextTrafficSecret(c.in.trafficSecret)
+	if err := c.setReadTrafficSecret(cipherSuite, QUICEncryptionLevelInitial, newSecret); err != nil {
+		return err
 	}

 	return nil
@@ -1576,7 +1581,9 @@ func (c *Conn) handshakeContext(ctx context.Context) (ret error) {
 			// Provide the 1-RTT read secret now that the handshake is complete.
 			// The QUIC layer MUST NOT decrypt 1-RTT packets prior to completing
 			// the handshake (RFC 9001, Section 5.7).
-			c.quicSetReadSecret(QUICEncryptionLevelApplication, c.cipherSuite, c.in.trafficSecret)
+			if err := c.quicSetReadSecret(QUICEncryptionLevelApplication, c.cipherSuite, c.in.trafficSecret); err != nil {
+				return err
+			}
 		} else {
 			c.out.Lock()
 			a, ok := errors.AsType[alert](c.out.err)
@@ -1672,3 +1679,25 @@ func (c *Conn) VerifyHostname(host string) error {
 	}
 	return c.peerCertificates[0].VerifyHostname(host)
 }
+
+// setReadTrafficSecret sets the read traffic secret for the given encryption level. If
+// being called at the same time as setWriteTrafficSecret, the caller must ensure the call
+// to setWriteTrafficSecret happens first so any alerts are sent at the write level.
+func (c *Conn) setReadTrafficSecret(suite *cipherSuiteTLS13, level QUICEncryptionLevel, secret []byte) error {
+	// Ensure that there are no buffered handshake messages before changing the
+	// read keys, since that can cause messages to be parsed that were encrypted
+	// using old keys which are no longer appropriate.
+	if c.hand.Len() != 0 {
+		c.sendAlert(alertUnexpectedMessage)
+		return errors.New("tls: handshake buffer not empty before setting read traffic secret")
+	}
+	c.in.setTrafficSecret(suite, level, secret)
+	return nil
+}
+
+// setWriteTrafficSecret sets the write traffic secret for the given encryption level. If
+// being called at the same time as setReadTrafficSecret, the caller must ensure the call
+// to setWriteTrafficSecret happens first so any alerts are sent at the write level.
+func (c *Conn) setWriteTrafficSecret(suite *cipherSuiteTLS13, level QUICEncryptionLevel, secret []byte) {
+	c.out.setTrafficSecret(suite, level, secret)
+}
--- a/src/crypto/tls/handshake_client_tls13.go
+++ b/src/crypto/tls/handshake_client_tls13.go
@@ -490,16 +490,17 @@ func (hs *clientHandshakeStateTLS13) establishHandshakeKeys() error {
 	handshakeSecret := earlySecret.HandshakeSecret(sharedKey)

 	clientSecret := handshakeSecret.ClientHandshakeTrafficSecret(hs.transcript)
-	c.out.setTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, clientSecret)
+	c.setWriteTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, clientSecret)
 	serverSecret := handshakeSecret.ServerHandshakeTrafficSecret(hs.transcript)
-	c.in.setTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, serverSecret)
+	if err := c.setReadTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, serverSecret); err != nil {
+		return err
+	}

 	if c.quic != nil {
-		if c.hand.Len() != 0 {
-			c.sendAlert(alertUnexpectedMessage)
-		}
 		c.quicSetWriteSecret(QUICEncryptionLevelHandshake, hs.suite.id, clientSecret)
-		c.quicSetReadSecret(QUICEncryptionLevelHandshake, hs.suite.id, serverSecret)
+		if err := c.quicSetReadSecret(QUICEncryptionLevelHandshake, hs.suite.id, serverSecret); err != nil {
+			return err
+		}
 	}

 	err = c.config.writeKeyLog(keyLogLabelClientHandshake, hs.hello.random, clientSecret)
@@ -710,7 +711,9 @@ func (hs *clientHandshakeStateTLS13) readServerFinished() error {

 	hs.trafficSecret = hs.masterSecret.ClientApplicationTrafficSecret(hs.transcript)
 	serverSecret := hs.masterSecret.ServerApplicationTrafficSecret(hs.transcript)
-	c.in.setTrafficSecret(hs.suite, QUICEncryptionLevelApplication, serverSecret)
+	if err := c.setReadTrafficSecret(hs.suite, QUICEncryptionLevelApplication, serverSecret); err != nil {
+		return err
+	}

 	err = c.config.writeKeyLog(keyLogLabelClientTraffic, hs.hello.random, hs.trafficSecret)
 	if err != nil {
@@ -813,16 +816,13 @@ func (hs *clientHandshakeStateTLS13) sendClientFinished() error {
 		return err
 	}

-	c.out.setTrafficSecret(hs.suite, QUICEncryptionLevelApplication, hs.trafficSecret)
+	c.setWriteTrafficSecret(hs.suite, QUICEncryptionLevelApplication, hs.trafficSecret)

 	if !c.config.SessionTicketsDisabled && c.config.ClientSessionCache != nil {
 		c.resumptionSecret = hs.masterSecret.ResumptionMasterSecret(hs.transcript)
 	}

 	if c.quic != nil {
-		if c.hand.Len() != 0 {
-			c.sendAlert(alertUnexpectedMessage)
-		}
 		c.quicSetWriteSecret(QUICEncryptionLevelApplication, hs.suite.id, hs.trafficSecret)
 	}

--- a/src/crypto/tls/handshake_server_tls13.go
+++ b/src/crypto/tls/handshake_server_tls13.go
@@ -410,7 +410,9 @@ func (hs *serverHandshakeStateTLS13) checkForResumption() error {
 				return err
 			}
 			earlyTrafficSecret := hs.earlySecret.ClientEarlyTrafficSecret(transcript)
-			c.quicSetReadSecret(QUICEncryptionLevelEarly, hs.suite.id, earlyTrafficSecret)
+			if err := c.quicSetReadSecret(QUICEncryptionLevelEarly, hs.suite.id, earlyTrafficSecret); err != nil {
+				return err
+			}
 		}

 		c.didResume = true
@@ -514,6 +516,14 @@ func (hs *serverHandshakeStateTLS13) sendDummyChangeCipherSpec() error {
 func (hs *serverHandshakeStateTLS13) doHelloRetryRequest(selectedGroup CurveID) (*keyShare, error) {
 	c := hs.c

+	// Make sure the client didn't send extra handshake messages alongside
+	// their initial client_hello. If they sent two client_hello messages,
+	// we will consume the second before they respond to the server_hello.
+	if c.hand.Len() != 0 {
+		c.sendAlert(alertUnexpectedMessage)
+		return nil, errors.New("tls: handshake buffer not empty before HelloRetryRequest")
+	}
+
 	// The first ClientHello gets double-hashed into the transcript upon a
 	// HelloRetryRequest. See RFC 8446, Section 4.4.1.
 	if err := transcriptMsg(hs.clientHello, hs.transcript); err != nil {
@@ -733,17 +743,18 @@ func (hs *serverHandshakeStateTLS13) sendServerParameters() error {
 	}
 	hs.handshakeSecret = earlySecret.HandshakeSecret(hs.sharedKey)

-	clientSecret := hs.handshakeSecret.ClientHandshakeTrafficSecret(hs.transcript)
-	c.in.setTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, clientSecret)
 	serverSecret := hs.handshakeSecret.ServerHandshakeTrafficSecret(hs.transcript)
-	c.out.setTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, serverSecret)
+	c.setWriteTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, serverSecret)
+	clientSecret := hs.handshakeSecret.ClientHandshakeTrafficSecret(hs.transcript)
+	if err := c.setReadTrafficSecret(hs.suite, QUICEncryptionLevelHandshake, clientSecret); err != nil {
+		return err
+	}

 	if c.quic != nil {
-		if c.hand.Len() != 0 {
-			c.sendAlert(alertUnexpectedMessage)
-		}
 		c.quicSetWriteSecret(QUICEncryptionLevelHandshake, hs.suite.id, serverSecret)
-		c.quicSetReadSecret(QUICEncryptionLevelHandshake, hs.suite.id, clientSecret)
+		if err := c.quicSetReadSecret(QUICEncryptionLevelHandshake, hs.suite.id, clientSecret); err != nil {
+			return err
+		}
 	}

 	err := c.config.writeKeyLog(keyLogLabelClientHandshake, hs.clientHello.random, clientSecret)
@@ -887,13 +898,9 @@ func (hs *serverHandshakeStateTLS13) sendServerFinished() error {

 	hs.trafficSecret = hs.masterSecret.ClientApplicationTrafficSecret(hs.transcript)
 	serverSecret := hs.masterSecret.ServerApplicationTrafficSecret(hs.transcript)
-	c.out.setTrafficSecret(hs.suite, QUICEncryptionLevelApplication, serverSecret)
+	c.setWriteTrafficSecret(hs.suite, QUICEncryptionLevelApplication, serverSecret)

 	if c.quic != nil {
-		if c.hand.Len() != 0 {
-			// TODO: Handle this in setTrafficSecret?
-			c.sendAlert(alertUnexpectedMessage)
-		}
 		c.quicSetWriteSecret(QUICEncryptionLevelApplication, hs.suite.id, serverSecret)
 	}

@@ -1123,7 +1130,9 @@ func (hs *serverHandshakeStateTLS13) readClientFinished() error {
 		return errors.New("tls: invalid client finished hash")
 	}

-	c.in.setTrafficSecret(hs.suite, QUICEncryptionLevelApplication, hs.trafficSecret)
+	if err := c.setReadTrafficSecret(hs.suite, QUICEncryptionLevelApplication, hs.trafficSecret); err != nil {
+		return err
+	}

 	return nil
 }
--- a/src/crypto/tls/handshake_test.go
+++ b/src/crypto/tls/handshake_test.go
@@ -7,6 +7,7 @@ package tls
 import (
 	"bufio"
 	"bytes"
+	"context"
 	"crypto/ed25519"
 	"crypto/x509"
 	"encoding/hex"
@@ -638,3 +639,142 @@ var clientEd25519KeyPEM = testingKey(`
 -----BEGIN TESTING KEY-----
 MC4CAQAwBQYDK2VwBCIEINifzf07d9qx3d44e0FSbV4mC/xQxT644RRbpgNpin7I
 -----END TESTING KEY-----`)
+
+func TestServerHelloTrailingMessage(t *testing.T) {
+	// In TLS 1.3 the change cipher spec message is optional. If a CCS message
+	// is not sent, after reading the ServerHello, the read traffic secret is
+	// set, and all following messages must be encrypted. If the server sends
+	// additional unencrypted messages in a record with the ServerHello, the
+	// client must either fail or ignore the additional messages.
+
+	c, s := localPipe(t)
+	go func() {
+		ctx := context.Background()
+		srv := Server(s, testConfig)
+		clientHello, _, err := srv.readClientHello(ctx)
+		if err != nil {
+			testFatal(t, err)
+		}
+
+		hs := serverHandshakeStateTLS13{
+			c:           srv,
+			ctx:         ctx,
+			clientHello: clientHello,
+		}
+		if err := hs.processClientHello(); err != nil {
+			testFatal(t, err)
+		}
+		if err := transcriptMsg(hs.clientHello, hs.transcript); err != nil {
+			testFatal(t, err)
+		}
+
+		record, err := concatHandshakeMessages(hs.hello, &encryptedExtensionsMsg{alpnProtocol: "h2"})
+		if err != nil {
+			testFatal(t, err)
+		}
+
+		if _, err := s.Write(record); err != nil {
+			testFatal(t, err)
+		}
+		srv.Close()
+	}()
+
+	cli := Client(c, testConfig)
+	expectedErr := "tls: handshake buffer not empty before setting read traffic secret"
+	if err := cli.Handshake(); err == nil {
+		t.Fatal("expected error from incomplete handshake, got nil")
+	} else if err.Error() != expectedErr {
+		t.Fatalf("expected error %q, got %q", expectedErr, err.Error())
+	}
+}
+
+func TestClientHelloTrailingMessage(t *testing.T) {
+	// Same as TestServerHelloTrailingMessage but for the client side.
+
+	c, s := localPipe(t)
+	go func() {
+		cli := Client(c, testConfig)
+
+		hello, _, _, err := cli.makeClientHello()
+		if err != nil {
+			testFatal(t, err)
+		}
+
+		record, err := concatHandshakeMessages(hello, &certificateMsgTLS13{})
+		if err != nil {
+			testFatal(t, err)
+		}
+
+		if _, err := c.Write(record); err != nil {
+			testFatal(t, err)
+		}
+		cli.Close()
+	}()
+
+	srv := Server(s, testConfig)
+	expectedErr := "tls: handshake buffer not empty before setting read traffic secret"
+	if err := srv.Handshake(); err == nil {
+		t.Fatal("expected error from incomplete handshake, got nil")
+	} else if err.Error() != expectedErr {
+		t.Fatalf("expected error %q, got %q", expectedErr, err.Error())
+	}
+}
+
+func TestDoubleClientHelloHRR(t *testing.T) {
+	// If a client sends two ClientHello messages in a single record, and the
+	// server sends a HRR after reading the first ClientHello, the server must
+	// either fail or ignore the trailing ClientHello.
+
+	c, s := localPipe(t)
+
+	go func() {
+		cli := Client(c, testConfig)
+
+		hello, _, _, err := cli.makeClientHello()
+		if err != nil {
+			testFatal(t, err)
+		}
+		hello.keyShares = nil
+
+		record, err := concatHandshakeMessages(hello, hello)
+		if err != nil {
+			testFatal(t, err)
+		}
+
+		if _, err := c.Write(record); err != nil {
+			testFatal(t, err)
+		}
+		cli.Close()
+	}()
+
+	srv := Server(s, testConfig)
+	expectedErr := "tls: handshake buffer not empty before HelloRetryRequest"
+	if err := srv.Handshake(); err == nil {
+		t.Fatal("expected error from incomplete handshake, got nil")
+	} else if err.Error() != expectedErr {
+		t.Fatalf("expected error %q, got %q", expectedErr, err.Error())
+	}
+}
+
+// concatHandshakeMessages marshals and concatenates the given handshake
+// messages into a single record.
+func concatHandshakeMessages(msgs ...handshakeMessage) ([]byte, error) {
+	var marshalled []byte
+	for _, msg := range msgs {
+		data, err := msg.marshal()
+		if err != nil {
+			return nil, err
+		}
+		marshalled = append(marshalled, data...)
+	}
+	m := len(marshalled)
+	outBuf := make([]byte, recordHeaderLen)
+	outBuf[0] = byte(recordTypeHandshake)
+	vers := VersionTLS12
+	outBuf[1] = byte(vers >> 8)
+	outBuf[2] = byte(vers)
+	outBuf[3] = byte(m >> 8)
+	outBuf[4] = byte(m)
+	outBuf = append(outBuf, marshalled...)
+	return outBuf, nil
+}
--- a/src/crypto/tls/quic.go
+++ b/src/crypto/tls/quic.go
@@ -402,13 +402,22 @@ func (c *Conn) quicReadHandshakeBytes(n int) error {
 	return nil
 }

-func (c *Conn) quicSetReadSecret(level QUICEncryptionLevel, suite uint16, secret []byte) {
+func (c *Conn) quicSetReadSecret(level QUICEncryptionLevel, suite uint16, secret []byte) error {
+	// Ensure that there are no buffered handshake messages before changing the
+	// read keys, since that can cause messages to be parsed that were encrypted
+	// using old keys which are no longer appropriate.
+	// TODO(roland): we should merge this check with the similar one in setReadTrafficSecret.
+	if c.hand.Len() != 0 {
+		c.sendAlert(alertUnexpectedMessage)
+		return errors.New("tls: handshake buffer not empty before setting read traffic secret")
+	}
 	c.quic.events = append(c.quic.events, QUICEvent{
 		Kind:  QUICSetReadSecret,
 		Level: level,
 		Suite: suite,
 		Data:  secret,
 	})
+	return nil
 }

 func (c *Conn) quicSetWriteSecret(level QUICEncryptionLevel, suite uint16, secret []byte) {
--- a/src/debug/pe/file.go
+++ b/src/debug/pe/file.go
@@ -379,7 +379,11 @@ func (f *File) ImportedSymbols() ([]string, error) {
 	}

 	// seek to the virtual address specified in the import data directory
-	d = d[idd.VirtualAddress-ds.VirtualAddress:]
+	seek := idd.VirtualAddress - ds.VirtualAddress
+	if seek >= uint32(len(d)) {
+		return nil, errors.New("optional header data directory virtual size doesn't fit within data seek")
+	}
+	d = d[seek:]

 	// start decoding the import directory
 	var ida []ImportDirectory
@@ -408,9 +412,16 @@ func (f *File) ImportedSymbols() ([]string, error) {
 		dt.dll, _ = getString(names, int(dt.Name-ds.VirtualAddress))
 		d, _ = ds.Data()
 		// seek to OriginalFirstThunk
-		d = d[dt.OriginalFirstThunk-ds.VirtualAddress:]
+		seek := dt.OriginalFirstThunk - ds.VirtualAddress
+		if seek >= uint32(len(d)) {
+			return nil, errors.New("import directory original first thunk doesn't fit within data seek")
+		}
+		d = d[seek:]
 		for len(d) > 0 {
 			if pe64 { // 64bit
+				if len(d) < 8 {
+					return nil, errors.New("thunk parsing needs at least 8-bytes")
+				}
 				va := binary.LittleEndian.Uint64(d[0:8])
 				d = d[8:]
 				if va == 0 {
@@ -423,6 +434,9 @@ func (f *File) ImportedSymbols() ([]string, error) {
 					all = append(all, fn+":"+dt.dll)
 				}
 			} else { // 32bit
+				if len(d) <= 4 {
+					return nil, errors.New("thunk parsing needs at least 5-bytes")
+				}
 				va := binary.LittleEndian.Uint32(d[0:4])
 				d = d[4:]
 				if va == 0 {
--- a/src/encoding/gob/doc.go
+++ b/src/encoding/gob/doc.go
@@ -153,16 +153,16 @@ are transmitted, even if all the elements are zero.

 Structs are sent as a sequence of (field number, field value) pairs. The field
 value is sent using the standard gob encoding for its type, recursively. If a
-field has the zero value for its type (except for arrays; see above), it is omitted
-from the transmission. The field number is defined by the type of the encoded
-struct: the first field of the encoded type is field 0, the second is field 1,
-etc. When encoding a value, the field numbers are delta encoded for efficiency
-and the fields are always sent in order of increasing field number; the deltas are
-therefore unsigned. The initialization for the delta encoding sets the field
-number to -1, so an unsigned integer field 0 with value 7 is transmitted as unsigned
-delta = 1, unsigned value = 7 or (01 07). Finally, after all the fields have been
-sent a terminating mark denotes the end of the struct. That mark is a delta=0
-value, which has representation (00).
+field has the zero value for its type (except for arrays; see above) or it's a
+pointer to a zero value, it is omitted from the transmission. The field number
+is defined by the type of the encoded struct: the first field of the encoded type
+is field 0, the second is field 1, etc. When encoding a value, the field numbers
+are delta encoded for efficiency and the fields are always sent in order of
+increasing field number; the deltas are therefore unsigned. The initialization
+for the delta encoding sets the field number to -1, so an unsigned integer field 0
+with value 7 is transmitted as unsigned delta = 1, unsigned value = 7 or (01 07).
+Finally, after all the fields have been sent a terminating mark denotes the end
+of the struct. That mark is a delta=0 value, which has representation (00).

 Interface types are not checked for compatibility; all interface types are
 treated, for transmission, as members of a single "interface" type, analogous to
--- a/src/errors/join.go
+++ b/src/errors/join.go
@@ -27,16 +27,6 @@ func Join(errs ...error) error {
 	if n == 0 {
 		return nil
 	}
-	if n == 1 {
-		for _, err := range errs {
-			if _, ok := err.(interface {
-				Unwrap() []error
-			}); ok {
-				return err
-			}
-		}
-	}
-
 	e := &joinError{
 		errs: make([]error, 0, n),
 	}
--- a/src/errors/join_test.go
+++ b/src/errors/join_test.go
@@ -25,6 +25,7 @@ func TestJoinReturnsNil(t *testing.T) {
 func TestJoin(t *testing.T) {
 	err1 := errors.New("err1")
 	err2 := errors.New("err2")
+	merr := multiErr{errors.New("err3")}
 	for _, test := range []struct {
 		errs []error
 		want []error
@@ -37,6 +38,9 @@ func TestJoin(t *testing.T) {
 	}, {
 		errs: []error{err1, nil, err2},
 		want: []error{err1, err2},
+	}, {
+		errs: []error{merr},
+		want: []error{merr},
 	}} {
 		got := errors.Join(test.errs...).(interface{ Unwrap() []error }).Unwrap()
 		if !reflect.DeepEqual(got, test.want) {
@@ -70,37 +74,3 @@ func TestJoinErrorMethod(t *testing.T) {
 		}
 	}
 }
-
-func BenchmarkJoin(b *testing.B) {
-	for _, bb := range []struct {
-		name string
-		errs []error
-	}{
-		{
-			name: "no error",
-		},
-		{
-			name: "single non-nil error",
-			errs: []error{errors.New("err")},
-		},
-		{
-			name: "multiple errors",
-			errs: []error{errors.New("err"), errors.New("newerr"), errors.New("newerr2")},
-		},
-		{
-			name: "unwrappable single error",
-			errs: []error{errors.Join(errors.New("err"))},
-		},
-		{
-			name: "nil first error",
-			errs: []error{nil, errors.New("newerr")},
-		},
-	} {
-		b.Run(bb.name, func(b *testing.B) {
-			b.ReportAllocs()
-			for i := 0; i < b.N; i++ {
-				_ = errors.Join(bb.errs...)
-			}
-		})
-	}
-}
--- a/src/go/doc/comment_test.go
+++ b/src/go/doc/comment_test.go
@@ -24,12 +24,12 @@ func TestComment(t *testing.T) {
 	pkg := New(pkgs["pkgdoc"], "testdata/pkgdoc", 0)

 	var (
-		input           = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link. [rand.Int] and [crand.Reader] are things. [G.M1] and [G.M2] are generic methods. [I.F] is an interface method and [I.V] is a broken link.\n"
-		wantHTML        = `<p><a href="#T">T</a> and <a href="#U">U</a> are types, and <a href="#T.M">T.M</a> is a method, but [V] is a broken link. <a href="/math/rand#Int">rand.Int</a> and <a href="/crypto/rand#Reader">crand.Reader</a> are things. <a href="#G.M1">G.M1</a> and <a href="#G.M2">G.M2</a> are generic methods. <a href="#I.F">I.F</a> is an interface method and [I.V] is a broken link.` + "\n"
-		wantOldHTML     = "<p>[T] and [U] are <i>types</i>, and [T.M] is a method, but [V] is a broken link. [rand.Int] and [crand.Reader] are things. [G.M1] and [G.M2] are generic methods. [I.F] is an interface method and [I.V] is a broken link.\n"
-		wantMarkdown    = "[T](#T) and [U](#U) are types, and [T.M](#T.M) is a method, but \\[V] is a broken link. [rand.Int](/math/rand#Int) and [crand.Reader](/crypto/rand#Reader) are things. [G.M1](#G.M1) and [G.M2](#G.M2) are generic methods. [I.F](#I.F) is an interface method and \\[I.V] is a broken link.\n"
-		wantText        = "T and U are types, and T.M is a method, but [V] is a broken link. rand.Int and\ncrand.Reader are things. G.M1 and G.M2 are generic methods. I.F is an interface\nmethod and [I.V] is a broken link.\n"
-		wantOldText     = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link.\n[rand.Int] and [crand.Reader] are things. [G.M1] and [G.M2] are generic methods.\n[I.F] is an interface method and [I.V] is a broken link.\n"
+		input           = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link. [rand.Int] and [crand.Reader] are things. [G.X] is a field, [G.M1] and [G.M2] are generic methods. [I.F] is an interface method and [I.V] is a broken link.\n"
+		wantHTML        = `<p><a href="#T">T</a> and <a href="#U">U</a> are types, and <a href="#T.M">T.M</a> is a method, but [V] is a broken link. <a href="/math/rand#Int">rand.Int</a> and <a href="/crypto/rand#Reader">crand.Reader</a> are things. <a href="#G.X">G.X</a> is a field, <a href="#G.M1">G.M1</a> and <a href="#G.M2">G.M2</a> are generic methods. <a href="#I.F">I.F</a> is an interface method and [I.V] is a broken link.` + "\n"
+		wantOldHTML     = "<p>[T] and [U] are <i>types</i>, and [T.M] is a method, but [V] is a broken link. [rand.Int] and [crand.Reader] are things. [G.X] is a field, [G.M1] and [G.M2] are generic methods. [I.F] is an interface method and [I.V] is a broken link.\n"
+		wantMarkdown    = "[T](#T) and [U](#U) are types, and [T.M](#T.M) is a method, but \\[V] is a broken link. [rand.Int](/math/rand#Int) and [crand.Reader](/crypto/rand#Reader) are things. [G.X](#G.X) is a field, [G.M1](#G.M1) and [G.M2](#G.M2) are generic methods. [I.F](#I.F) is an interface method and \\[I.V] is a broken link.\n"
+		wantText        = "T and U are types, and T.M is a method, but [V] is a broken link. rand.Int and\ncrand.Reader are things. G.X is a field, G.M1 and G.M2 are generic methods.\nI.F is an interface method and [I.V] is a broken link.\n"
+		wantOldText     = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link.\n[rand.Int] and [crand.Reader] are things. [G.X] is a field, [G.M1] and [G.M2]\nare generic methods. [I.F] is an interface method and [I.V] is a broken link.\n"
 		wantSynopsis    = "T and U are types, and T.M is a method, but [V] is a broken link."
 		wantOldSynopsis = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link."
 	)
--- a/src/go/doc/doc.go
+++ b/src/go/doc/doc.go
@@ -168,6 +168,7 @@ func (p *Package) collectTypes(types []*Type) {
 		p.collectFuncs(t.Funcs)
 		p.collectFuncs(t.Methods)
 		p.collectInterfaceMethods(t)
+		p.collectStructFields(t)
 	}
 }

@@ -212,6 +213,24 @@ func (p *Package) collectInterfaceMethods(t *Type) {
 	}
 }

+func (p *Package) collectStructFields(t *Type) {
+	for _, s := range t.Decl.Specs {
+		spec, ok := s.(*ast.TypeSpec)
+		if !ok {
+			continue
+		}
+		list, isStruct := fields(spec.Type)
+		if !isStruct {
+			continue
+		}
+		for _, field := range list {
+			for _, name := range field.Names {
+				p.syms[t.Name+"."+name.Name] = true
+			}
+		}
+	}
+}
+
 // NewFromFiles computes documentation for a package.
 //
 // The package is specified by a list of *ast.Files and corresponding
--- a/src/go/doc/example.go
+++ b/src/go/doc/example.go
@@ -74,6 +74,9 @@ func Examples(testFiles ...*ast.File) []*Example {
 			if params := f.Type.Params; len(params.List) != 0 {
 				continue // function has params; not a valid example
 			}
+			if results := f.Type.Results; results != nil && len(results.List) != 0 {
+				continue // function has results; not a valid example
+			}
 			if f.Body == nil { // ast.File.Body nil dereference (see issue 28044)
 				continue
 			}
--- a/src/go/doc/example_test.go
+++ b/src/go/doc/example_test.go
@@ -228,6 +228,8 @@ func ExampleFunc1_foo()           {}
 func ExampleFunc1_foo_suffix()    {}
 func ExampleFunc1_foo_Suffix()    {} // matches Func1, instead of Func1_foo
 func Examplefunc1()               {} // invalid - cannot match unexported
+func ExampleFunc1_params(a int)   {} // invalid - has parameter
+func ExampleFunc1_results() int   {} // invalid - has results

 func ExampleType1_Func1()               {}
 func ExampleType1_Func1_()              {} // invalid - suffix must start with a lower-case letter
--- a/src/go/doc/testdata/pkgdoc/doc.go
+++ b/src/go/doc/testdata/pkgdoc/doc.go
@@ -18,7 +18,7 @@ func (T) M() {}
 var _ = rand.Int
 var _ = crand.Reader

-type G[T any] struct{ x T }
+type G[T any] struct{ X T }

 func (g G[T]) M1()  {}
 func (g *G[T]) M2() {}
--- a/src/internal/buildcfg/exp.go
+++ b/src/internal/buildcfg/exp.go
@@ -84,6 +84,7 @@ func ParseGOEXPERIMENT(goos, goarch, goexp string) (*ExperimentFlags, error) {
 		RegabiWrappers:        regabiSupported,
 		RegabiArgs:            regabiSupported,
 		Dwarf5:                dwarf5Supported,
+		SIMD:                  goarch == "amd64", // TODO: remove this (default to false) when dev.simd is merged
 		RandomizedHeapBase64:  true,
 		SizeSpecializedMalloc: true,
 		GreenTeaGC:            true,
--- a/src/internal/coverage/decodemeta/decodefile.go
+++ b/src/internal/coverage/decodemeta/decodefile.go
@@ -75,7 +75,7 @@ func (r *CoverageMetaFileReader) readFileHeader() error {
 	// Vet the version. If this is a meta-data file from the future,
 	// we won't be able to read it.
 	if r.hdr.Version > coverage.MetaFileVersion {
-		return fmt.Errorf("meta-data file withn unknown version %d (expected %d)", r.hdr.Version, coverage.MetaFileVersion)
+		return fmt.Errorf("meta-data file with an unknown version %d (expected %d)", r.hdr.Version, coverage.MetaFileVersion)
 	}

 	// Read package offsets for good measure
--- a/src/internal/cpu/cpu_x86.go
+++ b/src/internal/cpu/cpu_x86.go
@@ -136,12 +136,6 @@ func doinit() {
 	// e.g. setting the xsavedisable boot option on Windows 10.
 	X86.HasOSXSAVE = isSet(ecx1, cpuid_OSXSAVE)

-	// The FMA instruction set extension only has VEX prefixed instructions.
-	// VEX prefixed instructions require OSXSAVE to be enabled.
-	// See Intel 64 and IA-32 Architecture Software Developer’s Manual Volume 2
-	// Section 2.4 "AVX and SSE Instruction Exception Specification"
-	X86.HasFMA = isSet(ecx1, cpuid_FMA) && X86.HasOSXSAVE
-
 	osSupportsAVX := false
 	osSupportsAVX512 := false
 	// For XGETBV, OSXSAVE bit is required and sufficient.
@@ -159,6 +153,14 @@ func doinit() {

 	X86.HasAVX = isSet(ecx1, cpuid_AVX) && osSupportsAVX

+	// The FMA instruction set extension requires both the FMA and AVX flags.
+	//
+	// Furthermore, the FMA instructions are all VEX prefixed instructions.
+	// VEX prefixed instructions require OSXSAVE to be enabled.
+	// See Intel 64 and IA-32 Architecture Software Developer’s Manual Volume 2
+	// Section 2.4 "AVX and SSE Instruction Exception Specification"
+	X86.HasFMA = isSet(ecx1, cpuid_FMA) && X86.HasAVX && X86.HasOSXSAVE
+
 	if maxID < 7 {
 		osInit()
 		return
@@ -219,7 +221,7 @@ func doinit() {
 	if eax7 >= 1 {
 		eax71, _, _, _ := cpuid(7, 1)
 		if X86.HasAVX {
-			X86.HasAVXVNNI = isSet(4, eax71)
+			X86.HasAVXVNNI = isSet(eax71, cpuid_AVXVNNI)
 		}
 	}

--- a/src/net/rpc/server.go
+++ b/src/net/rpc/server.go
@@ -202,7 +202,7 @@ func NewServer() *Server {
 // DefaultServer is the default instance of [*Server].
 var DefaultServer = NewServer()

-// Is this type exported or a builtin?
+// isExportedOrBuiltinType reports whether t is an exported or builtin type
 func isExportedOrBuiltinType(t reflect.Type) bool {
 	for t.Kind() == reflect.Pointer {
 		t = t.Elem()
--- a/src/os/exec/exec.go
+++ b/src/os/exec/exec.go
@@ -102,6 +102,7 @@ import (
 	"runtime"
 	"strconv"
 	"strings"
+	"sync/atomic"
 	"syscall"
 	"time"
 )
@@ -354,6 +355,9 @@ type Cmd struct {
 	// the work of resolving the extension, so Start doesn't need to do it again.
 	// This is only used on Windows.
 	cachedLookExtensions struct{ in, out string }
+
+	// startCalled records that Start was attempted, regardless of outcome.
+	startCalled atomic.Bool
 }

 // A ctxResult reports the result of watching the Context associated with a
@@ -635,7 +639,8 @@ func (c *Cmd) Run() error {
 func (c *Cmd) Start() error {
 	// Check for doubled Start calls before we defer failure cleanup. If the prior
 	// call to Start succeeded, we don't want to spuriously close its pipes.
-	if c.Process != nil {
+	// It is an error to call Start twice even if the first call did not create a process.
+	if c.startCalled.Swap(true) {
 		return errors.New("exec: already started")
 	}

@@ -647,6 +652,7 @@ func (c *Cmd) Start() error {
 		if !started {
 			closeDescriptors(c.parentIOPipes)
 			c.parentIOPipes = nil
+			c.goroutine = nil // aid GC, finalization of pipe fds
 		}
 	}()

--- a/src/os/exec/exec_test.go
+++ b/src/os/exec/exec_test.go
@@ -1839,3 +1839,29 @@ func TestAbsPathExec(t *testing.T) {
 		}
 	})
 }
+
+// Calling Start twice is an error, regardless of outcome.
+func TestStart_twice(t *testing.T) {
+	testenv.MustHaveExec(t)
+
+	cmd := exec.Command("/bin/nonesuch")
+	for i, want := range []string{
+		cond(runtime.GOOS == "windows",
+			`exec: "/bin/nonesuch": executable file not found in %PATH%`,
+			"fork/exec /bin/nonesuch: no such file or directory"),
+		"exec: already started",
+	} {
+		err := cmd.Start()
+		if got := fmt.Sprint(err); got != want {
+			t.Errorf("Start call #%d return err %q, want %q", i+1, got, want)
+		}
+	}
+}
+
+func cond[T any](cond bool, t, f T) T {
+	if cond {
+		return t
+	} else {
+		return f
+	}
+}
--- a/src/reflect/value.go
+++ b/src/reflect/value.go
@@ -362,6 +362,7 @@ func (v Value) CanSet() bool {
 // type of the function's corresponding input parameter.
 // If v is a variadic function, Call creates the variadic slice parameter
 // itself, copying in the corresponding values.
+// It panics if the Value was obtained by accessing unexported struct fields.
 func (v Value) Call(in []Value) []Value {
 	v.mustBe(Func)
 	v.mustBeExported()
@@ -375,6 +376,7 @@ func (v Value) Call(in []Value) []Value {
 // It returns the output results as Values.
 // As in Go, each input argument must be assignable to the
 // type of the function's corresponding input parameter.
+// It panics if the Value was obtained by accessing unexported struct fields.
 func (v Value) CallSlice(in []Value) []Value {
 	v.mustBe(Func)
 	v.mustBeExported()
--- a/src/regexp/find_test.go
+++ b/src/regexp/find_test.go
@@ -159,23 +159,23 @@ func TestFind(t *testing.T) {
 	for _, test := range findTests {
 		re := MustCompile(test.pat)
 		if re.String() != test.pat {
-			t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat)
+			t.Errorf("re.String() = %q, want %q", re.String(), test.pat)
 		}
 		result := re.Find([]byte(test.text))
 		switch {
 		case len(test.matches) == 0 && len(result) == 0:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Errorf("expected match; got none: %s", test)
+			t.Errorf("got no match, want one: %s", test)
 		case test.matches != nil && result != nil:
-			expect := test.text[test.matches[0][0]:test.matches[0][1]]
+			want := test.text[test.matches[0][0]:test.matches[0][1]]
 			if len(result) != cap(result) {
-				t.Errorf("expected capacity %d got %d: %s", len(result), cap(result), test)
+				t.Errorf("got capacity %d, want %d: %s", cap(result), len(result), test)
 			}
-			if expect != string(result) {
-				t.Errorf("expected %q got %q: %s", expect, result, test)
+			if want != string(result) {
+				t.Errorf("got %q, want %q: %s", result, want, test)
 			}
 		}
 	}
@@ -188,16 +188,16 @@ func TestFindString(t *testing.T) {
 		case len(test.matches) == 0 && len(result) == 0:
 			// ok
 		case test.matches == nil && result != "":
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == "":
 			// Tricky because an empty result has two meanings: no match or empty match.
 			if test.matches[0][0] != test.matches[0][1] {
-				t.Errorf("expected match; got none: %s", test)
+				t.Errorf("got no match, want one: %s", test)
 			}
 		case test.matches != nil && result != "":
-			expect := test.text[test.matches[0][0]:test.matches[0][1]]
-			if expect != result {
-				t.Errorf("expected %q got %q: %s", expect, result, test)
+			want := test.text[test.matches[0][0]:test.matches[0][1]]
+			if want != result {
+				t.Errorf("got %q, want %q: %s", result, want, test)
 			}
 		}
 	}
@@ -208,13 +208,13 @@ func testFindIndex(test *FindTest, result []int, t *testing.T) {
 	case len(test.matches) == 0 && len(result) == 0:
 		// ok
 	case test.matches == nil && result != nil:
-		t.Errorf("expected no match; got one: %s", test)
+		t.Errorf("got match %v, want none: %s", result, test)
 	case test.matches != nil && result == nil:
-		t.Errorf("expected match; got none: %s", test)
+		t.Errorf("got no match, want one: %s", test)
 	case test.matches != nil && result != nil:
-		expect := test.matches[0]
-		if expect[0] != result[0] || expect[1] != result[1] {
-			t.Errorf("expected %v got %v: %s", expect, result, test)
+		want := test.matches[0]
+		if want[0] != result[0] || want[1] != result[1] {
+			t.Errorf("got %v, want %v: %s", result, want, test)
 		}
 	}
 }
@@ -246,22 +246,22 @@ func TestFindAll(t *testing.T) {
 		case test.matches == nil && result == nil:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Fatalf("expected match; got none: %s", test)
+			t.Fatalf("got no match, want one: %s", test)
 		case test.matches != nil && result != nil:
 			if len(test.matches) != len(result) {
-				t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
+				t.Errorf("got %d matches, want %d: %s", len(result), len(test.matches), test)
 				continue
 			}
 			for k, e := range test.matches {
 				got := result[k]
 				if len(got) != cap(got) {
-					t.Errorf("match %d: expected capacity %d got %d: %s", k, len(got), cap(got), test)
+					t.Errorf("match %d: got capacity %d, want %d: %s", k, cap(got), len(got), test)
 				}
-				expect := test.text[e[0]:e[1]]
-				if expect != string(got) {
-					t.Errorf("match %d: expected %q got %q: %s", k, expect, got, test)
+				want := test.text[e[0]:e[1]]
+				if want != string(got) {
+					t.Errorf("match %d: got %q, want %q: %s", k, got, want, test)
 				}
 			}
 		}
@@ -275,18 +275,18 @@ func TestFindAllString(t *testing.T) {
 		case test.matches == nil && result == nil:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Errorf("expected match; got none: %s", test)
+			t.Errorf("got no match, want one: %s", test)
 		case test.matches != nil && result != nil:
 			if len(test.matches) != len(result) {
-				t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
+				t.Errorf("got %d matches, want %d: %s", len(result), len(test.matches), test)
 				continue
 			}
 			for k, e := range test.matches {
-				expect := test.text[e[0]:e[1]]
-				if expect != result[k] {
-					t.Errorf("expected %q got %q: %s", expect, result, test)
+				want := test.text[e[0]:e[1]]
+				if want != result[k] {
+					t.Errorf("got %q, want %q: %s", result[k], want, test)
 				}
 			}
 		}
@@ -298,17 +298,17 @@ func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) {
 	case test.matches == nil && result == nil:
 		// ok
 	case test.matches == nil && result != nil:
-		t.Errorf("expected no match; got one: %s", test)
+		t.Errorf("got match %v, want none: %s", result, test)
 	case test.matches != nil && result == nil:
-		t.Errorf("expected match; got none: %s", test)
+		t.Errorf("got no match, want one: %s", test)
 	case test.matches != nil && result != nil:
 		if len(test.matches) != len(result) {
-			t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
+			t.Errorf("got %d matches, want %d: %s", len(result), len(test.matches), test)
 			return
 		}
 		for k, e := range test.matches {
 			if e[0] != result[k][0] || e[1] != result[k][1] {
-				t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test)
+				t.Errorf("match %d: got %v, want %v: %s", k, result[k], e, test)
 			}
 		}
 	}
@@ -330,24 +330,24 @@ func TestFindAllStringIndex(t *testing.T) {

 func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) {
 	if len(submatches) != len(result)*2 {
-		t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
+		t.Errorf("match %d: got %d submatches, want %d: %s", n, len(result), len(submatches)/2, test)
 		return
 	}
 	for k := 0; k < len(submatches); k += 2 {
 		if submatches[k] == -1 {
 			if result[k/2] != nil {
-				t.Errorf("match %d: expected nil got %q: %s", n, result, test)
+				t.Errorf("match %d: got %q, want nil: %s", n, result, test)
 			}
 			continue
 		}
 		got := result[k/2]
 		if len(got) != cap(got) {
-			t.Errorf("match %d: expected capacity %d got %d: %s", n, len(got), cap(got), test)
+			t.Errorf("match %d: got capacity %d, want %d: %s", n, cap(got), len(got), test)
 			return
 		}
-		expect := test.text[submatches[k]:submatches[k+1]]
-		if expect != string(got) {
-			t.Errorf("match %d: expected %q got %q: %s", n, expect, got, test)
+		want := test.text[submatches[k]:submatches[k+1]]
+		if want != string(got) {
+			t.Errorf("match %d: got %q, want %q: %s", n, got, want, test)
 			return
 		}
 	}
@@ -360,9 +360,9 @@ func TestFindSubmatch(t *testing.T) {
 		case test.matches == nil && result == nil:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Errorf("expected match; got none: %s", test)
+			t.Errorf("got no match, want one: %s", test)
 		case test.matches != nil && result != nil:
 			testSubmatchBytes(&test, 0, test.matches[0], result, t)
 		}
@@ -371,19 +371,19 @@ func TestFindSubmatch(t *testing.T) {

 func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) {
 	if len(submatches) != len(result)*2 {
-		t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
+		t.Errorf("match %d: got %d submatches, want %d: %s", n, len(result), len(submatches)/2, test)
 		return
 	}
 	for k := 0; k < len(submatches); k += 2 {
 		if submatches[k] == -1 {
 			if result[k/2] != "" {
-				t.Errorf("match %d: expected nil got %q: %s", n, result, test)
+				t.Errorf("match %d: got %q, want empty string: %s", n, result, test)
 			}
 			continue
 		}
-		expect := test.text[submatches[k]:submatches[k+1]]
-		if expect != result[k/2] {
-			t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test)
+		want := test.text[submatches[k]:submatches[k+1]]
+		if want != result[k/2] {
+			t.Errorf("match %d: got %q, want %q: %s", n, result[k/2], want, test)
 			return
 		}
 	}
@@ -396,23 +396,23 @@ func TestFindStringSubmatch(t *testing.T) {
 		case test.matches == nil && result == nil:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Errorf("expected match; got none: %s", test)
+			t.Errorf("got no match, want one: %s", test)
 		case test.matches != nil && result != nil:
 			testSubmatchString(&test, 0, test.matches[0], result, t)
 		}
 	}
 }

-func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) {
-	if len(expect) != len(result) {
-		t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test)
+func testSubmatchIndices(test *FindTest, n int, want, result []int, t *testing.T) {
+	if len(want) != len(result) {
+		t.Errorf("match %d: got %d matches, want %d: %s", n, len(result)/2, len(want)/2, test)
 		return
 	}
-	for k, e := range expect {
+	for k, e := range want {
 		if e != result[k] {
-			t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test)
+			t.Errorf("match %d: submatch error: got %v, want %v: %s", n, result, want, test)
 		}
 	}
 }
@@ -422,9 +422,9 @@ func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) {
 	case test.matches == nil && result == nil:
 		// ok
 	case test.matches == nil && result != nil:
-		t.Errorf("expected no match; got one: %s", test)
+		t.Errorf("got match %v, want none: %s", result, test)
 	case test.matches != nil && result == nil:
-		t.Errorf("expected match; got none: %s", test)
+		t.Errorf("got no match, want one: %s", test)
 	case test.matches != nil && result != nil:
 		testSubmatchIndices(test, 0, test.matches[0], result, t)
 	}
@@ -457,11 +457,11 @@ func TestFindAllSubmatch(t *testing.T) {
 		case test.matches == nil && result == nil:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Errorf("expected match; got none: %s", test)
+			t.Errorf("got no match, want one: %s", test)
 		case len(test.matches) != len(result):
-			t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
+			t.Errorf("got %d matches, want %d: %s", len(result), len(test.matches), test)
 		case test.matches != nil && result != nil:
 			for k, match := range test.matches {
 				testSubmatchBytes(&test, k, match, result[k], t)
@@ -477,11 +477,11 @@ func TestFindAllStringSubmatch(t *testing.T) {
 		case test.matches == nil && result == nil:
 			// ok
 		case test.matches == nil && result != nil:
-			t.Errorf("expected no match; got one: %s", test)
+			t.Errorf("got match %q, want none: %s", result, test)
 		case test.matches != nil && result == nil:
-			t.Errorf("expected match; got none: %s", test)
+			t.Errorf("got no match, want one: %s", test)
 		case len(test.matches) != len(result):
-			t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
+			t.Errorf("got %d matches, want %d: %s", len(result), len(test.matches), test)
 		case test.matches != nil && result != nil:
 			for k, match := range test.matches {
 				testSubmatchString(&test, k, match, result[k], t)
@@ -495,11 +495,11 @@ func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) {
 	case test.matches == nil && result == nil:
 		// ok
 	case test.matches == nil && result != nil:
-		t.Errorf("expected no match; got one: %s", test)
+		t.Errorf("got match %v, want none: %s", result, test)
 	case test.matches != nil && result == nil:
-		t.Errorf("expected match; got none: %s", test)
+		t.Errorf("got no match, want one: %s", test)
 	case len(test.matches) != len(result):
-		t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
+		t.Errorf("got %d matches, want %d: %s", len(result), len(test.matches), test)
 	case test.matches != nil && result != nil:
 		for k, match := range test.matches {
 			testSubmatchIndices(test, k, match, result[k], t)
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -1213,7 +1213,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	if goexperiment.RuntimeSecret && gp.secret > 0 {
 		// Mark any object allocated while in secret mode as secret.
 		// This ensures we zero it immediately when freeing it.
-		addSecret(x)
+		addSecret(x, size)
 	}

 	// Notify sanitizers, if enabled.
--- a/src/runtime/malloc_generated.go
+++ b/src/runtime/malloc_generated.go
@@ -156,7 +156,7 @@ func mallocgcSmallScanNoHeaderSC1(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -321,7 +321,7 @@ func mallocgcSmallScanNoHeaderSC2(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -486,7 +486,7 @@ func mallocgcSmallScanNoHeaderSC3(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -651,7 +651,7 @@ func mallocgcSmallScanNoHeaderSC4(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -816,7 +816,7 @@ func mallocgcSmallScanNoHeaderSC5(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -981,7 +981,7 @@ func mallocgcSmallScanNoHeaderSC6(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -1146,7 +1146,7 @@ func mallocgcSmallScanNoHeaderSC7(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -1311,7 +1311,7 @@ func mallocgcSmallScanNoHeaderSC8(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -1476,7 +1476,7 @@ func mallocgcSmallScanNoHeaderSC9(size uintptr, typ *_type, needzero bool) unsaf
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -1641,7 +1641,7 @@ func mallocgcSmallScanNoHeaderSC10(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -1806,7 +1806,7 @@ func mallocgcSmallScanNoHeaderSC11(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -1971,7 +1971,7 @@ func mallocgcSmallScanNoHeaderSC12(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -2136,7 +2136,7 @@ func mallocgcSmallScanNoHeaderSC13(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -2301,7 +2301,7 @@ func mallocgcSmallScanNoHeaderSC14(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -2466,7 +2466,7 @@ func mallocgcSmallScanNoHeaderSC15(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -2631,7 +2631,7 @@ func mallocgcSmallScanNoHeaderSC16(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -2796,7 +2796,7 @@ func mallocgcSmallScanNoHeaderSC17(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -2961,7 +2961,7 @@ func mallocgcSmallScanNoHeaderSC18(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -3126,7 +3126,7 @@ func mallocgcSmallScanNoHeaderSC19(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -3291,7 +3291,7 @@ func mallocgcSmallScanNoHeaderSC20(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -3456,7 +3456,7 @@ func mallocgcSmallScanNoHeaderSC21(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -3621,7 +3621,7 @@ func mallocgcSmallScanNoHeaderSC22(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -3786,7 +3786,7 @@ func mallocgcSmallScanNoHeaderSC23(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -3951,7 +3951,7 @@ func mallocgcSmallScanNoHeaderSC24(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -4116,7 +4116,7 @@ func mallocgcSmallScanNoHeaderSC25(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -4281,7 +4281,7 @@ func mallocgcSmallScanNoHeaderSC26(size uintptr, typ *_type, needzero bool) unsa
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -6686,7 +6686,7 @@ func mallocgcSmallNoScanSC2(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -6757,7 +6757,7 @@ func mallocgcSmallNoScanSC2(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -6822,7 +6822,7 @@ func mallocgcSmallNoScanSC3(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -6893,7 +6893,7 @@ func mallocgcSmallNoScanSC3(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -6958,7 +6958,7 @@ func mallocgcSmallNoScanSC4(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -7029,7 +7029,7 @@ func mallocgcSmallNoScanSC4(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -7094,7 +7094,7 @@ func mallocgcSmallNoScanSC5(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -7165,7 +7165,7 @@ func mallocgcSmallNoScanSC5(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -7230,7 +7230,7 @@ func mallocgcSmallNoScanSC6(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -7301,7 +7301,7 @@ func mallocgcSmallNoScanSC6(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -7366,7 +7366,7 @@ func mallocgcSmallNoScanSC7(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -7437,7 +7437,7 @@ func mallocgcSmallNoScanSC7(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -7502,7 +7502,7 @@ func mallocgcSmallNoScanSC8(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -7573,7 +7573,7 @@ func mallocgcSmallNoScanSC8(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -7638,7 +7638,7 @@ func mallocgcSmallNoScanSC9(size uintptr, typ *_type, needzero bool) unsafe.Poin
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -7709,7 +7709,7 @@ func mallocgcSmallNoScanSC9(size uintptr, typ *_type, needzero bool) unsafe.Poin
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -7774,7 +7774,7 @@ func mallocgcSmallNoScanSC10(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -7845,7 +7845,7 @@ func mallocgcSmallNoScanSC10(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -7910,7 +7910,7 @@ func mallocgcSmallNoScanSC11(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -7981,7 +7981,7 @@ func mallocgcSmallNoScanSC11(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -8046,7 +8046,7 @@ func mallocgcSmallNoScanSC12(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -8117,7 +8117,7 @@ func mallocgcSmallNoScanSC12(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -8182,7 +8182,7 @@ func mallocgcSmallNoScanSC13(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -8253,7 +8253,7 @@ func mallocgcSmallNoScanSC13(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -8318,7 +8318,7 @@ func mallocgcSmallNoScanSC14(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -8389,7 +8389,7 @@ func mallocgcSmallNoScanSC14(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -8454,7 +8454,7 @@ func mallocgcSmallNoScanSC15(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -8525,7 +8525,7 @@ func mallocgcSmallNoScanSC15(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -8590,7 +8590,7 @@ func mallocgcSmallNoScanSC16(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -8661,7 +8661,7 @@ func mallocgcSmallNoScanSC16(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -8726,7 +8726,7 @@ func mallocgcSmallNoScanSC17(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -8797,7 +8797,7 @@ func mallocgcSmallNoScanSC17(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -8862,7 +8862,7 @@ func mallocgcSmallNoScanSC18(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -8933,7 +8933,7 @@ func mallocgcSmallNoScanSC18(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -8998,7 +8998,7 @@ func mallocgcSmallNoScanSC19(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -9069,7 +9069,7 @@ func mallocgcSmallNoScanSC19(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -9134,7 +9134,7 @@ func mallocgcSmallNoScanSC20(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -9205,7 +9205,7 @@ func mallocgcSmallNoScanSC20(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -9270,7 +9270,7 @@ func mallocgcSmallNoScanSC21(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -9341,7 +9341,7 @@ func mallocgcSmallNoScanSC21(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -9406,7 +9406,7 @@ func mallocgcSmallNoScanSC22(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -9477,7 +9477,7 @@ func mallocgcSmallNoScanSC22(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -9542,7 +9542,7 @@ func mallocgcSmallNoScanSC23(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -9613,7 +9613,7 @@ func mallocgcSmallNoScanSC23(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -9678,7 +9678,7 @@ func mallocgcSmallNoScanSC24(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -9749,7 +9749,7 @@ func mallocgcSmallNoScanSC24(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -9814,7 +9814,7 @@ func mallocgcSmallNoScanSC25(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -9885,7 +9885,7 @@ func mallocgcSmallNoScanSC25(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
@@ -9950,7 +9950,7 @@ func mallocgcSmallNoScanSC26(size uintptr, typ *_type, needzero bool) unsafe.Poi
 			gp := getg()
 			if goexperiment.RuntimeSecret && gp.secret > 0 {

-				addSecret(x)
+				addSecret(x, size)
 			}

 			if valgrindenabled {
@@ -10021,7 +10021,7 @@ func mallocgcSmallNoScanSC26(size uintptr, typ *_type, needzero bool) unsafe.Poi
 	gp := getg()
 	if goexperiment.RuntimeSecret && gp.secret > 0 {

-		addSecret(x)
+		addSecret(x, size)
 	}

 	if valgrindenabled {
--- a/src/runtime/malloc_stubs.go
+++ b/src/runtime/malloc_stubs.go
@@ -101,7 +101,7 @@ func mallocStub(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		if goexperiment.RuntimeSecret && gp.secret > 0 {
 			// Mark any object allocated while in secret mode as secret.
 			// This ensures we zero it immediately when freeing it.
-			addSecret(x)
+			addSecret(x, size)
 		}
 	}

--- a/src/runtime/mcleanup_test.go
+++ b/src/runtime/mcleanup_test.go
@@ -331,9 +331,14 @@ func TestCleanupLost(t *testing.T) {
 	}
 	wg.Wait()
 	runtime.GC()
-	runtime.BlockUntilEmptyCleanupQueue(int64(10 * time.Second))
+	timeout := 10 * time.Second
+	empty := runtime.BlockUntilEmptyCleanupQueue(int64(timeout))
+	if !empty {
+		t.Errorf("failed to drain cleanup queue within %s", timeout)
+	}
+
 	if got := int(got.Load()); got != want {
-		t.Errorf("expected %d cleanups to be executed, got %d", got, want)
+		t.Errorf("%d cleanups executed, expected %d", got, want)
 	}
 }

--- a/src/runtime/metrics_cgo_test.go
+++ b/src/runtime/metrics_cgo_test.go
@@ -12,7 +12,7 @@ import (
 	"testing"
 )

-func TestNotInGoMetricCallback(t *testing.T) {
+func TestNotInGoMetric(t *testing.T) {
 	switch runtime.GOOS {
 	case "windows", "plan9":
 		t.Skip("unsupported on Windows and Plan9")
@@ -22,11 +22,22 @@ func TestNotInGoMetricCallback(t *testing.T) {
 		}
 	}

-	// This test is run in a subprocess to prevent other tests from polluting the metrics
-	// and because we need to make some cgo callbacks.
-	output := runTestProg(t, "testprogcgo", "NotInGoMetricCallback")
-	want := "OK\n"
-	if output != want {
-		t.Fatalf("output:\n%s\n\nwanted:\n%s", output, want)
+	run := func(t *testing.T, name string) {
+		// This test is run in a subprocess to prevent other tests from polluting the metrics
+		// and because we need to make some cgo callbacks.
+		output := runTestProg(t, "testprogcgo", name)
+		want := "OK\n"
+		if output != want {
+			t.Fatalf("output:\n%s\n\nwanted:\n%s", output, want)
+		}
 	}
+	t.Run("CgoCall", func(t *testing.T) {
+		run(t, "NotInGoMetricCgoCall")
+	})
+	t.Run("CgoCallback", func(t *testing.T) {
+		run(t, "NotInGoMetricCgoCallback")
+	})
+	t.Run("CgoCallAndCallback", func(t *testing.T) {
+		run(t, "NotInGoMetricCgoCallAndCallback")
+	})
 }
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -2745,6 +2745,14 @@ type specialPinCounter struct {
 	counter uintptr
 }

+// specialSecret tracks whether we need to zero an object immediately
+// upon freeing.
+type specialSecret struct {
+	_       sys.NotInHeap
+	special special
+	size    uintptr
+}
+
 // specialsIter helps iterate over specials lists.
 type specialsIter struct {
 	pprev **special
@@ -2775,6 +2783,12 @@ func (i *specialsIter) unlinkAndNext() *special {

 // freeSpecial performs any cleanup on special s and deallocates it.
 // s must already be unlinked from the specials list.
+// TODO(mknyszek): p and size together DO NOT represent a valid allocation.
+// size is the size of the allocation block in the span (mspan.elemsize), and p is
+// whatever pointer the special was attached to, which need not point to the
+// beginning of the block, though it may.
+// Consider passing the arguments differently to avoid giving the impression
+// that p and size together represent an address range.
 func freeSpecial(s *special, p unsafe.Pointer, size uintptr) {
 	switch s.kind {
 	case _KindSpecialFinalizer:
@@ -2828,7 +2842,19 @@ func freeSpecial(s *special, p unsafe.Pointer, size uintptr) {
 		mheap_.specialBubbleAlloc.free(unsafe.Pointer(st))
 		unlock(&mheap_.speciallock)
 	case _KindSpecialSecret:
-		memclrNoHeapPointers(p, size)
+		ss := (*specialSecret)(unsafe.Pointer(s))
+		// p is the actual byte location that the special was
+		// attached to, but the size argument is the span
+		// element size. If we were to zero out using the size
+		// argument, we'd trounce over adjacent memory in cases
+		// where the allocation contains a header. Hence, we use
+		// the user-visible size which we stash in the special itself.
+		//
+		// p always points to the beginning of the user-visible
+		// allocation since the only way to attach a secret special
+		// is via the allocation path. This isn't universal for
+		// tiny allocs, but we avoid them in mallocgc anyway.
+		memclrNoHeapPointers(p, ss.size)
 		lock(&mheap_.speciallock)
 		mheap_.specialSecretAlloc.free(unsafe.Pointer(s))
 		unlock(&mheap_.speciallock)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -2455,8 +2455,16 @@ func needm(signal bool) {
 	// mp.curg is now a real goroutine.
 	casgstatus(mp.curg, _Gdeadextra, _Gsyscall)
 	sched.ngsys.Add(-1)
-	// N.B. We do not update nGsyscallNoP, because isExtraInC threads are not
-	// counted as real goroutines while they're in C.
+
+	// This is technically inaccurate, but we set isExtraInC to false above,
+	// and so we need to update addGSyscallNoP to keep the two pieces of state
+	// consistent (it's only updated when isExtraInC is false). More specifically,
+	// When we get to cgocallbackg and exitsyscall, we'll be looking for a P, and
+	// since isExtraInC is false, we will decrement this metric.
+	//
+	// The inaccuracy is thankfully transient: only until this thread can get a P.
+	// We're going into Go anyway, so it's okay to pretend we're a real goroutine now.
+	addGSyscallNoP(mp)

 	if !signal {
 		if trace.ok() {
@@ -5027,7 +5035,7 @@ func exitsyscallTryGetP(oldp *p) *p {
 	if oldp != nil {
 		if thread, ok := setBlockOnExitSyscall(oldp); ok {
 			thread.takeP()
-			addGSyscallNoP(thread.mp) // takeP does the opposite, but this is a net zero change.
+			decGSyscallNoP(getg().m) // We got a P for ourselves.
 			thread.resume()
 			return oldp
 		}
--- a/src/runtime/rt0_freebsd_arm64.s
+++ b/src/runtime/rt0_freebsd_arm64.s
@@ -4,9 +4,12 @@

 #include "textflag.h"

-// On FreeBSD argc/argv are passed in R0, not RSP
+// FreeBSD passes a pointer to the argument block in R0, not RSP,
+// so _rt0_arm64 cannot be used.
 TEXT _rt0_arm64_freebsd(SB),NOSPLIT,$0
-	JMP	_rt0_arm64(SB)
+	ADD	$8, R0, R1	// argv (use R0 while it's still the pointer)
+	MOVD	0(R0), R0	// argc
+	JMP	runtime·rt0_go(SB)

 // When building with -buildmode=c-shared, this symbol is called when the shared
 // library is loaded.
--- a/src/runtime/secret.go
+++ b/src/runtime/secret.go
@@ -55,15 +55,9 @@ func secret_eraseSecrets() {
 	// Don't put any code here: the stack frame's contents are gone!
 }

-// specialSecret tracks whether we need to zero an object immediately
-// upon freeing.
-type specialSecret struct {
-	special special
-}
-
 // addSecret records the fact that we need to zero p immediately
 // when it is freed.
-func addSecret(p unsafe.Pointer) {
+func addSecret(p unsafe.Pointer, size uintptr) {
 	// TODO(dmo): figure out the cost of these. These are mostly
 	// intended to catch allocations that happen via the runtime
 	// that the user has no control over and not big buffers that user
@@ -72,6 +66,7 @@ func addSecret(p unsafe.Pointer) {
 	lock(&mheap_.speciallock)
 	s := (*specialSecret)(mheap_.specialSecretAlloc.alloc())
 	s.special.kind = _KindSpecialSecret
+	s.size = size
 	unlock(&mheap_.speciallock)
 	addspecial(p, &s.special, false)
 }
--- a/src/runtime/secret/alloc_test.go
+++ b/src/runtime/secret/alloc_test.go
@@ -0,0 +1,39 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.runtimesecret && (arm64 || amd64) && linux
+
+package secret_test
+
+import (
+	"runtime"
+	"runtime/secret"
+	"testing"
+)
+
+func TestInterleavedAllocFrees(t *testing.T) {
+	// Interleave heap objects that are kept alive beyond secret.Do
+	// with heap objects that do not live past secret.Do.
+	// The intent is for the clearing of one object (with the wrong size)
+	// to clobber the type header of the next slot. If the GC sees a nil type header
+	// when it expects to find one, it can throw.
+	type T struct {
+		p *int
+		x [1024]byte
+	}
+	for range 10 {
+		var s []*T
+		secret.Do(func() {
+			for i := range 100 {
+				t := &T{}
+				if i%2 == 0 {
+					s = append(s, t)
+				}
+			}
+		})
+		runtime.GC()
+		runtime.GC()
+		runtime.KeepAlive(s)
+	}
+}
--- a/src/runtime/secret/doc.go
+++ b/src/runtime/secret/doc.go
@@ -0,0 +1,15 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.runtimesecret
+
+// Package secret contains helper functions for zeroing out memory
+// that is otherwise invisible to a user program in the service of
+// forward secrecy. See https://en.wikipedia.org/wiki/Forward_secrecy for
+// more information.
+//
+// This package (runtime/secret) is experimental,
+// and not subject to the Go 1 compatibility promise.
+// It only exists when building with the GOEXPERIMENT=runtimesecret environment variable set.
+package secret
--- a/src/runtime/secret/secret.go
+++ b/src/runtime/secret/secret.go
@@ -18,12 +18,23 @@ import (
 // entire call tree initiated by f.)
 //   - Any registers used by f are erased before Do returns.
 //   - Any stack used by f is erased before Do returns.
-//   - Any heap allocation done by f is erased as soon as the garbage
-//     collector realizes that it is no longer reachable.
+//   - Heap allocations done by f are erased as soon as the garbage
+//     collector realizes that all allocated values are no longer reachable.
 //   - Do works even if f panics or calls runtime.Goexit.  As part of
 //     that, any panic raised by f will appear as if it originates from
 //     Do itself.
 //
+// Users should be cautious of allocating inside Do.
+// Erasing heap memory after Do returns may increase garbage collector sweep times and
+// requires additional memory to keep track of allocations until they are to be erased.
+// These costs can compound when an allocation is done in the service of growing a value,
+// like appending to a slice or inserting into a map. In these cases, the entire new allocation is erased rather
+// than just the secret parts of it.
+//
+// To reduce lifetimes of allocations and avoid unexpected performance issues,
+// if a function invoked by Do needs to yield a result that shouldn't be erased,
+// it should do so by copying the result into an allocation created by the caller.
+//
 // Limitations:
 //   - Currently only supported on linux/amd64 and linux/arm64.  On unsupported
 //     platforms, Do will invoke f directly.
--- a/src/runtime/secret/secret_test.go
+++ b/src/runtime/secret/secret_test.go
@@ -19,6 +19,7 @@ import (
 	"testing"
 	"time"
 	"unsafe"
+	"weak"
 )

 type secretType int64
@@ -63,28 +64,33 @@ func heapSTiny() *secretType {
 // are freed.
 // See runtime/mheap.go:freeSpecial.
 func TestHeap(t *testing.T) {
-	var u uintptr
+	var addr uintptr
+	var p weak.Pointer[S]
 	Do(func() {
-		u = uintptr(unsafe.Pointer(heapS()))
+		sp := heapS()
+		addr = uintptr(unsafe.Pointer(sp))
+		p = weak.Make(sp)
 	})
-
-	runtime.GC()
+	waitCollected(t, p)

 	// Check that object got zeroed.
-	checkRangeForSecret(t, u, u+unsafe.Sizeof(S{}))
+	checkRangeForSecret(t, addr, addr+unsafe.Sizeof(S{}))
 	// Also check our stack, just because we can.
 	checkStackForSecret(t)
 }

 func TestHeapTiny(t *testing.T) {
-	var u uintptr
+	var addr uintptr
+	var p weak.Pointer[secretType]
 	Do(func() {
-		u = uintptr(unsafe.Pointer(heapSTiny()))
+		sp := heapSTiny()
+		addr = uintptr(unsafe.Pointer(sp))
+		p = weak.Make(sp)
 	})
-	runtime.GC()
+	waitCollected(t, p)

 	// Check that object got zeroed.
-	checkRangeForSecret(t, u, u+unsafe.Sizeof(secretType(0)))
+	checkRangeForSecret(t, addr, addr+unsafe.Sizeof(secretType(0)))
 	// Also check our stack, just because we can.
 	checkStackForSecret(t)
 }
@@ -240,6 +246,20 @@ func checkRangeForSecret(t *testing.T, lo, hi uintptr) {
 	}
 }

+func waitCollected[P any](t *testing.T, ptr weak.Pointer[P]) {
+	t.Helper()
+	i := 0
+	for ptr.Value() != nil {
+		runtime.GC()
+		i++
+		// 20 seems like a decent number of times to try
+		if i > 20 {
+			t.Errorf("value was never collected")
+		}
+	}
+	t.Logf("number of cycles until collection: %d", i)
+}
+
 func TestRegisters(t *testing.T) {
 	Do(func() {
 		s := makeS()
--- a/src/runtime/secret_nosecret.go
+++ b/src/runtime/secret_nosecret.go
@@ -22,9 +22,7 @@ func secret_dec() {}
 //go:linkname secret_eraseSecrets runtime/secret.eraseSecrets
 func secret_eraseSecrets() {}

-func addSecret(p unsafe.Pointer) {}
-
-type specialSecret struct{}
+func addSecret(p unsafe.Pointer, size uintptr) {}

 //go:linkname secret_getStack runtime/secret.getStack
 func secret_getStack() (uintptr, uintptr) { return 0, 0 }
--- a/src/runtime/testdata/testprogcgo/notingo.go
+++ b/src/runtime/testdata/testprogcgo/notingo.go
@@ -12,6 +12,7 @@ package main
 #include <pthread.h>

 extern void Ready();
+extern void BlockForeverInGo();

 static _Atomic int spinning;
 static _Atomic int released;
@@ -40,6 +41,21 @@ static void Release() {
 	atomic_store(&spinning, 0);
 	atomic_store(&released, 1);
 }
+
+static void* enterGoThenWait(void* arg __attribute__ ((unused))) {
+	BlockForeverInGo();
+	return NULL;
+}
+
+static void WaitInGoInNewCThread() {
+	pthread_t tid;
+	pthread_create(&tid, NULL, enterGoThenWait, NULL);
+}
+
+static void SpinForever() {
+	atomic_fetch_add(&spinning, 1);
+	while(1) {};
+}
 */
 import "C"

@@ -47,15 +63,62 @@ import (
 	"os"
 	"runtime"
 	"runtime/metrics"
+	"sync/atomic"
 )

 func init() {
-	register("NotInGoMetricCallback", NotInGoMetricCallback)
+	register("NotInGoMetricCgoCall", NotInGoMetricCgoCall)
+	register("NotInGoMetricCgoCallback", NotInGoMetricCgoCallback)
+	register("NotInGoMetricCgoCallAndCallback", NotInGoMetricCgoCallAndCallback)
 }

-func NotInGoMetricCallback() {
+// NotInGoMetric just double-checks that N goroutines in cgo count as the metric reading N.
+func NotInGoMetricCgoCall() {
 	const N = 10
+
+	// Spin up the same number of goroutines that will all wait in a cgo call.
+	for range N {
+		go func() {
+			C.SpinForever()
+		}()
+	}
+
+	// Make sure we're all blocked and spinning.
+	for C.Spinning() < N {
+	}
+
+	// Read not-in-go before taking the Ps back.
 	s := []metrics.Sample{{Name: "/sched/goroutines/not-in-go:goroutines"}}
+	failed := false
+	metrics.Read(s)
+	if n := s[0].Value.Uint64(); n != N {
+		println("pre-STW: expected", N, "not-in-go goroutines, found", n)
+	}
+
+	// Do something that stops the world to take all the Ps back.
+	//
+	// This will force a re-accounting of some of the goroutines and
+	// re-checking not-in-go will help catch bugs.
+	runtime.ReadMemStats(&m)
+
+	// Read not-in-go.
+	metrics.Read(s)
+	if n := s[0].Value.Uint64(); n != N {
+		println("post-STW: expected", N, "not-in-go goroutines, found", n)
+	}
+
+	// Fail if we get a bad reading.
+	if failed {
+		os.Exit(2)
+	}
+	println("OK")
+}
+
+// NotInGoMetricCgoCallback tests that threads that called into Go, then returned
+// to C with *no* Go on the stack, are *not* counted as not-in-go in the
+// runtime/metrics package.
+func NotInGoMetricCgoCallback() {
+	const N = 10

 	// Create N new C threads that have called into Go at least once.
 	for range N {
@@ -90,6 +153,7 @@ func NotInGoMetricCallback() {
 	}

 	// Read not-in-go.
+	s := []metrics.Sample{{Name: "/sched/goroutines/not-in-go:goroutines"}}
 	metrics.Read(s)
 	if n := s[0].Value.Uint64(); n != 0 {
 		println("expected 0 not-in-go goroutines, found", n)
@@ -105,3 +169,69 @@ var readyCh = make(chan bool)
 func Ready() {
 	readyCh <- true
 }
+
+// NotInGoMetricCgoCallAndCallback tests that threads that called into Go are not
+// keeping the count of not-in-go threads negative. Specifically, needm sets
+// isExtraInC to false, breaking some of the invariants behind the not-in-go
+// runtime/metrics metric, causing the underlying count to break if we don't
+// account for this. In go.dev/cl/726964 this amounts to nGsyscallNoP being negative.
+// Unfortunately the runtime/metrics package masks a negative nGsyscallNoP because
+// it can transiently go negative due to a race. Therefore, this test checks
+// the condition by making sure not-in-go is positive when we expect it to be.
+// That is, threads in a cgo callback are *not* cancelling out threads in a
+// regular cgo call.
+func NotInGoMetricCgoCallAndCallback() {
+	const N = 10
+
+	// Spin up some threads that will do a cgo callback and just wait in Go.
+	// These threads are the ones we're worried about having the incorrect
+	// accounting that skews the count later.
+	for range N {
+		C.WaitInGoInNewCThread()
+	}
+
+	// Spin up the same number of goroutines that will all wait in a cgo call.
+	for range N {
+		go func() {
+			C.SpinForever()
+		}()
+	}
+
+	// Make sure we're all blocked and spinning.
+	for C.Spinning() < N || blockedForever.Load() < N {
+	}
+
+	// Read not-in-go before taking the Ps back.
+	s := []metrics.Sample{{Name: "/sched/goroutines/not-in-go:goroutines"}}
+	failed := false
+	metrics.Read(s)
+	if n := s[0].Value.Uint64(); n != N {
+		println("pre-STW: expected", N, "not-in-go goroutines, found", n)
+	}
+
+	// Do something that stops the world to take all the Ps back.
+	//
+	// This will force a re-accounting of some of the goroutines and
+	// re-checking not-in-go will help catch bugs.
+	runtime.ReadMemStats(&m)
+
+	// Read not-in-go.
+	metrics.Read(s)
+	if n := s[0].Value.Uint64(); n != N {
+		println("post-STW: expected", N, "not-in-go goroutines, found", n)
+	}
+
+	// Fail if we get a bad reading.
+	if failed {
+		os.Exit(2)
+	}
+	println("OK")
+}
+
+var blockedForever atomic.Uint32
+
+//export BlockForeverInGo
+func BlockForeverInGo() {
+	blockedForever.Add(1)
+	select {}
+}
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -12,7 +12,7 @@
 //
 // ## Design
 //
-// The basic idea behind the the execution tracer is to have per-M buffers that
+// The basic idea behind the execution tracer is to have per-M buffers that
 // trace data may be written into. Each M maintains a write flag indicating whether
 // its trace buffer is currently in use.
 //
@@ -173,7 +173,7 @@
 // doesn't do this directly for performance reasons. The runtime implementation instead caches
 // a G on the M created for the C thread. On Linux this M is then cached in the thread's TLS,
 // and on other systems, the M is put on a global list on exit from Go. We need to do some
-// extra work to make sure that this is modeled correctly in the the tracer. For example,
+// extra work to make sure that this is modeled correctly in the tracer. For example,
 // a C thread exiting Go may leave a P hanging off of its M (whether that M is kept in TLS
 // or placed back on a list). In order to correctly model goroutine creation and destruction,
 // we must behave as if the P was at some point stolen by the runtime, if the C thread
--- a/src/runtime/tracebuf.go
+++ b/src/runtime/tracebuf.go
@@ -29,7 +29,7 @@ type traceWriter struct {
 	*traceBuf
 }

-// writer returns an a traceWriter that writes into the current M's stream.
+// writer returns a traceWriter that writes into the current M's stream.
 //
 // Once this is called, the caller must guard against stack growth until
 // end is called on it. Therefore, it's highly recommended to use this
--- a/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go
@@ -30,6 +30,13 @@ func (x simdType) ElemBits() int {
 	return x.Size / x.Lanes
 }

+func (x simdType) Article() string {
+	if strings.HasPrefix(x.Name, "Int") {
+		return "an"
+	}
+	return "a" // Float, Uint
+}
+
 // LanesContainer returns the smallest int/uint bit size that is
 // large enough to hold one bit for each lane.  E.g., Mask32x4
 // is 4 lanes, and a uint8 is the smallest uint that has 4 bits.
@@ -135,7 +142,11 @@ type v{{.}} struct {
 {{end}}

 {{define "typeTmpl"}}
-// {{.Name}} is a {{.Size}}-bit SIMD vector of {{.Lanes}} {{.Base}}
+{{- if eq .Type "mask"}}
+// {{.Name}} is a mask for a SIMD vector of {{.Lanes}} {{.ElemBits}}-bit elements.
+{{- else}}
+// {{.Name}} is a {{.Size}}-bit SIMD vector of {{.Lanes}} {{.Base}}s.
+{{- end}}
 type {{.Name}} struct {
 {{.Fields}}
 }
@@ -151,6 +162,7 @@ type X86Features struct {}
 var X86 X86Features

 {{range .}}
+{{$f := .}}
 {{- if eq .Feature "AVX512"}}
 // {{.Feature}} returns whether the CPU supports the AVX512F+CD+BW+DQ+VL features.
 //
@@ -161,25 +173,33 @@ var X86 X86Features
 {{- else -}}
 // {{.Feature}} returns whether the CPU supports the {{.Feature}} feature.
 {{- end}}
+{{- if ne .ImpliesAll ""}}
+//
+// If it returns true, then the CPU also supports {{.ImpliesAll}}.
+{{- end}}
 //
 // {{.Feature}} is defined on all GOARCHes, but will only return true on
 // GOARCH {{.GoArch}}.
-func (X86Features) {{.Feature}}() bool {
-	return cpu.X86.Has{{.Feature}}
+func ({{.FeatureVar}}Features) {{.Feature}}() bool {
+{{- if .Virtual}}
+	return {{range $i, $dep := .Implies}}{{if $i}} && {{end}}cpu.{{$f.FeatureVar}}.Has{{$dep}}{{end}}
+{{- else}}
+	return cpu.{{.FeatureVar}}.Has{{.Feature}}
+{{- end}}
 }
 {{end}}
 `

 const simdLoadStoreTemplate = `
-// Len returns the number of elements in a {{.Name}}
+// Len returns the number of elements in {{.Article}} {{.Name}}.
 func (x {{.Name}}) Len() int { return {{.Lanes}} }

-// Load{{.Name}} loads a {{.Name}} from an array
+// Load{{.Name}} loads {{.Article}} {{.Name}} from an array.
 //
 //go:noescape
 func Load{{.Name}}(y *[{{.Lanes}}]{{.Base}}) {{.Name}}

-// Store stores a {{.Name}} to an array
+// Store stores {{.Article}} {{.Name}} to an array.
 //
 //go:noescape
 func (x {{.Name}}) Store(y *[{{.Lanes}}]{{.Base}})
@@ -204,16 +224,16 @@ func (x {{.Name}}) ToBits() uint{{.LanesContainer}}
 `

 const simdMaskedLoadStoreTemplate = `
-// LoadMasked{{.Name}} loads a {{.Name}} from an array,
-// at those elements enabled by mask
+// LoadMasked{{.Name}} loads {{.Article}} {{.Name}} from an array,
+// at those elements enabled by mask.
 //
 {{.MaskedLoadDoc}}
 //
 //go:noescape
 func LoadMasked{{.Name}}(y *[{{.Lanes}}]{{.Base}}, mask Mask{{.ElemBits}}x{{.Lanes}}) {{.Name}}

-// StoreMasked stores a {{.Name}} to an array,
-// at those elements enabled by mask
+// StoreMasked stores {{.Article}} {{.Name}} to an array,
+// at those elements enabled by mask.
 //
 {{.MaskedStoreDoc}}
 //
@@ -395,15 +415,15 @@ func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"
 {{end}}

 {{define "vectorConversion"}}
-// {{.Tdst.Name}} converts from {{.Tsrc.Name}} to {{.Tdst.Name}}
-func (from {{.Tsrc.Name}}) As{{.Tdst.Name}}() (to {{.Tdst.Name}})
+// As{{.Tdst.Name}} returns {{.Tdst.Article}} {{.Tdst.Name}} with the same bit representation as x.
+func (x {{.Tsrc.Name}}) As{{.Tdst.Name}}() {{.Tdst.Name}}
 {{end}}

 {{define "mask"}}
-// To{{.VectorCounterpart}} converts from {{.Name}} to {{.VectorCounterpart}}
+// To{{.VectorCounterpart}} converts from {{.Name}} to {{.VectorCounterpart}}.
 func (from {{.Name}}) To{{.VectorCounterpart}}() (to {{.VectorCounterpart}})

-// asMask converts from {{.VectorCounterpart}} to {{.Name}}
+// asMask converts from {{.VectorCounterpart}} to {{.Name}}.
 func (from {{.VectorCounterpart}}) asMask() (to {{.Name}})

 func (x {{.Name}}) And(y {{.Name}}) {{.Name}}
@@ -553,6 +573,65 @@ func writeSIMDTypes(typeMap simdTypeMap) *bytes.Buffer {
 	return buffer
 }

+type goarchFeatures struct {
+	// featureVar is the name of the exported feature-check variable for this
+	// architecture.
+	featureVar string
+
+	// features records per-feature information.
+	features map[string]featureInfo
+}
+
+type featureInfo struct {
+	// Implies is a list of other CPU features that are required for this
+	// feature. These are allowed to chain.
+	//
+	// For example, if the Frob feature lists "Baz", then if X.Frob() returns
+	// true, it must also be true that the CPU has feature Baz.
+	Implies []string
+
+	// Virtual means this feature is not represented directly in internal/cpu,
+	// but is instead the logical AND of the features in Implies.
+	Virtual bool
+}
+
+// goarchFeatureInfo maps from GOARCH to CPU feature to additional information
+// about that feature. Not all features need to be in this map.
+var goarchFeatureInfo = make(map[string]goarchFeatures)
+
+func registerFeatureInfo(goArch string, features goarchFeatures) {
+	goarchFeatureInfo[goArch] = features
+}
+
+func featureImplies(goarch string, base string) string {
+	// Compute the transitive closure of base.
+	var list []string
+	var visit func(f string)
+	visit = func(f string) {
+		list = append(list, f)
+		for _, dep := range goarchFeatureInfo[goarch].features[f].Implies {
+			visit(dep)
+		}
+	}
+	visit(base)
+	// Drop base
+	list = list[1:]
+	// Put in "nice" order
+	slices.Reverse(list)
+	// Combine into a comment-ready form
+	switch len(list) {
+	case 0:
+		return ""
+	case 1:
+		return list[0]
+	case 2:
+		return list[0] + " and " + list[1]
+	default:
+		list[len(list)-1] = "and " + list[len(list)-1]
+		return strings.Join(list, ", ")
+	}
+}
+
 func writeSIMDFeatures(ops []Operation) *bytes.Buffer {
 	// Gather all features
 	type featureKey struct {
@@ -568,13 +647,36 @@ func writeSIMDFeatures(ops []Operation) *bytes.Buffer {
 			featureSet[featureKey{op.GoArch, feature}] = struct{}{}
 		}
 	}
-	features := slices.SortedFunc(maps.Keys(featureSet), func(a, b featureKey) int {
+	featureKeys := slices.SortedFunc(maps.Keys(featureSet), func(a, b featureKey) int {
 		if c := cmp.Compare(a.GoArch, b.GoArch); c != 0 {
 			return c
 		}
 		return compareNatural(a.Feature, b.Feature)
 	})

+	// TODO: internal/cpu doesn't enforce these at all. You can even do
+	// GODEBUG=cpu.avx=off and it will happily turn off AVX without turning off
+	// AVX2. We need to push these dependencies into it somehow.
+	type feature struct {
+		featureKey
+		FeatureVar string
+		Virtual    bool
+		Implies    []string
+		ImpliesAll string
+	}
+	var features []feature
+	for _, k := range featureKeys {
+		featureVar := goarchFeatureInfo[k.GoArch].featureVar
+		fi := goarchFeatureInfo[k.GoArch].features[k.Feature]
+		features = append(features, feature{
+			featureKey: k,
+			FeatureVar: featureVar,
+			Virtual:    fi.Virtual,
+			Implies:    fi.Implies,
+			ImpliesAll: featureImplies(k.GoArch, k.Feature),
+		})
+	}
+
 	// If we ever have the same feature name on more than one GOARCH, we'll have
 	// to be more careful about this.
 	t := templateOf(simdFeaturesTemplate, "features")
--- a/src/simd/archsimd/_gen/simdgen/gen_simdrules.go
+++ b/src/simd/archsimd/_gen/simdgen/gen_simdrules.go
@@ -275,7 +275,7 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer {
 						origArgs = after
 					}
 					immArg = "[c] "
-					immArgCombineOff = " [makeValAndOff(int32(int8(c)),off)] "
+					immArgCombineOff = " [makeValAndOff(int32(uint8(c)),off)] "
 				}
 				memOpData.ArgsLoadAddr = immArg + origArgs + fmt.Sprintf("l:(VMOVDQUload%d {sym} [off] ptr mem)", *lastVreg.Bits)
 				// Remove the last vreg from the arg and change it to "ptr".
--- a/src/simd/archsimd/_gen/simdgen/gen_simdssa.go
+++ b/src/simd/archsimd/_gen/simdgen/gen_simdssa.go
@@ -13,9 +13,7 @@ import (
 )

 var (
-	ssaTemplates = template.Must(template.New("simdSSA").Parse(`
-{{define "header"}}// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
-
+	ssaTemplates = template.Must(template.New("simdSSA").Parse(`{{define "header"}}` + generatedHeader + `
 package amd64

 import (
--- a/src/simd/archsimd/_gen/simdgen/godefs.go
+++ b/src/simd/archsimd/_gen/simdgen/godefs.go
@@ -135,6 +135,19 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {

 	o.In = append(o.rawOperation.In, o.rawOperation.InVariant...)

+	// For down conversions, the high elements are zeroed if the result has more elements.
+	// TODO: we should encode this logic in the YAML file, instead of hardcoding it here.
+	if len(o.In) > 0 && len(o.Out) > 0 {
+		inLanes := o.In[0].Lanes
+		outLanes := o.Out[0].Lanes
+		if inLanes != nil && outLanes != nil && *inLanes < *outLanes {
+			if (strings.Contains(o.Go, "Saturate") || strings.Contains(o.Go, "Truncate")) &&
+				!strings.Contains(o.Go, "Concat") {
+				o.Documentation += "\n// Results are packed to low elements in the returned vector, its upper elements are zeroed."
+			}
+		}
+	}
+
 	return nil
 }

@@ -362,7 +375,7 @@ func compareNatural(s1, s2 string) int {
 	return strings.Compare(s1, s2)
 }

-const generatedHeader = `// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+const generatedHeader = `// Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.
 `

 func writeGoDefs(path string, cl unify.Closure) error {
--- a/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
@@ -17,21 +17,83 @@
    // NAME subtracts corresponding elements of two vectors with saturation.
 - go: AddPairs
  commutative: false
+  out:
+  - elemBits: 16|32
  documentation: !string |-
    // NAME horizontally adds adjacent pairs of elements.
-    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+- go: AddPairs
+  commutative: false
+  out:
+  - elemBits: 64
+  documentation: !string |-
+    // NAME horizontally adds adjacent pairs of elements.
+    // For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
 - go: SubPairs
  commutative: false
+  out:
+  - elemBits: 16|32
  documentation: !string |-
    // NAME horizontally subtracts adjacent pairs of elements.
-    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+- go: SubPairs
+  commutative: false
+  out:
+  - elemBits: 64
+  documentation: !string |-
+    // NAME horizontally subtracts adjacent pairs of elements.
+    // For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
 - go: AddPairsSaturated
  commutative: false
  documentation: !string |-
    // NAME horizontally adds adjacent pairs of elements with saturation.
-    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 - go: SubPairsSaturated
  commutative: false
  documentation: !string |-
    // NAME horizontally subtracts adjacent pairs of elements with saturation.
-    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+- go: AddPairsGrouped
+  commutative: false
+  out:
+  - elemBits: 16|32
+  documentation: !string |-
+    // NAME horizontally adds adjacent pairs of elements.
+    // With each 128-bit as a group:
+    // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+- go: AddPairsGrouped
+  commutative: false
+  out:
+  - elemBits: 64
+  documentation: !string |-
+    // NAME horizontally adds adjacent pairs of elements.
+    // With each 128-bit as a group:
+    // for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
+- go: SubPairsGrouped
+  commutative: false
+  out:
+  - elemBits: 16|32
+  documentation: !string |-
+    // NAME horizontally subtracts adjacent pairs of elements.
+    // With each 128-bit as a group:
+    // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+- go: SubPairsGrouped
+  commutative: false
+  out:
+  - elemBits: 64
+  documentation: !string |-
+    // NAME horizontally subtracts adjacent pairs of elements.
+    // With each 128-bit as a group:
+    // for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
+- go: AddPairsSaturatedGrouped
+  commutative: false
+  documentation: !string |-
+    // NAME horizontally adds adjacent pairs of elements with saturation.
+    // With each 128-bit as a group:
+    // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+- go: SubPairsSaturatedGrouped
+  commutative: false
+  documentation: !string |-
+    // NAME horizontally subtracts adjacent pairs of elements with saturation.
+    // With each 128-bit as a group:
+    // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
--- a/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
@@ -53,25 +53,71 @@
  - *uint
 - go: AddPairs
  asm: "VPHADD[DW]"
-  in: *2any
-  out: *1any
+  in: &2any128
+  - &any128
+    go: $t
+    bits: 128
+  - *any128
+  out: &1any128
+  - *any128
 - go: SubPairs
  asm: "VPHSUB[DW]"
-  in: *2any
-  out: *1any
+  in: *2any128
+  out: *1any128
 - go: AddPairs
  asm: "VHADDP[SD]" # floats
-  in: *2any
-  out: *1any
+  in: *2any128
+  out: *1any128
 - go: SubPairs
  asm: "VHSUBP[SD]"  # floats
-  in: *2any
-  out: *1any
+  in: *2any128
+  out: *1any128
 - go: AddPairsSaturated
  asm: "VPHADDS[DW]"
-  in: *2int
-  out: *1int
+  in: &2int128
+  - &int128
+    go: $t
+    base: int
+    bits: 128
+  - *int128
+  out: &1int128
+  - *int128
 - go: SubPairsSaturated
  asm: "VPHSUBS[DW]"
-  in: *2int
-  out: *1int
+  in: *2int128
+  out: *1int128
+- go: AddPairsGrouped
+  asm: "VPHADD[DW]"
+  in: &2any256
+  - &any256
+    go: $t
+    bits: 256
+  - *any256
+  out: &1any256
+  - *any256
+- go: SubPairsGrouped
+  asm: "VPHSUB[DW]"
+  in: *2any256
+  out: *1any256
+- go: AddPairsGrouped
+  asm: "VHADDP[SD]" # floats
+  in: *2any256
+  out: *1any256
+- go: SubPairsGrouped
+  asm: "VHSUBP[SD]"  # floats
+  in: *2any256
+  out: *1any256
+- go: AddPairsSaturatedGrouped
+  asm: "VPHADDS[DW]"
+  in: &2int256
+  - &int256
+    go: $t
+    base: int
+    bits: 256
+  - *int256
+  out: &1int256
+  - *int256
+- go: SubPairsSaturatedGrouped
+  asm: "VPHSUBS[DW]"
+  in: *2int256
+  out: *1int256
--- a/src/simd/archsimd/_gen/simdgen/ops/Compares/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Compares/categories.yaml
@@ -10,34 +10,29 @@
  constImm: 0
  commutative: true
  documentation: !string |-
-    // NAME returns x equals y, elementwise.
+    // NAME returns a mask whose elements indicate whether x == y.
 - go: Less
  constImm: 1
  commutative: false
  documentation: !string |-
-    // NAME returns x less-than y, elementwise.
+    // NAME returns a mask whose elements indicate whether x < y.
 - go: LessEqual
  constImm: 2
  commutative: false
  documentation: !string |-
-    // NAME returns x less-than-or-equals y, elementwise.
- go: IsNan # For float only.
-  constImm: 3
-  commutative: true
-  documentation: !string |-
-    // NAME checks if elements are NaN. Use as x.IsNan(x).
+    // NAME returns a mask whose elements indicate whether x <= y.
 - go: NotEqual
  constImm: 4
  commutative: true
  documentation: !string |-
-    // NAME returns x not-equals y, elementwise.
+    // NAME returns a mask whose elements indicate whether x != y.
 - go: GreaterEqual
  constImm: 13
  commutative: false
  documentation: !string |-
-    // NAME returns x greater-than-or-equals y, elementwise.
+    // NAME returns a mask whose elements indicate whether x >= y.
 - go: Greater
  constImm: 14
  commutative: false
  documentation: !string |-
-    // NAME returns x greater-than y, elementwise.
+    // NAME returns a mask whose elements indicate whether x > y.
--- a/src/simd/archsimd/_gen/simdgen/ops/Compares/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Compares/go.yaml
@@ -121,7 +121,7 @@
  - class: mask

 # Floats
- go: Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual|IsNan
+- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual)
  regexpTag: "compares"
  asm: "VCMPP[SD]"
  in:
@@ -135,7 +135,7 @@
  - go: $t
    overwriteBase: int
    overwriteClass: mask
- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual|IsNan)
+- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual)
  regexpTag: "compares"
  asm: "VCMPP[SD]"
  in:
--- a/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml
@@ -44,124 +44,174 @@
    // NAME converts element values to float64.

 # Int <-> Int conversions
- go: "(Extend|Saturate|Truncate)?ToInt8"
+- go: "TruncateToInt8"
  commutative: false
  regexpTag: "convert"
  documentation: !string |-
-    // NAME converts element values to int8.
- go: "(Extend|Saturate|Truncate)?ToInt16(Concat)?"
+    // NAME truncates element values to int8.
+- go: "SaturateToInt8"
  commutative: false
  regexpTag: "convert"
  documentation: !string |-
-    // NAME converts element values to int16.
- go: "(Extend|Saturate|Truncate)?ToInt32"
+    // NAME converts element values to int8 with signed saturation.
+- go: "ExtendToInt16(Concat)?"
  commutative: false
  regexpTag: "convert"
  documentation: !string |-
-    // NAME converts element values to int32.
- go: "(Extend|Saturate|Truncate)?ToInt64"
+    // NAME sign-extends element values to int16.
+- go: "TruncateToInt16(Concat)?"
  commutative: false
  regexpTag: "convert"
  documentation: !string |-
-    // NAME converts element values to int64.
- go: "(Extend|Saturate|Truncate)?ToUint8"
+    // NAME truncates element values to int16.
+- go: "SaturateToInt16(Concat(Grouped)?)?"
  commutative: false
  regexpTag: "convert"
  documentation: !string |-
-    // NAME converts element values to uint8.
- go: "(Extend|Saturate|Truncate)?ToUint16(Concat)?"
+    // NAME converts element values to int16 with signed saturation.
+- go: "ExtendToInt32"
  commutative: false
  regexpTag: "convert"
  documentation: !string |-
-    // NAME converts element values to uint16.
- go: "(Extend|Saturate|Truncate)?ToUint32"
+    // NAME sign-extends element values to int32.
+- go: "TruncateToInt32"
+  commutative: false
+  regexpTag: "convert"
+  documentation: !string |-
+    // NAME truncates element values to int32.
+- go: "SaturateToInt32"
+  commutative: false
+  regexpTag: "convert"
+  documentation: !string |-
+    // NAME converts element values to int32 with signed saturation.
+- go: "ExtendToInt64"
+  commutative: false
+  regexpTag: "convert"
+  documentation: !string |-
+    // NAME sign-extends element values to int64.
+- go: "TruncateToUint8"
+  commutative: false
+  regexpTag: "convert"
+  documentation: !string |-
+    // NAME truncates element values to uint8.
+- go: "SaturateToUint8"
+  commutative: false
+  regexpTag: "convert"
+  documentation: !string |-
+    // NAME converts element values to uint8 with unsigned saturation.
+- go: "ExtendToUint16(Concat)?"
+  commutative: false
+  regexpTag: "convert"
+  documentation: !string |-
+    // NAME zero-extends element values to uint16.
+- go: "TruncateToUint16(Concat)?"
+  commutative: false
+  regexpTag: "convert"
+  documentation: !string |-
+    // NAME truncates element values to uint16.
+- go: "SaturateToUint16(Concat(Grouped)?)?"
+  commutative: false
+  regexpTag: "convert"
+  documentation: !string |-
+    // NAME converts element values to uint16 with unsigned saturation.
+- go: "ExtendToUint32"
  regexpTag: "convert"
  commutative: false
  documentation: !string |-
-    // NAME converts element values to uint32.
- go: "(Extend|Saturate|Truncate)?ToUint64"
+    // NAME zero-extends element values to uint32.
+- go: "TruncateToUint32"
  regexpTag: "convert"
  commutative: false
  documentation: !string |-
-    // NAME converts element values to uint64.
+    // NAME truncates element values to uint32.
+- go: "SaturateToUint32"
+  regexpTag: "convert"
+  commutative: false
+  documentation: !string |-
+    // NAME converts element values to uint32 with unsigned saturation.
+- go: "ExtendToUint64"
+  regexpTag: "convert"
+  commutative: false
+  documentation: !string |-
+    // NAME zero-extends element values to uint64.
 # low-part only Int <-> Int conversions
- go: ExtendLo8ToUint16x8
+- go: ExtendLo8ToUint16
  commutative: false
  documentation: !string |-
-    // NAME converts 8 lowest vector element values to uint16.
- go: ExtendLo8ToInt16x8
+    // NAME zero-extends 8 lowest vector element values to uint16.
+- go: ExtendLo8ToInt16
  commutative: false
  documentation: !string |-
-    // NAME converts 8 lowest vector element values to int16.
- go: ExtendLo4ToUint32x4
+    // NAME sign-extends 8 lowest vector element values to int16.
+- go: ExtendLo4ToUint32
  commutative: false
  documentation: !string |-
-    // NAME converts 4 lowest vector element values to uint32.
- go: ExtendLo4ToInt32x4
+    // NAME zero-extends 4 lowest vector element values to uint32.
+- go: ExtendLo4ToInt32
  commutative: false
  documentation: !string |-
-    // NAME converts 4 lowest vector element values to int32.
- go: ExtendLo2ToUint64x2
+    // NAME sign-extends 4 lowest vector element values to int32.
+- go: ExtendLo2ToUint64
  commutative: false
  documentation: !string |-
-    // NAME converts 2 lowest vector element values to uint64.
- go: ExtendLo2ToInt64x2
+    // NAME zero-extends 2 lowest vector element values to uint64.
+- go: ExtendLo2ToInt64
  commutative: false
  documentation: !string |-
-    // NAME converts 2 lowest vector element values to int64.
- go: ExtendLo2ToUint64x2
+    // NAME sign-extends 2 lowest vector element values to int64.
+- go: ExtendLo2ToUint64
  commutative: false
  documentation: !string |-
-    // NAME converts 2 lowest vector element values to uint64.
- go: ExtendLo4ToUint64x4
+    // NAME zero-extends 2 lowest vector element values to uint64.
+- go: ExtendLo4ToUint64
  commutative: false
  documentation: !string |-
-    // NAME converts 4 lowest vector element values to uint64.
- go: ExtendLo2ToInt64x2
+    // NAME zero-extends 4 lowest vector element values to uint64.
+- go: ExtendLo2ToInt64
  commutative: false
  documentation: !string |-
-    // NAME converts 2 lowest vector element values to int64.
- go: ExtendLo4ToInt64x4
+    // NAME sign-extends 2 lowest vector element values to int64.
+- go: ExtendLo4ToInt64
  commutative: false
  documentation: !string |-
-    // NAME converts 4 lowest vector element values to int64.
- go: ExtendLo4ToUint32x4
+    // NAME sign-extends 4 lowest vector element values to int64.
+- go: ExtendLo4ToUint32
  commutative: false
  documentation: !string |-
-    // NAME converts 4 lowest vector element values to uint32.
- go: ExtendLo8ToUint32x8
+    // NAME zero-extends 4 lowest vector element values to uint32.
+- go: ExtendLo8ToUint32
  commutative: false
  documentation: !string |-
-    // NAME converts 8 lowest vector element values to uint32.
- go: ExtendLo4ToInt32x4
+    // NAME zero-extends 8 lowest vector element values to uint32.
+- go: ExtendLo4ToInt32
  commutative: false
  documentation: !string |-
-    // NAME converts 4 lowest vector element values to int32.
- go: ExtendLo8ToInt32x8
+    // NAME sign-extends 4 lowest vector element values to int32.
+- go: ExtendLo8ToInt32
  commutative: false
  documentation: !string |-
-    // NAME converts 8 lowest vector element values to int32.
- go: ExtendLo2ToUint64x2
+    // NAME sign-extends 8 lowest vector element values to int32.
+- go: ExtendLo2ToUint64
  commutative: false
  documentation: !string |-
-    // NAME converts 2 lowest vector element values to uint64.
- go: ExtendLo4ToUint64x4
+    // NAME zero-extends 2 lowest vector element values to uint64.
+- go: ExtendLo4ToUint64
  commutative: false
  documentation: !string |-
-    // NAME converts 4 lowest vector element values to uint64.
- go: ExtendLo8ToUint64x8
+    // NAME zero-extends 4 lowest vector element values to uint64.
+- go: ExtendLo8ToUint64
  commutative: false
  documentation: !string |-
-    // NAME converts 8 lowest vector element values to uint64.
- go: ExtendLo2ToInt64x2
+    // NAME zero-extends 8 lowest vector element values to uint64.
+- go: ExtendLo2ToInt64
  commutative: false
  documentation: !string |-
-    // NAME converts 2 lowest vector element values to int64.
- go: ExtendLo4ToInt64x4
+    // NAME sign-extends 2 lowest vector element values to int64.
+- go: ExtendLo4ToInt64
  commutative: false
  documentation: !string |-
-    // NAME converts 4 lowest vector element values to int64.
- go: ExtendLo8ToInt64x8
+    // NAME sign-extends 4 lowest vector element values to int64.
+- go: ExtendLo8ToInt64
  commutative: false
  documentation: !string |-
-    // NAME converts 8 lowest vector element values to int64.
+    // NAME sign-extends 8 lowest vector element values to int64.
--- a/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml
@@ -138,9 +138,6 @@
 # Widening integer conversions.
 # uint8 -> uint16
 - go: ExtendToUint16
-  addDoc: &zeroExtendDoc
-    !string |-
-    // The result vector's elements are zero-extended.
  regexpTag: "convert"
  asm: "VPMOVZXBW"
  in:
@@ -156,7 +153,6 @@
 - go: ExtendToUint16
  regexpTag: "convert"
  asm: "VPMOVZXBW"
-  addDoc: *zeroExtendDoc
  in:
  - &u8x32
    base: uint
@@ -171,9 +167,6 @@
 - go: ExtendToInt16
  regexpTag: "convert"
  asm: "VPMOVSXBW"
-  addDoc: &signExtendDoc
-    !string |-
-    // The result vector's elements are sign-extended.
  in:
  - &i8x16
    base: int
@@ -187,7 +180,6 @@
 - go: ExtendToInt16
  regexpTag: "convert"
  asm: "VPMOVSXBW"
-  addDoc: *signExtendDoc
  in:
  - &i8x32
    base: int
@@ -202,7 +194,6 @@
 - go: ExtendToUint32
  regexpTag: "convert"
  asm: "VPMOVZXWD"
-  addDoc: *zeroExtendDoc
  in:
  - &u16x8
    base: uint
@@ -216,7 +207,6 @@
 - go: ExtendToUint32
  regexpTag: "convert"
  asm: "VPMOVZXWD"
-  addDoc: *zeroExtendDoc
  in:
  - *u16x16
  out:
@@ -228,7 +218,6 @@
 - go: ExtendToInt32
  regexpTag: "convert"
  asm: "VPMOVSXWD"
-  addDoc: *signExtendDoc
  in:
  - &i16x8
    base: int
@@ -242,7 +231,6 @@
 - go: ExtendToInt32
  regexpTag: "convert"
  asm: "VPMOVSXWD"
-  addDoc: *signExtendDoc
  in:
  - *i16x16
  out:
@@ -254,7 +242,6 @@
 - go: ExtendToUint64
  regexpTag: "convert"
  asm: "VPMOVZXDQ"
-  addDoc: *zeroExtendDoc
  in:
  - &u32x4
    base: uint
@@ -268,7 +255,6 @@
 - go: ExtendToUint64
  regexpTag: "convert"
  asm: "VPMOVZXDQ"
-  addDoc: *zeroExtendDoc
  in:
  - *u32x8
  out:
@@ -280,7 +266,6 @@
 - go: ExtendToInt64
  regexpTag: "convert"
  asm: "VPMOVSXDQ"
-  addDoc: *signExtendDoc
  in:
  - &i32x4
    base: int
@@ -294,7 +279,6 @@
 - go: ExtendToInt64
  regexpTag: "convert"
  asm: "VPMOVSXDQ"
-  addDoc: *signExtendDoc
  in:
  - *i32x8
  out:
@@ -306,7 +290,6 @@
 - go: ExtendToUint64
  regexpTag: "convert"
  asm: "VPMOVZXWQ"
-  addDoc: *zeroExtendDoc
  in:
  - *u16x8
  out:
@@ -315,7 +298,6 @@
 - go: ExtendToInt64
  regexpTag: "convert"
  asm: "VPMOVSXWQ"
-  addDoc: *signExtendDoc
  in:
  - *i16x8
  out:
@@ -324,7 +306,6 @@
 - go: ExtendToUint32
  regexpTag: "convert"
  asm: "VPMOVZXBD"
-  addDoc: *zeroExtendDoc
  in:
  - *u8x16
  out:
@@ -333,7 +314,6 @@
 - go: ExtendToInt32
  regexpTag: "convert"
  asm: "VPMOVSXBD"
-  addDoc: *signExtendDoc
  in:
  - *i8x16
  out:
@@ -342,10 +322,6 @@
 - go: TruncateToInt8
  regexpTag: "convert"
  asm: "VPMOV[WDQ]B"
-  addDoc: &truncDocZeroUpper
-    !string |-
-    // Conversion is done with truncation on the vector elements.
-    // Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
  in:
    - base: int
  out:
@@ -354,7 +330,6 @@
 - go: TruncateToUint8
  regexpTag: "convert"
  asm: "VPMOV[WDQ]B"
-  addDoc: *truncDocZeroUpper
  in:
    - base: uint
  out:
@@ -363,9 +338,6 @@
 - go: TruncateToInt8
  regexpTag: "convert"
  asm: "VPMOV[WDQ]B"
-  addDoc: &truncDoc
-    !string |-
-    // Conversion is done with truncation on the vector elements.
  in:
    - base: int
  out:
@@ -374,7 +346,6 @@
 - go: TruncateToUint8
  regexpTag: "convert"
  asm: "VPMOV[WDQ]B"
-  addDoc: *truncDoc
  in:
    - base: uint
  out:
@@ -383,7 +354,6 @@
 - go: TruncateToInt16
  regexpTag: "convert"
  asm: "VPMOV[DQ]W"
-  addDoc: *truncDoc
  in:
    - base: int
  out:
@@ -391,7 +361,6 @@
 - go: TruncateToUint16
  regexpTag: "convert"
  asm: "VPMOV[DQ]W"
-  addDoc: *truncDoc
  in:
    - base: uint
  out:
@@ -399,7 +368,6 @@
 - go: TruncateToInt32
  regexpTag: "convert"
  asm: "VPMOVQD"
-  addDoc: *truncDoc
  in:
    - base: int
  out:
@@ -407,7 +375,6 @@
 - go: TruncateToUint32
  regexpTag: "convert"
  asm: "VPMOVQD"
-  addDoc: *truncDoc
  in:
    - base: uint
  out:
@@ -416,10 +383,6 @@
 - go: SaturateToInt8
  regexpTag: "convert"
  asm: "VPMOVS[WDQ]B"
-  addDoc: &satDocZeroUpper
-    !string |-
-    // Conversion is done with saturation on the vector elements.
-    // Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
  in:
    - base: int
  out:
@@ -427,19 +390,15 @@
      bits: 128
 - go: SaturateToUint8
  regexpTag: "convert"
-  asm: "VPMOVS[WDQ]B"
-  addDoc: *satDocZeroUpper
+  asm: "VPMOVUS[WDQ]B"
  in:
-    - base: int
+    - base: uint
  out:
-    - base: int
+    - base: uint
      bits: 128
 - go: SaturateToInt8
  regexpTag: "convert"
  asm: "VPMOVS[WDQ]B"
-  addDoc: &satDoc
-    !string |-
-    // Conversion is done with saturation on the vector elements.
  in:
    - base: int
  out:
@@ -448,7 +407,6 @@
 - go: SaturateToUint8
  regexpTag: "convert"
  asm: "VPMOVUS[WDQ]B"
-  addDoc: *satDoc
  in:
    - base: uint
  out:
@@ -457,7 +415,6 @@
 - go: SaturateToInt16
  regexpTag: "convert"
  asm: "VPMOVS[DQ]W"
-  addDoc: *satDoc
  in:
    - base: int
  out:
@@ -465,7 +422,6 @@
 - go: SaturateToUint16
  regexpTag: "convert"
  asm: "VPMOVUS[DQ]W"
-  addDoc: *satDoc
  in:
    - base: uint
  out:
@@ -473,7 +429,6 @@
 - go: SaturateToInt32
  regexpTag: "convert"
  asm: "VPMOVSQD"
-  addDoc: *satDoc
  in:
    - base: int
  out:
@@ -481,7 +436,6 @@
 - go: SaturateToUint32
  regexpTag: "convert"
  asm: "VPMOVUSQD"
-  addDoc: *satDoc
  in:
    - base: uint
  out:
@@ -492,67 +446,86 @@
  asm: "VPACKSSDW"
  addDoc: &satDocConcat
    !string |-
-    // With each 128-bit as a group:
-    // The converted group from the first input vector will be packed to the lower part of the result vector,
-    // the converted group from the second input vector will be packed to the upper part of the result vector.
-    // Conversion is done with saturation on the vector elements.
+    // The converted elements from x will be packed to the lower part of the result vector,
+    // the converted elements from y will be packed to the upper part of the result vector.
  in:
    - base: int
    - base: int
  out:
    - base: int
+      bits: 128
+- go: SaturateToInt16ConcatGrouped
+  regexpTag: "convert"
+  asm: "VPACKSSDW"
+  addDoc: &satDocConcatGrouped
+    !string |-
+    // With each 128-bit as a group:
+    // The converted elements from x will be packed to the lower part of the group in the result vector,
+    // the converted elements from y will be packed to the upper part of the group in the result vector.
+  in:
+    - base: int
+    - base: int
+  out:
+    - base: int
+      bits: 256|512
 - go: SaturateToUint16Concat
  regexpTag: "convert"
  asm: "VPACKUSDW"
  addDoc: *satDocConcat
  in:
-    - base: uint
-    - base: uint
+    - base: int
+    - base: int
  out:
    - base: uint
+      bits: 128
+- go: SaturateToUint16ConcatGrouped
+  regexpTag: "convert"
+  asm: "VPACKUSDW"
+  addDoc: *satDocConcatGrouped
+  in:
+    - base: int
+    - base: int
+  out:
+    - base: uint
+      bits: 256|512

 # low-part only conversions.
 # uint8->uint16
- go: ExtendLo8ToUint16x8
+- go: ExtendLo8ToUint16
  regexpTag: "convert"
  asm: "VPMOVZXBW"
-  addDoc: *zeroExtendDoc
  in:
  - *u8x16
  out:
  - *u16x8
 # int8->int16
- go: ExtendLo8ToInt16x8
+- go: ExtendLo8ToInt16
  regexpTag: "convert"
  asm: "VPMOVSXBW"
-  addDoc: *signExtendDoc
  in:
  - *i8x16
  out:
  - *i16x8
 # uint16->uint32
- go: ExtendLo4ToUint32x4
+- go: ExtendLo4ToUint32
  regexpTag: "convert"
  asm: "VPMOVZXWD"
-  addDoc: *zeroExtendDoc
  in:
  - *u16x8
  out:
  - *u32x4
 # int16->int32
- go: ExtendLo4ToInt32x4
+- go: ExtendLo4ToInt32
  regexpTag: "convert"
  asm: "VPMOVSXWD"
-  addDoc: *signExtendDoc
  in:
  - *i16x8
  out:
  - *i32x4
 # uint32 -> uint64
- go: ExtendLo2ToUint64x2
+- go: ExtendLo2ToUint64
  regexpTag: "convert"
  asm: "VPMOVZXDQ"
-  addDoc: *zeroExtendDoc
  in:
  - *u32x4
  out:
@@ -561,10 +534,9 @@
    elemBits: 64
    bits: 128
 # int32 -> int64
- go: ExtendLo2ToInt64x2
+- go: ExtendLo2ToInt64
  regexpTag: "convert"
  asm: "VPMOVSXDQ"
-  addDoc: *signExtendDoc
  in:
  - *i32x4
  out:
@@ -573,120 +545,106 @@
    elemBits: 64
    bits: 128
 # uint16 -> uint64
- go: ExtendLo2ToUint64x2
+- go: ExtendLo2ToUint64
  regexpTag: "convert"
  asm: "VPMOVZXWQ"
-  addDoc: *zeroExtendDoc
  in:
  - *u16x8
  out:
  - *u64x2
- go: ExtendLo4ToUint64x4
+- go: ExtendLo4ToUint64
  regexpTag: "convert"
  asm: "VPMOVZXWQ"
-  addDoc: *zeroExtendDoc
  in:
  - *u16x8
  out:
  - *u64x4
 # int16 -> int64
- go: ExtendLo2ToInt64x2
+- go: ExtendLo2ToInt64
  regexpTag: "convert"
  asm: "VPMOVSXWQ"
-  addDoc: *signExtendDoc
  in:
  - *i16x8
  out:
  - *i64x2
- go: ExtendLo4ToInt64x4
+- go: ExtendLo4ToInt64
  regexpTag: "convert"
  asm: "VPMOVSXWQ"
-  addDoc: *signExtendDoc
  in:
  - *i16x8
  out:
  - *i64x4
 # uint8 -> uint32
- go: ExtendLo4ToUint32x4
+- go: ExtendLo4ToUint32
  regexpTag: "convert"
  asm: "VPMOVZXBD"
-  addDoc: *zeroExtendDoc
  in:
  - *u8x16
  out:
  - *u32x4
- go: ExtendLo8ToUint32x8
+- go: ExtendLo8ToUint32
  regexpTag: "convert"
  asm: "VPMOVZXBD"
-  addDoc: *zeroExtendDoc
  in:
  - *u8x16
  out:
  - *u32x8
 # int8 -> int32
- go: ExtendLo4ToInt32x4
+- go: ExtendLo4ToInt32
  regexpTag: "convert"
  asm: "VPMOVSXBD"
-  addDoc: *signExtendDoc
  in:
  - *i8x16
  out:
  - *i32x4
- go: ExtendLo8ToInt32x8
+- go: ExtendLo8ToInt32
  regexpTag: "convert"
  asm: "VPMOVSXBD"
-  addDoc: *signExtendDoc
  in:
  - *i8x16
  out:
  - *i32x8
 # uint8 -> uint64
- go: ExtendLo2ToUint64x2
+- go: ExtendLo2ToUint64
  regexpTag: "convert"
  asm: "VPMOVZXBQ"
-  addDoc: *zeroExtendDoc
  in:
  - *u8x16
  out:
  - *u64x2
- go: ExtendLo4ToUint64x4
+- go: ExtendLo4ToUint64
  regexpTag: "convert"
  asm: "VPMOVZXBQ"
-  addDoc: *zeroExtendDoc
  in:
  - *u8x16
  out:
  - *u64x4
- go: ExtendLo8ToUint64x8
+- go: ExtendLo8ToUint64
  regexpTag: "convert"
  asm: "VPMOVZXBQ"
-  addDoc: *zeroExtendDoc
  in:
  - *u8x16
  out:
  - *u64x8
 # int8 -> int64
- go: ExtendLo2ToInt64x2
+- go: ExtendLo2ToInt64
  regexpTag: "convert"
  asm: "VPMOVSXBQ"
-  addDoc: *signExtendDoc
  in:
  - *i8x16
  out:
  - *i64x2
- go: ExtendLo4ToInt64x4
+- go: ExtendLo4ToInt64
  regexpTag: "convert"
  asm: "VPMOVSXBQ"
-  addDoc: *signExtendDoc
  in:
  - *i8x16
  out:
  - *i64x4
- go: ExtendLo8ToInt64x8
+- go: ExtendLo8ToInt64
  regexpTag: "convert"
  asm: "VPMOVSXBQ"
-  addDoc: *signExtendDoc
  in:
  - *i8x16
  out:
-  - *i64x8
+  - *i64x8
--- a/src/simd/archsimd/_gen/simdgen/ops/FPonlyArith/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/FPonlyArith/categories.yaml
@@ -18,12 +18,13 @@
 - go: Scale
  commutative: false
  documentation: !string |-
-    // NAME multiplies elements by a power of 2.
+    // NAME multiplies each element of x by 2 raised to the power of the
+    // floor of the corresponding element in y.
 - go: RoundToEven
  commutative: false
  constImm: 0
  documentation: !string |-
-    // NAME rounds elements to the nearest integer.
+    // NAME rounds elements to the nearest integer, rounding ties to even.
 - go: RoundToEvenScaled
  commutative: false
  constImm: 0
--- a/src/simd/archsimd/_gen/simdgen/ops/IntOnlyArith/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/IntOnlyArith/categories.yaml
@@ -12,8 +12,8 @@
  # Applies sign of second operand to first: sign(val, sign_src)
  commutative: false
  documentation: !string |-
-    // NAME returns the product of the first operand with -1, 0, or 1,
-    // whichever constant is nearest to the value of the second operand.
+    // NAME returns the product of x with -1, 0, or 1,
+    // whichever constant is nearest to the value of y.
  # Sign does not have masked version
 - go: OnesCount
  commutative: false
--- a/src/simd/archsimd/_gen/simdgen/ops/MLOps/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/MLOps/categories.yaml
@@ -10,21 +10,10 @@
  documentation: !string |-
    // NAME multiplies the elements and add the pairs together with saturation,
    // yielding a vector of half as many elements with twice the input element size.
-# QuadDotProduct, i.e. VPDPBUSD(S) are operations with src/dst on the same register, we are not supporting this as of now.
 # - go: DotProductBroadcast
 #   commutative: true
 # #   documentation: !string |-
 #     // NAME multiplies all elements and broadcasts the sum.
- go: DotProductQuadruple
-  commutative: false
-  documentation: !string |-
-    // NAME performs dot products on groups of 4 elements of x and y.
-    // NAME(x, y).Add(z) will be optimized to the full form of the underlying instruction.
- go: DotProductQuadrupleSaturated
-  commutative: false
-  documentation: !string |-
-    // NAME multiplies performs dot products on groups of 4 elements of x and y.
-    // NAME(x, y).Add(z) will be optimized to the full form of the underlying instruction.
 - go: AddDotProductPairs
  commutative: false
  noTypes: "true"
--- a/src/simd/archsimd/_gen/simdgen/ops/MLOps/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/MLOps/go.yaml
@@ -33,33 +33,6 @@
 #     const: 127
 #   out:
 #   - *dpb_src
- go: DotProductQuadruple
-  asm: "VPDPBUSD"
-  operandOrder: "31Zero3" # switch operand 3 and 1, and make 3 always 0
-  in:
-  - &qdpa_acc
-    go: $t_acc
-    base: int
-    elemBits: 32
-  - &qdpa_src1
-    go: $t_src1
-    base: uint
-    overwriteElementBits: 8
-  - &qdpa_src2
-    go: $t_src2
-    base: int
-    overwriteElementBits: 8
-  out:
-  - *qdpa_acc
- go: DotProductQuadrupleSaturated
-  asm: "VPDPBUSDS"
-  operandOrder: "31Zero3" # switch operand 3 and 1, and make 3 always 0
-  in:
-  - *qdpa_acc
-  - *qdpa_src1
-  - *qdpa_src2
-  out:
-  - *qdpa_acc
 - go: AddDotProductPairs
  asm: "VPDPWSSD"
  in:
--- a/src/simd/archsimd/_gen/simdgen/ops/MinMax/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/MinMax/categories.yaml
@@ -2,8 +2,8 @@
 - go: Max
  commutative: true
  documentation: !string |-
-    // NAME computes the maximum of corresponding elements.
+    // NAME computes the maximum of each pair of corresponding elements in x and y.
 - go: Min
  commutative: true
  documentation: !string |-
-    // NAME computes the minimum of corresponding elements.
+    // NAME computes the minimum of each pair of corresponding elements in x and y.
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/categories.yaml
@@ -31,17 +31,23 @@
  commutative: false
  documentation: !string |-
    // NAME performs a full permutation of vector x using indices:
-    // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+    //
+    //   result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+    //
 - go: Permute
  commutative: false
  documentation: !string |-
    // NAME performs a full permutation of vector x using indices:
-    // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+    //
+    //   result = {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+    //
 - go: ConcatPermute # ConcatPermute is only available on or after AVX512
  commutative: false
  documentation: !string |-
    // NAME performs a full permutation of vector x, y using indices:
-    // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+    //
+    //   result = {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+    //
    // where xy is the concatenation of x (lower half) and y (upper half).
    // Only the needed bits to represent xy's index are used in indices' elements.
 - go: Compress
@@ -236,12 +242,12 @@
 - go: ConcatShiftBytesRight
  commutative: false
  documentation: !string |-
-    // NAME concatenates x and y and shift it right by constant bytes.
+    // NAME concatenates x and y and shift it right by shift bytes.
    // The result vector will be the lower half of the concatenated vector.

 - go: ConcatShiftBytesRightGrouped
  commutative: false
  documentation: !string |-
-    // NAME concatenates x and y and shift it right by constant bytes.
+    // NAME concatenates x and y and shift it right by shift bytes.
    // The result vector will be the lower half of the concatenated vector.
    // This operation is performed grouped by each 16 byte.
--- a/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Moves/go.yaml
@@ -227,7 +227,7 @@
 - go: Permute
  asm: "VPERMQ|VPERMPD"
  addDoc: !string |-
-    // The low 2 bits (values 0-3) of each element of indices is used
+    // The low 2 bits (values 0-3) of each element of indices is used.
  operandOrder: "21Type1"
  in:
  - &anyindices
@@ -244,7 +244,7 @@
 - go: Permute
  asm: "VPERM[WDQ]|VPERMP[SD]"
  addDoc: !string |-
-    // The low 3 bits (values 0-7) of each element of indices is used
+    // The low 3 bits (values 0-7) of each element of indices is used.
  operandOrder: "21Type1"
  in:
  - *anyindices
@@ -257,7 +257,7 @@
 - go: Permute
  asm: "VPERM[BWD]|VPERMPS"
  addDoc: !string |-
-    // The low 4 bits (values 0-15) of each element of indices is used
+    // The low 4 bits (values 0-15) of each element of indices is used.
  operandOrder: "21Type1"
  in:
  - *anyindices
@@ -270,7 +270,7 @@
 - go: Permute
  asm: "VPERM[BW]"
  addDoc: !string |-
-    // The low 5 bits (values 0-31) of each element of indices is used
+    // The low 5 bits (values 0-31) of each element of indices is used.
  operandOrder: "21Type1"
  in:
  - *anyindices
@@ -283,7 +283,7 @@
 - go: Permute
  asm: "VPERMB"
  addDoc: !string |-
-    // The low 6 bits (values 0-63) of each element of indices is used
+    // The low 6 bits (values 0-63) of each element of indices is used.
  operandOrder: "21Type1"
  in:
  - *anyindices
@@ -489,7 +489,9 @@
 - go: PermuteOrZeroGrouped
  asm: VPSHUFB
  addDoc: !string |-
-    // result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+    //
+    //   result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+    //
    // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
    // unless the index's sign bit is set in which case zero is used instead.
    // Each group is of size 128-bit.
@@ -506,7 +508,9 @@
 - go: permuteScalars
  asm: VPSHUFD
  addDoc: !string |-
-    // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+    //
+    //   result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+    //
    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
  in:
  - *128any
@@ -520,7 +524,9 @@
 - go: permuteScalarsGrouped
  asm: VPSHUFD
  addDoc: !string |-
-    // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+    //
+    //   result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+    //
    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
    // Each group is of size 128-bit.
  in:
@@ -535,7 +541,9 @@
 - go: permuteScalarsLo
  asm: VPSHUFLW
  addDoc: !string |-
-    // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+    //
+    //   result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+    //
    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
  in:
    - &128lanes8
@@ -573,7 +581,9 @@
 - go: permuteScalarsHi
  asm: VPSHUFHW
  addDoc: !string |-
-    // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+    //
+    //   result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+    //
    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
  in:
  - *128lanes8
@@ -1001,6 +1011,7 @@
  - *uint128
  - class: immediate
    immOffset: 0
+    name: shift
  out:
  - *uint128

@@ -1014,5 +1025,6 @@
  - *uint256512
  - class: immediate
    immOffset: 0
+    name: shift
  out:
  - *uint256512
--- a/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Mul/categories.yaml
@@ -7,7 +7,7 @@
  commutative: true
  documentation: !string |-
    // NAME multiplies even-indexed elements, widening the result.
-    // Result[i] = v1.Even[i] * v2.Even[i].
+    // Result[i] = v1[2*i] * v2[2*i].
 - go: MulHigh
  commutative: true
  documentation: !string |-
--- a/Show More
+++ b/Show More