simd/archsimd: correct documentation for pairwise operations

For Add/SubPairs(Saturated?), the documented result element order is wrong. Corrected. Also, for 256-bit vectors, this is a grouped operation. So name it with the Grouped suffix to be clear. Change-Id: Idfd0975cb4a332b2e28c898613861205d26f75b0 Reviewed-on: https://go-review.googlesource.com/c/go/+/732020 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2026-02-05 10:25:06 +03:00 · 2025-12-22 14:52:57 -05:00
parent 3d77a0b15e
commit c1efada1d2
10 changed files with 446 additions and 248 deletions
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -250,12 +250,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPADDQ256,
 		ssa.OpAMD64VPADDQ512,
 		ssa.OpAMD64VHADDPS128,
-		ssa.OpAMD64VHADDPS256,
 		ssa.OpAMD64VHADDPD128,
-		ssa.OpAMD64VHADDPD256,
 		ssa.OpAMD64VPHADDW128,
-		ssa.OpAMD64VPHADDW256,
 		ssa.OpAMD64VPHADDD128,
+		ssa.OpAMD64VHADDPS256,
+		ssa.OpAMD64VHADDPD256,
+		ssa.OpAMD64VPHADDW256,
 		ssa.OpAMD64VPHADDD256,
 		ssa.OpAMD64VPHADDSW128,
 		ssa.OpAMD64VPHADDSW256,
@@ -520,12 +520,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPSUBQ256,
 		ssa.OpAMD64VPSUBQ512,
 		ssa.OpAMD64VHSUBPS128,
-		ssa.OpAMD64VHSUBPS256,
 		ssa.OpAMD64VHSUBPD128,
-		ssa.OpAMD64VHSUBPD256,
 		ssa.OpAMD64VPHSUBW128,
-		ssa.OpAMD64VPHSUBW256,
 		ssa.OpAMD64VPHSUBD128,
+		ssa.OpAMD64VHSUBPS256,
+		ssa.OpAMD64VHSUBPD256,
+		ssa.OpAMD64VPHSUBW256,
 		ssa.OpAMD64VPHSUBD256,
 		ssa.OpAMD64VPHSUBSW128,
 		ssa.OpAMD64VPHSUBSW256,
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -57,19 +57,19 @@
 (AddUint64x4 ...) => (VPADDQ256 ...)
 (AddUint64x8 ...) => (VPADDQ512 ...)
 (AddPairsFloat32x4 ...) => (VHADDPS128 ...)
-(AddPairsFloat32x8 ...) => (VHADDPS256 ...)
 (AddPairsFloat64x2 ...) => (VHADDPD128 ...)
-(AddPairsFloat64x4 ...) => (VHADDPD256 ...)
 (AddPairsInt16x8 ...) => (VPHADDW128 ...)
-(AddPairsInt16x16 ...) => (VPHADDW256 ...)
 (AddPairsInt32x4 ...) => (VPHADDD128 ...)
-(AddPairsInt32x8 ...) => (VPHADDD256 ...)
 (AddPairsUint16x8 ...) => (VPHADDW128 ...)
-(AddPairsUint16x16 ...) => (VPHADDW256 ...)
 (AddPairsUint32x4 ...) => (VPHADDD128 ...)
-(AddPairsUint32x8 ...) => (VPHADDD256 ...)
+(AddPairsGroupedFloat32x8 ...) => (VHADDPS256 ...)
+(AddPairsGroupedFloat64x4 ...) => (VHADDPD256 ...)
+(AddPairsGroupedInt16x16 ...) => (VPHADDW256 ...)
+(AddPairsGroupedInt32x8 ...) => (VPHADDD256 ...)
+(AddPairsGroupedUint16x16 ...) => (VPHADDW256 ...)
+(AddPairsGroupedUint32x8 ...) => (VPHADDD256 ...)
 (AddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...)
-(AddPairsSaturatedInt16x16 ...) => (VPHADDSW256 ...)
+(AddPairsSaturatedGroupedInt16x16 ...) => (VPHADDSW256 ...)
 (AddSaturatedInt8x16 ...) => (VPADDSB128 ...)
 (AddSaturatedInt8x32 ...) => (VPADDSB256 ...)
 (AddSaturatedInt8x64 ...) => (VPADDSB512 ...)
@@ -1217,19 +1217,19 @@
 (SubUint64x4 ...) => (VPSUBQ256 ...)
 (SubUint64x8 ...) => (VPSUBQ512 ...)
 (SubPairsFloat32x4 ...) => (VHSUBPS128 ...)
-(SubPairsFloat32x8 ...) => (VHSUBPS256 ...)
 (SubPairsFloat64x2 ...) => (VHSUBPD128 ...)
-(SubPairsFloat64x4 ...) => (VHSUBPD256 ...)
 (SubPairsInt16x8 ...) => (VPHSUBW128 ...)
-(SubPairsInt16x16 ...) => (VPHSUBW256 ...)
 (SubPairsInt32x4 ...) => (VPHSUBD128 ...)
-(SubPairsInt32x8 ...) => (VPHSUBD256 ...)
 (SubPairsUint16x8 ...) => (VPHSUBW128 ...)
-(SubPairsUint16x16 ...) => (VPHSUBW256 ...)
 (SubPairsUint32x4 ...) => (VPHSUBD128 ...)
-(SubPairsUint32x8 ...) => (VPHSUBD256 ...)
+(SubPairsGroupedFloat32x8 ...) => (VHSUBPS256 ...)
+(SubPairsGroupedFloat64x4 ...) => (VHSUBPD256 ...)
+(SubPairsGroupedInt16x16 ...) => (VPHSUBW256 ...)
+(SubPairsGroupedInt32x8 ...) => (VPHSUBD256 ...)
+(SubPairsGroupedUint16x16 ...) => (VPHSUBW256 ...)
+(SubPairsGroupedUint32x8 ...) => (VPHSUBD256 ...)
 (SubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...)
-(SubPairsSaturatedInt16x16 ...) => (VPHSUBSW256 ...)
+(SubPairsSaturatedGroupedInt16x16 ...) => (VPHSUBSW256 ...)
 (SubSaturatedInt8x16 ...) => (VPSUBSB128 ...)
 (SubSaturatedInt8x32 ...) => (VPSUBSB256 ...)
 (SubSaturatedInt8x64 ...) => (VPSUBSB512 ...)
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -48,19 +48,19 @@ func simdGenericOps() []opData {
 		{name: "AddInt64x4", argLength: 2, commutative: true},
 		{name: "AddInt64x8", argLength: 2, commutative: true},
 		{name: "AddPairsFloat32x4", argLength: 2, commutative: false},
-		{name: "AddPairsFloat32x8", argLength: 2, commutative: false},
 		{name: "AddPairsFloat64x2", argLength: 2, commutative: false},
-		{name: "AddPairsFloat64x4", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedFloat32x8", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedFloat64x4", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedInt16x16", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedInt32x8", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedUint16x16", argLength: 2, commutative: false},
+		{name: "AddPairsGroupedUint32x8", argLength: 2, commutative: false},
 		{name: "AddPairsInt16x8", argLength: 2, commutative: false},
-		{name: "AddPairsInt16x16", argLength: 2, commutative: false},
 		{name: "AddPairsInt32x4", argLength: 2, commutative: false},
-		{name: "AddPairsInt32x8", argLength: 2, commutative: false},
+		{name: "AddPairsSaturatedGroupedInt16x16", argLength: 2, commutative: false},
 		{name: "AddPairsSaturatedInt16x8", argLength: 2, commutative: false},
-		{name: "AddPairsSaturatedInt16x16", argLength: 2, commutative: false},
 		{name: "AddPairsUint16x8", argLength: 2, commutative: false},
-		{name: "AddPairsUint16x16", argLength: 2, commutative: false},
 		{name: "AddPairsUint32x4", argLength: 2, commutative: false},
-		{name: "AddPairsUint32x8", argLength: 2, commutative: false},
 		{name: "AddSaturatedInt8x16", argLength: 2, commutative: true},
 		{name: "AddSaturatedInt8x32", argLength: 2, commutative: true},
 		{name: "AddSaturatedInt8x64", argLength: 2, commutative: true},
@@ -1036,19 +1036,19 @@ func simdGenericOps() []opData {
 		{name: "SubInt64x4", argLength: 2, commutative: false},
 		{name: "SubInt64x8", argLength: 2, commutative: false},
 		{name: "SubPairsFloat32x4", argLength: 2, commutative: false},
-		{name: "SubPairsFloat32x8", argLength: 2, commutative: false},
 		{name: "SubPairsFloat64x2", argLength: 2, commutative: false},
-		{name: "SubPairsFloat64x4", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedFloat32x8", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedFloat64x4", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedInt16x16", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedInt32x8", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedUint16x16", argLength: 2, commutative: false},
+		{name: "SubPairsGroupedUint32x8", argLength: 2, commutative: false},
 		{name: "SubPairsInt16x8", argLength: 2, commutative: false},
-		{name: "SubPairsInt16x16", argLength: 2, commutative: false},
 		{name: "SubPairsInt32x4", argLength: 2, commutative: false},
-		{name: "SubPairsInt32x8", argLength: 2, commutative: false},
+		{name: "SubPairsSaturatedGroupedInt16x16", argLength: 2, commutative: false},
 		{name: "SubPairsSaturatedInt16x8", argLength: 2, commutative: false},
-		{name: "SubPairsSaturatedInt16x16", argLength: 2, commutative: false},
 		{name: "SubPairsUint16x8", argLength: 2, commutative: false},
-		{name: "SubPairsUint16x16", argLength: 2, commutative: false},
 		{name: "SubPairsUint32x4", argLength: 2, commutative: false},
-		{name: "SubPairsUint32x8", argLength: 2, commutative: false},
 		{name: "SubSaturatedInt8x16", argLength: 2, commutative: false},
 		{name: "SubSaturatedInt8x32", argLength: 2, commutative: false},
 		{name: "SubSaturatedInt8x64", argLength: 2, commutative: false},
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -6202,19 +6202,19 @@ const (
 	OpAddInt64x4
 	OpAddInt64x8
 	OpAddPairsFloat32x4
-	OpAddPairsFloat32x8
 	OpAddPairsFloat64x2
-	OpAddPairsFloat64x4
+	OpAddPairsGroupedFloat32x8
+	OpAddPairsGroupedFloat64x4
+	OpAddPairsGroupedInt16x16
+	OpAddPairsGroupedInt32x8
+	OpAddPairsGroupedUint16x16
+	OpAddPairsGroupedUint32x8
 	OpAddPairsInt16x8
-	OpAddPairsInt16x16
 	OpAddPairsInt32x4
-	OpAddPairsInt32x8
+	OpAddPairsSaturatedGroupedInt16x16
 	OpAddPairsSaturatedInt16x8
-	OpAddPairsSaturatedInt16x16
 	OpAddPairsUint16x8
-	OpAddPairsUint16x16
 	OpAddPairsUint32x4
-	OpAddPairsUint32x8
 	OpAddSaturatedInt8x16
 	OpAddSaturatedInt8x32
 	OpAddSaturatedInt8x64
@@ -7190,19 +7190,19 @@ const (
 	OpSubInt64x4
 	OpSubInt64x8
 	OpSubPairsFloat32x4
-	OpSubPairsFloat32x8
 	OpSubPairsFloat64x2
-	OpSubPairsFloat64x4
+	OpSubPairsGroupedFloat32x8
+	OpSubPairsGroupedFloat64x4
+	OpSubPairsGroupedInt16x16
+	OpSubPairsGroupedInt32x8
+	OpSubPairsGroupedUint16x16
+	OpSubPairsGroupedUint32x8
 	OpSubPairsInt16x8
-	OpSubPairsInt16x16
 	OpSubPairsInt32x4
-	OpSubPairsInt32x8
+	OpSubPairsSaturatedGroupedInt16x16
 	OpSubPairsSaturatedInt16x8
-	OpSubPairsSaturatedInt16x16
 	OpSubPairsUint16x8
-	OpSubPairsUint16x16
 	OpSubPairsUint32x4
-	OpSubPairsUint32x8
 	OpSubSaturatedInt8x16
 	OpSubSaturatedInt8x32
 	OpSubSaturatedInt8x64
@@ -89231,18 +89231,38 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
-	{
-		name:    "AddPairsFloat32x8",
-		argLen:  2,
-		generic: true,
-	},
 	{
 		name:    "AddPairsFloat64x2",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsFloat64x4",
+		name:    "AddPairsGroupedFloat32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsGroupedFloat64x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsGroupedInt16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsGroupedInt32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsGroupedUint16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsGroupedUint32x8",
 		argLen:  2,
 		generic: true,
 	},
@@ -89251,18 +89271,13 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
-	{
-		name:    "AddPairsInt16x16",
-		argLen:  2,
-		generic: true,
-	},
 	{
 		name:    "AddPairsInt32x4",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "AddPairsInt32x8",
+		name:    "AddPairsSaturatedGroupedInt16x16",
 		argLen:  2,
 		generic: true,
 	},
@@ -89271,31 +89286,16 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
-	{
-		name:    "AddPairsSaturatedInt16x16",
-		argLen:  2,
-		generic: true,
-	},
 	{
 		name:    "AddPairsUint16x8",
 		argLen:  2,
 		generic: true,
 	},
-	{
-		name:    "AddPairsUint16x16",
-		argLen:  2,
-		generic: true,
-	},
 	{
 		name:    "AddPairsUint32x4",
 		argLen:  2,
 		generic: true,
 	},
-	{
-		name:    "AddPairsUint32x8",
-		argLen:  2,
-		generic: true,
-	},
 	{
 		name:        "AddSaturatedInt8x16",
 		argLen:      2,
@@ -94393,18 +94393,38 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
-	{
-		name:    "SubPairsFloat32x8",
-		argLen:  2,
-		generic: true,
-	},
 	{
 		name:    "SubPairsFloat64x2",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsFloat64x4",
+		name:    "SubPairsGroupedFloat32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsGroupedFloat64x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsGroupedInt16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsGroupedInt32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsGroupedUint16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsGroupedUint32x8",
 		argLen:  2,
 		generic: true,
 	},
@@ -94413,18 +94433,13 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
-	{
-		name:    "SubPairsInt16x16",
-		argLen:  2,
-		generic: true,
-	},
 	{
 		name:    "SubPairsInt32x4",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "SubPairsInt32x8",
+		name:    "SubPairsSaturatedGroupedInt16x16",
 		argLen:  2,
 		generic: true,
 	},
@@ -94433,31 +94448,16 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
-	{
-		name:    "SubPairsSaturatedInt16x16",
-		argLen:  2,
-		generic: true,
-	},
 	{
 		name:    "SubPairsUint16x8",
 		argLen:  2,
 		generic: true,
 	},
-	{
-		name:    "SubPairsUint16x16",
-		argLen:  2,
-		generic: true,
-	},
 	{
 		name:    "SubPairsUint32x4",
 		argLen:  2,
 		generic: true,
 	},
-	{
-		name:    "SubPairsUint32x8",
-		argLen:  2,
-		generic: true,
-	},
 	{
 		name:    "SubSaturatedInt8x16",
 		argLen:  2,
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -2113,45 +2113,45 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpAddPairsFloat32x4:
 		v.Op = OpAMD64VHADDPS128
 		return true
-	case OpAddPairsFloat32x8:
-		v.Op = OpAMD64VHADDPS256
-		return true
 	case OpAddPairsFloat64x2:
 		v.Op = OpAMD64VHADDPD128
 		return true
-	case OpAddPairsFloat64x4:
+	case OpAddPairsGroupedFloat32x8:
+		v.Op = OpAMD64VHADDPS256
+		return true
+	case OpAddPairsGroupedFloat64x4:
 		v.Op = OpAMD64VHADDPD256
 		return true
-	case OpAddPairsInt16x16:
+	case OpAddPairsGroupedInt16x16:
 		v.Op = OpAMD64VPHADDW256
 		return true
+	case OpAddPairsGroupedInt32x8:
+		v.Op = OpAMD64VPHADDD256
+		return true
+	case OpAddPairsGroupedUint16x16:
+		v.Op = OpAMD64VPHADDW256
+		return true
+	case OpAddPairsGroupedUint32x8:
+		v.Op = OpAMD64VPHADDD256
+		return true
 	case OpAddPairsInt16x8:
 		v.Op = OpAMD64VPHADDW128
 		return true
 	case OpAddPairsInt32x4:
 		v.Op = OpAMD64VPHADDD128
 		return true
-	case OpAddPairsInt32x8:
-		v.Op = OpAMD64VPHADDD256
-		return true
-	case OpAddPairsSaturatedInt16x16:
+	case OpAddPairsSaturatedGroupedInt16x16:
 		v.Op = OpAMD64VPHADDSW256
 		return true
 	case OpAddPairsSaturatedInt16x8:
 		v.Op = OpAMD64VPHADDSW128
 		return true
-	case OpAddPairsUint16x16:
-		v.Op = OpAMD64VPHADDW256
-		return true
 	case OpAddPairsUint16x8:
 		v.Op = OpAMD64VPHADDW128
 		return true
 	case OpAddPairsUint32x4:
 		v.Op = OpAMD64VPHADDD128
 		return true
-	case OpAddPairsUint32x8:
-		v.Op = OpAMD64VPHADDD256
-		return true
 	case OpAddPtr:
 		v.Op = OpAMD64ADDQ
 		return true
@@ -5860,45 +5860,45 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpSubPairsFloat32x4:
 		v.Op = OpAMD64VHSUBPS128
 		return true
-	case OpSubPairsFloat32x8:
-		v.Op = OpAMD64VHSUBPS256
-		return true
 	case OpSubPairsFloat64x2:
 		v.Op = OpAMD64VHSUBPD128
 		return true
-	case OpSubPairsFloat64x4:
+	case OpSubPairsGroupedFloat32x8:
+		v.Op = OpAMD64VHSUBPS256
+		return true
+	case OpSubPairsGroupedFloat64x4:
 		v.Op = OpAMD64VHSUBPD256
 		return true
-	case OpSubPairsInt16x16:
+	case OpSubPairsGroupedInt16x16:
 		v.Op = OpAMD64VPHSUBW256
 		return true
+	case OpSubPairsGroupedInt32x8:
+		v.Op = OpAMD64VPHSUBD256
+		return true
+	case OpSubPairsGroupedUint16x16:
+		v.Op = OpAMD64VPHSUBW256
+		return true
+	case OpSubPairsGroupedUint32x8:
+		v.Op = OpAMD64VPHSUBD256
+		return true
 	case OpSubPairsInt16x8:
 		v.Op = OpAMD64VPHSUBW128
 		return true
 	case OpSubPairsInt32x4:
 		v.Op = OpAMD64VPHSUBD128
 		return true
-	case OpSubPairsInt32x8:
-		v.Op = OpAMD64VPHSUBD256
-		return true
-	case OpSubPairsSaturatedInt16x16:
+	case OpSubPairsSaturatedGroupedInt16x16:
 		v.Op = OpAMD64VPHSUBSW256
 		return true
 	case OpSubPairsSaturatedInt16x8:
 		v.Op = OpAMD64VPHSUBSW128
 		return true
-	case OpSubPairsUint16x16:
-		v.Op = OpAMD64VPHSUBW256
-		return true
 	case OpSubPairsUint16x8:
 		v.Op = OpAMD64VPHSUBW128
 		return true
 	case OpSubPairsUint32x4:
 		v.Op = OpAMD64VPHSUBD128
 		return true
-	case OpSubPairsUint32x8:
-		v.Op = OpAMD64VPHSUBD256
-		return true
 	case OpSubPtr:
 		v.Op = OpAMD64SUBQ
 		return true
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -69,19 +69,19 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.AddPairs", opLen2(ssa.OpAddPairsFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.AddPairs", opLen2(ssa.OpAddPairsFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x8.AddPairs", opLen2(ssa.OpAddPairsInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.AddPairs", opLen2(ssa.OpAddPairsInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x4.AddPairs", opLen2(ssa.OpAddPairsInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.AddPairs", opLen2(ssa.OpAddPairsInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x8.AddPairs", opLen2(ssa.OpAddPairsUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.AddPairs", opLen2(ssa.OpAddPairsUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x4.AddPairs", opLen2(ssa.OpAddPairsUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x8.AddPairs", opLen2(ssa.OpAddPairsUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x4.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x8.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddPairsSaturatedGrouped", opLen2(ssa.OpAddPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x64, types.TypeVec512), sys.AMD64)
@@ -1193,19 +1193,19 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x4.Sub", opLen2(ssa.OpSubUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Sub", opLen2(ssa.OpSubUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.SubPairs", opLen2(ssa.OpSubPairsFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.SubPairs", opLen2(ssa.OpSubPairsFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.SubPairs", opLen2(ssa.OpSubPairsFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.SubPairs", opLen2(ssa.OpSubPairsFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x8.SubPairs", opLen2(ssa.OpSubPairsInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SubPairs", opLen2(ssa.OpSubPairsInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x4.SubPairs", opLen2(ssa.OpSubPairsInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.SubPairs", opLen2(ssa.OpSubPairsInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x8.SubPairs", opLen2(ssa.OpSubPairsUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.SubPairs", opLen2(ssa.OpSubPairsUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x4.SubPairs", opLen2(ssa.OpSubPairsUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x8.SubPairs", opLen2(ssa.OpSubPairsUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x4.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x8.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubPairsSaturatedGrouped", opLen2(ssa.OpSubPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x64, types.TypeVec512), sys.AMD64)
--- a/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml
@@ -17,21 +17,83 @@
    // NAME subtracts corresponding elements of two vectors with saturation.
 - go: AddPairs
  commutative: false
+  out:
+  - elemBits: 16|32
  documentation: !string |-
    // NAME horizontally adds adjacent pairs of elements.
-    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+- go: AddPairs
+  commutative: false
+  out:
+  - elemBits: 64
+  documentation: !string |-
+    // NAME horizontally adds adjacent pairs of elements.
+    // For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
 - go: SubPairs
  commutative: false
+  out:
+  - elemBits: 16|32
  documentation: !string |-
    // NAME horizontally subtracts adjacent pairs of elements.
-    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+- go: SubPairs
+  commutative: false
+  out:
+  - elemBits: 64
+  documentation: !string |-
+    // NAME horizontally subtracts adjacent pairs of elements.
+    // For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
 - go: AddPairsSaturated
  commutative: false
  documentation: !string |-
    // NAME horizontally adds adjacent pairs of elements with saturation.
-    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 - go: SubPairsSaturated
  commutative: false
  documentation: !string |-
    // NAME horizontally subtracts adjacent pairs of elements with saturation.
-    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+    // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+- go: AddPairsGrouped
+  commutative: false
+  out:
+  - elemBits: 16|32
+  documentation: !string |-
+    // NAME horizontally adds adjacent pairs of elements.
+    // With each 128-bit as a group:
+    // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+- go: AddPairsGrouped
+  commutative: false
+  out:
+  - elemBits: 64
+  documentation: !string |-
+    // NAME horizontally adds adjacent pairs of elements.
+    // With each 128-bit as a group:
+    // for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
+- go: SubPairsGrouped
+  commutative: false
+  out:
+  - elemBits: 16|32
+  documentation: !string |-
+    // NAME horizontally subtracts adjacent pairs of elements.
+    // With each 128-bit as a group:
+    // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+- go: SubPairsGrouped
+  commutative: false
+  out:
+  - elemBits: 64
+  documentation: !string |-
+    // NAME horizontally subtracts adjacent pairs of elements.
+    // With each 128-bit as a group:
+    // for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
+- go: AddPairsSaturatedGrouped
+  commutative: false
+  documentation: !string |-
+    // NAME horizontally adds adjacent pairs of elements with saturation.
+    // With each 128-bit as a group:
+    // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+- go: SubPairsSaturatedGrouped
+  commutative: false
+  documentation: !string |-
+    // NAME horizontally subtracts adjacent pairs of elements with saturation.
+    // With each 128-bit as a group:
+    // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
--- a/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml
@@ -53,25 +53,71 @@
  - *uint
 - go: AddPairs
  asm: "VPHADD[DW]"
-  in: *2any
-  out: *1any
+  in: &2any128
+  - &any128
+    go: $t
+    bits: 128
+  - *any128
+  out: &1any128
+  - *any128
 - go: SubPairs
  asm: "VPHSUB[DW]"
-  in: *2any
-  out: *1any
+  in: *2any128
+  out: *1any128
 - go: AddPairs
  asm: "VHADDP[SD]" # floats
-  in: *2any
-  out: *1any
+  in: *2any128
+  out: *1any128
 - go: SubPairs
  asm: "VHSUBP[SD]"  # floats
-  in: *2any
-  out: *1any
+  in: *2any128
+  out: *1any128
 - go: AddPairsSaturated
  asm: "VPHADDS[DW]"
-  in: *2int
-  out: *1int
+  in: &2int128
+  - &int128
+    go: $t
+    base: int
+    bits: 128
+  - *int128
+  out: &1int128
+  - *int128
 - go: SubPairsSaturated
  asm: "VPHSUBS[DW]"
-  in: *2int
-  out: *1int
+  in: *2int128
+  out: *1int128
+- go: AddPairsGrouped
+  asm: "VPHADD[DW]"
+  in: &2any256
+  - &any256
+    go: $t
+    bits: 256
+  - *any256
+  out: &1any256
+  - *any256
+- go: SubPairsGrouped
+  asm: "VPHSUB[DW]"
+  in: *2any256
+  out: *1any256
+- go: AddPairsGrouped
+  asm: "VHADDP[SD]" # floats
+  in: *2any256
+  out: *1any256
+- go: SubPairsGrouped
+  asm: "VHSUBP[SD]"  # floats
+  in: *2any256
+  out: *1any256
+- go: AddPairsSaturatedGrouped
+  asm: "VPHADDS[DW]"
+  in: &2int256
+  - &int256
+    go: $t
+    base: int
+    bits: 256
+  - *int256
+  out: &1int256
+  - *int256
+- go: SubPairsSaturatedGrouped
+  asm: "VPHSUBS[DW]"
+  in: *2int256
+  out: *1int256
--- a/src/simd/archsimd/internal/simd_test/simd_test.go
+++ b/src/simd/archsimd/internal/simd_test/simd_test.go
@@ -13,6 +13,7 @@ import (
 	"simd/archsimd"
 	"slices"
 	"testing"
+	"unsafe"
 )

 func TestMain(m *testing.M) {
@@ -1228,3 +1229,70 @@ func TestClMul(t *testing.T) {
 	foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0})

 }
+
+func addPairsSlice[T number](a, b []T) []T {
+	r := make([]T, len(a))
+	for i := range len(a) / 2 {
+		r[i] = a[2*i] + a[2*i+1]
+		r[i+len(a)/2] = b[2*i] + b[2*i+1]
+	}
+	return r
+}
+
+func subPairsSlice[T number](a, b []T) []T {
+	r := make([]T, len(a))
+	for i := range len(a) / 2 {
+		r[i] = a[2*i] - a[2*i+1]
+		r[i+len(a)/2] = b[2*i] - b[2*i+1]
+	}
+	return r
+}
+
+func addPairsGroupedSlice[T number](a, b []T) []T {
+	group := int(128 / unsafe.Sizeof(a[0]))
+	r := make([]T, 0, len(a))
+	for i := range len(a) / group {
+		r = append(r, addPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
+	}
+	return r
+}
+
+func subPairsGroupedSlice[T number](a, b []T) []T {
+	group := int(128 / unsafe.Sizeof(a[0]))
+	r := make([]T, 0, len(a))
+	for i := range len(a) / group {
+		r = append(r, subPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
+	}
+	return r
+}
+
+func TestAddSubPairs(t *testing.T) {
+	testInt16x8Binary(t, archsimd.Int16x8.AddPairs, addPairsSlice[int16])
+	testInt16x8Binary(t, archsimd.Int16x8.SubPairs, subPairsSlice[int16])
+	testUint16x8Binary(t, archsimd.Uint16x8.AddPairs, addPairsSlice[uint16])
+	testUint16x8Binary(t, archsimd.Uint16x8.SubPairs, subPairsSlice[uint16])
+	testInt32x4Binary(t, archsimd.Int32x4.AddPairs, addPairsSlice[int32])
+	testInt32x4Binary(t, archsimd.Int32x4.SubPairs, subPairsSlice[int32])
+	testUint32x4Binary(t, archsimd.Uint32x4.AddPairs, addPairsSlice[uint32])
+	testUint32x4Binary(t, archsimd.Uint32x4.SubPairs, subPairsSlice[uint32])
+	testFloat32x4Binary(t, archsimd.Float32x4.AddPairs, addPairsSlice[float32])
+	testFloat32x4Binary(t, archsimd.Float32x4.SubPairs, subPairsSlice[float32])
+	testFloat64x2Binary(t, archsimd.Float64x2.AddPairs, addPairsSlice[float64])
+	testFloat64x2Binary(t, archsimd.Float64x2.SubPairs, subPairsSlice[float64])
+
+	// Grouped versions
+	if archsimd.X86.AVX2() {
+		testInt16x16Binary(t, archsimd.Int16x16.AddPairsGrouped, addPairsGroupedSlice[int16])
+		testInt16x16Binary(t, archsimd.Int16x16.SubPairsGrouped, subPairsGroupedSlice[int16])
+		testUint16x16Binary(t, archsimd.Uint16x16.AddPairsGrouped, addPairsGroupedSlice[uint16])
+		testUint16x16Binary(t, archsimd.Uint16x16.SubPairsGrouped, subPairsGroupedSlice[uint16])
+		testInt32x8Binary(t, archsimd.Int32x8.AddPairsGrouped, addPairsGroupedSlice[int32])
+		testInt32x8Binary(t, archsimd.Int32x8.SubPairsGrouped, subPairsGroupedSlice[int32])
+		testUint32x8Binary(t, archsimd.Uint32x8.AddPairsGrouped, addPairsGroupedSlice[uint32])
+		testUint32x8Binary(t, archsimd.Uint32x8.SubPairsGrouped, subPairsGroupedSlice[uint32])
+		testFloat32x8Binary(t, archsimd.Float32x8.AddPairsGrouped, addPairsGroupedSlice[float32])
+		testFloat32x8Binary(t, archsimd.Float32x8.SubPairsGrouped, subPairsGroupedSlice[float32])
+		testFloat64x4Binary(t, archsimd.Float64x4.AddPairsGrouped, addPairsGroupedSlice[float64])
+		testFloat64x4Binary(t, archsimd.Float64x4.SubPairsGrouped, subPairsGroupedSlice[float64])
+	}
+}
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -349,90 +349,101 @@ func (x Uint64x8) Add(y Uint64x8) Uint64x8
 /* AddPairs */

 // AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VHADDPS, CPU Feature: AVX
 func (x Float32x4) AddPairs(y Float32x4) Float32x4

 // AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPS, CPU Feature: AVX
-func (x Float32x8) AddPairs(y Float32x8) Float32x8
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
 //
 // Asm: VHADDPD, CPU Feature: AVX
 func (x Float64x2) AddPairs(y Float64x2) Float64x2

 // AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPD, CPU Feature: AVX
-func (x Float64x4) AddPairs(y Float64x4) Float64x4
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VPHADDW, CPU Feature: AVX
 func (x Int16x8) AddPairs(y Int16x8) Int16x8

 // AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Int16x16) AddPairs(y Int16x16) Int16x16
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VPHADDD, CPU Feature: AVX
 func (x Int32x4) AddPairs(y Int32x4) Int32x4

 // AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX2
-func (x Int32x8) AddPairs(y Int32x8) Int32x8
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VPHADDW, CPU Feature: AVX
 func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8

 // AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Uint16x16) AddPairs(y Uint16x16) Uint16x16
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VPHADDD, CPU Feature: AVX
 func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4

-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+/* AddPairsGrouped */
+
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VHADDPS, CPU Feature: AVX
+func (x Float32x8) AddPairsGrouped(y Float32x8) Float32x8
+
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
+//
+// Asm: VHADDPD, CPU Feature: AVX
+func (x Float64x4) AddPairsGrouped(y Float64x4) Float64x4
+
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Int16x16) AddPairsGrouped(y Int16x16) Int16x16
+
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VPHADDD, CPU Feature: AVX2
-func (x Uint32x8) AddPairs(y Uint32x8) Uint32x8
+func (x Int32x8) AddPairsGrouped(y Int32x8) Int32x8
+
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Uint16x16) AddPairsGrouped(y Uint16x16) Uint16x16
+
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX2
+func (x Uint32x8) AddPairsGrouped(y Uint32x8) Uint32x8

 /* AddPairsSaturated */

 // AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VPHADDSW, CPU Feature: AVX
 func (x Int16x8) AddPairsSaturated(y Int16x8) Int16x8

-// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+/* AddPairsSaturatedGrouped */
+
+// AddPairsSaturatedGrouped horizontally adds adjacent pairs of elements with saturation.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
 //
 // Asm: VPHADDSW, CPU Feature: AVX2
-func (x Int16x16) AddPairsSaturated(y Int16x16) Int16x16
+func (x Int16x16) AddPairsSaturatedGrouped(y Int16x16) Int16x16

 /* AddSaturated */

@@ -7084,90 +7095,101 @@ func (x Uint64x8) Sub(y Uint64x8) Uint64x8
 /* SubPairs */

 // SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VHSUBPS, CPU Feature: AVX
 func (x Float32x4) SubPairs(y Float32x4) Float32x4

 // SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VHSUBPS, CPU Feature: AVX
-func (x Float32x8) SubPairs(y Float32x8) Float32x8
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
 //
 // Asm: VHSUBPD, CPU Feature: AVX
 func (x Float64x2) SubPairs(y Float64x2) Float64x2

 // SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VHSUBPD, CPU Feature: AVX
-func (x Float64x4) SubPairs(y Float64x4) Float64x4
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VPHSUBW, CPU Feature: AVX
 func (x Int16x8) SubPairs(y Int16x8) Int16x8

 // SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Int16x16) SubPairs(y Int16x16) Int16x16
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VPHSUBD, CPU Feature: AVX
 func (x Int32x4) SubPairs(y Int32x4) Int32x4

 // SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Int32x8) SubPairs(y Int32x8) Int32x8
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VPHSUBW, CPU Feature: AVX
 func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8

 // SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Uint16x16) SubPairs(y Uint16x16) Uint16x16
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VPHSUBD, CPU Feature: AVX
 func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4

-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+/* SubPairsGrouped */
+
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VHSUBPS, CPU Feature: AVX
+func (x Float32x8) SubPairsGrouped(y Float32x8) Float32x8
+
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
+//
+// Asm: VHSUBPD, CPU Feature: AVX
+func (x Float64x4) SubPairsGrouped(y Float64x4) Float64x4
+
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Int16x16) SubPairsGrouped(y Int16x16) Int16x16
+
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VPHSUBD, CPU Feature: AVX2
-func (x Uint32x8) SubPairs(y Uint32x8) Uint32x8
+func (x Int32x8) SubPairsGrouped(y Int32x8) Int32x8
+
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Uint16x16) SubPairsGrouped(y Uint16x16) Uint16x16
+
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX2
+func (x Uint32x8) SubPairsGrouped(y Uint32x8) Uint32x8

 /* SubPairsSaturated */

 // SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VPHSUBSW, CPU Feature: AVX
 func (x Int16x8) SubPairsSaturated(y Int16x8) Int16x8

-// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+/* SubPairsSaturatedGrouped */
+
+// SubPairsSaturatedGrouped horizontally subtracts adjacent pairs of elements with saturation.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
 //
 // Asm: VPHSUBSW, CPU Feature: AVX2
-func (x Int16x16) SubPairsSaturated(y Int16x16) Int16x16
+func (x Int16x16) SubPairsSaturatedGrouped(y Int16x16) Int16x16

 /* SubSaturated */