simd/archsimd: correct documentation for pairwise operations

For Add/SubPairs(Saturated?), the documented result element order
is wrong. Corrected.

Also, for 256-bit vectors, this is a grouped operation. So name it
with the Grouped suffix to be clear.

Change-Id: Idfd0975cb4a332b2e28c898613861205d26f75b0
Reviewed-on: https://go-review.googlesource.com/c/go/+/732020
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
Cherry Mui
2025-12-22 14:52:57 -05:00
parent 3d77a0b15e
commit c1efada1d2
10 changed files with 446 additions and 248 deletions

View File

@@ -250,12 +250,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPADDQ256,
ssa.OpAMD64VPADDQ512,
ssa.OpAMD64VHADDPS128,
ssa.OpAMD64VHADDPS256,
ssa.OpAMD64VHADDPD128,
ssa.OpAMD64VHADDPD256,
ssa.OpAMD64VPHADDW128,
ssa.OpAMD64VPHADDW256,
ssa.OpAMD64VPHADDD128,
ssa.OpAMD64VHADDPS256,
ssa.OpAMD64VHADDPD256,
ssa.OpAMD64VPHADDW256,
ssa.OpAMD64VPHADDD256,
ssa.OpAMD64VPHADDSW128,
ssa.OpAMD64VPHADDSW256,
@@ -520,12 +520,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPSUBQ256,
ssa.OpAMD64VPSUBQ512,
ssa.OpAMD64VHSUBPS128,
ssa.OpAMD64VHSUBPS256,
ssa.OpAMD64VHSUBPD128,
ssa.OpAMD64VHSUBPD256,
ssa.OpAMD64VPHSUBW128,
ssa.OpAMD64VPHSUBW256,
ssa.OpAMD64VPHSUBD128,
ssa.OpAMD64VHSUBPS256,
ssa.OpAMD64VHSUBPD256,
ssa.OpAMD64VPHSUBW256,
ssa.OpAMD64VPHSUBD256,
ssa.OpAMD64VPHSUBSW128,
ssa.OpAMD64VPHSUBSW256,

View File

@@ -57,19 +57,19 @@
(AddUint64x4 ...) => (VPADDQ256 ...)
(AddUint64x8 ...) => (VPADDQ512 ...)
(AddPairsFloat32x4 ...) => (VHADDPS128 ...)
(AddPairsFloat32x8 ...) => (VHADDPS256 ...)
(AddPairsFloat64x2 ...) => (VHADDPD128 ...)
(AddPairsFloat64x4 ...) => (VHADDPD256 ...)
(AddPairsInt16x8 ...) => (VPHADDW128 ...)
(AddPairsInt16x16 ...) => (VPHADDW256 ...)
(AddPairsInt32x4 ...) => (VPHADDD128 ...)
(AddPairsInt32x8 ...) => (VPHADDD256 ...)
(AddPairsUint16x8 ...) => (VPHADDW128 ...)
(AddPairsUint16x16 ...) => (VPHADDW256 ...)
(AddPairsUint32x4 ...) => (VPHADDD128 ...)
(AddPairsUint32x8 ...) => (VPHADDD256 ...)
(AddPairsGroupedFloat32x8 ...) => (VHADDPS256 ...)
(AddPairsGroupedFloat64x4 ...) => (VHADDPD256 ...)
(AddPairsGroupedInt16x16 ...) => (VPHADDW256 ...)
(AddPairsGroupedInt32x8 ...) => (VPHADDD256 ...)
(AddPairsGroupedUint16x16 ...) => (VPHADDW256 ...)
(AddPairsGroupedUint32x8 ...) => (VPHADDD256 ...)
(AddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...)
(AddPairsSaturatedInt16x16 ...) => (VPHADDSW256 ...)
(AddPairsSaturatedGroupedInt16x16 ...) => (VPHADDSW256 ...)
(AddSaturatedInt8x16 ...) => (VPADDSB128 ...)
(AddSaturatedInt8x32 ...) => (VPADDSB256 ...)
(AddSaturatedInt8x64 ...) => (VPADDSB512 ...)
@@ -1217,19 +1217,19 @@
(SubUint64x4 ...) => (VPSUBQ256 ...)
(SubUint64x8 ...) => (VPSUBQ512 ...)
(SubPairsFloat32x4 ...) => (VHSUBPS128 ...)
(SubPairsFloat32x8 ...) => (VHSUBPS256 ...)
(SubPairsFloat64x2 ...) => (VHSUBPD128 ...)
(SubPairsFloat64x4 ...) => (VHSUBPD256 ...)
(SubPairsInt16x8 ...) => (VPHSUBW128 ...)
(SubPairsInt16x16 ...) => (VPHSUBW256 ...)
(SubPairsInt32x4 ...) => (VPHSUBD128 ...)
(SubPairsInt32x8 ...) => (VPHSUBD256 ...)
(SubPairsUint16x8 ...) => (VPHSUBW128 ...)
(SubPairsUint16x16 ...) => (VPHSUBW256 ...)
(SubPairsUint32x4 ...) => (VPHSUBD128 ...)
(SubPairsUint32x8 ...) => (VPHSUBD256 ...)
(SubPairsGroupedFloat32x8 ...) => (VHSUBPS256 ...)
(SubPairsGroupedFloat64x4 ...) => (VHSUBPD256 ...)
(SubPairsGroupedInt16x16 ...) => (VPHSUBW256 ...)
(SubPairsGroupedInt32x8 ...) => (VPHSUBD256 ...)
(SubPairsGroupedUint16x16 ...) => (VPHSUBW256 ...)
(SubPairsGroupedUint32x8 ...) => (VPHSUBD256 ...)
(SubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...)
(SubPairsSaturatedInt16x16 ...) => (VPHSUBSW256 ...)
(SubPairsSaturatedGroupedInt16x16 ...) => (VPHSUBSW256 ...)
(SubSaturatedInt8x16 ...) => (VPSUBSB128 ...)
(SubSaturatedInt8x32 ...) => (VPSUBSB256 ...)
(SubSaturatedInt8x64 ...) => (VPSUBSB512 ...)

View File

@@ -48,19 +48,19 @@ func simdGenericOps() []opData {
{name: "AddInt64x4", argLength: 2, commutative: true},
{name: "AddInt64x8", argLength: 2, commutative: true},
{name: "AddPairsFloat32x4", argLength: 2, commutative: false},
{name: "AddPairsFloat32x8", argLength: 2, commutative: false},
{name: "AddPairsFloat64x2", argLength: 2, commutative: false},
{name: "AddPairsFloat64x4", argLength: 2, commutative: false},
{name: "AddPairsGroupedFloat32x8", argLength: 2, commutative: false},
{name: "AddPairsGroupedFloat64x4", argLength: 2, commutative: false},
{name: "AddPairsGroupedInt16x16", argLength: 2, commutative: false},
{name: "AddPairsGroupedInt32x8", argLength: 2, commutative: false},
{name: "AddPairsGroupedUint16x16", argLength: 2, commutative: false},
{name: "AddPairsGroupedUint32x8", argLength: 2, commutative: false},
{name: "AddPairsInt16x8", argLength: 2, commutative: false},
{name: "AddPairsInt16x16", argLength: 2, commutative: false},
{name: "AddPairsInt32x4", argLength: 2, commutative: false},
{name: "AddPairsInt32x8", argLength: 2, commutative: false},
{name: "AddPairsSaturatedGroupedInt16x16", argLength: 2, commutative: false},
{name: "AddPairsSaturatedInt16x8", argLength: 2, commutative: false},
{name: "AddPairsSaturatedInt16x16", argLength: 2, commutative: false},
{name: "AddPairsUint16x8", argLength: 2, commutative: false},
{name: "AddPairsUint16x16", argLength: 2, commutative: false},
{name: "AddPairsUint32x4", argLength: 2, commutative: false},
{name: "AddPairsUint32x8", argLength: 2, commutative: false},
{name: "AddSaturatedInt8x16", argLength: 2, commutative: true},
{name: "AddSaturatedInt8x32", argLength: 2, commutative: true},
{name: "AddSaturatedInt8x64", argLength: 2, commutative: true},
@@ -1036,19 +1036,19 @@ func simdGenericOps() []opData {
{name: "SubInt64x4", argLength: 2, commutative: false},
{name: "SubInt64x8", argLength: 2, commutative: false},
{name: "SubPairsFloat32x4", argLength: 2, commutative: false},
{name: "SubPairsFloat32x8", argLength: 2, commutative: false},
{name: "SubPairsFloat64x2", argLength: 2, commutative: false},
{name: "SubPairsFloat64x4", argLength: 2, commutative: false},
{name: "SubPairsGroupedFloat32x8", argLength: 2, commutative: false},
{name: "SubPairsGroupedFloat64x4", argLength: 2, commutative: false},
{name: "SubPairsGroupedInt16x16", argLength: 2, commutative: false},
{name: "SubPairsGroupedInt32x8", argLength: 2, commutative: false},
{name: "SubPairsGroupedUint16x16", argLength: 2, commutative: false},
{name: "SubPairsGroupedUint32x8", argLength: 2, commutative: false},
{name: "SubPairsInt16x8", argLength: 2, commutative: false},
{name: "SubPairsInt16x16", argLength: 2, commutative: false},
{name: "SubPairsInt32x4", argLength: 2, commutative: false},
{name: "SubPairsInt32x8", argLength: 2, commutative: false},
{name: "SubPairsSaturatedGroupedInt16x16", argLength: 2, commutative: false},
{name: "SubPairsSaturatedInt16x8", argLength: 2, commutative: false},
{name: "SubPairsSaturatedInt16x16", argLength: 2, commutative: false},
{name: "SubPairsUint16x8", argLength: 2, commutative: false},
{name: "SubPairsUint16x16", argLength: 2, commutative: false},
{name: "SubPairsUint32x4", argLength: 2, commutative: false},
{name: "SubPairsUint32x8", argLength: 2, commutative: false},
{name: "SubSaturatedInt8x16", argLength: 2, commutative: false},
{name: "SubSaturatedInt8x32", argLength: 2, commutative: false},
{name: "SubSaturatedInt8x64", argLength: 2, commutative: false},

View File

@@ -6202,19 +6202,19 @@ const (
OpAddInt64x4
OpAddInt64x8
OpAddPairsFloat32x4
OpAddPairsFloat32x8
OpAddPairsFloat64x2
OpAddPairsFloat64x4
OpAddPairsGroupedFloat32x8
OpAddPairsGroupedFloat64x4
OpAddPairsGroupedInt16x16
OpAddPairsGroupedInt32x8
OpAddPairsGroupedUint16x16
OpAddPairsGroupedUint32x8
OpAddPairsInt16x8
OpAddPairsInt16x16
OpAddPairsInt32x4
OpAddPairsInt32x8
OpAddPairsSaturatedGroupedInt16x16
OpAddPairsSaturatedInt16x8
OpAddPairsSaturatedInt16x16
OpAddPairsUint16x8
OpAddPairsUint16x16
OpAddPairsUint32x4
OpAddPairsUint32x8
OpAddSaturatedInt8x16
OpAddSaturatedInt8x32
OpAddSaturatedInt8x64
@@ -7190,19 +7190,19 @@ const (
OpSubInt64x4
OpSubInt64x8
OpSubPairsFloat32x4
OpSubPairsFloat32x8
OpSubPairsFloat64x2
OpSubPairsFloat64x4
OpSubPairsGroupedFloat32x8
OpSubPairsGroupedFloat64x4
OpSubPairsGroupedInt16x16
OpSubPairsGroupedInt32x8
OpSubPairsGroupedUint16x16
OpSubPairsGroupedUint32x8
OpSubPairsInt16x8
OpSubPairsInt16x16
OpSubPairsInt32x4
OpSubPairsInt32x8
OpSubPairsSaturatedGroupedInt16x16
OpSubPairsSaturatedInt16x8
OpSubPairsSaturatedInt16x16
OpSubPairsUint16x8
OpSubPairsUint16x16
OpSubPairsUint32x4
OpSubPairsUint32x8
OpSubSaturatedInt8x16
OpSubSaturatedInt8x32
OpSubSaturatedInt8x64
@@ -89231,18 +89231,38 @@ var opcodeTable = [...]opInfo{
argLen: 2,
generic: true,
},
{
name: "AddPairsFloat32x8",
argLen: 2,
generic: true,
},
{
name: "AddPairsFloat64x2",
argLen: 2,
generic: true,
},
{
name: "AddPairsFloat64x4",
name: "AddPairsGroupedFloat32x8",
argLen: 2,
generic: true,
},
{
name: "AddPairsGroupedFloat64x4",
argLen: 2,
generic: true,
},
{
name: "AddPairsGroupedInt16x16",
argLen: 2,
generic: true,
},
{
name: "AddPairsGroupedInt32x8",
argLen: 2,
generic: true,
},
{
name: "AddPairsGroupedUint16x16",
argLen: 2,
generic: true,
},
{
name: "AddPairsGroupedUint32x8",
argLen: 2,
generic: true,
},
@@ -89251,18 +89271,13 @@ var opcodeTable = [...]opInfo{
argLen: 2,
generic: true,
},
{
name: "AddPairsInt16x16",
argLen: 2,
generic: true,
},
{
name: "AddPairsInt32x4",
argLen: 2,
generic: true,
},
{
name: "AddPairsInt32x8",
name: "AddPairsSaturatedGroupedInt16x16",
argLen: 2,
generic: true,
},
@@ -89271,31 +89286,16 @@ var opcodeTable = [...]opInfo{
argLen: 2,
generic: true,
},
{
name: "AddPairsSaturatedInt16x16",
argLen: 2,
generic: true,
},
{
name: "AddPairsUint16x8",
argLen: 2,
generic: true,
},
{
name: "AddPairsUint16x16",
argLen: 2,
generic: true,
},
{
name: "AddPairsUint32x4",
argLen: 2,
generic: true,
},
{
name: "AddPairsUint32x8",
argLen: 2,
generic: true,
},
{
name: "AddSaturatedInt8x16",
argLen: 2,
@@ -94393,18 +94393,38 @@ var opcodeTable = [...]opInfo{
argLen: 2,
generic: true,
},
{
name: "SubPairsFloat32x8",
argLen: 2,
generic: true,
},
{
name: "SubPairsFloat64x2",
argLen: 2,
generic: true,
},
{
name: "SubPairsFloat64x4",
name: "SubPairsGroupedFloat32x8",
argLen: 2,
generic: true,
},
{
name: "SubPairsGroupedFloat64x4",
argLen: 2,
generic: true,
},
{
name: "SubPairsGroupedInt16x16",
argLen: 2,
generic: true,
},
{
name: "SubPairsGroupedInt32x8",
argLen: 2,
generic: true,
},
{
name: "SubPairsGroupedUint16x16",
argLen: 2,
generic: true,
},
{
name: "SubPairsGroupedUint32x8",
argLen: 2,
generic: true,
},
@@ -94413,18 +94433,13 @@ var opcodeTable = [...]opInfo{
argLen: 2,
generic: true,
},
{
name: "SubPairsInt16x16",
argLen: 2,
generic: true,
},
{
name: "SubPairsInt32x4",
argLen: 2,
generic: true,
},
{
name: "SubPairsInt32x8",
name: "SubPairsSaturatedGroupedInt16x16",
argLen: 2,
generic: true,
},
@@ -94433,31 +94448,16 @@ var opcodeTable = [...]opInfo{
argLen: 2,
generic: true,
},
{
name: "SubPairsSaturatedInt16x16",
argLen: 2,
generic: true,
},
{
name: "SubPairsUint16x8",
argLen: 2,
generic: true,
},
{
name: "SubPairsUint16x16",
argLen: 2,
generic: true,
},
{
name: "SubPairsUint32x4",
argLen: 2,
generic: true,
},
{
name: "SubPairsUint32x8",
argLen: 2,
generic: true,
},
{
name: "SubSaturatedInt8x16",
argLen: 2,

View File

@@ -2113,45 +2113,45 @@ func rewriteValueAMD64(v *Value) bool {
case OpAddPairsFloat32x4:
v.Op = OpAMD64VHADDPS128
return true
case OpAddPairsFloat32x8:
v.Op = OpAMD64VHADDPS256
return true
case OpAddPairsFloat64x2:
v.Op = OpAMD64VHADDPD128
return true
case OpAddPairsFloat64x4:
case OpAddPairsGroupedFloat32x8:
v.Op = OpAMD64VHADDPS256
return true
case OpAddPairsGroupedFloat64x4:
v.Op = OpAMD64VHADDPD256
return true
case OpAddPairsInt16x16:
case OpAddPairsGroupedInt16x16:
v.Op = OpAMD64VPHADDW256
return true
case OpAddPairsGroupedInt32x8:
v.Op = OpAMD64VPHADDD256
return true
case OpAddPairsGroupedUint16x16:
v.Op = OpAMD64VPHADDW256
return true
case OpAddPairsGroupedUint32x8:
v.Op = OpAMD64VPHADDD256
return true
case OpAddPairsInt16x8:
v.Op = OpAMD64VPHADDW128
return true
case OpAddPairsInt32x4:
v.Op = OpAMD64VPHADDD128
return true
case OpAddPairsInt32x8:
v.Op = OpAMD64VPHADDD256
return true
case OpAddPairsSaturatedInt16x16:
case OpAddPairsSaturatedGroupedInt16x16:
v.Op = OpAMD64VPHADDSW256
return true
case OpAddPairsSaturatedInt16x8:
v.Op = OpAMD64VPHADDSW128
return true
case OpAddPairsUint16x16:
v.Op = OpAMD64VPHADDW256
return true
case OpAddPairsUint16x8:
v.Op = OpAMD64VPHADDW128
return true
case OpAddPairsUint32x4:
v.Op = OpAMD64VPHADDD128
return true
case OpAddPairsUint32x8:
v.Op = OpAMD64VPHADDD256
return true
case OpAddPtr:
v.Op = OpAMD64ADDQ
return true
@@ -5860,45 +5860,45 @@ func rewriteValueAMD64(v *Value) bool {
case OpSubPairsFloat32x4:
v.Op = OpAMD64VHSUBPS128
return true
case OpSubPairsFloat32x8:
v.Op = OpAMD64VHSUBPS256
return true
case OpSubPairsFloat64x2:
v.Op = OpAMD64VHSUBPD128
return true
case OpSubPairsFloat64x4:
case OpSubPairsGroupedFloat32x8:
v.Op = OpAMD64VHSUBPS256
return true
case OpSubPairsGroupedFloat64x4:
v.Op = OpAMD64VHSUBPD256
return true
case OpSubPairsInt16x16:
case OpSubPairsGroupedInt16x16:
v.Op = OpAMD64VPHSUBW256
return true
case OpSubPairsGroupedInt32x8:
v.Op = OpAMD64VPHSUBD256
return true
case OpSubPairsGroupedUint16x16:
v.Op = OpAMD64VPHSUBW256
return true
case OpSubPairsGroupedUint32x8:
v.Op = OpAMD64VPHSUBD256
return true
case OpSubPairsInt16x8:
v.Op = OpAMD64VPHSUBW128
return true
case OpSubPairsInt32x4:
v.Op = OpAMD64VPHSUBD128
return true
case OpSubPairsInt32x8:
v.Op = OpAMD64VPHSUBD256
return true
case OpSubPairsSaturatedInt16x16:
case OpSubPairsSaturatedGroupedInt16x16:
v.Op = OpAMD64VPHSUBSW256
return true
case OpSubPairsSaturatedInt16x8:
v.Op = OpAMD64VPHSUBSW128
return true
case OpSubPairsUint16x16:
v.Op = OpAMD64VPHSUBW256
return true
case OpSubPairsUint16x8:
v.Op = OpAMD64VPHSUBW128
return true
case OpSubPairsUint32x4:
v.Op = OpAMD64VPHSUBD128
return true
case OpSubPairsUint32x8:
v.Op = OpAMD64VPHSUBD256
return true
case OpSubPtr:
v.Op = OpAMD64SUBQ
return true

View File

@@ -69,19 +69,19 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.AddPairs", opLen2(ssa.OpAddPairsFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.AddPairs", opLen2(ssa.OpAddPairsFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x8.AddPairs", opLen2(ssa.OpAddPairsInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.AddPairs", opLen2(ssa.OpAddPairsInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x4.AddPairs", opLen2(ssa.OpAddPairsInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x8.AddPairs", opLen2(ssa.OpAddPairsInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x8.AddPairs", opLen2(ssa.OpAddPairsUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x16.AddPairs", opLen2(ssa.OpAddPairsUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x4.AddPairs", opLen2(ssa.OpAddPairsUint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x8.AddPairs", opLen2(ssa.OpAddPairsUint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x4.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x8.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x16.AddPairsSaturatedGrouped", opLen2(ssa.OpAddPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x64, types.TypeVec512), sys.AMD64)
@@ -1193,19 +1193,19 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint64x4.Sub", opLen2(ssa.OpSubUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Sub", opLen2(ssa.OpSubUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.SubPairs", opLen2(ssa.OpSubPairsFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.SubPairs", opLen2(ssa.OpSubPairsFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x2.SubPairs", opLen2(ssa.OpSubPairsFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.SubPairs", opLen2(ssa.OpSubPairsFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x8.SubPairs", opLen2(ssa.OpSubPairsInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.SubPairs", opLen2(ssa.OpSubPairsInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x4.SubPairs", opLen2(ssa.OpSubPairsInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x8.SubPairs", opLen2(ssa.OpSubPairsInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x8.SubPairs", opLen2(ssa.OpSubPairsUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x16.SubPairs", opLen2(ssa.OpSubPairsUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x4.SubPairs", opLen2(ssa.OpSubPairsUint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x8.SubPairs", opLen2(ssa.OpSubPairsUint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x4.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x8.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x16.SubPairsSaturatedGrouped", opLen2(ssa.OpSubPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x64, types.TypeVec512), sys.AMD64)

View File

@@ -17,21 +17,83 @@
// NAME subtracts corresponding elements of two vectors with saturation.
- go: AddPairs
commutative: false
out:
- elemBits: 16|32
documentation: !string |-
// NAME horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
- go: AddPairs
commutative: false
out:
- elemBits: 64
documentation: !string |-
// NAME horizontally adds adjacent pairs of elements.
// For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
- go: SubPairs
commutative: false
out:
- elemBits: 16|32
documentation: !string |-
// NAME horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
- go: SubPairs
commutative: false
out:
- elemBits: 64
documentation: !string |-
// NAME horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
- go: AddPairsSaturated
commutative: false
documentation: !string |-
// NAME horizontally adds adjacent pairs of elements with saturation.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
- go: SubPairsSaturated
commutative: false
documentation: !string |-
// NAME horizontally subtracts adjacent pairs of elements with saturation.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
- go: AddPairsGrouped
commutative: false
out:
- elemBits: 16|32
documentation: !string |-
// NAME horizontally adds adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
- go: AddPairsGrouped
commutative: false
out:
- elemBits: 64
documentation: !string |-
// NAME horizontally adds adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
- go: SubPairsGrouped
commutative: false
out:
- elemBits: 16|32
documentation: !string |-
// NAME horizontally subtracts adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
- go: SubPairsGrouped
commutative: false
out:
- elemBits: 64
documentation: !string |-
// NAME horizontally subtracts adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
- go: AddPairsSaturatedGrouped
commutative: false
documentation: !string |-
// NAME horizontally adds adjacent pairs of elements with saturation.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
- go: SubPairsSaturatedGrouped
commutative: false
documentation: !string |-
// NAME horizontally subtracts adjacent pairs of elements with saturation.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].

View File

@@ -53,25 +53,71 @@
- *uint
- go: AddPairs
asm: "VPHADD[DW]"
in: *2any
out: *1any
in: &2any128
- &any128
go: $t
bits: 128
- *any128
out: &1any128
- *any128
- go: SubPairs
asm: "VPHSUB[DW]"
in: *2any
out: *1any
in: *2any128
out: *1any128
- go: AddPairs
asm: "VHADDP[SD]" # floats
in: *2any
out: *1any
in: *2any128
out: *1any128
- go: SubPairs
asm: "VHSUBP[SD]" # floats
in: *2any
out: *1any
in: *2any128
out: *1any128
- go: AddPairsSaturated
asm: "VPHADDS[DW]"
in: *2int
out: *1int
in: &2int128
- &int128
go: $t
base: int
bits: 128
- *int128
out: &1int128
- *int128
- go: SubPairsSaturated
asm: "VPHSUBS[DW]"
in: *2int
out: *1int
in: *2int128
out: *1int128
- go: AddPairsGrouped
asm: "VPHADD[DW]"
in: &2any256
- &any256
go: $t
bits: 256
- *any256
out: &1any256
- *any256
- go: SubPairsGrouped
asm: "VPHSUB[DW]"
in: *2any256
out: *1any256
- go: AddPairsGrouped
asm: "VHADDP[SD]" # floats
in: *2any256
out: *1any256
- go: SubPairsGrouped
asm: "VHSUBP[SD]" # floats
in: *2any256
out: *1any256
- go: AddPairsSaturatedGrouped
asm: "VPHADDS[DW]"
in: &2int256
- &int256
go: $t
base: int
bits: 256
- *int256
out: &1int256
- *int256
- go: SubPairsSaturatedGrouped
asm: "VPHSUBS[DW]"
in: *2int256
out: *1int256

View File

@@ -13,6 +13,7 @@ import (
"simd/archsimd"
"slices"
"testing"
"unsafe"
)
func TestMain(m *testing.M) {
@@ -1228,3 +1229,70 @@ func TestClMul(t *testing.T) {
foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0})
}
func addPairsSlice[T number](a, b []T) []T {
r := make([]T, len(a))
for i := range len(a) / 2 {
r[i] = a[2*i] + a[2*i+1]
r[i+len(a)/2] = b[2*i] + b[2*i+1]
}
return r
}
func subPairsSlice[T number](a, b []T) []T {
r := make([]T, len(a))
for i := range len(a) / 2 {
r[i] = a[2*i] - a[2*i+1]
r[i+len(a)/2] = b[2*i] - b[2*i+1]
}
return r
}
func addPairsGroupedSlice[T number](a, b []T) []T {
group := int(128 / unsafe.Sizeof(a[0]))
r := make([]T, 0, len(a))
for i := range len(a) / group {
r = append(r, addPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
}
return r
}
func subPairsGroupedSlice[T number](a, b []T) []T {
group := int(128 / unsafe.Sizeof(a[0]))
r := make([]T, 0, len(a))
for i := range len(a) / group {
r = append(r, subPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
}
return r
}
func TestAddSubPairs(t *testing.T) {
testInt16x8Binary(t, archsimd.Int16x8.AddPairs, addPairsSlice[int16])
testInt16x8Binary(t, archsimd.Int16x8.SubPairs, subPairsSlice[int16])
testUint16x8Binary(t, archsimd.Uint16x8.AddPairs, addPairsSlice[uint16])
testUint16x8Binary(t, archsimd.Uint16x8.SubPairs, subPairsSlice[uint16])
testInt32x4Binary(t, archsimd.Int32x4.AddPairs, addPairsSlice[int32])
testInt32x4Binary(t, archsimd.Int32x4.SubPairs, subPairsSlice[int32])
testUint32x4Binary(t, archsimd.Uint32x4.AddPairs, addPairsSlice[uint32])
testUint32x4Binary(t, archsimd.Uint32x4.SubPairs, subPairsSlice[uint32])
testFloat32x4Binary(t, archsimd.Float32x4.AddPairs, addPairsSlice[float32])
testFloat32x4Binary(t, archsimd.Float32x4.SubPairs, subPairsSlice[float32])
testFloat64x2Binary(t, archsimd.Float64x2.AddPairs, addPairsSlice[float64])
testFloat64x2Binary(t, archsimd.Float64x2.SubPairs, subPairsSlice[float64])
// Grouped versions
if archsimd.X86.AVX2() {
testInt16x16Binary(t, archsimd.Int16x16.AddPairsGrouped, addPairsGroupedSlice[int16])
testInt16x16Binary(t, archsimd.Int16x16.SubPairsGrouped, subPairsGroupedSlice[int16])
testUint16x16Binary(t, archsimd.Uint16x16.AddPairsGrouped, addPairsGroupedSlice[uint16])
testUint16x16Binary(t, archsimd.Uint16x16.SubPairsGrouped, subPairsGroupedSlice[uint16])
testInt32x8Binary(t, archsimd.Int32x8.AddPairsGrouped, addPairsGroupedSlice[int32])
testInt32x8Binary(t, archsimd.Int32x8.SubPairsGrouped, subPairsGroupedSlice[int32])
testUint32x8Binary(t, archsimd.Uint32x8.AddPairsGrouped, addPairsGroupedSlice[uint32])
testUint32x8Binary(t, archsimd.Uint32x8.SubPairsGrouped, subPairsGroupedSlice[uint32])
testFloat32x8Binary(t, archsimd.Float32x8.AddPairsGrouped, addPairsGroupedSlice[float32])
testFloat32x8Binary(t, archsimd.Float32x8.SubPairsGrouped, subPairsGroupedSlice[float32])
testFloat64x4Binary(t, archsimd.Float64x4.AddPairsGrouped, addPairsGroupedSlice[float64])
testFloat64x4Binary(t, archsimd.Float64x4.SubPairsGrouped, subPairsGroupedSlice[float64])
}
}

View File

@@ -349,90 +349,101 @@ func (x Uint64x8) Add(y Uint64x8) Uint64x8
/* AddPairs */
// AddPairs horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VHADDPS, CPU Feature: AVX
func (x Float32x4) AddPairs(y Float32x4) Float32x4
// AddPairs horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
//
// Asm: VHADDPS, CPU Feature: AVX
func (x Float32x8) AddPairs(y Float32x8) Float32x8
// AddPairs horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
// For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
//
// Asm: VHADDPD, CPU Feature: AVX
func (x Float64x2) AddPairs(y Float64x2) Float64x2
// AddPairs horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
//
// Asm: VHADDPD, CPU Feature: AVX
func (x Float64x4) AddPairs(y Float64x4) Float64x4
// AddPairs horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDW, CPU Feature: AVX
func (x Int16x8) AddPairs(y Int16x8) Int16x8
// AddPairs horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
//
// Asm: VPHADDW, CPU Feature: AVX2
func (x Int16x16) AddPairs(y Int16x16) Int16x16
// AddPairs horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDD, CPU Feature: AVX
func (x Int32x4) AddPairs(y Int32x4) Int32x4
// AddPairs horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
//
// Asm: VPHADDD, CPU Feature: AVX2
func (x Int32x8) AddPairs(y Int32x8) Int32x8
// AddPairs horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDW, CPU Feature: AVX
func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8
// AddPairs horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
//
// Asm: VPHADDW, CPU Feature: AVX2
func (x Uint16x16) AddPairs(y Uint16x16) Uint16x16
// AddPairs horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDD, CPU Feature: AVX
func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4
// AddPairs horizontally adds adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
/* AddPairsGrouped */
// AddPairsGrouped horizontally adds adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VHADDPS, CPU Feature: AVX
func (x Float32x8) AddPairsGrouped(y Float32x8) Float32x8
// AddPairsGrouped horizontally adds adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
//
// Asm: VHADDPD, CPU Feature: AVX
func (x Float64x4) AddPairsGrouped(y Float64x4) Float64x4
// AddPairsGrouped horizontally adds adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDW, CPU Feature: AVX2
func (x Int16x16) AddPairsGrouped(y Int16x16) Int16x16
// AddPairsGrouped horizontally adds adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDD, CPU Feature: AVX2
func (x Uint32x8) AddPairs(y Uint32x8) Uint32x8
func (x Int32x8) AddPairsGrouped(y Int32x8) Int32x8
// AddPairsGrouped horizontally adds adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDW, CPU Feature: AVX2
func (x Uint16x16) AddPairsGrouped(y Uint16x16) Uint16x16
// AddPairsGrouped horizontally adds adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDD, CPU Feature: AVX2
func (x Uint32x8) AddPairsGrouped(y Uint32x8) Uint32x8
/* AddPairsSaturated */
// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDSW, CPU Feature: AVX
func (x Int16x8) AddPairsSaturated(y Int16x8) Int16x8
// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
/* AddPairsSaturatedGrouped */
// AddPairsSaturatedGrouped horizontally adds adjacent pairs of elements with saturation.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDSW, CPU Feature: AVX2
func (x Int16x16) AddPairsSaturated(y Int16x16) Int16x16
func (x Int16x16) AddPairsSaturatedGrouped(y Int16x16) Int16x16
/* AddSaturated */
@@ -7084,90 +7095,101 @@ func (x Uint64x8) Sub(y Uint64x8) Uint64x8
/* SubPairs */
// SubPairs horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VHSUBPS, CPU Feature: AVX
func (x Float32x4) SubPairs(y Float32x4) Float32x4
// SubPairs horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
//
// Asm: VHSUBPS, CPU Feature: AVX
func (x Float32x8) SubPairs(y Float32x8) Float32x8
// SubPairs horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
// For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
//
// Asm: VHSUBPD, CPU Feature: AVX
func (x Float64x2) SubPairs(y Float64x2) Float64x2
// SubPairs horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
//
// Asm: VHSUBPD, CPU Feature: AVX
func (x Float64x4) SubPairs(y Float64x4) Float64x4
// SubPairs horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBW, CPU Feature: AVX
func (x Int16x8) SubPairs(y Int16x8) Int16x8
// SubPairs horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
//
// Asm: VPHSUBW, CPU Feature: AVX2
func (x Int16x16) SubPairs(y Int16x16) Int16x16
// SubPairs horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBD, CPU Feature: AVX
func (x Int32x4) SubPairs(y Int32x4) Int32x4
// SubPairs horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
//
// Asm: VPHSUBD, CPU Feature: AVX2
func (x Int32x8) SubPairs(y Int32x8) Int32x8
// SubPairs horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBW, CPU Feature: AVX
func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8
// SubPairs horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
//
// Asm: VPHSUBW, CPU Feature: AVX2
func (x Uint16x16) SubPairs(y Uint16x16) Uint16x16
// SubPairs horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBD, CPU Feature: AVX
func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4
// SubPairs horizontally subtracts adjacent pairs of elements.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
/* SubPairsGrouped */
// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VHSUBPS, CPU Feature: AVX
func (x Float32x8) SubPairsGrouped(y Float32x8) Float32x8
// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
//
// Asm: VHSUBPD, CPU Feature: AVX
func (x Float64x4) SubPairsGrouped(y Float64x4) Float64x4
// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBW, CPU Feature: AVX2
func (x Int16x16) SubPairsGrouped(y Int16x16) Int16x16
// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBD, CPU Feature: AVX2
func (x Uint32x8) SubPairs(y Uint32x8) Uint32x8
func (x Int32x8) SubPairsGrouped(y Int32x8) Int32x8
// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBW, CPU Feature: AVX2
func (x Uint16x16) SubPairsGrouped(y Uint16x16) Uint16x16
// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBD, CPU Feature: AVX2
func (x Uint32x8) SubPairsGrouped(y Uint32x8) Uint32x8
/* SubPairsSaturated */
// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBSW, CPU Feature: AVX
func (x Int16x8) SubPairsSaturated(y Int16x8) Int16x8
// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
/* SubPairsSaturatedGrouped */
// SubPairsSaturatedGrouped horizontally subtracts adjacent pairs of elements with saturation.
// With each 128-bit as a group:
// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBSW, CPU Feature: AVX2
func (x Int16x16) SubPairsSaturated(y Int16x16) Int16x16
func (x Int16x16) SubPairsSaturatedGrouped(y Int16x16) Int16x16
/* SubSaturated */