mirror of
https://github.com/golang/go.git
synced 2026-01-29 23:22:06 +03:00
[dev.simd] simd, cmd/compile: added simd methods for VSHUFP[DS]
These are package private, and will be hidden behind other methods in a following CL with a more general interface. Change-Id: Id090a5de06a0e2aed5cc60a11ff627c5e3b9c52d Reviewed-on: https://go-review.googlesource.com/c/go/+/698577 Reviewed-by: Junyang Shao <shaojunyang@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
@@ -1074,7 +1074,13 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
|
||||
ssa.OpAMD64VPSHRDD512,
|
||||
ssa.OpAMD64VPSHRDQ128,
|
||||
ssa.OpAMD64VPSHRDQ256,
|
||||
ssa.OpAMD64VPSHRDQ512:
|
||||
ssa.OpAMD64VPSHRDQ512,
|
||||
ssa.OpAMD64VSHUFPS128,
|
||||
ssa.OpAMD64VSHUFPD128,
|
||||
ssa.OpAMD64VSHUFPS256,
|
||||
ssa.OpAMD64VSHUFPS512,
|
||||
ssa.OpAMD64VSHUFPD256,
|
||||
ssa.OpAMD64VSHUFPD512:
|
||||
p = simdV21Imm8(s, v)
|
||||
|
||||
case ssa.OpAMD64VCMPPS512,
|
||||
@@ -1878,7 +1884,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
|
||||
ssa.OpAMD64VPSHRDD512load,
|
||||
ssa.OpAMD64VPSHRDQ128load,
|
||||
ssa.OpAMD64VPSHRDQ256load,
|
||||
ssa.OpAMD64VPSHRDQ512load:
|
||||
ssa.OpAMD64VPSHRDQ512load,
|
||||
ssa.OpAMD64VSHUFPS512load,
|
||||
ssa.OpAMD64VSHUFPD512load:
|
||||
p = simdV21loadImm8(s, v)
|
||||
|
||||
case ssa.OpAMD64VCMPPS512load,
|
||||
|
||||
@@ -1279,6 +1279,24 @@
|
||||
(blendMaskedInt16x32 x y mask) => (VPBLENDMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
|
||||
(blendMaskedInt32x16 x y mask) => (VPBLENDMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
|
||||
(blendMaskedInt64x8 x y mask) => (VPBLENDMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
|
||||
(concatSelectedConstantFloat32x4 ...) => (VSHUFPS128 ...)
|
||||
(concatSelectedConstantFloat64x2 ...) => (VSHUFPD128 ...)
|
||||
(concatSelectedConstantInt32x4 ...) => (VSHUFPS128 ...)
|
||||
(concatSelectedConstantInt64x2 ...) => (VSHUFPD128 ...)
|
||||
(concatSelectedConstantUint32x4 ...) => (VSHUFPS128 ...)
|
||||
(concatSelectedConstantUint64x2 ...) => (VSHUFPD128 ...)
|
||||
(concatSelectedConstantGroupedFloat32x8 ...) => (VSHUFPS256 ...)
|
||||
(concatSelectedConstantGroupedFloat32x16 ...) => (VSHUFPS512 ...)
|
||||
(concatSelectedConstantGroupedFloat64x4 ...) => (VSHUFPD256 ...)
|
||||
(concatSelectedConstantGroupedFloat64x8 ...) => (VSHUFPD512 ...)
|
||||
(concatSelectedConstantGroupedInt32x8 ...) => (VSHUFPS256 ...)
|
||||
(concatSelectedConstantGroupedInt32x16 ...) => (VSHUFPS512 ...)
|
||||
(concatSelectedConstantGroupedInt64x4 ...) => (VSHUFPD256 ...)
|
||||
(concatSelectedConstantGroupedInt64x8 ...) => (VSHUFPD512 ...)
|
||||
(concatSelectedConstantGroupedUint32x8 ...) => (VSHUFPS256 ...)
|
||||
(concatSelectedConstantGroupedUint32x16 ...) => (VSHUFPS512 ...)
|
||||
(concatSelectedConstantGroupedUint64x4 ...) => (VSHUFPD256 ...)
|
||||
(concatSelectedConstantGroupedUint64x8 ...) => (VSHUFPD512 ...)
|
||||
(moveMaskedFloat32x16 x mask) => (VMOVUPSMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
|
||||
(moveMaskedFloat64x8 x mask) => (VMOVUPDMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
|
||||
(moveMaskedInt8x64 x mask) => (VMOVDQU8Masked512 x (VPMOVVec8x64ToM <types.TypeMask> mask))
|
||||
@@ -1993,6 +2011,8 @@
|
||||
(VPXORQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPXORQMasked512load {sym} [off] x ptr mask mem)
|
||||
(VPBLENDMDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMDMasked512load {sym} [off] x ptr mask mem)
|
||||
(VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem)
|
||||
(VSHUFPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
|
||||
(VSHUFPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
|
||||
(VPSLLD512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
|
||||
(VPSLLQ512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
|
||||
(VPSLLDMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
|
||||
|
||||
@@ -1256,6 +1256,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
|
||||
{name: "VPSHRDQMasked128", argLength: 3, reg: w2kw, asm: "VPSHRDQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
|
||||
{name: "VPSHRDQMasked256", argLength: 3, reg: w2kw, asm: "VPSHRDQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
|
||||
{name: "VPSHRDQMasked512", argLength: 3, reg: w2kw, asm: "VPSHRDQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
|
||||
{name: "VSHUFPS128", argLength: 2, reg: v21, asm: "VSHUFPS", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
|
||||
{name: "VSHUFPD128", argLength: 2, reg: v21, asm: "VSHUFPD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
|
||||
{name: "VSHUFPS256", argLength: 2, reg: v21, asm: "VSHUFPS", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
|
||||
{name: "VSHUFPS512", argLength: 2, reg: w21, asm: "VSHUFPS", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
|
||||
{name: "VSHUFPD256", argLength: 2, reg: v21, asm: "VSHUFPD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
|
||||
{name: "VSHUFPD512", argLength: 2, reg: w21, asm: "VSHUFPD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
|
||||
{name: "VPSLLW128const", argLength: 1, reg: v11, asm: "VPSLLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
|
||||
{name: "VPSLLW256const", argLength: 1, reg: v11, asm: "VPSLLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
|
||||
{name: "VPSLLW512const", argLength: 1, reg: w11, asm: "VPSLLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
|
||||
@@ -1834,6 +1840,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
|
||||
{name: "VPSHRDQMasked128load", argLength: 4, reg: w2kwload, asm: "VPSHRDQ", commutative: false, typ: "Vec128", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false},
|
||||
{name: "VPSHRDQMasked256load", argLength: 4, reg: w2kwload, asm: "VPSHRDQ", commutative: false, typ: "Vec256", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false},
|
||||
{name: "VPSHRDQMasked512load", argLength: 4, reg: w2kwload, asm: "VPSHRDQ", commutative: false, typ: "Vec512", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false},
|
||||
{name: "VSHUFPS512load", argLength: 3, reg: w21load, asm: "VSHUFPS", commutative: false, typ: "Vec512", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false},
|
||||
{name: "VSHUFPD512load", argLength: 3, reg: w21load, asm: "VSHUFPD", commutative: false, typ: "Vec512", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false},
|
||||
{name: "VPSLLD512constload", argLength: 2, reg: w11load, asm: "VPSLLD", commutative: false, typ: "Vec512", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false},
|
||||
{name: "VPSLLQ512constload", argLength: 2, reg: w11load, asm: "VPSLLQ", commutative: false, typ: "Vec512", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false},
|
||||
{name: "VPSLLDMasked128constload", argLength: 3, reg: wkwload, asm: "VPSLLD", commutative: false, typ: "Vec128", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false},
|
||||
|
||||
@@ -1257,5 +1257,23 @@ func simdGenericOps() []opData {
|
||||
{name: "TruncScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"},
|
||||
{name: "TruncScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"},
|
||||
{name: "TruncScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantFloat32x4", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantFloat64x2", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantGroupedFloat32x8", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantGroupedFloat32x16", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantGroupedFloat64x4", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantGroupedFloat64x8", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantGroupedInt32x8", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantGroupedInt32x16", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantGroupedInt64x4", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantGroupedInt64x8", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantGroupedUint32x8", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantGroupedUint32x16", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantGroupedUint64x4", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantGroupedUint64x8", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantInt32x4", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantInt64x2", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantUint32x4", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantUint64x2", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2488,6 +2488,12 @@ const (
|
||||
OpAMD64VPSHRDQMasked128
|
||||
OpAMD64VPSHRDQMasked256
|
||||
OpAMD64VPSHRDQMasked512
|
||||
OpAMD64VSHUFPS128
|
||||
OpAMD64VSHUFPD128
|
||||
OpAMD64VSHUFPS256
|
||||
OpAMD64VSHUFPS512
|
||||
OpAMD64VSHUFPD256
|
||||
OpAMD64VSHUFPD512
|
||||
OpAMD64VPSLLW128const
|
||||
OpAMD64VPSLLW256const
|
||||
OpAMD64VPSLLW512const
|
||||
@@ -3066,6 +3072,8 @@ const (
|
||||
OpAMD64VPSHRDQMasked128load
|
||||
OpAMD64VPSHRDQMasked256load
|
||||
OpAMD64VPSHRDQMasked512load
|
||||
OpAMD64VSHUFPS512load
|
||||
OpAMD64VSHUFPD512load
|
||||
OpAMD64VPSLLD512constload
|
||||
OpAMD64VPSLLQ512constload
|
||||
OpAMD64VPSLLDMasked128constload
|
||||
@@ -6644,6 +6652,24 @@ const (
|
||||
OpTruncScaledResidueFloat64x2
|
||||
OpTruncScaledResidueFloat64x4
|
||||
OpTruncScaledResidueFloat64x8
|
||||
OpconcatSelectedConstantFloat32x4
|
||||
OpconcatSelectedConstantFloat64x2
|
||||
OpconcatSelectedConstantGroupedFloat32x8
|
||||
OpconcatSelectedConstantGroupedFloat32x16
|
||||
OpconcatSelectedConstantGroupedFloat64x4
|
||||
OpconcatSelectedConstantGroupedFloat64x8
|
||||
OpconcatSelectedConstantGroupedInt32x8
|
||||
OpconcatSelectedConstantGroupedInt32x16
|
||||
OpconcatSelectedConstantGroupedInt64x4
|
||||
OpconcatSelectedConstantGroupedInt64x8
|
||||
OpconcatSelectedConstantGroupedUint32x8
|
||||
OpconcatSelectedConstantGroupedUint32x16
|
||||
OpconcatSelectedConstantGroupedUint64x4
|
||||
OpconcatSelectedConstantGroupedUint64x8
|
||||
OpconcatSelectedConstantInt32x4
|
||||
OpconcatSelectedConstantInt64x2
|
||||
OpconcatSelectedConstantUint32x4
|
||||
OpconcatSelectedConstantUint64x2
|
||||
)
|
||||
|
||||
var opcodeTable = [...]opInfo{
|
||||
@@ -38308,6 +38334,96 @@ var opcodeTable = [...]opInfo{
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VSHUFPS128",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
asm: x86.AVSHUFPS,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VSHUFPD128",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
asm: x86.AVSHUFPD,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VSHUFPS256",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
asm: x86.AVSHUFPS,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VSHUFPS512",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
asm: x86.AVSHUFPS,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VSHUFPD256",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
asm: x86.AVSHUFPD,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VSHUFPD512",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
asm: x86.AVSHUFPD,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VPSLLW128const",
|
||||
auxType: auxUInt8,
|
||||
@@ -47864,6 +47980,38 @@ var opcodeTable = [...]opInfo{
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VSHUFPS512load",
|
||||
auxType: auxSymValAndOff,
|
||||
argLen: 3,
|
||||
symEffect: SymRead,
|
||||
asm: x86.AVSHUFPS,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{1, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
|
||||
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VSHUFPD512load",
|
||||
auxType: auxSymValAndOff,
|
||||
argLen: 3,
|
||||
symEffect: SymRead,
|
||||
asm: x86.AVSHUFPD,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{1, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
|
||||
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VPSLLD512constload",
|
||||
auxType: auxSymValAndOff,
|
||||
@@ -82560,6 +82708,114 @@ var opcodeTable = [...]opInfo{
|
||||
argLen: 1,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantFloat32x4",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantFloat64x2",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantGroupedFloat32x8",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantGroupedFloat32x16",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantGroupedFloat64x4",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantGroupedFloat64x8",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantGroupedInt32x8",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantGroupedInt32x16",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantGroupedInt64x4",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantGroupedInt64x8",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantGroupedUint32x8",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantGroupedUint32x16",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantGroupedUint64x4",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantGroupedUint64x8",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantInt32x4",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantInt64x2",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantUint32x4",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantUint64x2",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
}
|
||||
|
||||
func (o Op) Asm() obj.As { return opcodeTable[o].asm }
|
||||
|
||||
@@ -1715,6 +1715,10 @@ func rewriteValueAMD64(v *Value) bool {
|
||||
return rewriteValueAMD64_OpAMD64VSCALEFPSMasked256(v)
|
||||
case OpAMD64VSCALEFPSMasked512:
|
||||
return rewriteValueAMD64_OpAMD64VSCALEFPSMasked512(v)
|
||||
case OpAMD64VSHUFPD512:
|
||||
return rewriteValueAMD64_OpAMD64VSHUFPD512(v)
|
||||
case OpAMD64VSHUFPS512:
|
||||
return rewriteValueAMD64_OpAMD64VSHUFPS512(v)
|
||||
case OpAMD64VSQRTPD512:
|
||||
return rewriteValueAMD64_OpAMD64VSQRTPD512(v)
|
||||
case OpAMD64VSQRTPDMasked128:
|
||||
@@ -5992,6 +5996,60 @@ func rewriteValueAMD64(v *Value) bool {
|
||||
return rewriteValueAMD64_OpblendMaskedInt64x8(v)
|
||||
case OpblendMaskedInt8x64:
|
||||
return rewriteValueAMD64_OpblendMaskedInt8x64(v)
|
||||
case OpconcatSelectedConstantFloat32x4:
|
||||
v.Op = OpAMD64VSHUFPS128
|
||||
return true
|
||||
case OpconcatSelectedConstantFloat64x2:
|
||||
v.Op = OpAMD64VSHUFPD128
|
||||
return true
|
||||
case OpconcatSelectedConstantGroupedFloat32x16:
|
||||
v.Op = OpAMD64VSHUFPS512
|
||||
return true
|
||||
case OpconcatSelectedConstantGroupedFloat32x8:
|
||||
v.Op = OpAMD64VSHUFPS256
|
||||
return true
|
||||
case OpconcatSelectedConstantGroupedFloat64x4:
|
||||
v.Op = OpAMD64VSHUFPD256
|
||||
return true
|
||||
case OpconcatSelectedConstantGroupedFloat64x8:
|
||||
v.Op = OpAMD64VSHUFPD512
|
||||
return true
|
||||
case OpconcatSelectedConstantGroupedInt32x16:
|
||||
v.Op = OpAMD64VSHUFPS512
|
||||
return true
|
||||
case OpconcatSelectedConstantGroupedInt32x8:
|
||||
v.Op = OpAMD64VSHUFPS256
|
||||
return true
|
||||
case OpconcatSelectedConstantGroupedInt64x4:
|
||||
v.Op = OpAMD64VSHUFPD256
|
||||
return true
|
||||
case OpconcatSelectedConstantGroupedInt64x8:
|
||||
v.Op = OpAMD64VSHUFPD512
|
||||
return true
|
||||
case OpconcatSelectedConstantGroupedUint32x16:
|
||||
v.Op = OpAMD64VSHUFPS512
|
||||
return true
|
||||
case OpconcatSelectedConstantGroupedUint32x8:
|
||||
v.Op = OpAMD64VSHUFPS256
|
||||
return true
|
||||
case OpconcatSelectedConstantGroupedUint64x4:
|
||||
v.Op = OpAMD64VSHUFPD256
|
||||
return true
|
||||
case OpconcatSelectedConstantGroupedUint64x8:
|
||||
v.Op = OpAMD64VSHUFPD512
|
||||
return true
|
||||
case OpconcatSelectedConstantInt32x4:
|
||||
v.Op = OpAMD64VSHUFPS128
|
||||
return true
|
||||
case OpconcatSelectedConstantInt64x2:
|
||||
v.Op = OpAMD64VSHUFPD128
|
||||
return true
|
||||
case OpconcatSelectedConstantUint32x4:
|
||||
v.Op = OpAMD64VSHUFPS128
|
||||
return true
|
||||
case OpconcatSelectedConstantUint64x2:
|
||||
v.Op = OpAMD64VSHUFPD128
|
||||
return true
|
||||
case OpmoveMaskedFloat32x16:
|
||||
return rewriteValueAMD64_OpmoveMaskedFloat32x16(v)
|
||||
case OpmoveMaskedFloat64x8:
|
||||
@@ -47442,6 +47500,62 @@ func rewriteValueAMD64_OpAMD64VSCALEFPSMasked512(v *Value) bool {
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueAMD64_OpAMD64VSHUFPD512(v *Value) bool {
|
||||
v_1 := v.Args[1]
|
||||
v_0 := v.Args[0]
|
||||
// match: (VSHUFPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
|
||||
// cond: canMergeLoad(v, l) && clobber(l)
|
||||
// result: (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
|
||||
for {
|
||||
c := auxIntToUint8(v.AuxInt)
|
||||
x := v_0
|
||||
l := v_1
|
||||
if l.Op != OpAMD64VMOVDQUload512 {
|
||||
break
|
||||
}
|
||||
off := auxIntToInt32(l.AuxInt)
|
||||
sym := auxToSym(l.Aux)
|
||||
mem := l.Args[1]
|
||||
ptr := l.Args[0]
|
||||
if !(canMergeLoad(v, l) && clobber(l)) {
|
||||
break
|
||||
}
|
||||
v.reset(OpAMD64VSHUFPD512load)
|
||||
v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
|
||||
v.Aux = symToAux(sym)
|
||||
v.AddArg3(x, ptr, mem)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueAMD64_OpAMD64VSHUFPS512(v *Value) bool {
|
||||
v_1 := v.Args[1]
|
||||
v_0 := v.Args[0]
|
||||
// match: (VSHUFPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
|
||||
// cond: canMergeLoad(v, l) && clobber(l)
|
||||
// result: (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
|
||||
for {
|
||||
c := auxIntToUint8(v.AuxInt)
|
||||
x := v_0
|
||||
l := v_1
|
||||
if l.Op != OpAMD64VMOVDQUload512 {
|
||||
break
|
||||
}
|
||||
off := auxIntToInt32(l.AuxInt)
|
||||
sym := auxToSym(l.Aux)
|
||||
mem := l.Args[1]
|
||||
ptr := l.Args[0]
|
||||
if !(canMergeLoad(v, l) && clobber(l)) {
|
||||
break
|
||||
}
|
||||
v.reset(OpAMD64VSHUFPS512load)
|
||||
v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
|
||||
v.Aux = symToAux(sym)
|
||||
v.AddArg3(x, ptr, mem)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueAMD64_OpAMD64VSQRTPD512(v *Value) bool {
|
||||
v_0 := v.Args[0]
|
||||
// match: (VSQRTPD512 l:(VMOVDQUload512 {sym} [off] ptr mem))
|
||||
|
||||
@@ -1255,6 +1255,24 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
|
||||
addF(simdPackage, "Int16x32.blendMasked", opLen3(ssa.OpblendMaskedInt16x32, types.TypeVec512), sys.AMD64)
|
||||
addF(simdPackage, "Int32x16.blendMasked", opLen3(ssa.OpblendMaskedInt32x16, types.TypeVec512), sys.AMD64)
|
||||
addF(simdPackage, "Int64x8.blendMasked", opLen3(ssa.OpblendMaskedInt64x8, types.TypeVec512), sys.AMD64)
|
||||
addF(simdPackage, "Float32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128, 0), sys.AMD64)
|
||||
addF(simdPackage, "Float64x2.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat64x2, types.TypeVec128, 0), sys.AMD64)
|
||||
addF(simdPackage, "Int32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128, 0), sys.AMD64)
|
||||
addF(simdPackage, "Int64x2.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantInt64x2, types.TypeVec128, 0), sys.AMD64)
|
||||
addF(simdPackage, "Uint32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantUint32x4, types.TypeVec128, 0), sys.AMD64)
|
||||
addF(simdPackage, "Uint64x2.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantUint64x2, types.TypeVec128, 0), sys.AMD64)
|
||||
addF(simdPackage, "Float32x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedFloat32x8, types.TypeVec256, 0), sys.AMD64)
|
||||
addF(simdPackage, "Float32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedFloat32x16, types.TypeVec512, 0), sys.AMD64)
|
||||
addF(simdPackage, "Float64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedFloat64x4, types.TypeVec256, 0), sys.AMD64)
|
||||
addF(simdPackage, "Float64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedFloat64x8, types.TypeVec512, 0), sys.AMD64)
|
||||
addF(simdPackage, "Int32x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
|
||||
addF(simdPackage, "Int32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
|
||||
addF(simdPackage, "Int64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedInt64x4, types.TypeVec256, 0), sys.AMD64)
|
||||
addF(simdPackage, "Int64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedInt64x8, types.TypeVec512, 0), sys.AMD64)
|
||||
addF(simdPackage, "Uint32x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
|
||||
addF(simdPackage, "Uint32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
|
||||
addF(simdPackage, "Uint64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, 0), sys.AMD64)
|
||||
addF(simdPackage, "Uint64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, 0), sys.AMD64)
|
||||
addF(simdPackage, "Float32x16.moveMasked", opLen2(ssa.OpmoveMaskedFloat32x16, types.TypeVec512), sys.AMD64)
|
||||
addF(simdPackage, "Float64x8.moveMasked", opLen2(ssa.OpmoveMaskedFloat64x8, types.TypeVec512), sys.AMD64)
|
||||
addF(simdPackage, "Int8x64.moveMasked", opLen2(ssa.OpmoveMaskedInt8x64, types.TypeVec512), sys.AMD64)
|
||||
|
||||
@@ -687,6 +687,9 @@ var depsRules = `
|
||||
FMT, DEBUG, flag, runtime/trace, internal/sysinfo, math/rand
|
||||
< testing;
|
||||
|
||||
testing, math
|
||||
< simd/internal/test_helpers;
|
||||
|
||||
log/slog, testing
|
||||
< testing/slogtest;
|
||||
|
||||
|
||||
@@ -120,3 +120,58 @@
|
||||
documentation: !string |-
|
||||
// NAME interleaves the elements of the low half of each 128-bit subvector of x and y.
|
||||
|
||||
- go: concatSelectedConstant
|
||||
commutative: false
|
||||
out:
|
||||
- elemBits: 32
|
||||
documentation: !string |-
|
||||
// NAME concatenates selected elements from x and y into the lower and upper
|
||||
// halves of the output. The selection is chosen by the constant parameter h1h0l1l0
|
||||
// where each {h,l}{1,0} is two bits specify which element from y or x to select.
|
||||
// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
|
||||
// {2, 1, 4, 6} (don't forget that the binary constant is written big-endian).
|
||||
|
||||
- go: concatSelectedConstant
|
||||
commutative: false
|
||||
out:
|
||||
- elemBits: 64
|
||||
documentation: !string |-
|
||||
// NAME concatenates selected elements from x and y into the lower and upper
|
||||
// halves of the output. The selection is chosen by the constant parameter hilo
|
||||
// where hi and lo are each one bit specifying which 64-bit element to select
|
||||
// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7})
|
||||
// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
|
||||
// selecting from y, is 1, and selects 7.
|
||||
|
||||
- go: concatSelectedConstantGrouped
|
||||
commutative: false
|
||||
out:
|
||||
- elemBits: 32
|
||||
documentation: !string |-
|
||||
// NAME concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selection is chosen by the constant parameter h1h0l1l0
|
||||
// where each {h,l}{1,0} is two bits specify which element from y or x to select.
|
||||
// For example,
|
||||
// {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
|
||||
// returns {2,0,5,7,10,8,13,15}
|
||||
// (don't forget that the binary constant is written big-endian).
|
||||
|
||||
- go: concatSelectedConstantGrouped
|
||||
commutative: false
|
||||
out:
|
||||
- elemBits: 64
|
||||
documentation: !string |-
|
||||
// NAME concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selections are specified by the constant parameter hilos where each
|
||||
// hi and lo pair select 64-bit elements from the corresponding 128-bit
|
||||
// subvectors of x and y.
|
||||
//
|
||||
// For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11})
|
||||
// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
|
||||
// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
|
||||
// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
|
||||
// selecting element 1 from y's upper 128 bits (11).
|
||||
// This differs from the same method applied to a 32x8 vector, where
|
||||
// the 8-bit constant performs the same selection on both subvectors.
|
||||
@@ -564,3 +564,210 @@
|
||||
out:
|
||||
- *256Or512any
|
||||
|
||||
# These are all described separately to carry the name of the constant parameter
|
||||
|
||||
- go: concatSelectedConstant
|
||||
asm: VSHUFPS
|
||||
width: 32
|
||||
in:
|
||||
- &v
|
||||
go: $t
|
||||
class: vreg
|
||||
base: float
|
||||
bits: 128
|
||||
- *v
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: h1h0l1l0
|
||||
inVariant: []
|
||||
out:
|
||||
- *v
|
||||
|
||||
- go: concatSelectedConstant
|
||||
asm: VSHUFPS
|
||||
in:
|
||||
- &v
|
||||
go: $t
|
||||
class: vreg
|
||||
base: float
|
||||
bits: 128
|
||||
OverwriteBase: int
|
||||
- *v
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: h1h0l1l0
|
||||
inVariant: []
|
||||
out:
|
||||
- *v
|
||||
|
||||
- go: concatSelectedConstant
|
||||
asm: VSHUFPS
|
||||
in:
|
||||
- &v
|
||||
go: $t
|
||||
class: vreg
|
||||
base: float
|
||||
bits: 128
|
||||
OverwriteBase: uint
|
||||
- *v
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: h1h0l1l0
|
||||
inVariant: []
|
||||
out:
|
||||
- *v
|
||||
|
||||
|
||||
- go: concatSelectedConstantGrouped
|
||||
asm: VSHUFPS
|
||||
in:
|
||||
- &v
|
||||
go: $t
|
||||
class: vreg
|
||||
base: float
|
||||
bits: "256|512"
|
||||
- *v
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: h1h0l1l0
|
||||
inVariant: []
|
||||
out:
|
||||
- *v
|
||||
|
||||
- go: concatSelectedConstantGrouped
|
||||
asm: VSHUFPS
|
||||
in:
|
||||
- &v
|
||||
go: $t
|
||||
class: vreg
|
||||
base: float
|
||||
bits: "256|512"
|
||||
OverwriteBase: int
|
||||
- *v
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: h1h0l1l0
|
||||
inVariant: []
|
||||
out:
|
||||
- *v
|
||||
|
||||
- go: concatSelectedConstantGrouped
|
||||
asm: VSHUFPS
|
||||
in:
|
||||
- &v
|
||||
go: $t
|
||||
class: vreg
|
||||
base: float
|
||||
bits: "256|512"
|
||||
OverwriteBase: uint
|
||||
- *v
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: h1h0l1l0
|
||||
inVariant: []
|
||||
out:
|
||||
- *v
|
||||
|
||||
|
||||
# 64 bit versions
|
||||
|
||||
- go: concatSelectedConstant
|
||||
asm: VSHUFPD
|
||||
in:
|
||||
- &v
|
||||
go: $t
|
||||
class: vreg
|
||||
base: float
|
||||
bits: 128
|
||||
- *v
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: hilo
|
||||
inVariant: []
|
||||
out:
|
||||
- *v
|
||||
|
||||
- go: concatSelectedConstant
|
||||
asm: VSHUFPD
|
||||
in:
|
||||
- &v
|
||||
go: $t
|
||||
class: vreg
|
||||
base: float
|
||||
bits: 128
|
||||
OverwriteBase: int
|
||||
- *v
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: hilo
|
||||
inVariant: []
|
||||
out:
|
||||
- *v
|
||||
|
||||
- go: concatSelectedConstant
|
||||
asm: VSHUFPD
|
||||
in:
|
||||
- &v
|
||||
go: $t
|
||||
class: vreg
|
||||
base: float
|
||||
bits: 128
|
||||
OverwriteBase: uint
|
||||
- *v
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: hilo
|
||||
inVariant: []
|
||||
out:
|
||||
- *v
|
||||
|
||||
|
||||
- go: concatSelectedConstantGrouped
|
||||
asm: VSHUFPD
|
||||
in:
|
||||
- &v
|
||||
go: $t
|
||||
class: vreg
|
||||
base: float
|
||||
bits: "256|512"
|
||||
- *v
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: hilos
|
||||
inVariant: []
|
||||
out:
|
||||
- *v
|
||||
|
||||
- go: concatSelectedConstantGrouped
|
||||
asm: VSHUFPD
|
||||
in:
|
||||
- &v
|
||||
go: $t
|
||||
class: vreg
|
||||
base: float
|
||||
bits: "256|512"
|
||||
OverwriteBase: int
|
||||
- *v
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: hilos
|
||||
inVariant: []
|
||||
out:
|
||||
- *v
|
||||
|
||||
- go: concatSelectedConstantGrouped
|
||||
asm: VSHUFPD
|
||||
in:
|
||||
- &v
|
||||
go: $t
|
||||
class: vreg
|
||||
base: float
|
||||
bits: "256|512"
|
||||
OverwriteBase: uint
|
||||
- *v
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: hilos
|
||||
inVariant: []
|
||||
out:
|
||||
- *v
|
||||
|
||||
@@ -8,6 +8,7 @@ package simd_test
|
||||
|
||||
import (
|
||||
"math"
|
||||
"simd/internal/test_helpers"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -29,97 +30,12 @@ type number interface {
|
||||
|
||||
func checkSlices[T number](t *testing.T, got, want []T) bool {
|
||||
t.Helper()
|
||||
return checkSlicesLogInput[T](t, got, want, 0.0, nil)
|
||||
return test_helpers.CheckSlicesLogInput[T](t, got, want, 0.0, nil)
|
||||
}
|
||||
|
||||
// checkSlices compares two slices for equality,
|
||||
// reporting a test error if there is a problem,
|
||||
// and also consumes the two slices so that a
|
||||
// test/benchmark won't be dead-code eliminated.
|
||||
func checkSlicesLogInput[T number](t *testing.T, got, want []T, flakiness float64, logInput func()) bool {
|
||||
t.Helper()
|
||||
var z T
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
var ia any = got[i]
|
||||
var ib any = want[i]
|
||||
switch x := ia.(type) {
|
||||
case float32:
|
||||
y := ib.(float32)
|
||||
if math.IsNaN(float64(x)) && math.IsNaN(float64(y)) {
|
||||
continue
|
||||
}
|
||||
if flakiness > 0 {
|
||||
if y == 0 {
|
||||
if math.Abs(float64(x)) < flakiness {
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
if math.Abs(float64((x-y)/y)) < flakiness {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
case float64:
|
||||
y := ib.(float64)
|
||||
if math.IsNaN(x) && math.IsNaN(y) {
|
||||
continue
|
||||
}
|
||||
if flakiness > 0 {
|
||||
if y == 0 {
|
||||
if math.Abs(x) < flakiness {
|
||||
continue
|
||||
}
|
||||
} else if math.Abs((x-y)/y) < flakiness {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
}
|
||||
|
||||
t.Logf("For %T vector elements:", z)
|
||||
t.Logf("got =%v", got)
|
||||
t.Logf("want=%v", want)
|
||||
if logInput != nil {
|
||||
logInput()
|
||||
}
|
||||
t.Errorf("at index %d, got=%v, want=%v", i, got[i], want[i])
|
||||
return false
|
||||
} else if got[i] == 0 { // for floating point, 0.0 == -0.0 but a bitwise check can see the difference
|
||||
var ia any = got[i]
|
||||
var ib any = want[i]
|
||||
switch x := ia.(type) {
|
||||
case float32:
|
||||
y := ib.(float32)
|
||||
if math.Float32bits(x) != math.Float32bits(y) {
|
||||
t.Logf("For %T vector elements:", z)
|
||||
t.Logf("got =%v", got)
|
||||
t.Logf("want=%v", want)
|
||||
if logInput != nil {
|
||||
logInput()
|
||||
}
|
||||
t.Errorf("at index %d, different signs of zero", i)
|
||||
return false
|
||||
}
|
||||
case float64:
|
||||
y := ib.(float64)
|
||||
if math.Float64bits(x) != math.Float64bits(y) {
|
||||
t.Logf("For %T vector elements:", z)
|
||||
t.Logf("got =%v", got)
|
||||
t.Logf("want=%v", want)
|
||||
if logInput != nil {
|
||||
logInput()
|
||||
}
|
||||
t.Errorf("at index %d, different signs of zero", i)
|
||||
return false
|
||||
}
|
||||
default:
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return true
|
||||
return test_helpers.CheckSlicesLogInput[T](t, got, want, flakiness, logInput)
|
||||
}
|
||||
|
||||
// sliceOf returns a slice n T's, with each
|
||||
|
||||
123
src/simd/internal/test_helpers/checkslices.go
Normal file
123
src/simd/internal/test_helpers/checkslices.go
Normal file
@@ -0,0 +1,123 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build goexperiment.simd && amd64
|
||||
|
||||
package test_helpers
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type signed interface {
|
||||
~int | ~int8 | ~int16 | ~int32 | ~int64
|
||||
}
|
||||
|
||||
type integer interface {
|
||||
~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr
|
||||
}
|
||||
|
||||
type float interface {
|
||||
~float32 | ~float64
|
||||
}
|
||||
|
||||
type number interface {
|
||||
~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr | ~float32 | ~float64
|
||||
}
|
||||
|
||||
func CheckSlices[T number](t *testing.T, got, want []T) bool {
|
||||
t.Helper()
|
||||
return CheckSlicesLogInput[T](t, got, want, 0.0, nil)
|
||||
}
|
||||
|
||||
// CheckSlices compares two slices for equality,
|
||||
// reporting a test error if there is a problem,
|
||||
// and also consumes the two slices so that a
|
||||
// test/benchmark won't be dead-code eliminated.
|
||||
func CheckSlicesLogInput[T number](t *testing.T, got, want []T, flakiness float64, logInput func()) bool {
|
||||
t.Helper()
|
||||
var z T
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
var ia any = got[i]
|
||||
var ib any = want[i]
|
||||
switch x := ia.(type) {
|
||||
case float32:
|
||||
y := ib.(float32)
|
||||
if math.IsNaN(float64(x)) && math.IsNaN(float64(y)) {
|
||||
continue
|
||||
}
|
||||
if flakiness > 0 {
|
||||
if y == 0 {
|
||||
if math.Abs(float64(x)) < flakiness {
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
if math.Abs(float64((x-y)/y)) < flakiness {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
case float64:
|
||||
y := ib.(float64)
|
||||
if math.IsNaN(x) && math.IsNaN(y) {
|
||||
continue
|
||||
}
|
||||
if flakiness > 0 {
|
||||
if y == 0 {
|
||||
if math.Abs(x) < flakiness {
|
||||
continue
|
||||
}
|
||||
} else if math.Abs((x-y)/y) < flakiness {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
}
|
||||
|
||||
t.Logf("For %T vector elements:", z)
|
||||
t.Logf("got =%v", got)
|
||||
t.Logf("want=%v", want)
|
||||
if logInput != nil {
|
||||
logInput()
|
||||
}
|
||||
t.Errorf("at index %d, got=%v, want=%v", i, got[i], want[i])
|
||||
return false
|
||||
} else if got[i] == 0 { // for floating point, 0.0 == -0.0 but a bitwise check can see the difference
|
||||
var ia any = got[i]
|
||||
var ib any = want[i]
|
||||
switch x := ia.(type) {
|
||||
case float32:
|
||||
y := ib.(float32)
|
||||
if math.Float32bits(x) != math.Float32bits(y) {
|
||||
t.Logf("For %T vector elements:", z)
|
||||
t.Logf("got =%v", got)
|
||||
t.Logf("want=%v", want)
|
||||
if logInput != nil {
|
||||
logInput()
|
||||
}
|
||||
t.Errorf("at index %d, different signs of zero", i)
|
||||
return false
|
||||
}
|
||||
case float64:
|
||||
y := ib.(float64)
|
||||
if math.Float64bits(x) != math.Float64bits(y) {
|
||||
t.Logf("For %T vector elements:", z)
|
||||
t.Logf("got =%v", got)
|
||||
t.Logf("want=%v", want)
|
||||
if logInput != nil {
|
||||
logInput()
|
||||
}
|
||||
t.Errorf("at index %d, different signs of zero", i)
|
||||
return false
|
||||
}
|
||||
default:
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
@@ -7369,6 +7369,277 @@ func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16
|
||||
// Asm: VPBLENDMQ, CPU Feature: AVX512
|
||||
func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8
|
||||
|
||||
/* concatSelectedConstant */
|
||||
|
||||
// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
|
||||
// halves of the output. The selection is chosen by the constant parameter h1h0l1l0
|
||||
// where each {h,l}{1,0} is two bits specify which element from y or x to select.
|
||||
// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
|
||||
// {2, 1, 4, 6} (don't forget that the binary constant is written big-endian).
|
||||
//
|
||||
// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPS, CPU Feature: AVX
|
||||
func (x Float32x4) concatSelectedConstant(h1h0l1l0 uint8, y Float32x4) Float32x4
|
||||
|
||||
// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
|
||||
// halves of the output. The selection is chosen by the constant parameter hilo
|
||||
// where hi and lo are each one bit specifying which 64-bit element to select
|
||||
// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7})
|
||||
// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
|
||||
// selecting from y, is 1, and selects 7.
|
||||
//
|
||||
// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPD, CPU Feature: AVX
|
||||
func (x Float64x2) concatSelectedConstant(hilo uint8, y Float64x2) Float64x2
|
||||
|
||||
// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
|
||||
// halves of the output. The selection is chosen by the constant parameter h1h0l1l0
|
||||
// where each {h,l}{1,0} is two bits specify which element from y or x to select.
|
||||
// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
|
||||
// {2, 1, 4, 6} (don't forget that the binary constant is written big-endian).
|
||||
//
|
||||
// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPS, CPU Feature: AVX
|
||||
func (x Int32x4) concatSelectedConstant(h1h0l1l0 uint8, y Int32x4) Int32x4
|
||||
|
||||
// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
|
||||
// halves of the output. The selection is chosen by the constant parameter hilo
|
||||
// where hi and lo are each one bit specifying which 64-bit element to select
|
||||
// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7})
|
||||
// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
|
||||
// selecting from y, is 1, and selects 7.
|
||||
//
|
||||
// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPD, CPU Feature: AVX
|
||||
func (x Int64x2) concatSelectedConstant(hilo uint8, y Int64x2) Int64x2
|
||||
|
||||
// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
|
||||
// halves of the output. The selection is chosen by the constant parameter h1h0l1l0
|
||||
// where each {h,l}{1,0} is two bits specify which element from y or x to select.
|
||||
// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
|
||||
// {2, 1, 4, 6} (don't forget that the binary constant is written big-endian).
|
||||
//
|
||||
// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPS, CPU Feature: AVX
|
||||
func (x Uint32x4) concatSelectedConstant(h1h0l1l0 uint8, y Uint32x4) Uint32x4
|
||||
|
||||
// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
|
||||
// halves of the output. The selection is chosen by the constant parameter hilo
|
||||
// where hi and lo are each one bit specifying which 64-bit element to select
|
||||
// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7})
|
||||
// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
|
||||
// selecting from y, is 1, and selects 7.
|
||||
//
|
||||
// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPD, CPU Feature: AVX
|
||||
func (x Uint64x2) concatSelectedConstant(hilo uint8, y Uint64x2) Uint64x2
|
||||
|
||||
/* concatSelectedConstantGrouped */
|
||||
|
||||
// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selection is chosen by the constant parameter h1h0l1l0
|
||||
// where each {h,l}{1,0} is two bits specify which element from y or x to select.
|
||||
// For example,
|
||||
// {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
|
||||
// returns {2,0,5,7,10,8,13,15}
|
||||
// (don't forget that the binary constant is written big-endian).
|
||||
//
|
||||
// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPS, CPU Feature: AVX
|
||||
func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Float32x8
|
||||
|
||||
// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selection is chosen by the constant parameter h1h0l1l0
|
||||
// where each {h,l}{1,0} is two bits specify which element from y or x to select.
|
||||
// For example,
|
||||
// {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
|
||||
// returns {2,0,5,7,10,8,13,15}
|
||||
// (don't forget that the binary constant is written big-endian).
|
||||
//
|
||||
// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPS, CPU Feature: AVX512
|
||||
func (x Float32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x16) Float32x16
|
||||
|
||||
// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selections are specified by the constant parameter hilos where each
|
||||
// hi and lo pair select 64-bit elements from the corresponding 128-bit
|
||||
// subvectors of x and y.
|
||||
//
|
||||
// For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11})
|
||||
// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
|
||||
// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
|
||||
// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
|
||||
// selecting element 1 from y's upper 128 bits (11).
|
||||
// This differs from the same method applied to a 32x8 vector, where
|
||||
// the 8-bit constant performs the same selection on both subvectors.
|
||||
//
|
||||
// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPD, CPU Feature: AVX
|
||||
func (x Float64x4) concatSelectedConstantGrouped(hilos uint8, y Float64x4) Float64x4
|
||||
|
||||
// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selections are specified by the constant parameter hilos where each
|
||||
// hi and lo pair select 64-bit elements from the corresponding 128-bit
|
||||
// subvectors of x and y.
|
||||
//
|
||||
// For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11})
|
||||
// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
|
||||
// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
|
||||
// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
|
||||
// selecting element 1 from y's upper 128 bits (11).
|
||||
// This differs from the same method applied to a 32x8 vector, where
|
||||
// the 8-bit constant performs the same selection on both subvectors.
|
||||
//
|
||||
// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPD, CPU Feature: AVX512
|
||||
func (x Float64x8) concatSelectedConstantGrouped(hilos uint8, y Float64x8) Float64x8
|
||||
|
||||
// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selection is chosen by the constant parameter h1h0l1l0
|
||||
// where each {h,l}{1,0} is two bits specify which element from y or x to select.
|
||||
// For example,
|
||||
// {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
|
||||
// returns {2,0,5,7,10,8,13,15}
|
||||
// (don't forget that the binary constant is written big-endian).
|
||||
//
|
||||
// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPS, CPU Feature: AVX
|
||||
func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x8
|
||||
|
||||
// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selection is chosen by the constant parameter h1h0l1l0
|
||||
// where each {h,l}{1,0} is two bits specify which element from y or x to select.
|
||||
// For example,
|
||||
// {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
|
||||
// returns {2,0,5,7,10,8,13,15}
|
||||
// (don't forget that the binary constant is written big-endian).
|
||||
//
|
||||
// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPS, CPU Feature: AVX512
|
||||
func (x Int32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x16) Int32x16
|
||||
|
||||
// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selections are specified by the constant parameter hilos where each
|
||||
// hi and lo pair select 64-bit elements from the corresponding 128-bit
|
||||
// subvectors of x and y.
|
||||
//
|
||||
// For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11})
|
||||
// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
|
||||
// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
|
||||
// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
|
||||
// selecting element 1 from y's upper 128 bits (11).
|
||||
// This differs from the same method applied to a 32x8 vector, where
|
||||
// the 8-bit constant performs the same selection on both subvectors.
|
||||
//
|
||||
// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPD, CPU Feature: AVX
|
||||
func (x Int64x4) concatSelectedConstantGrouped(hilos uint8, y Int64x4) Int64x4
|
||||
|
||||
// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selections are specified by the constant parameter hilos where each
|
||||
// hi and lo pair select 64-bit elements from the corresponding 128-bit
|
||||
// subvectors of x and y.
|
||||
//
|
||||
// For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11})
|
||||
// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
|
||||
// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
|
||||
// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
|
||||
// selecting element 1 from y's upper 128 bits (11).
|
||||
// This differs from the same method applied to a 32x8 vector, where
|
||||
// the 8-bit constant performs the same selection on both subvectors.
|
||||
//
|
||||
// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPD, CPU Feature: AVX512
|
||||
func (x Int64x8) concatSelectedConstantGrouped(hilos uint8, y Int64x8) Int64x8
|
||||
|
||||
// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selection is chosen by the constant parameter h1h0l1l0
|
||||
// where each {h,l}{1,0} is two bits specify which element from y or x to select.
|
||||
// For example,
|
||||
// {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
|
||||
// returns {2,0,5,7,10,8,13,15}
|
||||
// (don't forget that the binary constant is written big-endian).
|
||||
//
|
||||
// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPS, CPU Feature: AVX
|
||||
func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint32x8
|
||||
|
||||
// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selection is chosen by the constant parameter h1h0l1l0
|
||||
// where each {h,l}{1,0} is two bits specify which element from y or x to select.
|
||||
// For example,
|
||||
// {0,1,2,3,8,9,10,11}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
|
||||
// returns {2,0,5,7,10,8,13,15}
|
||||
// (don't forget that the binary constant is written big-endian).
|
||||
//
|
||||
// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPS, CPU Feature: AVX512
|
||||
func (x Uint32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x16) Uint32x16
|
||||
|
||||
// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selections are specified by the constant parameter hilos where each
|
||||
// hi and lo pair select 64-bit elements from the corresponding 128-bit
|
||||
// subvectors of x and y.
|
||||
//
|
||||
// For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11})
|
||||
// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
|
||||
// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
|
||||
// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
|
||||
// selecting element 1 from y's upper 128 bits (11).
|
||||
// This differs from the same method applied to a 32x8 vector, where
|
||||
// the 8-bit constant performs the same selection on both subvectors.
|
||||
//
|
||||
// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPD, CPU Feature: AVX
|
||||
func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x4
|
||||
|
||||
// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
|
||||
// into the lower and upper halves of corresponding subvectors of the output.
|
||||
// The selections are specified by the constant parameter hilos where each
|
||||
// hi and lo pair select 64-bit elements from the corresponding 128-bit
|
||||
// subvectors of x and y.
|
||||
//
|
||||
// For example {4,5,8,9}.concatSelectedConstant(0b_11_10, {6,7,10,11})
|
||||
// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
|
||||
// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
|
||||
// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
|
||||
// selecting element 1 from y's upper 128 bits (11).
|
||||
// This differs from the same method applied to a 32x8 vector, where
|
||||
// the 8-bit constant performs the same selection on both subvectors.
|
||||
//
|
||||
// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VSHUFPD, CPU Feature: AVX512
|
||||
func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8
|
||||
|
||||
/* moveMasked */
|
||||
|
||||
// moveMasked blends a vector with zero, with the original value where the mask is true
|
||||
|
||||
48
src/simd/pkginternal_test.go
Normal file
48
src/simd/pkginternal_test.go
Normal file
@@ -0,0 +1,48 @@
|
||||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build goexperiment.simd && amd64
|
||||
|
||||
package simd
|
||||
|
||||
import (
|
||||
"simd/internal/test_helpers"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestConcatSelectedConstant64(t *testing.T) {
|
||||
a := make([]int64, 2)
|
||||
x := LoadInt64x2Slice([]int64{4, 5})
|
||||
y := LoadInt64x2Slice([]int64{6, 7})
|
||||
z := x.concatSelectedConstant(0b10, y)
|
||||
z.StoreSlice(a)
|
||||
test_helpers.CheckSlices[int64](t, a, []int64{4, 7})
|
||||
}
|
||||
|
||||
func TestConcatSelectedConstantGrouped64(t *testing.T) {
|
||||
a := make([]float64, 4)
|
||||
x := LoadFloat64x4Slice([]float64{4, 5, 8, 9})
|
||||
y := LoadFloat64x4Slice([]float64{6, 7, 10, 11})
|
||||
z := x.concatSelectedConstantGrouped(0b_11_10, y)
|
||||
z.StoreSlice(a)
|
||||
test_helpers.CheckSlices[float64](t, a, []float64{4, 7, 9, 11})
|
||||
}
|
||||
|
||||
func TestConcatSelectedConstant32(t *testing.T) {
|
||||
a := make([]float32, 4)
|
||||
x := LoadFloat32x4Slice([]float32{4, 5, 8, 9})
|
||||
y := LoadFloat32x4Slice([]float32{6, 7, 10, 11})
|
||||
z := x.concatSelectedConstant(0b_11_01_10_00, y)
|
||||
z.StoreSlice(a)
|
||||
test_helpers.CheckSlices[float32](t, a, []float32{4, 8, 7, 11})
|
||||
}
|
||||
|
||||
func TestConcatSelectedConstantGrouped32(t *testing.T) {
|
||||
a := make([]uint32, 8)
|
||||
x := LoadUint32x8Slice([]uint32{0, 1, 2, 3, 8, 9, 10, 11})
|
||||
y := LoadUint32x8Slice([]uint32{4, 5, 6, 7, 12, 13, 14, 15})
|
||||
z := x.concatSelectedConstantGrouped(0b_11_01_00_10, y)
|
||||
z.StoreSlice(a)
|
||||
test_helpers.CheckSlices[uint32](t, a, []uint32{2, 0, 5, 7, 10, 8, 13, 15})
|
||||
}
|
||||
Reference in New Issue
Block a user