[dev.simd] simd: add carryless multiply

now with comments, and also a test.

choice of data types, method names, etc, are all up for comment.
It's NOT commutative, because of the immediate operand (unless we
swap the bits of the immediate).

Change-Id: I730a6938c6803d0b93544445db65eadc51783e42
Reviewed-on: https://go-review.googlesource.com/c/go/+/726963
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
David Chase
2025-12-04 17:51:04 -05:00
parent f51ee08905
commit 3417b48b17
15 changed files with 302 additions and 7 deletions

View File

@@ -1232,6 +1232,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPSHRDQ128,
ssa.OpAMD64VPSHRDQ256,
ssa.OpAMD64VPSHRDQ512,
ssa.OpAMD64VPCLMULQDQ128,
ssa.OpAMD64VPCLMULQDQ256,
ssa.OpAMD64VPCLMULQDQ512,
ssa.OpAMD64VSHUFPS128,
ssa.OpAMD64VSHUFPD128,
ssa.OpAMD64VSHUFPS256,

View File

@@ -1333,6 +1333,9 @@
(blendMaskedInt16x32 x y mask) => (VPBLENDMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
(blendMaskedInt32x16 x y mask) => (VPBLENDMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
(blendMaskedInt64x8 x y mask) => (VPBLENDMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
(carrylessMultiplyUint64x2 ...) => (VPCLMULQDQ128 ...)
(carrylessMultiplyUint64x4 ...) => (VPCLMULQDQ256 ...)
(carrylessMultiplyUint64x8 ...) => (VPCLMULQDQ512 ...)
(concatSelectedConstantFloat32x4 ...) => (VSHUFPS128 ...)
(concatSelectedConstantFloat64x2 ...) => (VSHUFPD128 ...)
(concatSelectedConstantInt32x4 ...) => (VSHUFPS128 ...)

View File

@@ -1269,6 +1269,9 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPALIGNRMasked128", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPALIGNRMasked256", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPALIGNRMasked512", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCLMULQDQ128", argLength: 2, reg: v21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPCLMULQDQ256", argLength: 2, reg: w21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPCLMULQDQ512", argLength: 2, reg: w21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPBMasked128", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPBMasked256", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},

View File

@@ -1301,6 +1301,9 @@ func simdGenericOps() []opData {
{name: "TruncScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"},
{name: "TruncScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "TruncScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "carrylessMultiplyUint64x2", argLength: 2, commutative: false, aux: "UInt8"},
{name: "carrylessMultiplyUint64x4", argLength: 2, commutative: false, aux: "UInt8"},
{name: "carrylessMultiplyUint64x8", argLength: 2, commutative: false, aux: "UInt8"},
{name: "concatSelectedConstantFloat32x4", argLength: 2, commutative: false, aux: "UInt8"},
{name: "concatSelectedConstantFloat64x2", argLength: 2, commutative: false, aux: "UInt8"},
{name: "concatSelectedConstantGroupedFloat32x8", argLength: 2, commutative: false, aux: "UInt8"},

View File

@@ -2510,6 +2510,9 @@ const (
OpAMD64VPALIGNRMasked128
OpAMD64VPALIGNRMasked256
OpAMD64VPALIGNRMasked512
OpAMD64VPCLMULQDQ128
OpAMD64VPCLMULQDQ256
OpAMD64VPCLMULQDQ512
OpAMD64VPCMPB512
OpAMD64VPCMPBMasked128
OpAMD64VPCMPBMasked256
@@ -7448,6 +7451,9 @@ const (
OpTruncScaledResidueFloat64x2
OpTruncScaledResidueFloat64x4
OpTruncScaledResidueFloat64x8
OpcarrylessMultiplyUint64x2
OpcarrylessMultiplyUint64x4
OpcarrylessMultiplyUint64x8
OpconcatSelectedConstantFloat32x4
OpconcatSelectedConstantFloat64x2
OpconcatSelectedConstantGroupedFloat32x8
@@ -39211,6 +39217,51 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "VPCLMULQDQ128",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPCLMULQDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPCLMULQDQ256",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPCLMULQDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPCLMULQDQ512",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPCLMULQDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPCMPB512",
auxType: auxUInt8,
@@ -95848,6 +95899,24 @@ var opcodeTable = [...]opInfo{
argLen: 1,
generic: true,
},
{
name: "carrylessMultiplyUint64x2",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
name: "carrylessMultiplyUint64x4",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
name: "carrylessMultiplyUint64x8",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
{
name: "concatSelectedConstantFloat32x4",
auxType: auxUInt8,

View File

@@ -6307,6 +6307,15 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpblendMaskedInt64x8(v)
case OpblendMaskedInt8x64:
return rewriteValueAMD64_OpblendMaskedInt8x64(v)
case OpcarrylessMultiplyUint64x2:
v.Op = OpAMD64VPCLMULQDQ128
return true
case OpcarrylessMultiplyUint64x4:
v.Op = OpAMD64VPCLMULQDQ256
return true
case OpcarrylessMultiplyUint64x8:
v.Op = OpAMD64VPCLMULQDQ512
return true
case OpconcatSelectedConstantFloat32x4:
v.Op = OpAMD64VSHUFPS128
return true

View File

@@ -1309,6 +1309,9 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Int16x32.blendMasked", opLen3(ssa.OpblendMaskedInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x16.blendMasked", opLen3(ssa.OpblendMaskedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.blendMasked", opLen3(ssa.OpblendMaskedInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x2.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x2, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint64x4.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x4, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint64x8.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x8, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Float32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Float64x2.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat64x2, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128, 0), sys.AMD64)

View File

@@ -19,3 +19,5 @@
documentation: !string |-
// NAME computes element-wise GF(2^8) multiplication with
// reduction polynomial x^8 + x^4 + x^3 + x + 1.
- go: carrylessMultiply
commutative: false

View File

@@ -30,3 +30,63 @@
- *uint8
out:
- *uint8
- go: carrylessMultiply
documentation: !string |-
// NAME computes one of four possible Galois polynomial
// products of selected high and low halves of x and y,
// depending on the value of xyHiLo, returning the 128-bit
// product in the concatenated two elements of the result.
// Bit 0 selects the low (0) or high (1) element of x and
// bit 4 selects the low (0x00) or high (0x10) element of y.
asm: V?PCLMULQDQ
in:
- go: Uint64x2
- go: Uint64x2
- class: immediate
immOffset: 0
name: xyHiLo
out:
- go: Uint64x2
overwriteElementBits: 64
hideMaskMethods: true
- go: carrylessMultiply
documentation: !string |-
// NAME computes one of two possible Galois polynomial
// products of selected high and low halves of each of the two
// 128-bit lanes of x and y, depending on the value of xyHiLo,
// and returns the four 128-bit products in the result's lanes.
// Bit 0 selects the low (0) or high (1) elements of x's lanes and
// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
asm: V?PCLMULQDQ
in:
- go: Uint64x4
- go: Uint64x4
- class: immediate
immOffset: 0
name: xyHiLo
out:
- go: Uint64x4
overwriteElementBits: 64
hideMaskMethods: true
- go: carrylessMultiply
documentation: !string |-
// NAME computes one of four possible Galois polynomial
// products of selected high and low halves of each of the four
// 128-bit lanes of x and y, depending on the value of xyHiLo,
// and returns the four 128-bit products in the result's lanes.
// Bit 0 selects the low (0) or high (1) elements of x's lanes and
// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
asm: V?PCLMULQDQ
in:
- go: Uint64x8
- go: Uint64x8
- class: immediate
immOffset: 0
name: xyHiLo
out:
- go: Uint64x8
overwriteElementBits: 64
hideMaskMethods: true

View File

@@ -83,6 +83,9 @@ in: !repeat
- {class: vreg, go: Int64x4, base: "int", elemBits: 128, bits: 256, lanes: 4}
- {class: vreg, go: Uint64x4, base: "uint", elemBits: 128, bits: 256, lanes: 4}
# Special for carryless multiply
- {class: vreg, go: Uint64x8, base: "uint", elemBits: 128, bits: 512, lanes: 8}
# Special shapes just to make VAES(ENC|DEC)(LAST)?512 work.
# The elemBits field of these shapes are wrong, it would be overwritten by overwriteElemBits.
- {class: vreg, go: Int8x32, base: "int", elemBits: 128, bits: 512, lanes: 32}

View File

@@ -808,13 +808,14 @@ var cpuFeatureMap = map[cpuFeatureKey]string{
// the vector length suffix.
// AVX-512 extension features
{"AVX512EVEX", "AVX512_BITALG"}: "AVX512BITALG",
{"AVX512EVEX", "AVX512_GFNI"}: "AVX512GFNI",
{"AVX512EVEX", "AVX512_VBMI2"}: "AVX512VBMI2",
{"AVX512EVEX", "AVX512_VBMI"}: "AVX512VBMI",
{"AVX512EVEX", "AVX512_VNNI"}: "AVX512VNNI",
{"AVX512EVEX", "AVX512_VPOPCNTDQ"}: "AVX512VPOPCNTDQ",
{"AVX512EVEX", "AVX512_VAES"}: "AVX512VAES",
{"AVX512EVEX", "AVX512_BITALG"}: "AVX512BITALG",
{"AVX512EVEX", "AVX512_GFNI"}: "AVX512GFNI",
{"AVX512EVEX", "AVX512_VBMI2"}: "AVX512VBMI2",
{"AVX512EVEX", "AVX512_VBMI"}: "AVX512VBMI",
{"AVX512EVEX", "AVX512_VNNI"}: "AVX512VNNI",
{"AVX512EVEX", "AVX512_VPOPCNTDQ"}: "AVX512VPOPCNTDQ",
{"AVX512EVEX", "AVX512_VAES"}: "AVX512VAES",
{"AVX512EVEX", "AVX512_VPCLMULQDQ"}: "AVX512VPCLMULQDQ",
// AVX 10.2 (not yet supported)
{"AVX512EVEX", "AVX10_2_RC"}: "ignore",

View File

@@ -95,6 +95,14 @@ func (X86Features) AVX512VNNI() bool {
return cpu.X86.HasAVX512VNNI
}
// AVX512VPCLMULQDQ returns whether the CPU supports the AVX512VPCLMULQDQ feature.
//
// AVX512VPCLMULQDQ is defined on all GOARCHes, but will only return true on
// GOARCH amd64.
func (X86Features) AVX512VPCLMULQDQ() bool {
return cpu.X86.HasAVX512VPCLMULQDQ
}
// AVX512VPOPCNTDQ returns whether the CPU supports the AVX512VPOPCNTDQ feature.
//
// AVX512VPOPCNTDQ is defined on all GOARCHes, but will only return true on

View File

@@ -1194,3 +1194,21 @@ func TestPermuteScalarsLoGrouped(t *testing.T) {
simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
checkSlices(t, got, want)
}
func TestClMul(t *testing.T) {
var x = simd.LoadUint64x2Slice([]uint64{1, 5})
var y = simd.LoadUint64x2Slice([]uint64{3, 9})
foo := func(v simd.Uint64x2, s []uint64) {
r := make([]uint64, 2, 2)
v.StoreSlice(r)
checkSlices[uint64](t, r, s)
}
foo(x.CarrylessMultiply(0, 0, y), []uint64{3, 0})
foo(x.CarrylessMultiply(0, 1, y), []uint64{9, 0})
foo(x.CarrylessMultiply(1, 0, y), []uint64{15, 0})
foo(x.CarrylessMultiply(1, 1, y), []uint64{45, 0})
foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0})
}

View File

@@ -52,6 +52,44 @@ func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16
// Asm: VPBLENDMQ, CPU Feature: AVX512
func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8
/* carrylessMultiply */
// carrylessMultiply computes one of four possible Galois polynomial
// products of selected high and low halves of x and y,
// depending on the value of xyHiLo, returning the 128-bit
// product in the concatenated two elements of the result.
// Bit 0 selects the low (0) or high (1) element of x and
// bit 4 selects the low (0x00) or high (0x10) element of y.
//
// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPCLMULQDQ, CPU Feature: AVX
func (x Uint64x2) carrylessMultiply(xyHiLo uint8, y Uint64x2) Uint64x2
// carrylessMultiply computes one of two possible Galois polynomial
// products of selected high and low halves of each of the two
// 128-bit lanes of x and y, depending on the value of xyHiLo,
// and returns the four 128-bit products in the result's lanes.
// Bit 0 selects the low (0) or high (1) elements of x's lanes and
// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
//
// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
func (x Uint64x4) carrylessMultiply(xyHiLo uint8, y Uint64x4) Uint64x4
// carrylessMultiply computes one of four possible Galois polynomial
// products of selected high and low halves of each of the four
// 128-bit lanes of x and y, depending on the value of xyHiLo,
// and returns the four 128-bit products in the result's lanes.
// Bit 0 selects the low (0) or high (1) elements of x's lanes and
// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
//
// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
func (x Uint64x8) carrylessMultiply(xyHiLo uint8, y Uint64x8) Uint64x8
/* concatSelectedConstant */
// concatSelectedConstant concatenates selected elements from x and y into the lower and upper

View File

@@ -1266,3 +1266,75 @@ func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// CarrylessMultiply computes one of four possible carryless
// multiplications of selected high and low halves of x and y,
// depending on the values of a and b, returning the 128-bit
// product in the concatenated two elements of the result.
// a selects the low (0) or high (1) element of x and
// b selects the low (0) or high (1) element of y.
//
// A carryless multiplication uses bitwise XOR instead of
// add-with-carry, for example (in base two):
// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
//
// This also models multiplication of polynomials with coefficients
// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
// polynomial terms, but coefficients "add" with XOR.)
//
// constant values of a and b will result in better performance,
// otherwise the intrinsic may translate into a jump table.
//
// Asm: VPCLMULQDQ, CPU Feature: AVX
func (x Uint64x2) CarrylessMultiply(a, b uint8, y Uint64x2) Uint64x2 {
return x.carrylessMultiply(a&1+((b&1)<<4), y)
}
// CarrylessMultiplyGrouped computes one of four possible carryless
// multiplications of selected high and low halves of each of the two
// 128-bit lanes of x and y, depending on the values of a and b,
// and returns the four 128-bit products in the result's lanes.
// a selects the low (0) or high (1) elements of x's lanes and
// b selects the low (0) or high (1) elements of y's lanes.
//
// A carryless multiplication uses bitwise XOR instead of
// add-with-carry, for example (in base two):
// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
//
// This also models multiplication of polynomials with coefficients
// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
// polynomial terms, but coefficients "add" with XOR.)
//
// constant values of a and b will result in better performance,
// otherwise the intrinsic may translate into a jump table.
//
// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
func (x Uint64x4) CarrylessMultiplyGrouped(a, b uint8, y Uint64x4) Uint64x4 {
return x.carrylessMultiply(a&1+((b&1)<<4), y)
}
// CarrylessMultiplyGrouped computes one of four possible carryless
// multiplications of selected high and low halves of each of the four
// 128-bit lanes of x and y, depending on the values of a and b,
// and returns the four 128-bit products in the result's lanes.
// a selects the low (0) or high (1) elements of x's lanes and
// b selects the low (0) or high (1) elements of y's lanes.
//
// A carryless multiplication uses bitwise XOR instead of
// add-with-carry, for example (in base two):
// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
//
// This also models multiplication of polynomials with coefficients
// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
// polynomial terms, but coefficients "add" with XOR.)
//
// constant values of a and b will result in better performance,
// otherwise the intrinsic may translate into a jump table.
//
// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
func (x Uint64x8) CarrylessMultiplyGrouped(a, b uint8, y Uint64x8) Uint64x8 {
return x.carrylessMultiply(a&1+((b&1)<<4), y)
}