mirror of
https://github.com/golang/go.git
synced 2026-01-29 07:02:05 +03:00
[dev.simd] simd: add carryless multiply
now with comments, and also a test. choice of data types, method names, etc, are all up for comment. It's NOT commutative, because of the immediate operand (unless we swap the bits of the immediate). Change-Id: I730a6938c6803d0b93544445db65eadc51783e42 Reviewed-on: https://go-review.googlesource.com/c/go/+/726963 Reviewed-by: Junyang Shao <shaojunyang@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
@@ -1232,6 +1232,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
|
||||
ssa.OpAMD64VPSHRDQ128,
|
||||
ssa.OpAMD64VPSHRDQ256,
|
||||
ssa.OpAMD64VPSHRDQ512,
|
||||
ssa.OpAMD64VPCLMULQDQ128,
|
||||
ssa.OpAMD64VPCLMULQDQ256,
|
||||
ssa.OpAMD64VPCLMULQDQ512,
|
||||
ssa.OpAMD64VSHUFPS128,
|
||||
ssa.OpAMD64VSHUFPD128,
|
||||
ssa.OpAMD64VSHUFPS256,
|
||||
|
||||
@@ -1333,6 +1333,9 @@
|
||||
(blendMaskedInt16x32 x y mask) => (VPBLENDMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
|
||||
(blendMaskedInt32x16 x y mask) => (VPBLENDMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
|
||||
(blendMaskedInt64x8 x y mask) => (VPBLENDMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
|
||||
(carrylessMultiplyUint64x2 ...) => (VPCLMULQDQ128 ...)
|
||||
(carrylessMultiplyUint64x4 ...) => (VPCLMULQDQ256 ...)
|
||||
(carrylessMultiplyUint64x8 ...) => (VPCLMULQDQ512 ...)
|
||||
(concatSelectedConstantFloat32x4 ...) => (VSHUFPS128 ...)
|
||||
(concatSelectedConstantFloat64x2 ...) => (VSHUFPD128 ...)
|
||||
(concatSelectedConstantInt32x4 ...) => (VSHUFPS128 ...)
|
||||
|
||||
@@ -1269,6 +1269,9 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
|
||||
{name: "VPALIGNRMasked128", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
|
||||
{name: "VPALIGNRMasked256", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
|
||||
{name: "VPALIGNRMasked512", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
|
||||
{name: "VPCLMULQDQ128", argLength: 2, reg: v21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
|
||||
{name: "VPCLMULQDQ256", argLength: 2, reg: w21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
|
||||
{name: "VPCLMULQDQ512", argLength: 2, reg: w21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
|
||||
{name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
|
||||
{name: "VPCMPBMasked128", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
|
||||
{name: "VPCMPBMasked256", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
|
||||
|
||||
@@ -1301,6 +1301,9 @@ func simdGenericOps() []opData {
|
||||
{name: "TruncScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"},
|
||||
{name: "TruncScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"},
|
||||
{name: "TruncScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"},
|
||||
{name: "carrylessMultiplyUint64x2", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "carrylessMultiplyUint64x4", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "carrylessMultiplyUint64x8", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantFloat32x4", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantFloat64x2", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
{name: "concatSelectedConstantGroupedFloat32x8", argLength: 2, commutative: false, aux: "UInt8"},
|
||||
|
||||
@@ -2510,6 +2510,9 @@ const (
|
||||
OpAMD64VPALIGNRMasked128
|
||||
OpAMD64VPALIGNRMasked256
|
||||
OpAMD64VPALIGNRMasked512
|
||||
OpAMD64VPCLMULQDQ128
|
||||
OpAMD64VPCLMULQDQ256
|
||||
OpAMD64VPCLMULQDQ512
|
||||
OpAMD64VPCMPB512
|
||||
OpAMD64VPCMPBMasked128
|
||||
OpAMD64VPCMPBMasked256
|
||||
@@ -7448,6 +7451,9 @@ const (
|
||||
OpTruncScaledResidueFloat64x2
|
||||
OpTruncScaledResidueFloat64x4
|
||||
OpTruncScaledResidueFloat64x8
|
||||
OpcarrylessMultiplyUint64x2
|
||||
OpcarrylessMultiplyUint64x4
|
||||
OpcarrylessMultiplyUint64x8
|
||||
OpconcatSelectedConstantFloat32x4
|
||||
OpconcatSelectedConstantFloat64x2
|
||||
OpconcatSelectedConstantGroupedFloat32x8
|
||||
@@ -39211,6 +39217,51 @@ var opcodeTable = [...]opInfo{
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VPCLMULQDQ128",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
asm: x86.AVPCLMULQDQ,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VPCLMULQDQ256",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
asm: x86.AVPCLMULQDQ,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VPCLMULQDQ512",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
asm: x86.AVPCLMULQDQ,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VPCMPB512",
|
||||
auxType: auxUInt8,
|
||||
@@ -95848,6 +95899,24 @@ var opcodeTable = [...]opInfo{
|
||||
argLen: 1,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "carrylessMultiplyUint64x2",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "carrylessMultiplyUint64x4",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "carrylessMultiplyUint64x8",
|
||||
auxType: auxUInt8,
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "concatSelectedConstantFloat32x4",
|
||||
auxType: auxUInt8,
|
||||
|
||||
@@ -6307,6 +6307,15 @@ func rewriteValueAMD64(v *Value) bool {
|
||||
return rewriteValueAMD64_OpblendMaskedInt64x8(v)
|
||||
case OpblendMaskedInt8x64:
|
||||
return rewriteValueAMD64_OpblendMaskedInt8x64(v)
|
||||
case OpcarrylessMultiplyUint64x2:
|
||||
v.Op = OpAMD64VPCLMULQDQ128
|
||||
return true
|
||||
case OpcarrylessMultiplyUint64x4:
|
||||
v.Op = OpAMD64VPCLMULQDQ256
|
||||
return true
|
||||
case OpcarrylessMultiplyUint64x8:
|
||||
v.Op = OpAMD64VPCLMULQDQ512
|
||||
return true
|
||||
case OpconcatSelectedConstantFloat32x4:
|
||||
v.Op = OpAMD64VSHUFPS128
|
||||
return true
|
||||
|
||||
@@ -1309,6 +1309,9 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
|
||||
addF(simdPackage, "Int16x32.blendMasked", opLen3(ssa.OpblendMaskedInt16x32, types.TypeVec512), sys.AMD64)
|
||||
addF(simdPackage, "Int32x16.blendMasked", opLen3(ssa.OpblendMaskedInt32x16, types.TypeVec512), sys.AMD64)
|
||||
addF(simdPackage, "Int64x8.blendMasked", opLen3(ssa.OpblendMaskedInt64x8, types.TypeVec512), sys.AMD64)
|
||||
addF(simdPackage, "Uint64x2.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x2, types.TypeVec128, 0), sys.AMD64)
|
||||
addF(simdPackage, "Uint64x4.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x4, types.TypeVec256, 0), sys.AMD64)
|
||||
addF(simdPackage, "Uint64x8.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x8, types.TypeVec512, 0), sys.AMD64)
|
||||
addF(simdPackage, "Float32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128, 0), sys.AMD64)
|
||||
addF(simdPackage, "Float64x2.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat64x2, types.TypeVec128, 0), sys.AMD64)
|
||||
addF(simdPackage, "Int32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128, 0), sys.AMD64)
|
||||
|
||||
@@ -19,3 +19,5 @@
|
||||
documentation: !string |-
|
||||
// NAME computes element-wise GF(2^8) multiplication with
|
||||
// reduction polynomial x^8 + x^4 + x^3 + x + 1.
|
||||
- go: carrylessMultiply
|
||||
commutative: false
|
||||
|
||||
@@ -30,3 +30,63 @@
|
||||
- *uint8
|
||||
out:
|
||||
- *uint8
|
||||
|
||||
- go: carrylessMultiply
|
||||
documentation: !string |-
|
||||
// NAME computes one of four possible Galois polynomial
|
||||
// products of selected high and low halves of x and y,
|
||||
// depending on the value of xyHiLo, returning the 128-bit
|
||||
// product in the concatenated two elements of the result.
|
||||
// Bit 0 selects the low (0) or high (1) element of x and
|
||||
// bit 4 selects the low (0x00) or high (0x10) element of y.
|
||||
asm: V?PCLMULQDQ
|
||||
in:
|
||||
- go: Uint64x2
|
||||
- go: Uint64x2
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: xyHiLo
|
||||
out:
|
||||
- go: Uint64x2
|
||||
overwriteElementBits: 64
|
||||
hideMaskMethods: true
|
||||
|
||||
- go: carrylessMultiply
|
||||
documentation: !string |-
|
||||
// NAME computes one of two possible Galois polynomial
|
||||
// products of selected high and low halves of each of the two
|
||||
// 128-bit lanes of x and y, depending on the value of xyHiLo,
|
||||
// and returns the four 128-bit products in the result's lanes.
|
||||
// Bit 0 selects the low (0) or high (1) elements of x's lanes and
|
||||
// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
|
||||
asm: V?PCLMULQDQ
|
||||
in:
|
||||
- go: Uint64x4
|
||||
- go: Uint64x4
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: xyHiLo
|
||||
out:
|
||||
- go: Uint64x4
|
||||
overwriteElementBits: 64
|
||||
hideMaskMethods: true
|
||||
|
||||
- go: carrylessMultiply
|
||||
documentation: !string |-
|
||||
// NAME computes one of four possible Galois polynomial
|
||||
// products of selected high and low halves of each of the four
|
||||
// 128-bit lanes of x and y, depending on the value of xyHiLo,
|
||||
// and returns the four 128-bit products in the result's lanes.
|
||||
// Bit 0 selects the low (0) or high (1) elements of x's lanes and
|
||||
// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
|
||||
asm: V?PCLMULQDQ
|
||||
in:
|
||||
- go: Uint64x8
|
||||
- go: Uint64x8
|
||||
- class: immediate
|
||||
immOffset: 0
|
||||
name: xyHiLo
|
||||
out:
|
||||
- go: Uint64x8
|
||||
overwriteElementBits: 64
|
||||
hideMaskMethods: true
|
||||
|
||||
@@ -83,6 +83,9 @@ in: !repeat
|
||||
- {class: vreg, go: Int64x4, base: "int", elemBits: 128, bits: 256, lanes: 4}
|
||||
- {class: vreg, go: Uint64x4, base: "uint", elemBits: 128, bits: 256, lanes: 4}
|
||||
|
||||
# Special for carryless multiply
|
||||
- {class: vreg, go: Uint64x8, base: "uint", elemBits: 128, bits: 512, lanes: 8}
|
||||
|
||||
# Special shapes just to make VAES(ENC|DEC)(LAST)?512 work.
|
||||
# The elemBits field of these shapes are wrong, it would be overwritten by overwriteElemBits.
|
||||
- {class: vreg, go: Int8x32, base: "int", elemBits: 128, bits: 512, lanes: 32}
|
||||
|
||||
@@ -808,13 +808,14 @@ var cpuFeatureMap = map[cpuFeatureKey]string{
|
||||
// the vector length suffix.
|
||||
|
||||
// AVX-512 extension features
|
||||
{"AVX512EVEX", "AVX512_BITALG"}: "AVX512BITALG",
|
||||
{"AVX512EVEX", "AVX512_GFNI"}: "AVX512GFNI",
|
||||
{"AVX512EVEX", "AVX512_VBMI2"}: "AVX512VBMI2",
|
||||
{"AVX512EVEX", "AVX512_VBMI"}: "AVX512VBMI",
|
||||
{"AVX512EVEX", "AVX512_VNNI"}: "AVX512VNNI",
|
||||
{"AVX512EVEX", "AVX512_VPOPCNTDQ"}: "AVX512VPOPCNTDQ",
|
||||
{"AVX512EVEX", "AVX512_VAES"}: "AVX512VAES",
|
||||
{"AVX512EVEX", "AVX512_BITALG"}: "AVX512BITALG",
|
||||
{"AVX512EVEX", "AVX512_GFNI"}: "AVX512GFNI",
|
||||
{"AVX512EVEX", "AVX512_VBMI2"}: "AVX512VBMI2",
|
||||
{"AVX512EVEX", "AVX512_VBMI"}: "AVX512VBMI",
|
||||
{"AVX512EVEX", "AVX512_VNNI"}: "AVX512VNNI",
|
||||
{"AVX512EVEX", "AVX512_VPOPCNTDQ"}: "AVX512VPOPCNTDQ",
|
||||
{"AVX512EVEX", "AVX512_VAES"}: "AVX512VAES",
|
||||
{"AVX512EVEX", "AVX512_VPCLMULQDQ"}: "AVX512VPCLMULQDQ",
|
||||
|
||||
// AVX 10.2 (not yet supported)
|
||||
{"AVX512EVEX", "AVX10_2_RC"}: "ignore",
|
||||
|
||||
@@ -95,6 +95,14 @@ func (X86Features) AVX512VNNI() bool {
|
||||
return cpu.X86.HasAVX512VNNI
|
||||
}
|
||||
|
||||
// AVX512VPCLMULQDQ returns whether the CPU supports the AVX512VPCLMULQDQ feature.
|
||||
//
|
||||
// AVX512VPCLMULQDQ is defined on all GOARCHes, but will only return true on
|
||||
// GOARCH amd64.
|
||||
func (X86Features) AVX512VPCLMULQDQ() bool {
|
||||
return cpu.X86.HasAVX512VPCLMULQDQ
|
||||
}
|
||||
|
||||
// AVX512VPOPCNTDQ returns whether the CPU supports the AVX512VPOPCNTDQ feature.
|
||||
//
|
||||
// AVX512VPOPCNTDQ is defined on all GOARCHes, but will only return true on
|
||||
|
||||
@@ -1194,3 +1194,21 @@ func TestPermuteScalarsLoGrouped(t *testing.T) {
|
||||
simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
|
||||
checkSlices(t, got, want)
|
||||
}
|
||||
|
||||
func TestClMul(t *testing.T) {
|
||||
var x = simd.LoadUint64x2Slice([]uint64{1, 5})
|
||||
var y = simd.LoadUint64x2Slice([]uint64{3, 9})
|
||||
|
||||
foo := func(v simd.Uint64x2, s []uint64) {
|
||||
r := make([]uint64, 2, 2)
|
||||
v.StoreSlice(r)
|
||||
checkSlices[uint64](t, r, s)
|
||||
}
|
||||
|
||||
foo(x.CarrylessMultiply(0, 0, y), []uint64{3, 0})
|
||||
foo(x.CarrylessMultiply(0, 1, y), []uint64{9, 0})
|
||||
foo(x.CarrylessMultiply(1, 0, y), []uint64{15, 0})
|
||||
foo(x.CarrylessMultiply(1, 1, y), []uint64{45, 0})
|
||||
foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0})
|
||||
|
||||
}
|
||||
|
||||
@@ -52,6 +52,44 @@ func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16
|
||||
// Asm: VPBLENDMQ, CPU Feature: AVX512
|
||||
func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8
|
||||
|
||||
/* carrylessMultiply */
|
||||
|
||||
// carrylessMultiply computes one of four possible Galois polynomial
|
||||
// products of selected high and low halves of x and y,
|
||||
// depending on the value of xyHiLo, returning the 128-bit
|
||||
// product in the concatenated two elements of the result.
|
||||
// Bit 0 selects the low (0) or high (1) element of x and
|
||||
// bit 4 selects the low (0x00) or high (0x10) element of y.
|
||||
//
|
||||
// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VPCLMULQDQ, CPU Feature: AVX
|
||||
func (x Uint64x2) carrylessMultiply(xyHiLo uint8, y Uint64x2) Uint64x2
|
||||
|
||||
// carrylessMultiply computes one of two possible Galois polynomial
|
||||
// products of selected high and low halves of each of the two
|
||||
// 128-bit lanes of x and y, depending on the value of xyHiLo,
|
||||
// and returns the four 128-bit products in the result's lanes.
|
||||
// Bit 0 selects the low (0) or high (1) elements of x's lanes and
|
||||
// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
|
||||
//
|
||||
// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
|
||||
func (x Uint64x4) carrylessMultiply(xyHiLo uint8, y Uint64x4) Uint64x4
|
||||
|
||||
// carrylessMultiply computes one of four possible Galois polynomial
|
||||
// products of selected high and low halves of each of the four
|
||||
// 128-bit lanes of x and y, depending on the value of xyHiLo,
|
||||
// and returns the four 128-bit products in the result's lanes.
|
||||
// Bit 0 selects the low (0) or high (1) elements of x's lanes and
|
||||
// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
|
||||
//
|
||||
// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
|
||||
//
|
||||
// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
|
||||
func (x Uint64x8) carrylessMultiply(xyHiLo uint8, y Uint64x8) Uint64x8
|
||||
|
||||
/* concatSelectedConstant */
|
||||
|
||||
// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
|
||||
|
||||
@@ -1266,3 +1266,75 @@ func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
|
||||
func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
|
||||
return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
|
||||
}
|
||||
|
||||
// CarrylessMultiply computes one of four possible carryless
|
||||
// multiplications of selected high and low halves of x and y,
|
||||
// depending on the values of a and b, returning the 128-bit
|
||||
// product in the concatenated two elements of the result.
|
||||
// a selects the low (0) or high (1) element of x and
|
||||
// b selects the low (0) or high (1) element of y.
|
||||
//
|
||||
// A carryless multiplication uses bitwise XOR instead of
|
||||
// add-with-carry, for example (in base two):
|
||||
// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
|
||||
//
|
||||
// This also models multiplication of polynomials with coefficients
|
||||
// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
|
||||
// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
|
||||
// polynomial terms, but coefficients "add" with XOR.)
|
||||
//
|
||||
// constant values of a and b will result in better performance,
|
||||
// otherwise the intrinsic may translate into a jump table.
|
||||
//
|
||||
// Asm: VPCLMULQDQ, CPU Feature: AVX
|
||||
func (x Uint64x2) CarrylessMultiply(a, b uint8, y Uint64x2) Uint64x2 {
|
||||
return x.carrylessMultiply(a&1+((b&1)<<4), y)
|
||||
}
|
||||
|
||||
// CarrylessMultiplyGrouped computes one of four possible carryless
|
||||
// multiplications of selected high and low halves of each of the two
|
||||
// 128-bit lanes of x and y, depending on the values of a and b,
|
||||
// and returns the four 128-bit products in the result's lanes.
|
||||
// a selects the low (0) or high (1) elements of x's lanes and
|
||||
// b selects the low (0) or high (1) elements of y's lanes.
|
||||
//
|
||||
// A carryless multiplication uses bitwise XOR instead of
|
||||
// add-with-carry, for example (in base two):
|
||||
// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
|
||||
//
|
||||
// This also models multiplication of polynomials with coefficients
|
||||
// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
|
||||
// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
|
||||
// polynomial terms, but coefficients "add" with XOR.)
|
||||
//
|
||||
// constant values of a and b will result in better performance,
|
||||
// otherwise the intrinsic may translate into a jump table.
|
||||
//
|
||||
// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
|
||||
func (x Uint64x4) CarrylessMultiplyGrouped(a, b uint8, y Uint64x4) Uint64x4 {
|
||||
return x.carrylessMultiply(a&1+((b&1)<<4), y)
|
||||
}
|
||||
|
||||
// CarrylessMultiplyGrouped computes one of four possible carryless
|
||||
// multiplications of selected high and low halves of each of the four
|
||||
// 128-bit lanes of x and y, depending on the values of a and b,
|
||||
// and returns the four 128-bit products in the result's lanes.
|
||||
// a selects the low (0) or high (1) elements of x's lanes and
|
||||
// b selects the low (0) or high (1) elements of y's lanes.
|
||||
//
|
||||
// A carryless multiplication uses bitwise XOR instead of
|
||||
// add-with-carry, for example (in base two):
|
||||
// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
|
||||
//
|
||||
// This also models multiplication of polynomials with coefficients
|
||||
// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
|
||||
// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
|
||||
// polynomial terms, but coefficients "add" with XOR.)
|
||||
//
|
||||
// constant values of a and b will result in better performance,
|
||||
// otherwise the intrinsic may translate into a jump table.
|
||||
//
|
||||
// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
|
||||
func (x Uint64x8) CarrylessMultiplyGrouped(a, b uint8, y Uint64x8) Uint64x8 {
|
||||
return x.carrylessMultiply(a&1+((b&1)<<4), y)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user