[dev.simd] simd: add carryless multiply

now with comments, and also a test. choice of data types, method names, etc, are all up for comment. It's NOT commutative, because of the immediate operand (unless we swap the bits of the immediate). Change-Id: I730a6938c6803d0b93544445db65eadc51783e42 Reviewed-on: https://go-review.googlesource.com/c/go/+/726963 Reviewed-by: Junyang Shao <shaojunyang@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2026-01-29 07:02:05 +03:00 · 2025-12-04 17:51:04 -05:00
parent f51ee08905
commit 3417b48b17
15 changed files with 302 additions and 7 deletions
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -1232,6 +1232,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPSHRDQ128,
 		ssa.OpAMD64VPSHRDQ256,
 		ssa.OpAMD64VPSHRDQ512,
+		ssa.OpAMD64VPCLMULQDQ128,
+		ssa.OpAMD64VPCLMULQDQ256,
+		ssa.OpAMD64VPCLMULQDQ512,
 		ssa.OpAMD64VSHUFPS128,
 		ssa.OpAMD64VSHUFPD128,
 		ssa.OpAMD64VSHUFPS256,
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -1333,6 +1333,9 @@
 (blendMaskedInt16x32 x y mask) => (VPBLENDMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (blendMaskedInt32x16 x y mask) => (VPBLENDMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
 (blendMaskedInt64x8 x y mask) => (VPBLENDMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(carrylessMultiplyUint64x2 ...) => (VPCLMULQDQ128 ...)
+(carrylessMultiplyUint64x4 ...) => (VPCLMULQDQ256 ...)
+(carrylessMultiplyUint64x8 ...) => (VPCLMULQDQ512 ...)
 (concatSelectedConstantFloat32x4 ...) => (VSHUFPS128 ...)
 (concatSelectedConstantFloat64x2 ...) => (VSHUFPD128 ...)
 (concatSelectedConstantInt32x4 ...) => (VSHUFPS128 ...)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -1269,6 +1269,9 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPALIGNRMasked128", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPALIGNRMasked256", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPALIGNRMasked512", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPCLMULQDQ128", argLength: 2, reg: v21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPCLMULQDQ256", argLength: 2, reg: w21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPCLMULQDQ512", argLength: 2, reg: w21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPBMasked128", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPBMasked256", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -1301,6 +1301,9 @@ func simdGenericOps() []opData {
 		{name: "TruncScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "TruncScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "TruncScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "carrylessMultiplyUint64x2", argLength: 2, commutative: false, aux: "UInt8"},
+		{name: "carrylessMultiplyUint64x4", argLength: 2, commutative: false, aux: "UInt8"},
+		{name: "carrylessMultiplyUint64x8", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "concatSelectedConstantFloat32x4", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "concatSelectedConstantFloat64x2", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "concatSelectedConstantGroupedFloat32x8", argLength: 2, commutative: false, aux: "UInt8"},
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -2510,6 +2510,9 @@ const (
 	OpAMD64VPALIGNRMasked128
 	OpAMD64VPALIGNRMasked256
 	OpAMD64VPALIGNRMasked512
+	OpAMD64VPCLMULQDQ128
+	OpAMD64VPCLMULQDQ256
+	OpAMD64VPCLMULQDQ512
 	OpAMD64VPCMPB512
 	OpAMD64VPCMPBMasked128
 	OpAMD64VPCMPBMasked256
@@ -7448,6 +7451,9 @@ const (
 	OpTruncScaledResidueFloat64x2
 	OpTruncScaledResidueFloat64x4
 	OpTruncScaledResidueFloat64x8
+	OpcarrylessMultiplyUint64x2
+	OpcarrylessMultiplyUint64x4
+	OpcarrylessMultiplyUint64x8
 	OpconcatSelectedConstantFloat32x4
 	OpconcatSelectedConstantFloat64x2
 	OpconcatSelectedConstantGroupedFloat32x8
@@ -39211,6 +39217,51 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VPCLMULQDQ128",
+		auxType: auxUInt8,
+		argLen:  2,
+		asm:     x86.AVPCLMULQDQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VPCLMULQDQ256",
+		auxType: auxUInt8,
+		argLen:  2,
+		asm:     x86.AVPCLMULQDQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:    "VPCLMULQDQ512",
+		auxType: auxUInt8,
+		argLen:  2,
+		asm:     x86.AVPCLMULQDQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
 	{
 		name:    "VPCMPB512",
 		auxType: auxUInt8,
@@ -95848,6 +95899,24 @@ var opcodeTable = [...]opInfo{
 		argLen:  1,
 		generic: true,
 	},
+	{
+		name:    "carrylessMultiplyUint64x2",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "carrylessMultiplyUint64x4",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "carrylessMultiplyUint64x8",
+		auxType: auxUInt8,
+		argLen:  2,
+		generic: true,
+	},
 	{
 		name:    "concatSelectedConstantFloat32x4",
 		auxType: auxUInt8,
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -6307,6 +6307,15 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpblendMaskedInt64x8(v)
 	case OpblendMaskedInt8x64:
 		return rewriteValueAMD64_OpblendMaskedInt8x64(v)
+	case OpcarrylessMultiplyUint64x2:
+		v.Op = OpAMD64VPCLMULQDQ128
+		return true
+	case OpcarrylessMultiplyUint64x4:
+		v.Op = OpAMD64VPCLMULQDQ256
+		return true
+	case OpcarrylessMultiplyUint64x8:
+		v.Op = OpAMD64VPCLMULQDQ512
+		return true
 	case OpconcatSelectedConstantFloat32x4:
 		v.Op = OpAMD64VSHUFPS128
 		return true
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -1309,6 +1309,9 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Int16x32.blendMasked", opLen3(ssa.OpblendMaskedInt16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x16.blendMasked", opLen3(ssa.OpblendMaskedInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x8.blendMasked", opLen3(ssa.OpblendMaskedInt64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint64x2.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x2, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Uint64x4.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x4, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Uint64x8.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x8, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Float32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Float64x2.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat64x2, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128, 0), sys.AMD64)
--- a/src/simd/_gen/simdgen/ops/GaloisField/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/GaloisField/categories.yaml
@@ -19,3 +19,5 @@
  documentation: !string |-
    // NAME computes element-wise GF(2^8) multiplication with
    // reduction polynomial x^8 + x^4 + x^3 + x + 1.
+- go: carrylessMultiply
+  commutative: false
--- a/src/simd/_gen/simdgen/ops/GaloisField/go.yaml
+++ b/src/simd/_gen/simdgen/ops/GaloisField/go.yaml
@@ -30,3 +30,63 @@
  - *uint8
  out:
  - *uint8
+
+- go: carrylessMultiply
+  documentation: !string |-
+    // NAME computes one of four possible Galois polynomial
+    // products of selected high and low halves of x and y,
+    // depending on the value of xyHiLo, returning the 128-bit
+    // product in the concatenated two elements of the result.
+    // Bit 0 selects the low (0) or high (1) element of x and
+    // bit 4 selects the low (0x00) or high (0x10) element of y.
+  asm: V?PCLMULQDQ
+  in:
+  - go: Uint64x2
+  - go: Uint64x2
+  - class: immediate
+    immOffset: 0
+    name: xyHiLo
+  out:
+  - go: Uint64x2
+    overwriteElementBits: 64
+  hideMaskMethods: true
+
+- go: carrylessMultiply
+  documentation: !string |-
+    // NAME computes one of two possible Galois polynomial
+    // products of selected high and low halves of each of the two
+    // 128-bit lanes of x and y, depending on the value of xyHiLo,
+    // and returns the four 128-bit products in the result's lanes.
+    // Bit 0 selects the low (0) or high (1) elements of x's lanes and
+    // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
+  asm: V?PCLMULQDQ
+  in:
+  - go: Uint64x4
+  - go: Uint64x4
+  - class: immediate
+    immOffset: 0
+    name: xyHiLo
+  out:
+  - go: Uint64x4
+    overwriteElementBits: 64
+  hideMaskMethods: true
+
+- go: carrylessMultiply
+  documentation: !string |-
+    // NAME computes one of four possible Galois polynomial
+    // products of selected high and low halves of each of the four
+    // 128-bit lanes of x and y, depending on the value of xyHiLo,
+    // and returns the four 128-bit products in the result's lanes.
+    // Bit 0 selects the low (0) or high (1) elements of x's lanes and
+    // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
+  asm: V?PCLMULQDQ
+  in:
+  - go: Uint64x8
+  - go: Uint64x8
+  - class: immediate
+    immOffset: 0
+    name: xyHiLo
+  out:
+  - go: Uint64x8
+    overwriteElementBits: 64
+  hideMaskMethods: true
--- a/src/simd/_gen/simdgen/types.yaml
+++ b/src/simd/_gen/simdgen/types.yaml
@@ -83,6 +83,9 @@ in: !repeat
  - {class: vreg, go: Int64x4,    base: "int",   elemBits: 128, bits: 256, lanes: 4}
  - {class: vreg, go: Uint64x4,   base: "uint",  elemBits: 128, bits: 256, lanes: 4}

+# Special for carryless multiply
+  - {class: vreg, go: Uint64x8,   base: "uint",  elemBits: 128, bits: 512, lanes: 8}
+
 # Special shapes just to make VAES(ENC|DEC)(LAST)?512 work.
 # The elemBits field of these shapes are wrong, it would be overwritten by overwriteElemBits.
  - {class: vreg, go: Int8x32,    base: "int",   elemBits: 128, bits: 512, lanes: 32}
--- a/src/simd/_gen/simdgen/xed.go
+++ b/src/simd/_gen/simdgen/xed.go
@@ -808,13 +808,14 @@ var cpuFeatureMap = map[cpuFeatureKey]string{
 	// the vector length suffix.

 	// AVX-512 extension features
-	{"AVX512EVEX", "AVX512_BITALG"}:    "AVX512BITALG",
-	{"AVX512EVEX", "AVX512_GFNI"}:      "AVX512GFNI",
-	{"AVX512EVEX", "AVX512_VBMI2"}:     "AVX512VBMI2",
-	{"AVX512EVEX", "AVX512_VBMI"}:      "AVX512VBMI",
-	{"AVX512EVEX", "AVX512_VNNI"}:      "AVX512VNNI",
-	{"AVX512EVEX", "AVX512_VPOPCNTDQ"}: "AVX512VPOPCNTDQ",
-	{"AVX512EVEX", "AVX512_VAES"}:      "AVX512VAES",
+	{"AVX512EVEX", "AVX512_BITALG"}:     "AVX512BITALG",
+	{"AVX512EVEX", "AVX512_GFNI"}:       "AVX512GFNI",
+	{"AVX512EVEX", "AVX512_VBMI2"}:      "AVX512VBMI2",
+	{"AVX512EVEX", "AVX512_VBMI"}:       "AVX512VBMI",
+	{"AVX512EVEX", "AVX512_VNNI"}:       "AVX512VNNI",
+	{"AVX512EVEX", "AVX512_VPOPCNTDQ"}:  "AVX512VPOPCNTDQ",
+	{"AVX512EVEX", "AVX512_VAES"}:       "AVX512VAES",
+	{"AVX512EVEX", "AVX512_VPCLMULQDQ"}: "AVX512VPCLMULQDQ",

 	// AVX 10.2 (not yet supported)
 	{"AVX512EVEX", "AVX10_2_RC"}: "ignore",
--- a/src/simd/cpu.go
+++ b/src/simd/cpu.go
@@ -95,6 +95,14 @@ func (X86Features) AVX512VNNI() bool {
 	return cpu.X86.HasAVX512VNNI
 }

+// AVX512VPCLMULQDQ returns whether the CPU supports the AVX512VPCLMULQDQ feature.
+//
+// AVX512VPCLMULQDQ is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVX512VPCLMULQDQ() bool {
+	return cpu.X86.HasAVX512VPCLMULQDQ
+}
+
 // AVX512VPOPCNTDQ returns whether the CPU supports the AVX512VPOPCNTDQ feature.
 //
 // AVX512VPOPCNTDQ is defined on all GOARCHes, but will only return true on
--- a/src/simd/internal/simd_test/simd_test.go
+++ b/src/simd/internal/simd_test/simd_test.go
@@ -1194,3 +1194,21 @@ func TestPermuteScalarsLoGrouped(t *testing.T) {
 	simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
 	checkSlices(t, got, want)
 }
+
+func TestClMul(t *testing.T) {
+	var x = simd.LoadUint64x2Slice([]uint64{1, 5})
+	var y = simd.LoadUint64x2Slice([]uint64{3, 9})
+
+	foo := func(v simd.Uint64x2, s []uint64) {
+		r := make([]uint64, 2, 2)
+		v.StoreSlice(r)
+		checkSlices[uint64](t, r, s)
+	}
+
+	foo(x.CarrylessMultiply(0, 0, y), []uint64{3, 0})
+	foo(x.CarrylessMultiply(0, 1, y), []uint64{9, 0})
+	foo(x.CarrylessMultiply(1, 0, y), []uint64{15, 0})
+	foo(x.CarrylessMultiply(1, 1, y), []uint64{45, 0})
+	foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0})
+
+}
--- a/src/simd/ops_internal_amd64.go
+++ b/src/simd/ops_internal_amd64.go
@@ -52,6 +52,44 @@ func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16
 // Asm: VPBLENDMQ, CPU Feature: AVX512
 func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8

+/* carrylessMultiply */
+
+// carrylessMultiply computes one of four possible Galois polynomial
+// products of selected high and low halves of x and y,
+// depending on the value of xyHiLo, returning the 128-bit
+// product in the concatenated two elements of the result.
+// Bit 0 selects the low (0) or high (1) element of x and
+// bit 4 selects the low (0x00) or high (0x10) element of y.
+//
+// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX
+func (x Uint64x2) carrylessMultiply(xyHiLo uint8, y Uint64x2) Uint64x2
+
+// carrylessMultiply computes one of two possible Galois polynomial
+// products of selected high and low halves of each of the two
+// 128-bit lanes of x and y, depending on the value of xyHiLo,
+// and returns the four 128-bit products in the result's lanes.
+// Bit 0 selects the low (0) or high (1) elements of x's lanes and
+// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
+//
+// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
+func (x Uint64x4) carrylessMultiply(xyHiLo uint8, y Uint64x4) Uint64x4
+
+// carrylessMultiply computes one of four possible Galois polynomial
+// products of selected high and low halves of each of the four
+// 128-bit lanes of x and y, depending on the value of xyHiLo,
+// and returns the four 128-bit products in the result's lanes.
+// Bit 0 selects the low (0) or high (1) elements of x's lanes and
+// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
+//
+// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
+func (x Uint64x8) carrylessMultiply(xyHiLo uint8, y Uint64x8) Uint64x8
+
 /* concatSelectedConstant */

 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper
--- a/src/simd/shuffles_amd64.go
+++ b/src/simd/shuffles_amd64.go
@@ -1266,3 +1266,75 @@ func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
 func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
 	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
+
+// CarrylessMultiply computes one of four possible carryless
+// multiplications of selected high and low halves of x and y,
+// depending on the values of a and b, returning the 128-bit
+// product in the concatenated two elements of the result.
+// a selects the low (0) or high (1) element of x and
+// b selects the low (0) or high (1) element of y.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// constant values of a and b will result in better performance,
+// otherwise the intrinsic may translate into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX
+func (x Uint64x2) CarrylessMultiply(a, b uint8, y Uint64x2) Uint64x2 {
+	return x.carrylessMultiply(a&1+((b&1)<<4), y)
+}
+
+// CarrylessMultiplyGrouped computes one of four possible carryless
+// multiplications of selected high and low halves of each of the two
+// 128-bit lanes of x and y, depending on the values of a and b,
+// and returns the four 128-bit products in the result's lanes.
+// a selects the low (0) or high (1) elements of x's lanes and
+// b selects the low (0) or high (1) elements of y's lanes.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// constant values of a and b will result in better performance,
+// otherwise the intrinsic may translate into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
+func (x Uint64x4) CarrylessMultiplyGrouped(a, b uint8, y Uint64x4) Uint64x4 {
+	return x.carrylessMultiply(a&1+((b&1)<<4), y)
+}
+
+// CarrylessMultiplyGrouped computes one of four possible carryless
+// multiplications of selected high and low halves of each of the four
+// 128-bit lanes of x and y, depending on the values of a and b,
+// and returns the four 128-bit products in the result's lanes.
+// a selects the low (0) or high (1) elements of x's lanes and
+// b selects the low (0) or high (1) elements of y's lanes.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// constant values of a and b will result in better performance,
+// otherwise the intrinsic may translate into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
+func (x Uint64x8) CarrylessMultiplyGrouped(a, b uint8, y Uint64x8) Uint64x8 {
+	return x.carrylessMultiply(a&1+((b&1)<<4), y)
+}