Compare commits

...

2 Commits

Author SHA1 Message Date
Xiaolin Zhao
7f0f671951 math: optimize the floating-point pipeline on loong64
Using the FSEL instruction on loong64 to eliminate branches and reduce
pipeline interruptions.

On the Loongson CPU 3A6000, there is a 0.09% performance improvement, as follows:
goos: linux
goarch: loong64
pkg: math/big
cpu: Loongson-3A6000-HV @ 2500.00MHz
        │  old.bench  │             new.bench              │
        │   sec/op    │   sec/op     vs base               │
Exp       7.748m ± 0%   7.740m ± 0%  -0.10% (p=0.001 n=10)
Exp2      7.747m ± 0%   7.741m ± 0%  -0.09% (p=0.002 n=10)
geomean   7.747m        7.740m       -0.09%

Change-Id: If62f2e81bf345c83a1fa9350ace131240cfa3b9b
Reviewed-on: https://go-review.googlesource.com/c/go/+/693458
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
2026-01-28 17:01:37 -08:00
David Chase
985b0b3fe2 simd/archsimd: add missing cpufeature to generated mask/merge methods
Change-Id: I34678f4ef17fe1b8b7657a2c3d39685b4a5951f2
Reviewed-on: https://go-review.googlesource.com/c/go/+/739981
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2026-01-28 13:41:19 -08:00
3 changed files with 132 additions and 12 deletions

View File

@@ -62,13 +62,9 @@ TEXT ·archExp(SB),$0-16
MOVD 8(R10), F3
MOVD 48(R10), F2
CMPGTD F0, F5, FCC0
BFPT add // x > 0
sub:
FMSUBD F3, F2, F0, F3 // Log2e*x - 0.5
JMP 2(PC)
add:
FMSUBD F3, F2, F0, F4 // Log2e*x - 0.5
FMADDD F3, F2, F0, F3 // Log2e*x + 0.5
FSEL FCC0, F3, F4, F3
FTINTRZVD F3, F4 // float64 -> int64
MOVV F4, R5 // R5 = int(k)
FFINTDV F4, F3 // int64 -> float64
@@ -162,13 +158,9 @@ TEXT ·archExp2(SB),$0-16
MOVD 0(R10), F10
MOVD 8(R10), F2
CMPGTD F0, F10, FCC0
BFPT add
sub:
SUBD F2, F0, F3 // x - 0.5
JMP 2(PC)
add:
SUBD F2, F0, F4 // x - 0.5
ADDD F2, F0, F3 // x + 0.5
FSEL FCC0, F3, F4, F3
FTINTRZVD F3, F4
MOVV F4, R5
FFINTDV F4, F3

View File

@@ -801,6 +801,8 @@ func pa{{.VType}}(s []{{.Etype}}) *[{{.Count}}]{{.Etype}} {
var avx2MaskedTemplate = shapedTemplateOf(avx2Shapes, "avx2 .Masked methods", `
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: {{.CPUfeature}}
func (x {{.VType}}) Masked(mask Mask{{.WxC}}) {{.VType}} {
im := mask.ToInt{{.WxC}}()
{{- if eq .Base "Int" }}
@@ -811,6 +813,8 @@ func (x {{.VType}}) Masked(mask Mask{{.WxC}}) {{.VType}} {
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: {{.CPUfeature}}
func (x {{.VType}}) Merge(y {{.VType}}, mask Mask{{.WxC}}) {{.VType}} {
{{- if eq .BxC .WxC -}}
im := mask.ToInt{{.BxC}}()
@@ -830,6 +834,8 @@ func (x {{.VType}}) Merge(y {{.VType}}, mask Mask{{.WxC}}) {{.VType}} {
// TODO perhaps write these in ways that work better on AVX512
var avx512MaskedTemplate = shapedTemplateOf(avx512Shapes, "avx512 .Masked methods", `
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x {{.VType}}) Masked(mask Mask{{.WxC}}) {{.VType}} {
im := mask.ToInt{{.WxC}}()
{{- if eq .Base "Int" }}
@@ -840,6 +846,8 @@ func (x {{.VType}}) Masked(mask Mask{{.WxC}}) {{.VType}} {
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x {{.VType}}) Merge(y {{.VType}}, mask Mask{{.WxC}}) {{.VType}} {
{{- if eq .Base "Int" }}
return y.blendMasked(x, mask)

View File

@@ -5,24 +5,32 @@
package archsimd
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Int8x16) Masked(mask Mask8x16) Int8x16 {
im := mask.ToInt8x16()
return im.And(x)
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Int8x16) Merge(y Int8x16, mask Mask8x16) Int8x16 {
im := mask.ToInt8x16()
return y.blend(x, im)
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Int16x8) Masked(mask Mask16x8) Int16x8 {
im := mask.ToInt16x8()
return im.And(x)
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Int16x8) Merge(y Int16x8, mask Mask16x8) Int16x8 {
im := mask.ToInt16x8().AsInt8x16()
ix := x.AsInt8x16()
@@ -31,12 +39,16 @@ func (x Int16x8) Merge(y Int16x8, mask Mask16x8) Int16x8 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Int32x4) Masked(mask Mask32x4) Int32x4 {
im := mask.ToInt32x4()
return im.And(x)
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Int32x4) Merge(y Int32x4, mask Mask32x4) Int32x4 {
im := mask.ToInt32x4().AsInt8x16()
ix := x.AsInt8x16()
@@ -45,12 +57,16 @@ func (x Int32x4) Merge(y Int32x4, mask Mask32x4) Int32x4 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Int64x2) Masked(mask Mask64x2) Int64x2 {
im := mask.ToInt64x2()
return im.And(x)
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Int64x2) Merge(y Int64x2, mask Mask64x2) Int64x2 {
im := mask.ToInt64x2().AsInt8x16()
ix := x.AsInt8x16()
@@ -59,12 +75,16 @@ func (x Int64x2) Merge(y Int64x2, mask Mask64x2) Int64x2 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Uint8x16) Masked(mask Mask8x16) Uint8x16 {
im := mask.ToInt8x16()
return x.AsInt8x16().And(im).AsUint8x16()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Uint8x16) Merge(y Uint8x16, mask Mask8x16) Uint8x16 {
im := mask.ToInt8x16()
ix := x.AsInt8x16()
@@ -73,12 +93,16 @@ func (x Uint8x16) Merge(y Uint8x16, mask Mask8x16) Uint8x16 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Uint16x8) Masked(mask Mask16x8) Uint16x8 {
im := mask.ToInt16x8()
return x.AsInt16x8().And(im).AsUint16x8()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Uint16x8) Merge(y Uint16x8, mask Mask16x8) Uint16x8 {
im := mask.ToInt16x8().AsInt8x16()
ix := x.AsInt8x16()
@@ -87,12 +111,16 @@ func (x Uint16x8) Merge(y Uint16x8, mask Mask16x8) Uint16x8 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Uint32x4) Masked(mask Mask32x4) Uint32x4 {
im := mask.ToInt32x4()
return x.AsInt32x4().And(im).AsUint32x4()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Uint32x4) Merge(y Uint32x4, mask Mask32x4) Uint32x4 {
im := mask.ToInt32x4().AsInt8x16()
ix := x.AsInt8x16()
@@ -101,12 +129,16 @@ func (x Uint32x4) Merge(y Uint32x4, mask Mask32x4) Uint32x4 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Uint64x2) Masked(mask Mask64x2) Uint64x2 {
im := mask.ToInt64x2()
return x.AsInt64x2().And(im).AsUint64x2()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Uint64x2) Merge(y Uint64x2, mask Mask64x2) Uint64x2 {
im := mask.ToInt64x2().AsInt8x16()
ix := x.AsInt8x16()
@@ -115,12 +147,16 @@ func (x Uint64x2) Merge(y Uint64x2, mask Mask64x2) Uint64x2 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Float32x4) Masked(mask Mask32x4) Float32x4 {
im := mask.ToInt32x4()
return x.AsInt32x4().And(im).AsFloat32x4()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Float32x4) Merge(y Float32x4, mask Mask32x4) Float32x4 {
im := mask.ToInt32x4().AsInt8x16()
ix := x.AsInt8x16()
@@ -129,12 +165,16 @@ func (x Float32x4) Merge(y Float32x4, mask Mask32x4) Float32x4 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Float64x2) Masked(mask Mask64x2) Float64x2 {
im := mask.ToInt64x2()
return x.AsInt64x2().And(im).AsFloat64x2()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX
func (x Float64x2) Merge(y Float64x2, mask Mask64x2) Float64x2 {
im := mask.ToInt64x2().AsInt8x16()
ix := x.AsInt8x16()
@@ -143,24 +183,32 @@ func (x Float64x2) Merge(y Float64x2, mask Mask64x2) Float64x2 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Int8x32) Masked(mask Mask8x32) Int8x32 {
im := mask.ToInt8x32()
return im.And(x)
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Int8x32) Merge(y Int8x32, mask Mask8x32) Int8x32 {
im := mask.ToInt8x32()
return y.blend(x, im)
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Int16x16) Masked(mask Mask16x16) Int16x16 {
im := mask.ToInt16x16()
return im.And(x)
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Int16x16) Merge(y Int16x16, mask Mask16x16) Int16x16 {
im := mask.ToInt16x16().AsInt8x32()
ix := x.AsInt8x32()
@@ -169,12 +217,16 @@ func (x Int16x16) Merge(y Int16x16, mask Mask16x16) Int16x16 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Int32x8) Masked(mask Mask32x8) Int32x8 {
im := mask.ToInt32x8()
return im.And(x)
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Int32x8) Merge(y Int32x8, mask Mask32x8) Int32x8 {
im := mask.ToInt32x8().AsInt8x32()
ix := x.AsInt8x32()
@@ -183,12 +235,16 @@ func (x Int32x8) Merge(y Int32x8, mask Mask32x8) Int32x8 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Int64x4) Masked(mask Mask64x4) Int64x4 {
im := mask.ToInt64x4()
return im.And(x)
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Int64x4) Merge(y Int64x4, mask Mask64x4) Int64x4 {
im := mask.ToInt64x4().AsInt8x32()
ix := x.AsInt8x32()
@@ -197,12 +253,16 @@ func (x Int64x4) Merge(y Int64x4, mask Mask64x4) Int64x4 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Uint8x32) Masked(mask Mask8x32) Uint8x32 {
im := mask.ToInt8x32()
return x.AsInt8x32().And(im).AsUint8x32()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Uint8x32) Merge(y Uint8x32, mask Mask8x32) Uint8x32 {
im := mask.ToInt8x32()
ix := x.AsInt8x32()
@@ -211,12 +271,16 @@ func (x Uint8x32) Merge(y Uint8x32, mask Mask8x32) Uint8x32 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Uint16x16) Masked(mask Mask16x16) Uint16x16 {
im := mask.ToInt16x16()
return x.AsInt16x16().And(im).AsUint16x16()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Uint16x16) Merge(y Uint16x16, mask Mask16x16) Uint16x16 {
im := mask.ToInt16x16().AsInt8x32()
ix := x.AsInt8x32()
@@ -225,12 +289,16 @@ func (x Uint16x16) Merge(y Uint16x16, mask Mask16x16) Uint16x16 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Uint32x8) Masked(mask Mask32x8) Uint32x8 {
im := mask.ToInt32x8()
return x.AsInt32x8().And(im).AsUint32x8()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Uint32x8) Merge(y Uint32x8, mask Mask32x8) Uint32x8 {
im := mask.ToInt32x8().AsInt8x32()
ix := x.AsInt8x32()
@@ -239,12 +307,16 @@ func (x Uint32x8) Merge(y Uint32x8, mask Mask32x8) Uint32x8 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Uint64x4) Masked(mask Mask64x4) Uint64x4 {
im := mask.ToInt64x4()
return x.AsInt64x4().And(im).AsUint64x4()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Uint64x4) Merge(y Uint64x4, mask Mask64x4) Uint64x4 {
im := mask.ToInt64x4().AsInt8x32()
ix := x.AsInt8x32()
@@ -253,12 +325,16 @@ func (x Uint64x4) Merge(y Uint64x4, mask Mask64x4) Uint64x4 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Float32x8) Masked(mask Mask32x8) Float32x8 {
im := mask.ToInt32x8()
return x.AsInt32x8().And(im).AsFloat32x8()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Float32x8) Merge(y Float32x8, mask Mask32x8) Float32x8 {
im := mask.ToInt32x8().AsInt8x32()
ix := x.AsInt8x32()
@@ -267,12 +343,16 @@ func (x Float32x8) Merge(y Float32x8, mask Mask32x8) Float32x8 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Float64x4) Masked(mask Mask64x4) Float64x4 {
im := mask.ToInt64x4()
return x.AsInt64x4().And(im).AsFloat64x4()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX2
func (x Float64x4) Merge(y Float64x4, mask Mask64x4) Float64x4 {
im := mask.ToInt64x4().AsInt8x32()
ix := x.AsInt8x32()
@@ -281,56 +361,76 @@ func (x Float64x4) Merge(y Float64x4, mask Mask64x4) Float64x4 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Int8x64) Masked(mask Mask8x64) Int8x64 {
im := mask.ToInt8x64()
return im.And(x)
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Int8x64) Merge(y Int8x64, mask Mask8x64) Int8x64 {
return y.blendMasked(x, mask)
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Int16x32) Masked(mask Mask16x32) Int16x32 {
im := mask.ToInt16x32()
return im.And(x)
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Int16x32) Merge(y Int16x32, mask Mask16x32) Int16x32 {
return y.blendMasked(x, mask)
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Int32x16) Masked(mask Mask32x16) Int32x16 {
im := mask.ToInt32x16()
return im.And(x)
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Int32x16) Merge(y Int32x16, mask Mask32x16) Int32x16 {
return y.blendMasked(x, mask)
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Int64x8) Masked(mask Mask64x8) Int64x8 {
im := mask.ToInt64x8()
return im.And(x)
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Int64x8) Merge(y Int64x8, mask Mask64x8) Int64x8 {
return y.blendMasked(x, mask)
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Uint8x64) Masked(mask Mask8x64) Uint8x64 {
im := mask.ToInt8x64()
return x.AsInt8x64().And(im).AsUint8x64()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Uint8x64) Merge(y Uint8x64, mask Mask8x64) Uint8x64 {
ix := x.AsInt8x64()
iy := y.AsInt8x64()
@@ -338,12 +438,16 @@ func (x Uint8x64) Merge(y Uint8x64, mask Mask8x64) Uint8x64 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Uint16x32) Masked(mask Mask16x32) Uint16x32 {
im := mask.ToInt16x32()
return x.AsInt16x32().And(im).AsUint16x32()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Uint16x32) Merge(y Uint16x32, mask Mask16x32) Uint16x32 {
ix := x.AsInt16x32()
iy := y.AsInt16x32()
@@ -351,12 +455,16 @@ func (x Uint16x32) Merge(y Uint16x32, mask Mask16x32) Uint16x32 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Uint32x16) Masked(mask Mask32x16) Uint32x16 {
im := mask.ToInt32x16()
return x.AsInt32x16().And(im).AsUint32x16()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Uint32x16) Merge(y Uint32x16, mask Mask32x16) Uint32x16 {
ix := x.AsInt32x16()
iy := y.AsInt32x16()
@@ -364,12 +472,16 @@ func (x Uint32x16) Merge(y Uint32x16, mask Mask32x16) Uint32x16 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Uint64x8) Masked(mask Mask64x8) Uint64x8 {
im := mask.ToInt64x8()
return x.AsInt64x8().And(im).AsUint64x8()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Uint64x8) Merge(y Uint64x8, mask Mask64x8) Uint64x8 {
ix := x.AsInt64x8()
iy := y.AsInt64x8()
@@ -377,12 +489,16 @@ func (x Uint64x8) Merge(y Uint64x8, mask Mask64x8) Uint64x8 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Float32x16) Masked(mask Mask32x16) Float32x16 {
im := mask.ToInt32x16()
return x.AsInt32x16().And(im).AsFloat32x16()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Float32x16) Merge(y Float32x16, mask Mask32x16) Float32x16 {
ix := x.AsInt32x16()
iy := y.AsInt32x16()
@@ -390,12 +506,16 @@ func (x Float32x16) Merge(y Float32x16, mask Mask32x16) Float32x16 {
}
// Masked returns x but with elements zeroed where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Float64x8) Masked(mask Mask64x8) Float64x8 {
im := mask.ToInt64x8()
return x.AsInt64x8().And(im).AsFloat64x8()
}
// Merge returns x but with elements set to y where mask is false.
//
// Emulated, CPU Feature: AVX512
func (x Float64x8) Merge(y Float64x8, mask Mask64x8) Float64x8 {
ix := x.AsInt64x8()
iy := y.AsInt64x8()