From 7f0f67195194cb07122315d5ab563eb617dbe21a Mon Sep 17 00:00:00 2001 From: Xiaolin Zhao Date: Wed, 6 Aug 2025 11:34:12 +0800 Subject: [PATCH] math: optimize the floating-point pipeline on loong64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the FSEL instruction on loong64 to eliminate branches and reduce pipeline interruptions. On the Loongson CPU 3A6000, there is a 0.09% performance improvement, as follows: goos: linux goarch: loong64 pkg: math/big cpu: Loongson-3A6000-HV @ 2500.00MHz │ old.bench │ new.bench │ │ sec/op │ sec/op vs base │ Exp 7.748m ± 0% 7.740m ± 0% -0.10% (p=0.001 n=10) Exp2 7.747m ± 0% 7.741m ± 0% -0.09% (p=0.002 n=10) geomean 7.747m 7.740m -0.09% Change-Id: If62f2e81bf345c83a1fa9350ace131240cfa3b9b Reviewed-on: https://go-review.googlesource.com/c/go/+/693458 Reviewed-by: Dmitri Shuralyov Reviewed-by: Cherry Mui Reviewed-by: abner chenc LUCI-TryBot-Result: Go LUCI Reviewed-by: Meidan Li --- src/math/exp_loong64.s | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/math/exp_loong64.s b/src/math/exp_loong64.s index bf2823f8885..8420a2254e1 100644 --- a/src/math/exp_loong64.s +++ b/src/math/exp_loong64.s @@ -62,13 +62,9 @@ TEXT ·archExp(SB),$0-16 MOVD 8(R10), F3 MOVD 48(R10), F2 CMPGTD F0, F5, FCC0 - BFPT add // x > 0 -sub: - FMSUBD F3, F2, F0, F3 // Log2e*x - 0.5 - JMP 2(PC) -add: + FMSUBD F3, F2, F0, F4 // Log2e*x - 0.5 FMADDD F3, F2, F0, F3 // Log2e*x + 0.5 - + FSEL FCC0, F3, F4, F3 FTINTRZVD F3, F4 // float64 -> int64 MOVV F4, R5 // R5 = int(k) FFINTDV F4, F3 // int64 -> float64 @@ -162,13 +158,9 @@ TEXT ·archExp2(SB),$0-16 MOVD 0(R10), F10 MOVD 8(R10), F2 CMPGTD F0, F10, FCC0 - BFPT add -sub: - SUBD F2, F0, F3 // x - 0.5 - JMP 2(PC) -add: + SUBD F2, F0, F4 // x - 0.5 ADDD F2, F0, F3 // x + 0.5 - + FSEL FCC0, F3, F4, F3 FTINTRZVD F3, F4 MOVV F4, R5 FFINTDV F4, F3