crypto/subtle: add vector implementation of xorBytes for riscv64

On a Banana Pi F3: │ subtle.1 │ subtle.2 │ │ sec/op │ sec/op vs base │ ConstantTimeSelect-8 4.391n ± 1% 4.390n ± 0% ~ (p=0.500 n=10) ConstantTimeByteEq-8 3.763n ± 0% 3.764n ± 0% ~ (p=0.549 n=10) ConstantTimeEq-8 3.767n ± 1% 3.764n ± 0% -0.08% (p=0.002 n=10) ConstantTimeLessOrEq-8 3.136n ± 0% 3.138n ± 0% +0.06% (p=0.002 n=10) XORBytes/8Bytes-8 53.42n ± 0% 52.28n ± 0% -2.13% (p=0.000 n=10) XORBytes/128Bytes-8 64.79n ± 0% 64.12n ± 0% -1.03% (p=0.000 n=10) XORBytes/2048Bytes-8 479.3n ± 0% 322.0n ± 0% -32.84% (p=0.000 n=10) XORBytes/8192Bytes-8 8.897µ ± 26% 7.734µ ± 12% ~ (p=0.165 n=10) XORBytes/32768Bytes-8 39.17µ ± 17% 35.40µ ± 24% -9.63% (p=0.029 n=10) XORBytesAlignment/8Bytes0Offset-8 51.74n ± 0% 54.18n ± 0% +4.72% (p=0.000 n=10) XORBytesAlignment/8Bytes1Offset-8 51.51n ± 1% 53.52n ± 0% +3.92% (p=0.000 n=10) XORBytesAlignment/8Bytes2Offset-8 51.35n ± 1% 53.58n ± 0% +4.34% (p=0.000 n=10) XORBytesAlignment/8Bytes3Offset-8 50.86n ± 0% 53.56n ± 0% +5.31% (p=0.000 n=10) XORBytesAlignment/8Bytes4Offset-8 51.62n ± 0% 54.20n ± 0% +4.98% (p=0.000 n=10) XORBytesAlignment/8Bytes5Offset-8 51.42n ± 1% 53.48n ± 0% +4.02% (p=0.000 n=10) XORBytesAlignment/8Bytes6Offset-8 51.08n ± 1% 53.46n ± 0% +4.67% (p=0.000 n=10) XORBytesAlignment/8Bytes7Offset-8 50.83n ± 0% 53.54n ± 0% +5.33% (p=0.000 n=10) XORBytesAlignment/128Bytes0Offset-8 63.67n ± 0% 66.04n ± 0% +3.72% (p=0.000 n=10) XORBytesAlignment/128Bytes1Offset-8 114.40n ± 0% 67.42n ± 0% -41.07% (p=0.000 n=10) XORBytesAlignment/128Bytes2Offset-8 113.85n ± 0% 67.43n ± 0% -40.78% (p=0.000 n=10) XORBytesAlignment/128Bytes3Offset-8 114.60n ± 0% 67.31n ± 0% -41.27% (p=0.000 n=10) XORBytesAlignment/128Bytes4Offset-8 109.30n ± 0% 67.45n ± 0% -38.29% (p=0.000 n=10) XORBytesAlignment/128Bytes5Offset-8 110.70n ± 0% 67.32n ± 1% -39.19% (p=0.000 n=10) XORBytesAlignment/128Bytes6Offset-8 110.05n ± 0% 67.45n ± 1% -38.71% (p=0.000 n=10) XORBytesAlignment/128Bytes7Offset-8 110.60n ± 0% 67.43n ± 0% -39.04% (p=0.000 n=10) XORBytesAlignment/2048Bytes0Offset-8 478.4n ± 0% 335.6n ± 0% -29.85% (p=0.000 n=10) XORBytesAlignment/2048Bytes1Offset-8 529.7n ± 0% 349.3n ± 0% -34.05% (p=0.000 n=10) XORBytesAlignment/2048Bytes2Offset-8 529.3n ± 0% 349.8n ± 0% -33.91% (p=0.000 n=10) XORBytesAlignment/2048Bytes3Offset-8 529.8n ± 0% 349.5n ± 0% -34.02% (p=0.000 n=10) XORBytesAlignment/2048Bytes4Offset-8 524.7n ± 0% 349.6n ± 0% -33.38% (p=0.000 n=10) XORBytesAlignment/2048Bytes5Offset-8 525.9n ± 0% 349.6n ± 0% -33.52% (p=0.000 n=10) XORBytesAlignment/2048Bytes6Offset-8 525.1n ± 0% 349.8n ± 0% -33.39% (p=0.000 n=10) XORBytesAlignment/2048Bytes7Offset-8 526.0n ± 0% 349.8n ± 0% -33.51% (p=0.000 n=10) geomean 120.0n 96.92n -19.23% │ subtle.1 │ subtle.2 │ │ B/s │ B/s vs base │ XORBytes/8Bytes-8 142.8Mi ± 0% 145.9Mi ± 0% +2.19% (p=0.000 n=10) XORBytes/128Bytes-8 1.840Gi ± 0% 1.859Gi ± 0% +1.05% (p=0.000 n=10) XORBytes/2048Bytes-8 3.979Gi ± 0% 5.925Gi ± 0% +48.89% (p=0.000 n=10) XORBytes/8192Bytes-8 879.1Mi ± 35% 1010.2Mi ± 13% ~ (p=0.165 n=10) XORBytes/32768Bytes-8 797.9Mi ± 21% 882.8Mi ± 31% +10.64% (p=0.029 n=10) XORBytesAlignment/8Bytes0Offset-8 147.5Mi ± 0% 140.8Mi ± 0% -4.50% (p=0.000 n=10) XORBytesAlignment/8Bytes1Offset-8 148.1Mi ± 1% 142.5Mi ± 0% -3.77% (p=0.000 n=10) XORBytesAlignment/8Bytes2Offset-8 148.6Mi ± 1% 142.4Mi ± 0% -4.15% (p=0.000 n=10) XORBytesAlignment/8Bytes3Offset-8 150.0Mi ± 0% 142.4Mi ± 0% -5.04% (p=0.000 n=10) XORBytesAlignment/8Bytes4Offset-8 147.8Mi ± 0% 140.8Mi ± 0% -4.75% (p=0.000 n=10) XORBytesAlignment/8Bytes5Offset-8 148.4Mi ± 1% 142.6Mi ± 0% -3.87% (p=0.000 n=10) XORBytesAlignment/8Bytes6Offset-8 149.4Mi ± 1% 142.7Mi ± 0% -4.45% (p=0.000 n=10) XORBytesAlignment/8Bytes7Offset-8 150.1Mi ± 0% 142.5Mi ± 0% -5.05% (p=0.000 n=10) XORBytesAlignment/128Bytes0Offset-8 1.872Gi ± 0% 1.805Gi ± 0% -3.59% (p=0.000 n=10) XORBytesAlignment/128Bytes1Offset-8 1.042Gi ± 0% 1.768Gi ± 0% +69.65% (p=0.000 n=10) XORBytesAlignment/128Bytes2Offset-8 1.047Gi ± 0% 1.768Gi ± 0% +68.80% (p=0.000 n=10) XORBytesAlignment/128Bytes3Offset-8 1.040Gi ± 0% 1.771Gi ± 0% +70.27% (p=0.000 n=10) XORBytesAlignment/128Bytes4Offset-8 1.090Gi ± 0% 1.767Gi ± 0% +62.08% (p=0.000 n=10) XORBytesAlignment/128Bytes5Offset-8 1.077Gi ± 0% 1.771Gi ± 1% +64.41% (p=0.000 n=10) XORBytesAlignment/128Bytes6Offset-8 1.083Gi ± 0% 1.767Gi ± 1% +63.17% (p=0.000 n=10) XORBytesAlignment/128Bytes7Offset-8 1.078Gi ± 0% 1.768Gi ± 0% +64.07% (p=0.000 n=10) XORBytesAlignment/2048Bytes0Offset-8 3.987Gi ± 0% 5.684Gi ± 0% +42.55% (p=0.000 n=10) XORBytesAlignment/2048Bytes1Offset-8 3.601Gi ± 0% 5.459Gi ± 0% +51.61% (p=0.000 n=10) XORBytesAlignment/2048Bytes2Offset-8 3.604Gi ± 0% 5.453Gi ± 0% +51.31% (p=0.000 n=10) XORBytesAlignment/2048Bytes3Offset-8 3.600Gi ± 0% 5.457Gi ± 0% +51.56% (p=0.000 n=10) XORBytesAlignment/2048Bytes4Offset-8 3.635Gi ± 0% 5.456Gi ± 0% +50.10% (p=0.000 n=10) XORBytesAlignment/2048Bytes5Offset-8 3.627Gi ± 0% 5.455Gi ± 0% +50.39% (p=0.000 n=10) XORBytesAlignment/2048Bytes6Offset-8 3.632Gi ± 0% 5.454Gi ± 0% +50.14% (p=0.000 n=10) XORBytesAlignment/2048Bytes7Offset-8 3.626Gi ± 0% 5.453Gi ± 0% +50.39% (p=0.000 n=10) geomean 881.0Mi 1.097Gi +27.51% Change-Id: Id7f9d87fe1ea39aa91ea7d3fd1ba20737f0dda3c Reviewed-on: https://go-review.googlesource.com/c/go/+/649657 Reviewed-by: Julian Zhu <jz531210@gmail.com> Reviewed-by: Carlos Amedee <carlos@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
2026-01-29 07:02:05 +03:00 · 2025-02-03 23:53:13 +11:00
parent cf0c42c2ca
commit f532f87a98
4 changed files with 54 additions and 3 deletions
--- a/src/crypto/internal/fips140/subtle/xor_asm.go
+++ b/src/crypto/internal/fips140/subtle/xor_asm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-//go:build (amd64 || arm64 || ppc64 || ppc64le || riscv64) && !purego
+//go:build (amd64 || arm64 || ppc64 || ppc64le) && !purego

 package subtle

--- a/src/crypto/internal/fips140/subtle/xor_riscv64.go
+++ b/src/crypto/internal/fips140/subtle/xor_riscv64.go
@@ -0,0 +1,18 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build riscv64 && !purego
+
+package subtle
+
+import (
+	"crypto/internal/fips140deps/cpu"
+)
+
+//go:noescape
+func xorBytesRISCV64(dst, a, b *byte, n int, hasV bool)
+
+func xorBytes(dst, a, b *byte, n int) {
+	xorBytesRISCV64(dst, a, b, n, cpu.RISCV64HasV)
+}
--- a/src/crypto/internal/fips140/subtle/xor_riscv64.s
+++ b/src/crypto/internal/fips140/subtle/xor_riscv64.s
@@ -4,10 +4,12 @@

 //go:build !purego

+#include "asm_riscv64.h"
+#include "go_asm.h"
 #include "textflag.h"

-// func xorBytes(dst, a, b *byte, n int)
-TEXT ·xorBytes(SB), NOSPLIT|NOFRAME, $0
+// func xorBytesRISCV64(dst, a, b *byte, n int, hasV bool)
+TEXT ·xorBytesRISCV64(SB), NOSPLIT|NOFRAME, $0
 	MOV	dst+0(FP), X10
 	MOV	a+8(FP), X11
 	MOV	b+16(FP), X12
@@ -16,6 +18,35 @@ TEXT ·xorBytes(SB), NOSPLIT|NOFRAME, $0
 	MOV	$32, X15
 	BLT	X13, X15, loop4_check

+#ifndef hasV
+	MOVB	hasV+32(FP), X5
+	BEQZ	X5, xorbytes_scalar
+#endif
+
+	// Use vector if not 8 byte aligned.
+	OR	X10, X11, X5
+	AND	$7, X5
+	BNEZ	X5, vector_loop
+
+	// Use scalar if 8 byte aligned and <= 64 bytes.
+	SUB	$64, X12, X6
+	BLEZ	X6, loop64_check
+
+	PCALIGN	$16
+vector_loop:
+	VSETVLI X13, E8, M8, TU, MU, X15
+	VLE8V	(X11), V8
+	VLE8V	(X12), V16
+	VXORVV	V8, V16, V24
+	VSE8V	V24, (X10)
+	ADD	X15, X10
+	ADD	X15, X11
+	ADD	X15, X12
+	SUB	X15, X13
+	BNEZ	X13, vector_loop
+	RET
+
+xorbytes_scalar:
 	// Check alignment - if alignment differs we have to do one byte at a time.
 	AND	$7, X10, X5
 	AND	$7, X11, X6
--- a/src/crypto/internal/fips140deps/cpu/cpu.go
+++ b/src/crypto/internal/fips140deps/cpu/cpu.go
@@ -27,6 +27,8 @@ var (
 	LOONG64HasLSX  = cpu.Loong64.HasLSX
 	LOONG64HasLASX = cpu.Loong64.HasLASX

+	RISCV64HasV = cpu.RISCV64.HasV
+
 	S390XHasAES    = cpu.S390X.HasAES
 	S390XHasAESCBC = cpu.S390X.HasAESCBC
 	S390XHasAESCTR = cpu.S390X.HasAESCTR