From f532f87a9895afc60b6ab17969c67e33d1ed1564 Mon Sep 17 00:00:00 2001 From: Joel Sing Date: Mon, 3 Feb 2025 23:53:13 +1100 Subject: [PATCH] crypto/subtle: add vector implementation of xorBytes for riscv64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a Banana Pi F3: │ subtle.1 │ subtle.2 │ │ sec/op │ sec/op vs base │ ConstantTimeSelect-8 4.391n ± 1% 4.390n ± 0% ~ (p=0.500 n=10) ConstantTimeByteEq-8 3.763n ± 0% 3.764n ± 0% ~ (p=0.549 n=10) ConstantTimeEq-8 3.767n ± 1% 3.764n ± 0% -0.08% (p=0.002 n=10) ConstantTimeLessOrEq-8 3.136n ± 0% 3.138n ± 0% +0.06% (p=0.002 n=10) XORBytes/8Bytes-8 53.42n ± 0% 52.28n ± 0% -2.13% (p=0.000 n=10) XORBytes/128Bytes-8 64.79n ± 0% 64.12n ± 0% -1.03% (p=0.000 n=10) XORBytes/2048Bytes-8 479.3n ± 0% 322.0n ± 0% -32.84% (p=0.000 n=10) XORBytes/8192Bytes-8 8.897µ ± 26% 7.734µ ± 12% ~ (p=0.165 n=10) XORBytes/32768Bytes-8 39.17µ ± 17% 35.40µ ± 24% -9.63% (p=0.029 n=10) XORBytesAlignment/8Bytes0Offset-8 51.74n ± 0% 54.18n ± 0% +4.72% (p=0.000 n=10) XORBytesAlignment/8Bytes1Offset-8 51.51n ± 1% 53.52n ± 0% +3.92% (p=0.000 n=10) XORBytesAlignment/8Bytes2Offset-8 51.35n ± 1% 53.58n ± 0% +4.34% (p=0.000 n=10) XORBytesAlignment/8Bytes3Offset-8 50.86n ± 0% 53.56n ± 0% +5.31% (p=0.000 n=10) XORBytesAlignment/8Bytes4Offset-8 51.62n ± 0% 54.20n ± 0% +4.98% (p=0.000 n=10) XORBytesAlignment/8Bytes5Offset-8 51.42n ± 1% 53.48n ± 0% +4.02% (p=0.000 n=10) XORBytesAlignment/8Bytes6Offset-8 51.08n ± 1% 53.46n ± 0% +4.67% (p=0.000 n=10) XORBytesAlignment/8Bytes7Offset-8 50.83n ± 0% 53.54n ± 0% +5.33% (p=0.000 n=10) XORBytesAlignment/128Bytes0Offset-8 63.67n ± 0% 66.04n ± 0% +3.72% (p=0.000 n=10) XORBytesAlignment/128Bytes1Offset-8 114.40n ± 0% 67.42n ± 0% -41.07% (p=0.000 n=10) XORBytesAlignment/128Bytes2Offset-8 113.85n ± 0% 67.43n ± 0% -40.78% (p=0.000 n=10) XORBytesAlignment/128Bytes3Offset-8 114.60n ± 0% 67.31n ± 0% -41.27% (p=0.000 n=10) XORBytesAlignment/128Bytes4Offset-8 109.30n ± 0% 67.45n ± 0% -38.29% (p=0.000 n=10) XORBytesAlignment/128Bytes5Offset-8 110.70n ± 0% 67.32n ± 1% -39.19% (p=0.000 n=10) XORBytesAlignment/128Bytes6Offset-8 110.05n ± 0% 67.45n ± 1% -38.71% (p=0.000 n=10) XORBytesAlignment/128Bytes7Offset-8 110.60n ± 0% 67.43n ± 0% -39.04% (p=0.000 n=10) XORBytesAlignment/2048Bytes0Offset-8 478.4n ± 0% 335.6n ± 0% -29.85% (p=0.000 n=10) XORBytesAlignment/2048Bytes1Offset-8 529.7n ± 0% 349.3n ± 0% -34.05% (p=0.000 n=10) XORBytesAlignment/2048Bytes2Offset-8 529.3n ± 0% 349.8n ± 0% -33.91% (p=0.000 n=10) XORBytesAlignment/2048Bytes3Offset-8 529.8n ± 0% 349.5n ± 0% -34.02% (p=0.000 n=10) XORBytesAlignment/2048Bytes4Offset-8 524.7n ± 0% 349.6n ± 0% -33.38% (p=0.000 n=10) XORBytesAlignment/2048Bytes5Offset-8 525.9n ± 0% 349.6n ± 0% -33.52% (p=0.000 n=10) XORBytesAlignment/2048Bytes6Offset-8 525.1n ± 0% 349.8n ± 0% -33.39% (p=0.000 n=10) XORBytesAlignment/2048Bytes7Offset-8 526.0n ± 0% 349.8n ± 0% -33.51% (p=0.000 n=10) geomean 120.0n 96.92n -19.23% │ subtle.1 │ subtle.2 │ │ B/s │ B/s vs base │ XORBytes/8Bytes-8 142.8Mi ± 0% 145.9Mi ± 0% +2.19% (p=0.000 n=10) XORBytes/128Bytes-8 1.840Gi ± 0% 1.859Gi ± 0% +1.05% (p=0.000 n=10) XORBytes/2048Bytes-8 3.979Gi ± 0% 5.925Gi ± 0% +48.89% (p=0.000 n=10) XORBytes/8192Bytes-8 879.1Mi ± 35% 1010.2Mi ± 13% ~ (p=0.165 n=10) XORBytes/32768Bytes-8 797.9Mi ± 21% 882.8Mi ± 31% +10.64% (p=0.029 n=10) XORBytesAlignment/8Bytes0Offset-8 147.5Mi ± 0% 140.8Mi ± 0% -4.50% (p=0.000 n=10) XORBytesAlignment/8Bytes1Offset-8 148.1Mi ± 1% 142.5Mi ± 0% -3.77% (p=0.000 n=10) XORBytesAlignment/8Bytes2Offset-8 148.6Mi ± 1% 142.4Mi ± 0% -4.15% (p=0.000 n=10) XORBytesAlignment/8Bytes3Offset-8 150.0Mi ± 0% 142.4Mi ± 0% -5.04% (p=0.000 n=10) XORBytesAlignment/8Bytes4Offset-8 147.8Mi ± 0% 140.8Mi ± 0% -4.75% (p=0.000 n=10) XORBytesAlignment/8Bytes5Offset-8 148.4Mi ± 1% 142.6Mi ± 0% -3.87% (p=0.000 n=10) XORBytesAlignment/8Bytes6Offset-8 149.4Mi ± 1% 142.7Mi ± 0% -4.45% (p=0.000 n=10) XORBytesAlignment/8Bytes7Offset-8 150.1Mi ± 0% 142.5Mi ± 0% -5.05% (p=0.000 n=10) XORBytesAlignment/128Bytes0Offset-8 1.872Gi ± 0% 1.805Gi ± 0% -3.59% (p=0.000 n=10) XORBytesAlignment/128Bytes1Offset-8 1.042Gi ± 0% 1.768Gi ± 0% +69.65% (p=0.000 n=10) XORBytesAlignment/128Bytes2Offset-8 1.047Gi ± 0% 1.768Gi ± 0% +68.80% (p=0.000 n=10) XORBytesAlignment/128Bytes3Offset-8 1.040Gi ± 0% 1.771Gi ± 0% +70.27% (p=0.000 n=10) XORBytesAlignment/128Bytes4Offset-8 1.090Gi ± 0% 1.767Gi ± 0% +62.08% (p=0.000 n=10) XORBytesAlignment/128Bytes5Offset-8 1.077Gi ± 0% 1.771Gi ± 1% +64.41% (p=0.000 n=10) XORBytesAlignment/128Bytes6Offset-8 1.083Gi ± 0% 1.767Gi ± 1% +63.17% (p=0.000 n=10) XORBytesAlignment/128Bytes7Offset-8 1.078Gi ± 0% 1.768Gi ± 0% +64.07% (p=0.000 n=10) XORBytesAlignment/2048Bytes0Offset-8 3.987Gi ± 0% 5.684Gi ± 0% +42.55% (p=0.000 n=10) XORBytesAlignment/2048Bytes1Offset-8 3.601Gi ± 0% 5.459Gi ± 0% +51.61% (p=0.000 n=10) XORBytesAlignment/2048Bytes2Offset-8 3.604Gi ± 0% 5.453Gi ± 0% +51.31% (p=0.000 n=10) XORBytesAlignment/2048Bytes3Offset-8 3.600Gi ± 0% 5.457Gi ± 0% +51.56% (p=0.000 n=10) XORBytesAlignment/2048Bytes4Offset-8 3.635Gi ± 0% 5.456Gi ± 0% +50.10% (p=0.000 n=10) XORBytesAlignment/2048Bytes5Offset-8 3.627Gi ± 0% 5.455Gi ± 0% +50.39% (p=0.000 n=10) XORBytesAlignment/2048Bytes6Offset-8 3.632Gi ± 0% 5.454Gi ± 0% +50.14% (p=0.000 n=10) XORBytesAlignment/2048Bytes7Offset-8 3.626Gi ± 0% 5.453Gi ± 0% +50.39% (p=0.000 n=10) geomean 881.0Mi 1.097Gi +27.51% Change-Id: Id7f9d87fe1ea39aa91ea7d3fd1ba20737f0dda3c Reviewed-on: https://go-review.googlesource.com/c/go/+/649657 Reviewed-by: Julian Zhu Reviewed-by: Carlos Amedee LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov --- src/crypto/internal/fips140/subtle/xor_asm.go | 2 +- .../internal/fips140/subtle/xor_riscv64.go | 18 ++++++++++ .../internal/fips140/subtle/xor_riscv64.s | 35 +++++++++++++++++-- src/crypto/internal/fips140deps/cpu/cpu.go | 2 ++ 4 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 src/crypto/internal/fips140/subtle/xor_riscv64.go diff --git a/src/crypto/internal/fips140/subtle/xor_asm.go b/src/crypto/internal/fips140/subtle/xor_asm.go index bb85aefef40..d8dcb99ba4b 100644 --- a/src/crypto/internal/fips140/subtle/xor_asm.go +++ b/src/crypto/internal/fips140/subtle/xor_asm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (amd64 || arm64 || ppc64 || ppc64le || riscv64) && !purego +//go:build (amd64 || arm64 || ppc64 || ppc64le) && !purego package subtle diff --git a/src/crypto/internal/fips140/subtle/xor_riscv64.go b/src/crypto/internal/fips140/subtle/xor_riscv64.go new file mode 100644 index 00000000000..7bec9920d5e --- /dev/null +++ b/src/crypto/internal/fips140/subtle/xor_riscv64.go @@ -0,0 +1,18 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build riscv64 && !purego + +package subtle + +import ( + "crypto/internal/fips140deps/cpu" +) + +//go:noescape +func xorBytesRISCV64(dst, a, b *byte, n int, hasV bool) + +func xorBytes(dst, a, b *byte, n int) { + xorBytesRISCV64(dst, a, b, n, cpu.RISCV64HasV) +} diff --git a/src/crypto/internal/fips140/subtle/xor_riscv64.s b/src/crypto/internal/fips140/subtle/xor_riscv64.s index b5fa5dcef45..34331f94044 100644 --- a/src/crypto/internal/fips140/subtle/xor_riscv64.s +++ b/src/crypto/internal/fips140/subtle/xor_riscv64.s @@ -4,10 +4,12 @@ //go:build !purego +#include "asm_riscv64.h" +#include "go_asm.h" #include "textflag.h" -// func xorBytes(dst, a, b *byte, n int) -TEXT ·xorBytes(SB), NOSPLIT|NOFRAME, $0 +// func xorBytesRISCV64(dst, a, b *byte, n int, hasV bool) +TEXT ·xorBytesRISCV64(SB), NOSPLIT|NOFRAME, $0 MOV dst+0(FP), X10 MOV a+8(FP), X11 MOV b+16(FP), X12 @@ -16,6 +18,35 @@ TEXT ·xorBytes(SB), NOSPLIT|NOFRAME, $0 MOV $32, X15 BLT X13, X15, loop4_check +#ifndef hasV + MOVB hasV+32(FP), X5 + BEQZ X5, xorbytes_scalar +#endif + + // Use vector if not 8 byte aligned. + OR X10, X11, X5 + AND $7, X5 + BNEZ X5, vector_loop + + // Use scalar if 8 byte aligned and <= 64 bytes. + SUB $64, X12, X6 + BLEZ X6, loop64_check + + PCALIGN $16 +vector_loop: + VSETVLI X13, E8, M8, TU, MU, X15 + VLE8V (X11), V8 + VLE8V (X12), V16 + VXORVV V8, V16, V24 + VSE8V V24, (X10) + ADD X15, X10 + ADD X15, X11 + ADD X15, X12 + SUB X15, X13 + BNEZ X13, vector_loop + RET + +xorbytes_scalar: // Check alignment - if alignment differs we have to do one byte at a time. AND $7, X10, X5 AND $7, X11, X6 diff --git a/src/crypto/internal/fips140deps/cpu/cpu.go b/src/crypto/internal/fips140deps/cpu/cpu.go index 2dfcc1a4d4a..be75431a140 100644 --- a/src/crypto/internal/fips140deps/cpu/cpu.go +++ b/src/crypto/internal/fips140deps/cpu/cpu.go @@ -27,6 +27,8 @@ var ( LOONG64HasLSX = cpu.Loong64.HasLSX LOONG64HasLASX = cpu.Loong64.HasLASX + RISCV64HasV = cpu.RISCV64.HasV + S390XHasAES = cpu.S390X.HasAES S390XHasAESCBC = cpu.S390X.HasAESCBC S390XHasAESCTR = cpu.S390X.HasAESCTR