crypto/subtle: add vector implementation of xorBytes for riscv64

On a Banana Pi F3:

                                     │   subtle.1    │               subtle.2               │
                                     │    sec/op     │    sec/op     vs base                │
ConstantTimeSelect-8                    4.391n ±  1%   4.390n ±  0%        ~ (p=0.500 n=10)
ConstantTimeByteEq-8                    3.763n ±  0%   3.764n ±  0%        ~ (p=0.549 n=10)
ConstantTimeEq-8                        3.767n ±  1%   3.764n ±  0%   -0.08% (p=0.002 n=10)
ConstantTimeLessOrEq-8                  3.136n ±  0%   3.138n ±  0%   +0.06% (p=0.002 n=10)
XORBytes/8Bytes-8                       53.42n ±  0%   52.28n ±  0%   -2.13% (p=0.000 n=10)
XORBytes/128Bytes-8                     64.79n ±  0%   64.12n ±  0%   -1.03% (p=0.000 n=10)
XORBytes/2048Bytes-8                    479.3n ±  0%   322.0n ±  0%  -32.84% (p=0.000 n=10)
XORBytes/8192Bytes-8                    8.897µ ± 26%   7.734µ ± 12%        ~ (p=0.165 n=10)
XORBytes/32768Bytes-8                   39.17µ ± 17%   35.40µ ± 24%   -9.63% (p=0.029 n=10)
XORBytesAlignment/8Bytes0Offset-8       51.74n ±  0%   54.18n ±  0%   +4.72% (p=0.000 n=10)
XORBytesAlignment/8Bytes1Offset-8       51.51n ±  1%   53.52n ±  0%   +3.92% (p=0.000 n=10)
XORBytesAlignment/8Bytes2Offset-8       51.35n ±  1%   53.58n ±  0%   +4.34% (p=0.000 n=10)
XORBytesAlignment/8Bytes3Offset-8       50.86n ±  0%   53.56n ±  0%   +5.31% (p=0.000 n=10)
XORBytesAlignment/8Bytes4Offset-8       51.62n ±  0%   54.20n ±  0%   +4.98% (p=0.000 n=10)
XORBytesAlignment/8Bytes5Offset-8       51.42n ±  1%   53.48n ±  0%   +4.02% (p=0.000 n=10)
XORBytesAlignment/8Bytes6Offset-8       51.08n ±  1%   53.46n ±  0%   +4.67% (p=0.000 n=10)
XORBytesAlignment/8Bytes7Offset-8       50.83n ±  0%   53.54n ±  0%   +5.33% (p=0.000 n=10)
XORBytesAlignment/128Bytes0Offset-8     63.67n ±  0%   66.04n ±  0%   +3.72% (p=0.000 n=10)
XORBytesAlignment/128Bytes1Offset-8    114.40n ±  0%   67.42n ±  0%  -41.07% (p=0.000 n=10)
XORBytesAlignment/128Bytes2Offset-8    113.85n ±  0%   67.43n ±  0%  -40.78% (p=0.000 n=10)
XORBytesAlignment/128Bytes3Offset-8    114.60n ±  0%   67.31n ±  0%  -41.27% (p=0.000 n=10)
XORBytesAlignment/128Bytes4Offset-8    109.30n ±  0%   67.45n ±  0%  -38.29% (p=0.000 n=10)
XORBytesAlignment/128Bytes5Offset-8    110.70n ±  0%   67.32n ±  1%  -39.19% (p=0.000 n=10)
XORBytesAlignment/128Bytes6Offset-8    110.05n ±  0%   67.45n ±  1%  -38.71% (p=0.000 n=10)
XORBytesAlignment/128Bytes7Offset-8    110.60n ±  0%   67.43n ±  0%  -39.04% (p=0.000 n=10)
XORBytesAlignment/2048Bytes0Offset-8    478.4n ±  0%   335.6n ±  0%  -29.85% (p=0.000 n=10)
XORBytesAlignment/2048Bytes1Offset-8    529.7n ±  0%   349.3n ±  0%  -34.05% (p=0.000 n=10)
XORBytesAlignment/2048Bytes2Offset-8    529.3n ±  0%   349.8n ±  0%  -33.91% (p=0.000 n=10)
XORBytesAlignment/2048Bytes3Offset-8    529.8n ±  0%   349.5n ±  0%  -34.02% (p=0.000 n=10)
XORBytesAlignment/2048Bytes4Offset-8    524.7n ±  0%   349.6n ±  0%  -33.38% (p=0.000 n=10)
XORBytesAlignment/2048Bytes5Offset-8    525.9n ±  0%   349.6n ±  0%  -33.52% (p=0.000 n=10)
XORBytesAlignment/2048Bytes6Offset-8    525.1n ±  0%   349.8n ±  0%  -33.39% (p=0.000 n=10)
XORBytesAlignment/2048Bytes7Offset-8    526.0n ±  0%   349.8n ±  0%  -33.51% (p=0.000 n=10)
geomean                                 120.0n         96.92n        -19.23%

                                     │   subtle.1    │                subtle.2                │
                                     │      B/s      │      B/s        vs base                │
XORBytes/8Bytes-8                      142.8Mi ±  0%    145.9Mi ±  0%   +2.19% (p=0.000 n=10)
XORBytes/128Bytes-8                    1.840Gi ±  0%    1.859Gi ±  0%   +1.05% (p=0.000 n=10)
XORBytes/2048Bytes-8                   3.979Gi ±  0%    5.925Gi ±  0%  +48.89% (p=0.000 n=10)
XORBytes/8192Bytes-8                   879.1Mi ± 35%   1010.2Mi ± 13%        ~ (p=0.165 n=10)
XORBytes/32768Bytes-8                  797.9Mi ± 21%    882.8Mi ± 31%  +10.64% (p=0.029 n=10)
XORBytesAlignment/8Bytes0Offset-8      147.5Mi ±  0%    140.8Mi ±  0%   -4.50% (p=0.000 n=10)
XORBytesAlignment/8Bytes1Offset-8      148.1Mi ±  1%    142.5Mi ±  0%   -3.77% (p=0.000 n=10)
XORBytesAlignment/8Bytes2Offset-8      148.6Mi ±  1%    142.4Mi ±  0%   -4.15% (p=0.000 n=10)
XORBytesAlignment/8Bytes3Offset-8      150.0Mi ±  0%    142.4Mi ±  0%   -5.04% (p=0.000 n=10)
XORBytesAlignment/8Bytes4Offset-8      147.8Mi ±  0%    140.8Mi ±  0%   -4.75% (p=0.000 n=10)
XORBytesAlignment/8Bytes5Offset-8      148.4Mi ±  1%    142.6Mi ±  0%   -3.87% (p=0.000 n=10)
XORBytesAlignment/8Bytes6Offset-8      149.4Mi ±  1%    142.7Mi ±  0%   -4.45% (p=0.000 n=10)
XORBytesAlignment/8Bytes7Offset-8      150.1Mi ±  0%    142.5Mi ±  0%   -5.05% (p=0.000 n=10)
XORBytesAlignment/128Bytes0Offset-8    1.872Gi ±  0%    1.805Gi ±  0%   -3.59% (p=0.000 n=10)
XORBytesAlignment/128Bytes1Offset-8    1.042Gi ±  0%    1.768Gi ±  0%  +69.65% (p=0.000 n=10)
XORBytesAlignment/128Bytes2Offset-8    1.047Gi ±  0%    1.768Gi ±  0%  +68.80% (p=0.000 n=10)
XORBytesAlignment/128Bytes3Offset-8    1.040Gi ±  0%    1.771Gi ±  0%  +70.27% (p=0.000 n=10)
XORBytesAlignment/128Bytes4Offset-8    1.090Gi ±  0%    1.767Gi ±  0%  +62.08% (p=0.000 n=10)
XORBytesAlignment/128Bytes5Offset-8    1.077Gi ±  0%    1.771Gi ±  1%  +64.41% (p=0.000 n=10)
XORBytesAlignment/128Bytes6Offset-8    1.083Gi ±  0%    1.767Gi ±  1%  +63.17% (p=0.000 n=10)
XORBytesAlignment/128Bytes7Offset-8    1.078Gi ±  0%    1.768Gi ±  0%  +64.07% (p=0.000 n=10)
XORBytesAlignment/2048Bytes0Offset-8   3.987Gi ±  0%    5.684Gi ±  0%  +42.55% (p=0.000 n=10)
XORBytesAlignment/2048Bytes1Offset-8   3.601Gi ±  0%    5.459Gi ±  0%  +51.61% (p=0.000 n=10)
XORBytesAlignment/2048Bytes2Offset-8   3.604Gi ±  0%    5.453Gi ±  0%  +51.31% (p=0.000 n=10)
XORBytesAlignment/2048Bytes3Offset-8   3.600Gi ±  0%    5.457Gi ±  0%  +51.56% (p=0.000 n=10)
XORBytesAlignment/2048Bytes4Offset-8   3.635Gi ±  0%    5.456Gi ±  0%  +50.10% (p=0.000 n=10)
XORBytesAlignment/2048Bytes5Offset-8   3.627Gi ±  0%    5.455Gi ±  0%  +50.39% (p=0.000 n=10)
XORBytesAlignment/2048Bytes6Offset-8   3.632Gi ±  0%    5.454Gi ±  0%  +50.14% (p=0.000 n=10)
XORBytesAlignment/2048Bytes7Offset-8   3.626Gi ±  0%    5.453Gi ±  0%  +50.39% (p=0.000 n=10)
geomean                                881.0Mi          1.097Gi        +27.51%

Change-Id: Id7f9d87fe1ea39aa91ea7d3fd1ba20737f0dda3c
Reviewed-on: https://go-review.googlesource.com/c/go/+/649657
Reviewed-by: Julian Zhu <jz531210@gmail.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
This commit is contained in:
Joel Sing
2025-02-03 23:53:13 +11:00
parent cf0c42c2ca
commit f532f87a98
4 changed files with 54 additions and 3 deletions

View File

@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (amd64 || arm64 || ppc64 || ppc64le || riscv64) && !purego
//go:build (amd64 || arm64 || ppc64 || ppc64le) && !purego
package subtle

View File

@@ -0,0 +1,18 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build riscv64 && !purego
package subtle
import (
"crypto/internal/fips140deps/cpu"
)
//go:noescape
func xorBytesRISCV64(dst, a, b *byte, n int, hasV bool)
func xorBytes(dst, a, b *byte, n int) {
xorBytesRISCV64(dst, a, b, n, cpu.RISCV64HasV)
}

View File

@@ -4,10 +4,12 @@
//go:build !purego
#include "asm_riscv64.h"
#include "go_asm.h"
#include "textflag.h"
// func xorBytes(dst, a, b *byte, n int)
TEXT ·xorBytes(SB), NOSPLIT|NOFRAME, $0
// func xorBytesRISCV64(dst, a, b *byte, n int, hasV bool)
TEXT ·xorBytesRISCV64(SB), NOSPLIT|NOFRAME, $0
MOV dst+0(FP), X10
MOV a+8(FP), X11
MOV b+16(FP), X12
@@ -16,6 +18,35 @@ TEXT ·xorBytes(SB), NOSPLIT|NOFRAME, $0
MOV $32, X15
BLT X13, X15, loop4_check
#ifndef hasV
MOVB hasV+32(FP), X5
BEQZ X5, xorbytes_scalar
#endif
// Use vector if not 8 byte aligned.
OR X10, X11, X5
AND $7, X5
BNEZ X5, vector_loop
// Use scalar if 8 byte aligned and <= 64 bytes.
SUB $64, X12, X6
BLEZ X6, loop64_check
PCALIGN $16
vector_loop:
VSETVLI X13, E8, M8, TU, MU, X15
VLE8V (X11), V8
VLE8V (X12), V16
VXORVV V8, V16, V24
VSE8V V24, (X10)
ADD X15, X10
ADD X15, X11
ADD X15, X12
SUB X15, X13
BNEZ X13, vector_loop
RET
xorbytes_scalar:
// Check alignment - if alignment differs we have to do one byte at a time.
AND $7, X10, X5
AND $7, X11, X6

View File

@@ -27,6 +27,8 @@ var (
LOONG64HasLSX = cpu.Loong64.HasLSX
LOONG64HasLASX = cpu.Loong64.HasLASX
RISCV64HasV = cpu.RISCV64.HasV
S390XHasAES = cpu.S390X.HasAES
S390XHasAESCBC = cpu.S390X.HasAESCBC
S390XHasAESCTR = cpu.S390X.HasAESCTR