mirror of
https://github.com/golang/go.git
synced 2026-02-06 19:05:05 +03:00
bytes, strings: replace asciiSet bitset with lookup table
Replace the 32-byte bitset implementation with a 256-byte lookup table for simpler and faster ASCII character membership testing. The bitset implementation used bit manipulation (shifts, masks, AND/OR) requiring multiple CPU operations per lookup. The lookup table uses direct array indexing with a single load and compare, reducing CPU overhead significantly. Using [256]bool instead of [256]byte allows the compiler to eliminate the comparison instruction entirely, as bool values are guaranteed to be either 0 or 1. The full 256-element array (rather than 128 elements) is used because it eliminates branches entirely. Testing shows [256]bool is 68% faster than [128]bool with an explicit bounds check (488µs vs 821µs) due to avoiding branch misprediction penalties in the hot path. Using [128]bool with bit masking (c&0x7f) eliminates bounds checks but still costs ~10% performance due to the AND operation. The 224-byte increase in memory usage is acceptable for modern systems, and the simpler implementation is easier to understand and maintain. Full benchmark results demonstrating ~1.5x improvements across all affected functions are available at: https://github.com/golang/go/issues/77194#issuecomment-3814095806 This supersedes CL 737920 with a simpler approach that improves performance for all architectures without requiring SIMD instructions. Updates #77194 Change-Id: I272ee6de05b963a8efc62e7e8838735fb0c4f41b Reviewed-on: https://go-review.googlesource.com/c/go/+/739982 Auto-Submit: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Alan Donovan <adonovan@google.com> Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
committed by
Gopher Robot
parent
b691a2edc7
commit
e0414d74fe
@@ -246,7 +246,7 @@ func IndexAny(s []byte, chars string) int {
|
||||
}
|
||||
return IndexRune(s, r)
|
||||
}
|
||||
if len(s) > 8 {
|
||||
if shouldUseASCIISet(len(s)) {
|
||||
if as, isASCII := makeASCIISet(chars); isASCII {
|
||||
for i, c := range s {
|
||||
if as.contains(c) {
|
||||
@@ -301,7 +301,7 @@ func LastIndexAny(s []byte, chars string) int {
|
||||
// Avoid scanning all of s.
|
||||
return -1
|
||||
}
|
||||
if len(s) > 8 {
|
||||
if shouldUseASCIISet(len(s)) {
|
||||
if as, isASCII := makeASCIISet(chars); isASCII {
|
||||
for i := len(s) - 1; i >= 0; i-- {
|
||||
if as.contains(s[i]) {
|
||||
@@ -935,15 +935,22 @@ func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
|
||||
return -1
|
||||
}
|
||||
|
||||
// asciiSet is a 32-byte value, where each bit represents the presence of a
|
||||
// given ASCII character in the set. The 128-bits of the lower 16 bytes,
|
||||
// starting with the least-significant bit of the lowest word to the
|
||||
// most-significant bit of the highest word, map to the full range of all
|
||||
// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
|
||||
// ensuring that any non-ASCII character will be reported as not in the set.
|
||||
// This allocates a total of 32 bytes even though the upper half
|
||||
// is unused to avoid bounds checks in asciiSet.contains.
|
||||
type asciiSet [8]uint32
|
||||
// asciiSet is a 256-byte lookup table for fast ASCII character membership testing.
|
||||
// Each element corresponds to an ASCII character value, with true indicating the
|
||||
// character is in the set. Using bool instead of byte allows the compiler to
|
||||
// eliminate the comparison instruction, as bool values are guaranteed to be 0 or 1.
|
||||
//
|
||||
// The full 256-element table is used rather than a 128-element table to avoid
|
||||
// additional operations in the lookup path. Alternative approaches were tested:
|
||||
// - [128]bool with explicit bounds check (if c >= 128): introduces branches
|
||||
// that cause pipeline stalls, resulting in ~70% slower performance
|
||||
// - [128]bool with masking (c&0x7f): eliminates bounds checks but the AND
|
||||
// operation still costs ~10% performance compared to direct indexing
|
||||
//
|
||||
// The 256-element array allows direct indexing with no bounds checks, no branches,
|
||||
// and no masking operations, providing optimal performance. The additional 128 bytes
|
||||
// of memory is a worthwhile tradeoff for the simpler, faster code.
|
||||
type asciiSet [256]bool
|
||||
|
||||
// makeASCIISet creates a set of ASCII characters and reports whether all
|
||||
// characters in chars are ASCII.
|
||||
@@ -953,14 +960,24 @@ func makeASCIISet(chars string) (as asciiSet, ok bool) {
|
||||
if c >= utf8.RuneSelf {
|
||||
return as, false
|
||||
}
|
||||
as[c/32] |= 1 << (c % 32)
|
||||
as[c] = true
|
||||
}
|
||||
return as, true
|
||||
}
|
||||
|
||||
// contains reports whether c is inside the set.
|
||||
func (as *asciiSet) contains(c byte) bool {
|
||||
return (as[c/32] & (1 << (c % 32))) != 0
|
||||
return as[c]
|
||||
}
|
||||
|
||||
// shouldUseASCIISet returns whether to use the lookup table optimization.
|
||||
// The threshold of 8 bytes balances initialization cost against per-byte
|
||||
// search cost, performing well across all charset sizes.
|
||||
//
|
||||
// More complex heuristics (e.g., different thresholds per charset size)
|
||||
// add branching overhead that eats away any theoretical improvements.
|
||||
func shouldUseASCIISet(bufLen int) bool {
|
||||
return bufLen > 8
|
||||
}
|
||||
|
||||
// containsRune is a simplified version of strings.ContainsRune
|
||||
|
||||
@@ -193,7 +193,7 @@ func IndexAny(s, chars string) int {
|
||||
}
|
||||
return IndexRune(s, r)
|
||||
}
|
||||
if len(s) > 8 {
|
||||
if shouldUseASCIISet(len(s)) {
|
||||
if as, isASCII := makeASCIISet(chars); isASCII {
|
||||
for i := 0; i < len(s); i++ {
|
||||
if as.contains(s[i]) {
|
||||
@@ -229,7 +229,7 @@ func LastIndexAny(s, chars string) int {
|
||||
}
|
||||
return -1
|
||||
}
|
||||
if len(s) > 8 {
|
||||
if shouldUseASCIISet(len(s)) {
|
||||
if as, isASCII := makeASCIISet(chars); isASCII {
|
||||
for i := len(s) - 1; i >= 0; i-- {
|
||||
if as.contains(s[i]) {
|
||||
@@ -931,15 +931,22 @@ func lastIndexFunc(s string, f func(rune) bool, truth bool) int {
|
||||
return -1
|
||||
}
|
||||
|
||||
// asciiSet is a 32-byte value, where each bit represents the presence of a
|
||||
// given ASCII character in the set. The 128-bits of the lower 16 bytes,
|
||||
// starting with the least-significant bit of the lowest word to the
|
||||
// most-significant bit of the highest word, map to the full range of all
|
||||
// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
|
||||
// ensuring that any non-ASCII character will be reported as not in the set.
|
||||
// This allocates a total of 32 bytes even though the upper half
|
||||
// is unused to avoid bounds checks in asciiSet.contains.
|
||||
type asciiSet [8]uint32
|
||||
// asciiSet is a 256-byte lookup table for fast ASCII character membership testing.
|
||||
// Each element corresponds to an ASCII character value, with true indicating the
|
||||
// character is in the set. Using bool instead of byte allows the compiler to
|
||||
// eliminate the comparison instruction, as bool values are guaranteed to be 0 or 1.
|
||||
//
|
||||
// The full 256-element table is used rather than a 128-element table to avoid
|
||||
// additional operations in the lookup path. Alternative approaches were tested:
|
||||
// - [128]bool with explicit bounds check (if c >= 128): introduces branches
|
||||
// that cause pipeline stalls, resulting in ~70% slower performance
|
||||
// - [128]bool with masking (c&0x7f): eliminates bounds checks but the AND
|
||||
// operation still costs ~10% performance compared to direct indexing
|
||||
//
|
||||
// The 256-element array allows direct indexing with no bounds checks, no branches,
|
||||
// and no masking operations, providing optimal performance. The additional 128 bytes
|
||||
// of memory is a worthwhile tradeoff for the simpler, faster code.
|
||||
type asciiSet [256]bool
|
||||
|
||||
// makeASCIISet creates a set of ASCII characters and reports whether all
|
||||
// characters in chars are ASCII.
|
||||
@@ -949,14 +956,24 @@ func makeASCIISet(chars string) (as asciiSet, ok bool) {
|
||||
if c >= utf8.RuneSelf {
|
||||
return as, false
|
||||
}
|
||||
as[c/32] |= 1 << (c % 32)
|
||||
as[c] = true
|
||||
}
|
||||
return as, true
|
||||
}
|
||||
|
||||
// contains reports whether c is inside the set.
|
||||
func (as *asciiSet) contains(c byte) bool {
|
||||
return (as[c/32] & (1 << (c % 32))) != 0
|
||||
return as[c]
|
||||
}
|
||||
|
||||
// shouldUseASCIISet returns whether to use the lookup table optimization.
|
||||
// The threshold of 8 bytes balances initialization cost against per-byte
|
||||
// search cost, performing well across all charset sizes.
|
||||
//
|
||||
// More complex heuristics (e.g., different thresholds per charset size)
|
||||
// add branching overhead that eats away any theoretical improvements.
|
||||
func shouldUseASCIISet(bufLen int) bool {
|
||||
return bufLen > 8
|
||||
}
|
||||
|
||||
// Trim returns a slice of the string s with all leading and
|
||||
|
||||
Reference in New Issue
Block a user