mirror of
https://github.com/ollama/ollama.git
synced 2026-01-29 07:12:03 +03:00
glm4moelite: fix attention scale calculation (#13893)
Use the original key dimension (qkNopeHeadDim + qkRopeHeadDim = 256) for the attention scale instead of the MLA absorbed dimension (kvLoraRank + qkRopeHeadDim = 576). MLA absorption is a mathematically equivalent reorganization of the attention computation - it should not change the effective attention scale. The scale should match training, which uses 1/sqrt(256). This improves tool calling and model looping issues.
This commit is contained in:
@@ -223,12 +223,7 @@ func New(c fs.Config) (model.Model, error) {
|
||||
|
||||
keyLength := int(c.Uint("attention.key_length"))
|
||||
valueLength := int(c.Uint("attention.value_length"))
|
||||
kvLoraRank := int(c.Uint("attention.kv_lora_rank"))
|
||||
qkRopeHeadDim := int(c.Uint("rope.dimension_count"))
|
||||
|
||||
// For MLA absorption, the effective key dimension is kvLoraRank + qkRopeHeadDim
|
||||
mlaKeyLength := kvLoraRank + qkRopeHeadDim
|
||||
kqScale := 1.0 / math.Sqrt(float64(mlaKeyLength))
|
||||
kqScale := 1.0 / math.Sqrt(float64(keyLength))
|
||||
|
||||
var pre []string
|
||||
switch c.String("tokenizer.ggml.pre") {
|
||||
|
||||
Reference in New Issue
Block a user