glm4moelite: fix attention scale calculation (#13893)

Use the original key dimension (qkNopeHeadDim + qkRopeHeadDim = 256) for the attention scale instead of the MLA absorbed dimension (kvLoraRank + qkRopeHeadDim = 576). MLA absorption is a mathematically equivalent reorganization of the attention computation - it should not change the effective attention scale. The scale should match training, which uses 1/sqrt(256). This improves tool calling and model looping issues.
2026-01-29 07:12:03 +03:00 · 2026-01-24 17:48:09 -08:00
parent 16750865d1
commit a1ca428c90
1 changed files with 1 additions and 6 deletions
--- a/model/models/glm4moelite/model.go
+++ b/model/models/glm4moelite/model.go
@@ -223,12 +223,7 @@ func New(c fs.Config) (model.Model, error) {

 	keyLength := int(c.Uint("attention.key_length"))
 	valueLength := int(c.Uint("attention.value_length"))
-	kvLoraRank := int(c.Uint("attention.kv_lora_rank"))
-	qkRopeHeadDim := int(c.Uint("rope.dimension_count"))
-
-	// For MLA absorption, the effective key dimension is kvLoraRank + qkRopeHeadDim
-	mlaKeyLength := kvLoraRank + qkRopeHeadDim
-	kqScale := 1.0 / math.Sqrt(float64(mlaKeyLength))
+	kqScale := 1.0 / math.Sqrt(float64(keyLength))

 	var pre []string
 	switch c.String("tokenizer.ggml.pre") {