From 16750865d11ecce48d45bb152c2793753b8781e6 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sat, 24 Jan 2026 16:33:54 -0800
Subject: [PATCH] glm4moelite: quantize more tensors to q8_0 and avoid double
 BOS token (#13891)

---
 model/models/glm4moelite/model.go | 2 +-
 server/quantization.go            | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/model/models/glm4moelite/model.go b/model/models/glm4moelite/model.go
index 2c55bbfb2..4dfbab173 100644
--- a/model/models/glm4moelite/model.go
+++ b/model/models/glm4moelite/model.go
@@ -246,7 +246,7 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
 				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				EOS: append(
diff --git a/server/quantization.go b/server/quantization.go
index 6ecaf583c..edfd4b470 100644
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -95,6 +95,13 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 			// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
 			newType = fsggml.TensorTypeQ8_0
 		}
+	} else if strings.Contains(name, "attn_k_b.weight") ||
+		strings.Contains(name, "attn_v_b.weight") ||
+		strings.Contains(name, "attn_kv_a_mqa.weight") ||
+		strings.Contains(name, "attn_q_a.weight") ||
+		strings.Contains(name, "attn_q_b.weight") {
+		// MLA tensors need higher precision to avoid quality degradation
+		newType = fsggml.TensorTypeQ8_0
 	} else if strings.Contains(name, "ffn_down") {
 		iLayer := qs.iFfnDown
 		n_layer := qs.nFfnDown