mirror of
https://github.com/ollama/ollama.git
synced 2026-01-29 07:12:03 +03:00
glm4moelite: quantize more tensors to q8_0 and avoid double BOS token (#13891)
This commit is contained in:
@@ -246,7 +246,7 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
|
||||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
EOS: append(
|
EOS: append(
|
||||||
|
|||||||
@@ -95,6 +95,13 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
|
|||||||
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
||||||
newType = fsggml.TensorTypeQ8_0
|
newType = fsggml.TensorTypeQ8_0
|
||||||
}
|
}
|
||||||
|
} else if strings.Contains(name, "attn_k_b.weight") ||
|
||||||
|
strings.Contains(name, "attn_v_b.weight") ||
|
||||||
|
strings.Contains(name, "attn_kv_a_mqa.weight") ||
|
||||||
|
strings.Contains(name, "attn_q_a.weight") ||
|
||||||
|
strings.Contains(name, "attn_q_b.weight") {
|
||||||
|
// MLA tensors need higher precision to avoid quality degradation
|
||||||
|
newType = fsggml.TensorTypeQ8_0
|
||||||
} else if strings.Contains(name, "ffn_down") {
|
} else if strings.Contains(name, "ffn_down") {
|
||||||
iLayer := qs.iFfnDown
|
iLayer := qs.iFfnDown
|
||||||
n_layer := qs.nFfnDown
|
n_layer := qs.nFfnDown
|
||||||
|
|||||||
Reference in New Issue
Block a user