From 07d944fdfce3474bb69500df636c521bf3a8f9b9 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 19 Jan 2026 14:34:59 -0800 Subject: [PATCH] move tokenizer to separate package --- llm/server.go | 21 +- model/ignore_test.go | 410 ++++++++++++++++++ model/model.go | 5 +- model/models/bert/embed.go | 13 +- model/models/deepseek2/model.go | 7 +- model/models/deepseekocr/model.go | 7 +- model/models/gemma2/model.go | 7 +- model/models/gemma3/embed.go | 7 +- model/models/gemma3/model.go | 13 +- model/models/gemma3n/model.go | 7 +- model/models/glm4moelite/model.go | 7 +- model/models/gptoss/model.go | 7 +- model/models/lfm2/model.go | 9 +- model/models/llama/model.go | 15 +- model/models/llama4/model.go | 7 +- model/models/mistral3/model.go | 9 +- model/models/mllama/model.go | 7 +- model/models/nomicbert/model.go | 11 +- model/models/olmo3/model.go | 11 +- model/models/qwen2/model.go | 7 +- model/models/qwen25vl/model.go | 7 +- model/models/qwen3/embed.go | 7 +- model/models/qwen3/model.go | 7 +- model/models/qwen3vl/model.go | 7 +- runner/ollamarunner/runner.go | 15 +- sample/samplers.go | 14 +- sample/samplers_test.go | 10 +- {model => tokenizer}/bytepairencoding.go | 40 +- {model => tokenizer}/bytepairencoding_test.go | 8 +- {model => tokenizer}/sentencepiece.go | 20 +- {model => tokenizer}/sentencepiece_test.go | 6 +- .../testdata/gemma2/tokenizer.model | Bin .../testdata/llama3.2/encoder.json | 0 .../testdata/llama3.2/vocab.bpe | 0 .../testdata/war-and-peace.txt | 0 .../tokenizer.go | 4 +- {model => tokenizer}/vocabulary.go | 2 +- {model => tokenizer}/vocabulary_test.go | 2 +- {model => tokenizer}/wordpiece.go | 28 +- {model => tokenizer}/wordpiece_test.go | 4 +- x/model/bytepairencoding_test.go | 4 +- x/model/models/gemma3/model.go | 7 +- x/model/sentencepiece_test.go | 2 +- 43 files changed, 613 insertions(+), 168 deletions(-) create mode 100644 model/ignore_test.go rename {model => tokenizer}/bytepairencoding.go (83%) rename {model => tokenizer}/bytepairencoding_test.go (97%) rename {model => tokenizer}/sentencepiece.go (92%) rename {model => tokenizer}/sentencepiece_test.go (96%) rename {model => tokenizer}/testdata/gemma2/tokenizer.model (100%) rename {model => tokenizer}/testdata/llama3.2/encoder.json (100%) rename {model => tokenizer}/testdata/llama3.2/vocab.bpe (100%) rename {model => tokenizer}/testdata/war-and-peace.txt (100%) rename model/textprocessor.go => tokenizer/tokenizer.go (86%) rename {model => tokenizer}/vocabulary.go (99%) rename {model => tokenizer}/vocabulary_test.go (99%) rename {model => tokenizer}/wordpiece.go (83%) rename {model => tokenizer}/wordpiece_test.go (97%) diff --git a/llm/server.go b/llm/server.go index 8fedc8468..5fdf1fa06 100644 --- a/llm/server.go +++ b/llm/server.go @@ -34,6 +34,7 @@ import ( "github.com/ollama/ollama/logutil" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/model" + "github.com/ollama/ollama/tokenizer" ) type filteredEnv []string @@ -115,7 +116,7 @@ type llamaServer struct { type ollamaServer struct { llmServer - textProcessor model.TextProcessor // textProcessor handles text encoding/decoding + tokenizer tokenizer.Tokenizer // textProcessor handles text encoding/decoding } // LoadModel will load a model from disk. The model must be in the GGML format. @@ -141,11 +142,11 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) { // NewLlamaServer will run a server for the given GPUs func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { var llamaModel *llama.Model - var textProcessor model.TextProcessor + var tokenizer tokenizer.Tokenizer var err error if envconfig.NewEngine() || f.KV().OllamaEngineRequired() { if len(projectors) == 0 { - textProcessor, err = model.NewTextProcessor(modelPath) + tokenizer, err = model.NewTextProcessor(modelPath) } else { err = errors.New("split vision models aren't supported") } @@ -154,7 +155,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err) } } - if textProcessor == nil { + if tokenizer == nil { llamaModel, err = llama.LoadModelFromFile(modelPath, llama.ModelParams{VocabOnly: true}) if err != nil { return nil, err @@ -210,7 +211,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st kvct := strings.ToLower(envconfig.KvCacheType()) - if textProcessor == nil { + if tokenizer == nil { flashAttention := ml.FlashAttentionAuto if faUserSet { if fa { @@ -260,7 +261,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st gpuLibs := ml.LibraryPaths(gpus) status := NewStatusWriter(os.Stderr) cmd, port, err := StartRunner( - textProcessor != nil, + tokenizer != nil, modelPath, gpuLibs, status, @@ -309,8 +310,8 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st } }() - if textProcessor != nil { - return &ollamaServer{llmServer: s, textProcessor: textProcessor}, nil + if tokenizer != nil { + return &ollamaServer{llmServer: s, tokenizer: tokenizer}, nil } else { return &llamaServer{llmServer: s, ggml: f}, nil } @@ -1772,7 +1773,7 @@ func (s *llamaServer) Tokenize(ctx context.Context, content string) ([]int, erro } func (s *ollamaServer) Tokenize(ctx context.Context, content string) ([]int, error) { - tokens, err := s.textProcessor.Encode(content, false) + tokens, err := s.tokenizer.Encode(content, false) if err != nil { return nil, err } @@ -1807,7 +1808,7 @@ func (s *ollamaServer) Detokenize(ctx context.Context, tokens []int) (string, er toks[i] = int32(t) } - content, err := s.textProcessor.Decode(toks) + content, err := s.tokenizer.Decode(toks) if err != nil { return "", err } diff --git a/model/ignore_test.go b/model/ignore_test.go new file mode 100644 index 000000000..dc5dc330e --- /dev/null +++ b/model/ignore_test.go @@ -0,0 +1,410 @@ +package model_test + +import ( + "cmp" + "encoding/binary" + "encoding/json" + "errors" + "flag" + "io" + "log/slog" + "os" + "path/filepath" + "runtime" + "slices" + "strconv" + "strings" + "testing" + + gocmp "github.com/google/go-cmp/cmp" + + "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/logutil" + "github.com/ollama/ollama/ml" + "github.com/ollama/ollama/model" + "github.com/ollama/ollama/model/input" + _ "github.com/ollama/ollama/model/models" + "github.com/ollama/ollama/tokenizer" + typesmodel "github.com/ollama/ollama/types/model" +) + +var args struct { + model, prompt, image, output string + layers, threads int + vv bool +} + +func setup(tb testing.TB) model.Model { + tb.Helper() + if args.model == "" { + tb.Skip("model argument is required") + } + + path := args.model + if _, err := os.Stat(path); errors.Is(err, os.ErrNotExist) { + name := typesmodel.ParseName(path) + if !name.IsValid() { + tb.Fatalf("model path %q does not exist and is not a valid model name", path) + } + + models := envconfig.Models() + + f, err := os.Open(filepath.Join(models, "manifests", name.Filepath())) + if err != nil { + tb.Fatal(err) + } + defer f.Close() + + var manifest struct { + Layers []struct { + MediaType string `json:"mediaType"` + Digest string `json:"digest"` + } `json:"layers"` + } + + if err := json.NewDecoder(f).Decode(&manifest); err != nil { + tb.Fatal(err) + } + + for _, layer := range manifest.Layers { + if layer.MediaType == "application/vnd.ollama.image.model" { + path = filepath.Join(models, "blobs", strings.ReplaceAll(layer.Digest, ":", "-")) + break + } + } + } else if err != nil { + tb.Fatal(err) + } + + return setupFile(tb, path) +} + +func setupFile(tb testing.TB, path string) model.Model { + tb.Log("using file", path) + + if s := os.Getenv("OLLAMA_LIBRARY_PATH"); s != "" { + abspath, err := filepath.Abs(s) + if err != nil { + tb.Fatal(err) + } + + tb.Setenv("PATH", strings.Join([]string{abspath, os.Getenv("PATH")}, string(os.PathListSeparator))) + } + + m, err := model.New(path, ml.BackendParams{ + AllocMemory: true, + NumThreads: args.threads, + GPULayers: []ml.GPULayers{ + { + DeviceID: ml.DeviceID{ + ID: cmp.Or(os.Getenv("OLLAMA_DEVICE_ID"), "0"), + Library: cmp.Or(os.Getenv("OLLAMA_DEVICE_LIBRARY"), "Metal"), + }, + Layers: func() (layers []int) { + layers = make([]int, args.layers) + for i := range layers { + layers[i] = i + } + return layers + }(), + }, + }, + }) + if err != nil { + tb.Fatal(err) + } + + if err := m.Backend().Load(tb.Context(), func(float32) {}); err != nil { + tb.Fatal(err) + } + + return m +} + +func dump(tb testing.TB, ctx ml.Context, tt ml.Tensor) { + tt = tt.Contiguous(ctx) + ctx.Forward(tt).Compute(tt) + + tb.Log("shape", tt.Shape(), "dtype", tt.DType(), "nbytes", len(tt.Bytes())) + if args.vv { + tb.Log(ml.Dump(ctx, tt)) + } + + if args.output != "" { + f, err := os.Create(args.output) + if err != nil { + tb.Fatal(err) + } + defer f.Close() + + if err := binary.Write(f, binary.LittleEndian, tt.Bytes()); err != nil { + tb.Fatal(err) + } + } +} + +func TestMain(m *testing.M) { + flag.StringVar(&args.model, "model", "", "path to model") + flag.StringVar(&args.prompt, "prompt", "The capital of France is", "model prompt") + flag.StringVar(&args.image, "image", "", "model image") + flag.StringVar(&args.output, "output", "", "output file") + + flag.IntVar(&args.layers, "layers", 99, "num of gpu layers") + flag.IntVar(&args.threads, "threads", runtime.NumCPU(), "num of threads") + + flag.BoolVar(&args.vv, "vv", false, "enable verbose logging") + flag.Parse() + + slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel())) + os.Exit(m.Run()) +} + +func TestTokenizer(t *testing.T) { + m := setup(t) + + processor, ok := m.(tokenizer.Tokenizer) + if !ok { + t.Fatal("not a text processor") + } + + cases := map[string][]int32{ + "kingfisher": {7037, 506, 13761}, + "prospect": {4892, 25594}, + } + + for wantText, wantIds := range cases { + t.Run(wantText, func(t *testing.T) { + encoded, err := processor.Encode(wantText, false) + if err != nil { + t.Fatal(err) + } + + t.Log("encoded", "ids", encoded) + if diff := gocmp.Diff(wantIds, encoded); diff != "" { + t.Error("-want +got:", diff) + } + + // if len(wantIds) < 10 { + // for _, wantId := range wantIds { + // decoded, err := processor.Decode([]int32{wantId}) + // if err != nil { + // t.Fatal(err) + // } + // + // t.Log("decoded", "id", wantId, "text", decoded) + // } + // } + + decoded, err := processor.Decode(wantIds) + if err != nil { + t.Fatal(err) + } + + if diff := gocmp.Diff(wantText, decoded); diff != "" { + t.Error("-want +got:", diff) + } + }) + } +} + +func TestDecodeText(t *testing.T) { + m := setup(t) + + maxBatch := 512 + if cache := m.Config().Cache; cache != nil { + cache.Init(m.Backend(), ml.DTypeF16, 1, 32<<10, maxBatch) + defer cache.Close() + } + + processor, ok := m.(tokenizer.Tokenizer) + if !ok { + t.Fatal("not a text processor") + } + + // prompt := "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nthe capital of france is<|im_end|>\n<|im_start|>assistant\n" + // prompt := `<|begin▁of▁sentence|>hellohello` + // prompt := "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nthe capital of france is<|im_end|>\n<|im_start|>assistant\n" + // t.Log("prompt", args.prompt) + // prompt := args.prompt + prompt := "[gMASK]<|user|>\nThe capital of France is<|assistant|>\n" + ids, err := processor.Encode(prompt, true) + if err != nil { + t.Fatal(err) + } + + positions := make([]int32, len(ids)) + for i := range ids { + positions[i] = int32(i) + } + + for i := 0; i < len(ids); i += maxBatch { + t.Run("batch/"+strconv.Itoa(i), func(t *testing.T) { + j := min(len(ids), i+maxBatch) + + slog.Info("", "ids", ids[i:j], "positions", positions[i:j]) + + ctx := m.Backend().NewContext() + defer ctx.Close() + + batch := input.Batch{ + Inputs: ctx.Input().FromInts(ids[i:j], j-i), + Positions: positions[i:j], + Sequences: slices.Repeat([]int{0}, j-i), + // Outputs: ctx.Input().FromInts([]int32{positions[j-1]}, 1), + } + + tt, err := model.Forward(ctx, m, batch) + if err != nil { + t.Fatal(err) + } + + dump(t, ctx, tt) + }) + } +} + +func TestEncodeImage(t *testing.T) { + m := setup(t) + + processor, ok := m.(model.MultimodalProcessor) + if !ok { + t.Fatal("model is not a MultimodalProcessor") + } + + if args.image == "" { + t.Fatal("image argument is required for multimodal models") + } + + f, err := os.Open(args.image) + if err != nil { + t.Fatal(err) + } + defer f.Close() + + bts, err := io.ReadAll(f) + if err != nil { + t.Fatal(err) + } + + ctx := m.Backend().NewContext() + defer ctx.Close() + + mm, err := processor.EncodeMultimodal(ctx, bts) + if err != nil { + t.Fatal(err) + } + + if len(mm) < 1 { + t.Fatal("no multimodal tensors returned") + } + + dump(t, ctx, mm[0].Tensor) +} + +func TestMultimodal(t *testing.T) { + m := setup(t) + + maxBatch := 2048 + if cache := m.Config().Cache; cache != nil { + cache.Init(m.Backend(), ml.DTypeF16, 1, 32<<10, maxBatch) + defer cache.Close() + } + + ctx := m.Backend().NewContext() + defer ctx.Close() + + textProcessor, ok := m.(tokenizer.Tokenizer) + if !ok { + t.Fatal("not a text processor") + } + + prompt := "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|image_pad|>transcribe the text in this image<|im_end|>\n<|im_start|>assistant\n" + ids, err := textProcessor.Encode(prompt, true) + if err != nil { + t.Fatal(err) + } + + imageProcessor, ok := m.(model.MultimodalProcessor) + if !ok { + t.Fatal("model is not a MultimodalProcessor") + } + + f, err := os.Open(args.image) + if err != nil { + t.Fatal(err) + } + defer f.Close() + + bts, err := io.ReadAll(f) + if err != nil { + t.Fatal(err) + } + + mm, err := imageProcessor.EncodeMultimodal(ctx, bts) + if err != nil { + t.Fatal(err) + } + + imagePad, err := textProcessor.Encode("<|image_pad|>", false) + if err != nil { + t.Fatal(err) + } else if len(imagePad) != 1 { + t.Fatalf("expected image pad to be a single token, got %d tokens", len(imagePad)) + } + + inputs := make([]*input.Input, len(ids)) + for i, id := range ids { + inputs[i] = &input.Input{Token: id} + if id == imagePad[0] { + inputs[i].Multimodal = mm + } + } + + t.Log("inputs", len(inputs)) + t.Log("inputs", func() (s []int) { + s = make([]int, len(inputs)) + for i, input := range inputs { + s[i] = int(input.Token) + } + return s + }()) + + inputs, err = imageProcessor.PostTokenize(inputs) + if err != nil { + t.Fatal(err) + } + + t.Log("post tokenize inputs", len(inputs)) + + tt, err := model.Forward(ctx, m, input.Batch{ + Inputs: ctx.Input().FromInts(slices.Collect(func(yield func(int32) bool) { + for _, input := range inputs { + if !yield(input.Token) { + return + } + } + }), len(inputs)), + Positions: slices.Collect(func(yield func(int32) bool) { + for i := range inputs { + if !yield(int32(i)) { + return + } + } + }), + Sequences: slices.Repeat([]int{0}, len(inputs)), + Multimodal: slices.Collect(func(yield func(input.MultimodalIndex) bool) { + for i := range inputs { + if mm := inputs[i].Multimodal; mm != nil { + if !yield(input.MultimodalIndex{Index: i, Multimodal: mm}) { + return + } + } + } + }), + }) + if err != nil { + t.Fatal(err) + } + + dump(t, ctx, tt) +} diff --git a/model/model.go b/model/model.go index b77d7175d..42fe7f25c 100644 --- a/model/model.go +++ b/model/model.go @@ -23,6 +23,7 @@ import ( _ "github.com/ollama/ollama/ml/backend" "github.com/ollama/ollama/ml/nn/pooling" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) var ( @@ -133,7 +134,7 @@ func New(modelPath string, params ml.BackendParams) (Model, error) { return m, nil } -func NewTextProcessor(s string) (TextProcessor, error) { +func NewTextProcessor(s string) (tokenizer.Tokenizer, error) { r, err := os.Open(s) if err != nil { return nil, err @@ -150,7 +151,7 @@ func NewTextProcessor(s string) (TextProcessor, error) { return nil, err } - tp, ok := m.(TextProcessor) + tp, ok := m.(tokenizer.Tokenizer) if !ok { return nil, ErrUnsupportedTokenizer } diff --git a/model/models/bert/embed.go b/model/models/bert/embed.go index 79cb3a3c7..3bce8bc0b 100644 --- a/model/models/bert/embed.go +++ b/model/models/bert/embed.go @@ -10,11 +10,12 @@ import ( "github.com/ollama/ollama/ml/nn/pooling" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Model struct { model.Base - model.TextProcessor + tokenizer.Tokenizer TokenEmbedding *nn.Embedding `gguf:"token_embd"` TypeEmbedding *nn.Embedding `gguf:"token_types"` @@ -129,7 +130,7 @@ func (o Options) headDim() int { } func New(c fs.Config) (model.Model, error) { - vocab := &model.Vocabulary{ + vocab := &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), @@ -153,17 +154,17 @@ func New(c fs.Config) (model.Model, error) { }, } - var processor model.TextProcessor + var t tokenizer.Tokenizer switch c.String("tokenizer.ggml.model", "bert") { case "bert": - processor = model.NewWordPiece(vocab, true) + t = tokenizer.NewWordPiece(vocab, true) default: return nil, model.ErrUnsupportedTokenizer } return &Model{ - TextProcessor: processor, - Layers: make([]EncoderLayer, c.Uint("block_count")), + Tokenizer: t, + Layers: make([]EncoderLayer, c.Uint("block_count")), Options: Options{ hiddenSize: int(c.Uint("embedding_length")), numHeads: int(c.Uint("attention.head_count")), diff --git a/model/models/deepseek2/model.go b/model/models/deepseek2/model.go index 576076aab..0bf9e7da3 100644 --- a/model/models/deepseek2/model.go +++ b/model/models/deepseek2/model.go @@ -13,6 +13,7 @@ import ( "github.com/ollama/ollama/ml/nn/rope" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Options struct { @@ -222,7 +223,7 @@ func (t *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tens type Model struct { model.Base - model.BytePairEncoding + tokenizer.Tokenizer TokenEmbedding *nn.Embedding `gguf:"token_embd"` Layers []Layer `gguf:"blk"` @@ -277,8 +278,8 @@ func New(c fs.Config) (model.Model, error) { } m := Model{ - BytePairEncoding: model.NewBytePairEncoding( - &model.Vocabulary{ + Tokenizer: tokenizer.NewBytePairEncoding( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), diff --git a/model/models/deepseekocr/model.go b/model/models/deepseekocr/model.go index 4fc069b69..9bfb5596a 100644 --- a/model/models/deepseekocr/model.go +++ b/model/models/deepseekocr/model.go @@ -10,11 +10,12 @@ import ( "github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Model struct { model.Base - model.TextProcessor + tokenizer.Tokenizer Sam *samModel `gguf:"s"` Vision *visionModel `gguf:"v"` @@ -134,8 +135,8 @@ func init() { } m := Model{ - TextProcessor: model.NewBytePairEncoding( - &model.Vocabulary{ + Tokenizer: tokenizer.NewBytePairEncoding( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), diff --git a/model/models/gemma2/model.go b/model/models/gemma2/model.go index 7b0aa2f01..56ac69922 100644 --- a/model/models/gemma2/model.go +++ b/model/models/gemma2/model.go @@ -10,6 +10,7 @@ import ( "github.com/ollama/ollama/ml/nn/rope" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Options struct { @@ -27,7 +28,7 @@ func (o Options) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions type Model struct { model.Base - model.SentencePiece + tokenizer.Tokenizer TokenEmbedding *nn.Embedding `gguf:"token_embd"` Layers []Layer `gguf:"blk"` @@ -43,8 +44,8 @@ const ( func New(c fs.Config) (model.Model, error) { m := Model{ - SentencePiece: model.NewSentencePiece( - &model.Vocabulary{ + Tokenizer: tokenizer.NewSentencePiece( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), diff --git a/model/models/gemma3/embed.go b/model/models/gemma3/embed.go index 9251111cf..6ad7f82cb 100644 --- a/model/models/gemma3/embed.go +++ b/model/models/gemma3/embed.go @@ -7,11 +7,12 @@ import ( "github.com/ollama/ollama/ml/nn/pooling" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type embedModel struct { model.Base - model.SentencePiece + tokenizer.Tokenizer *TextModel poolingType pooling.Type @@ -31,8 +32,8 @@ func (m *embedModel) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, erro func newEmbedModel(c fs.Config) (model.Model, error) { m := &embedModel{ - SentencePiece: model.NewSentencePiece( - &model.Vocabulary{ + Tokenizer: tokenizer.NewSentencePiece( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), diff --git a/model/models/gemma3/model.go b/model/models/gemma3/model.go index e595f1863..4f5f0e404 100644 --- a/model/models/gemma3/model.go +++ b/model/models/gemma3/model.go @@ -12,11 +12,12 @@ import ( "github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Model struct { model.Base - model.TextProcessor + tokenizer.Tokenizer *VisionModel `gguf:"v"` *TextModel @@ -54,7 +55,7 @@ func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, i } func New(c fs.Config) (model.Model, error) { - vocabulary := model.Vocabulary{ + vocabulary := tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), @@ -70,19 +71,19 @@ func New(c fs.Config) (model.Model, error) { ), } - var processor model.TextProcessor + var t tokenizer.Tokenizer switch c.String("tokenizer.ggml.model") { case "gpt2": - processor = model.NewBytePairEncoding(&vocabulary) + t = tokenizer.NewBytePairEncoding(&vocabulary) default: // Previous uploads of Gemma 3 on Ollama did not have token 106 // (i.e. "") so we need to add in case it's not already present vocabulary.EOS = append(vocabulary.EOS, int32(c.Uint("tokenizer.ggml.eot_token_id", 106))) - processor = model.NewSentencePiece(&vocabulary) + t = tokenizer.NewSentencePiece(&vocabulary) } m := Model{ - TextProcessor: processor, + Tokenizer: t, ImageProcessor: newImageProcessor(c), VisionModel: newVisionModel(c), TextModel: newTextModel(c), diff --git a/model/models/gemma3n/model.go b/model/models/gemma3n/model.go index e59e3193f..758745c5e 100644 --- a/model/models/gemma3n/model.go +++ b/model/models/gemma3n/model.go @@ -6,11 +6,12 @@ import ( "github.com/ollama/ollama/ml" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Model struct { model.Base - model.SentencePiece + tokenizer.Tokenizer *TextModel } @@ -23,8 +24,8 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { func New(c fs.Config) (model.Model, error) { m := Model{ TextModel: newTextModel(c), - SentencePiece: model.NewSentencePiece( - &model.Vocabulary{ + Tokenizer: tokenizer.NewSentencePiece( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), diff --git a/model/models/glm4moelite/model.go b/model/models/glm4moelite/model.go index 4d1e54aa6..d5d50ad82 100644 --- a/model/models/glm4moelite/model.go +++ b/model/models/glm4moelite/model.go @@ -10,6 +10,7 @@ import ( "github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) var ErrOldModelFormat = errors.New("this model uses a weight format that is no longer supported; please re-download it") @@ -198,7 +199,7 @@ func (t *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tens type Model struct { model.Base - model.BytePairEncoding + tokenizer.Tokenizer TokenEmbedding *nn.Embedding `gguf:"token_embd"` Layers []Layer `gguf:"blk"` @@ -236,8 +237,8 @@ func New(c fs.Config) (model.Model, error) { } m := Model{ - BytePairEncoding: model.NewBytePairEncoding( - &model.Vocabulary{ + Tokenizer: tokenizer.NewBytePairEncoding( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), diff --git a/model/models/gptoss/model.go b/model/models/gptoss/model.go index 9d1520bf3..2a3610a56 100644 --- a/model/models/gptoss/model.go +++ b/model/models/gptoss/model.go @@ -12,11 +12,12 @@ import ( "github.com/ollama/ollama/ml/nn/rope" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Transformer struct { model.Base - model.BytePairEncoding + tokenizer.Tokenizer TokenEmbedding *nn.Embedding `gguf:"token_embd"` TransformerBlocks []TransformerBlock `gguf:"blk"` @@ -196,8 +197,8 @@ func (mlp *MLPBlock) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Optio func New(c fs.Config) (model.Model, error) { m := Transformer{ TransformerBlocks: make([]TransformerBlock, c.Uint("block_count")), - BytePairEncoding: model.NewBytePairEncoding( - &model.Vocabulary{ + Tokenizer: tokenizer.NewBytePairEncoding( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), diff --git a/model/models/lfm2/model.go b/model/models/lfm2/model.go index 8ebeda1b8..51e40d3c3 100644 --- a/model/models/lfm2/model.go +++ b/model/models/lfm2/model.go @@ -10,6 +10,7 @@ import ( "github.com/ollama/ollama/ml/nn/rope" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Options struct { @@ -59,7 +60,7 @@ func (o Options) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions type Model struct { model.Base - model.TextProcessor + tokenizer.Tokenizer TokenEmbedding *nn.Embedding `gguf:"token_embd"` Layers []Layer `gguf:"blk"` @@ -78,7 +79,7 @@ func New(c fs.Config) (model.Model, error) { return nil, model.ErrUnsupportedTokenizer } - vocabulary := model.Vocabulary{ + vocabulary := tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), @@ -104,8 +105,8 @@ func New(c fs.Config) (model.Model, error) { } m := Model{ - TextProcessor: model.NewBytePairEncoding(&vocabulary, pretokenizers...), - Layers: make([]Layer, c.Uint("block_count")), + Tokenizer: tokenizer.NewBytePairEncoding(&vocabulary, pretokenizers...), + Layers: make([]Layer, c.Uint("block_count")), Options: Options{ hiddenSize: int(c.Uint("embedding_length")), headDim: int(c.Uint("attention.key_length")), diff --git a/model/models/llama/model.go b/model/models/llama/model.go index 5ff4894e4..ad95c5c00 100644 --- a/model/models/llama/model.go +++ b/model/models/llama/model.go @@ -11,6 +11,7 @@ import ( "github.com/ollama/ollama/ml/nn/rope" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Options struct { @@ -25,7 +26,7 @@ func (o Options) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions type Model struct { model.Base - model.TextProcessor + tokenizer.Tokenizer TokenEmbedding *nn.Embedding `gguf:"token_embd"` Layers []Layer `gguf:"blk"` @@ -41,8 +42,8 @@ func New(c fs.Config) (model.Model, error) { return nil, model.ErrUnsupportedModel } - var processor model.TextProcessor - vocabulary := model.Vocabulary{ + var processor tokenizer.Tokenizer + vocabulary := tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), @@ -80,16 +81,16 @@ func New(c fs.Config) (model.Model, error) { "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", } } - processor = model.NewBytePairEncoding(&vocabulary, pretokenizers...) + processor = tokenizer.NewBytePairEncoding(&vocabulary, pretokenizers...) case "llama": - processor = model.NewSentencePiece(&vocabulary) + processor = tokenizer.NewSentencePiece(&vocabulary) default: return nil, model.ErrUnsupportedTokenizer } m := Model{ - TextProcessor: processor, - Layers: make([]Layer, c.Uint("block_count")), + Tokenizer: processor, + Layers: make([]Layer, c.Uint("block_count")), Options: Options{ hiddenSize: int(c.Uint("embedding_length")), numHeads: int(c.Uint("attention.head_count")), diff --git a/model/models/llama4/model.go b/model/models/llama4/model.go index 4a22bc4bb..c8373b7eb 100644 --- a/model/models/llama4/model.go +++ b/model/models/llama4/model.go @@ -11,11 +11,12 @@ import ( "github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Model struct { model.Base - model.BytePairEncoding + tokenizer.Tokenizer ImageProcessor *VisionModel `gguf:"v"` @@ -33,8 +34,8 @@ func (p *Projector) Forward(ctx ml.Context, visionOutputs ml.Tensor) ml.Tensor { func New(c fs.Config) (model.Model, error) { m := Model{ - BytePairEncoding: model.NewBytePairEncoding( - &model.Vocabulary{ + Tokenizer: tokenizer.NewBytePairEncoding( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), diff --git a/model/models/mistral3/model.go b/model/models/mistral3/model.go index 8230dde39..8485d34ca 100644 --- a/model/models/mistral3/model.go +++ b/model/models/mistral3/model.go @@ -11,11 +11,12 @@ import ( "github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Model struct { model.Base - model.BytePairEncoding + tokenizer.Tokenizer *TextModel *VisionModel `gguf:"v"` @@ -28,12 +29,12 @@ type Model struct { var _ model.MultimodalProcessor = (*Model)(nil) // Implement TextProcessor interface -var _ model.TextProcessor = (*Model)(nil) +var _ tokenizer.Tokenizer = (*Model)(nil) func New(c fs.Config) (model.Model, error) { m := &Model{ - BytePairEncoding: model.NewBytePairEncoding( - &model.Vocabulary{ + Tokenizer: tokenizer.NewBytePairEncoding( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go index 58fd5adcf..2d032467d 100644 --- a/model/models/mllama/model.go +++ b/model/models/mllama/model.go @@ -11,11 +11,12 @@ import ( "github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Model struct { model.Base - model.BytePairEncoding + tokenizer.Tokenizer *VisionModel `gguf:"v"` *TextModel @@ -32,8 +33,8 @@ const ( func New(c fs.Config) (model.Model, error) { m := Model{ - BytePairEncoding: model.NewBytePairEncoding( - &model.Vocabulary{ + Tokenizer: tokenizer.NewBytePairEncoding( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), diff --git a/model/models/nomicbert/model.go b/model/models/nomicbert/model.go index 096d046a0..4a414f6d5 100644 --- a/model/models/nomicbert/model.go +++ b/model/models/nomicbert/model.go @@ -11,11 +11,12 @@ import ( "github.com/ollama/ollama/ml/nn/rope" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Model struct { model.Base - model.TextProcessor + tokenizer.Tokenizer TokenEmbedding *nn.Embedding `gguf:"token_embd"` TypeEmbedding *nn.Embedding `gguf:"token_types"` @@ -178,8 +179,8 @@ func New(c fs.Config) (model.Model, error) { numHeads := int(c.Uint("attention.head_count")) headDim := hiddenSize / numHeads - processor := model.NewWordPiece( - &model.Vocabulary{ + tokenizer := tokenizer.NewWordPiece( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), @@ -219,8 +220,8 @@ func New(c fs.Config) (model.Model, error) { } return &Model{ - TextProcessor: processor, - Layers: layers, + Tokenizer: tokenizer, + Layers: layers, Options: Options{ hiddenSize: hiddenSize, numHeads: numHeads, diff --git a/model/models/olmo3/model.go b/model/models/olmo3/model.go index 523c00e68..048869a40 100644 --- a/model/models/olmo3/model.go +++ b/model/models/olmo3/model.go @@ -11,6 +11,7 @@ import ( "github.com/ollama/ollama/ml/nn/rope" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) const ( @@ -33,7 +34,7 @@ type Options struct { type Model struct { model.Base - model.TextProcessor + tokenizer.Tokenizer TokenEmbedding *nn.Embedding `gguf:"token_embd"` Layers []Layer `gguf:"blk"` @@ -44,7 +45,7 @@ type Model struct { } func New(c fs.Config) (model.Model, error) { - vocabulary := model.Vocabulary{ + vocabulary := tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), @@ -58,14 +59,14 @@ func New(c fs.Config) (model.Model, error) { ), } - processor := model.NewBytePairEncoding( + tokenizer := tokenizer.NewBytePairEncoding( &vocabulary, "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", ) m := Model{ - TextProcessor: processor, - Layers: make([]Layer, c.Uint("block_count")), + Tokenizer: tokenizer, + Layers: make([]Layer, c.Uint("block_count")), Options: Options{ hiddenSize: int(c.Uint("embedding_length")), numHeads: int(c.Uint("attention.head_count")), diff --git a/model/models/qwen2/model.go b/model/models/qwen2/model.go index 66f546ae6..17ed0d327 100644 --- a/model/models/qwen2/model.go +++ b/model/models/qwen2/model.go @@ -13,6 +13,7 @@ import ( "github.com/ollama/ollama/ml/nn/rope" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Options struct { @@ -92,7 +93,7 @@ func (d DecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs m type Model struct { model.Base - model.BytePairEncoding + tokenizer.Tokenizer TokenEmbedding *nn.Embedding `gguf:"token_embd"` Layers []DecoderLayer `gguf:"blk"` @@ -139,8 +140,8 @@ func New(c fs.Config) (model.Model, error) { } m := Model{ Layers: make([]DecoderLayer, c.Uint("block_count")), - BytePairEncoding: model.NewBytePairEncoding( - &model.Vocabulary{ + Tokenizer: tokenizer.NewBytePairEncoding( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go index 81296a81b..68230707f 100644 --- a/model/models/qwen25vl/model.go +++ b/model/models/qwen25vl/model.go @@ -10,11 +10,12 @@ import ( "github.com/ollama/ollama/ml" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Model struct { model.Base - model.BytePairEncoding + tokenizer.Tokenizer *TextModel *VisionModel `gguf:"v"` @@ -27,8 +28,8 @@ var _ model.MultimodalProcessor = (*Model)(nil) func New(c fs.Config) (model.Model, error) { m := &Model{ - BytePairEncoding: model.NewBytePairEncoding( - &model.Vocabulary{ + Tokenizer: tokenizer.NewBytePairEncoding( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), diff --git a/model/models/qwen3/embed.go b/model/models/qwen3/embed.go index c03888d45..c10390ff7 100644 --- a/model/models/qwen3/embed.go +++ b/model/models/qwen3/embed.go @@ -7,11 +7,12 @@ import ( "github.com/ollama/ollama/ml/nn/pooling" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type embedModel struct { model.Base - model.BytePairEncoding + tokenizer.Tokenizer *Model poolingType pooling.Type @@ -34,8 +35,8 @@ func newEmbed(c fs.Config) (model.Model, error) { layers[i].MLP = &dense{} } m := embedModel{ - BytePairEncoding: model.NewBytePairEncoding( - &model.Vocabulary{ + Tokenizer: tokenizer.NewBytePairEncoding( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), diff --git a/model/models/qwen3/model.go b/model/models/qwen3/model.go index d7747364e..2602be44e 100644 --- a/model/models/qwen3/model.go +++ b/model/models/qwen3/model.go @@ -12,6 +12,7 @@ import ( "github.com/ollama/ollama/ml/nn/rope" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Options struct { @@ -159,7 +160,7 @@ func (d *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tens type Model struct { model.Base - model.BytePairEncoding + tokenizer.Tokenizer TokenEmbedding *nn.Embedding `gguf:"token_embd"` OutputNorm *nn.RMSNorm `gguf:"output_norm"` @@ -218,8 +219,8 @@ func New(c fs.Config) (model.Model, error) { } m := Model{ - BytePairEncoding: model.NewBytePairEncoding( - &model.Vocabulary{ + Tokenizer: tokenizer.NewBytePairEncoding( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), diff --git a/model/models/qwen3vl/model.go b/model/models/qwen3vl/model.go index cb1ce8d2c..740c548ff 100644 --- a/model/models/qwen3vl/model.go +++ b/model/models/qwen3vl/model.go @@ -10,11 +10,12 @@ import ( "github.com/ollama/ollama/ml" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/tokenizer" ) type Model struct { model.Base - model.TextProcessor + tokenizer.Tokenizer *TextModel *VisionModel `gguf:"v"` @@ -172,8 +173,8 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { func New(c fs.Config) (model.Model, error) { m := Model{ - TextProcessor: model.NewBytePairEncoding( - &model.Vocabulary{ + Tokenizer: tokenizer.NewBytePairEncoding( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Types: c.Ints("tokenizer.ggml.token_type"), Merges: c.Strings("tokenizer.ggml.merges"), diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index a756cba23..a9b45e776 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -37,6 +37,7 @@ import ( "github.com/ollama/ollama/model/input" "github.com/ollama/ollama/runner/common" "github.com/ollama/ollama/sample" + "github.com/ollama/ollama/tokenizer" _ "github.com/ollama/ollama/model/models" ) @@ -210,9 +211,9 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe } // calculateLogprobs converts raw logits to log probabilities and finds top K tokens -func calculateLogprobs(logits []float32, selectedToken int32, topK int, textProcessor model.TextProcessor) []llm.Logprob { +func calculateLogprobs(logits []float32, selectedToken int32, topK int, tokenizer tokenizer.Tokenizer) []llm.Logprob { decoder := func(tokenID int) string { - text, _ := textProcessor.Decode([]int32{int32(tokenID)}) + text, _ := tokenizer.Decode([]int32{int32(tokenID)}) return text } return common.CalculateLogprobs(logits, int(selectedToken), topK, decoder) @@ -242,7 +243,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]*input.Input, for i, part := range parts { // text - tokenize - tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0) + tokens, err := s.model.(tokenizer.Tokenizer).Encode(part, i == 0) if err != nil { return nil, nil, nil, err } @@ -766,7 +767,7 @@ func (s *Server) computeBatch(activeBatch batchState) { nextBatchTokens[i].Token = token // if it's an end of sequence token, break - if s.model.(model.TextProcessor).Is(token, model.SpecialEOS) { + if s.model.(tokenizer.Tokenizer).Is(token, tokenizer.SpecialEOS) { // TODO (jmorganca): we should send this back // as it's important for the /api/generate context // seq.responses <- piece @@ -775,14 +776,14 @@ func (s *Server) computeBatch(activeBatch batchState) { continue } - piece, err := s.model.(model.TextProcessor).Decode([]int32{token}) + piece, err := s.model.(tokenizer.Tokenizer).Decode([]int32{token}) if err != nil { panic("failed to decode token") } // Calculate logprobs if requested (after EOS check to avoid logprobs for EOS tokens) if seq.logprobs { - logprobs := calculateLogprobs(logits, token, seq.topLogprobs, s.model.(model.TextProcessor)) + logprobs := calculateLogprobs(logits, token, seq.topLogprobs, s.model.(tokenizer.Tokenizer)) seq.pendingLogprobs = append(seq.pendingLogprobs, logprobs...) } @@ -873,7 +874,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { var grammar *sample.GrammarSampler var err error if req.Grammar != "" { - grammar, err = sample.NewGrammarSampler(s.model.(model.TextProcessor), req.Grammar) + grammar, err = sample.NewGrammarSampler(s.model.(tokenizer.Tokenizer), req.Grammar) if err != nil { http.Error(w, "failed to load model vocabulary required for format", http.StatusInternalServerError) return diff --git a/sample/samplers.go b/sample/samplers.go index d395650d9..a8cf5def8 100644 --- a/sample/samplers.go +++ b/sample/samplers.go @@ -7,7 +7,7 @@ import ( "slices" "github.com/ollama/ollama/llama" - "github.com/ollama/ollama/model" + "github.com/ollama/ollama/tokenizer" ) // token represents information about a single token during sampling @@ -168,15 +168,15 @@ type GrammarSampler struct { grammar *llama.Grammar } -func NewGrammarSampler(model model.TextProcessor, grammarStr string) (*GrammarSampler, error) { - vocabIds := make([]uint32, len(model.Vocabulary().Values)) - pieces := make([]string, len(model.Vocabulary().Values)) - for i := range model.Vocabulary().Values { - pieces[i], _ = model.Decode([]int32{int32(i)}) +func NewGrammarSampler(tokenizer tokenizer.Tokenizer, grammarStr string) (*GrammarSampler, error) { + vocabIds := make([]uint32, len(tokenizer.Vocabulary().Values)) + pieces := make([]string, len(tokenizer.Vocabulary().Values)) + for i := range tokenizer.Vocabulary().Values { + pieces[i], _ = tokenizer.Decode([]int32{int32(i)}) vocabIds[i] = uint32(i) } - grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, model.Vocabulary().EOS) + grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, tokenizer.Vocabulary().EOS) if grammar == nil { return nil, errors.New("sample: failed to initialize grammar") } diff --git a/sample/samplers_test.go b/sample/samplers_test.go index eb10295d4..c9790fa8d 100644 --- a/sample/samplers_test.go +++ b/sample/samplers_test.go @@ -8,7 +8,7 @@ import ( "path/filepath" "testing" - "github.com/ollama/ollama/model" + "github.com/ollama/ollama/tokenizer" ) func TestWeighted(t *testing.T) { @@ -60,10 +60,10 @@ func TestWeighted(t *testing.T) { } } -func modelHelper(t testing.TB) model.BytePairEncoding { +func modelHelper(t testing.TB) tokenizer.Tokenizer { t.Helper() - f, err := os.Open(filepath.Join("..", "model", "testdata", "llama3.2", "encoder.json")) + f, err := os.Open(filepath.Join("..", "testdata", "testdata", "llama3.2", "encoder.json")) if err != nil { t.Fatal(err) } @@ -81,8 +81,8 @@ func modelHelper(t testing.TB) model.BytePairEncoding { merges := make([]string, 0, 1) // Only need vocab for Grammar Test - return model.NewBytePairEncoding( - &model.Vocabulary{ + return tokenizer.NewBytePairEncoding( + &tokenizer.Vocabulary{ Values: tokens, Types: make([]int32, len(vocab)), Merges: merges, diff --git a/model/bytepairencoding.go b/tokenizer/bytepairencoding.go similarity index 83% rename from model/bytepairencoding.go rename to tokenizer/bytepairencoding.go index 765331bf8..da6010758 100644 --- a/model/bytepairencoding.go +++ b/tokenizer/bytepairencoding.go @@ -1,8 +1,10 @@ -package model +package tokenizer import ( "cmp" + "fmt" "iter" + "log/slog" "slices" "strings" @@ -11,24 +13,24 @@ import ( "github.com/ollama/ollama/logutil" ) -type BytePairEncoding struct { +type bytePairEncoding struct { vocab *Vocabulary regexps []*regexp2.Regexp } -var _ TextProcessor = (*BytePairEncoding)(nil) +var _ Tokenizer = (*bytePairEncoding)(nil) -func NewBytePairEncoding(vocab *Vocabulary, pretokenizers ...string) BytePairEncoding { - if len(pretokenizers) == 0 { +func NewBytePairEncoding(vocab *Vocabulary, pretokenizer ...string) bytePairEncoding { + if len(pretokenizer) == 0 { // set default byte-level pretokenizer if none provided, e.g. - // https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L44 - pretokenizers = []string{`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`} + // https://github.com/huggingface/tokenizer/blob/main/tokenizer/src/pre_tokenizer/byte_level.rs#L44 + pretokenizer = []string{`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`} } - return BytePairEncoding{ + return bytePairEncoding{ vocab: vocab, regexps: slices.Collect(func(yield func(*regexp2.Regexp) bool) { - for _, p := range pretokenizers { + for _, p := range pretokenizer { if !yield(regexp2.MustCompile(p, regexp2.RE2)) { return } @@ -37,15 +39,15 @@ func NewBytePairEncoding(vocab *Vocabulary, pretokenizers ...string) BytePairEnc } } -func (bpe BytePairEncoding) Vocabulary() *Vocabulary { +func (bpe bytePairEncoding) Vocabulary() *Vocabulary { return bpe.vocab } -func (bpe BytePairEncoding) Is(id int32, special Special) bool { +func (bpe bytePairEncoding) Is(id int32, special Special) bool { return bpe.vocab.Is(id, special) } -func (bpe *BytePairEncoding) split(s string) iter.Seq[string] { +func (bpe *bytePairEncoding) split(s string) iter.Seq[string] { parts := []string{s} for _, re := range bpe.regexps { parts = slices.Collect(func(yield func(string) bool) { @@ -96,7 +98,7 @@ type merge struct { runes []rune } -func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) { +func (bpe bytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) { fragments := []fragment{{value: s}} for _, special := range bpe.vocab.SpecialVocabulary() { // TODO: process special tokens concurrently @@ -243,7 +245,15 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) { return ids, nil } -func (bpe BytePairEncoding) Decode(ids []int32) (string, error) { +type lazyIdsString struct { + ids []int32 +} + +func (l lazyIdsString) LogValue() slog.Value { + return slog.AnyValue(fmt.Sprint(l.ids)) +} + +func (bpe bytePairEncoding) Decode(ids []int32) (string, error) { var sb strings.Builder for _, id := range ids { for _, r := range bpe.vocab.Decode(id) { @@ -267,6 +277,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) { } } - logutil.Trace("decoded", "string", sb.String(), "from", ids) + logutil.Trace("decoded", "string", sb.String(), "from", lazyIdsString{ids: ids}) return sb.String(), nil } diff --git a/model/bytepairencoding_test.go b/tokenizer/bytepairencoding_test.go similarity index 97% rename from model/bytepairencoding_test.go rename to tokenizer/bytepairencoding_test.go index 15cb56ca9..a94904836 100644 --- a/model/bytepairencoding_test.go +++ b/tokenizer/bytepairencoding_test.go @@ -1,4 +1,4 @@ -package model +package tokenizer import ( "bufio" @@ -14,10 +14,10 @@ import ( "github.com/google/go-cmp/cmp" ) -func llama(t testing.TB) BytePairEncoding { +func llama(t testing.TB) bytePairEncoding { t.Helper() - f, err := os.Open(filepath.Join("testdata", "llama3.2", "encoder.json")) + f, err := os.Open(filepath.FromSlash("testdata/llama3.2/encoder.json")) if err != nil { t.Fatal(err) } @@ -43,7 +43,7 @@ func llama(t testing.TB) BytePairEncoding { } } - f, err = os.Open(filepath.Join("testdata", "llama3.2", "vocab.bpe")) + f, err = os.Open(filepath.FromSlash("testdata/llama3.2/vocab.bpe")) if err != nil { t.Fatal(err) } diff --git a/model/sentencepiece.go b/tokenizer/sentencepiece.go similarity index 92% rename from model/sentencepiece.go rename to tokenizer/sentencepiece.go index 2c178ec0c..baf6383ba 100644 --- a/model/sentencepiece.go +++ b/tokenizer/sentencepiece.go @@ -1,4 +1,4 @@ -package model +package tokenizer import ( "container/heap" @@ -12,18 +12,18 @@ import ( const spmWhitespaceSep = "▁" -type SentencePiece struct { +type sentencePiece struct { maxTokenLen int vocab *Vocabulary } -var _ TextProcessor = (*SentencePiece)(nil) +var _ Tokenizer = (*sentencePiece)(nil) -func (spm SentencePiece) Vocabulary() *Vocabulary { +func (spm sentencePiece) Vocabulary() *Vocabulary { return spm.vocab } -func NewSentencePiece(vocab *Vocabulary) SentencePiece { +func NewSentencePiece(vocab *Vocabulary) sentencePiece { logutil.Trace("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5]) counter := map[int]int{} @@ -42,17 +42,17 @@ func NewSentencePiece(vocab *Vocabulary) SentencePiece { "user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE], "max token len", maxTokenLen) - return SentencePiece{ + return sentencePiece{ maxTokenLen: maxTokenLen, vocab: vocab, } } -func (spm SentencePiece) Is(id int32, special Special) bool { +func (spm sentencePiece) Is(id int32, special Special) bool { return spm.vocab.Is(id, special) } -func (spm SentencePiece) Encode(s string, addSpecial bool) ([]int32, error) { +func (spm sentencePiece) Encode(s string, addSpecial bool) ([]int32, error) { fragments := []fragment{{value: s}} for _, special := range spm.vocab.SpecialVocabulary() { id := spm.vocab.Encode(special) @@ -218,13 +218,13 @@ func (q *queue) Pop() interface{} { return item } -func (spm SentencePiece) Decode(ids []int32) (string, error) { +func (spm sentencePiece) Decode(ids []int32) (string, error) { var sb strings.Builder for _, id := range ids { data := spm.vocab.Decode(id) data = strings.ReplaceAll(data, spmWhitespaceSep, " ") - // For tokenizers that use byte tokens like "<0xEA>" + // For tokenizer that use byte tokens like "<0xEA>" // convert them to the partial unicode character // so they are buffered correctly by the runner instead // of being sent back to the api as "<0xEA>" diff --git a/model/sentencepiece_test.go b/tokenizer/sentencepiece_test.go similarity index 96% rename from model/sentencepiece_test.go rename to tokenizer/sentencepiece_test.go index 8f4570c17..92b7c41e7 100644 --- a/model/sentencepiece_test.go +++ b/tokenizer/sentencepiece_test.go @@ -1,4 +1,4 @@ -package model +package tokenizer import ( "log/slog" @@ -12,10 +12,10 @@ import ( "github.com/ollama/ollama/convert/sentencepiece" ) -func loadSentencePieceVocab(t *testing.T) SentencePiece { +func loadSentencePieceVocab(t *testing.T) sentencePiece { t.Helper() - bts, err := os.ReadFile(filepath.Join("testdata", "gemma2", "tokenizer.model")) + bts, err := os.ReadFile(filepath.FromSlash("testdata/gemma2/tokenizer.model")) if err != nil { t.Fatal(err) } diff --git a/model/testdata/gemma2/tokenizer.model b/tokenizer/testdata/gemma2/tokenizer.model similarity index 100% rename from model/testdata/gemma2/tokenizer.model rename to tokenizer/testdata/gemma2/tokenizer.model diff --git a/model/testdata/llama3.2/encoder.json b/tokenizer/testdata/llama3.2/encoder.json similarity index 100% rename from model/testdata/llama3.2/encoder.json rename to tokenizer/testdata/llama3.2/encoder.json diff --git a/model/testdata/llama3.2/vocab.bpe b/tokenizer/testdata/llama3.2/vocab.bpe similarity index 100% rename from model/testdata/llama3.2/vocab.bpe rename to tokenizer/testdata/llama3.2/vocab.bpe diff --git a/model/testdata/war-and-peace.txt b/tokenizer/testdata/war-and-peace.txt similarity index 100% rename from model/testdata/war-and-peace.txt rename to tokenizer/testdata/war-and-peace.txt diff --git a/model/textprocessor.go b/tokenizer/tokenizer.go similarity index 86% rename from model/textprocessor.go rename to tokenizer/tokenizer.go index 4a36f2352..64b5c4102 100644 --- a/model/textprocessor.go +++ b/tokenizer/tokenizer.go @@ -1,4 +1,4 @@ -package model +package tokenizer const ( TOKEN_TYPE_NORMAL = iota + 1 @@ -9,7 +9,7 @@ const ( TOKEN_TYPE_BYTE ) -type TextProcessor interface { +type Tokenizer interface { Encode(s string, addSpecial bool) ([]int32, error) Decode([]int32) (string, error) Is(int32, Special) bool diff --git a/model/vocabulary.go b/tokenizer/vocabulary.go similarity index 99% rename from model/vocabulary.go rename to tokenizer/vocabulary.go index d977c4957..f5d71ef69 100644 --- a/model/vocabulary.go +++ b/tokenizer/vocabulary.go @@ -1,4 +1,4 @@ -package model +package tokenizer import ( "log/slog" diff --git a/model/vocabulary_test.go b/tokenizer/vocabulary_test.go similarity index 99% rename from model/vocabulary_test.go rename to tokenizer/vocabulary_test.go index ccfc39e69..72f73203a 100644 --- a/model/vocabulary_test.go +++ b/tokenizer/vocabulary_test.go @@ -1,4 +1,4 @@ -package model +package tokenizer import ( "testing" diff --git a/model/wordpiece.go b/tokenizer/wordpiece.go similarity index 83% rename from model/wordpiece.go rename to tokenizer/wordpiece.go index e552bce0d..df8642136 100644 --- a/model/wordpiece.go +++ b/tokenizer/wordpiece.go @@ -1,4 +1,4 @@ -package model +package tokenizer import ( "fmt" @@ -9,7 +9,7 @@ import ( "github.com/ollama/ollama/logutil" ) -type WordPiece struct { +type wordPiece struct { vocab *Vocabulary lowercase bool } @@ -32,8 +32,8 @@ var wordPieceReplacer = strings.NewReplacer( " 're", "'re", ) -// Decode implements TextProcessor. -func (wpm WordPiece) Decode(ids []int32) (string, error) { +// Decode implements Tokenizer. +func (wpm wordPiece) Decode(ids []int32) (string, error) { var sb strings.Builder for i, id := range ids { if id < 0 || int(id) >= len(wpm.vocab.Values) { @@ -56,7 +56,7 @@ func (wpm WordPiece) Decode(ids []int32) (string, error) { // words splits a string into words, treating CJK characters as separate words. // TODO: this is specifically for BERT and may need to be adjusted or refactored for other models. -func (wpm WordPiece) words(s string) iter.Seq[string] { +func (wpm wordPiece) words(s string) iter.Seq[string] { return func(yield func(string) bool) { runes := make([]rune, 0, len(s)*3) for _, r := range s { @@ -96,8 +96,8 @@ func (wpm WordPiece) words(s string) iter.Seq[string] { } } -// Encode implements TextProcessor. -func (wpm WordPiece) Encode(s string, addSpecial bool) ([]int32, error) { +// Encode implements Tokenizer. +func (wpm wordPiece) Encode(s string, addSpecial bool) ([]int32, error) { var ids []int32 // TODO: use [UNK] from config @@ -151,20 +151,20 @@ func (wpm WordPiece) Encode(s string, addSpecial bool) ([]int32, error) { return ids, nil } -// Is implements TextProcessor. -func (wpm WordPiece) Is(id int32, special Special) bool { +// Is implements Tokenizer. +func (wpm wordPiece) Is(id int32, special Special) bool { return wpm.vocab.Is(id, special) } -// Vocabulary implements TextProcessor. -func (wpm WordPiece) Vocabulary() *Vocabulary { +// Vocabulary implements Tokenizer. +func (wpm wordPiece) Vocabulary() *Vocabulary { return wpm.vocab } -var _ TextProcessor = (*WordPiece)(nil) +var _ Tokenizer = (*wordPiece)(nil) -func NewWordPiece(vocab *Vocabulary, lowercase bool) WordPiece { - return WordPiece{ +func NewWordPiece(vocab *Vocabulary, lowercase bool) wordPiece { + return wordPiece{ vocab: vocab, lowercase: lowercase, } diff --git a/model/wordpiece_test.go b/tokenizer/wordpiece_test.go similarity index 97% rename from model/wordpiece_test.go rename to tokenizer/wordpiece_test.go index c03bb17a7..509216637 100644 --- a/model/wordpiece_test.go +++ b/tokenizer/wordpiece_test.go @@ -1,4 +1,4 @@ -package model +package tokenizer import ( "slices" @@ -39,7 +39,7 @@ func TestWordPiece(t *testing.T) { } func TestWordPieceWords(t *testing.T) { - var wpm WordPiece + var wpm wordPiece basic := slices.Collect(wpm.words("Hey friend! How are you?!?")) if diff := cmp.Diff([]string{"Hey", "friend", "!", "How", "are", "you", "?", "!", "?"}, basic); diff != "" { diff --git a/x/model/bytepairencoding_test.go b/x/model/bytepairencoding_test.go index 2a7041284..0e497920c 100644 --- a/x/model/bytepairencoding_test.go +++ b/x/model/bytepairencoding_test.go @@ -17,7 +17,7 @@ import ( func llama(t testing.TB) BytePairEncoding { t.Helper() - f, err := os.Open(filepath.Join("..", "..", "model", "testdata", "llama3.2", "encoder.json")) + f, err := os.Open(filepath.Join("..", "..", "tokenizer", "testdata", "llama3.2", "encoder.json")) if err != nil { t.Fatal(err) } @@ -43,7 +43,7 @@ func llama(t testing.TB) BytePairEncoding { } } - f, err = os.Open(filepath.Join("..", "..", "model", "testdata", "llama3.2", "vocab.bpe")) + f, err = os.Open(filepath.Join("..", "..", "tokenizer", "testdata", "llama3.2", "vocab.bpe")) if err != nil { t.Fatal(err) } diff --git a/x/model/models/gemma3/model.go b/x/model/models/gemma3/model.go index 23f78f207..66e0f5124 100644 --- a/x/model/models/gemma3/model.go +++ b/x/model/models/gemma3/model.go @@ -9,6 +9,7 @@ import ( "slices" "github.com/ollama/ollama/fs" + "github.com/ollama/ollama/tokenizer" "github.com/ollama/ollama/x/kvcache" "github.com/ollama/ollama/x/ml" "github.com/ollama/ollama/x/ml/nn" @@ -18,7 +19,7 @@ import ( type Model struct { model.Base - model.SentencePiece + tokenizer.Tokenizer *VisionModel `gguf:"vision_tower.vision_model"` *TextModel `gguf:"language_model.model"` @@ -58,8 +59,8 @@ func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, i func New(c fs.Config) (model.Model, error) { // slog.Info("XXX Config", "c", c) m := Model{ - SentencePiece: model.NewSentencePiece( - &model.Vocabulary{ + Tokenizer: tokenizer.NewSentencePiece( + &tokenizer.Vocabulary{ Values: c.Strings("tokenizer.ggml.tokens"), Scores: c.Floats("tokenizer.ggml.scores"), Types: c.Ints("tokenizer.ggml.token_type"), diff --git a/x/model/sentencepiece_test.go b/x/model/sentencepiece_test.go index 7ab158af7..65c97cab2 100644 --- a/x/model/sentencepiece_test.go +++ b/x/model/sentencepiece_test.go @@ -15,7 +15,7 @@ import ( func loadSentencePieceVocab(t *testing.T) SentencePiece { t.Helper() - bts, err := os.ReadFile(filepath.Join("..", "..", "model", "testdata", "gemma2", "tokenizer.model")) + bts, err := os.ReadFile(filepath.Join("..", "..", "tokenizer", "testdata", "gemma2", "tokenizer.model")) if err != nil { t.Fatal(err) }