diff --git a/envconfig/config.go b/envconfig/config.go index 238e5e6e1..a7466a84a 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -201,7 +201,7 @@ var ( // Enable the new Ollama engine NewEngine = Bool("OLLAMA_NEW_ENGINE") // ContextLength sets the default context length - ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096) + ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 0) // Auth enables authentication between the Ollama client and server UseAuth = Bool("OLLAMA_AUTH") // Enable Vulkan backend @@ -290,7 +290,7 @@ func AsMap() map[string]EnvVar { "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, - "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"}, + "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4k/32k/256k based on VRAM)"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, "OLLAMA_REMOTES": {"OLLAMA_REMOTES", Remotes(), "Allowed hosts for remote models (default \"ollama.com\")"}, diff --git a/envconfig/config_test.go b/envconfig/config_test.go index ddd86a114..4e7ae9eba 100644 --- a/envconfig/config_test.go +++ b/envconfig/config_test.go @@ -282,7 +282,7 @@ func TestVar(t *testing.T) { func TestContextLength(t *testing.T) { cases := map[string]uint{ - "": 4096, + "": 0, "2048": 2048, } diff --git a/server/routes.go b/server/routes.go index e28eb7798..910b8e954 100644 --- a/server/routes.go +++ b/server/routes.go @@ -75,16 +75,12 @@ func experimentEnabled(name string) bool { var useClient2 = experimentEnabled("client2") -// Low VRAM mode is based on the sum of total VRAM (not free) and triggers -// reduced context length on some models -var lowVRAMThreshold uint64 = 20 * format.GibiByte - var mode string = gin.DebugMode type Server struct { - addr net.Addr - sched *Scheduler - lowVRAM bool + addr net.Addr + sched *Scheduler + defaultNumCtx int } func init() { @@ -107,8 +103,12 @@ var ( errBadTemplate = errors.New("template error") ) -func modelOptions(model *Model, requestOpts map[string]any) (api.Options, error) { +func (s *Server) modelOptions(model *Model, requestOpts map[string]any) (api.Options, error) { opts := api.DefaultOptions() + if opts.NumCtx == 0 { + opts.NumCtx = s.defaultNumCtx + } + if err := opts.FromMap(model.Options); err != nil { return api.Options{}, err } @@ -140,20 +140,11 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C return nil, nil, nil, fmt.Errorf("%s %w", name, err) } - opts, err := modelOptions(model, requestOpts) + opts, err := s.modelOptions(model, requestOpts) if err != nil { return nil, nil, nil, err } - // This model is much more capable with a larger context, so set that - // unless it would penalize performance too much - if !s.lowVRAM && slices.Contains([]string{ - "gptoss", "gpt-oss", - "qwen3vl", "qwen3vlmoe", - }, model.Config.ModelFamily) { - opts.NumCtx = max(opts.NumCtx, 8192) - } - runnerCh, errCh := s.sched.GetRunner(ctx, model, opts, keepAlive) var runner *runnerRef select { @@ -1720,10 +1711,18 @@ func Serve(ln net.Listener) error { for _, gpu := range gpus { totalVRAM += gpu.TotalMemory - envconfig.GpuOverhead() } - if totalVRAM < lowVRAMThreshold { - s.lowVRAM = true - slog.Info("entering low vram mode", "total vram", format.HumanBytes2(totalVRAM), "threshold", format.HumanBytes2(lowVRAMThreshold)) + + // Set default context based on VRAM tier + // Use slightly lower thresholds (47/23 GiB vs. 48/24 GiB) to account for small differences in the exact value + switch { + case totalVRAM >= 47*format.GibiByte: + s.defaultNumCtx = 262144 + case totalVRAM >= 23*format.GibiByte: + s.defaultNumCtx = 32768 + default: + s.defaultNumCtx = 4096 } + slog.Info("vram-based default context", "total_vram", format.HumanBytes2(totalVRAM), "default_num_ctx", s.defaultNumCtx) err = srvr.Serve(ln) // If server is closed from the signal handler, wait for the ctx to be done diff --git a/server/routes_debug_test.go b/server/routes_debug_test.go index 6f9104c39..a9d14b8cd 100644 --- a/server/routes_debug_test.go +++ b/server/routes_debug_test.go @@ -15,6 +15,7 @@ import ( ) func TestGenerateDebugRenderOnly(t *testing.T) { + t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096") gin.SetMode(gin.TestMode) mock := mockRunner{ @@ -208,6 +209,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) { } func TestChatDebugRenderOnly(t *testing.T) { + t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096") gin.SetMode(gin.TestMode) mock := mockRunner{ diff --git a/server/routes_generate_renderer_test.go b/server/routes_generate_renderer_test.go index e6473e087..d1e6bb56d 100644 --- a/server/routes_generate_renderer_test.go +++ b/server/routes_generate_renderer_test.go @@ -20,6 +20,7 @@ import ( // TestGenerateWithBuiltinRenderer tests that api/generate uses built-in renderers // when in chat-like flow (messages present, no suffix, no template) func TestGenerateWithBuiltinRenderer(t *testing.T) { + t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096") gin.SetMode(gin.TestMode) mock := mockRunner{ @@ -204,6 +205,7 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) { // TestGenerateWithDebugRenderOnly tests that debug_render_only works with built-in renderers func TestGenerateWithDebugRenderOnly(t *testing.T) { + t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096") gin.SetMode(gin.TestMode) mock := mockRunner{ diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index 26d40e3c8..7e21d80aa 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -162,6 +162,7 @@ func TestGenerateChatRemote(t *testing.T) { } func TestGenerateChat(t *testing.T) { + t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096") gin.SetMode(gin.TestMode) mock := mockRunner{ @@ -878,6 +879,7 @@ func TestGenerateChat(t *testing.T) { } func TestGenerate(t *testing.T) { + t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096") gin.SetMode(gin.TestMode) mock := mockRunner{ @@ -2355,6 +2357,7 @@ func TestGenerateWithImages(t *testing.T) { // TestImageGenerateStreamFalse tests that image generation respects stream=false // and returns a single JSON response instead of streaming ndjson. func TestImageGenerateStreamFalse(t *testing.T) { + t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096") gin.SetMode(gin.TestMode) p := t.TempDir() diff --git a/server/routes_options_test.go b/server/routes_options_test.go new file mode 100644 index 000000000..9634e7e1a --- /dev/null +++ b/server/routes_options_test.go @@ -0,0 +1,127 @@ +package server + +import ( + "testing" +) + +func TestModelOptionsNumCtxPriority(t *testing.T) { + tests := []struct { + name string + envContextLen string // empty means not set (uses 0 sentinel) + defaultNumCtx int // VRAM-based default + modelNumCtx int // 0 means not set in model + requestNumCtx int // 0 means not set in request + expectedNumCtx int + }{ + { + name: "vram default when nothing else set", + envContextLen: "", + defaultNumCtx: 32768, + modelNumCtx: 0, + requestNumCtx: 0, + expectedNumCtx: 32768, + }, + { + name: "env var overrides vram default", + envContextLen: "8192", + defaultNumCtx: 32768, + modelNumCtx: 0, + requestNumCtx: 0, + expectedNumCtx: 8192, + }, + { + name: "model overrides vram default", + envContextLen: "", + defaultNumCtx: 32768, + modelNumCtx: 16384, + requestNumCtx: 0, + expectedNumCtx: 16384, + }, + { + name: "model overrides env var", + envContextLen: "8192", + defaultNumCtx: 32768, + modelNumCtx: 16384, + requestNumCtx: 0, + expectedNumCtx: 16384, + }, + { + name: "request overrides everything", + envContextLen: "8192", + defaultNumCtx: 32768, + modelNumCtx: 16384, + requestNumCtx: 4096, + expectedNumCtx: 4096, + }, + { + name: "request overrides vram default", + envContextLen: "", + defaultNumCtx: 32768, + modelNumCtx: 0, + requestNumCtx: 4096, + expectedNumCtx: 4096, + }, + { + name: "request overrides model", + envContextLen: "", + defaultNumCtx: 32768, + modelNumCtx: 16384, + requestNumCtx: 4096, + expectedNumCtx: 4096, + }, + { + name: "low vram tier default", + envContextLen: "", + defaultNumCtx: 4096, + modelNumCtx: 0, + requestNumCtx: 0, + expectedNumCtx: 4096, + }, + { + name: "high vram tier default", + envContextLen: "", + defaultNumCtx: 262144, + modelNumCtx: 0, + requestNumCtx: 0, + expectedNumCtx: 262144, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Set or clear environment variable + if tt.envContextLen != "" { + t.Setenv("OLLAMA_CONTEXT_LENGTH", tt.envContextLen) + } + + // Create server with VRAM-based default + s := &Server{ + defaultNumCtx: tt.defaultNumCtx, + } + + // Create model options (use float64 as FromMap expects JSON-style numbers) + var modelOpts map[string]any + if tt.modelNumCtx != 0 { + modelOpts = map[string]any{"num_ctx": float64(tt.modelNumCtx)} + } + model := &Model{ + Options: modelOpts, + } + + // Create request options (use float64 as FromMap expects JSON-style numbers) + var requestOpts map[string]any + if tt.requestNumCtx != 0 { + requestOpts = map[string]any{"num_ctx": float64(tt.requestNumCtx)} + } + + opts, err := s.modelOptions(model, requestOpts) + if err != nil { + t.Fatalf("modelOptions failed: %v", err) + } + + if opts.NumCtx != tt.expectedNumCtx { + t.Errorf("NumCtx = %d, want %d", opts.NumCtx, tt.expectedNumCtx) + } + }) + } +}