From 5d1021603a4c783fd6bcd6fd89ca90d6095f40b5 Mon Sep 17 00:00:00 2001 From: Parth Sareen Date: Mon, 20 Apr 2026 17:42:29 -0700 Subject: [PATCH] server: apply format when think=false for gemma4 (#15678) --- server/routes.go | 5 +- server/routes_generate_test.go | 126 +++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 1 deletion(-) diff --git a/server/routes.go b/server/routes.go index 07e8a274e..515929516 100644 --- a/server/routes.go +++ b/server/routes.go @@ -2408,7 +2408,10 @@ func (s *Server) ChatHandler(c *gin.Context) { // current approach uses the transition from parsed thinking content to // parsed non-thinking content as the signal to turn constraining on - if req.Format != nil && structuredOutputsState == structuredOutputsState_None && ((builtinParser != nil || thinkingState != nil) && slices.Contains(m.Capabilities(), model.CapabilityThinking)) { + // TODO(parthsareen): temporary fix for https://github.com/ollama/ollama/issues/15260. + // To revisit for other models and have a consistent pattern across models through parsers. + forceImmediate := m.Config.Parser == "gemma4" && req.Think != nil && !req.Think.Bool() + if req.Format != nil && structuredOutputsState == structuredOutputsState_None && !forceImmediate && ((builtinParser != nil || thinkingState != nil) && slices.Contains(m.Capabilities(), model.CapabilityThinking)) { currentFormat = nil } diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index 458be7ddd..80442c238 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -2108,6 +2108,132 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) { }) } +// TestChatFormatWithThinkFalse verifies that when a model uses a builtin +// parser that supports thinking (e.g. gemma4) and the request explicitly +// disables thinking (think=false), the format constraint is passed to the +// first and only completion call. Previously, format was deferred for all +// thinking-capable parsers and only re-applied after an end-of-thinking +// transition — a transition that never fires when thinking is off. See +// https://github.com/ollama/ollama/issues/15260. +func TestChatFormatWithThinkFalse(t *testing.T) { + gin.SetMode(gin.TestMode) + + mock := &mockRunner{ + CompletionResponse: llm.CompletionResponse{ + Done: true, + DoneReason: llm.DoneReasonStop, + PromptEvalCount: 1, + PromptEvalDuration: 1, + EvalCount: 1, + EvalDuration: 1, + }, + } + + s := &Server{ + sched: &Scheduler{ + pendingReqCh: make(chan *LlmRequest, 1), + finishedReqCh: make(chan *LlmRequest, 1), + expiredCh: make(chan *runnerRef, 1), + unloadedCh: make(chan any, 1), + loaded: make(map[string]*runnerRef), + newServerFn: newMockServer(mock), + getGpuFn: getGpuFn, + getSystemInfoFn: getSystemInfoFn, + waitForRecovery: 250 * time.Millisecond, + loadFn: func(req *LlmRequest, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { + time.Sleep(time.Millisecond) + req.successCh <- &runnerRef{llama: mock} + return false + }, + }, + } + + go s.sched.Run(t.Context()) + + _, digest := createBinFile(t, ggml.KV{ + "general.architecture": "llama", + "llama.block_count": uint32(1), + "llama.context_length": uint32(8192), + "llama.embedding_length": uint32(4096), + "llama.attention.head_count": uint32(32), + "llama.attention.head_count_kv": uint32(8), + "tokenizer.ggml.tokens": []string{""}, + "tokenizer.ggml.scores": []float32{0}, + "tokenizer.ggml.token_type": []int32{0}, + }, []*ggml.Tensor{ + {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + {Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, + }) + + // Use the gemma4 builtin parser — it reports HasThinkingSupport=true, which + // adds CapabilityThinking to the model and previously triggered deferral of + // the format even when the user passed think=false. + w := createRequest(t, s.CreateHandler, api.CreateRequest{ + Model: "test-gemma4-parser", + Files: map[string]string{"file.gguf": digest}, + Parser: "gemma4", + Template: `{{- range .Messages }}{{ .Role }}: {{ .Content }}{{ end }}`, + Stream: &stream, + }) + if w.Code != http.StatusOK { + t.Fatalf("create: expected status 200, got %d: %s", w.Code, w.Body.String()) + } + + format := json.RawMessage(`{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}`) + + var ( + requestsMu sync.Mutex + requests []llm.CompletionRequest + ) + mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error { + requestsMu.Lock() + requests = append(requests, r) + requestsMu.Unlock() + + fn(llm.CompletionResponse{ + Content: `{"answer":"42"}`, + Done: true, + DoneReason: llm.DoneReasonStop, + PromptEvalCount: 1, + PromptEvalDuration: 1, + EvalCount: 1, + EvalDuration: 1, + }) + return nil + } + + streamRequest := false + think := false + w = createRequest(t, s.ChatHandler, api.ChatRequest{ + Model: "test-gemma4-parser", + Messages: []api.Message{{Role: "user", Content: "Respond in JSON."}}, + Think: &api.ThinkValue{Value: think}, + Stream: &streamRequest, + Format: format, + }) + + if w.Code != http.StatusOK { + t.Fatalf("chat: expected status 200, got %d: %s", w.Code, w.Body.String()) + } + + if len(requests) != 1 { + t.Fatalf("expected a single completion call, got %d", len(requests)) + } + + if !bytes.Equal([]byte(format), []byte(requests[0].Format)) { + t.Errorf("expected first completion format to match the request format, got %q", string(requests[0].Format)) + } +} + func TestGenerateUnload(t *testing.T) { gin.SetMode(gin.TestMode)