mirror of
https://github.com/ollama/ollama
synced 2026-04-23 08:45:14 +00:00
server: apply format when think=false for gemma4 (#15678)
This commit is contained in:
parent
8e05d734b9
commit
5d1021603a
|
|
@ -2408,7 +2408,10 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||
// current approach uses the transition from parsed thinking content to
|
||||
// parsed non-thinking content as the signal to turn constraining on
|
||||
|
||||
if req.Format != nil && structuredOutputsState == structuredOutputsState_None && ((builtinParser != nil || thinkingState != nil) && slices.Contains(m.Capabilities(), model.CapabilityThinking)) {
|
||||
// TODO(parthsareen): temporary fix for https://github.com/ollama/ollama/issues/15260.
|
||||
// To revisit for other models and have a consistent pattern across models through parsers.
|
||||
forceImmediate := m.Config.Parser == "gemma4" && req.Think != nil && !req.Think.Bool()
|
||||
if req.Format != nil && structuredOutputsState == structuredOutputsState_None && !forceImmediate && ((builtinParser != nil || thinkingState != nil) && slices.Contains(m.Capabilities(), model.CapabilityThinking)) {
|
||||
currentFormat = nil
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2108,6 +2108,132 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
|
|||
})
|
||||
}
|
||||
|
||||
// TestChatFormatWithThinkFalse verifies that when a model uses a builtin
|
||||
// parser that supports thinking (e.g. gemma4) and the request explicitly
|
||||
// disables thinking (think=false), the format constraint is passed to the
|
||||
// first and only completion call. Previously, format was deferred for all
|
||||
// thinking-capable parsers and only re-applied after an end-of-thinking
|
||||
// transition — a transition that never fires when thinking is off. See
|
||||
// https://github.com/ollama/ollama/issues/15260.
|
||||
func TestChatFormatWithThinkFalse(t *testing.T) {
|
||||
gin.SetMode(gin.TestMode)
|
||||
|
||||
mock := &mockRunner{
|
||||
CompletionResponse: llm.CompletionResponse{
|
||||
Done: true,
|
||||
DoneReason: llm.DoneReasonStop,
|
||||
PromptEvalCount: 1,
|
||||
PromptEvalDuration: 1,
|
||||
EvalCount: 1,
|
||||
EvalDuration: 1,
|
||||
},
|
||||
}
|
||||
|
||||
s := &Server{
|
||||
sched: &Scheduler{
|
||||
pendingReqCh: make(chan *LlmRequest, 1),
|
||||
finishedReqCh: make(chan *LlmRequest, 1),
|
||||
expiredCh: make(chan *runnerRef, 1),
|
||||
unloadedCh: make(chan any, 1),
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(mock),
|
||||
getGpuFn: getGpuFn,
|
||||
getSystemInfoFn: getSystemInfoFn,
|
||||
waitForRecovery: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{llama: mock}
|
||||
return false
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
go s.sched.Run(t.Context())
|
||||
|
||||
_, digest := createBinFile(t, ggml.KV{
|
||||
"general.architecture": "llama",
|
||||
"llama.block_count": uint32(1),
|
||||
"llama.context_length": uint32(8192),
|
||||
"llama.embedding_length": uint32(4096),
|
||||
"llama.attention.head_count": uint32(32),
|
||||
"llama.attention.head_count_kv": uint32(8),
|
||||
"tokenizer.ggml.tokens": []string{""},
|
||||
"tokenizer.ggml.scores": []float32{0},
|
||||
"tokenizer.ggml.token_type": []int32{0},
|
||||
}, []*ggml.Tensor{
|
||||
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
||||
})
|
||||
|
||||
// Use the gemma4 builtin parser — it reports HasThinkingSupport=true, which
|
||||
// adds CapabilityThinking to the model and previously triggered deferral of
|
||||
// the format even when the user passed think=false.
|
||||
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
||||
Model: "test-gemma4-parser",
|
||||
Files: map[string]string{"file.gguf": digest},
|
||||
Parser: "gemma4",
|
||||
Template: `{{- range .Messages }}{{ .Role }}: {{ .Content }}{{ end }}`,
|
||||
Stream: &stream,
|
||||
})
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("create: expected status 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
format := json.RawMessage(`{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}`)
|
||||
|
||||
var (
|
||||
requestsMu sync.Mutex
|
||||
requests []llm.CompletionRequest
|
||||
)
|
||||
mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
|
||||
requestsMu.Lock()
|
||||
requests = append(requests, r)
|
||||
requestsMu.Unlock()
|
||||
|
||||
fn(llm.CompletionResponse{
|
||||
Content: `{"answer":"42"}`,
|
||||
Done: true,
|
||||
DoneReason: llm.DoneReasonStop,
|
||||
PromptEvalCount: 1,
|
||||
PromptEvalDuration: 1,
|
||||
EvalCount: 1,
|
||||
EvalDuration: 1,
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
streamRequest := false
|
||||
think := false
|
||||
w = createRequest(t, s.ChatHandler, api.ChatRequest{
|
||||
Model: "test-gemma4-parser",
|
||||
Messages: []api.Message{{Role: "user", Content: "Respond in JSON."}},
|
||||
Think: &api.ThinkValue{Value: think},
|
||||
Stream: &streamRequest,
|
||||
Format: format,
|
||||
})
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("chat: expected status 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
if len(requests) != 1 {
|
||||
t.Fatalf("expected a single completion call, got %d", len(requests))
|
||||
}
|
||||
|
||||
if !bytes.Equal([]byte(format), []byte(requests[0].Format)) {
|
||||
t.Errorf("expected first completion format to match the request format, got %q", string(requests[0].Format))
|
||||
}
|
||||
}
|
||||
|
||||
func TestGenerateUnload(t *testing.T) {
|
||||
gin.SetMode(gin.TestMode)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue